From f0a3465486b1508a874885ef6906dbd11ed6be06 Mon Sep 17 00:00:00 2001 From: Luke Benstead Date: Sun, 5 Mar 2023 21:16:12 +0000 Subject: [PATCH] Use Moops memcpy --- GL/platforms/sh4.h | 33 +++++++++++++++++++++++++++++++-- GL/types.h | 2 +- 2 files changed, 32 insertions(+), 3 deletions(-) diff --git a/GL/platforms/sh4.h b/GL/platforms/sh4.h index 87f0f91..6bc89c1 100644 --- a/GL/platforms/sh4.h +++ b/GL/platforms/sh4.h @@ -26,6 +26,34 @@ #define PREFETCH(addr) __asm__("pref @%0" : : "r"((addr))) +GL_FORCE_INLINE void* memcpy_fast(void *dest, const void *src, size_t len) { + if(!len) { + return dest; + } + + const uint8_t *s = (uint8_t *)src; + uint8_t *d = (uint8_t *)dest; + + uint32_t diff = (uint32_t)d - (uint32_t)(s + 1); // extra offset because input gets incremented before output is calculated + // Underflow would be like adding a negative offset + + // Can use 'd' as a scratch reg now + asm volatile ( + "clrs\n" // Align for parallelism (CO) - SH4a use "stc SR, Rn" instead with a dummy Rn + ".align 2\n" + "0:\n\t" + "dt %[size]\n\t" // (--len) ? 0 -> T : 1 -> T (EX 1) + "mov.b @%[in]+, %[scratch]\n\t" // scratch = *(s++) (LS 1/2) + "bf.s 0b\n\t" // while(s != nexts) aka while(!T) (BR 1/2) + " mov.b %[scratch], @(%[offset], %[in])\n" // *(datatype_of_s*) ((char*)s + diff) = scratch, where src + diff = dest (LS 1) + : [in] "+&r" ((uint32_t)s), [scratch] "=&r" ((uint32_t)d), [size] "+&r" (len) // outputs + : [offset] "z" (diff) // inputs + : "t", "memory" // clobbers + ); + + return dest; +} + /* We use sq_cpy if the src and size is properly aligned. We control that the * destination is properly aligned so we assert that. */ #define FASTCPY(dst, src, bytes) \ @@ -34,11 +62,12 @@ gl_assert(((uintptr_t) dst) % 32 == 0); \ sq_cpy(dst, src, bytes); \ } else { \ - memcpy(dst, src, bytes); \ + memcpy_fast(dst, src, bytes); \ } \ } while(0) -#define MEMCPY4(dst, src, bytes) memcpy4(dst, src, bytes) + +#define MEMCPY4(dst, src, bytes) memcpy_fast(dst, src, bytes) #define MEMSET4(dst, v, size) memset4((dst), (v), (size)) diff --git a/GL/types.h b/GL/types.h index 99758e2..85df8ba 100644 --- a/GL/types.h +++ b/GL/types.h @@ -13,4 +13,4 @@ typedef struct { * but we're not using that for now, so having W here makes the code * simpler */ float w; -} Vertex; +} __attribute__ ((aligned (32))) Vertex;