Use Moops memcpy

This commit is contained in:
Luke Benstead 2023-03-05 21:16:12 +00:00
parent b08bbebf12
commit f0a3465486
2 changed files with 32 additions and 3 deletions

View File

@ -26,6 +26,34 @@
#define PREFETCH(addr) __asm__("pref @%0" : : "r"((addr)))
GL_FORCE_INLINE void* memcpy_fast(void *dest, const void *src, size_t len) {
if(!len) {
return dest;
}
const uint8_t *s = (uint8_t *)src;
uint8_t *d = (uint8_t *)dest;
uint32_t diff = (uint32_t)d - (uint32_t)(s + 1); // extra offset because input gets incremented before output is calculated
// Underflow would be like adding a negative offset
// Can use 'd' as a scratch reg now
asm volatile (
"clrs\n" // Align for parallelism (CO) - SH4a use "stc SR, Rn" instead with a dummy Rn
".align 2\n"
"0:\n\t"
"dt %[size]\n\t" // (--len) ? 0 -> T : 1 -> T (EX 1)
"mov.b @%[in]+, %[scratch]\n\t" // scratch = *(s++) (LS 1/2)
"bf.s 0b\n\t" // while(s != nexts) aka while(!T) (BR 1/2)
" mov.b %[scratch], @(%[offset], %[in])\n" // *(datatype_of_s*) ((char*)s + diff) = scratch, where src + diff = dest (LS 1)
: [in] "+&r" ((uint32_t)s), [scratch] "=&r" ((uint32_t)d), [size] "+&r" (len) // outputs
: [offset] "z" (diff) // inputs
: "t", "memory" // clobbers
);
return dest;
}
/* We use sq_cpy if the src and size is properly aligned. We control that the
* destination is properly aligned so we assert that. */
#define FASTCPY(dst, src, bytes) \
@ -34,11 +62,12 @@
gl_assert(((uintptr_t) dst) % 32 == 0); \
sq_cpy(dst, src, bytes); \
} else { \
memcpy(dst, src, bytes); \
memcpy_fast(dst, src, bytes); \
} \
} while(0)
#define MEMCPY4(dst, src, bytes) memcpy4(dst, src, bytes)
#define MEMCPY4(dst, src, bytes) memcpy_fast(dst, src, bytes)
#define MEMSET4(dst, v, size) memset4((dst), (v), (size))

View File

@ -13,4 +13,4 @@ typedef struct {
* but we're not using that for now, so having W here makes the code
* simpler */
float w;
} Vertex;
} __attribute__ ((aligned (32))) Vertex;