Use Moops memcpy
This commit is contained in:
parent
b08bbebf12
commit
f0a3465486
|
@ -26,6 +26,34 @@
|
|||
|
||||
#define PREFETCH(addr) __asm__("pref @%0" : : "r"((addr)))
|
||||
|
||||
GL_FORCE_INLINE void* memcpy_fast(void *dest, const void *src, size_t len) {
|
||||
if(!len) {
|
||||
return dest;
|
||||
}
|
||||
|
||||
const uint8_t *s = (uint8_t *)src;
|
||||
uint8_t *d = (uint8_t *)dest;
|
||||
|
||||
uint32_t diff = (uint32_t)d - (uint32_t)(s + 1); // extra offset because input gets incremented before output is calculated
|
||||
// Underflow would be like adding a negative offset
|
||||
|
||||
// Can use 'd' as a scratch reg now
|
||||
asm volatile (
|
||||
"clrs\n" // Align for parallelism (CO) - SH4a use "stc SR, Rn" instead with a dummy Rn
|
||||
".align 2\n"
|
||||
"0:\n\t"
|
||||
"dt %[size]\n\t" // (--len) ? 0 -> T : 1 -> T (EX 1)
|
||||
"mov.b @%[in]+, %[scratch]\n\t" // scratch = *(s++) (LS 1/2)
|
||||
"bf.s 0b\n\t" // while(s != nexts) aka while(!T) (BR 1/2)
|
||||
" mov.b %[scratch], @(%[offset], %[in])\n" // *(datatype_of_s*) ((char*)s + diff) = scratch, where src + diff = dest (LS 1)
|
||||
: [in] "+&r" ((uint32_t)s), [scratch] "=&r" ((uint32_t)d), [size] "+&r" (len) // outputs
|
||||
: [offset] "z" (diff) // inputs
|
||||
: "t", "memory" // clobbers
|
||||
);
|
||||
|
||||
return dest;
|
||||
}
|
||||
|
||||
/* We use sq_cpy if the src and size is properly aligned. We control that the
|
||||
* destination is properly aligned so we assert that. */
|
||||
#define FASTCPY(dst, src, bytes) \
|
||||
|
@ -34,11 +62,12 @@
|
|||
gl_assert(((uintptr_t) dst) % 32 == 0); \
|
||||
sq_cpy(dst, src, bytes); \
|
||||
} else { \
|
||||
memcpy(dst, src, bytes); \
|
||||
memcpy_fast(dst, src, bytes); \
|
||||
} \
|
||||
} while(0)
|
||||
|
||||
#define MEMCPY4(dst, src, bytes) memcpy4(dst, src, bytes)
|
||||
|
||||
#define MEMCPY4(dst, src, bytes) memcpy_fast(dst, src, bytes)
|
||||
|
||||
#define MEMSET4(dst, v, size) memset4((dst), (v), (size))
|
||||
|
||||
|
|
|
@ -13,4 +13,4 @@ typedef struct {
|
|||
* but we're not using that for now, so having W here makes the code
|
||||
* simpler */
|
||||
float w;
|
||||
} Vertex;
|
||||
} __attribute__ ((aligned (32))) Vertex;
|
||||
|
|
Loading…
Reference in New Issue
Block a user