Faster memcpy

This commit is contained in:
Luke Benstead 2023-03-06 14:05:14 +00:00
parent be4c1bc14c
commit c195d471e1

View File

@ -18,9 +18,48 @@ static inline void* memalign(size_t alignment, size_t size) {
#include <malloc.h> #include <malloc.h>
#endif #endif
#ifdef __cplusplus
#define AV_FORCE_INLINE static inline
#else
#define AV_NO_INSTRUMENT inline __attribute__((no_instrument_function))
#define AV_INLINE_DEBUG AV_NO_INSTRUMENT __attribute__((always_inline))
#define AV_FORCE_INLINE static AV_INLINE_DEBUG
#endif
#ifdef __DREAMCAST__ #ifdef __DREAMCAST__
#include <kos/string.h> #include <kos/string.h>
#define AV_MEMCPY4 memcpy4
AV_FORCE_INLINE void *AV_MEMCPY4(void *dest, const void *src, size_t len)
{
if(!len)
{
return dest;
}
const uint8_t *s = (uint8_t *)src;
uint8_t *d = (uint8_t *)dest;
uint32_t diff = (uint32_t)d - (uint32_t)(s + 1); // extra offset because input gets incremented before output is calculated
// Underflow would be like adding a negative offset
// Can use 'd' as a scratch reg now
asm volatile (
"clrs\n" // Align for parallelism (CO) - SH4a use "stc SR, Rn" instead with a dummy Rn
".align 2\n"
"0:\n\t"
"dt %[size]\n\t" // (--len) ? 0 -> T : 1 -> T (EX 1)
"mov.b @%[in]+, %[scratch]\n\t" // scratch = *(s++) (LS 1/2)
"bf.s 0b\n\t" // while(s != nexts) aka while(!T) (BR 1/2)
" mov.b %[scratch], @(%[offset], %[in])\n" // *(datatype_of_s*) ((char*)s + diff) = scratch, where src + diff = dest (LS 1)
: [in] "+&r" ((uint32_t)s), [scratch] "=&r" ((uint32_t)d), [size] "+&r" (len) // outputs
: [offset] "z" (diff) // inputs
: "t", "memory" // clobbers
);
return dest;
}
#else #else
#define AV_MEMCPY4 memcpy #define AV_MEMCPY4 memcpy
#endif #endif
@ -34,13 +73,6 @@ typedef struct {
#define ALIGNED_VECTOR_CHUNK_SIZE 256u #define ALIGNED_VECTOR_CHUNK_SIZE 256u
#ifdef __cplusplus
#define AV_FORCE_INLINE static inline
#else
#define AV_NO_INSTRUMENT inline __attribute__((no_instrument_function))
#define AV_INLINE_DEBUG AV_NO_INSTRUMENT __attribute__((always_inline))
#define AV_FORCE_INLINE static AV_INLINE_DEBUG
#endif
#define ROUND_TO_CHUNK_SIZE(v) \ #define ROUND_TO_CHUNK_SIZE(v) \
((((v) + ALIGNED_VECTOR_CHUNK_SIZE - 1) / ALIGNED_VECTOR_CHUNK_SIZE) * ALIGNED_VECTOR_CHUNK_SIZE) ((((v) + ALIGNED_VECTOR_CHUNK_SIZE - 1) / ALIGNED_VECTOR_CHUNK_SIZE) * ALIGNED_VECTOR_CHUNK_SIZE)