diff --git a/containers/aligned_vector.h b/containers/aligned_vector.h index c9d1b53..3400fe1 100644 --- a/containers/aligned_vector.h +++ b/containers/aligned_vector.h @@ -18,9 +18,48 @@ static inline void* memalign(size_t alignment, size_t size) { #include #endif +#ifdef __cplusplus +#define AV_FORCE_INLINE static inline +#else +#define AV_NO_INSTRUMENT inline __attribute__((no_instrument_function)) +#define AV_INLINE_DEBUG AV_NO_INSTRUMENT __attribute__((always_inline)) +#define AV_FORCE_INLINE static AV_INLINE_DEBUG +#endif + + #ifdef __DREAMCAST__ #include -#define AV_MEMCPY4 memcpy4 + +AV_FORCE_INLINE void *AV_MEMCPY4(void *dest, const void *src, size_t len) +{ + if(!len) + { + return dest; + } + + const uint8_t *s = (uint8_t *)src; + uint8_t *d = (uint8_t *)dest; + + uint32_t diff = (uint32_t)d - (uint32_t)(s + 1); // extra offset because input gets incremented before output is calculated + // Underflow would be like adding a negative offset + + // Can use 'd' as a scratch reg now + asm volatile ( + "clrs\n" // Align for parallelism (CO) - SH4a use "stc SR, Rn" instead with a dummy Rn + ".align 2\n" + "0:\n\t" + "dt %[size]\n\t" // (--len) ? 0 -> T : 1 -> T (EX 1) + "mov.b @%[in]+, %[scratch]\n\t" // scratch = *(s++) (LS 1/2) + "bf.s 0b\n\t" // while(s != nexts) aka while(!T) (BR 1/2) + " mov.b %[scratch], @(%[offset], %[in])\n" // *(datatype_of_s*) ((char*)s + diff) = scratch, where src + diff = dest (LS 1) + : [in] "+&r" ((uint32_t)s), [scratch] "=&r" ((uint32_t)d), [size] "+&r" (len) // outputs + : [offset] "z" (diff) // inputs + : "t", "memory" // clobbers + ); + + return dest; +} + #else #define AV_MEMCPY4 memcpy #endif @@ -34,13 +73,6 @@ typedef struct { #define ALIGNED_VECTOR_CHUNK_SIZE 256u -#ifdef __cplusplus -#define AV_FORCE_INLINE static inline -#else -#define AV_NO_INSTRUMENT inline __attribute__((no_instrument_function)) -#define AV_INLINE_DEBUG AV_NO_INSTRUMENT __attribute__((always_inline)) -#define AV_FORCE_INLINE static AV_INLINE_DEBUG -#endif #define ROUND_TO_CHUNK_SIZE(v) \ ((((v) + ALIGNED_VECTOR_CHUNK_SIZE - 1) / ALIGNED_VECTOR_CHUNK_SIZE) * ALIGNED_VECTOR_CHUNK_SIZE)