Use Moops memcpy

2023-03-05 21:16:12 +00:00 · 2023-03-05 21:16:12 +00:00 · f0a3465486
commit f0a3465486
parent b08bbebf12
2 changed files with 32 additions and 3 deletions
--- a/GL/platforms/sh4.h
+++ b/GL/platforms/sh4.h
@ -26,6 +26,34 @@

 #define PREFETCH(addr) __asm__("pref @%0" : : "r"((addr)))

+GL_FORCE_INLINE void* memcpy_fast(void *dest, const void *src, size_t len) {
+  if(!len) {
+    return dest;
+  }
+
+  const uint8_t *s = (uint8_t *)src;
+  uint8_t *d = (uint8_t *)dest;
+
+  uint32_t diff = (uint32_t)d - (uint32_t)(s + 1); // extra offset because input gets incremented before output is calculated
+  // Underflow would be like adding a negative offset
+
+  // Can use 'd' as a scratch reg now
+  asm volatile (
+    "clrs\n" // Align for parallelism (CO) - SH4a use "stc SR, Rn" instead with a dummy Rn
+  ".align 2\n"
+  "0:\n\t"
+    "dt %[size]\n\t" // (--len) ? 0 -> T : 1 -> T (EX 1)
+    "mov.b @%[in]+, %[scratch]\n\t" // scratch = *(s++) (LS 1/2)
+    "bf.s 0b\n\t" // while(s != nexts) aka while(!T) (BR 1/2)
+    " mov.b %[scratch], @(%[offset], %[in])\n" // *(datatype_of_s*) ((char*)s + diff) = scratch, where src + diff = dest (LS 1)
+    : [in] "+&r" ((uint32_t)s), [scratch] "=&r" ((uint32_t)d), [size] "+&r" (len) // outputs
+    : [offset] "z" (diff) // inputs
+    : "t", "memory" // clobbers
+  );
+
+  return dest;
+}
+
 /* We use sq_cpy if the src and size is properly aligned. We control that the
 * destination is properly aligned so we assert that. */
 #define FASTCPY(dst, src, bytes) \
@ -34,11 +62,12 @@
            gl_assert(((uintptr_t) dst) % 32 == 0); \
            sq_cpy(dst, src, bytes); \
        } else { \
-            memcpy(dst, src, bytes); \
+            memcpy_fast(dst, src, bytes); \
        } \
    } while(0)

-#define MEMCPY4(dst, src, bytes) memcpy4(dst, src, bytes)
+
+#define MEMCPY4(dst, src, bytes) memcpy_fast(dst, src, bytes)

 #define MEMSET4(dst, v, size) memset4((dst), (v), (size))

--- a/GL/types.h
+++ b/GL/types.h
@ -13,4 +13,4 @@ typedef struct {
     * but we're not using that for now, so having W here makes the code
     * simpler */
    float w;
-} Vertex;
+} __attribute__ ((aligned (32))) Vertex;