diff --git a/GL/matrix.c b/GL/matrix.c
index 275b607..ee71a94 100644
--- a/GL/matrix.c
+++ b/GL/matrix.c
@@ -49,8 +49,8 @@ void _glInitMatrices() {
     stack_push(&MATRIX_STACKS[1], IDENTITY);
     stack_push(&MATRIX_STACKS[2], IDENTITY);
 
-    FASTCPY4(NORMAL_MATRIX, IDENTITY, sizeof(Matrix4x4));
-    FASTCPY4(SCREENVIEW_MATRIX, IDENTITY, sizeof(Matrix4x4));
+    MEMCPY4(NORMAL_MATRIX, IDENTITY, sizeof(Matrix4x4));
+    MEMCPY4(SCREENVIEW_MATRIX, IDENTITY, sizeof(Matrix4x4));
 
     const VideoMode* vid_mode = GetVideoMode();
 
@@ -96,7 +96,7 @@ static void transpose(GLfloat* m) {
 }
 
 static void recalculateNormalMatrix() {
-    FASTCPY4(NORMAL_MATRIX, stack_top(MATRIX_STACKS + (GL_MODELVIEW & 0xF)), sizeof(Matrix4x4));
+    MEMCPY4(NORMAL_MATRIX, stack_top(MATRIX_STACKS + (GL_MODELVIEW & 0xF)), sizeof(Matrix4x4));
     inverse((GLfloat*) NORMAL_MATRIX);
     transpose((GLfloat*) NORMAL_MATRIX);
 }
@@ -290,7 +290,7 @@ void APIENTRY glFrustum(GLfloat left, GLfloat right,
 /* Multiply the current matrix by an arbitrary matrix */
 void glMultMatrixf(const GLfloat *m) {
     Matrix4x4 TEMP;
-    FASTCPY4(TEMP, m, sizeof(Matrix4x4));
+    MEMCPY4(TEMP, m, sizeof(Matrix4x4));
 
     UploadMatrix4x4(stack_top(MATRIX_STACKS + MATRIX_IDX));
     MultiplyMatrix4x4((const Matrix4x4*) &TEMP);
diff --git a/GL/platforms/sh4.h b/GL/platforms/sh4.h
index 0bc7dd6..a2e3446 100644
--- a/GL/platforms/sh4.h
+++ b/GL/platforms/sh4.h
@@ -10,11 +10,27 @@
 #include "../types.h"
 #include "sh4_math.h"
 
-#define FASTCPY(dst, src, bytes) \
-    (bytes % 32 == 0) ? sq_cpy(dst, src, bytes) : memcpy(dst, src, bytes)
+#ifndef NDEBUG
+#define PERF_WARNING() printf("[PERF] Unaligned data passed to glTexImage2D\n")
+#else
+#define PERF_WARNING() (void) 0
+#endif
 
-#define FASTCPY4(dst, src, bytes) \
-    (bytes % 32 == 0) ? sq_cpy(dst, src, bytes) : memcpy4(dst, src, bytes)
+
+/* We use sq_cpy if the src and size is properly aligned. We control that the
+ * destination is properly aligned so we assert that. */
+#define FASTCPY(dst, src, bytes) \
+    do { \
+        if(bytes % 32 == 0 && (uintptr_t) src % 32 == 0) { \
+            assert((uintptr_t) dst % 32 == 0); \
+            sq_cpy(dst, src, bytes); \
+        } else { \
+            PERF_WARNING(); \
+            memcpy(dst, src, bytes); \
+        } \
+    } while(0)
+
+#define MEMCPY4(dst, src, bytes) memcpy4(dst, src, bytes)
 
 #define MEMSET4(dst, v, size) memset4((dst), (v), (size))
 
diff --git a/GL/platforms/software.h b/GL/platforms/software.h
index 5bac618..47fa9a6 100644
--- a/GL/platforms/software.h
+++ b/GL/platforms/software.h
@@ -12,7 +12,9 @@
 #define MATH_Fast_Invert(x) (1.0f / (x))
 
 #define FASTCPY(dst, src, bytes) memcpy(dst, src, bytes)
-#define FASTCPY4(dst, src, bytes) memcpy(dst, src, bytes)
+#define MEMCPY(dst, src, bytes) memcpy(dst, src, bytes)
+#define MEMCPY4(dst, src, bytes) memcpy(dst, src, bytes)
+
 #define MEMSET4(dst, v, size) memset((dst), (v), (size))
 
 #define VEC3_NORMALIZE(x, y, z) \
diff --git a/GL/state.c b/GL/state.c
index 908a282..020508d 100644
--- a/GL/state.c
+++ b/GL/state.c
@@ -702,10 +702,10 @@ void APIENTRY glGetBooleanv(GLenum pname, GLboolean* params) {
 void APIENTRY glGetFloatv(GLenum pname, GLfloat* params) {
     switch(pname) {
         case GL_PROJECTION_MATRIX:
-            FASTCPY4(params, _glGetProjectionMatrix(), sizeof(float) * 16);
+            MEMCPY4(params, _glGetProjectionMatrix(), sizeof(float) * 16);
         break;
         case GL_MODELVIEW_MATRIX:
-            FASTCPY4(params, _glGetModelViewMatrix(), sizeof(float) * 16);
+            MEMCPY4(params, _glGetModelViewMatrix(), sizeof(float) * 16);
         break;
         case GL_POLYGON_OFFSET_FACTOR:
             *params = OFFSET_FACTOR;