diff --git a/GL/matrix.c b/GL/matrix.c index 275b607..ee71a94 100644 --- a/GL/matrix.c +++ b/GL/matrix.c @@ -49,8 +49,8 @@ void _glInitMatrices() { stack_push(&MATRIX_STACKS[1], IDENTITY); stack_push(&MATRIX_STACKS[2], IDENTITY); - FASTCPY4(NORMAL_MATRIX, IDENTITY, sizeof(Matrix4x4)); - FASTCPY4(SCREENVIEW_MATRIX, IDENTITY, sizeof(Matrix4x4)); + MEMCPY4(NORMAL_MATRIX, IDENTITY, sizeof(Matrix4x4)); + MEMCPY4(SCREENVIEW_MATRIX, IDENTITY, sizeof(Matrix4x4)); const VideoMode* vid_mode = GetVideoMode(); @@ -96,7 +96,7 @@ static void transpose(GLfloat* m) { } static void recalculateNormalMatrix() { - FASTCPY4(NORMAL_MATRIX, stack_top(MATRIX_STACKS + (GL_MODELVIEW & 0xF)), sizeof(Matrix4x4)); + MEMCPY4(NORMAL_MATRIX, stack_top(MATRIX_STACKS + (GL_MODELVIEW & 0xF)), sizeof(Matrix4x4)); inverse((GLfloat*) NORMAL_MATRIX); transpose((GLfloat*) NORMAL_MATRIX); } @@ -290,7 +290,7 @@ void APIENTRY glFrustum(GLfloat left, GLfloat right, /* Multiply the current matrix by an arbitrary matrix */ void glMultMatrixf(const GLfloat *m) { Matrix4x4 TEMP; - FASTCPY4(TEMP, m, sizeof(Matrix4x4)); + MEMCPY4(TEMP, m, sizeof(Matrix4x4)); UploadMatrix4x4(stack_top(MATRIX_STACKS + MATRIX_IDX)); MultiplyMatrix4x4((const Matrix4x4*) &TEMP); diff --git a/GL/platforms/sh4.h b/GL/platforms/sh4.h index 0bc7dd6..a2e3446 100644 --- a/GL/platforms/sh4.h +++ b/GL/platforms/sh4.h @@ -10,11 +10,27 @@ #include "../types.h" #include "sh4_math.h" -#define FASTCPY(dst, src, bytes) \ - (bytes % 32 == 0) ? sq_cpy(dst, src, bytes) : memcpy(dst, src, bytes) +#ifndef NDEBUG +#define PERF_WARNING() printf("[PERF] Unaligned data passed to glTexImage2D\n") +#else +#define PERF_WARNING() (void) 0 +#endif -#define FASTCPY4(dst, src, bytes) \ - (bytes % 32 == 0) ? sq_cpy(dst, src, bytes) : memcpy4(dst, src, bytes) + +/* We use sq_cpy if the src and size is properly aligned. We control that the + * destination is properly aligned so we assert that. */ +#define FASTCPY(dst, src, bytes) \ + do { \ + if(bytes % 32 == 0 && (uintptr_t) src % 32 == 0) { \ + assert((uintptr_t) dst % 32 == 0); \ + sq_cpy(dst, src, bytes); \ + } else { \ + PERF_WARNING(); \ + memcpy(dst, src, bytes); \ + } \ + } while(0) + +#define MEMCPY4(dst, src, bytes) memcpy4(dst, src, bytes) #define MEMSET4(dst, v, size) memset4((dst), (v), (size)) diff --git a/GL/platforms/software.h b/GL/platforms/software.h index 5bac618..47fa9a6 100644 --- a/GL/platforms/software.h +++ b/GL/platforms/software.h @@ -12,7 +12,9 @@ #define MATH_Fast_Invert(x) (1.0f / (x)) #define FASTCPY(dst, src, bytes) memcpy(dst, src, bytes) -#define FASTCPY4(dst, src, bytes) memcpy(dst, src, bytes) +#define MEMCPY(dst, src, bytes) memcpy(dst, src, bytes) +#define MEMCPY4(dst, src, bytes) memcpy(dst, src, bytes) + #define MEMSET4(dst, v, size) memset((dst), (v), (size)) #define VEC3_NORMALIZE(x, y, z) \ diff --git a/GL/state.c b/GL/state.c index 908a282..020508d 100644 --- a/GL/state.c +++ b/GL/state.c @@ -702,10 +702,10 @@ void APIENTRY glGetBooleanv(GLenum pname, GLboolean* params) { void APIENTRY glGetFloatv(GLenum pname, GLfloat* params) { switch(pname) { case GL_PROJECTION_MATRIX: - FASTCPY4(params, _glGetProjectionMatrix(), sizeof(float) * 16); + MEMCPY4(params, _glGetProjectionMatrix(), sizeof(float) * 16); break; case GL_MODELVIEW_MATRIX: - FASTCPY4(params, _glGetModelViewMatrix(), sizeof(float) * 16); + MEMCPY4(params, _glGetModelViewMatrix(), sizeof(float) * 16); break; case GL_POLYGON_OFFSET_FACTOR: *params = OFFSET_FACTOR;