From c222c23ae3b4ee78a88f6d69aa0724d9573a313e Mon Sep 17 00:00:00 2001
From: Hayden Kowalchuk <hayden@hkowsoftware.com>
Date: Fri, 4 Oct 2019 21:32:54 -0400
Subject: [PATCH 1/2] fixes: rollup of minor stuff

---
 GL/clip.c                   |  5 +--
 GL/draw.c                   | 49 +++++++++++++++--------------
 GL/matrix.c                 |  8 ++---
 GL/private.h                | 62 +++++--------------------------------
 GL/texture.c                | 10 ++----
 Makefile                    |  2 +-
 containers/aligned_vector.c | 15 +++++++--
 7 files changed, 54 insertions(+), 97 deletions(-)

diff --git a/GL/clip.c b/GL/clip.c
index c140c01..e4a8307 100644
--- a/GL/clip.c
+++ b/GL/clip.c
@@ -91,9 +91,6 @@ void _glClipTriangle(const Triangle* triangle, const uint8_t visible, Submission
     const Vertex* vertices = triangle->vertex;
     const VertexExtra* extras = triangle->extra;
 
-    /* Used when flat shading is enabled */
-    uint32_t finalColour = *((uint32_t*) vertices[2].bgra);
-
     for(i = 0; i < 4; ++i) {
         uint8_t thisIndex = (i == 3) ? 0 : i;
 
@@ -123,7 +120,7 @@ void _glClipTriangle(const Triangle* triangle, const uint8_t visible, Submission
                 interpolateVec2(ve1->st, ve2->st, t, veNext.st);
 
                 if(flatShade) {
-                    *((uint32_t*) next.bgra) = finalColour;
+                    *((uint32_t*) next.bgra) = *((uint32_t*) vertices[2].bgra);
                 } else {
                     interpolateColour(v1->bgra, v2->bgra, t, next.bgra);
                 }
diff --git a/GL/draw.c b/GL/draw.c
index cfba2f8..68331f4 100644
--- a/GL/draw.c
+++ b/GL/draw.c
@@ -798,31 +798,34 @@ static void generate(SubmissionTarget* target, const GLenum mode, const GLsizei
         
         sq_cpy(start, VERTEX_POINTER.ptr - ( sizeof(unsigned int) * 1 ), count * sizeof(Vertex) );
 
-        ITERATE(count) {
-            it->flags = PVR_CMD_VERTEX;
-            ++it;
-        }
+        if(start->flags == 0){
 
-        profiler_checkpoint("flags");
+            ITERATE(count) {
+                it->flags = PVR_CMD_VERTEX;
+                ++it;
+            }
 
-        // Drawing arrays
-        switch(mode) {
-        case GL_TRIANGLES:
-            genTriangles(start, count);
-            break;
-        case GL_QUADS:
-            genQuads(start, count);
-            break;
-        case GL_TRIANGLE_FAN:
-            genTriangleFan(start, count);
-            break;
-        case GL_TRIANGLE_STRIP:
-            genTriangleStrip(_glSubmissionTargetStart(target), count);
-            break;
-        default:
-            printf("mode: 0x%08x\n", mode);
-            fflush(stdout);
-            assert(0 && "Not Implemented");
+            profiler_checkpoint("flags");
+
+            // Drawing arrays
+            switch(mode) {
+            case GL_TRIANGLES:
+                genTriangles(start, count);
+                break;
+            case GL_QUADS:
+                genQuads(start, count);
+                break;
+            case GL_TRIANGLE_FAN:
+                genTriangleFan(start, count);
+                break;
+            case GL_TRIANGLE_STRIP:
+                genTriangleStrip(_glSubmissionTargetStart(target), count);
+                break;
+            default:
+                printf("mode: 0x%08x\n", mode);
+                fflush(stdout);
+                assert(0 && "Not Implemented");
+            }
         }
 
         profiler_checkpoint("quads");
diff --git a/GL/matrix.c b/GL/matrix.c
index 28b8177..75b1e56 100644
--- a/GL/matrix.c
+++ b/GL/matrix.c
@@ -13,7 +13,7 @@
 #define DEG2RAD (0.01745329251994329576923690768489)
 
 /* Viewport mapping */
-static GLfloat gl_viewport_scale[3], gl_viewport_offset[3];
+//static GLfloat gl_viewport_scale[3], gl_viewport_offset[3];
 
 /* Depth range */
 GLfloat DEPTH_RANGE_MULTIPLIER_L = (1 - 0) / 2;
@@ -78,8 +78,8 @@ void _glInitMatrices() {
     stack_push(&MATRIX_STACKS[1], IDENTITY);
     stack_push(&MATRIX_STACKS[2], IDENTITY);
 
-    memcpy(NORMAL_MATRIX, IDENTITY, sizeof(Matrix4x4));
-    memcpy(SCREENVIEW_MATRIX, IDENTITY, sizeof(Matrix4x4));
+    sq_cpy(NORMAL_MATRIX, IDENTITY, sizeof(Matrix4x4));
+    sq_cpy(SCREENVIEW_MATRIX, IDENTITY, sizeof(Matrix4x4));
 
     glDepthRange(0.0f, 1.0f);
     glViewport(0, 0, vid_mode->width, vid_mode->height);
@@ -123,7 +123,7 @@ static void transpose(GLfloat* m) {
 }
 
 static void recalculateNormalMatrix() {
-    memcpy(NORMAL_MATRIX, stack_top(MATRIX_STACKS + (GL_MODELVIEW & 0xF)), sizeof(Matrix4x4));
+    sq_cpy(NORMAL_MATRIX, stack_top(MATRIX_STACKS + (GL_MODELVIEW & 0xF)), sizeof(Matrix4x4));
     inverse((GLfloat*) NORMAL_MATRIX);
     transpose((GLfloat*) NORMAL_MATRIX);
 }
diff --git a/GL/private.h b/GL/private.h
index 4cd2acb..6987723 100644
--- a/GL/private.h
+++ b/GL/private.h
@@ -7,6 +7,8 @@
 #include "../containers/aligned_vector.h"
 #include "../containers/named_array.h"
 
+extern void* memcpy4 (void *dest, const void *src, size_t count);
+
 #define TRACE_ENABLED 0
 #define TRACE() if(TRACE_ENABLED) {fprintf(stderr, "%s\n", __func__);}
 
@@ -158,64 +160,14 @@ typedef struct {
     float w;
 } Vertex;
 
-/* FIXME: SH4 has a swap.w instruction, we should leverage it here! */
-#define _SWAP32(x, y) \
+#define swapVertex(a, b)   \
 do { \
-    uint32_t t = *((uint32_t*) &x); \
-    *((uint32_t*) &x) = *((uint32_t*) &y); \
-    *((uint32_t*) &y) = t; \
+    Vertex temp __attribute__((aligned(32))); \
+     memcpy4(&temp, &b, 32); \
+     memcpy4(&b, &a, 32); \
+     memcpy4(&b, &temp, 32); \
 } while(0)
 
-/*
-    *((uint32_t*) &x) = *((uint32_t*) &x) ^ *((uint32_t*) &y); \
-    *((uint32_t*) &y) = *((uint32_t*) &x) ^ *((uint32_t*) &y); \
-    *((uint32_t*) &x) = *((uint32_t*) &x) ^ *((uint32_t*) &y); */
-
-
-#define swapVertex(a, b)   \
-do {                 \
-    _SWAP32(a->flags, b->flags); \
-    _SWAP32(a->xyz[0], b->xyz[0]); \
-    _SWAP32(a->xyz[1], b->xyz[1]); \
-    _SWAP32(a->xyz[2], b->xyz[2]); \
-    _SWAP32(a->uv[0], b->uv[0]); \
-    _SWAP32(a->uv[1], b->uv[1]); \
-    _SWAP32(a->bgra, b->bgra); \
-    _SWAP32(a->w, b->w); \
-} while(0)
-
-#if 0
-/* FIXME: SH4 has a swap.w instruction, we should leverage it here! */
-inline void _SWAP32( void* x, void* y) 
-{ 
-    #pragma GCC diagnostic push 
-    #pragma GCC diagnostic ignored "-Wstrict-aliasing"
-    *((uint32_t*) &x) = *((uint32_t*) &x) ^ *((uint32_t*) &y); 
-    *((uint32_t*) &y) = *((uint32_t*) &x) ^ *((uint32_t*) &y); 
-    *((uint32_t*) &x) = *((uint32_t*) &x) ^ *((uint32_t*) &y); 
-    #pragma GCC diagnostic pop 
-}
-
-/*
-    *((uint32_t*) &x) = *((uint32_t*) &x) ^ *((uint32_t*) &y); \
-    *((uint32_t*) &y) = *((uint32_t*) &x) ^ *((uint32_t*) &y); \
-    *((uint32_t*) &x) = *((uint32_t*) &x) ^ *((uint32_t*) &y); */
-
- 
-
-#define swapVertex(a, b)   \
-{      \
-    _SWAP32(&a->flags,  &b->flags); \
-    _SWAP32(&a->xyz[0], &b->xyz[0]); \
-    _SWAP32(&a->xyz[1], &b->xyz[1]); \
-    _SWAP32(&a->xyz[2], &b->xyz[2]); \
-    _SWAP32(&a->uv[0],  &b->uv[0]); \
-    _SWAP32(&a->uv[1],  &b->uv[1]); \
-    _SWAP32(&a->bgra,   &b->bgra); \
-    _SWAP32(&a->w,      &b->w); \
-}
-#endif
-
 /* ClipVertex doesn't have room for these, so we need to parse them
  * out separately. Potentially 'w' will be housed here if we support oargb */
 typedef struct {
diff --git a/GL/texture.c b/GL/texture.c
index 31eb852..a984464 100644
--- a/GL/texture.c
+++ b/GL/texture.c
@@ -917,12 +917,12 @@ void _glAllocateSpaceForMipmaps(TextureObject* active) {
      * then free the original
     */
     GLubyte* temp = malloc(size);
-    memcpy(temp, active->data, size);
+    memcpy4(temp, active->data, size);
     pvr_mem_free(active->data);
     active->data = pvr_mem_malloc(_glGetMipmapDataSize(active));
 
     /* If there was existing data, then copy it where it should go */
-    memcpy(_glGetMipmapLocation(active,0), temp, size);
+    memcpy4(_glGetMipmapLocation(active,0), temp, size);
     free(temp);
 }
 
@@ -1122,11 +1122,7 @@ void APIENTRY glTexImage2D(GLenum target, GLint level, GLint internalFormat,
             }
         } else {
             /* No conversion? Just copy the data, and the pvr_format is correct */
-            if(bytes % 32 == 0){
-                sq_cpy(targetData, data, bytes);
-            } else {
-                memcpy(targetData, data, bytes);
-            }
+            FASTCPY(targetData, data, bytes);
         }
 
         return;
diff --git a/Makefile b/Makefile
index 61b2478..b24acae 100644
--- a/Makefile
+++ b/Makefile
@@ -13,7 +13,7 @@ OBJS += containers/stack.o containers/named_array.o containers/aligned_vector.o
 
 SUBDIRS =
 
-EXTRA_CFLAGS=  -Wall -Wextra -Wstrict-aliasing=0
+EXTRA_CFLAGS=  -Wall -Wextra 
 KOS_CFLAGS += -ffast-math -O2 -funroll-loops -fsingle-precision-constant -Iinclude -funsafe-math-optimizations -DBUILD_LIBGL $(EXTRA_CFLAGS)
 #KOS_CFLAGS += -O1 -mlra -Iinclude -DBUILD_LIBGL -Wall -Wextra
 #GCC5_FLAGS = -mfsca -mfsrra -mlra
diff --git a/containers/aligned_vector.c b/containers/aligned_vector.c
index a949310..708b187 100644
--- a/containers/aligned_vector.c
+++ b/containers/aligned_vector.c
@@ -54,7 +54,10 @@ void aligned_vector_reserve(AlignedVector* vector, unsigned int element_count) {
     assert(vector->data);
 
     if(original_data) {
-        memcpy(vector->data, original_data, original_byte_size);
+        if(vector->element_size == 32){
+            sq_cpy(vector->data, original_data, original_byte_size);
+        } else
+            memcpy4(vector->data, original_data, original_byte_size);
         free(original_data);
     }
 
@@ -74,7 +77,10 @@ void* aligned_vector_push_back(AlignedVector* vector, const void* objs, unsigned
     unsigned char* dest = vector->data + (vector->element_size * initial_size);
 
     /* Copy the objects in */
-    memcpy(dest, objs, vector->element_size * count);
+    if(vector->element_size == 32){
+        sq_cpy(dest, objs, vector->element_size * count);
+    } else
+        memcpy4(dest, objs, vector->element_size * count);
 
     return dest;
 }
@@ -139,7 +145,10 @@ void aligned_vector_shrink_to_fit(AlignedVector* vector) {
         vector->data = (unsigned char*) memalign(0x20, new_byte_size);
 
         if(original_data) {
-            memcpy(vector->data, original_data, new_byte_size);
+            if(vector->element_size == 32){
+                sq_cpy(vector->data, original_data, new_byte_size);
+            } else
+                memcpy4(vector->data, original_data, new_byte_size);
             free(original_data);
         }
 

From 74516601ee9f2f73f82ba8b7968938aedcef917d Mon Sep 17 00:00:00 2001
From: Hayden Kowalchuk <hayden@hkowsoftware.com>
Date: Wed, 20 Nov 2019 09:13:53 -0500
Subject: [PATCH 2/2] fix: change modulate to support alpha too

---
 GL/texture.c | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/GL/texture.c b/GL/texture.c
index a984464..7160e94 100644
--- a/GL/texture.c
+++ b/GL/texture.c
@@ -358,7 +358,7 @@ static void _glInitializeTextureObject(TextureObject* txr, unsigned int id) {
     txr->width = txr->height = 0;
     txr->mipmap = 0;
     txr->uv_clamp = 0;
-    txr->env = PVR_TXRENV_MODULATE;
+    txr->env = PVR_TXRENV_MODULATEALPHA;
     txr->data = NULL;
     txr->mipmapCount = 0;
     txr->minFilter = GL_NEAREST;
@@ -474,7 +474,7 @@ void APIENTRY glTexEnvi(GLenum target, GLenum pname, GLint param) {
 
             switch(param) {
                 case GL_MODULATE:
-                    active->env = PVR_TXRENV_MODULATE;
+                    active->env = PVR_TXRENV_MODULATEALPHA;
                 break;
                 case GL_DECAL:
                     active->env = PVR_TXRENV_DECAL;