Merge branch 'dma_exploration' into 'master'

Fixes rollup See merge request HaydenKow/GLdc!3
2019-11-20 14:16:47 +00:00 · 2019-11-20 14:16:47 +00:00 · 082c381667
commit 082c381667
parent 7aabea010d 74516601ee
7 changed files with 56 additions and 99 deletions
--- a/GL/clip.c
+++ b/GL/clip.c
@ -91,9 +91,6 @@ void _glClipTriangle(const Triangle* triangle, const uint8_t visible, Submission
    const Vertex* vertices = triangle->vertex;
    const VertexExtra* extras = triangle->extra;

-    /* Used when flat shading is enabled */
-    uint32_t finalColour = *((uint32_t*) vertices[2].bgra);
-
    for(i = 0; i < 4; ++i) {
        uint8_t thisIndex = (i == 3) ? 0 : i;

@ -123,7 +120,7 @@ void _glClipTriangle(const Triangle* triangle, const uint8_t visible, Submission
                interpolateVec2(ve1->st, ve2->st, t, veNext.st);

                if(flatShade) {
-                    *((uint32_t*) next.bgra) = finalColour;
+                    *((uint32_t*) next.bgra) = *((uint32_t*) vertices[2].bgra);
                } else {
                    interpolateColour(v1->bgra, v2->bgra, t, next.bgra);
                }
--- a/GL/draw.c
+++ b/GL/draw.c
@ -798,31 +798,34 @@ static void generate(SubmissionTarget* target, const GLenum mode, const GLsizei
        
        sq_cpy(start, VERTEX_POINTER.ptr - ( sizeof(unsigned int) * 1 ), count * sizeof(Vertex) );

-        ITERATE(count) {
-            it->flags = PVR_CMD_VERTEX;
-            ++it;
-        }
+        if(start->flags == 0){

-        profiler_checkpoint("flags");
+            ITERATE(count) {
+                it->flags = PVR_CMD_VERTEX;
+                ++it;
+            }

-        // Drawing arrays
-        switch(mode) {
-        case GL_TRIANGLES:
-            genTriangles(start, count);
-            break;
-        case GL_QUADS:
-            genQuads(start, count);
-            break;
-        case GL_TRIANGLE_FAN:
-            genTriangleFan(start, count);
-            break;
-        case GL_TRIANGLE_STRIP:
-            genTriangleStrip(_glSubmissionTargetStart(target), count);
-            break;
-        default:
-            printf("mode: 0x%08x\n", mode);
-            fflush(stdout);
-            assert(0 && "Not Implemented");
+            profiler_checkpoint("flags");
+
+            // Drawing arrays
+            switch(mode) {
+            case GL_TRIANGLES:
+                genTriangles(start, count);
+                break;
+            case GL_QUADS:
+                genQuads(start, count);
+                break;
+            case GL_TRIANGLE_FAN:
+                genTriangleFan(start, count);
+                break;
+            case GL_TRIANGLE_STRIP:
+                genTriangleStrip(_glSubmissionTargetStart(target), count);
+                break;
+            default:
+                printf("mode: 0x%08x\n", mode);
+                fflush(stdout);
+                assert(0 && "Not Implemented");
+            }
        }

        profiler_checkpoint("quads");
--- a/GL/matrix.c
+++ b/GL/matrix.c
@ -13,7 +13,7 @@
 #define DEG2RAD (0.01745329251994329576923690768489)

 /* Viewport mapping */
-static GLfloat gl_viewport_scale[3], gl_viewport_offset[3];
+//static GLfloat gl_viewport_scale[3], gl_viewport_offset[3];

 /* Depth range */
 GLfloat DEPTH_RANGE_MULTIPLIER_L = (1 - 0) / 2;
@ -78,8 +78,8 @@ void _glInitMatrices() {
    stack_push(&MATRIX_STACKS[1], IDENTITY);
    stack_push(&MATRIX_STACKS[2], IDENTITY);

-    memcpy(NORMAL_MATRIX, IDENTITY, sizeof(Matrix4x4));
-    memcpy(SCREENVIEW_MATRIX, IDENTITY, sizeof(Matrix4x4));
+    sq_cpy(NORMAL_MATRIX, IDENTITY, sizeof(Matrix4x4));
+    sq_cpy(SCREENVIEW_MATRIX, IDENTITY, sizeof(Matrix4x4));

    glDepthRange(0.0f, 1.0f);
    glViewport(0, 0, vid_mode->width, vid_mode->height);
@ -123,7 +123,7 @@ static void transpose(GLfloat* m) {
 }

 static void recalculateNormalMatrix() {
-    memcpy(NORMAL_MATRIX, stack_top(MATRIX_STACKS + (GL_MODELVIEW & 0xF)), sizeof(Matrix4x4));
+    sq_cpy(NORMAL_MATRIX, stack_top(MATRIX_STACKS + (GL_MODELVIEW & 0xF)), sizeof(Matrix4x4));
    inverse((GLfloat*) NORMAL_MATRIX);
    transpose((GLfloat*) NORMAL_MATRIX);
 }
--- a/GL/private.h
+++ b/GL/private.h
@ -7,6 +7,8 @@
 #include "../containers/aligned_vector.h"
 #include "../containers/named_array.h"

+extern void* memcpy4 (void *dest, const void *src, size_t count);
+
 #define TRACE_ENABLED 0
 #define TRACE() if(TRACE_ENABLED) {fprintf(stderr, "%s\n", __func__);}

@ -158,64 +160,14 @@ typedef struct {
    float w;
 } Vertex;

-/* FIXME: SH4 has a swap.w instruction, we should leverage it here! */
-#define _SWAP32(x, y) \
+#define swapVertex(a, b)   \
 do { \
-    uint32_t t = *((uint32_t*) &x); \
-    *((uint32_t*) &x) = *((uint32_t*) &y); \
-    *((uint32_t*) &y) = t; \
+    Vertex temp __attribute__((aligned(32))); \
+     memcpy4(&temp, &b, 32); \
+     memcpy4(&b, &a, 32); \
+     memcpy4(&b, &temp, 32); \
 } while(0)

-/*
-    *((uint32_t*) &x) = *((uint32_t*) &x) ^ *((uint32_t*) &y); \
-    *((uint32_t*) &y) = *((uint32_t*) &x) ^ *((uint32_t*) &y); \
-    *((uint32_t*) &x) = *((uint32_t*) &x) ^ *((uint32_t*) &y); */
-
-
-#define swapVertex(a, b)   \
-do {                 \
-    _SWAP32(a->flags, b->flags); \
-    _SWAP32(a->xyz[0], b->xyz[0]); \
-    _SWAP32(a->xyz[1], b->xyz[1]); \
-    _SWAP32(a->xyz[2], b->xyz[2]); \
-    _SWAP32(a->uv[0], b->uv[0]); \
-    _SWAP32(a->uv[1], b->uv[1]); \
-    _SWAP32(a->bgra, b->bgra); \
-    _SWAP32(a->w, b->w); \
-} while(0)
-
-#if 0
-/* FIXME: SH4 has a swap.w instruction, we should leverage it here! */
-inline void _SWAP32( void* x, void* y) 
-{ 
-    #pragma GCC diagnostic push 
-    #pragma GCC diagnostic ignored "-Wstrict-aliasing"
-    *((uint32_t*) &x) = *((uint32_t*) &x) ^ *((uint32_t*) &y); 
-    *((uint32_t*) &y) = *((uint32_t*) &x) ^ *((uint32_t*) &y); 
-    *((uint32_t*) &x) = *((uint32_t*) &x) ^ *((uint32_t*) &y); 
-    #pragma GCC diagnostic pop 
-}
-
-/*
-    *((uint32_t*) &x) = *((uint32_t*) &x) ^ *((uint32_t*) &y); \
-    *((uint32_t*) &y) = *((uint32_t*) &x) ^ *((uint32_t*) &y); \
-    *((uint32_t*) &x) = *((uint32_t*) &x) ^ *((uint32_t*) &y); */
-
- 
-
-#define swapVertex(a, b)   \
-{      \
-    _SWAP32(&a->flags,  &b->flags); \
-    _SWAP32(&a->xyz[0], &b->xyz[0]); \
-    _SWAP32(&a->xyz[1], &b->xyz[1]); \
-    _SWAP32(&a->xyz[2], &b->xyz[2]); \
-    _SWAP32(&a->uv[0],  &b->uv[0]); \
-    _SWAP32(&a->uv[1],  &b->uv[1]); \
-    _SWAP32(&a->bgra,   &b->bgra); \
-    _SWAP32(&a->w,      &b->w); \
-}
-#endif
-
 /* ClipVertex doesn't have room for these, so we need to parse them
 * out separately. Potentially 'w' will be housed here if we support oargb */
 typedef struct {
--- a/GL/texture.c
+++ b/GL/texture.c
@ -358,7 +358,7 @@ static void _glInitializeTextureObject(TextureObject* txr, unsigned int id) {
    txr->width = txr->height = 0;
    txr->mipmap = 0;
    txr->uv_clamp = 0;
-    txr->env = PVR_TXRENV_MODULATE;
+    txr->env = PVR_TXRENV_MODULATEALPHA;
    txr->data = NULL;
    txr->mipmapCount = 0;
    txr->minFilter = GL_NEAREST;
@ -474,7 +474,7 @@ void APIENTRY glTexEnvi(GLenum target, GLenum pname, GLint param) {

            switch(param) {
                case GL_MODULATE:
-                    active->env = PVR_TXRENV_MODULATE;
+                    active->env = PVR_TXRENV_MODULATEALPHA;
                break;
                case GL_DECAL:
                    active->env = PVR_TXRENV_DECAL;
@ -917,12 +917,12 @@ void _glAllocateSpaceForMipmaps(TextureObject* active) {
     * then free the original
    */
    GLubyte* temp = malloc(size);
-    memcpy(temp, active->data, size);
+    memcpy4(temp, active->data, size);
    pvr_mem_free(active->data);
    active->data = pvr_mem_malloc(_glGetMipmapDataSize(active));

    /* If there was existing data, then copy it where it should go */
-    memcpy(_glGetMipmapLocation(active,0), temp, size);
+    memcpy4(_glGetMipmapLocation(active,0), temp, size);
    free(temp);
 }

@ -1122,11 +1122,7 @@ void APIENTRY glTexImage2D(GLenum target, GLint level, GLint internalFormat,
            }
        } else {
            /* No conversion? Just copy the data, and the pvr_format is correct */
-            if(bytes % 32 == 0){
-                sq_cpy(targetData, data, bytes);
-            } else {
-                memcpy(targetData, data, bytes);
-            }
+            FASTCPY(targetData, data, bytes);
        }

        return;
--- a/2
+++ b/2
@ -13,7 +13,7 @@ OBJS += containers/stack.o containers/named_array.o containers/aligned_vector.o

 SUBDIRS =

-EXTRA_CFLAGS=  -Wall -Wextra -Wstrict-aliasing=0
+EXTRA_CFLAGS=  -Wall -Wextra 
 KOS_CFLAGS += -ffast-math -O2 -funroll-loops -fsingle-precision-constant -Iinclude -funsafe-math-optimizations -DBUILD_LIBGL $(EXTRA_CFLAGS)
 #KOS_CFLAGS += -O1 -mlra -Iinclude -DBUILD_LIBGL -Wall -Wextra
 #GCC5_FLAGS = -mfsca -mfsrra -mlra
--- a/containers/aligned_vector.c
+++ b/containers/aligned_vector.c
@ -54,7 +54,10 @@ void aligned_vector_reserve(AlignedVector* vector, unsigned int element_count) {
    assert(vector->data);

    if(original_data) {
-        memcpy(vector->data, original_data, original_byte_size);
+        if(vector->element_size == 32){
+            sq_cpy(vector->data, original_data, original_byte_size);
+        } else
+            memcpy4(vector->data, original_data, original_byte_size);
        free(original_data);
    }

@ -74,7 +77,10 @@ void* aligned_vector_push_back(AlignedVector* vector, const void* objs, unsigned
    unsigned char* dest = vector->data + (vector->element_size * initial_size);

    /* Copy the objects in */
-    memcpy(dest, objs, vector->element_size * count);
+    if(vector->element_size == 32){
+        sq_cpy(dest, objs, vector->element_size * count);
+    } else
+        memcpy4(dest, objs, vector->element_size * count);

    return dest;
 }
@ -139,7 +145,10 @@ void aligned_vector_shrink_to_fit(AlignedVector* vector) {
        vector->data = (unsigned char*) memalign(0x20, new_byte_size);

        if(original_data) {
-            memcpy(vector->data, original_data, new_byte_size);
+            if(vector->element_size == 32){
+                sq_cpy(vector->data, original_data, new_byte_size);
+            } else
+                memcpy4(vector->data, original_data, new_byte_size);
            free(original_data);
        }