From c222c23ae3b4ee78a88f6d69aa0724d9573a313e Mon Sep 17 00:00:00 2001 From: Hayden Kowalchuk Date: Fri, 4 Oct 2019 21:32:54 -0400 Subject: [PATCH 1/2] fixes: rollup of minor stuff --- GL/clip.c | 5 +-- GL/draw.c | 49 +++++++++++++++-------------- GL/matrix.c | 8 ++--- GL/private.h | 62 +++++-------------------------------- GL/texture.c | 10 ++---- Makefile | 2 +- containers/aligned_vector.c | 15 +++++++-- 7 files changed, 54 insertions(+), 97 deletions(-) diff --git a/GL/clip.c b/GL/clip.c index c140c01..e4a8307 100644 --- a/GL/clip.c +++ b/GL/clip.c @@ -91,9 +91,6 @@ void _glClipTriangle(const Triangle* triangle, const uint8_t visible, Submission const Vertex* vertices = triangle->vertex; const VertexExtra* extras = triangle->extra; - /* Used when flat shading is enabled */ - uint32_t finalColour = *((uint32_t*) vertices[2].bgra); - for(i = 0; i < 4; ++i) { uint8_t thisIndex = (i == 3) ? 0 : i; @@ -123,7 +120,7 @@ void _glClipTriangle(const Triangle* triangle, const uint8_t visible, Submission interpolateVec2(ve1->st, ve2->st, t, veNext.st); if(flatShade) { - *((uint32_t*) next.bgra) = finalColour; + *((uint32_t*) next.bgra) = *((uint32_t*) vertices[2].bgra); } else { interpolateColour(v1->bgra, v2->bgra, t, next.bgra); } diff --git a/GL/draw.c b/GL/draw.c index cfba2f8..68331f4 100644 --- a/GL/draw.c +++ b/GL/draw.c @@ -798,31 +798,34 @@ static void generate(SubmissionTarget* target, const GLenum mode, const GLsizei sq_cpy(start, VERTEX_POINTER.ptr - ( sizeof(unsigned int) * 1 ), count * sizeof(Vertex) ); - ITERATE(count) { - it->flags = PVR_CMD_VERTEX; - ++it; - } + if(start->flags == 0){ - profiler_checkpoint("flags"); + ITERATE(count) { + it->flags = PVR_CMD_VERTEX; + ++it; + } - // Drawing arrays - switch(mode) { - case GL_TRIANGLES: - genTriangles(start, count); - break; - case GL_QUADS: - genQuads(start, count); - break; - case GL_TRIANGLE_FAN: - genTriangleFan(start, count); - break; - case GL_TRIANGLE_STRIP: - genTriangleStrip(_glSubmissionTargetStart(target), count); - break; - default: - printf("mode: 0x%08x\n", mode); - fflush(stdout); - assert(0 && "Not Implemented"); + profiler_checkpoint("flags"); + + // Drawing arrays + switch(mode) { + case GL_TRIANGLES: + genTriangles(start, count); + break; + case GL_QUADS: + genQuads(start, count); + break; + case GL_TRIANGLE_FAN: + genTriangleFan(start, count); + break; + case GL_TRIANGLE_STRIP: + genTriangleStrip(_glSubmissionTargetStart(target), count); + break; + default: + printf("mode: 0x%08x\n", mode); + fflush(stdout); + assert(0 && "Not Implemented"); + } } profiler_checkpoint("quads"); diff --git a/GL/matrix.c b/GL/matrix.c index 28b8177..75b1e56 100644 --- a/GL/matrix.c +++ b/GL/matrix.c @@ -13,7 +13,7 @@ #define DEG2RAD (0.01745329251994329576923690768489) /* Viewport mapping */ -static GLfloat gl_viewport_scale[3], gl_viewport_offset[3]; +//static GLfloat gl_viewport_scale[3], gl_viewport_offset[3]; /* Depth range */ GLfloat DEPTH_RANGE_MULTIPLIER_L = (1 - 0) / 2; @@ -78,8 +78,8 @@ void _glInitMatrices() { stack_push(&MATRIX_STACKS[1], IDENTITY); stack_push(&MATRIX_STACKS[2], IDENTITY); - memcpy(NORMAL_MATRIX, IDENTITY, sizeof(Matrix4x4)); - memcpy(SCREENVIEW_MATRIX, IDENTITY, sizeof(Matrix4x4)); + sq_cpy(NORMAL_MATRIX, IDENTITY, sizeof(Matrix4x4)); + sq_cpy(SCREENVIEW_MATRIX, IDENTITY, sizeof(Matrix4x4)); glDepthRange(0.0f, 1.0f); glViewport(0, 0, vid_mode->width, vid_mode->height); @@ -123,7 +123,7 @@ static void transpose(GLfloat* m) { } static void recalculateNormalMatrix() { - memcpy(NORMAL_MATRIX, stack_top(MATRIX_STACKS + (GL_MODELVIEW & 0xF)), sizeof(Matrix4x4)); + sq_cpy(NORMAL_MATRIX, stack_top(MATRIX_STACKS + (GL_MODELVIEW & 0xF)), sizeof(Matrix4x4)); inverse((GLfloat*) NORMAL_MATRIX); transpose((GLfloat*) NORMAL_MATRIX); } diff --git a/GL/private.h b/GL/private.h index 4cd2acb..6987723 100644 --- a/GL/private.h +++ b/GL/private.h @@ -7,6 +7,8 @@ #include "../containers/aligned_vector.h" #include "../containers/named_array.h" +extern void* memcpy4 (void *dest, const void *src, size_t count); + #define TRACE_ENABLED 0 #define TRACE() if(TRACE_ENABLED) {fprintf(stderr, "%s\n", __func__);} @@ -158,64 +160,14 @@ typedef struct { float w; } Vertex; -/* FIXME: SH4 has a swap.w instruction, we should leverage it here! */ -#define _SWAP32(x, y) \ +#define swapVertex(a, b) \ do { \ - uint32_t t = *((uint32_t*) &x); \ - *((uint32_t*) &x) = *((uint32_t*) &y); \ - *((uint32_t*) &y) = t; \ + Vertex temp __attribute__((aligned(32))); \ + memcpy4(&temp, &b, 32); \ + memcpy4(&b, &a, 32); \ + memcpy4(&b, &temp, 32); \ } while(0) -/* - *((uint32_t*) &x) = *((uint32_t*) &x) ^ *((uint32_t*) &y); \ - *((uint32_t*) &y) = *((uint32_t*) &x) ^ *((uint32_t*) &y); \ - *((uint32_t*) &x) = *((uint32_t*) &x) ^ *((uint32_t*) &y); */ - - -#define swapVertex(a, b) \ -do { \ - _SWAP32(a->flags, b->flags); \ - _SWAP32(a->xyz[0], b->xyz[0]); \ - _SWAP32(a->xyz[1], b->xyz[1]); \ - _SWAP32(a->xyz[2], b->xyz[2]); \ - _SWAP32(a->uv[0], b->uv[0]); \ - _SWAP32(a->uv[1], b->uv[1]); \ - _SWAP32(a->bgra, b->bgra); \ - _SWAP32(a->w, b->w); \ -} while(0) - -#if 0 -/* FIXME: SH4 has a swap.w instruction, we should leverage it here! */ -inline void _SWAP32( void* x, void* y) -{ - #pragma GCC diagnostic push - #pragma GCC diagnostic ignored "-Wstrict-aliasing" - *((uint32_t*) &x) = *((uint32_t*) &x) ^ *((uint32_t*) &y); - *((uint32_t*) &y) = *((uint32_t*) &x) ^ *((uint32_t*) &y); - *((uint32_t*) &x) = *((uint32_t*) &x) ^ *((uint32_t*) &y); - #pragma GCC diagnostic pop -} - -/* - *((uint32_t*) &x) = *((uint32_t*) &x) ^ *((uint32_t*) &y); \ - *((uint32_t*) &y) = *((uint32_t*) &x) ^ *((uint32_t*) &y); \ - *((uint32_t*) &x) = *((uint32_t*) &x) ^ *((uint32_t*) &y); */ - - - -#define swapVertex(a, b) \ -{ \ - _SWAP32(&a->flags, &b->flags); \ - _SWAP32(&a->xyz[0], &b->xyz[0]); \ - _SWAP32(&a->xyz[1], &b->xyz[1]); \ - _SWAP32(&a->xyz[2], &b->xyz[2]); \ - _SWAP32(&a->uv[0], &b->uv[0]); \ - _SWAP32(&a->uv[1], &b->uv[1]); \ - _SWAP32(&a->bgra, &b->bgra); \ - _SWAP32(&a->w, &b->w); \ -} -#endif - /* ClipVertex doesn't have room for these, so we need to parse them * out separately. Potentially 'w' will be housed here if we support oargb */ typedef struct { diff --git a/GL/texture.c b/GL/texture.c index 31eb852..a984464 100644 --- a/GL/texture.c +++ b/GL/texture.c @@ -917,12 +917,12 @@ void _glAllocateSpaceForMipmaps(TextureObject* active) { * then free the original */ GLubyte* temp = malloc(size); - memcpy(temp, active->data, size); + memcpy4(temp, active->data, size); pvr_mem_free(active->data); active->data = pvr_mem_malloc(_glGetMipmapDataSize(active)); /* If there was existing data, then copy it where it should go */ - memcpy(_glGetMipmapLocation(active,0), temp, size); + memcpy4(_glGetMipmapLocation(active,0), temp, size); free(temp); } @@ -1122,11 +1122,7 @@ void APIENTRY glTexImage2D(GLenum target, GLint level, GLint internalFormat, } } else { /* No conversion? Just copy the data, and the pvr_format is correct */ - if(bytes % 32 == 0){ - sq_cpy(targetData, data, bytes); - } else { - memcpy(targetData, data, bytes); - } + FASTCPY(targetData, data, bytes); } return; diff --git a/Makefile b/Makefile index 61b2478..b24acae 100644 --- a/Makefile +++ b/Makefile @@ -13,7 +13,7 @@ OBJS += containers/stack.o containers/named_array.o containers/aligned_vector.o SUBDIRS = -EXTRA_CFLAGS= -Wall -Wextra -Wstrict-aliasing=0 +EXTRA_CFLAGS= -Wall -Wextra KOS_CFLAGS += -ffast-math -O2 -funroll-loops -fsingle-precision-constant -Iinclude -funsafe-math-optimizations -DBUILD_LIBGL $(EXTRA_CFLAGS) #KOS_CFLAGS += -O1 -mlra -Iinclude -DBUILD_LIBGL -Wall -Wextra #GCC5_FLAGS = -mfsca -mfsrra -mlra diff --git a/containers/aligned_vector.c b/containers/aligned_vector.c index a949310..708b187 100644 --- a/containers/aligned_vector.c +++ b/containers/aligned_vector.c @@ -54,7 +54,10 @@ void aligned_vector_reserve(AlignedVector* vector, unsigned int element_count) { assert(vector->data); if(original_data) { - memcpy(vector->data, original_data, original_byte_size); + if(vector->element_size == 32){ + sq_cpy(vector->data, original_data, original_byte_size); + } else + memcpy4(vector->data, original_data, original_byte_size); free(original_data); } @@ -74,7 +77,10 @@ void* aligned_vector_push_back(AlignedVector* vector, const void* objs, unsigned unsigned char* dest = vector->data + (vector->element_size * initial_size); /* Copy the objects in */ - memcpy(dest, objs, vector->element_size * count); + if(vector->element_size == 32){ + sq_cpy(dest, objs, vector->element_size * count); + } else + memcpy4(dest, objs, vector->element_size * count); return dest; } @@ -139,7 +145,10 @@ void aligned_vector_shrink_to_fit(AlignedVector* vector) { vector->data = (unsigned char*) memalign(0x20, new_byte_size); if(original_data) { - memcpy(vector->data, original_data, new_byte_size); + if(vector->element_size == 32){ + sq_cpy(vector->data, original_data, new_byte_size); + } else + memcpy4(vector->data, original_data, new_byte_size); free(original_data); } From 74516601ee9f2f73f82ba8b7968938aedcef917d Mon Sep 17 00:00:00 2001 From: Hayden Kowalchuk Date: Wed, 20 Nov 2019 09:13:53 -0500 Subject: [PATCH 2/2] fix: change modulate to support alpha too --- GL/texture.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/GL/texture.c b/GL/texture.c index a984464..7160e94 100644 --- a/GL/texture.c +++ b/GL/texture.c @@ -358,7 +358,7 @@ static void _glInitializeTextureObject(TextureObject* txr, unsigned int id) { txr->width = txr->height = 0; txr->mipmap = 0; txr->uv_clamp = 0; - txr->env = PVR_TXRENV_MODULATE; + txr->env = PVR_TXRENV_MODULATEALPHA; txr->data = NULL; txr->mipmapCount = 0; txr->minFilter = GL_NEAREST; @@ -474,7 +474,7 @@ void APIENTRY glTexEnvi(GLenum target, GLenum pname, GLint param) { switch(param) { case GL_MODULATE: - active->env = PVR_TXRENV_MODULATE; + active->env = PVR_TXRENV_MODULATEALPHA; break; case GL_DECAL: active->env = PVR_TXRENV_DECAL;