From 36fe13095c98385a4861eb3d67386a26c85e200b Mon Sep 17 00:00:00 2001 From: Luke Benstead Date: Sun, 18 Oct 2020 20:39:34 +0100 Subject: [PATCH] Use fat 64 byte vertices to improve data locality --- GL/clip.c | 58 ++++++++++++---------------------------------------- GL/draw.c | 49 +++++++++++++++----------------------------- GL/flush.c | 5 ++++- GL/private.h | 27 ++++++++++++++---------- 4 files changed, 49 insertions(+), 90 deletions(-) diff --git a/GL/clip.c b/GL/clip.c index 259cac1..2b18e5a 100644 --- a/GL/clip.c +++ b/GL/clip.c @@ -85,16 +85,12 @@ const uint32_t VERTEX_CMD = 0xe0000000; typedef struct { Vertex vertex[3]; - VertexExtra extra[3]; uint8_t visible; } Triangle; void _glClipTriangle(const Triangle* triangle, const uint8_t visible, SubmissionTarget* target, const uint8_t flatShade) { Vertex* last = NULL; - VertexExtra* veLast = NULL; - const Vertex* vertices = triangle->vertex; - const VertexExtra* extras = triangle->extra; char* bgra = (char*) vertices[2].bgra; @@ -102,25 +98,22 @@ void _glClipTriangle(const Triangle* triangle, const uint8_t visible, Submission uint32_t finalColour = *((uint32_t*) bgra); Vertex tmp; - VertexExtra veTmp; - uint8_t pushedCount = 0; #define IS_VISIBLE(x) (visible & (1 << (2 - (x)))) > 0 -#define PUSH_VERT(vert, ve) \ +#define PUSH_VERT(vert) \ last = aligned_vector_push_back(&target->output->vector, vert, 1); \ last->flags = VERTEX_CMD; \ - veLast = aligned_vector_push_back(target->extras, ve, 1); \ ++pushedCount; -#define CLIP_TO_PLANE(vert1, ve1, vert2, ve2) \ +#define CLIP_TO_PLANE(vert1, vert2) \ do { \ float t = _glClipLineToNearZ((vert1), (vert2), &tmp); \ interpolateFloat((vert1)->w, (vert2)->w, t, &tmp.w); \ interpolateVec2((vert1)->uv, (vert2)->uv, t, tmp.uv); \ - interpolateVec3((ve1)->nxyz, (ve2)->nxyz, t, veTmp.nxyz); \ - interpolateVec2((ve1)->st, (ve2)->st, t, veTmp.st); \ + interpolateVec3((vert1)->nxyz, (vert2)->nxyz, t, tmp.nxyz); \ + interpolateVec2((vert1)->st, (vert2)->st, t, tmp.st); \ if(flatShade) { \ interpolateColour((const uint8_t*) &finalColour, (const uint8_t*) &finalColour, t, tmp.bgra); \ } else { interpolateColour((vert1)->bgra, (vert2)->bgra, t, tmp.bgra); } \ @@ -130,44 +123,38 @@ void _glClipTriangle(const Triangle* triangle, const uint8_t visible, Submission uint8_t v1 = IS_VISIBLE(1); uint8_t v2 = IS_VISIBLE(2); if(v0) { - PUSH_VERT(&vertices[0], &extras[0]); + PUSH_VERT(&vertices[0]); } if(v0 != v1) { - CLIP_TO_PLANE(&vertices[0], &extras[0], &vertices[1], &extras[1]); - PUSH_VERT(&tmp, &veTmp); + CLIP_TO_PLANE(&vertices[0], &vertices[1]); + PUSH_VERT(&tmp); } if(v1) { - PUSH_VERT(&vertices[1], &extras[1]); + PUSH_VERT(&vertices[1]); } if(v1 != v2) { - CLIP_TO_PLANE(&vertices[1], &extras[1], &vertices[2], &extras[2]); - PUSH_VERT(&tmp, &veTmp); + CLIP_TO_PLANE(&vertices[1], &vertices[2]); + PUSH_VERT(&tmp); } if(v2) { - PUSH_VERT(&vertices[2], &extras[2]); + PUSH_VERT(&vertices[2]); } if(v2 != v0) { - CLIP_TO_PLANE(&vertices[2], &extras[2], &vertices[0], &extras[0]); - PUSH_VERT(&tmp, &veTmp); + CLIP_TO_PLANE(&vertices[2], &vertices[0]); + PUSH_VERT(&tmp); } if(pushedCount == 4) { Vertex* prev = last - 1; - VertexExtra* prevVe = veLast - 1; - tmp = *prev; - veTmp = *prevVe; *prev = *last; - *prevVe = *veLast; - *last = tmp; - *veLast = veTmp; prev->flags = VERTEX_CMD; last->flags = VERTEX_CMD_EOL; @@ -309,15 +296,6 @@ void _glClipTriangleStrip(SubmissionTarget* target, uint8_t fladeShade) { TO_CLIP[CLIP_COUNT].vertex[0] = *v1; TO_CLIP[CLIP_COUNT].vertex[1] = *v2; TO_CLIP[CLIP_COUNT].vertex[2] = *v3; - - VertexExtra* ve1 = (VertexExtra*) aligned_vector_at(target->extras, vi1); - VertexExtra* ve2 = (VertexExtra*) aligned_vector_at(target->extras, vi2); - VertexExtra* ve3 = (VertexExtra*) aligned_vector_at(target->extras, vi3); - - TO_CLIP[CLIP_COUNT].extra[0] = *ve1; - TO_CLIP[CLIP_COUNT].extra[1] = *ve2; - TO_CLIP[CLIP_COUNT].extra[2] = *ve3; - TO_CLIP[CLIP_COUNT].visible = visible; ++CLIP_COUNT; @@ -359,11 +337,6 @@ void _glClipTriangleStrip(SubmissionTarget* target, uint8_t fladeShade) { TO_CLIP[CLIP_COUNT].vertex[1] = *v2; TO_CLIP[CLIP_COUNT].vertex[2] = *v4; - VertexExtra* ve4 = (VertexExtra*) aligned_vector_at(target->extras, vi4); - TO_CLIP[CLIP_COUNT].extra[0] = *(VertexExtra*) aligned_vector_at(target->extras, vi3); - TO_CLIP[CLIP_COUNT].extra[1] = *(VertexExtra*) aligned_vector_at(target->extras, vi2); - TO_CLIP[CLIP_COUNT].extra[2] = *ve4; - visible = (_VERT_VISIBLE(v3) ? 4 : 0) | (_VERT_VISIBLE(v2) ? 2 : 0) | (_VERT_VISIBLE(v4) ? 1 : 0); @@ -385,11 +358,6 @@ void _glClipTriangleStrip(SubmissionTarget* target, uint8_t fladeShade) { swapVertex(v3, v4); v3->flags = VERTEX_CMD; v4->flags = VERTEX_CMD; - - /* Swap the extra data too */ - VertexExtra t = *ve4; - *ve3 = *ve4; - *ve4 = t; } } break; diff --git a/GL/draw.c b/GL/draw.c index fc99dee..04e2c3e 100644 --- a/GL/draw.c +++ b/GL/draw.c @@ -651,35 +651,35 @@ GL_FORCE_INLINE void _readUVData(const GLuint first, const GLuint count, Vertex* } } -GL_FORCE_INLINE void _readSTData(const GLuint first, const GLuint count, VertexExtra* extra) { +GL_FORCE_INLINE void _readSTData(const GLuint first, const GLuint count, Vertex* outpu) { const GLubyte ststride = (ST_POINTER.stride) ? ST_POINTER.stride : ST_POINTER.size * byte_size(ST_POINTER.type); const void* stptr = ((GLubyte*) ST_POINTER.ptr + (first * ststride)); ReadUVFunc func = calcReadUVFunc(); - GLubyte* out = (GLubyte*) extra[0].st; + GLubyte* out = (GLubyte*) outpu[0].st; ITERATE(count) { func(stptr, out); stptr += ststride; - out += sizeof(VertexExtra); + out += sizeof(Vertex); } } -GL_FORCE_INLINE void _readNormalData(const GLuint first, const GLuint count, VertexExtra* extra) { +GL_FORCE_INLINE void _readNormalData(const GLuint first, const GLuint count, Vertex* output) { const GLuint nstride = (NORMAL_POINTER.stride) ? NORMAL_POINTER.stride : NORMAL_POINTER.size * byte_size(NORMAL_POINTER.type); const void* nptr = ((GLubyte*) NORMAL_POINTER.ptr + (first * nstride)); ReadNormalFunc func = calcReadNormalFunc(); - GLubyte* out = (GLubyte*) extra[0].nxyz; + GLubyte* out = (GLubyte*) output[0].nxyz; ITERATE(count) { func(nptr, out); nptr += nstride; - out += sizeof(VertexExtra); + out += sizeof(Vertex); } if(_glIsNormalizeEnabled()) { - GLubyte* ptr = (GLubyte*) extra->nxyz; + GLubyte* ptr = (GLubyte*) output->nxyz; ITERATE(count) { GLfloat* n = (GLfloat*) ptr; float temp = n[0] * n[0]; @@ -691,7 +691,7 @@ GL_FORCE_INLINE void _readNormalData(const GLuint first, const GLuint count, Ver n[1] *= ilength; n[2] *= ilength; - ptr += sizeof(VertexExtra); + ptr += sizeof(Vertex); } } } @@ -724,7 +724,6 @@ static void generateElements( GLubyte* nxyz; Vertex* output = _glSubmissionTargetStart(target); - VertexExtra* ve = aligned_vector_at(target->extras, 0); uint32_t i = first; uint32_t idx = 0; @@ -762,12 +761,11 @@ static void generateElements( pos_func(xyz, (GLubyte*) output->xyz); uv_func(uv, (GLubyte*) output->uv); diffuse_func(bgra, output->bgra); - st_func(st, (GLubyte*) ve->st); - normal_func(nxyz, (GLubyte*) ve->nxyz); + st_func(st, (GLubyte*) output->st); + normal_func(nxyz, (GLubyte*) output->nxyz); output->flags = PVR_CMD_VERTEX; ++output; - ++ve; } } @@ -804,10 +802,8 @@ static void generate(SubmissionTarget* target, const GLenum mode, const GLsizei } } - VertexExtra* ve = aligned_vector_at(target->extras, 0); - - _readNormalData(first, count, ve); - _readSTData(first, count, ve); + _readNormalData(first, count, start); + _readSTData(first, count, start); } else { generateElements( @@ -917,14 +913,13 @@ static void light(SubmissionTarget* target) { /* Perform lighting calculations and manipulate the colour */ Vertex* vertex = _glSubmissionTargetStart(target); - VertexExtra* extra = aligned_vector_at(target->extras, 0); EyeSpaceData* eye_space = (EyeSpaceData*) eye_space_data->data; _glMatrixLoadModelView(); mat_transform3(vertex->xyz, eye_space->xyz, target->count, sizeof(Vertex), sizeof(EyeSpaceData)); _glMatrixLoadNormal(); - mat_transform_normal3(extra->nxyz, eye_space->n, target->count, sizeof(VertexExtra), sizeof(EyeSpaceData)); + mat_transform_normal3(vertex->nxyz, eye_space->n, target->count, sizeof(Vertex), sizeof(EyeSpaceData)); EyeSpaceData* ES = aligned_vector_at(eye_space_data, 0); _glPerformLighting(vertex, ES, target->count); @@ -1005,18 +1000,13 @@ GL_FORCE_INLINE void submitVertices(GLenum mode, GLsizei first, GLuint count, GL } static SubmissionTarget* target = NULL; - static AlignedVector extras; - /* Initialization of the target and extras */ + /* Initialization of the target */ if(!target) { target = (SubmissionTarget*) malloc(sizeof(SubmissionTarget)); - target->extras = NULL; target->count = 0; target->output = NULL; target->header_offset = target->start_offset = 0; - - aligned_vector_init(&extras, sizeof(VertexExtra)); - target->extras = &extras; } GLboolean doMultitexture, doTexture, doLighting; @@ -1057,9 +1047,6 @@ GL_FORCE_INLINE void submitVertices(GLenum mode, GLsizei first, GLuint count, GL assert(target->count); - /* Make sure we have enough room for all the "extra" data */ - aligned_vector_resize(&extras, target->count); - /* Make room for the vertices and header */ aligned_vector_extend(&target->output->vector, target->count + 1); generate(target, mode, first, count, (GLubyte*) indices, type); @@ -1087,8 +1074,6 @@ GL_FORCE_INLINE void submitVertices(GLenum mode, GLsizei first, GLuint count, GL clip(target); - assert(extras.size == target->count); - #if DEBUG_CLIPPING fprintf(stderr, "--------\n"); for(i = 0; i < target->count; ++i) { @@ -1139,12 +1124,10 @@ GL_FORCE_INLINE void submitVertices(GLenum mode, GLsizei first, GLuint count, GL PVRHeader* mtHeader = (PVRHeader*) vertex++; /* Replace the UV coordinates with the ST ones */ - VertexExtra* ve = aligned_vector_at(target->extras, 0); ITERATE(target->count) { - vertex->uv[0] = ve->st[0]; - vertex->uv[1] = ve->st[1]; + vertex->uv[0] = vertex->st[0]; + vertex->uv[1] = vertex->st[1]; ++vertex; - ++ve; } /* Send the buffer again to the transparent list */ diff --git a/GL/flush.c b/GL/flush.c index 95a6dde..e06b13c 100644 --- a/GL/flush.c +++ b/GL/flush.c @@ -15,13 +15,15 @@ static PolyList OP_LIST; static PolyList PT_LIST; static PolyList TR_LIST; +static const int STRIDE = sizeof(Vertex) / sizeof(GLuint); + static void pvr_list_submit(void *src, int n) { GLuint *d = TA_SQ_ADDR; GLuint *s = src; /* fill/write queues as many times necessary */ while(n--) { - __asm__("pref @%0" : : "r"(s + 8)); /* prefetch 32 bytes for next loop */ + __asm__("pref @%0" : : "r"(s + STRIDE)); /* prefetch 64 bytes for next loop */ d[0] = *(s++); d[1] = *(s++); d[2] = *(s++); @@ -32,6 +34,7 @@ static void pvr_list_submit(void *src, int n) { d[7] = *(s++); __asm__("pref @%0" : : "r"(d)); d += 8; + s += (STRIDE - 8); } /* Wait for both store queues to complete */ diff --git a/GL/private.h b/GL/private.h index dad060b..a0a0f59 100644 --- a/GL/private.h +++ b/GL/private.h @@ -198,11 +198,23 @@ typedef struct { float xyz[3]; float uv[2]; uint8_t bgra[4]; + uint8_t obgra[4]; + /* End 32 pvr_vertex_t */ - /* In the pvr_vertex_t structure, this next 4 bytes is oargb - * but we're not using that for now, so having W here makes the code - * simpler */ - float w; + /* + * The following are necessary for our purposes + * W - W coordinate - for clipping + * ST - ST coordinate for multitexture + * NXYZ - Normal + */ + + float w; // 4 + float st[2]; // +8 (12) + float nxyz[3]; // +12 (24) + uint8_t visible; // +1 (25) + + uint8_t padding0[3]; // +3 (28) + uint32_t padding1; // +4 (32) } Vertex; @@ -242,13 +254,6 @@ do { \ *b = c; \ } while(0) -/* ClipVertex doesn't have room for these, so we need to parse them - * out separately. Potentially 'w' will be housed here if we support oargb */ -typedef struct { - float nxyz[3]; - float st[2]; -} VertexExtra; - /* Generating PVR vertices from the user-submitted data gets complicated, particularly * when a realloc could invalidate pointers. This structure holds all the information * we need on the target vertex array to allow passing around to the various stages (e.g. generate/clip etc.)