Use fat 64 byte vertices to improve data locality

This commit is contained in:
Luke Benstead 2020-10-18 20:39:34 +01:00
parent 8c187f67de
commit 36fe13095c
4 changed files with 49 additions and 90 deletions

View File

@ -85,16 +85,12 @@ const uint32_t VERTEX_CMD = 0xe0000000;
typedef struct { typedef struct {
Vertex vertex[3]; Vertex vertex[3];
VertexExtra extra[3];
uint8_t visible; uint8_t visible;
} Triangle; } Triangle;
void _glClipTriangle(const Triangle* triangle, const uint8_t visible, SubmissionTarget* target, const uint8_t flatShade) { void _glClipTriangle(const Triangle* triangle, const uint8_t visible, SubmissionTarget* target, const uint8_t flatShade) {
Vertex* last = NULL; Vertex* last = NULL;
VertexExtra* veLast = NULL;
const Vertex* vertices = triangle->vertex; const Vertex* vertices = triangle->vertex;
const VertexExtra* extras = triangle->extra;
char* bgra = (char*) vertices[2].bgra; char* bgra = (char*) vertices[2].bgra;
@ -102,25 +98,22 @@ void _glClipTriangle(const Triangle* triangle, const uint8_t visible, Submission
uint32_t finalColour = *((uint32_t*) bgra); uint32_t finalColour = *((uint32_t*) bgra);
Vertex tmp; Vertex tmp;
VertexExtra veTmp;
uint8_t pushedCount = 0; uint8_t pushedCount = 0;
#define IS_VISIBLE(x) (visible & (1 << (2 - (x)))) > 0 #define IS_VISIBLE(x) (visible & (1 << (2 - (x)))) > 0
#define PUSH_VERT(vert, ve) \ #define PUSH_VERT(vert) \
last = aligned_vector_push_back(&target->output->vector, vert, 1); \ last = aligned_vector_push_back(&target->output->vector, vert, 1); \
last->flags = VERTEX_CMD; \ last->flags = VERTEX_CMD; \
veLast = aligned_vector_push_back(target->extras, ve, 1); \
++pushedCount; ++pushedCount;
#define CLIP_TO_PLANE(vert1, ve1, vert2, ve2) \ #define CLIP_TO_PLANE(vert1, vert2) \
do { \ do { \
float t = _glClipLineToNearZ((vert1), (vert2), &tmp); \ float t = _glClipLineToNearZ((vert1), (vert2), &tmp); \
interpolateFloat((vert1)->w, (vert2)->w, t, &tmp.w); \ interpolateFloat((vert1)->w, (vert2)->w, t, &tmp.w); \
interpolateVec2((vert1)->uv, (vert2)->uv, t, tmp.uv); \ interpolateVec2((vert1)->uv, (vert2)->uv, t, tmp.uv); \
interpolateVec3((ve1)->nxyz, (ve2)->nxyz, t, veTmp.nxyz); \ interpolateVec3((vert1)->nxyz, (vert2)->nxyz, t, tmp.nxyz); \
interpolateVec2((ve1)->st, (ve2)->st, t, veTmp.st); \ interpolateVec2((vert1)->st, (vert2)->st, t, tmp.st); \
if(flatShade) { \ if(flatShade) { \
interpolateColour((const uint8_t*) &finalColour, (const uint8_t*) &finalColour, t, tmp.bgra); \ interpolateColour((const uint8_t*) &finalColour, (const uint8_t*) &finalColour, t, tmp.bgra); \
} else { interpolateColour((vert1)->bgra, (vert2)->bgra, t, tmp.bgra); } \ } else { interpolateColour((vert1)->bgra, (vert2)->bgra, t, tmp.bgra); } \
@ -130,44 +123,38 @@ void _glClipTriangle(const Triangle* triangle, const uint8_t visible, Submission
uint8_t v1 = IS_VISIBLE(1); uint8_t v1 = IS_VISIBLE(1);
uint8_t v2 = IS_VISIBLE(2); uint8_t v2 = IS_VISIBLE(2);
if(v0) { if(v0) {
PUSH_VERT(&vertices[0], &extras[0]); PUSH_VERT(&vertices[0]);
} }
if(v0 != v1) { if(v0 != v1) {
CLIP_TO_PLANE(&vertices[0], &extras[0], &vertices[1], &extras[1]); CLIP_TO_PLANE(&vertices[0], &vertices[1]);
PUSH_VERT(&tmp, &veTmp); PUSH_VERT(&tmp);
} }
if(v1) { if(v1) {
PUSH_VERT(&vertices[1], &extras[1]); PUSH_VERT(&vertices[1]);
} }
if(v1 != v2) { if(v1 != v2) {
CLIP_TO_PLANE(&vertices[1], &extras[1], &vertices[2], &extras[2]); CLIP_TO_PLANE(&vertices[1], &vertices[2]);
PUSH_VERT(&tmp, &veTmp); PUSH_VERT(&tmp);
} }
if(v2) { if(v2) {
PUSH_VERT(&vertices[2], &extras[2]); PUSH_VERT(&vertices[2]);
} }
if(v2 != v0) { if(v2 != v0) {
CLIP_TO_PLANE(&vertices[2], &extras[2], &vertices[0], &extras[0]); CLIP_TO_PLANE(&vertices[2], &vertices[0]);
PUSH_VERT(&tmp, &veTmp); PUSH_VERT(&tmp);
} }
if(pushedCount == 4) { if(pushedCount == 4) {
Vertex* prev = last - 1; Vertex* prev = last - 1;
VertexExtra* prevVe = veLast - 1;
tmp = *prev; tmp = *prev;
veTmp = *prevVe;
*prev = *last; *prev = *last;
*prevVe = *veLast;
*last = tmp; *last = tmp;
*veLast = veTmp;
prev->flags = VERTEX_CMD; prev->flags = VERTEX_CMD;
last->flags = VERTEX_CMD_EOL; last->flags = VERTEX_CMD_EOL;
@ -309,15 +296,6 @@ void _glClipTriangleStrip(SubmissionTarget* target, uint8_t fladeShade) {
TO_CLIP[CLIP_COUNT].vertex[0] = *v1; TO_CLIP[CLIP_COUNT].vertex[0] = *v1;
TO_CLIP[CLIP_COUNT].vertex[1] = *v2; TO_CLIP[CLIP_COUNT].vertex[1] = *v2;
TO_CLIP[CLIP_COUNT].vertex[2] = *v3; TO_CLIP[CLIP_COUNT].vertex[2] = *v3;
VertexExtra* ve1 = (VertexExtra*) aligned_vector_at(target->extras, vi1);
VertexExtra* ve2 = (VertexExtra*) aligned_vector_at(target->extras, vi2);
VertexExtra* ve3 = (VertexExtra*) aligned_vector_at(target->extras, vi3);
TO_CLIP[CLIP_COUNT].extra[0] = *ve1;
TO_CLIP[CLIP_COUNT].extra[1] = *ve2;
TO_CLIP[CLIP_COUNT].extra[2] = *ve3;
TO_CLIP[CLIP_COUNT].visible = visible; TO_CLIP[CLIP_COUNT].visible = visible;
++CLIP_COUNT; ++CLIP_COUNT;
@ -359,11 +337,6 @@ void _glClipTriangleStrip(SubmissionTarget* target, uint8_t fladeShade) {
TO_CLIP[CLIP_COUNT].vertex[1] = *v2; TO_CLIP[CLIP_COUNT].vertex[1] = *v2;
TO_CLIP[CLIP_COUNT].vertex[2] = *v4; TO_CLIP[CLIP_COUNT].vertex[2] = *v4;
VertexExtra* ve4 = (VertexExtra*) aligned_vector_at(target->extras, vi4);
TO_CLIP[CLIP_COUNT].extra[0] = *(VertexExtra*) aligned_vector_at(target->extras, vi3);
TO_CLIP[CLIP_COUNT].extra[1] = *(VertexExtra*) aligned_vector_at(target->extras, vi2);
TO_CLIP[CLIP_COUNT].extra[2] = *ve4;
visible = (_VERT_VISIBLE(v3) ? 4 : 0) | visible = (_VERT_VISIBLE(v3) ? 4 : 0) |
(_VERT_VISIBLE(v2) ? 2 : 0) | (_VERT_VISIBLE(v2) ? 2 : 0) |
(_VERT_VISIBLE(v4) ? 1 : 0); (_VERT_VISIBLE(v4) ? 1 : 0);
@ -385,11 +358,6 @@ void _glClipTriangleStrip(SubmissionTarget* target, uint8_t fladeShade) {
swapVertex(v3, v4); swapVertex(v3, v4);
v3->flags = VERTEX_CMD; v3->flags = VERTEX_CMD;
v4->flags = VERTEX_CMD; v4->flags = VERTEX_CMD;
/* Swap the extra data too */
VertexExtra t = *ve4;
*ve3 = *ve4;
*ve4 = t;
} }
} }
break; break;

View File

@ -651,35 +651,35 @@ GL_FORCE_INLINE void _readUVData(const GLuint first, const GLuint count, Vertex*
} }
} }
GL_FORCE_INLINE void _readSTData(const GLuint first, const GLuint count, VertexExtra* extra) { GL_FORCE_INLINE void _readSTData(const GLuint first, const GLuint count, Vertex* outpu) {
const GLubyte ststride = (ST_POINTER.stride) ? ST_POINTER.stride : ST_POINTER.size * byte_size(ST_POINTER.type); const GLubyte ststride = (ST_POINTER.stride) ? ST_POINTER.stride : ST_POINTER.size * byte_size(ST_POINTER.type);
const void* stptr = ((GLubyte*) ST_POINTER.ptr + (first * ststride)); const void* stptr = ((GLubyte*) ST_POINTER.ptr + (first * ststride));
ReadUVFunc func = calcReadUVFunc(); ReadUVFunc func = calcReadUVFunc();
GLubyte* out = (GLubyte*) extra[0].st; GLubyte* out = (GLubyte*) outpu[0].st;
ITERATE(count) { ITERATE(count) {
func(stptr, out); func(stptr, out);
stptr += ststride; stptr += ststride;
out += sizeof(VertexExtra); out += sizeof(Vertex);
} }
} }
GL_FORCE_INLINE void _readNormalData(const GLuint first, const GLuint count, VertexExtra* extra) { GL_FORCE_INLINE void _readNormalData(const GLuint first, const GLuint count, Vertex* output) {
const GLuint nstride = (NORMAL_POINTER.stride) ? NORMAL_POINTER.stride : NORMAL_POINTER.size * byte_size(NORMAL_POINTER.type); const GLuint nstride = (NORMAL_POINTER.stride) ? NORMAL_POINTER.stride : NORMAL_POINTER.size * byte_size(NORMAL_POINTER.type);
const void* nptr = ((GLubyte*) NORMAL_POINTER.ptr + (first * nstride)); const void* nptr = ((GLubyte*) NORMAL_POINTER.ptr + (first * nstride));
ReadNormalFunc func = calcReadNormalFunc(); ReadNormalFunc func = calcReadNormalFunc();
GLubyte* out = (GLubyte*) extra[0].nxyz; GLubyte* out = (GLubyte*) output[0].nxyz;
ITERATE(count) { ITERATE(count) {
func(nptr, out); func(nptr, out);
nptr += nstride; nptr += nstride;
out += sizeof(VertexExtra); out += sizeof(Vertex);
} }
if(_glIsNormalizeEnabled()) { if(_glIsNormalizeEnabled()) {
GLubyte* ptr = (GLubyte*) extra->nxyz; GLubyte* ptr = (GLubyte*) output->nxyz;
ITERATE(count) { ITERATE(count) {
GLfloat* n = (GLfloat*) ptr; GLfloat* n = (GLfloat*) ptr;
float temp = n[0] * n[0]; float temp = n[0] * n[0];
@ -691,7 +691,7 @@ GL_FORCE_INLINE void _readNormalData(const GLuint first, const GLuint count, Ver
n[1] *= ilength; n[1] *= ilength;
n[2] *= ilength; n[2] *= ilength;
ptr += sizeof(VertexExtra); ptr += sizeof(Vertex);
} }
} }
} }
@ -724,7 +724,6 @@ static void generateElements(
GLubyte* nxyz; GLubyte* nxyz;
Vertex* output = _glSubmissionTargetStart(target); Vertex* output = _glSubmissionTargetStart(target);
VertexExtra* ve = aligned_vector_at(target->extras, 0);
uint32_t i = first; uint32_t i = first;
uint32_t idx = 0; uint32_t idx = 0;
@ -762,12 +761,11 @@ static void generateElements(
pos_func(xyz, (GLubyte*) output->xyz); pos_func(xyz, (GLubyte*) output->xyz);
uv_func(uv, (GLubyte*) output->uv); uv_func(uv, (GLubyte*) output->uv);
diffuse_func(bgra, output->bgra); diffuse_func(bgra, output->bgra);
st_func(st, (GLubyte*) ve->st); st_func(st, (GLubyte*) output->st);
normal_func(nxyz, (GLubyte*) ve->nxyz); normal_func(nxyz, (GLubyte*) output->nxyz);
output->flags = PVR_CMD_VERTEX; output->flags = PVR_CMD_VERTEX;
++output; ++output;
++ve;
} }
} }
@ -804,10 +802,8 @@ static void generate(SubmissionTarget* target, const GLenum mode, const GLsizei
} }
} }
VertexExtra* ve = aligned_vector_at(target->extras, 0); _readNormalData(first, count, start);
_readSTData(first, count, start);
_readNormalData(first, count, ve);
_readSTData(first, count, ve);
} else { } else {
generateElements( generateElements(
@ -917,14 +913,13 @@ static void light(SubmissionTarget* target) {
/* Perform lighting calculations and manipulate the colour */ /* Perform lighting calculations and manipulate the colour */
Vertex* vertex = _glSubmissionTargetStart(target); Vertex* vertex = _glSubmissionTargetStart(target);
VertexExtra* extra = aligned_vector_at(target->extras, 0);
EyeSpaceData* eye_space = (EyeSpaceData*) eye_space_data->data; EyeSpaceData* eye_space = (EyeSpaceData*) eye_space_data->data;
_glMatrixLoadModelView(); _glMatrixLoadModelView();
mat_transform3(vertex->xyz, eye_space->xyz, target->count, sizeof(Vertex), sizeof(EyeSpaceData)); mat_transform3(vertex->xyz, eye_space->xyz, target->count, sizeof(Vertex), sizeof(EyeSpaceData));
_glMatrixLoadNormal(); _glMatrixLoadNormal();
mat_transform_normal3(extra->nxyz, eye_space->n, target->count, sizeof(VertexExtra), sizeof(EyeSpaceData)); mat_transform_normal3(vertex->nxyz, eye_space->n, target->count, sizeof(Vertex), sizeof(EyeSpaceData));
EyeSpaceData* ES = aligned_vector_at(eye_space_data, 0); EyeSpaceData* ES = aligned_vector_at(eye_space_data, 0);
_glPerformLighting(vertex, ES, target->count); _glPerformLighting(vertex, ES, target->count);
@ -1005,18 +1000,13 @@ GL_FORCE_INLINE void submitVertices(GLenum mode, GLsizei first, GLuint count, GL
} }
static SubmissionTarget* target = NULL; static SubmissionTarget* target = NULL;
static AlignedVector extras;
/* Initialization of the target and extras */ /* Initialization of the target */
if(!target) { if(!target) {
target = (SubmissionTarget*) malloc(sizeof(SubmissionTarget)); target = (SubmissionTarget*) malloc(sizeof(SubmissionTarget));
target->extras = NULL;
target->count = 0; target->count = 0;
target->output = NULL; target->output = NULL;
target->header_offset = target->start_offset = 0; target->header_offset = target->start_offset = 0;
aligned_vector_init(&extras, sizeof(VertexExtra));
target->extras = &extras;
} }
GLboolean doMultitexture, doTexture, doLighting; GLboolean doMultitexture, doTexture, doLighting;
@ -1057,9 +1047,6 @@ GL_FORCE_INLINE void submitVertices(GLenum mode, GLsizei first, GLuint count, GL
assert(target->count); assert(target->count);
/* Make sure we have enough room for all the "extra" data */
aligned_vector_resize(&extras, target->count);
/* Make room for the vertices and header */ /* Make room for the vertices and header */
aligned_vector_extend(&target->output->vector, target->count + 1); aligned_vector_extend(&target->output->vector, target->count + 1);
generate(target, mode, first, count, (GLubyte*) indices, type); generate(target, mode, first, count, (GLubyte*) indices, type);
@ -1087,8 +1074,6 @@ GL_FORCE_INLINE void submitVertices(GLenum mode, GLsizei first, GLuint count, GL
clip(target); clip(target);
assert(extras.size == target->count);
#if DEBUG_CLIPPING #if DEBUG_CLIPPING
fprintf(stderr, "--------\n"); fprintf(stderr, "--------\n");
for(i = 0; i < target->count; ++i) { for(i = 0; i < target->count; ++i) {
@ -1139,12 +1124,10 @@ GL_FORCE_INLINE void submitVertices(GLenum mode, GLsizei first, GLuint count, GL
PVRHeader* mtHeader = (PVRHeader*) vertex++; PVRHeader* mtHeader = (PVRHeader*) vertex++;
/* Replace the UV coordinates with the ST ones */ /* Replace the UV coordinates with the ST ones */
VertexExtra* ve = aligned_vector_at(target->extras, 0);
ITERATE(target->count) { ITERATE(target->count) {
vertex->uv[0] = ve->st[0]; vertex->uv[0] = vertex->st[0];
vertex->uv[1] = ve->st[1]; vertex->uv[1] = vertex->st[1];
++vertex; ++vertex;
++ve;
} }
/* Send the buffer again to the transparent list */ /* Send the buffer again to the transparent list */

View File

@ -15,13 +15,15 @@ static PolyList OP_LIST;
static PolyList PT_LIST; static PolyList PT_LIST;
static PolyList TR_LIST; static PolyList TR_LIST;
static const int STRIDE = sizeof(Vertex) / sizeof(GLuint);
static void pvr_list_submit(void *src, int n) { static void pvr_list_submit(void *src, int n) {
GLuint *d = TA_SQ_ADDR; GLuint *d = TA_SQ_ADDR;
GLuint *s = src; GLuint *s = src;
/* fill/write queues as many times necessary */ /* fill/write queues as many times necessary */
while(n--) { while(n--) {
__asm__("pref @%0" : : "r"(s + 8)); /* prefetch 32 bytes for next loop */ __asm__("pref @%0" : : "r"(s + STRIDE)); /* prefetch 64 bytes for next loop */
d[0] = *(s++); d[0] = *(s++);
d[1] = *(s++); d[1] = *(s++);
d[2] = *(s++); d[2] = *(s++);
@ -32,6 +34,7 @@ static void pvr_list_submit(void *src, int n) {
d[7] = *(s++); d[7] = *(s++);
__asm__("pref @%0" : : "r"(d)); __asm__("pref @%0" : : "r"(d));
d += 8; d += 8;
s += (STRIDE - 8);
} }
/* Wait for both store queues to complete */ /* Wait for both store queues to complete */

View File

@ -198,11 +198,23 @@ typedef struct {
float xyz[3]; float xyz[3];
float uv[2]; float uv[2];
uint8_t bgra[4]; uint8_t bgra[4];
uint8_t obgra[4];
/* End 32 pvr_vertex_t */
/* In the pvr_vertex_t structure, this next 4 bytes is oargb /*
* but we're not using that for now, so having W here makes the code * The following are necessary for our purposes
* simpler */ * W - W coordinate - for clipping
float w; * ST - ST coordinate for multitexture
* NXYZ - Normal
*/
float w; // 4
float st[2]; // +8 (12)
float nxyz[3]; // +12 (24)
uint8_t visible; // +1 (25)
uint8_t padding0[3]; // +3 (28)
uint32_t padding1; // +4 (32)
} Vertex; } Vertex;
@ -242,13 +254,6 @@ do { \
*b = c; \ *b = c; \
} while(0) } while(0)
/* ClipVertex doesn't have room for these, so we need to parse them
* out separately. Potentially 'w' will be housed here if we support oargb */
typedef struct {
float nxyz[3];
float st[2];
} VertexExtra;
/* Generating PVR vertices from the user-submitted data gets complicated, particularly /* Generating PVR vertices from the user-submitted data gets complicated, particularly
* when a realloc could invalidate pointers. This structure holds all the information * when a realloc could invalidate pointers. This structure holds all the information
* we need on the target vertex array to allow passing around to the various stages (e.g. generate/clip etc.) * we need on the target vertex array to allow passing around to the various stages (e.g. generate/clip etc.)