Use fat 64 byte vertices to improve data locality

This commit is contained in:
Luke Benstead 2020-10-18 20:39:34 +01:00
parent 8c187f67de
commit 36fe13095c
4 changed files with 49 additions and 90 deletions

View File

@ -85,16 +85,12 @@ const uint32_t VERTEX_CMD = 0xe0000000;
typedef struct {
Vertex vertex[3];
VertexExtra extra[3];
uint8_t visible;
} Triangle;
void _glClipTriangle(const Triangle* triangle, const uint8_t visible, SubmissionTarget* target, const uint8_t flatShade) {
Vertex* last = NULL;
VertexExtra* veLast = NULL;
const Vertex* vertices = triangle->vertex;
const VertexExtra* extras = triangle->extra;
char* bgra = (char*) vertices[2].bgra;
@ -102,25 +98,22 @@ void _glClipTriangle(const Triangle* triangle, const uint8_t visible, Submission
uint32_t finalColour = *((uint32_t*) bgra);
Vertex tmp;
VertexExtra veTmp;
uint8_t pushedCount = 0;
#define IS_VISIBLE(x) (visible & (1 << (2 - (x)))) > 0
#define PUSH_VERT(vert, ve) \
#define PUSH_VERT(vert) \
last = aligned_vector_push_back(&target->output->vector, vert, 1); \
last->flags = VERTEX_CMD; \
veLast = aligned_vector_push_back(target->extras, ve, 1); \
++pushedCount;
#define CLIP_TO_PLANE(vert1, ve1, vert2, ve2) \
#define CLIP_TO_PLANE(vert1, vert2) \
do { \
float t = _glClipLineToNearZ((vert1), (vert2), &tmp); \
interpolateFloat((vert1)->w, (vert2)->w, t, &tmp.w); \
interpolateVec2((vert1)->uv, (vert2)->uv, t, tmp.uv); \
interpolateVec3((ve1)->nxyz, (ve2)->nxyz, t, veTmp.nxyz); \
interpolateVec2((ve1)->st, (ve2)->st, t, veTmp.st); \
interpolateVec3((vert1)->nxyz, (vert2)->nxyz, t, tmp.nxyz); \
interpolateVec2((vert1)->st, (vert2)->st, t, tmp.st); \
if(flatShade) { \
interpolateColour((const uint8_t*) &finalColour, (const uint8_t*) &finalColour, t, tmp.bgra); \
} else { interpolateColour((vert1)->bgra, (vert2)->bgra, t, tmp.bgra); } \
@ -130,44 +123,38 @@ void _glClipTriangle(const Triangle* triangle, const uint8_t visible, Submission
uint8_t v1 = IS_VISIBLE(1);
uint8_t v2 = IS_VISIBLE(2);
if(v0) {
PUSH_VERT(&vertices[0], &extras[0]);
PUSH_VERT(&vertices[0]);
}
if(v0 != v1) {
CLIP_TO_PLANE(&vertices[0], &extras[0], &vertices[1], &extras[1]);
PUSH_VERT(&tmp, &veTmp);
CLIP_TO_PLANE(&vertices[0], &vertices[1]);
PUSH_VERT(&tmp);
}
if(v1) {
PUSH_VERT(&vertices[1], &extras[1]);
PUSH_VERT(&vertices[1]);
}
if(v1 != v2) {
CLIP_TO_PLANE(&vertices[1], &extras[1], &vertices[2], &extras[2]);
PUSH_VERT(&tmp, &veTmp);
CLIP_TO_PLANE(&vertices[1], &vertices[2]);
PUSH_VERT(&tmp);
}
if(v2) {
PUSH_VERT(&vertices[2], &extras[2]);
PUSH_VERT(&vertices[2]);
}
if(v2 != v0) {
CLIP_TO_PLANE(&vertices[2], &extras[2], &vertices[0], &extras[0]);
PUSH_VERT(&tmp, &veTmp);
CLIP_TO_PLANE(&vertices[2], &vertices[0]);
PUSH_VERT(&tmp);
}
if(pushedCount == 4) {
Vertex* prev = last - 1;
VertexExtra* prevVe = veLast - 1;
tmp = *prev;
veTmp = *prevVe;
*prev = *last;
*prevVe = *veLast;
*last = tmp;
*veLast = veTmp;
prev->flags = VERTEX_CMD;
last->flags = VERTEX_CMD_EOL;
@ -309,15 +296,6 @@ void _glClipTriangleStrip(SubmissionTarget* target, uint8_t fladeShade) {
TO_CLIP[CLIP_COUNT].vertex[0] = *v1;
TO_CLIP[CLIP_COUNT].vertex[1] = *v2;
TO_CLIP[CLIP_COUNT].vertex[2] = *v3;
VertexExtra* ve1 = (VertexExtra*) aligned_vector_at(target->extras, vi1);
VertexExtra* ve2 = (VertexExtra*) aligned_vector_at(target->extras, vi2);
VertexExtra* ve3 = (VertexExtra*) aligned_vector_at(target->extras, vi3);
TO_CLIP[CLIP_COUNT].extra[0] = *ve1;
TO_CLIP[CLIP_COUNT].extra[1] = *ve2;
TO_CLIP[CLIP_COUNT].extra[2] = *ve3;
TO_CLIP[CLIP_COUNT].visible = visible;
++CLIP_COUNT;
@ -359,11 +337,6 @@ void _glClipTriangleStrip(SubmissionTarget* target, uint8_t fladeShade) {
TO_CLIP[CLIP_COUNT].vertex[1] = *v2;
TO_CLIP[CLIP_COUNT].vertex[2] = *v4;
VertexExtra* ve4 = (VertexExtra*) aligned_vector_at(target->extras, vi4);
TO_CLIP[CLIP_COUNT].extra[0] = *(VertexExtra*) aligned_vector_at(target->extras, vi3);
TO_CLIP[CLIP_COUNT].extra[1] = *(VertexExtra*) aligned_vector_at(target->extras, vi2);
TO_CLIP[CLIP_COUNT].extra[2] = *ve4;
visible = (_VERT_VISIBLE(v3) ? 4 : 0) |
(_VERT_VISIBLE(v2) ? 2 : 0) |
(_VERT_VISIBLE(v4) ? 1 : 0);
@ -385,11 +358,6 @@ void _glClipTriangleStrip(SubmissionTarget* target, uint8_t fladeShade) {
swapVertex(v3, v4);
v3->flags = VERTEX_CMD;
v4->flags = VERTEX_CMD;
/* Swap the extra data too */
VertexExtra t = *ve4;
*ve3 = *ve4;
*ve4 = t;
}
}
break;

View File

@ -651,35 +651,35 @@ GL_FORCE_INLINE void _readUVData(const GLuint first, const GLuint count, Vertex*
}
}
GL_FORCE_INLINE void _readSTData(const GLuint first, const GLuint count, VertexExtra* extra) {
GL_FORCE_INLINE void _readSTData(const GLuint first, const GLuint count, Vertex* outpu) {
const GLubyte ststride = (ST_POINTER.stride) ? ST_POINTER.stride : ST_POINTER.size * byte_size(ST_POINTER.type);
const void* stptr = ((GLubyte*) ST_POINTER.ptr + (first * ststride));
ReadUVFunc func = calcReadUVFunc();
GLubyte* out = (GLubyte*) extra[0].st;
GLubyte* out = (GLubyte*) outpu[0].st;
ITERATE(count) {
func(stptr, out);
stptr += ststride;
out += sizeof(VertexExtra);
out += sizeof(Vertex);
}
}
GL_FORCE_INLINE void _readNormalData(const GLuint first, const GLuint count, VertexExtra* extra) {
GL_FORCE_INLINE void _readNormalData(const GLuint first, const GLuint count, Vertex* output) {
const GLuint nstride = (NORMAL_POINTER.stride) ? NORMAL_POINTER.stride : NORMAL_POINTER.size * byte_size(NORMAL_POINTER.type);
const void* nptr = ((GLubyte*) NORMAL_POINTER.ptr + (first * nstride));
ReadNormalFunc func = calcReadNormalFunc();
GLubyte* out = (GLubyte*) extra[0].nxyz;
GLubyte* out = (GLubyte*) output[0].nxyz;
ITERATE(count) {
func(nptr, out);
nptr += nstride;
out += sizeof(VertexExtra);
out += sizeof(Vertex);
}
if(_glIsNormalizeEnabled()) {
GLubyte* ptr = (GLubyte*) extra->nxyz;
GLubyte* ptr = (GLubyte*) output->nxyz;
ITERATE(count) {
GLfloat* n = (GLfloat*) ptr;
float temp = n[0] * n[0];
@ -691,7 +691,7 @@ GL_FORCE_INLINE void _readNormalData(const GLuint first, const GLuint count, Ver
n[1] *= ilength;
n[2] *= ilength;
ptr += sizeof(VertexExtra);
ptr += sizeof(Vertex);
}
}
}
@ -724,7 +724,6 @@ static void generateElements(
GLubyte* nxyz;
Vertex* output = _glSubmissionTargetStart(target);
VertexExtra* ve = aligned_vector_at(target->extras, 0);
uint32_t i = first;
uint32_t idx = 0;
@ -762,12 +761,11 @@ static void generateElements(
pos_func(xyz, (GLubyte*) output->xyz);
uv_func(uv, (GLubyte*) output->uv);
diffuse_func(bgra, output->bgra);
st_func(st, (GLubyte*) ve->st);
normal_func(nxyz, (GLubyte*) ve->nxyz);
st_func(st, (GLubyte*) output->st);
normal_func(nxyz, (GLubyte*) output->nxyz);
output->flags = PVR_CMD_VERTEX;
++output;
++ve;
}
}
@ -804,10 +802,8 @@ static void generate(SubmissionTarget* target, const GLenum mode, const GLsizei
}
}
VertexExtra* ve = aligned_vector_at(target->extras, 0);
_readNormalData(first, count, ve);
_readSTData(first, count, ve);
_readNormalData(first, count, start);
_readSTData(first, count, start);
} else {
generateElements(
@ -917,14 +913,13 @@ static void light(SubmissionTarget* target) {
/* Perform lighting calculations and manipulate the colour */
Vertex* vertex = _glSubmissionTargetStart(target);
VertexExtra* extra = aligned_vector_at(target->extras, 0);
EyeSpaceData* eye_space = (EyeSpaceData*) eye_space_data->data;
_glMatrixLoadModelView();
mat_transform3(vertex->xyz, eye_space->xyz, target->count, sizeof(Vertex), sizeof(EyeSpaceData));
_glMatrixLoadNormal();
mat_transform_normal3(extra->nxyz, eye_space->n, target->count, sizeof(VertexExtra), sizeof(EyeSpaceData));
mat_transform_normal3(vertex->nxyz, eye_space->n, target->count, sizeof(Vertex), sizeof(EyeSpaceData));
EyeSpaceData* ES = aligned_vector_at(eye_space_data, 0);
_glPerformLighting(vertex, ES, target->count);
@ -1005,18 +1000,13 @@ GL_FORCE_INLINE void submitVertices(GLenum mode, GLsizei first, GLuint count, GL
}
static SubmissionTarget* target = NULL;
static AlignedVector extras;
/* Initialization of the target and extras */
/* Initialization of the target */
if(!target) {
target = (SubmissionTarget*) malloc(sizeof(SubmissionTarget));
target->extras = NULL;
target->count = 0;
target->output = NULL;
target->header_offset = target->start_offset = 0;
aligned_vector_init(&extras, sizeof(VertexExtra));
target->extras = &extras;
}
GLboolean doMultitexture, doTexture, doLighting;
@ -1057,9 +1047,6 @@ GL_FORCE_INLINE void submitVertices(GLenum mode, GLsizei first, GLuint count, GL
assert(target->count);
/* Make sure we have enough room for all the "extra" data */
aligned_vector_resize(&extras, target->count);
/* Make room for the vertices and header */
aligned_vector_extend(&target->output->vector, target->count + 1);
generate(target, mode, first, count, (GLubyte*) indices, type);
@ -1087,8 +1074,6 @@ GL_FORCE_INLINE void submitVertices(GLenum mode, GLsizei first, GLuint count, GL
clip(target);
assert(extras.size == target->count);
#if DEBUG_CLIPPING
fprintf(stderr, "--------\n");
for(i = 0; i < target->count; ++i) {
@ -1139,12 +1124,10 @@ GL_FORCE_INLINE void submitVertices(GLenum mode, GLsizei first, GLuint count, GL
PVRHeader* mtHeader = (PVRHeader*) vertex++;
/* Replace the UV coordinates with the ST ones */
VertexExtra* ve = aligned_vector_at(target->extras, 0);
ITERATE(target->count) {
vertex->uv[0] = ve->st[0];
vertex->uv[1] = ve->st[1];
vertex->uv[0] = vertex->st[0];
vertex->uv[1] = vertex->st[1];
++vertex;
++ve;
}
/* Send the buffer again to the transparent list */

View File

@ -15,13 +15,15 @@ static PolyList OP_LIST;
static PolyList PT_LIST;
static PolyList TR_LIST;
static const int STRIDE = sizeof(Vertex) / sizeof(GLuint);
static void pvr_list_submit(void *src, int n) {
GLuint *d = TA_SQ_ADDR;
GLuint *s = src;
/* fill/write queues as many times necessary */
while(n--) {
__asm__("pref @%0" : : "r"(s + 8)); /* prefetch 32 bytes for next loop */
__asm__("pref @%0" : : "r"(s + STRIDE)); /* prefetch 64 bytes for next loop */
d[0] = *(s++);
d[1] = *(s++);
d[2] = *(s++);
@ -32,6 +34,7 @@ static void pvr_list_submit(void *src, int n) {
d[7] = *(s++);
__asm__("pref @%0" : : "r"(d));
d += 8;
s += (STRIDE - 8);
}
/* Wait for both store queues to complete */

View File

@ -198,11 +198,23 @@ typedef struct {
float xyz[3];
float uv[2];
uint8_t bgra[4];
uint8_t obgra[4];
/* End 32 pvr_vertex_t */
/* In the pvr_vertex_t structure, this next 4 bytes is oargb
* but we're not using that for now, so having W here makes the code
* simpler */
float w;
/*
* The following are necessary for our purposes
* W - W coordinate - for clipping
* ST - ST coordinate for multitexture
* NXYZ - Normal
*/
float w; // 4
float st[2]; // +8 (12)
float nxyz[3]; // +12 (24)
uint8_t visible; // +1 (25)
uint8_t padding0[3]; // +3 (28)
uint32_t padding1; // +4 (32)
} Vertex;
@ -242,13 +254,6 @@ do { \
*b = c; \
} while(0)
/* ClipVertex doesn't have room for these, so we need to parse them
* out separately. Potentially 'w' will be housed here if we support oargb */
typedef struct {
float nxyz[3];
float st[2];
} VertexExtra;
/* Generating PVR vertices from the user-submitted data gets complicated, particularly
* when a realloc could invalidate pointers. This structure holds all the information
* we need on the target vertex array to allow passing around to the various stages (e.g. generate/clip etc.)