More performance work

This commit is contained in:
Luke Benstead 2021-04-21 15:34:28 +01:00
parent 2547459ef3
commit 26c9a454e4
2 changed files with 59 additions and 28 deletions

View File

@ -797,40 +797,49 @@ static void generateArraysFastPath(SubmissionTarget* target, const GLsizei first
/* Copy the pos, uv and color directly in one go */
const GLubyte* pos = VERTEX_POINTER.ptr + (first * vstride);
const GLubyte* uv = UV_POINTER.ptr + (first * uvstride);
const GLubyte* col = DIFFUSE_POINTER.ptr + (first * dstride);
const GLubyte* st = ST_POINTER.ptr + (first * ststride);
const GLubyte* n = NORMAL_POINTER.ptr + (first * nstride);
typedef struct {
float x, y, z;
} V3;
typedef struct {
float u, v;
} V2;
const GLubyte* pos = (ENABLED_VERTEX_ATTRIBUTES & VERTEX_ENABLED_FLAG) ? VERTEX_POINTER.ptr + (first * vstride) : NULL;
const GLubyte* uv = (ENABLED_VERTEX_ATTRIBUTES & UV_ENABLED_FLAG) ? UV_POINTER.ptr + (first * uvstride) : NULL;
const GLubyte* col = (ENABLED_VERTEX_ATTRIBUTES & DIFFUSE_ENABLED_FLAG) ? DIFFUSE_POINTER.ptr + (first * dstride) : NULL;
const GLubyte* st = (ENABLED_VERTEX_ATTRIBUTES & ST_ENABLED_FLAG) ? ST_POINTER.ptr + (first * ststride) : NULL;
const GLubyte* n = (ENABLED_VERTEX_ATTRIBUTES & NORMAL_ENABLED_FLAG) ? NORMAL_POINTER.ptr + (first * nstride) : NULL;
VertexExtra* ve = aligned_vector_at(target->extras, 0);
Vertex* it = start;
ITERATE(count) {
const float w = 1.0f;
uint32_t i = count;
while(i--) {
it->flags = GPU_CMD_VERTEX;
*((V3*) it->xyz) = *((V3*) pos);
*((V2*) it->uv) = *((V2*) uv);
*((uint32_t*) it->bgra) = *((uint32_t*) col);
if(pos) {
TransformVertex((const float*) pos, &w, it->xyz, &it->w);
pos += vstride;
}
*((V2*) ve->st) = *((V2*) st);
*((V3*) ve->nxyz) = *((V3*) n);
if(uv) {
MEMCPY4(it->uv, uv, sizeof(float) * 2);
uv += uvstride;
}
if(col) {
MEMCPY4(it->bgra, col, sizeof(uint32_t));
col += dstride;
}
if(st) {
MEMCPY4(ve->st, st, sizeof(float) * 2);
st += ststride;
}
if(n) {
MEMCPY4(ve->nxyz, n, sizeof(float) * 3);
n += nstride;
}
it++;
ve++;
pos += vstride;
uv += uvstride;
col += dstride;
st += ststride;
n += nstride;
}
}
@ -890,8 +899,6 @@ static void transform(SubmissionTarget* target) {
/* Perform modelview transform, storing W */
Vertex* vertex = _glSubmissionTargetStart(target);
_glApplyRenderMatrix(); /* Apply the Render Matrix Stack */
TransformVertices(vertex, target->count);
}
@ -1095,13 +1102,18 @@ GL_FORCE_INLINE void submitVertices(GLenum mode, GLsizei first, GLuint count, GL
/* Make room for the vertices and header */
aligned_vector_extend(&target->output->vector, target->count + 1);
_glApplyRenderMatrix(); /* Apply the Render Matrix Stack */
generate(target, mode, first, count, (GLubyte*) indices, type);
if(doLighting){
light(target);
}
transform(target);
if(!FAST_PATH_ENABLED) {
transform(target);
}
if(_glIsClippingEnabled()) {
#if DEBUG_CLIPPING

View File

@ -69,6 +69,25 @@ inline void TransformVec4(float* x) {
}
static inline void TransformVertex(const float* xyz, const float* w, float* oxyz, float* ow) {
register float __x __asm__("fr12") = (xyz[0]);
register float __y __asm__("fr13") = (xyz[1]);
register float __z __asm__("fr14") = (xyz[2]);
register float __w __asm__("fr15") = (*w);
__asm__ __volatile__(
"fldi1 fr15\n"
"ftrv xmtrx,fv12\n"
: "=f" (__x), "=f" (__y), "=f" (__z), "=f" (__w)
: "0" (__x), "1" (__y), "2" (__z), "3" (__w)
);
oxyz[0] = __x;
oxyz[1] = __y;
oxyz[2] = __z;
*ow = __w;
}
static inline void TransformVertices(Vertex* vertices, const int count) {
Vertex* it = vertices;
for(int i = 0; i < count; ++i, ++it) {