Optimise vertex transform in non-fast path to avoid storing xyz to memory and then loading it again

This commit is contained in:
UnknownShadow200 2025-02-02 14:36:40 +11:00
parent 9d717800bd
commit 3b2e549934
6 changed files with 66 additions and 52 deletions

View File

@ -37,67 +37,89 @@ GLuint* _glGetEnabledAttributes() {
static void _readPosition3f3f(const GLubyte* __restrict__ in, GLubyte* __restrict__ out) {
vec3cpy(out, in);
const float* input = (const float*) in;
Vertex* it = (Vertex*) out;
float x = input[0];
float y = input[1];
float z = input[2];
float w = 1.0f;
TransformVertex(x, y, z, w, it->xyz, &it->w);
}
static void _readPosition3ub3f(const GLubyte* input, GLubyte* out) {
float* output = (float*) out;
Vertex* it = (Vertex*)out;
output[0] = input[0] * ONE_OVER_TWO_FIVE_FIVE;
output[1] = input[1] * ONE_OVER_TWO_FIVE_FIVE;
output[2] = input[2] * ONE_OVER_TWO_FIVE_FIVE;
float x = input[0] * ONE_OVER_TWO_FIVE_FIVE;
float y = input[1] * ONE_OVER_TWO_FIVE_FIVE;
float z = input[2] * ONE_OVER_TWO_FIVE_FIVE;
float w = 1.0f;
TransformVertex(x, y, z, w, it->xyz, &it->w);
}
static void _readPosition3us3f(const GLubyte* in, GLubyte* out) {
const GLushort* input = (const GLushort*) in;
float* output = (float*) out;
Vertex* it = (Vertex*) out;
output[0] = input[0];
output[1] = input[1];
output[2] = input[2];
float x = input[0];
float y = input[1];
float z = input[2];
float w = 1.0f;
TransformVertex(x, y, z, w, it->xyz, &it->w);
}
static void _readPosition3ui3f(const GLubyte* in, GLubyte* out) {
const GLuint* input = (const GLuint*) in;
float* output = (float*) out;
Vertex* it = (Vertex*) out;
output[0] = input[0];
output[1] = input[1];
output[2] = input[2];
float x = input[0];
float y = input[1];
float z = input[2];
float w = 1.0f;
TransformVertex(x, y, z, w, it->xyz, &it->w);
}
static void _readPosition2f3f(const GLubyte* in, GLubyte* out) {
const float* input = (const float*) in;
float* output = (float*) out;
Vertex* it = (Vertex*) out;
vec2cpy(output, input);
output[2] = 0.0f;
float x = input[0];
float y = input[1];
float z = 0.0f;
float w = 1.0f;
TransformVertex(x, y, z, w, it->xyz, &it->w);
}
static void _readPosition2ub3f(const GLubyte* input, GLubyte* out) {
float* output = (float*) out;
Vertex* it = (Vertex*) out;
output[0] = input[0] * ONE_OVER_TWO_FIVE_FIVE;
output[1] = input[1] * ONE_OVER_TWO_FIVE_FIVE;
output[2] = 0.0f;
float x = input[0] * ONE_OVER_TWO_FIVE_FIVE;
float y = input[1] * ONE_OVER_TWO_FIVE_FIVE;
float z = 0.0f;
float w = 1.0f;
TransformVertex(x, y, z, w, it->xyz, &it->w);
}
static void _readPosition2us3f(const GLubyte* in, GLubyte* out) {
const GLushort* input = (const GLushort*) in;
float* output = (float*) out;
Vertex* it = (Vertex*) out;
output[0] = input[0];
output[1] = input[1];
output[2] = 0.0f;
float x = input[0];
float y = input[1];
float z = 0.0f;
float w = 1.0f;
TransformVertex(x, y, z, w, it->xyz, &it->w);
}
static void _readPosition2ui3f(const GLubyte* in, GLubyte* out) {
const GLuint* input = (const GLuint*) in;
float* output = (float*) out;
Vertex* it = (Vertex*)out;
output[0] = input[0];
output[1] = input[1];
output[2] = 0.0f;
float x = input[0];
float y = input[1];
float z = 0.0f;
float w = 1.0f;
TransformVertex(x, y, z, w, it->xyz, &it->w);
}
static ReadAttributeFunc calcReadPositionFunc() {

View File

@ -289,13 +289,10 @@ static void _readPositionData(const GLuint first, const GLuint count, Vertex* it
const GLsizei vstride = ATTRIB_LIST.vertex.stride;
const GLubyte* vptr = ((GLubyte*) ATTRIB_LIST.vertex.ptr + (first * vstride));
float pos[3], w = 1.0f;
ITERATE(count) {
PREFETCH(vptr + vstride);
func(vptr, (GLubyte*) pos);
func(vptr, (GLubyte*) it);
it->flags = GPU_CMD_VERTEX;
TransformVertex(pos, &w, it->xyz, &it->w);
vptr += vstride;
++it;
@ -411,8 +408,7 @@ static void generateElements(
st = (GLubyte*) ATTRIB_LIST.st.ptr + (idx * ststride);
nxyz = (GLubyte*) ATTRIB_LIST.normal.ptr + (idx * nstride);
pos_func(xyz, (GLubyte*) pos);
TransformVertex((const float*) pos, &w, output->xyz, &output->w);
pos_func(xyz, (GLubyte*) output);
uv_func(uv, (GLubyte*) output->uv);
diffuse_func(bgra, output->bgra);
st_func(st, (GLubyte*) ve->st);
@ -460,8 +456,6 @@ static void generateElementsFastPath(
VertexExtra* ve = aligned_vector_at(target->extras, 0);
Vertex* it = start;
const float w = 1.0f;
if(!pos) {
return;
}
@ -472,7 +466,7 @@ static void generateElementsFastPath(
it->flags = GPU_CMD_VERTEX;
pos = (GLubyte*) ATTRIB_LIST.vertex.ptr + (idx * vstride);
TransformVertex((const float*) pos, &w, it->xyz, &it->w);
TransformVertex(((float*) pos)[0], ((float*) pos)[1], ((float*) pos)[2], 1.0f, it->xyz, &it->w);
if(uv) {
uv = (GLubyte*) ATTRIB_LIST.uv.ptr + (idx * uvstride);

View File

@ -5,7 +5,6 @@
MAKE_FUNC(POLYMODE)
{
static const float w = 1.0f;
if(!(ATTRIB_LIST.enabled & VERTEX_ENABLED_FLAG)) {
/* If we don't have vertices, do nothing */
return;
@ -75,7 +74,7 @@ MAKE_FUNC(POLYMODE)
PREFETCH(ptr);
for(int_fast32_t i = 0; i < loop; ++i, ++it) {
PREFETCH(ptr + stride);
TransformVertex((const float*) ptr, &w, it->xyz, &it->w);
TransformVertex(((float*) ptr)[0], ((float*) ptr)[1], ((float*) ptr)[2], 1.0f, it->xyz, &it->w);
PROCESS_VERTEX_FLAGS(it, min + i);
ptr += stride;
}

View File

@ -106,15 +106,14 @@ inline void TransformVec4(float* x) {
}
GL_FORCE_INLINE void TransformVertex(const float* xyz, const float* w, float* oxyz, float* ow) {
register float __x __asm__("fr12") = (xyz[0]);
register float __y __asm__("fr13") = (xyz[1]);
register float __z __asm__("fr14") = (xyz[2]);
register float __w __asm__("fr15") = (*w);
GL_FORCE_INLINE void TransformVertex(float x, float y, float z, float w, float* oxyz, float* ow) {
register float __x __asm__("fr4") = x;
register float __y __asm__("fr5") = y;
register float __z __asm__("fr6") = z;
register float __w __asm__("fr7") = w;
__asm__ __volatile__(
"fldi1 fr15\n"
"ftrv xmtrx,fv12\n"
"ftrv xmtrx,fv4\n"
: "=f" (__x), "=f" (__y), "=f" (__z), "=f" (__w)
: "0" (__x), "1" (__y), "2" (__z), "3" (__w)
);

View File

@ -636,12 +636,12 @@ void TransformVertices(Vertex* vertices, const int count) {
}
}
void TransformVertex(const float* xyz, const float* w, float* oxyz, float* ow) {
void TransformVertex(float x, float y, float z, float w, float* oxyz, float* ow) {
float ret[4];
ret[0] = xyz[0];
ret[1] = xyz[1];
ret[2] = xyz[2];
ret[3] = *w;
ret[0] = x;
ret[1] = y;
ret[2] = z;
ret[3] = w;
TransformVec4(ret);

View File

@ -53,7 +53,7 @@ static inline void TransformNormalNoMod(const float* xIn, float* xOut) {
}
void TransformVertices(Vertex* vertices, const int count);
void TransformVertex(const float* xyz, const float* w, float* oxyz, float* ow);
void TransformVertex(float x, float y, float z, float w, float* oxyz, float* ow);
void InitGPU(_Bool autosort, _Bool fsaa);