From 9d717800bd63be3ae64e4c77f4eccacc1e07f982 Mon Sep 17 00:00:00 2001 From: UnknownShadow200 Date: Sun, 2 Feb 2025 13:51:30 +1100 Subject: [PATCH 1/3] Reorganise attribute parsing functions --- GL/attributes.c | 625 +++++++++++++++++++++++++----------------------- 1 file changed, 329 insertions(+), 296 deletions(-) diff --git a/GL/attributes.c b/GL/attributes.c index 607e2f3..703d562 100644 --- a/GL/attributes.c +++ b/GL/attributes.c @@ -36,12 +36,333 @@ GLuint* _glGetEnabledAttributes() { } -static void _readVertexData3f3f(const GLubyte* __restrict__ in, GLubyte* __restrict__ out) { +static void _readPosition3f3f(const GLubyte* __restrict__ in, GLubyte* __restrict__ out) { vec3cpy(out, in); } +static void _readPosition3ub3f(const GLubyte* input, GLubyte* out) { + float* output = (float*) out; + + output[0] = input[0] * ONE_OVER_TWO_FIVE_FIVE; + output[1] = input[1] * ONE_OVER_TWO_FIVE_FIVE; + output[2] = input[2] * ONE_OVER_TWO_FIVE_FIVE; +} + +static void _readPosition3us3f(const GLubyte* in, GLubyte* out) { + const GLushort* input = (const GLushort*) in; + float* output = (float*) out; + + output[0] = input[0]; + output[1] = input[1]; + output[2] = input[2]; +} + +static void _readPosition3ui3f(const GLubyte* in, GLubyte* out) { + const GLuint* input = (const GLuint*) in; + float* output = (float*) out; + + output[0] = input[0]; + output[1] = input[1]; + output[2] = input[2]; +} + +static void _readPosition2f3f(const GLubyte* in, GLubyte* out) { + const float* input = (const float*) in; + float* output = (float*) out; + + vec2cpy(output, input); + output[2] = 0.0f; +} + +static void _readPosition2ub3f(const GLubyte* input, GLubyte* out) { + float* output = (float*) out; + + output[0] = input[0] * ONE_OVER_TWO_FIVE_FIVE; + output[1] = input[1] * ONE_OVER_TWO_FIVE_FIVE; + output[2] = 0.0f; +} + +static void _readPosition2us3f(const GLubyte* in, GLubyte* out) { + const GLushort* input = (const GLushort*) in; + float* output = (float*) out; + + output[0] = input[0]; + output[1] = input[1]; + output[2] = 0.0f; +} + +static void _readPosition2ui3f(const GLubyte* in, GLubyte* out) { + const GLuint* input = (const GLuint*) in; + float* output = (float*) out; + + output[0] = input[0]; + output[1] = input[1]; + output[2] = 0.0f; +} + +static ReadAttributeFunc calcReadPositionFunc() { + switch(ATTRIB_LIST.vertex.type) { + default: + case GL_DOUBLE: + case GL_FLOAT: + return (ATTRIB_LIST.vertex.size == 3) ? _readPosition3f3f: + _readPosition2f3f; + case GL_BYTE: + case GL_UNSIGNED_BYTE: + return (ATTRIB_LIST.vertex.size == 3) ? _readPosition3ub3f: + _readPosition2ub3f; + case GL_SHORT: + case GL_UNSIGNED_SHORT: + return (ATTRIB_LIST.vertex.size == 3) ? _readPosition3us3f: + _readPosition2us3f; + case GL_INT: + case GL_UNSIGNED_INT: + return (ATTRIB_LIST.vertex.size == 3) ? _readPosition3ui3f: + _readPosition2ui3f; + } +} + + +static void _fillWhiteARGB(const GLubyte* __restrict__ input, GLubyte* __restrict__ output) { + _GL_UNUSED(input); + *((uint32_t*) output) = ~0; +} + +static void _readColour4ubARGB(const GLubyte* input, GLubyte* output) { + output[R8IDX] = input[0]; + output[G8IDX] = input[1]; + output[B8IDX] = input[2]; + output[A8IDX] = input[3]; +} + +static void _readColour4fARGB(const GLubyte* in, GLubyte* output) { + const float* input = (const float*) in; + + output[R8IDX] = (GLubyte) clamp(input[0] * 255.0f, 0, 255); + output[G8IDX] = (GLubyte) clamp(input[1] * 255.0f, 0, 255); + output[B8IDX] = (GLubyte) clamp(input[2] * 255.0f, 0, 255); + output[A8IDX] = (GLubyte) clamp(input[3] * 255.0f, 0, 255); +} + +static void _readColour3fARGB(const GLubyte* in, GLubyte* output) { + const float* input = (const float*) in; + + output[R8IDX] = (GLubyte) clamp(input[0] * 255.0f, 0, 255); + output[G8IDX] = (GLubyte) clamp(input[1] * 255.0f, 0, 255); + output[B8IDX] = (GLubyte) clamp(input[2] * 255.0f, 0, 255); + output[A8IDX] = 255; +} + +static void _readColour3ubARGB(const GLubyte* __restrict__ input, GLubyte* __restrict__ output) { + output[R8IDX] = input[0]; + output[G8IDX] = input[1]; + output[B8IDX] = input[2]; + output[A8IDX] = 255; +} + +static void _readColour4ubRevARGB(const GLubyte* __restrict__ input, GLubyte* __restrict__ output) { + argbcpy(output, input); +} + +static void _readColour4fRevARGB(const GLubyte* __restrict__ in, GLubyte* __restrict__ output) { + const float* input = (const float*) in; + + output[0] = (GLubyte) clamp(input[0] * 255.0f, 0, 255); + output[1] = (GLubyte) clamp(input[1] * 255.0f, 0, 255); + output[2] = (GLubyte) clamp(input[2] * 255.0f, 0, 255); + output[3] = (GLubyte) clamp(input[3] * 255.0f, 0, 255); +} + +static void _readColour3usARGB(const GLubyte* input, GLubyte* output) { + _GL_UNUSED(input); + _GL_UNUSED(output); + gl_assert(0 && "Not Implemented"); +} + +static void _readColour3uiARGB(const GLubyte* input, GLubyte* output) { + _GL_UNUSED(input); + _GL_UNUSED(output); + gl_assert(0 && "Not Implemented"); +} + +static void _readColour4usARGB(const GLubyte* input, GLubyte* output) { + _GL_UNUSED(input); + _GL_UNUSED(output); + gl_assert(0 && "Not Implemented"); +} + +static void _readColour4uiARGB(const GLubyte* input, GLubyte* output) { + _GL_UNUSED(input); + _GL_UNUSED(output); + gl_assert(0 && "Not Implemented"); +} + +static void _readColour4usRevARGB(const GLubyte* input, GLubyte* output) { + _GL_UNUSED(input); + _GL_UNUSED(output); + gl_assert(0 && "Not Implemented"); +} + +static void _readColour4uiRevARGB(const GLubyte* input, GLubyte* output) { + _GL_UNUSED(input); + _GL_UNUSED(output); + gl_assert(0 && "Not Implemented"); +} + +static ReadAttributeFunc calcReadDiffuseFunc() { + if((ATTRIB_LIST.enabled & DIFFUSE_ENABLED_FLAG) != DIFFUSE_ENABLED_FLAG) { + /* Just fill the whole thing white if the attribute is disabled */ + return _fillWhiteARGB; + } + + switch(ATTRIB_LIST.colour.type) { + default: + case GL_DOUBLE: + case GL_FLOAT: + return (ATTRIB_LIST.colour.size == 3) ? _readColour3fARGB: + (ATTRIB_LIST.colour.size == 4) ? _readColour4fARGB: + _readColour4fRevARGB; + case GL_BYTE: + case GL_UNSIGNED_BYTE: + return (ATTRIB_LIST.colour.size == 3) ? _readColour3ubARGB: + (ATTRIB_LIST.colour.size == 4) ? _readColour4ubARGB: + _readColour4ubRevARGB; + case GL_SHORT: + case GL_UNSIGNED_SHORT: + return (ATTRIB_LIST.colour.size == 3) ? _readColour3usARGB: + (ATTRIB_LIST.colour.size == 4) ? _readColour4usARGB: + _readColour4usRevARGB; + case GL_INT: + case GL_UNSIGNED_INT: + return (ATTRIB_LIST.colour.size == 3) ? _readColour3uiARGB: + (ATTRIB_LIST.colour.size == 4) ? _readColour4uiARGB: + _readColour4uiRevARGB; + } +} + + +static void _fillZero2f(const GLubyte* __restrict__ input, GLubyte* __restrict__ out) { + _GL_UNUSED(input); + //memset(out, 0, sizeof(float) * 2); + // memset does 8 byte writes - faster to manually write as uint32 + uint32_t* dst = (uint32_t*)out; + dst[0] = 0; + dst[1] = 0; +} + +static void _readTexcoord2f2f(const GLubyte* in, GLubyte* out) { + vec2cpy(out, in); +} + +static void _readTexcoord2ub2f(const GLubyte* input, GLubyte* out) { + float* output = (float*) out; + + output[0] = input[0] * ONE_OVER_TWO_FIVE_FIVE; + output[1] = input[1] * ONE_OVER_TWO_FIVE_FIVE; +} + +static void _readTexcoord2us2f(const GLubyte* in, GLubyte* out) { + const GLushort* input = (const GLushort*) in; + float* output = (float*) out; + + output[0] = (float)input[0] / SHRT_MAX; + output[1] = (float)input[1] / SHRT_MAX; +} + +static void _readTexcoord2ui2f(const GLubyte* in, GLubyte* out) { + const GLuint* input = (const GLuint*) in; + float* output = (float*) out; + + output[0] = input[0]; + output[1] = input[1]; +} + +static ReadAttributeFunc calcReadUVFunc() { + if((ATTRIB_LIST.enabled & UV_ENABLED_FLAG) != UV_ENABLED_FLAG) { + return _fillZero2f; + } + + switch(ATTRIB_LIST.uv.type) { + default: + case GL_DOUBLE: + case GL_FLOAT: + return _readTexcoord2f2f; + case GL_BYTE: + case GL_UNSIGNED_BYTE: + return _readTexcoord2ub2f; + case GL_SHORT: + case GL_UNSIGNED_SHORT: + return _readTexcoord2us2f; + case GL_INT: + case GL_UNSIGNED_INT: + return _readTexcoord2ui2f; + } +} + +static ReadAttributeFunc calcReadSTFunc() { + if((ATTRIB_LIST.enabled & ST_ENABLED_FLAG) != ST_ENABLED_FLAG) { + return _fillZero2f; + } + + switch(ATTRIB_LIST.st.type) { + default: + case GL_DOUBLE: + case GL_FLOAT: + return _readTexcoord2f2f; + case GL_BYTE: + case GL_UNSIGNED_BYTE: + return _readTexcoord2ub2f; + case GL_SHORT: + case GL_UNSIGNED_SHORT: + return _readTexcoord2us2f; + case GL_INT: + case GL_UNSIGNED_INT: + return _readTexcoord2ui2f; + } +} + + +static void _fillWithNegZVE(const GLubyte* __restrict__ input, GLubyte* __restrict__ out) { + _GL_UNUSED(input); + typedef struct { float x, y, z; } V; + + static const V NegZ = {0.0f, 0.0f, -1.0f}; + + *((V*) out) = NegZ; +} + +static void _readNormal3f3f(const GLubyte* __restrict__ in, GLubyte* __restrict__ out) { + vec3cpy(out, in); +} + +static void _readNormal3ub3f(const GLubyte* input, GLubyte* out) { + float* output = (float*) out; + + output[0] = input[0] * ONE_OVER_TWO_FIVE_FIVE; + output[1] = input[1] * ONE_OVER_TWO_FIVE_FIVE; + output[2] = input[2] * ONE_OVER_TWO_FIVE_FIVE; +} + +static void _readNormal3us3f(const GLubyte* in, GLubyte* out) { + const GLushort* input = (const GLushort*) in; + float* output = (float*) out; + + output[0] = input[0]; + output[1] = input[1]; + output[2] = input[2]; +} + +static void _readNormal3ui3f(const GLubyte* in, GLubyte* out) { + const GLuint* input = (const GLuint*) in; + float* output = (float*) out; + + output[0] = input[0]; + output[1] = input[1]; + output[2] = input[2]; +} + // 10:10:10:2REV format -static void _readVertexData1i3f(const GLubyte* in, GLubyte* out) { +static void _readNormal1i3f(const GLubyte* in, GLubyte* out) { static const float MULTIPLIER = 1.0f / 1023.0f; GLfloat* output = (GLfloat*) out; @@ -63,295 +384,6 @@ static void _readVertexData1i3f(const GLubyte* in, GLubyte* out) { output[2] = (2.0f * (float) input.bits.z + 1.0f) * MULTIPLIER; } -static void _readVertexData3us3f(const GLubyte* in, GLubyte* out) { - const GLushort* input = (const GLushort*) in; - float* output = (float*) out; - - output[0] = input[0]; - output[1] = input[1]; - output[2] = input[2]; -} - -static void _readVertexData3ui3f(const GLubyte* in, GLubyte* out) { - const GLuint* input = (const GLuint*) in; - float* output = (float*) out; - - output[0] = input[0]; - output[1] = input[1]; - output[2] = input[2]; -} - - -static void _readVertexData3ub3f(const GLubyte* input, GLubyte* out) { - float* output = (float*) out; - - output[0] = input[0] * ONE_OVER_TWO_FIVE_FIVE; - output[1] = input[1] * ONE_OVER_TWO_FIVE_FIVE; - output[2] = input[2] * ONE_OVER_TWO_FIVE_FIVE; -} - -static void _readVertexData2f2f(const GLubyte* in, GLubyte* out) { - vec2cpy(out, in); -} - -static void _readVertexData2f3f(const GLubyte* in, GLubyte* out) { - const float* input = (const float*) in; - float* output = (float*) out; - - vec2cpy(output, input); - output[2] = 0.0f; -} - -static void _readVertexData2ub3f(const GLubyte* input, GLubyte* out) { - float* output = (float*) out; - - output[0] = input[0] * ONE_OVER_TWO_FIVE_FIVE; - output[1] = input[1] * ONE_OVER_TWO_FIVE_FIVE; - output[2] = 0.0f; -} - -static void _readVertexData2us3f(const GLubyte* in, GLubyte* out) { - const GLushort* input = (const GLushort*) in; - float* output = (float*) out; - - output[0] = input[0]; - output[1] = input[1]; - output[2] = 0.0f; -} - -static void _readVertexData2us2f(const GLubyte* in, GLubyte* out) { - const GLushort* input = (const GLushort*) in; - float* output = (float*) out; - - output[0] = (float)input[0] / SHRT_MAX; - output[1] = (float)input[1] / SHRT_MAX; -} - -static void _readVertexData2ui2f(const GLubyte* in, GLubyte* out) { - const GLuint* input = (const GLuint*) in; - float* output = (float*) out; - - output[0] = input[0]; - output[1] = input[1]; -} - -static void _readVertexData2ub2f(const GLubyte* input, GLubyte* out) { - float* output = (float*) out; - - output[0] = input[0] * ONE_OVER_TWO_FIVE_FIVE; - output[1] = input[1] * ONE_OVER_TWO_FIVE_FIVE; -} - -static void _readVertexData2ui3f(const GLubyte* in, GLubyte* out) { - const GLuint* input = (const GLuint*) in; - float* output = (float*) out; - - output[0] = input[0]; - output[1] = input[1]; - output[2] = 0.0f; -} - -static void _readVertexData4ubARGB(const GLubyte* input, GLubyte* output) { - output[R8IDX] = input[0]; - output[G8IDX] = input[1]; - output[B8IDX] = input[2]; - output[A8IDX] = input[3]; -} - -static void _readVertexData4fARGB(const GLubyte* in, GLubyte* output) { - const float* input = (const float*) in; - - output[R8IDX] = (GLubyte) clamp(input[0] * 255.0f, 0, 255); - output[G8IDX] = (GLubyte) clamp(input[1] * 255.0f, 0, 255); - output[B8IDX] = (GLubyte) clamp(input[2] * 255.0f, 0, 255); - output[A8IDX] = (GLubyte) clamp(input[3] * 255.0f, 0, 255); -} - -static void _readVertexData3fARGB(const GLubyte* in, GLubyte* output) { - const float* input = (const float*) in; - - output[R8IDX] = (GLubyte) clamp(input[0] * 255.0f, 0, 255); - output[G8IDX] = (GLubyte) clamp(input[1] * 255.0f, 0, 255); - output[B8IDX] = (GLubyte) clamp(input[2] * 255.0f, 0, 255); - output[A8IDX] = 255; -} - -static void _readVertexData3ubARGB(const GLubyte* __restrict__ input, GLubyte* __restrict__ output) { - output[R8IDX] = input[0]; - output[G8IDX] = input[1]; - output[B8IDX] = input[2]; - output[A8IDX] = 255; -} - -static void _readVertexData4ubRevARGB(const GLubyte* __restrict__ input, GLubyte* __restrict__ output) { - argbcpy(output, input); -} - -static void _readVertexData4fRevARGB(const GLubyte* __restrict__ in, GLubyte* __restrict__ output) { - const float* input = (const float*) in; - - output[0] = (GLubyte) clamp(input[0] * 255.0f, 0, 255); - output[1] = (GLubyte) clamp(input[1] * 255.0f, 0, 255); - output[2] = (GLubyte) clamp(input[2] * 255.0f, 0, 255); - output[3] = (GLubyte) clamp(input[3] * 255.0f, 0, 255); -} - -static void _fillWithNegZVE(const GLubyte* __restrict__ input, GLubyte* __restrict__ out) { - _GL_UNUSED(input); - typedef struct { float x, y, z; } V; - - static const V NegZ = {0.0f, 0.0f, -1.0f}; - - *((V*) out) = NegZ; -} - -static void _fillWhiteARGB(const GLubyte* __restrict__ input, GLubyte* __restrict__ output) { - _GL_UNUSED(input); - *((uint32_t*) output) = ~0; -} - -static void _fillZero2f(const GLubyte* __restrict__ input, GLubyte* __restrict__ out) { - _GL_UNUSED(input); - //memset(out, 0, sizeof(float) * 2); - // memset does 8 byte writes - faster to manually write as uint32 - uint32_t* dst = (uint32_t*)out; - dst[0] = 0; - dst[1] = 0; -} - -static void _readVertexData3usARGB(const GLubyte* input, GLubyte* output) { - _GL_UNUSED(input); - _GL_UNUSED(output); - gl_assert(0 && "Not Implemented"); -} - -static void _readVertexData3uiARGB(const GLubyte* input, GLubyte* output) { - _GL_UNUSED(input); - _GL_UNUSED(output); - gl_assert(0 && "Not Implemented"); -} - -static void _readVertexData4usARGB(const GLubyte* input, GLubyte* output) { - _GL_UNUSED(input); - _GL_UNUSED(output); - gl_assert(0 && "Not Implemented"); -} - -static void _readVertexData4uiARGB(const GLubyte* input, GLubyte* output) { - _GL_UNUSED(input); - _GL_UNUSED(output); - gl_assert(0 && "Not Implemented"); -} - -static void _readVertexData4usRevARGB(const GLubyte* input, GLubyte* output) { - _GL_UNUSED(input); - _GL_UNUSED(output); - gl_assert(0 && "Not Implemented"); -} - -static void _readVertexData4uiRevARGB(const GLubyte* input, GLubyte* output) { - _GL_UNUSED(input); - _GL_UNUSED(output); - gl_assert(0 && "Not Implemented"); -} - -static ReadAttributeFunc calcReadDiffuseFunc() { - if((ATTRIB_LIST.enabled & DIFFUSE_ENABLED_FLAG) != DIFFUSE_ENABLED_FLAG) { - /* Just fill the whole thing white if the attribute is disabled */ - return _fillWhiteARGB; - } - - switch(ATTRIB_LIST.colour.type) { - default: - case GL_DOUBLE: - case GL_FLOAT: - return (ATTRIB_LIST.colour.size == 3) ? _readVertexData3fARGB: - (ATTRIB_LIST.colour.size == 4) ? _readVertexData4fARGB: - _readVertexData4fRevARGB; - case GL_BYTE: - case GL_UNSIGNED_BYTE: - return (ATTRIB_LIST.colour.size == 3) ? _readVertexData3ubARGB: - (ATTRIB_LIST.colour.size == 4) ? _readVertexData4ubARGB: - _readVertexData4ubRevARGB; - case GL_SHORT: - case GL_UNSIGNED_SHORT: - return (ATTRIB_LIST.colour.size == 3) ? _readVertexData3usARGB: - (ATTRIB_LIST.colour.size == 4) ? _readVertexData4usARGB: - _readVertexData4usRevARGB; - case GL_INT: - case GL_UNSIGNED_INT: - return (ATTRIB_LIST.colour.size == 3) ? _readVertexData3uiARGB: - (ATTRIB_LIST.colour.size == 4) ? _readVertexData4uiARGB: - _readVertexData4uiRevARGB; - } -} - -static ReadAttributeFunc calcReadPositionFunc() { - switch(ATTRIB_LIST.vertex.type) { - default: - case GL_DOUBLE: - case GL_FLOAT: - return (ATTRIB_LIST.vertex.size == 3) ? _readVertexData3f3f: - _readVertexData2f3f; - case GL_BYTE: - case GL_UNSIGNED_BYTE: - return (ATTRIB_LIST.vertex.size == 3) ? _readVertexData3ub3f: - _readVertexData2ub3f; - case GL_SHORT: - case GL_UNSIGNED_SHORT: - return (ATTRIB_LIST.vertex.size == 3) ? _readVertexData3us3f: - _readVertexData2us3f; - case GL_INT: - case GL_UNSIGNED_INT: - return (ATTRIB_LIST.vertex.size == 3) ? _readVertexData3ui3f: - _readVertexData2ui3f; - } -} - -static ReadAttributeFunc calcReadUVFunc() { - if((ATTRIB_LIST.enabled & UV_ENABLED_FLAG) != UV_ENABLED_FLAG) { - return _fillZero2f; - } - - switch(ATTRIB_LIST.uv.type) { - default: - case GL_DOUBLE: - case GL_FLOAT: - return _readVertexData2f2f; - case GL_BYTE: - case GL_UNSIGNED_BYTE: - return _readVertexData2ub2f; - case GL_SHORT: - case GL_UNSIGNED_SHORT: - return _readVertexData2us2f; - case GL_INT: - case GL_UNSIGNED_INT: - return _readVertexData2ui2f; - } -} - -static ReadAttributeFunc calcReadSTFunc() { - if((ATTRIB_LIST.enabled & ST_ENABLED_FLAG) != ST_ENABLED_FLAG) { - return _fillZero2f; - } - - switch(ATTRIB_LIST.st.type) { - default: - case GL_DOUBLE: - case GL_FLOAT: - return _readVertexData2f2f; - case GL_BYTE: - case GL_UNSIGNED_BYTE: - return _readVertexData2ub2f; - case GL_SHORT: - case GL_UNSIGNED_SHORT: - return _readVertexData2us2f; - case GL_INT: - case GL_UNSIGNED_INT: - return _readVertexData2ui2f; - } -} - static ReadAttributeFunc calcReadNormalFunc() { if((ATTRIB_LIST.enabled & NORMAL_ENABLED_FLAG) != NORMAL_ENABLED_FLAG) { return _fillWithNegZVE; @@ -361,26 +393,27 @@ static ReadAttributeFunc calcReadNormalFunc() { default: case GL_DOUBLE: case GL_FLOAT: - return _readVertexData3f3f; + return _readNormal3f3f; break; case GL_BYTE: case GL_UNSIGNED_BYTE: - return _readVertexData3ub3f; + return _readNormal3ub3f; break; case GL_SHORT: case GL_UNSIGNED_SHORT: - return _readVertexData3us3f; + return _readNormal3us3f; break; case GL_INT: case GL_UNSIGNED_INT: - return _readVertexData3ui3f; + return _readNormal3ui3f; break; case GL_UNSIGNED_INT_2_10_10_10_REV: - return _readVertexData1i3f; + return _readNormal1i3f; break; } } + void APIENTRY glEnableClientState(GLenum cap) { TRACE(); From 3b2e5499347aa9e92d8431d01229b33443e33734 Mon Sep 17 00:00:00 2001 From: UnknownShadow200 Date: Sun, 2 Feb 2025 14:36:40 +1100 Subject: [PATCH 2/3] Optimise vertex transform in non-fast path to avoid storing xyz to memory and then loading it again --- GL/attributes.c | 78 ++++++++++++++++++++++++++--------------- GL/draw.c | 12 ++----- GL/draw_fastpath.inc | 3 +- GL/platforms/sh4.h | 13 ++++--- GL/platforms/software.c | 10 +++--- GL/platforms/software.h | 2 +- 6 files changed, 66 insertions(+), 52 deletions(-) diff --git a/GL/attributes.c b/GL/attributes.c index 703d562..f12f1ed 100644 --- a/GL/attributes.c +++ b/GL/attributes.c @@ -37,67 +37,89 @@ GLuint* _glGetEnabledAttributes() { static void _readPosition3f3f(const GLubyte* __restrict__ in, GLubyte* __restrict__ out) { - vec3cpy(out, in); + const float* input = (const float*) in; + Vertex* it = (Vertex*) out; + + float x = input[0]; + float y = input[1]; + float z = input[2]; + float w = 1.0f; + TransformVertex(x, y, z, w, it->xyz, &it->w); } static void _readPosition3ub3f(const GLubyte* input, GLubyte* out) { - float* output = (float*) out; + Vertex* it = (Vertex*)out; - output[0] = input[0] * ONE_OVER_TWO_FIVE_FIVE; - output[1] = input[1] * ONE_OVER_TWO_FIVE_FIVE; - output[2] = input[2] * ONE_OVER_TWO_FIVE_FIVE; + float x = input[0] * ONE_OVER_TWO_FIVE_FIVE; + float y = input[1] * ONE_OVER_TWO_FIVE_FIVE; + float z = input[2] * ONE_OVER_TWO_FIVE_FIVE; + float w = 1.0f; + TransformVertex(x, y, z, w, it->xyz, &it->w); } static void _readPosition3us3f(const GLubyte* in, GLubyte* out) { const GLushort* input = (const GLushort*) in; - float* output = (float*) out; + Vertex* it = (Vertex*) out; - output[0] = input[0]; - output[1] = input[1]; - output[2] = input[2]; + float x = input[0]; + float y = input[1]; + float z = input[2]; + float w = 1.0f; + TransformVertex(x, y, z, w, it->xyz, &it->w); } static void _readPosition3ui3f(const GLubyte* in, GLubyte* out) { const GLuint* input = (const GLuint*) in; - float* output = (float*) out; + Vertex* it = (Vertex*) out; - output[0] = input[0]; - output[1] = input[1]; - output[2] = input[2]; + float x = input[0]; + float y = input[1]; + float z = input[2]; + float w = 1.0f; + TransformVertex(x, y, z, w, it->xyz, &it->w); } static void _readPosition2f3f(const GLubyte* in, GLubyte* out) { const float* input = (const float*) in; - float* output = (float*) out; + Vertex* it = (Vertex*) out; - vec2cpy(output, input); - output[2] = 0.0f; + float x = input[0]; + float y = input[1]; + float z = 0.0f; + float w = 1.0f; + TransformVertex(x, y, z, w, it->xyz, &it->w); } static void _readPosition2ub3f(const GLubyte* input, GLubyte* out) { - float* output = (float*) out; + Vertex* it = (Vertex*) out; - output[0] = input[0] * ONE_OVER_TWO_FIVE_FIVE; - output[1] = input[1] * ONE_OVER_TWO_FIVE_FIVE; - output[2] = 0.0f; + float x = input[0] * ONE_OVER_TWO_FIVE_FIVE; + float y = input[1] * ONE_OVER_TWO_FIVE_FIVE; + float z = 0.0f; + float w = 1.0f; + TransformVertex(x, y, z, w, it->xyz, &it->w); } static void _readPosition2us3f(const GLubyte* in, GLubyte* out) { const GLushort* input = (const GLushort*) in; - float* output = (float*) out; + Vertex* it = (Vertex*) out; - output[0] = input[0]; - output[1] = input[1]; - output[2] = 0.0f; + float x = input[0]; + float y = input[1]; + float z = 0.0f; + float w = 1.0f; + TransformVertex(x, y, z, w, it->xyz, &it->w); } static void _readPosition2ui3f(const GLubyte* in, GLubyte* out) { const GLuint* input = (const GLuint*) in; - float* output = (float*) out; + Vertex* it = (Vertex*)out; - output[0] = input[0]; - output[1] = input[1]; - output[2] = 0.0f; + float x = input[0]; + float y = input[1]; + float z = 0.0f; + float w = 1.0f; + TransformVertex(x, y, z, w, it->xyz, &it->w); } static ReadAttributeFunc calcReadPositionFunc() { diff --git a/GL/draw.c b/GL/draw.c index 98b5cfd..b6e5086 100644 --- a/GL/draw.c +++ b/GL/draw.c @@ -289,13 +289,10 @@ static void _readPositionData(const GLuint first, const GLuint count, Vertex* it const GLsizei vstride = ATTRIB_LIST.vertex.stride; const GLubyte* vptr = ((GLubyte*) ATTRIB_LIST.vertex.ptr + (first * vstride)); - float pos[3], w = 1.0f; - ITERATE(count) { PREFETCH(vptr + vstride); - func(vptr, (GLubyte*) pos); + func(vptr, (GLubyte*) it); it->flags = GPU_CMD_VERTEX; - TransformVertex(pos, &w, it->xyz, &it->w); vptr += vstride; ++it; @@ -411,8 +408,7 @@ static void generateElements( st = (GLubyte*) ATTRIB_LIST.st.ptr + (idx * ststride); nxyz = (GLubyte*) ATTRIB_LIST.normal.ptr + (idx * nstride); - pos_func(xyz, (GLubyte*) pos); - TransformVertex((const float*) pos, &w, output->xyz, &output->w); + pos_func(xyz, (GLubyte*) output); uv_func(uv, (GLubyte*) output->uv); diffuse_func(bgra, output->bgra); st_func(st, (GLubyte*) ve->st); @@ -460,8 +456,6 @@ static void generateElementsFastPath( VertexExtra* ve = aligned_vector_at(target->extras, 0); Vertex* it = start; - const float w = 1.0f; - if(!pos) { return; } @@ -472,7 +466,7 @@ static void generateElementsFastPath( it->flags = GPU_CMD_VERTEX; pos = (GLubyte*) ATTRIB_LIST.vertex.ptr + (idx * vstride); - TransformVertex((const float*) pos, &w, it->xyz, &it->w); + TransformVertex(((float*) pos)[0], ((float*) pos)[1], ((float*) pos)[2], 1.0f, it->xyz, &it->w); if(uv) { uv = (GLubyte*) ATTRIB_LIST.uv.ptr + (idx * uvstride); diff --git a/GL/draw_fastpath.inc b/GL/draw_fastpath.inc index b26e6a9..fe3b527 100644 --- a/GL/draw_fastpath.inc +++ b/GL/draw_fastpath.inc @@ -5,7 +5,6 @@ MAKE_FUNC(POLYMODE) { - static const float w = 1.0f; if(!(ATTRIB_LIST.enabled & VERTEX_ENABLED_FLAG)) { /* If we don't have vertices, do nothing */ return; @@ -75,7 +74,7 @@ MAKE_FUNC(POLYMODE) PREFETCH(ptr); for(int_fast32_t i = 0; i < loop; ++i, ++it) { PREFETCH(ptr + stride); - TransformVertex((const float*) ptr, &w, it->xyz, &it->w); + TransformVertex(((float*) ptr)[0], ((float*) ptr)[1], ((float*) ptr)[2], 1.0f, it->xyz, &it->w); PROCESS_VERTEX_FLAGS(it, min + i); ptr += stride; } diff --git a/GL/platforms/sh4.h b/GL/platforms/sh4.h index 5cbcc08..6b9bad0 100644 --- a/GL/platforms/sh4.h +++ b/GL/platforms/sh4.h @@ -106,15 +106,14 @@ inline void TransformVec4(float* x) { } -GL_FORCE_INLINE void TransformVertex(const float* xyz, const float* w, float* oxyz, float* ow) { - register float __x __asm__("fr12") = (xyz[0]); - register float __y __asm__("fr13") = (xyz[1]); - register float __z __asm__("fr14") = (xyz[2]); - register float __w __asm__("fr15") = (*w); +GL_FORCE_INLINE void TransformVertex(float x, float y, float z, float w, float* oxyz, float* ow) { + register float __x __asm__("fr4") = x; + register float __y __asm__("fr5") = y; + register float __z __asm__("fr6") = z; + register float __w __asm__("fr7") = w; __asm__ __volatile__( - "fldi1 fr15\n" - "ftrv xmtrx,fv12\n" + "ftrv xmtrx,fv4\n" : "=f" (__x), "=f" (__y), "=f" (__z), "=f" (__w) : "0" (__x), "1" (__y), "2" (__z), "3" (__w) ); diff --git a/GL/platforms/software.c b/GL/platforms/software.c index bf9ba47..9a2b4e5 100644 --- a/GL/platforms/software.c +++ b/GL/platforms/software.c @@ -636,12 +636,12 @@ void TransformVertices(Vertex* vertices, const int count) { } } -void TransformVertex(const float* xyz, const float* w, float* oxyz, float* ow) { +void TransformVertex(float x, float y, float z, float w, float* oxyz, float* ow) { float ret[4]; - ret[0] = xyz[0]; - ret[1] = xyz[1]; - ret[2] = xyz[2]; - ret[3] = *w; + ret[0] = x; + ret[1] = y; + ret[2] = z; + ret[3] = w; TransformVec4(ret); diff --git a/GL/platforms/software.h b/GL/platforms/software.h index 5fee317..cc89f97 100644 --- a/GL/platforms/software.h +++ b/GL/platforms/software.h @@ -53,7 +53,7 @@ static inline void TransformNormalNoMod(const float* xIn, float* xOut) { } void TransformVertices(Vertex* vertices, const int count); -void TransformVertex(const float* xyz, const float* w, float* oxyz, float* ow); +void TransformVertex(float x, float y, float z, float w, float* oxyz, float* ow); void InitGPU(_Bool autosort, _Bool fsaa); From a566bba082d96ad5bd64615b77450c8b4e7e79eb Mon Sep 17 00:00:00 2001 From: UnknownShadow200 Date: Sun, 2 Feb 2025 15:58:19 +1100 Subject: [PATCH 3/3] Call TransformVertex directly instead of using TransformVertices --- GL/draw.c | 8 ++++++-- GL/platforms/sh4.h | 22 ---------------------- GL/platforms/software.c | 31 +++++++------------------------ GL/platforms/software.h | 1 - 4 files changed, 13 insertions(+), 49 deletions(-) diff --git a/GL/draw.c b/GL/draw.c index b6e5086..a8dade5 100644 --- a/GL/draw.c +++ b/GL/draw.c @@ -615,9 +615,13 @@ static void transform(SubmissionTarget* target) { TRACE(); /* Perform modelview transform, storing W */ - Vertex* vertex = _glSubmissionTargetStart(target); + Vertex* it = _glSubmissionTargetStart(target); + int count = target->count; - TransformVertices(vertex, target->count); + for(int i = 0; i < count; ++i, ++it) { + TransformVertex(it->xyz[0], it->xyz[1], it->xyz[2], it->w, + it->xyz, &it->w); + } } static void mat_transform_normal3(const float* xyz, const float* xyzOut, const uint32_t count, const uint32_t inStride, const uint32_t outStride) { diff --git a/GL/platforms/sh4.h b/GL/platforms/sh4.h index 6b9bad0..1139283 100644 --- a/GL/platforms/sh4.h +++ b/GL/platforms/sh4.h @@ -124,28 +124,6 @@ GL_FORCE_INLINE void TransformVertex(float x, float y, float z, float w, float* *ow = __w; } -static inline void TransformVertices(Vertex* vertices, const int count) { - Vertex* it = vertices; - for(int i = 0; i < count; ++i, ++it) { - register float __x __asm__("fr12") = (it->xyz[0]); - register float __y __asm__("fr13") = (it->xyz[1]); - register float __z __asm__("fr14") = (it->xyz[2]); - register float __w __asm__("fr15") = (it->w); - - __asm__ __volatile__( - "fldi1 fr15\n" - "ftrv xmtrx,fv12\n" - : "=f" (__x), "=f" (__y), "=f" (__z), "=f" (__w) - : "0" (__x), "1" (__y), "2" (__z), "3" (__w) - ); - - it->xyz[0] = __x; - it->xyz[1] = __y; - it->xyz[2] = __z; - it->w = __w; - } -} - void InitGPU(_Bool autosort, _Bool fsaa); static inline size_t GPUMemoryAvailable() { diff --git a/GL/platforms/software.c b/GL/platforms/software.c index 9a2b4e5..57940b6 100644 --- a/GL/platforms/software.c +++ b/GL/platforms/software.c @@ -619,34 +619,17 @@ void TransformVec4(float* v) { FASTCPY(v, ret, sizeof(float) * 4); } -void TransformVertices(Vertex* vertices, const int count) { - float ret[4]; - for(int i = 0; i < count; ++i, ++vertices) { - ret[0] = vertices->xyz[0]; - ret[1] = vertices->xyz[1]; - ret[2] = vertices->xyz[2]; - ret[3] = 1.0f; - - TransformVec4(ret); - - vertices->xyz[0] = ret[0]; - vertices->xyz[1] = ret[1]; - vertices->xyz[2] = ret[2]; - vertices->w = ret[3]; - } -} - void TransformVertex(float x, float y, float z, float w, float* oxyz, float* ow) { - float ret[4]; - ret[0] = x; - ret[1] = y; - ret[2] = z; - ret[3] = w; + float vec[4], ret[4]; + vec[0] = x; + vec[1] = y; + vec[2] = z; + vec[3] = w; - TransformVec4(ret); + TransformVec4NoMod(vec, ret); oxyz[0] = ret[0]; oxyz[1] = ret[1]; oxyz[2] = ret[2]; - *ow = ret[3]; + *ow = ret[3]; } diff --git a/GL/platforms/software.h b/GL/platforms/software.h index cc89f97..744a9bc 100644 --- a/GL/platforms/software.h +++ b/GL/platforms/software.h @@ -52,7 +52,6 @@ static inline void TransformNormalNoMod(const float* xIn, float* xOut) { (void) xOut; } -void TransformVertices(Vertex* vertices, const int count); void TransformVertex(float x, float y, float z, float w, float* oxyz, float* ow); void InitGPU(_Bool autosort, _Bool fsaa);