Merge branch 'quicker_transform' into 'master'

Quicker vertex transform in non-fast path

See merge request simulant/GLdc!144
This commit is contained in:
Luke Benstead 2025-02-02 08:34:36 +00:00
commit acee4b5c37
6 changed files with 376 additions and 365 deletions

View File

@ -36,12 +36,355 @@ GLuint* _glGetEnabledAttributes() {
}
static void _readVertexData3f3f(const GLubyte* __restrict__ in, GLubyte* __restrict__ out) {
static void _readPosition3f3f(const GLubyte* __restrict__ in, GLubyte* __restrict__ out) {
const float* input = (const float*) in;
Vertex* it = (Vertex*) out;
float x = input[0];
float y = input[1];
float z = input[2];
float w = 1.0f;
TransformVertex(x, y, z, w, it->xyz, &it->w);
}
static void _readPosition3ub3f(const GLubyte* input, GLubyte* out) {
Vertex* it = (Vertex*)out;
float x = input[0] * ONE_OVER_TWO_FIVE_FIVE;
float y = input[1] * ONE_OVER_TWO_FIVE_FIVE;
float z = input[2] * ONE_OVER_TWO_FIVE_FIVE;
float w = 1.0f;
TransformVertex(x, y, z, w, it->xyz, &it->w);
}
static void _readPosition3us3f(const GLubyte* in, GLubyte* out) {
const GLushort* input = (const GLushort*) in;
Vertex* it = (Vertex*) out;
float x = input[0];
float y = input[1];
float z = input[2];
float w = 1.0f;
TransformVertex(x, y, z, w, it->xyz, &it->w);
}
static void _readPosition3ui3f(const GLubyte* in, GLubyte* out) {
const GLuint* input = (const GLuint*) in;
Vertex* it = (Vertex*) out;
float x = input[0];
float y = input[1];
float z = input[2];
float w = 1.0f;
TransformVertex(x, y, z, w, it->xyz, &it->w);
}
static void _readPosition2f3f(const GLubyte* in, GLubyte* out) {
const float* input = (const float*) in;
Vertex* it = (Vertex*) out;
float x = input[0];
float y = input[1];
float z = 0.0f;
float w = 1.0f;
TransformVertex(x, y, z, w, it->xyz, &it->w);
}
static void _readPosition2ub3f(const GLubyte* input, GLubyte* out) {
Vertex* it = (Vertex*) out;
float x = input[0] * ONE_OVER_TWO_FIVE_FIVE;
float y = input[1] * ONE_OVER_TWO_FIVE_FIVE;
float z = 0.0f;
float w = 1.0f;
TransformVertex(x, y, z, w, it->xyz, &it->w);
}
static void _readPosition2us3f(const GLubyte* in, GLubyte* out) {
const GLushort* input = (const GLushort*) in;
Vertex* it = (Vertex*) out;
float x = input[0];
float y = input[1];
float z = 0.0f;
float w = 1.0f;
TransformVertex(x, y, z, w, it->xyz, &it->w);
}
static void _readPosition2ui3f(const GLubyte* in, GLubyte* out) {
const GLuint* input = (const GLuint*) in;
Vertex* it = (Vertex*)out;
float x = input[0];
float y = input[1];
float z = 0.0f;
float w = 1.0f;
TransformVertex(x, y, z, w, it->xyz, &it->w);
}
static ReadAttributeFunc calcReadPositionFunc() {
switch(ATTRIB_LIST.vertex.type) {
default:
case GL_DOUBLE:
case GL_FLOAT:
return (ATTRIB_LIST.vertex.size == 3) ? _readPosition3f3f:
_readPosition2f3f;
case GL_BYTE:
case GL_UNSIGNED_BYTE:
return (ATTRIB_LIST.vertex.size == 3) ? _readPosition3ub3f:
_readPosition2ub3f;
case GL_SHORT:
case GL_UNSIGNED_SHORT:
return (ATTRIB_LIST.vertex.size == 3) ? _readPosition3us3f:
_readPosition2us3f;
case GL_INT:
case GL_UNSIGNED_INT:
return (ATTRIB_LIST.vertex.size == 3) ? _readPosition3ui3f:
_readPosition2ui3f;
}
}
static void _fillWhiteARGB(const GLubyte* __restrict__ input, GLubyte* __restrict__ output) {
_GL_UNUSED(input);
*((uint32_t*) output) = ~0;
}
static void _readColour4ubARGB(const GLubyte* input, GLubyte* output) {
output[R8IDX] = input[0];
output[G8IDX] = input[1];
output[B8IDX] = input[2];
output[A8IDX] = input[3];
}
static void _readColour4fARGB(const GLubyte* in, GLubyte* output) {
const float* input = (const float*) in;
output[R8IDX] = (GLubyte) clamp(input[0] * 255.0f, 0, 255);
output[G8IDX] = (GLubyte) clamp(input[1] * 255.0f, 0, 255);
output[B8IDX] = (GLubyte) clamp(input[2] * 255.0f, 0, 255);
output[A8IDX] = (GLubyte) clamp(input[3] * 255.0f, 0, 255);
}
static void _readColour3fARGB(const GLubyte* in, GLubyte* output) {
const float* input = (const float*) in;
output[R8IDX] = (GLubyte) clamp(input[0] * 255.0f, 0, 255);
output[G8IDX] = (GLubyte) clamp(input[1] * 255.0f, 0, 255);
output[B8IDX] = (GLubyte) clamp(input[2] * 255.0f, 0, 255);
output[A8IDX] = 255;
}
static void _readColour3ubARGB(const GLubyte* __restrict__ input, GLubyte* __restrict__ output) {
output[R8IDX] = input[0];
output[G8IDX] = input[1];
output[B8IDX] = input[2];
output[A8IDX] = 255;
}
static void _readColour4ubRevARGB(const GLubyte* __restrict__ input, GLubyte* __restrict__ output) {
argbcpy(output, input);
}
static void _readColour4fRevARGB(const GLubyte* __restrict__ in, GLubyte* __restrict__ output) {
const float* input = (const float*) in;
output[0] = (GLubyte) clamp(input[0] * 255.0f, 0, 255);
output[1] = (GLubyte) clamp(input[1] * 255.0f, 0, 255);
output[2] = (GLubyte) clamp(input[2] * 255.0f, 0, 255);
output[3] = (GLubyte) clamp(input[3] * 255.0f, 0, 255);
}
static void _readColour3usARGB(const GLubyte* input, GLubyte* output) {
_GL_UNUSED(input);
_GL_UNUSED(output);
gl_assert(0 && "Not Implemented");
}
static void _readColour3uiARGB(const GLubyte* input, GLubyte* output) {
_GL_UNUSED(input);
_GL_UNUSED(output);
gl_assert(0 && "Not Implemented");
}
static void _readColour4usARGB(const GLubyte* input, GLubyte* output) {
_GL_UNUSED(input);
_GL_UNUSED(output);
gl_assert(0 && "Not Implemented");
}
static void _readColour4uiARGB(const GLubyte* input, GLubyte* output) {
_GL_UNUSED(input);
_GL_UNUSED(output);
gl_assert(0 && "Not Implemented");
}
static void _readColour4usRevARGB(const GLubyte* input, GLubyte* output) {
_GL_UNUSED(input);
_GL_UNUSED(output);
gl_assert(0 && "Not Implemented");
}
static void _readColour4uiRevARGB(const GLubyte* input, GLubyte* output) {
_GL_UNUSED(input);
_GL_UNUSED(output);
gl_assert(0 && "Not Implemented");
}
static ReadAttributeFunc calcReadDiffuseFunc() {
if((ATTRIB_LIST.enabled & DIFFUSE_ENABLED_FLAG) != DIFFUSE_ENABLED_FLAG) {
/* Just fill the whole thing white if the attribute is disabled */
return _fillWhiteARGB;
}
switch(ATTRIB_LIST.colour.type) {
default:
case GL_DOUBLE:
case GL_FLOAT:
return (ATTRIB_LIST.colour.size == 3) ? _readColour3fARGB:
(ATTRIB_LIST.colour.size == 4) ? _readColour4fARGB:
_readColour4fRevARGB;
case GL_BYTE:
case GL_UNSIGNED_BYTE:
return (ATTRIB_LIST.colour.size == 3) ? _readColour3ubARGB:
(ATTRIB_LIST.colour.size == 4) ? _readColour4ubARGB:
_readColour4ubRevARGB;
case GL_SHORT:
case GL_UNSIGNED_SHORT:
return (ATTRIB_LIST.colour.size == 3) ? _readColour3usARGB:
(ATTRIB_LIST.colour.size == 4) ? _readColour4usARGB:
_readColour4usRevARGB;
case GL_INT:
case GL_UNSIGNED_INT:
return (ATTRIB_LIST.colour.size == 3) ? _readColour3uiARGB:
(ATTRIB_LIST.colour.size == 4) ? _readColour4uiARGB:
_readColour4uiRevARGB;
}
}
static void _fillZero2f(const GLubyte* __restrict__ input, GLubyte* __restrict__ out) {
_GL_UNUSED(input);
//memset(out, 0, sizeof(float) * 2);
// memset does 8 byte writes - faster to manually write as uint32
uint32_t* dst = (uint32_t*)out;
dst[0] = 0;
dst[1] = 0;
}
static void _readTexcoord2f2f(const GLubyte* in, GLubyte* out) {
vec2cpy(out, in);
}
static void _readTexcoord2ub2f(const GLubyte* input, GLubyte* out) {
float* output = (float*) out;
output[0] = input[0] * ONE_OVER_TWO_FIVE_FIVE;
output[1] = input[1] * ONE_OVER_TWO_FIVE_FIVE;
}
static void _readTexcoord2us2f(const GLubyte* in, GLubyte* out) {
const GLushort* input = (const GLushort*) in;
float* output = (float*) out;
output[0] = (float)input[0] / SHRT_MAX;
output[1] = (float)input[1] / SHRT_MAX;
}
static void _readTexcoord2ui2f(const GLubyte* in, GLubyte* out) {
const GLuint* input = (const GLuint*) in;
float* output = (float*) out;
output[0] = input[0];
output[1] = input[1];
}
static ReadAttributeFunc calcReadUVFunc() {
if((ATTRIB_LIST.enabled & UV_ENABLED_FLAG) != UV_ENABLED_FLAG) {
return _fillZero2f;
}
switch(ATTRIB_LIST.uv.type) {
default:
case GL_DOUBLE:
case GL_FLOAT:
return _readTexcoord2f2f;
case GL_BYTE:
case GL_UNSIGNED_BYTE:
return _readTexcoord2ub2f;
case GL_SHORT:
case GL_UNSIGNED_SHORT:
return _readTexcoord2us2f;
case GL_INT:
case GL_UNSIGNED_INT:
return _readTexcoord2ui2f;
}
}
static ReadAttributeFunc calcReadSTFunc() {
if((ATTRIB_LIST.enabled & ST_ENABLED_FLAG) != ST_ENABLED_FLAG) {
return _fillZero2f;
}
switch(ATTRIB_LIST.st.type) {
default:
case GL_DOUBLE:
case GL_FLOAT:
return _readTexcoord2f2f;
case GL_BYTE:
case GL_UNSIGNED_BYTE:
return _readTexcoord2ub2f;
case GL_SHORT:
case GL_UNSIGNED_SHORT:
return _readTexcoord2us2f;
case GL_INT:
case GL_UNSIGNED_INT:
return _readTexcoord2ui2f;
}
}
static void _fillWithNegZVE(const GLubyte* __restrict__ input, GLubyte* __restrict__ out) {
_GL_UNUSED(input);
typedef struct { float x, y, z; } V;
static const V NegZ = {0.0f, 0.0f, -1.0f};
*((V*) out) = NegZ;
}
static void _readNormal3f3f(const GLubyte* __restrict__ in, GLubyte* __restrict__ out) {
vec3cpy(out, in);
}
static void _readNormal3ub3f(const GLubyte* input, GLubyte* out) {
float* output = (float*) out;
output[0] = input[0] * ONE_OVER_TWO_FIVE_FIVE;
output[1] = input[1] * ONE_OVER_TWO_FIVE_FIVE;
output[2] = input[2] * ONE_OVER_TWO_FIVE_FIVE;
}
static void _readNormal3us3f(const GLubyte* in, GLubyte* out) {
const GLushort* input = (const GLushort*) in;
float* output = (float*) out;
output[0] = input[0];
output[1] = input[1];
output[2] = input[2];
}
static void _readNormal3ui3f(const GLubyte* in, GLubyte* out) {
const GLuint* input = (const GLuint*) in;
float* output = (float*) out;
output[0] = input[0];
output[1] = input[1];
output[2] = input[2];
}
// 10:10:10:2REV format
static void _readVertexData1i3f(const GLubyte* in, GLubyte* out) {
static void _readNormal1i3f(const GLubyte* in, GLubyte* out) {
static const float MULTIPLIER = 1.0f / 1023.0f;
GLfloat* output = (GLfloat*) out;
@ -63,295 +406,6 @@ static void _readVertexData1i3f(const GLubyte* in, GLubyte* out) {
output[2] = (2.0f * (float) input.bits.z + 1.0f) * MULTIPLIER;
}
static void _readVertexData3us3f(const GLubyte* in, GLubyte* out) {
const GLushort* input = (const GLushort*) in;
float* output = (float*) out;
output[0] = input[0];
output[1] = input[1];
output[2] = input[2];
}
static void _readVertexData3ui3f(const GLubyte* in, GLubyte* out) {
const GLuint* input = (const GLuint*) in;
float* output = (float*) out;
output[0] = input[0];
output[1] = input[1];
output[2] = input[2];
}
static void _readVertexData3ub3f(const GLubyte* input, GLubyte* out) {
float* output = (float*) out;
output[0] = input[0] * ONE_OVER_TWO_FIVE_FIVE;
output[1] = input[1] * ONE_OVER_TWO_FIVE_FIVE;
output[2] = input[2] * ONE_OVER_TWO_FIVE_FIVE;
}
static void _readVertexData2f2f(const GLubyte* in, GLubyte* out) {
vec2cpy(out, in);
}
static void _readVertexData2f3f(const GLubyte* in, GLubyte* out) {
const float* input = (const float*) in;
float* output = (float*) out;
vec2cpy(output, input);
output[2] = 0.0f;
}
static void _readVertexData2ub3f(const GLubyte* input, GLubyte* out) {
float* output = (float*) out;
output[0] = input[0] * ONE_OVER_TWO_FIVE_FIVE;
output[1] = input[1] * ONE_OVER_TWO_FIVE_FIVE;
output[2] = 0.0f;
}
static void _readVertexData2us3f(const GLubyte* in, GLubyte* out) {
const GLushort* input = (const GLushort*) in;
float* output = (float*) out;
output[0] = input[0];
output[1] = input[1];
output[2] = 0.0f;
}
static void _readVertexData2us2f(const GLubyte* in, GLubyte* out) {
const GLushort* input = (const GLushort*) in;
float* output = (float*) out;
output[0] = (float)input[0] / SHRT_MAX;
output[1] = (float)input[1] / SHRT_MAX;
}
static void _readVertexData2ui2f(const GLubyte* in, GLubyte* out) {
const GLuint* input = (const GLuint*) in;
float* output = (float*) out;
output[0] = input[0];
output[1] = input[1];
}
static void _readVertexData2ub2f(const GLubyte* input, GLubyte* out) {
float* output = (float*) out;
output[0] = input[0] * ONE_OVER_TWO_FIVE_FIVE;
output[1] = input[1] * ONE_OVER_TWO_FIVE_FIVE;
}
static void _readVertexData2ui3f(const GLubyte* in, GLubyte* out) {
const GLuint* input = (const GLuint*) in;
float* output = (float*) out;
output[0] = input[0];
output[1] = input[1];
output[2] = 0.0f;
}
static void _readVertexData4ubARGB(const GLubyte* input, GLubyte* output) {
output[R8IDX] = input[0];
output[G8IDX] = input[1];
output[B8IDX] = input[2];
output[A8IDX] = input[3];
}
static void _readVertexData4fARGB(const GLubyte* in, GLubyte* output) {
const float* input = (const float*) in;
output[R8IDX] = (GLubyte) clamp(input[0] * 255.0f, 0, 255);
output[G8IDX] = (GLubyte) clamp(input[1] * 255.0f, 0, 255);
output[B8IDX] = (GLubyte) clamp(input[2] * 255.0f, 0, 255);
output[A8IDX] = (GLubyte) clamp(input[3] * 255.0f, 0, 255);
}
static void _readVertexData3fARGB(const GLubyte* in, GLubyte* output) {
const float* input = (const float*) in;
output[R8IDX] = (GLubyte) clamp(input[0] * 255.0f, 0, 255);
output[G8IDX] = (GLubyte) clamp(input[1] * 255.0f, 0, 255);
output[B8IDX] = (GLubyte) clamp(input[2] * 255.0f, 0, 255);
output[A8IDX] = 255;
}
static void _readVertexData3ubARGB(const GLubyte* __restrict__ input, GLubyte* __restrict__ output) {
output[R8IDX] = input[0];
output[G8IDX] = input[1];
output[B8IDX] = input[2];
output[A8IDX] = 255;
}
static void _readVertexData4ubRevARGB(const GLubyte* __restrict__ input, GLubyte* __restrict__ output) {
argbcpy(output, input);
}
static void _readVertexData4fRevARGB(const GLubyte* __restrict__ in, GLubyte* __restrict__ output) {
const float* input = (const float*) in;
output[0] = (GLubyte) clamp(input[0] * 255.0f, 0, 255);
output[1] = (GLubyte) clamp(input[1] * 255.0f, 0, 255);
output[2] = (GLubyte) clamp(input[2] * 255.0f, 0, 255);
output[3] = (GLubyte) clamp(input[3] * 255.0f, 0, 255);
}
static void _fillWithNegZVE(const GLubyte* __restrict__ input, GLubyte* __restrict__ out) {
_GL_UNUSED(input);
typedef struct { float x, y, z; } V;
static const V NegZ = {0.0f, 0.0f, -1.0f};
*((V*) out) = NegZ;
}
static void _fillWhiteARGB(const GLubyte* __restrict__ input, GLubyte* __restrict__ output) {
_GL_UNUSED(input);
*((uint32_t*) output) = ~0;
}
static void _fillZero2f(const GLubyte* __restrict__ input, GLubyte* __restrict__ out) {
_GL_UNUSED(input);
//memset(out, 0, sizeof(float) * 2);
// memset does 8 byte writes - faster to manually write as uint32
uint32_t* dst = (uint32_t*)out;
dst[0] = 0;
dst[1] = 0;
}
static void _readVertexData3usARGB(const GLubyte* input, GLubyte* output) {
_GL_UNUSED(input);
_GL_UNUSED(output);
gl_assert(0 && "Not Implemented");
}
static void _readVertexData3uiARGB(const GLubyte* input, GLubyte* output) {
_GL_UNUSED(input);
_GL_UNUSED(output);
gl_assert(0 && "Not Implemented");
}
static void _readVertexData4usARGB(const GLubyte* input, GLubyte* output) {
_GL_UNUSED(input);
_GL_UNUSED(output);
gl_assert(0 && "Not Implemented");
}
static void _readVertexData4uiARGB(const GLubyte* input, GLubyte* output) {
_GL_UNUSED(input);
_GL_UNUSED(output);
gl_assert(0 && "Not Implemented");
}
static void _readVertexData4usRevARGB(const GLubyte* input, GLubyte* output) {
_GL_UNUSED(input);
_GL_UNUSED(output);
gl_assert(0 && "Not Implemented");
}
static void _readVertexData4uiRevARGB(const GLubyte* input, GLubyte* output) {
_GL_UNUSED(input);
_GL_UNUSED(output);
gl_assert(0 && "Not Implemented");
}
static ReadAttributeFunc calcReadDiffuseFunc() {
if((ATTRIB_LIST.enabled & DIFFUSE_ENABLED_FLAG) != DIFFUSE_ENABLED_FLAG) {
/* Just fill the whole thing white if the attribute is disabled */
return _fillWhiteARGB;
}
switch(ATTRIB_LIST.colour.type) {
default:
case GL_DOUBLE:
case GL_FLOAT:
return (ATTRIB_LIST.colour.size == 3) ? _readVertexData3fARGB:
(ATTRIB_LIST.colour.size == 4) ? _readVertexData4fARGB:
_readVertexData4fRevARGB;
case GL_BYTE:
case GL_UNSIGNED_BYTE:
return (ATTRIB_LIST.colour.size == 3) ? _readVertexData3ubARGB:
(ATTRIB_LIST.colour.size == 4) ? _readVertexData4ubARGB:
_readVertexData4ubRevARGB;
case GL_SHORT:
case GL_UNSIGNED_SHORT:
return (ATTRIB_LIST.colour.size == 3) ? _readVertexData3usARGB:
(ATTRIB_LIST.colour.size == 4) ? _readVertexData4usARGB:
_readVertexData4usRevARGB;
case GL_INT:
case GL_UNSIGNED_INT:
return (ATTRIB_LIST.colour.size == 3) ? _readVertexData3uiARGB:
(ATTRIB_LIST.colour.size == 4) ? _readVertexData4uiARGB:
_readVertexData4uiRevARGB;
}
}
static ReadAttributeFunc calcReadPositionFunc() {
switch(ATTRIB_LIST.vertex.type) {
default:
case GL_DOUBLE:
case GL_FLOAT:
return (ATTRIB_LIST.vertex.size == 3) ? _readVertexData3f3f:
_readVertexData2f3f;
case GL_BYTE:
case GL_UNSIGNED_BYTE:
return (ATTRIB_LIST.vertex.size == 3) ? _readVertexData3ub3f:
_readVertexData2ub3f;
case GL_SHORT:
case GL_UNSIGNED_SHORT:
return (ATTRIB_LIST.vertex.size == 3) ? _readVertexData3us3f:
_readVertexData2us3f;
case GL_INT:
case GL_UNSIGNED_INT:
return (ATTRIB_LIST.vertex.size == 3) ? _readVertexData3ui3f:
_readVertexData2ui3f;
}
}
static ReadAttributeFunc calcReadUVFunc() {
if((ATTRIB_LIST.enabled & UV_ENABLED_FLAG) != UV_ENABLED_FLAG) {
return _fillZero2f;
}
switch(ATTRIB_LIST.uv.type) {
default:
case GL_DOUBLE:
case GL_FLOAT:
return _readVertexData2f2f;
case GL_BYTE:
case GL_UNSIGNED_BYTE:
return _readVertexData2ub2f;
case GL_SHORT:
case GL_UNSIGNED_SHORT:
return _readVertexData2us2f;
case GL_INT:
case GL_UNSIGNED_INT:
return _readVertexData2ui2f;
}
}
static ReadAttributeFunc calcReadSTFunc() {
if((ATTRIB_LIST.enabled & ST_ENABLED_FLAG) != ST_ENABLED_FLAG) {
return _fillZero2f;
}
switch(ATTRIB_LIST.st.type) {
default:
case GL_DOUBLE:
case GL_FLOAT:
return _readVertexData2f2f;
case GL_BYTE:
case GL_UNSIGNED_BYTE:
return _readVertexData2ub2f;
case GL_SHORT:
case GL_UNSIGNED_SHORT:
return _readVertexData2us2f;
case GL_INT:
case GL_UNSIGNED_INT:
return _readVertexData2ui2f;
}
}
static ReadAttributeFunc calcReadNormalFunc() {
if((ATTRIB_LIST.enabled & NORMAL_ENABLED_FLAG) != NORMAL_ENABLED_FLAG) {
return _fillWithNegZVE;
@ -361,26 +415,27 @@ static ReadAttributeFunc calcReadNormalFunc() {
default:
case GL_DOUBLE:
case GL_FLOAT:
return _readVertexData3f3f;
return _readNormal3f3f;
break;
case GL_BYTE:
case GL_UNSIGNED_BYTE:
return _readVertexData3ub3f;
return _readNormal3ub3f;
break;
case GL_SHORT:
case GL_UNSIGNED_SHORT:
return _readVertexData3us3f;
return _readNormal3us3f;
break;
case GL_INT:
case GL_UNSIGNED_INT:
return _readVertexData3ui3f;
return _readNormal3ui3f;
break;
case GL_UNSIGNED_INT_2_10_10_10_REV:
return _readVertexData1i3f;
return _readNormal1i3f;
break;
}
}
void APIENTRY glEnableClientState(GLenum cap) {
TRACE();

View File

@ -289,13 +289,10 @@ static void _readPositionData(const GLuint first, const GLuint count, Vertex* it
const GLsizei vstride = ATTRIB_LIST.vertex.stride;
const GLubyte* vptr = ((GLubyte*) ATTRIB_LIST.vertex.ptr + (first * vstride));
float pos[3], w = 1.0f;
ITERATE(count) {
PREFETCH(vptr + vstride);
func(vptr, (GLubyte*) pos);
func(vptr, (GLubyte*) it);
it->flags = GPU_CMD_VERTEX;
TransformVertex(pos, &w, it->xyz, &it->w);
vptr += vstride;
++it;
@ -409,8 +406,7 @@ static void generateElements(
st = (GLubyte*) ATTRIB_LIST.st.ptr + (idx * ststride);
nxyz = (GLubyte*) ATTRIB_LIST.normal.ptr + (idx * nstride);
pos_func(xyz, (GLubyte*) pos);
TransformVertex((const float*) pos, &w, output->xyz, &output->w);
pos_func(xyz, (GLubyte*) output);
uv_func(uv, (GLubyte*) output->uv);
diffuse_func(bgra, output->bgra);
st_func(st, (GLubyte*) ve->st);
@ -458,8 +454,6 @@ static void generateElementsFastPath(
VertexExtra* ve = aligned_vector_at(target->extras, 0);
Vertex* it = start;
const float w = 1.0f;
if(!pos) {
return;
}
@ -470,7 +464,7 @@ static void generateElementsFastPath(
it->flags = GPU_CMD_VERTEX;
pos = (GLubyte*) ATTRIB_LIST.vertex.ptr + (idx * vstride);
TransformVertex((const float*) pos, &w, it->xyz, &it->w);
TransformVertex(((float*) pos)[0], ((float*) pos)[1], ((float*) pos)[2], 1.0f, it->xyz, &it->w);
if(uv) {
uv = (GLubyte*) ATTRIB_LIST.uv.ptr + (idx * uvstride);
@ -619,9 +613,13 @@ static void transform(SubmissionTarget* target) {
TRACE();
/* Perform modelview transform, storing W */
Vertex* vertex = _glSubmissionTargetStart(target);
Vertex* it = _glSubmissionTargetStart(target);
int count = target->count;
TransformVertices(vertex, target->count);
for(int i = 0; i < count; ++i, ++it) {
TransformVertex(it->xyz[0], it->xyz[1], it->xyz[2], it->w,
it->xyz, &it->w);
}
}
static void mat_transform_normal3(const float* xyz, const float* xyzOut, const uint32_t count, const uint32_t inStride, const uint32_t outStride) {

View File

@ -5,7 +5,6 @@
MAKE_FUNC(POLYMODE)
{
static const float w = 1.0f;
if(!(ATTRIB_LIST.enabled & VERTEX_ENABLED_FLAG)) {
/* If we don't have vertices, do nothing */
return;
@ -75,7 +74,7 @@ MAKE_FUNC(POLYMODE)
PREFETCH(ptr);
for(int_fast32_t i = 0; i < loop; ++i, ++it) {
PREFETCH(ptr + stride);
TransformVertex((const float*) ptr, &w, it->xyz, &it->w);
TransformVertex(((float*) ptr)[0], ((float*) ptr)[1], ((float*) ptr)[2], 1.0f, it->xyz, &it->w);
PROCESS_VERTEX_FLAGS(it, min + i);
ptr += stride;
}

View File

@ -140,15 +140,14 @@ inline void TransformVec4(float* x) {
}
GL_FORCE_INLINE void TransformVertex(const float* xyz, const float* w, float* oxyz, float* ow) {
register float __x __asm__("fr12") = (xyz[0]);
register float __y __asm__("fr13") = (xyz[1]);
register float __z __asm__("fr14") = (xyz[2]);
register float __w __asm__("fr15") = (*w);
GL_FORCE_INLINE void TransformVertex(float x, float y, float z, float w, float* oxyz, float* ow) {
register float __x __asm__("fr4") = x;
register float __y __asm__("fr5") = y;
register float __z __asm__("fr6") = z;
register float __w __asm__("fr7") = w;
__asm__ __volatile__(
"fldi1 fr15\n"
"ftrv xmtrx,fv12\n"
"ftrv xmtrx,fv4\n"
: "=f" (__x), "=f" (__y), "=f" (__z), "=f" (__w)
: "0" (__x), "1" (__y), "2" (__z), "3" (__w)
);
@ -159,28 +158,6 @@ GL_FORCE_INLINE void TransformVertex(const float* xyz, const float* w, float* ox
*ow = __w;
}
static inline void TransformVertices(Vertex* vertices, const int count) {
Vertex* it = vertices;
for(int i = 0; i < count; ++i, ++it) {
register float __x __asm__("fr12") = (it->xyz[0]);
register float __y __asm__("fr13") = (it->xyz[1]);
register float __z __asm__("fr14") = (it->xyz[2]);
register float __w __asm__("fr15") = (it->w);
__asm__ __volatile__(
"fldi1 fr15\n"
"ftrv xmtrx,fv12\n"
: "=f" (__x), "=f" (__y), "=f" (__z), "=f" (__w)
: "0" (__x), "1" (__y), "2" (__z), "3" (__w)
);
it->xyz[0] = __x;
it->xyz[1] = __y;
it->xyz[2] = __z;
it->w = __w;
}
}
void InitGPU(_Bool autosort, _Bool fsaa);
static inline size_t GPUMemoryAvailable() {

View File

@ -619,34 +619,17 @@ void TransformVec4(float* v) {
FASTCPY(v, ret, sizeof(float) * 4);
}
void TransformVertices(Vertex* vertices, const int count) {
float ret[4];
for(int i = 0; i < count; ++i, ++vertices) {
ret[0] = vertices->xyz[0];
ret[1] = vertices->xyz[1];
ret[2] = vertices->xyz[2];
ret[3] = 1.0f;
void TransformVertex(float x, float y, float z, float w, float* oxyz, float* ow) {
float vec[4], ret[4];
vec[0] = x;
vec[1] = y;
vec[2] = z;
vec[3] = w;
TransformVec4(ret);
vertices->xyz[0] = ret[0];
vertices->xyz[1] = ret[1];
vertices->xyz[2] = ret[2];
vertices->w = ret[3];
}
}
void TransformVertex(const float* xyz, const float* w, float* oxyz, float* ow) {
float ret[4];
ret[0] = xyz[0];
ret[1] = xyz[1];
ret[2] = xyz[2];
ret[3] = *w;
TransformVec4(ret);
TransformVec4NoMod(vec, ret);
oxyz[0] = ret[0];
oxyz[1] = ret[1];
oxyz[2] = ret[2];
*ow = ret[3];
*ow = ret[3];
}

View File

@ -49,8 +49,7 @@ static inline void TransformNormalNoMod(const float* xIn, float* xOut) {
(void) xOut;
}
void TransformVertices(Vertex* vertices, const int count);
void TransformVertex(const float* xyz, const float* w, float* oxyz, float* ow);
void TransformVertex(float x, float y, float z, float w, float* oxyz, float* ow);
void InitGPU(_Bool autosort, _Bool fsaa);