From fc947c49f85e41a4aec7a4d9d1e30a8212d04896 Mon Sep 17 00:00:00 2001 From: Luke Benstead Date: Tue, 25 May 2021 20:40:43 +0100 Subject: [PATCH] Performance improvements --- GL/draw.c | 120 +++++++++++++++++---------------------------- GL/flush.c | 23 ++++++--- GL/immediate.c | 56 ++++++++++----------- GL/platforms/sh4.h | 19 ++++--- 4 files changed, 99 insertions(+), 119 deletions(-) diff --git a/GL/draw.c b/GL/draw.c index 8048fa3..c602cd7 100644 --- a/GL/draw.c +++ b/GL/draw.c @@ -8,11 +8,11 @@ #include "private.h" #include "platform.h" -static AttribPointer VERTEX_POINTER; -static AttribPointer UV_POINTER; -static AttribPointer ST_POINTER; -static AttribPointer NORMAL_POINTER; -static AttribPointer DIFFUSE_POINTER; +AttribPointer VERTEX_POINTER; +AttribPointer UV_POINTER; +AttribPointer ST_POINTER; +AttribPointer NORMAL_POINTER; +AttribPointer DIFFUSE_POINTER; static GLuint ENABLED_VERTEX_ATTRIBUTES = 0; static GLubyte ACTIVE_CLIENT_TEXTURE = 0; @@ -632,7 +632,7 @@ ReadNormalFunc calcReadNormalFunc() { } static void _readPositionData(ReadDiffuseFunc func, const GLuint first, const GLuint count, const Vertex* output) { - const GLsizei vstride = (VERTEX_POINTER.stride) ? VERTEX_POINTER.stride : VERTEX_POINTER.size * byte_size(VERTEX_POINTER.type); + const GLsizei vstride = VERTEX_POINTER.stride; const GLubyte* vptr = ((GLubyte*) VERTEX_POINTER.ptr + (first * vstride)); GLubyte* out = (GLubyte*) output[0].xyz; @@ -654,7 +654,7 @@ static void _readPositionData(ReadDiffuseFunc func, const GLuint first, const GL } static void _readUVData(ReadUVFunc func, const GLuint first, const GLuint count, const Vertex* output) { - const GLsizei uvstride = (UV_POINTER.stride) ? UV_POINTER.stride : UV_POINTER.size * byte_size(UV_POINTER.type); + const GLsizei uvstride = UV_POINTER.stride; const GLubyte* uvptr = ((GLubyte*) UV_POINTER.ptr + (first * uvstride)); GLubyte* out = (GLubyte*) output[0].uv; @@ -669,7 +669,7 @@ static void _readUVData(ReadUVFunc func, const GLuint first, const GLuint count, } static void _readSTData(ReadUVFunc func, const GLuint first, const GLuint count, const VertexExtra* extra) { - const GLsizei ststride = (ST_POINTER.stride) ? ST_POINTER.stride : ST_POINTER.size * byte_size(ST_POINTER.type); + const GLsizei ststride = ST_POINTER.stride; const GLubyte* stptr = ((GLubyte*) ST_POINTER.ptr + (first * ststride)); GLubyte* out = (GLubyte*) extra[0].st; @@ -684,7 +684,7 @@ static void _readSTData(ReadUVFunc func, const GLuint first, const GLuint count, } static void _readNormalData(ReadNormalFunc func, const GLuint first, const GLuint count, const VertexExtra* extra) { - const GLsizei nstride = (NORMAL_POINTER.stride) ? NORMAL_POINTER.stride : NORMAL_POINTER.size * byte_size(NORMAL_POINTER.type); + const GLsizei nstride = NORMAL_POINTER.stride; const GLubyte* nptr = ((GLubyte*) NORMAL_POINTER.ptr + (first * nstride)); GLubyte* out = (GLubyte*) extra[0].nxyz; @@ -718,8 +718,7 @@ GL_FORCE_INLINE GLuint diffusePointerSize() { } static void _readDiffuseData(ReadDiffuseFunc func, const GLuint first, const GLuint count, const Vertex* output) { - const GLuint size = diffusePointerSize(); - const GLuint cstride = (DIFFUSE_POINTER.stride) ? DIFFUSE_POINTER.stride : size * byte_size(DIFFUSE_POINTER.type); + const GLuint cstride = DIFFUSE_POINTER.stride; const GLubyte* cptr = ((GLubyte*) DIFFUSE_POINTER.ptr) + (first * cstride); GLubyte* out = (GLubyte*) output[0].bgra; @@ -758,20 +757,12 @@ static void generateElements( const ReadDiffuseFunc diffuse_func = calcReadDiffuseFunc(); const ReadNormalFunc normal_func = calcReadNormalFunc(); - const GLuint vstride = (VERTEX_POINTER.stride) ? - VERTEX_POINTER.stride : VERTEX_POINTER.size * byte_size(VERTEX_POINTER.type); + const GLsizei vstride = VERTEX_POINTER.stride; - const GLuint uvstride = (UV_POINTER.stride) ? - UV_POINTER.stride : UV_POINTER.size * byte_size(UV_POINTER.type); - - const GLuint ststride = (ST_POINTER.stride) ? - ST_POINTER.stride : ST_POINTER.size * byte_size(ST_POINTER.type); - - const GLuint dstride = (DIFFUSE_POINTER.stride) ? - DIFFUSE_POINTER.stride : diffusePointerSize() * byte_size(DIFFUSE_POINTER.type); - - const GLuint nstride = (NORMAL_POINTER.stride) ? - NORMAL_POINTER.stride : NORMAL_POINTER.size * byte_size(NORMAL_POINTER.type); + const GLuint uvstride = UV_POINTER.stride; + const GLuint ststride = ST_POINTER.stride; + const GLuint dstride = DIFFUSE_POINTER.stride; + const GLuint nstride = NORMAL_POINTER.stride; for(; i < first + count; ++i) { idx = IndexFunc(indices + (i * istride)); @@ -812,20 +803,11 @@ static void generateElementsFastPath( Vertex* start = _glSubmissionTargetStart(target); - const GLuint vstride = (VERTEX_POINTER.stride) ? - VERTEX_POINTER.stride : VERTEX_POINTER.size * byte_size(VERTEX_POINTER.type); - - const GLuint uvstride = (UV_POINTER.stride) ? - UV_POINTER.stride : UV_POINTER.size * byte_size(UV_POINTER.type); - - const GLuint ststride = (ST_POINTER.stride) ? - ST_POINTER.stride : ST_POINTER.size * byte_size(ST_POINTER.type); - - const GLuint dstride = (DIFFUSE_POINTER.stride) ? - DIFFUSE_POINTER.stride : diffusePointerSize() * byte_size(DIFFUSE_POINTER.type); - - const GLuint nstride = (NORMAL_POINTER.stride) ? - NORMAL_POINTER.stride : NORMAL_POINTER.size * byte_size(NORMAL_POINTER.type); + const GLuint vstride = VERTEX_POINTER.stride; + const GLuint uvstride = UV_POINTER.stride; + const GLuint ststride = ST_POINTER.stride; + const GLuint dstride = DIFFUSE_POINTER.stride; + const GLuint nstride = NORMAL_POINTER.stride; const GLsizei istride = byte_size(type); const IndexParseFunc IndexFunc = _calcParseIndexFunc(type); @@ -842,17 +824,17 @@ static void generateElementsFastPath( const float w = 1.0f; + if(!pos) { + return; + } + for(GLuint i = first; i < first + count; ++i) { GLuint idx = IndexFunc(indices + (i * istride)); it->flags = GPU_CMD_VERTEX; - if(pos) { - pos = (GLubyte*) VERTEX_POINTER.ptr + (idx * vstride); - TransformVertex((const float*) pos, &w, it->xyz, &it->w); - } else { - *((Float3*) it->xyz) = F3ZERO; - } + pos = (GLubyte*) VERTEX_POINTER.ptr + (idx * vstride); + TransformVertex((const float*) pos, &w, it->xyz, &it->w); if(uv) { uv = (GLubyte*) UV_POINTER.ptr + (idx * uvstride); @@ -892,21 +874,11 @@ static void generateElementsFastPath( static void generateArraysFastPath(SubmissionTarget* target, const GLsizei first, const GLuint count) { Vertex* start = _glSubmissionTargetStart(target); - const GLuint vstride = (VERTEX_POINTER.stride) ? - VERTEX_POINTER.stride : VERTEX_POINTER.size * byte_size(VERTEX_POINTER.type); - - const GLuint uvstride = (UV_POINTER.stride) ? - UV_POINTER.stride : UV_POINTER.size * byte_size(UV_POINTER.type); - - const GLuint ststride = (ST_POINTER.stride) ? - ST_POINTER.stride : ST_POINTER.size * byte_size(ST_POINTER.type); - - const GLuint dstride = (DIFFUSE_POINTER.stride) ? - DIFFUSE_POINTER.stride : diffusePointerSize() * byte_size(DIFFUSE_POINTER.type); - - const GLuint nstride = (NORMAL_POINTER.stride) ? - NORMAL_POINTER.stride : NORMAL_POINTER.size * byte_size(NORMAL_POINTER.type); - + const GLuint vstride = VERTEX_POINTER.stride; + const GLuint uvstride = UV_POINTER.stride; + const GLuint ststride = ST_POINTER.stride; + const GLuint dstride = DIFFUSE_POINTER.stride; + const GLuint nstride = NORMAL_POINTER.stride; /* Copy the pos, uv and color directly in one go */ const GLubyte* pos = (ENABLED_VERTEX_ATTRIBUTES & VERTEX_ENABLED_FLAG) ? VERTEX_POINTER.ptr + (first * vstride) : NULL; @@ -922,16 +894,16 @@ static void generateArraysFastPath(SubmissionTarget* target, const GLsizei first uint32_t i = count; + if(!pos) { + /* If we don't have vertices, do nothing */ + return; + } + while(i--) { it->flags = GPU_CMD_VERTEX; - if(pos) { - TransformVertex((const float*) pos, &w, it->xyz, &it->w); - pos += vstride; - } else { - *((Float3*) it->xyz) = F3ZERO; - } - + TransformVertex((const float*) pos, &w, it->xyz, &it->w); + pos += vstride; if(uv) { MEMCPY4(it->uv, uv, sizeof(float) * 2); @@ -1468,12 +1440,12 @@ void APIENTRY glTexCoordPointer(GLint size, GLenum type, GLsizei stride, cons AttribPointer* tointer = (ACTIVE_CLIENT_TEXTURE == 0) ? &UV_POINTER : &ST_POINTER; tointer->ptr = pointer; - tointer->stride = stride; + tointer->stride = (stride) ? stride : size * byte_size(type); tointer->type = type; tointer->size = size; } -void APIENTRY glVertexPointer(GLint size, GLenum type, GLsizei stride, const GLvoid * pointer) { +void APIENTRY glVertexPointer(GLint size, GLenum type, GLsizei stride, const GLvoid * pointer) { TRACE(); if(size < 2 || size > 4) { @@ -1483,7 +1455,7 @@ void APIENTRY glVertexPointer(GLint size, GLenum type, GLsizei stride, const } VERTEX_POINTER.ptr = pointer; - VERTEX_POINTER.stride = stride; + VERTEX_POINTER.stride = (stride) ? stride : (size * byte_size(VERTEX_POINTER.type)); VERTEX_POINTER.type = type; VERTEX_POINTER.size = size; } @@ -1497,10 +1469,11 @@ void APIENTRY glColorPointer(GLint size, GLenum type, GLsizei stride, const G return; } + DIFFUSE_POINTER.ptr = pointer; - DIFFUSE_POINTER.stride = stride; DIFFUSE_POINTER.type = type; - DIFFUSE_POINTER.size = size; + DIFFUSE_POINTER.size = (DIFFUSE_POINTER.size == GL_BGRA) ? 4 : size; + DIFFUSE_POINTER.stride = (stride) ? stride : DIFFUSE_POINTER.size * byte_size(type); } void APIENTRY glNormalPointer(GLenum type, GLsizei stride, const GLvoid * pointer) { @@ -1522,8 +1495,7 @@ void APIENTRY glNormalPointer(GLenum type, GLsizei stride, const GLvoid * poin } NORMAL_POINTER.ptr = pointer; - NORMAL_POINTER.stride = stride; - NORMAL_POINTER.type = type; NORMAL_POINTER.size = (type == GL_UNSIGNED_INT_2_10_10_10_REV) ? 1 : 3; - + NORMAL_POINTER.stride = (stride) ? stride : NORMAL_POINTER.size * byte_size(type); + NORMAL_POINTER.type = type; } diff --git a/GL/flush.c b/GL/flush.c index 6aab22f..eec9d76 100644 --- a/GL/flush.c +++ b/GL/flush.c @@ -101,29 +101,38 @@ GL_FORCE_INLINE void glPerspectiveDivideStandard(void* src, uint32_t n) { /* Perform perspective divide on each vertex */ Vertex* vertex = (Vertex*) src; + PREFETCH(vertex + 1); const float h = GetVideoMode()->height; while(n--) { - PREFETCH(vertex + 1); + PREFETCH(vertex + 2); if(likely(glIsVertex(vertex->flags))) { const float f = MATH_Fast_Invert(vertex->w); /* Convert to NDC and apply viewport */ - vertex->xyz[0] = MATH_fmac( + vertex->xyz[0] = __builtin_fmaf( VIEWPORT.hwidth, vertex->xyz[0] * f, VIEWPORT.x_plus_hwidth ); - vertex->xyz[1] = h - MATH_fmac( + vertex->xyz[1] = h - __builtin_fmaf( VIEWPORT.hheight, vertex->xyz[1] * f, VIEWPORT.y_plus_hheight ); /* FIXME: Apply depth range */ - vertex->xyz[2] = MAX( - 1.0f - MATH_fmac(vertex->xyz[2] * f, 0.5f, 0.5f), - PVR_MIN_Z - ); + + /* After multiplying by 'f', the Z coordinate is between + * -1 and 1. We then need to shift it into a value > 0.00001f + * where the larger value becomes smaller and vice-versa (because + * the PVR works backwards). + * + * If we multipled the lowest value (-1) by -1 it becomes 1, if + * we multiply the lowest value (1) by -1 it becomes, then we need + * to add 1 to get it in the range 0 - 2. Then we add a little offset + * and this approach means we can just use FMAC. + * */ + vertex->xyz[2] = __builtin_fmaf((vertex->xyz[2] * f), -1.0f, 1.00001f); } ++vertex; diff --git a/GL/immediate.c b/GL/immediate.c index 7cb4057..62bd7a2 100644 --- a/GL/immediate.c +++ b/GL/immediate.c @@ -30,6 +30,12 @@ static AttribPointer UV_ATTRIB; static AttribPointer ST_ATTRIB; static AttribPointer NORMAL_ATTRIB; +extern AttribPointer VERTEX_POINTER; +extern AttribPointer UV_POINTER; +extern AttribPointer ST_POINTER; +extern AttribPointer NORMAL_POINTER; +extern AttribPointer DIFFUSE_POINTER; + /* We store the list of attributes that have been "enabled" by a call to glColor, glNormal, glTexCoord etc. otherwise we already have defaults that can be applied faster */ @@ -114,10 +120,10 @@ void APIENTRY glBegin(GLenum mode) { void APIENTRY glColor4f(GLfloat r, GLfloat g, GLfloat b, GLfloat a) { ENABLED_VERTEX_ATTRIBUTES |= DIFFUSE_ENABLED_FLAG; - COLOR[A8IDX] = (GLubyte)(a * 255); - COLOR[R8IDX] = (GLubyte)(r * 255); - COLOR[G8IDX] = (GLubyte)(g * 255); - COLOR[B8IDX] = (GLubyte)(b * 255); + COLOR[A8IDX] = (GLubyte)(a * 255.0f); + COLOR[R8IDX] = (GLubyte)(r * 255.0f); + COLOR[G8IDX] = (GLubyte)(g * 255.0f); + COLOR[B8IDX] = (GLubyte)(b * 255.0f); } void APIENTRY glColor4ub(GLubyte r, GLubyte g, GLubyte b, GLubyte a) { @@ -268,27 +274,21 @@ void APIENTRY glEnd() { GLuint* attrs = _glGetEnabledAttributes(); - AttribPointer* vattr = _glGetVertexAttribPointer(); - AttribPointer* dattr = _glGetDiffuseAttribPointer(); - AttribPointer* nattr = _glGetNormalAttribPointer(); - AttribPointer* uattr = _glGetUVAttribPointer(); - AttribPointer* sattr = _glGetSTAttribPointer(); - /* Stash existing values */ - AttribPointer vptr = *vattr; - AttribPointer dptr = *dattr; - AttribPointer nptr = *nattr; - AttribPointer uvptr = *uattr; - AttribPointer stptr = *sattr; + AttribPointer vptr = VERTEX_POINTER; + AttribPointer dptr = DIFFUSE_POINTER; + AttribPointer nptr = NORMAL_POINTER; + AttribPointer uvptr = UV_POINTER; + AttribPointer stptr = ST_POINTER; GLuint prevAttrs = *attrs; /* Switch to our immediate mode arrays */ - *vattr = VERTEX_ATTRIB; - *dattr = DIFFUSE_ATTRIB; - *nattr = NORMAL_ATTRIB; - *uattr = UV_ATTRIB; - *sattr = ST_ATTRIB; + VERTEX_POINTER = VERTEX_ATTRIB; + DIFFUSE_POINTER = DIFFUSE_ATTRIB; + NORMAL_POINTER = NORMAL_ATTRIB; + UV_POINTER = UV_ATTRIB; + ST_POINTER = ST_ATTRIB; *attrs = ENABLED_VERTEX_ATTRIBUTES; @@ -303,11 +303,11 @@ void APIENTRY glEnd() { glDrawArrays(ACTIVE_POLYGON_MODE, 0, VERTICES.size); /* Restore everything */ - *vattr = vptr; - *dattr = dptr; - *nattr = nptr; - *uattr = uvptr; - *sattr = stptr; + VERTEX_POINTER = vptr; + DIFFUSE_POINTER = dptr; + NORMAL_POINTER = nptr; + UV_POINTER = uvptr; + ST_POINTER = stptr; *attrs = prevAttrs; @@ -315,12 +315,6 @@ void APIENTRY glEnd() { aligned_vector_clear(&VERTICES); aligned_vector_clear(&ST_COORDS); aligned_vector_clear(&NORMALS); - - *vattr = vptr; - *dattr = dptr; - *nattr = nptr; - *uattr = uvptr; - *sattr = stptr; } void APIENTRY glRectf(GLfloat x1, GLfloat y1, GLfloat x2, GLfloat y2) { diff --git a/GL/platforms/sh4.h b/GL/platforms/sh4.h index f2a3295..9e518f1 100644 --- a/GL/platforms/sh4.h +++ b/GL/platforms/sh4.h @@ -16,6 +16,11 @@ #define PERF_WARNING(msg) (void) 0 #endif +#ifndef GL_FORCE_INLINE +#define GL_NO_INSTRUMENT inline __attribute__((no_instrument_function)) +#define GL_INLINE_DEBUG GL_NO_INSTRUMENT __attribute__((always_inline)) +#define GL_FORCE_INLINE static GL_INLINE_DEBUG +#endif #define PREFETCH(addr) __asm__("pref @%0" : : "r"((addr))) @@ -39,29 +44,29 @@ #define VEC3_LENGTH(x, y, z, l) vec3f_length((x), (y), (z), (l)) #define VEC3_DOT(x1, y1, z1, x2, y2, z2, d) vec3f_dot((x1), (y1), (z1), (x2), (y2), (z2), (d)) -static inline void UploadMatrix4x4(const Matrix4x4* mat) { +GL_FORCE_INLINE void UploadMatrix4x4(const Matrix4x4* mat) { mat_load((matrix_t*) mat); } -static inline void DownloadMatrix4x4(Matrix4x4* mat) { +GL_FORCE_INLINE void DownloadMatrix4x4(Matrix4x4* mat) { mat_store((matrix_t*) mat); } -static inline void MultiplyMatrix4x4(const Matrix4x4* mat) { +GL_FORCE_INLINE void MultiplyMatrix4x4(const Matrix4x4* mat) { mat_apply((matrix_t*) mat); } -static inline void TransformVec3(float* x) { +GL_FORCE_INLINE void TransformVec3(float* x) { mat_trans_single4(x[0], x[1], x[2], x[3]); } /* Transform a 3-element vector using the stored matrix (w == 1) */ -static inline void TransformVec3NoMod(const float* xIn, float* xOut) { +GL_FORCE_INLINE void TransformVec3NoMod(const float* xIn, float* xOut) { mat_trans_single3_nodiv_nomod(xIn[0], xIn[1], xIn[2], xOut[0], xOut[1], xOut[2]); } /* Transform a 3-element normal using the stored matrix (w == 0)*/ -static inline void TransformNormalNoMod(const float* in, float* out) { +GL_FORCE_INLINE void TransformNormalNoMod(const float* in, float* out) { mat_trans_normal3_nomod(in[0], in[1], in[2], out[0], out[1], out[2]); } @@ -70,7 +75,7 @@ inline void TransformVec4(float* x) { } -static inline void TransformVertex(const float* xyz, const float* w, float* oxyz, float* ow) { +GL_FORCE_INLINE void TransformVertex(const float* xyz, const float* w, float* oxyz, float* ow) { register float __x __asm__("fr12") = (xyz[0]); register float __y __asm__("fr13") = (xyz[1]); register float __z __asm__("fr14") = (xyz[2]);