Performance improvements

This commit is contained in:
Luke Benstead 2021-05-25 20:40:43 +01:00
parent f736332eb8
commit fc947c49f8
4 changed files with 99 additions and 119 deletions

120
GL/draw.c
View File

@ -8,11 +8,11 @@
#include "private.h"
#include "platform.h"
static AttribPointer VERTEX_POINTER;
static AttribPointer UV_POINTER;
static AttribPointer ST_POINTER;
static AttribPointer NORMAL_POINTER;
static AttribPointer DIFFUSE_POINTER;
AttribPointer VERTEX_POINTER;
AttribPointer UV_POINTER;
AttribPointer ST_POINTER;
AttribPointer NORMAL_POINTER;
AttribPointer DIFFUSE_POINTER;
static GLuint ENABLED_VERTEX_ATTRIBUTES = 0;
static GLubyte ACTIVE_CLIENT_TEXTURE = 0;
@ -632,7 +632,7 @@ ReadNormalFunc calcReadNormalFunc() {
}
static void _readPositionData(ReadDiffuseFunc func, const GLuint first, const GLuint count, const Vertex* output) {
const GLsizei vstride = (VERTEX_POINTER.stride) ? VERTEX_POINTER.stride : VERTEX_POINTER.size * byte_size(VERTEX_POINTER.type);
const GLsizei vstride = VERTEX_POINTER.stride;
const GLubyte* vptr = ((GLubyte*) VERTEX_POINTER.ptr + (first * vstride));
GLubyte* out = (GLubyte*) output[0].xyz;
@ -654,7 +654,7 @@ static void _readPositionData(ReadDiffuseFunc func, const GLuint first, const GL
}
static void _readUVData(ReadUVFunc func, const GLuint first, const GLuint count, const Vertex* output) {
const GLsizei uvstride = (UV_POINTER.stride) ? UV_POINTER.stride : UV_POINTER.size * byte_size(UV_POINTER.type);
const GLsizei uvstride = UV_POINTER.stride;
const GLubyte* uvptr = ((GLubyte*) UV_POINTER.ptr + (first * uvstride));
GLubyte* out = (GLubyte*) output[0].uv;
@ -669,7 +669,7 @@ static void _readUVData(ReadUVFunc func, const GLuint first, const GLuint count,
}
static void _readSTData(ReadUVFunc func, const GLuint first, const GLuint count, const VertexExtra* extra) {
const GLsizei ststride = (ST_POINTER.stride) ? ST_POINTER.stride : ST_POINTER.size * byte_size(ST_POINTER.type);
const GLsizei ststride = ST_POINTER.stride;
const GLubyte* stptr = ((GLubyte*) ST_POINTER.ptr + (first * ststride));
GLubyte* out = (GLubyte*) extra[0].st;
@ -684,7 +684,7 @@ static void _readSTData(ReadUVFunc func, const GLuint first, const GLuint count,
}
static void _readNormalData(ReadNormalFunc func, const GLuint first, const GLuint count, const VertexExtra* extra) {
const GLsizei nstride = (NORMAL_POINTER.stride) ? NORMAL_POINTER.stride : NORMAL_POINTER.size * byte_size(NORMAL_POINTER.type);
const GLsizei nstride = NORMAL_POINTER.stride;
const GLubyte* nptr = ((GLubyte*) NORMAL_POINTER.ptr + (first * nstride));
GLubyte* out = (GLubyte*) extra[0].nxyz;
@ -718,8 +718,7 @@ GL_FORCE_INLINE GLuint diffusePointerSize() {
}
static void _readDiffuseData(ReadDiffuseFunc func, const GLuint first, const GLuint count, const Vertex* output) {
const GLuint size = diffusePointerSize();
const GLuint cstride = (DIFFUSE_POINTER.stride) ? DIFFUSE_POINTER.stride : size * byte_size(DIFFUSE_POINTER.type);
const GLuint cstride = DIFFUSE_POINTER.stride;
const GLubyte* cptr = ((GLubyte*) DIFFUSE_POINTER.ptr) + (first * cstride);
GLubyte* out = (GLubyte*) output[0].bgra;
@ -758,20 +757,12 @@ static void generateElements(
const ReadDiffuseFunc diffuse_func = calcReadDiffuseFunc();
const ReadNormalFunc normal_func = calcReadNormalFunc();
const GLuint vstride = (VERTEX_POINTER.stride) ?
VERTEX_POINTER.stride : VERTEX_POINTER.size * byte_size(VERTEX_POINTER.type);
const GLsizei vstride = VERTEX_POINTER.stride;
const GLuint uvstride = (UV_POINTER.stride) ?
UV_POINTER.stride : UV_POINTER.size * byte_size(UV_POINTER.type);
const GLuint ststride = (ST_POINTER.stride) ?
ST_POINTER.stride : ST_POINTER.size * byte_size(ST_POINTER.type);
const GLuint dstride = (DIFFUSE_POINTER.stride) ?
DIFFUSE_POINTER.stride : diffusePointerSize() * byte_size(DIFFUSE_POINTER.type);
const GLuint nstride = (NORMAL_POINTER.stride) ?
NORMAL_POINTER.stride : NORMAL_POINTER.size * byte_size(NORMAL_POINTER.type);
const GLuint uvstride = UV_POINTER.stride;
const GLuint ststride = ST_POINTER.stride;
const GLuint dstride = DIFFUSE_POINTER.stride;
const GLuint nstride = NORMAL_POINTER.stride;
for(; i < first + count; ++i) {
idx = IndexFunc(indices + (i * istride));
@ -812,20 +803,11 @@ static void generateElementsFastPath(
Vertex* start = _glSubmissionTargetStart(target);
const GLuint vstride = (VERTEX_POINTER.stride) ?
VERTEX_POINTER.stride : VERTEX_POINTER.size * byte_size(VERTEX_POINTER.type);
const GLuint uvstride = (UV_POINTER.stride) ?
UV_POINTER.stride : UV_POINTER.size * byte_size(UV_POINTER.type);
const GLuint ststride = (ST_POINTER.stride) ?
ST_POINTER.stride : ST_POINTER.size * byte_size(ST_POINTER.type);
const GLuint dstride = (DIFFUSE_POINTER.stride) ?
DIFFUSE_POINTER.stride : diffusePointerSize() * byte_size(DIFFUSE_POINTER.type);
const GLuint nstride = (NORMAL_POINTER.stride) ?
NORMAL_POINTER.stride : NORMAL_POINTER.size * byte_size(NORMAL_POINTER.type);
const GLuint vstride = VERTEX_POINTER.stride;
const GLuint uvstride = UV_POINTER.stride;
const GLuint ststride = ST_POINTER.stride;
const GLuint dstride = DIFFUSE_POINTER.stride;
const GLuint nstride = NORMAL_POINTER.stride;
const GLsizei istride = byte_size(type);
const IndexParseFunc IndexFunc = _calcParseIndexFunc(type);
@ -842,17 +824,17 @@ static void generateElementsFastPath(
const float w = 1.0f;
if(!pos) {
return;
}
for(GLuint i = first; i < first + count; ++i) {
GLuint idx = IndexFunc(indices + (i * istride));
it->flags = GPU_CMD_VERTEX;
if(pos) {
pos = (GLubyte*) VERTEX_POINTER.ptr + (idx * vstride);
TransformVertex((const float*) pos, &w, it->xyz, &it->w);
} else {
*((Float3*) it->xyz) = F3ZERO;
}
pos = (GLubyte*) VERTEX_POINTER.ptr + (idx * vstride);
TransformVertex((const float*) pos, &w, it->xyz, &it->w);
if(uv) {
uv = (GLubyte*) UV_POINTER.ptr + (idx * uvstride);
@ -892,21 +874,11 @@ static void generateElementsFastPath(
static void generateArraysFastPath(SubmissionTarget* target, const GLsizei first, const GLuint count) {
Vertex* start = _glSubmissionTargetStart(target);
const GLuint vstride = (VERTEX_POINTER.stride) ?
VERTEX_POINTER.stride : VERTEX_POINTER.size * byte_size(VERTEX_POINTER.type);
const GLuint uvstride = (UV_POINTER.stride) ?
UV_POINTER.stride : UV_POINTER.size * byte_size(UV_POINTER.type);
const GLuint ststride = (ST_POINTER.stride) ?
ST_POINTER.stride : ST_POINTER.size * byte_size(ST_POINTER.type);
const GLuint dstride = (DIFFUSE_POINTER.stride) ?
DIFFUSE_POINTER.stride : diffusePointerSize() * byte_size(DIFFUSE_POINTER.type);
const GLuint nstride = (NORMAL_POINTER.stride) ?
NORMAL_POINTER.stride : NORMAL_POINTER.size * byte_size(NORMAL_POINTER.type);
const GLuint vstride = VERTEX_POINTER.stride;
const GLuint uvstride = UV_POINTER.stride;
const GLuint ststride = ST_POINTER.stride;
const GLuint dstride = DIFFUSE_POINTER.stride;
const GLuint nstride = NORMAL_POINTER.stride;
/* Copy the pos, uv and color directly in one go */
const GLubyte* pos = (ENABLED_VERTEX_ATTRIBUTES & VERTEX_ENABLED_FLAG) ? VERTEX_POINTER.ptr + (first * vstride) : NULL;
@ -922,16 +894,16 @@ static void generateArraysFastPath(SubmissionTarget* target, const GLsizei first
uint32_t i = count;
if(!pos) {
/* If we don't have vertices, do nothing */
return;
}
while(i--) {
it->flags = GPU_CMD_VERTEX;
if(pos) {
TransformVertex((const float*) pos, &w, it->xyz, &it->w);
pos += vstride;
} else {
*((Float3*) it->xyz) = F3ZERO;
}
TransformVertex((const float*) pos, &w, it->xyz, &it->w);
pos += vstride;
if(uv) {
MEMCPY4(it->uv, uv, sizeof(float) * 2);
@ -1468,12 +1440,12 @@ void APIENTRY glTexCoordPointer(GLint size, GLenum type, GLsizei stride, cons
AttribPointer* tointer = (ACTIVE_CLIENT_TEXTURE == 0) ? &UV_POINTER : &ST_POINTER;
tointer->ptr = pointer;
tointer->stride = stride;
tointer->stride = (stride) ? stride : size * byte_size(type);
tointer->type = type;
tointer->size = size;
}
void APIENTRY glVertexPointer(GLint size, GLenum type, GLsizei stride, const GLvoid * pointer) {
void APIENTRY glVertexPointer(GLint size, GLenum type, GLsizei stride, const GLvoid * pointer) {
TRACE();
if(size < 2 || size > 4) {
@ -1483,7 +1455,7 @@ void APIENTRY glVertexPointer(GLint size, GLenum type, GLsizei stride, const
}
VERTEX_POINTER.ptr = pointer;
VERTEX_POINTER.stride = stride;
VERTEX_POINTER.stride = (stride) ? stride : (size * byte_size(VERTEX_POINTER.type));
VERTEX_POINTER.type = type;
VERTEX_POINTER.size = size;
}
@ -1497,10 +1469,11 @@ void APIENTRY glColorPointer(GLint size, GLenum type, GLsizei stride, const G
return;
}
DIFFUSE_POINTER.ptr = pointer;
DIFFUSE_POINTER.stride = stride;
DIFFUSE_POINTER.type = type;
DIFFUSE_POINTER.size = size;
DIFFUSE_POINTER.size = (DIFFUSE_POINTER.size == GL_BGRA) ? 4 : size;
DIFFUSE_POINTER.stride = (stride) ? stride : DIFFUSE_POINTER.size * byte_size(type);
}
void APIENTRY glNormalPointer(GLenum type, GLsizei stride, const GLvoid * pointer) {
@ -1522,8 +1495,7 @@ void APIENTRY glNormalPointer(GLenum type, GLsizei stride, const GLvoid * poin
}
NORMAL_POINTER.ptr = pointer;
NORMAL_POINTER.stride = stride;
NORMAL_POINTER.type = type;
NORMAL_POINTER.size = (type == GL_UNSIGNED_INT_2_10_10_10_REV) ? 1 : 3;
NORMAL_POINTER.stride = (stride) ? stride : NORMAL_POINTER.size * byte_size(type);
NORMAL_POINTER.type = type;
}

View File

@ -101,29 +101,38 @@ GL_FORCE_INLINE void glPerspectiveDivideStandard(void* src, uint32_t n) {
/* Perform perspective divide on each vertex */
Vertex* vertex = (Vertex*) src;
PREFETCH(vertex + 1);
const float h = GetVideoMode()->height;
while(n--) {
PREFETCH(vertex + 1);
PREFETCH(vertex + 2);
if(likely(glIsVertex(vertex->flags))) {
const float f = MATH_Fast_Invert(vertex->w);
/* Convert to NDC and apply viewport */
vertex->xyz[0] = MATH_fmac(
vertex->xyz[0] = __builtin_fmaf(
VIEWPORT.hwidth, vertex->xyz[0] * f, VIEWPORT.x_plus_hwidth
);
vertex->xyz[1] = h - MATH_fmac(
vertex->xyz[1] = h - __builtin_fmaf(
VIEWPORT.hheight, vertex->xyz[1] * f, VIEWPORT.y_plus_hheight
);
/* FIXME: Apply depth range */
vertex->xyz[2] = MAX(
1.0f - MATH_fmac(vertex->xyz[2] * f, 0.5f, 0.5f),
PVR_MIN_Z
);
/* After multiplying by 'f', the Z coordinate is between
* -1 and 1. We then need to shift it into a value > 0.00001f
* where the larger value becomes smaller and vice-versa (because
* the PVR works backwards).
*
* If we multipled the lowest value (-1) by -1 it becomes 1, if
* we multiply the lowest value (1) by -1 it becomes, then we need
* to add 1 to get it in the range 0 - 2. Then we add a little offset
* and this approach means we can just use FMAC.
* */
vertex->xyz[2] = __builtin_fmaf((vertex->xyz[2] * f), -1.0f, 1.00001f);
}
++vertex;

View File

@ -30,6 +30,12 @@ static AttribPointer UV_ATTRIB;
static AttribPointer ST_ATTRIB;
static AttribPointer NORMAL_ATTRIB;
extern AttribPointer VERTEX_POINTER;
extern AttribPointer UV_POINTER;
extern AttribPointer ST_POINTER;
extern AttribPointer NORMAL_POINTER;
extern AttribPointer DIFFUSE_POINTER;
/* We store the list of attributes that have been "enabled" by a call to
glColor, glNormal, glTexCoord etc. otherwise we already have defaults that
can be applied faster */
@ -114,10 +120,10 @@ void APIENTRY glBegin(GLenum mode) {
void APIENTRY glColor4f(GLfloat r, GLfloat g, GLfloat b, GLfloat a) {
ENABLED_VERTEX_ATTRIBUTES |= DIFFUSE_ENABLED_FLAG;
COLOR[A8IDX] = (GLubyte)(a * 255);
COLOR[R8IDX] = (GLubyte)(r * 255);
COLOR[G8IDX] = (GLubyte)(g * 255);
COLOR[B8IDX] = (GLubyte)(b * 255);
COLOR[A8IDX] = (GLubyte)(a * 255.0f);
COLOR[R8IDX] = (GLubyte)(r * 255.0f);
COLOR[G8IDX] = (GLubyte)(g * 255.0f);
COLOR[B8IDX] = (GLubyte)(b * 255.0f);
}
void APIENTRY glColor4ub(GLubyte r, GLubyte g, GLubyte b, GLubyte a) {
@ -268,27 +274,21 @@ void APIENTRY glEnd() {
GLuint* attrs = _glGetEnabledAttributes();
AttribPointer* vattr = _glGetVertexAttribPointer();
AttribPointer* dattr = _glGetDiffuseAttribPointer();
AttribPointer* nattr = _glGetNormalAttribPointer();
AttribPointer* uattr = _glGetUVAttribPointer();
AttribPointer* sattr = _glGetSTAttribPointer();
/* Stash existing values */
AttribPointer vptr = *vattr;
AttribPointer dptr = *dattr;
AttribPointer nptr = *nattr;
AttribPointer uvptr = *uattr;
AttribPointer stptr = *sattr;
AttribPointer vptr = VERTEX_POINTER;
AttribPointer dptr = DIFFUSE_POINTER;
AttribPointer nptr = NORMAL_POINTER;
AttribPointer uvptr = UV_POINTER;
AttribPointer stptr = ST_POINTER;
GLuint prevAttrs = *attrs;
/* Switch to our immediate mode arrays */
*vattr = VERTEX_ATTRIB;
*dattr = DIFFUSE_ATTRIB;
*nattr = NORMAL_ATTRIB;
*uattr = UV_ATTRIB;
*sattr = ST_ATTRIB;
VERTEX_POINTER = VERTEX_ATTRIB;
DIFFUSE_POINTER = DIFFUSE_ATTRIB;
NORMAL_POINTER = NORMAL_ATTRIB;
UV_POINTER = UV_ATTRIB;
ST_POINTER = ST_ATTRIB;
*attrs = ENABLED_VERTEX_ATTRIBUTES;
@ -303,11 +303,11 @@ void APIENTRY glEnd() {
glDrawArrays(ACTIVE_POLYGON_MODE, 0, VERTICES.size);
/* Restore everything */
*vattr = vptr;
*dattr = dptr;
*nattr = nptr;
*uattr = uvptr;
*sattr = stptr;
VERTEX_POINTER = vptr;
DIFFUSE_POINTER = dptr;
NORMAL_POINTER = nptr;
UV_POINTER = uvptr;
ST_POINTER = stptr;
*attrs = prevAttrs;
@ -315,12 +315,6 @@ void APIENTRY glEnd() {
aligned_vector_clear(&VERTICES);
aligned_vector_clear(&ST_COORDS);
aligned_vector_clear(&NORMALS);
*vattr = vptr;
*dattr = dptr;
*nattr = nptr;
*uattr = uvptr;
*sattr = stptr;
}
void APIENTRY glRectf(GLfloat x1, GLfloat y1, GLfloat x2, GLfloat y2) {

View File

@ -16,6 +16,11 @@
#define PERF_WARNING(msg) (void) 0
#endif
#ifndef GL_FORCE_INLINE
#define GL_NO_INSTRUMENT inline __attribute__((no_instrument_function))
#define GL_INLINE_DEBUG GL_NO_INSTRUMENT __attribute__((always_inline))
#define GL_FORCE_INLINE static GL_INLINE_DEBUG
#endif
#define PREFETCH(addr) __asm__("pref @%0" : : "r"((addr)))
@ -39,29 +44,29 @@
#define VEC3_LENGTH(x, y, z, l) vec3f_length((x), (y), (z), (l))
#define VEC3_DOT(x1, y1, z1, x2, y2, z2, d) vec3f_dot((x1), (y1), (z1), (x2), (y2), (z2), (d))
static inline void UploadMatrix4x4(const Matrix4x4* mat) {
GL_FORCE_INLINE void UploadMatrix4x4(const Matrix4x4* mat) {
mat_load((matrix_t*) mat);
}
static inline void DownloadMatrix4x4(Matrix4x4* mat) {
GL_FORCE_INLINE void DownloadMatrix4x4(Matrix4x4* mat) {
mat_store((matrix_t*) mat);
}
static inline void MultiplyMatrix4x4(const Matrix4x4* mat) {
GL_FORCE_INLINE void MultiplyMatrix4x4(const Matrix4x4* mat) {
mat_apply((matrix_t*) mat);
}
static inline void TransformVec3(float* x) {
GL_FORCE_INLINE void TransformVec3(float* x) {
mat_trans_single4(x[0], x[1], x[2], x[3]);
}
/* Transform a 3-element vector using the stored matrix (w == 1) */
static inline void TransformVec3NoMod(const float* xIn, float* xOut) {
GL_FORCE_INLINE void TransformVec3NoMod(const float* xIn, float* xOut) {
mat_trans_single3_nodiv_nomod(xIn[0], xIn[1], xIn[2], xOut[0], xOut[1], xOut[2]);
}
/* Transform a 3-element normal using the stored matrix (w == 0)*/
static inline void TransformNormalNoMod(const float* in, float* out) {
GL_FORCE_INLINE void TransformNormalNoMod(const float* in, float* out) {
mat_trans_normal3_nomod(in[0], in[1], in[2], out[0], out[1], out[2]);
}
@ -70,7 +75,7 @@ inline void TransformVec4(float* x) {
}
static inline void TransformVertex(const float* xyz, const float* w, float* oxyz, float* ow) {
GL_FORCE_INLINE void TransformVertex(const float* xyz, const float* w, float* oxyz, float* ow) {
register float __x __asm__("fr12") = (xyz[0]);
register float __y __asm__("fr13") = (xyz[1]);
register float __z __asm__("fr14") = (xyz[2]);