From 5a9b7bb37e98cdb2d9af9fb2cc2e44079b3a7e23 Mon Sep 17 00:00:00 2001 From: Luke Benstead Date: Tue, 20 Apr 2021 16:49:00 +0100 Subject: [PATCH 1/6] Force inline aligned vector functions --- CMakeLists.txt | 2 +- GL/draw.c | 45 ++++++++++++++++++------------------- containers/aligned_vector.h | 10 +++++++-- 3 files changed, 31 insertions(+), 26 deletions(-) diff --git a/CMakeLists.txt b/CMakeLists.txt index d39a310..8578f94 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -25,7 +25,7 @@ if(NOT PLATFORM_DREAMCAST) set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -m32") endif() -set(CMAKE_C_FLAGS_RELEASE "${CMAKE_C_FLAGS_RELEASE} -Ofast --fast-math") +set(CMAKE_C_FLAGS_RELEASE "${CMAKE_C_FLAGS_RELEASE} -O3 --fast-math") set( SOURCES diff --git a/GL/draw.c b/GL/draw.c index ab3ade0..1f0914e 100644 --- a/GL/draw.c +++ b/GL/draw.c @@ -109,7 +109,7 @@ typedef void (*FloatParseFunc)(GLfloat* out, const GLubyte* in); typedef void (*ByteParseFunc)(GLubyte* out, const GLubyte* in); typedef void (*PolyBuildFunc)(Vertex* first, Vertex* previous, Vertex* vertex, Vertex* next, const GLsizei i); -static void _readVertexData3f3f(const GLubyte* in, GLubyte* out) { +static void _readVertexData3f3f(const GLubyte* __restrict__ in, GLubyte* __restrict__ out) { vec3cpy(out, in); } @@ -265,7 +265,7 @@ static void _readVertexData4ubRevARGB(const GLubyte* __restrict__ input, GLubyte argbcpy(output, input); } -static void _readVertexData4fRevARGB(const GLubyte* in, GLubyte* output) { +static void _readVertexData4fRevARGB(const GLubyte* __restrict__ in, GLubyte* __restrict__ output) { const float* input = (const float*) in; output[0] = (GLubyte) clamp(input[0] * 255.0f, 0, 255); @@ -286,12 +286,12 @@ static void _fillWithNegZVE(const GLubyte* __restrict__ input, GLubyte* __restri *((V*) out) = NegZ; } -static void _fillWhiteARGB(const GLubyte* input, GLubyte* output) { +static void _fillWhiteARGB(const GLubyte* __restrict__ input, GLubyte* __restrict__ output) { _GL_UNUSED(input); *((uint32_t*) output) = ~0; } -static void _fillZero2f(const GLubyte* input, GLubyte* out) { +static void _fillZero2f(const GLubyte* __restrict__ input, GLubyte* __restrict__ out) { _GL_UNUSED(input); memset(out, sizeof(float) * 2, 0); } @@ -616,21 +616,28 @@ ReadNormalFunc calcReadNormalFunc() { } } -GL_FORCE_INLINE void _readPositionData(const GLuint first, const GLuint count, const Vertex* output) { +void _readPositionData(const GLuint first, const GLuint count, const Vertex* output) { const GLsizei vstride = (VERTEX_POINTER.stride) ? VERTEX_POINTER.stride : VERTEX_POINTER.size * byte_size(VERTEX_POINTER.type); const void* vptr = ((GLubyte*) VERTEX_POINTER.ptr + (first * vstride)); ReadDiffuseFunc func = calcReadPositionFunc(); GLubyte* out = (GLubyte*) output[0].xyz; + uint32_t* flags; ITERATE(count) { func(vptr, out); vptr += vstride; + + /* Set the flags which are 4 bytes before the position. Doing it here saves + * an additional loop */ + flags = (uint32_t*) out - 1; + *flags = GPU_CMD_VERTEX; + out += sizeof(Vertex); } } -GL_FORCE_INLINE void _readUVData(const GLuint first, const GLuint count, const Vertex* output) { +void _readUVData(const GLuint first, const GLuint count, const Vertex* output) { const GLsizei uvstride = (UV_POINTER.stride) ? UV_POINTER.stride : UV_POINTER.size * byte_size(UV_POINTER.type); const void* uvptr = ((GLubyte*) UV_POINTER.ptr + (first * uvstride)); @@ -644,7 +651,7 @@ GL_FORCE_INLINE void _readUVData(const GLuint first, const GLuint count, const V } } -GL_FORCE_INLINE void _readSTData(const GLuint first, const GLuint count, const VertexExtra* extra) { +void _readSTData(const GLuint first, const GLuint count, const VertexExtra* extra) { const GLsizei ststride = (ST_POINTER.stride) ? ST_POINTER.stride : ST_POINTER.size * byte_size(ST_POINTER.type); const void* stptr = ((GLubyte*) ST_POINTER.ptr + (first * ststride)); @@ -658,7 +665,7 @@ GL_FORCE_INLINE void _readSTData(const GLuint first, const GLuint count, const V } } -GL_FORCE_INLINE void _readNormalData(const GLuint first, const GLuint count, const VertexExtra* extra) { +void _readNormalData(const GLuint first, const GLuint count, const VertexExtra* extra) { const GLsizei nstride = (NORMAL_POINTER.stride) ? NORMAL_POINTER.stride : NORMAL_POINTER.size * byte_size(NORMAL_POINTER.type); const void* nptr = ((GLubyte*) NORMAL_POINTER.ptr + (first * nstride)); @@ -689,7 +696,7 @@ GL_FORCE_INLINE void _readNormalData(const GLuint first, const GLuint count, con } } -GL_FORCE_INLINE void _readDiffuseData(const GLuint first, const GLuint count, const Vertex* output) { +void _readDiffuseData(const GLuint first, const GLuint count, const Vertex* output) { const GLuint size = (DIFFUSE_POINTER.size == GL_BGRA) ? 4 : DIFFUSE_POINTER.size; const GLuint cstride = (DIFFUSE_POINTER.stride) ? DIFFUSE_POINTER.stride : size * byte_size(DIFFUSE_POINTER.type); const GLubyte* cptr = ((GLubyte*) DIFFUSE_POINTER.ptr) + (first * cstride); @@ -767,7 +774,7 @@ static void generateElements( static const uint32_t FAST_PATH_BYTE_SIZE = (sizeof(GLfloat) * 3) + (sizeof(GLfloat) * 2) + (sizeof(GLubyte) * 4); -static void generateArraysFastPath(SubmissionTarget* target, const GLsizei first, const GLuint count, const GLenum type) { +static void generateArraysFastPath(SubmissionTarget* target, const GLsizei first, const GLuint count) { Vertex* start = _glSubmissionTargetStart(target); /* Copy the pos, uv and color directly in one go */ const GLubyte* pos = VERTEX_POINTER.ptr; @@ -785,21 +792,13 @@ static void generateArraysFastPath(SubmissionTarget* target, const GLsizei first _readSTData(first, count, ve); } -static void generateArrays(SubmissionTarget* target, const GLsizei first, const GLuint count, const GLenum type) { +static void generateArrays(SubmissionTarget* target, const GLsizei first, const GLuint count) { Vertex* start = _glSubmissionTargetStart(target); + VertexExtra* ve = aligned_vector_at(target->extras, 0); + _readPositionData(first, count, start); _readDiffuseData(first, count, start); _readUVData(first, count, start); - - Vertex* it = _glSubmissionTargetStart(target); - - ITERATE(count) { - it->flags = GPU_CMD_VERTEX; - ++it; - } - - VertexExtra* ve = aligned_vector_at(target->extras, 0); - _readNormalData(first, count, ve); _readSTData(first, count, ve); } @@ -812,9 +811,9 @@ static void generate(SubmissionTarget* target, const GLenum mode, const GLsizei if(indices) { generateElements(target, first, count, indices, type); } else if(FAST_PATH_ENABLED) { - generateArraysFastPath(target, first, count, type); + generateArraysFastPath(target, first, count); } else { - generateArrays(target, first, count, type); + generateArrays(target, first, count); } Vertex* it = _glSubmissionTargetStart(target); diff --git a/containers/aligned_vector.h b/containers/aligned_vector.h index 53128b8..c99bbf8 100644 --- a/containers/aligned_vector.h +++ b/containers/aligned_vector.h @@ -16,16 +16,22 @@ typedef struct { #define ALIGNED_VECTOR_CHUNK_SIZE 256u +#define AV_NO_INSTRUMENT inline __attribute__((no_instrument_function)) +#define AV_INLINE_DEBUG AV_NO_INSTRUMENT __attribute__((always_inline)) +#define AV_FORCE_INLINE static AV_INLINE_DEBUG + void aligned_vector_init(AlignedVector* vector, unsigned int element_size); void* aligned_vector_reserve(AlignedVector* vector, unsigned int element_count); void* aligned_vector_push_back(AlignedVector* vector, const void* objs, unsigned int count); void* aligned_vector_resize(AlignedVector* vector, const unsigned int element_count); -static inline void* aligned_vector_at(const AlignedVector* vector, const unsigned int index) { + +AV_FORCE_INLINE void* aligned_vector_at(const AlignedVector* vector, const unsigned int index) { assert(index < vector->size); return &vector->data[index * vector->element_size]; } void* aligned_vector_extend(AlignedVector* vector, const unsigned int additional_count); -static inline void aligned_vector_clear(AlignedVector* vector){ + +AV_FORCE_INLINE void aligned_vector_clear(AlignedVector* vector){ vector->size = 0; } void aligned_vector_shrink_to_fit(AlignedVector* vector); From e1e3eaf51b36bbd8ecaeef4beff1d55613a64a4f Mon Sep 17 00:00:00 2001 From: Luke Benstead Date: Tue, 20 Apr 2021 19:27:16 +0100 Subject: [PATCH 2/6] Testing with batched arrays --- GL/draw.c | 50 +++++++++++++++++++++++++++++++++----------------- 1 file changed, 33 insertions(+), 17 deletions(-) diff --git a/GL/draw.c b/GL/draw.c index 1f0914e..873c7f3 100644 --- a/GL/draw.c +++ b/GL/draw.c @@ -616,11 +616,10 @@ ReadNormalFunc calcReadNormalFunc() { } } -void _readPositionData(const GLuint first, const GLuint count, const Vertex* output) { +static void _readPositionData(ReadDiffuseFunc func, const GLuint first, const GLuint count, const Vertex* output) { const GLsizei vstride = (VERTEX_POINTER.stride) ? VERTEX_POINTER.stride : VERTEX_POINTER.size * byte_size(VERTEX_POINTER.type); const void* vptr = ((GLubyte*) VERTEX_POINTER.ptr + (first * vstride)); - ReadDiffuseFunc func = calcReadPositionFunc(); GLubyte* out = (GLubyte*) output[0].xyz; uint32_t* flags; @@ -637,11 +636,10 @@ void _readPositionData(const GLuint first, const GLuint count, const Vertex* out } } -void _readUVData(const GLuint first, const GLuint count, const Vertex* output) { +static void _readUVData(ReadUVFunc func, const GLuint first, const GLuint count, const Vertex* output) { const GLsizei uvstride = (UV_POINTER.stride) ? UV_POINTER.stride : UV_POINTER.size * byte_size(UV_POINTER.type); const void* uvptr = ((GLubyte*) UV_POINTER.ptr + (first * uvstride)); - ReadUVFunc func = calcReadUVFunc(); GLubyte* out = (GLubyte*) output[0].uv; ITERATE(count) { @@ -651,11 +649,10 @@ void _readUVData(const GLuint first, const GLuint count, const Vertex* output) { } } -void _readSTData(const GLuint first, const GLuint count, const VertexExtra* extra) { +static void _readSTData(ReadUVFunc func, const GLuint first, const GLuint count, const VertexExtra* extra) { const GLsizei ststride = (ST_POINTER.stride) ? ST_POINTER.stride : ST_POINTER.size * byte_size(ST_POINTER.type); const void* stptr = ((GLubyte*) ST_POINTER.ptr + (first * ststride)); - ReadUVFunc func = calcReadSTFunc(); GLubyte* out = (GLubyte*) extra[0].st; ITERATE(count) { @@ -665,11 +662,10 @@ void _readSTData(const GLuint first, const GLuint count, const VertexExtra* extr } } -void _readNormalData(const GLuint first, const GLuint count, const VertexExtra* extra) { +static void _readNormalData(ReadNormalFunc func, const GLuint first, const GLuint count, const VertexExtra* extra) { const GLsizei nstride = (NORMAL_POINTER.stride) ? NORMAL_POINTER.stride : NORMAL_POINTER.size * byte_size(NORMAL_POINTER.type); const void* nptr = ((GLubyte*) NORMAL_POINTER.ptr + (first * nstride)); - ReadNormalFunc func = calcReadNormalFunc(); GLubyte* out = (GLubyte*) extra[0].nxyz; ITERATE(count) { @@ -696,12 +692,11 @@ void _readNormalData(const GLuint first, const GLuint count, const VertexExtra* } } -void _readDiffuseData(const GLuint first, const GLuint count, const Vertex* output) { +static void _readDiffuseData(ReadDiffuseFunc func, const GLuint first, const GLuint count, const Vertex* output) { const GLuint size = (DIFFUSE_POINTER.size == GL_BGRA) ? 4 : DIFFUSE_POINTER.size; const GLuint cstride = (DIFFUSE_POINTER.stride) ? DIFFUSE_POINTER.stride : size * byte_size(DIFFUSE_POINTER.type); const GLubyte* cptr = ((GLubyte*) DIFFUSE_POINTER.ptr) + (first * cstride); - ReadDiffuseFunc func = calcReadDiffuseFunc(); GLubyte* out = (GLubyte*) output[0].bgra; ITERATE(count) { @@ -788,19 +783,40 @@ static void generateArraysFastPath(SubmissionTarget* target, const GLsizei first VertexExtra* ve = aligned_vector_at(target->extras, 0); - _readNormalData(first, count, ve); - _readSTData(first, count, ve); + ReadNormalFunc nfunc = calcReadNormalFunc(); + ReadUVFunc stfunc = calcReadSTFunc(); + + _readNormalData(nfunc, first, count, ve); + _readSTData(stfunc, first, count, ve); } +#define BATCH_SIZE 32 + static void generateArrays(SubmissionTarget* target, const GLsizei first, const GLuint count) { Vertex* start = _glSubmissionTargetStart(target); VertexExtra* ve = aligned_vector_at(target->extras, 0); - _readPositionData(first, count, start); - _readDiffuseData(first, count, start); - _readUVData(first, count, start); - _readNormalData(first, count, ve); - _readSTData(first, count, ve); + GLsizei s = first; + GLuint e = s + BATCH_SIZE; + + ReadPositionFunc pfunc = calcReadPositionFunc(); + ReadDiffuseFunc dfunc = calcReadDiffuseFunc(); + ReadUVFunc uvfunc = calcReadUVFunc(); + ReadNormalFunc nfunc = calcReadNormalFunc(); + ReadUVFunc stfunc = calcReadSTFunc(); + + do { + _readPositionData(pfunc, s, BATCH_SIZE, start); + _readDiffuseData(dfunc, s, BATCH_SIZE, start); + _readUVData(uvfunc, s, BATCH_SIZE, start); + _readNormalData(nfunc, s, BATCH_SIZE, ve); + _readSTData(stfunc, s, BATCH_SIZE, ve); + + s = e; + e += BATCH_SIZE; + start += BATCH_SIZE; + ve += BATCH_SIZE; + } while (s < count); } static void generate(SubmissionTarget* target, const GLenum mode, const GLsizei first, const GLuint count, From bfca6fd8b6b459300624669d6dd0621d2effdb94 Mon Sep 17 00:00:00 2001 From: Luke Benstead Date: Tue, 20 Apr 2021 19:52:49 +0100 Subject: [PATCH 3/6] Revert batching --- GL/draw.c | 30 +++++++++--------------------- 1 file changed, 9 insertions(+), 21 deletions(-) diff --git a/GL/draw.c b/GL/draw.c index 873c7f3..780197d 100644 --- a/GL/draw.c +++ b/GL/draw.c @@ -618,7 +618,7 @@ ReadNormalFunc calcReadNormalFunc() { static void _readPositionData(ReadDiffuseFunc func, const GLuint first, const GLuint count, const Vertex* output) { const GLsizei vstride = (VERTEX_POINTER.stride) ? VERTEX_POINTER.stride : VERTEX_POINTER.size * byte_size(VERTEX_POINTER.type); - const void* vptr = ((GLubyte*) VERTEX_POINTER.ptr + (first * vstride)); + const GLubyte* vptr = ((GLubyte*) VERTEX_POINTER.ptr + (first * vstride)); GLubyte* out = (GLubyte*) output[0].xyz; uint32_t* flags; @@ -638,7 +638,7 @@ static void _readPositionData(ReadDiffuseFunc func, const GLuint first, const GL static void _readUVData(ReadUVFunc func, const GLuint first, const GLuint count, const Vertex* output) { const GLsizei uvstride = (UV_POINTER.stride) ? UV_POINTER.stride : UV_POINTER.size * byte_size(UV_POINTER.type); - const void* uvptr = ((GLubyte*) UV_POINTER.ptr + (first * uvstride)); + const GLubyte* uvptr = ((GLubyte*) UV_POINTER.ptr + (first * uvstride)); GLubyte* out = (GLubyte*) output[0].uv; @@ -651,7 +651,7 @@ static void _readUVData(ReadUVFunc func, const GLuint first, const GLuint count, static void _readSTData(ReadUVFunc func, const GLuint first, const GLuint count, const VertexExtra* extra) { const GLsizei ststride = (ST_POINTER.stride) ? ST_POINTER.stride : ST_POINTER.size * byte_size(ST_POINTER.type); - const void* stptr = ((GLubyte*) ST_POINTER.ptr + (first * ststride)); + const GLubyte* stptr = ((GLubyte*) ST_POINTER.ptr + (first * ststride)); GLubyte* out = (GLubyte*) extra[0].st; @@ -664,7 +664,7 @@ static void _readSTData(ReadUVFunc func, const GLuint first, const GLuint count, static void _readNormalData(ReadNormalFunc func, const GLuint first, const GLuint count, const VertexExtra* extra) { const GLsizei nstride = (NORMAL_POINTER.stride) ? NORMAL_POINTER.stride : NORMAL_POINTER.size * byte_size(NORMAL_POINTER.type); - const void* nptr = ((GLubyte*) NORMAL_POINTER.ptr + (first * nstride)); + const GLubyte* nptr = ((GLubyte*) NORMAL_POINTER.ptr + (first * nstride)); GLubyte* out = (GLubyte*) extra[0].nxyz; @@ -790,33 +790,21 @@ static void generateArraysFastPath(SubmissionTarget* target, const GLsizei first _readSTData(stfunc, first, count, ve); } -#define BATCH_SIZE 32 - static void generateArrays(SubmissionTarget* target, const GLsizei first, const GLuint count) { Vertex* start = _glSubmissionTargetStart(target); VertexExtra* ve = aligned_vector_at(target->extras, 0); - GLsizei s = first; - GLuint e = s + BATCH_SIZE; - ReadPositionFunc pfunc = calcReadPositionFunc(); ReadDiffuseFunc dfunc = calcReadDiffuseFunc(); ReadUVFunc uvfunc = calcReadUVFunc(); ReadNormalFunc nfunc = calcReadNormalFunc(); ReadUVFunc stfunc = calcReadSTFunc(); - do { - _readPositionData(pfunc, s, BATCH_SIZE, start); - _readDiffuseData(dfunc, s, BATCH_SIZE, start); - _readUVData(uvfunc, s, BATCH_SIZE, start); - _readNormalData(nfunc, s, BATCH_SIZE, ve); - _readSTData(stfunc, s, BATCH_SIZE, ve); - - s = e; - e += BATCH_SIZE; - start += BATCH_SIZE; - ve += BATCH_SIZE; - } while (s < count); + _readPositionData(pfunc, first, count, start); + _readDiffuseData(dfunc, first, count, start); + _readUVData(uvfunc, first, count, start); + _readNormalData(nfunc, first, count, ve); + _readSTData(stfunc, first, count, ve); } static void generate(SubmissionTarget* target, const GLenum mode, const GLsizei first, const GLuint count, From 2547459ef3f0be5bed1874d45d867c4af7d820a4 Mon Sep 17 00:00:00 2001 From: Luke Benstead Date: Wed, 21 Apr 2021 09:11:51 +0100 Subject: [PATCH 4/6] Refactor fast path --- GL/draw.c | 124 ++++++++++++++++++++++++++++++++++++------------------ 1 file changed, 84 insertions(+), 40 deletions(-) diff --git a/GL/draw.c b/GL/draw.c index 780197d..7350324 100644 --- a/GL/draw.c +++ b/GL/draw.c @@ -53,39 +53,49 @@ void _glInitAttributePointers() { } GL_FORCE_INLINE GLboolean _glIsVertexDataFastPathCompatible() { - /* - * We provide a "fast path" if vertex data is provided in - * exactly the right format that matches what the PVR can handle. - * This function returns true if all the requirements are met. + /* The fast path is enabled when all enabled elements of the vertex + * match the output format. This means: + * + * xyz == 3f + * uv == 2f + * rgba == argb4444 + * st == 2f + * normal == 3f + * + * When this happens we do inline straight copies of the enabled data + * and transforms for positions and normals happen while copying. */ - /* - * At least these attributes need to be enabled, because we're not going to do any checking - * in the loop - */ - if((ENABLED_VERTEX_ATTRIBUTES & VERTEX_ENABLED_FLAG) != VERTEX_ENABLED_FLAG) return GL_FALSE; - if((ENABLED_VERTEX_ATTRIBUTES & UV_ENABLED_FLAG) != UV_ENABLED_FLAG) return GL_FALSE; - if((ENABLED_VERTEX_ATTRIBUTES & DIFFUSE_ENABLED_FLAG) != DIFFUSE_ENABLED_FLAG) return GL_FALSE; + if((ENABLED_VERTEX_ATTRIBUTES & VERTEX_ENABLED_FLAG)) { + if(VERTEX_POINTER.size != 3 || VERTEX_POINTER.type != GL_FLOAT) { + return GL_FALSE; + } + } - // All 3 attribute types must have a stride of 32 - if(VERTEX_POINTER.stride != 32) return GL_FALSE; - if(UV_POINTER.stride != 32) return GL_FALSE; - if(DIFFUSE_POINTER.stride != 32) return GL_FALSE; + if((ENABLED_VERTEX_ATTRIBUTES & UV_ENABLED_FLAG)) { + if(UV_POINTER.size != 2 || UV_POINTER.type != GL_FLOAT) { + return GL_FALSE; + } + } - // UV must follow vertex, diffuse must follow UV - if((UV_POINTER.ptr - VERTEX_POINTER.ptr) != sizeof(GLfloat) * 3) return GL_FALSE; - if((DIFFUSE_POINTER.ptr - UV_POINTER.ptr) != sizeof(GLfloat) * 2) return GL_FALSE; + if((ENABLED_VERTEX_ATTRIBUTES & DIFFUSE_ENABLED_FLAG)) { + /* FIXME: Shouldn't this be a reversed format? */ + if(DIFFUSE_POINTER.size != GL_BGRA || DIFFUSE_POINTER.type != GL_UNSIGNED_BYTE) { + return GL_FALSE; + } + } - if(VERTEX_POINTER.type != GL_FLOAT) return GL_FALSE; - if(VERTEX_POINTER.size != 3) return GL_FALSE; + if((ENABLED_VERTEX_ATTRIBUTES & ST_ENABLED_FLAG)) { + if(ST_POINTER.size != 2 || ST_POINTER.type != GL_FLOAT) { + return GL_FALSE; + } + } - if(UV_POINTER.type != GL_FLOAT) return GL_FALSE; - if(UV_POINTER.size != 2) return GL_FALSE; - - if(DIFFUSE_POINTER.type != GL_UNSIGNED_BYTE) return GL_FALSE; - - /* BGRA is the required color order */ - if(DIFFUSE_POINTER.size != GL_BGRA) return GL_FALSE; + if((ENABLED_VERTEX_ATTRIBUTES & NORMAL_ENABLED_FLAG)) { + if(NORMAL_POINTER.size != 3 || NORMAL_POINTER.type != GL_FLOAT) { + return GL_FALSE; + } + } return GL_TRUE; } @@ -767,27 +777,61 @@ static void generateElements( } } -static const uint32_t FAST_PATH_BYTE_SIZE = (sizeof(GLfloat) * 3) + (sizeof(GLfloat) * 2) + (sizeof(GLubyte) * 4); - static void generateArraysFastPath(SubmissionTarget* target, const GLsizei first, const GLuint count) { Vertex* start = _glSubmissionTargetStart(target); + + const GLuint vstride = (VERTEX_POINTER.stride) ? + VERTEX_POINTER.stride : VERTEX_POINTER.size * byte_size(VERTEX_POINTER.type); + + const GLuint uvstride = (UV_POINTER.stride) ? + UV_POINTER.stride : UV_POINTER.size * byte_size(UV_POINTER.type); + + const GLuint ststride = (ST_POINTER.stride) ? + ST_POINTER.stride : ST_POINTER.size * byte_size(ST_POINTER.type); + + const GLuint dstride = (DIFFUSE_POINTER.stride) ? + DIFFUSE_POINTER.stride : DIFFUSE_POINTER.size * byte_size(DIFFUSE_POINTER.type); + + const GLuint nstride = (NORMAL_POINTER.stride) ? + NORMAL_POINTER.stride : NORMAL_POINTER.size * byte_size(NORMAL_POINTER.type); + + /* Copy the pos, uv and color directly in one go */ - const GLubyte* pos = VERTEX_POINTER.ptr; + const GLubyte* pos = VERTEX_POINTER.ptr + (first * vstride); + const GLubyte* uv = UV_POINTER.ptr + (first * uvstride); + const GLubyte* col = DIFFUSE_POINTER.ptr + (first * dstride); + const GLubyte* st = ST_POINTER.ptr + (first * ststride); + const GLubyte* n = NORMAL_POINTER.ptr + (first * nstride); + + typedef struct { + float x, y, z; + } V3; + + typedef struct { + float u, v; + } V2; + + VertexExtra* ve = aligned_vector_at(target->extras, 0); Vertex* it = start; ITERATE(count) { it->flags = GPU_CMD_VERTEX; - MEMCPY4(it->xyz, pos, FAST_PATH_BYTE_SIZE); + + *((V3*) it->xyz) = *((V3*) pos); + *((V2*) it->uv) = *((V2*) uv); + *((uint32_t*) it->bgra) = *((uint32_t*) col); + + *((V2*) ve->st) = *((V2*) st); + *((V3*) ve->nxyz) = *((V3*) n); + it++; - pos += VERTEX_POINTER.stride; + ve++; + + pos += vstride; + uv += uvstride; + col += dstride; + st += ststride; + n += nstride; } - - VertexExtra* ve = aligned_vector_at(target->extras, 0); - - ReadNormalFunc nfunc = calcReadNormalFunc(); - ReadUVFunc stfunc = calcReadSTFunc(); - - _readNormalData(nfunc, first, count, ve); - _readSTData(stfunc, first, count, ve); } static void generateArrays(SubmissionTarget* target, const GLsizei first, const GLuint count) { From 26c9a454e48cb1f8feb98b2c64f622185a31254c Mon Sep 17 00:00:00 2001 From: Luke Benstead Date: Wed, 21 Apr 2021 15:34:28 +0100 Subject: [PATCH 5/6] More performance work --- GL/draw.c | 68 +++++++++++++++++++++++++++------------------- GL/platforms/sh4.h | 19 +++++++++++++ 2 files changed, 59 insertions(+), 28 deletions(-) diff --git a/GL/draw.c b/GL/draw.c index 7350324..5491e39 100644 --- a/GL/draw.c +++ b/GL/draw.c @@ -797,40 +797,49 @@ static void generateArraysFastPath(SubmissionTarget* target, const GLsizei first /* Copy the pos, uv and color directly in one go */ - const GLubyte* pos = VERTEX_POINTER.ptr + (first * vstride); - const GLubyte* uv = UV_POINTER.ptr + (first * uvstride); - const GLubyte* col = DIFFUSE_POINTER.ptr + (first * dstride); - const GLubyte* st = ST_POINTER.ptr + (first * ststride); - const GLubyte* n = NORMAL_POINTER.ptr + (first * nstride); - - typedef struct { - float x, y, z; - } V3; - - typedef struct { - float u, v; - } V2; + const GLubyte* pos = (ENABLED_VERTEX_ATTRIBUTES & VERTEX_ENABLED_FLAG) ? VERTEX_POINTER.ptr + (first * vstride) : NULL; + const GLubyte* uv = (ENABLED_VERTEX_ATTRIBUTES & UV_ENABLED_FLAG) ? UV_POINTER.ptr + (first * uvstride) : NULL; + const GLubyte* col = (ENABLED_VERTEX_ATTRIBUTES & DIFFUSE_ENABLED_FLAG) ? DIFFUSE_POINTER.ptr + (first * dstride) : NULL; + const GLubyte* st = (ENABLED_VERTEX_ATTRIBUTES & ST_ENABLED_FLAG) ? ST_POINTER.ptr + (first * ststride) : NULL; + const GLubyte* n = (ENABLED_VERTEX_ATTRIBUTES & NORMAL_ENABLED_FLAG) ? NORMAL_POINTER.ptr + (first * nstride) : NULL; VertexExtra* ve = aligned_vector_at(target->extras, 0); Vertex* it = start; - ITERATE(count) { + + const float w = 1.0f; + + uint32_t i = count; + + while(i--) { it->flags = GPU_CMD_VERTEX; - *((V3*) it->xyz) = *((V3*) pos); - *((V2*) it->uv) = *((V2*) uv); - *((uint32_t*) it->bgra) = *((uint32_t*) col); + if(pos) { + TransformVertex((const float*) pos, &w, it->xyz, &it->w); + pos += vstride; + } - *((V2*) ve->st) = *((V2*) st); - *((V3*) ve->nxyz) = *((V3*) n); + if(uv) { + MEMCPY4(it->uv, uv, sizeof(float) * 2); + uv += uvstride; + } + + if(col) { + MEMCPY4(it->bgra, col, sizeof(uint32_t)); + col += dstride; + } + + if(st) { + MEMCPY4(ve->st, st, sizeof(float) * 2); + st += ststride; + } + + if(n) { + MEMCPY4(ve->nxyz, n, sizeof(float) * 3); + n += nstride; + } it++; ve++; - - pos += vstride; - uv += uvstride; - col += dstride; - st += ststride; - n += nstride; } } @@ -890,8 +899,6 @@ static void transform(SubmissionTarget* target) { /* Perform modelview transform, storing W */ Vertex* vertex = _glSubmissionTargetStart(target); - _glApplyRenderMatrix(); /* Apply the Render Matrix Stack */ - TransformVertices(vertex, target->count); } @@ -1095,13 +1102,18 @@ GL_FORCE_INLINE void submitVertices(GLenum mode, GLsizei first, GLuint count, GL /* Make room for the vertices and header */ aligned_vector_extend(&target->output->vector, target->count + 1); + + _glApplyRenderMatrix(); /* Apply the Render Matrix Stack */ + generate(target, mode, first, count, (GLubyte*) indices, type); if(doLighting){ light(target); } - transform(target); + if(!FAST_PATH_ENABLED) { + transform(target); + } if(_glIsClippingEnabled()) { #if DEBUG_CLIPPING diff --git a/GL/platforms/sh4.h b/GL/platforms/sh4.h index d84b25e..b1b6823 100644 --- a/GL/platforms/sh4.h +++ b/GL/platforms/sh4.h @@ -69,6 +69,25 @@ inline void TransformVec4(float* x) { } +static inline void TransformVertex(const float* xyz, const float* w, float* oxyz, float* ow) { + register float __x __asm__("fr12") = (xyz[0]); + register float __y __asm__("fr13") = (xyz[1]); + register float __z __asm__("fr14") = (xyz[2]); + register float __w __asm__("fr15") = (*w); + + __asm__ __volatile__( + "fldi1 fr15\n" + "ftrv xmtrx,fv12\n" + : "=f" (__x), "=f" (__y), "=f" (__z), "=f" (__w) + : "0" (__x), "1" (__y), "2" (__z), "3" (__w) + ); + + oxyz[0] = __x; + oxyz[1] = __y; + oxyz[2] = __z; + *ow = __w; +} + static inline void TransformVertices(Vertex* vertices, const int count) { Vertex* it = vertices; for(int i = 0; i < count; ++i, ++it) { From f7424ea5bd6913d6fd93ae4134e3880f984ea042 Mon Sep 17 00:00:00 2001 From: Luke Benstead Date: Wed, 21 Apr 2021 17:21:03 +0100 Subject: [PATCH 6/6] Implement TransformVertex for PC --- GL/platforms/software.c | 15 +++++++++++++++ GL/platforms/software.h | 1 + 2 files changed, 16 insertions(+) diff --git a/GL/platforms/software.c b/GL/platforms/software.c index 7e904a0..40ccf8f 100644 --- a/GL/platforms/software.c +++ b/GL/platforms/software.c @@ -328,3 +328,18 @@ void TransformVertices(Vertex* vertices, const int count) { vertices->w = ret[3]; } } + +void TransformVertex(const float* xyz, const float* w, float* oxyz, float* ow) { + float ret[4]; + ret[0] = xyz[0]; + ret[1] = xyz[1]; + ret[2] = xyz[2]; + ret[3] = *w; + + TransformVec4(ret); + + oxyz[0] = ret[0]; + oxyz[1] = ret[1]; + oxyz[2] = ret[2]; + *ow = ret[3]; +} diff --git a/GL/platforms/software.h b/GL/platforms/software.h index 47fa9a6..e3a3a03 100644 --- a/GL/platforms/software.h +++ b/GL/platforms/software.h @@ -50,6 +50,7 @@ static inline void TransformNormalNoMod(const float* xIn, float* xOut) { } void TransformVertices(Vertex* vertices, const int count); +void TransformVertex(const float* xyz, const float* w, float* oxyz, float* ow); void InitGPU(_Bool autosort, _Bool fsaa);