Add a basic profiler and optimise some code

2018-08-16 17:51:15 +01:00 · 2018-08-16 17:51:15 +01:00 · ba66608a96
commit ba66608a96
parent 7d1b57fb12
8 changed files with 536 additions and 172 deletions
--- a/GL/clip.c
+++ b/GL/clip.c
@ -7,6 +7,7 @@
 #define PVR_PACK_COLOR(a, r, g, b) {}
 #endif

+#include "profiler.h"
 #include "clip.h"
 #include "../containers/aligned_vector.h"

@ -21,6 +22,7 @@ void enableClipping(unsigned char v) {
    ZCLIP_ENABLED = v;
 }

+void clipLineToNearZ(const ClipVertex* v1, const ClipVertex* v2, ClipVertex* vout, float* t) __attribute__((optimize("fast-math")));
 void clipLineToNearZ(const ClipVertex* v1, const ClipVertex* v2, ClipVertex* vout, float* t) {
    const float NEAR_PLANE = 0.2; // FIXME: this needs to be read from the projection matrix.. somehow

@ -38,13 +40,13 @@ static inline void interpolateFloat(const float v1, const float v2, const float
    *out = (v * t) + v1;
 }

-static void interpolateVec2(const float* v1, const float* v2, const float t, float* out) {
+static inline void interpolateVec2(const float* v1, const float* v2, const float t, float* out) {
    /* FIXME: SH4 has an asm instruction for this */
    interpolateFloat(v1[0], v2[0], t, &out[0]);
    interpolateFloat(v1[1], v2[1], t, &out[1]);
 }

-static void interpolateVec3(const float* v1, const float* v2, const float t, float* out) {
+static inline void interpolateVec3(const float* v1, const float* v2, const float t, float* out) {
    /* FIXME: SH4 has an asm instruction for this */

    interpolateFloat(v1[0], v2[0], t, &out[0]);
@ -52,7 +54,7 @@ static void interpolateVec3(const float* v1, const float* v2, const float t, flo
    interpolateFloat(v1[2], v2[2], t, &out[2]);
 }

-static void interpolateVec4(const float* v1, const float* v2, const float t, float* out) {
+static inline void interpolateVec4(const float* v1, const float* v2, const float t, float* out) {
    /* FIXME: SH4 has an asm instruction for this */
    interpolateFloat(v1[0], v2[0], t, &out[0]);
    interpolateFloat(v1[1], v2[1], t, &out[1]);
@ -81,29 +83,31 @@ void clipTriangleStrip(AlignedVector* vertices, AlignedVector* outBuffer) {
    uint32_t i;
    uint32_t stripCount = 2; /* The number of vertices in the source strip so far */

+    ClipVertex* thisVertex = aligned_vector_at(vertices, 1);
+
    for(i = 2; i < vertices->size; ++i) {
+        ++thisVertex;
+
        if(stripCount < 2) {
            stripCount++;
            continue;
        }

-        ClipVertex* thisVertex = aligned_vector_at(vertices, i);
-
-        ClipVertex* sourceTriangle[3] = {
-            aligned_vector_at(vertices, i - 2),
-            aligned_vector_at(vertices, i - 1),
+        const ClipVertex* sourceTriangle[3] = {
+            thisVertex - 2,
+            thisVertex - 1,
            thisVertex
        };

        /* If we're on an odd vertex, we need to swap the order of the first two vertices, as that's what
         * triangle strips do */
-        uint8_t swap = stripCount > 2 && (stripCount % 2 != 0);
-        ClipVertex* v1 = swap ? sourceTriangle[1] : sourceTriangle[0];
-        ClipVertex* v2 = swap ? sourceTriangle[0] : sourceTriangle[1];
-        ClipVertex* v3 = sourceTriangle[2];
+        uint32_t swap = stripCount > 2 && (stripCount % 2 != 0);
+        const ClipVertex* v1 = swap ? sourceTriangle[1] : sourceTriangle[0];
+        const ClipVertex* v2 = swap ? sourceTriangle[0] : sourceTriangle[1];
+        const ClipVertex* v3 = sourceTriangle[2];

-        uint8_t visible = ((v1->w > 0) ? 4 : 0) | ((v2->w > 0) ? 2 : 0) | ((v3->w > 0) ? 1 : 0);
-        uint8_t startOfStrip = (i == 2) || (outBuffer->size > 2 && ((ClipVertex*) aligned_vector_back(outBuffer))->flags == VERTEX_CMD_EOL);
+        uint32_t visible = ((v1->w > 0) ? 4 : 0) | ((v2->w > 0) ? 2 : 0) | ((v3->w > 0) ? 1 : 0);
+        uint32_t startOfStrip = (i == 2) || (outBuffer->size > 2 && ((ClipVertex*) aligned_vector_back(outBuffer))->flags == VERTEX_CMD_EOL);

        /* All visible, we're fine! */
        if(visible == 0b111) {
--- a/GL/draw.c
+++ b/GL/draw.c
@ -6,6 +6,7 @@
 #include "../include/gl.h"
 #include "../include/glext.h"
 #include "private.h"
+#include "profiler.h"

 typedef struct {
    const void* ptr;
@ -59,7 +60,7 @@ void initAttributePointers() {
    NORMAL_POINTER.size = 3;
 }

-static GLuint byte_size(GLenum type) {
+static inline GLuint byte_size(GLenum type) {
    switch(type) {
    case GL_BYTE: return sizeof(GLbyte);
    case GL_UNSIGNED_BYTE: return sizeof(GLubyte);
@ -73,73 +74,121 @@ static GLuint byte_size(GLenum type) {
    }
 }

-static void _parseColour(float* out, const GLubyte* in, GLint size, GLenum type) {
-    const float ONE_OVER_255 = 1.0f / 255.0f;
+typedef void (*FloatParseFunc)(GLfloat* out, const GLubyte* in);
+typedef void (*PolyBuildFunc)(ClipVertex* first, ClipVertex* previous, ClipVertex* vertex, ClipVertex* next, const GLsizei i);

-    switch(type) {
-    case GL_BYTE: {
-    case GL_UNSIGNED_BYTE:
+static inline void _parseVec3FromShort3(GLfloat* out, const GLubyte* in) {
+    GLshort* ptr = (GLshort*) in;
+
+    out[0] = (GLfloat) ptr[0];
+    out[1] = (GLfloat) ptr[1];
+    out[2] = (GLfloat) ptr[2];
+}
+
+static inline void _parseVec3FromInt3(GLfloat* out, const GLubyte* in) {
+    GLint* ptr = (GLint*) in;
+
+    out[0] = (GLfloat) ptr[0];
+    out[1] = (GLfloat) ptr[1];
+    out[2] = (GLfloat) ptr[2];
+}
+
+static inline void _parseVec3FromFloat3(GLfloat* out, const GLubyte* in) {
+    GLfloat* ptr = (GLfloat*) in;
+
+    out[0] = ptr[0];
+    out[1] = ptr[1];
+    out[2] = ptr[2];
+}
+
+static inline void _parseVec2FromFloat2(GLfloat* out, const GLubyte* in) {
+    GLfloat* ptr = (GLfloat*) in;
+
+    out[0] = ptr[0];
+    out[1] = ptr[1];
+}
+
+static inline void _parseVec3FromFloat2(GLfloat* out, const GLubyte* in) {
+    GLfloat* ptr = (GLfloat*) in;
+
+    out[0] = ptr[0];
+    out[1] = ptr[1];
+    out[2] = 0.0f;
+}
+
+static inline void _parseVec4FromFloat3(GLfloat* out, const GLubyte* in) {
+    GLfloat* ptr = (GLfloat*) in;
+
+    out[0] = ptr[0];
+    out[1] = ptr[1];
+    out[2] = ptr[2];
+    out[3] = 1.0;
+}
+
+static inline void _parseVec4FromFloat4(GLfloat* out, const GLubyte* in) {
+    GLfloat* ptr = (GLfloat*) in;
+
+    out[0] = ptr[0];
+    out[1] = ptr[1];
+    out[2] = ptr[2];
+    out[3] = ptr[3];
+}
+
+static inline void _parseColourFromUByte4(GLfloat* out, const GLubyte* in) {
+    const float ONE_OVER_255 = 1.0f / 255.0f;
    out[0] = ((GLfloat) in[0]) * ONE_OVER_255;
    out[1] = ((GLfloat) in[1]) * ONE_OVER_255;
    out[2] = ((GLfloat) in[2]) * ONE_OVER_255;
    out[3] = ((GLfloat) in[3]) * ONE_OVER_255;
-    } break;
-    case GL_SHORT:
-    case GL_UNSIGNED_SHORT:
-        /* FIXME!!!! */
-    break;
-    case GL_INT:
-    case GL_UNSIGNED_INT:
-        /* FIXME!!!! */
-    break;
-    case GL_FLOAT:
-    case GL_DOUBLE:
-    default: {
-        out[0] = ((GLfloat*) in)[0];
-        out[1] = ((GLfloat*) in)[1];
-        out[2] = ((GLfloat*) in)[2];
-        out[3] = ((GLfloat*) in)[3];
-    } break;
-    }
 }

-static void _parseFloats(GLfloat* out, const GLubyte* in, GLint size, GLenum type) {
-    GLubyte i;
-
-    switch(type) {
-    case GL_SHORT: {
-        GLshort* inp = (GLshort*) in;
-        for(i = 0; i < size; ++i) {
-            out[i] = (GLfloat) inp[i];
-        }
-    } break;
-    case GL_INT: {
-        GLint* inp = (GLint*) in;
-        for(i = 0; i < size; ++i) {
-            out[i] = (GLfloat) inp[i];
-        }
-    } break;
-    case GL_FLOAT:
-    case GL_DOUBLE:  /* Double == Float */
-        default: {
-            const GLfloat* ptr = (const GLfloat*) in;
-            for(i = 0; i < size; ++i) out[i] = ptr[i];
-        }
-    }
+static inline void _constVec2Zero(GLfloat* out, const GLubyte* in) {
+    out[0] = 0.0f;
+    out[1] = 0.0f;
 }

-static void _parseIndex(GLuint* out, const GLubyte* in, GLenum type) {
+static inline void _constVec3NegZ(GLfloat* out, const GLubyte* in) {
+    out[0] = 0.0f;
+    out[1] = 0.0f;
+    out[2] = -1.0f;
+}
+
+static inline void _constVec4One(GLfloat* out, const GLubyte* in) {
+    out[0] = 1.0f;
+    out[1] = 1.0f;
+    out[2] = 1.0f;
+    out[3] = 1.0f;
+}
+
+typedef GLuint (*IndexParseFunc)(const GLubyte* in);
+
+static inline GLuint _parseUByteIndex(const GLubyte* in) {
+    return (GLuint) *in;
+}
+
+static inline GLuint _parseUIntIndex(const GLubyte* in) {
+    return *((GLuint*) in);
+}
+
+static inline GLuint _parseUShortIndex(const GLubyte* in) {
+    return *((GLshort*) in);
+}
+
+
+static inline IndexParseFunc _calcParseIndexFunc(GLenum type) {
    switch(type) {
    case GL_UNSIGNED_BYTE:
-        *out = (GLuint) *in;
+        return &_parseUByteIndex;
    break;
    case GL_UNSIGNED_INT:
-        *out = *((GLuint*) in);
+        return &_parseUIntIndex;
    break;
    case GL_UNSIGNED_SHORT:
    default:
-        *out = *((GLshort*) in);
+        break;
    }
+
+    return &_parseUShortIndex;
 }


@ -187,89 +236,160 @@ static inline void transformNormalToEyeSpace(GLfloat* normal) {
    mat_trans_normal3(normal[0], normal[1], normal[2]);
 }

-static void swapVertex(ClipVertex* v1, ClipVertex* v2) {
-    ClipVertex tmp = *v1;
+static inline void swapVertex(ClipVertex* v1, ClipVertex* v2) {
+    static ClipVertex tmp;
+
+    tmp = *v1;
    *v1 = *v2;
    *v2 = tmp;
 }

-static void generate(AlignedVector* output, const GLenum mode, const GLsizei first, const GLsizei count,
-        const GLubyte* indices, const GLenum type,
-        const GLubyte* vptr, const GLubyte vstride, const GLubyte* cptr, const GLubyte cstride,
-        const GLubyte* uvptr, const GLubyte uvstride, const GLubyte* stptr, const GLubyte ststride,
-        const GLubyte* nptr, const GLubyte nstride) {
-    /* Read from the client buffers and generate an array of ClipVertices */
-
-    GLsizei max = first + count;
-
-    GLsizei spaceNeeded = (mode == GL_POLYGON || mode == GL_TRIANGLE_FAN) ? ((count - 2) * 3) : count;
-
-    /* Make sure we have room for the output */
-    aligned_vector_resize(output, spaceNeeded);
-
-    ClipVertex* vertex = (ClipVertex*) output->data;
-
-    GLsizei j;
-    GLsizei i = 0;
-    for(j = first; j < max; ++i, ++j, ++vertex) {
-        vertex->flags = PVR_CMD_VERTEX;
-
-        GLuint idx = j;
-        if(indices) {
-            _parseIndex(&idx, &indices[byte_size(type) * j], type);
+static inline FloatParseFunc _calcVertexParseFunc() {
+    switch(VERTEX_POINTER.type) {
+    case GL_SHORT: {
+        if(VERTEX_POINTER.size == 3) {
+            return &_parseVec3FromShort3;
+        }
+    } break;
+    case GL_INT: {
+        if(VERTEX_POINTER.size == 3) {
+            return &_parseVec3FromInt3;
+        }
+    } break;
+    case GL_FLOAT: {
+        if(VERTEX_POINTER.size == 3) {
+            return &_parseVec3FromFloat3;
+        } else if(VERTEX_POINTER.size == 2) {
+            return &_parseVec3FromFloat2;
+        }
+    } break;
+    default:
+        break;
    }

-        _parseFloats(vertex->xyz, vptr + (idx * vstride), VERTEX_POINTER.size, VERTEX_POINTER.type);
-
-        if(ENABLED_VERTEX_ATTRIBUTES & DIFFUSE_ENABLED_FLAG) {
-            _parseColour(vertex->diffuse, cptr + (idx * cstride), DIFFUSE_POINTER.size, DIFFUSE_POINTER.type);
-        } else {
-            /* Default to white if colours are disabled */
-            vertex->diffuse[0] = vertex->diffuse[1] = vertex->diffuse[2] = vertex->diffuse[3] = 1.0f;
+    return NULL;
 }

-        if(ENABLED_VERTEX_ATTRIBUTES & UV_ENABLED_FLAG) {
-            _parseFloats(vertex->uv, uvptr + (idx * uvstride), UV_POINTER.size, UV_POINTER.type);
-        } else {
-            vertex->uv[0] = vertex->uv[1] = 0.0f;
+static inline FloatParseFunc _calcDiffuseParseFunc() {
+    if((ENABLED_VERTEX_ATTRIBUTES & DIFFUSE_ENABLED_FLAG) != DIFFUSE_ENABLED_FLAG) {
+        return &_constVec4One;
    }

-        if(ENABLED_VERTEX_ATTRIBUTES & ST_ENABLED_FLAG) {
-            _parseFloats(vertex->st, stptr + (idx * ststride), ST_POINTER.size, ST_POINTER.type);
-        } else {
-            vertex->st[0] = vertex->st[1] = 0.0f;
+    switch(DIFFUSE_POINTER.type) {
+    case GL_BYTE:
+    case GL_UNSIGNED_BYTE: {
+        if(DIFFUSE_POINTER.size == 4) {
+            return &_parseColourFromUByte4;
+        }
+    } break;
+    case GL_INT: {
+        if(DIFFUSE_POINTER.size == 3) {
+            return &_parseVec3FromInt3;
+        }
+    } break;
+    case GL_FLOAT: {
+        if(DIFFUSE_POINTER.size == 3) {
+            return &_parseVec4FromFloat3;
+        } else if(DIFFUSE_POINTER.size == 4) {
+            return &_parseVec4FromFloat4;
+        }
+    } break;
+    default:
+        break;
    }

-        if(ENABLED_VERTEX_ATTRIBUTES & NORMAL_ENABLED_FLAG) {
-            _parseFloats(vertex->nxyz, nptr + (idx * nstride), NORMAL_POINTER.size, NORMAL_POINTER.type);
-        } else {
-            vertex->nxyz[0] = 0.0f;
-            vertex->nxyz[1] = 0.0f;
-            vertex->nxyz[2] = -1.0f;
+    return &_constVec4One;
 }

-        switch(mode) {
-        case GL_TRIANGLES: {
+static inline FloatParseFunc _calcUVParseFunc() {
+    if((ENABLED_VERTEX_ATTRIBUTES & UV_ENABLED_FLAG) != UV_ENABLED_FLAG) {
+        return &_constVec2Zero;
+    }
+
+    switch(UV_POINTER.type) {
+    case GL_FLOAT: {
+        if(UV_POINTER.size == 2) {
+            return &_parseVec2FromFloat2;
+        }
+    } break;
+    default:
+        break;
+    }
+
+    return &_constVec2Zero;
+}
+
+static inline FloatParseFunc _calcSTParseFunc() {
+    if((ENABLED_VERTEX_ATTRIBUTES & ST_ENABLED_FLAG) != ST_ENABLED_FLAG) {
+        return &_constVec2Zero;
+    }
+
+    switch(ST_POINTER.type) {
+    case GL_FLOAT: {
+        if(ST_POINTER.size == 2) {
+            return &_parseVec2FromFloat2;
+        }
+    } break;
+    default:
+        break;
+    }
+
+    return &_constVec2Zero;
+}
+
+static inline FloatParseFunc _calcNormalParseFunc() {
+    if((ENABLED_VERTEX_ATTRIBUTES & NORMAL_ENABLED_FLAG) != NORMAL_ENABLED_FLAG) {
+        return &_constVec3NegZ;
+    }
+
+    switch(NORMAL_POINTER.type) {
+    case GL_SHORT: {
+        if(NORMAL_POINTER.size == 3) {
+            return &_parseVec3FromShort3;
+        }
+    } break;
+    case GL_INT: {
+        if(NORMAL_POINTER.size == 3) {
+            return &_parseVec3FromInt3;
+        }
+    } break;
+    case GL_FLOAT: {
+        if(NORMAL_POINTER.size == 3) {
+            return &_parseVec3FromFloat3;
+        } else if(NORMAL_POINTER.size == 2) {
+            return &_parseVec3FromFloat2;
+        }
+    } break;
+    default:
+        break;
+    }
+
+    return &_constVec3NegZ;
+}
+
+
+static void _buildTriangle(ClipVertex* first, ClipVertex* previous, ClipVertex* vertex, ClipVertex* next, const GLsizei i) {
    if(((i + 1) % 3) == 0) {
        vertex->flags = PVR_CMD_VERTEX_EOL;
    }
-        } break;
-        case GL_QUADS: {
-            if(((i + 1) % 4) == 0) {
-                ClipVertex* previous = vertex - 1;
+}
+
+static inline GLsizei fast_mod(const GLsizei input, const GLsizei ceil) {
+    return input >= ceil ? input % ceil : input;
+}
+
+static void _buildQuad(ClipVertex* first, ClipVertex* previous, ClipVertex* vertex, ClipVertex* next, const GLsizei i) {
+    if((i + 1) % 4 == 0) {
        previous->flags = PVR_CMD_VERTEX_EOL;
        swapVertex(previous, vertex);
    }
-        } break;
-        case GL_POLYGON:
-        case GL_TRIANGLE_FAN: {
-            ClipVertex* previous = vertex - 1;
+}
+
+static void _buildTriangleFan(ClipVertex* first, ClipVertex* previous, ClipVertex* vertex, ClipVertex* next, const GLsizei i) {
    if(i == 2) {
        swapVertex(previous, vertex);
        vertex->flags = PVR_CMD_VERTEX_EOL;
    } else if(i > 2) {
-                ClipVertex* first = (ClipVertex*) output->data;
-                ClipVertex* previous = vertex - 1;
        ClipVertex* next = vertex + 1;

        *next = *first;
@ -281,16 +401,99 @@ static void generate(AlignedVector* output, const GLenum mode, const GLsizei fir

        vertex->flags = PVR_CMD_VERTEX_EOL;
    }
-        } break;
-        case GL_TRIANGLE_STRIP:
-        default: {
-            if(j == (max - 1)) {
+}
+
+static void _buildStrip(ClipVertex* first, ClipVertex* previous, ClipVertex* vertex, ClipVertex* next, const GLsizei i) {
+    if(!next) {
        /* If the mode was triangle strip, then the last vertex is the last vertex */
        vertex->flags = PVR_CMD_VERTEX_EOL;
    }
 }

+static inline PolyBuildFunc _calcBuildFunc(const GLenum type) {
+    switch(type) {
+    case GL_TRIANGLES:
+        return &_buildTriangle;
+    break;
+    case GL_QUADS:
+        return &_buildQuad;
+    break;
+    case GL_TRIANGLE_FAN:
+    case GL_POLYGON:
+        return &_buildTriangleFan;
+    break;
+    default:
+        break;
    }
+
+    return &_buildStrip;
+}
+
+typedef struct {
+    const GLubyte* vptr;
+    const GLuint vstride;
+    const GLubyte* cptr;
+    const GLuint cstride;
+    const GLubyte* uvptr;
+    const GLuint uvstride;
+    const GLubyte* stptr;
+    const GLuint ststride;
+    const GLubyte* nptr;
+    const GLuint nstride;
+} GenerateParams;
+
+static void generate(AlignedVector* output, const GLenum mode, const GLsizei first, const GLsizei count,
+        const GLubyte* indices, const GLenum type, const GenerateParams* pointers) {
+    /* Read from the client buffers and generate an array of ClipVertices */
+
+    const GLsizei max = first + count;
+    const GLsizei spaceNeeded = (mode == GL_POLYGON || mode == GL_TRIANGLE_FAN) ? ((count - 2) * 3) : count;
+
+    /* Make sure we have room for the output */
+    ClipVertex* vertex = aligned_vector_resize(output, spaceNeeded);
+
+    const FloatParseFunc vertexFunc = _calcVertexParseFunc();
+    const FloatParseFunc diffuseFunc = _calcDiffuseParseFunc();
+    const FloatParseFunc uvFunc = _calcUVParseFunc();
+    const FloatParseFunc stFunc = _calcSTParseFunc();
+    const FloatParseFunc normalFunc = _calcNormalParseFunc();
+
+    const PolyBuildFunc buildFunc = _calcBuildFunc(mode);
+    const IndexParseFunc indexFunc = _calcParseIndexFunc(type);
+
+    const GLsizei type_byte_size = byte_size(type);
+
+    ClipVertex* previous = NULL;
+    ClipVertex* firstV = vertex;
+    ClipVertex* next = NULL;
+
+    GLsizei i;
+
+    for(i = first; i < max; ++i, ++vertex) {
+        vertex->flags = PVR_CMD_VERTEX;
+
+        const GLuint idx = (indices) ?
+            indexFunc(&indices[type_byte_size * i]) : i;
+
+        const GLubyte* vin = pointers->vptr + (idx * pointers->vstride);
+        const GLubyte* din = pointers->cptr + (idx * pointers->cstride);
+        const GLubyte* uin = pointers->uvptr + (idx * pointers->uvstride);
+        const GLubyte* sin = pointers->stptr + (idx * pointers->ststride);
+        const GLubyte* nin = pointers->nptr + (idx * pointers->nstride);
+
+        vertexFunc(vertex->xyz, vin);
+        diffuseFunc(vertex->diffuse, din);
+        uvFunc(vertex->uv, uin);
+        stFunc(vertex->st, sin);
+        normalFunc(vertex->nxyz, nin);
+    }
+
+    vertex = firstV;
+
+    for(i = 0; i < count; ++i, ++vertex) {
+        next = (i < count - 1) ? vertex + 1 : NULL;
+        previous = (i > 0) ? vertex - 1 : NULL;
+        buildFunc(firstV, previous, vertex, next, i);
    }
 }

@ -334,7 +537,7 @@ static void clip(AlignedVector* vertices) {
    }

    /* Make sure we allocate roughly enough space */
-    aligned_vector_reserve(CLIP_BUFFER, vertices->size);
+    aligned_vector_reserve(CLIP_BUFFER, vertices->size * 1.5);

    /* Start from empty */
    aligned_vector_resize(CLIP_BUFFER, 0);
@ -537,29 +740,38 @@ static void submitVertices(GLenum mode, GLsizei first, GLsizei count, GLenum typ
        aligned_vector_resize(buffer, 0);
    }

-    GLubyte vstride = (VERTEX_POINTER.stride) ? VERTEX_POINTER.stride : VERTEX_POINTER.size * byte_size(VERTEX_POINTER.type);
+    const GLuint vstride = (VERTEX_POINTER.stride) ? VERTEX_POINTER.stride : VERTEX_POINTER.size * byte_size(VERTEX_POINTER.type);
    const GLubyte* vptr = VERTEX_POINTER.ptr;

-    GLubyte cstride = (DIFFUSE_POINTER.stride) ? DIFFUSE_POINTER.stride : DIFFUSE_POINTER.size * byte_size(DIFFUSE_POINTER.type);
+    const GLuint cstride = (DIFFUSE_POINTER.stride) ? DIFFUSE_POINTER.stride : DIFFUSE_POINTER.size * byte_size(DIFFUSE_POINTER.type);
    const GLubyte* cptr = DIFFUSE_POINTER.ptr;

-    GLubyte uvstride = (UV_POINTER.stride) ? UV_POINTER.stride : UV_POINTER.size * byte_size(UV_POINTER.type);
+    const GLuint uvstride = (UV_POINTER.stride) ? UV_POINTER.stride : UV_POINTER.size * byte_size(UV_POINTER.type);
    const GLubyte* uvptr = UV_POINTER.ptr;

-    GLubyte ststride = (ST_POINTER.stride) ? ST_POINTER.stride : ST_POINTER.size * byte_size(ST_POINTER.type);
+    const GLuint ststride = (ST_POINTER.stride) ? ST_POINTER.stride : ST_POINTER.size * byte_size(ST_POINTER.type);
    const GLubyte* stptr = ST_POINTER.ptr;

-    GLubyte nstride = (NORMAL_POINTER.stride) ? NORMAL_POINTER.stride : NORMAL_POINTER.size * byte_size(NORMAL_POINTER.type);
+    const GLuint nstride = (NORMAL_POINTER.stride) ? NORMAL_POINTER.stride : NORMAL_POINTER.size * byte_size(NORMAL_POINTER.type);
    const GLubyte* nptr = NORMAL_POINTER.ptr;

-    generate(
-        buffer, mode, first, count, (GLubyte*) indices, type,
-        vptr, vstride, cptr, cstride,
-        uvptr, uvstride, stptr, ststride,
-        nptr, nstride
-    );
+    GenerateParams params = {
+        .vptr = vptr,
+        .vstride = vstride,
+        .cptr = cptr,
+        .cstride = cstride,
+        .uvptr = uvptr,
+        .uvstride = uvstride,
+        .stptr = stptr,
+        .ststride = ststride,
+        .nptr = nptr,
+        .nstride = nstride
+    };
+
+    generate(buffer, mode, first, count, (GLubyte*) indices, type, &params);

    light(buffer);
+
    transform(buffer);

    if(isClippingEnabled()) {
@ -567,6 +779,7 @@ static void submitVertices(GLenum mode, GLsizei first, GLsizei count, GLenum typ
    }

    divide(buffer);
+
    push(buffer, activePolyList(), 0);

    /*
--- a/GL/flush.c
+++ b/GL/flush.c
@ -4,6 +4,7 @@

 #include "../containers/aligned_vector.h"
 #include "private.h"
+#include "profiler.h"

 #define TA_SQ_ADDR (unsigned int *)(void *) \
    (0xe0000000 | (((unsigned long)0x10000000) & 0x03ffffe0))
--- a/GL/profiler.c
+++ b/GL/profiler.c
@ -0,0 +1,123 @@
+#include <kos.h>
+#include <stdint.h>
+#include <string.h>
+#include <stdio.h>
+
+#include "profiler.h"
+#include "../containers/aligned_vector.h"
+
+#define MAX_PATH 256
+
+typedef struct {
+    char name[MAX_PATH];
+
+    uint64_t total_time_us;
+    uint64_t total_calls;
+} ProfilerResult;
+
+typedef struct {
+    AlignedVector stack;
+    AlignedVector results;
+    uint64_t start_time_in_us;
+} RootProfiler;
+
+
+static RootProfiler* root = NULL;
+
+static ProfilerResult* profiler_get_or_create_result(const char* name) {
+    uint16_t i = 0;
+    for(; i < root->results.size; ++i) {
+        ProfilerResult* result = aligned_vector_at(&root->results, i);
+        if(strcmp(result->name, name) == 0) {
+            return result;
+        }
+    }
+
+    ProfilerResult newResult;
+    strcpy(newResult.name, name);
+    newResult.total_calls = 0;
+    newResult.total_time_us = 0;
+    aligned_vector_push_back(&root->results, &newResult, 1);
+    return aligned_vector_back(&root->results);
+}
+
+static uint64_t current_time_in_us() {
+    return timer_us_gettime64();
+}
+
+static void profiler_generate_path(const char* suffix, char* path) {
+    uint16_t i = 0;
+    for(; i < root->stack.size; ++i) {
+        Profiler* prof = aligned_vector_at(&root->stack, i);
+        strcat(path, prof->name);
+
+        if(i != root->stack.size - 1) {
+            strcat(path, ".");
+        }
+    }
+
+    if(strlen(suffix)) {
+        strcat(path, ":");
+        strcat(path, suffix);
+    }
+}
+
+
+Profiler* profiler_push(const char* name) {
+    if(!root) {
+        root = (RootProfiler*) malloc(sizeof(RootProfiler));
+        aligned_vector_init(
+            &root->stack,
+            sizeof(Profiler)
+        );
+
+        aligned_vector_init(
+            &root->results,
+            sizeof(ProfilerResult)
+        );
+
+        aligned_vector_reserve(&root->stack, 32);
+        aligned_vector_reserve(&root->results, 64);
+    }
+
+    Profiler profiler;
+    strncpy(profiler.name, name, 64);
+    profiler.start_time_in_us = current_time_in_us();
+
+    aligned_vector_push_back(&root->stack, &profiler, 1);
+    return aligned_vector_back(&root->stack);
+}
+
+void profiler_checkpoint(const char* name) {
+    Profiler* prof = aligned_vector_back(&root->stack);
+
+    char path[MAX_PATH];
+    path[0] = '\0';
+
+    profiler_generate_path(name, path);
+
+    uint64_t now = current_time_in_us();
+    uint64_t diff = now - prof->start_time_in_us;
+    prof->start_time_in_us = now;
+
+    ProfilerResult* result = profiler_get_or_create_result(path);
+    result->total_calls++;
+    result->total_time_us += diff;
+}
+
+void profiler_pop() {
+    aligned_vector_resize(&root->stack, root->stack.size - 1);
+}
+
+void profiler_print_stats() {
+    fprintf(stderr, "%-60s%-20s%-20s%-20s\n", "Path", "Average", "Total", "Calls");
+
+    uint16_t i = 0;
+    for(; i < root->results.size; ++i) {
+        ProfilerResult* result = aligned_vector_at(&root->results, i);
+        float ms = ((float) result->total_time_us) / 1000.0f;
+        float avg = ms / (float) result->total_calls;
+
+        fprintf(stderr, "%-60s%-20f%-20f%d\n", result->name, avg, ms, result->total_calls);
+    }
+}
--- a/GL/profiler.h
+++ b/GL/profiler.h
@ -0,0 +1,15 @@
+#pragma once
+
+#include <stdint.h>
+
+typedef struct {
+    char name[64];
+    uint64_t start_time_in_us;
+} Profiler;
+
+
+Profiler* profiler_push(const char* name);
+void profiler_checkpoint(const char* name);
+void profiler_pop();
+
+void profiler_print_stats();
--- a/2
+++ b/2
@ -7,7 +7,7 @@

 TARGET = libGLdc.a
 OBJS = GL/draw.o GL/flush.o GL/framebuffer.o GL/immediate.o GL/lighting.o GL/state.o GL/texture.o GL/glu.o
-OBJS += GL/matrix.o GL/fog.o GL/error.o GL/clip.o containers/stack.o containers/named_array.o containers/aligned_vector.o
+OBJS += GL/matrix.o GL/fog.o GL/error.o GL/clip.o containers/stack.o containers/named_array.o containers/aligned_vector.o GL/profiler.o

 SUBDIRS =

--- a/containers/aligned_vector.c
+++ b/containers/aligned_vector.c
@ -50,11 +50,13 @@ void aligned_vector_push_back(AlignedVector* vector, const void* objs, unsigned
    memcpy(dest, objs, vector->element_size * count);
 }

-void aligned_vector_resize(AlignedVector* vector, const unsigned int element_count) {
+void* aligned_vector_resize(AlignedVector* vector, const unsigned int element_count) {
+    unsigned int previousCount = vector->size;
+
    /* Don't change memory when resizing downwards, just change the size */
    if(element_count <= vector->size) {
        vector->size = element_count;
-        return;
+        return NULL;
    }

    if(vector->capacity < element_count) {
@ -64,6 +66,12 @@ void aligned_vector_resize(AlignedVector* vector, const unsigned int element_cou
    }

    vector->size = element_count;
+
+    if(previousCount < vector->size) {
+        return aligned_vector_at(vector, previousCount);
+    } else {
+        return NULL;
+    }
 }

 void* aligned_vector_at(const AlignedVector* vector, const unsigned int index) {
--- a/containers/aligned_vector.h
+++ b/containers/aligned_vector.h
@ -17,7 +17,7 @@ typedef struct {
 void aligned_vector_init(AlignedVector* vector, unsigned int element_size);
 void aligned_vector_reserve(AlignedVector* vector, unsigned int element_count);
 void aligned_vector_push_back(AlignedVector* vector, const void* objs, unsigned int count);
-void aligned_vector_resize(AlignedVector* vector, const unsigned int element_count);
+void* aligned_vector_resize(AlignedVector* vector, const unsigned int element_count);
 void* aligned_vector_at(const AlignedVector* vector, const unsigned int index);
 void* aligned_vector_extend(AlignedVector* vector, const unsigned int additional_count);
 void aligned_vector_clear(AlignedVector* vector);