From 18759422ea2d126b3938e4a3cfc8f850befbadf8 Mon Sep 17 00:00:00 2001
From: Luke Benstead <kazade@gmail.com>
Date: Mon, 8 Oct 2018 22:03:50 +0100
Subject: [PATCH 1/5] Optimise non-indexed rendering paths

---
 GL/draw.c                           | 348 +++++++++++++++++++++-------
 samples/Makefile                    |   1 +
 samples/trimark/Makefile            |  29 +++
 samples/trimark/main.c              | 174 ++++++++++++++
 samples/trimark/romdisk/PLACEHOLDER |   0
 5 files changed, 466 insertions(+), 86 deletions(-)
 create mode 100644 samples/trimark/Makefile
 create mode 100644 samples/trimark/main.c
 create mode 100644 samples/trimark/romdisk/PLACEHOLDER

diff --git a/GL/draw.c b/GL/draw.c
index 6ba523b..50f301d 100644
--- a/GL/draw.c
+++ b/GL/draw.c
@@ -460,6 +460,178 @@ static inline PolyBuildFunc _calcBuildFunc(const GLenum type) {
     return &_buildStrip;
 }
 
+static inline void genArraysCommon(
+    ClipVertex* output,
+    GLsizei count,
+    const GLubyte* vptr, GLuint vstride,
+    const GLubyte* cptr, GLuint cstride,
+    const GLubyte* uvptr, GLuint uvstride,
+    const GLubyte* stptr, GLuint ststride,
+    const GLubyte* nptr, GLuint nstride,
+    GLboolean doTexture, GLboolean doMultitexture, GLboolean doLighting
+) {
+    const FloatParseFunc vertexFunc = _calcVertexParseFunc();
+    const ByteParseFunc diffuseFunc = _calcDiffuseParseFunc();
+    const FloatParseFunc uvFunc = _calcUVParseFunc();
+    const FloatParseFunc stFunc = _calcSTParseFunc();
+    const FloatParseFunc normalFunc = _calcNormalParseFunc();
+
+    GLsizei i = count;
+
+    ClipVertex* vertex = output;
+
+    while(i--) {
+        vertex->flags = PVR_CMD_VERTEX;
+        vertexFunc(vertex->xyz, vptr);
+        vptr += vstride;
+        vertex++;
+    }
+
+    i = count;
+    vertex = output;
+    while(i--) {
+        diffuseFunc(vertex->bgra, cptr);
+        cptr += cstride;
+        vertex++;
+    }
+
+    if(doTexture) {
+        i = count;
+        vertex = output;
+        while(i--) {
+            uvFunc(vertex->uv, uvptr);
+            uvptr += uvstride;
+            vertex++;
+        }
+    }
+
+    if(doMultitexture) {
+        i = count;
+        vertex = output;
+        while(i--) {
+            stFunc(vertex->st, stptr);
+            stptr += ststride;
+            ++vertex;
+        }
+    }
+
+    if(doLighting) {
+        i = count;
+        vertex = output;
+        while(i--) {
+            normalFunc(vertex->nxyz, nptr);
+            nptr += nstride;
+            ++vertex;
+        }
+    }
+}
+
+
+static inline void genArraysTriangles(
+    ClipVertex* output,
+    GLsizei count,
+    const GLubyte* vptr, GLuint vstride,
+    const GLubyte* cptr, GLuint cstride,
+    const GLubyte* uvptr, GLuint uvstride,
+    const GLubyte* stptr, GLuint ststride,
+    const GLubyte* nptr, GLuint nstride,
+    GLboolean doTexture, GLboolean doMultitexture, GLboolean doLighting) {
+
+    genArraysCommon(
+        output, count,
+        vptr, vstride, cptr, cstride, uvptr, uvstride, stptr, ststride, nptr, nstride,
+        doTexture, doMultitexture, doLighting
+    );
+
+    GLsizei i = count;
+    ClipVertex* vertex = output;
+    for(i = 2; i < count; i += 3) {
+        vertex[i].flags = PVR_CMD_VERTEX_EOL;
+    }
+}
+
+static void genArraysQuads(
+    ClipVertex* output,
+    GLsizei count,
+    const GLubyte* vptr, GLuint vstride,
+    const GLubyte* cptr, GLuint cstride,
+    const GLubyte* uvptr, GLuint uvstride,
+    const GLubyte* stptr, GLuint ststride,
+    const GLubyte* nptr, GLuint nstride,
+    GLboolean doTexture, GLboolean doMultitexture, GLboolean doLighting) {
+
+    genArraysCommon(
+        output, count,
+        vptr, vstride, cptr, cstride, uvptr, uvstride, stptr, ststride, nptr, nstride,
+        doTexture, doMultitexture, doLighting
+    );
+
+    GLsizei i = count;
+    ClipVertex* vertex = output;
+
+    for(i = 3; i < count; i += 4) {
+        swapVertex(&vertex[i], &vertex[i - 1]);
+        vertex[i].flags = PVR_CMD_VERTEX_EOL;
+    }
+}
+
+static void genArraysTriangleStrip(
+    ClipVertex* output,
+    GLsizei count,
+    const GLubyte* vptr, GLuint vstride,
+    const GLubyte* cptr, GLuint cstride,
+    const GLubyte* uvptr, GLuint uvstride,
+    const GLubyte* stptr, GLuint ststride,
+    const GLubyte* nptr, GLuint nstride,
+    GLboolean doTexture, GLboolean doMultitexture, GLboolean doLighting) {
+
+    genArraysCommon(
+        output, count,
+        vptr, vstride, cptr, cstride, uvptr, uvstride, stptr, ststride, nptr, nstride,
+        doTexture, doMultitexture, doLighting
+    );
+
+    output[count - 1].flags = PVR_CMD_VERTEX_EOL;
+}
+
+static void genArraysTriangleFan(
+    ClipVertex* output,
+    GLsizei count,
+    const GLubyte* vptr, GLuint vstride,
+    const GLubyte* cptr, GLuint cstride,
+    const GLubyte* uvptr, GLuint uvstride,
+    const GLubyte* stptr, GLuint ststride,
+    const GLubyte* nptr, GLuint nstride,
+    GLboolean doTexture, GLboolean doMultitexture, GLboolean doLighting) {
+
+    genArraysCommon(
+        output, count,
+        vptr, vstride, cptr, cstride, uvptr, uvstride, stptr, ststride, nptr, nstride,
+        doTexture, doMultitexture, doLighting
+    );
+
+    swapVertex(&output[1], &output[2]);
+    output[2].flags = PVR_CMD_VERTEX_EOL;
+
+    GLsizei i = 3;
+    ClipVertex* first = &output[0];
+
+    for(; i < count - 1; ++i) {
+        ClipVertex* next = &output[i + 1];
+        ClipVertex* previous = &output[i - 1];
+        ClipVertex* vertex = &output[i];
+
+        *next = *first;
+
+        swapVertex(next, vertex);
+
+        vertex = next + 1;
+        *vertex = *previous;
+
+        vertex->flags = PVR_CMD_VERTEX_EOL;
+    }
+}
+
 static void generate(ClipVertex* output, const GLenum mode, const GLsizei first, const GLsizei count,
         const GLubyte* indices, const GLenum type, const GLboolean doTexture, const GLboolean doMultitexture, const GLboolean doLighting) {
     /* Read from the client buffers and generate an array of ClipVertices */
@@ -470,6 +642,65 @@ static void generate(ClipVertex* output, const GLenum mode, const GLsizei first,
     const GLuint ststride = (ST_POINTER.stride) ? ST_POINTER.stride : ST_POINTER.size * byte_size(ST_POINTER.type);
     const GLuint nstride = (NORMAL_POINTER.stride) ? NORMAL_POINTER.stride : NORMAL_POINTER.size * byte_size(NORMAL_POINTER.type);
 
+    if(!indices) {
+        const GLubyte* vptr = VERTEX_POINTER.ptr + (first * vstride);
+        const GLubyte* cptr = DIFFUSE_POINTER.ptr + (first * cstride);
+        const GLubyte* uvptr = UV_POINTER.ptr + (first * uvstride);
+        const GLubyte* stptr = ST_POINTER.ptr + (first * ststride);
+        const GLubyte* nptr = NORMAL_POINTER.ptr + (first * nstride);
+
+        // Drawing arrays
+        switch(mode) {
+        case GL_TRIANGLES:
+            genArraysTriangles(
+                output,
+                count,
+                vptr, vstride,
+                cptr, cstride,
+                uvptr, uvstride,
+                stptr, ststride,
+                nptr, nstride,
+                doTexture, doMultitexture, doLighting
+            );
+        case GL_QUADS:
+            genArraysQuads(
+                output,
+                count,
+                vptr, vstride,
+                cptr, cstride,
+                uvptr, uvstride,
+                stptr, ststride,
+                nptr, nstride,
+                doTexture, doMultitexture, doLighting
+            );
+        case GL_TRIANGLE_FAN:
+            genArraysTriangleFan(
+                output,
+                count,
+                vptr, vstride,
+                cptr, cstride,
+                uvptr, uvstride,
+                stptr, ststride,
+                nptr, nstride,
+                doTexture, doMultitexture, doLighting
+            );
+        case GL_TRIANGLE_STRIP:
+        default:
+            genArraysTriangleStrip(
+                output,
+                count,
+                vptr, vstride,
+                cptr, cstride,
+                uvptr, uvstride,
+                stptr, ststride,
+                nptr, nstride,
+                doTexture, doMultitexture, doLighting
+            );
+        }
+        return;
+    }
+
+
     const GLsizei max = first + count;
 
     ClipVertex* vertex = output;
@@ -494,102 +725,47 @@ static void generate(ClipVertex* output, const GLenum mode, const GLsizei first,
     GLsizei i, j = 0;
     GLuint idx;
 
-    if(!indices) {
-        GLubyte* vptr = VERTEX_POINTER.ptr + (first * vstride);
-        GLubyte* cptr = DIFFUSE_POINTER.ptr + (first * cstride);
-        GLubyte* uvptr = UV_POINTER.ptr + (first * uvstride);
-        GLubyte* stptr = ST_POINTER.ptr + (first * ststride);
-        GLubyte* nptr = NORMAL_POINTER.ptr + (first * nstride);
-
-        for(j = 0; j < count; ++j, ++vertex) {
-            if(mode == GL_QUADS) {
-                /* Performance optimisation to prevent copying to a temporary */
-                GLsizei mod = (j + 1) % 4;
-                if(mod == 0) {
-                    target = vertex - 1;
-                    target->flags = PVR_CMD_VERTEX;
-                } else if(mod == 3) {
-                    target = vertex + 1;
-                    target->flags = PVR_CMD_VERTEX_EOL;
-                } else {
-                    target = vertex;
-                    target->flags = PVR_CMD_VERTEX;
-                }
+    for(i = first; i < max; ++i, ++j, ++vertex) {
+        if(mode == GL_QUADS) {
+            /* Performance optimisation to prevent copying to a temporary */
+            GLsizei mod = (j + 1) % 4;
+            if(mod == 0) {
+                target = vertex - 1;
+                target->flags = PVR_CMD_VERTEX;
+            } else if(mod == 3) {
+                target = vertex + 1;
+                target->flags = PVR_CMD_VERTEX_EOL;
             } else {
                 target = vertex;
                 target->flags = PVR_CMD_VERTEX;
             }
-
-            vertexFunc(target->xyz, vptr);
-            diffuseFunc(target->bgra, cptr);
-            vptr += vstride;
-            cptr += cstride;
-
-            if(doTexture) {
-                uvFunc(target->uv, uvptr);
-                uvptr += uvstride;
-            }
-
-            if(doMultitexture) {
-                stFunc(target->st, stptr);
-                stptr += ststride;
-            }
-
-            if(doLighting) {
-                normalFunc(target->nxyz, nptr);
-                nptr += nstride;
-            }
-
-            if(mode != GL_QUADS) {
-                next = (j < count - 1) ? vertex + 1 : NULL;
-                previous = (j > 0) ? vertex - 1 : NULL;
-                buildFunc(firstV, previous, vertex, next, j);
-            }
+        } else {
+            target = vertex;
+            target->flags = PVR_CMD_VERTEX;
         }
 
-    } else {
-        for(i = first; i < max; ++i, ++j, ++vertex) {
-            if(mode == GL_QUADS) {
-                /* Performance optimisation to prevent copying to a temporary */
-                GLsizei mod = (j + 1) % 4;
-                if(mod == 0) {
-                    target = vertex - 1;
-                    target->flags = PVR_CMD_VERTEX;
-                } else if(mod == 3) {
-                    target = vertex + 1;
-                    target->flags = PVR_CMD_VERTEX_EOL;
-                } else {
-                    target = vertex;
-                    target->flags = PVR_CMD_VERTEX;
-                }
-            } else {
-                target = vertex;
-                target->flags = PVR_CMD_VERTEX;
-            }
+        idx = (indices) ?
+            indexFunc(&indices[type_byte_size * i]) : i;
 
-            idx = (indices) ?
-                indexFunc(&indices[type_byte_size * i]) : i;
+        vertexFunc(target->xyz, VERTEX_POINTER.ptr + (idx * vstride));
+        diffuseFunc(target->bgra, DIFFUSE_POINTER.ptr + (idx * cstride));
 
-            vertexFunc(target->xyz, VERTEX_POINTER.ptr + (idx * vstride));
-            diffuseFunc(target->bgra, DIFFUSE_POINTER.ptr + (idx * cstride));
+        if(doTexture) {
+            uvFunc(target->uv, UV_POINTER.ptr + (idx * uvstride));
+        }
 
-            if(doTexture) {
-                uvFunc(target->uv, UV_POINTER.ptr + (idx * uvstride));
-            }
+        if(doMultitexture) {
+            stFunc(target->st, ST_POINTER.ptr + (idx * ststride));
+        }
 
-            if(doMultitexture) {
-                stFunc(target->st, ST_POINTER.ptr + (idx * ststride));
-            }
+        if(doLighting) {
+            normalFunc(target->nxyz, NORMAL_POINTER.ptr + (idx * nstride));
+        }
 
-            if(doLighting) {
-                normalFunc(target->nxyz, NORMAL_POINTER.ptr + (idx * nstride));
-            }
-
-            if(mode != GL_QUADS) {
-                next = (j < count - 1) ? vertex + 1 : NULL;
-                previous = (j > 0) ? vertex - 1 : NULL;
-                buildFunc(firstV, previous, vertex, next, j);
-            }
+        if(mode != GL_QUADS) {
+            next = (j < count - 1) ? vertex + 1 : NULL;
+            previous = (j > 0) ? vertex - 1 : NULL;
+            buildFunc(firstV, previous, vertex, next, j);
         }
     }
 }
@@ -827,7 +1003,7 @@ static void submitVertices(GLenum mode, GLsizei first, GLsizei count, GLenum typ
 
         /* Clipping may have realloc'd so reset the start pointer */
         start = ((ClipVertex*) activeList->vector.data) + startOffset;
-        header = start - 1;  /* Update the header pointer */
+        header = (PVRHeader*) (start - 1);  /* Update the header pointer */
 
 #if DEBUG_CLIPPING
         fprintf(stderr, "--------\n");
diff --git a/samples/Makefile b/samples/Makefile
index eabf377..8d94f2f 100644
--- a/samples/Makefile
+++ b/samples/Makefile
@@ -19,4 +19,5 @@ all:
 	$(KOS_MAKE) -C zclip_trianglestrip all
 	$(KOS_MAKE) -C terrain all
 	$(KOS_MAKE) -C quadmark all
+	$(KOS_MAKE) -C trimark all
 	$(KOS_MAKE) -C multitexture_arrays all
diff --git a/samples/trimark/Makefile b/samples/trimark/Makefile
new file mode 100644
index 0000000..8b18757
--- /dev/null
+++ b/samples/trimark/Makefile
@@ -0,0 +1,29 @@
+TARGET = trimark.elf
+OBJS = main.o
+
+all: rm-elf $(TARGET)
+
+include $(KOS_BASE)/Makefile.rules
+
+clean:
+	-rm -f $(TARGET) $(OBJS) romdisk.*
+
+rm-elf:
+	-rm -f $(TARGET) romdisk.*
+
+$(TARGET): $(OBJS) romdisk.o
+	$(KOS_CC) $(KOS_CFLAGS) $(KOS_LDFLAGS) -o $(TARGET) $(KOS_START) \
+		$(OBJS) romdisk.o $(OBJEXTRA) -lm -lkosutils $(KOS_LIBS)
+
+romdisk.img:
+	$(KOS_GENROMFS) -f romdisk.img -d romdisk -v
+
+romdisk.o: romdisk.img
+	$(KOS_BASE)/utils/bin2o/bin2o romdisk.img romdisk romdisk.o
+
+run: $(TARGET)
+	$(KOS_LOADER) $(TARGET)
+
+dist:
+	rm -f $(OBJS) romdisk.o romdisk.img
+	$(KOS_STRIP) $(TARGET)
diff --git a/samples/trimark/main.c b/samples/trimark/main.c
new file mode 100644
index 0000000..3f4a766
--- /dev/null
+++ b/samples/trimark/main.c
@@ -0,0 +1,174 @@
+/*
+   KallistiGL 2.0.0
+
+   quadmark.c
+   (c)2018 Luke Benstead
+   (c)2014 Josh Pearson
+   (c)2002 Dan Potter, Paul Boese
+*/
+
+#include <kos.h>
+
+#include <GL/gl.h>
+
+#include <stdlib.h>
+#include <time.h>
+
+enum { PHASE_HALVE, PHASE_INCR, PHASE_DECR, PHASE_FINAL };
+
+int polycnt;
+int phase = PHASE_HALVE;
+float avgfps = -1;
+
+void running_stats() {
+    pvr_stats_t stats;
+    pvr_get_stats(&stats);
+
+    if(avgfps == -1)
+        avgfps = stats.frame_rate;
+    else
+        avgfps = (avgfps + stats.frame_rate) / 2.0f;
+}
+
+void stats() {
+    pvr_stats_t stats;
+
+    pvr_get_stats(&stats);
+    dbglog(DBG_DEBUG, "3D Stats: %d VBLs, frame rate ~%f fps\n",
+           stats.vbl_count, stats.frame_rate);
+}
+
+
+int check_start() {
+    maple_device_t *cont;
+    cont_state_t *state;
+
+    cont = maple_enum_type(0, MAPLE_FUNC_CONTROLLER);
+
+    if(cont) {
+        state = (cont_state_t *)maple_dev_status(cont);
+
+        if(state)
+            return state->buttons & CONT_START;
+    }
+
+    return 0;
+}
+
+pvr_poly_hdr_t hdr;
+
+void setup() {
+    glKosInit();
+    glMatrixMode(GL_MODELVIEW);
+    glLoadIdentity();
+    glOrtho(0, 640, 0, 480, -100, 100);
+    glMatrixMode(GL_PROJECTION);
+    glLoadIdentity();
+}
+
+void do_frame() {
+    int x, y, z;
+    int size;
+    int i;
+    float col;
+
+    glBegin(GL_TRIANGLES);
+
+    for(i = 0; i < polycnt; i++) {
+        x = rand() % 640;
+        y = rand() % 480;
+        z = rand() % 100 + 1;
+        size = rand() % 50 + 1;
+        col = (rand() % 255) * 0.00391f;
+
+        glColor3f(col, col, col);
+        glVertex3f(x - size, y - size, z);
+        glVertex3f(x + size, y - size, z);
+        glVertex3f(x + size, y + size, z);
+    }
+
+    glEnd();
+
+    glKosSwapBuffers();
+}
+
+time_t start;
+void switch_tests(int ppf) {
+    printf("Beginning new test: %d polys per frame (%d per second at 60fps)\n",
+           ppf * 2, ppf * 2 * 60);
+    avgfps = -1;
+    polycnt = ppf;
+}
+
+void check_switch() {
+    time_t now;
+
+    now = time(NULL);
+
+    if(now >= (start + 5)) {
+        start = time(NULL);
+        printf("  Average Frame Rate: ~%f fps (%d pps)\n", avgfps, (int)(polycnt * avgfps * 2));
+
+        switch(phase) {
+            case PHASE_HALVE:
+
+                if(avgfps < 55) {
+                    switch_tests(polycnt / 1.2f);
+                }
+                else {
+                    printf("  Entering PHASE_INCR\n");
+                    phase = PHASE_INCR;
+                }
+
+                break;
+            case PHASE_INCR:
+
+                if(avgfps >= 55) {
+                    switch_tests(polycnt + 15);
+                }
+                else {
+                    printf("  Entering PHASE_DECR\n");
+                    phase = PHASE_DECR;
+                }
+
+                break;
+            case PHASE_DECR:
+
+                if(avgfps < 55) {
+                    switch_tests(polycnt - 30);
+                }
+                else {
+                    printf("  Entering PHASE_FINAL\n");
+                    phase = PHASE_FINAL;
+                }
+
+                break;
+            case PHASE_FINAL:
+                break;
+        }
+    }
+}
+
+int main(int argc, char **argv) {
+    setup();
+
+    /* Start off with something obscene */
+    switch_tests(220000 / 60);
+    start = time(NULL);
+
+    for(;;) {
+        if(check_start())
+            break;
+
+        printf(" \r");
+        do_frame();
+        running_stats();
+        check_switch();
+    }
+
+    stats();
+
+    return 0;
+}
+
+
diff --git a/samples/trimark/romdisk/PLACEHOLDER b/samples/trimark/romdisk/PLACEHOLDER
new file mode 100644
index 0000000..e69de29

From df44c0ea73a683e1bb03bd59903b4286cffb5d2d Mon Sep 17 00:00:00 2001
From: Luke Benstead <kazade@gmail.com>
Date: Tue, 9 Oct 2018 09:27:53 +0100
Subject: [PATCH 2/5] More performance work and fix bugs introduced in last
 commit

---
 GL/draw.c                            | 320 ++++++++++++++++++++-------
 samples/Makefile                     |   1 +
 samples/nehe02de/Makefile            |  29 +++
 samples/nehe02de/main.c              |  94 ++++++++
 samples/nehe02de/romdisk/PLACEHOLDER |   0
 5 files changed, 363 insertions(+), 81 deletions(-)
 create mode 100644 samples/nehe02de/Makefile
 create mode 100644 samples/nehe02de/main.c
 create mode 100644 samples/nehe02de/romdisk/PLACEHOLDER

diff --git a/GL/draw.c b/GL/draw.c
index 50f301d..f2fbcb3 100644
--- a/GL/draw.c
+++ b/GL/draw.c
@@ -460,6 +460,180 @@ static inline PolyBuildFunc _calcBuildFunc(const GLenum type) {
     return &_buildStrip;
 }
 
+static inline void genElementsCommon(
+    ClipVertex* output,
+    const GLubyte* iptr, GLuint istride, GLenum type,
+    GLsizei count,
+    const GLubyte* vptr, GLuint vstride,
+    const GLubyte* cptr, GLuint cstride,
+    const GLubyte* uvptr, GLuint uvstride,
+    const GLubyte* stptr, GLuint ststride,
+    const GLubyte* nptr, GLuint nstride,
+    GLboolean doTexture, GLboolean doMultitexture, GLboolean doLighting
+) {
+    const FloatParseFunc vertexFunc = _calcVertexParseFunc();
+    const ByteParseFunc diffuseFunc = _calcDiffuseParseFunc();
+    const FloatParseFunc uvFunc = _calcUVParseFunc();
+    const FloatParseFunc stFunc = _calcSTParseFunc();
+    const FloatParseFunc normalFunc = _calcNormalParseFunc();
+
+    const IndexParseFunc indexFunc = _calcParseIndexFunc(type);
+
+    GLsizei i = 0;
+    const GLubyte* idx = iptr;
+    ClipVertex* vertex = output;
+
+    for(; i < count; ++i, idx += istride, ++vertex) {
+        GLuint j = indexFunc(idx);
+        vertex->flags = PVR_CMD_VERTEX;
+        vertexFunc(vertex->xyz, vptr + (j * vstride));
+    }
+
+    idx = iptr;
+    vertex = output;
+    for(i = 0; i < count; ++i, idx += istride, ++vertex) {
+        GLuint j = indexFunc(idx);
+        diffuseFunc(vertex->bgra, cptr + (j * cstride));
+    }
+
+    if(doTexture) {
+        idx = iptr;
+        vertex = output;
+        for(i = 0; i < count; ++i, idx += istride, ++vertex) {
+            GLuint j = indexFunc(idx);
+            uvFunc(vertex->uv, uvptr + (j * uvstride));
+        }
+    }
+
+    if(doMultitexture) {
+        idx = iptr;
+        vertex = output;
+        for(i = 0; i < count; ++i, idx += istride, ++vertex) {
+            GLuint j = indexFunc(idx);
+            stFunc(vertex->st, stptr + (j * ststride));
+        }
+    }
+
+    if(doLighting) {
+        idx = iptr;
+        vertex = output;
+        for(i = 0; i < count; ++i, idx += istride, ++vertex) {
+            GLuint j = indexFunc(idx);
+            normalFunc(vertex->nxyz, nptr + (j * nstride));
+        }
+    }
+}
+
+static inline void genElementsTriangles(
+    ClipVertex* output,
+    GLsizei count,
+    const GLubyte* iptr, GLuint istride, GLenum type,
+    const GLubyte* vptr, GLuint vstride,
+    const GLubyte* cptr, GLuint cstride,
+    const GLubyte* uvptr, GLuint uvstride,
+    const GLubyte* stptr, GLuint ststride,
+    const GLubyte* nptr, GLuint nstride,
+    GLboolean doTexture, GLboolean doMultitexture, GLboolean doLighting) {
+
+    genElementsCommon(
+        output,
+        iptr, istride, type, count,
+        vptr, vstride, cptr, cstride, uvptr, uvstride, stptr, ststride, nptr, nstride,
+        doTexture, doMultitexture, doLighting
+    );
+
+    GLsizei i = 2;
+    for(; i < count; i += 3) {
+        output[i].flags = PVR_CMD_VERTEX_EOL;
+    }
+}
+
+static inline void genElementsQuads(
+    ClipVertex* output,
+    GLsizei count,
+    const GLubyte* iptr, GLuint istride, GLenum type,
+    const GLubyte* vptr, GLuint vstride,
+    const GLubyte* cptr, GLuint cstride,
+    const GLubyte* uvptr, GLuint uvstride,
+    const GLubyte* stptr, GLuint ststride,
+    const GLubyte* nptr, GLuint nstride,
+    GLboolean doTexture, GLboolean doMultitexture, GLboolean doLighting) {
+
+    genElementsCommon(
+        output,
+        iptr, istride, type, count,
+        vptr, vstride, cptr, cstride, uvptr, uvstride, stptr, ststride, nptr, nstride,
+        doTexture, doMultitexture, doLighting
+    );
+
+    GLsizei i = 3;
+    for(; i < count; i += 4) {
+        swapVertex(&output[i], &output[i - 1]);
+        output[i].flags = PVR_CMD_VERTEX_EOL;
+    }
+}
+
+static inline void genElementsTriangleFan(
+    ClipVertex* output,
+    GLsizei count,
+    const GLubyte* iptr, GLuint istride, GLenum type,
+    const GLubyte* vptr, GLuint vstride,
+    const GLubyte* cptr, GLuint cstride,
+    const GLubyte* uvptr, GLuint uvstride,
+    const GLubyte* stptr, GLuint ststride,
+    const GLubyte* nptr, GLuint nstride,
+    GLboolean doTexture, GLboolean doMultitexture, GLboolean doLighting) {
+
+    genElementsCommon(
+        output,
+        iptr, istride, type, count,
+        vptr, vstride, cptr, cstride, uvptr, uvstride, stptr, ststride, nptr, nstride,
+        doTexture, doMultitexture, doLighting
+    );
+
+    swapVertex(&output[1], &output[2]);
+    output[2].flags = PVR_CMD_VERTEX_EOL;
+
+    GLsizei i = 3;
+    ClipVertex* first = &output[0];
+
+    for(; i < count - 1; ++i) {
+        ClipVertex* next = &output[i + 1];
+        ClipVertex* previous = &output[i - 1];
+        ClipVertex* vertex = &output[i];
+
+        *next = *first;
+
+        swapVertex(next, vertex);
+
+        vertex = next + 1;
+        *vertex = *previous;
+
+        vertex->flags = PVR_CMD_VERTEX_EOL;
+    }
+}
+
+static inline void genElementsTriangleStrip(
+    ClipVertex* output,
+    GLsizei count,
+    const GLubyte* iptr, GLuint istride, GLenum type,
+    const GLubyte* vptr, GLuint vstride,
+    const GLubyte* cptr, GLuint cstride,
+    const GLubyte* uvptr, GLuint uvstride,
+    const GLubyte* stptr, GLuint ststride,
+    const GLubyte* nptr, GLuint nstride,
+    GLboolean doTexture, GLboolean doMultitexture, GLboolean doLighting) {
+
+    genElementsCommon(
+        output,
+        iptr, istride, type, count,
+        vptr, vstride, cptr, cstride, uvptr, uvstride, stptr, ststride, nptr, nstride,
+        doTexture, doMultitexture, doLighting
+    );
+
+    output[count - 1].flags = PVR_CMD_VERTEX_EOL;
+}
+
 static inline void genArraysCommon(
     ClipVertex* output,
     GLsizei count,
@@ -566,12 +740,14 @@ static void genArraysQuads(
         doTexture, doMultitexture, doLighting
     );
 
-    GLsizei i = count;
-    ClipVertex* vertex = output;
+    GLsizei i = 3;
 
-    for(i = 3; i < count; i += 4) {
-        swapVertex(&vertex[i], &vertex[i - 1]);
-        vertex[i].flags = PVR_CMD_VERTEX_EOL;
+    for(; i < count; i += 4) {
+        ClipVertex* this = output + i;
+        ClipVertex* previous = output + (i - 1);
+
+        swapVertex(previous, this);
+        this->flags = PVR_CMD_VERTEX_EOL;
     }
 }
 
@@ -642,13 +818,14 @@ static void generate(ClipVertex* output, const GLenum mode, const GLsizei first,
     const GLuint ststride = (ST_POINTER.stride) ? ST_POINTER.stride : ST_POINTER.size * byte_size(ST_POINTER.type);
     const GLuint nstride = (NORMAL_POINTER.stride) ? NORMAL_POINTER.stride : NORMAL_POINTER.size * byte_size(NORMAL_POINTER.type);
 
-    if(!indices) {
-        const GLubyte* vptr = VERTEX_POINTER.ptr + (first * vstride);
-        const GLubyte* cptr = DIFFUSE_POINTER.ptr + (first * cstride);
-        const GLubyte* uvptr = UV_POINTER.ptr + (first * uvstride);
-        const GLubyte* stptr = ST_POINTER.ptr + (first * ststride);
-        const GLubyte* nptr = NORMAL_POINTER.ptr + (first * nstride);
+    const GLubyte* vptr = VERTEX_POINTER.ptr + (first * vstride);
+    const GLubyte* cptr = DIFFUSE_POINTER.ptr + (first * cstride);
+    const GLubyte* uvptr = UV_POINTER.ptr + (first * uvstride);
+    const GLubyte* stptr = ST_POINTER.ptr + (first * ststride);
+    const GLubyte* nptr = NORMAL_POINTER.ptr + (first * nstride);
+    const GLsizei istride = byte_size(type);
 
+    if(!indices) {
         // Drawing arrays
         switch(mode) {
         case GL_TRIANGLES:
@@ -662,6 +839,7 @@ static void generate(ClipVertex* output, const GLenum mode, const GLsizei first,
                 nptr, nstride,
                 doTexture, doMultitexture, doLighting
             );
+            break;
         case GL_QUADS:
             genArraysQuads(
                 output,
@@ -673,6 +851,7 @@ static void generate(ClipVertex* output, const GLenum mode, const GLsizei first,
                 nptr, nstride,
                 doTexture, doMultitexture, doLighting
             );
+            break;
         case GL_TRIANGLE_FAN:
             genArraysTriangleFan(
                 output,
@@ -684,6 +863,7 @@ static void generate(ClipVertex* output, const GLenum mode, const GLsizei first,
                 nptr, nstride,
                 doTexture, doMultitexture, doLighting
             );
+            break;
         case GL_TRIANGLE_STRIP:
         default:
             genArraysTriangleStrip(
@@ -697,76 +877,54 @@ static void generate(ClipVertex* output, const GLenum mode, const GLsizei first,
                 doTexture, doMultitexture, doLighting
             );
         }
-        return;
-    }
-
-
-    const GLsizei max = first + count;
-
-    ClipVertex* vertex = output;
-
-    const FloatParseFunc vertexFunc = _calcVertexParseFunc();
-    const ByteParseFunc diffuseFunc = _calcDiffuseParseFunc();
-    const FloatParseFunc uvFunc = _calcUVParseFunc();
-    const FloatParseFunc stFunc = _calcSTParseFunc();
-    const FloatParseFunc normalFunc = _calcNormalParseFunc();
-
-    const PolyBuildFunc buildFunc = _calcBuildFunc(mode);
-    const IndexParseFunc indexFunc = _calcParseIndexFunc(type);
-
-    const GLsizei type_byte_size = byte_size(type);
-
-    ClipVertex* previous = NULL;
-    ClipVertex* firstV = vertex;
-    ClipVertex* next = NULL;
-
-    ClipVertex* target = NULL;
-
-    GLsizei i, j = 0;
-    GLuint idx;
-
-    for(i = first; i < max; ++i, ++j, ++vertex) {
-        if(mode == GL_QUADS) {
-            /* Performance optimisation to prevent copying to a temporary */
-            GLsizei mod = (j + 1) % 4;
-            if(mod == 0) {
-                target = vertex - 1;
-                target->flags = PVR_CMD_VERTEX;
-            } else if(mod == 3) {
-                target = vertex + 1;
-                target->flags = PVR_CMD_VERTEX_EOL;
-            } else {
-                target = vertex;
-                target->flags = PVR_CMD_VERTEX;
-            }
-        } else {
-            target = vertex;
-            target->flags = PVR_CMD_VERTEX;
-        }
-
-        idx = (indices) ?
-            indexFunc(&indices[type_byte_size * i]) : i;
-
-        vertexFunc(target->xyz, VERTEX_POINTER.ptr + (idx * vstride));
-        diffuseFunc(target->bgra, DIFFUSE_POINTER.ptr + (idx * cstride));
-
-        if(doTexture) {
-            uvFunc(target->uv, UV_POINTER.ptr + (idx * uvstride));
-        }
-
-        if(doMultitexture) {
-            stFunc(target->st, ST_POINTER.ptr + (idx * ststride));
-        }
-
-        if(doLighting) {
-            normalFunc(target->nxyz, NORMAL_POINTER.ptr + (idx * nstride));
-        }
-
-        if(mode != GL_QUADS) {
-            next = (j < count - 1) ? vertex + 1 : NULL;
-            previous = (j > 0) ? vertex - 1 : NULL;
-            buildFunc(firstV, previous, vertex, next, j);
-        }
+    } else if(mode == GL_TRIANGLES) {
+        genElementsTriangles(
+            output,
+            count,
+            indices, istride, type,
+            vptr, vstride,
+            cptr, cstride,
+            uvptr, uvstride,
+            stptr, ststride,
+            nptr, nstride,
+            doTexture, doMultitexture, doLighting
+        );
+    } else if(mode == GL_QUADS) {
+        genElementsQuads(
+            output,
+            count,
+            indices, istride, type,
+            vptr, vstride,
+            cptr, cstride,
+            uvptr, uvstride,
+            stptr, ststride,
+            nptr, nstride,
+            doTexture, doMultitexture, doLighting
+        );
+    } else if(mode == GL_TRIANGLE_FAN) {
+        genElementsTriangleFan(
+            output,
+            count,
+            indices, istride, type,
+            vptr, vstride,
+            cptr, cstride,
+            uvptr, uvstride,
+            stptr, ststride,
+            nptr, nstride,
+            doTexture, doMultitexture, doLighting
+        );
+    } else {
+        genElementsTriangleStrip(
+            output,
+            count,
+            indices, istride, type,
+            vptr, vstride,
+            cptr, cstride,
+            uvptr, uvstride,
+            stptr, ststride,
+            nptr, nstride,
+            doTexture, doMultitexture, doLighting
+        );
     }
 }
 
diff --git a/samples/Makefile b/samples/Makefile
index 8d94f2f..de30006 100644
--- a/samples/Makefile
+++ b/samples/Makefile
@@ -9,6 +9,7 @@ export OBJEXTRA := $(LIB_DIR)/libGLdc.a
 all:
 	$(KOS_MAKE) -C nehe02 all
 	$(KOS_MAKE) -C nehe02va all
+	$(KOS_MAKE) -C nehe02de all
 	$(KOS_MAKE) -C nehe03 all
 	$(KOS_MAKE) -C nehe06 all
 	$(KOS_MAKE) -C nehe06_vq all
diff --git a/samples/nehe02de/Makefile b/samples/nehe02de/Makefile
new file mode 100644
index 0000000..bd09525
--- /dev/null
+++ b/samples/nehe02de/Makefile
@@ -0,0 +1,29 @@
+TARGET = nehe02de.elf
+OBJS = main.o
+
+all: rm-elf $(TARGET)
+
+include $(KOS_BASE)/Makefile.rules
+
+clean:
+	-rm -f $(TARGET) $(OBJS) romdisk.*
+
+rm-elf:
+	-rm -f $(TARGET) romdisk.*
+
+$(TARGET): $(OBJS) romdisk.o
+	$(KOS_CC) $(KOS_CFLAGS) $(KOS_LDFLAGS) -o $(TARGET) $(KOS_START) \
+		$(OBJS) romdisk.o $(OBJEXTRA) -lm -lkosutils $(KOS_LIBS)
+
+romdisk.img:
+	$(KOS_GENROMFS) -f romdisk.img -d romdisk -v
+
+romdisk.o: romdisk.img
+	$(KOS_BASE)/utils/bin2o/bin2o romdisk.img romdisk romdisk.o
+
+run: $(TARGET)
+	$(KOS_LOADER) $(TARGET)
+
+dist:
+	rm -f $(OBJS) romdisk.o romdisk.img
+	$(KOS_STRIP) $(TARGET)
diff --git a/samples/nehe02de/main.c b/samples/nehe02de/main.c
new file mode 100644
index 0000000..9855e54
--- /dev/null
+++ b/samples/nehe02de/main.c
@@ -0,0 +1,94 @@
+#include "gl.h"
+#include "glu.h"
+#include "glkos.h"
+
+/* A general OpenGL initialization function.  Sets all of the initial parameters. */
+void InitGL(int Width, int Height)	        // We call this right after our OpenGL window is created.
+{
+    glClearColor(0.0f, 0.0f, 0.0f, 0.0f);		// This Will Clear The Background Color To Black
+    glClearDepth(1.0);				// Enables Clearing Of The Depth Buffer
+    glDepthFunc(GL_LESS);				// The Type Of Depth Test To Do
+    glEnable(GL_DEPTH_TEST);			// Enables Depth Testing
+    glShadeModel(GL_SMOOTH);			// Enables Smooth Color Shading
+
+    glEnableClientState(GL_VERTEX_ARRAY); // Enable vertex arrays
+
+    glMatrixMode(GL_PROJECTION);
+    glLoadIdentity();				// Reset The Projection Matrix
+
+    gluPerspective(45.0f,(GLfloat)Width/(GLfloat)Height,0.1f,100.0f);	// Calculate The Aspect Ratio Of The Window
+
+    glMatrixMode(GL_MODELVIEW);
+}
+
+/* The function called when our window is resized (which shouldn't happen, because we're fullscreen) */
+void ReSizeGLScene(int Width, int Height)
+{
+    if (Height == 0)				// Prevent A Divide By Zero If The Window Is Too Small
+        Height = 1;
+
+    glViewport(0, 0, Width, Height);		// Reset The Current Viewport And Perspective Transformation
+
+    glMatrixMode(GL_PROJECTION);
+    glLoadIdentity();
+
+    gluPerspective(45.0f,(GLfloat)Width/(GLfloat)Height,0.1f,100.0f);
+    glMatrixMode(GL_MODELVIEW);
+}
+
+
+/* The main drawing function. */
+void DrawGLScene()
+{
+
+    const GLfloat triangle [] = {
+        0.0f, 1.0f, 0.0f,
+        1.0f, -1.0f, 0.0f,
+        -1.0f, -1.0f, 0.0f
+    };
+
+    const GLfloat square [] = {
+        -1.0f, 1.0f, 0.0f,
+        1.0f, 1.0f, 0.0f,
+        1.0f, -1.0f, 0.0f,
+        -1.0f, -1.0f, 0.0f
+    };
+
+    const GLuint triangleIdx [] = {
+        0, 1, 2
+    };
+
+    const GLuint squareIdx [] = {
+        0, 1, 2, 3
+    };
+
+    glClear(GL_COLOR_BUFFER_BIT | GL_DEPTH_BUFFER_BIT);		// Clear The Screen And The Depth Buffer
+    glLoadIdentity();				// Reset The View
+
+    glTranslatef(-1.5f,0.0f,-6.0f);		// Move Left 1.5 Units And Into The Screen 6.0
+
+    glVertexPointer(3, GL_FLOAT, 0, triangle);
+    glDrawElements(GL_TRIANGLES, 3, GL_UNSIGNED_INT, triangleIdx);
+
+    glTranslatef(3.0f,0.0f,0.0f);		        // Move Right 3 Units
+
+    glVertexPointer(3, GL_FLOAT, 0, square);
+    glDrawElements(GL_QUADS, 4, GL_UNSIGNED_INT, squareIdx);
+
+    // swap buffers to display, since we're double buffered.
+    glKosSwapBuffers();
+}
+
+int main(int argc, char **argv)
+{
+    glKosInit();
+
+    InitGL(640, 480);
+    ReSizeGLScene(640, 480);
+
+    while(1) {
+        DrawGLScene();
+    }
+
+    return 0;
+}
diff --git a/samples/nehe02de/romdisk/PLACEHOLDER b/samples/nehe02de/romdisk/PLACEHOLDER
new file mode 100644
index 0000000..e69de29

From 0718ab0697fe6ece21e33f820cdcd90c24542784 Mon Sep 17 00:00:00 2001
From: Hayden Kowalchuk <mrneo240@users.noreply.github.com>
Date: Tue, 16 Oct 2018 01:00:34 -0400
Subject: [PATCH 3/5] Fix compilation windows targets

---
 containers/aligned_vector.c | 6 +++++-
 containers/stack.c          | 6 +++++-
 2 files changed, 10 insertions(+), 2 deletions(-)

diff --git a/containers/aligned_vector.c b/containers/aligned_vector.c
index 60638f6..4f44c92 100644
--- a/containers/aligned_vector.c
+++ b/containers/aligned_vector.c
@@ -3,8 +3,12 @@
 #include <math.h>
 
 #ifndef __APPLE__
-#include <malloc.h>
+#if defined(__WIN32__)
+/* Linux + Kos define this, OSX does not, so just use malloc there */
+#define memalign(x, size) malloc((size))
 #else
+#include <malloc.h>
+#endif 
 /* Linux + Kos define this, OSX does not, so just use malloc there */
 #define memalign(x, size) malloc((size))
 #endif
diff --git a/containers/stack.c b/containers/stack.c
index ba82b22..8370e24 100644
--- a/containers/stack.c
+++ b/containers/stack.c
@@ -1,8 +1,12 @@
 #include <string.h>
 
 #ifndef __APPLE__
-#include <malloc.h>
+#if defined(__WIN32__)
+/* Linux + Kos define this, OSX does not, so just use malloc there */
+#define memalign(x, size) malloc((size))
 #else
+#include <malloc.h>
+#endif 
 /* Linux + Kos define this, OSX does not, so just use malloc there */
 #define memalign(x, size) malloc((size))
 #endif

From b9f8fe4a4d594557cae69607cde4377a7744920b Mon Sep 17 00:00:00 2001
From: Luke Benstead <kazade@gmail.com>
Date: Thu, 18 Oct 2018 09:54:34 +0100
Subject: [PATCH 4/5] Clean up the memalign switching a bit

---
 containers/aligned_vector.c | 12 +++++-------
 containers/stack.c          | 13 ++++++-------
 2 files changed, 11 insertions(+), 14 deletions(-)

diff --git a/containers/aligned_vector.c b/containers/aligned_vector.c
index 4f44c92..67f2804 100644
--- a/containers/aligned_vector.c
+++ b/containers/aligned_vector.c
@@ -2,15 +2,13 @@
 #include <string.h>
 #include <math.h>
 
-#ifndef __APPLE__
-#if defined(__WIN32__)
+#if defined(__APPLE__) || defined(__WIN32__)
 /* Linux + Kos define this, OSX does not, so just use malloc there */
-#define memalign(x, size) malloc((size))
+static inline void* memalign(size_t alignment, size_t size) {
+    return malloc(size);
+}
 #else
-#include <malloc.h>
-#endif 
-/* Linux + Kos define this, OSX does not, so just use malloc there */
-#define memalign(x, size) malloc((size))
+    #include <malloc.h>
 #endif
 
 #include "aligned_vector.h"
diff --git a/containers/stack.c b/containers/stack.c
index 8370e24..80aa0bc 100644
--- a/containers/stack.c
+++ b/containers/stack.c
@@ -1,14 +1,13 @@
 #include <string.h>
+#include <stdlib.h>
 
-#ifndef __APPLE__
-#if defined(__WIN32__)
+#if defined(__APPLE__) || defined(__WIN32__)
 /* Linux + Kos define this, OSX does not, so just use malloc there */
-#define memalign(x, size) malloc((size))
+static inline void* memalign(size_t alignment, size_t size) {
+    return malloc(size);
+}
 #else
-#include <malloc.h>
-#endif 
-/* Linux + Kos define this, OSX does not, so just use malloc there */
-#define memalign(x, size) malloc((size))
+    #include <malloc.h>
 #endif
 
 #include "stack.h"

From 8efd678da0ce106cf6eedff2acc2f77cb97e118c Mon Sep 17 00:00:00 2001
From: Luke Benstead <kazade@gmail.com>
Date: Sun, 21 Oct 2018 10:11:45 +0100
Subject: [PATCH 5/5] Update README

---
 README.md | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/README.md b/README.md
index f2ca075..5a6bb59 100644
--- a/README.md
+++ b/README.md
@@ -1,6 +1,8 @@
 
 # GLdc
 
+**Development of GLdc has moved to [Gitlab](https://gitlab.com/simulant/GLdc)**
+
 This is a partial implementation of OpenGL 1.2 for the SEGA Dreamcast for use
 with the KallistiOS SDK.