Merge branch 'clipping-rewrite-for-the-last-time-ffs' into 'master'

Restructure clipping to be much MUCH faster in the visible case See merge request simulant/GLdc!105
2023-04-26 20:00:17 +00:00 · 2023-04-26 20:00:17 +00:00 · 9e1b1bc40a
commit 9e1b1bc40a
parent 34448939a4 0f65eab86a
8 changed files with 970 additions and 315 deletions
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@ -17,6 +17,7 @@ string(TOUPPER ${BACKEND} BACKEND_UPPER)
 add_definitions(-DBACKEND_${BACKEND_UPPER})

 set(CMAKE_C_STANDARD 99)
+set(CMAKE_CXX_STANDARD 11)

 include_directories(include)

@ -178,6 +179,7 @@ gen_sample(scissor samples/scissor/main.c)
 gen_sample(polymark samples/polymark/main.c)
 gen_sample(cubes samples/cubes/main.cpp)

+gen_sample(zclip_test tests/zclip/main.cpp)

 if(PLATFORM_DREAMCAST)
    gen_sample(trimark samples/trimark/main.c)
--- a/GL/platforms/sh4.c
+++ b/GL/platforms/sh4.c
@ -9,11 +9,7 @@
 #define likely(x)      __builtin_expect(!!(x), 1)
 #define unlikely(x)    __builtin_expect(!!(x), 0)

-#define SQ_BASE_ADDRESS (uint32_t *)(void *) \
-    (0xe0000000 | (((uint32_t)0x10000000) & 0x03ffffe0))
-
-
-static volatile uint32_t* PVR_LMMODE0 = (uint32_t*) 0xA05F6884;
+#define SQ_BASE_ADDRESS (void*) 0xe0000000


 GL_FORCE_INLINE bool glIsVertex(const float flags) {
@ -54,31 +50,28 @@ GL_FORCE_INLINE void _glPerspectiveDivideVertex(Vertex* vertex, const float h) {
    const float f = _glFastInvert(vertex->w);

    /* Convert to NDC and apply viewport */
-    vertex->xyz[0] = __builtin_fmaf(
-        VIEWPORT.hwidth, vertex->xyz[0] * f, VIEWPORT.x_plus_hwidth
-    );
-
-    vertex->xyz[1] = h - __builtin_fmaf(
-        VIEWPORT.hheight, vertex->xyz[1] * f, VIEWPORT.y_plus_hheight
-    );
+    vertex->xyz[0] = ((vertex->xyz[0] * f) * 320) + 320;
+    vertex->xyz[1] = ((vertex->xyz[1] * f) * -240) + 240;
+    vertex->xyz[2] = f;

    /* Orthographic projections need to use invZ otherwise we lose
    the depth information. As w == 1, and clip-space range is -w to +w
    we add 1.0 to the Z to bring it into range. We add a little extra to
    avoid a divide by zero.
    */
-
-    vertex->xyz[2] = (vertex->w == 1.0f) ? _glFastInvert(1.0001f + vertex->xyz[2]) : f;
+    if(vertex->w == 1.0f) {
+        vertex->xyz[2] = _glFastInvert(1.0001f + vertex->xyz[2]);
+    }
 }

-GL_FORCE_INLINE void _glSubmitHeaderOrVertex(uint32_t* d, const Vertex* v) {
+GL_FORCE_INLINE void _glSubmitHeaderOrVertex(volatile uint32_t* d, const Vertex* v) {
 #ifndef NDEBUG
    gl_assert(!isnan(v->xyz[2]));
    gl_assert(!isnan(v->w));
 #endif

 #if CLIP_DEBUG
-    printf("Submitting: %x (%x)\n", v, v->flags);
+    fprintf(stderr, "Submitting: %x (%x)\n", v, v->flags);
 #endif

    uint32_t *s = (uint32_t*) v;
@ -94,336 +87,323 @@ GL_FORCE_INLINE void _glSubmitHeaderOrVertex(uint32_t* d, const Vertex* v) {
    d += 8;
 }

-static struct __attribute__((aligned(32))) {
-    Vertex* v;
-    int visible;
-} triangle[3];
-
-static int tri_count = 0;
-static int strip_count = 0;
-
-static inline void interpolateColour(const uint32_t* a, const uint32_t* b, const float t, uint32_t* out) {
-    const static uint32_t MASK1 = 0x00FF00FF;
-    const static uint32_t MASK2 = 0xFF00FF00;
-
-    const uint32_t f2 = 256 * t;
-    const uint32_t f1 = 256 - f2;
-
-    *out = (((((*a & MASK1) * f1) + ((*b & MASK1) * f2)) >> 8) & MASK1) |
-            (((((*a & MASK2) * f1) + ((*b & MASK2) * f2)) >> 8) & MASK2);
-}
-
-static inline void _glClipEdge(const Vertex* v1, const Vertex* v2, Vertex* vout) {
-    /* Clipping time! */
+static inline void _glClipEdge(const Vertex* const v1, const Vertex* const v2, Vertex* vout) {
+    const static float o = 1.0f / 255.0f;
    const float d0 = v1->w + v1->xyz[2];
    const float d1 = v2->w + v2->xyz[2];
-    const float sign = ((2.0f * (d1 < d0)) - 1.0f);
-    const float epsilon = -0.00001f * sign;
-    const float n = (d0 - d1);
-    const float r = (1.f / sqrtf(n * n)) * sign;
-    float t = fmaf(r, d0, epsilon);
+    const float t = (fabs(d0) * (1.0f / sqrtf((d1 - d0) * (d1 - d0)))) + 0.000001f;
+    const float invt = 1.0f - t;

-    vout->xyz[0] = fmaf(v2->xyz[0] - v1->xyz[0], t, v1->xyz[0]);
-    vout->xyz[1] = fmaf(v2->xyz[1] - v1->xyz[1], t, v1->xyz[1]);
-    vout->xyz[2] = fmaf(v2->xyz[2] - v1->xyz[2], t, v1->xyz[2]);
-    vout->w = fmaf(v2->w - v1->w, t, v1->w);
+    vout->xyz[0] = invt * v1->xyz[0] + t * v2->xyz[0];
+    vout->xyz[1] = invt * v1->xyz[1] + t * v2->xyz[1];
+    vout->xyz[2] = invt * v1->xyz[2] + t * v2->xyz[2];

-    vout->uv[0] = fmaf(v2->uv[0] - v1->uv[0], t, v1->uv[0]);
-    vout->uv[1] = fmaf(v2->uv[1] - v1->uv[1], t, v1->uv[1]);
+    vout->uv[0] = invt * v1->uv[0] + t * v2->uv[0];
+    vout->uv[1] = invt * v1->uv[1] + t * v2->uv[1];

-    interpolateColour((uint32_t*) v1->bgra, (uint32_t*) v2->bgra, t, (uint32_t*) vout->bgra);
-}
+    vout->w = invt * v1->w + t * v2->w;

-GL_FORCE_INLINE void ClearTriangle() {
-    tri_count = 0;
-}
+    const float m = 255 * t;
+    const float n = 255 - m;

-static inline void ShiftTriangle() {
-    if(!tri_count) {
-        return;
-    }
-
-    tri_count--;
-    triangle[0] = triangle[1];
-    triangle[1] = triangle[2];
-
-#ifndef NDEBUG
-    triangle[2].v = NULL;
-    triangle[2].visible = false;
-#endif
-}
-
-
-static inline void ShiftRotateTriangle() {
-    if(!tri_count) {
-        return;
-    }
-
-    if(triangle[0].v < triangle[1].v) {
-        triangle[0] = triangle[2];
-    } else {
-        triangle[1] = triangle[2];
-    }
-
-    tri_count--;
+    vout->bgra[0] = (v1->bgra[0] * n + v2->bgra[0] * m) * o;
+    vout->bgra[1] = (v1->bgra[1] * n + v2->bgra[1] * m) * o;
+    vout->bgra[2] = (v1->bgra[2] * n + v2->bgra[2] * m) * o;
+    vout->bgra[3] = (v1->bgra[3] * n + v2->bgra[3] * m) * o;
 }

 #define SPAN_SORT_CFG 0x005F8030
+static volatile uint32_t* PVR_LMMODE0 = (uint32_t*) 0xA05F6884;
+static volatile uint32_t *PVR_LMMODE1 = (uint32_t*) 0xA05F6888;
+static volatile uint32_t *QACR = (uint32_t*) 0xFF000038;

 void SceneListSubmit(void* src, int n) {
+    /* You need at least a header, and 3 vertices to render anything */
+    if(n < 4) {
+        return;
+    }
+
    const float h = GetVideoMode()->height;

    PVR_SET(SPAN_SORT_CFG, 0x0);

    //Set PVR DMA registers
-    volatile int *pvrdmacfg = (int*)0xA05F6888;
-    pvrdmacfg[0] = 1;
-    pvrdmacfg[1] = 0;
+    *PVR_LMMODE0 = 0;
+    *PVR_LMMODE1 = 0;

    //Set QACR registers
-    volatile int *qacr = (int*)0xFF000038;
-    qacr[1] = qacr[0] = 0x11;
+    QACR[1] = QACR[0] = 0x11;

-    uint32_t *d = SQ_BASE_ADDRESS;
+    volatile uint32_t *sq = SQ_BASE_ADDRESS;

-    Vertex __attribute__((aligned(32))) tmp;
-
-    /* Perform perspective divide on each vertex */
-    Vertex* vertex = (Vertex*) src;
-
-    if(!_glNearZClippingEnabled()) {
-        /* Prep store queues */
-
-        while(n--) {
-            if(glIsVertex(vertex->flags)) {
-                _glPerspectiveDivideVertex(vertex, h);
-            }
-
-            _glSubmitHeaderOrVertex(d, vertex);
-            ++vertex;
-        }
-
-        return;
-    }
-
-    tri_count = 0;
-    strip_count = 0;
+    uint32_t clipping_disabled_mask = (_glNearZClippingEnabled()) ? 0 : 0x7;

 #if CLIP_DEBUG
-    printf("----\n");
-#endif
+    for(int i = 0; i < n; ++i) {
+        fprintf(stderr, "{%f, %f, %f, %f}, // %x (%x)\n", vertex[i].xyz[0], vertex[i].xyz[1], vertex[i].xyz[2], vertex[i].w, vertex[i].flags, &vertex[i]);
+    }

-    for(int i = 0; i < n; ++i, ++vertex) {
-        PREFETCH(vertex + 1);
-        PREFETCH(vertex + 2);
-        /* Wait until we fill the triangle */
-        if(tri_count < 3) {
-            if(glIsVertex(vertex->flags)) {
-                ++strip_count;
-                triangle[tri_count].v = vertex;
-                triangle[tri_count].visible = vertex->xyz[2] >= -vertex->w;
-                if(++tri_count < 3) {
+    fprintf(stderr, "----\n");
+#endif
+    uint8_t counter = 0;
+
+    Vertex* v2 = (Vertex*) src;
+    while(n--) {
+        __builtin_prefetch(v2 + 1);
+
+        switch(v2->flags) {
+            case GPU_CMD_VERTEX_EOL:
+            case GPU_CMD_VERTEX:
+                if(++counter < 3) {
+                    v2++;
                    continue;
                }
-            } else {
-                /* We hit a header */
-                tri_count = 0;
-                strip_count = 0;
-                _glSubmitHeaderOrVertex(d, vertex);
+            break;
+            default:
+                _glSubmitHeaderOrVertex(sq, v2++);
+                counter = 0;
                continue;
-            }
        }

-#if CLIP_DEBUG
-        printf("SC: %d\n", strip_count);
-#endif
+        Vertex* const v0 = v2 - 2;
+        Vertex* const v1 = v2 - 1;

-        /* If we got here, then triangle contains 3 vertices */
-        int visible_mask = triangle[0].visible | (triangle[1].visible << 1) | (triangle[2].visible << 2);
-
-        /* Clipping time!
-
-            There are 6 distinct possibilities when clipping a triangle. 3 of them result
-            in another triangle, 3 of them result in a quadrilateral.
-
-            Assuming you iterate the edges of the triangle in order, and create a new *visible*
-            vertex when you cross the plane, and discard vertices behind the plane, then the only
-            difference between the two cases is that the final two vertices that need submitting have
-            to be reversed.
-
-            Unfortunately we have to copy vertices here, because if we persp-divide a vertex it may
-            be used in a subsequent triangle in the strip and would end up being double divided.
-        */
-
-#define SUBMIT_QUEUED() \
-    if(strip_count > 3) { \
-        tmp = *(vertex - 2); \
-        /* If we had triangles ahead of this one, submit and finalize */ \
-        _glPerspectiveDivideVertex(&tmp, h); \
-        _glSubmitHeaderOrVertex(d, &tmp); \
-        tmp = *(vertex - 1); \
-        tmp.flags = GPU_CMD_VERTEX_EOL; \
-        _glPerspectiveDivideVertex(&tmp, h); \
-        _glSubmitHeaderOrVertex(d, &tmp); \
-    }
-
-        bool is_last_in_strip = glIsLastVertex(vertex->flags);
+        const uint8_t visible_mask = (
+            (v0->xyz[2] > -v0->w) << 0 |
+            (v1->xyz[2] > -v1->w) << 1 |
+            (v2->xyz[2] > -v2->w) << 2 |
+            ((v2->flags == GPU_CMD_VERTEX_EOL) << 3) |
+            clipping_disabled_mask  // This forces everything to be marked visible if clipping is disabled
+        );

        switch(visible_mask) {
-            case 1: {
-                SUBMIT_QUEUED();
-                /* 0, 0a, 2a */
-                tmp = *triangle[0].v;
-                tmp.flags = GPU_CMD_VERTEX;
-                _glPerspectiveDivideVertex(&tmp, h);
-                _glSubmitHeaderOrVertex(d, &tmp);
-
-                _glClipEdge(triangle[0].v, triangle[1].v, &tmp);
-                tmp.flags = GPU_CMD_VERTEX;
-                _glPerspectiveDivideVertex(&tmp, h);
-                _glSubmitHeaderOrVertex(d, &tmp);
-
-                _glClipEdge(triangle[2].v, triangle[0].v, &tmp);
-                tmp.flags = GPU_CMD_VERTEX_EOL;
-                _glPerspectiveDivideVertex(&tmp, h);
-                _glSubmitHeaderOrVertex(d, &tmp);
-            } break;
-            case 2: {
-                SUBMIT_QUEUED();
-                /* 0a, 1, 1a */
-                _glClipEdge(triangle[0].v, triangle[1].v, &tmp);
-                tmp.flags = GPU_CMD_VERTEX;
-                _glPerspectiveDivideVertex(&tmp, h);
-                _glSubmitHeaderOrVertex(d, &tmp);
-
-                tmp = *triangle[1].v;
-                tmp.flags = GPU_CMD_VERTEX;
-                _glPerspectiveDivideVertex(&tmp, h);
-                _glSubmitHeaderOrVertex(d, &tmp);
-
-                _glClipEdge(triangle[1].v, triangle[2].v, &tmp);
-                tmp.flags = GPU_CMD_VERTEX_EOL;
-                _glPerspectiveDivideVertex(&tmp, h);
-                _glSubmitHeaderOrVertex(d, &tmp);
-            } break;
-            case 3: {
-                SUBMIT_QUEUED();
-                /* 0, 1, 2a, 1a */
-                tmp = *triangle[0].v;
-                tmp.flags = GPU_CMD_VERTEX;
-                _glPerspectiveDivideVertex(&tmp, h);
-                _glSubmitHeaderOrVertex(d, &tmp);
-
-                tmp = *triangle[1].v;
-                tmp.flags = GPU_CMD_VERTEX;
-                _glPerspectiveDivideVertex(&tmp, h);
-                _glSubmitHeaderOrVertex(d, &tmp);
-
-                _glClipEdge(triangle[2].v, triangle[0].v, &tmp);
-                tmp.flags = GPU_CMD_VERTEX;
-                _glPerspectiveDivideVertex(&tmp, h);
-                _glSubmitHeaderOrVertex(d, &tmp);
-
-                _glClipEdge(triangle[1].v, triangle[2].v, &tmp);
-                tmp.flags = GPU_CMD_VERTEX_EOL;
-                _glPerspectiveDivideVertex(&tmp, h);
-                _glSubmitHeaderOrVertex(d, &tmp);
-            } break;
-            case 4: {
-                SUBMIT_QUEUED();
-                /* 1a, 2, 2a */
-                _glClipEdge(triangle[1].v, triangle[2].v, &tmp);
-                tmp.flags = GPU_CMD_VERTEX;
-                _glPerspectiveDivideVertex(&tmp, h);
-                _glSubmitHeaderOrVertex(d, &tmp);
-
-                tmp = *triangle[2].v;
-                tmp.flags = GPU_CMD_VERTEX;
-                _glPerspectiveDivideVertex(&tmp, h);
-                _glSubmitHeaderOrVertex(d, &tmp);
-
-                _glClipEdge(triangle[2].v, triangle[0].v, &tmp);
-                tmp.flags = GPU_CMD_VERTEX_EOL;
-                _glPerspectiveDivideVertex(&tmp, h);
-                _glSubmitHeaderOrVertex(d, &tmp);
-            } break;
-            case 5: {
-                SUBMIT_QUEUED();
-                /* 0, 0a, 2, 1a */
-                tmp = *triangle[0].v;
-                tmp.flags = GPU_CMD_VERTEX;
-                _glPerspectiveDivideVertex(&tmp, h);
-                _glSubmitHeaderOrVertex(d, &tmp);
-
-                _glClipEdge(triangle[0].v, triangle[1].v, &tmp);
-                tmp.flags = GPU_CMD_VERTEX;
-                _glPerspectiveDivideVertex(&tmp, h);
-                _glSubmitHeaderOrVertex(d, &tmp);
-
-                tmp = *triangle[2].v;
-                tmp.flags = GPU_CMD_VERTEX;
-                _glPerspectiveDivideVertex(&tmp, h);
-                _glSubmitHeaderOrVertex(d, &tmp);
-
-                _glClipEdge(triangle[1].v, triangle[2].v, &tmp);
-                tmp.flags = GPU_CMD_VERTEX_EOL;
-                _glPerspectiveDivideVertex(&tmp, h);
-                _glSubmitHeaderOrVertex(d, &tmp);
-            } break;
-            case 6: {
-                SUBMIT_QUEUED();
-                /* 0a, 1, 2a, 2 */
-                _glClipEdge(triangle[0].v, triangle[1].v, &tmp);
-                tmp.flags = GPU_CMD_VERTEX;
-                _glPerspectiveDivideVertex(&tmp, h);
-                _glSubmitHeaderOrVertex(d, &tmp);
-
-                tmp = *triangle[1].v;
-                tmp.flags = GPU_CMD_VERTEX;
-                _glPerspectiveDivideVertex(&tmp, h);
-                _glSubmitHeaderOrVertex(d, &tmp);
-
-                _glClipEdge(triangle[2].v, triangle[0].v, &tmp);
-                tmp.flags = GPU_CMD_VERTEX;
-                _glPerspectiveDivideVertex(&tmp, h);
-                _glSubmitHeaderOrVertex(d, &tmp);
-
-                tmp = *triangle[2].v;
-                tmp.flags = GPU_CMD_VERTEX_EOL;
-                _glPerspectiveDivideVertex(&tmp, h);
-                _glSubmitHeaderOrVertex(d, &tmp);
-            } break;
-            case 7: {
-                /* All the vertices are visible! We divide and submit v0, then shift */
-                _glPerspectiveDivideVertex(vertex - 2, h);
-                _glSubmitHeaderOrVertex(d, vertex - 2);
-
-                if(is_last_in_strip) {
-                    _glPerspectiveDivideVertex(vertex - 1, h);
-                    _glSubmitHeaderOrVertex(d, vertex - 1);
-                    _glPerspectiveDivideVertex(vertex, h);
-                    _glSubmitHeaderOrVertex(d, vertex);
-                    tri_count = 0;
-                    strip_count = 0;
-                }
-
-                ShiftRotateTriangle();
-                continue;
-            } break;
            case 0:
-            default:
            break;
+            case 15: /* All visible, but final vertex in strip */
+            {
+                _glPerspectiveDivideVertex(v0, h);
+                _glSubmitHeaderOrVertex(sq, v0);
+
+                Vertex __attribute__((aligned(32))) a = *v1;
+                _glPerspectiveDivideVertex(&a, h);
+                _glSubmitHeaderOrVertex(sq, &a);
+
+                a = *v2;
+                _glPerspectiveDivideVertex(&a, h);
+                _glSubmitHeaderOrVertex(sq, &a);
+            }
+            break;
+            case 7:
+                /* All visible, push the first vertex and move on */
+                _glPerspectiveDivideVertex(v0, h);
+                _glSubmitHeaderOrVertex(sq, v0);
+            break;
+            case 9:
+                /* First vertex was visible, last in strip */
+                {
+                    Vertex __attribute__((aligned(32))) a, b;
+
+                    _glClipEdge(v0, v1, &a);
+                    a.flags = GPU_CMD_VERTEX;
+
+                    _glClipEdge(v2, v0, &b);
+                    b.flags = GPU_CMD_VERTEX_EOL;
+
+                    _glPerspectiveDivideVertex(v0, h);
+                    _glSubmitHeaderOrVertex(sq, v0);
+
+                    _glPerspectiveDivideVertex(&a, h);
+                    _glSubmitHeaderOrVertex(sq, &a);
+
+                    _glPerspectiveDivideVertex(&b, h);
+                    _glSubmitHeaderOrVertex(sq, &b);
+                }
+            break;
+            case 1:
+                /* First vertex was visible, but not last in strip */
+                {
+                    Vertex __attribute__((aligned(32))) a, b;
+
+                    _glClipEdge(v0, v1, &a);
+                    a.flags = GPU_CMD_VERTEX;
+
+                    _glClipEdge(v2, v0, &b);
+                    b.flags = GPU_CMD_VERTEX;
+
+                    _glPerspectiveDivideVertex(v0, h);
+                    _glSubmitHeaderOrVertex(sq, v0);
+
+                    _glPerspectiveDivideVertex(&a, h);
+                    _glSubmitHeaderOrVertex(sq, &a);
+
+                    _glPerspectiveDivideVertex(&b, h);
+                    _glSubmitHeaderOrVertex(sq, &b);
+                    _glSubmitHeaderOrVertex(sq, &b);
+                }
+            break;
+            case 10:
+            case 2:
+                /* Second vertex was visible. In self case we need to create a triangle and produce
+                two new vertices: 1-2, and 2-3. */
+                {
+                    Vertex __attribute__((aligned(32))) a;
+                    Vertex __attribute__((aligned(32))) c = *v1;
+                    _glClipEdge(v0, &c, &a);
+                    a.flags = GPU_CMD_VERTEX;
+
+                    _glPerspectiveDivideVertex(&a, h);
+                    _glSubmitHeaderOrVertex(sq, &a);
+
+                    _glClipEdge(&c, v2, &a);
+                    a.flags = v2->flags;
+
+                    _glPerspectiveDivideVertex(&c, h);
+                    _glSubmitHeaderOrVertex(sq, &c);
+
+                    _glPerspectiveDivideVertex(&a, h);
+                    _glSubmitHeaderOrVertex(sq, &a);
+                }
+            break;
+            case 11:
+            case 3:  /* First and second vertex were visible */
+                {
+                    Vertex __attribute__((aligned(32))) a, b;
+                    Vertex __attribute__((aligned(32))) c = *v1;
+
+                    _glClipEdge(v2, v0, &b);
+                    b.flags = GPU_CMD_VERTEX;
+
+                    _glPerspectiveDivideVertex(v0, h);
+                    _glSubmitHeaderOrVertex(sq, v0);
+
+                    _glClipEdge(v1, v2, &a);
+                    a.flags = v2->flags;
+
+                    _glPerspectiveDivideVertex(&c, h);
+                    _glSubmitHeaderOrVertex(sq, &c);
+
+                    _glPerspectiveDivideVertex(&b, h);
+                    _glSubmitHeaderOrVertex(sq, &b);
+
+                    _glPerspectiveDivideVertex(&a, h);
+                    _glSubmitHeaderOrVertex(sq, &c);
+                    _glSubmitHeaderOrVertex(sq, &a);
+                }
+            break;
+            case 12:
+            case 4:
+                /* Third vertex was visible. */
+                {
+                    Vertex __attribute__((aligned(32))) a, b;
+                    Vertex __attribute__((aligned(32))) c = *v2;
+
+                    _glClipEdge(v2, v0, &a);
+                    a.flags = GPU_CMD_VERTEX;
+
+                    _glClipEdge(v1, v2, &b);
+                    b.flags = GPU_CMD_VERTEX;
+
+                    _glPerspectiveDivideVertex(&a, h);
+                    _glSubmitHeaderOrVertex(sq, &a);
+
+                    _glPerspectiveDivideVertex(&b, h);
+                    _glSubmitHeaderOrVertex(sq, &a);
+                    _glSubmitHeaderOrVertex(sq, &b);
+
+                    _glPerspectiveDivideVertex(&c, h);
+                    _glSubmitHeaderOrVertex(sq, &c);
+                }
+            break;
+            case 13:
+                {
+                    Vertex __attribute__((aligned(32))) a, b;
+                    Vertex __attribute__((aligned(32))) c = *v2;
+                    c.flags = GPU_CMD_VERTEX;
+
+                    _glClipEdge(v0, v1, &a);
+                    a.flags = GPU_CMD_VERTEX;
+
+                    _glClipEdge(v1, v2, &b);
+                    b.flags = GPU_CMD_VERTEX;
+
+                    _glPerspectiveDivideVertex(v0, h);
+                    _glSubmitHeaderOrVertex(sq, v0);
+
+                    _glPerspectiveDivideVertex(&a, h);
+                    _glSubmitHeaderOrVertex(sq, &a);
+
+                    _glPerspectiveDivideVertex(&c, h);
+                    _glSubmitHeaderOrVertex(sq, &c);
+                    _glPerspectiveDivideVertex(&b, h);
+                    _glSubmitHeaderOrVertex(sq, &b);
+
+                    c.flags = GPU_CMD_VERTEX_EOL;
+                    _glSubmitHeaderOrVertex(sq, &c);
+                }
+            break;
+            case 5:  /* First and third vertex were visible */
+                {
+                    Vertex __attribute__((aligned(32))) a, b;
+                    Vertex __attribute__((aligned(32))) c = *v2;
+                    c.flags = GPU_CMD_VERTEX;
+
+                    _glClipEdge(v0, v1, &a);
+                    a.flags = GPU_CMD_VERTEX;
+
+                    _glClipEdge(v1, v2, &b);
+                    b.flags = GPU_CMD_VERTEX;
+
+                    _glPerspectiveDivideVertex(v0, h);
+                    _glSubmitHeaderOrVertex(sq, v0);
+
+                    _glPerspectiveDivideVertex(&a, h);
+                    _glSubmitHeaderOrVertex(sq, &a);
+
+                    _glPerspectiveDivideVertex(&c, h);
+                    _glSubmitHeaderOrVertex(sq, &c);
+                    _glPerspectiveDivideVertex(&b, h);
+                    _glSubmitHeaderOrVertex(sq, &b);
+                    _glSubmitHeaderOrVertex(sq, &c);
+                }
+            break;
+            case 14:
+            case 6:  /* Second and third vertex were visible */
+                {
+                    Vertex __attribute__((aligned(32))) a, b;
+                    Vertex __attribute__((aligned(32))) c = *v1;
+
+                    _glClipEdge(v0, v1, &a);
+                    a.flags = GPU_CMD_VERTEX;
+
+                    _glClipEdge(v2, v0, &b);
+                    b.flags = GPU_CMD_VERTEX;
+
+                    _glPerspectiveDivideVertex(&a, h);
+                    _glSubmitHeaderOrVertex(sq, &a);
+
+                    _glPerspectiveDivideVertex(&c, h);
+                    _glSubmitHeaderOrVertex(sq, &c);
+
+                    _glPerspectiveDivideVertex(&b, h);
+                    _glSubmitHeaderOrVertex(sq, &b);
+                    _glSubmitHeaderOrVertex(sq, &c);
+
+                    c = *v2;
+                    _glPerspectiveDivideVertex(&c, h);
+                    _glSubmitHeaderOrVertex(sq, &c);
+                }
+            break;
+            default:
+                break;
        }

-        /* If this was the last in the strip, we don't need to
-        submit anything else, we just wipe the tri_count */
-        if(is_last_in_strip) {
-            tri_count = 0;
-            strip_count = 0;
+        if(v2->flags == GPU_CMD_VERTEX_EOL) {
+            counter = 0;
        } else {
-            ShiftRotateTriangle();
-            strip_count = 2;
+            --counter;
        }
+
+        v2++;
    }
 }

--- a/GL/private.h
+++ b/GL/private.h
@ -233,11 +233,41 @@ GL_FORCE_INLINE float clamp(float d, float min, float max) {
    return (d < min) ? min : (d > max) ? max : d;
 }

+GL_FORCE_INLINE void memcpy_vertex(Vertex *dest, const Vertex *src) {
+#ifdef __DREAMCAST__
+    _Complex float double_scratch;
+
+    asm volatile (
+        "fschg\n\t"
+        "clrs\n\t"
+        ".align 2\n\t"
+        "fmov.d @%[in]+, %[scratch]\n\t"
+        "fmov.d %[scratch], @%[out]\n\t"
+        "fmov.d @%[in]+, %[scratch]\n\t"
+        "add #8, %[out]\n\t"
+        "fmov.d %[scratch], @%[out]\n\t"
+        "fmov.d @%[in]+, %[scratch]\n\t"
+        "add #8, %[out]\n\t"
+        "fmov.d %[scratch], @%[out]\n\t"
+        "fmov.d @%[in], %[scratch]\n\t"
+        "add #8, %[out]\n\t"
+        "fmov.d %[scratch], @%[out]\n\t"
+        "fschg\n"
+        : [in] "+&r" ((uint32_t) src), [scratch] "=&d" (double_scratch), [out] "+&r" ((uint32_t) dest)
+        :
+        : "t", "memory" // clobbers
+    );
+#else
+    *dest = *src;
+#endif
+}
+
 #define swapVertex(a, b)   \
 do {                 \
-    Vertex c = *a;   \
-    *a = *b;         \
-    *b = c;          \
+    Vertex __attribute__((aligned(32))) c;   \
+    memcpy_vertex(&c, a); \
+    memcpy_vertex(a, b); \
+    memcpy_vertex(b, &c); \
 } while(0)

 /* ClipVertex doesn't have room for these, so we need to parse them
--- a/containers/aligned_vector.h
+++ b/containers/aligned_vector.h
@ -12,6 +12,7 @@ extern "C" {
 #if defined(__APPLE__) || defined(__WIN32__)
 /* Linux + Kos define this, OSX does not, so just use malloc there */
 static inline void* memalign(size_t alignment, size_t size) {
+    (void) alignment;
    return malloc(size);
 }
 #else
--- a/samples/loadbmp.c
+++ b/samples/loadbmp.c
@ -23,7 +23,11 @@ int ImageLoad(char *filename, Image *image) {
    }

    // seek through the bmp header, up to the width/height:
-    fseek(file, 18, SEEK_CUR);
+    fseek(file, 10, SEEK_CUR);
+
+    uint32_t offset;
+    fread(&offset, 4, 1, file);
+    fseek(file, 4, SEEK_CUR);

    // read the width
    if ((i = fread(&sizeX, 4, 1, file)) != 1) {
@ -65,7 +69,7 @@ int ImageLoad(char *filename, Image *image) {
    }

    // seek past the rest of the bitmap header.
-    fseek(file, 24, SEEK_CUR);
+    fseek(file, offset, SEEK_SET);

    // read the data.
    image->data = (char *) malloc(size);
--- a/samples/nehe10/romdisk/world.txt
+++ b/samples/nehe10/romdisk/world.txt
@ -157,4 +157,4 @@ NUMPOLLIES 36
 2.0  0.0   -0.5 0.0 0.0
 3.0  1.0  -0.5 1.0 1.0
 2.0  1.0 -0.5 0.0 1.0
-2.0  0.0   -0.5 0.0 0.0
+2.0  0.0   -0.5 0.0 0.0
--- a/samples/zclip_triangle/main.c
+++ b/samples/zclip_triangle/main.c
@ -86,12 +86,13 @@ void DrawGLScene()
    rotation = (rotation > 360.0f) ? rotation - 360.0f : rotation;

    glClear(GL_COLOR_BUFFER_BIT | GL_DEPTH_BUFFER_BIT);		// Clear The Screen And The Depth Buffer
+    glClearColor(0.5f, 0.5f, 0.5f, 0.5f);
    glLoadIdentity();				// Reset The View

    glDisable(GL_CULL_FACE);

    glPushMatrix();
-        glTranslatef(0.0f, -1.0f, movement);
+        glTranslatef(0.0f, -1.0f, -movement);
        glRotatef(rotation, 0.0f, 1.0f, 0.0f);

        glBegin(GL_TRIANGLES);
--- a/tests/zclip/main.cpp
+++ b/tests/zclip/main.cpp
@ -0,0 +1,637 @@
+
+#include <cstdint>
+#include <vector>
+#include <cstdio>
+#include <cmath>
+#include <stdexcept>
+#include <cassert>
+
+#define SQ_BASE_ADDRESS 0
+#define SPAN_SORT_CFG 0
+#define PVR_SET(x, y) (void)(x); (void)(y)
+
+struct Vertex  {
+    uint32_t flags;
+    float xyz[3];
+    float uv[2];
+    float w;
+    uint8_t bgra[4];
+};
+
+struct {
+    float hwidth;
+    float x_plus_hwidth;
+    float hheight;
+    float y_plus_hheight;
+} VIEWPORT = {320, 320, 240, 240};
+
+
+struct VideoMode {
+    float height;
+};
+
+static VideoMode* GetVideoMode() {
+    static VideoMode mode = {320.0f};
+    return &mode;
+}
+
+enum GPUCommand {
+    GPU_CMD_POLYHDR = 0x80840000,
+    GPU_CMD_VERTEX = 0xe0000000,
+    GPU_CMD_VERTEX_EOL = 0xf0000000,
+    GPU_CMD_USERCLIP = 0x20000000,
+    GPU_CMD_MODIFIER = 0x80000000,
+    GPU_CMD_SPRITE = 0xA0000000
+};
+
+static std::vector<Vertex> sent;
+
+static inline void interpolateColour(const uint32_t* a, const uint32_t* b, const float t, uint32_t* out) {
+    const static uint32_t MASK1 = 0x00FF00FF;
+    const static uint32_t MASK2 = 0xFF00FF00;
+
+    const uint32_t f2 = 256 * t;
+    const uint32_t f1 = 256 - f2;
+
+    *out = (((((*a & MASK1) * f1) + ((*b & MASK1) * f2)) >> 8) & MASK1) |
+            (((((*a & MASK2) * f1) + ((*b & MASK2) * f2)) >> 8) & MASK2);
+}
+
+static inline void _glClipEdge(const Vertex* v1, const Vertex* v2, Vertex* vout) {
+    /* Clipping time! */
+    const float d0 = v1->w + v1->xyz[2];
+    const float d1 = v2->w + v2->xyz[2];
+    const float sign = ((2.0f * (d1 < d0)) - 1.0f);
+    const float epsilon = -0.00001f * sign;
+    const float n = (d0 - d1);
+    const float r = (1.f / sqrtf(n * n)) * sign;
+    float t = fmaf(r, d0, epsilon);
+
+    vout->xyz[0] = fmaf(v2->xyz[0] - v1->xyz[0], t, v1->xyz[0]);
+    vout->xyz[1] = fmaf(v2->xyz[1] - v1->xyz[1], t, v1->xyz[1]);
+    vout->xyz[2] = fmaf(v2->xyz[2] - v1->xyz[2], t, v1->xyz[2]);
+    vout->w = fmaf(v2->w - v1->w, t, v1->w);
+
+    vout->uv[0] = fmaf(v2->uv[0] - v1->uv[0], t, v1->uv[0]);
+    vout->uv[1] = fmaf(v2->uv[1] - v1->uv[1], t, v1->uv[1]);
+
+    interpolateColour((uint32_t*) v1->bgra, (uint32_t*) v2->bgra, t, (uint32_t*) vout->bgra);
+}
+
+bool glIsVertex(const uint32_t flags) {
+    return flags == GPU_CMD_VERTEX_EOL || flags == GPU_CMD_VERTEX;
+}
+
+bool glIsLastVertex(const uint32_t flags) {
+    return flags == GPU_CMD_VERTEX_EOL;
+}
+
+void _glSubmitHeaderOrVertex(volatile uint32_t*, Vertex* vtx) {
+    sent.push_back(*vtx);
+}
+
+float _glFastInvert(float x) {
+    return (1.f / __builtin_sqrtf(x * x));
+}
+
+void _glPerspectiveDivideVertex(Vertex* vertex, const float h) {
+    const float f = _glFastInvert(vertex->w);
+
+    /* Convert to NDC and apply viewport */
+    vertex->xyz[0] = __builtin_fmaf(
+        VIEWPORT.hwidth, vertex->xyz[0] * f, VIEWPORT.x_plus_hwidth
+    );
+
+    vertex->xyz[1] = h - __builtin_fmaf(
+        VIEWPORT.hheight, vertex->xyz[1] * f, VIEWPORT.y_plus_hheight
+    );
+
+    /* Orthographic projections need to use invZ otherwise we lose
+    the depth information. As w == 1, and clip-space range is -w to +w
+    we add 1.0 to the Z to bring it into range. We add a little extra to
+    avoid a divide by zero.
+    */
+
+    vertex->xyz[2] = (vertex->w == 1.0f) ? _glFastInvert(1.0001f + vertex->xyz[2]) : f;
+}
+
+
+void memcpy_vertex(Vertex* dst, Vertex* src) {
+    *dst = *src;
+}
+
+/* Zclipping is so difficult to get right, that self sample tests all the cases of clipping and makes sure that things work as expected */
+
+#ifdef __DREAMCAST__
+static volatile int *pvrdmacfg = (int*)0xA05F6888;
+static volatile int *qacr = (int*)0xFF000038;
+#else
+static int pvrdmacfg[2];
+static int qacr[2];
+#endif
+
+void SceneListSubmit(void* src, int n) {
+    /* You need at least a header, and 3 vertices to render anything */
+    if(n < 4) {
+        return;
+    }
+
+    const float h = GetVideoMode()->height;
+
+    PVR_SET(SPAN_SORT_CFG, 0x0);
+
+    //Set PVR DMA registers
+    pvrdmacfg[0] = 1;
+    pvrdmacfg[1] = 1;
+
+    //Set QACR registers
+    qacr[1] = qacr[0] = 0x11;
+
+    volatile uint32_t *d = SQ_BASE_ADDRESS;
+
+    int8_t queue_head = 0;
+    int8_t queue_tail = 0;
+
+    /* The most vertices ever in the queue is 5 (as some clipping operations
+     * produce and additional couple of vertice, but we add one more so the ring buffer doesn't
+     * trip over itself (e.g. if tail == head we can guarantee it's empty, not full) */
+    Vertex __attribute__((aligned(32))) queue[4];
+    const int queue_capacity = sizeof(queue) / sizeof(Vertex);
+
+    Vertex* vertex = (Vertex*) src;
+    uint32_t visible_mask = 0;
+
+#if CLIP_DEBUG
+    for(int i = 0; i < n; ++i) {
+        fprintf(stderr, "{%f, %f, %f, %f}, // %x (%x)\n", vertex[i].xyz[0], vertex[i].xyz[1], vertex[i].xyz[2], vertex[i].w, vertex[i].flags, &vertex[i]);
+    }
+
+    fprintf(stderr, "----\n");
+#endif
+    while(n--) {
+        bool last_vertex = false;
+        memcpy_vertex(queue + queue_tail, vertex);
+        ++vertex;
+        switch(queue[queue_tail].flags) {
+            case GPU_CMD_POLYHDR:
+                _glSubmitHeaderOrVertex(d, &queue[queue_tail]);
+            break;
+            case GPU_CMD_VERTEX_EOL:
+                last_vertex = true;
+            case GPU_CMD_VERTEX:
+                visible_mask = (visible_mask >> 1) | (queue[queue_tail].xyz[2] >= -queue[queue_tail].w) << 2;
+                assert(visible_mask < 15);
+                queue_tail = (queue_tail + 1) % queue_capacity;
+            default:
+            break;
+        }
+
+        int counter = (queue_tail - queue_head + queue_capacity) % queue_capacity;
+        if(counter < 3) {
+            continue;
+        }
+
+#if CLIP_DEBUG
+        fprintf(stderr, "%d\n", visible_mask);
+#endif
+        Vertex __attribute__((aligned(32))) a, b;  // Scratch vertices
+        switch(visible_mask) {
+            case 0:
+            break;
+            case 7:
+                /* All visible, push the first vertex and move on */
+                _glPerspectiveDivideVertex(&queue[queue_head], h);
+                _glSubmitHeaderOrVertex(d, &queue[queue_head]);
+
+                if(last_vertex) {
+                    /* If this was the last vertex in the strip, we need to flush the queue and then
+                       restart it again */
+
+                    int v1 = (queue_head + 1) % queue_capacity;
+                    int v2 = (queue_head + 2) % queue_capacity;
+
+                    _glPerspectiveDivideVertex(&queue[v1], h);
+                    _glSubmitHeaderOrVertex(d, &queue[v1]);
+
+                    _glPerspectiveDivideVertex(&queue[v2], h);
+                    _glSubmitHeaderOrVertex(d, &queue[v2]);
+                }
+            break;
+            case 1:
+                /* First vertex was visible */
+                {
+                        Vertex* v0 = &queue[queue_head];
+                        Vertex* v1 = &queue[(queue_head + 1) % queue_capacity];
+                        Vertex* v2 = &queue[(queue_head + 2) % queue_capacity];
+
+                        _glClipEdge(v0, v1, &a);
+                        _glClipEdge(v2, v0, &b);
+                        a.flags = GPU_CMD_VERTEX;
+
+                        /* If v2 was the last in the strip, then b should be. If it wasn't
+                        we'll create a degenerate triangle by adding b twice in a row so that the
+                        strip processing will continue correctly after crossing the plane so it can
+                        cross back*/
+                        b.flags = v2->flags;
+
+                        _glPerspectiveDivideVertex(v0, h);
+                        _glPerspectiveDivideVertex(&a, h);
+                        _glPerspectiveDivideVertex(&b, h);
+
+                        _glSubmitHeaderOrVertex(d, v0);
+                        _glSubmitHeaderOrVertex(d, &a);
+                        _glSubmitHeaderOrVertex(d, &b);
+                        _glSubmitHeaderOrVertex(d, &b);
+                }
+            break;
+            case 2:
+                /* Second vertex was visible. In self case we need to create a triangle and produce
+                two new vertices: 1-2, and 2-3. */
+                {
+                        Vertex* v0 = &queue[queue_head];
+                        const Vertex* v1 = &queue[(queue_head + 1) % queue_capacity];
+                        const Vertex* v2 = &queue[(queue_head + 2) % queue_capacity];
+
+                        _glClipEdge(v0, v1, &a);
+                        _glClipEdge(v1, v2, &b);
+                        a.flags = GPU_CMD_VERTEX;
+                        b.flags = v2->flags;
+
+                        _glPerspectiveDivideVertex(v0, h);
+                        _glPerspectiveDivideVertex(&a, h);
+                        _glPerspectiveDivideVertex(&b, h);
+
+                        _glSubmitHeaderOrVertex(d, &a);
+                        _glSubmitHeaderOrVertex(d, v0);
+                        _glSubmitHeaderOrVertex(d, &b);
+                }
+            break;
+            case 3:  /* First and second vertex were visible */
+                    {
+                        Vertex* v0 = &queue[queue_head];
+                        Vertex __attribute__((aligned(32))) v1 = queue[(queue_head + 1) % queue_capacity];
+                        Vertex* v2 = &queue[(queue_head + 2) % queue_capacity];
+
+                        _glClipEdge(&v1, v2, &a);
+                        _glClipEdge(v2, v0, &b);
+
+                        a.flags = v2->flags;
+                        b.flags = GPU_CMD_VERTEX;
+
+                        _glPerspectiveDivideVertex(v0, h);
+                        _glPerspectiveDivideVertex(&v1, h);
+                        _glPerspectiveDivideVertex(&a, h);
+                        _glPerspectiveDivideVertex(&b, h);
+
+                        _glSubmitHeaderOrVertex(d, v0);
+                        _glSubmitHeaderOrVertex(d, &v1);
+                        _glSubmitHeaderOrVertex(d, &b);
+                        _glSubmitHeaderOrVertex(d, &v1);
+                        _glSubmitHeaderOrVertex(d, &a);
+                }
+            break;
+            case 4:
+                /* Third vertex was visible. */
+                {
+                        Vertex* v0 = &queue[queue_head];
+                        Vertex* v1 = &queue[(queue_head + 1) % queue_capacity];
+                        Vertex __attribute__((aligned(32))) v2 = queue[(queue_head + 2) % queue_capacity];
+
+                        _glClipEdge(&v2, v0, &a);
+                        _glClipEdge(v1, &v2, &b);
+                        a.flags = GPU_CMD_VERTEX;
+                        b.flags = GPU_CMD_VERTEX;
+
+                        _glPerspectiveDivideVertex(&v2, h);
+                        _glPerspectiveDivideVertex(&a, h);
+                        _glPerspectiveDivideVertex(&b, h);
+
+                        _glSubmitHeaderOrVertex(d, &a);
+                        _glSubmitHeaderOrVertex(d, &a);
+                        _glSubmitHeaderOrVertex(d, &b);
+                        _glSubmitHeaderOrVertex(d, &v2);
+                }
+            break;
+            case 5:  /* First and third vertex were visible */
+                {
+                        Vertex* v0 = &queue[queue_head];
+                        Vertex* v1 = &queue[(queue_head + 1) % queue_capacity];
+                        Vertex __attribute__((aligned(32))) v2 = queue[(queue_head + 2) % queue_capacity];
+
+                        _glClipEdge(v0, v1, &a);
+                        _glClipEdge(v1, &v2, &b);
+                        a.flags = GPU_CMD_VERTEX;
+                        b.flags = GPU_CMD_VERTEX;
+
+                        _glPerspectiveDivideVertex(v0, h);
+                        _glPerspectiveDivideVertex(&v2, h);
+                        _glPerspectiveDivideVertex(&a, h);
+                        _glPerspectiveDivideVertex(&b, h);
+
+                        _glSubmitHeaderOrVertex(d, v0);
+                        _glSubmitHeaderOrVertex(d, &a);
+                        uint32_t v2_flags = v2.flags;
+                        v2.flags = GPU_CMD_VERTEX;
+                        _glSubmitHeaderOrVertex(d, &v2);
+                        v2.flags = v2_flags;
+                        _glSubmitHeaderOrVertex(d, &b);
+                        _glSubmitHeaderOrVertex(d, &v2);
+                }
+            break;
+            case 6:  /* Second and third vertex were visible */
+                {
+                        Vertex* v0 = &queue[queue_head];
+                        Vertex __attribute__((aligned(32))) v1 = queue[(queue_head + 1) % queue_capacity];
+                        Vertex __attribute__((aligned(32))) v2 = queue[(queue_head + 2) % queue_capacity];
+
+                        _glClipEdge(v0, &v1, &a);
+                        _glClipEdge(&v2, v0, &b);
+
+                        a.flags = GPU_CMD_VERTEX;
+                        b.flags = GPU_CMD_VERTEX;
+
+                        _glPerspectiveDivideVertex(&v1, h);
+                        _glPerspectiveDivideVertex(&v2, h);
+                        _glPerspectiveDivideVertex(&a, h);
+                        _glPerspectiveDivideVertex(&b, h);
+
+                        _glSubmitHeaderOrVertex(d, &a);
+                        _glSubmitHeaderOrVertex(d, &v1);
+                        _glSubmitHeaderOrVertex(d, &b);
+                        _glSubmitHeaderOrVertex(d, &v1);
+                        _glSubmitHeaderOrVertex(d, &v2);
+                }
+            break;
+            default:
+                break;
+        }
+
+        if(last_vertex) {
+            visible_mask = queue_head = queue_tail = 0;
+        } else {
+            queue_head = (queue_head + 1) % queue_capacity;
+        }
+    }
+}
+
+
+struct VertexTmpl {
+    VertexTmpl(float x, float y, float z, float w):
+        x(x), y(y), z(z), w(w) {}
+
+    float x, y, z, w;
+};
+
+std::vector<Vertex> make_vertices(const std::vector<VertexTmpl>& verts) {
+    std::vector<Vertex> result;
+    Vertex r;
+
+    r.flags = GPU_CMD_POLYHDR;
+    result.push_back(r);
+
+    for(auto& v: verts) {
+        r.flags = GPU_CMD_VERTEX;
+        r.xyz[0] = v.x;
+        r.xyz[1] = v.y;
+        r.xyz[2] = v.z;
+        r.uv[0] = 0.0f;
+        r.uv[1] = 0.0f;
+        r.w = v.w;
+
+        result.push_back(r);
+    }
+
+    result.back().flags = GPU_CMD_VERTEX_EOL;
+    return result;
+}
+
+template<typename T, typename U>
+void check_equal(const T& lhs, const U& rhs) {
+    if(lhs != rhs) {
+        throw std::runtime_error("Assertion failed");
+    }
+}
+
+template<>
+void check_equal(const Vertex& lhs, const Vertex& rhs) {
+    if(lhs.xyz[0] != rhs.xyz[0] ||
+       lhs.xyz[1] != rhs.xyz[1] ||
+       lhs.xyz[2] != rhs.xyz[2] ||
+       lhs.w != rhs.w) {
+        throw std::runtime_error("Assertion failed");
+    }
+}
+
+
+bool test_clip_case_001() {
+    /* The first vertex is visible only */
+    sent.clear();
+
+    auto data = make_vertices({
+        {0.000000, -2.414213, 3.080808, 5.000000},
+        {-4.526650, -2.414213, -7.121212, -5.000000},
+        {4.526650, -2.414213, -7.121212, -5.000000}
+    });
+
+    SceneListSubmit(&data[0], data.size());
+
+    check_equal(sent.size(), 5);
+    check_equal(sent[0].flags, GPU_CMD_POLYHDR);
+    check_equal(sent[1].flags, GPU_CMD_VERTEX);
+    check_equal(sent[2].flags, GPU_CMD_VERTEX);
+
+    // Because we're sending a single triangle, we end up sending a
+    // degenerate final vert. But if we were sending more than one triangle
+    // this would be GPU_CMD_VERTEX twice
+    check_equal(sent[3].flags, GPU_CMD_VERTEX_EOL);
+    check_equal(sent[4].flags, GPU_CMD_VERTEX_EOL);
+    check_equal(sent[3], sent[4]);
+    return true;
+}
+
+bool test_clip_case_010() {
+    /* The third vertex is visible only */
+    sent.clear();
+
+    auto data = make_vertices({
+        {-4.526650, -2.414213, -7.121212, -5.000000},
+        {0.000000, -2.414213, 3.080808, 5.000000},
+        {4.526650, -2.414213, -7.121212, -5.000000}
+    });
+
+    SceneListSubmit(&data[0], data.size());
+
+    check_equal(sent.size(), 4);
+    check_equal(sent[0].flags, GPU_CMD_POLYHDR);
+    check_equal(sent[1].flags, GPU_CMD_VERTEX);
+    check_equal(sent[2].flags, GPU_CMD_VERTEX);
+    check_equal(sent[3].flags, GPU_CMD_VERTEX_EOL);
+    return true;
+}
+
+bool test_clip_case_100() {
+    /* The third vertex is visible only */
+    sent.clear();
+
+    auto data = make_vertices({
+        {-4.526650, -2.414213, -7.121212, -5.000000},
+        {4.526650, -2.414213, -7.121212, -5.000000},
+        {0.000000, -2.414213, 3.080808, 5.000000}
+    });
+
+    SceneListSubmit(&data[0], data.size());
+
+    check_equal(sent.size(), 5);
+    check_equal(sent[0].flags, GPU_CMD_POLYHDR);
+    check_equal(sent[1].flags, GPU_CMD_VERTEX);
+    check_equal(sent[2].flags, GPU_CMD_VERTEX);
+
+    // Because we're sending a single triangle, we end up sending a
+    // degenerate final vert. But if we were sending more than one triangle
+    // this would be GPU_CMD_VERTEX twice
+    check_equal(sent[3].flags, GPU_CMD_VERTEX);
+    check_equal(sent[4].flags, GPU_CMD_VERTEX_EOL);
+    check_equal(sent[1], sent[2]);
+    return true;
+}
+
+bool test_clip_case_110() {
+    /* 2nd and 3rd visible */
+    sent.clear();
+
+    auto data = make_vertices({
+        {0.0, -2.414213, -7.121212, -5.000000},
+        {-4.526650, -2.414213, 3.080808, 5.000000},
+        {4.526650, -2.414213, 3.080808, 5.000000}
+    });
+
+    SceneListSubmit(&data[0], data.size());
+
+    check_equal(sent.size(), 6);
+    check_equal(sent[0].flags, GPU_CMD_POLYHDR);
+    check_equal(sent[1].flags, GPU_CMD_VERTEX);
+    check_equal(sent[2].flags, GPU_CMD_VERTEX);
+    check_equal(sent[3].flags, GPU_CMD_VERTEX);
+    check_equal(sent[4].flags, GPU_CMD_VERTEX);
+    check_equal(sent[5].flags, GPU_CMD_VERTEX_EOL);
+    check_equal(sent[2], sent[4]);
+    return true;
+}
+
+bool test_clip_case_011() {
+    /* 1st and 2nd visible */
+    sent.clear();
+
+    auto data = make_vertices({
+        {-4.526650, -2.414213, 3.080808, 5.000000},
+        {4.526650, -2.414213, 3.080808, 5.000000},
+        {0.0, -2.414213, -7.121212, -5.000000}
+    });
+
+    SceneListSubmit(&data[0], data.size());
+
+    check_equal(sent.size(), 6);
+    check_equal(sent[0].flags, GPU_CMD_POLYHDR);
+    check_equal(sent[1].flags, GPU_CMD_VERTEX);
+    check_equal(sent[2].flags, GPU_CMD_VERTEX);
+    check_equal(sent[3].flags, GPU_CMD_VERTEX);
+    check_equal(sent[4].flags, GPU_CMD_VERTEX);
+    check_equal(sent[5].flags, GPU_CMD_VERTEX_EOL);
+    check_equal(sent[2], sent[4]);
+    return true;
+}
+
+bool test_clip_case_101() {
+    /* 1st and 3rd visible */
+    sent.clear();
+
+    auto data = make_vertices({
+        {-4.526650, -2.414213, 3.080808, 5.000000},
+        {0.0, -2.414213, -7.121212, -5.000000},
+        {4.526650, -2.414213, 3.080808, 5.000000},
+    });
+
+    SceneListSubmit(&data[0], data.size());
+
+    check_equal(sent.size(), 6);
+    check_equal(sent[0].flags, GPU_CMD_POLYHDR);
+    check_equal(sent[1].flags, GPU_CMD_VERTEX);
+    check_equal(sent[2].flags, GPU_CMD_VERTEX);
+    check_equal(sent[3].flags, GPU_CMD_VERTEX);
+    check_equal(sent[4].flags, GPU_CMD_VERTEX);
+    check_equal(sent[5].flags, GPU_CMD_VERTEX_EOL);
+    check_equal(sent[3], sent[5]);
+    return true;
+}
+
+bool test_clip_case_111() {
+    /* 1st and 3rd visible */
+    sent.clear();
+
+    auto data = make_vertices({
+        {-4.526650, -2.414213, 3.080808, 5.000000},
+        {0.0, -2.414213, -7.121212, 8.000000},
+        {4.526650, -2.414213, 3.080808, 5.000000},
+    });
+
+    SceneListSubmit(&data[0], data.size());
+
+    check_equal(sent.size(), 4);
+    check_equal(sent[0].flags, GPU_CMD_POLYHDR);
+    check_equal(sent[1].flags, GPU_CMD_VERTEX);
+    check_equal(sent[2].flags, GPU_CMD_VERTEX);
+    check_equal(sent[3].flags, GPU_CMD_VERTEX_EOL);
+    return true;
+}
+
+
+bool test_start_behind() {
+    /* Triangle behind the plane, but the strip continues in front */
+    sent.clear();
+
+    auto data = make_vertices({
+      {-3.021717, -2.414213, -10.155344, -9.935254},
+      {5.915236, -2.414213, -9.354721, -9.136231},
+      {-5.915236, -2.414213, -0.264096, -0.063767},
+      {3.021717, -2.414213, 0.536527, 0.735255},
+      {-7.361995, -2.414213, 4.681529, 4.871976},
+      {1.574958, -2.414213, 5.482152, 5.670999},
+    });
+
+    SceneListSubmit(&data[0], data.size());
+
+    return true;
+}
+
+bool test_longer_strip() {
+    sent.clear();
+
+    auto data = make_vertices({
+        {-4.384623, -2.414213, -5.699644, -5.488456},
+        {4.667572, -2.414213, -5.621354, -5.410322},
+        {-4.667572, -2.414213, 4.319152, 4.510323},
+        {4.384623, -2.414213, 4.397442, 4.588456},
+        {-4.809045, -2.414213, 9.328549, 9.509711},
+        {4.243149, -2.414213, 9.406840, 9.587846},
+    });
+
+    SceneListSubmit(&data[0], data.size());
+
+    return true;
+}
+
+int main(int argc, char* argv[]) {
+    // test_clip_case_000();
+    test_clip_case_001();
+    test_clip_case_010();
+    test_clip_case_100();
+    test_clip_case_110();
+    test_clip_case_011();
+    test_clip_case_101();
+    test_clip_case_111();
+
+    test_start_behind();
+    test_longer_strip();
+
+    return 0;
+}