experimental support for parsing from a float16 mesh (or a float32-quantized-as-ushort mesh)

Add missing defines
Merge branch 'fix-glshort-uv-read' into 'master'
2023-10-26 13:22:47 -05:00 · 2023-10-19 22:26:13 +01:00 · 2023-09-26 18:51:30 +00:00 · 2023-09-26 18:51:29 +00:00 · 2023-09-20 15:47:38 +00:00 · 2023-09-20 10:18:55 -05:00
54 changed files with 13013 additions and 2845 deletions
--- a/.gitlab-ci.yml
+++ b/.gitlab-ci.yml
@ -1,5 +1,6 @@
 stages:
  - build
+  - test

 build:sh4-gcc:
  stage: build
@ -17,11 +18,28 @@ build:sh4-gcc:
    
 build:x86-gcc:
  stage: build
-  image: fedora:34
+  image: fedora:38
  before_script:
-    - sudo dnf install -y cmake gcc gcc-c++ SDL2-devel glibc-devel pkgconf-pkg-config glibc-devel.i686 SDL2-devel.i686
+    - sudo dnf install -y cmake gcc gcc-c++ SDL2.i686 SDL2-devel.x86_64 glibc-devel glibc-devel.i686 SDL2-devel.i686 pkgconf-pkg-config.i686 pkgconf-pkg-config.x86_64
  script:
    - mkdir builddir
    - cd builddir
    - cmake -DCMAKE_BUILD_TYPE=Release ..
    - make
+  artifacts:
+    paths:
+    - builddir/tests/gldc_tests
+    
+test:x86-gcc:
+  stage: test
+  image: fedora:38
+  dependencies:
+    - build:x86-gcc
+  before_script:
+    - sudo dnf install -y cmake gcc gcc-c++ SDL2.i686 SDL2-devel glibc-devel pkgconf-pkg-config glibc-devel.i686 SDL2-devel.i686 pkgconf-pkg-config.i686
+  script:
+    - cd builddir/tests/
+    - SDL_VIDEODRIVER=dummy ./gldc_tests --junit-xml=report.xml
+  artifacts:
+   reports:
+    junit: builddir/tests/report.xml
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@ -1,6 +1,8 @@
-cmake_minimum_required(VERSION 3.0)
+cmake_minimum_required(VERSION 3.9)
 project(GLdc)

+set(CMAKE_VERBOSE_MAKEFILE ON)
+
 # set the default backend
 if(PLATFORM_DREAMCAST)
    set(BACKEND "kospvr" CACHE STRING "Backend to use")
@ -8,6 +10,9 @@ else()
    set(BACKEND "software" CACHE STRING "Backend to use")
 endif()

+include(CheckIPOSupported)
+check_ipo_supported(RESULT FLTO_SUPPORTED OUTPUT FLTO_ERROR)
+
 # List of possible backends
 set_property(CACHE BACKEND PROPERTY STRINGS kospvr software)

@ -17,6 +22,7 @@ string(TOUPPER ${BACKEND} BACKEND_UPPER)
 add_definitions(-DBACKEND_${BACKEND_UPPER})

 set(CMAKE_C_STANDARD 99)
+set(CMAKE_CXX_STANDARD 11)

 include_directories(include)

@ -29,15 +35,33 @@ else()
    check_c_compiler_flag("-mfsca"  COMPILER_HAS_FSCA)
    if(COMPILER_HAS_FSRRA)
        set(CMAKE_C_FLAGS_RELEASE "${CMAKE_C_FLAGS_RELEASE} -mfsrra")
+        set(CMAKE_CXX_FLAGS_RELEASE "${CMAKE_CXX_FLAGS_RELEASE} -mfsrra")
+
+        set(CMAKE_C_FLAGS_RELWITHDEBINFO "${CMAKE_C_FLAGS_RELWITHDEBINFO} -mfsrra")
+        set(CMAKE_CXX_FLAGS_RELWITHDEBINFO "${CMAKE_CXX_FLAGS_RELWITHDEBINFO} -mfsrra")
    endif()
    if(COMPILER_HAS_FSCA)
        set(CMAKE_C_FLAGS_RELEASE "${CMAKE_C_FLAGS_RELEASE} -mfsca")
+        set(CMAKE_CXX_FLAGS_RELEASE "${CMAKE_CXX_FLAGS_RELEASE} -mfsca")
+
+        set(CMAKE_C_FLAGS_RELWITHDEBINFO "${CMAKE_C_FLAGS_RELWITHDEBINFO} -mfsca")
+        set(CMAKE_CXX_FLAGS_RELWITHDEBINFO "${CMAKE_CXX_FLAGS_RELWITHDEBINFO} -mfsca")
    endif()
    set(CMAKE_C_FLAGS_RELEASE "${CMAKE_C_FLAGS_RELEASE} -ffp-contract=fast -ffast-math")
+    set(CMAKE_CXX_FLAGS_RELEASE "${CMAKE_CXX_FLAGS_RELEASE} -ffast-math")
+
+    set(CMAKE_C_FLAGS_RELWITHDEBINFO "${CMAKE_C_FLAGS_RELWITHDEBINFO} -ffp-contract=fast -ffast-math")
+    set(CMAKE_CXX_FLAGS_RELWITHDEBINFO "${CMAKE_CXX_FLAGS_RELWITHDEBINFO} -ffast-math")
 endif()

-set(CMAKE_C_FLAGS_RELEASE "${CMAKE_C_FLAGS_RELEASE} -O3 -fexpensive-optimizations")
+set(CMAKE_C_FLAGS_RELEASE "${CMAKE_C_FLAGS_RELEASE} -O3 -fexpensive-optimizations -fomit-frame-pointer -finline-functions")
+set(CMAKE_CXX_FLAGS_RELEASE "${CMAKE_CXX_FLAGS_RELEASE} -std=c++14 -O3 -g0 -s -fomit-frame-pointer -fstrict-aliasing")
+
+set(CMAKE_C_FLAGS_RELWITHDEBINFO "${CMAKE_C_FLAGS_RELWITHDEBINFO} -O3 -fexpensive-optimizations -fomit-frame-pointer -finline-functions")
+set(CMAKE_CXX_FLAGS_RELWITHDEBINFO "${CMAKE_CXX_FLAGS_RELWITHDEBINFO} -std=c++14 -O3 -fomit-frame-pointer -fstrict-aliasing")
+
 set(CMAKE_C_FLAGS_DEBUG "${CMAKE_C_FLAGS_DEBUG} -O0 -g -Wall -Wextra")
+set(CMAKE_CXX_FLAGS_DEBUG "${CMAKE_CXX_FLAGS_DEBUG} -O0 -g -Wall -Wextra")

 set(
    SOURCES
@ -56,7 +80,7 @@ set(
    GL/state.c
    GL/texture.c
    GL/util.c
-    GL/yalloc/yalloc.c
+    GL/alloc/alloc.c
    ${CMAKE_CURRENT_BINARY_DIR}/version.c
 )

@ -87,6 +111,10 @@ endif()

 add_library(GLdc STATIC ${SOURCES})

+if(FLTO_SUPPORTED)
+    set_property(TARGET GLdc PROPERTY INTERPROCEDURAL_OPTIMIZATION TRUE)
+endif()
+
 if(NOT PLATFORM_DREAMCAST)
 set_target_properties(GLdc PROPERTIES
    COMPILE_OPTIONS "-m32"
@ -110,6 +138,13 @@ function(gen_sample sample)

    add_executable(${sample} ${SAMPLE_SRCS})

+    if(FLTO_SUPPORTED)
+        # FIXME: Cubes + LTO causes an ICE
+        if(NOT ${sample} MATCHES "cubes")
+            set_property(TARGET ${sample} PROPERTY INTERPROCEDURAL_OPTIMIZATION TRUE)
+        endif()
+    endif()
+
    if(PLATFORM_DREAMCAST)
        if(EXISTS "${CMAKE_SOURCE_DIR}/samples/${sample}/romdisk")
            message("Generating romdisk for sample: ${sample}")
@ -140,6 +175,8 @@ function(gen_sample sample)
    endif()
 endfunction()

+add_subdirectory(tests)
+
 gen_sample(blend_test samples/blend_test/main.c)
 gen_sample(depth_funcs samples/depth_funcs/main.c)
 gen_sample(depth_funcs_alpha_testing samples/depth_funcs_alpha_testing/main.c samples/depth_funcs_alpha_testing/gl_png.c)
@ -170,11 +207,14 @@ gen_sample(zclip_triangle samples/zclip_triangle/main.c)
 gen_sample(zclip_trianglestrip samples/zclip_trianglestrip/main.c)
 gen_sample(scissor samples/scissor/main.c)
 gen_sample(polymark samples/polymark/main.c)
-
+gen_sample(cubes samples/cubes/main.cpp)
+gen_sample(zclip_test tests/zclip/main.cpp)

 if(PLATFORM_DREAMCAST)
    gen_sample(trimark samples/trimark/main.c)
    gen_sample(quadmark samples/quadmark/main.c samples/profiler.c)
+    gen_sample(prof_texture_upload samples/prof_texture_upload/main.c samples/profiler.c)
 else()
    gen_sample(quadmark samples/quadmark/main.c)
+    gen_sample(prof_texture_upload samples/prof_texture_upload/main.c)
 endif()
--- a/GL/alloc/alloc.c
+++ b/GL/alloc/alloc.c
@ -0,0 +1,534 @@
+#include <stdint.h>
+#include <string.h>
+#include <stdlib.h>
+#include <stdio.h>
+
+#include "alloc.h"
+
+
+/* This allocator is designed so that ideally all allocations larger
+ * than 2k, fall on a 2k boundary. Smaller allocations will
+ * never cross a 2k boundary.
+ *
+ * House keeping is stored in RAM to avoid reading back from the
+ * VRAM to check for usage. Headers can't be easily stored in the
+ * blocks anyway as they have to be 2k aligned (so you'd need to
+ * store them in reverse or something)
+ *
+ * Defragmenting the pool will move larger allocations first, then
+ * smaller ones, recursively until you tell it to stop, or until things
+ * stop moving.
+ *
+ * The maximum pool size is 8M, made up of:
+ *
+ * - 4096 blocks of 2k
+ * - each with 8 sub-blocks of 256 bytes
+ *
+ * Why?
+ *
+ * The PVR performs better if textures don't cross 2K memory
+ * addresses, so we try to avoid that. Obviously we can't
+ * if the allocation is > 2k, but in that case we can at least
+ * align with 2k and the VQ codebook (which is usually 2k) will
+ * be in its own page.
+ *
+ * The smallest PVR texture allowed is 8x8 at 16 bit (so 128 bytes)
+ * but we're unlikely to use too many of those, so having a min sub-block
+ * size of 256 should be OK (a 16x16 image is 512, so two sub-blocks).
+ *
+ * We could go down to 128 bytes if wastage is an issue, but then we have
+ * to store double the number of usage markers.
+ *
+ * FIXME:
+ *
+ *  - Only operates on one pool (ignores what you pass)
+ */
+
+#include <assert.h>
+#include <stdio.h>
+
+#define EIGHT_MEG (8 * 1024 * 1024)
+#define TWO_KILOBYTES (2 * 1024)
+#define BLOCK_COUNT (EIGHT_MEG / TWO_KILOBYTES)
+
+#define ALLOC_DEBUG 0
+#if ALLOC_DEBUG
+#define DBG_MSG(fmt, ...) fprintf(stderr, fmt, ##__VA_ARGS__)
+#else
+#define DBG_MSG(fmt, ...) do {} while (0)
+#endif
+
+
+static inline intptr_t round_up(intptr_t n, int multiple)
+{
+    if((n % multiple) == 0) {
+        return n;
+    }
+
+    assert(multiple);
+    return ((n + multiple - 1) / multiple) * multiple;
+}
+
+struct AllocEntry {
+    void* pointer;
+    size_t size;
+    struct AllocEntry* next;
+};
+
+
+typedef struct {
+    /* This is a usage bitmask for each block. A block
+     * is divided into 8 x 256 byte subblocks. If a block
+     * is entirely used, it's value will be 255, if
+     * it's entirely free then it will be 0.
+     */
+    uint8_t block_usage[BLOCK_COUNT];
+    uint8_t* pool;  // Pointer to the memory pool
+    size_t pool_size; // Size of the memory pool
+    uint8_t* base_address; // First 2k aligned address in the pool
+    size_t block_count;  // Number of 2k blocks in the pool
+
+    /* It's frustrating that we need to do this dynamically
+     * but we need to know the size allocated when we free()...
+     * we could store it statically but it would take 64k if we had
+     * an array of block_index -> block size where there would be 2 ** 32
+     * entries of 16 bit block sizes. The drawback (aside the memory usage)
+     * would be that we won't be able to order by size, so defragging will
+     * take much more time.*/
+    struct AllocEntry* allocations;
+} PoolHeader;
+
+
+static PoolHeader pool_header = {
+    {0}, NULL, 0, NULL, 0, NULL
+};
+
+void* alloc_base_address(void* pool) {
+    (void) pool;
+    return pool_header.base_address;
+}
+
+size_t alloc_block_count(void* pool) {
+    (void) pool;
+    return pool_header.block_count;
+}
+
+static inline void* calc_address(
+    uint8_t* block_usage_iterator,
+    int bit_offset,
+    size_t required_subblocks,
+    size_t* start_subblock_out
+) {
+    uintptr_t offset = (block_usage_iterator - pool_header.block_usage) * 8;
+    offset += (bit_offset + 1);
+    offset -= required_subblocks;
+
+    if(start_subblock_out) {
+        *start_subblock_out = offset;
+    }
+
+    return pool_header.base_address + (offset * 256);
+}
+
+void* alloc_next_available_ex(void* pool, size_t required_size, size_t* start_subblock, size_t* required_subblocks);
+
+void* alloc_next_available(void* pool, size_t required_size) {
+    return alloc_next_available_ex(pool, required_size, NULL, NULL);
+}
+
+void* alloc_next_available_ex(void* pool, size_t required_size, size_t* start_subblock_out, size_t* required_subblocks_out) {
+    (void) pool;
+
+    uint8_t* it = pool_header.block_usage;
+    uint32_t required_subblocks = (required_size / 256);
+    if(required_size % 256) required_subblocks += 1;
+
+    /* Anything gte to 2048 must be aligned to a 2048 boundary */
+    bool requires_alignment = required_size >= 2048;
+
+    if(required_subblocks_out) {
+        *required_subblocks_out = required_subblocks;
+    }
+
+    /* This is a fallback option. If while we're searching we find a possible slot
+     * but it's not aligned, or it's straddling a 2k boundary, then we store
+     * it here and if we reach the end of the search and find nothing better
+     * we use this instead */
+    uint8_t* poor_option = NULL;
+    size_t poor_start_subblock = 0;
+
+    uint32_t found_subblocks = 0;
+    uint32_t found_poor_subblocks = 0;
+
+    for(size_t j = 0; j < pool_header.block_count; ++j, ++it) {
+        /* We just need to find enough consecutive blocks */
+        if(found_subblocks < required_subblocks) {
+            uint8_t t = *it;
+
+            /* Optimisation only. Skip over full blocks */
+            if(t == 255) {
+                found_subblocks = 0;
+                found_poor_subblocks = 0;
+            } else {
+                /* Now let's see how many consecutive blocks we can find */
+                for(int i = 0; i < 8; ++i) {
+                    if((t & 0x80) == 0) {
+                        bool block_overflow = (
+                            required_size < 2048 && found_subblocks > 0 && i == 0
+                        );
+
+                        bool reset_subblocks = (
+                            (requires_alignment && found_subblocks == 0 && i != 0) ||
+                            block_overflow
+                        );
+
+                        if(reset_subblocks) {
+                            // Ignore this subblock, because we want the first subblock to be aligned
+                            // at a 2048 boundary and this one isn't (i != 0)
+                            found_subblocks = 0;
+                        } else {
+                            found_subblocks++;
+                        }
+
+                        /* If we reset the subblocks due to an overflow, we still
+                         * want to count this free subblock in our count */
+                        if(block_overflow) {
+                            found_subblocks++;
+                        }
+
+                        found_poor_subblocks++;
+
+                        if(found_subblocks >= required_subblocks) {
+                            /* We found space! Now calculate the address */
+                            return calc_address(it, i, required_subblocks, start_subblock_out);
+                        }
+
+                        if(!poor_option && (found_poor_subblocks >= required_subblocks)) {
+                            poor_option = calc_address(it, i, required_subblocks, &poor_start_subblock);
+                        }
+
+                    } else {
+                        found_subblocks = 0;
+                        found_poor_subblocks = 0;
+                    }
+
+                    t <<= 1;
+                }
+            }
+        }
+    }
+
+    if(poor_option) {
+        if(start_subblock_out) {
+            *start_subblock_out = poor_start_subblock;
+        }
+
+        return poor_option;
+    } else {
+        return NULL;
+    }
+}
+
+int alloc_init(void* pool, size_t size) {
+    (void) pool;
+
+    if(pool_header.pool) {
+        return -1;
+    }
+
+    if(size > EIGHT_MEG) {  // FIXME: >= ?
+        return -1;
+    }
+
+    uint8_t* p = (uint8_t*) pool;
+
+    memset(pool_header.block_usage, 0, BLOCK_COUNT);
+    pool_header.pool = pool;
+    pool_header.pool_size = size;
+
+    intptr_t base_address = (intptr_t) pool_header.pool;
+    base_address = round_up(base_address, 2048);
+
+    pool_header.base_address = (uint8_t*) base_address;
+    pool_header.block_count = ((p + size) - pool_header.base_address) / 2048;
+    pool_header.allocations = NULL;
+
+    assert(((uintptr_t) pool_header.base_address) % 2048 == 0);
+
+    return 0;
+}
+
+void alloc_shutdown(void* pool) {
+    (void) pool;
+
+    if(!pool_header.pool) {
+        return;
+    }
+
+    struct AllocEntry* it = pool_header.allocations;
+    while(it) {
+        struct AllocEntry* next = it->next;
+        free(it);
+        it = next;
+    }
+
+    memset(&pool_header, 0, sizeof(pool_header));
+    pool_header.pool = NULL;
+}
+
+static inline uint32_t size_to_subblock_count(size_t size) {
+    uint32_t required_subblocks = (size / 256);
+    if(size % 256) required_subblocks += 1;
+    return required_subblocks;
+}
+
+static inline uint32_t subblock_from_pointer(void* p) {
+    uint8_t* ptr = (uint8_t*) p;
+    return (ptr - pool_header.base_address) / 256;
+}
+
+static inline void block_and_offset_from_subblock(size_t sb, size_t* b, uint8_t* off) {
+    *b = sb / 8;
+    *off = (sb % 8);
+}
+
+void* alloc_malloc(void* pool, size_t size) {
+    DBG_MSG("Allocating: %d\n", size);
+
+    size_t start_subblock, required_subblocks;
+    void* ret = alloc_next_available_ex(pool, size, &start_subblock, &required_subblocks);
+
+    if(ret) {
+        size_t block;
+        uint8_t offset;
+
+        block_and_offset_from_subblock(start_subblock, &block, &offset);
+
+        uint8_t mask = 0;
+
+        DBG_MSG("Alloc: size: %d, rs: %d, sb: %d, b: %d, off: %d\n", size, required_subblocks, start_subblock, start_subblock / 8, start_subblock % 8);
+
+        /* Toggle any bits for the first block */
+        int c = (required_subblocks < 8) ? required_subblocks : 8;
+        for(int i = 0; i < c; ++i) {
+            mask |= (1 << (7 - (offset + i)));
+            required_subblocks--;
+        }
+
+        if(mask) {
+            pool_header.block_usage[block++] |= mask;
+        }
+
+        /* Fill any full blocks in the middle of the allocation */
+        while(required_subblocks > 8) {
+            pool_header.block_usage[block++] = 255;
+            required_subblocks -= 8;
+        }
+
+        /* Fill out any trailing subblocks */
+        mask = 0;
+        for(size_t i = 0; i < required_subblocks; ++i) {
+            mask |= (1 << (7 - i));
+        }
+
+        if(mask) {
+            pool_header.block_usage[block++] |= mask;
+        }
+
+        /* Insert allocations in the list by size descending so that when we
+         * defrag we can move the larger blocks before the smaller ones without
+         * much effort */
+        struct AllocEntry* new_entry = (struct AllocEntry*) malloc(sizeof(struct AllocEntry));
+        new_entry->pointer = ret;
+        new_entry->size = size;
+        new_entry->next = NULL;
+
+        struct AllocEntry* it = pool_header.allocations;
+        struct AllocEntry* last = NULL;
+
+        if(!it) {
+            pool_header.allocations = new_entry;
+        } else {
+            while(it) {
+                if(it->size < size) {
+                    if(last) {
+                        last->next = new_entry;
+                    } else {
+                        pool_header.allocations = new_entry;
+                    }
+
+                    new_entry->next = it;
+                    break;
+                } else if(!it->next) {
+                    it->next = new_entry;
+                    new_entry->next = NULL;
+                    break;
+                }
+
+                last = it;
+                it = it->next;
+            }
+        }
+    }
+
+    DBG_MSG("Alloc done\n");
+
+    return ret;
+}
+
+static void alloc_release_blocks(struct AllocEntry* it) {
+    size_t used_subblocks = size_to_subblock_count(it->size);
+    size_t subblock = subblock_from_pointer(it->pointer);
+    size_t block;
+    uint8_t offset;
+    block_and_offset_from_subblock(subblock, &block, &offset);
+
+    uint8_t mask = 0;
+
+    DBG_MSG("Free: size: %d, us: %d, sb: %d, off: %d\n", it->size, used_subblocks, block, offset);
+
+    /* Wipe out any leading subblocks */
+    int c = (used_subblocks < 8) ? used_subblocks : 8;
+    for(int i = 0; i < c; ++i) {
+        mask |= (1 << (7 - (offset + i)));
+        used_subblocks--;
+    }
+
+    if(mask) {
+        pool_header.block_usage[block++] &= ~mask;
+    }
+
+    /* Clear any full blocks in the middle of the allocation */
+    while(used_subblocks > 8) {
+        pool_header.block_usage[block++] = 0;
+        used_subblocks -= 8;
+    }
+
+    /* Wipe out any trailing subblocks */
+    mask = 0;
+    for(size_t i = 0; i < used_subblocks; ++i) {
+        mask |= (1 << (7 - i));
+    }
+
+    if(mask) {
+        pool_header.block_usage[block++] &= ~mask;
+    }
+}
+
+void alloc_free(void* pool, void* p) {
+    (void) pool;
+
+    struct AllocEntry* it = pool_header.allocations;
+    struct AllocEntry* last = NULL;
+    while(it) {
+        if(it->pointer == p) {
+            alloc_release_blocks(it);
+
+            if(last) {
+                last->next = it->next;
+            } else {
+                assert(it == pool_header.allocations);
+                pool_header.allocations = it->next;
+            }
+
+            DBG_MSG("Freed: size: %d, us: %d, sb: %d, off: %d\n", it->size, used_subblocks, block, offset);
+            free(it);
+            break;
+        }
+
+        last = it;
+        it = it->next;
+    }
+
+    DBG_MSG("Free done\n");
+}
+
+void alloc_run_defrag(void* pool, defrag_address_move callback, int max_iterations, void* user_data) {
+
+    for(int i = 0; i < max_iterations; ++i) {
+        bool move_occurred = false;
+
+        struct AllocEntry* it = pool_header.allocations;
+
+        if(!it) {
+            return;
+        }
+
+        while(it) {
+            void* potential_dest = alloc_next_available(pool, it->size);
+            if(potential_dest < it->pointer) {
+                potential_dest = alloc_malloc(pool, it->size);
+                memcpy(potential_dest, it->pointer, it->size);
+
+                /* Mark this block as now free, but don't fiddle with the
+                 * allocation list */
+                alloc_release_blocks(it);
+
+                callback(it->pointer, potential_dest, user_data);
+
+                it->pointer = potential_dest;
+                move_occurred = true;
+            }
+
+            it = it->next;
+        }
+
+        if(!move_occurred) {
+            return;
+        }
+    }
+}
+
+static inline uint8_t count_ones(uint8_t byte) {
+    static const uint8_t NIBBLE_LOOKUP [16] = {
+        0, 1, 1, 2, 1, 2, 2, 3,
+        1, 2, 2, 3, 2, 3, 3, 4
+    };
+    return NIBBLE_LOOKUP[byte & 0x0F] + NIBBLE_LOOKUP[byte >> 4];
+}
+
+size_t alloc_count_free(void* pool) {
+    (void) pool;
+
+    uint8_t* it = pool_header.block_usage;
+    uint8_t* end = it + pool_header.block_count;
+
+    size_t total_free = 0;
+
+    while(it < end) {
+        total_free += count_ones(*it) * 256;
+        ++it;
+    }
+
+    return total_free;
+}
+
+size_t alloc_count_continuous(void* pool) {
+    (void) pool;
+
+    size_t largest_block = 0;
+
+    uint8_t* it = pool_header.block_usage;
+    uint8_t* end = it + pool_header.block_count;
+
+    size_t current_block = 0;
+    while(it < end) {
+        uint8_t t = *it++;
+        if(!t) {
+            current_block += 2048;
+        } else {
+            for(int i = 7; i >= 0; --i) {
+                bool bitset = (t & (1 << i));
+                if(bitset) {
+                    current_block += (7 - i) * 256;
+                    if(largest_block < current_block) {
+                        largest_block = current_block;
+                        current_block = 0;
+                    }
+                }
+            }
+        }
+    }
+
+    return largest_block;
+}
--- a/GL/alloc/alloc.h
+++ b/GL/alloc/alloc.h
@ -0,0 +1,29 @@
+#pragma once
+
+#include <stdbool.h>
+#include <stdint.h>
+#include <stddef.h>
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+int alloc_init(void* pool, size_t size);
+void alloc_shutdown(void* pool);
+
+void *alloc_malloc(void* pool, size_t size);
+void alloc_free(void* pool, void* p);
+
+typedef void (defrag_address_move)(void*, void*, void*);
+void alloc_run_defrag(void* pool, defrag_address_move callback, int max_iterations, void* user_data);
+
+size_t alloc_count_free(void* pool);
+size_t alloc_count_continuous(void* pool);
+
+void* alloc_next_available(void* pool, size_t required_size);
+void* alloc_base_address(void* pool);
+size_t alloc_block_count(void* pool);
+
+#ifdef __cplusplus
+}
+#endif
--- a/GL/draw.c
+++ b/GL/draw.c
@ -3,10 +3,36 @@
 #include <string.h>
 #include <stdlib.h>
 #include <math.h>
+#include <limits.h>

 #include "private.h"
 #include "platform.h"

+GLushort _quantize( GLfloat v ) {
+    union { GLfloat f; GLuint ui; } u = {v};
+    GLuint ui = u.ui;
+
+    int s = (ui >> 16) & 0x8000;
+    int em = ui & 0x7fffffff;
+
+    int h = (em - (112 << 23) + (1 << 12)) >> 13;
+    h = (em < (113 << 23)) ? 0 : h;
+    h = (em >= (143 << 23)) ? 0x7c00 : h;
+    h = (em > (255 << 23)) ? 0x7e00 : h;
+
+    return (GLushort)(s | h);
+}
+GLfloat _dequantize( GLushort h ) {
+    GLuint s = (GLuint) (h & 0x8000) << 16;
+    int em = h & 0x7fff;
+    int r = (em + (112 << 10)) << 13;
+    r = (em < (1 << 10)) ? 0 : r;
+    r += (em >= (31 << 10)) ? (112 << 23) : 0;
+
+    union { GLfloat f; GLuint ui; } u;
+    u.ui = s | r;
+    return u.f;
+}

 AttribPointerList ATTRIB_POINTERS;
 GLuint ENABLED_VERTEX_ATTRIBUTES = 0;
@ -62,6 +88,7 @@ GL_FORCE_INLINE GLsizei byte_size(GLenum type) {
    case GL_INT: return sizeof(GLint);
    case GL_UNSIGNED_INT: return sizeof(GLuint);
    case GL_DOUBLE: return sizeof(GLdouble);
+    case GL_HALF_FLOAT: return sizeof(GLhalf);
    case GL_UNSIGNED_INT_2_10_10_10_REV: return sizeof(GLuint);
    case GL_FLOAT:
    default: return sizeof(GLfloat);
@ -78,7 +105,7 @@ static void _readVertexData3f3f(const GLubyte* __restrict__ in, GLubyte* __restr

 // 10:10:10:2REV format
 static void _readVertexData1i3f(const GLubyte* in, GLubyte* out) {
-    const static float MULTIPLIER = 1.0f / 1023.0f;
+    static const float MULTIPLIER = 1.0f / 1023.0f;

    GLfloat* output = (GLfloat*) out;

@ -108,6 +135,15 @@ static void _readVertexData3us3f(const GLubyte* in, GLubyte* out) {
    output[2] = input[2];
 }

+static void _readVertexData3usq3f(const GLubyte* in, GLubyte* out) {
+    const GLushort* input = (const GLushort*) in;
+    float* output = (float*) out;
+
+    output[0] = _dequantize(input[0]);
+    output[1] = _dequantize(input[1]);
+    output[2] = _dequantize(input[2]);
+}
+
 static void _readVertexData3ui3f(const GLubyte* in, GLubyte* out) {
    const GLuint* input = (const GLuint*) in;
    float* output = (float*) out;
@ -126,6 +162,15 @@ static void _readVertexData3ub3f(const GLubyte* input, GLubyte* out) {
    output[2] = input[2] * ONE_OVER_TWO_FIVE_FIVE;
 }

+static void _readVertexData3f16_3f(const GLubyte* in, GLubyte* out) {
+    const GLhalf* input = (const GLhalf*) in;
+    float* output = (float*) out;
+
+    output[0] = input[0];
+    output[1] = input[1];
+    output[2] = input[2];
+}
+
 static void _readVertexData2f2f(const GLubyte* in, GLubyte* out) {
    vec2cpy(out, in);
 }
@ -159,8 +204,25 @@ static void _readVertexData2us2f(const GLubyte* in, GLubyte* out) {
    const GLushort* input = (const GLushort*) in;
    float* output = (float*) out;

-    output[0] = input[0];
-    output[1] = input[1];
+    output[0] = (float)input[0] / SHRT_MAX;
+    output[1] = (float)input[1] / SHRT_MAX;
+}
+
+static void _readVertexData2usq3f(const GLubyte* in, GLubyte* out) {
+    const GLushort* input = (const GLushort*) in;
+    float* output = (float*) out;
+
+    output[0] = _dequantize(input[0]);
+    output[1] = _dequantize(input[1]);
+    output[2] = 0.0f;
+}
+
+static void _readVertexData2usq2f(const GLubyte* in, GLubyte* out) {
+    const GLushort* input = (const GLushort*) in;
+    float* output = (float*) out;
+
+    output[0] = _dequantize(input[0]);
+    output[1] = _dequantize(input[1]);
 }

 static void _readVertexData2ui2f(const GLubyte* in, GLubyte* out) {
@ -178,6 +240,14 @@ static void _readVertexData2ub2f(const GLubyte* input, GLubyte* out) {
    output[1] = input[1] * ONE_OVER_TWO_FIVE_FIVE;
 }

+static void _readVertexData2f16_2f(const GLubyte* in, GLubyte* out) {
+    const GLhalf* input = (const GLhalf*) in;
+    float* output = (float*) out;
+
+    output[0] = input[0];
+    output[1] = input[1];
+}
+
 static void _readVertexData2ui3f(const GLubyte* in, GLubyte* out) {
    const GLuint* input = (const GLuint*) in;
    float* output = (float*) out;
@ -187,6 +257,15 @@ static void _readVertexData2ui3f(const GLubyte* in, GLubyte* out) {
    output[2] = 0.0f;
 }

+static void _readVertexData2f16_3f(const GLubyte* in, GLubyte* out) {
+    const GLhalf* input = (const GLhalf*) in;
+    float* output = (float*) out;
+
+    output[0] = input[0];
+    output[1] = input[1];
+    output[2] = 0.0f;
+}
+
 static void _readVertexData4ubARGB(const GLubyte* input, GLubyte* output) {
    output[R8IDX] = input[0];
    output[G8IDX] = input[1];
@ -239,7 +318,7 @@ static void _fillWithNegZVE(const GLubyte* __restrict__ input, GLubyte* __restri
        float x, y, z;
    } V;

-    const static V NegZ = {0.0f, 0.0f, -1.0f};
+    static const V NegZ = {0.0f, 0.0f, -1.0f};

    *((V*) out) = NegZ;
 }
@ -391,12 +470,12 @@ GL_FORCE_INLINE void transformNormalToEyeSpace(GLfloat* normal) {
 }

 GL_FORCE_INLINE PolyHeader *_glSubmissionTargetHeader(SubmissionTarget* target) {
-    gl_assert(target->header_offset < target->output->vector.size);
+    gl_assert(target->header_offset < aligned_vector_size(&target->output->vector));
    return aligned_vector_at(&target->output->vector, target->header_offset);
 }

 GL_INLINE_DEBUG Vertex* _glSubmissionTargetStart(SubmissionTarget* target) {
-    gl_assert(target->start_offset < target->output->vector.size);
+    gl_assert(target->start_offset < aligned_vector_size(&target->output->vector));
    return aligned_vector_at(&target->output->vector, target->start_offset);
 }

@ -492,14 +571,17 @@ ReadPositionFunc calcReadPositionFunc() {
        case GL_FLOAT:
            return (ATTRIB_POINTERS.vertex.size == 3) ? _readVertexData3f3f:
                    _readVertexData2f3f;
+        case GL_HALF_FLOAT:
+            return (ATTRIB_POINTERS.vertex.size == 3) ? _readVertexData3f16_3f:
+                    _readVertexData2f16_3f;
        case GL_BYTE:
        case GL_UNSIGNED_BYTE:
            return (ATTRIB_POINTERS.vertex.size == 3) ? _readVertexData3ub3f:
                    _readVertexData2ub3f;
        case GL_SHORT:
        case GL_UNSIGNED_SHORT:
-            return (ATTRIB_POINTERS.vertex.size == 3) ? _readVertexData3us3f:
-                    _readVertexData2us3f;
+            return (ATTRIB_POINTERS.vertex.size == 3) ? _readVertexData3usq3f:
+                    _readVertexData2usq3f;
        case GL_INT:
        case GL_UNSIGNED_INT:
            return (ATTRIB_POINTERS.vertex.size == 3) ? _readVertexData3ui3f:
@ -517,12 +599,14 @@ ReadUVFunc calcReadUVFunc() {
        case GL_DOUBLE:
        case GL_FLOAT:
            return _readVertexData2f2f;
+        case GL_HALF_FLOAT:
+            return _readVertexData2f16_2f;
        case GL_BYTE:
        case GL_UNSIGNED_BYTE:
            return _readVertexData2ub2f;
        case GL_SHORT:
        case GL_UNSIGNED_SHORT:
-            return _readVertexData2us2f;
+            return _readVertexData2usq2f;
        case GL_INT:
        case GL_UNSIGNED_INT:
            return _readVertexData2ui2f;
@ -539,12 +623,14 @@ ReadUVFunc calcReadSTFunc() {
        case GL_DOUBLE:
        case GL_FLOAT:
            return _readVertexData2f2f;
+        case GL_HALF_FLOAT:
+            return _readVertexData2f16_2f;
        case GL_BYTE:
        case GL_UNSIGNED_BYTE:
            return _readVertexData2ub2f;
        case GL_SHORT:
        case GL_UNSIGNED_SHORT:
-            return _readVertexData2us2f;
+            return _readVertexData2usq2f;
        case GL_INT:
        case GL_UNSIGNED_INT:
            return _readVertexData2ui2f;
@ -561,6 +647,8 @@ ReadNormalFunc calcReadNormalFunc() {
        case GL_DOUBLE:
        case GL_FLOAT:
            return _readVertexData3f3f;
+        case GL_HALF_FLOAT:
+            return _readVertexData3f16_3f;
        break;
        case GL_BYTE:
        case GL_UNSIGNED_BYTE:
@ -568,7 +656,7 @@ ReadNormalFunc calcReadNormalFunc() {
        break;
        case GL_SHORT:
        case GL_UNSIGNED_SHORT:
-            return _readVertexData3us3f;
+            return _readVertexData3usq3f;
        break;
        case GL_INT:
        case GL_UNSIGNED_INT:
@ -585,7 +673,6 @@ static void _readPositionData(ReadDiffuseFunc func, const GLuint first, const GL
    const GLubyte* vptr = ((GLubyte*) ATTRIB_POINTERS.vertex.ptr + (first * vstride));

    float pos[3];
-    float w = 0.0f;

    ITERATE(count) {
        PREFETCH(vptr + vstride);
@ -726,9 +813,7 @@ typedef struct {
 } Float2;

 static const Float3 F3Z = {0.0f, 0.0f, 1.0f};
-static const Float3 F3ZERO = {0.0f, 0.0f, 0.0f};
 static const Float2 F2ZERO = {0.0f, 0.0f};
-static const uint32_t U4ONE = ~0;

 static void generateElementsFastPath(
        SubmissionTarget* target, const GLsizei first, const GLuint count,
@ -815,17 +900,15 @@ static void generateElementsFastPath(

 #define POLYMODE QUADS
 #define PROCESS_VERTEX_FLAGS(it, i) { \
-    if((i + 1) % 4 == 0) { \
-        Vertex* prev = ((it) - 1); \
-        Vertex t = (*prev); \
-        *(prev) = *((it)); \
-        *((it)) = t; \
-        prev->flags = GPU_CMD_VERTEX; \
+    it->flags = GPU_CMD_VERTEX; \
+    if(((i + 1) % 4) == 0) { \
+        Vertex t = *it; \
+        *it = *(it - 1); \
+        *(it - 1) = t; \
        it->flags = GPU_CMD_VERTEX_EOL; \
-    } else { \
-        it->flags = GPU_CMD_VERTEX; \
    } \
 }
+
 #include "draw_fastpath.inc"
 #undef PROCESS_VERTEX_FLAGS
 #undef POLYMODE
@ -912,24 +995,6 @@ static void transform(SubmissionTarget* target) {
    TransformVertices(vertex, target->count);
 }

-static void mat_transform3(const float* xyz, const float* xyzOut, const uint32_t count, const uint32_t inStride, const uint32_t outStride) {
-    const uint8_t* dataIn = (const uint8_t*) xyz;
-    uint8_t* dataOut = (uint8_t*) xyzOut;
-
-    ITERATE(count) {
-        const float* in = (const float*) dataIn;
-        float* out = (float*) dataOut;
-
-        TransformVec3NoMod(
-            in,
-            out
-        );
-
-        dataIn += inStride;
-        dataOut += outStride;
-    }
-}
-
 static void mat_transform_normal3(const float* xyz, const float* xyzOut, const uint32_t count, const uint32_t inStride, const uint32_t outStride) {
    const uint8_t* dataIn = (const uint8_t*) xyz;
    uint8_t* dataOut = (uint8_t*) xyzOut;
@ -1170,6 +1235,7 @@ void _glInitSubmissionTarget() {


 GL_FORCE_INLINE void submitVertices(GLenum mode, GLsizei first, GLuint count, GLenum type, const GLvoid* indices) {
+
    SubmissionTarget* const target = &SUBMISSION_TARGET;
    AlignedVector* const extras = target->extras;

@ -1210,17 +1276,22 @@ GL_FORCE_INLINE void submitVertices(GLenum mode, GLsizei first, GLuint count, GL
        return;
    }

-    GLboolean header_required = (target->output->vector.size == 0) || _glGPUStateIsDirty();
-
-
    // We don't handle this any further, so just make sure we never pass it down */
    gl_assert(mode != GL_POLYGON);

    target->output = _glActivePolyList();
-    target->count = (mode == GL_TRIANGLE_FAN) ? ((count - 2) * 3) : count;
-    target->header_offset = target->output->vector.size;
-    target->start_offset = target->header_offset + (header_required);
+    gl_assert(target->output);
+    gl_assert(extras);

+    uint32_t vector_size = aligned_vector_size(&target->output->vector);
+
+    GLboolean header_required = (vector_size == 0) || _glGPUStateIsDirty();
+
+    target->count = (mode == GL_TRIANGLE_FAN) ? ((count - 2) * 3) : count;
+    target->header_offset = vector_size;
+    target->start_offset = target->header_offset + (header_required ? 1 : 0);
+
+    gl_assert(target->start_offset >= target->header_offset);
    gl_assert(target->count);

    /* Make sure we have enough room for all the "extra" data */
--- a/GL/draw_fastpath.inc
+++ b/GL/draw_fastpath.inc
@ -5,75 +5,123 @@

 MAKE_FUNC(POLYMODE)
 {
-    const Vertex* const start = _glSubmissionTargetStart(target);
-    const VertexExtra* const ve_start = aligned_vector_at(target->extras, 0);
-
-    const GLuint vstride = ATTRIB_POINTERS.vertex.stride;
-    GLuint uvstride = ATTRIB_POINTERS.uv.stride;
-    GLuint ststride = ATTRIB_POINTERS.st.stride;
-    GLuint dstride = ATTRIB_POINTERS.colour.stride;
-    GLuint nstride = ATTRIB_POINTERS.normal.stride;
-
-    const GLubyte* pos = (ENABLED_VERTEX_ATTRIBUTES & VERTEX_ENABLED_FLAG) ? ATTRIB_POINTERS.vertex.ptr + (first * vstride) : NULL;
-    const GLubyte* uv = (ENABLED_VERTEX_ATTRIBUTES & UV_ENABLED_FLAG) ? ATTRIB_POINTERS.uv.ptr + (first * uvstride) : NULL;
-    const GLubyte* col = (ENABLED_VERTEX_ATTRIBUTES & DIFFUSE_ENABLED_FLAG) ? ATTRIB_POINTERS.colour.ptr + (first * dstride) : NULL;
-    const GLubyte* st = (ENABLED_VERTEX_ATTRIBUTES & ST_ENABLED_FLAG) ? ATTRIB_POINTERS.st.ptr + (first * ststride) : NULL;
-    const GLubyte* n = (ENABLED_VERTEX_ATTRIBUTES & NORMAL_ENABLED_FLAG) ? ATTRIB_POINTERS.normal.ptr + (first * nstride) : NULL;
-
-    const float w = 1.0f;
-
-    if(!pos) {
+    static const float w = 1.0f;
+    if(!(ENABLED_VERTEX_ATTRIBUTES & VERTEX_ENABLED_FLAG)) {
        /* If we don't have vertices, do nothing */
        return;
    }

-    if(!col) {
-        col = (GLubyte*) &U4ONE;
-        dstride = 0;
-    }
+    /* This is the best value we have. PROCESS_VERTEX_FLAGS needs to operate on quads and tris and so
+       this need to be divisible by 4 and 3. Even though we should be able to go much higher than this
+       and still be cache-local, trial and error says otherwise... */

-    if(!uv) {
-        uv = (GLubyte*) &F2ZERO;
-        uvstride = 0;
-    }
+#define BATCH_SIZE 60

-    if(!st) {
-        st = (GLubyte*) &F2ZERO;
-        ststride = 0;
-    }
+    GLuint min = 0;
+    GLuint stride;
+    const GLubyte* ptr;
+    Vertex* it;
+    VertexExtra* ve;

-    if(!n) {
-        n = (GLubyte*) &F3Z;
-        nstride = 0;
-    }

-    VertexExtra* ve = (VertexExtra*) ve_start;
-    Vertex* it = (Vertex*) start;
+    for(min = 0; min < count; min += BATCH_SIZE) {
+        const Vertex* start = ((Vertex*) _glSubmissionTargetStart(target)) + min;
+        const int_fast32_t loop = ((min + BATCH_SIZE) > count) ? count - min : BATCH_SIZE;
+        const int offset = (first + min);

-    for(int_fast32_t i = 0; i < count; ++i) {
-        TransformVertex((const float*) pos, &w, it->xyz, &it->w);
-        pos += vstride;
-        PREFETCH(pos);
+        stride = ATTRIB_POINTERS.uv.stride;
+        ptr = (ENABLED_VERTEX_ATTRIBUTES & UV_ENABLED_FLAG) ? ATTRIB_POINTERS.uv.ptr + ((first + min) * stride) : NULL;
+        it = (Vertex*) start;

-        *((Float2*) it->uv) = *((Float2*) uv);
-        uv += uvstride;
-        PREFETCH(uv);
+        if(ptr) {
+            PREFETCH(ptr);
+            for(int_fast32_t i = 0; i < loop; ++i, ++it) {
+                PREFETCH(ptr + stride);
+                it->uv[0] = ((float*) ptr)[0];
+                it->uv[1] = ((float*) ptr)[1];
+                ptr += stride;
+            }
+        } else {
+            for(int_fast32_t i = 0; i < loop; ++i, ++it) {
+                it->uv[0] = 0;
+                it->uv[1] = 0;
+            }
+        }

-        *((uint32_t*) it->bgra) = *((uint32_t*) col);
-        col += dstride;
-        PREFETCH(col);
+        stride = ATTRIB_POINTERS.colour.stride;
+        ptr = (ENABLED_VERTEX_ATTRIBUTES & DIFFUSE_ENABLED_FLAG) ? ATTRIB_POINTERS.colour.ptr + (offset * stride) : NULL;
+        it = (Vertex*) start;

-        *((Float2*) ve->st) = *((Float2*) st);
-        st += ststride;
-        PREFETCH(st);
+        if(ptr) {
+            PREFETCH(ptr);
+            for(int_fast32_t i = 0; i < loop; ++i, ++it) {
+                PREFETCH(ptr + stride);
+                it->bgra[0] = ptr[0];
+                it->bgra[1] = ptr[1];
+                it->bgra[2] = ptr[2];
+                it->bgra[3] = ptr[3];
+                ptr += stride;
+            }
+        } else {
+            for(int_fast32_t i = 0; i < loop; ++i, ++it) {
+                *((uint32_t*) it->bgra) = ~0;
+            }
+        }

-        *((Float3*) ve->nxyz) = *((Float3*) n);
-        n += nstride;
-        PREFETCH(n);
+        stride = ATTRIB_POINTERS.vertex.stride;
+        ptr = ATTRIB_POINTERS.vertex.ptr + (offset * stride);
+        it = (Vertex*) start;

-        PROCESS_VERTEX_FLAGS(it, i);
+        PREFETCH(ptr);
+        for(int_fast32_t i = 0; i < loop; ++i, ++it) {
+            PREFETCH(ptr + stride);
+            TransformVertex((const float*) ptr, &w, it->xyz, &it->w);
+            PROCESS_VERTEX_FLAGS(it, min + i);
+            ptr += stride;
+        }

-        ++it;
-        ++ve;
+        start = aligned_vector_at(target->extras, min);
+
+        stride = ATTRIB_POINTERS.st.stride;
+        ptr = (ENABLED_VERTEX_ATTRIBUTES & ST_ENABLED_FLAG) ? ATTRIB_POINTERS.st.ptr + (offset * stride) : NULL;
+        ve = (VertexExtra*) start;
+
+        if(ptr) {
+            PREFETCH(ptr);
+
+            for(int_fast32_t i = 0; i < loop; ++i, ++ve) {
+                PREFETCH(ptr + stride);
+                ve->st[0] = ((float*) ptr)[0];
+                ve->st[1] = ((float*) ptr)[1];
+                ptr += stride;
+            }
+        } else {
+            for(int_fast32_t i = 0; i < loop; ++i, ++ve) {
+                ve->st[0] = 0;
+                ve->st[1] = 0;
+            }
+        }
+
+        stride = ATTRIB_POINTERS.normal.stride;
+        ptr = (ENABLED_VERTEX_ATTRIBUTES & NORMAL_ENABLED_FLAG) ? ATTRIB_POINTERS.normal.ptr + (offset * stride) : NULL;
+        ve = (VertexExtra*) start;
+
+        if(ptr) {
+            PREFETCH(ptr);
+
+            for(int_fast32_t i = 0; i < loop; ++i, ++ve) {
+                PREFETCH(ptr + stride);
+                ve->nxyz[0] = ((float*) ptr)[0];
+                ve->nxyz[1] = ((float*) ptr)[1];
+                ve->nxyz[2] = ((float*) ptr)[2];
+                ptr += stride;
+            }
+        } else {
+            for(int_fast32_t i = 0; i < loop; ++i, ++ve) {
+                ve->nxyz[0] = 0;
+                ve->nxyz[1] = 0;
+                ve->nxyz[2] = 0;
+            }
+        }
    }
 }
--- a/GL/flush.c
+++ b/GL/flush.c
@ -46,10 +46,22 @@ void APIENTRY glKosInitConfig(GLdcConfig* config) {
    config->initial_pt_capacity = 512 * 3;
    config->initial_tr_capacity = 1024 * 3;
    config->initial_immediate_capacity = 1024 * 3;
-    config->internal_palette_format = GL_RGBA8;
+
+    // RGBA4444 is the fastest general format - 8888 will cause a perf issue
+    config->internal_palette_format = GL_RGBA4;
+
+    config->texture_twiddle = GL_TRUE;
 }

+static bool _initialized = false;
+
 void APIENTRY glKosInitEx(GLdcConfig* config) {
+    if(_initialized) {
+        return;
+    }
+
+    _initialized = true;
+
    TRACE();

    printf("\nWelcome to GLdc! Git revision: %s\n\n", GLDC_VERSION);
@ -70,6 +82,10 @@ void APIENTRY glKosInitEx(GLdcConfig* config) {

    _glInitTextures();

+    if(config->texture_twiddle) {
+        glEnable(GL_TEXTURE_TWIDDLE_KOS);
+    }
+
    OP_LIST.list_type = GPU_LIST_OP_POLY;
    PT_LIST.list_type = GPU_LIST_PT_POLY;
    TR_LIST.list_type = GPU_LIST_TR_POLY;
@ -83,6 +99,12 @@ void APIENTRY glKosInitEx(GLdcConfig* config) {
    aligned_vector_reserve(&TR_LIST.vector, config->initial_tr_capacity);
 }

+void APIENTRY glKosShutdown() {
+    aligned_vector_clear(&OP_LIST.vector);
+    aligned_vector_clear(&PT_LIST.vector);
+    aligned_vector_clear(&TR_LIST.vector);
+}
+
 void APIENTRY glKosInit() {
    GLdcConfig config;
    glKosInitConfig(&config);
@ -93,21 +115,21 @@ void APIENTRY glKosSwapBuffers() {
    TRACE();

    SceneBegin();
-        if(OP_LIST.vector.size > 2) {
+        if(aligned_vector_header(&OP_LIST.vector)->size > 2) {
            SceneListBegin(GPU_LIST_OP_POLY);
-            SceneListSubmit(OP_LIST.vector.data, OP_LIST.vector.size);
+            SceneListSubmit((Vertex*) aligned_vector_front(&OP_LIST.vector), aligned_vector_size(&OP_LIST.vector));
            SceneListFinish();
        }

-        if(PT_LIST.vector.size > 2) {
+        if(aligned_vector_header(&PT_LIST.vector)->size > 2) {
            SceneListBegin(GPU_LIST_PT_POLY);
-            SceneListSubmit(PT_LIST.vector.data, PT_LIST.vector.size);
+            SceneListSubmit((Vertex*) aligned_vector_front(&PT_LIST.vector), aligned_vector_size(&PT_LIST.vector));
            SceneListFinish();
        }

-        if(TR_LIST.vector.size > 2) {
+        if(aligned_vector_header(&TR_LIST.vector)->size > 2) {
            SceneListBegin(GPU_LIST_TR_POLY);
-            SceneListSubmit(TR_LIST.vector.data, TR_LIST.vector.size);
+            SceneListSubmit((Vertex*) aligned_vector_front(&TR_LIST.vector), aligned_vector_size(&TR_LIST.vector));
            SceneListFinish();
        }
    SceneFinish();
@ -117,4 +139,4 @@ void APIENTRY glKosSwapBuffers() {
    aligned_vector_clear(&TR_LIST.vector);

    _glApplyScissor(true);
-}
+}
--- a/GL/framebuffer.c
+++ b/GL/framebuffer.c
@ -254,7 +254,7 @@ GLboolean _glGenerateMipmapTwiddled(const GLuint pvrFormat, const GLubyte* prevD
    return GL_TRUE;
 }

-void APIENTRY glGenerateMipmapEXT(GLenum target) {
+void APIENTRY glGenerateMipmap(GLenum target) {
    if(target != GL_TEXTURE_2D) {
        _glKosThrowError(GL_INVALID_OPERATION, __func__);
        return;
@ -334,7 +334,7 @@ GLAPI GLvoid APIENTRY gluBuild2DMipmaps(GLenum target, GLint internalFormat,
 	 unsigned byte data, and finally the data itself. */
    glTexImage2D(GL_TEXTURE_2D, 0, 3, width, height, 0, GL_RGB, GL_UNSIGNED_BYTE, data);

-    glGenerateMipmapEXT(GL_TEXTURE_2D);
+    glGenerateMipmap(GL_TEXTURE_2D);
 }

 GLenum APIENTRY glCheckFramebufferStatusEXT(GLenum target) {
--- a/GL/immediate.c
+++ b/GL/immediate.c
@ -17,10 +17,10 @@ extern inline GLuint _glRecalcFastPath();
 GLboolean IMMEDIATE_MODE_ACTIVE = GL_FALSE;
 static GLenum ACTIVE_POLYGON_MODE = GL_TRIANGLES;

-static GLfloat NORMAL[3] = {0.0f, 0.0f, 1.0f};
-static GLubyte COLOR[4] = {255, 255, 255, 255}; /* ARGB order for speed */
-static GLfloat UV_COORD[2] = {0.0f, 0.0f};
-static GLfloat ST_COORD[2] = {0.0f, 0.0f};
+static GLfloat __attribute__((aligned(32))) NORMAL[3] = {0.0f, 0.0f, 1.0f};
+static GLubyte __attribute__((aligned(32))) COLOR[4] = {255, 255, 255, 255}; /* ARGB order for speed */
+static GLfloat __attribute__((aligned(32))) UV_COORD[2] = {0.0f, 0.0f};
+static GLfloat __attribute__((aligned(32))) ST_COORD[2] = {0.0f, 0.0f};

 static AlignedVector VERTICES;
 static AttribPointerList IM_ATTRIBS;
@ -30,7 +30,7 @@ static AttribPointerList IM_ATTRIBS;
  can be applied faster */
 static GLuint IM_ENABLED_VERTEX_ATTRIBUTES = 0;

-typedef struct {
+typedef struct __attribute__((aligned(32))) {
    GLfloat x;
    GLfloat y;
    GLfloat z;
@ -50,7 +50,7 @@ void _glInitImmediateMode(GLuint initial_size) {
    aligned_vector_init(&VERTICES, sizeof(IMVertex));
    aligned_vector_reserve(&VERTICES, initial_size);

-    IM_ATTRIBS.vertex.ptr = VERTICES.data;
+    IM_ATTRIBS.vertex.ptr = aligned_vector_front(&VERTICES);
    IM_ATTRIBS.vertex.size = 3;
    IM_ATTRIBS.vertex.type = GL_FLOAT;
    IM_ATTRIBS.vertex.stride = sizeof(IMVertex);
@ -161,31 +161,27 @@ void APIENTRY glColor3fv(const GLfloat* v) {
 void APIENTRY glVertex3f(GLfloat x, GLfloat y, GLfloat z) {
    IM_ENABLED_VERTEX_ATTRIBUTES |= VERTEX_ENABLED_FLAG;

-    unsigned int cap = VERTICES.capacity;
    IMVertex* vert = aligned_vector_extend(&VERTICES, 1);

-    if(cap != VERTICES.capacity) {
-        /* Resizing could've invalidated the pointers */
-        IM_ATTRIBS.vertex.ptr = VERTICES.data;
-        IM_ATTRIBS.uv.ptr = IM_ATTRIBS.vertex.ptr + (sizeof(GLfloat) * 3);
-        IM_ATTRIBS.st.ptr = IM_ATTRIBS.vertex.ptr + (sizeof(GLfloat) * 5);
-        IM_ATTRIBS.colour.ptr = IM_ATTRIBS.vertex.ptr + (sizeof(GLfloat) * 7);
-        IM_ATTRIBS.normal.ptr = IM_ATTRIBS.vertex.ptr + (sizeof(GLfloat) * 7) + sizeof(uint32_t);
-    }
+    /* Resizing could've invalidated the pointers */
+    IM_ATTRIBS.vertex.ptr = VERTICES.data;
+    IM_ATTRIBS.uv.ptr = IM_ATTRIBS.vertex.ptr + 12;
+    IM_ATTRIBS.st.ptr = IM_ATTRIBS.uv.ptr + 8;
+    IM_ATTRIBS.colour.ptr = IM_ATTRIBS.st.ptr + 8;
+    IM_ATTRIBS.normal.ptr = IM_ATTRIBS.colour.ptr + 4;

-    vert->x = x;
-    vert->y = y;
-    vert->z = z;
-    vert->u = UV_COORD[0];
-    vert->v = UV_COORD[1];
-    vert->s = ST_COORD[0];
-    vert->t = ST_COORD[1];
-
-    *((uint32_t*) vert->bgra) = *((uint32_t*) COLOR);
-
-    vert->nx = NORMAL[0];
-    vert->ny = NORMAL[1];
-    vert->nz = NORMAL[2];
+    uint32_t* dest = (uint32_t*) &vert->x;
+    *(dest++) = *((uint32_t*) &x);
+    *(dest++) = *((uint32_t*) &y);
+    *(dest++) = *((uint32_t*) &z);
+    *(dest++) = *((uint32_t*) &UV_COORD[0]);
+    *(dest++) = *((uint32_t*) &UV_COORD[1]);
+    *(dest++) = *((uint32_t*) &ST_COORD[0]);
+    *(dest++) = *((uint32_t*) &ST_COORD[1]);
+    *(dest++) = *((uint32_t*) COLOR);
+    *(dest++) = *((uint32_t*) &NORMAL[0]);
+    *(dest++) = *((uint32_t*) &NORMAL[1]);
+    *(dest++) = *((uint32_t*) &NORMAL[2]);
 }

 void APIENTRY glVertex3fv(const GLfloat* v) {
@ -281,7 +277,7 @@ void APIENTRY glEnd() {
    FAST_PATH_ENABLED = GL_TRUE;
 #endif

-    glDrawArrays(ACTIVE_POLYGON_MODE, 0, VERTICES.size);
+    glDrawArrays(ACTIVE_POLYGON_MODE, 0, aligned_vector_header(&VERTICES)->size);

    ATTRIB_POINTERS = stashed_attrib_pointers;

--- a/GL/lighting.c
+++ b/GL/lighting.c
@ -124,8 +124,10 @@ void APIENTRY glLightModeli(GLenum pname, const GLint param) {
 void APIENTRY glLightModelfv(GLenum pname, const GLfloat *params) {
    switch(pname) {
        case GL_LIGHT_MODEL_AMBIENT: {
-            _glSetLightModelSceneAmbient(params);
-            _glPrecalcLightingValues(SCENE_AMBIENT_MASK);
+            if(memcmp(_glGetLightModelSceneAmbient(), params, sizeof(float) * 4) != 0) {
+                _glSetLightModelSceneAmbient(params);
+                _glPrecalcLightingValues(SCENE_AMBIENT_MASK);
+            }
        } break;
        case GL_LIGHT_MODEL_LOCAL_VIEWER:
            _glSetLightModelViewerInEyeCoordinates((*params) ? GL_TRUE : GL_FALSE);
@ -164,18 +166,28 @@ void APIENTRY glLightfv(GLenum light, GLenum pname, const GLfloat *params) {

    LightSource* l = _glLightAt(idx);

+    GLboolean rebuild = GL_FALSE;
+
    switch(pname) {
        case GL_AMBIENT:
-            memcpy(l->ambient, params, sizeof(GLfloat) * 4);
+            rebuild = memcmp(l->ambient, params, sizeof(GLfloat) * 4) != 0;
+            if(rebuild) {
+                memcpy(l->ambient, params, sizeof(GLfloat) * 4);
+            }
        break;
        case GL_DIFFUSE:
-            memcpy(l->diffuse, params, sizeof(GLfloat) * 4);
+            rebuild = memcmp(l->diffuse, params, sizeof(GLfloat) * 4) != 0;
+            if(rebuild) {
+                memcpy(l->diffuse, params, sizeof(GLfloat) * 4);
+            }
        break;
        case GL_SPECULAR:
-            memcpy(l->specular, params, sizeof(GLfloat) * 4);
+            rebuild = memcmp(l->specular, params, sizeof(GLfloat) * 4) != 0;
+            if(rebuild) {
+                memcpy(l->specular, params, sizeof(GLfloat) * 4);
+            }
        break;
        case GL_POSITION: {
-            _glMatrixLoadModelView();
            memcpy(l->position, params, sizeof(GLfloat) * 4);

            l->isDirectional = params[3] == 0.0f;
@ -183,6 +195,7 @@ void APIENTRY glLightfv(GLenum light, GLenum pname, const GLfloat *params) {
            if(l->isDirectional) {
                //FIXME: Do we need to rotate directional lights?
            } else {
+                _glMatrixLoadModelView();
                TransformVec3(l->position);
            }
        }
@ -204,7 +217,10 @@ void APIENTRY glLightfv(GLenum light, GLenum pname, const GLfloat *params) {
        return;
    }

-    _glPrecalcLightingValues(mask);
+    if(rebuild) {
+        _glPrecalcLightingValues(mask);
+    }
+
 }

 void APIENTRY glLightf(GLenum light, GLenum pname, GLfloat param) {
@ -258,25 +274,47 @@ void APIENTRY glMaterialfv(GLenum face, GLenum pname, const GLfloat *params) {

    Material* material = _glActiveMaterial();

+    GLboolean rebuild = GL_FALSE;
+
    switch(pname) {
        case GL_SHININESS:
            glMaterialf(face, pname, *params);
+            rebuild = GL_TRUE;
        break;
-        case GL_AMBIENT:
-            vec4cpy(material->ambient, params);
-        break;
+        case GL_AMBIENT: {
+            if(memcmp(material->ambient, params, sizeof(float) * 4) != 0) {
+                vec4cpy(material->ambient, params);
+                rebuild = GL_TRUE;
+            }
+        } break;
        case GL_DIFFUSE:
-            vec4cpy(material->diffuse, params);
+            if(memcmp(material->diffuse, params, sizeof(float) * 4) != 0) {
+                vec4cpy(material->diffuse, params);
+                rebuild = GL_TRUE;
+            }
        break;
        case GL_SPECULAR:
-            vec4cpy(material->specular, params);
+            if(memcmp(material->specular, params, sizeof(float) * 4) != 0) {
+                vec4cpy(material->specular, params);
+                rebuild = GL_TRUE;
+            }
        break;
        case GL_EMISSION:
-            vec4cpy(material->emissive, params);
+            if(memcmp(material->emissive, params, sizeof(float) * 4) != 0) {
+                vec4cpy(material->emissive, params);
+                rebuild = GL_TRUE;
+            }
        break;
        case GL_AMBIENT_AND_DIFFUSE: {
-            vec4cpy(material->ambient, params);
-            vec4cpy(material->diffuse, params);
+            rebuild = (
+                memcmp(material->ambient, params, sizeof(float) * 4) != 0 ||
+                memcmp(material->diffuse, params, sizeof(float) * 4) != 0
+            );
+
+            if(rebuild) {
+                vec4cpy(material->ambient, params);
+                vec4cpy(material->diffuse, params);
+            }
        } break;
        case GL_COLOR_INDEXES:
        default: {
@ -285,13 +323,15 @@ void APIENTRY glMaterialfv(GLenum face, GLenum pname, const GLfloat *params) {
        }
    }

-    GLuint updateMask = (pname == GL_AMBIENT) ? AMBIENT_MASK:
-                        (pname == GL_DIFFUSE) ? DIFFUSE_MASK:
-                        (pname == GL_SPECULAR) ? SPECULAR_MASK:
-                        (pname == GL_EMISSION) ? EMISSION_MASK:
-                        (pname == GL_AMBIENT_AND_DIFFUSE) ? AMBIENT_MASK | DIFFUSE_MASK : 0;
+    if(rebuild) {
+        GLuint updateMask = (pname == GL_AMBIENT) ? AMBIENT_MASK:
+                            (pname == GL_DIFFUSE) ? DIFFUSE_MASK:
+                            (pname == GL_SPECULAR) ? SPECULAR_MASK:
+                            (pname == GL_EMISSION) ? EMISSION_MASK:
+                            (pname == GL_AMBIENT_AND_DIFFUSE) ? AMBIENT_MASK | DIFFUSE_MASK : 0;

-    _glPrecalcLightingValues(updateMask);
+        _glPrecalcLightingValues(updateMask);
+    }
 }

 void APIENTRY glColorMaterial(GLenum face, GLenum mode) {
--- a/GL/matrix.c
+++ b/GL/matrix.c
@ -13,8 +13,8 @@
 GLfloat DEPTH_RANGE_MULTIPLIER_L = (1 - 0) / 2;
 GLfloat DEPTH_RANGE_MULTIPLIER_H = (0 + 1) / 2;

-static Stack MATRIX_STACKS[3]; // modelview, projection, texture
-static Matrix4x4 NORMAL_MATRIX __attribute__((aligned(32)));
+static Stack __attribute__((aligned(32))) MATRIX_STACKS[4]; // modelview, projection, texture
+static Matrix4x4 __attribute__((aligned(32))) NORMAL_MATRIX;

 Viewport VIEWPORT = {
    0, 0, 640, 480, 320.0f, 240.0f, 320.0f, 240.0f
@ -23,7 +23,7 @@ Viewport VIEWPORT = {
 static GLenum MATRIX_MODE = GL_MODELVIEW;
 static GLubyte MATRIX_IDX = 0;

-static const Matrix4x4 IDENTITY = {
+static const Matrix4x4 __attribute__((aligned(32))) IDENTITY = {
    1.0f, 0.0f, 0.0f, 0.0f,
    0.0f, 1.0f, 0.0f, 0.0f,
    0.0f, 0.0f, 1.0f, 0.0f,
@ -106,7 +106,11 @@ void APIENTRY glMatrixMode(GLenum mode) {
 }

 void APIENTRY glPushMatrix() {
-    stack_push(MATRIX_STACKS + MATRIX_IDX, stack_top(MATRIX_STACKS + MATRIX_IDX));
+    void* top = stack_top(MATRIX_STACKS + MATRIX_IDX);
+    assert(top);
+    void* ret = stack_push(MATRIX_STACKS + MATRIX_IDX, top);
+    (void) ret;
+    assert(ret);
 }

 void APIENTRY glPopMatrix() {
@ -127,10 +131,16 @@ void APIENTRY glTranslatef(GLfloat x, GLfloat y, GLfloat z) {
        0.0f, 0.0f, 1.0f, 0.0f,
        x, y, z, 1.0f
    };
+    void* top = stack_top(MATRIX_STACKS + MATRIX_IDX);
+    assert(top);

-    UploadMatrix4x4(stack_top(MATRIX_STACKS + MATRIX_IDX));
+    UploadMatrix4x4(top);
    MultiplyMatrix4x4(&trn);
-    DownloadMatrix4x4(stack_top(MATRIX_STACKS + MATRIX_IDX));
+
+    top = stack_top(MATRIX_STACKS + MATRIX_IDX);
+    assert(top);
+
+    DownloadMatrix4x4(top);

    if(MATRIX_MODE == GL_MODELVIEW) {
        recalculateNormalMatrix();
@ -200,28 +210,9 @@ void APIENTRY glRotatef(GLfloat angle, GLfloat x, GLfloat  y, GLfloat z) {

 /* Load an arbitrary matrix */
 void APIENTRY glLoadMatrixf(const GLfloat *m) {
-    static Matrix4x4 TEMP;
-
-    TEMP[M0] = m[0];
-    TEMP[M1] = m[1];
-    TEMP[M2] = m[2];
-    TEMP[M3] = m[3];
-
-    TEMP[M4] = m[4];
-    TEMP[M5] = m[5];
-    TEMP[M6] = m[6];
-    TEMP[M7] = m[7];
-
-    TEMP[M8] = m[8];
-    TEMP[M9] = m[9];
-    TEMP[M10] = m[10];
-    TEMP[M11] = m[11];
-
-    TEMP[M12] = m[12];
-    TEMP[M13] = m[13];
-    TEMP[M14] = m[14];
-    TEMP[M15] = m[15];
+    static Matrix4x4 __attribute__((aligned(32))) TEMP;

+    memcpy(TEMP, m, sizeof(float) * 16);
    stack_replace(MATRIX_STACKS + MATRIX_IDX, TEMP);

    if(MATRIX_MODE == GL_MODELVIEW) {
@ -289,18 +280,10 @@ void APIENTRY glFrustum(GLfloat left, GLfloat right,
 /* Multiply the current matrix by an arbitrary matrix */
 void glMultMatrixf(const GLfloat *m) {
    Matrix4x4 TEMP __attribute__((aligned(32)));
-    const Matrix4x4 *pMatrix;
-
-    if (((GLint)m)&0xf){ /* Unaligned matrix */
-        pMatrix = &TEMP;
-        MEMCPY4(TEMP, m, sizeof(Matrix4x4));
-    }
-    else{
-        pMatrix = (const Matrix4x4*) m;
-    }
+    MEMCPY4(TEMP, m, sizeof(Matrix4x4));

    UploadMatrix4x4(stack_top(MATRIX_STACKS + MATRIX_IDX));
-    MultiplyMatrix4x4(pMatrix);
+    MultiplyMatrix4x4(&TEMP);
    DownloadMatrix4x4(stack_top(MATRIX_STACKS + MATRIX_IDX));

    if(MATRIX_MODE == GL_MODELVIEW) {
@ -426,7 +409,7 @@ GL_FORCE_INLINE void vec3f_normalize_sh4(float *v){
 void gluLookAt(GLfloat eyex, GLfloat eyey, GLfloat eyez, GLfloat centerx,
               GLfloat centery, GLfloat centerz, GLfloat upx, GLfloat upy,
               GLfloat upz) {
-    GLfloat m [16];
+    GLfloat m [16] __attribute__((aligned(32)));
    GLfloat f [3];
    GLfloat u [3];
    GLfloat s [3];
--- a/GL/platform.h
+++ b/GL/platform.h
@ -5,6 +5,7 @@
 #include <stdbool.h>

 #include "gl_assert.h"
+#include "types.h"

 #define MEMSET(dst, v, size) memset((dst), (v), (size))

@ -260,7 +261,7 @@ typedef float Matrix4x4[16];
 void SceneBegin();

 void SceneListBegin(GPUList list);
-void SceneListSubmit(void* src, int n);
+void SceneListSubmit(Vertex* v2, int n);
 void SceneListFinish();

 void SceneFinish();
--- a/GL/platforms/sh4.c
+++ b/GL/platforms/sh4.c
@ -9,9 +9,7 @@
 #define likely(x)      __builtin_expect(!!(x), 1)
 #define unlikely(x)    __builtin_expect(!!(x), 0)

-#define SQ_BASE_ADDRESS 0xe0000000
-
-static volatile uint32_t* PVR_LMMODE0 = (uint32_t*) 0xA05F6884;
+#define SQ_BASE_ADDRESS (void*) 0xe0000000


 GL_FORCE_INLINE bool glIsVertex(const float flags) {
@ -33,14 +31,23 @@ void InitGPU(_Bool autosort, _Bool fsaa) {
    };

    pvr_init(&params);
+
+    /* If we're PAL and we're NOT VGA, then use 50hz by default. This is the safest
+    thing to do. If someone wants to force 60hz then they can call vid_set_mode later and hopefully
+    that'll work... */
+
+    int cable = vid_check_cable();
+    int region = flashrom_get_region();
+
+    if(region == FLASHROM_REGION_EUROPE && cable != CT_VGA) {
+        printf("PAL region without VGA - enabling 50hz");
+        vid_set_mode(DM_640x480_PAL_IL, PM_RGB565);
+    }
 }

 void SceneBegin() {
    pvr_wait_ready();
    pvr_scene_begin();
-
-    QACR0 = 0x11;  /* Enable the direct texture path by setting the higher two bits */
-    QACR1 = 0x11;
 }

 void SceneListBegin(GPUList list) {
@ -52,380 +59,399 @@ GL_FORCE_INLINE float _glFastInvert(float x) {
 }

 GL_FORCE_INLINE void _glPerspectiveDivideVertex(Vertex* vertex, const float h) {
+    TRACE();
+
    const float f = _glFastInvert(vertex->w);

    /* Convert to NDC and apply viewport */
-    vertex->xyz[0] = __builtin_fmaf(
-        VIEWPORT.hwidth, vertex->xyz[0] * f, VIEWPORT.x_plus_hwidth
-    );
-
-    vertex->xyz[1] = h - __builtin_fmaf(
-        VIEWPORT.hheight, vertex->xyz[1] * f, VIEWPORT.y_plus_hheight
-    );
+    vertex->xyz[0] = (vertex->xyz[0] * f * 320) + 320;
+    vertex->xyz[1] = (vertex->xyz[1] * f * -240) + 240;

    /* Orthographic projections need to use invZ otherwise we lose
    the depth information. As w == 1, and clip-space range is -w to +w
    we add 1.0 to the Z to bring it into range. We add a little extra to
    avoid a divide by zero.
    */
-
-    vertex->xyz[2] = (vertex->w == 1.0f) ? _glFastInvert(1.0001f + vertex->xyz[2]) : f;
+    if(vertex->w == 1.0f) {
+        vertex->xyz[2] = _glFastInvert(1.0001f + vertex->xyz[2]);
+    } else {
+        vertex->xyz[2] = f;
+    }
 }

-GL_FORCE_INLINE void _glSubmitHeaderOrVertex(uint32_t* d, const Vertex* v) {
-#ifndef NDEBUG
-    gl_assert(!isnan(v->xyz[2]));
-    gl_assert(!isnan(v->w));
-#endif

-#if CLIP_DEBUG
-    printf("Submitting: %x (%x)\n", v, v->flags);
-#endif
+volatile uint32_t *sq = SQ_BASE_ADDRESS;

-    uint32_t *s = (uint32_t*) v;
-    __asm__("pref @%0" : : "r"(s + 8));  /* prefetch 32 bytes for next loop */
-    d[0] = *(s++);
-    d[1] = *(s++);
-    d[2] = *(s++);
-    d[3] = *(s++);
-    d[4] = *(s++);
-    d[5] = *(s++);
-    d[6] = *(s++);
-    d[7] = *(s++);
-    __asm__("pref @%0" : : "r"(d));
-    d += 8;
+static inline void _glFlushBuffer() {
+    TRACE();
+
+    /* Wait for both store queues to complete */
+    sq = (uint32_t*) 0xe0000000;
+    sq[0] = sq[8] = 0;
 }

-static struct __attribute__((aligned(32))) {
-    Vertex* v;
-    int visible;
-} triangle[3];
+static inline void _glPushHeaderOrVertex(Vertex* v)  {
+    TRACE();

-static int tri_count = 0;
-static int strip_count = 0;
-
-GL_FORCE_INLINE void interpolateColour(const uint32_t* a, const uint32_t* b, const float t, uint32_t* out) {
-    const static uint32_t MASK1 = 0x00FF00FF;
-    const static uint32_t MASK2 = 0xFF00FF00;
-
-    const uint32_t f2 = 256 * t;
-    const uint32_t f1 = 256 - f2;
-
-    *out = (((((*a & MASK1) * f1) + ((*b & MASK1) * f2)) >> 8) & MASK1) |
-            (((((*a & MASK2) * f1) + ((*b & MASK2) * f2)) >> 8) & MASK2);
+    uint32_t* s = (uint32_t*) v;
+    sq[0] = *(s++);
+    sq[1] = *(s++);
+    sq[2] = *(s++);
+    sq[3] = *(s++);
+    sq[4] = *(s++);
+    sq[5] = *(s++);
+    sq[6] = *(s++);
+    sq[7] = *(s++);
+    __asm__("pref @%0" : : "r"(sq));
+    sq += 8;
 }

-static inline void _glClipEdge(const Vertex* v1, const Vertex* v2, Vertex* vout) {
-    /* Clipping time! */
+static inline void _glClipEdge(const Vertex* const v1, const Vertex* const v2, Vertex* vout) {
+    const static float o = 0.003921569f;  // 1 / 255
    const float d0 = v1->w + v1->xyz[2];
    const float d1 = v2->w + v2->xyz[2];
-    const float sign = ((2.0f * (d1 < d0)) - 1.0f);
-    const float epsilon = -0.00001f * sign;
-    const float n = (d0 - d1);
-    const float r = (1.f / sqrtf(n * n)) * sign;
-    float t = fmaf(r, d0, epsilon);
+    const float t = (fabs(d0) * (1.0f / sqrtf((d1 - d0) * (d1 - d0)))) + 0.000001f;
+    const float invt = 1.0f - t;

-    vout->xyz[0] = fmaf(v2->xyz[0] - v1->xyz[0], t, v1->xyz[0]);
-    vout->xyz[1] = fmaf(v2->xyz[1] - v1->xyz[1], t, v1->xyz[1]);
-    vout->xyz[2] = fmaf(v2->xyz[2] - v1->xyz[2], t, v1->xyz[2]);
-    vout->w = fmaf(v2->w - v1->w, t, v1->w);
+    vout->xyz[0] = invt * v1->xyz[0] + t * v2->xyz[0];
+    vout->xyz[1] = invt * v1->xyz[1] + t * v2->xyz[1];
+    vout->xyz[2] = invt * v1->xyz[2] + t * v2->xyz[2];

-    vout->uv[0] = fmaf(v2->uv[0] - v1->uv[0], t, v1->uv[0]);
-    vout->uv[1] = fmaf(v2->uv[1] - v1->uv[1], t, v1->uv[1]);
+    vout->uv[0] = invt * v1->uv[0] + t * v2->uv[0];
+    vout->uv[1] = invt * v1->uv[1] + t * v2->uv[1];

-    interpolateColour((uint32_t*) v1->bgra, (uint32_t*) v2->bgra, t, (uint32_t*) vout->bgra);
-}
+    vout->w = invt * v1->w + t * v2->w;

-GL_FORCE_INLINE void ClearTriangle() {
-    tri_count = 0;
-}
+    const float m = 255 * t;
+    const float n = 255 - m;

-GL_FORCE_INLINE void ShiftTriangle() {
-    if(!tri_count) {
-        return;
-    }
-
-    tri_count--;
-    triangle[0] = triangle[1];
-    triangle[1] = triangle[2];
-
-#ifndef NDEBUG
-    triangle[2].v = NULL;
-    triangle[2].visible = false;
-#endif
-}
-
-
-GL_FORCE_INLINE void ShiftRotateTriangle() {
-    if(!tri_count) {
-        return;
-    }
-
-    if(triangle[0].v < triangle[1].v) {
-        triangle[0] = triangle[2];
-    } else {
-        triangle[1] = triangle[2];
-    }
-
-    tri_count--;
+    vout->bgra[0] = (v1->bgra[0] * n + v2->bgra[0] * m) * o;
+    vout->bgra[1] = (v1->bgra[1] * n + v2->bgra[1] * m) * o;
+    vout->bgra[2] = (v1->bgra[2] * n + v2->bgra[2] * m) * o;
+    vout->bgra[3] = (v1->bgra[3] * n + v2->bgra[3] * m) * o;
 }

 #define SPAN_SORT_CFG 0x005F8030
+static volatile uint32_t* PVR_LMMODE0 = (uint32_t*) 0xA05F6884;
+static volatile uint32_t *PVR_LMMODE1 = (uint32_t*) 0xA05F6888;
+static volatile uint32_t *QACR = (uint32_t*) 0xFF000038;
+
+void SceneListSubmit(Vertex* v2, int n) {
+    TRACE();
+
+    /* You need at least a header, and 3 vertices to render anything */
+    if(n < 4) {
+        return;
+    }

-void SceneListSubmit(void* src, int n) {
    const float h = GetVideoMode()->height;

    PVR_SET(SPAN_SORT_CFG, 0x0);

-    uint32_t *d = (uint32_t*) SQ_BASE_ADDRESS;
-    *PVR_LMMODE0 = 0x0; /* Enable 64bit mode */
+    //Set PVR DMA registers
+    *PVR_LMMODE0 = 0;
+    *PVR_LMMODE1 = 0;

-    Vertex __attribute__((aligned(32))) tmp;
-
-    /* Perform perspective divide on each vertex */
-    Vertex* vertex = (Vertex*) src;
-
-    if(!_glNearZClippingEnabled()) {
-        /* Prep store queues */
-
-        for(int i = 0; i < n; ++i, ++vertex) {
-            PREFETCH(vertex + 1);
-            if(glIsVertex(vertex->flags)) {
-                _glPerspectiveDivideVertex(vertex, h);
-            }
-            _glSubmitHeaderOrVertex(d, vertex);
-        }
-
-        /* Wait for both store queues to complete */
-        d = (uint32_t *) SQ_BASE_ADDRESS;
-        d[0] = d[8] = 0;
-
-        return;
-    }
-
-    tri_count = 0;
-    strip_count = 0;
+    //Set QACR registers
+    QACR[1] = QACR[0] = 0x11;

 #if CLIP_DEBUG
-    printf("----\n");
+    Vertex* vertex = (Vertex*) src;
+    for(int i = 0; i < n; ++i) {
+        fprintf(stderr, "{%f, %f, %f, %f}, // %x (%x)\n", vertex[i].xyz[0], vertex[i].xyz[1], vertex[i].xyz[2], vertex[i].w, vertex[i].flags, &vertex[i]);
+    }
+
+    fprintf(stderr, "----\n");
 #endif
+    uint8_t visible_mask = 0;
+    uint8_t counter = 0;

-    for(int i = 0; i < n; ++i, ++vertex) {
-        PREFETCH(vertex + 12);
+    sq = SQ_BASE_ADDRESS;

-        /* Wait until we fill the triangle */
-        if(tri_count < 3) {
-            if(glIsVertex(vertex->flags)) {
-                ++strip_count;
-                triangle[tri_count].v = vertex;
-                triangle[tri_count].visible = vertex->xyz[2] >= -vertex->w;
-                if(++tri_count < 3) {
-                    continue;
-                }
-            } else {
-                /* We hit a header */
-                tri_count = 0;
-                strip_count = 0;
-                _glSubmitHeaderOrVertex(d, vertex);
+    for(int i = 0; i < n; ++i, ++v2) {
+        PREFETCH(v2 + 1);
+        switch(v2->flags) {
+        case GPU_CMD_VERTEX_EOL:
+            if(counter < 2) {
                continue;
            }
-        }
+            counter = 0;
+            break;
+        case GPU_CMD_VERTEX:
+            ++counter;
+            if(counter < 3) {
+                continue;
+            }
+            break;
+        default:
+            _glPushHeaderOrVertex(v2);
+            counter = 0;
+            continue;
+        };

-#if CLIP_DEBUG
-        printf("SC: %d\n", strip_count);
-#endif
+        Vertex* const v0 = v2 - 2;
+        Vertex* const v1 = v2 - 1;

-        /* If we got here, then triangle contains 3 vertices */
-        int visible_mask = triangle[0].visible | (triangle[1].visible << 1) | (triangle[2].visible << 2);
-
-        /* Clipping time!
-
-            There are 6 distinct possibilities when clipping a triangle. 3 of them result
-            in another triangle, 3 of them result in a quadrilateral.
-
-            Assuming you iterate the edges of the triangle in order, and create a new *visible*
-            vertex when you cross the plane, and discard vertices behind the plane, then the only
-            difference between the two cases is that the final two vertices that need submitting have
-            to be reversed.
-
-            Unfortunately we have to copy vertices here, because if we persp-divide a vertex it may
-            be used in a subsequent triangle in the strip and would end up being double divided.
-        */
-
-#define SUBMIT_QUEUED() \
-    if(strip_count > 3) { \
-        tmp = *(vertex - 2); \
-        /* If we had triangles ahead of this one, submit and finalize */ \
-        _glPerspectiveDivideVertex(&tmp, h); \
-        _glSubmitHeaderOrVertex(d, &tmp); \
-        tmp = *(vertex - 1); \
-        tmp.flags = GPU_CMD_VERTEX_EOL; \
-        _glPerspectiveDivideVertex(&tmp, h); \
-        _glSubmitHeaderOrVertex(d, &tmp); \
-    }
-
-        bool is_last_in_strip = glIsLastVertex(vertex->flags);
+        visible_mask = (
+            (v0->xyz[2] > -v0->w) << 0 |
+            (v1->xyz[2] > -v1->w) << 1 |
+            (v2->xyz[2] > -v2->w) << 2 |
+            (counter == 0) << 3
+        );

        switch(visible_mask) {
-            case 1: {
-                SUBMIT_QUEUED();
-                /* 0, 0a, 2a */
-                tmp = *triangle[0].v;
-                tmp.flags = GPU_CMD_VERTEX;
-                _glPerspectiveDivideVertex(&tmp, h);
-                _glSubmitHeaderOrVertex(d, &tmp);
+        case 15: /* All visible, but final vertex in strip */
+        {
+            _glPerspectiveDivideVertex(v0, h);
+            _glPushHeaderOrVertex(v0);

-                _glClipEdge(triangle[0].v, triangle[1].v, &tmp);
-                tmp.flags = GPU_CMD_VERTEX;
-                _glPerspectiveDivideVertex(&tmp, h);
-                _glSubmitHeaderOrVertex(d, &tmp);
+            _glPerspectiveDivideVertex(v1, h);
+            _glPushHeaderOrVertex(v1);

-                _glClipEdge(triangle[2].v, triangle[0].v, &tmp);
-                tmp.flags = GPU_CMD_VERTEX_EOL;
-                _glPerspectiveDivideVertex(&tmp, h);
-                _glSubmitHeaderOrVertex(d, &tmp);
-            } break;
-            case 2: {
-                SUBMIT_QUEUED();
-                /* 0a, 1, 1a */
-                _glClipEdge(triangle[0].v, triangle[1].v, &tmp);
-                tmp.flags = GPU_CMD_VERTEX;
-                _glPerspectiveDivideVertex(&tmp, h);
-                _glSubmitHeaderOrVertex(d, &tmp);
+            _glPerspectiveDivideVertex(v2, h);
+            _glPushHeaderOrVertex(v2);
+        }
+        break;
+        case 7:
+            /* All visible, push the first vertex and move on */
+            _glPerspectiveDivideVertex(v0, h);
+            _glPushHeaderOrVertex(v0);
+            break;
+        case 9:
+            /* First vertex was visible, last in strip */
+            {
+                Vertex __attribute__((aligned(32))) scratch[2];
+                Vertex* a = &scratch[0];
+                Vertex* b = &scratch[1];

-                tmp = *triangle[1].v;
-                tmp.flags = GPU_CMD_VERTEX;
-                _glPerspectiveDivideVertex(&tmp, h);
-                _glSubmitHeaderOrVertex(d, &tmp);
+                _glClipEdge(v0, v1, a);
+                a->flags = GPU_CMD_VERTEX;

-                _glClipEdge(triangle[1].v, triangle[2].v, &tmp);
-                tmp.flags = GPU_CMD_VERTEX_EOL;
-                _glPerspectiveDivideVertex(&tmp, h);
-                _glSubmitHeaderOrVertex(d, &tmp);
-            } break;
-            case 3: {
-                SUBMIT_QUEUED();
-                /* 0, 1, 2a, 1a */
-                tmp = *triangle[0].v;
-                tmp.flags = GPU_CMD_VERTEX;
-                _glPerspectiveDivideVertex(&tmp, h);
-                _glSubmitHeaderOrVertex(d, &tmp);
+                _glClipEdge(v2, v0, b);
+                b->flags = GPU_CMD_VERTEX_EOL;

-                tmp = *triangle[1].v;
-                tmp.flags = GPU_CMD_VERTEX;
-                _glPerspectiveDivideVertex(&tmp, h);
-                _glSubmitHeaderOrVertex(d, &tmp);
+                _glPerspectiveDivideVertex(v0, h);
+                _glPushHeaderOrVertex(v0);

-                _glClipEdge(triangle[2].v, triangle[0].v, &tmp);
-                tmp.flags = GPU_CMD_VERTEX;
-                _glPerspectiveDivideVertex(&tmp, h);
-                _glSubmitHeaderOrVertex(d, &tmp);
+                _glPerspectiveDivideVertex(a, h);
+                _glPushHeaderOrVertex(a);

-                _glClipEdge(triangle[1].v, triangle[2].v, &tmp);
-                tmp.flags = GPU_CMD_VERTEX_EOL;
-                _glPerspectiveDivideVertex(&tmp, h);
-                _glSubmitHeaderOrVertex(d, &tmp);
-            } break;
-            case 4: {
-                SUBMIT_QUEUED();
-                /* 1a, 2, 2a */
-                _glClipEdge(triangle[1].v, triangle[2].v, &tmp);
-                tmp.flags = GPU_CMD_VERTEX;
-                _glPerspectiveDivideVertex(&tmp, h);
-                _glSubmitHeaderOrVertex(d, &tmp);
+                _glPerspectiveDivideVertex(b, h);
+                _glPushHeaderOrVertex(b);
+            }
+            break;
+        case 1:
+            /* First vertex was visible, but not last in strip */
+            {
+                Vertex __attribute__((aligned(32))) scratch[2];
+                Vertex* a = &scratch[0];
+                Vertex* b = &scratch[1];

-                tmp = *triangle[2].v;
-                tmp.flags = GPU_CMD_VERTEX;
-                _glPerspectiveDivideVertex(&tmp, h);
-                _glSubmitHeaderOrVertex(d, &tmp);
+                _glClipEdge(v0, v1, a);
+                a->flags = GPU_CMD_VERTEX;

-                _glClipEdge(triangle[2].v, triangle[0].v, &tmp);
-                tmp.flags = GPU_CMD_VERTEX_EOL;
-                _glPerspectiveDivideVertex(&tmp, h);
-                _glSubmitHeaderOrVertex(d, &tmp);
-            } break;
-            case 5: {
-                SUBMIT_QUEUED();
-                /* 0, 0a, 2, 1a */
-                tmp = *triangle[0].v;
-                tmp.flags = GPU_CMD_VERTEX;
-                _glPerspectiveDivideVertex(&tmp, h);
-                _glSubmitHeaderOrVertex(d, &tmp);
+                _glClipEdge(v2, v0, b);
+                b->flags = GPU_CMD_VERTEX;

-                _glClipEdge(triangle[0].v, triangle[1].v, &tmp);
-                tmp.flags = GPU_CMD_VERTEX;
-                _glPerspectiveDivideVertex(&tmp, h);
-                _glSubmitHeaderOrVertex(d, &tmp);
+                _glPerspectiveDivideVertex(v0, h);
+                _glPushHeaderOrVertex(v0);

-                tmp = *triangle[2].v;
-                tmp.flags = GPU_CMD_VERTEX;
-                _glPerspectiveDivideVertex(&tmp, h);
-                _glSubmitHeaderOrVertex(d, &tmp);
+                _glPerspectiveDivideVertex(a, h);
+                _glPushHeaderOrVertex(a);

-                _glClipEdge(triangle[1].v, triangle[2].v, &tmp);
-                tmp.flags = GPU_CMD_VERTEX_EOL;
-                _glPerspectiveDivideVertex(&tmp, h);
-                _glSubmitHeaderOrVertex(d, &tmp);
-            } break;
-            case 6: {
-                SUBMIT_QUEUED();
-                /* 0a, 1, 2a, 2 */
-                _glClipEdge(triangle[0].v, triangle[1].v, &tmp);
-                tmp.flags = GPU_CMD_VERTEX;
-                _glPerspectiveDivideVertex(&tmp, h);
-                _glSubmitHeaderOrVertex(d, &tmp);
+                _glPerspectiveDivideVertex(b, h);
+                _glPushHeaderOrVertex(b);
+                _glPushHeaderOrVertex(b);
+            }
+            break;
+        case 10:
+        case 2:
+            /* Second vertex was visible. In self case we need to create a triangle and produce
+                two new vertices: 1-2, and 2-3. */
+            {
+                Vertex __attribute__((aligned(32))) scratch[3];
+                Vertex* a = &scratch[0];
+                Vertex* b = &scratch[1];
+                Vertex* c = &scratch[2];

-                tmp = *triangle[1].v;
-                tmp.flags = GPU_CMD_VERTEX;
-                _glPerspectiveDivideVertex(&tmp, h);
-                _glSubmitHeaderOrVertex(d, &tmp);
+                memcpy_vertex(c, v1);

-                _glClipEdge(triangle[2].v, triangle[0].v, &tmp);
-                tmp.flags = GPU_CMD_VERTEX;
-                _glPerspectiveDivideVertex(&tmp, h);
-                _glSubmitHeaderOrVertex(d, &tmp);
+                _glClipEdge(v0, v1, a);
+                a->flags = GPU_CMD_VERTEX;

-                tmp = *triangle[2].v;
-                tmp.flags = GPU_CMD_VERTEX_EOL;
-                _glPerspectiveDivideVertex(&tmp, h);
-                _glSubmitHeaderOrVertex(d, &tmp);
-            } break;
-            case 7: {
-                /* All the vertices are visible! We divide and submit v0, then shift */
-                _glPerspectiveDivideVertex(vertex - 2, h);
-                _glSubmitHeaderOrVertex(d, vertex - 2);
+                _glClipEdge(v1, v2, b);
+                b->flags = v2->flags;

-                if(is_last_in_strip) {
-                    _glPerspectiveDivideVertex(vertex - 1, h);
-                    _glSubmitHeaderOrVertex(d, vertex - 1);
-                    _glPerspectiveDivideVertex(vertex, h);
-                    _glSubmitHeaderOrVertex(d, vertex);
-                    tri_count = 0;
-                    strip_count = 0;
+                _glPerspectiveDivideVertex(a, h);
+                _glPushHeaderOrVertex(a);
+
+                _glPerspectiveDivideVertex(c, h);
+                _glPushHeaderOrVertex(c);
+
+                _glPerspectiveDivideVertex(b, h);
+                _glPushHeaderOrVertex(b);
+            }
+            break;
+        case 11:
+        case 3:  /* First and second vertex were visible */
+        {
+            Vertex __attribute__((aligned(32))) scratch[3];
+            Vertex* a = &scratch[0];
+            Vertex* b = &scratch[1];
+            Vertex* c = &scratch[2];
+
+            memcpy_vertex(c, v1);
+
+            _glClipEdge(v2, v0, b);
+            b->flags = GPU_CMD_VERTEX;
+
+            _glPerspectiveDivideVertex(v0, h);
+            _glPushHeaderOrVertex(v0);
+
+            _glClipEdge(v1, v2, a);
+            a->flags = v2->flags;
+
+            _glPerspectiveDivideVertex(c, h);
+            _glPushHeaderOrVertex(c);
+
+            _glPerspectiveDivideVertex(b, h);
+            _glPushHeaderOrVertex(b);
+
+            _glPerspectiveDivideVertex(a, h);
+            _glPushHeaderOrVertex(c);
+            _glPushHeaderOrVertex(a);
+        }
+        break;
+        case 12:
+        case 4:
+            /* Third vertex was visible. */
+            {
+                Vertex __attribute__((aligned(32))) scratch[3];
+                Vertex* a = &scratch[0];
+                Vertex* b = &scratch[1];
+                Vertex* c = &scratch[2];
+
+                memcpy_vertex(c, v2);
+
+                _glClipEdge(v2, v0, a);
+                a->flags = GPU_CMD_VERTEX;
+
+                _glClipEdge(v1, v2, b);
+                b->flags = GPU_CMD_VERTEX;
+
+                _glPerspectiveDivideVertex(a, h);
+                _glPushHeaderOrVertex(a);
+
+                if(counter % 2 == 1) {
+                    _glPushHeaderOrVertex(a);
                }

-                ShiftRotateTriangle();
-                continue;
-            } break;
-            case 0:
-            default:
-            break;
-        }
+                _glPerspectiveDivideVertex(b, h);
+                _glPushHeaderOrVertex(b);

-        /* If this was the last in the strip, we don't need to
-        submit anything else, we just wipe the tri_count */
-        if(is_last_in_strip) {
-            tri_count = 0;
-            strip_count = 0;
-        } else {
-            ShiftRotateTriangle();
-            strip_count = 2;
+                _glPerspectiveDivideVertex(c, h);
+                _glPushHeaderOrVertex(c);
+            }
+            break;
+        case 13:
+        {
+            Vertex __attribute__((aligned(32))) scratch[3];
+            Vertex* a = &scratch[0];
+            Vertex* b = &scratch[1];
+            Vertex* c = &scratch[2];
+
+            memcpy_vertex(c, v2);
+            c->flags = GPU_CMD_VERTEX;
+
+            _glClipEdge(v0, v1, a);
+            a->flags = GPU_CMD_VERTEX;
+
+            _glClipEdge(v1, v2, b);
+            b->flags = GPU_CMD_VERTEX;
+
+            _glPerspectiveDivideVertex(v0, h);
+            _glPushHeaderOrVertex(v0);
+
+            _glPerspectiveDivideVertex(a, h);
+            _glPushHeaderOrVertex(a);
+
+            _glPerspectiveDivideVertex(c, h);
+            _glPushHeaderOrVertex(c);
+            _glPerspectiveDivideVertex(b, h);
+            _glPushHeaderOrVertex(b);
+
+            c->flags = GPU_CMD_VERTEX_EOL;
+            _glPushHeaderOrVertex(c);
+        }
+        break;
+        case 5:  /* First and third vertex were visible */
+        {
+            Vertex __attribute__((aligned(32))) scratch[3];
+            Vertex* a = &scratch[0];
+            Vertex* b = &scratch[1];
+            Vertex* c = &scratch[2];
+
+            memcpy_vertex(c, v2);
+            c->flags = GPU_CMD_VERTEX;
+
+            _glClipEdge(v0, v1, a);
+            a->flags = GPU_CMD_VERTEX;
+
+            _glClipEdge(v1, v2, b);
+            b->flags = GPU_CMD_VERTEX;
+
+            _glPerspectiveDivideVertex(v0, h);
+            _glPushHeaderOrVertex(v0);
+
+            _glPerspectiveDivideVertex(a, h);
+            _glPushHeaderOrVertex(a);
+
+            _glPerspectiveDivideVertex(c, h);
+            _glPushHeaderOrVertex(c);
+            _glPerspectiveDivideVertex(b, h);
+            _glPushHeaderOrVertex(b);
+            _glPushHeaderOrVertex(c);
+        }
+        break;
+        case 14:
+        case 6:  /* Second and third vertex were visible */
+        {
+            Vertex __attribute__((aligned(32))) scratch[4];
+            Vertex* a = &scratch[0];
+            Vertex* b = &scratch[1];
+            Vertex* c = &scratch[2];
+            Vertex* d = &scratch[3];
+
+            memcpy_vertex(c, v1);
+            memcpy_vertex(d, v2);
+
+            _glClipEdge(v0, v1, a);
+            a->flags = GPU_CMD_VERTEX;
+
+            _glClipEdge(v2, v0, b);
+            b->flags = GPU_CMD_VERTEX;
+
+            _glPerspectiveDivideVertex(a, h);
+            _glPushHeaderOrVertex(a);
+
+            _glPerspectiveDivideVertex(c, h);
+            _glPushHeaderOrVertex(c);
+
+            _glPerspectiveDivideVertex(b, h);
+            _glPushHeaderOrVertex(b);
+            _glPushHeaderOrVertex(c);
+
+            _glPerspectiveDivideVertex(d, h);
+            _glPushHeaderOrVertex(d);
+        }
+        break;
+        case 8:
+        default:
+            break;
        }
    }

-    /* Wait for both store queues to complete */
-    d = (uint32_t *)0xe0000000;
-    d[0] = d[8] = 0;
+    _glFlushBuffer();
 }

 void SceneListFinish() {
--- a/GL/platforms/sh4.h
+++ b/GL/platforms/sh4.h
@ -24,7 +24,7 @@
 #define GL_FORCE_INLINE static GL_INLINE_DEBUG
 #endif

-#define PREFETCH(addr) __asm__("pref @%0" : : "r"((addr)))
+#define PREFETCH(addr) __builtin_prefetch((addr))

 GL_FORCE_INLINE void* memcpy_fast(void *dest, const void *src, size_t len) {
  if(!len) {
--- a/GL/platforms/software.c
+++ b/GL/platforms/software.c
@ -10,8 +10,9 @@
 #include "software/parameter_equation.h"

 #define CLIP_DEBUG 0
+#define ZNEAR_CLIPPING_ENABLED 1

-static size_t AVAILABLE_VRAM = 16 * 1024 * 1024;
+static size_t AVAILABLE_VRAM = 8 * 1024 * 1024;
 static Matrix4x4 MATRIX;

 static SDL_Window* WINDOW = NULL;
@ -29,83 +30,13 @@ static VideoMode vid_mode = {
 #define MIN(x, y) ((x) < (y) ? (x) : (y))
 #define MAX(x, y) ((x) > (y) ? (x) : (y))

-static void DrawTriangle(Vertex* v0, Vertex* v1, Vertex* v2) {
-    // Compute triangle bounding box.
-
-    int minX = MIN(MIN(v0->xyz[0], v1->xyz[0]), v2->xyz[0]);
-    int maxX = MAX(MAX(v0->xyz[0], v1->xyz[0]), v2->xyz[0]);
-    int minY = MIN(MIN(v0->xyz[1], v1->xyz[1]), v2->xyz[1]);
-    int maxY = MAX(MAX(v0->xyz[1], v1->xyz[1]), v2->xyz[1]);
-
-    // Clip to scissor rect.
-
-    minX = MAX(minX, 0);
-    maxX = MIN(maxX, vid_mode.width);
-    minY = MAX(minY, 0);
-    maxY = MIN(maxY, vid_mode.height);
-
-    // Compute edge equations.
-
-    EdgeEquation e0, e1, e2;
-    EdgeEquationInit(&e0, &v0->xyz[0], &v1->xyz[0]);
-    EdgeEquationInit(&e1, &v1->xyz[0], &v2->xyz[0]);
-    EdgeEquationInit(&e2, &v2->xyz[0], &v0->xyz[0]);
-
-    float area = 0.5 * (e0.c + e1.c + e2.c);
-
-    /* This is very ugly. I don't understand the math properly
-     * so I just swap the vertex order if something is back-facing
-     * and we want to render it. Patches welcome! */
-#define REVERSE_WINDING() \
-    Vertex* tv = v0; \
-    v0 = v1; \
-    v1 = tv; \
-    EdgeEquationInit(&e0, &v0->xyz[0], &v1->xyz[0]); \
-    EdgeEquationInit(&e1, &v1->xyz[0], &v2->xyz[0]); \
-    EdgeEquationInit(&e2, &v2->xyz[0], &v0->xyz[0]); \
-    area = 0.5f * (e0.c + e1.c + e2.c) \
-
-    // Check if triangle is backfacing.
-    if(CULL_MODE == GPU_CULLING_CCW) {
-        if(area < 0) {
-            return;
-        }
-    } else if(CULL_MODE == GPU_CULLING_CW) {
-        if(area < 0) {
-            // We only draw front-facing polygons, so swap
-            // the back to front and draw
-            REVERSE_WINDING();
-        } else {
-            // Front facing, so bail
-            return;
-        }
-    } else if(area < 0) {
-        /* We're not culling, but this is backfacing, so swap vertices and edges */
-        REVERSE_WINDING();
-    }
-
-    ParameterEquation r, g, b;
-
-    ParameterEquationInit(&r, v0->bgra[2], v1->bgra[2], v2->bgra[2], &e0, &e1, &e2, area);
-    ParameterEquationInit(&g, v0->bgra[1], v1->bgra[1], v2->bgra[1], &e0, &e1, &e2, area);
-    ParameterEquationInit(&b, v0->bgra[0], v1->bgra[0], v2->bgra[0], &e0, &e1, &e2, area);
-
-    // Add 0.5 to sample at pixel centers.
-    for (float x = minX + 0.5f, xm = maxX + 0.5f; x <= xm; x += 1.0f)
-    for (float y = minY + 0.5f, ym = maxY + 0.5f; y <= ym; y += 1.0f)
-    {
-      if (EdgeEquationTestPoint(&e0, x, y) && EdgeEquationTestPoint(&e1, x, y) && EdgeEquationTestPoint(&e2, x, y)) {
-        int rint = ParameterEquationEvaluate(&r, x, y);
-        int gint = ParameterEquationEvaluate(&g, x, y);
-        int bint = ParameterEquationEvaluate(&b, x, y);
-        SDL_SetRenderDrawColor(RENDERER, rint, gint, bint, 255);
-        SDL_RenderDrawPoint(RENDERER, x, y);
-      }
-    }
-}
-
+AlignedVector vbuffer;

 void InitGPU(_Bool autosort, _Bool fsaa) {
+
+    // 32-bit SDL has trouble with the wayland driver for some reason
+    setenv("SDL_VIDEODRIVER", "x11", 1);
+
    SDL_Init(SDL_INIT_VIDEO | SDL_INIT_EVENTS);

    WINDOW = SDL_CreateWindow(
@ -119,6 +50,8 @@ void InitGPU(_Bool autosort, _Bool fsaa) {
    RENDERER = SDL_CreateRenderer(
        WINDOW, -1, SDL_RENDERER_ACCELERATED
    );
+
+    aligned_vector_init(&vbuffer, sizeof(SDL_Vertex));
 }

 void SceneBegin() {
@ -161,7 +94,7 @@ GL_FORCE_INLINE void _glPerspectiveDivideVertex(Vertex* vertex, const float h) {
    }
 }

-GL_FORCE_INLINE void _glSubmitHeaderOrVertex(const Vertex* v) {
+GL_FORCE_INLINE void _glPushHeaderOrVertex(const Vertex* v) {
 #ifndef NDEBUG
    if(glIsVertex(v->flags)) {
        gl_assert(!isnan(v->xyz[2]));
@ -176,335 +109,329 @@ GL_FORCE_INLINE void _glSubmitHeaderOrVertex(const Vertex* v) {
    BUFFER[vertex_counter++] = *v;
 }

-static struct {
-    Vertex* v;
-    int visible;
-} triangle[3];
+static inline void _glFlushBuffer() {}

-static int tri_count = 0;
-static int strip_count = 0;
-
-GL_FORCE_INLINE void interpolateColour(const uint8_t* v1, const uint8_t* v2, const float t, uint8_t* out) {
-    const int MASK1 = 0x00FF00FF;
-    const int MASK2 = 0xFF00FF00;
-
-    const int f2 = 256 * t;
-    const int f1 = 256 - f2;
-
-    const uint32_t a = *(uint32_t*) v1;
-    const uint32_t b = *(uint32_t*) v2;
-
-    *((uint32_t*) out) = (((((a & MASK1) * f1) + ((b & MASK1) * f2)) >> 8) & MASK1) |
-            (((((a & MASK2) * f1) + ((b & MASK2) * f2)) >> 8) & MASK2);
-}

 GL_FORCE_INLINE void _glClipEdge(const Vertex* v1, const Vertex* v2, Vertex* vout) {
-    /* Clipping time! */
+    const static float o = 0.003921569f;  // 1 / 255
    const float d0 = v1->w + v1->xyz[2];
    const float d1 = v2->w + v2->xyz[2];
+    const float t = (fabs(d0) * (1.0f / sqrtf((d1 - d0) * (d1 - d0)))) + 0.000001f;
+    const float invt = 1.0f - t;

-    const float epsilon = (d0 < d1) ? -0.00001f : 0.00001f;
+    vout->xyz[0] = invt * v1->xyz[0] + t * v2->xyz[0];
+    vout->xyz[1] = invt * v1->xyz[1] + t * v2->xyz[1];
+    vout->xyz[2] = invt * v1->xyz[2] + t * v2->xyz[2];

-    float t = (d0 / (d0 - d1)) + epsilon;
+    vout->uv[0] = invt * v1->uv[0] + t * v2->uv[0];
+    vout->uv[1] = invt * v1->uv[1] + t * v2->uv[1];

-    t = (t > 1.0f) ? 1.0f : t;
-    t = (t < 0.0f) ? 0.0f : t;
+    vout->w = invt * v1->w + t * v2->w;

-    vout->xyz[0] = __builtin_fmaf(v2->xyz[0] - v1->xyz[0], t, v1->xyz[0]);
-    vout->xyz[1] = __builtin_fmaf(v2->xyz[1] - v1->xyz[1], t, v1->xyz[1]);
-    vout->xyz[2] = __builtin_fmaf(v2->xyz[2] - v1->xyz[2], t, v1->xyz[2]);
-    vout->w = __builtin_fmaf(v2->w - v1->w, t, v1->w);
+    const float m = 255 * t;
+    const float n = 255 - m;

-    vout->uv[0] = __builtin_fmaf(v2->uv[0] - v1->uv[0], t, v1->uv[0]);
-    vout->uv[1] = __builtin_fmaf(v2->uv[1] - v1->uv[1], t, v1->uv[1]);
-
-    interpolateColour(v1->bgra, v2->bgra, t, vout->bgra);
+    vout->bgra[0] = (v1->bgra[0] * n + v2->bgra[0] * m) * o;
+    vout->bgra[1] = (v1->bgra[1] * n + v2->bgra[1] * m) * o;
+    vout->bgra[2] = (v1->bgra[2] * n + v2->bgra[2] * m) * o;
+    vout->bgra[3] = (v1->bgra[3] * n + v2->bgra[3] * m) * o;
 }

-GL_FORCE_INLINE void ClearTriangle() {
-    tri_count = 0;
-}
-
-GL_FORCE_INLINE void ShiftTriangle() {
-    if(!tri_count) {
+void SceneListSubmit(Vertex* v2, int n) {
+    /* You need at least a header, and 3 vertices to render anything */
+    if(n < 4) {
        return;
    }

-    tri_count--;
-    triangle[0] = triangle[1];
-    triangle[1] = triangle[2];
-
-#ifndef NDEBUG
-    triangle[2].v = NULL;
-    triangle[2].visible = false;
-#endif
-}
-
-GL_FORCE_INLINE void ShiftRotateTriangle() {
-    if(!tri_count) {
-        return;
-    }
-
-    if(triangle[0].v < triangle[1].v) {
-        triangle[0] = triangle[2];
-    } else {
-        triangle[1] = triangle[2];
-    }
-
-    tri_count--;
-}
-
-void SceneListSubmit(void* src, int n) {
-        /* Perform perspective divide on each vertex */
-    Vertex* vertex = (Vertex*) src;
-
    const float h = GetVideoMode()->height;

-    /* If Z-clipping is disabled, just fire everything over to the buffer */
-    if(!ZNEAR_CLIPPING_ENABLED) {
-        for(int i = 0; i < n; ++i, ++vertex) {
-            PREFETCH(vertex + 1);
-            if(glIsVertex(vertex->flags)) {
-                _glPerspectiveDivideVertex(vertex, h);
-            }
-            _glSubmitHeaderOrVertex(vertex);
-        }
+    uint8_t visible_mask = 0;
+    uint8_t counter = 0;

-        return;
-    }
-
-    tri_count = 0;
-    strip_count = 0;
-
-#if CLIP_DEBUG
-    printf("----\n");
-#endif
-
-    for(int i = 0; i < n; ++i, ++vertex) {
-        PREFETCH(vertex + 1);
-
-        bool is_last_in_strip = glIsLastVertex(vertex->flags);
-
-        /* Wait until we fill the triangle */
-        if(tri_count < 3) {
-            if(glIsVertex(vertex->flags)) {
-                triangle[tri_count].v = vertex;
-                triangle[tri_count].visible = vertex->xyz[2] >= -vertex->w;
-                tri_count++;
-                strip_count++;
-            } else {
-                /* We hit a header */
-                tri_count = 0;
-                strip_count = 0;
-                _glSubmitHeaderOrVertex(vertex);
-            }
-
-            if(tri_count < 3) {
+    for(int i = 0; i < n; ++i, ++v2) {
+        PREFETCH(v2 + 1);
+        switch(v2->flags) {
+            case GPU_CMD_VERTEX_EOL:
+                if(counter < 2) {
+                    continue;
+                }
+                counter = 0;
+            break;
+            case GPU_CMD_VERTEX:
+                ++counter;
+                if(counter < 3) {
+                    continue;
+                }
+            break;
+            default:
+                _glPushHeaderOrVertex(v2);
+                counter = 0;
                continue;
-            }
+        };
+
+        Vertex* const v0 = v2 - 2;
+        Vertex* const v1 = v2 - 1;
+
+        visible_mask = (
+            (v0->xyz[2] > -v0->w) << 0 |
+            (v1->xyz[2] > -v1->w) << 1 |
+            (v2->xyz[2] > -v2->w) << 2 |
+            (counter == 0) << 3
+        );
+
+        switch(visible_mask) {
+        case 15: /* All visible, but final vertex in strip */
+        {
+            _glPerspectiveDivideVertex(v0, h);
+            _glPushHeaderOrVertex(v0);
+
+            _glPerspectiveDivideVertex(v1, h);
+            _glPushHeaderOrVertex(v1);
+
+            _glPerspectiveDivideVertex(v2, h);
+            _glPushHeaderOrVertex(v2);
        }
+        break;
+        case 7:
+            /* All visible, push the first vertex and move on */
+            _glPerspectiveDivideVertex(v0, h);
+            _glPushHeaderOrVertex(v0);
+        break;
+        case 9:
+      /* First vertex was visible, last in strip */
+        {
+            Vertex __attribute__((aligned(32))) scratch[2];
+            Vertex* a = &scratch[0];
+            Vertex* b = &scratch[1];

-#if CLIP_DEBUG
-        printf("SC: %d\n", strip_count);
-#endif
+            _glClipEdge(v0, v1, a);
+            a->flags = GPU_CMD_VERTEX;

-        /* If we got here, then triangle contains 3 vertices */
-        int visible_mask = triangle[0].visible | (triangle[1].visible << 1) | (triangle[2].visible << 2);
-        if(visible_mask == 7) {
-#if CLIP_DEBUG
-            printf("Visible\n");
-#endif
-            /* All the vertices are visible! We divide and submit v0, then shift */
-            _glPerspectiveDivideVertex(vertex - 2, h);
-            _glSubmitHeaderOrVertex(vertex - 2);
+            _glClipEdge(v2, v0, b);
+            b->flags = GPU_CMD_VERTEX_EOL;

-            if(is_last_in_strip) {
-                _glPerspectiveDivideVertex(vertex - 1, h);
-                _glSubmitHeaderOrVertex(vertex - 1);
-                _glPerspectiveDivideVertex(vertex, h);
-                _glSubmitHeaderOrVertex(vertex);
-                tri_count = 0;
-                strip_count = 0;
+            _glPerspectiveDivideVertex(v0, h);
+            _glPushHeaderOrVertex(v0);
+
+            _glPerspectiveDivideVertex(a, h);
+            _glPushHeaderOrVertex(a);
+
+            _glPerspectiveDivideVertex(b, h);
+            _glPushHeaderOrVertex(b);
+        }
+        break;
+        case 1:
+        /* First vertex was visible, but not last in strip */
+        {
+            Vertex __attribute__((aligned(32))) scratch[2];
+            Vertex* a = &scratch[0];
+            Vertex* b = &scratch[1];
+
+            _glClipEdge(v0, v1, a);
+            a->flags = GPU_CMD_VERTEX;
+
+            _glClipEdge(v2, v0, b);
+            b->flags = GPU_CMD_VERTEX;
+
+            _glPerspectiveDivideVertex(v0, h);
+            _glPushHeaderOrVertex(v0);
+
+            _glPerspectiveDivideVertex(a, h);
+            _glPushHeaderOrVertex(a);
+
+            _glPerspectiveDivideVertex(b, h);
+            _glPushHeaderOrVertex(b);
+            _glPushHeaderOrVertex(b);
+        }
+        break;
+        case 10:
+        case 2:
+        /* Second vertex was visible. In self case we need to create a triangle and produce
+                two new vertices: 1-2, and 2-3. */
+        {
+            Vertex __attribute__((aligned(32))) scratch[3];
+            Vertex* a = &scratch[0];
+            Vertex* b = &scratch[1];
+            Vertex* c = &scratch[2];
+
+            memcpy_vertex(c, v1);
+
+            _glClipEdge(v0, v1, a);
+            a->flags = GPU_CMD_VERTEX;
+
+            _glClipEdge(v1, v2, b);
+            b->flags = v2->flags;
+
+            _glPerspectiveDivideVertex(a, h);
+            _glPushHeaderOrVertex(a);
+
+            _glPerspectiveDivideVertex(c, h);
+            _glPushHeaderOrVertex(c);
+
+            _glPerspectiveDivideVertex(b, h);
+            _glPushHeaderOrVertex(b);
+        }
+        break;
+        case 11:
+        case 3:  /* First and second vertex were visible */
+        {
+            Vertex __attribute__((aligned(32))) scratch[3];
+            Vertex* a = &scratch[0];
+            Vertex* b = &scratch[1];
+            Vertex* c = &scratch[2];
+
+            memcpy_vertex(c, v1);
+
+            _glClipEdge(v2, v0, b);
+            b->flags = GPU_CMD_VERTEX;
+
+            _glPerspectiveDivideVertex(v0, h);
+            _glPushHeaderOrVertex(v0);
+
+            _glClipEdge(v1, v2, a);
+            a->flags = v2->flags;
+
+            _glPerspectiveDivideVertex(c, h);
+            _glPushHeaderOrVertex(c);
+
+            _glPerspectiveDivideVertex(b, h);
+            _glPushHeaderOrVertex(b);
+
+            _glPerspectiveDivideVertex(a, h);
+            _glPushHeaderOrVertex(c);
+            _glPushHeaderOrVertex(a);
+        }
+        break;
+        case 12:
+        case 4:
+        /* Third vertex was visible. */
+        {
+            Vertex __attribute__((aligned(32))) scratch[3];
+            Vertex* a = &scratch[0];
+            Vertex* b = &scratch[1];
+            Vertex* c = &scratch[2];
+
+            memcpy_vertex(c, v2);
+
+            _glClipEdge(v2, v0, a);
+            a->flags = GPU_CMD_VERTEX;
+
+            _glClipEdge(v1, v2, b);
+            b->flags = GPU_CMD_VERTEX;
+
+            _glPerspectiveDivideVertex(a, h);
+            _glPushHeaderOrVertex(a);
+
+            if(counter % 2 == 1) {
+                _glPushHeaderOrVertex(a);
            }

-            ShiftRotateTriangle();
+            _glPerspectiveDivideVertex(b, h);
+            _glPushHeaderOrVertex(b);

-        } else if(visible_mask) {
-            /* Clipping time!
+            _glPerspectiveDivideVertex(c, h);
+            _glPushHeaderOrVertex(c);
+        }
+        break;
+        case 13:
+        {
+            Vertex __attribute__((aligned(32))) scratch[3];
+            Vertex* a = &scratch[0];
+            Vertex* b = &scratch[1];
+            Vertex* c = &scratch[2];

-                There are 6 distinct possibilities when clipping a triangle. 3 of them result
-                in another triangle, 3 of them result in a quadrilateral.
+            memcpy_vertex(c, v2);
+            c->flags = GPU_CMD_VERTEX;

-                Assuming you iterate the edges of the triangle in order, and create a new *visible*
-                vertex when you cross the plane, and discard vertices behind the plane, then the only
-                difference between the two cases is that the final two vertices that need submitting have
-                to be reversed.
+            _glClipEdge(v0, v1, a);
+            a->flags = GPU_CMD_VERTEX;

-                Unfortunately we have to copy vertices here, because if we persp-divide a vertex it may
-                be used in a subsequent triangle in the strip and would end up being double divided.
-            */
-#if CLIP_DEBUG
-            printf("Clip: %d, SC: %d\n", visible_mask, strip_count);
-            printf("%d, %d, %d\n", triangle[0].v - (Vertex*) src - 1, triangle[1].v - (Vertex*) src - 1, triangle[2].v - (Vertex*) src - 1);
-#endif
-            Vertex tmp;
-            if(strip_count > 3) {
-#if CLIP_DEBUG
-                printf("Flush\n");
-#endif
-                tmp = *(vertex - 2);
-                /* If we had triangles ahead of this one, submit and finalize */
-                _glPerspectiveDivideVertex(&tmp, h);
-                _glSubmitHeaderOrVertex(&tmp);
+            _glClipEdge(v1, v2, b);
+            b->flags = GPU_CMD_VERTEX;

-                tmp = *(vertex - 1);
-                tmp.flags = GPU_CMD_VERTEX_EOL;
-                _glPerspectiveDivideVertex(&tmp, h);
-                _glSubmitHeaderOrVertex(&tmp);
-            }
+            _glPerspectiveDivideVertex(v0, h);
+            _glPushHeaderOrVertex(v0);

-            switch(visible_mask) {
-                case 1: {
-                    /* 0, 0a, 2a */
-                    tmp = *triangle[0].v;
-                    tmp.flags = GPU_CMD_VERTEX;
-                    _glPerspectiveDivideVertex(&tmp, h);
-                    _glSubmitHeaderOrVertex(&tmp);
+            _glPerspectiveDivideVertex(a, h);
+            _glPushHeaderOrVertex(a);

-                    _glClipEdge(triangle[0].v, triangle[1].v, &tmp);
-                    tmp.flags = GPU_CMD_VERTEX;
-                    _glPerspectiveDivideVertex(&tmp, h);
-                    _glSubmitHeaderOrVertex(&tmp);
+            _glPerspectiveDivideVertex(c, h);
+            _glPushHeaderOrVertex(c);
+            _glPerspectiveDivideVertex(b, h);
+            _glPushHeaderOrVertex(b);

-                    _glClipEdge(triangle[2].v, triangle[0].v, &tmp);
-                    tmp.flags = GPU_CMD_VERTEX_EOL;
-                    _glPerspectiveDivideVertex(&tmp, h);
-                    _glSubmitHeaderOrVertex(&tmp);
-                } break;
-                case 2: {
-                    /* 0a, 1, 1a */
-                    _glClipEdge(triangle[0].v, triangle[1].v, &tmp);
-                    tmp.flags = GPU_CMD_VERTEX;
-                    _glPerspectiveDivideVertex(&tmp, h);
-                    _glSubmitHeaderOrVertex(&tmp);
+            c->flags = GPU_CMD_VERTEX_EOL;
+            _glPushHeaderOrVertex(c);
+        }
+        break;
+        case 5:  /* First and third vertex were visible */
+        {
+            Vertex __attribute__((aligned(32))) scratch[3];
+            Vertex* a = &scratch[0];
+            Vertex* b = &scratch[1];
+            Vertex* c = &scratch[2];

-                    tmp = *triangle[1].v;
-                    tmp.flags = GPU_CMD_VERTEX;
-                    _glPerspectiveDivideVertex(&tmp, h);
-                    _glSubmitHeaderOrVertex(&tmp);
+            memcpy_vertex(c, v2);
+            c->flags = GPU_CMD_VERTEX;

-                    _glClipEdge(triangle[1].v, triangle[2].v, &tmp);
-                    tmp.flags = GPU_CMD_VERTEX_EOL;
-                    _glPerspectiveDivideVertex(&tmp, h);
-                    _glSubmitHeaderOrVertex(&tmp);
-                } break;
-                case 3: {
-                    /* 0, 1, 2a, 1a */
-                    tmp = *triangle[0].v;
-                    tmp.flags = GPU_CMD_VERTEX;
-                    _glPerspectiveDivideVertex(&tmp, h);
-                    _glSubmitHeaderOrVertex(&tmp);
+            _glClipEdge(v0, v1, a);
+            a->flags = GPU_CMD_VERTEX;

-                    tmp = *triangle[1].v;
-                    tmp.flags = GPU_CMD_VERTEX;
-                    _glPerspectiveDivideVertex(&tmp, h);
-                    _glSubmitHeaderOrVertex(&tmp);
+            _glClipEdge(v1, v2, b);
+            b->flags = GPU_CMD_VERTEX;

-                    _glClipEdge(triangle[2].v, triangle[0].v, &tmp);
-                    tmp.flags = GPU_CMD_VERTEX;
-                    _glPerspectiveDivideVertex(&tmp, h);
-                    _glSubmitHeaderOrVertex(&tmp);
+            _glPerspectiveDivideVertex(v0, h);
+            _glPushHeaderOrVertex(v0);

-                    _glClipEdge(triangle[1].v, triangle[2].v, &tmp);
-                    tmp.flags = GPU_CMD_VERTEX_EOL;
-                    _glPerspectiveDivideVertex(&tmp, h);
-                    _glSubmitHeaderOrVertex(&tmp);
-                } break;
-                case 4: {
-                    /* 1a, 2, 2a */
-                    _glClipEdge(triangle[1].v, triangle[2].v, &tmp);
-                    tmp.flags = GPU_CMD_VERTEX;
-                    _glPerspectiveDivideVertex(&tmp, h);
-                    _glSubmitHeaderOrVertex(&tmp);
+            _glPerspectiveDivideVertex(a, h);
+            _glPushHeaderOrVertex(a);

-                    tmp = *triangle[2].v;
-                    tmp.flags = GPU_CMD_VERTEX;
-                    _glPerspectiveDivideVertex(&tmp, h);
-                    _glSubmitHeaderOrVertex(&tmp);
+            _glPerspectiveDivideVertex(c, h);
+            _glPushHeaderOrVertex(c);
+            _glPerspectiveDivideVertex(b, h);
+            _glPushHeaderOrVertex(b);
+            _glPushHeaderOrVertex(c);
+        }
+        break;
+        case 14:
+        case 6:  /* Second and third vertex were visible */
+        {
+            Vertex __attribute__((aligned(32))) scratch[4];
+            Vertex* a = &scratch[0];
+            Vertex* b = &scratch[1];
+            Vertex* c = &scratch[2];
+            Vertex* d = &scratch[3];

-                    _glClipEdge(triangle[2].v, triangle[0].v, &tmp);
-                    tmp.flags = GPU_CMD_VERTEX_EOL;
-                    _glPerspectiveDivideVertex(&tmp, h);
-                    _glSubmitHeaderOrVertex(&tmp);
-                } break;
-                case 5: {
-                    /* 0, 0a, 2, 1a */
-                    tmp = *triangle[0].v;
-                    tmp.flags = GPU_CMD_VERTEX;
-                    _glPerspectiveDivideVertex(&tmp, h);
-                    _glSubmitHeaderOrVertex(&tmp);
+            memcpy_vertex(c, v1);
+            memcpy_vertex(d, v2);

-                    _glClipEdge(triangle[0].v, triangle[1].v, &tmp);
-                    tmp.flags = GPU_CMD_VERTEX;
-                    _glPerspectiveDivideVertex(&tmp, h);
-                    _glSubmitHeaderOrVertex(&tmp);
+            _glClipEdge(v0, v1, a);
+            a->flags = GPU_CMD_VERTEX;

-                    tmp = *triangle[2].v;
-                    tmp.flags = GPU_CMD_VERTEX;
-                    _glPerspectiveDivideVertex(&tmp, h);
-                    _glSubmitHeaderOrVertex(&tmp);
+            _glClipEdge(v2, v0, b);
+            b->flags = GPU_CMD_VERTEX;

-                    _glClipEdge(triangle[1].v, triangle[2].v, &tmp);
-                    tmp.flags = GPU_CMD_VERTEX_EOL;
-                    _glPerspectiveDivideVertex(&tmp, h);
-                    _glSubmitHeaderOrVertex(&tmp);
-                } break;
-                case 6: {
-                    /* 0a, 1, 2a, 2 */
-                    _glClipEdge(triangle[0].v, triangle[1].v, &tmp);
-                    tmp.flags = GPU_CMD_VERTEX;
-                    _glPerspectiveDivideVertex(&tmp, h);
-                    _glSubmitHeaderOrVertex(&tmp);
+            _glPerspectiveDivideVertex(a, h);
+            _glPushHeaderOrVertex(a);

-                    tmp = *triangle[1].v;
-                    tmp.flags = GPU_CMD_VERTEX;
-                    _glPerspectiveDivideVertex(&tmp, h);
-                    _glSubmitHeaderOrVertex(&tmp);
+            _glPerspectiveDivideVertex(c, h);
+            _glPushHeaderOrVertex(c);

-                    _glClipEdge(triangle[2].v, triangle[0].v, &tmp);
-                    tmp.flags = GPU_CMD_VERTEX;
-                    _glPerspectiveDivideVertex(&tmp, h);
-                    _glSubmitHeaderOrVertex(&tmp);
+            _glPerspectiveDivideVertex(b, h);
+            _glPushHeaderOrVertex(b);
+            _glPushHeaderOrVertex(c);

-                    tmp = *triangle[2].v;
-                    tmp.flags = GPU_CMD_VERTEX_EOL;
-                    _glPerspectiveDivideVertex(&tmp, h);
-                    _glSubmitHeaderOrVertex(&tmp);
-                } break;
-                default:
-                break;
-            }
-
-            /* If this was the last in the strip, we don't need to
-            submit anything else, we just wipe the tri_count */
-            if(is_last_in_strip) {
-                tri_count = 0;
-                strip_count = 0;
-            } else {
-                ShiftRotateTriangle();
-                strip_count = 2;
-            }
-        } else {
-            /* Invisible? Move to the next in the strip */
-
-            if(is_last_in_strip) {
-                tri_count = 0;
-                strip_count = 0;
-            }
-            strip_count = 2;
-            ShiftRotateTriangle();
+            _glPerspectiveDivideVertex(d, h);
+            _glPushHeaderOrVertex(d);
+        }
+        break;
+        case 8:
+        default:
+        break;
        }
    }
+
+    _glFlushBuffer();
 }

 void SceneListFinish() {
@ -536,18 +463,41 @@ void SceneListFinish() {
            Vertex* v0 = (Vertex*) (flags - step - step);
            Vertex* v1 = (Vertex*) (flags - step);
            Vertex* v2 = (Vertex*) (flags);
-            (vidx % 2 == 0) ? DrawTriangle(v0, v1, v2) : DrawTriangle(v1, v0, v2);
+
+            SDL_Vertex sv0 = {
+                {v0->xyz[0], v0->xyz[1]},
+                {v0->bgra[2], v0->bgra[1], v0->bgra[0], v0->bgra[3]},
+                {v0->uv[0], v0->uv[1]}
+            };
+
+            SDL_Vertex sv1 = {
+                {v1->xyz[0], v1->xyz[1]},
+                {v1->bgra[2], v1->bgra[1], v1->bgra[0], v1->bgra[3]},
+                {v1->uv[0], v1->uv[1]}
+            };
+
+            SDL_Vertex sv2 = {
+                {v2->xyz[0], v2->xyz[1]},
+                {v2->bgra[2], v2->bgra[1], v2->bgra[0], v2->bgra[3]},
+                {v2->uv[0], v2->uv[1]}
+            };
+
+            aligned_vector_push_back(&vbuffer, &sv0, 1);
+            aligned_vector_push_back(&vbuffer, &sv1, 1);
+            aligned_vector_push_back(&vbuffer, &sv2, 1);
        }

        if((*flags) == GPU_CMD_VERTEX_EOL) {
            vidx = 0;
        }
    }
+
+    SDL_SetRenderDrawColor(RENDERER, 255, 255, 255, 255);
+    SDL_RenderGeometry(RENDERER, NULL, aligned_vector_front(&vbuffer), aligned_vector_size(&vbuffer), NULL, 0);
 }

 void SceneFinish() {
    SDL_RenderPresent(RENDERER);
-    return;
    /* Only sensible place to hook the quit signal */
    SDL_Event e;
    while (SDL_PollEvent(&e)) {
--- a/GL/platforms/software.h
+++ b/GL/platforms/software.h
@ -48,7 +48,8 @@ void TransformVec3NoMod(const float* v, float* ret);

 /* Transform a 3-element normal using the stored matrix (w == 0)*/
 static inline void TransformNormalNoMod(const float* xIn, float* xOut) {
-
+    (void) xIn;
+    (void) xOut;
 }

 void TransformVertices(Vertex* vertices, const int count);
--- a/GL/private.h
+++ b/GL/private.h
@ -164,7 +164,10 @@ typedef struct {
    GLboolean isCompressed;
    GLboolean isPaletted;
    //50
-} TextureObject;
+    GLenum internalFormat;
+    //54
+    GLubyte padding[10];  // Pad to 64-bytes
+} __attribute__((aligned(32))) TextureObject;

 typedef struct {
    GLfloat emissive[4];
@ -233,11 +236,41 @@ GL_FORCE_INLINE float clamp(float d, float min, float max) {
    return (d < min) ? min : (d > max) ? max : d;
 }

+GL_FORCE_INLINE void memcpy_vertex(Vertex *dest, const Vertex *src) {
+#ifdef __DREAMCAST__
+    _Complex float double_scratch;
+
+    asm volatile (
+        "fschg\n\t"
+        "clrs\n\t"
+        ".align 2\n\t"
+        "fmov.d @%[in]+, %[scratch]\n\t"
+        "fmov.d %[scratch], @%[out]\n\t"
+        "fmov.d @%[in]+, %[scratch]\n\t"
+        "add #8, %[out]\n\t"
+        "fmov.d %[scratch], @%[out]\n\t"
+        "fmov.d @%[in]+, %[scratch]\n\t"
+        "add #8, %[out]\n\t"
+        "fmov.d %[scratch], @%[out]\n\t"
+        "fmov.d @%[in], %[scratch]\n\t"
+        "add #8, %[out]\n\t"
+        "fmov.d %[scratch], @%[out]\n\t"
+        "fschg\n"
+        : [in] "+&r" ((uint32_t) src), [scratch] "=&d" (double_scratch), [out] "+&r" ((uint32_t) dest)
+        :
+        : "t", "memory" // clobbers
+    );
+#else
+    *dest = *src;
+#endif
+}
+
 #define swapVertex(a, b)   \
 do {                 \
-    Vertex c = *a;   \
-    *a = *b;         \
-    *b = c;          \
+    Vertex __attribute__((aligned(32))) c;   \
+    memcpy_vertex(&c, a); \
+    memcpy_vertex(a, b); \
+    memcpy_vertex(b, &c); \
 } while(0)

 /* ClipVertex doesn't have room for these, so we need to parse them
@ -345,6 +378,9 @@ extern GLubyte ACTIVE_TEXTURE;
 extern GLboolean TEXTURES_ENABLED[];

 GLubyte _glGetActiveTexture();
+GLint _glGetTextureInternalFormat();
+GLboolean _glGetTextureTwiddle();
+void _glSetTextureTwiddle(GLboolean v);

 GLuint _glGetActiveClientTexture();
 TexturePalette* _glGetSharedPalette(GLshort bank);
@ -520,6 +556,7 @@ void _glSetLightModelColorControl(GLint v);
 GLuint _glEnabledLightCount();
 void _glRecalcEnabledLights();
 GLfloat* _glLightModelSceneAmbient();
+GLfloat* _glGetLightModelSceneAmbient();
 LightSource* _glLightAt(GLuint i);
 GLboolean _glNearZClippingEnabled();

--- a/GL/state.c
+++ b/GL/state.c
@ -180,6 +180,10 @@ void _glSetLightModelSceneAmbient(const GLfloat* v) {
    vec4cpy(GPUState.scene_ambient, v);
 }

+GLfloat* _glGetLightModelSceneAmbient() {
+    return GPUState.scene_ambient;
+}
+
 void _glSetLightModelColorControl(GLint v) {
    GPUState.color_control = v;
 }
@ -251,7 +255,8 @@ void _glUpdatePVRTextureContext(PolyContext *context, GLshort textureUnit) {
    context->txr2.enable = GPU_TEXTURE_DISABLE;
    context->txr2.alpha = GPU_TXRALPHA_DISABLE;

-    if(!TEXTURES_ENABLED[textureUnit] || !tx1) {
+    if(!TEXTURES_ENABLED[textureUnit] || !tx1 || !tx1->data) {
+        context->txr.base = NULL;
        return;
    }

@ -399,8 +404,8 @@ GLAPI void APIENTRY glEnable(GLenum cap) {
            }
        break;
        case GL_CULL_FACE: {
-            if(GPUState.cull_face != GL_TRUE) {
-                GPUState.cull_face = GL_TRUE;
+            if(GPUState.culling_enabled != GL_TRUE) {
+                GPUState.culling_enabled = GL_TRUE;
                GPUState.is_dirty = GL_TRUE;
            }

@ -489,7 +494,11 @@ GLAPI void APIENTRY glEnable(GLenum cap) {
                GPUState.is_dirty = GL_TRUE;
            }
        break;
+        case GL_TEXTURE_TWIDDLE_KOS:
+            _glSetTextureTwiddle(GL_TRUE);
+        break;
    default:
+        _glKosThrowError(GL_INVALID_VALUE, __func__);
        break;
    }
 }
@ -503,8 +512,8 @@ GLAPI void APIENTRY glDisable(GLenum cap) {
            }
        break;
        case GL_CULL_FACE: {
-            if(GPUState.cull_face != GL_FALSE) {
-                GPUState.cull_face = GL_FALSE;
+            if(GPUState.culling_enabled != GL_FALSE) {
+                GPUState.culling_enabled = GL_FALSE;
                GPUState.is_dirty = GL_TRUE;
            }

@ -591,7 +600,11 @@ GLAPI void APIENTRY glDisable(GLenum cap) {
                GPUState.is_dirty = GL_TRUE;
            }
        break;
+        case GL_TEXTURE_TWIDDLE_KOS:
+            _glSetTextureTwiddle(GL_FALSE);
+        break;
    default:
+        _glKosThrowError(GL_INVALID_VALUE, __func__);
        break;
    }
 }
@ -972,6 +985,10 @@ void APIENTRY glGetIntegerv(GLenum pname, GLint *params) {
        case GL_FREE_CONTIGUOUS_TEXTURE_MEMORY_KOS:
            *params = _glFreeContiguousTextureMemory();
        break;
+        case GL_TEXTURE_INTERNAL_FORMAT_KOS:
+            *params = _glGetTextureInternalFormat();
+        break;
+
    default:
        _glKosThrowError(GL_INVALID_ENUM, __func__);
        break;
--- a/GL/texture.c
+++ b/GL/texture.c
--- a/GL/yalloc/LICENSE
+++ b/GL/yalloc/LICENSE
@ -1,21 +0,0 @@
-MIT License
-
-Copyright (c) [year] [fullname]
-
-Permission is hereby granted, free of charge, to any person obtaining a copy
-of this software and associated documentation files (the "Software"), to deal
-in the Software without restriction, including without limitation the rights
-to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
-copies of the Software, and to permit persons to whom the Software is
-furnished to do so, subject to the following conditions:
-
-The above copyright notice and this permission notice shall be included in all
-copies or substantial portions of the Software.
-
-THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
-IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
-FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
-AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
-LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
-OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
-SOFTWARE.
--- a/GL/yalloc/README.md
+++ b/GL/yalloc/README.md
@ -1,158 +0,0 @@
-# Summary
-
-yalloc is a memory efficient allocator which is intended for embedded
-applications that only have a low amount of RAM and want to maximize its
-utilization. Properties of the allocator:
-
- - pools can be up to 128k
- - user data is 32bit aligned
- - 4 bytes overhead per allocation
- - supports defragmentation
- - uses a free list for first fit allocation strategy (most recently freed
-   blocks are used first)
- - extensively tested (see section below)
- - MIT license
-
-# Defragmentation
-
-This feature was the initial motivation for this implementation. Especially
-when dealing with highly memory constrained environments fragmenting memory
-pools can be annoying. For this reason this implementation supports
-defragmentation which moves all allocated blocks into a contiguous range at the
-beginning of the pool, leaving a maximized free range at the end.
-
-As there is no garbage collector or other runtime system involved that updates
-the references, the application must do so. This is done in three steps:
-
- 1. yalloc_defrag_start() is called. This calculates the new
-    post-defragmentation-addresses for all allocations, but otherwise leaves
-    the allocations untouched.
-
- 2. yalloc_defrag_address() is called by the application for every pointer that
-    points to an allocation. It returns the post-defragmentation-address for
-    the allocation. The application must update all its relevant pointers this
-    way. Care must be taken not not yet dereference that moved pointers. If the
-    application works with hierarchical data then this can easily be done by
-    updating the pointers button up (first the leafs then their parents).
-
- 3. yalloc_defrag_commit() is called to finally perform the defragmentation.
-    All allocated blocks are moved to their post-defragmentation-address and
-    the application can continue using the pool the normal way.
-
-It is up to the application when (and if) it performs defragmentation. One
-strategy would be to delay it until an allocation failure. Another approach
-would be to perform the defragmentation regularly when there is nothing else to
-do.
-
-# Configurable Defines
-
-INTERNAL_VALIDATE
-
-If this is not defined on the compiler commandline it will be defined as 0 if
-NDEBUG is defined and otherwise as 1. If you want to disable internal
-validation when NDEBUG is not defined then define INERNAL_VALIDATE as 0 on the
-compiler commandline.
-
-If it is nonzero the heap will be validated via a bunch of assert() calls at
-the end of every function that modifies the heap. This has roughly O(N*M)
-overhead where N is the number of allocated blocks and M the number of free
-blocks in a heap. For applications with enough live allocations this will get
-significant.
-
-YALLOC_VALGRIND
-
-If this is defined in yalloc.c and NVALGRIND is not defined then
-valgrind/memcheck.h is included and the the allocator functions tell valgrind
-about the pool, the allocations and makes the block headers inaccessible outside
-of yalloc-functions. This allows valgrind to detect a lot of the accidents that
-can happen when dealing dynamic memory. This also adds some overhead for every
-yalloc-call because most of them will "unprotect" the internal structure on
-entry and "protect" it again (marking it as inaccessible for valgrind) before
-returning.
-
-# Tests
-
-The tests rely on internal validation of the pool (see INTERNAL_VALIDATE) to
-check that no assumptions about the internal structure of the pool are
-violated. They additionally check for correctness of observations that can be
-made by using the public functions of the allocator (like checking if user data
-stays unmodified). There are a few different scripts that run tests:
-
- - run_coverage.sh runs a bunch of testfunctions that are carefully crafted to
-   cover all code paths. Coverage data is generated by clang and a summary is
-   shown at the end of the test.
-
- - run_valgrind.sh tests if the valgrind integration is working as expected,
-   runs the functions from the coverage test and some randomly generated
-   testcases under valgrind.
-
- - run_libfuzzer.sh uses libfuzzer from clang to generate interesting testcases
-   and runs them in multiple jobs in parallel for 10 seconds. It also generates
-   coverage data at the end (it always got 100% coverage in my testruns).
-
-All tests exit with 0 and print "All fine!" at the end if there where no
-errors. Coverage deficits are not counted as error, so you have to look at the
-summary (they should show 100% coverage!).
-
-
-# Implementation Details
-
-The Headers and the user data are 32bit aligned. Headers have two 16bit fields
-where the high 15 bits represent offsets (relative to the pools address) to the
-previous/next block. The macros HDR_PTR() and HDR_OFFSET() are used to
-translate an offset to an address and back. The 32bit alignment is exploited to
-allow pools of up to 128k with that 15 significant bits.
-
-A pool is always occupied by non-overlapping blocks that link to their
-previous/next block in address order via the prev/next field of Header.
-
-Free blocks are always joined: No two free blocks will ever be neighbors.
-
-Free blocks have an additional header of the same structure. This additional
-header is used to build a list of free blocks (independent of their address
-order).
-
-yalloc_free() will insert the freed block to the front of the free list.
-yalloc_alloc() searches that list front to back and takes the first block that
-is big enough to satisfy the allocation.
-
-There is always a Header at the front and at the end of the pool. The Header at
-the end is degenerate: It is marked as "used" but has no next block (which is
-usually used to determine the size of a block).
-
-The prev-field of the very first block in the pool has special meaning: It
-points to the first free block in the pool. Or, if the pool is currently
-defragmenting (after yalloc_defrag_start() and before yalloc_defrag_commit()),
-points to the last header of the pool. This state can be recognized by checking
-if it points to an empty block (normal pool state) or a used block
-(defragmentation in progress). This logic can be seen in
-yalloc_defrag_in_progress().
-
-The lowest bit of next/prev have special meaning:
-
- - low bit of prev is set for free blocks
-
- - low bit of next is set for blocks with 32bit padding after the user data.
-   This is needed when a block is allocated from a free block that leaves only
-   4 free bytes after the user data... which is not enough to insert a
-   free-header (which is needs 8 bytes). The padding will be reclaimed when
-   that block is freed or when the pool is defragmented. The predicate
-   isPadded() can be used to test if a block is padded. Free blocks are never
-   padded.
-
-The predicate isNil() can be used to test if an offset points nowhere (it tests
-if all 15 high bits of an offset are 1). The constant NIL has all but the
-lowest bit set. It is used to set offsets to point to nowhere, and in some
-places it is used to mask out the actual address bits of an offset. This should
-be kept in mind when modifying the code and updating prev/next: Think carefully
-if you have to preserve the low bit when updating an offset!
-
-Defragmentation is done in two phases: First the user calls
-yalloc_defrag_start(). This will put the pool in a special state where no
-alloc/free-calls are allowed. In this state the prev-fields of the used blocks
-have a special meaning: They store the offset that the block will have after
-defragmentation finished. This information is used by yalloc_defrag_address()
-which can be called by the application to query the new addresses for its
-allocations. After the application has updated all its pointers it must call
-yalloc_defrag_commit() which moves all used blocks in contiguous space at the
-beginning of the pool, leaving one maximized free block at the end.
--- a/GL/yalloc/yalloc.c
+++ b/GL/yalloc/yalloc.c
@ -1,803 +0,0 @@
-#include "yalloc.h"
-#include "yalloc_internals.h"
-
-#include <assert.h>
-#include <string.h>
-
-#define ALIGN(num, align) (((num) + ((align) - 1)) & ~((align) - 1))
-
-#if defined(YALLOC_VALGRIND) && !defined(NVALGRIND)
-# define USE_VALGRIND 1
-#else
-# define USE_VALGRIND 0
-#endif
-
-#if USE_VALGRIND
-# include <valgrind/memcheck.h>
-#else
-# define VALGRIND_MAKE_MEM_UNDEFINED(p, s) ((void)0)
-# define VALGRIND_MAKE_MEM_DEFINED(p, s) ((void)0)
-# define VALGRIND_MAKE_MEM_NOACCESS(p, s) ((void)0)
-# define VALGRIND_CREATE_MEMPOOL(pool, rz, z) ((void)0)
-# define VALGRIND_MEMPOOL_ALLOC(pool, p, s) ((void)0)
-# define VALGRIND_MEMPOOL_FREE(pool, p)  ((void)0)
-# define VALGRIND_MEMPOOL_CHANGE(pool, a, b, s)  ((void)0)
-#endif
-
-#define MARK_NEW_FREE_HDR(p) VALGRIND_MAKE_MEM_UNDEFINED(p, sizeof(Header) * 2)
-#define MARK_NEW_HDR(p) VALGRIND_MAKE_MEM_UNDEFINED(p, sizeof(Header))
-#define PROTECT_HDR(p) VALGRIND_MAKE_MEM_NOACCESS(p, sizeof(Header))
-#define PROTECT_FREE_HDR(p) VALGRIND_MAKE_MEM_NOACCESS(p, sizeof(Header) * 2)
-#define UNPROTECT_HDR(p) VALGRIND_MAKE_MEM_DEFINED(p, sizeof(Header))
-#define UNPROTECT_FREE_HDR(p) VALGRIND_MAKE_MEM_DEFINED(p, sizeof(Header) * 2)
-
-
-#if USE_VALGRIND
-static void _unprotect_pool(void * pool)
-{
-  Header * cur = (Header*)pool;
-  for (;;)
-  {
-    UNPROTECT_HDR(cur);
-    if (isFree(cur))
-      UNPROTECT_HDR(cur + 1);
-
-    if (isNil(cur->next))
-      break;
-
-    cur = HDR_PTR(cur->next);
-  }
-}
-
-static void _protect_pool(void * pool)
-{
-  Header * cur = (Header*)pool;
-  while (cur)
-  {
-    Header * next = isNil(cur->next) ? NULL : HDR_PTR(cur->next);
-
-    if (isFree(cur))
-      VALGRIND_MAKE_MEM_NOACCESS(cur, (char*)next - (char*)cur);
-    else
-      PROTECT_HDR(cur);
-
-    cur = next;
-  }
-}
-#define assert_is_pool(pool) assert(VALGRIND_MEMPOOL_EXISTS(pool));
-
-#else
-
-static void _unprotect_pool(void * pool){(void)pool;}
-static void _protect_pool(void * pool){(void)pool;}
-#define assert_is_pool(pool) ((void)0)
-#endif
-
-// internal version that does not unprotect/protect the pool
-static int _yalloc_defrag_in_progress(void * pool)
-{
-  // fragmentation is indicated by a free list with one entry: the last block of the pool, which has its "free"-bit cleared.
-  Header * p = (Header*)pool;
-  if (isNil(p->prev))
-    return 0;
-
-  return !(HDR_PTR(p->prev)->prev & 1);
-}
-
-int yalloc_defrag_in_progress(void * pool)
-{
-  _unprotect_pool(pool);
-  int ret = _yalloc_defrag_in_progress(pool);
-  _protect_pool(pool);
-  return ret;
-}
-
-#if YALLOC_INTERNAL_VALIDATE
-
-static size_t _count_free_list_occurences(Header * pool, Header * blk)
-{
-  int n = 0;
-  if (!isNil(pool->prev))
-  {
-    Header * cur = HDR_PTR(pool->prev);
-    for (;;)
-    {
-      if (cur == blk)
-        ++n;
-
-      if (isNil(cur[1].next))
-        break;
-
-      cur = HDR_PTR(cur[1].next);
-    }
-  }
-  return n;
-}
-
-static size_t _count_addr_list_occurences(Header * pool, Header * blk)
-{
-  size_t n = 0;
-  Header * cur = pool;
-  for (;;)
-  {
-    if (cur == blk)
-      ++n;
-
-    if (isNil(cur->next))
-      break;
-
-    cur = HDR_PTR(cur->next);
-  }
-  return n;
-}
-
-static void _validate_user_ptr(void * pool, void * p)
-{
-  Header * hdr = (Header*)p - 1;
-  size_t n = _count_addr_list_occurences((Header*)pool, hdr);
-  assert(n == 1 && !isFree(hdr));
-}
-
-/**
-Validates if all the invariants of a pool are intact.
-
-This is very expensive when there are enough blocks in the heap (quadratic complexity!).
-*/
-static void _yalloc_validate(void * pool_)
-{
-  Header * pool = (Header*)pool_;
-  Header * cur = pool;
-
-  assert(!isNil(pool->next)); // there must always be at least two blocks: a free/used one and the final block at the end
-
-  if (_yalloc_defrag_in_progress(pool))
-  {
-    Header * prevUsed = NULL;
-    while (!isNil(cur->next))
-    {
-      if (!isFree(cur))
-      { // it is a used block
-        Header * newAddr = cur == pool ? pool : HDR_PTR(cur->prev);
-        assert(newAddr <= cur);
-        assert(newAddr >= pool);
-
-        if (prevUsed)
-        {
-          Header * prevNewAddr = prevUsed == pool ? pool : HDR_PTR(prevUsed->prev);
-          size_t prevBruttoSize = (char*)HDR_PTR(prevUsed->next) - (char*)prevUsed;
-          if (isPadded(prevUsed))
-            prevBruttoSize -= 4; // remove padding
-          assert((char*)newAddr == (char*)prevNewAddr + prevBruttoSize);
-        }
-        else
-        {
-          assert(newAddr == pool);
-        }
-
-        prevUsed = cur;
-      }
-
-      cur = HDR_PTR(cur->next);
-    }
-
-    assert(cur == HDR_PTR(pool->prev)); // the free-list should point to the last block
-    assert(!isFree(cur)); // the last block must not be free
-  }
-  else
-  {
-    Header * prev = NULL;
-
-    // iterate blocks in address order
-    for (;;)
-    {
-      if (prev)
-      {
-        Header * x = HDR_PTR(cur->prev);
-        assert(x == prev);
-      }
-
-      int n = _count_free_list_occurences(pool, cur);
-      if (isFree(cur))
-      { // it is a free block
-        assert(n == 1);
-        assert(!isPadded(cur)); // free blocks must have a zero padding-bit
-
-        if (prev)
-        {
-          assert(!isFree(prev)); // free blocks must not be direct neighbours
-        }
-      }
-      else
-      {
-        assert(n == 0);
-      }
-
-      if (isNil(cur->next))
-        break;
-
-      Header * next = HDR_PTR(cur->next);
-      assert((char*)next >= (char*)cur + sizeof(Header) * 2);
-      prev = cur;
-      cur = next;
-    }
-
-    assert(isNil(cur->next));
-
-    if (!isNil(pool->prev))
-    {
-      // iterate free-list
-      Header * f = HDR_PTR(pool->prev);
-      assert(isNil(f[1].prev));
-      for (;;)
-      {
-        assert(isFree(f)); // must be free
-
-        int n = _count_addr_list_occurences(pool, f);
-        assert(n == 1);
-
-        if (isNil(f[1].next))
-          break;
-
-        f = HDR_PTR(f[1].next);
-      }
-    }
-  }
-}
-
-#else
-static void _yalloc_validate(void * pool){(void)pool;}
-static void _validate_user_ptr(void * pool, void * p){(void)pool; (void)p;}
-#endif
-
-int yalloc_init(void * pool, size_t size)
-{
-  if (size > MAX_POOL_SIZE)
-    return -1;
-
-  // TODO: Error when pool is not properly aligned
-
-  // TODO: Error when size is not a multiple of the alignment?
-  while (size % sizeof(Header))
-    --size;
-
-  if(size < sizeof(Header) * 3)
-    return -1;
-
-  VALGRIND_CREATE_MEMPOOL(pool, 0, 0);
-
-  Header * first = (Header*)pool;
-  Header * last = (Header*)((char*)pool + size) - 1;
-
-  MARK_NEW_FREE_HDR(first);
-  MARK_NEW_HDR(first);
-
-  first->prev = HDR_OFFSET(first) | 1;
-  first->next = HDR_OFFSET(last);
-  first[1].prev = NIL;
-  first[1].next = NIL;
-
-  last->prev = HDR_OFFSET(first);
-  last->next = NIL;
-
-  _unprotect_pool(pool);
-  _yalloc_validate(pool);
-  _protect_pool(pool);
-  return 0;
-}
-
-void yalloc_deinit(void * pool)
-{
-#if USE_VALGRIND
-  VALGRIND_DESTROY_MEMPOOL(pool);
-
-  Header * last = (Header*)pool;
-  UNPROTECT_HDR(last);
-  while (!isNil(last->next))
-  {
-    Header * next = HDR_PTR(last->next);
-    UNPROTECT_HDR(next);
-    last = next;
-  }
-
-  VALGRIND_MAKE_MEM_UNDEFINED(pool, (char*)(last + 1) - (char*)pool);
-#else
-  (void)pool;
-#endif
-}
-
-
-void * yalloc_alloc(void * pool, size_t size)
-{
-  assert_is_pool(pool);
-  _unprotect_pool(pool);
-  assert(!_yalloc_defrag_in_progress(pool));
-  _yalloc_validate(pool);
-  if (!size)
-  {
-    _protect_pool(pool);
-    return NULL;
-  }
-
-  Header * root = (Header*)pool;
-  if (isNil(root->prev))
-  {
-    _protect_pool(pool);
-    return NULL; /* no free block, no chance to allocate anything */ // TODO: Just read up which C standard supports single line comments and then fucking use them!
-  }
-
-  /* round up to alignment */
-  size = ALIGN(size, 32);
-
-  size_t bruttoSize = size + sizeof(Header);
-  Header * prev = NULL;
-  Header * cur = HDR_PTR(root->prev);
-  for (;;)
-  {
-    size_t curSize = (char*)HDR_PTR(cur->next) - (char*)cur; /* size of the block, including its header */
-
-    if (curSize >= bruttoSize) // it is big enough
-    {
-      // take action for unused space in the free block
-      if (curSize >= bruttoSize + sizeof(Header) * 2)
-      { // the leftover space is big enough to make it a free block
-        // Build a free block from the unused space and insert it into the list of free blocks after the current free block
-        Header * tail = (Header*)((char*)cur + bruttoSize);
-        MARK_NEW_FREE_HDR(tail);
-
-        // update address-order-list
-        tail->next = cur->next;
-        tail->prev = HDR_OFFSET(cur) | 1;
-        HDR_PTR(cur->next)->prev = HDR_OFFSET(tail); // NOTE: We know the next block is used because free blocks are never neighbours. So we don't have to care about the lower bit which would be set for the prev of a free block.
-        cur->next = HDR_OFFSET(tail);
-
-        // update list of free blocks
-        tail[1].next = cur[1].next;
-        // NOTE: tail[1].prev is updated in the common path below (assignment to "HDR_PTR(cur[1].next)[1].prev")
-
-        if (!isNil(cur[1].next))
-          HDR_PTR(cur[1].next)[1].prev = HDR_OFFSET(tail);
-        cur[1].next = HDR_OFFSET(tail);
-      }
-      else if (curSize > bruttoSize)
-      { // there will be unused space, but not enough to insert a free header
-        internal_assert(curSize - bruttoSize == sizeof(Header)); // unused space must be enough to build a free-block or it should be exactly the size of a Header
-        cur->next |= 1; // set marker for "has unused trailing space"
-      }
-      else
-      {
-        internal_assert(curSize == bruttoSize);
-      }
-
-      cur->prev &= NIL; // clear marker for "is a free block"
-
-      // remove from linked list of free blocks
-      if (prev)
-        prev[1].next = cur[1].next;
-      else
-      {
-        uint32_t freeBit = isFree(root);
-        root->prev = (cur[1].next & NIL) | freeBit;
-      }
-
-      if (!isNil(cur[1].next))
-        HDR_PTR(cur[1].next)[1].prev = prev ? HDR_OFFSET(prev) : NIL;
-
-      _yalloc_validate(pool);
-      VALGRIND_MEMPOOL_ALLOC(pool, cur + 1, size);
-      _protect_pool(pool);
-      return cur + 1; // return address after the header
-    }
-
-    if (isNil(cur[1].next))
-      break;
-
-    prev = cur;
-    cur = HDR_PTR(cur[1].next);
-  }
-
-  _yalloc_validate(pool);
-  _protect_pool(pool);
-  return NULL;
-}
-
-// Removes a block from the free-list and moves the pools first-free-bock pointer to its successor if it pointed to that block.
-static void unlink_from_free_list(Header * pool, Header * blk)
-{
-  // update the pools pointer to the first block in the free list if necessary
-  if (isNil(blk[1].prev))
-  { // the block is the first in the free-list
-    // make the pools first-free-pointer point to the next in the free list
-    uint32_t freeBit = isFree(pool);
-    pool->prev = (blk[1].next & NIL) | freeBit;
-  }
-  else
-    HDR_PTR(blk[1].prev)[1].next = blk[1].next;
-
-  if (!isNil(blk[1].next))
-    HDR_PTR(blk[1].next)[1].prev = blk[1].prev;
-}
-
-size_t yalloc_block_size(void * pool, void * p)
-{
-  Header * a = (Header*)p - 1;
-  UNPROTECT_HDR(a);
-  Header * b = HDR_PTR(a->next);
-  size_t payloadSize = (char*)b - (char*)p;
-  if (isPadded(a))
-    payloadSize -= sizeof(Header);
-  PROTECT_HDR(a);
-  return payloadSize;
-}
-
-void yalloc_free(void * pool_, void * p)
-{
-  assert_is_pool(pool_);
-  assert(!yalloc_defrag_in_progress(pool_));
-  if (!p)
-    return;
-
-  _unprotect_pool(pool_);
-
-  Header * pool = (Header*)pool_;
-  Header * cur = (Header*)p - 1;
-
-  // get pointers to previous/next block in address order
-  Header * prev = cur == pool || isNil(cur->prev) ? NULL : HDR_PTR(cur->prev);
-  Header * next = isNil(cur->next) ? NULL : HDR_PTR(cur->next);
-
-  int prevFree = prev && isFree(prev);
-  int nextFree = next && isFree(next);
-
-#if USE_VALGRIND
-  {
-    unsigned errs = VALGRIND_COUNT_ERRORS;
-    VALGRIND_MEMPOOL_FREE(pool, p);
-    if (VALGRIND_COUNT_ERRORS > errs)
-    { // early exit if the free was invalid (so we get a valgrind error and don't mess up the pool, which is helpful for testing if invalid frees are detected by valgrind)
-      _protect_pool(pool_);
-      return;
-    }
-  }
-#endif
-
-  _validate_user_ptr(pool_, p);
-
-  if (prevFree && nextFree)
-  { // the freed block has two free neighbors
-    unlink_from_free_list(pool, prev);
-    unlink_from_free_list(pool, next);
-
-    // join prev, cur and next
-    prev->next = next->next;
-    HDR_PTR(next->next)->prev = cur->prev;
-
-    // prev is now the block we want to push onto the free-list
-    cur = prev;
-  }
-  else if (prevFree)
-  {
-    unlink_from_free_list(pool, prev);
-
-    // join prev and cur
-    prev->next = cur->next;
-    HDR_PTR(cur->next)->prev = cur->prev;
-
-    // prev is now the block we want to push onto the free-list
-    cur = prev;
-  }
-  else if (nextFree)
-  {
-    unlink_from_free_list(pool, next);
-
-    // join cur and next
-    cur->next = next->next;
-    HDR_PTR(next->next)->prev = next->prev & NIL;
-  }
-
-  // if there is a previous block and that block has padding then we want to grow the new free block into that padding
-  if (cur != pool && !isNil(cur->prev))
-  { // there is a previous block
-    Header * left = HDR_PTR(cur->prev);
-    if (isPadded(left))
-    { // the previous block has padding, so extend the current block to consume move the padding to the current free block
-      Header * grown = cur - 1;
-      MARK_NEW_HDR(grown);
-      grown->next = cur->next;
-      grown->prev = cur->prev;
-      left->next = HDR_OFFSET(grown);
-      if (!isNil(cur->next))
-        HDR_PTR(cur->next)->prev = HDR_OFFSET(grown);
-
-      cur = grown;
-    }
-  }
-
-  cur->prev |= 1; // it becomes a free block
-  cur->next &= NIL; // reset padding-bit
-  UNPROTECT_HDR(cur + 1);
-  cur[1].prev = NIL; // it will be the first free block in the free list, so it has no prevFree
-
-  if (!isNil(pool->prev))
-  { // the free-list was already non-empty
-    HDR_PTR(pool->prev)[1].prev = HDR_OFFSET(cur); // make the first entry in the free list point back to the new free block (it will become the first one)
-    cur[1].next = pool->prev; // the next free block is the first of the old free-list
-  }
-  else
-    cur[1].next = NIL; // free-list was empty, so there is no successor
-
-  VALGRIND_MAKE_MEM_NOACCESS(cur + 2, (char*)HDR_PTR(cur->next) - (char*)(cur + 2));
-
-  // now the freed block is the first in the free-list
-
-  // update the offset to the first element of the free list
-  uint32_t freeBit = isFree(pool); // remember the free-bit of the offset
-  pool->prev = HDR_OFFSET(cur) | freeBit; // update the offset and restore the free-bit
-  _yalloc_validate(pool);
-  _protect_pool(pool);
-}
-
-size_t yalloc_count_free(void * pool_)
-{
-  assert_is_pool(pool_);
-  _unprotect_pool(pool_);
-  assert(!_yalloc_defrag_in_progress(pool_));
-  Header * pool = (Header*)pool_;
-  size_t bruttoFree = 0;
-  Header * cur = pool;
-
-  _yalloc_validate(pool);
-
-  for (;;)
-  {
-    if (isFree(cur))
-    { // it is a free block
-      bruttoFree += (char*)HDR_PTR(cur->next) - (char*)cur;
-    }
-    else
-    { // it is a used block
-      if (isPadded(cur))
-      { // the used block is padded
-        bruttoFree += sizeof(Header);
-      }
-    }
-
-    if (isNil(cur->next))
-      break;
-
-    cur = HDR_PTR(cur->next);
-  }
-
-  _protect_pool(pool);
-
-  if (bruttoFree < sizeof(Header))
-  {
-    internal_assert(!bruttoFree); // free space should always be a multiple of sizeof(Header)
-    return 0;
-  }
-
-  return bruttoFree - sizeof(Header);
-}
-
-size_t yalloc_count_continuous(void * pool_)
-{
-  assert_is_pool(pool_);
-  _unprotect_pool(pool_);
-  assert(!_yalloc_defrag_in_progress(pool_));
-  Header * pool = (Header*)pool_;
-  size_t largestFree = 0;
-  Header * cur = pool;
-
-  _yalloc_validate(pool);
-
-  for (;;)
-  {
-    if (isFree(cur))
-    { // it is a free block
-      size_t temp = (uintptr_t)HDR_PTR(cur->next) - (uintptr_t)cur;
-      if(temp > largestFree)
-        largestFree = temp;
-    }
-
-    if (isNil(cur->next))
-      break;
-
-    cur = HDR_PTR(cur->next);
-  }
-
-  _protect_pool(pool);
-
-  if (largestFree < sizeof(Header))
-  {
-    internal_assert(!largestFree); // free space should always be a multiple of sizeof(Header)
-    return 0;
-  }
-
-  return largestFree - sizeof(Header);
-}
-
-void * yalloc_first_used(void * pool)
-{
-  assert_is_pool(pool);
-  _unprotect_pool(pool);
-  Header * blk = (Header*)pool;
-  while (!isNil(blk->next))
-  {
-    if (!isFree(blk))
-    {
-      _protect_pool(pool);
-      return blk + 1;
-    }
-
-    blk = HDR_PTR(blk->next);
-  }
-
-  _protect_pool(pool);
-  return NULL;
-}
-
-void * yalloc_next_used(void * pool, void * p)
-{
-  assert_is_pool(pool);
-  _unprotect_pool(pool);
-  _validate_user_ptr(pool, p);
-  Header * prev = (Header*)p - 1;
-  assert(!isNil(prev->next)); // the last block should never end up as input to this function (because it is not user-visible)
-
-  Header * blk = HDR_PTR(prev->next);
-  while (!isNil(blk->next))
-  {
-    if (!isFree(blk))
-    {
-      _protect_pool(pool);
-      return blk + 1;
-    }
-
-    blk = HDR_PTR(blk->next);
-  }
-
-  _protect_pool(pool);
-  return NULL;
-}
-
-void yalloc_defrag_start(void * pool_)
-{
-  assert_is_pool(pool_);
-  _unprotect_pool(pool_);
-  assert(!_yalloc_defrag_in_progress(pool_));
-  Header * pool = (Header*)pool_;
-
-  // iterate over all blocks in address order and store the post-defragment address of used blocks in their "prev" field
-  size_t end = 0; // offset for the next used block
-  Header * blk = (Header*)pool;
-  for (; !isNil(blk->next); blk = HDR_PTR(blk->next))
-  {
-    if (!isFree(blk))
-    { // it is a used block
-      blk->prev = end >> 1;
-      internal_assert((char*)HDR_PTR(blk->prev) == (char*)pool + end);
-
-      size_t bruttoSize = (char*)HDR_PTR(blk->next) - (char*)blk;
-
-      if (isPadded(blk))
-      { // the block is padded
-        bruttoSize -= sizeof(Header);
-      }
-
-      end += bruttoSize;
-      internal_assert(end % sizeof(Header) == 0);
-    }
-  }
-
-  // blk is now the last block (the dummy "used" block at the end of the pool)
-  internal_assert(isNil(blk->next));
-  internal_assert(!isFree(blk));
-
-  // mark the pool as "defragementation in progress"
-  uint32_t freeBit = isFree(pool);
-  pool->prev = (HDR_OFFSET(blk) & NIL) | freeBit;
-
-  _yalloc_validate(pool);
-  internal_assert(yalloc_defrag_in_progress(pool));
-  _protect_pool(pool);
-}
-
-void * yalloc_defrag_address(void * pool_, void * p)
-{
-  assert_is_pool(pool_);
-  assert(yalloc_defrag_in_progress(pool_));
-  if (!p)
-    return NULL;
-
-  Header * pool = (Header*)pool_;
-
-  _unprotect_pool(pool);
-  _validate_user_ptr(pool_, p);
-
-  if (pool + 1 == p)
-    return pool + 1; // "prev" of the first block points to the last used block to mark the pool as "defragmentation in progress"
-
-  Header * blk = (Header*)p - 1;
-
-  void * defragP = HDR_PTR(blk->prev) + 1;
-
-  _protect_pool(pool);
-  return defragP;
-}
-
-void yalloc_defrag_commit(void * pool_)
-{
-  assert_is_pool(pool_);
-  _unprotect_pool(pool_);
-  assert(_yalloc_defrag_in_progress(pool_));
-  Header * pool = (Header*)pool_;
-
-  // iterate over all blocks in address order and move them
-  size_t end = 0; // offset for the next used block
-  Header * blk = pool;
-  Header * lastUsed = NULL;
-  while (!isNil(blk->next))
-  {
-    if (!isFree(blk))
-    { // it is a used block
-      size_t bruttoSize = (char*)HDR_PTR(blk->next) - (char*)blk;
-
-      if (isPadded(blk))
-      { // the block is padded
-        bruttoSize -= sizeof(Header);
-      }
-
-      Header * next = HDR_PTR(blk->next);
-
-      blk->prev = lastUsed ? HDR_OFFSET(lastUsed) : NIL;
-      blk->next = (end + bruttoSize) >> 1;
-
-      lastUsed = (Header*)((char*)pool + end);
-      VALGRIND_MAKE_MEM_UNDEFINED(lastUsed, (char*)blk - (char*)lastUsed);
-      memmove(lastUsed, blk, bruttoSize);
-      VALGRIND_MEMPOOL_CHANGE(pool, blk + 1, lastUsed + 1, bruttoSize - sizeof(Header));
-
-      end += bruttoSize;
-      blk = next;
-    }
-    else
-      blk = HDR_PTR(blk->next);
-  }
-
-  // blk is now the last block (the dummy "used" block at the end of the pool)
-  internal_assert(isNil(blk->next));
-  internal_assert(!isFree(blk));
-
-  if (lastUsed)
-  {
-    Header * gap = HDR_PTR(lastUsed->next);
-    if (gap == blk)
-    { // there is no gap
-      pool->prev = NIL; // the free list is empty
-      blk->prev = HDR_OFFSET(lastUsed);
-    }
-    else if (blk - gap > 1)
-    { // the gap is big enouogh for a free Header
-
-      // set a free list that contains the gap as only element
-      gap->prev = HDR_OFFSET(lastUsed) | 1;
-      gap->next = HDR_OFFSET(blk);
-      gap[1].prev = NIL;
-      gap[1].next = NIL;
-      pool->prev = blk->prev = HDR_OFFSET(gap);
-    }
-    else
-    { // there is a gap, but it is too small to be used as free-list-node, so just make it padding of the last used block
-      lastUsed->next = HDR_OFFSET(blk) | 1;
-      pool->prev = NIL;
-      blk->prev = HDR_OFFSET(lastUsed);
-    }
-  }
-  else
-  { // the pool is empty
-    pool->prev = 1;
-  }
-
-  internal_assert(!_yalloc_defrag_in_progress(pool));
-  _yalloc_validate(pool);
-  _protect_pool(pool);
-}
--- a/GL/yalloc/yalloc.h
+++ b/GL/yalloc/yalloc.h
@ -1,176 +0,0 @@
-/**
-@file
-
-API of the yalloc allocator.
-*/
-
-#ifndef YALLOC_H
-#define YALLOC_H
-
-#include <stddef.h>
-
-/**
-Maximum supported pool size. yalloc_init() will fail for larger pools.
-*/
-#define MAX_POOL_SIZE ((2 << 24) - 4)
-
-/**
-Creates a pool inside a given buffer.
-
-Pools must be deinitialized with yalloc_deinit() when they are no longer needed.
-
-@param pool The starting address of the pool. It must have at least 16bit
-alignment (internal structure uses 16bit integers). Allocations are placed at
-32bit boundaries starting from this address, so if the user data should be
-32bit aligned then this address has to be 32bit aligned. Typically an address
-of static memory, or an array on the stack is used if the pool is only used
-temporarily.
-@param size Size of the pool.
-@return 0 on success, nonzero if the size is not supported.
- */
-int yalloc_init(void * pool, size_t size);
-
-/**
-Deinitializes the buffer that is used by the pool and makes it available for other use.
-
-The content of the buffer is undefined after this.
-
-@param pool The starting address of an initialized pool.
-*/
-void yalloc_deinit(void * pool);
-
-/**
-Allocates a block of memory from a pool.
-
-This function mimics malloc().
-
-The pool must not be in the "defragmenting" state when this function is called.
-
-@param pool The starting address of an initialized pool.
-@param size Number of bytes to allocate.
-@return Allocated buffer or \c NULL if there was no free range that could serve
-the allocation. See @ref yalloc_defrag_start() for a way to remove
-fragmentation which may cause allocations to fail even when there is enough
-space in total.
-*/
-void * yalloc_alloc(void * pool, size_t size);
-
-/**
-Returns an allocation to a pool.
-
-This function mimics free().
-
-The pool must not be in the "defragmenting" state when this function is called.
-
-@param pool The starting address of the initialized pool the allocation comes from.
-@param p An address that was returned from yalloc_alloc() of the same pool.
-*/
-void yalloc_free(void * pool, void * p);
-
-/**
-Returns the maximum size of a successful allocation (assuming a completely unfragmented heap).
-
-After defragmentation the first allocation with the returned size is guaranteed to succeed.
-
-@param pool The starting address of an initialized pool.
-@return Number of bytes that can be allocated (assuming the pool is defragmented).
-*/
-size_t yalloc_count_free(void * pool);
-
-/**
-Returns the maximum continuous free area.
-
-@param pool The starting address of an initialized pool.
-@return Number of free bytes that exist continuously.
-*/
-size_t yalloc_count_continuous(void * pool_);
-
-/**
-Queries the usable size of an allocated block.
-
-@param pool The starting address of the initialized pool the allocation comes from.
-@param p An address that was returned from yalloc_alloc() of the same pool.
-@return Size of the memory block. This is the size passed to @ref yalloc_alloc() rounded up to 4.
-*/
-size_t yalloc_block_size(void * pool, void * p);
-
-/**
-Finds the first (in address order) allocation of a pool.
-
-@param pool The starting address of an initialized pool.
-@return Address of the allocation the lowest address inside the pool (this is
-what @ref yalloc_alloc() returned), or \c NULL if there is no used block.
-*/
-void * yalloc_first_used(void * pool);
-
-/**
-Given a pointer to an allocation finds the next (in address order) used block of a pool.
-
-@param pool The starting address of the initialized pool the allocation comes from.
-@param p Pointer to an allocation in that pool, typically comes from a previous
-call to @ref yalloc_first_used()
-*/
-void * yalloc_next_used(void * pool, void * p);
-
-/**
-Starts defragmentation for a pool.
-
-Allocations will stay where they are. But the pool is put in the "defagmenting"
-state (see @ref yalloc_defrag_in_progress()).
-
-The pool must not be in the "defragmenting" state when this function is called.
-The pool is put into the "defragmenting" state by this function.
-
-@param pool The starting address of an initialized pool.
-*/
-void yalloc_defrag_start(void * pool);
-
-/**
-Returns the address that an allocation will have after @ref yalloc_defrag_commit() is called.
-
-The pool must be in the "defragmenting" state when this function is called.
-
-@param pool The starting address of the initialized pool the allocation comes from.
-@param p Pointer to an allocation in that pool.
-@return The address the alloation will have after @ref yalloc_defrag_commit() is called.
-*/
-void * yalloc_defrag_address(void * pool, void * p);
-
-/**
-Finishes the defragmentation.
-
-The content of all allocations in the pool will be moved to the address that
-was reported by @ref yalloc_defrag_address(). The pool will then have only one
-free block. This means that an <tt>yalloc_alloc(pool, yalloc_count_free(pool))</tt>
-will succeed.
-
-The pool must be in the "defragmenting" state when this function is called. The
-pool is put back to normal state by this function.
-
-@param pool The starting address of an initialized pool.
-*/
-void yalloc_defrag_commit(void * pool);
-
-/**
-Tells if the pool is in the "defragmenting" state (after a @ref yalloc_defrag_start() and before a @ref yalloc_defrag_commit()).
-
-@param pool The starting address of an initialized pool.
-@return Nonzero if the pool is currently in the "defragmenting" state.
-*/
-int yalloc_defrag_in_progress(void * pool);
-
-
-/**
-Helper function that dumps the state of the pool to stdout.
-
-This function is only available if build with <tt>yalloc_dump.c</tt>. This
-function only exists for debugging purposes and can be ignored by normal users
-that are not interested in the internal structure of the implementation.
-
-@param pool The starting address of an initialized pool.
-@param name A string that is used as "Title" for the output.
-*/
-void yalloc_dump(void * pool, char * name);
-
-
-#endif // YALLOC_H
--- a/GL/yalloc/yalloc_dump.c
+++ b/GL/yalloc/yalloc_dump.c
@ -1,39 +0,0 @@
-#include "yalloc_internals.h"
-
-#include <stdio.h>
-
-static void printOffset(void * pool, char * name, uint16_t offset)
-{
-  if (isNil(offset))
-    printf("  %s: nil\n", name);
-  else
-    printf("  %s: %td\n", name, (char*)HDR_PTR(offset) - (char*)pool);
-}
-
-void yalloc_dump(void * pool, char * name)
-{
-  printf("---- %s ----\n", name);
-  Header * cur = (Header*)pool;
-  for (;;)
-  {
-    printf(isFree(cur) ? "%td: free @%p\n" : "%td: used @%p\n", (char*)cur - (char*)pool, cur);
-    printOffset(pool, cur == pool ? "first free" : "prev", cur->prev);
-    printOffset(pool, "next", cur->next);
-    if (isFree(cur))
-    {
-      printOffset(pool, "prevFree", cur[1].prev);
-      printOffset(pool, "nextFree", cur[1].next);
-    }
-    else
-      printf("  payload includes padding: %i\n", isPadded(cur));
-
-    if (isNil(cur->next))
-      break;
-
-    printf("  %td bytes payload\n", (char*)HDR_PTR(cur->next) - (char*)cur - sizeof(Header));
-
-    cur = HDR_PTR(cur->next);
-  }
-
-  fflush(stdout);
-}
--- a/GL/yalloc/yalloc_internals.h
+++ b/GL/yalloc/yalloc_internals.h
@ -1,63 +0,0 @@
-#ifndef YALLOC_INTERNALS_H
-#define YALLOC_INTERNALS_H
-
-#include <stdint.h>
-
-typedef struct
-{
-  uint32_t prev; // low bit set if free
-  uint32_t next; // for used blocks: low bit set if unused header at the end
-
-  /* We need user data to be 32-byte aligned, so the header needs
-   * to be 32 bytes in size (as user data follows the header) */
-  uint8_t padding[32 - (sizeof(uint32_t) * 2)];
-} Header;
-
-// NOTE: We have 32bit aligned data and 16bit offsets where the lowest bit is used as flag. So we remove the low bit and shift by 1 to address 128k bytes with the 15bit significant offset bits.
-
-#define NIL 0xFFFFFFFEu
-
-// return Header-address for a prev/next
-#define HDR_PTR(offset) ((Header*)((char*)pool + (((offset) & NIL)<<1)))
-
-// return a prev/next for a Header-address
-#define HDR_OFFSET(blockPtr) ((uint32_t)(((char*)blockPtr - (char*)pool) >> 1))
-
-#ifndef YALLOC_INTERNAL_VALIDATE
-# ifdef NDEBUG
-#   define YALLOC_INTERNAL_VALIDATE 0
-# else
-#   define YALLOC_INTERNAL_VALIDATE 1
-#endif
-#endif
-
-
-/*
-internal_assert() is used in some places to check internal expections.
-Activate this if you modify the code to detect problems as early as possible.
-In other cases this should be deactivated.
-*/
-#if 0
-#define internal_assert assert
-#else
-#define internal_assert(condition)((void) 0)
-#endif
-
-// detects offsets that point nowhere
-static inline int isNil(uint32_t offset)
-{
-  return (offset | 1) == 0xFFFFFFFF;
-}
-
-static inline int isFree(Header * hdr)
-{
-  return hdr->prev & 1;
-}
-
-static inline int isPadded(Header * hdr)
-{
-  return hdr->next & 1;
-}
-
-
-#endif // YALLOC_INTERNALS_H
--- a/README.md
+++ b/README.md
@ -32,7 +32,7 @@ GLdc uses CMake for its build system, it currently ships with two "backends":
 - kospvr - This is the hardware-accelerated Dreamcast backend
 - software - This is a stub software rasterizer used for testing testing and debugging
 
-To compile for Dreamcast, you'll want to do something like the following:
+To compile a Dreamcast debug build, you'll want to do something like the following:

 ```
 mkdir dcbuild
@ -41,6 +41,11 @@ cmake -DCMAKE_TOOLCHAIN_FILE=../toolchains/Dreamcast.cmake -G "Unix Makefiles" .
 make
 ```

+For a release build, replace the cmake line with with the following:
+```
+cmake -DCMAKE_TOOLCHAIN_FILE=../toolchains/Dreamcast.cmake -G "Unix Makefiles" -DCMAKE_BUILD_TYPE=Release ..
+```
+
 You will need KallistiOS compiled and configured (e.g. the KOS_BASE environment
 variable must be set)

--- a/containers/aligned_vector.c
+++ b/containers/aligned_vector.c
@ -12,36 +12,45 @@

 #include "aligned_vector.h"

-extern inline void* aligned_vector_resize(AlignedVector* vector, const unsigned int element_count);
-extern inline void* aligned_vector_extend(AlignedVector* vector, const unsigned int additional_count);
-extern inline void* aligned_vector_reserve(AlignedVector* vector, unsigned int element_count);
-extern inline void* aligned_vector_push_back(AlignedVector* vector, const void* objs, unsigned int count);
+extern inline void* aligned_vector_resize(AlignedVector* vector, const uint32_t element_count);
+extern inline void* aligned_vector_extend(AlignedVector* vector, const uint32_t additional_count);
+extern inline void* aligned_vector_reserve(AlignedVector* vector, uint32_t element_count);
+extern inline void* aligned_vector_push_back(AlignedVector* vector, const void* objs, uint32_t count);

-void aligned_vector_init(AlignedVector* vector, unsigned int element_size) {
-    vector->size = vector->capacity = 0;
-    vector->element_size = element_size;
+void aligned_vector_init(AlignedVector* vector, uint32_t element_size) {
+    /* Now initialize the header*/
+    AlignedVectorHeader* const hdr = &vector->hdr;
+    hdr->size = 0;
+    hdr->capacity = ALIGNED_VECTOR_CHUNK_SIZE;
+    hdr->element_size = element_size;
    vector->data = NULL;

-    /* Reserve some initial capacity */
-    aligned_vector_reserve(vector, ALIGNED_VECTOR_CHUNK_SIZE);
+    /* Reserve some initial capacity. This will do the allocation but not set up the header */
+    void* ptr = aligned_vector_reserve(vector, ALIGNED_VECTOR_CHUNK_SIZE);
+    assert(ptr);
+    (void) ptr;
 }

 void aligned_vector_shrink_to_fit(AlignedVector* vector) {
-    if(vector->size == 0) {
+    AlignedVectorHeader* const hdr = &vector->hdr;
+    if(hdr->size == 0) {
+        uint32_t element_size = hdr->element_size;
        free(vector->data);
+
+        /* Reallocate the header */
        vector->data = NULL;
-        vector->capacity = 0;
+        hdr->size = hdr->capacity = 0;
+        hdr->element_size = element_size;
    } else {
-        unsigned int new_byte_size = vector->size * vector->element_size;
-        unsigned char* original_data = vector->data;
+        uint32_t new_byte_size = (hdr->size * hdr->element_size);
+        uint8_t* original_data = vector->data;
        vector->data = (unsigned char*) memalign(0x20, new_byte_size);

        if(original_data) {
            FASTCPY(vector->data, original_data, new_byte_size);
            free(original_data);
        }
-
-        vector->capacity = vector->size;
+        hdr->capacity = hdr->size;
    }
 }

--- a/containers/aligned_vector.h
+++ b/containers/aligned_vector.h
@ -4,6 +4,7 @@
 #include <stdlib.h>
 #include <string.h>
 #include <stdint.h>
+#include <stdio.h>

 #ifdef __cplusplus
 extern "C" {
@ -12,6 +13,7 @@ extern "C" {
 #if defined(__APPLE__) || defined(__WIN32__)
 /* Linux + Kos define this, OSX does not, so just use malloc there */
 static inline void* memalign(size_t alignment, size_t size) {
+    (void) alignment;
    return malloc(size);
 }
 #else
@ -65,10 +67,14 @@ AV_FORCE_INLINE void *AV_MEMCPY4(void *dest, const void *src, size_t len)
 #endif

 typedef struct {
-    uint8_t* __attribute__((aligned(32))) data;
    uint32_t size;
    uint32_t capacity;
    uint32_t element_size;
+} __attribute__((aligned(32))) AlignedVectorHeader;
+
+typedef struct {
+    AlignedVectorHeader hdr;
+    uint8_t* data;
 } AlignedVector;

 #define ALIGNED_VECTOR_CHUNK_SIZE 256u
@ -78,90 +84,137 @@ typedef struct {
    ((((v) + ALIGNED_VECTOR_CHUNK_SIZE - 1) / ALIGNED_VECTOR_CHUNK_SIZE) * ALIGNED_VECTOR_CHUNK_SIZE)


-void aligned_vector_init(AlignedVector* vector, unsigned int element_size);
+void aligned_vector_init(AlignedVector* vector, uint32_t element_size);

-AV_FORCE_INLINE void* aligned_vector_reserve(AlignedVector* vector, unsigned int element_count) {
-    if(element_count <= vector->capacity) {
-        return NULL;
+AV_FORCE_INLINE void* aligned_vector_at(const AlignedVector* vector, const uint32_t index) {
+    const AlignedVectorHeader* hdr = &vector->hdr;
+    assert(index < hdr->size);
+    return vector->data + (index * hdr->element_size);
+}
+
+AV_FORCE_INLINE void* aligned_vector_reserve(AlignedVector* vector, uint32_t element_count) {
+    AlignedVectorHeader* hdr = &vector->hdr;
+
+    if(element_count < hdr->capacity) {
+        return aligned_vector_at(vector, element_count);
    }

-    unsigned int original_byte_size = vector->size * vector->element_size;
+    uint32_t original_byte_size = (hdr->size * hdr->element_size);

    /* We overallocate so that we don't make small allocations during push backs */
    element_count = ROUND_TO_CHUNK_SIZE(element_count);

-    unsigned int new_byte_size = element_count * vector->element_size;
-    unsigned char* original_data = vector->data;
+    uint32_t new_byte_size = (element_count * hdr->element_size);
+    uint8_t* original_data = vector->data;

-    vector->data = (unsigned char*) memalign(0x20, new_byte_size);
+    vector->data = (uint8_t*) memalign(0x20, new_byte_size);
    assert(vector->data);

-    if(original_data) {
-        AV_MEMCPY4(vector->data, original_data, original_byte_size);
-        free(original_data);
-    }
-
-    vector->capacity = element_count;
+    AV_MEMCPY4(vector->data, original_data, original_byte_size);
+    free(original_data);

+    hdr->capacity = element_count;
    return vector->data + original_byte_size;
 }

-AV_FORCE_INLINE void* aligned_vector_at(const AlignedVector* vector, const unsigned int index) {
-    assert(index < vector->size);
-    return &vector->data[index * vector->element_size];
+AV_FORCE_INLINE AlignedVectorHeader* aligned_vector_header(const AlignedVector* vector) {
+    return (AlignedVectorHeader*) &vector->hdr;
 }

-AV_FORCE_INLINE void* aligned_vector_resize(AlignedVector* vector, const unsigned int element_count) {
+AV_FORCE_INLINE uint32_t aligned_vector_size(const AlignedVector* vector) {
+    const AlignedVectorHeader* hdr = &vector->hdr;
+    return hdr->size;
+}
+
+AV_FORCE_INLINE uint32_t aligned_vector_capacity(const AlignedVector* vector) {
+    const AlignedVectorHeader* hdr = &vector->hdr;
+    return hdr->capacity;
+}
+
+AV_FORCE_INLINE void* aligned_vector_front(const AlignedVector* vector) {
+    return vector->data;
+}
+
+#define av_assert(x) \
+    do {\
+        if(!(x)) {\
+            fprintf(stderr, "Assertion failed at %s:%d\n", __FILE__, __LINE__);\
+            exit(1);\
+        }\
+    } while(0); \
+
+/* Resizes the array and returns a pointer to the first new element (if upsizing) or NULL (if downsizing) */
+AV_FORCE_INLINE void* aligned_vector_resize(AlignedVector* vector, const uint32_t element_count) {
    void* ret = NULL;

-    unsigned int previousCount = vector->size;
-
-    if(vector->capacity < element_count) {
+    AlignedVectorHeader* hdr = &vector->hdr;
+    uint32_t previous_count = hdr->size;
+    if(hdr->capacity <= element_count) {
        /* If we didn't have capacity, increase capacity (slow) */
-        vector->size = element_count;
-        ret = aligned_vector_reserve(vector, element_count);
-    } else if(previousCount < element_count) {
+
+        aligned_vector_reserve(vector, element_count);
+        hdr->size = element_count;
+
+        ret = aligned_vector_at(vector, previous_count);
+
+        av_assert(hdr->size == element_count);
+        av_assert(hdr->size <= hdr->capacity);
+    } else if(previous_count < element_count) {
        /* So we grew, but had the capacity, just get a pointer to
         * where we were */
-        vector->size = element_count;
-        ret = aligned_vector_at(vector, previousCount);
-    } else {
-        vector->size = element_count;
+        hdr->size = element_count;
+        av_assert(hdr->size < hdr->capacity);
+        ret = aligned_vector_at(vector, previous_count);
+    } else if(hdr->size != element_count) {
+        hdr->size = element_count;
+        av_assert(hdr->size < hdr->capacity);
    }

    return ret;
 }

-AV_FORCE_INLINE void* aligned_vector_push_back(AlignedVector* vector, const void* objs, unsigned int count) {
+AV_FORCE_INLINE void* aligned_vector_push_back(AlignedVector* vector, const void* objs, uint32_t count) {
    /* Resize enough room */
+    AlignedVectorHeader* hdr = &vector->hdr;
+
    assert(count);
-    assert(vector->element_size);
+    assert(hdr->element_size);

-    unsigned int initial_size = vector->size;
-    aligned_vector_resize(vector, vector->size + count);
+#ifndef NDEBUG
+    uint32_t element_size = hdr->element_size;
+    uint32_t initial_size = hdr->size;
+#endif

-    assert(vector->size == initial_size + count);
-
-    unsigned char* dest = vector->data + (vector->element_size * initial_size);
+    uint8_t* dest = (uint8_t*) aligned_vector_resize(vector, hdr->size + count);
+    assert(dest);

    /* Copy the objects in */
-    AV_MEMCPY4(dest, objs, vector->element_size * count);
+    AV_MEMCPY4(dest, objs, hdr->element_size * count);

+    assert(hdr->element_size == element_size);
+    assert(hdr->size == initial_size + count);
    return dest;
 }


-AV_FORCE_INLINE void* aligned_vector_extend(AlignedVector* vector, const unsigned int additional_count) {
-    return aligned_vector_resize(vector, vector->size + additional_count);
+AV_FORCE_INLINE void* aligned_vector_extend(AlignedVector* vector, const uint32_t additional_count) {
+    AlignedVectorHeader* hdr = &vector->hdr;
+    void* ret = aligned_vector_resize(vector, hdr->size + additional_count);
+    assert(ret);  // Should always return something
+    return ret;
 }

 AV_FORCE_INLINE void aligned_vector_clear(AlignedVector* vector){
-    vector->size = 0;
+    AlignedVectorHeader* hdr = &vector->hdr;
+    hdr->size = 0;
 }
+
 void aligned_vector_shrink_to_fit(AlignedVector* vector);
 void aligned_vector_cleanup(AlignedVector* vector);
-static inline void* aligned_vector_back(AlignedVector* vector){
-    return aligned_vector_at(vector, vector->size - 1);
+
+AV_FORCE_INLINE void* aligned_vector_back(AlignedVector* vector){
+    AlignedVectorHeader* hdr = &vector->hdr;
+    return aligned_vector_at(vector, hdr->size ? hdr->size - 1 : 0);
 }

 #ifdef __cplusplus
--- a/containers/named_array.c
+++ b/containers/named_array.c
@ -68,7 +68,6 @@ void* named_array_reserve(NamedArray* array, unsigned int id) {
 void named_array_release(NamedArray* array, unsigned int new_id) {
    unsigned int i = new_id / 8;
    unsigned int j = new_id % 8;
-
    array->used_markers[i] &= (unsigned char) ~(1 << j);
 }

--- a/include/GL/gl.h
+++ b/include/GL/gl.h
@ -19,6 +19,10 @@ __BEGIN_DECLS

 #include <math.h>

+#if __STDCPP_FLOAT16_T__
+   #include <stdfloat>
+#endif
+
 /* Primitive Types taken from GL for compatability */
 /* Not all types are implemented in Open GL DC V.1.0 */
 #define GL_POINTS                               0x0000
@ -305,12 +309,13 @@ __BEGIN_DECLS
 #define GL_UNSIGNED_INT                         0x1405
 #define GL_FLOAT                                0x1406
 #define GL_DOUBLE                               0x140A
+#define GL_HALF_FLOAT                           0x140B
 #define GL_2_BYTES                              0x1407
 #define GL_3_BYTES                              0x1408
 #define GL_4_BYTES                              0x1409

 /* ErrorCode */
-#define GL_NO_ERROR                       0
+#define GL_NO_ERROR                       ((GLenum) 0)
 #define GL_INVALID_ENUM                   0x0500
 #define GL_INVALID_VALUE                  0x0501
 #define GL_INVALID_OPERATION              0x0502
@ -359,7 +364,7 @@ __BEGIN_DECLS
 #define GL_UNSIGNED_SHORT_5_6_5_REV     0x8364
 #define GL_UNSIGNED_SHORT_4_4_4_4_REV   0x8365
 #define GL_UNSIGNED_SHORT_1_5_5_5_REV   0x8366
-
+#define GL_UNSIGNED_INT_8_8_8_8_REV     0x8367
 #define GL_UNSIGNED_INT_2_10_10_10_REV  0x8368

 #define GL_COLOR_INDEX                    0x1900
@ -371,6 +376,32 @@ __BEGIN_DECLS
 #define GL_RGBA                           0x1908
 #define GL_LUMINANCE                      0x1909
 #define GL_LUMINANCE_ALPHA                0x190A
+
+#define GL_R3_G3_B2                    0x2A10
+
+#define GL_ALPHA4                    0x803B
+#define GL_ALPHA8                    0x803C
+#define GL_ALPHA12                    0x803D
+#define GL_ALPHA16                    0x803E
+
+#define GL_LUMINANCE4                  0x803F
+#define GL_LUMINANCE8                  0x8040
+#define GL_LUMINANCE12                  0x8041
+#define GL_LUMINANCE16                  0x8042
+
+#define GL_LUMINANCE4_ALPHA4              0x8043
+#define GL_LUMINANCE6_ALPHA2              0x8044
+#define GL_LUMINANCE8_ALPHA8              0x8045
+#define GL_LUMINANCE12_ALPHA4              0x8046
+#define GL_LUMINANCE12_ALPHA12              0x8047
+#define GL_LUMINANCE16_ALPHA16              0x8048
+
+#define GL_INTENSITY4                  0x804A
+#define GL_INTENSITY8                  0x804B
+#define GL_INTENSITY12                  0x804C
+#define GL_INTENSITY16                  0x804D
+
+#define GL_BGR                            0x80E0
 #define GL_BGRA                           0x80E1
 #define GL_INTENSITY                      0x8049
 #define GL_RGB4                           0x804F
@ -387,6 +418,14 @@ __BEGIN_DECLS
 #define GL_RGBA12                         0x805A
 #define GL_RGBA16                         0x805B

+#define GL_R8                      0x8229
+#define GL_RG8                      0x822B
+#define GL_RG                      0x8227
+#define GL_R16                      0x822A
+#define GL_RG16                      0x822C
+#define GL_COMPRESSED_RED                0x8225
+#define GL_COMPRESSED_RG                0x8226
+
 /* Polygons */
 #define GL_POINT				0x1B00
 #define GL_LINE					0x1B01
@ -427,6 +466,12 @@ __BEGIN_DECLS
 #define GL_FALSE   0
 #define GL_TRUE    1

+#if __STDCPP_FLOAT16_T__
+#define GLhalf std::float16_t
+#else
+#define GLhalf unsigned short
+#endif
+
 /* Stubs for portability */
 #define GL_LINE_SMOOTH                    0x0B20
 #define GL_ALPHA_TEST                     0x0BC0
--- a/include/GL/glext.h
+++ b/include/GL/glext.h
@ -130,7 +130,7 @@ GLAPI void APIENTRY glGenFramebuffersEXT(GLsizei n, GLuint* framebuffers);
 GLAPI void APIENTRY glDeleteFramebuffersEXT(GLsizei n, const GLuint* framebuffers);
 GLAPI void APIENTRY glBindFramebufferEXT(GLenum target, GLuint framebuffer);
 GLAPI void APIENTRY glFramebufferTexture2DEXT(GLenum target, GLenum attachment, GLenum textarget, GLuint texture, GLint level);
-GLAPI void APIENTRY glGenerateMipmapEXT(GLenum target);
+GLAPI void APIENTRY glGenerateMipmap(GLenum target);
 GLAPI GLenum APIENTRY glCheckFramebufferStatusEXT(GLenum target);
 GLAPI GLboolean APIENTRY glIsFramebufferEXT(GLuint framebuffer);

@ -203,7 +203,7 @@ GLAPI void APIENTRY glCompressedTexImage2DARB(GLenum target,
 #define glClientActiveTexture glClientActiveTextureARB
 #define glMultiTexCoord2f glMultiTexCoord2fARB

-#define glGenerateMipmap glGenerateMipmapEXT
+#define glGenerateMipmapEXT glGenerateMipmap
 #define glCompressedTexImage2D glCompressedTexImage2DARB

 #ifndef GL_VERSION_1_4
--- a/include/GL/glkos.h
+++ b/include/GL/glkos.h
@ -35,8 +35,6 @@ extern const char* GLDC_VERSION;

 #define GL_NEARZ_CLIPPING_KOS                       0xEEFA

-#define GL_UNSIGNED_BYTE_TWID_KOS                   0xEEFB
-

 /* Initialize the GL pipeline. GL will initialize the PVR. */
 GLAPI void APIENTRY glKosInit();
@ -57,6 +55,13 @@ typedef struct {
    GLuint initial_pt_capacity;
    GLuint initial_immediate_capacity;

+    /* Default: True
+     *
+     * Whether glTexImage should automatically twiddle textures
+     * if the internal format is a generic format (e.g. GL_RGB).
+     * this is the same as calling glEnable(GL_TEXTURE_TWIDDLE_KOS)
+     * on boot */
+    GLboolean texture_twiddle;
 } GLdcConfig;


@ -87,7 +92,7 @@ GLAPI void APIENTRY glKosInitConfig(GLdcConfig* config);
 */
 GLAPI void APIENTRY glKosInitEx(GLdcConfig* config);
 GLAPI void APIENTRY glKosSwapBuffers();
-
+GLAPI void APIENTRY glKosShutdown();

 /*
 * CUSTOM EXTENSION multiple_shared_palette_KOS
@ -186,12 +191,28 @@ GLAPI void APIENTRY glKosSwapBuffers();
 /* Memory allocation extension (GL_KOS_texture_memory_management) */
 GLAPI GLvoid APIENTRY glDefragmentTextureMemory_KOS(void);

+/* glGet extensions */
 #define GL_FREE_TEXTURE_MEMORY_KOS                  0xEF3D
 #define GL_USED_TEXTURE_MEMORY_KOS                  0xEF3E
 #define GL_FREE_CONTIGUOUS_TEXTURE_MEMORY_KOS       0xEF3F

 //for palette internal format (glfcConfig)
 #define GL_RGB565_KOS                               0xEF40
+#define GL_ARGB4444_KOS                             0xEF41
+#define GL_ARGB1555_KOS                             0xEF42
+#define GL_RGB565_TWID_KOS                          0xEF43
+#define GL_ARGB4444_TWID_KOS                        0xEF44
+#define GL_ARGB1555_TWID_KOS                        0xEF45
+#define GL_COLOR_INDEX8_TWID_KOS                    0xEF46
+#define GL_COLOR_INDEX4_TWID_KOS                    0xEF47
+#define GL_RGB_TWID_KOS                             0xEF48
+#define GL_RGBA_TWID_KOS                            0xEF49
+
+/* glGet extensions */
+#define GL_TEXTURE_INTERNAL_FORMAT_KOS              0xEF50
+
+/* If enabled, will twiddle texture uploads where possible */
+#define GL_TEXTURE_TWIDDLE_KOS                      0xEF51

 __END_DECLS

--- a/samples/cubes/main.cpp
+++ b/samples/cubes/main.cpp
@ -0,0 +1,446 @@
+
+#include <cstdio>
+#include <stdbool.h>
+#include <stdlib.h>
+#include <time.h>
+
+#ifdef __DREAMCAST__
+#include <kos.h>
+float avgfps = -1;
+#endif
+
+#include "GL/gl.h"
+#include "GL/glkos.h"
+#include "GL/glu.h"
+#include "GL/glext.h"
+
+#define PI 3.14159265358979323846264338327950288f
+#define RAD_TO_DEG 57.295779513082320876798154814105f
+#define MAX_CUBES 350
+
+float timeElapsed = 0.0f;
+const float dt = 1.0f / 60.0f;
+
+float angle = 0;
+const float invAngle360 = 1.0f / 360.0f;
+const float cameraDistance = 3.0f;
+
+bool isDrawingArrays = false;
+bool isBlendingEnabled = true;
+bool isRunning = true;
+
+typedef struct
+{
+	GLubyte r;
+	GLubyte g;
+	GLubyte b;
+	GLubyte a;
+} Color;
+
+Color colors[] =
+{
+	{255, 0, 0, 128},
+	{0, 255, 0, 128},
+	{0, 0, 255, 128},
+	{255, 255, 0, 128},
+	{255, 0, 255, 128},
+	{0, 255, 255, 128}
+};
+Color faceColors[24];
+
+float cubeVertices[] =
+{
+	// Front face
+	-1.0f, -1.0f, +1.0f, // vertex 0
+	+1.0f, -1.0f, +1.0f, // vertex 1
+	+1.0f, +1.0f, +1.0f, // vertex 2
+	-1.0f, +1.0f, +1.0f, // vertex 3
+
+	// Back face
+	-1.0f, -1.0f, -1.0f, // vertex 4
+	+1.0f, -1.0f, -1.0f, // vertex 5
+	+1.0f, +1.0f, -1.0f, // vertex 6
+	-1.0f, +1.0f, -1.0f, // vertex 7
+
+	// Top face
+	-1.0f, +1.0f, +1.0f, // vertex 8
+	+1.0f, +1.0f, +1.0f, // vertex 9
+	+1.0f, +1.0f, -1.0f, // vertex 10
+	-1.0f, +1.0f, -1.0f, // vertex 11
+
+	// Bottom face
+	-1.0f, -1.0f, +1.0f, // vertex 12
+	+1.0f, -1.0f, +1.0f, // vertex 13
+	+1.0f, -1.0f, -1.0f, // vertex 14
+	-1.0f, -1.0f, -1.0f, // vertex 15
+
+	// Right face
+	+1.0f, -1.0f, +1.0f, // vertex 16
+	+1.0f, -1.0f, -1.0f, // vertex 17
+	+1.0f, +1.0f, -1.0f, // vertex 18
+	+1.0f, +1.0f, +1.0f, // vertex 19
+
+	// Left face
+	-1.0f, -1.0f, +1.0f, // vertex 20
+	-1.0f, -1.0f, -1.0f, // vertex 21
+	-1.0f, +1.0f, -1.0f, // vertex 22
+	-1.0f, +1.0f, +1.0f // vertex 23
+};
+
+// Set up indices array
+unsigned int cubeIndices[] =
+{
+	// Front face
+	0, 1, 2, 3,
+
+	// Back face
+	4, 5, 6, 7,
+
+	// Top face
+	8, 9, 10, 11,
+
+	// Bottom face
+	12, 13, 14, 15,
+
+	// Right face
+	16, 17, 18, 19,
+
+	// Left face
+	20, 21, 22, 23
+};
+
+typedef struct
+{
+	float r;
+	float x, y, z;
+	float vx, vy, vz;
+} Cube;
+
+Cube cubes[MAX_CUBES];
+
+int numCubes = 0;
+
+// Create a 4x4 identity matrix
+float cubeTransformationMatrix[16] = { 1.0f, 0.0f, 0.0f, 0.0f,
+									  0.0f, 1.0f, 0.0f, 0.0f,
+									  0.0f, 0.0f, 1.0f, 0.0f,
+									  0.0f, 0.0f, 0.0f, 1.0f };
+
+
+void debugLog(const char* msg) {
+#ifdef __DREAMCAST__
+	dbglog(DBG_KDEBUG, "%s\n", msg);
+#else
+	printf("%s\n", msg);
+#endif
+}
+
+
+void runningStats() {
+#ifdef __DREAMCAST__
+	pvr_stats_t stats;
+	pvr_get_stats(&stats);
+
+	if (avgfps != -1)
+		avgfps = (avgfps + stats.frame_rate) * 0.5f;
+	else
+		avgfps = stats.frame_rate;
+#endif
+}
+
+void avgStats() {
+#ifdef __DREAMCAST__
+	dbglog(DBG_DEBUG, "Average frame rate: ~%f fps\n", avgfps);
+#endif
+}
+
+
+void stats() {
+#ifdef __DREAMCAST__
+	pvr_stats_t stats;
+
+	pvr_get_stats(&stats);
+	dbglog(DBG_DEBUG, "3D Stats: %d VBLs, current frame rate ~%f fps\n", stats.vbl_count, stats.frame_rate);
+	avgStats();
+#endif
+}
+
+
+void addCube(float r, float x, float y, float z, float vx, float vy, float vz)
+{
+	if (numCubes < MAX_CUBES) {
+		cubes[numCubes].r = r;
+		cubes[numCubes].x = x;
+		cubes[numCubes].y = y;
+		cubes[numCubes].z = z;
+		cubes[numCubes].vx = vx;
+		cubes[numCubes].vy = vy;
+		cubes[numCubes].vz = vz;
+		numCubes++;
+	}
+}
+
+
+void addCubeQuick(float x, float y, float z, float scale_factor)
+{
+	addCube(0.5f * scale_factor, x, y, z, 0, 0, 0);
+}
+
+
+void updateCubes(float dt)
+{
+	for (size_t i = 0; i < numCubes; i++)
+	{
+		Cube* cube = &cubes[i];
+		cube->x += cube->vx * dt;
+		cube->y += cube->vy * dt;
+		cube->z += cube->vz * dt;
+
+		if (cube->x < -3 || cube->x > +3) { cube->vx *= -1; }
+		if (cube->y < -3 || cube->y > +3) { cube->vy *= -1; }
+		if (cube->z < -3 || cube->z > +3) { cube->vz *= -1; }
+	}
+}
+
+
+void renderUnitCube()
+{
+	glEnableClientState(GL_VERTEX_ARRAY);
+	glEnableClientState(GL_COLOR_ARRAY);
+
+	glVertexPointer(3, GL_FLOAT, 0, cubeVertices);
+	glColorPointer(4, GL_UNSIGNED_BYTE, 0, faceColors);
+
+	if (isDrawingArrays) {
+		glDrawArrays(GL_QUADS, 0, 24);
+	}
+	else {
+		glDrawElements(GL_QUADS, 24, GL_UNSIGNED_INT, cubeIndices);
+	}
+
+	glDisableClientState(GL_COLOR_ARRAY);
+	glDisableClientState(GL_VERTEX_ARRAY);
+}
+
+
+void renderCubes(float angle)
+{
+	for (size_t i = 0; i < numCubes; i++) {
+		const float scale_factor = 0.05f + (i / (float)numCubes) * 0.35f;
+		Cube* cube = &cubes[i];
+
+		glPushMatrix(); // Save previous camera state
+		glMatrixMode(GL_MODELVIEW);
+
+		glTranslatef(cube->x, cube->y, cube->z);
+		glRotatef(angle, 1, 1, 1); // Rotate camera / object
+
+		glScalef(scale_factor, scale_factor, scale_factor); // Apply scale factor
+
+		renderUnitCube();
+		glPopMatrix(); // Restore previous camera state
+	}
+}
+
+
+float rnd(float Min, float Max)
+{
+	return (Max - Min) * (float)rand() / (float)RAND_MAX + Min;
+}
+
+
+void initialize()
+{
+	debugLog("Initialize video output");
+	glKosInit();
+
+	glClearDepth(1.0);
+	glDepthFunc(GL_LEQUAL);
+	glDepthMask(GL_TRUE);
+	glEnable(GL_DEPTH_TEST);
+	glShadeModel(GL_SMOOTH);
+
+	if (isBlendingEnabled)
+	{
+		glEnable(GL_BLEND);
+	}
+	else
+	{
+		glDisable(GL_BLEND);
+	}
+
+	glBlendFunc(GL_SRC_ALPHA, GL_ONE_MINUS_SRC_ALPHA);
+	glDisable(GL_CULL_FACE);
+
+	glViewport(0, 0, 640, 480);
+	glClearColor(0.0f, 0.0f, 0.3f, 1.0f);
+
+	glMatrixMode(GL_PROJECTION);
+	glLoadIdentity();
+
+	// Set up colors (each face has a different color)
+	for (int i = 0; i < 6; i++)
+	{
+		faceColors[i * 4] = colors[i];
+		faceColors[i * 4 + 1] = colors[i];
+		faceColors[i * 4 + 2] = colors[i];
+		faceColors[i * 4 + 3] = colors[i];
+	}
+}
+
+
+void updateTimer()
+{
+	timeElapsed += dt;
+
+	if (timeElapsed > 10.0f)
+	{
+		stats();
+		timeElapsed = 0.0f;
+	}
+}
+
+
+void updateLogic()
+{
+	updateTimer();
+
+	const int fullRot = (int)(angle * invAngle360);
+	angle -= fullRot * 360.0f;
+	angle += 50.0f * dt;
+
+	const float zoomVal = __builtin_sinf(timeElapsed) * 5.0f;
+
+	glClear(GL_COLOR_BUFFER_BIT | GL_DEPTH_BUFFER_BIT);
+	glMatrixMode(GL_MODELVIEW);
+	glLoadIdentity();
+
+	// Set up the camera position and orientation
+	float cameraPos[] = { 0.0f, 0.0f, cameraDistance };
+	float cameraTarget[] = { 0.0f, 0.0f, 0.0f };
+	float cameraUp[] = { 0.0f, 1.0f, 0.0f };
+
+	// Move the camera
+	gluLookAt(cameraPos[0], cameraPos[1], cameraPos[2],
+		cameraTarget[0], cameraTarget[1], cameraTarget[2],
+		cameraUp[0], cameraUp[1], cameraUp[2]);
+
+	glTranslatef(0.0f, 0.0f, -cameraDistance + zoomVal);
+
+	// Apply cube transformation (identity matrix)
+	glLoadIdentity();
+
+	updateCubes(dt);
+
+	renderCubes(angle);
+
+	// Reset ModelView matrix to remove camera transformation
+	float matrix[16];
+	glGetFloatv(GL_MODELVIEW_MATRIX, matrix);
+	matrix[12] = 0.0f;
+	matrix[13] = 0.0f;
+	matrix[14] = 0.0f;
+
+	glMatrixMode(GL_MODELVIEW);
+	glLoadMatrixf(matrix);
+}
+
+
+void updateInput()
+{
+#ifdef __DREAMCAST__
+	static uint8_t prevButtons = 0;
+	maple_device_t* cont;
+	cont_state_t* state;
+
+	cont = maple_enum_type(0, MAPLE_FUNC_CONTROLLER);
+
+	if (cont)
+	{
+		state = (cont_state_t*)maple_dev_status(cont);
+
+		if (state && (state->buttons & CONT_START) && !(prevButtons & CONT_START))
+		{
+			isRunning = false;
+		}
+
+		if (state && (state->buttons & CONT_A) && !(prevButtons & CONT_A))
+		{
+			isDrawingArrays = !isDrawingArrays;
+
+			if (isDrawingArrays)
+			{
+				glClearColor(0.3f, 0.0f, 0.3f, 1.0f);
+			}
+			else
+			{
+				glClearColor(0.0f, 0.0f, 0.3f, 1.0f);
+			}
+		}
+
+		if (state && (state->buttons & CONT_B) && !(prevButtons & CONT_B))
+		{
+			isBlendingEnabled = !isBlendingEnabled;
+
+			if (isBlendingEnabled)
+			{
+				glEnable(GL_BLEND);
+			}
+			else
+			{
+				glDisable(GL_BLEND);
+			}
+		}
+
+		prevButtons = state->buttons;
+	}
+#endif
+}
+
+
+void swapBuffers()
+{
+#ifdef __DREAMCAST__
+	glKosSwapBuffers();
+#endif
+}
+
+
+int main(int argc, char* argv[])
+{
+	initialize();
+
+	// Setup camera frustum
+	const float aspectRatio = 640.0f / 480.0f;
+	const float fov = 60;
+	const float zNear = 0.1f;
+	const float zFar = 1000.0f;
+
+	gluPerspective(fov, aspectRatio, zNear, zFar);
+
+	for (size_t i = 0; i < MAX_CUBES; i++)
+	{
+
+		const float r = rnd(0.1f, 0.5f);
+		const float x = rnd(-3.0f, 3.0f);
+		const float y = rnd(-3.0f, 3.0f);
+		const float z = rnd(-3.0f, 3.0f);
+		const float vx = rnd(-2.0f, 2.0f);
+		const float vy = rnd(-2.0f, 2.0f);
+		const float vz = rnd(-2.0f, 2.0f);
+
+		addCube(r, x, y, z, vx, vy, vz);
+	}
+
+	while (isRunning)
+	{
+		updateLogic();
+		updateInput();
+		swapBuffers();
+		runningStats();
+	}
+
+	avgStats();
+
+	return 0;
+}
--- a/samples/lights/main.c
+++ b/samples/lights/main.c
@ -145,7 +145,7 @@ int check_start() {

 void DrawCube(float x, float z) {
    static float pos = 0.0f;
-    const static float radius = 30.0f;
+    static const float radius = 30.0f;

    pos += 0.001f;

--- a/samples/loadbmp.c
+++ b/samples/loadbmp.c
@ -23,7 +23,11 @@ int ImageLoad(char *filename, Image *image) {
    }

    // seek through the bmp header, up to the width/height:
-    fseek(file, 18, SEEK_CUR);
+    fseek(file, 10, SEEK_CUR);
+
+    uint32_t offset;
+    fread(&offset, 4, 1, file);
+    fseek(file, 4, SEEK_CUR);

    // read the width
    if ((i = fread(&sizeX, 4, 1, file)) != 1) {
@ -65,7 +69,7 @@ int ImageLoad(char *filename, Image *image) {
    }

    // seek past the rest of the bitmap header.
-    fseek(file, 24, SEEK_CUR);
+    fseek(file, offset, SEEK_SET);

    // read the data.
    image->data = (char *) malloc(size);
--- a/samples/nehe02/main.c
+++ b/samples/nehe02/main.c
@ -9,7 +9,7 @@
 /* A general OpenGL initialization function.  Sets all of the initial parameters. */
 void InitGL(int Width, int Height)	        // We call this right after our OpenGL window is created.
 {
-    glClearColor(0.0f, 0.0f, 0.0f, 0.0f);		// This Will Clear The Background Color To Black
+    glClearColor(0.0f, 0.0f, 1.0f, 0.0f);		// This Will Clear The Background Color To Black
    glClearDepth(1.0);				// Enables Clearing Of The Depth Buffer
    glDepthFunc(GL_LEQUAL);				// The Type Of Depth Test To Do
    glEnable(GL_DEPTH_TEST);			// Enables Depth Testing
@ -20,7 +20,7 @@ void InitGL(int Width, int Height)	        // We call this right after our OpenG

    gluPerspective(45.0f,(GLfloat)Width/(GLfloat)Height,0.1f,100.0f);	// Calculate The Aspect Ratio Of The Window

-    glMatrixMode(GL_MODELVIEW);    
+    glMatrixMode(GL_MODELVIEW);
 }

 /* The function called when our window is resized (which shouldn't happen, because we're fullscreen) */
--- a/samples/nehe06/main.c
+++ b/samples/nehe06/main.c
@ -53,10 +53,10 @@ void LoadGLTextures() {

    // 2d texture, level of detail 0 (normal), 3 components (red, green, blue), x size from image, y size from image,
    // border 0 (normal), rgb color data, unsigned byte data, and finally the data itself.
-    glTexImage2D(GL_TEXTURE_2D, 0, 3, image1->sizeX, image1->sizeY, 0, GL_RGB, GL_UNSIGNED_BYTE, image1->data);
+    glTexImage2D(GL_TEXTURE_2D, 0, GL_RGB, image1->sizeX, image1->sizeY, 0, GL_RGB, GL_UNSIGNED_BYTE, image1->data);

    free(image1);
-};
+}

 /* A general OpenGL initialization function.  Sets all of the initial parameters. */
 void InitGL(int Width, int Height)	        // We call this right after our OpenGL window is created.
@ -74,7 +74,7 @@ void InitGL(int Width, int Height)	        // We call this right after our OpenG

    gluPerspective(45.0f,(GLfloat)Width/(GLfloat)Height,0.1f,100.0f);	// Calculate The Aspect Ratio Of The Window

-    glMatrixMode(GL_MODELVIEW);
+    glMatrixMode(GL_MODELVIEW);    
 }

 /* The function called when our window is resized (which shouldn't happen, because we're fullscreen) */
--- a/samples/nehe06/romdisk/NeHe.bmp
+++ b/samples/nehe06/romdisk/NeHe.bmp
--- a/samples/nehe06_4444twid/main.c
+++ b/samples/nehe06_4444twid/main.c
@ -59,10 +59,10 @@ int ImageLoad(char *filename, Image *image) {

    fread(&header, sizeof(header), 1, file);

-    GLboolean twiddled = (header.type & (1 << 25)) < 1;
-    GLboolean compressed = (header.type & (1 << 29)) > 0;
-    GLboolean mipmapped = (header.type & (1 << 30)) > 0;
-    GLboolean strided = (header.type & (1 << 24)) > 0;
+    GLboolean twiddled = (header.type & (1 << 26)) < 1;
+    GLboolean compressed = (header.type & (1 << 30)) > 0;
+    GLboolean mipmapped = (header.type & (1 << 31)) > 0;
+    GLboolean strided = (header.type & (1 << 25)) > 0;
    GLuint format = (header.type >> 27) & 0b111;

    image->data = (char *) malloc (header.size);
--- a/samples/nehe10/main.c
+++ b/samples/nehe10/main.c
@ -10,6 +10,8 @@

 #ifdef __DREAMCAST__
 #include <kos.h>
+#else
+#include <SDL.h>
 #endif

 #include <stdio.h>
@ -17,7 +19,9 @@
 #include <GL/glu.h>
 #include <GL/glkos.h>

+#include <stdlib.h>
 #include <stdbool.h>
+#include <stdint.h>

 #include "../loadbmp.h"

@ -84,7 +88,16 @@ void SetupWorld()
 	int numtriangles;
 	FILE *filein;
 	char oneline[255];
+#ifdef __DREAMCAST__
 	filein = fopen("/rd/world.txt", "rt");				// File To Load World Data From
+#else
+    filein = fopen("../samples/nehe10/romdisk/world.txt", "rt");
+#endif
+
+    if(!filein) {
+        fprintf(stderr, "Failed to load world file\n");
+        exit(1);
+    }

 	readstr(filein,oneline);
 	sscanf(oneline, "NUMPOLLIES %d\n", &numtriangles);
@ -228,6 +241,13 @@ void DrawGLScene(void) {
 }

 int ReadController(void) {
+    bool start = false;
+    bool up = false;
+    bool down = false;
+    bool left = false;
+    bool right = false;
+
+
 #ifdef __DREAMCAST__
    maple_device_t *cont;
    cont_state_t *state;
@ -241,10 +261,27 @@ int ReadController(void) {
        return 0;
    }

-    if(state->buttons & CONT_START)
-        return 0;
+    start = (state->buttons & CONT_START);
+    up = (state->buttons & CONT_DPAD_UP);
+    down = (state->buttons & CONT_DPAD_DOWN);
+    left = (state->buttons & CONT_DPAD_LEFT);
+    right = (state->buttons & CONT_DPAD_RIGHT);

-    if(state->buttons & CONT_DPAD_UP) {
+#else
+    int num_keys = 0;
+    uint8_t* state = SDL_GetKeyboardState(&num_keys);
+    start = state[SDL_SCANCODE_RETURN];
+    up = state[SDL_SCANCODE_UP];
+    down = state[SDL_SCANCODE_DOWN];
+    left = state[SDL_SCANCODE_LEFT];
+    right = state[SDL_SCANCODE_RIGHT];
+#endif
+
+    if(start) {
+        return 0;
+    }
+
+    if(up) {
        xpos -= (float)sin(heading*piover180) * 0.05f;
        zpos -= (float)cos(heading*piover180) * 0.05f;
        if (walkbiasangle >= 359.0f)
@ -258,8 +295,7 @@ int ReadController(void) {
        walkbias = (float)sin(walkbiasangle * piover180)/20.0f;
    }

-
-    if(state->buttons & CONT_DPAD_DOWN) {
+    if(down) {
        xpos += (float)sin(heading*piover180) * 0.05f;
        zpos += (float)cos(heading*piover180) * 0.05f;
        if (walkbiasangle <= 1.0f)
@ -273,18 +309,17 @@ int ReadController(void) {
        walkbias = (float)sin(walkbiasangle * piover180)/20.0f;
    }

-
-    if(state->buttons & CONT_DPAD_LEFT) {
+    if(left) {
        heading += 1.0f;
        yrot = heading;
    }

-    if(state->buttons & CONT_DPAD_RIGHT) {
+    if(right) {
        heading -= 1.0f;
        yrot = heading;
    }

-#endif
+

    /* Switch to the blended polygon list if needed */
    if(blend) {
--- a/samples/nehe10/romdisk/world.txt
+++ b/samples/nehe10/romdisk/world.txt
@ -157,4 +157,4 @@ NUMPOLLIES 36
 2.0  0.0   -0.5 0.0 0.0
 3.0  1.0  -0.5 1.0 1.0
 2.0  1.0 -0.5 0.0 1.0
-2.0  0.0   -0.5 0.0 0.0
+2.0  0.0   -0.5 0.0 0.0
--- a/samples/paletted/main.c
+++ b/samples/paletted/main.c
@ -132,7 +132,7 @@ void LoadGLTextures() {

    // 2d texture, level of detail 0 (normal), 3 components (red, green, blue), x size from image, y size from image,
    // border 0 (normal), rgb color data, unsigned byte data, and finally the data itself.
-    glTexImage2D(GL_TEXTURE_2D, 0, GL_COLOR_INDEX8_EXT, image1->width, image1->height, 0, GL_COLOR_INDEX, GL_UNSIGNED_BYTE_TWID_KOS, image1->data);
+    glTexImage2D(GL_TEXTURE_2D, 0, GL_COLOR_INDEX8_EXT, image1->width, image1->height, 0, GL_COLOR_INDEX8_TWID_KOS, GL_UNSIGNED_BYTE, image1->data);
    glGenerateMipmapEXT(GL_TEXTURE_2D);

    free(image1);
--- a/samples/paletted_pcx/main.c
+++ b/samples/paletted_pcx/main.c
@ -254,6 +254,8 @@ int BMP_Infos(FILE *pFile, uint32_t *width, uint32_t *height)
 	*width = (uint32_t)BmpInfoHeader.Width;
 	*height = (uint32_t)BmpInfoHeader.Height;

+    fseek(pFile, BmpInfoHeader.Size + 14, SEEK_SET);
+
 	return 1;
 }

@ -270,6 +272,7 @@ int BMP_GetPalette(FILE *pFile)
 		bitCount = BmpInfoHeader.ClrImportant * sizeof(RGB_QUAD);

 		if (fread(BmpRgbQuad, 1, bitCount, pFile) != bitCount){
+            fprintf(stderr, "Failed to read palette: %d\n", bitCount);
 			return 0;
 		}

@ -281,6 +284,8 @@ int BMP_GetPalette(FILE *pFile)
 		}
 		return 1;
 	}
+
+    fprintf(stderr, "BitCount: %d\n", BmpInfoHeader.BitCount);
 	return 0;
 }

@ -346,7 +351,7 @@ int LoadPalettedBMP(const char* filename, Image* image)
 	}

 	if (!BMP_GetPalette(fp)) {
-		printf("Only 16c BMP are supported for this sample");
+        printf("Only 16c BMP are supported for this sample\n");
 		return 0;
 	}

@ -429,7 +434,7 @@ void LoadGLTextures() {
 #ifndef USE_16C_PALETTE
    glTexImage2D(GL_TEXTURE_2D, 0, GL_COLOR_INDEX8_EXT, image1.width, image1.height, 0, GL_COLOR_INDEX, GL_UNSIGNED_BYTE, image1.data);
 #else
-    glTexImage2D(GL_TEXTURE_2D, 0, GL_COLOR_INDEX4_EXT, image1.width, image1.height, 0, GL_COLOR_INDEX, GL_UNSIGNED_BYTE, image1.data);
+    glTexImage2D(GL_TEXTURE_2D, 0, GL_COLOR_INDEX4_EXT, image1.width, image1.height, 0, GL_COLOR_INDEX4_EXT, GL_UNSIGNED_BYTE, image1.data);
 #endif

    glBindTexture(GL_TEXTURE_2D, textures[1]);   // 2d texture (x and y size)
@ -444,7 +449,7 @@ void LoadGLTextures() {
 #ifndef USE_16C_PALETTE
    glTexImage2D(GL_TEXTURE_2D, 0, GL_COLOR_INDEX8_EXT, image1.width, image1.height, 0, GL_COLOR_INDEX, GL_UNSIGNED_BYTE, image1.data);
 #else
-    glTexImage2D(GL_TEXTURE_2D, 0, GL_COLOR_INDEX4_EXT, image1.width, image1.height, 0, GL_COLOR_INDEX, GL_UNSIGNED_BYTE, image1.data);
+    glTexImage2D(GL_TEXTURE_2D, 0, GL_COLOR_INDEX4_EXT, image1.width, image1.height, 0, GL_COLOR_INDEX4_EXT, GL_UNSIGNED_BYTE, image1.data);
 #endif

    glBindTexture(GL_TEXTURE_2D, textures[2]);
@ -463,7 +468,7 @@ void LoadGLTextures() {
 #ifndef USE_16C_PALETTE
    glTexImage2D(GL_TEXTURE_2D, 0, GL_COLOR_INDEX8_EXT, image2.width, image2.height, 0, GL_COLOR_INDEX, GL_UNSIGNED_BYTE, image2.data);
 #else
-    glTexImage2D(GL_TEXTURE_2D, 0, GL_COLOR_INDEX4_EXT, image2.width, image2.height, 0, GL_COLOR_INDEX, GL_UNSIGNED_BYTE, image2.data);
+    glTexImage2D(GL_TEXTURE_2D, 0, GL_COLOR_INDEX4_EXT, image2.width, image2.height, 0, GL_COLOR_INDEX4_EXT, GL_UNSIGNED_BYTE, image2.data);
 #endif
 }

--- a/samples/paletted_pcx/romdisk/NeHe-Alpha.bmp
+++ b/samples/paletted_pcx/romdisk/NeHe-Alpha.bmp
--- a/samples/prof_texture_upload/image.h
+++ b/samples/prof_texture_upload/image.h
--- a/samples/prof_texture_upload/main.c
+++ b/samples/prof_texture_upload/main.c
@ -0,0 +1,64 @@
+#include <stddef.h>
+#include <time.h>
+#include <stdio.h>
+
+#ifdef __DREAMCAST__
+#include <kos.h>
+#include "../profiler.h"
+#endif
+
+#include <GL/gl.h>
+#include <GL/glkos.h>
+
+#include "image.h"
+
+#define PROFILE 0
+
+int main(int argc, char* argv[]) {
+    (void) argc;
+    (void) argv;
+
+    fprintf(stdout, "Initializing\n");
+    glKosInit();
+    glClearColor(0.5f, 0.0f, 0.5f, 1.0f);
+    glClear(GL_COLOR_BUFFER_BIT | GL_DEPTH_BUFFER_BIT);
+    glKosSwapBuffers();
+
+    GLuint texture_id = 0;
+    glGenTextures(1, &texture_id);
+    glBindTexture(GL_TEXTURE_2D, texture_id);
+
+    time_t start = time(NULL);
+    time_t end = start;
+
+    int counter = 0;
+
+    fprintf(stderr, "Starting test run...\n");
+
+#ifdef __DREAMCAST__
+#if PROFILE
+    profiler_init("/pc/gmon.out");
+    profiler_start();
+#endif
+#endif
+
+    while((end - start) < 5) {
+        glTexImage2D(
+            GL_TEXTURE_2D, 0, GL_RGB, width, height, 0, GL_RGB, GL_UNSIGNED_BYTE, header_data
+        );
+
+        ++counter;
+        end = time(NULL);
+    }
+
+#ifdef __DREAMCAST__
+#if PROFILE
+    profiler_stop();
+    profiler_clean_up();
+#endif
+#endif
+
+    fprintf(stderr, "Called glTexImage2D %d times (%.4f per call)\n", counter, (float)(end - start) / (float)(counter));
+
+    return 0;
+}
--- a/samples/quadmark/main.c
+++ b/samples/quadmark/main.c
@ -68,21 +68,16 @@ int check_start() {
 }

 void setup() {
-    //PVR needs to warm up for a frame, or results will be low
-    glKosInit();
+    GLdcConfig cfg;
+    glKosInitConfig(&cfg);
+    cfg.initial_immediate_capacity = 14000;
+    glKosInitEx(&cfg);
+
    glMatrixMode(GL_MODELVIEW);
    glLoadIdentity();
    glOrtho(0, 640, 0, 480, -100, 100);
    glMatrixMode(GL_PROJECTION);
    glLoadIdentity();
-
-    glDisable(GL_NEARZ_CLIPPING_KOS);
-
-#ifdef __DREAMCAST__
-    pvr_wait_ready();
-    pvr_scene_begin();
-    pvr_scene_finish();
-#endif
 }

 void do_frame() {
@ -116,6 +111,8 @@ time_t begin;
 void switch_tests(int ppf) {
    printf("Beginning new test: %d polys per frame (%d per second at 60fps)\n",
           ppf * 2, ppf * 2 * 60);
+    fflush(stdout);
+
    avgfps = -1;
    polycnt = ppf;
 }
@ -128,7 +125,6 @@ void check_switch() {
    if(now >= (begin + 5)) {
        begin = time(NULL);
        printf("  Average Frame Rate: ~%f fps (%d pps)\n", avgfps, (int)(polycnt * avgfps * 2));
-
        switch(phase) {
            case PHASE_HALVE:

@ -169,19 +165,24 @@ void check_switch() {
            case PHASE_FINAL:
                break;
        }
+
+        fflush(stdout);
    }
 }

+#define PROFILE 0
+
 int main(int argc, char **argv) {
-#ifndef NDEBUG
-#ifdef __DREAMCAST__
+#if PROFILE
    profiler_init("/pc/gmon.out");
-    profiler_start();
-#endif
 #endif

    setup();

+#if PROFILE
+    profiler_start();
+#endif
+
    /* Start off with something obscene */
    switch_tests(200000 / 60);
    begin = time(NULL);
@ -200,11 +201,9 @@ int main(int argc, char **argv) {

    stats();

-#ifdef __DREAMCAST__
-#ifndef NDEBUG
+#if PROFILE
    profiler_stop();
    profiler_clean_up();
-#endif
 #endif

    return 0;
--- a/samples/zclip_triangle/main.c
+++ b/samples/zclip_triangle/main.c
@ -28,6 +28,8 @@ void InitGL(int Width, int Height)	        // We call this right after our OpenG

    glMatrixMode(GL_MODELVIEW);
    glLoadIdentity();
+
+    glEnable(GL_CULL_FACE);
 }

 /* The function called when our window is resized (which shouldn't happen, because we're fullscreen) */
@ -86,12 +88,13 @@ void DrawGLScene()
    rotation = (rotation > 360.0f) ? rotation - 360.0f : rotation;

    glClear(GL_COLOR_BUFFER_BIT | GL_DEPTH_BUFFER_BIT);		// Clear The Screen And The Depth Buffer
+    glClearColor(0.5f, 0.5f, 0.5f, 0.5f);
    glLoadIdentity();				// Reset The View

    glDisable(GL_CULL_FACE);

    glPushMatrix();
-        glTranslatef(0.0f, -1.0f, movement);
+        glTranslatef(0.0f, -1.0f, -movement);
        glRotatef(rotation, 0.0f, 1.0f, 0.0f);

        glBegin(GL_TRIANGLES);
--- a/tests/CMakeLists.txt
+++ b/tests/CMakeLists.txt
@ -0,0 +1,26 @@
+
+
+FILE(GLOB GL_TESTS ${CMAKE_CURRENT_SOURCE_DIR}/test_*.h)
+
+INCLUDE_DIRECTORIES(${CMAKE_SOURCE_DIR})
+
+SET(TEST_GENERATOR_BIN ${CMAKE_SOURCE_DIR}/tools/test_generator.py)
+SET(TEST_MAIN_FILENAME ${CMAKE_CURRENT_BINARY_DIR}/main.cpp)
+
+ADD_CUSTOM_COMMAND(
+    OUTPUT ${TEST_MAIN_FILENAME}
+    COMMAND ${TEST_GENERATOR_BIN} --output ${TEST_MAIN_FILENAME} ${TEST_FILES} ${GL_TESTS}
+    DEPENDS ${TEST_FILES} ${GL_TESTS} ${TEST_GENERATOR_BIN}
+)
+
+add_executable(gldc_tests ${TEST_FILES} ${TEST_SOURCES} ${TEST_MAIN_FILENAME})
+target_link_libraries(gldc_tests GLdc)
+
+if(NOT PLATFORM_DREAMCAST)
+set_target_properties(
+    gldc_tests
+    PROPERTIES
+    COMPILE_OPTIONS "-m32"
+    LINK_OPTIONS "-m32"
+)
+endif()
--- a/tests/test_allocator.h
+++ b/tests/test_allocator.h
@ -0,0 +1,189 @@
+#include "tools/test.h"
+
+#include <cstdint>
+#include <cassert>
+#include <malloc.h>
+#include <utility>
+
+#include <GL/gl.h>
+#include <GL/glkos.h>
+
+#include "GL/alloc/alloc.h"
+
+static inline int round_up(int n, int multiple)
+{
+    assert(multiple);
+    return ((n + multiple - 1) / multiple) * multiple;
+}
+
+#define POOL_SIZE (16 * 2048)
+
+class AllocatorTests : public test::TestCase {
+public:
+    uint8_t* pool = NULL;
+
+    std::vector<std::pair<void*, void*>> defrag_moves;
+
+    void set_up() {
+        pool = (uint8_t*) memalign(2048, POOL_SIZE);
+        assert(((intptr_t) pool) % 2048 == 0);
+    }
+
+    void tear_down() {
+        alloc_shutdown(pool);
+        free(pool);
+    }
+
+    static void on_defrag(void* src, void* dst, void* user_data) {
+        AllocatorTests* self = (AllocatorTests*) user_data;
+        self->defrag_moves.push_back(std::make_pair(src, dst));
+    }
+
+    void test_defrag() {
+        alloc_init(pool, POOL_SIZE);
+
+        alloc_malloc(pool, 256);
+        void* a2 = alloc_malloc(pool, 256);
+        void* a3 = alloc_malloc(pool, 256);
+
+        alloc_free(pool, a2);
+
+        alloc_run_defrag(pool, &AllocatorTests::on_defrag, 5, this);
+
+        assert_equal(defrag_moves.size(), 1u); // Moved a3 -> a2
+
+        assert_equal(defrag_moves[0].first, a3);
+        assert_equal(defrag_moves[0].second, a2);
+
+        assert_equal(alloc_malloc(pool, 256), a3);
+    }
+
+    void test_poor_alloc_aligned() {
+        /* If we try to allocate and there are no suitable aligned
+         * slots available, we fallback to any available unaligned slots */
+        alloc_init(pool, POOL_SIZE);
+
+        // Leave only space for an unaligned block
+        alloc_malloc(pool, (15 * 2048) - 256);
+
+        // Should work, we have space (just) but it's not aligned
+        void* a1 = alloc_malloc(pool, 2048 + 256);
+        assert_is_not_null(a1);
+        assert_equal(a1, pool + ((15 * 2048) - 256));
+    }
+
+    void test_poor_alloc_straddling() {
+        /*
+         * If we try to allocate a small block, it should not
+         * cross a 2048 boundary unless there is no other option */
+        alloc_init(pool, POOL_SIZE);
+        alloc_malloc(pool, (15 * 2048) - 256);
+        void* a1 = alloc_malloc(pool, 512);
+        assert_true((uintptr_t(a1) % 2048) == 0); // Should've aligned to the last 2048 block
+
+        /* Allocate the rest of the last block, this leaves a 256 block in the
+         * penultimate block */
+        alloc_malloc(pool, 1536);
+        alloc_free(pool, a1);
+
+        /* No choice but to straddle the boundary */
+        a1 = alloc_malloc(pool, 768);
+    }
+
+    void test_alloc_init() {
+        alloc_init(pool, POOL_SIZE);
+
+        void* expected_base_address = (void*) round_up((uintptr_t) pool, 2048);
+        assert_equal(alloc_next_available(pool, 16), expected_base_address);
+        assert_equal(alloc_base_address(pool), expected_base_address);
+
+        size_t expected_blocks = (
+            uintptr_t(pool + POOL_SIZE) -
+            uintptr_t(expected_base_address)
+        ) / 2048;
+
+        assert_equal(alloc_block_count(pool), expected_blocks);
+    }
+
+    void test_complex_case() {
+        uint8_t* large_pool = (uint8_t*) malloc(8 * 1024 * 1024);
+
+        alloc_init(large_pool, 8 * 1024 * 1024);
+        alloc_malloc(large_pool, 262144);
+        alloc_malloc(large_pool, 262144);
+        void* a1 = alloc_malloc(large_pool, 524288);
+        alloc_free(large_pool, a1);
+        alloc_malloc(large_pool, 699056);
+        alloc_malloc(large_pool, 128);
+        alloc_shutdown(large_pool);
+
+        free(large_pool);
+    }
+
+    void test_complex_case2() {
+        uint8_t* large_pool = (uint8_t*) malloc(8 * 1024 * 1024);
+        alloc_init(large_pool, 8 * 1024 * 1024);
+
+        void* a1 = alloc_malloc(large_pool, 131072);
+        alloc_free(large_pool, a1);
+
+        alloc_malloc(large_pool, 174768);
+        void* a2 = alloc_malloc(large_pool, 131072);
+        alloc_free(large_pool, a2);
+
+        alloc_malloc(large_pool, 174768);
+        void* a3 = alloc_malloc(large_pool, 128);
+
+        alloc_free(large_pool, a3);
+
+        alloc_shutdown(large_pool);
+        free(large_pool);
+    }
+
+    void test_alloc_malloc() {
+        alloc_init(pool, POOL_SIZE);
+
+        uint8_t* base_address = (uint8_t*) alloc_base_address(pool);
+        void* a1 = alloc_malloc(pool, 1024);
+
+        /* First alloc should always be the base address */
+        assert_equal(a1, base_address);
+
+        /* An allocation of <= 2048 (well 1024) will not necessarily be at
+         * a 2k boundary */
+        void* expected_next_available = base_address + uintptr_t(1024);
+        assert_equal(alloc_next_available(pool, 1024), expected_next_available);
+
+        /* Requesting 2k though will force to a 2k boundary */
+        expected_next_available = base_address + uintptr_t(2048);
+        assert_equal(alloc_next_available(pool, 2048), expected_next_available);
+
+        /* Now alloc 2048 bytes, this should be on the 2k boundary */
+        void* a2 = alloc_malloc(pool, 2048);
+        assert_equal(a2, expected_next_available);
+
+        /* If we try to allocate 1k, this should go in the second half of the
+         * first block */
+        expected_next_available = base_address + uintptr_t(1024);
+        void* a3 = alloc_malloc(pool, 1024);
+        assert_equal(a3, expected_next_available);
+
+        alloc_free(pool, a1);
+
+        /* Next allocation would go in the just freed block */
+        expected_next_available = base_address;
+        assert_equal(alloc_next_available(pool, 64), expected_next_available);
+
+        /* Now allocate 14 more 2048 size blocks, the following one should
+         * return NULL */
+        for(int i = 0; i < 14; ++i) {
+            alloc_malloc(pool, 2048);
+        }
+
+        assert_is_null(alloc_malloc(pool, 2048));
+
+        /* But we should still have room in the second block for this */
+        assert_is_not_null(alloc_malloc(pool, 64));
+    }
+
+};
--- a/tests/test_glteximage2d.h
+++ b/tests/test_glteximage2d.h
@ -0,0 +1,77 @@
+#include "tools/test.h"
+
+#include <stdint.h>
+#include <GL/gl.h>
+#include <GL/glkos.h>
+
+
+class TexImage2DTests : public test::TestCase {
+public:
+    uint8_t image_data[8 * 8 * 4] = {0};
+
+    void set_up() {
+        GLdcConfig config;
+        glKosInitConfig(&config);
+        config.texture_twiddle = false;
+        glKosInitEx(&config);
+
+        /* Init image data so each texel RGBA value matches the
+         * position in the array */
+        for(int i = 0; i < 8 * 8 * 4; i += 4) {
+            image_data[i + 0] = i;
+            image_data[i + 1] = i;
+            image_data[i + 2] = i;
+            image_data[i + 3] = i;
+        }
+    }
+
+    void tear_down() {
+        glKosShutdown();
+    }
+
+    void test_rgb_to_rgb565() {
+        glTexImage2D(GL_TEXTURE_2D, 0, GL_RGB, 8, 8, 0, GL_RGB, GL_UNSIGNED_BYTE, image_data);
+        assert_equal(glGetError(), GL_NO_ERROR);
+
+        GLint internalFormat;
+        glGetIntegerv(GL_TEXTURE_INTERNAL_FORMAT_KOS, &internalFormat);
+
+        assert_equal(internalFormat, GL_RGB565_KOS);
+    }
+
+    void test_rgb_to_rgb565_twiddle() {
+        glEnable(GL_TEXTURE_TWIDDLE_KOS);
+        glTexImage2D(GL_TEXTURE_2D, 0, GL_RGB, 8, 8, 0, GL_RGB, GL_UNSIGNED_BYTE, image_data);
+        glDisable(GL_TEXTURE_TWIDDLE_KOS);
+
+        assert_equal(glGetError(), GL_NO_ERROR);
+
+        GLint internalFormat;
+        glGetIntegerv(GL_TEXTURE_INTERNAL_FORMAT_KOS, &internalFormat);
+
+        assert_equal(internalFormat, GL_RGB565_TWID_KOS);
+    }
+
+    void test_rgba_to_argb4444() {
+        glTexImage2D(GL_TEXTURE_2D, 0, GL_RGBA, 8, 8, 0, GL_RGBA, GL_UNSIGNED_BYTE, image_data);
+        assert_equal(glGetError(), GL_NO_ERROR);
+
+        GLint internalFormat;
+        glGetIntegerv(GL_TEXTURE_INTERNAL_FORMAT_KOS, &internalFormat);
+
+        assert_equal(internalFormat, GL_ARGB4444_KOS);
+    }
+
+    void test_rgba_to_argb4444_twiddle() {
+        glEnable(GL_TEXTURE_TWIDDLE_KOS);
+        glTexImage2D(GL_TEXTURE_2D, 0, GL_RGBA, 8, 8, 0, GL_RGBA, GL_UNSIGNED_BYTE, image_data);
+        glDisable(GL_TEXTURE_TWIDDLE_KOS);
+
+        assert_equal(glGetError(), GL_NO_ERROR);
+
+        GLint internalFormat;
+        glGetIntegerv(GL_TEXTURE_INTERNAL_FORMAT_KOS, &internalFormat);
+
+        assert_equal(internalFormat, GL_ARGB4444_TWID_KOS);
+    }
+};
--- a/tests/zclip/main.cpp
+++ b/tests/zclip/main.cpp
@ -0,0 +1,637 @@
+
+#include <cstdint>
+#include <vector>
+#include <cstdio>
+#include <cmath>
+#include <stdexcept>
+#include <cassert>
+
+#define SQ_BASE_ADDRESS 0
+#define SPAN_SORT_CFG 0
+#define PVR_SET(x, y) (void)(x); (void)(y)
+
+struct Vertex  {
+    uint32_t flags;
+    float xyz[3];
+    float uv[2];
+    float w;
+    uint8_t bgra[4];
+};
+
+struct {
+    float hwidth;
+    float x_plus_hwidth;
+    float hheight;
+    float y_plus_hheight;
+} VIEWPORT = {320, 320, 240, 240};
+
+
+struct VideoMode {
+    float height;
+};
+
+static VideoMode* GetVideoMode() {
+    static VideoMode mode = {320.0f};
+    return &mode;
+}
+
+enum GPUCommand {
+    GPU_CMD_POLYHDR = 0x80840000,
+    GPU_CMD_VERTEX = 0xe0000000,
+    GPU_CMD_VERTEX_EOL = 0xf0000000,
+    GPU_CMD_USERCLIP = 0x20000000,
+    GPU_CMD_MODIFIER = 0x80000000,
+    GPU_CMD_SPRITE = 0xA0000000
+};
+
+static std::vector<Vertex> sent;
+
+static inline void interpolateColour(const uint32_t* a, const uint32_t* b, const float t, uint32_t* out) {
+    const static uint32_t MASK1 = 0x00FF00FF;
+    const static uint32_t MASK2 = 0xFF00FF00;
+
+    const uint32_t f2 = 256 * t;
+    const uint32_t f1 = 256 - f2;
+
+    *out = (((((*a & MASK1) * f1) + ((*b & MASK1) * f2)) >> 8) & MASK1) |
+            (((((*a & MASK2) * f1) + ((*b & MASK2) * f2)) >> 8) & MASK2);
+}
+
+static inline void _glClipEdge(const Vertex* v1, const Vertex* v2, Vertex* vout) {
+    /* Clipping time! */
+    const float d0 = v1->w + v1->xyz[2];
+    const float d1 = v2->w + v2->xyz[2];
+    const float sign = ((2.0f * (d1 < d0)) - 1.0f);
+    const float epsilon = -0.00001f * sign;
+    const float n = (d0 - d1);
+    const float r = (1.f / sqrtf(n * n)) * sign;
+    float t = fmaf(r, d0, epsilon);
+
+    vout->xyz[0] = fmaf(v2->xyz[0] - v1->xyz[0], t, v1->xyz[0]);
+    vout->xyz[1] = fmaf(v2->xyz[1] - v1->xyz[1], t, v1->xyz[1]);
+    vout->xyz[2] = fmaf(v2->xyz[2] - v1->xyz[2], t, v1->xyz[2]);
+    vout->w = fmaf(v2->w - v1->w, t, v1->w);
+
+    vout->uv[0] = fmaf(v2->uv[0] - v1->uv[0], t, v1->uv[0]);
+    vout->uv[1] = fmaf(v2->uv[1] - v1->uv[1], t, v1->uv[1]);
+
+    interpolateColour((uint32_t*) v1->bgra, (uint32_t*) v2->bgra, t, (uint32_t*) vout->bgra);
+}
+
+bool glIsVertex(const uint32_t flags) {
+    return flags == GPU_CMD_VERTEX_EOL || flags == GPU_CMD_VERTEX;
+}
+
+bool glIsLastVertex(const uint32_t flags) {
+    return flags == GPU_CMD_VERTEX_EOL;
+}
+
+void _glSubmitHeaderOrVertex(volatile uint32_t*, Vertex* vtx) {
+    sent.push_back(*vtx);
+}
+
+float _glFastInvert(float x) {
+    return (1.f / __builtin_sqrtf(x * x));
+}
+
+void _glPerspectiveDivideVertex(Vertex* vertex, const float h) {
+    const float f = _glFastInvert(vertex->w);
+
+    /* Convert to NDC and apply viewport */
+    vertex->xyz[0] = __builtin_fmaf(
+        VIEWPORT.hwidth, vertex->xyz[0] * f, VIEWPORT.x_plus_hwidth
+    );
+
+    vertex->xyz[1] = h - __builtin_fmaf(
+        VIEWPORT.hheight, vertex->xyz[1] * f, VIEWPORT.y_plus_hheight
+    );
+
+    /* Orthographic projections need to use invZ otherwise we lose
+    the depth information. As w == 1, and clip-space range is -w to +w
+    we add 1.0 to the Z to bring it into range. We add a little extra to
+    avoid a divide by zero.
+    */
+
+    vertex->xyz[2] = (vertex->w == 1.0f) ? _glFastInvert(1.0001f + vertex->xyz[2]) : f;
+}
+
+
+void memcpy_vertex(Vertex* dst, Vertex* src) {
+    *dst = *src;
+}
+
+/* Zclipping is so difficult to get right, that self sample tests all the cases of clipping and makes sure that things work as expected */
+
+#ifdef __DREAMCAST__
+static volatile int *pvrdmacfg = (int*)0xA05F6888;
+static volatile int *qacr = (int*)0xFF000038;
+#else
+static int pvrdmacfg[2];
+static int qacr[2];
+#endif
+
+void SceneListSubmit(void* src, int n) {
+    /* You need at least a header, and 3 vertices to render anything */
+    if(n < 4) {
+        return;
+    }
+
+    const float h = GetVideoMode()->height;
+
+    PVR_SET(SPAN_SORT_CFG, 0x0);
+
+    //Set PVR DMA registers
+    pvrdmacfg[0] = 1;
+    pvrdmacfg[1] = 1;
+
+    //Set QACR registers
+    qacr[1] = qacr[0] = 0x11;
+
+    volatile uint32_t *d = SQ_BASE_ADDRESS;
+
+    int8_t queue_head = 0;
+    int8_t queue_tail = 0;
+
+    /* The most vertices ever in the queue is 5 (as some clipping operations
+     * produce and additional couple of vertice, but we add one more so the ring buffer doesn't
+     * trip over itself (e.g. if tail == head we can guarantee it's empty, not full) */
+    Vertex __attribute__((aligned(32))) queue[4];
+    const int queue_capacity = sizeof(queue) / sizeof(Vertex);
+
+    Vertex* vertex = (Vertex*) src;
+    uint32_t visible_mask = 0;
+
+#if CLIP_DEBUG
+    for(int i = 0; i < n; ++i) {
+        fprintf(stderr, "{%f, %f, %f, %f}, // %x (%x)\n", vertex[i].xyz[0], vertex[i].xyz[1], vertex[i].xyz[2], vertex[i].w, vertex[i].flags, &vertex[i]);
+    }
+
+    fprintf(stderr, "----\n");
+#endif
+    while(n--) {
+        bool last_vertex = false;
+        memcpy_vertex(queue + queue_tail, vertex);
+        ++vertex;
+        switch(queue[queue_tail].flags) {
+            case GPU_CMD_POLYHDR:
+                _glSubmitHeaderOrVertex(d, &queue[queue_tail]);
+            break;
+            case GPU_CMD_VERTEX_EOL:
+                last_vertex = true;  // fallthru
+            case GPU_CMD_VERTEX:
+                visible_mask = (visible_mask >> 1) | (queue[queue_tail].xyz[2] >= -queue[queue_tail].w) << 2;
+                assert(visible_mask < 15);
+                queue_tail = (queue_tail + 1) % queue_capacity;
+            default:
+            break;
+        }
+
+        int counter = (queue_tail - queue_head + queue_capacity) % queue_capacity;
+        if(counter < 3) {
+            continue;
+        }
+
+#if CLIP_DEBUG
+        fprintf(stderr, "%d\n", visible_mask);
+#endif
+        Vertex __attribute__((aligned(32))) a, b;  // Scratch vertices
+        switch(visible_mask) {
+            case 0:
+            break;
+            case 7:
+                /* All visible, push the first vertex and move on */
+                _glPerspectiveDivideVertex(&queue[queue_head], h);
+                _glSubmitHeaderOrVertex(d, &queue[queue_head]);
+
+                if(last_vertex) {
+                    /* If this was the last vertex in the strip, we need to flush the queue and then
+                       restart it again */
+
+                    int v1 = (queue_head + 1) % queue_capacity;
+                    int v2 = (queue_head + 2) % queue_capacity;
+
+                    _glPerspectiveDivideVertex(&queue[v1], h);
+                    _glSubmitHeaderOrVertex(d, &queue[v1]);
+
+                    _glPerspectiveDivideVertex(&queue[v2], h);
+                    _glSubmitHeaderOrVertex(d, &queue[v2]);
+                }
+            break;
+            case 1:
+                /* First vertex was visible */
+                {
+                        Vertex* v0 = &queue[queue_head];
+                        Vertex* v1 = &queue[(queue_head + 1) % queue_capacity];
+                        Vertex* v2 = &queue[(queue_head + 2) % queue_capacity];
+
+                        _glClipEdge(v0, v1, &a);
+                        _glClipEdge(v2, v0, &b);
+                        a.flags = GPU_CMD_VERTEX;
+
+                        /* If v2 was the last in the strip, then b should be. If it wasn't
+                        we'll create a degenerate triangle by adding b twice in a row so that the
+                        strip processing will continue correctly after crossing the plane so it can
+                        cross back*/
+                        b.flags = v2->flags;
+
+                        _glPerspectiveDivideVertex(v0, h);
+                        _glPerspectiveDivideVertex(&a, h);
+                        _glPerspectiveDivideVertex(&b, h);
+
+                        _glSubmitHeaderOrVertex(d, v0);
+                        _glSubmitHeaderOrVertex(d, &a);
+                        _glSubmitHeaderOrVertex(d, &b);
+                        _glSubmitHeaderOrVertex(d, &b);
+                }
+            break;
+            case 2:
+                /* Second vertex was visible. In self case we need to create a triangle and produce
+                two new vertices: 1-2, and 2-3. */
+                {
+                        Vertex* v0 = &queue[queue_head];
+                        const Vertex* v1 = &queue[(queue_head + 1) % queue_capacity];
+                        const Vertex* v2 = &queue[(queue_head + 2) % queue_capacity];
+
+                        _glClipEdge(v0, v1, &a);
+                        _glClipEdge(v1, v2, &b);
+                        a.flags = GPU_CMD_VERTEX;
+                        b.flags = v2->flags;
+
+                        _glPerspectiveDivideVertex(v0, h);
+                        _glPerspectiveDivideVertex(&a, h);
+                        _glPerspectiveDivideVertex(&b, h);
+
+                        _glSubmitHeaderOrVertex(d, &a);
+                        _glSubmitHeaderOrVertex(d, v0);
+                        _glSubmitHeaderOrVertex(d, &b);
+                }
+            break;
+            case 3:  /* First and second vertex were visible */
+                    {
+                        Vertex* v0 = &queue[queue_head];
+                        Vertex __attribute__((aligned(32))) v1 = queue[(queue_head + 1) % queue_capacity];
+                        Vertex* v2 = &queue[(queue_head + 2) % queue_capacity];
+
+                        _glClipEdge(&v1, v2, &a);
+                        _glClipEdge(v2, v0, &b);
+
+                        a.flags = v2->flags;
+                        b.flags = GPU_CMD_VERTEX;
+
+                        _glPerspectiveDivideVertex(v0, h);
+                        _glPerspectiveDivideVertex(&v1, h);
+                        _glPerspectiveDivideVertex(&a, h);
+                        _glPerspectiveDivideVertex(&b, h);
+
+                        _glSubmitHeaderOrVertex(d, v0);
+                        _glSubmitHeaderOrVertex(d, &v1);
+                        _glSubmitHeaderOrVertex(d, &b);
+                        _glSubmitHeaderOrVertex(d, &v1);
+                        _glSubmitHeaderOrVertex(d, &a);
+                }
+            break;
+            case 4:
+                /* Third vertex was visible. */
+                {
+                        Vertex* v0 = &queue[queue_head];
+                        Vertex* v1 = &queue[(queue_head + 1) % queue_capacity];
+                        Vertex __attribute__((aligned(32))) v2 = queue[(queue_head + 2) % queue_capacity];
+
+                        _glClipEdge(&v2, v0, &a);
+                        _glClipEdge(v1, &v2, &b);
+                        a.flags = GPU_CMD_VERTEX;
+                        b.flags = GPU_CMD_VERTEX;
+
+                        _glPerspectiveDivideVertex(&v2, h);
+                        _glPerspectiveDivideVertex(&a, h);
+                        _glPerspectiveDivideVertex(&b, h);
+
+                        _glSubmitHeaderOrVertex(d, &a);
+                        _glSubmitHeaderOrVertex(d, &a);
+                        _glSubmitHeaderOrVertex(d, &b);
+                        _glSubmitHeaderOrVertex(d, &v2);
+                }
+            break;
+            case 5:  /* First and third vertex were visible */
+                {
+                        Vertex* v0 = &queue[queue_head];
+                        Vertex* v1 = &queue[(queue_head + 1) % queue_capacity];
+                        Vertex __attribute__((aligned(32))) v2 = queue[(queue_head + 2) % queue_capacity];
+
+                        _glClipEdge(v0, v1, &a);
+                        _glClipEdge(v1, &v2, &b);
+                        a.flags = GPU_CMD_VERTEX;
+                        b.flags = GPU_CMD_VERTEX;
+
+                        _glPerspectiveDivideVertex(v0, h);
+                        _glPerspectiveDivideVertex(&v2, h);
+                        _glPerspectiveDivideVertex(&a, h);
+                        _glPerspectiveDivideVertex(&b, h);
+
+                        _glSubmitHeaderOrVertex(d, v0);
+                        _glSubmitHeaderOrVertex(d, &a);
+                        uint32_t v2_flags = v2.flags;
+                        v2.flags = GPU_CMD_VERTEX;
+                        _glSubmitHeaderOrVertex(d, &v2);
+                        v2.flags = v2_flags;
+                        _glSubmitHeaderOrVertex(d, &b);
+                        _glSubmitHeaderOrVertex(d, &v2);
+                }
+            break;
+            case 6:  /* Second and third vertex were visible */
+                {
+                        Vertex* v0 = &queue[queue_head];
+                        Vertex __attribute__((aligned(32))) v1 = queue[(queue_head + 1) % queue_capacity];
+                        Vertex __attribute__((aligned(32))) v2 = queue[(queue_head + 2) % queue_capacity];
+
+                        _glClipEdge(v0, &v1, &a);
+                        _glClipEdge(&v2, v0, &b);
+
+                        a.flags = GPU_CMD_VERTEX;
+                        b.flags = GPU_CMD_VERTEX;
+
+                        _glPerspectiveDivideVertex(&v1, h);
+                        _glPerspectiveDivideVertex(&v2, h);
+                        _glPerspectiveDivideVertex(&a, h);
+                        _glPerspectiveDivideVertex(&b, h);
+
+                        _glSubmitHeaderOrVertex(d, &a);
+                        _glSubmitHeaderOrVertex(d, &v1);
+                        _glSubmitHeaderOrVertex(d, &b);
+                        _glSubmitHeaderOrVertex(d, &v1);
+                        _glSubmitHeaderOrVertex(d, &v2);
+                }
+            break;
+            default:
+                break;
+        }
+
+        if(last_vertex) {
+            visible_mask = queue_head = queue_tail = 0;
+        } else {
+            queue_head = (queue_head + 1) % queue_capacity;
+        }
+    }
+}
+
+
+struct VertexTmpl {
+    VertexTmpl(float x, float y, float z, float w):
+        x(x), y(y), z(z), w(w) {}
+
+    float x, y, z, w;
+};
+
+std::vector<Vertex> make_vertices(const std::vector<VertexTmpl>& verts) {
+    std::vector<Vertex> result;
+    Vertex r;
+
+    r.flags = GPU_CMD_POLYHDR;
+    result.push_back(r);
+
+    for(auto& v: verts) {
+        r.flags = GPU_CMD_VERTEX;
+        r.xyz[0] = v.x;
+        r.xyz[1] = v.y;
+        r.xyz[2] = v.z;
+        r.uv[0] = 0.0f;
+        r.uv[1] = 0.0f;
+        r.w = v.w;
+
+        result.push_back(r);
+    }
+
+    result.back().flags = GPU_CMD_VERTEX_EOL;
+    return result;
+}
+
+template<typename T, typename U>
+void check_equal(const T& lhs, const U& rhs) {
+    if(lhs != rhs) {
+        throw std::runtime_error("Assertion failed");
+    }
+}
+
+template<>
+void check_equal(const Vertex& lhs, const Vertex& rhs) {
+    if(lhs.xyz[0] != rhs.xyz[0] ||
+       lhs.xyz[1] != rhs.xyz[1] ||
+       lhs.xyz[2] != rhs.xyz[2] ||
+       lhs.w != rhs.w) {
+        throw std::runtime_error("Assertion failed");
+    }
+}
+
+
+bool test_clip_case_001() {
+    /* The first vertex is visible only */
+    sent.clear();
+
+    auto data = make_vertices({
+        {0.000000, -2.414213, 3.080808, 5.000000},
+        {-4.526650, -2.414213, -7.121212, -5.000000},
+        {4.526650, -2.414213, -7.121212, -5.000000}
+    });
+
+    SceneListSubmit(&data[0], data.size());
+
+    check_equal(sent.size(), 5);
+    check_equal(sent[0].flags, GPU_CMD_POLYHDR);
+    check_equal(sent[1].flags, GPU_CMD_VERTEX);
+    check_equal(sent[2].flags, GPU_CMD_VERTEX);
+
+    // Because we're sending a single triangle, we end up sending a
+    // degenerate final vert. But if we were sending more than one triangle
+    // this would be GPU_CMD_VERTEX twice
+    check_equal(sent[3].flags, GPU_CMD_VERTEX_EOL);
+    check_equal(sent[4].flags, GPU_CMD_VERTEX_EOL);
+    check_equal(sent[3], sent[4]);
+    return true;
+}
+
+bool test_clip_case_010() {
+    /* The third vertex is visible only */
+    sent.clear();
+
+    auto data = make_vertices({
+        {-4.526650, -2.414213, -7.121212, -5.000000},
+        {0.000000, -2.414213, 3.080808, 5.000000},
+        {4.526650, -2.414213, -7.121212, -5.000000}
+    });
+
+    SceneListSubmit(&data[0], data.size());
+
+    check_equal(sent.size(), 4);
+    check_equal(sent[0].flags, GPU_CMD_POLYHDR);
+    check_equal(sent[1].flags, GPU_CMD_VERTEX);
+    check_equal(sent[2].flags, GPU_CMD_VERTEX);
+    check_equal(sent[3].flags, GPU_CMD_VERTEX_EOL);
+    return true;
+}
+
+bool test_clip_case_100() {
+    /* The third vertex is visible only */
+    sent.clear();
+
+    auto data = make_vertices({
+        {-4.526650, -2.414213, -7.121212, -5.000000},
+        {4.526650, -2.414213, -7.121212, -5.000000},
+        {0.000000, -2.414213, 3.080808, 5.000000}
+    });
+
+    SceneListSubmit(&data[0], data.size());
+
+    check_equal(sent.size(), 5);
+    check_equal(sent[0].flags, GPU_CMD_POLYHDR);
+    check_equal(sent[1].flags, GPU_CMD_VERTEX);
+    check_equal(sent[2].flags, GPU_CMD_VERTEX);
+
+    // Because we're sending a single triangle, we end up sending a
+    // degenerate final vert. But if we were sending more than one triangle
+    // this would be GPU_CMD_VERTEX twice
+    check_equal(sent[3].flags, GPU_CMD_VERTEX);
+    check_equal(sent[4].flags, GPU_CMD_VERTEX_EOL);
+    check_equal(sent[1], sent[2]);
+    return true;
+}
+
+bool test_clip_case_110() {
+    /* 2nd and 3rd visible */
+    sent.clear();
+
+    auto data = make_vertices({
+        {0.0, -2.414213, -7.121212, -5.000000},
+        {-4.526650, -2.414213, 3.080808, 5.000000},
+        {4.526650, -2.414213, 3.080808, 5.000000}
+    });
+
+    SceneListSubmit(&data[0], data.size());
+
+    check_equal(sent.size(), 6);
+    check_equal(sent[0].flags, GPU_CMD_POLYHDR);
+    check_equal(sent[1].flags, GPU_CMD_VERTEX);
+    check_equal(sent[2].flags, GPU_CMD_VERTEX);
+    check_equal(sent[3].flags, GPU_CMD_VERTEX);
+    check_equal(sent[4].flags, GPU_CMD_VERTEX);
+    check_equal(sent[5].flags, GPU_CMD_VERTEX_EOL);
+    check_equal(sent[2], sent[4]);
+    return true;
+}
+
+bool test_clip_case_011() {
+    /* 1st and 2nd visible */
+    sent.clear();
+
+    auto data = make_vertices({
+        {-4.526650, -2.414213, 3.080808, 5.000000},
+        {4.526650, -2.414213, 3.080808, 5.000000},
+        {0.0, -2.414213, -7.121212, -5.000000}
+    });
+
+    SceneListSubmit(&data[0], data.size());
+
+    check_equal(sent.size(), 6);
+    check_equal(sent[0].flags, GPU_CMD_POLYHDR);
+    check_equal(sent[1].flags, GPU_CMD_VERTEX);
+    check_equal(sent[2].flags, GPU_CMD_VERTEX);
+    check_equal(sent[3].flags, GPU_CMD_VERTEX);
+    check_equal(sent[4].flags, GPU_CMD_VERTEX);
+    check_equal(sent[5].flags, GPU_CMD_VERTEX_EOL);
+    check_equal(sent[2], sent[4]);
+    return true;
+}
+
+bool test_clip_case_101() {
+    /* 1st and 3rd visible */
+    sent.clear();
+
+    auto data = make_vertices({
+        {-4.526650, -2.414213, 3.080808, 5.000000},
+        {0.0, -2.414213, -7.121212, -5.000000},
+        {4.526650, -2.414213, 3.080808, 5.000000},
+    });
+
+    SceneListSubmit(&data[0], data.size());
+
+    check_equal(sent.size(), 6);
+    check_equal(sent[0].flags, GPU_CMD_POLYHDR);
+    check_equal(sent[1].flags, GPU_CMD_VERTEX);
+    check_equal(sent[2].flags, GPU_CMD_VERTEX);
+    check_equal(sent[3].flags, GPU_CMD_VERTEX);
+    check_equal(sent[4].flags, GPU_CMD_VERTEX);
+    check_equal(sent[5].flags, GPU_CMD_VERTEX_EOL);
+    check_equal(sent[3], sent[5]);
+    return true;
+}
+
+bool test_clip_case_111() {
+    /* 1st and 3rd visible */
+    sent.clear();
+
+    auto data = make_vertices({
+        {-4.526650, -2.414213, 3.080808, 5.000000},
+        {0.0, -2.414213, -7.121212, 8.000000},
+        {4.526650, -2.414213, 3.080808, 5.000000},
+    });
+
+    SceneListSubmit(&data[0], data.size());
+
+    check_equal(sent.size(), 4);
+    check_equal(sent[0].flags, GPU_CMD_POLYHDR);
+    check_equal(sent[1].flags, GPU_CMD_VERTEX);
+    check_equal(sent[2].flags, GPU_CMD_VERTEX);
+    check_equal(sent[3].flags, GPU_CMD_VERTEX_EOL);
+    return true;
+}
+
+
+bool test_start_behind() {
+    /* Triangle behind the plane, but the strip continues in front */
+    sent.clear();
+
+    auto data = make_vertices({
+      {-3.021717, -2.414213, -10.155344, -9.935254},
+      {5.915236, -2.414213, -9.354721, -9.136231},
+      {-5.915236, -2.414213, -0.264096, -0.063767},
+      {3.021717, -2.414213, 0.536527, 0.735255},
+      {-7.361995, -2.414213, 4.681529, 4.871976},
+      {1.574958, -2.414213, 5.482152, 5.670999},
+    });
+
+    SceneListSubmit(&data[0], data.size());
+
+    return true;
+}
+
+bool test_longer_strip() {
+    sent.clear();
+
+    auto data = make_vertices({
+        {-4.384623, -2.414213, -5.699644, -5.488456},
+        {4.667572, -2.414213, -5.621354, -5.410322},
+        {-4.667572, -2.414213, 4.319152, 4.510323},
+        {4.384623, -2.414213, 4.397442, 4.588456},
+        {-4.809045, -2.414213, 9.328549, 9.509711},
+        {4.243149, -2.414213, 9.406840, 9.587846},
+    });
+
+    SceneListSubmit(&data[0], data.size());
+
+    return true;
+}
+
+int main(int argc, char* argv[]) {
+    // test_clip_case_000();
+    test_clip_case_001();
+    test_clip_case_010();
+    test_clip_case_100();
+    test_clip_case_110();
+    test_clip_case_011();
+    test_clip_case_101();
+    test_clip_case_111();
+
+    test_start_behind();
+    test_longer_strip();
+
+    return 0;
+}
--- a/tools/test.h
+++ b/tools/test.h
@ -0,0 +1,451 @@
+/* *   Copyright (c) 2011-2017 Luke Benstead https://simulant-engine.appspot.com
+ *
+ *     This file is part of Simulant.
+ *
+ *     Simulant is free software: you can redistribute it and/or modify
+ *     it under the terms of the GNU Lesser General Public License as published by
+ *     the Free Software Foundation, either version 3 of the License, or
+ *     (at your option) any later version.
+ *
+ *     Simulant is distributed in the hope that it will be useful,
+ *     but WITHOUT ANY WARRANTY; without even the implied warranty of
+ *     MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ *     GNU Lesser General Public License for more details.
+ *
+ *     You should have received a copy of the GNU Lesser General Public License
+ *     along with Simulant.  If not, see <http://www.gnu.org/licenses/>.
+ */
+
+#pragma once
+
+#include <vector>
+#include <functional>
+#include <stdexcept>
+#include <iostream>
+#include <sstream>
+#include <algorithm>
+#include <fstream>
+#include <memory>
+
+#define assert_equal(expected, actual) _assert_equal((expected), (actual), __FILE__, __LINE__)
+#define assert_not_equal(expected, actual) _assert_not_equal((expected), (actual), __FILE__, __LINE__)
+#define assert_false(actual) _assert_false((actual), __FILE__, __LINE__)
+#define assert_true(actual) _assert_true((actual), __FILE__, __LINE__)
+#define assert_close(expected, actual, difference) _assert_close((expected), (actual), (difference), __FILE__, __LINE__)
+#define assert_is_null(actual) _assert_is_null((actual), __FILE__, __LINE__)
+#define assert_is_not_null(actual) _assert_is_not_null((actual), __FILE__, __LINE__)
+#define assert_raises(exception, func) _assert_raises<exception>((func), __FILE__, __LINE__)
+#define assert_items_equal(expected, actual) _assert_items_equal((actual), (expected), __FILE__, __LINE__)
+#define not_implemented() _not_implemented(__FILE__, __LINE__)
+
+
+namespace test {
+
+class StringFormatter {
+public:
+    StringFormatter(const std::string& templ):
+        templ_(templ) { }
+
+    struct Counter {
+        Counter(uint32_t c): c(c) {}
+        uint32_t c;
+    };
+
+    template<typename T>
+    std::string format(T value) {
+        std::stringstream ss;
+        ss << value;
+        return _do_format(0, ss.str());
+    }
+
+    template<typename T>
+    std::string format(Counter count, T value) {
+        std::stringstream ss;
+        ss << value;
+        return _do_format(count.c, ss.str());
+    }
+
+    template<typename T, typename... Args>
+    std::string format(T value, const Args&... args) {
+        std::stringstream ss;
+        ss << value;
+        return StringFormatter(_do_format(0, ss.str())).format(Counter(1), args...);
+    }
+
+    template<typename T, typename... Args>
+    std::string format(Counter count, T value, const Args&... args) {
+        std::stringstream ss;
+        ss << value;
+        return StringFormatter(_do_format(count.c, ss.str())).format(Counter(count.c + 1), args...);
+    }
+
+    std::string _do_format(uint32_t counter, const std::string& value) {
+        std::stringstream ss; // Can't use to_string on all platforms
+        ss << counter;
+
+        const std::string to_replace = "{" + ss.str() + "}";
+        std::string output = templ_;
+
+        auto replace = [](std::string& str, const std::string& from, const std::string& to) -> bool {
+            size_t start_pos = str.find(from);
+            if(start_pos == std::string::npos)
+                return false;
+            str.replace(start_pos, from.length(), to);
+            return true;
+        };
+
+        replace(output, to_replace, value);
+        return output;
+    }
+
+private:
+    std::string templ_;
+};
+
+class StringSplitter {
+public:
+    StringSplitter(const std::string& str):
+        str_(str) {
+
+    }
+
+    std::vector<std::string> split() {
+        std::vector<std::string> result;
+        std::string buffer;
+
+        for(auto c: str_) {
+            if(c == '\n') {
+                if(!buffer.empty()) {
+                    result.push_back(buffer);
+                    buffer.clear();
+                }
+            } else {
+                buffer.push_back(c);
+            }
+        }
+
+        if(!buffer.empty()) {
+            result.push_back(buffer);
+        }
+
+        return result;
+    }
+
+private:
+    std::string str_;
+};
+
+typedef StringFormatter _Format;
+
+class AssertionError : public std::logic_error {
+public:
+    AssertionError(const std::string& what):
+        std::logic_error(what),
+        file(""),
+        line(-1) {
+    }
+
+    AssertionError(const std::pair<std::string, int> file_and_line, const std::string& what):
+        std::logic_error(what),
+        file(file_and_line.first),
+        line(file_and_line.second) {
+
+    }
+
+    ~AssertionError() noexcept (true) {
+
+    }
+
+    std::string file;
+    int line;
+};
+
+
+class NotImplementedError: public std::logic_error {
+public:
+    NotImplementedError(const std::string& file, int line):
+        std::logic_error(_Format("Not implemented at {0}:{1}").format(file, line)) {}
+};
+
+
+class SkippedTestError: public std::logic_error {
+public:
+    SkippedTestError(const std::string& reason):
+    std::logic_error(reason) {
+
+    }
+};
+
+class TestCase {
+public:
+    virtual ~TestCase() {}
+
+    virtual void set_up() {}
+    virtual void tear_down() {}
+
+    void skip_if(const bool& flag, const std::string& reason) {
+        if(flag) { throw test::SkippedTestError(reason); }
+    }
+
+    template<typename T, typename U>
+    void _assert_equal(T expected, U actual, std::string file, int line) {
+        if(expected != actual) {
+            auto file_and_line = std::make_pair(file, line);
+            throw test::AssertionError(file_and_line, test::_Format("{0} does not match {1}").format(actual, expected));
+        }
+    }
+
+    template<typename T, typename U>
+    void _assert_not_equal(T lhs, U rhs, std::string file, int line) {
+        if(lhs == (T) rhs) {
+            auto file_and_line = std::make_pair(file, line);
+            throw test::AssertionError(file_and_line, test::_Format("{0} should not match {1}").format(lhs, rhs));
+        }
+    }
+
+    template<typename T>
+    void _assert_true(T actual, std::string file, int line) {
+        if(!bool(actual)) {
+            auto file_and_line = std::make_pair(file, line);
+            throw test::AssertionError(file_and_line, test::_Format("{0} is not true").format(bool(actual) ? "true" : "false"));
+        }
+    }
+
+    template<typename T>
+    void _assert_false(T actual, std::string file, int line) {
+        if(bool(actual)) {
+            auto file_and_line = std::make_pair(file, line);
+            throw test::AssertionError(file_and_line, test::_Format("{0} is not false").format(bool(actual) ? "true" : "false"));
+        }
+    }
+
+    template<typename T, typename U, typename V>
+    void _assert_close(T expected, U actual, V difference, std::string file, int line) {
+        if(actual < expected - difference ||
+           actual > expected + difference) {
+            auto file_and_line = std::make_pair(file, line);
+            throw test::AssertionError(file_and_line, test::_Format("{0} is not close enough to {1}").format(actual, expected));
+        }
+    }
+
+    template<typename T>
+    void _assert_is_null(T* thing, std::string file, int line) {
+        if(thing != nullptr) {
+            auto file_and_line = std::make_pair(file, line);
+            throw test::AssertionError(file_and_line, "Pointer was not NULL");
+        }
+    }
+
+    template<typename T>
+    void _assert_is_not_null(T* thing, std::string file, int line) {
+        if(thing == nullptr) {
+            auto file_and_line = std::make_pair(file, line);
+            throw test::AssertionError(file_and_line, "Pointer was unexpectedly NULL");
+        }
+    }
+
+    template<typename T, typename Func>
+    void _assert_raises(Func func, std::string file, int line) {
+        try {
+            func();
+            auto file_and_line = std::make_pair(file, line);
+            throw test::AssertionError(file_and_line, test::_Format("Expected exception ({0}) was not thrown").format(typeid(T).name()));
+        } catch(T& e) {}
+    }
+
+    template<typename T, typename U>
+    void _assert_items_equal(const T& lhs, const U& rhs, std::string file, int line) {
+        auto file_and_line = std::make_pair(file, line);
+
+        if(lhs.size() != rhs.size()) {
+            throw test::AssertionError(file_and_line, "Containers are not the same length");
+        }
+
+        for(auto item: lhs) {
+            if(std::find(rhs.begin(), rhs.end(), item) == rhs.end()) {
+                throw test::AssertionError(file_and_line, test::_Format("Container does not contain {0}").format(item));
+            }
+        }
+    }
+
+    void _not_implemented(std::string file, int line) {
+        throw test::NotImplementedError(file, line);
+    }
+};
+
+class TestRunner {
+public:
+    template<typename T, typename U>
+    void register_case(std::vector<U> methods, std::vector<std::string> names) {
+        std::shared_ptr<TestCase> instance = std::make_shared<T>();
+
+        instances_.push_back(instance); //Hold on to it
+
+        for(std::string name: names) {
+            names_.push_back(name);
+        }
+
+        for(U& method: methods) {
+            std::function<void()> func = std::bind(method, dynamic_cast<T*>(instance.get()));
+            tests_.push_back([=]() {
+                instance->set_up();
+                try {
+                    func();
+                } catch(...) {
+                    instance->tear_down();
+                    throw;
+                }
+
+                instance->tear_down();
+            });
+        }
+    }
+
+    int32_t run(const std::string& test_case, const std::string& junit_output="") {
+        int failed = 0;
+        int skipped = 0;
+        int ran = 0;
+        int crashed = 0;
+
+        auto new_tests = tests_;
+        auto new_names = names_;
+
+        if(!test_case.empty()) {
+            new_tests.clear();
+            new_names.clear();
+
+            for(uint32_t i = 0; i < names_.size(); ++i) {
+                if(names_[i].find(test_case) == 0) {
+                    new_tests.push_back(tests_[i]);
+                    new_names.push_back(names_[i]);
+                }
+            }
+        }
+
+        std::cout << std::endl << "Running " << new_tests.size() << " tests" << std::endl << std::endl;
+
+        std::vector<std::string> junit_lines;
+        junit_lines.push_back("<testsuites>\n");
+
+        std::string klass = "";
+
+        for(std::function<void ()> test: new_tests) {
+            std::string name = new_names[ran];
+            std::string this_klass(name.begin(), name.begin() + name.find_first_of(":"));
+            bool close_klass = ran == (int) new_tests.size() - 1;
+
+            if(this_klass != klass) {
+                if(!klass.empty()) {
+                    junit_lines.push_back("  </testsuite>\n");
+                }
+                klass = this_klass;
+                junit_lines.push_back("  <testsuite name=\"" + this_klass + "\">\n");
+            }
+
+            try {
+                junit_lines.push_back("    <testcase name=\"" + new_names[ran] + "\">\n");
+                std::string output = "    " + new_names[ran];
+
+                for(int i = output.length(); i < 76; ++i) {
+                    output += " ";
+                }
+
+                std::cout << output;
+                test();
+                std::cout << "\033[32m" << "   OK   " << "\033[0m" << std::endl;
+                junit_lines.push_back("    </testcase>\n");
+            } catch(test::NotImplementedError& e) {
+                std::cout << "\033[34m" << " SKIPPED" << "\033[0m" << std::endl;
+                ++skipped;
+                junit_lines.push_back("    </testcase>\n");
+            } catch(test::SkippedTestError& e) {
+                std::cout << "\033[34m" << " SKIPPED" << "\033[0m" << std::endl;
+                ++skipped;
+                junit_lines.push_back("    </testcase>\n");
+            } catch(test::AssertionError& e) {
+                std::cout << "\033[33m" << " FAILED " << "\033[0m" << std::endl;
+                std::cout << "        " << e.what() << std::endl;
+                if(!e.file.empty()) {
+                    std::cout << "        " << e.file << ":" << e.line << std::endl;
+
+                    std::ifstream ifs(e.file);
+                    if(ifs.good()) {
+                        std::string buffer;
+                        std::vector<std::string> lines;
+                        while(std::getline(ifs, buffer)) {
+                            lines.push_back(buffer);
+                        }
+
+                        int line_count = lines.size();
+                        if(line_count && e.line <= line_count) {
+                            std::cout << lines.at(e.line - 1) << std::endl << std::endl;
+                        }
+                    }
+                }
+                ++failed;
+
+                junit_lines.push_back("      <failure message=\"" + std::string(e.what()) + "\"/>\n");
+                junit_lines.push_back("    </testcase>\n");
+            } catch(std::exception& e) {
+                std::cout << "\033[31m" << " EXCEPT " << std::endl;
+                std::cout << "        " << e.what() << "\033[0m" << std::endl;
+                ++crashed;
+
+                junit_lines.push_back("      <failure message=\"" + std::string(e.what()) + "\"/>\n");
+                junit_lines.push_back("    </testcase>\n");
+            }
+            std::cout << "\033[0m";
+            ++ran;
+
+            if(close_klass) {
+                junit_lines.push_back("  </testsuite>\n");
+            }
+        }
+
+        junit_lines.push_back("</testsuites>\n");
+
+        if(!junit_output.empty()) {
+            FILE* f = fopen(junit_output.c_str(), "wt");
+            if(f) {
+                for(auto& line: junit_lines) {
+                    fwrite(line.c_str(), sizeof(char), line.length(), f);
+                }
+            }
+
+            fclose(f);
+        }
+
+        std::cout << "-----------------------" << std::endl;
+        if(!failed && !crashed && !skipped) {
+            std::cout << "All tests passed" << std::endl << std::endl;
+        } else {
+            if(skipped) {
+                std::cout << skipped << " tests skipped";
+            }
+
+            if(failed) {
+                if(skipped) {
+                    std::cout << ", ";
+                }
+                std::cout << failed << " tests failed";
+            }
+
+            if(crashed) {
+                if(failed) {
+                    std::cout << ", ";
+                }
+                std::cout << crashed << " tests crashed";
+            }
+            std::cout << std::endl << std::endl;
+        }
+
+        return failed + crashed;
+    }
+
+private:
+    std::vector<std::shared_ptr<TestCase>> instances_;
+    std::vector<std::function<void()> > tests_;
+    std::vector<std::string> names_;
+};
+} // test
+
--- a/tools/test_generator.py
+++ b/tools/test_generator.py
@ -0,0 +1,212 @@
+#!/usr/bin/env python3
+
+import argparse
+import re
+import sys
+
+parser = argparse.ArgumentParser(description="Generate C++ unit tests")
+parser.add_argument("--output", type=str, nargs=1, help="The output source file for the generated test main()", required=True)
+parser.add_argument("test_files", type=str, nargs="+", help="The list of C++ files containing your tests")
+parser.add_argument("--verbose", help="Verbose logging", action="store_true", default=False)
+
+
+CLASS_REGEX = r"\s*class\s+(\w+)\s*([\:|,]\s*(?:public|private|protected)\s+[\w|::]+\s*)*"
+TEST_FUNC_REGEX = r"void\s+(?P<func_name>test_\S[^\(]+)\(\s*(void)?\s*\)"
+
+
+INCLUDE_TEMPLATE = "#include \"%(file_path)s\""
+
+REGISTER_TEMPLATE = """
+    runner->register_case<%(class_name)s>(
+        std::vector<void (%(class_name)s::*)()>({%(members)s}),
+        {%(names)s}
+    );"""
+
+MAIN_TEMPLATE = """
+
+#include <functional>
+#include <memory>
+#include <map>
+
+#include "tools/test.h"
+
+%(includes)s
+
+
+std::map<std::string, std::string> parse_args(int argc, char* argv[]) {
+    std::map<std::string, std::string> ret;
+
+    for(int i = 1; i < argc; ++i) {
+        std::string arg = argv[i];
+
+        auto eq = arg.find('=');
+        if(eq != std::string::npos && arg[0] == '-' && arg[1] == '-') {
+            auto key = std::string(arg.begin(), arg.begin() + eq);
+            auto value = std::string(arg.begin() + eq + 1, arg.end());
+            ret[key] = value;
+        } else if(arg[0] == '-' && arg[1] == '-') {
+            auto key = arg;
+            if(i < (argc - 1)) {
+                auto value = argv[++i];
+                ret[key] = value;
+            } else {
+                ret[key] = "";
+            }
+        } else {
+            ret[arg] = "";  // Positional, not key=value
+        }
+    }
+
+    return ret;
+}
+
+int main(int argc, char* argv[]) {
+    auto runner = std::make_shared<test::TestRunner>();
+
+    auto args = parse_args(argc, argv);
+
+    std::string junit_xml;
+    auto junit_xml_it = args.find("--junit-xml");
+    if(junit_xml_it != args.end()) {
+        junit_xml = junit_xml_it->second;
+        std::cout << "    Outputting junit XML to: " << junit_xml << std::endl;
+        args.erase(junit_xml_it);
+    }
+
+    std::string test_case;
+    if(args.size()) {
+        test_case = args.begin()->first;
+    }
+
+    %(registrations)s
+
+    return runner->run(test_case, junit_xml);
+}
+
+
+"""
+
+VERBOSE = False
+
+def log_verbose(message):
+    if VERBOSE:
+        print(message)
+
+
+def find_tests(files):
+
+    subclasses = []
+
+    # First pass, find all class definitions
+    for path in files:
+        with open(path, "rt") as f:
+            source_file_data = f.read().replace("\r\n", "").replace("\n", "")
+
+            while True:
+                match = re.search(CLASS_REGEX, source_file_data)
+                if not match:
+                    break
+
+                class_name = match.group().split(":")[0].replace("class", "").strip()
+
+                try:
+                    parents = match.group().split(":", 1)[1]
+                except IndexError:
+                    pass
+                else:
+                    parents = [ x.strip() for x in parents.split(",") ]
+                    parents = [
+                        x.replace("public", "").replace("private", "").replace("protected", "").strip()
+                        for x in parents
+                    ]
+
+                    subclasses.append((path, class_name, parents, []))
+                    log_verbose("Found: %s" % str(subclasses[-1]))
+
+                start = match.end()
+
+                # Find the next opening brace
+                while source_file_data[start] in (' ', '\t'):
+                    start += 1
+
+                start -= 1
+                end = start
+                if source_file_data[start+1] == '{':
+
+                    class_data = []
+                    brace_counter = 1
+                    for i in range(start+2, len(source_file_data)):
+                        class_data.append(source_file_data[i])
+                        if class_data[-1] == '{': brace_counter += 1
+                        if class_data[-1] == '}': brace_counter -= 1
+                        if not brace_counter:
+                            end = i
+                            break
+
+                    class_data = "".join(class_data)
+
+                    while True:
+                        match = re.search(TEST_FUNC_REGEX, class_data)
+                        if not match:
+                            break
+
+                        subclasses[-1][-1].append(match.group('func_name'))
+                        class_data = class_data[match.end():]
+
+                source_file_data = source_file_data[end:]
+
+
+    # Now, simplify the list by finding all potential superclasses, and then keeping any classes
+    # that subclass them.
+    test_case_subclasses = []
+    i = 0
+    while i < len(subclasses):
+        subclass_names = [x.rsplit("::")[-1] for x in subclasses[i][2]]
+
+        # If this subclasses TestCase, or it subclasses any of the already found testcase subclasses
+        # then add it to the list
+        if "TestCase" in subclass_names or "SimulantTestCase" in subclass_names or any(x[1] in subclasses[i][2] for x in test_case_subclasses):
+            if subclasses[i] not in test_case_subclasses:
+                test_case_subclasses.append(subclasses[i])
+
+                i = 0 # Go back to the start, as we may have just found another parent class
+                continue
+        i += 1
+
+    log_verbose("\n".join([str(x) for x in test_case_subclasses]))
+    return test_case_subclasses
+
+
+def main():
+    global VERBOSE
+
+    args = parser.parse_args()
+
+    VERBOSE = args.verbose
+
+    testcases = find_tests(args.test_files)
+
+    includes = "\n".join([ INCLUDE_TEMPLATE % { 'file_path' : x } for x in set([y[0] for y in testcases]) ])
+    registrations = []
+
+    for path, class_name, superclasses, funcs in testcases:
+        BIND_TEMPLATE = "&%(class_name)s::%(func)s"
+
+        members = ", ".join([ BIND_TEMPLATE % { 'class_name' : class_name, 'func' : x } for x in funcs ])
+        names = ", ".join([ '"%s::%s"' % (class_name, x) for x in funcs ])
+
+        registrations.append(REGISTER_TEMPLATE % { 'class_name' : class_name, 'members' : members, 'names' : names })
+
+    registrations = "\n".join(registrations)
+
+    final = MAIN_TEMPLATE % {
+        'registrations' : registrations,
+        'includes' : includes
+    }
+
+    open(args.output[0], "w").write(final)
+
+    return 0
+
+if __name__ == '__main__':
+    sys.exit(main())
Author	SHA1	Message	Date
mrq	2601afb5f3	experimental support for parsing from a float16 mesh (or a float32-quantized-as-ushort mesh)	2023-10-26 13:22:47 -05:00
Luke Benstead	0efe4c6cef	Add missing defines	2023-10-19 22:26:13 +01:00
Luke Benstead	744dfb32f7	Merge branch 'fix-glshort-uv-read' into 'master' Convert GL_SHORT to proper float on conversion See merge request simulant/GLdc!109	2023-09-26 18:51:30 +00:00
Spencer Elliott	79172452f2	Convert GL_SHORT to proper float on conversion	2023-09-26 18:51:29 +00:00
Luke Benstead	420e2d75f2	Merge branch 'fix-glulookat-alignment' into 'master' Fixed alignment for matrix passed into UploadMatrix4x4 in gluLookAt See merge request simulant/GLdc!108	2023-09-20 15:47:38 +00:00
Spencer Elliott	202f546848	Fixed alignment for matrix passed into UploadMatrix4x4 in gluLookAt	2023-09-20 10:18:55 -05:00
Luke Benstead	d054dde785	Fix paletted texture glitch	2023-09-12 21:11:05 +01:00
Luke Benstead	00b4468928	Remove unused function	2023-09-11 20:42:09 +01:00
Luke Benstead	f0d799d14f	Merge branch 'texture-refactor' into 'master' Drastically refactor glTexImage2D See merge request simulant/GLdc!107	2023-09-11 19:39:03 +00:00
Luke Benstead	1bf8554926	More erquirements	2023-09-11 20:34:18 +01:00
Luke Benstead	9bc6da9fba	Add some requirements	2023-09-11 19:55:26 +01:00
Luke Benstead	a1536cba44	Add dependency	2023-09-11 17:31:08 +01:00
Luke Benstead	3eee140add	Fix stage	2023-09-11 17:29:59 +01:00
Luke Benstead	43d64a4957	Fix twiddling issues	2023-09-11 17:27:04 +01:00
Luke Benstead	951ece6d19	Add test job to CI	2023-09-11 17:25:47 +01:00
Luke Benstead	61e5a7a2a6	More twiddling work	2023-09-10 19:41:25 +01:00
Luke Benstead	3308a57e59	Implement defragmenting the memory	2023-09-08 17:49:46 +01:00
Luke Benstead	db9e1cd424	Fall back to unaligned if there's no more aligned spaced	2023-09-08 09:13:33 +01:00
Luke Benstead	6eb079228e	Fix infinite loop	2023-09-06 21:01:37 +01:00
Luke Benstead	7ce01ad93f	Fix up paletted textures	2023-09-06 08:01:01 +01:00
Luke Benstead	12bd6f474f	Fix issues with the allocator	2023-09-06 07:59:40 +01:00
Luke Benstead	e5a4f4f716	Continue fixing up paletted texture issues	2023-09-03 21:12:11 +01:00
Luke Benstead	4d39e19ed5	Start repairing paletted textures	2023-09-02 21:10:42 +01:00
Luke Benstead	49a0e103cb	Fix up CI	2023-09-01 20:34:29 +01:00
Luke Benstead	9cedc81850	Fix broken merge	2023-09-01 20:29:24 +01:00
Luke Benstead	0e31aa3d27	Tweak	2023-09-01 20:25:27 +01:00
Luke Benstead	5e7b33797d	Perf improvements and fixes	2023-09-01 20:25:27 +01:00
Luke Benstead	b19b9d498a	Clean up the allocator code	2023-09-01 20:25:21 +01:00
Luke Benstead	36de063756	Allow configuring automatic texture twiddling in glKosInitEx Defaults to enabled.	2023-09-01 20:23:55 +01:00
Luke Benstead	246cb997da	Add optional dcprof to prof_texture_upload	2023-09-01 20:23:55 +01:00
Luke Benstead	cfbaea4a46	Don't calculate things twice	2023-09-01 08:52:01 +01:00
Luke Benstead	4b47f6878f	Remove yalloc	2023-09-01 08:34:48 +01:00
Luke Benstead	3248499d5a	Switch to the new allocator	2023-08-31 21:21:14 +01:00
Luke Benstead	fd9a9d1c25	Merge commit 'f49a98ab543b1be0049e07456fb23022435ba450' into texture-refactor	2023-08-31 20:49:42 +01:00
Luke Benstead	f49a98ab54	Fix allocate and free	2023-08-31 20:49:34 +01:00
Luke Benstead	f278777c0e	WIP: Start implementing new allocator	2023-08-31 08:47:00 +01:00
Luke Benstead	34173d926c	Drastically refactor glTexImage2D	2023-08-31 08:47:00 +01:00
Luke Benstead	77531ca347	Drastically refactor glTexImage2D	2023-08-26 20:34:11 +01:00
Luke Benstead	a05e1b01fa	Make glGenerateMipmap the function, and EXT the alias	2023-07-26 20:33:12 +01:00
Luke Benstead	3dcbbdbde6	Add logging	2023-06-09 20:35:00 +01:00
Luke Benstead	92ee4f616d	Set mode to PAL@50 if it's a European console without VGA	2023-06-06 21:05:52 +01:00
Luke Benstead	e7574bca1d	Fix issues with GL_QUADS	2023-05-31 18:27:17 +01:00
Luke Benstead	026bdeff09	Fix infuriating memory corruption bug	2023-05-20 07:47:39 +01:00
Luke Benstead	f6713bc778	Speed up the software renderer	2023-05-20 07:45:45 +01:00
Luke Benstead	5865d57384	Wait for the store queues to finish when we've uploaded everything	2023-05-20 07:45:16 +01:00
Luke Benstead	1e3896e699	Clean up	2023-05-20 07:44:55 +01:00
Luke Benstead	bd47f333d6	Add more assertions	2023-05-20 07:43:57 +01:00
Luke Benstead	e57b503355	Fix memory errors	2023-05-18 16:44:11 +01:00
Luke Benstead	d81472ef57	Liberally assert stuff	2023-05-17 20:39:58 +01:00
Luke Benstead	462eb40d7a	Fix bugs in texture deletion	2023-05-17 20:39:49 +01:00
Luke Benstead	c4c0bf4239	Fix an off-by-one error	2023-05-17 20:39:27 +01:00
Luke Benstead	9037d157d5	Clean up	2023-05-17 20:38:21 +01:00
Luke Benstead	52a0215ed8	Make sure we initialize texture 0. We don't actually use it yet (binding zero disables texturing) but I believe the spec says that texture 0 is the "default texture" and is an actual texture object.	2023-05-17 20:36:59 +01:00
Luke Benstead	a5891056db	Many bug fixes and optimisations	2023-05-16 13:31:44 +01:00
Luke Benstead	9cffe14ad6	Clean up aligned vector	2023-05-12 20:51:36 +01:00
Luke Benstead	e683b8becb	Optimisations	2023-05-11 20:00:13 +01:00
Luke Benstead	cba2fb7ceb	Fix a memory corruption issue	2023-05-11 15:22:46 +01:00
Luke Benstead	c754c5c338	Ensure RelWithDebInfo builds use release flags + debugging	2023-05-11 15:22:27 +01:00
Luke Benstead	452cda5a3b	Fix backface culling	2023-04-28 19:49:01 +01:00
Luke Benstead	9e1b1bc40a	Merge branch 'clipping-rewrite-for-the-last-time-ffs' into 'master' Restructure clipping to be much MUCH faster in the visible case See merge request simulant/GLdc!105	2023-04-26 20:00:17 +00:00
Luke Benstead	0f65eab86a	Much faster clipping	2023-04-26 20:50:43 +01:00
Luke Benstead	1a678d2c8d	Undo some bad changes	2023-04-23 21:00:01 +01:00
Luke Benstead	0923b5c601	Further optimisations	2023-04-23 20:16:15 +01:00
Luke Benstead	2ec7055547	Optimisations	2023-04-23 07:44:09 +01:00
Luke Benstead	9cc52a01fe	Better clipping	2023-04-22 20:47:45 +01:00
Luke Benstead	095ebf2790	Fix final bug	2023-04-22 11:37:42 +01:00
Luke Benstead	baa275b41b	Fix a bunch of issues with clipping (almost working)	2023-04-21 20:38:21 +01:00
Luke Benstead	72c375f87c	Fix some things	2023-04-21 11:39:37 +01:00
Luke Benstead	e54494e995	More clipping work	2023-04-20 20:45:59 +01:00
Luke Benstead	c5ce81a38d	WIP: Restructure clipping to be much MUCH faster in the visible case This currently only works with triangles, anything more and it crashes due to me not queuing subsequent vertices in the strip correctly	2023-04-19 20:57:44 +01:00
Luke Benstead	34448939a4	Merge branch 'update-cubes-cmakelists' into 'master' Update cubes sample + cmakelists See merge request simulant/GLdc!104	2023-04-17 18:24:56 +00:00
Dave	b6249e9ca4	Update README	2023-04-17 19:35:06 +02:00
Dave	1a181f702c	Update CMakeLists with CXX flags (Debug and Release)	2023-04-17 19:34:56 +02:00
Dave	3b53691e4b	Change float colors to GLubyte in cubes sample	2023-04-17 19:33:59 +02:00
Luke Benstead	25d215dad3	Add some compiler flags for lolz	2023-04-11 20:46:44 +01:00
Luke Benstead	307d371c55	Various store queue shinanigans	2023-04-11 20:46:31 +01:00
Luke Benstead	4ad58bea89	Optimise glLoadMatrixf	2023-04-11 20:46:12 +01:00
Luke Benstead	8e60b18f29	Merge branch 'add-new-cubes-sample' into 'master' Add new cubes sample See merge request simulant/GLdc!103	2023-04-08 20:32:18 +00:00
David Reichelt	190b4ecfb7	Add include for printf	2023-04-08 20:07:21 +00:00
David Reichelt	a4b778063a	Make sure ZNEAR_CLIPPING_ENABLED is defined in software renderer	2023-04-08 20:00:33 +00:00
Dave Reichelt	df9a12bbd6	Added new cubes sample - A button toggles between glDrawElements and glDrawArrays - B button toggles glBlend - Start quits the sample - Every 10 seconds a log with stats will be sent to the terminal	2023-04-08 21:42:03 +02:00
Luke Benstead	6ee9a823c1	Don't update lights unnecessarily	2023-03-23 20:01:41 +00:00