60 changed files with 3553 additions and 14160 deletions
--- a/.gitignore
+++ b/.gitignore
@ -9,5 +9,3 @@ dc-build.sh
 build/*
 builddir/*
 version.[c|h]
-pcbuild/*
-dcbuild/*
--- a/.gitlab-ci.yml
+++ b/.gitlab-ci.yml
@ -1,6 +1,5 @@
 stages:
  - build
-  - test

 build:sh4-gcc:
  stage: build
@ -18,28 +17,11 @@ build:sh4-gcc:
    
 build:x86-gcc:
  stage: build
-  image: fedora:38
+  image: fedora:34
  before_script:
-    - sudo dnf install -y cmake gcc gcc-c++ SDL2.i686 SDL2-devel.x86_64 glibc-devel glibc-devel.i686 SDL2-devel.i686 pkgconf-pkg-config.i686 pkgconf-pkg-config.x86_64
+    - sudo dnf install -y cmake gcc gcc-c++ SDL2-devel glibc-devel pkgconf-pkg-config glibc-devel.i686 SDL2-devel.i686
  script:
    - mkdir builddir
    - cd builddir
    - cmake -DCMAKE_BUILD_TYPE=Release ..
    - make
-  artifacts:
-    paths:
-    - builddir/tests/gldc_tests
-    
-test:x86-gcc:
-  stage: test
-  image: fedora:38
-  dependencies:
-    - build:x86-gcc
-  before_script:
-    - sudo dnf install -y cmake gcc gcc-c++ SDL2.i686 SDL2-devel glibc-devel pkgconf-pkg-config glibc-devel.i686 SDL2-devel.i686 pkgconf-pkg-config.i686
-  script:
-    - cd builddir/tests/
-    - SDL_VIDEODRIVER=dummy ./gldc_tests --junit-xml=report.xml
-  artifacts:
-   reports:
-    junit: builddir/tests/report.xml
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@ -1,8 +1,6 @@
-cmake_minimum_required(VERSION 3.9)
+cmake_minimum_required(VERSION 3.0)
 project(GLdc)

-set(CMAKE_VERBOSE_MAKEFILE ON)
-
 # set the default backend
 if(PLATFORM_DREAMCAST)
    set(BACKEND "kospvr" CACHE STRING "Backend to use")
@ -10,9 +8,6 @@ else()
    set(BACKEND "software" CACHE STRING "Backend to use")
 endif()

-include(CheckIPOSupported)
-check_ipo_supported(RESULT FLTO_SUPPORTED OUTPUT FLTO_ERROR)
-
 # List of possible backends
 set_property(CACHE BACKEND PROPERTY STRINGS kospvr software)

@ -22,46 +17,16 @@ string(TOUPPER ${BACKEND} BACKEND_UPPER)
 add_definitions(-DBACKEND_${BACKEND_UPPER})

 set(CMAKE_C_STANDARD 99)
-set(CMAKE_CXX_STANDARD 11)

 include_directories(include)

 if(NOT PLATFORM_DREAMCAST)
    set(FIND_LIBRARY_USE_LIB32_PATHS true)
    set(FIND_LIBRARY_USE_LIB64_PATHS false)
-else()
-    include(CheckCCompilerFlag)
-    check_c_compiler_flag("-mfsrra" COMPILER_HAS_FSRRA)
-    check_c_compiler_flag("-mfsca"  COMPILER_HAS_FSCA)
-    if(COMPILER_HAS_FSRRA)
-        set(CMAKE_C_FLAGS_RELEASE "${CMAKE_C_FLAGS_RELEASE} -mfsrra")
-        set(CMAKE_CXX_FLAGS_RELEASE "${CMAKE_CXX_FLAGS_RELEASE} -mfsrra")
-
-        set(CMAKE_C_FLAGS_RELWITHDEBINFO "${CMAKE_C_FLAGS_RELWITHDEBINFO} -mfsrra")
-        set(CMAKE_CXX_FLAGS_RELWITHDEBINFO "${CMAKE_CXX_FLAGS_RELWITHDEBINFO} -mfsrra")
-    endif()
-    if(COMPILER_HAS_FSCA)
-        set(CMAKE_C_FLAGS_RELEASE "${CMAKE_C_FLAGS_RELEASE} -mfsca")
-        set(CMAKE_CXX_FLAGS_RELEASE "${CMAKE_CXX_FLAGS_RELEASE} -mfsca")
-
-        set(CMAKE_C_FLAGS_RELWITHDEBINFO "${CMAKE_C_FLAGS_RELWITHDEBINFO} -mfsca")
-        set(CMAKE_CXX_FLAGS_RELWITHDEBINFO "${CMAKE_CXX_FLAGS_RELWITHDEBINFO} -mfsca")
-    endif()
-    set(CMAKE_C_FLAGS_RELEASE "${CMAKE_C_FLAGS_RELEASE} -ffp-contract=fast -ffast-math")
-    set(CMAKE_CXX_FLAGS_RELEASE "${CMAKE_CXX_FLAGS_RELEASE} -ffast-math")
-
-    set(CMAKE_C_FLAGS_RELWITHDEBINFO "${CMAKE_C_FLAGS_RELWITHDEBINFO} -ffp-contract=fast -ffast-math")
-    set(CMAKE_CXX_FLAGS_RELWITHDEBINFO "${CMAKE_CXX_FLAGS_RELWITHDEBINFO} -ffast-math")
 endif()

-set(CMAKE_C_FLAGS_RELEASE "${CMAKE_C_FLAGS_RELEASE} -O3 -fexpensive-optimizations -fomit-frame-pointer -finline-functions")
-set(CMAKE_CXX_FLAGS_RELEASE "${CMAKE_CXX_FLAGS_RELEASE} -std=c++14 -O3 -g0 -s -fomit-frame-pointer -fstrict-aliasing")
-
-set(CMAKE_C_FLAGS_RELWITHDEBINFO "${CMAKE_C_FLAGS_RELWITHDEBINFO} -O3 -fexpensive-optimizations -fomit-frame-pointer -finline-functions")
-set(CMAKE_CXX_FLAGS_RELWITHDEBINFO "${CMAKE_CXX_FLAGS_RELWITHDEBINFO} -std=c++14 -O3 -fomit-frame-pointer -fstrict-aliasing")
-
+set(CMAKE_C_FLAGS_RELEASE "${CMAKE_C_FLAGS_RELEASE} -O3 --fast-math -fexpensive-optimizations -funroll-all-loops")
 set(CMAKE_C_FLAGS_DEBUG "${CMAKE_C_FLAGS_DEBUG} -O0 -g -Wall -Wextra")
-set(CMAKE_CXX_FLAGS_DEBUG "${CMAKE_CXX_FLAGS_DEBUG} -O0 -g -Wall -Wextra")

 set(
    SOURCES
@ -80,7 +45,7 @@ set(
    GL/state.c
    GL/texture.c
    GL/util.c
-    GL/alloc/alloc.c
+    GL/yalloc/yalloc.c
    ${CMAKE_CURRENT_BINARY_DIR}/version.c
 )

@ -111,10 +76,6 @@ endif()

 add_library(GLdc STATIC ${SOURCES})

-if(FLTO_SUPPORTED)
-    set_property(TARGET GLdc PROPERTY INTERPROCEDURAL_OPTIMIZATION TRUE)
-endif()
-
 if(NOT PLATFORM_DREAMCAST)
 set_target_properties(GLdc PROPERTIES
    COMPILE_OPTIONS "-m32"
@ -138,13 +99,6 @@ function(gen_sample sample)

    add_executable(${sample} ${SAMPLE_SRCS})

-    if(FLTO_SUPPORTED)
-        # FIXME: Cubes + LTO causes an ICE
-        if(NOT ${sample} MATCHES "cubes")
-            set_property(TARGET ${sample} PROPERTY INTERPROCEDURAL_OPTIMIZATION TRUE)
-        endif()
-    endif()
-
    if(PLATFORM_DREAMCAST)
        if(EXISTS "${CMAKE_SOURCE_DIR}/samples/${sample}/romdisk")
            message("Generating romdisk for sample: ${sample}")
@ -175,8 +129,6 @@ function(gen_sample sample)
    endif()
 endfunction()

-add_subdirectory(tests)
-
 gen_sample(blend_test samples/blend_test/main.c)
 gen_sample(depth_funcs samples/depth_funcs/main.c)
 gen_sample(depth_funcs_alpha_testing samples/depth_funcs_alpha_testing/main.c samples/depth_funcs_alpha_testing/gl_png.c)
@ -207,14 +159,11 @@ gen_sample(zclip_triangle samples/zclip_triangle/main.c)
 gen_sample(zclip_trianglestrip samples/zclip_trianglestrip/main.c)
 gen_sample(scissor samples/scissor/main.c)
 gen_sample(polymark samples/polymark/main.c)
-gen_sample(cubes samples/cubes/main.cpp)
-gen_sample(zclip_test tests/zclip/main.cpp)
+

 if(PLATFORM_DREAMCAST)
    gen_sample(trimark samples/trimark/main.c)
    gen_sample(quadmark samples/quadmark/main.c samples/profiler.c)
-    gen_sample(prof_texture_upload samples/prof_texture_upload/main.c samples/profiler.c)
 else()
    gen_sample(quadmark samples/quadmark/main.c)
-    gen_sample(prof_texture_upload samples/prof_texture_upload/main.c)
 endif()
--- a/GL/alloc/alloc.c
+++ b/GL/alloc/alloc.c
@ -1,534 +0,0 @@
-#include <stdint.h>
-#include <string.h>
-#include <stdlib.h>
-#include <stdio.h>
-
-#include "alloc.h"
-
-
-/* This allocator is designed so that ideally all allocations larger
- * than 2k, fall on a 2k boundary. Smaller allocations will
- * never cross a 2k boundary.
- *
- * House keeping is stored in RAM to avoid reading back from the
- * VRAM to check for usage. Headers can't be easily stored in the
- * blocks anyway as they have to be 2k aligned (so you'd need to
- * store them in reverse or something)
- *
- * Defragmenting the pool will move larger allocations first, then
- * smaller ones, recursively until you tell it to stop, or until things
- * stop moving.
- *
- * The maximum pool size is 8M, made up of:
- *
- * - 4096 blocks of 2k
- * - each with 8 sub-blocks of 256 bytes
- *
- * Why?
- *
- * The PVR performs better if textures don't cross 2K memory
- * addresses, so we try to avoid that. Obviously we can't
- * if the allocation is > 2k, but in that case we can at least
- * align with 2k and the VQ codebook (which is usually 2k) will
- * be in its own page.
- *
- * The smallest PVR texture allowed is 8x8 at 16 bit (so 128 bytes)
- * but we're unlikely to use too many of those, so having a min sub-block
- * size of 256 should be OK (a 16x16 image is 512, so two sub-blocks).
- *
- * We could go down to 128 bytes if wastage is an issue, but then we have
- * to store double the number of usage markers.
- *
- * FIXME:
- *
- *  - Only operates on one pool (ignores what you pass)
- */
-
-#include <assert.h>
-#include <stdio.h>
-
-#define EIGHT_MEG (8 * 1024 * 1024)
-#define TWO_KILOBYTES (2 * 1024)
-#define BLOCK_COUNT (EIGHT_MEG / TWO_KILOBYTES)
-
-#define ALLOC_DEBUG 0
-#if ALLOC_DEBUG
-#define DBG_MSG(fmt, ...) fprintf(stderr, fmt, ##__VA_ARGS__)
-#else
-#define DBG_MSG(fmt, ...) do {} while (0)
-#endif
-
-
-static inline intptr_t round_up(intptr_t n, int multiple)
-{
-    if((n % multiple) == 0) {
-        return n;
-    }
-
-    assert(multiple);
-    return ((n + multiple - 1) / multiple) * multiple;
-}
-
-struct AllocEntry {
-    void* pointer;
-    size_t size;
-    struct AllocEntry* next;
-};
-
-
-typedef struct {
-    /* This is a usage bitmask for each block. A block
-     * is divided into 8 x 256 byte subblocks. If a block
-     * is entirely used, it's value will be 255, if
-     * it's entirely free then it will be 0.
-     */
-    uint8_t block_usage[BLOCK_COUNT];
-    uint8_t* pool;  // Pointer to the memory pool
-    size_t pool_size; // Size of the memory pool
-    uint8_t* base_address; // First 2k aligned address in the pool
-    size_t block_count;  // Number of 2k blocks in the pool
-
-    /* It's frustrating that we need to do this dynamically
-     * but we need to know the size allocated when we free()...
-     * we could store it statically but it would take 64k if we had
-     * an array of block_index -> block size where there would be 2 ** 32
-     * entries of 16 bit block sizes. The drawback (aside the memory usage)
-     * would be that we won't be able to order by size, so defragging will
-     * take much more time.*/
-    struct AllocEntry* allocations;
-} PoolHeader;
-
-
-static PoolHeader pool_header = {
-    {0}, NULL, 0, NULL, 0, NULL
-};
-
-void* alloc_base_address(void* pool) {
-    (void) pool;
-    return pool_header.base_address;
-}
-
-size_t alloc_block_count(void* pool) {
-    (void) pool;
-    return pool_header.block_count;
-}
-
-static inline void* calc_address(
-    uint8_t* block_usage_iterator,
-    int bit_offset,
-    size_t required_subblocks,
-    size_t* start_subblock_out
-) {
-    uintptr_t offset = (block_usage_iterator - pool_header.block_usage) * 8;
-    offset += (bit_offset + 1);
-    offset -= required_subblocks;
-
-    if(start_subblock_out) {
-        *start_subblock_out = offset;
-    }
-
-    return pool_header.base_address + (offset * 256);
-}
-
-void* alloc_next_available_ex(void* pool, size_t required_size, size_t* start_subblock, size_t* required_subblocks);
-
-void* alloc_next_available(void* pool, size_t required_size) {
-    return alloc_next_available_ex(pool, required_size, NULL, NULL);
-}
-
-void* alloc_next_available_ex(void* pool, size_t required_size, size_t* start_subblock_out, size_t* required_subblocks_out) {
-    (void) pool;
-
-    uint8_t* it = pool_header.block_usage;
-    uint32_t required_subblocks = (required_size / 256);
-    if(required_size % 256) required_subblocks += 1;
-
-    /* Anything gte to 2048 must be aligned to a 2048 boundary */
-    bool requires_alignment = required_size >= 2048;
-
-    if(required_subblocks_out) {
-        *required_subblocks_out = required_subblocks;
-    }
-
-    /* This is a fallback option. If while we're searching we find a possible slot
-     * but it's not aligned, or it's straddling a 2k boundary, then we store
-     * it here and if we reach the end of the search and find nothing better
-     * we use this instead */
-    uint8_t* poor_option = NULL;
-    size_t poor_start_subblock = 0;
-
-    uint32_t found_subblocks = 0;
-    uint32_t found_poor_subblocks = 0;
-
-    for(size_t j = 0; j < pool_header.block_count; ++j, ++it) {
-        /* We just need to find enough consecutive blocks */
-        if(found_subblocks < required_subblocks) {
-            uint8_t t = *it;
-
-            /* Optimisation only. Skip over full blocks */
-            if(t == 255) {
-                found_subblocks = 0;
-                found_poor_subblocks = 0;
-            } else {
-                /* Now let's see how many consecutive blocks we can find */
-                for(int i = 0; i < 8; ++i) {
-                    if((t & 0x80) == 0) {
-                        bool block_overflow = (
-                            required_size < 2048 && found_subblocks > 0 && i == 0
-                        );
-
-                        bool reset_subblocks = (
-                            (requires_alignment && found_subblocks == 0 && i != 0) ||
-                            block_overflow
-                        );
-
-                        if(reset_subblocks) {
-                            // Ignore this subblock, because we want the first subblock to be aligned
-                            // at a 2048 boundary and this one isn't (i != 0)
-                            found_subblocks = 0;
-                        } else {
-                            found_subblocks++;
-                        }
-
-                        /* If we reset the subblocks due to an overflow, we still
-                         * want to count this free subblock in our count */
-                        if(block_overflow) {
-                            found_subblocks++;
-                        }
-
-                        found_poor_subblocks++;
-
-                        if(found_subblocks >= required_subblocks) {
-                            /* We found space! Now calculate the address */
-                            return calc_address(it, i, required_subblocks, start_subblock_out);
-                        }
-
-                        if(!poor_option && (found_poor_subblocks >= required_subblocks)) {
-                            poor_option = calc_address(it, i, required_subblocks, &poor_start_subblock);
-                        }
-
-                    } else {
-                        found_subblocks = 0;
-                        found_poor_subblocks = 0;
-                    }
-
-                    t <<= 1;
-                }
-            }
-        }
-    }
-
-    if(poor_option) {
-        if(start_subblock_out) {
-            *start_subblock_out = poor_start_subblock;
-        }
-
-        return poor_option;
-    } else {
-        return NULL;
-    }
-}
-
-int alloc_init(void* pool, size_t size) {
-    (void) pool;
-
-    if(pool_header.pool) {
-        return -1;
-    }
-
-    if(size > EIGHT_MEG) {  // FIXME: >= ?
-        return -1;
-    }
-
-    uint8_t* p = (uint8_t*) pool;
-
-    memset(pool_header.block_usage, 0, BLOCK_COUNT);
-    pool_header.pool = pool;
-    pool_header.pool_size = size;
-
-    intptr_t base_address = (intptr_t) pool_header.pool;
-    base_address = round_up(base_address, 2048);
-
-    pool_header.base_address = (uint8_t*) base_address;
-    pool_header.block_count = ((p + size) - pool_header.base_address) / 2048;
-    pool_header.allocations = NULL;
-
-    assert(((uintptr_t) pool_header.base_address) % 2048 == 0);
-
-    return 0;
-}
-
-void alloc_shutdown(void* pool) {
-    (void) pool;
-
-    if(!pool_header.pool) {
-        return;
-    }
-
-    struct AllocEntry* it = pool_header.allocations;
-    while(it) {
-        struct AllocEntry* next = it->next;
-        free(it);
-        it = next;
-    }
-
-    memset(&pool_header, 0, sizeof(pool_header));
-    pool_header.pool = NULL;
-}
-
-static inline uint32_t size_to_subblock_count(size_t size) {
-    uint32_t required_subblocks = (size / 256);
-    if(size % 256) required_subblocks += 1;
-    return required_subblocks;
-}
-
-static inline uint32_t subblock_from_pointer(void* p) {
-    uint8_t* ptr = (uint8_t*) p;
-    return (ptr - pool_header.base_address) / 256;
-}
-
-static inline void block_and_offset_from_subblock(size_t sb, size_t* b, uint8_t* off) {
-    *b = sb / 8;
-    *off = (sb % 8);
-}
-
-void* alloc_malloc(void* pool, size_t size) {
-    DBG_MSG("Allocating: %d\n", size);
-
-    size_t start_subblock, required_subblocks;
-    void* ret = alloc_next_available_ex(pool, size, &start_subblock, &required_subblocks);
-
-    if(ret) {
-        size_t block;
-        uint8_t offset;
-
-        block_and_offset_from_subblock(start_subblock, &block, &offset);
-
-        uint8_t mask = 0;
-
-        DBG_MSG("Alloc: size: %d, rs: %d, sb: %d, b: %d, off: %d\n", size, required_subblocks, start_subblock, start_subblock / 8, start_subblock % 8);
-
-        /* Toggle any bits for the first block */
-        int c = (required_subblocks < 8) ? required_subblocks : 8;
-        for(int i = 0; i < c; ++i) {
-            mask |= (1 << (7 - (offset + i)));
-            required_subblocks--;
-        }
-
-        if(mask) {
-            pool_header.block_usage[block++] |= mask;
-        }
-
-        /* Fill any full blocks in the middle of the allocation */
-        while(required_subblocks > 8) {
-            pool_header.block_usage[block++] = 255;
-            required_subblocks -= 8;
-        }
-
-        /* Fill out any trailing subblocks */
-        mask = 0;
-        for(size_t i = 0; i < required_subblocks; ++i) {
-            mask |= (1 << (7 - i));
-        }
-
-        if(mask) {
-            pool_header.block_usage[block++] |= mask;
-        }
-
-        /* Insert allocations in the list by size descending so that when we
-         * defrag we can move the larger blocks before the smaller ones without
-         * much effort */
-        struct AllocEntry* new_entry = (struct AllocEntry*) malloc(sizeof(struct AllocEntry));
-        new_entry->pointer = ret;
-        new_entry->size = size;
-        new_entry->next = NULL;
-
-        struct AllocEntry* it = pool_header.allocations;
-        struct AllocEntry* last = NULL;
-
-        if(!it) {
-            pool_header.allocations = new_entry;
-        } else {
-            while(it) {
-                if(it->size < size) {
-                    if(last) {
-                        last->next = new_entry;
-                    } else {
-                        pool_header.allocations = new_entry;
-                    }
-
-                    new_entry->next = it;
-                    break;
-                } else if(!it->next) {
-                    it->next = new_entry;
-                    new_entry->next = NULL;
-                    break;
-                }
-
-                last = it;
-                it = it->next;
-            }
-        }
-    }
-
-    DBG_MSG("Alloc done\n");
-
-    return ret;
-}
-
-static void alloc_release_blocks(struct AllocEntry* it) {
-    size_t used_subblocks = size_to_subblock_count(it->size);
-    size_t subblock = subblock_from_pointer(it->pointer);
-    size_t block;
-    uint8_t offset;
-    block_and_offset_from_subblock(subblock, &block, &offset);
-
-    uint8_t mask = 0;
-
-    DBG_MSG("Free: size: %d, us: %d, sb: %d, off: %d\n", it->size, used_subblocks, block, offset);
-
-    /* Wipe out any leading subblocks */
-    int c = (used_subblocks < 8) ? used_subblocks : 8;
-    for(int i = 0; i < c; ++i) {
-        mask |= (1 << (7 - (offset + i)));
-        used_subblocks--;
-    }
-
-    if(mask) {
-        pool_header.block_usage[block++] &= ~mask;
-    }
-
-    /* Clear any full blocks in the middle of the allocation */
-    while(used_subblocks > 8) {
-        pool_header.block_usage[block++] = 0;
-        used_subblocks -= 8;
-    }
-
-    /* Wipe out any trailing subblocks */
-    mask = 0;
-    for(size_t i = 0; i < used_subblocks; ++i) {
-        mask |= (1 << (7 - i));
-    }
-
-    if(mask) {
-        pool_header.block_usage[block++] &= ~mask;
-    }
-}
-
-void alloc_free(void* pool, void* p) {
-    (void) pool;
-
-    struct AllocEntry* it = pool_header.allocations;
-    struct AllocEntry* last = NULL;
-    while(it) {
-        if(it->pointer == p) {
-            alloc_release_blocks(it);
-
-            if(last) {
-                last->next = it->next;
-            } else {
-                assert(it == pool_header.allocations);
-                pool_header.allocations = it->next;
-            }
-
-            DBG_MSG("Freed: size: %d, us: %d, sb: %d, off: %d\n", it->size, used_subblocks, block, offset);
-            free(it);
-            break;
-        }
-
-        last = it;
-        it = it->next;
-    }
-
-    DBG_MSG("Free done\n");
-}
-
-void alloc_run_defrag(void* pool, defrag_address_move callback, int max_iterations, void* user_data) {
-
-    for(int i = 0; i < max_iterations; ++i) {
-        bool move_occurred = false;
-
-        struct AllocEntry* it = pool_header.allocations;
-
-        if(!it) {
-            return;
-        }
-
-        while(it) {
-            void* potential_dest = alloc_next_available(pool, it->size);
-            if(potential_dest < it->pointer) {
-                potential_dest = alloc_malloc(pool, it->size);
-                memcpy(potential_dest, it->pointer, it->size);
-
-                /* Mark this block as now free, but don't fiddle with the
-                 * allocation list */
-                alloc_release_blocks(it);
-
-                callback(it->pointer, potential_dest, user_data);
-
-                it->pointer = potential_dest;
-                move_occurred = true;
-            }
-
-            it = it->next;
-        }
-
-        if(!move_occurred) {
-            return;
-        }
-    }
-}
-
-static inline uint8_t count_ones(uint8_t byte) {
-    static const uint8_t NIBBLE_LOOKUP [16] = {
-        0, 1, 1, 2, 1, 2, 2, 3,
-        1, 2, 2, 3, 2, 3, 3, 4
-    };
-    return NIBBLE_LOOKUP[byte & 0x0F] + NIBBLE_LOOKUP[byte >> 4];
-}
-
-size_t alloc_count_free(void* pool) {
-    (void) pool;
-
-    uint8_t* it = pool_header.block_usage;
-    uint8_t* end = it + pool_header.block_count;
-
-    size_t total_free = 0;
-
-    while(it < end) {
-        total_free += count_ones(*it) * 256;
-        ++it;
-    }
-
-    return total_free;
-}
-
-size_t alloc_count_continuous(void* pool) {
-    (void) pool;
-
-    size_t largest_block = 0;
-
-    uint8_t* it = pool_header.block_usage;
-    uint8_t* end = it + pool_header.block_count;
-
-    size_t current_block = 0;
-    while(it < end) {
-        uint8_t t = *it++;
-        if(!t) {
-            current_block += 2048;
-        } else {
-            for(int i = 7; i >= 0; --i) {
-                bool bitset = (t & (1 << i));
-                if(bitset) {
-                    current_block += (7 - i) * 256;
-                    if(largest_block < current_block) {
-                        largest_block = current_block;
-                        current_block = 0;
-                    }
-                }
-            }
-        }
-    }
-
-    return largest_block;
-}
--- a/GL/alloc/alloc.h
+++ b/GL/alloc/alloc.h
@ -1,29 +0,0 @@
-#pragma once
-
-#include <stdbool.h>
-#include <stdint.h>
-#include <stddef.h>
-
-#ifdef __cplusplus
-extern "C" {
-#endif
-
-int alloc_init(void* pool, size_t size);
-void alloc_shutdown(void* pool);
-
-void *alloc_malloc(void* pool, size_t size);
-void alloc_free(void* pool, void* p);
-
-typedef void (defrag_address_move)(void*, void*, void*);
-void alloc_run_defrag(void* pool, defrag_address_move callback, int max_iterations, void* user_data);
-
-size_t alloc_count_free(void* pool);
-size_t alloc_count_continuous(void* pool);
-
-void* alloc_next_available(void* pool, size_t required_size);
-void* alloc_base_address(void* pool);
-size_t alloc_block_count(void* pool);
-
-#ifdef __cplusplus
-}
-#endif
--- a/GL/draw.c
+++ b/GL/draw.c
@ -3,48 +3,20 @@
 #include <string.h>
 #include <stdlib.h>
 #include <math.h>
-#include <limits.h>
+#include <assert.h>

 #include "private.h"
 #include "platform.h"

-GLushort _quantize( GLfloat v ) {
-    union { GLfloat f; GLuint ui; } u = {v};
-    GLuint ui = u.ui;
-
-    int s = (ui >> 16) & 0x8000;
-    int em = ui & 0x7fffffff;
-
-    int h = (em - (112 << 23) + (1 << 12)) >> 13;
-    h = (em < (113 << 23)) ? 0 : h;
-    h = (em >= (143 << 23)) ? 0x7c00 : h;
-    h = (em > (255 << 23)) ? 0x7e00 : h;
-
-    return (GLushort)(s | h);
-}
-GLfloat _dequantize( GLushort h ) {
-    GLuint s = (GLuint) (h & 0x8000) << 16;
-    int em = h & 0x7fff;
-    int r = (em + (112 << 10)) << 13;
-    r = (em < (1 << 10)) ? 0 : r;
-    r += (em >= (31 << 10)) ? (112 << 23) : 0;
-
-    union { GLfloat f; GLuint ui; } u;
-    u.ui = s | r;
-    return u.f;
-}

 AttribPointerList ATTRIB_POINTERS;
 GLuint ENABLED_VERTEX_ATTRIBUTES = 0;
 GLuint FAST_PATH_ENABLED = GL_FALSE;

 static GLubyte ACTIVE_CLIENT_TEXTURE = 0;
-static const float ONE_OVER_TWO_FIVE_FIVE = 1.0f / 255.0f;

 extern inline GLuint _glRecalcFastPath();

-extern GLboolean AUTOSORT_ENABLED;
-
 #define ITERATE(count) \
    GLuint i = count; \
    while(i--)
@ -88,7 +60,6 @@ GL_FORCE_INLINE GLsizei byte_size(GLenum type) {
    case GL_INT: return sizeof(GLint);
    case GL_UNSIGNED_INT: return sizeof(GLuint);
    case GL_DOUBLE: return sizeof(GLdouble);
-    case GL_HALF_FLOAT: return sizeof(GLhalf);
    case GL_UNSIGNED_INT_2_10_10_10_REV: return sizeof(GLuint);
    case GL_FLOAT:
    default: return sizeof(GLfloat);
@ -105,7 +76,7 @@ static void _readVertexData3f3f(const GLubyte* __restrict__ in, GLubyte* __restr

 // 10:10:10:2REV format
 static void _readVertexData1i3f(const GLubyte* in, GLubyte* out) {
-    static const float MULTIPLIER = 1.0f / 1023.0f;
+    const static float MULTIPLIER = 1.0f / 1023.0f;

    GLfloat* output = (GLfloat*) out;

@ -135,15 +106,6 @@ static void _readVertexData3us3f(const GLubyte* in, GLubyte* out) {
    output[2] = input[2];
 }

-static void _readVertexData3usq3f(const GLubyte* in, GLubyte* out) {
-    const GLushort* input = (const GLushort*) in;
-    float* output = (float*) out;
-
-    output[0] = _dequantize(input[0]);
-    output[1] = _dequantize(input[1]);
-    output[2] = _dequantize(input[2]);
-}
-
 static void _readVertexData3ui3f(const GLubyte* in, GLubyte* out) {
    const GLuint* input = (const GLuint*) in;
    float* output = (float*) out;
@ -155,6 +117,8 @@ static void _readVertexData3ui3f(const GLubyte* in, GLubyte* out) {


 static void _readVertexData3ub3f(const GLubyte* input, GLubyte* out) {
+    const float ONE_OVER_TWO_FIVE_FIVE = 1.0f / 255.0f;
+
    float* output = (float*) out;

    output[0] = input[0] * ONE_OVER_TWO_FIVE_FIVE;
@ -162,15 +126,6 @@ static void _readVertexData3ub3f(const GLubyte* input, GLubyte* out) {
    output[2] = input[2] * ONE_OVER_TWO_FIVE_FIVE;
 }

-static void _readVertexData3f16_3f(const GLubyte* in, GLubyte* out) {
-    const GLhalf* input = (const GLhalf*) in;
-    float* output = (float*) out;
-
-    output[0] = input[0];
-    output[1] = input[1];
-    output[2] = input[2];
-}
-
 static void _readVertexData2f2f(const GLubyte* in, GLubyte* out) {
    vec2cpy(out, in);
 }
@ -184,6 +139,8 @@ static void _readVertexData2f3f(const GLubyte* in, GLubyte* out) {
 }

 static void _readVertexData2ub3f(const GLubyte* input, GLubyte* out) {
+    const float ONE_OVER_TWO_FIVE_FIVE = 1.0f / 255.0f;
+
    float* output = (float*) out;

    output[0] = input[0] * ONE_OVER_TWO_FIVE_FIVE;
@ -204,25 +161,8 @@ static void _readVertexData2us2f(const GLubyte* in, GLubyte* out) {
    const GLushort* input = (const GLushort*) in;
    float* output = (float*) out;

-    output[0] = (float)input[0] / SHRT_MAX;
-    output[1] = (float)input[1] / SHRT_MAX;
-}
-
-static void _readVertexData2usq3f(const GLubyte* in, GLubyte* out) {
-    const GLushort* input = (const GLushort*) in;
-    float* output = (float*) out;
-
-    output[0] = _dequantize(input[0]);
-    output[1] = _dequantize(input[1]);
-    output[2] = 0.0f;
-}
-
-static void _readVertexData2usq2f(const GLubyte* in, GLubyte* out) {
-    const GLushort* input = (const GLushort*) in;
-    float* output = (float*) out;
-
-    output[0] = _dequantize(input[0]);
-    output[1] = _dequantize(input[1]);
+    output[0] = input[0];
+    output[1] = input[1];
 }

 static void _readVertexData2ui2f(const GLubyte* in, GLubyte* out) {
@ -234,20 +174,13 @@ static void _readVertexData2ui2f(const GLubyte* in, GLubyte* out) {
 }

 static void _readVertexData2ub2f(const GLubyte* input, GLubyte* out) {
+    const float ONE_OVER_TWO_FIVE_FIVE = 1.0f / 255.0f;
    float* output = (float*) out;

    output[0] = input[0] * ONE_OVER_TWO_FIVE_FIVE;
    output[1] = input[1] * ONE_OVER_TWO_FIVE_FIVE;
 }

-static void _readVertexData2f16_2f(const GLubyte* in, GLubyte* out) {
-    const GLhalf* input = (const GLhalf*) in;
-    float* output = (float*) out;
-
-    output[0] = input[0];
-    output[1] = input[1];
-}
-
 static void _readVertexData2ui3f(const GLubyte* in, GLubyte* out) {
    const GLuint* input = (const GLuint*) in;
    float* output = (float*) out;
@ -257,15 +190,6 @@ static void _readVertexData2ui3f(const GLubyte* in, GLubyte* out) {
    output[2] = 0.0f;
 }

-static void _readVertexData2f16_3f(const GLubyte* in, GLubyte* out) {
-    const GLhalf* input = (const GLhalf*) in;
-    float* output = (float*) out;
-
-    output[0] = input[0];
-    output[1] = input[1];
-    output[2] = 0.0f;
-}
-
 static void _readVertexData4ubARGB(const GLubyte* input, GLubyte* output) {
    output[R8IDX] = input[0];
    output[G8IDX] = input[1];
@ -318,7 +242,7 @@ static void _fillWithNegZVE(const GLubyte* __restrict__ input, GLubyte* __restri
        float x, y, z;
    } V;

-    static const V NegZ = {0.0f, 0.0f, -1.0f};
+    const static V NegZ = {0.0f, 0.0f, -1.0f};

    *((V*) out) = NegZ;
 }
@ -336,37 +260,37 @@ static void _fillZero2f(const GLubyte* __restrict__ input, GLubyte* __restrict__
 static void _readVertexData3usARGB(const GLubyte* input, GLubyte* output) {
    _GL_UNUSED(input);
    _GL_UNUSED(output);
-    gl_assert(0 && "Not Implemented");
+    assert(0 && "Not Implemented");
 }

 static void _readVertexData3uiARGB(const GLubyte* input, GLubyte* output) {
    _GL_UNUSED(input);
    _GL_UNUSED(output);
-    gl_assert(0 && "Not Implemented");
+    assert(0 && "Not Implemented");
 }

 static void _readVertexData4usARGB(const GLubyte* input, GLubyte* output) {
    _GL_UNUSED(input);
    _GL_UNUSED(output);
-    gl_assert(0 && "Not Implemented");
+    assert(0 && "Not Implemented");
 }

 static void _readVertexData4uiARGB(const GLubyte* input, GLubyte* output) {
    _GL_UNUSED(input);
    _GL_UNUSED(output);
-    gl_assert(0 && "Not Implemented");
+    assert(0 && "Not Implemented");
 }

 static void _readVertexData4usRevARGB(const GLubyte* input, GLubyte* output) {
    _GL_UNUSED(input);
    _GL_UNUSED(output);
-    gl_assert(0 && "Not Implemented");
+    assert(0 && "Not Implemented");
 }

 static void _readVertexData4uiRevARGB(const GLubyte* input, GLubyte* output) {
    _GL_UNUSED(input);
    _GL_UNUSED(output);
-    gl_assert(0 && "Not Implemented");
+    assert(0 && "Not Implemented");
 }

 GLuint* _glGetEnabledAttributes() {
@ -470,12 +394,12 @@ GL_FORCE_INLINE void transformNormalToEyeSpace(GLfloat* normal) {
 }

 GL_FORCE_INLINE PolyHeader *_glSubmissionTargetHeader(SubmissionTarget* target) {
-    gl_assert(target->header_offset < aligned_vector_size(&target->output->vector));
+    assert(target->header_offset < target->output->vector.size);
    return aligned_vector_at(&target->output->vector, target->header_offset);
 }

 GL_INLINE_DEBUG Vertex* _glSubmissionTargetStart(SubmissionTarget* target) {
-    gl_assert(target->start_offset < aligned_vector_size(&target->output->vector));
+    assert(target->start_offset < target->output->vector.size);
    return aligned_vector_at(&target->output->vector, target->start_offset);
 }

@ -514,7 +438,7 @@ GL_FORCE_INLINE void genTriangleStrip(Vertex* output, GLuint count) {
 }

 static void genTriangleFan(Vertex* output, GLuint count) {
-    gl_assert(count <= 255);
+    assert(count <= 255);

    Vertex* dst = output + (((count - 2) * 3) - 1);
    Vertex* src = output + (count - 1);
@ -571,17 +495,14 @@ ReadPositionFunc calcReadPositionFunc() {
        case GL_FLOAT:
            return (ATTRIB_POINTERS.vertex.size == 3) ? _readVertexData3f3f:
                    _readVertexData2f3f;
-        case GL_HALF_FLOAT:
-            return (ATTRIB_POINTERS.vertex.size == 3) ? _readVertexData3f16_3f:
-                    _readVertexData2f16_3f;
        case GL_BYTE:
        case GL_UNSIGNED_BYTE:
            return (ATTRIB_POINTERS.vertex.size == 3) ? _readVertexData3ub3f:
                    _readVertexData2ub3f;
        case GL_SHORT:
        case GL_UNSIGNED_SHORT:
-            return (ATTRIB_POINTERS.vertex.size == 3) ? _readVertexData3usq3f:
-                    _readVertexData2usq3f;
+            return (ATTRIB_POINTERS.vertex.size == 3) ? _readVertexData3us3f:
+                    _readVertexData2us3f;
        case GL_INT:
        case GL_UNSIGNED_INT:
            return (ATTRIB_POINTERS.vertex.size == 3) ? _readVertexData3ui3f:
@ -599,14 +520,12 @@ ReadUVFunc calcReadUVFunc() {
        case GL_DOUBLE:
        case GL_FLOAT:
            return _readVertexData2f2f;
-        case GL_HALF_FLOAT:
-            return _readVertexData2f16_2f;
        case GL_BYTE:
        case GL_UNSIGNED_BYTE:
            return _readVertexData2ub2f;
        case GL_SHORT:
        case GL_UNSIGNED_SHORT:
-            return _readVertexData2usq2f;
+            return _readVertexData2us2f;
        case GL_INT:
        case GL_UNSIGNED_INT:
            return _readVertexData2ui2f;
@ -623,14 +542,12 @@ ReadUVFunc calcReadSTFunc() {
        case GL_DOUBLE:
        case GL_FLOAT:
            return _readVertexData2f2f;
-        case GL_HALF_FLOAT:
-            return _readVertexData2f16_2f;
        case GL_BYTE:
        case GL_UNSIGNED_BYTE:
            return _readVertexData2ub2f;
        case GL_SHORT:
        case GL_UNSIGNED_SHORT:
-            return _readVertexData2usq2f;
+            return _readVertexData2us2f;
        case GL_INT:
        case GL_UNSIGNED_INT:
            return _readVertexData2ui2f;
@ -647,8 +564,6 @@ ReadNormalFunc calcReadNormalFunc() {
        case GL_DOUBLE:
        case GL_FLOAT:
            return _readVertexData3f3f;
-        case GL_HALF_FLOAT:
-            return _readVertexData3f16_3f;
        break;
        case GL_BYTE:
        case GL_UNSIGNED_BYTE:
@ -656,7 +571,7 @@ ReadNormalFunc calcReadNormalFunc() {
        break;
        case GL_SHORT:
        case GL_UNSIGNED_SHORT:
-            return _readVertexData3usq3f;
+            return _readVertexData3us3f;
        break;
        case GL_INT:
        case GL_UNSIGNED_INT:
@ -668,57 +583,74 @@ ReadNormalFunc calcReadNormalFunc() {
    }
 }

-static void _readPositionData(ReadDiffuseFunc func, const GLuint first, const GLuint count, Vertex* it) {
+static void _readPositionData(ReadDiffuseFunc func, const GLuint first, const GLuint count, const Vertex* output) {
    const GLsizei vstride = ATTRIB_POINTERS.vertex.stride;
    const GLubyte* vptr = ((GLubyte*) ATTRIB_POINTERS.vertex.ptr + (first * vstride));

-    float pos[3];
+    GLubyte* out = (GLubyte*) output[0].xyz;
+    uint32_t* flags;

    ITERATE(count) {
        PREFETCH(vptr + vstride);
-        func(vptr, (GLubyte*) pos);
-        it->flags = GPU_CMD_VERTEX;

+        func(vptr, out);
        vptr += vstride;
-        ++it;
+
+        /* Set the flags which are 4 bytes before the position. Doing it here saves
+         * an additional loop */
+        flags = (uint32_t*) out - 1;
+        *flags = GPU_CMD_VERTEX;
+
+        out += sizeof(Vertex);
    }
 }

-static void _readUVData(ReadUVFunc func, const GLuint first, const GLuint count, Vertex* it) {
+static void _readUVData(ReadUVFunc func, const GLuint first, const GLuint count, const Vertex* output) {
    const GLsizei uvstride = ATTRIB_POINTERS.uv.stride;
    const GLubyte* uvptr = ((GLubyte*) ATTRIB_POINTERS.uv.ptr + (first * uvstride));

+    GLubyte* out = (GLubyte*) output[0].uv;
+
    ITERATE(count) {
        PREFETCH(uvptr + uvstride);

-        func(uvptr, (GLubyte*) it->uv);
+        func(uvptr, out);
        uvptr += uvstride;
-        ++it;
+        out += sizeof(Vertex);
    }
 }

-static void _readSTData(ReadUVFunc func, const GLuint first, const GLuint count, VertexExtra* it) {
+static void _readSTData(ReadUVFunc func, const GLuint first, const GLuint count, const VertexExtra* extra) {
    const GLsizei ststride = ATTRIB_POINTERS.st.stride;
    const GLubyte* stptr = ((GLubyte*) ATTRIB_POINTERS.st.ptr + (first * ststride));

+    GLubyte* out = (GLubyte*) extra[0].st;
+
    ITERATE(count) {
        PREFETCH(stptr + ststride);
-        func(stptr, (GLubyte*) it->st);
+
+        func(stptr, out);
        stptr += ststride;
-        ++it;
+        out += sizeof(VertexExtra);
    }
 }

-static void _readNormalData(ReadNormalFunc func, const GLuint first, const GLuint count, VertexExtra* it) {
+static void _readNormalData(ReadNormalFunc func, const GLuint first, const GLuint count, const VertexExtra* extra) {
    const GLsizei nstride = ATTRIB_POINTERS.normal.stride;
    const GLubyte* nptr = ((GLubyte*) ATTRIB_POINTERS.normal.ptr + (first * nstride));

-    ITERATE(count) {
-        func(nptr, (GLubyte*) it->nxyz);
-        nptr += nstride;
+    GLubyte* out = (GLubyte*) extra[0].nxyz;

-        if(_glIsNormalizeEnabled()) {
-            GLfloat* n = (GLfloat*) it->nxyz;
+    ITERATE(count) {
+        func(nptr, out);
+        nptr += nstride;
+        out += sizeof(VertexExtra);
+    }
+
+    if(_glIsNormalizeEnabled()) {
+        GLubyte* ptr = (GLubyte*) extra->nxyz;
+        ITERATE(count) {
+            GLfloat* n = (GLfloat*) ptr;
            float temp = n[0] * n[0];
            temp = MATH_fmac(n[1], n[1], temp);
            temp = MATH_fmac(n[2], n[2], temp);
@ -727,9 +659,9 @@ static void _readNormalData(ReadNormalFunc func, const GLuint first, const GLuin
            n[0] *= ilength;
            n[1] *= ilength;
            n[2] *= ilength;
-        }

-        ++it;
+            ptr += sizeof(VertexExtra);
+        }
    }
 }

@ -737,15 +669,18 @@ GL_FORCE_INLINE GLuint diffusePointerSize() {
    return (ATTRIB_POINTERS.colour.size == GL_BGRA) ? 4 : ATTRIB_POINTERS.colour.size;
 }

-static void _readDiffuseData(ReadDiffuseFunc func, const GLuint first, const GLuint count, Vertex* it) {
+static void _readDiffuseData(ReadDiffuseFunc func, const GLuint first, const GLuint count, const Vertex* output) {
    const GLuint cstride = ATTRIB_POINTERS.colour.stride;
    const GLubyte* cptr = ((GLubyte*) ATTRIB_POINTERS.colour.ptr) + (first * cstride);

+    GLubyte* out = (GLubyte*) output[0].bgra;
+
    ITERATE(count) {
        PREFETCH(cptr + cstride);
-        func(cptr, it->bgra);
+
+        func(cptr, out);
        cptr += cstride;
-        ++it;
+        out += sizeof(Vertex);
    }
 }

@ -813,7 +748,9 @@ typedef struct {
 } Float2;

 static const Float3 F3Z = {0.0f, 0.0f, 1.0f};
+static const Float3 F3ZERO = {0.0f, 0.0f, 0.0f};
 static const Float2 F2ZERO = {0.0f, 0.0f};
+static const uint32_t U4ONE = ~0;

 static void generateElementsFastPath(
        SubmissionTarget* target, const GLsizei first, const GLuint count,
@ -900,15 +837,17 @@ static void generateElementsFastPath(

 #define POLYMODE QUADS
 #define PROCESS_VERTEX_FLAGS(it, i) { \
-    it->flags = GPU_CMD_VERTEX; \
-    if(((i + 1) % 4) == 0) { \
-        Vertex t = *it; \
-        *it = *(it - 1); \
-        *(it - 1) = t; \
+    if((i + 1) % 4 == 0) { \
+        Vertex* prev = ((it) - 1); \
+        Vertex t = (*prev); \
+        *(prev) = *((it)); \
+        *((it)) = t; \
+        prev->flags = GPU_CMD_VERTEX; \
        it->flags = GPU_CMD_VERTEX_EOL; \
+    } else { \
+        it->flags = GPU_CMD_VERTEX; \
    } \
 }
-
 #include "draw_fastpath.inc"
 #undef PROCESS_VERTEX_FLAGS
 #undef POLYMODE
@ -925,11 +864,11 @@ static void generateArrays(SubmissionTarget* target, const GLsizei first, const
    Vertex* start = _glSubmissionTargetStart(target);
    VertexExtra* ve = aligned_vector_at(target->extras, 0);

-    const ReadPositionFunc pfunc = calcReadPositionFunc();
-    const ReadDiffuseFunc dfunc = calcReadDiffuseFunc();
-    const ReadUVFunc uvfunc = calcReadUVFunc();
-    const ReadNormalFunc nfunc = calcReadNormalFunc();
-    const ReadUVFunc stfunc = calcReadSTFunc();
+    ReadPositionFunc pfunc = calcReadPositionFunc();
+    ReadDiffuseFunc dfunc = calcReadDiffuseFunc();
+    ReadUVFunc uvfunc = calcReadUVFunc();
+    ReadNormalFunc nfunc = calcReadNormalFunc();
+    ReadUVFunc stfunc = calcReadSTFunc();

    _readPositionData(pfunc, first, count, start);
    _readDiffuseData(dfunc, first, count, start);
@ -947,15 +886,14 @@ static void generate(SubmissionTarget* target, const GLenum mode, const GLsizei
        if(indices) {
            generateElementsFastPath(target, first, count, indices, type);
        } else {
-            switch(mode) {
-                case GL_QUADS:
-                    generateArraysFastPath_QUADS(target, first, count);
-                    return;  // Don't need to do any more processing
-                case GL_TRIANGLES:
-                    generateArraysFastPath_TRIS(target, first, count);
-                    return; // Don't need to do any more processing
-                default:
-                    generateArraysFastPath_ALL(target, first, count);
+            if(mode == GL_QUADS) {
+                generateArraysFastPath_QUADS(target, first, count);
+                return;  // Don't need to do any more processing
+            } else if(mode == GL_TRIANGLES) {
+                generateArraysFastPath_TRIS(target, first, count);
+                return; // Don't need to do any more processing
+            } else {
+                generateArraysFastPath_ALL(target, first, count);
            }
        }
    } else {
@ -982,7 +920,7 @@ static void generate(SubmissionTarget* target, const GLenum mode, const GLsizei
        genTriangleStrip(it, count);
        break;
    default:
-        gl_assert(0 && "Not Implemented");
+        assert(0 && "Not Implemented");
    }
 }

@ -995,6 +933,24 @@ static void transform(SubmissionTarget* target) {
    TransformVertices(vertex, target->count);
 }

+static void mat_transform3(const float* xyz, const float* xyzOut, const uint32_t count, const uint32_t inStride, const uint32_t outStride) {
+    const uint8_t* dataIn = (const uint8_t*) xyz;
+    uint8_t* dataOut = (uint8_t*) xyzOut;
+
+    ITERATE(count) {
+        const float* in = (const float*) dataIn;
+        float* out = (float*) dataOut;
+
+        TransformVec3NoMod(
+            in,
+            out
+        );
+
+        dataIn += inStride;
+        dataOut += outStride;
+    }
+}
+
 static void mat_transform_normal3(const float* xyz, const float* xyzOut, const uint32_t count, const uint32_t inStride, const uint32_t outStride) {
    const uint8_t* dataIn = (const uint8_t*) xyz;
    uint8_t* dataOut = (uint8_t*) xyzOut;
@ -1062,143 +1018,40 @@ GL_FORCE_INLINE void divide(SubmissionTarget* target) {
    }
 }

-GL_FORCE_INLINE int _calc_pvr_face_culling() {
-    if(!_glIsCullingEnabled()) {
-        return GPU_CULLING_SMALL;
-    } else {
-        if(_glGetCullFace() == GL_BACK) {
-            return (_glGetFrontFace() == GL_CW) ? GPU_CULLING_CCW : GPU_CULLING_CW;
-        } else {
-            return (_glGetFrontFace() == GL_CCW) ? GPU_CULLING_CCW : GPU_CULLING_CW;
-        }
-    }
-}
-
-GL_FORCE_INLINE int _calc_pvr_depth_test() {
-    if(!_glIsDepthTestEnabled()) {
-        return GPU_DEPTHCMP_ALWAYS;
-    }
-
-    switch(_glGetDepthFunc()) {
-        case GL_NEVER:
-            return GPU_DEPTHCMP_NEVER;
-        case GL_LESS:
-            return GPU_DEPTHCMP_GREATER;
-        case GL_EQUAL:
-            return GPU_DEPTHCMP_EQUAL;
-        case GL_LEQUAL:
-            return GPU_DEPTHCMP_GEQUAL;
-        case GL_GREATER:
-            return GPU_DEPTHCMP_LESS;
-        case GL_NOTEQUAL:
-            return GPU_DEPTHCMP_NOTEQUAL;
-        case GL_GEQUAL:
-            return GPU_DEPTHCMP_LEQUAL;
-        break;
-        case GL_ALWAYS:
-        default:
-            return GPU_DEPTHCMP_ALWAYS;
-    }
-}
-
-GL_FORCE_INLINE int _calcPVRBlendFactor(GLenum factor) {
-    switch(factor) {
-    case GL_ZERO:
-        return GPU_BLEND_ZERO;
-    case GL_SRC_ALPHA:
-        return GPU_BLEND_SRCALPHA;
-    case GL_DST_COLOR:
-        return GPU_BLEND_DESTCOLOR;
-    case GL_DST_ALPHA:
-        return GPU_BLEND_DESTALPHA;
-    case GL_ONE_MINUS_DST_COLOR:
-        return GPU_BLEND_INVDESTCOLOR;
-    case GL_ONE_MINUS_SRC_ALPHA:
-        return GPU_BLEND_INVSRCALPHA;
-    case GL_ONE_MINUS_DST_ALPHA:
-        return GPU_BLEND_INVDESTALPHA;
-    case GL_ONE:
-        return GPU_BLEND_ONE;
-    default:
-        fprintf(stderr, "Invalid blend mode: %u\n", (unsigned int) factor);
-        return GPU_BLEND_ONE;
-    }
-}
-
-
-GL_FORCE_INLINE void _updatePVRBlend(PolyContext* context) {
-    if(_glIsBlendingEnabled() || _glIsAlphaTestEnabled()) {
-        context->gen.alpha = GPU_ALPHA_ENABLE;
-    } else {
-        context->gen.alpha = GPU_ALPHA_DISABLE;
-    }
-
-    context->blend.src = _calcPVRBlendFactor(_glGetBlendSourceFactor());
-    context->blend.dst = _calcPVRBlendFactor(_glGetBlendDestFactor());
-}
-
-GL_FORCE_INLINE void apply_poly_header(PolyHeader* header, GLboolean multiTextureHeader, PolyList* activePolyList, GLshort textureUnit) {
+GL_FORCE_INLINE void push(PolyHeader* header, GLboolean multiTextureHeader, PolyList* activePolyList, GLshort textureUnit) {
    TRACE();

    // Compile the header
-    PolyContext ctx;
-    memset(&ctx, 0, sizeof(PolyContext));
+    PolyContext cxt = *_glGetPVRContext();
+    cxt.list_type = activePolyList->list_type;

-    ctx.list_type = activePolyList->list_type;
-    ctx.fmt.color = GPU_CLRFMT_ARGBPACKED;
-    ctx.fmt.uv = GPU_UVFMT_32BIT;
-    ctx.gen.color_clamp = GPU_CLRCLAMP_DISABLE;
-
-    ctx.gen.culling = _calc_pvr_face_culling();
-    ctx.depth.comparison = _calc_pvr_depth_test();
-    ctx.depth.write = _glIsDepthWriteEnabled() ? GPU_DEPTHWRITE_ENABLE : GPU_DEPTHWRITE_DISABLE;
-
-    ctx.gen.shading = (_glGetShadeModel() == GL_SMOOTH) ? GPU_SHADE_GOURAUD : GPU_SHADE_FLAT;
-
-    if(_glIsScissorTestEnabled()) {
-        ctx.gen.clip_mode = GPU_USERCLIP_INSIDE;
-    } else {
-        ctx.gen.clip_mode = GPU_USERCLIP_DISABLE;
-    }
-
-    if(_glIsFogEnabled()) {
-        ctx.gen.fog_type = GPU_FOG_TABLE;
-    } else {
-        ctx.gen.fog_type = GPU_FOG_DISABLE;
-    }
-
-    _updatePVRBlend(&ctx);
-
-    if(ctx.list_type == GPU_LIST_OP_POLY) {
+    if(cxt.list_type == GPU_LIST_OP_POLY) {
        /* Opaque polys are always one/zero */
-        ctx.blend.src = GPU_BLEND_ONE;
-        ctx.blend.dst = GPU_BLEND_ZERO;
-    } else if(ctx.list_type == GPU_LIST_PT_POLY) {
+        cxt.blend.src = GPU_BLEND_ONE;
+        cxt.blend.dst = GPU_BLEND_ZERO;
+    } else if(cxt.list_type == GPU_LIST_PT_POLY) {
        /* Punch-through polys require fixed blending and depth modes */
-        ctx.blend.src = GPU_BLEND_SRCALPHA;
-        ctx.blend.dst = GPU_BLEND_INVSRCALPHA;
-        ctx.depth.comparison = GPU_DEPTHCMP_LEQUAL;
-    } else if(ctx.list_type == GPU_LIST_TR_POLY && AUTOSORT_ENABLED) {
+        cxt.blend.src = GPU_BLEND_SRCALPHA;
+        cxt.blend.dst = GPU_BLEND_INVSRCALPHA;
+        cxt.depth.comparison = GPU_DEPTHCMP_LEQUAL;
+    } else if(cxt.list_type == GPU_LIST_TR_POLY && AUTOSORT_ENABLED) {
        /* Autosort mode requires this mode for transparent polys */
-        ctx.depth.comparison = GPU_DEPTHCMP_GEQUAL;
+        cxt.depth.comparison = GPU_DEPTHCMP_GEQUAL;
    }

-    _glUpdatePVRTextureContext(&ctx, textureUnit);
+    _glUpdatePVRTextureContext(&cxt, textureUnit);

    if(multiTextureHeader) {
-        gl_assert(ctx.list_type == GPU_LIST_TR_POLY);
+        assert(cxt.list_type == GPU_LIST_TR_POLY);

-        ctx.gen.alpha = GPU_ALPHA_ENABLE;
-        ctx.txr.alpha = GPU_TXRALPHA_ENABLE;
-        ctx.blend.src = GPU_BLEND_ZERO;
-        ctx.blend.dst = GPU_BLEND_DESTCOLOR;
-        ctx.depth.comparison = GPU_DEPTHCMP_EQUAL;
+        cxt.gen.alpha = GPU_ALPHA_ENABLE;
+        cxt.txr.alpha = GPU_TXRALPHA_ENABLE;
+        cxt.blend.src = GPU_BLEND_ZERO;
+        cxt.blend.dst = GPU_BLEND_DESTCOLOR;
+        cxt.depth.comparison = GPU_DEPTHCMP_EQUAL;
    }

-    CompilePolyHeader(header, &ctx);
-
-    /* Force bits 18 and 19 on to switch to 6 triangle strips */
-    header->cmd |= 0xC0000;
+    CompilePolyHeader(header, &cxt);

    /* Post-process the vertex list */
    /*
@ -1216,29 +1069,7 @@ GL_FORCE_INLINE void apply_poly_header(PolyHeader* header, GLboolean multiTextur

 #define DEBUG_CLIPPING 0

-
-static AlignedVector VERTEX_EXTRAS;
-static SubmissionTarget SUBMISSION_TARGET;
-
-
-void _glInitSubmissionTarget() {
-    SubmissionTarget* target = &SUBMISSION_TARGET;
-
-    target->extras = NULL;
-    target->count = 0;
-    target->output = NULL;
-    target->header_offset = target->start_offset = 0;
-
-    aligned_vector_init(&VERTEX_EXTRAS, sizeof(VertexExtra));
-    target->extras = &VERTEX_EXTRAS;
-}
-
-
 GL_FORCE_INLINE void submitVertices(GLenum mode, GLsizei first, GLuint count, GLenum type, const GLvoid* indices) {
-
-    SubmissionTarget* const target = &SUBMISSION_TARGET;
-    AlignedVector* const extras = target->extras;
-
    TRACE();

    /* Do nothing if vertices aren't enabled */
@ -1251,59 +1082,55 @@ GL_FORCE_INLINE void submitVertices(GLenum mode, GLsizei first, GLuint count, GL
        return;
    }

-    /* Polygons are treated as triangle fans, the only time this would be a
-     * problem is if we supported glPolygonMode(..., GL_LINE) but we don't.
-     * We optimise the triangle and quad cases.
-     */
-    if(mode == GL_POLYGON) {
-        switch(count) {
-            case 2:
-                mode = GL_LINES;
-            break;
-            case 3:
-                mode = GL_TRIANGLES;
-            break;
-            case 4:
-                mode = GL_QUADS;
-            break;
-            default:
-                mode = GL_TRIANGLE_FAN;
-        }
-    }
-
    if(mode == GL_LINE_STRIP || mode == GL_LINES) {
        fprintf(stderr, "Line drawing is currently unsupported\n");
        return;
    }

+    static SubmissionTarget* target = NULL;
+    static AlignedVector extras;
+
+    /* Initialization of the target and extras */
+    if(!target) {
+        target = (SubmissionTarget*) malloc(sizeof(SubmissionTarget));
+        target->extras = NULL;
+        target->count = 0;
+        target->output = NULL;
+        target->header_offset = target->start_offset = 0;
+
+        aligned_vector_init(&extras, sizeof(VertexExtra));
+        target->extras = &extras;
+    }
+
+    /* Polygons are treated as triangle fans, the only time this would be a
+     * problem is if we supported glPolygonMode(..., GL_LINE) but we don't.
+     * We optimise the triangle and quad cases.
+     */
+    if(mode == GL_POLYGON) {
+        if(count == 3) {
+            mode = GL_TRIANGLES;
+        } else if(count == 4) {
+            mode = GL_QUADS;
+        } else {
+            mode = GL_TRIANGLE_FAN;
+        }
+    }
+
    // We don't handle this any further, so just make sure we never pass it down */
-    gl_assert(mode != GL_POLYGON);
+    assert(mode != GL_POLYGON);

    target->output = _glActivePolyList();
-    gl_assert(target->output);
-    gl_assert(extras);
-
-    uint32_t vector_size = aligned_vector_size(&target->output->vector);
-
-    GLboolean header_required = (vector_size == 0) || _glGPUStateIsDirty();
-
    target->count = (mode == GL_TRIANGLE_FAN) ? ((count - 2) * 3) : count;
-    target->header_offset = vector_size;
-    target->start_offset = target->header_offset + (header_required ? 1 : 0);
+    target->header_offset = target->output->vector.size;
+    target->start_offset = target->header_offset + 1;

-    gl_assert(target->start_offset >= target->header_offset);
-    gl_assert(target->count);
+    assert(target->count);

    /* Make sure we have enough room for all the "extra" data */
-    aligned_vector_resize(extras, target->count);
+    aligned_vector_resize(&extras, target->count);

    /* Make room for the vertices and header */
-    aligned_vector_extend(&target->output->vector, target->count + (header_required));
-
-    if(header_required) {
-        apply_poly_header(_glSubmissionTargetHeader(target), GL_FALSE, target->output, 0);
-        _glGPUStateMarkClean();
-    }
+    aligned_vector_extend(&target->output->vector, target->count + 1);

    /* If we're lighting, then we need to do some work in
     * eye-space, so we only transform vertices by the modelview
@ -1312,7 +1139,7 @@ GL_FORCE_INLINE void submitVertices(GLenum mode, GLsizei first, GLuint count, GL
     * If we're not doing lighting though we can optimise by taking
     * vertices straight to clip-space */

-    if(_glIsLightingEnabled()) {
+    if(LIGHTING_ENABLED) {
        _glMatrixLoadModelView();
    } else {
        _glMatrixLoadModelViewProjection();
@ -1327,7 +1154,7 @@ GL_FORCE_INLINE void submitVertices(GLenum mode, GLsizei first, GLuint count, GL
        transform(target);
    }

-    if(_glIsLightingEnabled()){
+    if(LIGHTING_ENABLED){
        light(target);

        /* OK eye-space work done, now move into clip space */
@ -1335,48 +1162,51 @@ GL_FORCE_INLINE void submitVertices(GLenum mode, GLsizei first, GLuint count, GL
        transform(target);
    }

-    // /*
-    //    Now, if multitexturing is enabled, we want to send exactly the same vertices again, except:
-    //    - We want to enable blending, and send them to the TR list
-    //    - We want to set the depth func to GL_EQUAL
-    //    - We want to set the second texture ID
-    //    - We want to set the uv coordinates to the passed st ones
-    // */
+    push(_glSubmissionTargetHeader(target), GL_FALSE, target->output, 0);

-    // if(!TEXTURES_ENABLED[1]) {
-    //     /* Multitexture actively disabled */
-    //     return;
-    // }
+    /*
+       Now, if multitexturing is enabled, we want to send exactly the same vertices again, except:
+       - We want to enable blending, and send them to the TR list
+       - We want to set the depth func to GL_EQUAL
+       - We want to set the second texture ID
+       - We want to set the uv coordinates to the passed st ones
+    */

-    // TextureObject* texture1 = _glGetTexture1();
+    if(!TEXTURES_ENABLED[1]) {
+        /* Multitexture actively disabled */
+        return;
+    }

-    // /* Multitexture implicitly disabled */
-    // if(!texture1 || ((ENABLED_VERTEX_ATTRIBUTES & ST_ENABLED_FLAG) != ST_ENABLED_FLAG)) {
-    //     /* Multitexture actively disabled */
-    //     return;
-    // }
+    TextureObject* texture1 = _glGetTexture1();

-    // /* Push back a copy of the list to the transparent poly list, including the header
-    //     (hence the + 1)
-    // */
-    // Vertex* vertex = aligned_vector_push_back(
-    //     &_glTransparentPolyList()->vector, (Vertex*) _glSubmissionTargetHeader(target), target->count + 1
-    // );
+    /* Multitexture implicitly disabled */
+    if(!texture1 || ((ENABLED_VERTEX_ATTRIBUTES & ST_ENABLED_FLAG) != ST_ENABLED_FLAG)) {
+        /* Multitexture actively disabled */
+        return;
+    }

-    // gl_assert(vertex);
+    /* Push back a copy of the list to the transparent poly list, including the header
+        (hence the + 1)
+    */
+    Vertex* vertex = aligned_vector_push_back(
+        &_glTransparentPolyList()->vector, (Vertex*) _glSubmissionTargetHeader(target), target->count + 1
+    );

-    // PolyHeader* mtHeader = (PolyHeader*) vertex++;
-    // /* Send the buffer again to the transparent list */
-    // apply_poly_header(mtHeader, GL_TRUE, _glTransparentPolyList(), 1);
+    assert(vertex);

-    // /* Replace the UV coordinates with the ST ones */
-    // VertexExtra* ve = aligned_vector_at(target->extras, 0);
-    // ITERATE(target->count) {
-    //     vertex->uv[0] = ve->st[0];
-    //     vertex->uv[1] = ve->st[1];
-    //     ++vertex;
-    //     ++ve;
-    // }
+    PolyHeader* mtHeader = (PolyHeader*) vertex++;
+
+    /* Replace the UV coordinates with the ST ones */
+    VertexExtra* ve = aligned_vector_at(target->extras, 0);
+    ITERATE(target->count) {
+        vertex->uv[0] = ve->st[0];
+        vertex->uv[1] = ve->st[1];
+        ++vertex;
+        ++ve;
+    }
+
+    /* Send the buffer again to the transparent list */
+    push(mtHeader, GL_TRUE, _glTransparentPolyList(), 1);
 }

 void APIENTRY glDrawElements(GLenum mode, GLsizei count, GLenum type, const GLvoid* indices) {
@ -1480,8 +1310,6 @@ void APIENTRY glTexCoordPointer(GLint size, GLenum type, GLsizei stride, const G
        return;
    }

-    stride = (stride) ? stride : size * byte_size(type);
-
    AttribPointer* tointer = (ACTIVE_CLIENT_TEXTURE == 0) ? &ATTRIB_POINTERS.uv : &ATTRIB_POINTERS.st;

    if(_glComparePointers(tointer, size, type, stride, pointer)) {
@ -1490,7 +1318,7 @@ void APIENTRY glTexCoordPointer(GLint size, GLenum type, GLsizei stride, const G
    }

    tointer->ptr = pointer;
-    tointer->stride = stride;
+    tointer->stride = (stride) ? stride : size * byte_size(type);
    tointer->type = type;
    tointer->size = size;

@ -1505,15 +1333,13 @@ void APIENTRY glVertexPointer(GLint size, GLenum type,  GLsizei stride,  const G
        return;
    }

-    stride = (stride) ? stride : (size * byte_size(ATTRIB_POINTERS.vertex.type));
-
    if(_glComparePointers(&ATTRIB_POINTERS.vertex, size, type, stride, pointer)) {
        // No Change
        return;
    }

    ATTRIB_POINTERS.vertex.ptr = pointer;
-    ATTRIB_POINTERS.vertex.stride = stride;
+    ATTRIB_POINTERS.vertex.stride = (stride) ? stride : (size * byte_size(ATTRIB_POINTERS.vertex.type));
    ATTRIB_POINTERS.vertex.type = type;
    ATTRIB_POINTERS.vertex.size = size;

@ -1528,8 +1354,6 @@ void APIENTRY glColorPointer(GLint size,  GLenum type,  GLsizei stride,  const G
        return;
    }

-    stride = (stride) ? stride : ((size == GL_BGRA) ? 4 : size) * byte_size(type);
-
    if(_glComparePointers(&ATTRIB_POINTERS.colour, size, type, stride, pointer)) {
        // No Change
        return;
@ -1538,7 +1362,7 @@ void APIENTRY glColorPointer(GLint size,  GLenum type,  GLsizei stride,  const G
    ATTRIB_POINTERS.colour.ptr = pointer;
    ATTRIB_POINTERS.colour.type = type;
    ATTRIB_POINTERS.colour.size = size;
-    ATTRIB_POINTERS.colour.stride = stride;
+    ATTRIB_POINTERS.colour.stride = (stride) ? stride : ((size == GL_BGRA) ? 4 : size) * byte_size(type);

    _glRecalcFastPath();
 }
@ -1561,8 +1385,6 @@ void APIENTRY glNormalPointer(GLenum type,  GLsizei stride,  const GLvoid * poin
        return;
    }

-    stride = (stride) ? stride : ATTRIB_POINTERS.normal.size * byte_size(type);
-
    if(_glComparePointers(&ATTRIB_POINTERS.normal, 3, type, stride, pointer)) {
        // No Change
        return;
@ -1570,7 +1392,7 @@ void APIENTRY glNormalPointer(GLenum type,  GLsizei stride,  const GLvoid * poin

    ATTRIB_POINTERS.normal.ptr = pointer;
    ATTRIB_POINTERS.normal.size = (type == GL_UNSIGNED_INT_2_10_10_10_REV) ? 1 : 3;
-    ATTRIB_POINTERS.normal.stride = stride;
+    ATTRIB_POINTERS.normal.stride = (stride) ? stride : ATTRIB_POINTERS.normal.size * byte_size(type);
    ATTRIB_POINTERS.normal.type = type;

    _glRecalcFastPath();
--- a/GL/draw_fastpath.inc
+++ b/GL/draw_fastpath.inc
@ -5,123 +5,75 @@

 MAKE_FUNC(POLYMODE)
 {
-    static const float w = 1.0f;
-    if(!(ENABLED_VERTEX_ATTRIBUTES & VERTEX_ENABLED_FLAG)) {
+    const Vertex* const start = _glSubmissionTargetStart(target);
+    const VertexExtra* const ve_start = aligned_vector_at(target->extras, 0);
+
+    const GLuint vstride = ATTRIB_POINTERS.vertex.stride;
+    GLuint uvstride = ATTRIB_POINTERS.uv.stride;
+    GLuint ststride = ATTRIB_POINTERS.st.stride;
+    GLuint dstride = ATTRIB_POINTERS.colour.stride;
+    GLuint nstride = ATTRIB_POINTERS.normal.stride;
+
+    const GLubyte* pos = (ENABLED_VERTEX_ATTRIBUTES & VERTEX_ENABLED_FLAG) ? ATTRIB_POINTERS.vertex.ptr + (first * vstride) : NULL;
+    const GLubyte* uv = (ENABLED_VERTEX_ATTRIBUTES & UV_ENABLED_FLAG) ? ATTRIB_POINTERS.uv.ptr + (first * uvstride) : NULL;
+    const GLubyte* col = (ENABLED_VERTEX_ATTRIBUTES & DIFFUSE_ENABLED_FLAG) ? ATTRIB_POINTERS.colour.ptr + (first * dstride) : NULL;
+    const GLubyte* st = (ENABLED_VERTEX_ATTRIBUTES & ST_ENABLED_FLAG) ? ATTRIB_POINTERS.st.ptr + (first * ststride) : NULL;
+    const GLubyte* n = (ENABLED_VERTEX_ATTRIBUTES & NORMAL_ENABLED_FLAG) ? ATTRIB_POINTERS.normal.ptr + (first * nstride) : NULL;
+
+    const float w = 1.0f;
+
+    if(!pos) {
        /* If we don't have vertices, do nothing */
        return;
    }

-    /* This is the best value we have. PROCESS_VERTEX_FLAGS needs to operate on quads and tris and so
-       this need to be divisible by 4 and 3. Even though we should be able to go much higher than this
-       and still be cache-local, trial and error says otherwise... */
+    if(!col) {
+        col = (GLubyte*) &U4ONE;
+        dstride = 0;
+    }

-#define BATCH_SIZE 60
+    if(!uv) {
+        uv = (GLubyte*) &F2ZERO;
+        uvstride = 0;
+    }

-    GLuint min = 0;
-    GLuint stride;
-    const GLubyte* ptr;
-    Vertex* it;
-    VertexExtra* ve;
+    if(!st) {
+        st = (GLubyte*) &F2ZERO;
+        ststride = 0;
+    }

+    if(!n) {
+        n = (GLubyte*) &F3Z;
+        nstride = 0;
+    }

-    for(min = 0; min < count; min += BATCH_SIZE) {
-        const Vertex* start = ((Vertex*) _glSubmissionTargetStart(target)) + min;
-        const int_fast32_t loop = ((min + BATCH_SIZE) > count) ? count - min : BATCH_SIZE;
-        const int offset = (first + min);
+    VertexExtra* ve = (VertexExtra*) ve_start;
+    Vertex* it = (Vertex*) start;

-        stride = ATTRIB_POINTERS.uv.stride;
-        ptr = (ENABLED_VERTEX_ATTRIBUTES & UV_ENABLED_FLAG) ? ATTRIB_POINTERS.uv.ptr + ((first + min) * stride) : NULL;
-        it = (Vertex*) start;
+    for(int_fast32_t i = 0; i < count; ++i) {
+        TransformVertex((const float*) pos, &w, it->xyz, &it->w);
+        pos += vstride;
+        PREFETCH(pos);

-        if(ptr) {
-            PREFETCH(ptr);
-            for(int_fast32_t i = 0; i < loop; ++i, ++it) {
-                PREFETCH(ptr + stride);
-                it->uv[0] = ((float*) ptr)[0];
-                it->uv[1] = ((float*) ptr)[1];
-                ptr += stride;
-            }
-        } else {
-            for(int_fast32_t i = 0; i < loop; ++i, ++it) {
-                it->uv[0] = 0;
-                it->uv[1] = 0;
-            }
-        }
+        *((Float2*) it->uv) = *((Float2*) uv);
+        uv += uvstride;
+        PREFETCH(uv);

-        stride = ATTRIB_POINTERS.colour.stride;
-        ptr = (ENABLED_VERTEX_ATTRIBUTES & DIFFUSE_ENABLED_FLAG) ? ATTRIB_POINTERS.colour.ptr + (offset * stride) : NULL;
-        it = (Vertex*) start;
+        *((uint32_t*) it->bgra) = *((uint32_t*) col);
+        col += dstride;
+        PREFETCH(col);

-        if(ptr) {
-            PREFETCH(ptr);
-            for(int_fast32_t i = 0; i < loop; ++i, ++it) {
-                PREFETCH(ptr + stride);
-                it->bgra[0] = ptr[0];
-                it->bgra[1] = ptr[1];
-                it->bgra[2] = ptr[2];
-                it->bgra[3] = ptr[3];
-                ptr += stride;
-            }
-        } else {
-            for(int_fast32_t i = 0; i < loop; ++i, ++it) {
-                *((uint32_t*) it->bgra) = ~0;
-            }
-        }
+        *((Float2*) ve->st) = *((Float2*) st);
+        st += ststride;
+        PREFETCH(st);

-        stride = ATTRIB_POINTERS.vertex.stride;
-        ptr = ATTRIB_POINTERS.vertex.ptr + (offset * stride);
-        it = (Vertex*) start;
+        *((Float3*) ve->nxyz) = *((Float3*) n);
+        n += nstride;
+        PREFETCH(n);

-        PREFETCH(ptr);
-        for(int_fast32_t i = 0; i < loop; ++i, ++it) {
-            PREFETCH(ptr + stride);
-            TransformVertex((const float*) ptr, &w, it->xyz, &it->w);
-            PROCESS_VERTEX_FLAGS(it, min + i);
-            ptr += stride;
-        }
+        PROCESS_VERTEX_FLAGS(it, i);

-        start = aligned_vector_at(target->extras, min);
-
-        stride = ATTRIB_POINTERS.st.stride;
-        ptr = (ENABLED_VERTEX_ATTRIBUTES & ST_ENABLED_FLAG) ? ATTRIB_POINTERS.st.ptr + (offset * stride) : NULL;
-        ve = (VertexExtra*) start;
-
-        if(ptr) {
-            PREFETCH(ptr);
-
-            for(int_fast32_t i = 0; i < loop; ++i, ++ve) {
-                PREFETCH(ptr + stride);
-                ve->st[0] = ((float*) ptr)[0];
-                ve->st[1] = ((float*) ptr)[1];
-                ptr += stride;
-            }
-        } else {
-            for(int_fast32_t i = 0; i < loop; ++i, ++ve) {
-                ve->st[0] = 0;
-                ve->st[1] = 0;
-            }
-        }
-
-        stride = ATTRIB_POINTERS.normal.stride;
-        ptr = (ENABLED_VERTEX_ATTRIBUTES & NORMAL_ENABLED_FLAG) ? ATTRIB_POINTERS.normal.ptr + (offset * stride) : NULL;
-        ve = (VertexExtra*) start;
-
-        if(ptr) {
-            PREFETCH(ptr);
-
-            for(int_fast32_t i = 0; i < loop; ++i, ++ve) {
-                PREFETCH(ptr + stride);
-                ve->nxyz[0] = ((float*) ptr)[0];
-                ve->nxyz[1] = ((float*) ptr)[1];
-                ve->nxyz[2] = ((float*) ptr)[2];
-                ptr += stride;
-            }
-        } else {
-            for(int_fast32_t i = 0; i < loop; ++i, ++ve) {
-                ve->nxyz[0] = 0;
-                ve->nxyz[1] = 0;
-                ve->nxyz[2] = 0;
-            }
-        }
+        ++it;
+        ++ve;
    }
 }
--- a/GL/flush.c
+++ b/GL/flush.c
@ -46,22 +46,10 @@ void APIENTRY glKosInitConfig(GLdcConfig* config) {
    config->initial_pt_capacity = 512 * 3;
    config->initial_tr_capacity = 1024 * 3;
    config->initial_immediate_capacity = 1024 * 3;
-
-    // RGBA4444 is the fastest general format - 8888 will cause a perf issue
-    config->internal_palette_format = GL_RGBA4;
-
-    config->texture_twiddle = GL_TRUE;
+    config->internal_palette_format = GL_RGBA8;
 }

-static bool _initialized = false;
-
 void APIENTRY glKosInitEx(GLdcConfig* config) {
-    if(_initialized) {
-        return;
-    }
-
-    _initialized = true;
-
    TRACE();

    printf("\nWelcome to GLdc! Git revision: %s\n\n", GLDC_VERSION);
@ -70,7 +58,6 @@ void APIENTRY glKosInitEx(GLdcConfig* config) {

    AUTOSORT_ENABLED = config->autosort_enabled;

-    _glInitSubmissionTarget();
    _glInitMatrices();
    _glInitAttributePointers();
    _glInitContext();
@ -82,10 +69,6 @@ void APIENTRY glKosInitEx(GLdcConfig* config) {

    _glInitTextures();

-    if(config->texture_twiddle) {
-        glEnable(GL_TEXTURE_TWIDDLE_KOS);
-    }
-
    OP_LIST.list_type = GPU_LIST_OP_POLY;
    PT_LIST.list_type = GPU_LIST_PT_POLY;
    TR_LIST.list_type = GPU_LIST_TR_POLY;
@ -99,12 +82,6 @@ void APIENTRY glKosInitEx(GLdcConfig* config) {
    aligned_vector_reserve(&TR_LIST.vector, config->initial_tr_capacity);
 }

-void APIENTRY glKosShutdown() {
-    aligned_vector_clear(&OP_LIST.vector);
-    aligned_vector_clear(&PT_LIST.vector);
-    aligned_vector_clear(&TR_LIST.vector);
-}
-
 void APIENTRY glKosInit() {
    GLdcConfig config;
    glKosInitConfig(&config);
@ -115,23 +92,17 @@ void APIENTRY glKosSwapBuffers() {
    TRACE();

    SceneBegin();
-        if(aligned_vector_header(&OP_LIST.vector)->size > 2) {
-            SceneListBegin(GPU_LIST_OP_POLY);
-            SceneListSubmit((Vertex*) aligned_vector_front(&OP_LIST.vector), aligned_vector_size(&OP_LIST.vector));
-            SceneListFinish();
-        }
+        SceneListBegin(GPU_LIST_OP_POLY);
+        SceneListSubmit(OP_LIST.vector.data, OP_LIST.vector.size);
+        SceneListFinish();

-        if(aligned_vector_header(&PT_LIST.vector)->size > 2) {
-            SceneListBegin(GPU_LIST_PT_POLY);
-            SceneListSubmit((Vertex*) aligned_vector_front(&PT_LIST.vector), aligned_vector_size(&PT_LIST.vector));
-            SceneListFinish();
-        }
+        SceneListBegin(GPU_LIST_PT_POLY);
+        SceneListSubmit(PT_LIST.vector.data, PT_LIST.vector.size);
+        SceneListFinish();

-        if(aligned_vector_header(&TR_LIST.vector)->size > 2) {
-            SceneListBegin(GPU_LIST_TR_POLY);
-            SceneListSubmit((Vertex*) aligned_vector_front(&TR_LIST.vector), aligned_vector_size(&TR_LIST.vector));
-            SceneListFinish();
-        }
+        SceneListBegin(GPU_LIST_TR_POLY);
+        SceneListSubmit(TR_LIST.vector.data, TR_LIST.vector.size);
+        SceneListFinish();
    SceneFinish();

    aligned_vector_clear(&OP_LIST.vector);
@ -139,4 +110,4 @@ void APIENTRY glKosSwapBuffers() {
    aligned_vector_clear(&TR_LIST.vector);

    _glApplyScissor(true);
-}
+}
--- a/GL/framebuffer.c
+++ b/GL/framebuffer.c
@ -1,4 +1,5 @@
 #include <stdio.h>
+#include <assert.h>

 #include "private.h"

@ -196,7 +197,7 @@ static GL_NO_INSTRUMENT GLboolean _glCalculateAverageTexel(GLuint pvrFormat, con

        *d1 = PACK_ARGB4444(a, r, g, b);
    } else {
-        gl_assert(format == ARGB1555);
+        assert(format == ARGB1555);

        GLushort* s1 = (GLushort*) src1;
        GLushort* s2 = (GLushort*) src2;
@ -245,8 +246,8 @@ GLboolean _glGenerateMipmapTwiddled(const GLuint pvrFormat, const GLubyte* prevD
        const GLubyte* s4 = s3 + stride;
        GLubyte* t = &thisData[j * stride];

-        gl_assert(s4 < prevData + (lastHeight * lastWidth * stride));
-        gl_assert(t < thisData + (thisHeight * thisWidth * stride));
+        assert(s4 < prevData + (lastHeight * lastWidth * stride));
+        assert(t < thisData + (thisHeight * thisWidth * stride));

        _glCalculateAverageTexel(pvrFormat, s1, s2, s3, s4, t);
    }
@ -254,7 +255,7 @@ GLboolean _glGenerateMipmapTwiddled(const GLuint pvrFormat, const GLubyte* prevD
    return GL_TRUE;
 }

-void APIENTRY glGenerateMipmap(GLenum target) {
+void APIENTRY glGenerateMipmapEXT(GLenum target) {
    if(target != GL_TEXTURE_2D) {
        _glKosThrowError(GL_INVALID_OPERATION, __func__);
        return;
@ -322,7 +323,7 @@ void APIENTRY glGenerateMipmap(GLenum target) {
        prevHeight = thisHeight;
    }

-    gl_assert(_glIsMipmapComplete(tex));
+    assert(_glIsMipmapComplete(tex));
 }

 /* generate mipmaps for any image provided by the user and then pass them to OpenGL */
@ -334,7 +335,7 @@ GLAPI GLvoid APIENTRY gluBuild2DMipmaps(GLenum target, GLint internalFormat,
 	 unsigned byte data, and finally the data itself. */
    glTexImage2D(GL_TEXTURE_2D, 0, 3, width, height, 0, GL_RGB, GL_UNSIGNED_BYTE, data);

-    glGenerateMipmap(GL_TEXTURE_2D);
+    glGenerateMipmapEXT(GL_TEXTURE_2D);
 }

 GLenum APIENTRY glCheckFramebufferStatusEXT(GLenum target) {
--- a/GL/gl_assert.h
+++ b/GL/gl_assert.h
@ -1,20 +0,0 @@
-
-#ifndef NDEBUG
-/* We're debugging, use normal assert */
-#include <assert.h>
-#define gl_assert assert
-#else
-/* Release mode, use our custom assert */
-#include <stdio.h>
-#include <stdlib.h>
-
-#define gl_assert(x) \
-    do {\
-        if(!(x)) {\
-            fprintf(stderr, "Assertion failed at %s:%d\n", __FILE__, __LINE__);\
-            exit(1);\
-        }\
-    } while(0); \
-
-#endif
-
--- a/GL/immediate.c
+++ b/GL/immediate.c
@ -17,10 +17,10 @@ extern inline GLuint _glRecalcFastPath();
 GLboolean IMMEDIATE_MODE_ACTIVE = GL_FALSE;
 static GLenum ACTIVE_POLYGON_MODE = GL_TRIANGLES;

-static GLfloat __attribute__((aligned(32))) NORMAL[3] = {0.0f, 0.0f, 1.0f};
-static GLubyte __attribute__((aligned(32))) COLOR[4] = {255, 255, 255, 255}; /* ARGB order for speed */
-static GLfloat __attribute__((aligned(32))) UV_COORD[2] = {0.0f, 0.0f};
-static GLfloat __attribute__((aligned(32))) ST_COORD[2] = {0.0f, 0.0f};
+static GLfloat NORMAL[3] = {0.0f, 0.0f, 1.0f};
+static GLubyte COLOR[4] = {255, 255, 255, 255}; /* ARGB order for speed */
+static GLfloat UV_COORD[2] = {0.0f, 0.0f};
+static GLfloat ST_COORD[2] = {0.0f, 0.0f};

 static AlignedVector VERTICES;
 static AttribPointerList IM_ATTRIBS;
@ -30,7 +30,7 @@ static AttribPointerList IM_ATTRIBS;
  can be applied faster */
 static GLuint IM_ENABLED_VERTEX_ATTRIBUTES = 0;

-typedef struct __attribute__((aligned(32))) {
+typedef struct {
    GLfloat x;
    GLfloat y;
    GLfloat z;
@ -50,7 +50,7 @@ void _glInitImmediateMode(GLuint initial_size) {
    aligned_vector_init(&VERTICES, sizeof(IMVertex));
    aligned_vector_reserve(&VERTICES, initial_size);

-    IM_ATTRIBS.vertex.ptr = aligned_vector_front(&VERTICES);
+    IM_ATTRIBS.vertex.ptr = VERTICES.data;
    IM_ATTRIBS.vertex.size = 3;
    IM_ATTRIBS.vertex.type = GL_FLOAT;
    IM_ATTRIBS.vertex.stride = sizeof(IMVertex);
@ -161,27 +161,31 @@ void APIENTRY glColor3fv(const GLfloat* v) {
 void APIENTRY glVertex3f(GLfloat x, GLfloat y, GLfloat z) {
    IM_ENABLED_VERTEX_ATTRIBUTES |= VERTEX_ENABLED_FLAG;

+    unsigned int cap = VERTICES.capacity;
    IMVertex* vert = aligned_vector_extend(&VERTICES, 1);

-    /* Resizing could've invalidated the pointers */
-    IM_ATTRIBS.vertex.ptr = VERTICES.data;
-    IM_ATTRIBS.uv.ptr = IM_ATTRIBS.vertex.ptr + 12;
-    IM_ATTRIBS.st.ptr = IM_ATTRIBS.uv.ptr + 8;
-    IM_ATTRIBS.colour.ptr = IM_ATTRIBS.st.ptr + 8;
-    IM_ATTRIBS.normal.ptr = IM_ATTRIBS.colour.ptr + 4;
+    if(cap != VERTICES.capacity) {
+        /* Resizing could've invalidated the pointers */
+        IM_ATTRIBS.vertex.ptr = VERTICES.data;
+        IM_ATTRIBS.uv.ptr = IM_ATTRIBS.vertex.ptr + (sizeof(GLfloat) * 3);
+        IM_ATTRIBS.st.ptr = IM_ATTRIBS.vertex.ptr + (sizeof(GLfloat) * 5);
+        IM_ATTRIBS.colour.ptr = IM_ATTRIBS.vertex.ptr + (sizeof(GLfloat) * 7);
+        IM_ATTRIBS.normal.ptr = IM_ATTRIBS.vertex.ptr + (sizeof(GLfloat) * 7) + sizeof(uint32_t);
+    }

-    uint32_t* dest = (uint32_t*) &vert->x;
-    *(dest++) = *((uint32_t*) &x);
-    *(dest++) = *((uint32_t*) &y);
-    *(dest++) = *((uint32_t*) &z);
-    *(dest++) = *((uint32_t*) &UV_COORD[0]);
-    *(dest++) = *((uint32_t*) &UV_COORD[1]);
-    *(dest++) = *((uint32_t*) &ST_COORD[0]);
-    *(dest++) = *((uint32_t*) &ST_COORD[1]);
-    *(dest++) = *((uint32_t*) COLOR);
-    *(dest++) = *((uint32_t*) &NORMAL[0]);
-    *(dest++) = *((uint32_t*) &NORMAL[1]);
-    *(dest++) = *((uint32_t*) &NORMAL[2]);
+    vert->x = x;
+    vert->y = y;
+    vert->z = z;
+    vert->u = UV_COORD[0];
+    vert->v = UV_COORD[1];
+    vert->s = ST_COORD[0];
+    vert->t = ST_COORD[1];
+
+    *((uint32_t*) vert->bgra) = *((uint32_t*) COLOR);
+
+    vert->nx = NORMAL[0];
+    vert->ny = NORMAL[1];
+    vert->nz = NORMAL[2];
 }

 void APIENTRY glVertex3fv(const GLfloat* v) {
@ -271,13 +275,13 @@ void APIENTRY glEnd() {
 #ifndef NDEBUG
    // Immediate mode should always activate the fast path
    GLuint fastPathEnabled = _glRecalcFastPath();
-    gl_assert(fastPathEnabled);
+    assert(fastPathEnabled);
 #else
    /* If we're not debugging, set to true - we assume we haven't broken it! */
    FAST_PATH_ENABLED = GL_TRUE;
 #endif

-    glDrawArrays(ACTIVE_POLYGON_MODE, 0, aligned_vector_header(&VERTICES)->size);
+    glDrawArrays(ACTIVE_POLYGON_MODE, 0, VERTICES.size);

    ATTRIB_POINTERS = stashed_attrib_pointers;

--- a/GL/lighting.c
+++ b/GL/lighting.c
@ -1,3 +1,4 @@
+#include <assert.h>
 #include <stdio.h>
 #include <string.h>
 #include <math.h>
@ -12,107 +13,126 @@
 * multiplier ends up less than this value */
 #define ATTENUATION_THRESHOLD 100.0f

+static GLfloat SCENE_AMBIENT [] = {0.2f, 0.2f, 0.2f, 1.0f};
+static GLboolean VIEWER_IN_EYE_COORDINATES = GL_TRUE;
+static GLenum COLOR_CONTROL = GL_SINGLE_COLOR;

-void _glPrecalcLightingValues(GLuint mask) {
+static GLenum COLOR_MATERIAL_MODE = GL_AMBIENT_AND_DIFFUSE;
+
+#define AMBIENT_MASK 1
+#define DIFFUSE_MASK 2
+#define EMISSION_MASK 4
+#define SPECULAR_MASK 8
+#define SCENE_AMBIENT_MASK 16
+
+static GLenum COLOR_MATERIAL_MASK = AMBIENT_MASK | DIFFUSE_MASK;
+
+static LightSource LIGHTS[MAX_GLDC_LIGHTS];
+static GLuint ENABLED_LIGHT_COUNT = 0;
+static Material MATERIAL;
+
+GL_FORCE_INLINE void _glPrecalcLightingValues(GLuint mask);
+
+static void recalcEnabledLights() {
+    GLubyte i;
+
+    ENABLED_LIGHT_COUNT = 0;
+    for(i = 0; i < MAX_GLDC_LIGHTS; ++i) {
+        if(LIGHTS[i].isEnabled) {
+            ENABLED_LIGHT_COUNT++;
+        }
+    }
+}
+
+void _glInitLights() {
+    static GLfloat ONE [] = {1.0f, 1.0f, 1.0f, 1.0f};
+    static GLfloat ZERO [] = {0.0f, 0.0f, 0.0f, 1.0f};
+    static GLfloat PARTIAL [] = {0.2f, 0.2f, 0.2f, 1.0f};
+    static GLfloat MOSTLY [] = {0.8f, 0.8f, 0.8f, 1.0f};
+
+    memcpy(MATERIAL.ambient, PARTIAL, sizeof(GLfloat) * 4);
+    memcpy(MATERIAL.diffuse, MOSTLY, sizeof(GLfloat) * 4);
+    memcpy(MATERIAL.specular, ZERO, sizeof(GLfloat) * 4);
+    memcpy(MATERIAL.emissive, ZERO, sizeof(GLfloat) * 4);
+    MATERIAL.exponent = 0.0f;
+
+    GLubyte i;
+    for(i = 0; i < MAX_GLDC_LIGHTS; ++i) {
+        memcpy(LIGHTS[i].ambient, ZERO, sizeof(GLfloat) * 4);
+        memcpy(LIGHTS[i].diffuse, ONE, sizeof(GLfloat) * 4);
+        memcpy(LIGHTS[i].specular, ONE, sizeof(GLfloat) * 4);
+
+        if(i > 0) {
+            memcpy(LIGHTS[i].diffuse, ZERO, sizeof(GLfloat) * 4);
+            memcpy(LIGHTS[i].specular, ZERO, sizeof(GLfloat) * 4);
+        }
+
+        LIGHTS[i].position[0] = LIGHTS[i].position[1] = LIGHTS[i].position[3] = 0.0f;
+        LIGHTS[i].position[2] = 1.0f;
+        LIGHTS[i].isDirectional = GL_TRUE;
+        LIGHTS[i].isEnabled = GL_FALSE;
+
+        LIGHTS[i].spot_direction[0] = LIGHTS[i].spot_direction[1] = 0.0f;
+        LIGHTS[i].spot_direction[2] = -1.0f;
+
+        LIGHTS[i].spot_exponent = 0.0f;
+        LIGHTS[i].spot_cutoff = 180.0f;
+
+        LIGHTS[i].constant_attenuation = 1.0f;
+        LIGHTS[i].linear_attenuation = 0.0f;
+        LIGHTS[i].quadratic_attenuation = 0.0f;
+    }
+
+    _glPrecalcLightingValues(~0);
+    recalcEnabledLights();
+}
+
+void _glEnableLight(GLubyte light, GLboolean value) {
+    LIGHTS[light].isEnabled = value;
+    recalcEnabledLights();
+}
+
+GL_FORCE_INLINE void _glPrecalcLightingValues(GLuint mask) {
    /* Pre-calculate lighting values */
    GLshort i;

-    Material* material = _glActiveMaterial();
-
    if(mask & AMBIENT_MASK) {
        for(i = 0; i < MAX_GLDC_LIGHTS; ++i) {
-            LightSource* light = _glLightAt(i);
-
-            light->ambientMaterial[0] = light->ambient[0] * material->ambient[0];
-            light->ambientMaterial[1] = light->ambient[1] * material->ambient[1];
-            light->ambientMaterial[2] = light->ambient[2] * material->ambient[2];
-            light->ambientMaterial[3] = light->ambient[3] * material->ambient[3];
-
+            LIGHTS[i].ambientMaterial[0] = LIGHTS[i].ambient[0] * MATERIAL.ambient[0];
+            LIGHTS[i].ambientMaterial[1] = LIGHTS[i].ambient[1] * MATERIAL.ambient[1];
+            LIGHTS[i].ambientMaterial[2] = LIGHTS[i].ambient[2] * MATERIAL.ambient[2];
+            LIGHTS[i].ambientMaterial[3] = LIGHTS[i].ambient[3] * MATERIAL.ambient[3];
        }
    }

    if(mask & DIFFUSE_MASK) {
        for(i = 0; i < MAX_GLDC_LIGHTS; ++i) {
-            LightSource* light = _glLightAt(i);
-
-            light->diffuseMaterial[0] = light->diffuse[0] * material->diffuse[0];
-            light->diffuseMaterial[1] = light->diffuse[1] * material->diffuse[1];
-            light->diffuseMaterial[2] = light->diffuse[2] * material->diffuse[2];
-            light->diffuseMaterial[3] = light->diffuse[3] * material->diffuse[3];
+            LIGHTS[i].diffuseMaterial[0] = LIGHTS[i].diffuse[0] * MATERIAL.diffuse[0];
+            LIGHTS[i].diffuseMaterial[1] = LIGHTS[i].diffuse[1] * MATERIAL.diffuse[1];
+            LIGHTS[i].diffuseMaterial[2] = LIGHTS[i].diffuse[2] * MATERIAL.diffuse[2];
+            LIGHTS[i].diffuseMaterial[3] = LIGHTS[i].diffuse[3] * MATERIAL.diffuse[3];
        }
    }

    if(mask & SPECULAR_MASK) {
        for(i = 0; i < MAX_GLDC_LIGHTS; ++i) {
-            LightSource* light = _glLightAt(i);
-
-            light->specularMaterial[0] = light->specular[0] * material->specular[0];
-            light->specularMaterial[1] = light->specular[1] * material->specular[1];
-            light->specularMaterial[2] = light->specular[2] * material->specular[2];
-            light->specularMaterial[3] = light->specular[3] * material->specular[3];
+            LIGHTS[i].specularMaterial[0] = LIGHTS[i].specular[0] * MATERIAL.specular[0];
+            LIGHTS[i].specularMaterial[1] = LIGHTS[i].specular[1] * MATERIAL.specular[1];
+            LIGHTS[i].specularMaterial[2] = LIGHTS[i].specular[2] * MATERIAL.specular[2];
+            LIGHTS[i].specularMaterial[3] = LIGHTS[i].specular[3] * MATERIAL.specular[3];
        }
    }

    /* If ambient or emission are updated, we need to update
     * the base colour. */
    if((mask & AMBIENT_MASK) || (mask & EMISSION_MASK) || (mask & SCENE_AMBIENT_MASK)) {
-        GLfloat* scene_ambient = _glLightModelSceneAmbient();
-
-        material->baseColour[0] = MATH_fmac(scene_ambient[0], material->ambient[0], material->emissive[0]);
-        material->baseColour[1] = MATH_fmac(scene_ambient[1], material->ambient[1], material->emissive[1]);
-        material->baseColour[2] = MATH_fmac(scene_ambient[2], material->ambient[2], material->emissive[2]);
-        material->baseColour[3] = MATH_fmac(scene_ambient[3], material->ambient[3], material->emissive[3]);
+        MATERIAL.baseColour[0] = MATH_fmac(SCENE_AMBIENT[0], MATERIAL.ambient[0], MATERIAL.emissive[0]);
+        MATERIAL.baseColour[1] = MATH_fmac(SCENE_AMBIENT[1], MATERIAL.ambient[1], MATERIAL.emissive[1]);
+        MATERIAL.baseColour[2] = MATH_fmac(SCENE_AMBIENT[2], MATERIAL.ambient[2], MATERIAL.emissive[2]);
+        MATERIAL.baseColour[3] = MATH_fmac(SCENE_AMBIENT[3], MATERIAL.ambient[3], MATERIAL.emissive[3]);
    }
 }

-void _glInitLights() {
-    Material* material = _glActiveMaterial();
-
-    static GLfloat ONE [] = {1.0f, 1.0f, 1.0f, 1.0f};
-    static GLfloat ZERO [] = {0.0f, 0.0f, 0.0f, 1.0f};
-    static GLfloat PARTIAL [] = {0.2f, 0.2f, 0.2f, 1.0f};
-    static GLfloat MOSTLY [] = {0.8f, 0.8f, 0.8f, 1.0f};
-
-    memcpy(material->ambient, PARTIAL, sizeof(GLfloat) * 4);
-    memcpy(material->diffuse, MOSTLY, sizeof(GLfloat) * 4);
-    memcpy(material->specular, ZERO, sizeof(GLfloat) * 4);
-    memcpy(material->emissive, ZERO, sizeof(GLfloat) * 4);
-    material->exponent = 0.0f;
-
-    GLubyte i;
-    for(i = 0; i < MAX_GLDC_LIGHTS; ++i) {
-        LightSource* light = _glLightAt(i);
-
-        memcpy(light->ambient, ZERO, sizeof(GLfloat) * 4);
-        memcpy(light->diffuse, ONE, sizeof(GLfloat) * 4);
-        memcpy(light->specular, ONE, sizeof(GLfloat) * 4);
-
-        if(i > 0) {
-            memcpy(light->diffuse, ZERO, sizeof(GLfloat) * 4);
-            memcpy(light->specular, ZERO, sizeof(GLfloat) * 4);
-        }
-
-        light->position[0] = light->position[1] = light->position[3] = 0.0f;
-        light->position[2] = 1.0f;
-        light->isDirectional = GL_TRUE;
-        light->isEnabled = GL_FALSE;
-
-        light->spot_direction[0] = light->spot_direction[1] = 0.0f;
-        light->spot_direction[2] = -1.0f;
-
-        light->spot_exponent = 0.0f;
-        light->spot_cutoff = 180.0f;
-
-        light->constant_attenuation = 1.0f;
-        light->linear_attenuation = 0.0f;
-        light->quadratic_attenuation = 0.0f;
-    }
-
-    _glPrecalcLightingValues(~0);
-    _glRecalcEnabledLights();
-}
-
-
 void APIENTRY glLightModelf(GLenum pname, const GLfloat param) {
    glLightModelfv(pname, &param);
 }
@ -124,13 +144,11 @@ void APIENTRY glLightModeli(GLenum pname, const GLint param) {
 void APIENTRY glLightModelfv(GLenum pname, const GLfloat *params) {
    switch(pname) {
        case GL_LIGHT_MODEL_AMBIENT: {
-            if(memcmp(_glGetLightModelSceneAmbient(), params, sizeof(float) * 4) != 0) {
-                _glSetLightModelSceneAmbient(params);
-                _glPrecalcLightingValues(SCENE_AMBIENT_MASK);
-            }
+            memcpy(SCENE_AMBIENT, params, sizeof(GLfloat) * 4);
+            _glPrecalcLightingValues(SCENE_AMBIENT_MASK);
        } break;
        case GL_LIGHT_MODEL_LOCAL_VIEWER:
-            _glSetLightModelViewerInEyeCoordinates((*params) ? GL_TRUE : GL_FALSE);
+            VIEWER_IN_EYE_COORDINATES = (*params) ? GL_TRUE : GL_FALSE;
        break;
    case GL_LIGHT_MODEL_TWO_SIDE:
        /* Not implemented */
@ -142,10 +160,10 @@ void APIENTRY glLightModelfv(GLenum pname, const GLfloat *params) {
 void APIENTRY glLightModeliv(GLenum pname, const GLint* params) {
    switch(pname) {
        case GL_LIGHT_MODEL_COLOR_CONTROL:
-            _glSetLightModelColorControl(*params);
+            COLOR_CONTROL = *params;
        break;
        case GL_LIGHT_MODEL_LOCAL_VIEWER:
-            _glSetLightModelViewerInEyeCoordinates((*params) ? GL_TRUE : GL_FALSE);
+            VIEWER_IN_EYE_COORDINATES = (*params) ? GL_TRUE : GL_FALSE;
        break;
    default:
        _glKosThrowError(GL_INVALID_ENUM, __func__);
@ -156,7 +174,6 @@ void APIENTRY glLightfv(GLenum light, GLenum pname, const GLfloat *params) {
    GLubyte idx = light & 0xF;

    if(idx >= MAX_GLDC_LIGHTS) {
-        _glKosThrowError(GL_INVALID_VALUE, __func__);
        return;
    }

@ -164,46 +181,33 @@ void APIENTRY glLightfv(GLenum light, GLenum pname, const GLfloat *params) {
                  (pname == GL_DIFFUSE) ? DIFFUSE_MASK :
                  (pname == GL_SPECULAR) ? SPECULAR_MASK : 0;

-    LightSource* l = _glLightAt(idx);
-
-    GLboolean rebuild = GL_FALSE;
-
    switch(pname) {
        case GL_AMBIENT:
-            rebuild = memcmp(l->ambient, params, sizeof(GLfloat) * 4) != 0;
-            if(rebuild) {
-                memcpy(l->ambient, params, sizeof(GLfloat) * 4);
-            }
+            memcpy(LIGHTS[idx].ambient, params, sizeof(GLfloat) * 4);
        break;
        case GL_DIFFUSE:
-            rebuild = memcmp(l->diffuse, params, sizeof(GLfloat) * 4) != 0;
-            if(rebuild) {
-                memcpy(l->diffuse, params, sizeof(GLfloat) * 4);
-            }
+            memcpy(LIGHTS[idx].diffuse, params, sizeof(GLfloat) * 4);
        break;
        case GL_SPECULAR:
-            rebuild = memcmp(l->specular, params, sizeof(GLfloat) * 4) != 0;
-            if(rebuild) {
-                memcpy(l->specular, params, sizeof(GLfloat) * 4);
-            }
+            memcpy(LIGHTS[idx].specular, params, sizeof(GLfloat) * 4);
        break;
        case GL_POSITION: {
-            memcpy(l->position, params, sizeof(GLfloat) * 4);
+            _glMatrixLoadModelView();
+            memcpy(LIGHTS[idx].position, params, sizeof(GLfloat) * 4);

-            l->isDirectional = params[3] == 0.0f;
+            LIGHTS[idx].isDirectional = params[3] == 0.0f;

-            if(l->isDirectional) {
+            if(LIGHTS[idx].isDirectional) {
                //FIXME: Do we need to rotate directional lights?
            } else {
-                _glMatrixLoadModelView();
-                TransformVec3(l->position);
+                TransformVec3(LIGHTS[idx].position);
            }
        }
        break;
        case GL_SPOT_DIRECTION: {
-            l->spot_direction[0] = params[0];
-            l->spot_direction[1] = params[1];
-            l->spot_direction[2] = params[2];
+            LIGHTS[idx].spot_direction[0] = params[0];
+            LIGHTS[idx].spot_direction[1] = params[1];
+            LIGHTS[idx].spot_direction[2] = params[2];
        } break;
        case GL_CONSTANT_ATTENUATION:
        case GL_LINEAR_ATTENUATION:
@ -217,36 +221,31 @@ void APIENTRY glLightfv(GLenum light, GLenum pname, const GLfloat *params) {
        return;
    }

-    if(rebuild) {
-        _glPrecalcLightingValues(mask);
-    }
-
+    _glPrecalcLightingValues(mask);
 }

 void APIENTRY glLightf(GLenum light, GLenum pname, GLfloat param) {
    GLubyte idx = light & 0xF;

    if(idx >= MAX_GLDC_LIGHTS) {
-        _glKosThrowError(GL_INVALID_VALUE, __func__);
        return;
    }

-    LightSource* l = _glLightAt(idx);
    switch(pname) {
        case GL_CONSTANT_ATTENUATION:
-            l->constant_attenuation = param;
+            LIGHTS[idx].constant_attenuation = param;
        break;
        case GL_LINEAR_ATTENUATION:
-            l->linear_attenuation = param;
+            LIGHTS[idx].linear_attenuation = param;
        break;
        case GL_QUADRATIC_ATTENUATION:
-            l->quadratic_attenuation = param;
+            LIGHTS[idx].quadratic_attenuation = param;
        break;
        case GL_SPOT_EXPONENT:
-            l->spot_exponent = param;
+            LIGHTS[idx].spot_exponent = param;
        break;
        case GL_SPOT_CUTOFF:
-            l->spot_cutoff = param;
+            LIGHTS[idx].spot_cutoff = param;
        break;
    default:
        _glKosThrowError(GL_INVALID_ENUM, __func__);
@ -259,7 +258,7 @@ void APIENTRY glMaterialf(GLenum face, GLenum pname, const GLfloat param) {
        return;
    }

-    _glActiveMaterial()->exponent = _MIN(param, 128);  /* 128 is the max according to the GL spec */
+    MATERIAL.exponent = _MIN(param, 128);  /* 128 is the max according to the GL spec */
 }

 void APIENTRY glMateriali(GLenum face, GLenum pname, const GLint param) {
@ -272,49 +271,25 @@ void APIENTRY glMaterialfv(GLenum face, GLenum pname, const GLfloat *params) {
        return;
    }

-    Material* material = _glActiveMaterial();
-
-    GLboolean rebuild = GL_FALSE;
-
    switch(pname) {
        case GL_SHININESS:
            glMaterialf(face, pname, *params);
-            rebuild = GL_TRUE;
        break;
-        case GL_AMBIENT: {
-            if(memcmp(material->ambient, params, sizeof(float) * 4) != 0) {
-                vec4cpy(material->ambient, params);
-                rebuild = GL_TRUE;
-            }
-        } break;
+        case GL_AMBIENT:
+            vec4cpy(MATERIAL.ambient, params);
+        break;
        case GL_DIFFUSE:
-            if(memcmp(material->diffuse, params, sizeof(float) * 4) != 0) {
-                vec4cpy(material->diffuse, params);
-                rebuild = GL_TRUE;
-            }
+            vec4cpy(MATERIAL.diffuse, params);
        break;
        case GL_SPECULAR:
-            if(memcmp(material->specular, params, sizeof(float) * 4) != 0) {
-                vec4cpy(material->specular, params);
-                rebuild = GL_TRUE;
-            }
+            vec4cpy(MATERIAL.specular, params);
        break;
        case GL_EMISSION:
-            if(memcmp(material->emissive, params, sizeof(float) * 4) != 0) {
-                vec4cpy(material->emissive, params);
-                rebuild = GL_TRUE;
-            }
+            vec4cpy(MATERIAL.emissive, params);
        break;
        case GL_AMBIENT_AND_DIFFUSE: {
-            rebuild = (
-                memcmp(material->ambient, params, sizeof(float) * 4) != 0 ||
-                memcmp(material->diffuse, params, sizeof(float) * 4) != 0
-            );
-
-            if(rebuild) {
-                vec4cpy(material->ambient, params);
-                vec4cpy(material->diffuse, params);
-            }
+            vec4cpy(MATERIAL.ambient, params);
+            vec4cpy(MATERIAL.diffuse, params);
        } break;
        case GL_COLOR_INDEXES:
        default: {
@ -323,15 +298,13 @@ void APIENTRY glMaterialfv(GLenum face, GLenum pname, const GLfloat *params) {
        }
    }

-    if(rebuild) {
-        GLuint updateMask = (pname == GL_AMBIENT) ? AMBIENT_MASK:
-                            (pname == GL_DIFFUSE) ? DIFFUSE_MASK:
-                            (pname == GL_SPECULAR) ? SPECULAR_MASK:
-                            (pname == GL_EMISSION) ? EMISSION_MASK:
-                            (pname == GL_AMBIENT_AND_DIFFUSE) ? AMBIENT_MASK | DIFFUSE_MASK : 0;
+    GLuint updateMask = (pname == GL_AMBIENT) ? AMBIENT_MASK:
+                        (pname == GL_DIFFUSE) ? DIFFUSE_MASK:
+                        (pname == GL_SPECULAR) ? SPECULAR_MASK:
+                        (pname == GL_EMISSION) ? EMISSION_MASK:
+                        (pname == GL_AMBIENT_AND_DIFFUSE) ? AMBIENT_MASK | DIFFUSE_MASK : 0;

-        _glPrecalcLightingValues(updateMask);
-    }
+    _glPrecalcLightingValues(updateMask);
 }

 void APIENTRY glColorMaterial(GLenum face, GLenum mode) {
@ -346,13 +319,12 @@ void APIENTRY glColorMaterial(GLenum face, GLenum mode) {
        return;
    }

-    GLenum mask = (mode == GL_AMBIENT) ? AMBIENT_MASK:
+    COLOR_MATERIAL_MASK = (mode == GL_AMBIENT) ? AMBIENT_MASK:
                          (mode == GL_DIFFUSE) ? DIFFUSE_MASK:
                          (mode == GL_AMBIENT_AND_DIFFUSE) ? AMBIENT_MASK | DIFFUSE_MASK:
                          (mode == GL_EMISSION) ? EMISSION_MASK : SPECULAR_MASK;

-    _glSetColorMaterialMask(mask);
-    _glSetColorMaterialMode(mode);
+    COLOR_MATERIAL_MODE = mode;
 }

 GL_FORCE_INLINE void bgra_to_float(const uint8_t* input, GLfloat* output) {
@ -365,68 +337,44 @@ GL_FORCE_INLINE void bgra_to_float(const uint8_t* input, GLfloat* output) {
 }

 void _glUpdateColourMaterialA(const GLubyte* argb) {
-    Material* material = _glActiveMaterial();
-
    float colour[4];
    bgra_to_float(argb, colour);
-    vec4cpy(material->ambient, colour);
-    GLenum mask = _glColorMaterialMode();
-    _glPrecalcLightingValues(mask);
+    vec4cpy(MATERIAL.ambient, colour);
+    _glPrecalcLightingValues(COLOR_MATERIAL_MASK);
 }

 void _glUpdateColourMaterialD(const GLubyte* argb) {
-    Material* material = _glActiveMaterial();
-
    float colour[4];
    bgra_to_float(argb, colour);
-    vec4cpy(material->diffuse, colour);
-
-    GLenum mask = _glColorMaterialMode();
-    _glPrecalcLightingValues(mask);
+    vec4cpy(MATERIAL.diffuse, colour);
+    _glPrecalcLightingValues(COLOR_MATERIAL_MASK);
 }

 void _glUpdateColourMaterialE(const GLubyte* argb) {
-    Material* material = _glActiveMaterial();
-
    float colour[4];
    bgra_to_float(argb, colour);
-    vec4cpy(material->emissive, colour);
-
-    GLenum mask = _glColorMaterialMode();
-    _glPrecalcLightingValues(mask);
+    vec4cpy(MATERIAL.emissive, colour);
+    _glPrecalcLightingValues(COLOR_MATERIAL_MASK);
 }

 void _glUpdateColourMaterialAD(const GLubyte* argb) {
-    Material* material = _glActiveMaterial();
-
    float colour[4];
    bgra_to_float(argb, colour);
-    vec4cpy(material->ambient, colour);
-    vec4cpy(material->diffuse, colour);
-
-    GLenum mask = _glColorMaterialMode();
-    _glPrecalcLightingValues(mask);
+    vec4cpy(MATERIAL.ambient, colour);
+    vec4cpy(MATERIAL.diffuse, colour);
+    _glPrecalcLightingValues(COLOR_MATERIAL_MASK);
 }

 GL_FORCE_INLINE GLboolean isDiffuseColorMaterial() {
-    GLenum mode = _glColorMaterialMode();
-    return (
-        mode == GL_DIFFUSE ||
-        mode == GL_AMBIENT_AND_DIFFUSE
-    );
+    return (COLOR_MATERIAL_MODE == GL_DIFFUSE || COLOR_MATERIAL_MODE == GL_AMBIENT_AND_DIFFUSE);
 }

 GL_FORCE_INLINE GLboolean isAmbientColorMaterial() {
-    GLenum mode = _glColorMaterialMode();
-    return (
-        mode == GL_AMBIENT ||
-        mode == GL_AMBIENT_AND_DIFFUSE
-    );
+    return (COLOR_MATERIAL_MODE == GL_AMBIENT || COLOR_MATERIAL_MODE == GL_AMBIENT_AND_DIFFUSE);
 }

 GL_FORCE_INLINE GLboolean isSpecularColorMaterial() {
-    GLenum mode = _glColorMaterialMode();
-    return (mode == GL_SPECULAR);
+    return (COLOR_MATERIAL_MODE == GL_SPECULAR);
 }

 /*
@ -445,7 +393,7 @@ GL_FORCE_INLINE float faster_pow2(const float p) {
 }

 GL_FORCE_INLINE float faster_log2(const float x) {
-    gl_assert(x >= 0.0f);
+    assert(x >= 0.0f);

    const union { float f; uint32_t i; } vx = { x };
    const float y = (float) (vx.i) * 1.1920928955078125e-7f;
@ -461,15 +409,12 @@ GL_FORCE_INLINE void _glLightVertexDirectional(
    float* final, uint8_t lid,
    float LdotN, float NdotH) {

-    Material* material = _glActiveMaterial();
-    LightSource* light = _glLightAt(lid);
-
-    float FI = (material->exponent) ?
-        faster_pow((LdotN != 0.0f) * NdotH, material->exponent) : 1.0f;
+    float FI = (MATERIAL.exponent) ?
+        faster_pow((LdotN != 0.0f) * NdotH, MATERIAL.exponent) : 1.0f;

 #define _PROCESS_COMPONENT(X) \
-    final[X] += (LdotN * light->diffuseMaterial[X] + light->ambientMaterial[X]) \
-        + (FI * light->specularMaterial[X]); \
+    final[X] += (LdotN * LIGHTS[lid].diffuseMaterial[X] + LIGHTS[lid].ambientMaterial[X]) \
+        + (FI * LIGHTS[lid].specularMaterial[X]); \

    _PROCESS_COMPONENT(0);
    _PROCESS_COMPONENT(1);
@ -482,15 +427,12 @@ GL_FORCE_INLINE void _glLightVertexPoint(
    float* final, uint8_t lid,
    float LdotN, float NdotH, float att) {

-    Material* material = _glActiveMaterial();
-    LightSource* light = _glLightAt(lid);
-
-    float FI = (material->exponent) ?
-        faster_pow((LdotN != 0.0f) * NdotH, material->exponent) : 1.0f;
+    float FI = (MATERIAL.exponent) ?
+        faster_pow((LdotN != 0.0f) * NdotH, MATERIAL.exponent) : 1.0f;

 #define _PROCESS_COMPONENT(X) \
-    final[X] += ((LdotN * light->diffuseMaterial[X] + light->ambientMaterial[X]) \
-        + (FI * light->specularMaterial[X])) * att; \
+    final[X] += ((LdotN * LIGHTS[lid].diffuseMaterial[X] + LIGHTS[lid].ambientMaterial[X]) \
+        + (FI * LIGHTS[lid].specularMaterial[X])) * att; \

    _PROCESS_COMPONENT(0);
    _PROCESS_COMPONENT(1);
@ -503,8 +445,6 @@ void _glPerformLighting(Vertex* vertices, EyeSpaceData* es, const uint32_t count
    GLubyte i;
    GLuint j;

-    Material* material = _glActiveMaterial();
-
    Vertex* vertex = vertices;
    EyeSpaceData* data = es;

@ -512,8 +452,7 @@ void _glPerformLighting(Vertex* vertices, EyeSpaceData* es, const uint32_t count
    void (*updateColourMaterial)(const GLubyte*) = NULL;

    if(_glIsColorMaterialEnabled()) {
-        GLenum mode = _glColorMaterialMode();
-        switch(mode) {
+        switch(COLOR_MATERIAL_MODE) {
            case GL_AMBIENT:
                updateColourMaterial = _glUpdateColourMaterialA;
            break;
@ -536,10 +475,10 @@ void _glPerformLighting(Vertex* vertices, EyeSpaceData* es, const uint32_t count
        }

        /* Copy the base colour across */
-        vec4cpy(data->finalColour, material->baseColour);
+        vec4cpy(data->finalColour, MATERIAL.baseColour);
    }

-    if(!_glEnabledLightCount()) {
+    if(!ENABLED_LIGHT_COUNT) {
        return;
    }

@ -557,17 +496,15 @@ void _glPerformLighting(Vertex* vertices, EyeSpaceData* es, const uint32_t count
        const float Nz = data->n[2];

        for(i = 0; i < MAX_GLDC_LIGHTS; ++i) {
-            LightSource* light = _glLightAt(i);
-
-            if(!light->isEnabled) {
+            if(!LIGHTS[i].isEnabled) {
                continue;
            }

-            float Lx = light->position[0] - vertex->xyz[0];
-            float Ly = light->position[1] - vertex->xyz[1];
-            float Lz = light->position[2] - vertex->xyz[2];
+            float Lx = LIGHTS[i].position[0] - vertex->xyz[0];
+            float Ly = LIGHTS[i].position[1] - vertex->xyz[1];
+            float Lz = LIGHTS[i].position[2] - vertex->xyz[2];

-            if(light->isDirectional) {
+            if(LIGHTS[i].isDirectional) {
                float Hx = (Lx + 0);
                float Hy = (Ly + 0);
                float Hz = (Lz + 1);
@ -596,9 +533,9 @@ void _glPerformLighting(Vertex* vertices, EyeSpaceData* es, const uint32_t count
                VEC3_LENGTH(Lx, Ly, Lz, D);

                float att = (
-                    light->constant_attenuation + (
-                        light->linear_attenuation * D
-                    ) + (light->quadratic_attenuation * D * D)
+                    LIGHTS[i].constant_attenuation + (
+                        LIGHTS[i].linear_attenuation * D
+                    ) + (LIGHTS[i].quadratic_attenuation * D * D)
                );

                /* Anything over the attenuation threshold will
--- a/GL/matrix.c
+++ b/GL/matrix.c
@ -13,8 +13,8 @@
 GLfloat DEPTH_RANGE_MULTIPLIER_L = (1 - 0) / 2;
 GLfloat DEPTH_RANGE_MULTIPLIER_H = (0 + 1) / 2;

-static Stack __attribute__((aligned(32))) MATRIX_STACKS[4]; // modelview, projection, texture
-static Matrix4x4 __attribute__((aligned(32))) NORMAL_MATRIX;
+static Stack MATRIX_STACKS[3]; // modelview, projection, texture
+static Matrix4x4 NORMAL_MATRIX __attribute__((aligned(32)));

 Viewport VIEWPORT = {
    0, 0, 640, 480, 320.0f, 240.0f, 320.0f, 240.0f
@ -23,7 +23,7 @@ Viewport VIEWPORT = {
 static GLenum MATRIX_MODE = GL_MODELVIEW;
 static GLubyte MATRIX_IDX = 0;

-static const Matrix4x4 __attribute__((aligned(32))) IDENTITY = {
+static const Matrix4x4 IDENTITY = {
    1.0f, 0.0f, 0.0f, 0.0f,
    0.0f, 1.0f, 0.0f, 0.0f,
    0.0f, 0.0f, 1.0f, 0.0f,
@ -106,11 +106,7 @@ void APIENTRY glMatrixMode(GLenum mode) {
 }

 void APIENTRY glPushMatrix() {
-    void* top = stack_top(MATRIX_STACKS + MATRIX_IDX);
-    assert(top);
-    void* ret = stack_push(MATRIX_STACKS + MATRIX_IDX, top);
-    (void) ret;
-    assert(ret);
+    stack_push(MATRIX_STACKS + MATRIX_IDX, stack_top(MATRIX_STACKS + MATRIX_IDX));
 }

 void APIENTRY glPopMatrix() {
@ -131,16 +127,10 @@ void APIENTRY glTranslatef(GLfloat x, GLfloat y, GLfloat z) {
        0.0f, 0.0f, 1.0f, 0.0f,
        x, y, z, 1.0f
    };
-    void* top = stack_top(MATRIX_STACKS + MATRIX_IDX);
-    assert(top);

-    UploadMatrix4x4(top);
+    UploadMatrix4x4(stack_top(MATRIX_STACKS + MATRIX_IDX));
    MultiplyMatrix4x4(&trn);
-
-    top = stack_top(MATRIX_STACKS + MATRIX_IDX);
-    assert(top);
-
-    DownloadMatrix4x4(top);
+    DownloadMatrix4x4(stack_top(MATRIX_STACKS + MATRIX_IDX));

    if(MATRIX_MODE == GL_MODELVIEW) {
        recalculateNormalMatrix();
@ -210,9 +200,28 @@ void APIENTRY glRotatef(GLfloat angle, GLfloat x, GLfloat  y, GLfloat z) {

 /* Load an arbitrary matrix */
 void APIENTRY glLoadMatrixf(const GLfloat *m) {
-    static Matrix4x4 __attribute__((aligned(32))) TEMP;
+    static Matrix4x4 TEMP;
+
+    TEMP[M0] = m[0];
+    TEMP[M1] = m[1];
+    TEMP[M2] = m[2];
+    TEMP[M3] = m[3];
+
+    TEMP[M4] = m[4];
+    TEMP[M5] = m[5];
+    TEMP[M6] = m[6];
+    TEMP[M7] = m[7];
+
+    TEMP[M8] = m[8];
+    TEMP[M9] = m[9];
+    TEMP[M10] = m[10];
+    TEMP[M11] = m[11];
+
+    TEMP[M12] = m[12];
+    TEMP[M13] = m[13];
+    TEMP[M14] = m[14];
+    TEMP[M15] = m[15];

-    memcpy(TEMP, m, sizeof(float) * 16);
    stack_replace(MATRIX_STACKS + MATRIX_IDX, TEMP);

    if(MATRIX_MODE == GL_MODELVIEW) {
@ -280,10 +289,18 @@ void APIENTRY glFrustum(GLfloat left, GLfloat right,
 /* Multiply the current matrix by an arbitrary matrix */
 void glMultMatrixf(const GLfloat *m) {
    Matrix4x4 TEMP __attribute__((aligned(32)));
-    MEMCPY4(TEMP, m, sizeof(Matrix4x4));
+    const Matrix4x4 *pMatrix;
+
+    if (((GLint)m)&0xf){ /* Unaligned matrix */
+        pMatrix = &TEMP;
+        MEMCPY4(TEMP, m, sizeof(Matrix4x4));
+    }
+    else{
+        pMatrix = (const Matrix4x4*) m;
+    }

    UploadMatrix4x4(stack_top(MATRIX_STACKS + MATRIX_IDX));
-    MultiplyMatrix4x4(&TEMP);
+    MultiplyMatrix4x4(pMatrix);
    DownloadMatrix4x4(stack_top(MATRIX_STACKS + MATRIX_IDX));

    if(MATRIX_MODE == GL_MODELVIEW) {
@ -409,7 +426,7 @@ GL_FORCE_INLINE void vec3f_normalize_sh4(float *v){
 void gluLookAt(GLfloat eyex, GLfloat eyey, GLfloat eyez, GLfloat centerx,
               GLfloat centery, GLfloat centerz, GLfloat upx, GLfloat upy,
               GLfloat upz) {
-    GLfloat m [16] __attribute__((aligned(32)));
+    GLfloat m [16];
    GLfloat f [3];
    GLfloat u [3];
    GLfloat s [3];
--- a/GL/platform.h
+++ b/GL/platform.h
@ -3,9 +3,7 @@
 #include <stdint.h>
 #include <stdlib.h>
 #include <stdbool.h>
-
-#include "gl_assert.h"
-#include "types.h"
+#include <assert.h>

 #define MEMSET(dst, v, size) memset((dst), (v), (size))

@ -261,7 +259,7 @@ typedef float Matrix4x4[16];
 void SceneBegin();

 void SceneListBegin(GPUList list);
-void SceneListSubmit(Vertex* v2, int n);
+void SceneListSubmit(void* src, int n);
 void SceneListFinish();

 void SceneFinish();
--- a/GL/platforms/sh4.c
+++ b/GL/platforms/sh4.c
@ -4,14 +4,16 @@

 #define CLIP_DEBUG 0

+#define TA_SQ_ADDR (unsigned int *)(void *) \
+    (0xe0000000 | (((unsigned long)0x10000000) & 0x03ffffe0))
+
+#define QACRTA ((((unsigned int)0x10000000)>>26)<<2)&0x1c
+
 #define PVR_VERTEX_BUF_SIZE 2560 * 256

 #define likely(x)      __builtin_expect(!!(x), 1)
 #define unlikely(x)    __builtin_expect(!!(x), 0)

-#define SQ_BASE_ADDRESS (void*) 0xe0000000
-
-
 GL_FORCE_INLINE bool glIsVertex(const float flags) {
    return flags == GPU_CMD_VERTEX_EOL || flags == GPU_CMD_VERTEX;
 }
@ -20,6 +22,7 @@ GL_FORCE_INLINE bool glIsLastVertex(const float flags) {
    return flags == GPU_CMD_VERTEX_EOL;
 }

+
 void InitGPU(_Bool autosort, _Bool fsaa) {
    pvr_init_params_t params = {
        /* Enable opaque and translucent polygons with size 32 and 32 */
@ -31,427 +34,412 @@ void InitGPU(_Bool autosort, _Bool fsaa) {
    };

    pvr_init(&params);
-
-    /* If we're PAL and we're NOT VGA, then use 50hz by default. This is the safest
-    thing to do. If someone wants to force 60hz then they can call vid_set_mode later and hopefully
-    that'll work... */
-
-    int cable = vid_check_cable();
-    int region = flashrom_get_region();
-
-    if(region == FLASHROM_REGION_EUROPE && cable != CT_VGA) {
-        printf("PAL region without VGA - enabling 50hz");
-        vid_set_mode(DM_640x480_PAL_IL, PM_RGB565);
-    }
 }

 void SceneBegin() {
    pvr_wait_ready();
    pvr_scene_begin();
+    QACR0 = QACRTA;
+    QACR1 = QACRTA;
 }

 void SceneListBegin(GPUList list) {
    pvr_list_begin(list);
 }

-GL_FORCE_INLINE float _glFastInvert(float x) {
-    return (1.f / __builtin_sqrtf(x * x));
-}
-
 GL_FORCE_INLINE void _glPerspectiveDivideVertex(Vertex* vertex, const float h) {
-    TRACE();
-
-    const float f = _glFastInvert(vertex->w);
+    const float f = MATH_Fast_Invert(vertex->w);

    /* Convert to NDC and apply viewport */
-    vertex->xyz[0] = (vertex->xyz[0] * f * 320) + 320;
-    vertex->xyz[1] = (vertex->xyz[1] * f * -240) + 240;
+    vertex->xyz[0] = __builtin_fmaf(
+        VIEWPORT.hwidth, vertex->xyz[0] * f, VIEWPORT.x_plus_hwidth
+    );
+
+    vertex->xyz[1] = h - __builtin_fmaf(
+        VIEWPORT.hheight, vertex->xyz[1] * f, VIEWPORT.y_plus_hheight
+    );

    /* Orthographic projections need to use invZ otherwise we lose
    the depth information. As w == 1, and clip-space range is -w to +w
    we add 1.0 to the Z to bring it into range. We add a little extra to
    avoid a divide by zero.
    */
-    if(vertex->w == 1.0f) {
-        vertex->xyz[2] = _glFastInvert(1.0001f + vertex->xyz[2]);
+    if(unlikely(vertex->w == 1.0f)) {
+        vertex->xyz[2] = MATH_Fast_Invert(1.0001f + vertex->xyz[2]);
    } else {
        vertex->xyz[2] = f;
    }
 }

+static uint32_t *d;  // SQ target

-volatile uint32_t *sq = SQ_BASE_ADDRESS;
+GL_FORCE_INLINE void _glSubmitHeaderOrVertex(const Vertex* v) {
+#ifndef NDEBUG
+    assert(!isnan(v->xyz[2]));
+    assert(!isnan(v->w));
+#endif

-static inline void _glFlushBuffer() {
-    TRACE();
+#if CLIP_DEBUG
+    printf("Submitting: %x (%x)\n", v, v->flags);
+#endif

-    /* Wait for both store queues to complete */
-    sq = (uint32_t*) 0xe0000000;
-    sq[0] = sq[8] = 0;
+    uint32_t *s = (uint32_t*) v;
+    __asm__("pref @%0" : : "r"(s + 8));  /* prefetch 32 bytes for next loop */
+    d[0] = *(s++);
+    d[1] = *(s++);
+    d[2] = *(s++);
+    d[3] = *(s++);
+    d[4] = *(s++);
+    d[5] = *(s++);
+    d[6] = *(s++);
+    d[7] = *(s++);
+    __asm__("pref @%0" : : "r"(d));
+    d += 8;
 }

-static inline void _glPushHeaderOrVertex(Vertex* v)  {
-    TRACE();
+static struct {
+    Vertex* v;
+    int visible;
+} triangle[3];

-    uint32_t* s = (uint32_t*) v;
-    sq[0] = *(s++);
-    sq[1] = *(s++);
-    sq[2] = *(s++);
-    sq[3] = *(s++);
-    sq[4] = *(s++);
-    sq[5] = *(s++);
-    sq[6] = *(s++);
-    sq[7] = *(s++);
-    __asm__("pref @%0" : : "r"(sq));
-    sq += 8;
+static int tri_count = 0;
+static int strip_count = 0;
+
+GL_FORCE_INLINE void interpolateColour(const uint8_t* v1, const uint8_t* v2, const float t, uint8_t* out) {
+    const int MASK1 = 0x00FF00FF;
+    const int MASK2 = 0xFF00FF00;
+
+    const int f2 = 256 * t;
+    const int f1 = 256 - f2;
+
+    const uint32_t a = *(uint32_t*) v1;
+    const uint32_t b = *(uint32_t*) v2;
+
+    *((uint32_t*) out) = (((((a & MASK1) * f1) + ((b & MASK1) * f2)) >> 8) & MASK1) |
+            (((((a & MASK2) * f1) + ((b & MASK2) * f2)) >> 8) & MASK2);
 }

-static inline void _glClipEdge(const Vertex* const v1, const Vertex* const v2, Vertex* vout) {
-    const static float o = 0.003921569f;  // 1 / 255
+GL_FORCE_INLINE void _glClipEdge(const Vertex* v1, const Vertex* v2, Vertex* vout) {
+    /* Clipping time! */
    const float d0 = v1->w + v1->xyz[2];
    const float d1 = v2->w + v2->xyz[2];
-    const float t = (fabs(d0) * (1.0f / sqrtf((d1 - d0) * (d1 - d0)))) + 0.000001f;
-    const float invt = 1.0f - t;

-    vout->xyz[0] = invt * v1->xyz[0] + t * v2->xyz[0];
-    vout->xyz[1] = invt * v1->xyz[1] + t * v2->xyz[1];
-    vout->xyz[2] = invt * v1->xyz[2] + t * v2->xyz[2];
+    const float epsilon = (d0 < d1) ? -0.00001f : 0.00001f;

-    vout->uv[0] = invt * v1->uv[0] + t * v2->uv[0];
-    vout->uv[1] = invt * v1->uv[1] + t * v2->uv[1];
+    float t = MATH_Fast_Divide(d0, (d0 - d1)) + epsilon;

-    vout->w = invt * v1->w + t * v2->w;
+    t = (t > 1.0f) ? 1.0f : t;
+    t = (t < 0.0f) ? 0.0f : t;

-    const float m = 255 * t;
-    const float n = 255 - m;
+    vout->xyz[0] = __builtin_fmaf(v2->xyz[0] - v1->xyz[0], t, v1->xyz[0]);
+    vout->xyz[1] = __builtin_fmaf(v2->xyz[1] - v1->xyz[1], t, v1->xyz[1]);
+    vout->xyz[2] = __builtin_fmaf(v2->xyz[2] - v1->xyz[2], t, v1->xyz[2]);
+    vout->w = __builtin_fmaf(v2->w - v1->w, t, v1->w);

-    vout->bgra[0] = (v1->bgra[0] * n + v2->bgra[0] * m) * o;
-    vout->bgra[1] = (v1->bgra[1] * n + v2->bgra[1] * m) * o;
-    vout->bgra[2] = (v1->bgra[2] * n + v2->bgra[2] * m) * o;
-    vout->bgra[3] = (v1->bgra[3] * n + v2->bgra[3] * m) * o;
+    vout->uv[0] = __builtin_fmaf(v2->uv[0] - v1->uv[0], t, v1->uv[0]);
+    vout->uv[1] = __builtin_fmaf(v2->uv[1] - v1->uv[1], t, v1->uv[1]);
+
+    interpolateColour(v1->bgra, v2->bgra, t, vout->bgra);
 }

-#define SPAN_SORT_CFG 0x005F8030
-static volatile uint32_t* PVR_LMMODE0 = (uint32_t*) 0xA05F6884;
-static volatile uint32_t *PVR_LMMODE1 = (uint32_t*) 0xA05F6888;
-static volatile uint32_t *QACR = (uint32_t*) 0xFF000038;
+GL_FORCE_INLINE void ClearTriangle() {
+    tri_count = 0;
+}

-void SceneListSubmit(Vertex* v2, int n) {
-    TRACE();
-
-    /* You need at least a header, and 3 vertices to render anything */
-    if(n < 4) {
+GL_FORCE_INLINE void ShiftTriangle() {
+    if(!tri_count) {
        return;
    }

+    tri_count--;
+    triangle[0] = triangle[1];
+    triangle[1] = triangle[2];
+
+#ifndef NDEBUG
+    triangle[2].v = NULL;
+    triangle[2].visible = false;
+#endif
+}
+
+
+GL_FORCE_INLINE void ShiftRotateTriangle() {
+    if(!tri_count) {
+        return;
+    }
+
+    if(triangle[0].v < triangle[1].v) {
+        triangle[0] = triangle[2];
+    } else {
+        triangle[1] = triangle[2];
+    }
+
+    tri_count--;
+}
+
+
+void SceneListSubmit(void* src, int n) {
+    /* Do everything, everywhere, all at once */
+
+    /* Prep store queues */
+    d = (uint32_t*) TA_SQ_ADDR;
+
+    /* Perform perspective divide on each vertex */
+    Vertex* vertex = (Vertex*) src;
+
    const float h = GetVideoMode()->height;

-    PVR_SET(SPAN_SORT_CFG, 0x0);
+    if(!ZNEAR_CLIPPING_ENABLED) {
+        for(int i = 0; i < n; ++i, ++vertex) {
+            PREFETCH(vertex + 1);
+            if(glIsVertex(vertex->flags)) {
+                _glPerspectiveDivideVertex(vertex, h);
+            }
+            _glSubmitHeaderOrVertex(vertex);
+        }

-    //Set PVR DMA registers
-    *PVR_LMMODE0 = 0;
-    *PVR_LMMODE1 = 0;
+        /* Wait for both store queues to complete */
+        d = (uint32_t *)0xe0000000;
+        d[0] = d[8] = 0;

-    //Set QACR registers
-    QACR[1] = QACR[0] = 0x11;
+        return;
+    }
+
+    tri_count = 0;
+    strip_count = 0;

 #if CLIP_DEBUG
-    Vertex* vertex = (Vertex*) src;
-    for(int i = 0; i < n; ++i) {
-        fprintf(stderr, "{%f, %f, %f, %f}, // %x (%x)\n", vertex[i].xyz[0], vertex[i].xyz[1], vertex[i].xyz[2], vertex[i].w, vertex[i].flags, &vertex[i]);
-    }
-
-    fprintf(stderr, "----\n");
+    printf("----\n");
 #endif
-    uint8_t visible_mask = 0;
-    uint8_t counter = 0;

-    sq = SQ_BASE_ADDRESS;
+    for(int i = 0; i < n; ++i, ++vertex) {
+        PREFETCH(vertex + 1);

-    for(int i = 0; i < n; ++i, ++v2) {
-        PREFETCH(v2 + 1);
-        switch(v2->flags) {
-        case GPU_CMD_VERTEX_EOL:
-            if(counter < 2) {
+        bool is_last_in_strip = glIsLastVertex(vertex->flags);
+
+        /* Wait until we fill the triangle */
+        if(tri_count < 3) {
+            if(likely(glIsVertex(vertex->flags))) {
+                triangle[tri_count].v = vertex;
+                triangle[tri_count].visible = vertex->xyz[2] >= -vertex->w;
+                tri_count++;
+                strip_count++;
+            } else {
+                /* We hit a header */
+                tri_count = 0;
+                strip_count = 0;
+                _glSubmitHeaderOrVertex(vertex);
+            }
+
+            if(tri_count < 3) {
                continue;
            }
-            counter = 0;
-            break;
-        case GPU_CMD_VERTEX:
-            ++counter;
-            if(counter < 3) {
-                continue;
-            }
-            break;
-        default:
-            _glPushHeaderOrVertex(v2);
-            counter = 0;
-            continue;
-        };
-
-        Vertex* const v0 = v2 - 2;
-        Vertex* const v1 = v2 - 1;
-
-        visible_mask = (
-            (v0->xyz[2] > -v0->w) << 0 |
-            (v1->xyz[2] > -v1->w) << 1 |
-            (v2->xyz[2] > -v2->w) << 2 |
-            (counter == 0) << 3
-        );
-
-        switch(visible_mask) {
-        case 15: /* All visible, but final vertex in strip */
-        {
-            _glPerspectiveDivideVertex(v0, h);
-            _glPushHeaderOrVertex(v0);
-
-            _glPerspectiveDivideVertex(v1, h);
-            _glPushHeaderOrVertex(v1);
-
-            _glPerspectiveDivideVertex(v2, h);
-            _glPushHeaderOrVertex(v2);
        }
-        break;
-        case 7:
-            /* All visible, push the first vertex and move on */
-            _glPerspectiveDivideVertex(v0, h);
-            _glPushHeaderOrVertex(v0);
-            break;
-        case 9:
-            /* First vertex was visible, last in strip */
-            {
-                Vertex __attribute__((aligned(32))) scratch[2];
-                Vertex* a = &scratch[0];
-                Vertex* b = &scratch[1];

-                _glClipEdge(v0, v1, a);
-                a->flags = GPU_CMD_VERTEX;
+#if CLIP_DEBUG
+        printf("SC: %d\n", strip_count);
+#endif

-                _glClipEdge(v2, v0, b);
-                b->flags = GPU_CMD_VERTEX_EOL;
+        /* If we got here, then triangle contains 3 vertices */
+        int visible_mask = triangle[0].visible | (triangle[1].visible << 1) | (triangle[2].visible << 2);
+        if(visible_mask == 7) {
+#if CLIP_DEBUG
+            printf("Visible\n");
+#endif
+            /* All the vertices are visible! We divide and submit v0, then shift */
+            _glPerspectiveDivideVertex(vertex - 2, h);
+            _glSubmitHeaderOrVertex(vertex - 2);

-                _glPerspectiveDivideVertex(v0, h);
-                _glPushHeaderOrVertex(v0);
-
-                _glPerspectiveDivideVertex(a, h);
-                _glPushHeaderOrVertex(a);
-
-                _glPerspectiveDivideVertex(b, h);
-                _glPushHeaderOrVertex(b);
+            if(is_last_in_strip) {
+                _glPerspectiveDivideVertex(vertex - 1, h);
+                _glSubmitHeaderOrVertex(vertex - 1);
+                _glPerspectiveDivideVertex(vertex, h);
+                _glSubmitHeaderOrVertex(vertex);
+                tri_count = 0;
+                strip_count = 0;
            }
-            break;
-        case 1:
-            /* First vertex was visible, but not last in strip */
-            {
-                Vertex __attribute__((aligned(32))) scratch[2];
-                Vertex* a = &scratch[0];
-                Vertex* b = &scratch[1];

-                _glClipEdge(v0, v1, a);
-                a->flags = GPU_CMD_VERTEX;
+            ShiftRotateTriangle();

-                _glClipEdge(v2, v0, b);
-                b->flags = GPU_CMD_VERTEX;
+        } else if(visible_mask) {
+            /* Clipping time!

-                _glPerspectiveDivideVertex(v0, h);
-                _glPushHeaderOrVertex(v0);
+                There are 6 distinct possibilities when clipping a triangle. 3 of them result
+                in another triangle, 3 of them result in a quadrilateral.

-                _glPerspectiveDivideVertex(a, h);
-                _glPushHeaderOrVertex(a);
+                Assuming you iterate the edges of the triangle in order, and create a new *visible*
+                vertex when you cross the plane, and discard vertices behind the plane, then the only
+                difference between the two cases is that the final two vertices that need submitting have
+                to be reversed.

-                _glPerspectiveDivideVertex(b, h);
-                _glPushHeaderOrVertex(b);
-                _glPushHeaderOrVertex(b);
+                Unfortunately we have to copy vertices here, because if we persp-divide a vertex it may
+                be used in a subsequent triangle in the strip and would end up being double divided.
+            */
+#if CLIP_DEBUG
+            printf("Clip: %d, SC: %d\n", visible_mask, strip_count);
+            printf("%d, %d, %d\n", triangle[0].v - (Vertex*) src - 1, triangle[1].v - (Vertex*) src - 1, triangle[2].v - (Vertex*) src - 1);
+#endif
+            Vertex tmp;
+            if(strip_count > 3) {
+#if CLIP_DEBUG
+                printf("Flush\n");
+#endif
+                tmp = *(vertex - 2);
+                /* If we had triangles ahead of this one, submit and finalize */
+                _glPerspectiveDivideVertex(&tmp, h);
+                _glSubmitHeaderOrVertex(&tmp);
+
+                tmp = *(vertex - 1);
+                tmp.flags = GPU_CMD_VERTEX_EOL;
+                _glPerspectiveDivideVertex(&tmp, h);
+                _glSubmitHeaderOrVertex(&tmp);
            }
-            break;
-        case 10:
-        case 2:
-            /* Second vertex was visible. In self case we need to create a triangle and produce
-                two new vertices: 1-2, and 2-3. */
-            {
-                Vertex __attribute__((aligned(32))) scratch[3];
-                Vertex* a = &scratch[0];
-                Vertex* b = &scratch[1];
-                Vertex* c = &scratch[2];

-                memcpy_vertex(c, v1);
+            switch(visible_mask) {
+                case 1: {
+                    /* 0, 0a, 2a */
+                    tmp = *triangle[0].v;
+                    tmp.flags = GPU_CMD_VERTEX;
+                    _glPerspectiveDivideVertex(&tmp, h);
+                    _glSubmitHeaderOrVertex(&tmp);

-                _glClipEdge(v0, v1, a);
-                a->flags = GPU_CMD_VERTEX;
+                    _glClipEdge(triangle[0].v, triangle[1].v, &tmp);
+                    tmp.flags = GPU_CMD_VERTEX;
+                    _glPerspectiveDivideVertex(&tmp, h);
+                    _glSubmitHeaderOrVertex(&tmp);

-                _glClipEdge(v1, v2, b);
-                b->flags = v2->flags;
+                    _glClipEdge(triangle[2].v, triangle[0].v, &tmp);
+                    tmp.flags = GPU_CMD_VERTEX_EOL;
+                    _glPerspectiveDivideVertex(&tmp, h);
+                    _glSubmitHeaderOrVertex(&tmp);
+                } break;
+                case 2: {
+                    /* 0a, 1, 1a */
+                    _glClipEdge(triangle[0].v, triangle[1].v, &tmp);
+                    tmp.flags = GPU_CMD_VERTEX;
+                    _glPerspectiveDivideVertex(&tmp, h);
+                    _glSubmitHeaderOrVertex(&tmp);

-                _glPerspectiveDivideVertex(a, h);
-                _glPushHeaderOrVertex(a);
+                    tmp = *triangle[1].v;
+                    tmp.flags = GPU_CMD_VERTEX;
+                    _glPerspectiveDivideVertex(&tmp, h);
+                    _glSubmitHeaderOrVertex(&tmp);

-                _glPerspectiveDivideVertex(c, h);
-                _glPushHeaderOrVertex(c);
+                    _glClipEdge(triangle[1].v, triangle[2].v, &tmp);
+                    tmp.flags = GPU_CMD_VERTEX_EOL;
+                    _glPerspectiveDivideVertex(&tmp, h);
+                    _glSubmitHeaderOrVertex(&tmp);
+                } break;
+                case 3: {
+                    /* 0, 1, 2a, 1a */
+                    tmp = *triangle[0].v;
+                    tmp.flags = GPU_CMD_VERTEX;
+                    _glPerspectiveDivideVertex(&tmp, h);
+                    _glSubmitHeaderOrVertex(&tmp);

-                _glPerspectiveDivideVertex(b, h);
-                _glPushHeaderOrVertex(b);
+                    tmp = *triangle[1].v;
+                    tmp.flags = GPU_CMD_VERTEX;
+                    _glPerspectiveDivideVertex(&tmp, h);
+                    _glSubmitHeaderOrVertex(&tmp);
+
+                    _glClipEdge(triangle[2].v, triangle[0].v, &tmp);
+                    tmp.flags = GPU_CMD_VERTEX;
+                    _glPerspectiveDivideVertex(&tmp, h);
+                    _glSubmitHeaderOrVertex(&tmp);
+
+                    _glClipEdge(triangle[1].v, triangle[2].v, &tmp);
+                    tmp.flags = GPU_CMD_VERTEX_EOL;
+                    _glPerspectiveDivideVertex(&tmp, h);
+                    _glSubmitHeaderOrVertex(&tmp);
+                } break;
+                case 4: {
+                    /* 1a, 2, 2a */
+                    _glClipEdge(triangle[1].v, triangle[2].v, &tmp);
+                    tmp.flags = GPU_CMD_VERTEX;
+                    _glPerspectiveDivideVertex(&tmp, h);
+                    _glSubmitHeaderOrVertex(&tmp);
+
+                    tmp = *triangle[2].v;
+                    tmp.flags = GPU_CMD_VERTEX;
+                    _glPerspectiveDivideVertex(&tmp, h);
+                    _glSubmitHeaderOrVertex(&tmp);
+
+                    _glClipEdge(triangle[2].v, triangle[0].v, &tmp);
+                    tmp.flags = GPU_CMD_VERTEX_EOL;
+                    _glPerspectiveDivideVertex(&tmp, h);
+                    _glSubmitHeaderOrVertex(&tmp);
+                } break;
+                case 5: {
+                    /* 0, 0a, 2, 1a */
+                    tmp = *triangle[0].v;
+                    tmp.flags = GPU_CMD_VERTEX;
+                    _glPerspectiveDivideVertex(&tmp, h);
+                    _glSubmitHeaderOrVertex(&tmp);
+
+                    _glClipEdge(triangle[0].v, triangle[1].v, &tmp);
+                    tmp.flags = GPU_CMD_VERTEX;
+                    _glPerspectiveDivideVertex(&tmp, h);
+                    _glSubmitHeaderOrVertex(&tmp);
+
+                    tmp = *triangle[2].v;
+                    tmp.flags = GPU_CMD_VERTEX;
+                    _glPerspectiveDivideVertex(&tmp, h);
+                    _glSubmitHeaderOrVertex(&tmp);
+
+                    _glClipEdge(triangle[1].v, triangle[2].v, &tmp);
+                    tmp.flags = GPU_CMD_VERTEX_EOL;
+                    _glPerspectiveDivideVertex(&tmp, h);
+                    _glSubmitHeaderOrVertex(&tmp);
+                } break;
+                case 6: {
+                    /* 0a, 1, 2a, 2 */
+                    _glClipEdge(triangle[0].v, triangle[1].v, &tmp);
+                    tmp.flags = GPU_CMD_VERTEX;
+                    _glPerspectiveDivideVertex(&tmp, h);
+                    _glSubmitHeaderOrVertex(&tmp);
+
+                    tmp = *triangle[1].v;
+                    tmp.flags = GPU_CMD_VERTEX;
+                    _glPerspectiveDivideVertex(&tmp, h);
+                    _glSubmitHeaderOrVertex(&tmp);
+
+                    _glClipEdge(triangle[2].v, triangle[0].v, &tmp);
+                    tmp.flags = GPU_CMD_VERTEX;
+                    _glPerspectiveDivideVertex(&tmp, h);
+                    _glSubmitHeaderOrVertex(&tmp);
+
+                    tmp = *triangle[2].v;
+                    tmp.flags = GPU_CMD_VERTEX_EOL;
+                    _glPerspectiveDivideVertex(&tmp, h);
+                    _glSubmitHeaderOrVertex(&tmp);
+                } break;
+                default:
+                break;
            }
-            break;
-        case 11:
-        case 3:  /* First and second vertex were visible */
-        {
-            Vertex __attribute__((aligned(32))) scratch[3];
-            Vertex* a = &scratch[0];
-            Vertex* b = &scratch[1];
-            Vertex* c = &scratch[2];

-            memcpy_vertex(c, v1);
-
-            _glClipEdge(v2, v0, b);
-            b->flags = GPU_CMD_VERTEX;
-
-            _glPerspectiveDivideVertex(v0, h);
-            _glPushHeaderOrVertex(v0);
-
-            _glClipEdge(v1, v2, a);
-            a->flags = v2->flags;
-
-            _glPerspectiveDivideVertex(c, h);
-            _glPushHeaderOrVertex(c);
-
-            _glPerspectiveDivideVertex(b, h);
-            _glPushHeaderOrVertex(b);
-
-            _glPerspectiveDivideVertex(a, h);
-            _glPushHeaderOrVertex(c);
-            _glPushHeaderOrVertex(a);
-        }
-        break;
-        case 12:
-        case 4:
-            /* Third vertex was visible. */
-            {
-                Vertex __attribute__((aligned(32))) scratch[3];
-                Vertex* a = &scratch[0];
-                Vertex* b = &scratch[1];
-                Vertex* c = &scratch[2];
-
-                memcpy_vertex(c, v2);
-
-                _glClipEdge(v2, v0, a);
-                a->flags = GPU_CMD_VERTEX;
-
-                _glClipEdge(v1, v2, b);
-                b->flags = GPU_CMD_VERTEX;
-
-                _glPerspectiveDivideVertex(a, h);
-                _glPushHeaderOrVertex(a);
-
-                if(counter % 2 == 1) {
-                    _glPushHeaderOrVertex(a);
-                }
-
-                _glPerspectiveDivideVertex(b, h);
-                _glPushHeaderOrVertex(b);
-
-                _glPerspectiveDivideVertex(c, h);
-                _glPushHeaderOrVertex(c);
+            /* If this was the last in the strip, we don't need to
+            submit anything else, we just wipe the tri_count */
+            if(is_last_in_strip) {
+                tri_count = 0;
+                strip_count = 0;
+            } else {
+                ShiftRotateTriangle();
+                strip_count = 2;
            }
-            break;
-        case 13:
-        {
-            Vertex __attribute__((aligned(32))) scratch[3];
-            Vertex* a = &scratch[0];
-            Vertex* b = &scratch[1];
-            Vertex* c = &scratch[2];
+        } else {
+            /* Invisible? Move to the next in the strip */

-            memcpy_vertex(c, v2);
-            c->flags = GPU_CMD_VERTEX;
-
-            _glClipEdge(v0, v1, a);
-            a->flags = GPU_CMD_VERTEX;
-
-            _glClipEdge(v1, v2, b);
-            b->flags = GPU_CMD_VERTEX;
-
-            _glPerspectiveDivideVertex(v0, h);
-            _glPushHeaderOrVertex(v0);
-
-            _glPerspectiveDivideVertex(a, h);
-            _glPushHeaderOrVertex(a);
-
-            _glPerspectiveDivideVertex(c, h);
-            _glPushHeaderOrVertex(c);
-            _glPerspectiveDivideVertex(b, h);
-            _glPushHeaderOrVertex(b);
-
-            c->flags = GPU_CMD_VERTEX_EOL;
-            _glPushHeaderOrVertex(c);
-        }
-        break;
-        case 5:  /* First and third vertex were visible */
-        {
-            Vertex __attribute__((aligned(32))) scratch[3];
-            Vertex* a = &scratch[0];
-            Vertex* b = &scratch[1];
-            Vertex* c = &scratch[2];
-
-            memcpy_vertex(c, v2);
-            c->flags = GPU_CMD_VERTEX;
-
-            _glClipEdge(v0, v1, a);
-            a->flags = GPU_CMD_VERTEX;
-
-            _glClipEdge(v1, v2, b);
-            b->flags = GPU_CMD_VERTEX;
-
-            _glPerspectiveDivideVertex(v0, h);
-            _glPushHeaderOrVertex(v0);
-
-            _glPerspectiveDivideVertex(a, h);
-            _glPushHeaderOrVertex(a);
-
-            _glPerspectiveDivideVertex(c, h);
-            _glPushHeaderOrVertex(c);
-            _glPerspectiveDivideVertex(b, h);
-            _glPushHeaderOrVertex(b);
-            _glPushHeaderOrVertex(c);
-        }
-        break;
-        case 14:
-        case 6:  /* Second and third vertex were visible */
-        {
-            Vertex __attribute__((aligned(32))) scratch[4];
-            Vertex* a = &scratch[0];
-            Vertex* b = &scratch[1];
-            Vertex* c = &scratch[2];
-            Vertex* d = &scratch[3];
-
-            memcpy_vertex(c, v1);
-            memcpy_vertex(d, v2);
-
-            _glClipEdge(v0, v1, a);
-            a->flags = GPU_CMD_VERTEX;
-
-            _glClipEdge(v2, v0, b);
-            b->flags = GPU_CMD_VERTEX;
-
-            _glPerspectiveDivideVertex(a, h);
-            _glPushHeaderOrVertex(a);
-
-            _glPerspectiveDivideVertex(c, h);
-            _glPushHeaderOrVertex(c);
-
-            _glPerspectiveDivideVertex(b, h);
-            _glPushHeaderOrVertex(b);
-            _glPushHeaderOrVertex(c);
-
-            _glPerspectiveDivideVertex(d, h);
-            _glPushHeaderOrVertex(d);
-        }
-        break;
-        case 8:
-        default:
-            break;
+            if(is_last_in_strip) {
+                tri_count = 0;
+                strip_count = 0;
+            }
+            strip_count = 2;
+            ShiftRotateTriangle();
        }
    }

-    _glFlushBuffer();
+    /* Wait for both store queues to complete */
+    d = (uint32_t *)0xe0000000;
+    d[0] = d[8] = 0;
 }

 void SceneListFinish() {
--- a/GL/platforms/sh4.h
+++ b/GL/platforms/sh4.h
@ -24,50 +24,21 @@
 #define GL_FORCE_INLINE static GL_INLINE_DEBUG
 #endif

-#define PREFETCH(addr) __builtin_prefetch((addr))
-
-GL_FORCE_INLINE void* memcpy_fast(void *dest, const void *src, size_t len) {
-  if(!len) {
-    return dest;
-  }
-
-  const uint8_t *s = (uint8_t *)src;
-  uint8_t *d = (uint8_t *)dest;
-
-  uint32_t diff = (uint32_t)d - (uint32_t)(s + 1); // extra offset because input gets incremented before output is calculated
-  // Underflow would be like adding a negative offset
-
-  // Can use 'd' as a scratch reg now
-  asm volatile (
-    "clrs\n" // Align for parallelism (CO) - SH4a use "stc SR, Rn" instead with a dummy Rn
-  ".align 2\n"
-  "0:\n\t"
-    "dt %[size]\n\t" // (--len) ? 0 -> T : 1 -> T (EX 1)
-    "mov.b @%[in]+, %[scratch]\n\t" // scratch = *(s++) (LS 1/2)
-    "bf.s 0b\n\t" // while(s != nexts) aka while(!T) (BR 1/2)
-    " mov.b %[scratch], @(%[offset], %[in])\n" // *(datatype_of_s*) ((char*)s + diff) = scratch, where src + diff = dest (LS 1)
-    : [in] "+&r" ((uint32_t)s), [scratch] "=&r" ((uint32_t)d), [size] "+&r" (len) // outputs
-    : [offset] "z" (diff) // inputs
-    : "t", "memory" // clobbers
-  );
-
-  return dest;
-}
+#define PREFETCH(addr) __asm__("pref @%0" : : "r"((addr)))

 /* We use sq_cpy if the src and size is properly aligned. We control that the
 * destination is properly aligned so we assert that. */
 #define FASTCPY(dst, src, bytes) \
    do { \
        if(bytes % 32 == 0 && ((uintptr_t) src % 4) == 0) { \
-            gl_assert(((uintptr_t) dst) % 32 == 0); \
+            assert(((uintptr_t) dst) % 32 == 0); \
            sq_cpy(dst, src, bytes); \
        } else { \
-            memcpy_fast(dst, src, bytes); \
+            memcpy(dst, src, bytes); \
        } \
    } while(0)

-
-#define MEMCPY4(dst, src, bytes) memcpy_fast(dst, src, bytes)
+#define MEMCPY4(dst, src, bytes) memcpy4(dst, src, bytes)

 #define MEMSET4(dst, v, size) memset4((dst), (v), (size))

--- a/GL/platforms/software.c
+++ b/GL/platforms/software.c
@ -10,9 +10,8 @@
 #include "software/parameter_equation.h"

 #define CLIP_DEBUG 0
-#define ZNEAR_CLIPPING_ENABLED 1

-static size_t AVAILABLE_VRAM = 8 * 1024 * 1024;
+static size_t AVAILABLE_VRAM = 16 * 1024 * 1024;
 static Matrix4x4 MATRIX;

 static SDL_Window* WINDOW = NULL;
@ -30,13 +29,83 @@ static VideoMode vid_mode = {
 #define MIN(x, y) ((x) < (y) ? (x) : (y))
 #define MAX(x, y) ((x) > (y) ? (x) : (y))

-AlignedVector vbuffer;
+static void DrawTriangle(Vertex* v0, Vertex* v1, Vertex* v2) {
+    // Compute triangle bounding box.
+
+    int minX = MIN(MIN(v0->xyz[0], v1->xyz[0]), v2->xyz[0]);
+    int maxX = MAX(MAX(v0->xyz[0], v1->xyz[0]), v2->xyz[0]);
+    int minY = MIN(MIN(v0->xyz[1], v1->xyz[1]), v2->xyz[1]);
+    int maxY = MAX(MAX(v0->xyz[1], v1->xyz[1]), v2->xyz[1]);
+
+    // Clip to scissor rect.
+
+    minX = MAX(minX, 0);
+    maxX = MIN(maxX, vid_mode.width);
+    minY = MAX(minY, 0);
+    maxY = MIN(maxY, vid_mode.height);
+
+    // Compute edge equations.
+
+    EdgeEquation e0, e1, e2;
+    EdgeEquationInit(&e0, &v0->xyz[0], &v1->xyz[0]);
+    EdgeEquationInit(&e1, &v1->xyz[0], &v2->xyz[0]);
+    EdgeEquationInit(&e2, &v2->xyz[0], &v0->xyz[0]);
+
+    float area = 0.5 * (e0.c + e1.c + e2.c);
+
+    /* This is very ugly. I don't understand the math properly
+     * so I just swap the vertex order if something is back-facing
+     * and we want to render it. Patches welcome! */
+#define REVERSE_WINDING() \
+    Vertex* tv = v0; \
+    v0 = v1; \
+    v1 = tv; \
+    EdgeEquationInit(&e0, &v0->xyz[0], &v1->xyz[0]); \
+    EdgeEquationInit(&e1, &v1->xyz[0], &v2->xyz[0]); \
+    EdgeEquationInit(&e2, &v2->xyz[0], &v0->xyz[0]); \
+    area = 0.5f * (e0.c + e1.c + e2.c) \
+
+    // Check if triangle is backfacing.
+    if(CULL_MODE == GPU_CULLING_CCW) {
+        if(area < 0) {
+            return;
+        }
+    } else if(CULL_MODE == GPU_CULLING_CW) {
+        if(area < 0) {
+            // We only draw front-facing polygons, so swap
+            // the back to front and draw
+            REVERSE_WINDING();
+        } else {
+            // Front facing, so bail
+            return;
+        }
+    } else if(area < 0) {
+        /* We're not culling, but this is backfacing, so swap vertices and edges */
+        REVERSE_WINDING();
+    }
+
+    ParameterEquation r, g, b;
+
+    ParameterEquationInit(&r, v0->bgra[2], v1->bgra[2], v2->bgra[2], &e0, &e1, &e2, area);
+    ParameterEquationInit(&g, v0->bgra[1], v1->bgra[1], v2->bgra[1], &e0, &e1, &e2, area);
+    ParameterEquationInit(&b, v0->bgra[0], v1->bgra[0], v2->bgra[0], &e0, &e1, &e2, area);
+
+    // Add 0.5 to sample at pixel centers.
+    for (float x = minX + 0.5f, xm = maxX + 0.5f; x <= xm; x += 1.0f)
+    for (float y = minY + 0.5f, ym = maxY + 0.5f; y <= ym; y += 1.0f)
+    {
+      if (EdgeEquationTestPoint(&e0, x, y) && EdgeEquationTestPoint(&e1, x, y) && EdgeEquationTestPoint(&e2, x, y)) {
+        int rint = ParameterEquationEvaluate(&r, x, y);
+        int gint = ParameterEquationEvaluate(&g, x, y);
+        int bint = ParameterEquationEvaluate(&b, x, y);
+        SDL_SetRenderDrawColor(RENDERER, rint, gint, bint, 255);
+        SDL_RenderDrawPoint(RENDERER, x, y);
+      }
+    }
+}
+

 void InitGPU(_Bool autosort, _Bool fsaa) {
-
-    // 32-bit SDL has trouble with the wayland driver for some reason
-    setenv("SDL_VIDEODRIVER", "x11", 1);
-
    SDL_Init(SDL_INIT_VIDEO | SDL_INIT_EVENTS);

    WINDOW = SDL_CreateWindow(
@ -50,8 +119,6 @@ void InitGPU(_Bool autosort, _Bool fsaa) {
    RENDERER = SDL_CreateRenderer(
        WINDOW, -1, SDL_RENDERER_ACCELERATED
    );
-
-    aligned_vector_init(&vbuffer, sizeof(SDL_Vertex));
 }

 void SceneBegin() {
@ -94,11 +161,11 @@ GL_FORCE_INLINE void _glPerspectiveDivideVertex(Vertex* vertex, const float h) {
    }
 }

-GL_FORCE_INLINE void _glPushHeaderOrVertex(const Vertex* v) {
+GL_FORCE_INLINE void _glSubmitHeaderOrVertex(const Vertex* v) {
 #ifndef NDEBUG
    if(glIsVertex(v->flags)) {
-        gl_assert(!isnan(v->xyz[2]));
-        gl_assert(!isnan(v->w));
+        assert(!isnan(v->xyz[2]));
+        assert(!isnan(v->w));
    }
 #endif

@ -109,329 +176,335 @@ GL_FORCE_INLINE void _glPushHeaderOrVertex(const Vertex* v) {
    BUFFER[vertex_counter++] = *v;
 }

-static inline void _glFlushBuffer() {}
+static struct {
+    Vertex* v;
+    int visible;
+} triangle[3];

+static int tri_count = 0;
+static int strip_count = 0;

-GL_FORCE_INLINE void _glClipEdge(const Vertex* v1, const Vertex* v2, Vertex* vout) {
-    const static float o = 0.003921569f;  // 1 / 255
-    const float d0 = v1->w + v1->xyz[2];
-    const float d1 = v2->w + v2->xyz[2];
-    const float t = (fabs(d0) * (1.0f / sqrtf((d1 - d0) * (d1 - d0)))) + 0.000001f;
-    const float invt = 1.0f - t;
+GL_FORCE_INLINE void interpolateColour(const uint8_t* v1, const uint8_t* v2, const float t, uint8_t* out) {
+    const int MASK1 = 0x00FF00FF;
+    const int MASK2 = 0xFF00FF00;

-    vout->xyz[0] = invt * v1->xyz[0] + t * v2->xyz[0];
-    vout->xyz[1] = invt * v1->xyz[1] + t * v2->xyz[1];
-    vout->xyz[2] = invt * v1->xyz[2] + t * v2->xyz[2];
+    const int f2 = 256 * t;
+    const int f1 = 256 - f2;

-    vout->uv[0] = invt * v1->uv[0] + t * v2->uv[0];
-    vout->uv[1] = invt * v1->uv[1] + t * v2->uv[1];
+    const uint32_t a = *(uint32_t*) v1;
+    const uint32_t b = *(uint32_t*) v2;

-    vout->w = invt * v1->w + t * v2->w;
-
-    const float m = 255 * t;
-    const float n = 255 - m;
-
-    vout->bgra[0] = (v1->bgra[0] * n + v2->bgra[0] * m) * o;
-    vout->bgra[1] = (v1->bgra[1] * n + v2->bgra[1] * m) * o;
-    vout->bgra[2] = (v1->bgra[2] * n + v2->bgra[2] * m) * o;
-    vout->bgra[3] = (v1->bgra[3] * n + v2->bgra[3] * m) * o;
+    *((uint32_t*) out) = (((((a & MASK1) * f1) + ((b & MASK1) * f2)) >> 8) & MASK1) |
+            (((((a & MASK2) * f1) + ((b & MASK2) * f2)) >> 8) & MASK2);
 }

-void SceneListSubmit(Vertex* v2, int n) {
-    /* You need at least a header, and 3 vertices to render anything */
-    if(n < 4) {
+GL_FORCE_INLINE void _glClipEdge(const Vertex* v1, const Vertex* v2, Vertex* vout) {
+    /* Clipping time! */
+    const float d0 = v1->w + v1->xyz[2];
+    const float d1 = v2->w + v2->xyz[2];
+
+    const float epsilon = (d0 < d1) ? -0.00001f : 0.00001f;
+
+    float t = (d0 / (d0 - d1)) + epsilon;
+
+    t = (t > 1.0f) ? 1.0f : t;
+    t = (t < 0.0f) ? 0.0f : t;
+
+    vout->xyz[0] = __builtin_fmaf(v2->xyz[0] - v1->xyz[0], t, v1->xyz[0]);
+    vout->xyz[1] = __builtin_fmaf(v2->xyz[1] - v1->xyz[1], t, v1->xyz[1]);
+    vout->xyz[2] = __builtin_fmaf(v2->xyz[2] - v1->xyz[2], t, v1->xyz[2]);
+    vout->w = __builtin_fmaf(v2->w - v1->w, t, v1->w);
+
+    vout->uv[0] = __builtin_fmaf(v2->uv[0] - v1->uv[0], t, v1->uv[0]);
+    vout->uv[1] = __builtin_fmaf(v2->uv[1] - v1->uv[1], t, v1->uv[1]);
+
+    interpolateColour(v1->bgra, v2->bgra, t, vout->bgra);
+}
+
+GL_FORCE_INLINE void ClearTriangle() {
+    tri_count = 0;
+}
+
+GL_FORCE_INLINE void ShiftTriangle() {
+    if(!tri_count) {
        return;
    }

-    const float h = GetVideoMode()->height;
+    tri_count--;
+    triangle[0] = triangle[1];
+    triangle[1] = triangle[2];

-    uint8_t visible_mask = 0;
-    uint8_t counter = 0;
+#ifndef NDEBUG
+    triangle[2].v = NULL;
+    triangle[2].visible = false;
+#endif
+}

-    for(int i = 0; i < n; ++i, ++v2) {
-        PREFETCH(v2 + 1);
-        switch(v2->flags) {
-            case GPU_CMD_VERTEX_EOL:
-                if(counter < 2) {
-                    continue;
-                }
-                counter = 0;
-            break;
-            case GPU_CMD_VERTEX:
-                ++counter;
-                if(counter < 3) {
-                    continue;
-                }
-            break;
-            default:
-                _glPushHeaderOrVertex(v2);
-                counter = 0;
-                continue;
-        };
-
-        Vertex* const v0 = v2 - 2;
-        Vertex* const v1 = v2 - 1;
-
-        visible_mask = (
-            (v0->xyz[2] > -v0->w) << 0 |
-            (v1->xyz[2] > -v1->w) << 1 |
-            (v2->xyz[2] > -v2->w) << 2 |
-            (counter == 0) << 3
-        );
-
-        switch(visible_mask) {
-        case 15: /* All visible, but final vertex in strip */
-        {
-            _glPerspectiveDivideVertex(v0, h);
-            _glPushHeaderOrVertex(v0);
-
-            _glPerspectiveDivideVertex(v1, h);
-            _glPushHeaderOrVertex(v1);
-
-            _glPerspectiveDivideVertex(v2, h);
-            _glPushHeaderOrVertex(v2);
-        }
-        break;
-        case 7:
-            /* All visible, push the first vertex and move on */
-            _glPerspectiveDivideVertex(v0, h);
-            _glPushHeaderOrVertex(v0);
-        break;
-        case 9:
-      /* First vertex was visible, last in strip */
-        {
-            Vertex __attribute__((aligned(32))) scratch[2];
-            Vertex* a = &scratch[0];
-            Vertex* b = &scratch[1];
-
-            _glClipEdge(v0, v1, a);
-            a->flags = GPU_CMD_VERTEX;
-
-            _glClipEdge(v2, v0, b);
-            b->flags = GPU_CMD_VERTEX_EOL;
-
-            _glPerspectiveDivideVertex(v0, h);
-            _glPushHeaderOrVertex(v0);
-
-            _glPerspectiveDivideVertex(a, h);
-            _glPushHeaderOrVertex(a);
-
-            _glPerspectiveDivideVertex(b, h);
-            _glPushHeaderOrVertex(b);
-        }
-        break;
-        case 1:
-        /* First vertex was visible, but not last in strip */
-        {
-            Vertex __attribute__((aligned(32))) scratch[2];
-            Vertex* a = &scratch[0];
-            Vertex* b = &scratch[1];
-
-            _glClipEdge(v0, v1, a);
-            a->flags = GPU_CMD_VERTEX;
-
-            _glClipEdge(v2, v0, b);
-            b->flags = GPU_CMD_VERTEX;
-
-            _glPerspectiveDivideVertex(v0, h);
-            _glPushHeaderOrVertex(v0);
-
-            _glPerspectiveDivideVertex(a, h);
-            _glPushHeaderOrVertex(a);
-
-            _glPerspectiveDivideVertex(b, h);
-            _glPushHeaderOrVertex(b);
-            _glPushHeaderOrVertex(b);
-        }
-        break;
-        case 10:
-        case 2:
-        /* Second vertex was visible. In self case we need to create a triangle and produce
-                two new vertices: 1-2, and 2-3. */
-        {
-            Vertex __attribute__((aligned(32))) scratch[3];
-            Vertex* a = &scratch[0];
-            Vertex* b = &scratch[1];
-            Vertex* c = &scratch[2];
-
-            memcpy_vertex(c, v1);
-
-            _glClipEdge(v0, v1, a);
-            a->flags = GPU_CMD_VERTEX;
-
-            _glClipEdge(v1, v2, b);
-            b->flags = v2->flags;
-
-            _glPerspectiveDivideVertex(a, h);
-            _glPushHeaderOrVertex(a);
-
-            _glPerspectiveDivideVertex(c, h);
-            _glPushHeaderOrVertex(c);
-
-            _glPerspectiveDivideVertex(b, h);
-            _glPushHeaderOrVertex(b);
-        }
-        break;
-        case 11:
-        case 3:  /* First and second vertex were visible */
-        {
-            Vertex __attribute__((aligned(32))) scratch[3];
-            Vertex* a = &scratch[0];
-            Vertex* b = &scratch[1];
-            Vertex* c = &scratch[2];
-
-            memcpy_vertex(c, v1);
-
-            _glClipEdge(v2, v0, b);
-            b->flags = GPU_CMD_VERTEX;
-
-            _glPerspectiveDivideVertex(v0, h);
-            _glPushHeaderOrVertex(v0);
-
-            _glClipEdge(v1, v2, a);
-            a->flags = v2->flags;
-
-            _glPerspectiveDivideVertex(c, h);
-            _glPushHeaderOrVertex(c);
-
-            _glPerspectiveDivideVertex(b, h);
-            _glPushHeaderOrVertex(b);
-
-            _glPerspectiveDivideVertex(a, h);
-            _glPushHeaderOrVertex(c);
-            _glPushHeaderOrVertex(a);
-        }
-        break;
-        case 12:
-        case 4:
-        /* Third vertex was visible. */
-        {
-            Vertex __attribute__((aligned(32))) scratch[3];
-            Vertex* a = &scratch[0];
-            Vertex* b = &scratch[1];
-            Vertex* c = &scratch[2];
-
-            memcpy_vertex(c, v2);
-
-            _glClipEdge(v2, v0, a);
-            a->flags = GPU_CMD_VERTEX;
-
-            _glClipEdge(v1, v2, b);
-            b->flags = GPU_CMD_VERTEX;
-
-            _glPerspectiveDivideVertex(a, h);
-            _glPushHeaderOrVertex(a);
-
-            if(counter % 2 == 1) {
-                _glPushHeaderOrVertex(a);
-            }
-
-            _glPerspectiveDivideVertex(b, h);
-            _glPushHeaderOrVertex(b);
-
-            _glPerspectiveDivideVertex(c, h);
-            _glPushHeaderOrVertex(c);
-        }
-        break;
-        case 13:
-        {
-            Vertex __attribute__((aligned(32))) scratch[3];
-            Vertex* a = &scratch[0];
-            Vertex* b = &scratch[1];
-            Vertex* c = &scratch[2];
-
-            memcpy_vertex(c, v2);
-            c->flags = GPU_CMD_VERTEX;
-
-            _glClipEdge(v0, v1, a);
-            a->flags = GPU_CMD_VERTEX;
-
-            _glClipEdge(v1, v2, b);
-            b->flags = GPU_CMD_VERTEX;
-
-            _glPerspectiveDivideVertex(v0, h);
-            _glPushHeaderOrVertex(v0);
-
-            _glPerspectiveDivideVertex(a, h);
-            _glPushHeaderOrVertex(a);
-
-            _glPerspectiveDivideVertex(c, h);
-            _glPushHeaderOrVertex(c);
-            _glPerspectiveDivideVertex(b, h);
-            _glPushHeaderOrVertex(b);
-
-            c->flags = GPU_CMD_VERTEX_EOL;
-            _glPushHeaderOrVertex(c);
-        }
-        break;
-        case 5:  /* First and third vertex were visible */
-        {
-            Vertex __attribute__((aligned(32))) scratch[3];
-            Vertex* a = &scratch[0];
-            Vertex* b = &scratch[1];
-            Vertex* c = &scratch[2];
-
-            memcpy_vertex(c, v2);
-            c->flags = GPU_CMD_VERTEX;
-
-            _glClipEdge(v0, v1, a);
-            a->flags = GPU_CMD_VERTEX;
-
-            _glClipEdge(v1, v2, b);
-            b->flags = GPU_CMD_VERTEX;
-
-            _glPerspectiveDivideVertex(v0, h);
-            _glPushHeaderOrVertex(v0);
-
-            _glPerspectiveDivideVertex(a, h);
-            _glPushHeaderOrVertex(a);
-
-            _glPerspectiveDivideVertex(c, h);
-            _glPushHeaderOrVertex(c);
-            _glPerspectiveDivideVertex(b, h);
-            _glPushHeaderOrVertex(b);
-            _glPushHeaderOrVertex(c);
-        }
-        break;
-        case 14:
-        case 6:  /* Second and third vertex were visible */
-        {
-            Vertex __attribute__((aligned(32))) scratch[4];
-            Vertex* a = &scratch[0];
-            Vertex* b = &scratch[1];
-            Vertex* c = &scratch[2];
-            Vertex* d = &scratch[3];
-
-            memcpy_vertex(c, v1);
-            memcpy_vertex(d, v2);
-
-            _glClipEdge(v0, v1, a);
-            a->flags = GPU_CMD_VERTEX;
-
-            _glClipEdge(v2, v0, b);
-            b->flags = GPU_CMD_VERTEX;
-
-            _glPerspectiveDivideVertex(a, h);
-            _glPushHeaderOrVertex(a);
-
-            _glPerspectiveDivideVertex(c, h);
-            _glPushHeaderOrVertex(c);
-
-            _glPerspectiveDivideVertex(b, h);
-            _glPushHeaderOrVertex(b);
-            _glPushHeaderOrVertex(c);
-
-            _glPerspectiveDivideVertex(d, h);
-            _glPushHeaderOrVertex(d);
-        }
-        break;
-        case 8:
-        default:
-        break;
-        }
+GL_FORCE_INLINE void ShiftRotateTriangle() {
+    if(!tri_count) {
+        return;
    }

-    _glFlushBuffer();
+    if(triangle[0].v < triangle[1].v) {
+        triangle[0] = triangle[2];
+    } else {
+        triangle[1] = triangle[2];
+    }
+
+    tri_count--;
+}
+
+void SceneListSubmit(void* src, int n) {
+        /* Perform perspective divide on each vertex */
+    Vertex* vertex = (Vertex*) src;
+
+    const float h = GetVideoMode()->height;
+
+    /* If Z-clipping is disabled, just fire everything over to the buffer */
+    if(!ZNEAR_CLIPPING_ENABLED) {
+        for(int i = 0; i < n; ++i, ++vertex) {
+            PREFETCH(vertex + 1);
+            if(glIsVertex(vertex->flags)) {
+                _glPerspectiveDivideVertex(vertex, h);
+            }
+            _glSubmitHeaderOrVertex(vertex);
+        }
+
+        return;
+    }
+
+    tri_count = 0;
+    strip_count = 0;
+
+#if CLIP_DEBUG
+    printf("----\n");
+#endif
+
+    for(int i = 0; i < n; ++i, ++vertex) {
+        PREFETCH(vertex + 1);
+
+        bool is_last_in_strip = glIsLastVertex(vertex->flags);
+
+        /* Wait until we fill the triangle */
+        if(tri_count < 3) {
+            if(glIsVertex(vertex->flags)) {
+                triangle[tri_count].v = vertex;
+                triangle[tri_count].visible = vertex->xyz[2] >= -vertex->w;
+                tri_count++;
+                strip_count++;
+            } else {
+                /* We hit a header */
+                tri_count = 0;
+                strip_count = 0;
+                _glSubmitHeaderOrVertex(vertex);
+            }
+
+            if(tri_count < 3) {
+                continue;
+            }
+        }
+
+#if CLIP_DEBUG
+        printf("SC: %d\n", strip_count);
+#endif
+
+        /* If we got here, then triangle contains 3 vertices */
+        int visible_mask = triangle[0].visible | (triangle[1].visible << 1) | (triangle[2].visible << 2);
+        if(visible_mask == 7) {
+#if CLIP_DEBUG
+            printf("Visible\n");
+#endif
+            /* All the vertices are visible! We divide and submit v0, then shift */
+            _glPerspectiveDivideVertex(vertex - 2, h);
+            _glSubmitHeaderOrVertex(vertex - 2);
+
+            if(is_last_in_strip) {
+                _glPerspectiveDivideVertex(vertex - 1, h);
+                _glSubmitHeaderOrVertex(vertex - 1);
+                _glPerspectiveDivideVertex(vertex, h);
+                _glSubmitHeaderOrVertex(vertex);
+                tri_count = 0;
+                strip_count = 0;
+            }
+
+            ShiftRotateTriangle();
+
+        } else if(visible_mask) {
+            /* Clipping time!
+
+                There are 6 distinct possibilities when clipping a triangle. 3 of them result
+                in another triangle, 3 of them result in a quadrilateral.
+
+                Assuming you iterate the edges of the triangle in order, and create a new *visible*
+                vertex when you cross the plane, and discard vertices behind the plane, then the only
+                difference between the two cases is that the final two vertices that need submitting have
+                to be reversed.
+
+                Unfortunately we have to copy vertices here, because if we persp-divide a vertex it may
+                be used in a subsequent triangle in the strip and would end up being double divided.
+            */
+#if CLIP_DEBUG
+            printf("Clip: %d, SC: %d\n", visible_mask, strip_count);
+            printf("%d, %d, %d\n", triangle[0].v - (Vertex*) src - 1, triangle[1].v - (Vertex*) src - 1, triangle[2].v - (Vertex*) src - 1);
+#endif
+            Vertex tmp;
+            if(strip_count > 3) {
+#if CLIP_DEBUG
+                printf("Flush\n");
+#endif
+                tmp = *(vertex - 2);
+                /* If we had triangles ahead of this one, submit and finalize */
+                _glPerspectiveDivideVertex(&tmp, h);
+                _glSubmitHeaderOrVertex(&tmp);
+
+                tmp = *(vertex - 1);
+                tmp.flags = GPU_CMD_VERTEX_EOL;
+                _glPerspectiveDivideVertex(&tmp, h);
+                _glSubmitHeaderOrVertex(&tmp);
+            }
+
+            switch(visible_mask) {
+                case 1: {
+                    /* 0, 0a, 2a */
+                    tmp = *triangle[0].v;
+                    tmp.flags = GPU_CMD_VERTEX;
+                    _glPerspectiveDivideVertex(&tmp, h);
+                    _glSubmitHeaderOrVertex(&tmp);
+
+                    _glClipEdge(triangle[0].v, triangle[1].v, &tmp);
+                    tmp.flags = GPU_CMD_VERTEX;
+                    _glPerspectiveDivideVertex(&tmp, h);
+                    _glSubmitHeaderOrVertex(&tmp);
+
+                    _glClipEdge(triangle[2].v, triangle[0].v, &tmp);
+                    tmp.flags = GPU_CMD_VERTEX_EOL;
+                    _glPerspectiveDivideVertex(&tmp, h);
+                    _glSubmitHeaderOrVertex(&tmp);
+                } break;
+                case 2: {
+                    /* 0a, 1, 1a */
+                    _glClipEdge(triangle[0].v, triangle[1].v, &tmp);
+                    tmp.flags = GPU_CMD_VERTEX;
+                    _glPerspectiveDivideVertex(&tmp, h);
+                    _glSubmitHeaderOrVertex(&tmp);
+
+                    tmp = *triangle[1].v;
+                    tmp.flags = GPU_CMD_VERTEX;
+                    _glPerspectiveDivideVertex(&tmp, h);
+                    _glSubmitHeaderOrVertex(&tmp);
+
+                    _glClipEdge(triangle[1].v, triangle[2].v, &tmp);
+                    tmp.flags = GPU_CMD_VERTEX_EOL;
+                    _glPerspectiveDivideVertex(&tmp, h);
+                    _glSubmitHeaderOrVertex(&tmp);
+                } break;
+                case 3: {
+                    /* 0, 1, 2a, 1a */
+                    tmp = *triangle[0].v;
+                    tmp.flags = GPU_CMD_VERTEX;
+                    _glPerspectiveDivideVertex(&tmp, h);
+                    _glSubmitHeaderOrVertex(&tmp);
+
+                    tmp = *triangle[1].v;
+                    tmp.flags = GPU_CMD_VERTEX;
+                    _glPerspectiveDivideVertex(&tmp, h);
+                    _glSubmitHeaderOrVertex(&tmp);
+
+                    _glClipEdge(triangle[2].v, triangle[0].v, &tmp);
+                    tmp.flags = GPU_CMD_VERTEX;
+                    _glPerspectiveDivideVertex(&tmp, h);
+                    _glSubmitHeaderOrVertex(&tmp);
+
+                    _glClipEdge(triangle[1].v, triangle[2].v, &tmp);
+                    tmp.flags = GPU_CMD_VERTEX_EOL;
+                    _glPerspectiveDivideVertex(&tmp, h);
+                    _glSubmitHeaderOrVertex(&tmp);
+                } break;
+                case 4: {
+                    /* 1a, 2, 2a */
+                    _glClipEdge(triangle[1].v, triangle[2].v, &tmp);
+                    tmp.flags = GPU_CMD_VERTEX;
+                    _glPerspectiveDivideVertex(&tmp, h);
+                    _glSubmitHeaderOrVertex(&tmp);
+
+                    tmp = *triangle[2].v;
+                    tmp.flags = GPU_CMD_VERTEX;
+                    _glPerspectiveDivideVertex(&tmp, h);
+                    _glSubmitHeaderOrVertex(&tmp);
+
+                    _glClipEdge(triangle[2].v, triangle[0].v, &tmp);
+                    tmp.flags = GPU_CMD_VERTEX_EOL;
+                    _glPerspectiveDivideVertex(&tmp, h);
+                    _glSubmitHeaderOrVertex(&tmp);
+                } break;
+                case 5: {
+                    /* 0, 0a, 2, 1a */
+                    tmp = *triangle[0].v;
+                    tmp.flags = GPU_CMD_VERTEX;
+                    _glPerspectiveDivideVertex(&tmp, h);
+                    _glSubmitHeaderOrVertex(&tmp);
+
+                    _glClipEdge(triangle[0].v, triangle[1].v, &tmp);
+                    tmp.flags = GPU_CMD_VERTEX;
+                    _glPerspectiveDivideVertex(&tmp, h);
+                    _glSubmitHeaderOrVertex(&tmp);
+
+                    tmp = *triangle[2].v;
+                    tmp.flags = GPU_CMD_VERTEX;
+                    _glPerspectiveDivideVertex(&tmp, h);
+                    _glSubmitHeaderOrVertex(&tmp);
+
+                    _glClipEdge(triangle[1].v, triangle[2].v, &tmp);
+                    tmp.flags = GPU_CMD_VERTEX_EOL;
+                    _glPerspectiveDivideVertex(&tmp, h);
+                    _glSubmitHeaderOrVertex(&tmp);
+                } break;
+                case 6: {
+                    /* 0a, 1, 2a, 2 */
+                    _glClipEdge(triangle[0].v, triangle[1].v, &tmp);
+                    tmp.flags = GPU_CMD_VERTEX;
+                    _glPerspectiveDivideVertex(&tmp, h);
+                    _glSubmitHeaderOrVertex(&tmp);
+
+                    tmp = *triangle[1].v;
+                    tmp.flags = GPU_CMD_VERTEX;
+                    _glPerspectiveDivideVertex(&tmp, h);
+                    _glSubmitHeaderOrVertex(&tmp);
+
+                    _glClipEdge(triangle[2].v, triangle[0].v, &tmp);
+                    tmp.flags = GPU_CMD_VERTEX;
+                    _glPerspectiveDivideVertex(&tmp, h);
+                    _glSubmitHeaderOrVertex(&tmp);
+
+                    tmp = *triangle[2].v;
+                    tmp.flags = GPU_CMD_VERTEX_EOL;
+                    _glPerspectiveDivideVertex(&tmp, h);
+                    _glSubmitHeaderOrVertex(&tmp);
+                } break;
+                default:
+                break;
+            }
+
+            /* If this was the last in the strip, we don't need to
+            submit anything else, we just wipe the tri_count */
+            if(is_last_in_strip) {
+                tri_count = 0;
+                strip_count = 0;
+            } else {
+                ShiftRotateTriangle();
+                strip_count = 2;
+            }
+        } else {
+            /* Invisible? Move to the next in the strip */
+
+            if(is_last_in_strip) {
+                tri_count = 0;
+                strip_count = 0;
+            }
+            strip_count = 2;
+            ShiftRotateTriangle();
+        }
+    }
 }

 void SceneListFinish() {
@ -463,41 +536,18 @@ void SceneListFinish() {
            Vertex* v0 = (Vertex*) (flags - step - step);
            Vertex* v1 = (Vertex*) (flags - step);
            Vertex* v2 = (Vertex*) (flags);
-
-            SDL_Vertex sv0 = {
-                {v0->xyz[0], v0->xyz[1]},
-                {v0->bgra[2], v0->bgra[1], v0->bgra[0], v0->bgra[3]},
-                {v0->uv[0], v0->uv[1]}
-            };
-
-            SDL_Vertex sv1 = {
-                {v1->xyz[0], v1->xyz[1]},
-                {v1->bgra[2], v1->bgra[1], v1->bgra[0], v1->bgra[3]},
-                {v1->uv[0], v1->uv[1]}
-            };
-
-            SDL_Vertex sv2 = {
-                {v2->xyz[0], v2->xyz[1]},
-                {v2->bgra[2], v2->bgra[1], v2->bgra[0], v2->bgra[3]},
-                {v2->uv[0], v2->uv[1]}
-            };
-
-            aligned_vector_push_back(&vbuffer, &sv0, 1);
-            aligned_vector_push_back(&vbuffer, &sv1, 1);
-            aligned_vector_push_back(&vbuffer, &sv2, 1);
+            (vidx % 2 == 0) ? DrawTriangle(v0, v1, v2) : DrawTriangle(v1, v0, v2);
        }

        if((*flags) == GPU_CMD_VERTEX_EOL) {
            vidx = 0;
        }
    }
-
-    SDL_SetRenderDrawColor(RENDERER, 255, 255, 255, 255);
-    SDL_RenderGeometry(RENDERER, NULL, aligned_vector_front(&vbuffer), aligned_vector_size(&vbuffer), NULL, 0);
 }

 void SceneFinish() {
    SDL_RenderPresent(RENDERER);
+    return;
    /* Only sensible place to hook the quit signal */
    SDL_Event e;
    while (SDL_PollEvent(&e)) {
--- a/GL/platforms/software.h
+++ b/GL/platforms/software.h
@ -48,8 +48,7 @@ void TransformVec3NoMod(const float* v, float* ret);

 /* Transform a 3-element normal using the stored matrix (w == 0)*/
 static inline void TransformNormalNoMod(const float* xIn, float* xOut) {
-    (void) xIn;
-    (void) xOut;
+
 }

 void TransformVertices(Vertex* vertices, const int count);
--- a/GL/private.h
+++ b/GL/private.h
@ -4,7 +4,6 @@
 #include <stdint.h>
 #include <stdio.h>

-#include "gl_assert.h"
 #include "platform.h"
 #include "types.h"

@ -164,10 +163,7 @@ typedef struct {
    GLboolean isCompressed;
    GLboolean isPaletted;
    //50
-    GLenum internalFormat;
-    //54
-    GLubyte padding[10];  // Pad to 64-bytes
-} __attribute__((aligned(32))) TextureObject;
+} TextureObject;

 typedef struct {
    GLfloat emissive[4];
@ -236,41 +232,11 @@ GL_FORCE_INLINE float clamp(float d, float min, float max) {
    return (d < min) ? min : (d > max) ? max : d;
 }

-GL_FORCE_INLINE void memcpy_vertex(Vertex *dest, const Vertex *src) {
-#ifdef __DREAMCAST__
-    _Complex float double_scratch;
-
-    asm volatile (
-        "fschg\n\t"
-        "clrs\n\t"
-        ".align 2\n\t"
-        "fmov.d @%[in]+, %[scratch]\n\t"
-        "fmov.d %[scratch], @%[out]\n\t"
-        "fmov.d @%[in]+, %[scratch]\n\t"
-        "add #8, %[out]\n\t"
-        "fmov.d %[scratch], @%[out]\n\t"
-        "fmov.d @%[in]+, %[scratch]\n\t"
-        "add #8, %[out]\n\t"
-        "fmov.d %[scratch], @%[out]\n\t"
-        "fmov.d @%[in], %[scratch]\n\t"
-        "add #8, %[out]\n\t"
-        "fmov.d %[scratch], @%[out]\n\t"
-        "fschg\n"
-        : [in] "+&r" ((uint32_t) src), [scratch] "=&d" (double_scratch), [out] "+&r" ((uint32_t) dest)
-        :
-        : "t", "memory" // clobbers
-    );
-#else
-    *dest = *src;
-#endif
-}
-
 #define swapVertex(a, b)   \
 do {                 \
-    Vertex __attribute__((aligned(32))) c;   \
-    memcpy_vertex(&c, a); \
-    memcpy_vertex(a, b); \
-    memcpy_vertex(b, &c); \
+    Vertex c = *a;   \
+    *a = *b;         \
+    *b = c;          \
 } while(0)

 /* ClipVertex doesn't have room for these, so we need to parse them
@ -284,7 +250,7 @@ typedef struct {
 * when a realloc could invalidate pointers. This structure holds all the information
 * we need on the target vertex array to allow passing around to the various stages (e.g. generate/clip etc.)
 */
-typedef struct __attribute__((aligned(32))) {
+typedef struct {
    PolyList* output;
    uint32_t header_offset; // The offset of the header in the output list
    uint32_t start_offset; // The offset into the output list
@ -323,7 +289,6 @@ void _glInitLights();
 void _glInitImmediateMode(GLuint initial_size);
 void _glInitMatrices();
 void _glInitFramebuffers();
-void _glInitSubmissionTarget();

 void _glMatrixLoadNormal();
 void _glMatrixLoadModelView();
@ -339,6 +304,7 @@ Matrix4x4* _glGetModelViewMatrix();

 void _glWipeTextureOnFramebuffers(GLuint texture);

+PolyContext* _glGetPVRContext();
 GLubyte _glInitTextures();

 void _glUpdatePVRTextureContext(PolyContext* context, GLshort textureUnit);
@ -378,9 +344,6 @@ extern GLubyte ACTIVE_TEXTURE;
 extern GLboolean TEXTURES_ENABLED[];

 GLubyte _glGetActiveTexture();
-GLint _glGetTextureInternalFormat();
-GLboolean _glGetTextureTwiddle();
-void _glSetTextureTwiddle(GLboolean v);

 GLuint _glGetActiveClientTexture();
 TexturePalette* _glGetSharedPalette(GLshort bank);
@ -389,27 +352,26 @@ void _glSetInternalPaletteFormat(GLenum val);
 GLboolean _glIsSharedTexturePaletteEnabled();
 void _glApplyColorTable(TexturePalette *palette);

-GLboolean _glIsBlendingEnabled();
-GLboolean _glIsAlphaTestEnabled();
-GLboolean _glIsCullingEnabled();
-GLboolean _glIsDepthTestEnabled();
-GLboolean _glIsDepthWriteEnabled();
-GLboolean _glIsScissorTestEnabled();
-GLboolean _glIsFogEnabled();
-GLenum _glGetDepthFunc();
-GLenum _glGetCullFace();
-GLenum _glGetFrontFace();
-GLenum _glGetBlendSourceFactor();
-GLenum _glGetBlendDestFactor();
+extern GLboolean BLEND_ENABLED;
+extern GLboolean ALPHA_TEST_ENABLED;
+extern GLboolean AUTOSORT_ENABLED;
+
+GL_FORCE_INLINE GLboolean _glIsBlendingEnabled() {
+    return BLEND_ENABLED;
+}
+
+GL_FORCE_INLINE GLboolean _glIsAlphaTestEnabled() {
+    return ALPHA_TEST_ENABLED;
+}

 extern PolyList OP_LIST;
 extern PolyList PT_LIST;
 extern PolyList TR_LIST;

 GL_FORCE_INLINE PolyList* _glActivePolyList() {
-    if(_glIsBlendingEnabled()) {
+    if(BLEND_ENABLED) {
        return &TR_LIST;
-    } else if(_glIsAlphaTestEnabled()) {
+    } else if(ALPHA_TEST_ENABLED) {
        return &PT_LIST;
    } else {
        return &OP_LIST;
@ -419,9 +381,13 @@ GL_FORCE_INLINE PolyList* _glActivePolyList() {
 GLboolean _glIsMipmapComplete(const TextureObject* obj);
 GLubyte* _glGetMipmapLocation(const TextureObject* obj, GLuint level);
 GLuint _glGetMipmapLevelCount(const TextureObject* obj);
+
+extern GLboolean ZNEAR_CLIPPING_ENABLED;
+
+extern GLboolean LIGHTING_ENABLED;
 GLboolean _glIsLightingEnabled();

-void _glEnableLight(GLubyte light, GLboolean value);
+void _glEnableLight(GLubyte light, unsigned char value);
 GLboolean _glIsColorMaterialEnabled();

 GLboolean _glIsNormalizeEnabled();
@ -545,35 +511,10 @@ GLuint _glUsedTextureMemory();
 GLuint _glFreeContiguousTextureMemory();

 void _glApplyScissor(bool force);
-void _glSetColorMaterialMask(GLenum mask);
-void _glSetColorMaterialMode(GLenum mode);
-GLenum _glColorMaterialMode();
-
-Material* _glActiveMaterial();
-void _glSetLightModelViewerInEyeCoordinates(GLboolean v);
-void _glSetLightModelSceneAmbient(const GLfloat* v);
-void _glSetLightModelColorControl(GLint v);
-GLuint _glEnabledLightCount();
-void _glRecalcEnabledLights();
-GLfloat* _glLightModelSceneAmbient();
-GLfloat* _glGetLightModelSceneAmbient();
-LightSource* _glLightAt(GLuint i);
-GLboolean _glNearZClippingEnabled();
-
-GLboolean _glGPUStateIsDirty();
-void _glGPUStateMarkClean();
-void _glGPUStateMarkDirty();

 #define MAX_GLDC_TEXTURE_UNITS 2
 #define MAX_GLDC_LIGHTS 8

-#define AMBIENT_MASK 1
-#define DIFFUSE_MASK 2
-#define EMISSION_MASK 4
-#define SPECULAR_MASK 8
-#define SCENE_AMBIENT_MASK 16
-
-
 /* This is from KOS pvr_buffers.c */
 #define PVR_MIN_Z 0.0001f

--- a/GL/state.c
+++ b/GL/state.c
@ -4,228 +4,136 @@

 #include "private.h"

+static PolyContext GL_CONTEXT;

-static struct {
-    GLboolean is_dirty;
+PolyContext *_glGetPVRContext() {
+    return &GL_CONTEXT;
+}

 /* We can't just use the GL_CONTEXT for this state as the two
 * GL states are combined, so we store them separately and then
 * calculate the appropriate PVR state from them. */
-    GLenum depth_func;
-    GLboolean depth_test_enabled;
-    GLenum cull_face;
-    GLenum front_face;
-    GLboolean culling_enabled;
-    GLboolean color_material_enabled;
-    GLboolean znear_clipping_enabled;
-    GLboolean lighting_enabled;
-    GLboolean shared_palette_enabled;
-    GLboolean alpha_test_enabled;
-    GLboolean polygon_offset_enabled;
-    GLboolean normalize_enabled;
-    GLboolean scissor_test_enabled;
-    GLboolean fog_enabled;
-    GLboolean depth_mask_enabled;
+static GLenum CULL_FACE = GL_BACK;
+static GLenum FRONT_FACE = GL_CCW;
+static GLboolean CULLING_ENABLED = GL_FALSE;
+static GLboolean COLOR_MATERIAL_ENABLED = GL_FALSE;

-    struct {
-        GLint x;
-        GLint y;
-        GLsizei width;
-        GLsizei height;
-        GLboolean applied;
-    } scissor_rect;
+GLboolean ZNEAR_CLIPPING_ENABLED = GL_TRUE;

-    GLenum blend_sfactor;
-    GLenum blend_dfactor;
-    GLboolean blend_enabled;
-    GLfloat offset_factor;
-    GLfloat offset_units;
+GLboolean LIGHTING_ENABLED = GL_FALSE;

-    GLfloat scene_ambient[4];
-    GLboolean viewer_in_eye_coords;
-    GLenum color_control;
-    GLenum color_material_mode;
-    GLenum color_material_mask;
+/* Is the shared texture palette enabled? */
+static GLboolean SHARED_PALETTE_ENABLED = GL_FALSE;

-    LightSource lights[MAX_GLDC_LIGHTS];
-    GLuint enabled_light_count;
-    Material material;
+GLboolean ALPHA_TEST_ENABLED = GL_FALSE;

-    GLenum shade_model;
-} GPUState = {
-    .is_dirty = GL_TRUE,
-    .depth_func = GL_LESS,
-    .depth_test_enabled = GL_FALSE,
-    .cull_face = GL_BACK,
-    .front_face = GL_CCW,
-    .culling_enabled = GL_FALSE,
-    .color_material_enabled = GL_FALSE,
-    .znear_clipping_enabled = GL_TRUE,
-    .lighting_enabled = GL_FALSE,
-    .shared_palette_enabled = GL_FALSE,
-    .alpha_test_enabled = GL_FALSE,
-    .polygon_offset_enabled = GL_FALSE,
-    .normalize_enabled = GL_FALSE,
-    .scissor_test_enabled = GL_FALSE,
-    .fog_enabled = GL_FALSE,
-    .depth_mask_enabled = GL_FALSE,
-    .scissor_rect = {0, 0, 640, 480, false},
-    .blend_sfactor = GL_ONE,
-    .blend_dfactor = GL_ZERO,
-    .blend_enabled = GL_FALSE,
-    .offset_factor = 0.0f,
-    .offset_units = 0.0f,
-    .scene_ambient = {0.2f, 0.2f, 0.2f, 1.0f},
-    .viewer_in_eye_coords = GL_TRUE,
-    .color_control = GL_SINGLE_COLOR,
-    .color_material_mode = GL_AMBIENT_AND_DIFFUSE,
-    .color_material_mask = AMBIENT_MASK | DIFFUSE_MASK,
-    .lights = {0},
-    .enabled_light_count = 0,
-    .material = {0},
-    .shade_model = GL_SMOOTH
+static GLboolean POLYGON_OFFSET_ENABLED = GL_FALSE;
+
+static GLboolean NORMALIZE_ENABLED = GL_FALSE;
+
+static struct {
+    GLint x;
+    GLint y;
+    GLsizei width;
+    GLsizei height;
+    GLboolean applied;
+} SCISSOR_RECT = {
+    0, 0, 640, 480, false
 };

-void _glGPUStateMarkClean() {
-    GPUState.is_dirty = GL_FALSE;
-}
-
-void _glGPUStateMarkDirty() {
-    GPUState.is_dirty = GL_TRUE;
-}
-
-GLboolean _glGPUStateIsDirty() {
-    return GPUState.is_dirty;
-}
-
-Material* _glActiveMaterial() {
-    return &GPUState.material;
-}
-
-LightSource* _glLightAt(GLuint i) {
-    assert(i < MAX_GLDC_LIGHTS);
-    return &GPUState.lights[i];
-}
-
-void _glEnableLight(GLubyte light, GLboolean value) {
-    GPUState.lights[light].isEnabled = value;
-}
-
-GLboolean _glIsDepthTestEnabled() {
-    return GPUState.depth_test_enabled;
-}
-
-GLenum _glGetDepthFunc() {
-    return GPUState.depth_func;
-}
-
-GLboolean _glIsDepthWriteEnabled() {
-    return GPUState.depth_mask_enabled;
-}
-
-GLenum _glGetShadeModel() {
-    return GPUState.shade_model;
-}
-
-GLuint _glEnabledLightCount() {
-    return GPUState.enabled_light_count;
-}
-
-GLfloat* _glLightModelSceneAmbient() {
-    return GPUState.scene_ambient;
-}
-
-GLboolean _glIsBlendingEnabled() {
-    return GPUState.blend_enabled;
-}
-
-GLboolean _glIsAlphaTestEnabled() {
-    return GPUState.alpha_test_enabled;
-}
-
-GLboolean _glIsCullingEnabled() {
-    return GPUState.culling_enabled;
-}
-
-GLenum _glGetCullFace() {
-    return GPUState.cull_face;
-}
-
-GLenum _glGetFrontFace() {
-    return GPUState.front_face;
-}
-
-GLboolean _glIsFogEnabled() {
-    return GPUState.fog_enabled;
-}
-
-GLboolean _glIsScissorTestEnabled() {
-    return GPUState.scissor_test_enabled;
-}
-
-void _glRecalcEnabledLights() {
-    GPUState.enabled_light_count = 0;
-    for(GLubyte i = 0; i < MAX_GLDC_LIGHTS; ++i) {
-        if(_glLightAt(i)->isEnabled) {
-            GPUState.enabled_light_count++;
-        }
-    }
-}
-
-void _glSetLightModelViewerInEyeCoordinates(GLboolean v) {
-    GPUState.viewer_in_eye_coords = v;
-}
-
-void _glSetLightModelSceneAmbient(const GLfloat* v) {
-    vec4cpy(GPUState.scene_ambient, v);
-}
-
-GLfloat* _glGetLightModelSceneAmbient() {
-    return GPUState.scene_ambient;
-}
-
-void _glSetLightModelColorControl(GLint v) {
-    GPUState.color_control = v;
-}
-
-GLenum _glColorMaterialMask() {
-    return GPUState.color_material_mask;
-}
-
-void _glSetColorMaterialMask(GLenum mask) {
-    GPUState.color_material_mask = mask;
-}
-
-void _glSetColorMaterialMode(GLenum mode) {
-    GPUState.color_material_mode = mode;
-}
-
-GLenum _glColorMaterialMode() {
-    return GPUState.color_material_mode;
-}
-
 GLboolean _glIsSharedTexturePaletteEnabled() {
-    return GPUState.shared_palette_enabled;
-}
-
-GLboolean _glNearZClippingEnabled() {
-    return GPUState.znear_clipping_enabled;
+    return SHARED_PALETTE_ENABLED;
 }

 void _glApplyScissor(bool force);

+static int _calc_pvr_face_culling() {
+    if(!CULLING_ENABLED) {
+        return GPU_CULLING_NONE;
+    } else {
+        if(CULL_FACE == GL_BACK) {
+            return (FRONT_FACE == GL_CW) ? GPU_CULLING_CCW : GPU_CULLING_CW;
+        } else {
+            return (FRONT_FACE == GL_CCW) ? GPU_CULLING_CCW : GPU_CULLING_CW;
+        }
+    }
+}
+
+static GLenum DEPTH_FUNC = GL_LESS;
+static GLboolean DEPTH_TEST_ENABLED = GL_FALSE;
+
+static int _calc_pvr_depth_test() {
+    if(!DEPTH_TEST_ENABLED) {
+        return GPU_DEPTHCMP_ALWAYS;
+    }
+
+    switch(DEPTH_FUNC) {
+        case GL_NEVER:
+            return GPU_DEPTHCMP_NEVER;
+        case GL_LESS:
+            return GPU_DEPTHCMP_GREATER;
+        case GL_EQUAL:
+            return GPU_DEPTHCMP_EQUAL;
+        case GL_LEQUAL:
+            return GPU_DEPTHCMP_GEQUAL;
+        case GL_GREATER:
+            return GPU_DEPTHCMP_LESS;
+        case GL_NOTEQUAL:
+            return GPU_DEPTHCMP_NOTEQUAL;
+        case GL_GEQUAL:
+            return GPU_DEPTHCMP_LEQUAL;
+        break;
+        case GL_ALWAYS:
+        default:
+            return GPU_DEPTHCMP_ALWAYS;
+    }
+}
+
+static GLenum BLEND_SFACTOR = GL_ONE;
+static GLenum BLEND_DFACTOR = GL_ZERO;
+GLboolean BLEND_ENABLED = GL_FALSE;
+
+static GLfloat OFFSET_FACTOR = 0.0f;
+static GLfloat OFFSET_UNITS = 0.0f;
+
 GLboolean _glIsNormalizeEnabled() {
-    return GPUState.normalize_enabled;
+    return NORMALIZE_ENABLED;
 }

-GLenum _glGetBlendSourceFactor() {
-    return GPUState.blend_sfactor;
+static int _calcPVRBlendFactor(GLenum factor) {
+    switch(factor) {
+    case GL_ZERO:
+        return GPU_BLEND_ZERO;
+    case GL_SRC_ALPHA:
+        return GPU_BLEND_SRCALPHA;
+    case GL_DST_COLOR:
+        return GPU_BLEND_DESTCOLOR;
+    case GL_DST_ALPHA:
+        return GPU_BLEND_DESTALPHA;
+    case GL_ONE_MINUS_DST_COLOR:
+        return GPU_BLEND_INVDESTCOLOR;
+    case GL_ONE_MINUS_SRC_ALPHA:
+        return GPU_BLEND_INVSRCALPHA;
+    case GL_ONE_MINUS_DST_ALPHA:
+        return GPU_BLEND_INVDESTALPHA;
+    case GL_ONE:
+        return GPU_BLEND_ONE;
+    default:
+        fprintf(stderr, "Invalid blend mode: %u\n", (unsigned int) factor);
+        return GPU_BLEND_ONE;
+    }
 }

-GLenum _glGetBlendDestFactor() {
-    return GPUState.blend_dfactor;
-}
+static void _updatePVRBlend(PolyContext* context) {
+    if(BLEND_ENABLED || ALPHA_TEST_ENABLED) {
+        context->gen.alpha = GPU_ALPHA_ENABLE;
+    } else {
+        context->gen.alpha = GPU_ALPHA_DISABLE;
+    }

+    context->blend.src = _calcPVRBlendFactor(BLEND_SFACTOR);
+    context->blend.dst = _calcPVRBlendFactor(BLEND_DFACTOR);
+}

 GLboolean _glCheckValidEnum(GLint param, GLint* values, const char* func) {
    GLubyte found = 0;
@ -255,12 +163,11 @@ void _glUpdatePVRTextureContext(PolyContext *context, GLshort textureUnit) {
    context->txr2.enable = GPU_TEXTURE_DISABLE;
    context->txr2.alpha = GPU_TXRALPHA_DISABLE;

-    if(!TEXTURES_ENABLED[textureUnit] || !tx1 || !tx1->data) {
-        context->txr.base = NULL;
+    if(!TEXTURES_ENABLED[textureUnit] || !tx1) {
        return;
    }

-    context->txr.alpha = (GPUState.blend_enabled || GPUState.alpha_test_enabled) ? GPU_TXRALPHA_ENABLE : GPU_TXRALPHA_DISABLE;
+    context->txr.alpha = (BLEND_ENABLED || ALPHA_TEST_ENABLED) ? GPU_TXRALPHA_ENABLE : GPU_TXRALPHA_DISABLE;

    GLuint filter = GPU_FILTER_NEAREST;
    GLboolean enableMipmaps = GL_FALSE;
@ -355,22 +262,29 @@ void _glUpdatePVRTextureContext(PolyContext *context, GLshort textureUnit) {
 }

 GLboolean _glIsLightingEnabled() {
-    return GPUState.lighting_enabled;
+    return LIGHTING_ENABLED;
 }

 GLboolean _glIsColorMaterialEnabled() {
-    return GPUState.color_material_enabled;
+    return COLOR_MATERIAL_ENABLED;
 }

 static GLfloat CLEAR_COLOUR[3];

 void _glInitContext() {
+    memset(&GL_CONTEXT, 0, sizeof(PolyContext));
+
+    GL_CONTEXT.list_type = GPU_LIST_OP_POLY;
+    GL_CONTEXT.fmt.color = GPU_CLRFMT_ARGBPACKED;
+    GL_CONTEXT.fmt.uv = GPU_UVFMT_32BIT;
+    GL_CONTEXT.gen.color_clamp = GPU_CLRCLAMP_DISABLE;
+
    const VideoMode* mode = GetVideoMode();

-    GPUState.scissor_rect.x = 0;
-    GPUState.scissor_rect.y = 0;
-    GPUState.scissor_rect.width = mode->width;
-    GPUState.scissor_rect.height = mode->height;
+    SCISSOR_RECT.x = 0;
+    SCISSOR_RECT.y = 0;
+    SCISSOR_RECT.width = mode->width;
+    SCISSOR_RECT.height = mode->height;

    glClearDepth(1.0f);
    glDepthFunc(GL_LESS);
@ -398,174 +312,40 @@ void _glInitContext() {
 GLAPI void APIENTRY glEnable(GLenum cap) {
    switch(cap) {
        case GL_TEXTURE_2D:
-            if(TEXTURES_ENABLED[_glGetActiveTexture()] != GL_TRUE) {
-                TEXTURES_ENABLED[_glGetActiveTexture()] = GL_TRUE;
-                GPUState.is_dirty = GL_TRUE;
-            }
+            TEXTURES_ENABLED[_glGetActiveTexture()] = GL_TRUE;
        break;
        case GL_CULL_FACE: {
-            if(GPUState.culling_enabled != GL_TRUE) {
-                GPUState.culling_enabled = GL_TRUE;
-                GPUState.is_dirty = GL_TRUE;
-            }
-
+            CULLING_ENABLED = GL_TRUE;
+            GL_CONTEXT.gen.culling = _calc_pvr_face_culling();
        } break;
        case GL_DEPTH_TEST: {
-            if(GPUState.depth_test_enabled != GL_TRUE) {
-                GPUState.depth_test_enabled = GL_TRUE;
-                GPUState.is_dirty = GL_TRUE;
-            }
+            DEPTH_TEST_ENABLED = GL_TRUE;
+            GL_CONTEXT.depth.comparison = _calc_pvr_depth_test();
        } break;
        case GL_BLEND: {
-            if(GPUState.blend_enabled != GL_TRUE) {
-                GPUState.blend_enabled = GL_TRUE;
-                GPUState.is_dirty = GL_TRUE;
-            }
+            BLEND_ENABLED = GL_TRUE;
+            _updatePVRBlend(&GL_CONTEXT);
        } break;
        case GL_SCISSOR_TEST: {
-            if(GPUState.scissor_test_enabled != GL_TRUE) {
-                GPUState.scissor_test_enabled = GL_TRUE;
-                GPUState.is_dirty = GL_TRUE;
-            }
+            GL_CONTEXT.gen.clip_mode = GPU_USERCLIP_INSIDE;
+            _glApplyScissor(false);
        } break;
        case GL_LIGHTING: {
-            if(GPUState.lighting_enabled != GL_TRUE) {
-                GPUState.lighting_enabled = GL_TRUE;
-                GPUState.is_dirty = GL_TRUE;
-            }
+            LIGHTING_ENABLED = GL_TRUE;
        } break;
        case GL_FOG:
-            if(GPUState.fog_enabled != GL_TRUE) {
-                GPUState.fog_enabled = GL_TRUE;
-                GPUState.is_dirty = GL_TRUE;
-            }
+            GL_CONTEXT.gen.fog_type = GPU_FOG_TABLE;
        break;
        case GL_COLOR_MATERIAL:
-            if(GPUState.color_material_enabled != GL_TRUE) {
-                GPUState.color_material_enabled = GL_TRUE;
-                GPUState.is_dirty = GL_TRUE;
-            }
+            COLOR_MATERIAL_ENABLED = GL_TRUE;
        break;
        case GL_SHARED_TEXTURE_PALETTE_EXT: {
-            if(GPUState.shared_palette_enabled != GL_TRUE) {
-                GPUState.shared_palette_enabled = GL_TRUE;
-                GPUState.is_dirty = GL_TRUE;
-            }
+            SHARED_PALETTE_ENABLED = GL_TRUE;
        }
        break;
        case GL_ALPHA_TEST: {
-            if(GPUState.alpha_test_enabled != GL_TRUE) {
-                GPUState.alpha_test_enabled = GL_TRUE;
-                GPUState.is_dirty = GL_TRUE;
-            }
-        } break;
-        case GL_LIGHT0:
-        case GL_LIGHT1:
-        case GL_LIGHT2:
-        case GL_LIGHT3:
-        case GL_LIGHT4:
-        case GL_LIGHT5:
-        case GL_LIGHT6:
-        case GL_LIGHT7: {
-            LightSource* ptr = _glLightAt(cap & 0xF);
-            if(ptr->isEnabled != GL_TRUE) {
-                ptr->isEnabled = GL_TRUE;
-                _glRecalcEnabledLights();
-            }
-        }
-        break;
-        case GL_NEARZ_CLIPPING_KOS:
-            if(GPUState.znear_clipping_enabled != GL_TRUE) {
-                GPUState.znear_clipping_enabled = GL_TRUE;
-                GPUState.is_dirty = GL_TRUE;
-            }
-        break;
-        case GL_POLYGON_OFFSET_POINT:
-        case GL_POLYGON_OFFSET_LINE:
-        case GL_POLYGON_OFFSET_FILL:
-            if(GPUState.polygon_offset_enabled != GL_TRUE) {
-                GPUState.polygon_offset_enabled = GL_TRUE;
-                GPUState.is_dirty = GL_TRUE;
-            }
-        break;
-        case GL_NORMALIZE:
-            if(GPUState.normalize_enabled != GL_TRUE) {
-                GPUState.normalize_enabled = GL_TRUE;
-                GPUState.is_dirty = GL_TRUE;
-            }
-        break;
-        case GL_TEXTURE_TWIDDLE_KOS:
-            _glSetTextureTwiddle(GL_TRUE);
-        break;
-    default:
-        _glKosThrowError(GL_INVALID_VALUE, __func__);
-        break;
-    }
-}
-
-GLAPI void APIENTRY glDisable(GLenum cap) {
-    switch(cap) {
-        case GL_TEXTURE_2D:
-            if(TEXTURES_ENABLED[_glGetActiveTexture()] != GL_FALSE) {
-                TEXTURES_ENABLED[_glGetActiveTexture()] = GL_FALSE;
-                GPUState.is_dirty = GL_TRUE;
-            }
-        break;
-        case GL_CULL_FACE: {
-            if(GPUState.culling_enabled != GL_FALSE) {
-                GPUState.culling_enabled = GL_FALSE;
-                GPUState.is_dirty = GL_TRUE;
-            }
-
-        } break;
-        case GL_DEPTH_TEST: {
-            if(GPUState.depth_test_enabled != GL_FALSE) {
-                GPUState.depth_test_enabled = GL_FALSE;
-                GPUState.is_dirty = GL_TRUE;
-            }
-        } break;
-        case GL_BLEND: {
-            if(GPUState.blend_enabled != GL_FALSE) {
-                GPUState.blend_enabled = GL_FALSE;
-                GPUState.is_dirty = GL_TRUE;
-            }
-        } break;
-        case GL_SCISSOR_TEST: {
-            if(GPUState.scissor_test_enabled != GL_FALSE) {
-                GPUState.scissor_test_enabled = GL_FALSE;
-                GPUState.is_dirty = GL_TRUE;
-            }
-        } break;
-        case GL_LIGHTING: {
-            if(GPUState.lighting_enabled != GL_FALSE) {
-                GPUState.lighting_enabled = GL_FALSE;
-                GPUState.is_dirty = GL_TRUE;
-            }
-        } break;
-        case GL_FOG:
-            if(GPUState.fog_enabled != GL_FALSE) {
-                GPUState.fog_enabled = GL_FALSE;
-                GPUState.is_dirty = GL_TRUE;
-            }
-        break;
-        case GL_COLOR_MATERIAL:
-            if(GPUState.color_material_enabled != GL_FALSE) {
-                GPUState.color_material_enabled = GL_FALSE;
-                GPUState.is_dirty = GL_TRUE;
-            }
-        break;
-        case GL_SHARED_TEXTURE_PALETTE_EXT: {
-            if(GPUState.shared_palette_enabled != GL_FALSE) {
-                GPUState.shared_palette_enabled = GL_FALSE;
-                GPUState.is_dirty = GL_TRUE;
-            }
-        }
-        break;
-        case GL_ALPHA_TEST: {
-            if(GPUState.alpha_test_enabled != GL_FALSE) {
-                GPUState.alpha_test_enabled = GL_FALSE;
-                GPUState.is_dirty = GL_TRUE;
-            }
+            ALPHA_TEST_ENABLED = GL_TRUE;
+            _updatePVRBlend(&GL_CONTEXT);
        } break;
        case GL_LIGHT0:
        case GL_LIGHT1:
@ -575,36 +355,82 @@ GLAPI void APIENTRY glDisable(GLenum cap) {
        case GL_LIGHT5:
        case GL_LIGHT6:
        case GL_LIGHT7:
-            if(GPUState.lights[cap & 0xF].isEnabled) {
-                _glEnableLight(cap & 0xF, GL_FALSE);
-                GPUState.is_dirty = GL_TRUE;
-            }
+            _glEnableLight(cap & 0xF, GL_TRUE);
        break;
        case GL_NEARZ_CLIPPING_KOS:
-            if(GPUState.znear_clipping_enabled != GL_FALSE) {
-                GPUState.znear_clipping_enabled = GL_FALSE;
-                GPUState.is_dirty = GL_TRUE;
-            }
+            ZNEAR_CLIPPING_ENABLED = GL_TRUE;
        break;
        case GL_POLYGON_OFFSET_POINT:
        case GL_POLYGON_OFFSET_LINE:
        case GL_POLYGON_OFFSET_FILL:
-            if(GPUState.polygon_offset_enabled != GL_FALSE) {
-                GPUState.polygon_offset_enabled = GL_FALSE;
-                GPUState.is_dirty = GL_TRUE;
-            }
+            POLYGON_OFFSET_ENABLED = GL_TRUE;
        break;
        case GL_NORMALIZE:
-            if(GPUState.normalize_enabled != GL_FALSE) {
-                GPUState.normalize_enabled = GL_FALSE;
-                GPUState.is_dirty = GL_TRUE;
-            }
-        break;
-        case GL_TEXTURE_TWIDDLE_KOS:
-            _glSetTextureTwiddle(GL_FALSE);
+            NORMALIZE_ENABLED = GL_TRUE;
+        break;
+    default:
+        break;
+    }
+}
+
+GLAPI void APIENTRY glDisable(GLenum cap) {
+    switch(cap) {
+        case GL_TEXTURE_2D: {
+            TEXTURES_ENABLED[_glGetActiveTexture()] = GL_FALSE;
+        } break;
+        case GL_CULL_FACE: {
+            CULLING_ENABLED = GL_FALSE;
+            GL_CONTEXT.gen.culling = _calc_pvr_face_culling();
+        } break;
+        case GL_DEPTH_TEST: {
+            DEPTH_TEST_ENABLED = GL_FALSE;
+            GL_CONTEXT.depth.comparison = _calc_pvr_depth_test();
+        } break;
+        case GL_BLEND:
+            BLEND_ENABLED = GL_FALSE;
+            _updatePVRBlend(&GL_CONTEXT);
+        break;
+        case GL_SCISSOR_TEST: {
+            GL_CONTEXT.gen.clip_mode = GPU_USERCLIP_DISABLE;
+        } break;
+        case GL_LIGHTING: {
+            LIGHTING_ENABLED = GL_FALSE;
+        } break;
+        case GL_FOG:
+            GL_CONTEXT.gen.fog_type = GPU_FOG_DISABLE;
+        break;
+        case GL_COLOR_MATERIAL:
+            COLOR_MATERIAL_ENABLED = GL_FALSE;
+        break;
+        case GL_SHARED_TEXTURE_PALETTE_EXT: {
+            SHARED_PALETTE_ENABLED = GL_FALSE;
+        }
+        break;
+        case GL_ALPHA_TEST: {
+            ALPHA_TEST_ENABLED = GL_FALSE;
+        } break;
+        case GL_LIGHT0:
+        case GL_LIGHT1:
+        case GL_LIGHT2:
+        case GL_LIGHT3:
+        case GL_LIGHT4:
+        case GL_LIGHT5:
+        case GL_LIGHT6:
+        case GL_LIGHT7:
+            _glEnableLight(cap & 0xF, GL_FALSE);
+        break;
+        case GL_NEARZ_CLIPPING_KOS:
+            ZNEAR_CLIPPING_ENABLED = GL_FALSE;
+        break;
+        case GL_POLYGON_OFFSET_POINT:
+        case GL_POLYGON_OFFSET_LINE:
+        case GL_POLYGON_OFFSET_FILL:
+            POLYGON_OFFSET_ENABLED = GL_FALSE;
+        break;
+        case GL_NORMALIZE:
+            NORMALIZE_ENABLED = GL_FALSE;
        break;
    default:
-        _glKosThrowError(GL_INVALID_VALUE, __func__);
        break;
    }
 }
@ -651,17 +477,12 @@ GLAPI void APIENTRY glReadBuffer(GLenum mode) {
 }

 GLAPI void APIENTRY glDepthMask(GLboolean flag) {
-    if(GPUState.depth_mask_enabled != flag) {
-        GPUState.depth_mask_enabled = flag;
-        GPUState.is_dirty = GL_TRUE;
-    }
+    GL_CONTEXT.depth.write = (flag == GL_TRUE) ? GPU_DEPTHWRITE_ENABLE : GPU_DEPTHWRITE_DISABLE;
 }

 GLAPI void APIENTRY glDepthFunc(GLenum func) {
-    if(GPUState.depth_func != func) {
-        GPUState.depth_func = func;
-        GPUState.is_dirty = GL_TRUE;
-    }
+    DEPTH_FUNC = func;
+    GL_CONTEXT.depth.comparison = _calc_pvr_depth_test();
 }

 /* Hints */
@ -681,34 +502,29 @@ GLAPI void APIENTRY glPolygonMode(GLenum face, GLenum mode) {

 /* Culling */
 GLAPI void APIENTRY glFrontFace(GLenum mode) {
-    if(GPUState.front_face != mode) {
-        GPUState.front_face = mode;
-        GPUState.is_dirty = GL_TRUE;
-    }
+    FRONT_FACE = mode;
+    GL_CONTEXT.gen.culling = _calc_pvr_face_culling();
 }

 GLAPI void APIENTRY glCullFace(GLenum mode) {
-    if(GPUState.cull_face != mode) {
-        GPUState.cull_face = mode;
-        GPUState.is_dirty = GL_TRUE;
-    }
+    CULL_FACE = mode;
+    GL_CONTEXT.gen.culling = _calc_pvr_face_culling();
+}
+
+GLenum _glGetShadeModel() {
+    return (GL_CONTEXT.gen.shading == GPU_SHADE_FLAT) ? GL_FLAT : GL_SMOOTH;
 }

 /* Shading - Flat or Goraud */
 GLAPI void APIENTRY glShadeModel(GLenum mode) {
-    if(GPUState.shade_model != mode) {
-        GPUState.shade_model = mode;
-        GPUState.is_dirty = GL_TRUE;
-    }
+    GL_CONTEXT.gen.shading = (mode == GL_SMOOTH) ? GPU_SHADE_GOURAUD : GPU_SHADE_FLAT;
 }

 /* Blending */
 GLAPI void APIENTRY glBlendFunc(GLenum sfactor, GLenum dfactor) {
-    if(GPUState.blend_dfactor != dfactor || GPUState.blend_sfactor != sfactor) {
-        GPUState.blend_sfactor = sfactor;
-        GPUState.blend_dfactor = dfactor;
-        GPUState.is_dirty = GL_TRUE;
-    }
+    BLEND_SFACTOR = sfactor;
+    BLEND_DFACTOR = dfactor;
+    _updatePVRBlend(&GL_CONTEXT);
 }


@ -731,9 +547,8 @@ void glLineWidth(GLfloat width) {
 }

 void glPolygonOffset(GLfloat factor, GLfloat units) {
-    GPUState.offset_factor = factor;
-    GPUState.offset_units = units;
-    GPUState.is_dirty = GL_TRUE;
+    OFFSET_FACTOR = factor;
+    OFFSET_UNITS = units;
 }

 void glGetTexParameterfv(GLenum target, GLenum pname, GLfloat *params) {
@ -762,20 +577,18 @@ void glPixelStorei(GLenum pname, GLint param) {


 void APIENTRY glScissor(GLint x, GLint y, GLsizei width, GLsizei height) {
-
-    if(GPUState.scissor_rect.x == x &&
-        GPUState.scissor_rect.y == y &&
-        GPUState.scissor_rect.width == width &&
-        GPUState.scissor_rect.height == height) {
+    if(SCISSOR_RECT.x == x &&
+        SCISSOR_RECT.y == y &&
+        SCISSOR_RECT.width == width &&
+        SCISSOR_RECT.height == height) {
        return;
    }

-    GPUState.scissor_rect.x = x;
-    GPUState.scissor_rect.y = y;
-    GPUState.scissor_rect.width = width;
-    GPUState.scissor_rect.height = height;
-    GPUState.scissor_rect.applied = false;
-    GPUState.is_dirty = GL_TRUE; // FIXME: do we need this?
+    SCISSOR_RECT.x = x;
+    SCISSOR_RECT.y = y;
+    SCISSOR_RECT.width = width;
+    SCISSOR_RECT.height = height;
+    SCISSOR_RECT.applied = false;

    _glApplyScissor(false);
 }
@ -805,12 +618,12 @@ void APIENTRY glScissor(GLint x, GLint y, GLsizei width, GLsizei height) {
 */
 void _glApplyScissor(bool force) {
    /* Don't do anyting if clipping is disabled */
-    if(!GPUState.scissor_test_enabled) {
+    if(GL_CONTEXT.gen.clip_mode == GPU_USERCLIP_DISABLE) {
        return;
    }

    /* Don't apply if we already applied - nothing changed */
-    if(GPUState.scissor_rect.applied && !force) {
+    if(SCISSOR_RECT.applied && !force) {
        return;
    }

@ -820,31 +633,27 @@ void _glApplyScissor(bool force) {

    const VideoMode* vid_mode = GetVideoMode();

-    GLsizei scissor_width = MAX(MIN(GPUState.scissor_rect.width, vid_mode->width), 0);
-    GLsizei scissor_height = MAX(MIN(GPUState.scissor_rect.height, vid_mode->height), 0);
+    GLsizei scissor_width = MAX(MIN(SCISSOR_RECT.width, vid_mode->width), 0);
+    GLsizei scissor_height = MAX(MIN(SCISSOR_RECT.height, vid_mode->height), 0);

    /* force the origin to the lower left-hand corner of the screen */
-    miny = (vid_mode->height - scissor_height) - GPUState.scissor_rect.y;
-    maxx = (scissor_width + GPUState.scissor_rect.x);
+    miny = (vid_mode->height - scissor_height) - SCISSOR_RECT.y;
+    maxx = (scissor_width + SCISSOR_RECT.x);
    maxy = (scissor_height + miny);

    /* load command structure while mapping screen coords to TA tiles */
    c.flags = GPU_CMD_USERCLIP;
    c.d1 = c.d2 = c.d3 = 0;
-
-    uint16_t vw = vid_mode->width >> 5;
-    uint16_t vh = vid_mode->height >> 5;
-
-    c.sx = CLAMP(GPUState.scissor_rect.x >> 5, 0, vw);
-    c.sy = CLAMP(miny >> 5, 0, vh);
-    c.ex = CLAMP((maxx >> 5) - 1, 0, vw);
-    c.ey = CLAMP((maxy >> 5) - 1, 0, vh);
+    c.sx = CLAMP(SCISSOR_RECT.x / 32, 0, vid_mode->width / 32);
+    c.sy = CLAMP(miny / 32, 0, vid_mode->height / 32);
+    c.ex = CLAMP((maxx / 32) - 1, 0, vid_mode->width / 32);
+    c.ey = CLAMP((maxy / 32) - 1, 0, vid_mode->height / 32);

    aligned_vector_push_back(&_glOpaquePolyList()->vector, &c, 1);
    aligned_vector_push_back(&_glPunchThruPolyList()->vector, &c, 1);
    aligned_vector_push_back(&_glTransparentPolyList()->vector, &c, 1);

-    GPUState.scissor_rect.applied = true;
+    SCISSOR_RECT.applied = true;
 }

 void glStencilFunc(GLenum func, GLint ref, GLuint mask) {
@ -862,19 +671,19 @@ void glStencilOp(GLenum sfail, GLenum dpfail, GLenum dppass) {
 GLboolean APIENTRY glIsEnabled(GLenum cap) {
    switch(cap) {
    case GL_DEPTH_TEST:
-        return GPUState.depth_test_enabled;
+        return DEPTH_TEST_ENABLED;
    case GL_SCISSOR_TEST:
-        return GPUState.scissor_test_enabled;
+        return GL_CONTEXT.gen.clip_mode == GPU_USERCLIP_INSIDE;
    case GL_CULL_FACE:
-        return GPUState.culling_enabled;
+        return CULLING_ENABLED;
    case GL_LIGHTING:
-        return GPUState.lighting_enabled;
+        return LIGHTING_ENABLED;
    case GL_BLEND:
-        return GPUState.blend_enabled;
+        return BLEND_ENABLED;
    case GL_POLYGON_OFFSET_POINT:
    case GL_POLYGON_OFFSET_LINE:
    case GL_POLYGON_OFFSET_FILL:
-        return GPUState.polygon_offset_enabled;
+        return POLYGON_OFFSET_ENABLED;
    }

    return GL_FALSE;
@ -929,10 +738,10 @@ void APIENTRY glGetFloatv(GLenum pname, GLfloat* params) {
            MEMCPY4(params, _glGetModelViewMatrix(), sizeof(float) * 16);
        break;
        case GL_POLYGON_OFFSET_FACTOR:
-            *params = GPUState.offset_factor;
+            *params = OFFSET_FACTOR;
        break;
        case GL_POLYGON_OFFSET_UNITS:
-            *params = GPUState.offset_units;
+            *params = OFFSET_UNITS;
        break;
        default:
            _glKosThrowError(GL_INVALID_ENUM, __func__);
@ -949,13 +758,13 @@ void APIENTRY glGetIntegerv(GLenum pname, GLint *params) {
            *params = (_glGetBoundTexture()) ? _glGetBoundTexture()->index : 0;
        break;
        case GL_DEPTH_FUNC:
-            *params = GPUState.depth_func;
+            *params = DEPTH_FUNC;
        break;
        case GL_BLEND_SRC:
-            *params = GPUState.blend_sfactor;
+            *params = BLEND_SFACTOR;
        break;
        case GL_BLEND_DST:
-            *params = GPUState.blend_dfactor;
+            *params = BLEND_DFACTOR;
        break;
        case GL_MAX_TEXTURE_SIZE:
            *params = MAX_TEXTURE_SIZE;
@ -985,10 +794,6 @@ void APIENTRY glGetIntegerv(GLenum pname, GLint *params) {
        case GL_FREE_CONTIGUOUS_TEXTURE_MEMORY_KOS:
            *params = _glFreeContiguousTextureMemory();
        break;
-        case GL_TEXTURE_INTERNAL_FORMAT_KOS:
-            *params = _glGetTextureInternalFormat();
-        break;
-
    default:
        _glKosThrowError(GL_INVALID_ENUM, __func__);
        break;
--- a/GL/texture.c
+++ b/GL/texture.c
--- a/GL/types.h
+++ b/GL/types.h
@ -13,4 +13,4 @@ typedef struct {
     * but we're not using that for now, so having W here makes the code
     * simpler */
    float w;
-} __attribute__ ((aligned (32))) Vertex;
+} Vertex;
--- a/GL/yalloc/LICENSE
+++ b/GL/yalloc/LICENSE
@ -0,0 +1,21 @@
+MIT License
+
+Copyright (c) [year] [fullname]
+
+Permission is hereby granted, free of charge, to any person obtaining a copy
+of this software and associated documentation files (the "Software"), to deal
+in the Software without restriction, including without limitation the rights
+to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+copies of the Software, and to permit persons to whom the Software is
+furnished to do so, subject to the following conditions:
+
+The above copyright notice and this permission notice shall be included in all
+copies or substantial portions of the Software.
+
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+SOFTWARE.
--- a/GL/yalloc/README.md
+++ b/GL/yalloc/README.md
@ -0,0 +1,158 @@
+# Summary
+
+yalloc is a memory efficient allocator which is intended for embedded
+applications that only have a low amount of RAM and want to maximize its
+utilization. Properties of the allocator:
+
+ - pools can be up to 128k
+ - user data is 32bit aligned
+ - 4 bytes overhead per allocation
+ - supports defragmentation
+ - uses a free list for first fit allocation strategy (most recently freed
+   blocks are used first)
+ - extensively tested (see section below)
+ - MIT license
+
+# Defragmentation
+
+This feature was the initial motivation for this implementation. Especially
+when dealing with highly memory constrained environments fragmenting memory
+pools can be annoying. For this reason this implementation supports
+defragmentation which moves all allocated blocks into a contiguous range at the
+beginning of the pool, leaving a maximized free range at the end.
+
+As there is no garbage collector or other runtime system involved that updates
+the references, the application must do so. This is done in three steps:
+
+ 1. yalloc_defrag_start() is called. This calculates the new
+    post-defragmentation-addresses for all allocations, but otherwise leaves
+    the allocations untouched.
+
+ 2. yalloc_defrag_address() is called by the application for every pointer that
+    points to an allocation. It returns the post-defragmentation-address for
+    the allocation. The application must update all its relevant pointers this
+    way. Care must be taken not not yet dereference that moved pointers. If the
+    application works with hierarchical data then this can easily be done by
+    updating the pointers button up (first the leafs then their parents).
+
+ 3. yalloc_defrag_commit() is called to finally perform the defragmentation.
+    All allocated blocks are moved to their post-defragmentation-address and
+    the application can continue using the pool the normal way.
+
+It is up to the application when (and if) it performs defragmentation. One
+strategy would be to delay it until an allocation failure. Another approach
+would be to perform the defragmentation regularly when there is nothing else to
+do.
+
+# Configurable Defines
+
+INTERNAL_VALIDATE
+
+If this is not defined on the compiler commandline it will be defined as 0 if
+NDEBUG is defined and otherwise as 1. If you want to disable internal
+validation when NDEBUG is not defined then define INERNAL_VALIDATE as 0 on the
+compiler commandline.
+
+If it is nonzero the heap will be validated via a bunch of assert() calls at
+the end of every function that modifies the heap. This has roughly O(N*M)
+overhead where N is the number of allocated blocks and M the number of free
+blocks in a heap. For applications with enough live allocations this will get
+significant.
+
+YALLOC_VALGRIND
+
+If this is defined in yalloc.c and NVALGRIND is not defined then
+valgrind/memcheck.h is included and the the allocator functions tell valgrind
+about the pool, the allocations and makes the block headers inaccessible outside
+of yalloc-functions. This allows valgrind to detect a lot of the accidents that
+can happen when dealing dynamic memory. This also adds some overhead for every
+yalloc-call because most of them will "unprotect" the internal structure on
+entry and "protect" it again (marking it as inaccessible for valgrind) before
+returning.
+
+# Tests
+
+The tests rely on internal validation of the pool (see INTERNAL_VALIDATE) to
+check that no assumptions about the internal structure of the pool are
+violated. They additionally check for correctness of observations that can be
+made by using the public functions of the allocator (like checking if user data
+stays unmodified). There are a few different scripts that run tests:
+
+ - run_coverage.sh runs a bunch of testfunctions that are carefully crafted to
+   cover all code paths. Coverage data is generated by clang and a summary is
+   shown at the end of the test.
+
+ - run_valgrind.sh tests if the valgrind integration is working as expected,
+   runs the functions from the coverage test and some randomly generated
+   testcases under valgrind.
+
+ - run_libfuzzer.sh uses libfuzzer from clang to generate interesting testcases
+   and runs them in multiple jobs in parallel for 10 seconds. It also generates
+   coverage data at the end (it always got 100% coverage in my testruns).
+
+All tests exit with 0 and print "All fine!" at the end if there where no
+errors. Coverage deficits are not counted as error, so you have to look at the
+summary (they should show 100% coverage!).
+
+
+# Implementation Details
+
+The Headers and the user data are 32bit aligned. Headers have two 16bit fields
+where the high 15 bits represent offsets (relative to the pools address) to the
+previous/next block. The macros HDR_PTR() and HDR_OFFSET() are used to
+translate an offset to an address and back. The 32bit alignment is exploited to
+allow pools of up to 128k with that 15 significant bits.
+
+A pool is always occupied by non-overlapping blocks that link to their
+previous/next block in address order via the prev/next field of Header.
+
+Free blocks are always joined: No two free blocks will ever be neighbors.
+
+Free blocks have an additional header of the same structure. This additional
+header is used to build a list of free blocks (independent of their address
+order).
+
+yalloc_free() will insert the freed block to the front of the free list.
+yalloc_alloc() searches that list front to back and takes the first block that
+is big enough to satisfy the allocation.
+
+There is always a Header at the front and at the end of the pool. The Header at
+the end is degenerate: It is marked as "used" but has no next block (which is
+usually used to determine the size of a block).
+
+The prev-field of the very first block in the pool has special meaning: It
+points to the first free block in the pool. Or, if the pool is currently
+defragmenting (after yalloc_defrag_start() and before yalloc_defrag_commit()),
+points to the last header of the pool. This state can be recognized by checking
+if it points to an empty block (normal pool state) or a used block
+(defragmentation in progress). This logic can be seen in
+yalloc_defrag_in_progress().
+
+The lowest bit of next/prev have special meaning:
+
+ - low bit of prev is set for free blocks
+
+ - low bit of next is set for blocks with 32bit padding after the user data.
+   This is needed when a block is allocated from a free block that leaves only
+   4 free bytes after the user data... which is not enough to insert a
+   free-header (which is needs 8 bytes). The padding will be reclaimed when
+   that block is freed or when the pool is defragmented. The predicate
+   isPadded() can be used to test if a block is padded. Free blocks are never
+   padded.
+
+The predicate isNil() can be used to test if an offset points nowhere (it tests
+if all 15 high bits of an offset are 1). The constant NIL has all but the
+lowest bit set. It is used to set offsets to point to nowhere, and in some
+places it is used to mask out the actual address bits of an offset. This should
+be kept in mind when modifying the code and updating prev/next: Think carefully
+if you have to preserve the low bit when updating an offset!
+
+Defragmentation is done in two phases: First the user calls
+yalloc_defrag_start(). This will put the pool in a special state where no
+alloc/free-calls are allowed. In this state the prev-fields of the used blocks
+have a special meaning: They store the offset that the block will have after
+defragmentation finished. This information is used by yalloc_defrag_address()
+which can be called by the application to query the new addresses for its
+allocations. After the application has updated all its pointers it must call
+yalloc_defrag_commit() which moves all used blocks in contiguous space at the
+beginning of the pool, leaving one maximized free block at the end.
--- a/GL/yalloc/yalloc.c
+++ b/GL/yalloc/yalloc.c
@ -0,0 +1,802 @@
+#include "yalloc.h"
+#include "yalloc_internals.h"
+#include <assert.h>
+#include <string.h>
+
+#define ALIGN(num, align) (((num) + ((align) - 1)) & ~((align) - 1))
+
+#if defined(YALLOC_VALGRIND) && !defined(NVALGRIND)
+# define USE_VALGRIND 1
+#else
+# define USE_VALGRIND 0
+#endif
+
+#if USE_VALGRIND
+# include <valgrind/memcheck.h>
+#else
+# define VALGRIND_MAKE_MEM_UNDEFINED(p, s) ((void)0)
+# define VALGRIND_MAKE_MEM_DEFINED(p, s) ((void)0)
+# define VALGRIND_MAKE_MEM_NOACCESS(p, s) ((void)0)
+# define VALGRIND_CREATE_MEMPOOL(pool, rz, z) ((void)0)
+# define VALGRIND_MEMPOOL_ALLOC(pool, p, s) ((void)0)
+# define VALGRIND_MEMPOOL_FREE(pool, p)  ((void)0)
+# define VALGRIND_MEMPOOL_CHANGE(pool, a, b, s)  ((void)0)
+#endif
+
+#define MARK_NEW_FREE_HDR(p) VALGRIND_MAKE_MEM_UNDEFINED(p, sizeof(Header) * 2)
+#define MARK_NEW_HDR(p) VALGRIND_MAKE_MEM_UNDEFINED(p, sizeof(Header))
+#define PROTECT_HDR(p) VALGRIND_MAKE_MEM_NOACCESS(p, sizeof(Header))
+#define PROTECT_FREE_HDR(p) VALGRIND_MAKE_MEM_NOACCESS(p, sizeof(Header) * 2)
+#define UNPROTECT_HDR(p) VALGRIND_MAKE_MEM_DEFINED(p, sizeof(Header))
+#define UNPROTECT_FREE_HDR(p) VALGRIND_MAKE_MEM_DEFINED(p, sizeof(Header) * 2)
+
+
+#if USE_VALGRIND
+static void _unprotect_pool(void * pool)
+{
+  Header * cur = (Header*)pool;
+  for (;;)
+  {
+    UNPROTECT_HDR(cur);
+    if (isFree(cur))
+      UNPROTECT_HDR(cur + 1);
+
+    if (isNil(cur->next))
+      break;
+
+    cur = HDR_PTR(cur->next);
+  }
+}
+
+static void _protect_pool(void * pool)
+{
+  Header * cur = (Header*)pool;
+  while (cur)
+  {
+    Header * next = isNil(cur->next) ? NULL : HDR_PTR(cur->next);
+
+    if (isFree(cur))
+      VALGRIND_MAKE_MEM_NOACCESS(cur, (char*)next - (char*)cur);
+    else
+      PROTECT_HDR(cur);
+
+    cur = next;
+  }
+}
+#define assert_is_pool(pool) assert(VALGRIND_MEMPOOL_EXISTS(pool));
+
+#else
+
+static void _unprotect_pool(void * pool){(void)pool;}
+static void _protect_pool(void * pool){(void)pool;}
+#define assert_is_pool(pool) ((void)0)
+#endif
+
+// internal version that does not unprotect/protect the pool
+static int _yalloc_defrag_in_progress(void * pool)
+{
+  // fragmentation is indicated by a free list with one entry: the last block of the pool, which has its "free"-bit cleared.
+  Header * p = (Header*)pool;
+  if (isNil(p->prev))
+    return 0;
+
+  return !(HDR_PTR(p->prev)->prev & 1);
+}
+
+int yalloc_defrag_in_progress(void * pool)
+{
+  _unprotect_pool(pool);
+  int ret = _yalloc_defrag_in_progress(pool);
+  _protect_pool(pool);
+  return ret;
+}
+
+#if YALLOC_INTERNAL_VALIDATE
+
+static size_t _count_free_list_occurences(Header * pool, Header * blk)
+{
+  int n = 0;
+  if (!isNil(pool->prev))
+  {
+    Header * cur = HDR_PTR(pool->prev);
+    for (;;)
+    {
+      if (cur == blk)
+        ++n;
+
+      if (isNil(cur[1].next))
+        break;
+
+      cur = HDR_PTR(cur[1].next);
+    }
+  }
+  return n;
+}
+
+static size_t _count_addr_list_occurences(Header * pool, Header * blk)
+{
+  size_t n = 0;
+  Header * cur = pool;
+  for (;;)
+  {
+    if (cur == blk)
+      ++n;
+
+    if (isNil(cur->next))
+      break;
+
+    cur = HDR_PTR(cur->next);
+  }
+  return n;
+}
+
+static void _validate_user_ptr(void * pool, void * p)
+{
+  Header * hdr = (Header*)p - 1;
+  size_t n = _count_addr_list_occurences((Header*)pool, hdr);
+  assert(n == 1 && !isFree(hdr));
+}
+
+/**
+Validates if all the invariants of a pool are intact.
+
+This is very expensive when there are enough blocks in the heap (quadratic complexity!).
+*/
+static void _yalloc_validate(void * pool_)
+{
+  Header * pool = (Header*)pool_;
+  Header * cur = pool;
+
+  assert(!isNil(pool->next)); // there must always be at least two blocks: a free/used one and the final block at the end
+
+  if (_yalloc_defrag_in_progress(pool))
+  {
+    Header * prevUsed = NULL;
+    while (!isNil(cur->next))
+    {
+      if (!isFree(cur))
+      { // it is a used block
+        Header * newAddr = cur == pool ? pool : HDR_PTR(cur->prev);
+        assert(newAddr <= cur);
+        assert(newAddr >= pool);
+
+        if (prevUsed)
+        {
+          Header * prevNewAddr = prevUsed == pool ? pool : HDR_PTR(prevUsed->prev);
+          size_t prevBruttoSize = (char*)HDR_PTR(prevUsed->next) - (char*)prevUsed;
+          if (isPadded(prevUsed))
+            prevBruttoSize -= 4; // remove padding
+          assert((char*)newAddr == (char*)prevNewAddr + prevBruttoSize);
+        }
+        else
+        {
+          assert(newAddr == pool);
+        }
+
+        prevUsed = cur;
+      }
+
+      cur = HDR_PTR(cur->next);
+    }
+
+    assert(cur == HDR_PTR(pool->prev)); // the free-list should point to the last block
+    assert(!isFree(cur)); // the last block must not be free
+  }
+  else
+  {
+    Header * prev = NULL;
+
+    // iterate blocks in address order
+    for (;;)
+    {
+      if (prev)
+      {
+        Header * x = HDR_PTR(cur->prev);
+        assert(x == prev);
+      }
+
+      int n = _count_free_list_occurences(pool, cur);
+      if (isFree(cur))
+      { // it is a free block
+        assert(n == 1);
+        assert(!isPadded(cur)); // free blocks must have a zero padding-bit
+
+        if (prev)
+        {
+          assert(!isFree(prev)); // free blocks must not be direct neighbours
+        }
+      }
+      else
+      {
+        assert(n == 0);
+      }
+
+      if (isNil(cur->next))
+        break;
+
+      Header * next = HDR_PTR(cur->next);
+      assert((char*)next >= (char*)cur + sizeof(Header) * 2);
+      prev = cur;
+      cur = next;
+    }
+
+    assert(isNil(cur->next));
+
+    if (!isNil(pool->prev))
+    {
+      // iterate free-list
+      Header * f = HDR_PTR(pool->prev);
+      assert(isNil(f[1].prev));
+      for (;;)
+      {
+        assert(isFree(f)); // must be free
+
+        int n = _count_addr_list_occurences(pool, f);
+        assert(n == 1);
+
+        if (isNil(f[1].next))
+          break;
+
+        f = HDR_PTR(f[1].next);
+      }
+    }
+  }
+}
+
+#else
+static void _yalloc_validate(void * pool){(void)pool;}
+static void _validate_user_ptr(void * pool, void * p){(void)pool; (void)p;}
+#endif
+
+int yalloc_init(void * pool, size_t size)
+{
+  if (size > MAX_POOL_SIZE)
+    return -1;
+
+  // TODO: Error when pool is not properly aligned
+
+  // TODO: Error when size is not a multiple of the alignment?
+  while (size % sizeof(Header))
+    --size;
+
+  if(size < sizeof(Header) * 3)
+    return -1;
+
+  VALGRIND_CREATE_MEMPOOL(pool, 0, 0);
+
+  Header * first = (Header*)pool;
+  Header * last = (Header*)((char*)pool + size) - 1;
+
+  MARK_NEW_FREE_HDR(first);
+  MARK_NEW_HDR(first);
+
+  first->prev = HDR_OFFSET(first) | 1;
+  first->next = HDR_OFFSET(last);
+  first[1].prev = NIL;
+  first[1].next = NIL;
+
+  last->prev = HDR_OFFSET(first);
+  last->next = NIL;
+
+  _unprotect_pool(pool);
+  _yalloc_validate(pool);
+  _protect_pool(pool);
+  return 0;
+}
+
+void yalloc_deinit(void * pool)
+{
+#if USE_VALGRIND
+  VALGRIND_DESTROY_MEMPOOL(pool);
+
+  Header * last = (Header*)pool;
+  UNPROTECT_HDR(last);
+  while (!isNil(last->next))
+  {
+    Header * next = HDR_PTR(last->next);
+    UNPROTECT_HDR(next);
+    last = next;
+  }
+
+  VALGRIND_MAKE_MEM_UNDEFINED(pool, (char*)(last + 1) - (char*)pool);
+#else
+  (void)pool;
+#endif
+}
+
+
+void * yalloc_alloc(void * pool, size_t size)
+{
+  assert_is_pool(pool);
+  _unprotect_pool(pool);
+  assert(!_yalloc_defrag_in_progress(pool));
+  _yalloc_validate(pool);
+  if (!size)
+  {
+    _protect_pool(pool);
+    return NULL;
+  }
+
+  Header * root = (Header*)pool;
+  if (isNil(root->prev))
+  {
+    _protect_pool(pool);
+    return NULL; /* no free block, no chance to allocate anything */ // TODO: Just read up which C standard supports single line comments and then fucking use them!
+  }
+
+  /* round up to alignment */
+  size = ALIGN(size, 32);
+
+  size_t bruttoSize = size + sizeof(Header);
+  Header * prev = NULL;
+  Header * cur = HDR_PTR(root->prev);
+  for (;;)
+  {
+    size_t curSize = (char*)HDR_PTR(cur->next) - (char*)cur; /* size of the block, including its header */
+
+    if (curSize >= bruttoSize) // it is big enough
+    {
+      // take action for unused space in the free block
+      if (curSize >= bruttoSize + sizeof(Header) * 2)
+      { // the leftover space is big enough to make it a free block
+        // Build a free block from the unused space and insert it into the list of free blocks after the current free block
+        Header * tail = (Header*)((char*)cur + bruttoSize);
+        MARK_NEW_FREE_HDR(tail);
+
+        // update address-order-list
+        tail->next = cur->next;
+        tail->prev = HDR_OFFSET(cur) | 1;
+        HDR_PTR(cur->next)->prev = HDR_OFFSET(tail); // NOTE: We know the next block is used because free blocks are never neighbours. So we don't have to care about the lower bit which would be set for the prev of a free block.
+        cur->next = HDR_OFFSET(tail);
+
+        // update list of free blocks
+        tail[1].next = cur[1].next;
+        // NOTE: tail[1].prev is updated in the common path below (assignment to "HDR_PTR(cur[1].next)[1].prev")
+
+        if (!isNil(cur[1].next))
+          HDR_PTR(cur[1].next)[1].prev = HDR_OFFSET(tail);
+        cur[1].next = HDR_OFFSET(tail);
+      }
+      else if (curSize > bruttoSize)
+      { // there will be unused space, but not enough to insert a free header
+        internal_assert(curSize - bruttoSize == sizeof(Header)); // unused space must be enough to build a free-block or it should be exactly the size of a Header
+        cur->next |= 1; // set marker for "has unused trailing space"
+      }
+      else
+      {
+        internal_assert(curSize == bruttoSize);
+      }
+
+      cur->prev &= NIL; // clear marker for "is a free block"
+
+      // remove from linked list of free blocks
+      if (prev)
+        prev[1].next = cur[1].next;
+      else
+      {
+        uint32_t freeBit = isFree(root);
+        root->prev = (cur[1].next & NIL) | freeBit;
+      }
+
+      if (!isNil(cur[1].next))
+        HDR_PTR(cur[1].next)[1].prev = prev ? HDR_OFFSET(prev) : NIL;
+
+      _yalloc_validate(pool);
+      VALGRIND_MEMPOOL_ALLOC(pool, cur + 1, size);
+      _protect_pool(pool);
+      return cur + 1; // return address after the header
+    }
+
+    if (isNil(cur[1].next))
+      break;
+
+    prev = cur;
+    cur = HDR_PTR(cur[1].next);
+  }
+
+  _yalloc_validate(pool);
+  _protect_pool(pool);
+  return NULL;
+}
+
+// Removes a block from the free-list and moves the pools first-free-bock pointer to its successor if it pointed to that block.
+static void unlink_from_free_list(Header * pool, Header * blk)
+{
+  // update the pools pointer to the first block in the free list if necessary
+  if (isNil(blk[1].prev))
+  { // the block is the first in the free-list
+    // make the pools first-free-pointer point to the next in the free list
+    uint32_t freeBit = isFree(pool);
+    pool->prev = (blk[1].next & NIL) | freeBit;
+  }
+  else
+    HDR_PTR(blk[1].prev)[1].next = blk[1].next;
+
+  if (!isNil(blk[1].next))
+    HDR_PTR(blk[1].next)[1].prev = blk[1].prev;
+}
+
+size_t yalloc_block_size(void * pool, void * p)
+{
+  Header * a = (Header*)p - 1;
+  UNPROTECT_HDR(a);
+  Header * b = HDR_PTR(a->next);
+  size_t payloadSize = (char*)b - (char*)p;
+  if (isPadded(a))
+    payloadSize -= sizeof(Header);
+  PROTECT_HDR(a);
+  return payloadSize;
+}
+
+void yalloc_free(void * pool_, void * p)
+{
+  assert_is_pool(pool_);
+  assert(!yalloc_defrag_in_progress(pool_));
+  if (!p)
+    return;
+
+  _unprotect_pool(pool_);
+
+  Header * pool = (Header*)pool_;
+  Header * cur = (Header*)p - 1;
+
+  // get pointers to previous/next block in address order
+  Header * prev = cur == pool || isNil(cur->prev) ? NULL : HDR_PTR(cur->prev);
+  Header * next = isNil(cur->next) ? NULL : HDR_PTR(cur->next);
+
+  int prevFree = prev && isFree(prev);
+  int nextFree = next && isFree(next);
+
+#if USE_VALGRIND
+  {
+    unsigned errs = VALGRIND_COUNT_ERRORS;
+    VALGRIND_MEMPOOL_FREE(pool, p);
+    if (VALGRIND_COUNT_ERRORS > errs)
+    { // early exit if the free was invalid (so we get a valgrind error and don't mess up the pool, which is helpful for testing if invalid frees are detected by valgrind)
+      _protect_pool(pool_);
+      return;
+    }
+  }
+#endif
+
+  _validate_user_ptr(pool_, p);
+
+  if (prevFree && nextFree)
+  { // the freed block has two free neighbors
+    unlink_from_free_list(pool, prev);
+    unlink_from_free_list(pool, next);
+
+    // join prev, cur and next
+    prev->next = next->next;
+    HDR_PTR(next->next)->prev = cur->prev;
+
+    // prev is now the block we want to push onto the free-list
+    cur = prev;
+  }
+  else if (prevFree)
+  {
+    unlink_from_free_list(pool, prev);
+
+    // join prev and cur
+    prev->next = cur->next;
+    HDR_PTR(cur->next)->prev = cur->prev;
+
+    // prev is now the block we want to push onto the free-list
+    cur = prev;
+  }
+  else if (nextFree)
+  {
+    unlink_from_free_list(pool, next);
+
+    // join cur and next
+    cur->next = next->next;
+    HDR_PTR(next->next)->prev = next->prev & NIL;
+  }
+
+  // if there is a previous block and that block has padding then we want to grow the new free block into that padding
+  if (cur != pool && !isNil(cur->prev))
+  { // there is a previous block
+    Header * left = HDR_PTR(cur->prev);
+    if (isPadded(left))
+    { // the previous block has padding, so extend the current block to consume move the padding to the current free block
+      Header * grown = cur - 1;
+      MARK_NEW_HDR(grown);
+      grown->next = cur->next;
+      grown->prev = cur->prev;
+      left->next = HDR_OFFSET(grown);
+      if (!isNil(cur->next))
+        HDR_PTR(cur->next)->prev = HDR_OFFSET(grown);
+
+      cur = grown;
+    }
+  }
+
+  cur->prev |= 1; // it becomes a free block
+  cur->next &= NIL; // reset padding-bit
+  UNPROTECT_HDR(cur + 1);
+  cur[1].prev = NIL; // it will be the first free block in the free list, so it has no prevFree
+
+  if (!isNil(pool->prev))
+  { // the free-list was already non-empty
+    HDR_PTR(pool->prev)[1].prev = HDR_OFFSET(cur); // make the first entry in the free list point back to the new free block (it will become the first one)
+    cur[1].next = pool->prev; // the next free block is the first of the old free-list
+  }
+  else
+    cur[1].next = NIL; // free-list was empty, so there is no successor
+
+  VALGRIND_MAKE_MEM_NOACCESS(cur + 2, (char*)HDR_PTR(cur->next) - (char*)(cur + 2));
+
+  // now the freed block is the first in the free-list
+
+  // update the offset to the first element of the free list
+  uint32_t freeBit = isFree(pool); // remember the free-bit of the offset
+  pool->prev = HDR_OFFSET(cur) | freeBit; // update the offset and restore the free-bit
+  _yalloc_validate(pool);
+  _protect_pool(pool);
+}
+
+size_t yalloc_count_free(void * pool_)
+{
+  assert_is_pool(pool_);
+  _unprotect_pool(pool_);
+  assert(!_yalloc_defrag_in_progress(pool_));
+  Header * pool = (Header*)pool_;
+  size_t bruttoFree = 0;
+  Header * cur = pool;
+
+  _yalloc_validate(pool);
+
+  for (;;)
+  {
+    if (isFree(cur))
+    { // it is a free block
+      bruttoFree += (char*)HDR_PTR(cur->next) - (char*)cur;
+    }
+    else
+    { // it is a used block
+      if (isPadded(cur))
+      { // the used block is padded
+        bruttoFree += sizeof(Header);
+      }
+    }
+
+    if (isNil(cur->next))
+      break;
+
+    cur = HDR_PTR(cur->next);
+  }
+
+  _protect_pool(pool);
+
+  if (bruttoFree < sizeof(Header))
+  {
+    internal_assert(!bruttoFree); // free space should always be a multiple of sizeof(Header)
+    return 0;
+  }
+
+  return bruttoFree - sizeof(Header);
+}
+
+size_t yalloc_count_continuous(void * pool_)
+{
+  assert_is_pool(pool_);
+  _unprotect_pool(pool_);
+  assert(!_yalloc_defrag_in_progress(pool_));
+  Header * pool = (Header*)pool_;
+  size_t largestFree = 0;
+  Header * cur = pool;
+
+  _yalloc_validate(pool);
+
+  for (;;)
+  {
+    if (isFree(cur))
+    { // it is a free block
+      size_t temp = (uintptr_t)HDR_PTR(cur->next) - (uintptr_t)cur;
+      if(temp > largestFree)
+        largestFree = temp;
+    }
+
+    if (isNil(cur->next))
+      break;
+
+    cur = HDR_PTR(cur->next);
+  }
+
+  _protect_pool(pool);
+
+  if (largestFree < sizeof(Header))
+  {
+    internal_assert(!largestFree); // free space should always be a multiple of sizeof(Header)
+    return 0;
+  }
+
+  return largestFree - sizeof(Header);
+}
+
+void * yalloc_first_used(void * pool)
+{
+  assert_is_pool(pool);
+  _unprotect_pool(pool);
+  Header * blk = (Header*)pool;
+  while (!isNil(blk->next))
+  {
+    if (!isFree(blk))
+    {
+      _protect_pool(pool);
+      return blk + 1;
+    }
+
+    blk = HDR_PTR(blk->next);
+  }
+
+  _protect_pool(pool);
+  return NULL;
+}
+
+void * yalloc_next_used(void * pool, void * p)
+{
+  assert_is_pool(pool);
+  _unprotect_pool(pool);
+  _validate_user_ptr(pool, p);
+  Header * prev = (Header*)p - 1;
+  assert(!isNil(prev->next)); // the last block should never end up as input to this function (because it is not user-visible)
+
+  Header * blk = HDR_PTR(prev->next);
+  while (!isNil(blk->next))
+  {
+    if (!isFree(blk))
+    {
+      _protect_pool(pool);
+      return blk + 1;
+    }
+
+    blk = HDR_PTR(blk->next);
+  }
+
+  _protect_pool(pool);
+  return NULL;
+}
+
+void yalloc_defrag_start(void * pool_)
+{
+  assert_is_pool(pool_);
+  _unprotect_pool(pool_);
+  assert(!_yalloc_defrag_in_progress(pool_));
+  Header * pool = (Header*)pool_;
+
+  // iterate over all blocks in address order and store the post-defragment address of used blocks in their "prev" field
+  size_t end = 0; // offset for the next used block
+  Header * blk = (Header*)pool;
+  for (; !isNil(blk->next); blk = HDR_PTR(blk->next))
+  {
+    if (!isFree(blk))
+    { // it is a used block
+      blk->prev = end >> 1;
+      internal_assert((char*)HDR_PTR(blk->prev) == (char*)pool + end);
+
+      size_t bruttoSize = (char*)HDR_PTR(blk->next) - (char*)blk;
+
+      if (isPadded(blk))
+      { // the block is padded
+        bruttoSize -= sizeof(Header);
+      }
+
+      end += bruttoSize;
+      internal_assert(end % sizeof(Header) == 0);
+    }
+  }
+
+  // blk is now the last block (the dummy "used" block at the end of the pool)
+  internal_assert(isNil(blk->next));
+  internal_assert(!isFree(blk));
+
+  // mark the pool as "defragementation in progress"
+  uint32_t freeBit = isFree(pool);
+  pool->prev = (HDR_OFFSET(blk) & NIL) | freeBit;
+
+  _yalloc_validate(pool);
+  internal_assert(yalloc_defrag_in_progress(pool));
+  _protect_pool(pool);
+}
+
+void * yalloc_defrag_address(void * pool_, void * p)
+{
+  assert_is_pool(pool_);
+  assert(yalloc_defrag_in_progress(pool_));
+  if (!p)
+    return NULL;
+
+  Header * pool = (Header*)pool_;
+
+  _unprotect_pool(pool);
+  _validate_user_ptr(pool_, p);
+
+  if (pool + 1 == p)
+    return pool + 1; // "prev" of the first block points to the last used block to mark the pool as "defragmentation in progress"
+
+  Header * blk = (Header*)p - 1;
+
+  void * defragP = HDR_PTR(blk->prev) + 1;
+
+  _protect_pool(pool);
+  return defragP;
+}
+
+void yalloc_defrag_commit(void * pool_)
+{
+  assert_is_pool(pool_);
+  _unprotect_pool(pool_);
+  assert(_yalloc_defrag_in_progress(pool_));
+  Header * pool = (Header*)pool_;
+
+  // iterate over all blocks in address order and move them
+  size_t end = 0; // offset for the next used block
+  Header * blk = pool;
+  Header * lastUsed = NULL;
+  while (!isNil(blk->next))
+  {
+    if (!isFree(blk))
+    { // it is a used block
+      size_t bruttoSize = (char*)HDR_PTR(blk->next) - (char*)blk;
+
+      if (isPadded(blk))
+      { // the block is padded
+        bruttoSize -= sizeof(Header);
+      }
+
+      Header * next = HDR_PTR(blk->next);
+
+      blk->prev = lastUsed ? HDR_OFFSET(lastUsed) : NIL;
+      blk->next = (end + bruttoSize) >> 1;
+
+      lastUsed = (Header*)((char*)pool + end);
+      VALGRIND_MAKE_MEM_UNDEFINED(lastUsed, (char*)blk - (char*)lastUsed);
+      memmove(lastUsed, blk, bruttoSize);
+      VALGRIND_MEMPOOL_CHANGE(pool, blk + 1, lastUsed + 1, bruttoSize - sizeof(Header));
+
+      end += bruttoSize;
+      blk = next;
+    }
+    else
+      blk = HDR_PTR(blk->next);
+  }
+
+  // blk is now the last block (the dummy "used" block at the end of the pool)
+  internal_assert(isNil(blk->next));
+  internal_assert(!isFree(blk));
+
+  if (lastUsed)
+  {
+    Header * gap = HDR_PTR(lastUsed->next);
+    if (gap == blk)
+    { // there is no gap
+      pool->prev = NIL; // the free list is empty
+      blk->prev = HDR_OFFSET(lastUsed);
+    }
+    else if (blk - gap > 1)
+    { // the gap is big enouogh for a free Header
+
+      // set a free list that contains the gap as only element
+      gap->prev = HDR_OFFSET(lastUsed) | 1;
+      gap->next = HDR_OFFSET(blk);
+      gap[1].prev = NIL;
+      gap[1].next = NIL;
+      pool->prev = blk->prev = HDR_OFFSET(gap);
+    }
+    else
+    { // there is a gap, but it is too small to be used as free-list-node, so just make it padding of the last used block
+      lastUsed->next = HDR_OFFSET(blk) | 1;
+      pool->prev = NIL;
+      blk->prev = HDR_OFFSET(lastUsed);
+    }
+  }
+  else
+  { // the pool is empty
+    pool->prev = 1;
+  }
+
+  internal_assert(!_yalloc_defrag_in_progress(pool));
+  _yalloc_validate(pool);
+  _protect_pool(pool);
+}
--- a/GL/yalloc/yalloc.h
+++ b/GL/yalloc/yalloc.h
@ -0,0 +1,176 @@
+/**
+@file
+
+API of the yalloc allocator.
+*/
+
+#ifndef YALLOC_H
+#define YALLOC_H
+
+#include <stddef.h>
+
+/**
+Maximum supported pool size. yalloc_init() will fail for larger pools.
+*/
+#define MAX_POOL_SIZE ((2 << 24) - 4)
+
+/**
+Creates a pool inside a given buffer.
+
+Pools must be deinitialized with yalloc_deinit() when they are no longer needed.
+
+@param pool The starting address of the pool. It must have at least 16bit
+alignment (internal structure uses 16bit integers). Allocations are placed at
+32bit boundaries starting from this address, so if the user data should be
+32bit aligned then this address has to be 32bit aligned. Typically an address
+of static memory, or an array on the stack is used if the pool is only used
+temporarily.
+@param size Size of the pool.
+@return 0 on success, nonzero if the size is not supported.
+ */
+int yalloc_init(void * pool, size_t size);
+
+/**
+Deinitializes the buffer that is used by the pool and makes it available for other use.
+
+The content of the buffer is undefined after this.
+
+@param pool The starting address of an initialized pool.
+*/
+void yalloc_deinit(void * pool);
+
+/**
+Allocates a block of memory from a pool.
+
+This function mimics malloc().
+
+The pool must not be in the "defragmenting" state when this function is called.
+
+@param pool The starting address of an initialized pool.
+@param size Number of bytes to allocate.
+@return Allocated buffer or \c NULL if there was no free range that could serve
+the allocation. See @ref yalloc_defrag_start() for a way to remove
+fragmentation which may cause allocations to fail even when there is enough
+space in total.
+*/
+void * yalloc_alloc(void * pool, size_t size);
+
+/**
+Returns an allocation to a pool.
+
+This function mimics free().
+
+The pool must not be in the "defragmenting" state when this function is called.
+
+@param pool The starting address of the initialized pool the allocation comes from.
+@param p An address that was returned from yalloc_alloc() of the same pool.
+*/
+void yalloc_free(void * pool, void * p);
+
+/**
+Returns the maximum size of a successful allocation (assuming a completely unfragmented heap).
+
+After defragmentation the first allocation with the returned size is guaranteed to succeed.
+
+@param pool The starting address of an initialized pool.
+@return Number of bytes that can be allocated (assuming the pool is defragmented).
+*/
+size_t yalloc_count_free(void * pool);
+
+/**
+Returns the maximum continuous free area.
+
+@param pool The starting address of an initialized pool.
+@return Number of free bytes that exist continuously.
+*/
+size_t yalloc_count_continuous(void * pool_);
+
+/**
+Queries the usable size of an allocated block.
+
+@param pool The starting address of the initialized pool the allocation comes from.
+@param p An address that was returned from yalloc_alloc() of the same pool.
+@return Size of the memory block. This is the size passed to @ref yalloc_alloc() rounded up to 4.
+*/
+size_t yalloc_block_size(void * pool, void * p);
+
+/**
+Finds the first (in address order) allocation of a pool.
+
+@param pool The starting address of an initialized pool.
+@return Address of the allocation the lowest address inside the pool (this is
+what @ref yalloc_alloc() returned), or \c NULL if there is no used block.
+*/
+void * yalloc_first_used(void * pool);
+
+/**
+Given a pointer to an allocation finds the next (in address order) used block of a pool.
+
+@param pool The starting address of the initialized pool the allocation comes from.
+@param p Pointer to an allocation in that pool, typically comes from a previous
+call to @ref yalloc_first_used()
+*/
+void * yalloc_next_used(void * pool, void * p);
+
+/**
+Starts defragmentation for a pool.
+
+Allocations will stay where they are. But the pool is put in the "defagmenting"
+state (see @ref yalloc_defrag_in_progress()).
+
+The pool must not be in the "defragmenting" state when this function is called.
+The pool is put into the "defragmenting" state by this function.
+
+@param pool The starting address of an initialized pool.
+*/
+void yalloc_defrag_start(void * pool);
+
+/**
+Returns the address that an allocation will have after @ref yalloc_defrag_commit() is called.
+
+The pool must be in the "defragmenting" state when this function is called.
+
+@param pool The starting address of the initialized pool the allocation comes from.
+@param p Pointer to an allocation in that pool.
+@return The address the alloation will have after @ref yalloc_defrag_commit() is called.
+*/
+void * yalloc_defrag_address(void * pool, void * p);
+
+/**
+Finishes the defragmentation.
+
+The content of all allocations in the pool will be moved to the address that
+was reported by @ref yalloc_defrag_address(). The pool will then have only one
+free block. This means that an <tt>yalloc_alloc(pool, yalloc_count_free(pool))</tt>
+will succeed.
+
+The pool must be in the "defragmenting" state when this function is called. The
+pool is put back to normal state by this function.
+
+@param pool The starting address of an initialized pool.
+*/
+void yalloc_defrag_commit(void * pool);
+
+/**
+Tells if the pool is in the "defragmenting" state (after a @ref yalloc_defrag_start() and before a @ref yalloc_defrag_commit()).
+
+@param pool The starting address of an initialized pool.
+@return Nonzero if the pool is currently in the "defragmenting" state.
+*/
+int yalloc_defrag_in_progress(void * pool);
+
+
+/**
+Helper function that dumps the state of the pool to stdout.
+
+This function is only available if build with <tt>yalloc_dump.c</tt>. This
+function only exists for debugging purposes and can be ignored by normal users
+that are not interested in the internal structure of the implementation.
+
+@param pool The starting address of an initialized pool.
+@param name A string that is used as "Title" for the output.
+*/
+void yalloc_dump(void * pool, char * name);
+
+
+#endif // YALLOC_H
--- a/GL/yalloc/yalloc_dump.c
+++ b/GL/yalloc/yalloc_dump.c
@ -0,0 +1,39 @@
+#include "yalloc_internals.h"
+
+#include <stdio.h>
+
+static void printOffset(void * pool, char * name, uint16_t offset)
+{
+  if (isNil(offset))
+    printf("  %s: nil\n", name);
+  else
+    printf("  %s: %td\n", name, (char*)HDR_PTR(offset) - (char*)pool);
+}
+
+void yalloc_dump(void * pool, char * name)
+{
+  printf("---- %s ----\n", name);
+  Header * cur = (Header*)pool;
+  for (;;)
+  {
+    printf(isFree(cur) ? "%td: free @%p\n" : "%td: used @%p\n", (char*)cur - (char*)pool, cur);
+    printOffset(pool, cur == pool ? "first free" : "prev", cur->prev);
+    printOffset(pool, "next", cur->next);
+    if (isFree(cur))
+    {
+      printOffset(pool, "prevFree", cur[1].prev);
+      printOffset(pool, "nextFree", cur[1].next);
+    }
+    else
+      printf("  payload includes padding: %i\n", isPadded(cur));
+
+    if (isNil(cur->next))
+      break;
+
+    printf("  %td bytes payload\n", (char*)HDR_PTR(cur->next) - (char*)cur - sizeof(Header));
+
+    cur = HDR_PTR(cur->next);
+  }
+
+  fflush(stdout);
+}
--- a/GL/yalloc/yalloc_internals.h
+++ b/GL/yalloc/yalloc_internals.h
@ -0,0 +1,63 @@
+#ifndef YALLOC_INTERNALS_H
+#define YALLOC_INTERNALS_H
+
+#include <stdint.h>
+
+typedef struct
+{
+  uint32_t prev; // low bit set if free
+  uint32_t next; // for used blocks: low bit set if unused header at the end
+
+  /* We need user data to be 32-byte aligned, so the header needs
+   * to be 32 bytes in size (as user data follows the header) */
+  uint8_t padding[32 - (sizeof(uint32_t) * 2)];
+} Header;
+
+// NOTE: We have 32bit aligned data and 16bit offsets where the lowest bit is used as flag. So we remove the low bit and shift by 1 to address 128k bytes with the 15bit significant offset bits.
+
+#define NIL 0xFFFFFFFEu
+
+// return Header-address for a prev/next
+#define HDR_PTR(offset) ((Header*)((char*)pool + (((offset) & NIL)<<1)))
+
+// return a prev/next for a Header-address
+#define HDR_OFFSET(blockPtr) ((uint32_t)(((char*)blockPtr - (char*)pool) >> 1))
+
+#ifndef YALLOC_INTERNAL_VALIDATE
+# ifdef NDEBUG
+#   define YALLOC_INTERNAL_VALIDATE 0
+# else
+#   define YALLOC_INTERNAL_VALIDATE 1
+#endif
+#endif
+
+
+/*
+internal_assert() is used in some places to check internal expections.
+Activate this if you modify the code to detect problems as early as possible.
+In other cases this should be deactivated.
+*/
+#if 0
+#define internal_assert assert
+#else
+#define internal_assert(condition)((void) 0)
+#endif
+
+// detects offsets that point nowhere
+static inline int isNil(uint32_t offset)
+{
+  return (offset | 1) == 0xFFFFFFFF;
+}
+
+static inline int isFree(Header * hdr)
+{
+  return hdr->prev & 1;
+}
+
+static inline int isPadded(Header * hdr)
+{
+  return hdr->next & 1;
+}
+
+
+#endif // YALLOC_INTERNALS_H
--- a/README.md
+++ b/README.md
@ -32,7 +32,7 @@ GLdc uses CMake for its build system, it currently ships with two "backends":
 - kospvr - This is the hardware-accelerated Dreamcast backend
 - software - This is a stub software rasterizer used for testing testing and debugging
 
-To compile a Dreamcast debug build, you'll want to do something like the following:
+To compile for Dreamcast, you'll want to do something like the following:

 ```
 mkdir dcbuild
@ -41,11 +41,6 @@ cmake -DCMAKE_TOOLCHAIN_FILE=../toolchains/Dreamcast.cmake -G "Unix Makefiles" .
 make
 ```

-For a release build, replace the cmake line with with the following:
-```
-cmake -DCMAKE_TOOLCHAIN_FILE=../toolchains/Dreamcast.cmake -G "Unix Makefiles" -DCMAKE_BUILD_TYPE=Release ..
-```
-
 You will need KallistiOS compiled and configured (e.g. the KOS_BASE environment
 variable must be set)

--- a/containers/aligned_vector.c
+++ b/containers/aligned_vector.c
@ -12,45 +12,36 @@

 #include "aligned_vector.h"

-extern inline void* aligned_vector_resize(AlignedVector* vector, const uint32_t element_count);
-extern inline void* aligned_vector_extend(AlignedVector* vector, const uint32_t additional_count);
-extern inline void* aligned_vector_reserve(AlignedVector* vector, uint32_t element_count);
-extern inline void* aligned_vector_push_back(AlignedVector* vector, const void* objs, uint32_t count);
+extern inline void* aligned_vector_resize(AlignedVector* vector, const unsigned int element_count);
+extern inline void* aligned_vector_extend(AlignedVector* vector, const unsigned int additional_count);
+extern inline void* aligned_vector_reserve(AlignedVector* vector, unsigned int element_count);
+extern inline void* aligned_vector_push_back(AlignedVector* vector, const void* objs, unsigned int count);

-void aligned_vector_init(AlignedVector* vector, uint32_t element_size) {
-    /* Now initialize the header*/
-    AlignedVectorHeader* const hdr = &vector->hdr;
-    hdr->size = 0;
-    hdr->capacity = ALIGNED_VECTOR_CHUNK_SIZE;
-    hdr->element_size = element_size;
+void aligned_vector_init(AlignedVector* vector, unsigned int element_size) {
+    vector->size = vector->capacity = 0;
+    vector->element_size = element_size;
    vector->data = NULL;

-    /* Reserve some initial capacity. This will do the allocation but not set up the header */
-    void* ptr = aligned_vector_reserve(vector, ALIGNED_VECTOR_CHUNK_SIZE);
-    assert(ptr);
-    (void) ptr;
+    /* Reserve some initial capacity */
+    aligned_vector_reserve(vector, ALIGNED_VECTOR_CHUNK_SIZE);
 }

 void aligned_vector_shrink_to_fit(AlignedVector* vector) {
-    AlignedVectorHeader* const hdr = &vector->hdr;
-    if(hdr->size == 0) {
-        uint32_t element_size = hdr->element_size;
+    if(vector->size == 0) {
        free(vector->data);
-
-        /* Reallocate the header */
        vector->data = NULL;
-        hdr->size = hdr->capacity = 0;
-        hdr->element_size = element_size;
+        vector->capacity = 0;
    } else {
-        uint32_t new_byte_size = (hdr->size * hdr->element_size);
-        uint8_t* original_data = vector->data;
+        unsigned int new_byte_size = vector->size * vector->element_size;
+        unsigned char* original_data = vector->data;
        vector->data = (unsigned char*) memalign(0x20, new_byte_size);

        if(original_data) {
            FASTCPY(vector->data, original_data, new_byte_size);
            free(original_data);
        }
-        hdr->capacity = hdr->size;
+
+        vector->capacity = vector->size;
    }
 }

--- a/containers/aligned_vector.h
+++ b/containers/aligned_vector.h
@ -3,8 +3,6 @@
 #include <assert.h>
 #include <stdlib.h>
 #include <string.h>
-#include <stdint.h>
-#include <stdio.h>

 #ifdef __cplusplus
 extern "C" {
@ -13,13 +11,28 @@ extern "C" {
 #if defined(__APPLE__) || defined(__WIN32__)
 /* Linux + Kos define this, OSX does not, so just use malloc there */
 static inline void* memalign(size_t alignment, size_t size) {
-    (void) alignment;
    return malloc(size);
 }
 #else
    #include <malloc.h>
 #endif

+#ifdef __DREAMCAST__
+#include <kos/string.h>
+#define AV_MEMCPY4 memcpy4
+#else
+#define AV_MEMCPY4 memcpy
+#endif
+
+typedef struct {
+    unsigned int size;
+    unsigned int capacity;
+    unsigned char* data;
+    unsigned int element_size;
+} AlignedVector;
+
+#define ALIGNED_VECTOR_CHUNK_SIZE 256u
+
 #ifdef __cplusplus
 #define AV_FORCE_INLINE static inline
 #else
@ -28,193 +41,94 @@ static inline void* memalign(size_t alignment, size_t size) {
 #define AV_FORCE_INLINE static AV_INLINE_DEBUG
 #endif

-
-#ifdef __DREAMCAST__
-#include <kos/string.h>
-
-AV_FORCE_INLINE void *AV_MEMCPY4(void *dest, const void *src, size_t len)
-{
-  if(!len)
-  {
-    return dest;
-  }
-
-  const uint8_t *s = (uint8_t *)src;
-  uint8_t *d = (uint8_t *)dest;
-
-  uint32_t diff = (uint32_t)d - (uint32_t)(s + 1); // extra offset because input gets incremented before output is calculated
-  // Underflow would be like adding a negative offset
-
-  // Can use 'd' as a scratch reg now
-  asm volatile (
-    "clrs\n" // Align for parallelism (CO) - SH4a use "stc SR, Rn" instead with a dummy Rn
-  ".align 2\n"
-  "0:\n\t"
-    "dt %[size]\n\t" // (--len) ? 0 -> T : 1 -> T (EX 1)
-    "mov.b @%[in]+, %[scratch]\n\t" // scratch = *(s++) (LS 1/2)
-    "bf.s 0b\n\t" // while(s != nexts) aka while(!T) (BR 1/2)
-    " mov.b %[scratch], @(%[offset], %[in])\n" // *(datatype_of_s*) ((char*)s + diff) = scratch, where src + diff = dest (LS 1)
-    : [in] "+&r" ((uint32_t)s), [scratch] "=&r" ((uint32_t)d), [size] "+&r" (len) // outputs
-    : [offset] "z" (diff) // inputs
-    : "t", "memory" // clobbers
-  );
-
-  return dest;
-}
-
-#else
-#define AV_MEMCPY4 memcpy
-#endif
-
-typedef struct {
-    uint32_t size;
-    uint32_t capacity;
-    uint32_t element_size;
-} __attribute__((aligned(32))) AlignedVectorHeader;
-
-typedef struct {
-    AlignedVectorHeader hdr;
-    uint8_t* data;
-} AlignedVector;
-
-#define ALIGNED_VECTOR_CHUNK_SIZE 256u
-
-
 #define ROUND_TO_CHUNK_SIZE(v) \
    ((((v) + ALIGNED_VECTOR_CHUNK_SIZE - 1) / ALIGNED_VECTOR_CHUNK_SIZE) * ALIGNED_VECTOR_CHUNK_SIZE)


-void aligned_vector_init(AlignedVector* vector, uint32_t element_size);
+void aligned_vector_init(AlignedVector* vector, unsigned int element_size);

-AV_FORCE_INLINE void* aligned_vector_at(const AlignedVector* vector, const uint32_t index) {
-    const AlignedVectorHeader* hdr = &vector->hdr;
-    assert(index < hdr->size);
-    return vector->data + (index * hdr->element_size);
-}
-
-AV_FORCE_INLINE void* aligned_vector_reserve(AlignedVector* vector, uint32_t element_count) {
-    AlignedVectorHeader* hdr = &vector->hdr;
-
-    if(element_count < hdr->capacity) {
-        return aligned_vector_at(vector, element_count);
+AV_FORCE_INLINE void* aligned_vector_reserve(AlignedVector* vector, unsigned int element_count) {
+    if(element_count <= vector->capacity) {
+        return NULL;
    }

-    uint32_t original_byte_size = (hdr->size * hdr->element_size);
+    unsigned int original_byte_size = vector->size * vector->element_size;

    /* We overallocate so that we don't make small allocations during push backs */
    element_count = ROUND_TO_CHUNK_SIZE(element_count);

-    uint32_t new_byte_size = (element_count * hdr->element_size);
-    uint8_t* original_data = vector->data;
+    unsigned int new_byte_size = element_count * vector->element_size;
+    unsigned char* original_data = vector->data;

-    vector->data = (uint8_t*) memalign(0x20, new_byte_size);
+    vector->data = (unsigned char*) memalign(0x20, new_byte_size);
    assert(vector->data);

-    AV_MEMCPY4(vector->data, original_data, original_byte_size);
-    free(original_data);
+    if(original_data) {
+        AV_MEMCPY4(vector->data, original_data, original_byte_size);
+        free(original_data);
+    }
+
+    vector->capacity = element_count;

-    hdr->capacity = element_count;
    return vector->data + original_byte_size;
 }

-AV_FORCE_INLINE AlignedVectorHeader* aligned_vector_header(const AlignedVector* vector) {
-    return (AlignedVectorHeader*) &vector->hdr;
+AV_FORCE_INLINE void* aligned_vector_at(const AlignedVector* vector, const unsigned int index) {
+    assert(index < vector->size);
+    return &vector->data[index * vector->element_size];
 }

-AV_FORCE_INLINE uint32_t aligned_vector_size(const AlignedVector* vector) {
-    const AlignedVectorHeader* hdr = &vector->hdr;
-    return hdr->size;
-}
-
-AV_FORCE_INLINE uint32_t aligned_vector_capacity(const AlignedVector* vector) {
-    const AlignedVectorHeader* hdr = &vector->hdr;
-    return hdr->capacity;
-}
-
-AV_FORCE_INLINE void* aligned_vector_front(const AlignedVector* vector) {
-    return vector->data;
-}
-
-#define av_assert(x) \
-    do {\
-        if(!(x)) {\
-            fprintf(stderr, "Assertion failed at %s:%d\n", __FILE__, __LINE__);\
-            exit(1);\
-        }\
-    } while(0); \
-
-/* Resizes the array and returns a pointer to the first new element (if upsizing) or NULL (if downsizing) */
-AV_FORCE_INLINE void* aligned_vector_resize(AlignedVector* vector, const uint32_t element_count) {
+AV_FORCE_INLINE void* aligned_vector_resize(AlignedVector* vector, const unsigned int element_count) {
    void* ret = NULL;

-    AlignedVectorHeader* hdr = &vector->hdr;
-    uint32_t previous_count = hdr->size;
-    if(hdr->capacity <= element_count) {
+    unsigned int previousCount = vector->size;
+
+    if(vector->capacity < element_count) {
        /* If we didn't have capacity, increase capacity (slow) */
-
-        aligned_vector_reserve(vector, element_count);
-        hdr->size = element_count;
-
-        ret = aligned_vector_at(vector, previous_count);
-
-        av_assert(hdr->size == element_count);
-        av_assert(hdr->size <= hdr->capacity);
-    } else if(previous_count < element_count) {
+        vector->size = element_count;
+        ret = aligned_vector_reserve(vector, element_count);
+    } else if(previousCount < element_count) {
        /* So we grew, but had the capacity, just get a pointer to
         * where we were */
-        hdr->size = element_count;
-        av_assert(hdr->size < hdr->capacity);
-        ret = aligned_vector_at(vector, previous_count);
-    } else if(hdr->size != element_count) {
-        hdr->size = element_count;
-        av_assert(hdr->size < hdr->capacity);
+        vector->size = element_count;
+        ret = aligned_vector_at(vector, previousCount);
+    } else {
+        vector->size = element_count;
    }

    return ret;
 }

-AV_FORCE_INLINE void* aligned_vector_push_back(AlignedVector* vector, const void* objs, uint32_t count) {
+AV_FORCE_INLINE void* aligned_vector_push_back(AlignedVector* vector, const void* objs, unsigned int count) {
    /* Resize enough room */
-    AlignedVectorHeader* hdr = &vector->hdr;
-
    assert(count);
-    assert(hdr->element_size);
+    assert(vector->element_size);

-#ifndef NDEBUG
-    uint32_t element_size = hdr->element_size;
-    uint32_t initial_size = hdr->size;
-#endif
+    unsigned int initial_size = vector->size;
+    aligned_vector_resize(vector, vector->size + count);

-    uint8_t* dest = (uint8_t*) aligned_vector_resize(vector, hdr->size + count);
-    assert(dest);
+    assert(vector->size == initial_size + count);
+
+    unsigned char* dest = vector->data + (vector->element_size * initial_size);

    /* Copy the objects in */
-    AV_MEMCPY4(dest, objs, hdr->element_size * count);
+    AV_MEMCPY4(dest, objs, vector->element_size * count);

-    assert(hdr->element_size == element_size);
-    assert(hdr->size == initial_size + count);
    return dest;
 }


-AV_FORCE_INLINE void* aligned_vector_extend(AlignedVector* vector, const uint32_t additional_count) {
-    AlignedVectorHeader* hdr = &vector->hdr;
-    void* ret = aligned_vector_resize(vector, hdr->size + additional_count);
-    assert(ret);  // Should always return something
-    return ret;
+AV_FORCE_INLINE void* aligned_vector_extend(AlignedVector* vector, const unsigned int additional_count) {
+    return aligned_vector_resize(vector, vector->size + additional_count);
 }

 AV_FORCE_INLINE void aligned_vector_clear(AlignedVector* vector){
-    AlignedVectorHeader* hdr = &vector->hdr;
-    hdr->size = 0;
+    vector->size = 0;
 }
-
 void aligned_vector_shrink_to_fit(AlignedVector* vector);
 void aligned_vector_cleanup(AlignedVector* vector);
-
-AV_FORCE_INLINE void* aligned_vector_back(AlignedVector* vector){
-    AlignedVectorHeader* hdr = &vector->hdr;
-    return aligned_vector_at(vector, hdr->size ? hdr->size - 1 : 0);
+static inline void* aligned_vector_back(AlignedVector* vector){
+    return aligned_vector_at(vector, vector->size - 1);
 }

 #ifdef __cplusplus
--- a/containers/named_array.c
+++ b/containers/named_array.c
@ -68,6 +68,7 @@ void* named_array_reserve(NamedArray* array, unsigned int id) {
 void named_array_release(NamedArray* array, unsigned int new_id) {
    unsigned int i = new_id / 8;
    unsigned int j = new_id % 8;
+
    array->used_markers[i] &= (unsigned char) ~(1 << j);
 }

--- a/include/GL/gl.h
+++ b/include/GL/gl.h
@ -19,10 +19,6 @@ __BEGIN_DECLS

 #include <math.h>

-#if __STDCPP_FLOAT16_T__
-   #include <stdfloat>
-#endif
-
 /* Primitive Types taken from GL for compatability */
 /* Not all types are implemented in Open GL DC V.1.0 */
 #define GL_POINTS                               0x0000
@ -309,13 +305,12 @@ __BEGIN_DECLS
 #define GL_UNSIGNED_INT                         0x1405
 #define GL_FLOAT                                0x1406
 #define GL_DOUBLE                               0x140A
-#define GL_HALF_FLOAT                           0x140B
 #define GL_2_BYTES                              0x1407
 #define GL_3_BYTES                              0x1408
 #define GL_4_BYTES                              0x1409

 /* ErrorCode */
-#define GL_NO_ERROR                       ((GLenum) 0)
+#define GL_NO_ERROR                       0
 #define GL_INVALID_ENUM                   0x0500
 #define GL_INVALID_VALUE                  0x0501
 #define GL_INVALID_OPERATION              0x0502
@ -364,7 +359,7 @@ __BEGIN_DECLS
 #define GL_UNSIGNED_SHORT_5_6_5_REV     0x8364
 #define GL_UNSIGNED_SHORT_4_4_4_4_REV   0x8365
 #define GL_UNSIGNED_SHORT_1_5_5_5_REV   0x8366
-#define GL_UNSIGNED_INT_8_8_8_8_REV     0x8367
+
 #define GL_UNSIGNED_INT_2_10_10_10_REV  0x8368

 #define GL_COLOR_INDEX                    0x1900
@ -376,32 +371,6 @@ __BEGIN_DECLS
 #define GL_RGBA                           0x1908
 #define GL_LUMINANCE                      0x1909
 #define GL_LUMINANCE_ALPHA                0x190A
-
-#define GL_R3_G3_B2                    0x2A10
-
-#define GL_ALPHA4                    0x803B
-#define GL_ALPHA8                    0x803C
-#define GL_ALPHA12                    0x803D
-#define GL_ALPHA16                    0x803E
-
-#define GL_LUMINANCE4                  0x803F
-#define GL_LUMINANCE8                  0x8040
-#define GL_LUMINANCE12                  0x8041
-#define GL_LUMINANCE16                  0x8042
-
-#define GL_LUMINANCE4_ALPHA4              0x8043
-#define GL_LUMINANCE6_ALPHA2              0x8044
-#define GL_LUMINANCE8_ALPHA8              0x8045
-#define GL_LUMINANCE12_ALPHA4              0x8046
-#define GL_LUMINANCE12_ALPHA12              0x8047
-#define GL_LUMINANCE16_ALPHA16              0x8048
-
-#define GL_INTENSITY4                  0x804A
-#define GL_INTENSITY8                  0x804B
-#define GL_INTENSITY12                  0x804C
-#define GL_INTENSITY16                  0x804D
-
-#define GL_BGR                            0x80E0
 #define GL_BGRA                           0x80E1
 #define GL_INTENSITY                      0x8049
 #define GL_RGB4                           0x804F
@ -418,14 +387,6 @@ __BEGIN_DECLS
 #define GL_RGBA12                         0x805A
 #define GL_RGBA16                         0x805B

-#define GL_R8                      0x8229
-#define GL_RG8                      0x822B
-#define GL_RG                      0x8227
-#define GL_R16                      0x822A
-#define GL_RG16                      0x822C
-#define GL_COMPRESSED_RED                0x8225
-#define GL_COMPRESSED_RG                0x8226
-
 /* Polygons */
 #define GL_POINT				0x1B00
 #define GL_LINE					0x1B01
@ -466,12 +427,6 @@ __BEGIN_DECLS
 #define GL_FALSE   0
 #define GL_TRUE    1

-#if __STDCPP_FLOAT16_T__
-#define GLhalf std::float16_t
-#else
-#define GLhalf unsigned short
-#endif
-
 /* Stubs for portability */
 #define GL_LINE_SMOOTH                    0x0B20
 #define GL_ALPHA_TEST                     0x0BC0
@ -710,7 +665,6 @@ GLAPI void APIENTRY glFrustum(GLfloat left, GLfloat right,
 /* Fog Functions - client must enable GL_FOG for this to take effect */
 GLAPI void APIENTRY glFogi(GLenum pname, GLint param);
 GLAPI void APIENTRY glFogf(GLenum pname, GLfloat param);
-GLAPI void APIENTRY glFogiv(GLenum pname, const GLint* params);
 GLAPI void APIENTRY glFogfv(GLenum pname, const GLfloat *params);

 /* Lighting Functions - client must enable GL_LIGHTING for this to take effect */
--- a/include/GL/glext.h
+++ b/include/GL/glext.h
@ -130,7 +130,7 @@ GLAPI void APIENTRY glGenFramebuffersEXT(GLsizei n, GLuint* framebuffers);
 GLAPI void APIENTRY glDeleteFramebuffersEXT(GLsizei n, const GLuint* framebuffers);
 GLAPI void APIENTRY glBindFramebufferEXT(GLenum target, GLuint framebuffer);
 GLAPI void APIENTRY glFramebufferTexture2DEXT(GLenum target, GLenum attachment, GLenum textarget, GLuint texture, GLint level);
-GLAPI void APIENTRY glGenerateMipmap(GLenum target);
+GLAPI void APIENTRY glGenerateMipmapEXT(GLenum target);
 GLAPI GLenum APIENTRY glCheckFramebufferStatusEXT(GLenum target);
 GLAPI GLboolean APIENTRY glIsFramebufferEXT(GLuint framebuffer);

@ -203,7 +203,7 @@ GLAPI void APIENTRY glCompressedTexImage2DARB(GLenum target,
 #define glClientActiveTexture glClientActiveTextureARB
 #define glMultiTexCoord2f glMultiTexCoord2fARB

-#define glGenerateMipmapEXT glGenerateMipmap
+#define glGenerateMipmap glGenerateMipmapEXT
 #define glCompressedTexImage2D glCompressedTexImage2DARB

 #ifndef GL_VERSION_1_4
--- a/include/GL/glkos.h
+++ b/include/GL/glkos.h
@ -35,6 +35,8 @@ extern const char* GLDC_VERSION;

 #define GL_NEARZ_CLIPPING_KOS                       0xEEFA

+#define GL_UNSIGNED_BYTE_TWID_KOS                   0xEEFB
+

 /* Initialize the GL pipeline. GL will initialize the PVR. */
 GLAPI void APIENTRY glKosInit();
@ -55,13 +57,6 @@ typedef struct {
    GLuint initial_pt_capacity;
    GLuint initial_immediate_capacity;

-    /* Default: True
-     *
-     * Whether glTexImage should automatically twiddle textures
-     * if the internal format is a generic format (e.g. GL_RGB).
-     * this is the same as calling glEnable(GL_TEXTURE_TWIDDLE_KOS)
-     * on boot */
-    GLboolean texture_twiddle;
 } GLdcConfig;


@ -92,7 +87,7 @@ GLAPI void APIENTRY glKosInitConfig(GLdcConfig* config);
 */
 GLAPI void APIENTRY glKosInitEx(GLdcConfig* config);
 GLAPI void APIENTRY glKosSwapBuffers();
-GLAPI void APIENTRY glKosShutdown();
+

 /*
 * CUSTOM EXTENSION multiple_shared_palette_KOS
@ -191,28 +186,12 @@ GLAPI void APIENTRY glKosShutdown();
 /* Memory allocation extension (GL_KOS_texture_memory_management) */
 GLAPI GLvoid APIENTRY glDefragmentTextureMemory_KOS(void);

-/* glGet extensions */
 #define GL_FREE_TEXTURE_MEMORY_KOS                  0xEF3D
 #define GL_USED_TEXTURE_MEMORY_KOS                  0xEF3E
 #define GL_FREE_CONTIGUOUS_TEXTURE_MEMORY_KOS       0xEF3F

 //for palette internal format (glfcConfig)
 #define GL_RGB565_KOS                               0xEF40
-#define GL_ARGB4444_KOS                             0xEF41
-#define GL_ARGB1555_KOS                             0xEF42
-#define GL_RGB565_TWID_KOS                          0xEF43
-#define GL_ARGB4444_TWID_KOS                        0xEF44
-#define GL_ARGB1555_TWID_KOS                        0xEF45
-#define GL_COLOR_INDEX8_TWID_KOS                    0xEF46
-#define GL_COLOR_INDEX4_TWID_KOS                    0xEF47
-#define GL_RGB_TWID_KOS                             0xEF48
-#define GL_RGBA_TWID_KOS                            0xEF49
-
-/* glGet extensions */
-#define GL_TEXTURE_INTERNAL_FORMAT_KOS              0xEF50
-
-/* If enabled, will twiddle texture uploads where possible */
-#define GL_TEXTURE_TWIDDLE_KOS                      0xEF51

 __END_DECLS

--- a/samples/cubes/main.cpp
+++ b/samples/cubes/main.cpp
@ -1,446 +0,0 @@
-
-#include <cstdio>
-#include <stdbool.h>
-#include <stdlib.h>
-#include <time.h>
-
-#ifdef __DREAMCAST__
-#include <kos.h>
-float avgfps = -1;
-#endif
-
-#include "GL/gl.h"
-#include "GL/glkos.h"
-#include "GL/glu.h"
-#include "GL/glext.h"
-
-#define PI 3.14159265358979323846264338327950288f
-#define RAD_TO_DEG 57.295779513082320876798154814105f
-#define MAX_CUBES 350
-
-float timeElapsed = 0.0f;
-const float dt = 1.0f / 60.0f;
-
-float angle = 0;
-const float invAngle360 = 1.0f / 360.0f;
-const float cameraDistance = 3.0f;
-
-bool isDrawingArrays = false;
-bool isBlendingEnabled = true;
-bool isRunning = true;
-
-typedef struct
-{
-	GLubyte r;
-	GLubyte g;
-	GLubyte b;
-	GLubyte a;
-} Color;
-
-Color colors[] =
-{
-	{255, 0, 0, 128},
-	{0, 255, 0, 128},
-	{0, 0, 255, 128},
-	{255, 255, 0, 128},
-	{255, 0, 255, 128},
-	{0, 255, 255, 128}
-};
-Color faceColors[24];
-
-float cubeVertices[] =
-{
-	// Front face
-	-1.0f, -1.0f, +1.0f, // vertex 0
-	+1.0f, -1.0f, +1.0f, // vertex 1
-	+1.0f, +1.0f, +1.0f, // vertex 2
-	-1.0f, +1.0f, +1.0f, // vertex 3
-
-	// Back face
-	-1.0f, -1.0f, -1.0f, // vertex 4
-	+1.0f, -1.0f, -1.0f, // vertex 5
-	+1.0f, +1.0f, -1.0f, // vertex 6
-	-1.0f, +1.0f, -1.0f, // vertex 7
-
-	// Top face
-	-1.0f, +1.0f, +1.0f, // vertex 8
-	+1.0f, +1.0f, +1.0f, // vertex 9
-	+1.0f, +1.0f, -1.0f, // vertex 10
-	-1.0f, +1.0f, -1.0f, // vertex 11
-
-	// Bottom face
-	-1.0f, -1.0f, +1.0f, // vertex 12
-	+1.0f, -1.0f, +1.0f, // vertex 13
-	+1.0f, -1.0f, -1.0f, // vertex 14
-	-1.0f, -1.0f, -1.0f, // vertex 15
-
-	// Right face
-	+1.0f, -1.0f, +1.0f, // vertex 16
-	+1.0f, -1.0f, -1.0f, // vertex 17
-	+1.0f, +1.0f, -1.0f, // vertex 18
-	+1.0f, +1.0f, +1.0f, // vertex 19
-
-	// Left face
-	-1.0f, -1.0f, +1.0f, // vertex 20
-	-1.0f, -1.0f, -1.0f, // vertex 21
-	-1.0f, +1.0f, -1.0f, // vertex 22
-	-1.0f, +1.0f, +1.0f // vertex 23
-};
-
-// Set up indices array
-unsigned int cubeIndices[] =
-{
-	// Front face
-	0, 1, 2, 3,
-
-	// Back face
-	4, 5, 6, 7,
-
-	// Top face
-	8, 9, 10, 11,
-
-	// Bottom face
-	12, 13, 14, 15,
-
-	// Right face
-	16, 17, 18, 19,
-
-	// Left face
-	20, 21, 22, 23
-};
-
-typedef struct
-{
-	float r;
-	float x, y, z;
-	float vx, vy, vz;
-} Cube;
-
-Cube cubes[MAX_CUBES];
-
-int numCubes = 0;
-
-// Create a 4x4 identity matrix
-float cubeTransformationMatrix[16] = { 1.0f, 0.0f, 0.0f, 0.0f,
-									  0.0f, 1.0f, 0.0f, 0.0f,
-									  0.0f, 0.0f, 1.0f, 0.0f,
-									  0.0f, 0.0f, 0.0f, 1.0f };
-
-
-void debugLog(const char* msg) {
-#ifdef __DREAMCAST__
-	dbglog(DBG_KDEBUG, "%s\n", msg);
-#else
-	printf("%s\n", msg);
-#endif
-}
-
-
-void runningStats() {
-#ifdef __DREAMCAST__
-	pvr_stats_t stats;
-	pvr_get_stats(&stats);
-
-	if (avgfps != -1)
-		avgfps = (avgfps + stats.frame_rate) * 0.5f;
-	else
-		avgfps = stats.frame_rate;
-#endif
-}
-
-void avgStats() {
-#ifdef __DREAMCAST__
-	dbglog(DBG_DEBUG, "Average frame rate: ~%f fps\n", avgfps);
-#endif
-}
-
-
-void stats() {
-#ifdef __DREAMCAST__
-	pvr_stats_t stats;
-
-	pvr_get_stats(&stats);
-	dbglog(DBG_DEBUG, "3D Stats: %d VBLs, current frame rate ~%f fps\n", stats.vbl_count, stats.frame_rate);
-	avgStats();
-#endif
-}
-
-
-void addCube(float r, float x, float y, float z, float vx, float vy, float vz)
-{
-	if (numCubes < MAX_CUBES) {
-		cubes[numCubes].r = r;
-		cubes[numCubes].x = x;
-		cubes[numCubes].y = y;
-		cubes[numCubes].z = z;
-		cubes[numCubes].vx = vx;
-		cubes[numCubes].vy = vy;
-		cubes[numCubes].vz = vz;
-		numCubes++;
-	}
-}
-
-
-void addCubeQuick(float x, float y, float z, float scale_factor)
-{
-	addCube(0.5f * scale_factor, x, y, z, 0, 0, 0);
-}
-
-
-void updateCubes(float dt)
-{
-	for (size_t i = 0; i < numCubes; i++)
-	{
-		Cube* cube = &cubes[i];
-		cube->x += cube->vx * dt;
-		cube->y += cube->vy * dt;
-		cube->z += cube->vz * dt;
-
-		if (cube->x < -3 || cube->x > +3) { cube->vx *= -1; }
-		if (cube->y < -3 || cube->y > +3) { cube->vy *= -1; }
-		if (cube->z < -3 || cube->z > +3) { cube->vz *= -1; }
-	}
-}
-
-
-void renderUnitCube()
-{
-	glEnableClientState(GL_VERTEX_ARRAY);
-	glEnableClientState(GL_COLOR_ARRAY);
-
-	glVertexPointer(3, GL_FLOAT, 0, cubeVertices);
-	glColorPointer(4, GL_UNSIGNED_BYTE, 0, faceColors);
-
-	if (isDrawingArrays) {
-		glDrawArrays(GL_QUADS, 0, 24);
-	}
-	else {
-		glDrawElements(GL_QUADS, 24, GL_UNSIGNED_INT, cubeIndices);
-	}
-
-	glDisableClientState(GL_COLOR_ARRAY);
-	glDisableClientState(GL_VERTEX_ARRAY);
-}
-
-
-void renderCubes(float angle)
-{
-	for (size_t i = 0; i < numCubes; i++) {
-		const float scale_factor = 0.05f + (i / (float)numCubes) * 0.35f;
-		Cube* cube = &cubes[i];
-
-		glPushMatrix(); // Save previous camera state
-		glMatrixMode(GL_MODELVIEW);
-
-		glTranslatef(cube->x, cube->y, cube->z);
-		glRotatef(angle, 1, 1, 1); // Rotate camera / object
-
-		glScalef(scale_factor, scale_factor, scale_factor); // Apply scale factor
-
-		renderUnitCube();
-		glPopMatrix(); // Restore previous camera state
-	}
-}
-
-
-float rnd(float Min, float Max)
-{
-	return (Max - Min) * (float)rand() / (float)RAND_MAX + Min;
-}
-
-
-void initialize()
-{
-	debugLog("Initialize video output");
-	glKosInit();
-
-	glClearDepth(1.0);
-	glDepthFunc(GL_LEQUAL);
-	glDepthMask(GL_TRUE);
-	glEnable(GL_DEPTH_TEST);
-	glShadeModel(GL_SMOOTH);
-
-	if (isBlendingEnabled)
-	{
-		glEnable(GL_BLEND);
-	}
-	else
-	{
-		glDisable(GL_BLEND);
-	}
-
-	glBlendFunc(GL_SRC_ALPHA, GL_ONE_MINUS_SRC_ALPHA);
-	glDisable(GL_CULL_FACE);
-
-	glViewport(0, 0, 640, 480);
-	glClearColor(0.0f, 0.0f, 0.3f, 1.0f);
-
-	glMatrixMode(GL_PROJECTION);
-	glLoadIdentity();
-
-	// Set up colors (each face has a different color)
-	for (int i = 0; i < 6; i++)
-	{
-		faceColors[i * 4] = colors[i];
-		faceColors[i * 4 + 1] = colors[i];
-		faceColors[i * 4 + 2] = colors[i];
-		faceColors[i * 4 + 3] = colors[i];
-	}
-}
-
-
-void updateTimer()
-{
-	timeElapsed += dt;
-
-	if (timeElapsed > 10.0f)
-	{
-		stats();
-		timeElapsed = 0.0f;
-	}
-}
-
-
-void updateLogic()
-{
-	updateTimer();
-
-	const int fullRot = (int)(angle * invAngle360);
-	angle -= fullRot * 360.0f;
-	angle += 50.0f * dt;
-
-	const float zoomVal = __builtin_sinf(timeElapsed) * 5.0f;
-
-	glClear(GL_COLOR_BUFFER_BIT | GL_DEPTH_BUFFER_BIT);
-	glMatrixMode(GL_MODELVIEW);
-	glLoadIdentity();
-
-	// Set up the camera position and orientation
-	float cameraPos[] = { 0.0f, 0.0f, cameraDistance };
-	float cameraTarget[] = { 0.0f, 0.0f, 0.0f };
-	float cameraUp[] = { 0.0f, 1.0f, 0.0f };
-
-	// Move the camera
-	gluLookAt(cameraPos[0], cameraPos[1], cameraPos[2],
-		cameraTarget[0], cameraTarget[1], cameraTarget[2],
-		cameraUp[0], cameraUp[1], cameraUp[2]);
-
-	glTranslatef(0.0f, 0.0f, -cameraDistance + zoomVal);
-
-	// Apply cube transformation (identity matrix)
-	glLoadIdentity();
-
-	updateCubes(dt);
-
-	renderCubes(angle);
-
-	// Reset ModelView matrix to remove camera transformation
-	float matrix[16];
-	glGetFloatv(GL_MODELVIEW_MATRIX, matrix);
-	matrix[12] = 0.0f;
-	matrix[13] = 0.0f;
-	matrix[14] = 0.0f;
-
-	glMatrixMode(GL_MODELVIEW);
-	glLoadMatrixf(matrix);
-}
-
-
-void updateInput()
-{
-#ifdef __DREAMCAST__
-	static uint8_t prevButtons = 0;
-	maple_device_t* cont;
-	cont_state_t* state;
-
-	cont = maple_enum_type(0, MAPLE_FUNC_CONTROLLER);
-
-	if (cont)
-	{
-		state = (cont_state_t*)maple_dev_status(cont);
-
-		if (state && (state->buttons & CONT_START) && !(prevButtons & CONT_START))
-		{
-			isRunning = false;
-		}
-
-		if (state && (state->buttons & CONT_A) && !(prevButtons & CONT_A))
-		{
-			isDrawingArrays = !isDrawingArrays;
-
-			if (isDrawingArrays)
-			{
-				glClearColor(0.3f, 0.0f, 0.3f, 1.0f);
-			}
-			else
-			{
-				glClearColor(0.0f, 0.0f, 0.3f, 1.0f);
-			}
-		}
-
-		if (state && (state->buttons & CONT_B) && !(prevButtons & CONT_B))
-		{
-			isBlendingEnabled = !isBlendingEnabled;
-
-			if (isBlendingEnabled)
-			{
-				glEnable(GL_BLEND);
-			}
-			else
-			{
-				glDisable(GL_BLEND);
-			}
-		}
-
-		prevButtons = state->buttons;
-	}
-#endif
-}
-
-
-void swapBuffers()
-{
-#ifdef __DREAMCAST__
-	glKosSwapBuffers();
-#endif
-}
-
-
-int main(int argc, char* argv[])
-{
-	initialize();
-
-	// Setup camera frustum
-	const float aspectRatio = 640.0f / 480.0f;
-	const float fov = 60;
-	const float zNear = 0.1f;
-	const float zFar = 1000.0f;
-
-	gluPerspective(fov, aspectRatio, zNear, zFar);
-
-	for (size_t i = 0; i < MAX_CUBES; i++)
-	{
-
-		const float r = rnd(0.1f, 0.5f);
-		const float x = rnd(-3.0f, 3.0f);
-		const float y = rnd(-3.0f, 3.0f);
-		const float z = rnd(-3.0f, 3.0f);
-		const float vx = rnd(-2.0f, 2.0f);
-		const float vy = rnd(-2.0f, 2.0f);
-		const float vz = rnd(-2.0f, 2.0f);
-
-		addCube(r, x, y, z, vx, vy, vz);
-	}
-
-	while (isRunning)
-	{
-		updateLogic();
-		updateInput();
-		swapBuffers();
-		runningStats();
-	}
-
-	avgStats();
-
-	return 0;
-}
--- a/samples/lights/main.c
+++ b/samples/lights/main.c
@ -145,7 +145,7 @@ int check_start() {

 void DrawCube(float x, float z) {
    static float pos = 0.0f;
-    static const float radius = 30.0f;
+    const static float radius = 30.0f;

    pos += 0.001f;

--- a/samples/loadbmp.c
+++ b/samples/loadbmp.c
@ -23,11 +23,7 @@ int ImageLoad(char *filename, Image *image) {
    }

    // seek through the bmp header, up to the width/height:
-    fseek(file, 10, SEEK_CUR);
-
-    uint32_t offset;
-    fread(&offset, 4, 1, file);
-    fseek(file, 4, SEEK_CUR);
+    fseek(file, 18, SEEK_CUR);

    // read the width
    if ((i = fread(&sizeX, 4, 1, file)) != 1) {
@ -69,7 +65,7 @@ int ImageLoad(char *filename, Image *image) {
    }

    // seek past the rest of the bitmap header.
-    fseek(file, offset, SEEK_SET);
+    fseek(file, 24, SEEK_CUR);

    // read the data.
    image->data = (char *) malloc(size);
--- a/samples/nehe02/main.c
+++ b/samples/nehe02/main.c
@ -9,7 +9,7 @@
 /* A general OpenGL initialization function.  Sets all of the initial parameters. */
 void InitGL(int Width, int Height)	        // We call this right after our OpenGL window is created.
 {
-    glClearColor(0.0f, 0.0f, 1.0f, 0.0f);		// This Will Clear The Background Color To Black
+    glClearColor(0.0f, 0.0f, 0.0f, 0.0f);		// This Will Clear The Background Color To Black
    glClearDepth(1.0);				// Enables Clearing Of The Depth Buffer
    glDepthFunc(GL_LEQUAL);				// The Type Of Depth Test To Do
    glEnable(GL_DEPTH_TEST);			// Enables Depth Testing
@ -20,7 +20,7 @@ void InitGL(int Width, int Height)	        // We call this right after our OpenG

    gluPerspective(45.0f,(GLfloat)Width/(GLfloat)Height,0.1f,100.0f);	// Calculate The Aspect Ratio Of The Window

-    glMatrixMode(GL_MODELVIEW);
+    glMatrixMode(GL_MODELVIEW);    
 }

 /* The function called when our window is resized (which shouldn't happen, because we're fullscreen) */
--- a/samples/nehe06/main.c
+++ b/samples/nehe06/main.c
@ -53,10 +53,10 @@ void LoadGLTextures() {

    // 2d texture, level of detail 0 (normal), 3 components (red, green, blue), x size from image, y size from image,
    // border 0 (normal), rgb color data, unsigned byte data, and finally the data itself.
-    glTexImage2D(GL_TEXTURE_2D, 0, GL_RGB, image1->sizeX, image1->sizeY, 0, GL_RGB, GL_UNSIGNED_BYTE, image1->data);
+    glTexImage2D(GL_TEXTURE_2D, 0, 3, image1->sizeX, image1->sizeY, 0, GL_RGB, GL_UNSIGNED_BYTE, image1->data);

    free(image1);
-}
+};

 /* A general OpenGL initialization function.  Sets all of the initial parameters. */
 void InitGL(int Width, int Height)	        // We call this right after our OpenGL window is created.
@ -74,7 +74,7 @@ void InitGL(int Width, int Height)	        // We call this right after our OpenG

    gluPerspective(45.0f,(GLfloat)Width/(GLfloat)Height,0.1f,100.0f);	// Calculate The Aspect Ratio Of The Window

-    glMatrixMode(GL_MODELVIEW);    
+    glMatrixMode(GL_MODELVIEW);
 }

 /* The function called when our window is resized (which shouldn't happen, because we're fullscreen) */
--- a/samples/nehe06/romdisk/NeHe.bmp
+++ b/samples/nehe06/romdisk/NeHe.bmp
--- a/samples/nehe06_4444twid/main.c
+++ b/samples/nehe06_4444twid/main.c
@ -59,10 +59,10 @@ int ImageLoad(char *filename, Image *image) {

    fread(&header, sizeof(header), 1, file);

-    GLboolean twiddled = (header.type & (1 << 26)) < 1;
-    GLboolean compressed = (header.type & (1 << 30)) > 0;
-    GLboolean mipmapped = (header.type & (1 << 31)) > 0;
-    GLboolean strided = (header.type & (1 << 25)) > 0;
+    GLboolean twiddled = (header.type & (1 << 25)) < 1;
+    GLboolean compressed = (header.type & (1 << 29)) > 0;
+    GLboolean mipmapped = (header.type & (1 << 30)) > 0;
+    GLboolean strided = (header.type & (1 << 24)) > 0;
    GLuint format = (header.type >> 27) & 0b111;

    image->data = (char *) malloc (header.size);
--- a/samples/nehe10/main.c
+++ b/samples/nehe10/main.c
@ -10,8 +10,6 @@

 #ifdef __DREAMCAST__
 #include <kos.h>
-#else
-#include <SDL.h>
 #endif

 #include <stdio.h>
@ -19,9 +17,7 @@
 #include <GL/glu.h>
 #include <GL/glkos.h>

-#include <stdlib.h>
 #include <stdbool.h>
-#include <stdint.h>

 #include "../loadbmp.h"

@ -88,16 +84,7 @@ void SetupWorld()
 	int numtriangles;
 	FILE *filein;
 	char oneline[255];
-#ifdef __DREAMCAST__
 	filein = fopen("/rd/world.txt", "rt");				// File To Load World Data From
-#else
-    filein = fopen("../samples/nehe10/romdisk/world.txt", "rt");
-#endif
-
-    if(!filein) {
-        fprintf(stderr, "Failed to load world file\n");
-        exit(1);
-    }

 	readstr(filein,oneline);
 	sscanf(oneline, "NUMPOLLIES %d\n", &numtriangles);
@ -241,13 +228,6 @@ void DrawGLScene(void) {
 }

 int ReadController(void) {
-    bool start = false;
-    bool up = false;
-    bool down = false;
-    bool left = false;
-    bool right = false;
-
-
 #ifdef __DREAMCAST__
    maple_device_t *cont;
    cont_state_t *state;
@ -261,27 +241,10 @@ int ReadController(void) {
        return 0;
    }

-    start = (state->buttons & CONT_START);
-    up = (state->buttons & CONT_DPAD_UP);
-    down = (state->buttons & CONT_DPAD_DOWN);
-    left = (state->buttons & CONT_DPAD_LEFT);
-    right = (state->buttons & CONT_DPAD_RIGHT);
-
-#else
-    int num_keys = 0;
-    uint8_t* state = SDL_GetKeyboardState(&num_keys);
-    start = state[SDL_SCANCODE_RETURN];
-    up = state[SDL_SCANCODE_UP];
-    down = state[SDL_SCANCODE_DOWN];
-    left = state[SDL_SCANCODE_LEFT];
-    right = state[SDL_SCANCODE_RIGHT];
-#endif
-
-    if(start) {
+    if(state->buttons & CONT_START)
        return 0;
-    }

-    if(up) {
+    if(state->buttons & CONT_DPAD_UP) {
        xpos -= (float)sin(heading*piover180) * 0.05f;
        zpos -= (float)cos(heading*piover180) * 0.05f;
        if (walkbiasangle >= 359.0f)
@ -295,7 +258,8 @@ int ReadController(void) {
        walkbias = (float)sin(walkbiasangle * piover180)/20.0f;
    }

-    if(down) {
+
+    if(state->buttons & CONT_DPAD_DOWN) {
        xpos += (float)sin(heading*piover180) * 0.05f;
        zpos += (float)cos(heading*piover180) * 0.05f;
        if (walkbiasangle <= 1.0f)
@ -309,17 +273,18 @@ int ReadController(void) {
        walkbias = (float)sin(walkbiasangle * piover180)/20.0f;
    }

-    if(left) {
+
+    if(state->buttons & CONT_DPAD_LEFT) {
        heading += 1.0f;
        yrot = heading;
    }

-    if(right) {
+    if(state->buttons & CONT_DPAD_RIGHT) {
        heading -= 1.0f;
        yrot = heading;
    }

-
+#endif

    /* Switch to the blended polygon list if needed */
    if(blend) {
--- a/samples/nehe10/romdisk/world.txt
+++ b/samples/nehe10/romdisk/world.txt
@ -157,4 +157,4 @@ NUMPOLLIES 36
 2.0  0.0   -0.5 0.0 0.0
 3.0  1.0  -0.5 1.0 1.0
 2.0  1.0 -0.5 0.0 1.0
-2.0  0.0   -0.5 0.0 0.0
+2.0  0.0   -0.5 0.0 0.0
--- a/samples/paletted/main.c
+++ b/samples/paletted/main.c
@ -132,7 +132,7 @@ void LoadGLTextures() {

    // 2d texture, level of detail 0 (normal), 3 components (red, green, blue), x size from image, y size from image,
    // border 0 (normal), rgb color data, unsigned byte data, and finally the data itself.
-    glTexImage2D(GL_TEXTURE_2D, 0, GL_COLOR_INDEX8_EXT, image1->width, image1->height, 0, GL_COLOR_INDEX8_TWID_KOS, GL_UNSIGNED_BYTE, image1->data);
+    glTexImage2D(GL_TEXTURE_2D, 0, GL_COLOR_INDEX8_EXT, image1->width, image1->height, 0, GL_COLOR_INDEX, GL_UNSIGNED_BYTE_TWID_KOS, image1->data);
    glGenerateMipmapEXT(GL_TEXTURE_2D);

    free(image1);
--- a/samples/paletted_pcx/main.c
+++ b/samples/paletted_pcx/main.c
@ -254,8 +254,6 @@ int BMP_Infos(FILE *pFile, uint32_t *width, uint32_t *height)
 	*width = (uint32_t)BmpInfoHeader.Width;
 	*height = (uint32_t)BmpInfoHeader.Height;

-    fseek(pFile, BmpInfoHeader.Size + 14, SEEK_SET);
-
 	return 1;
 }

@ -272,7 +270,6 @@ int BMP_GetPalette(FILE *pFile)
 		bitCount = BmpInfoHeader.ClrImportant * sizeof(RGB_QUAD);

 		if (fread(BmpRgbQuad, 1, bitCount, pFile) != bitCount){
-            fprintf(stderr, "Failed to read palette: %d\n", bitCount);
 			return 0;
 		}

@ -284,8 +281,6 @@ int BMP_GetPalette(FILE *pFile)
 		}
 		return 1;
 	}
-
-    fprintf(stderr, "BitCount: %d\n", BmpInfoHeader.BitCount);
 	return 0;
 }

@ -351,7 +346,7 @@ int LoadPalettedBMP(const char* filename, Image* image)
 	}

 	if (!BMP_GetPalette(fp)) {
-        printf("Only 16c BMP are supported for this sample\n");
+		printf("Only 16c BMP are supported for this sample");
 		return 0;
 	}

@ -434,7 +429,7 @@ void LoadGLTextures() {
 #ifndef USE_16C_PALETTE
    glTexImage2D(GL_TEXTURE_2D, 0, GL_COLOR_INDEX8_EXT, image1.width, image1.height, 0, GL_COLOR_INDEX, GL_UNSIGNED_BYTE, image1.data);
 #else
-    glTexImage2D(GL_TEXTURE_2D, 0, GL_COLOR_INDEX4_EXT, image1.width, image1.height, 0, GL_COLOR_INDEX4_EXT, GL_UNSIGNED_BYTE, image1.data);
+    glTexImage2D(GL_TEXTURE_2D, 0, GL_COLOR_INDEX4_EXT, image1.width, image1.height, 0, GL_COLOR_INDEX, GL_UNSIGNED_BYTE, image1.data);
 #endif

    glBindTexture(GL_TEXTURE_2D, textures[1]);   // 2d texture (x and y size)
@ -449,7 +444,7 @@ void LoadGLTextures() {
 #ifndef USE_16C_PALETTE
    glTexImage2D(GL_TEXTURE_2D, 0, GL_COLOR_INDEX8_EXT, image1.width, image1.height, 0, GL_COLOR_INDEX, GL_UNSIGNED_BYTE, image1.data);
 #else
-    glTexImage2D(GL_TEXTURE_2D, 0, GL_COLOR_INDEX4_EXT, image1.width, image1.height, 0, GL_COLOR_INDEX4_EXT, GL_UNSIGNED_BYTE, image1.data);
+    glTexImage2D(GL_TEXTURE_2D, 0, GL_COLOR_INDEX4_EXT, image1.width, image1.height, 0, GL_COLOR_INDEX, GL_UNSIGNED_BYTE, image1.data);
 #endif

    glBindTexture(GL_TEXTURE_2D, textures[2]);
@ -468,7 +463,7 @@ void LoadGLTextures() {
 #ifndef USE_16C_PALETTE
    glTexImage2D(GL_TEXTURE_2D, 0, GL_COLOR_INDEX8_EXT, image2.width, image2.height, 0, GL_COLOR_INDEX, GL_UNSIGNED_BYTE, image2.data);
 #else
-    glTexImage2D(GL_TEXTURE_2D, 0, GL_COLOR_INDEX4_EXT, image2.width, image2.height, 0, GL_COLOR_INDEX4_EXT, GL_UNSIGNED_BYTE, image2.data);
+    glTexImage2D(GL_TEXTURE_2D, 0, GL_COLOR_INDEX4_EXT, image2.width, image2.height, 0, GL_COLOR_INDEX, GL_UNSIGNED_BYTE, image2.data);
 #endif
 }

--- a/samples/paletted_pcx/romdisk/NeHe-Alpha.bmp
+++ b/samples/paletted_pcx/romdisk/NeHe-Alpha.bmp
--- a/samples/polymark/main.c
+++ b/samples/polymark/main.c
@ -100,7 +100,7 @@ void do_frame() {
    glKosSwapBuffers();
 }

-time_t begin;
+time_t start;
 void switch_tests(int ppf) {
    printf("Beginning new test: %d polys per frame (%d per second at 60fps)\n",
           ppf * 3, ppf * 3 * 60);
@ -113,8 +113,8 @@ void check_switch() {

    now = time(NULL);

-    if(now >= (begin + 5)) {
-        begin = time(NULL);
+    if(now >= (start + 5)) {
+        start = time(NULL);
        printf("  Average Frame Rate: ~%f fps (%d pps)\n", avgfps, (int)(polycnt * avgfps * 2));

        switch(phase) {
@ -165,7 +165,7 @@ int main(int argc, char **argv) {

    /* Start off with something obscene */
    switch_tests(200000 / 60);
-    begin = time(NULL);
+    start = time(NULL);

    for(;;) {
        if(check_start())
--- a/samples/prof_texture_upload/image.h
+++ b/samples/prof_texture_upload/image.h
--- a/samples/prof_texture_upload/main.c
+++ b/samples/prof_texture_upload/main.c
@ -1,64 +0,0 @@
-#include <stddef.h>
-#include <time.h>
-#include <stdio.h>
-
-#ifdef __DREAMCAST__
-#include <kos.h>
-#include "../profiler.h"
-#endif
-
-#include <GL/gl.h>
-#include <GL/glkos.h>
-
-#include "image.h"
-
-#define PROFILE 0
-
-int main(int argc, char* argv[]) {
-    (void) argc;
-    (void) argv;
-
-    fprintf(stdout, "Initializing\n");
-    glKosInit();
-    glClearColor(0.5f, 0.0f, 0.5f, 1.0f);
-    glClear(GL_COLOR_BUFFER_BIT | GL_DEPTH_BUFFER_BIT);
-    glKosSwapBuffers();
-
-    GLuint texture_id = 0;
-    glGenTextures(1, &texture_id);
-    glBindTexture(GL_TEXTURE_2D, texture_id);
-
-    time_t start = time(NULL);
-    time_t end = start;
-
-    int counter = 0;
-
-    fprintf(stderr, "Starting test run...\n");
-
-#ifdef __DREAMCAST__
-#if PROFILE
-    profiler_init("/pc/gmon.out");
-    profiler_start();
-#endif
-#endif
-
-    while((end - start) < 5) {
-        glTexImage2D(
-            GL_TEXTURE_2D, 0, GL_RGB, width, height, 0, GL_RGB, GL_UNSIGNED_BYTE, header_data
-        );
-
-        ++counter;
-        end = time(NULL);
-    }
-
-#ifdef __DREAMCAST__
-#if PROFILE
-    profiler_stop();
-    profiler_clean_up();
-#endif
-#endif
-
-    fprintf(stderr, "Called glTexImage2D %d times (%.4f per call)\n", counter, (float)(end - start) / (float)(counter));
-
-    return 0;
-}
--- a/samples/quadmark/main.c
+++ b/samples/quadmark/main.c
@ -68,16 +68,14 @@ int check_start() {
 }

 void setup() {
-    GLdcConfig cfg;
-    glKosInitConfig(&cfg);
-    cfg.initial_immediate_capacity = 14000;
-    glKosInitEx(&cfg);
-
+    glKosInit();
    glMatrixMode(GL_MODELVIEW);
    glLoadIdentity();
    glOrtho(0, 640, 0, 480, -100, 100);
    glMatrixMode(GL_PROJECTION);
    glLoadIdentity();
+
+    glDisable(GL_NEARZ_CLIPPING_KOS);
 }

 void do_frame() {
@ -107,12 +105,10 @@ void do_frame() {
    glKosSwapBuffers();
 }

-time_t begin;
+time_t start;
 void switch_tests(int ppf) {
    printf("Beginning new test: %d polys per frame (%d per second at 60fps)\n",
           ppf * 2, ppf * 2 * 60);
-    fflush(stdout);
-
    avgfps = -1;
    polycnt = ppf;
 }
@ -122,9 +118,10 @@ void check_switch() {

    now = time(NULL);

-    if(now >= (begin + 5)) {
-        begin = time(NULL);
+    if(now >= (start + 5)) {
+        start = time(NULL);
        printf("  Average Frame Rate: ~%f fps (%d pps)\n", avgfps, (int)(polycnt * avgfps * 2));
+
        switch(phase) {
            case PHASE_HALVE:

@ -165,27 +162,22 @@ void check_switch() {
            case PHASE_FINAL:
                break;
        }
-
-        fflush(stdout);
    }
 }

-#define PROFILE 0
-
 int main(int argc, char **argv) {
-#if PROFILE
+#ifndef NDEBUG
+#ifdef __DREAMCAST__
    profiler_init("/pc/gmon.out");
+    profiler_start();
+#endif
 #endif

    setup();

-#if PROFILE
-    profiler_start();
-#endif
-
    /* Start off with something obscene */
    switch_tests(200000 / 60);
-    begin = time(NULL);
+    start = time(NULL);

    uint32_t iterations = 2000;

@ -201,9 +193,11 @@ int main(int argc, char **argv) {

    stats();

-#if PROFILE
+#ifdef __DREAMCAST__
+#ifndef NDEBUG
    profiler_stop();
    profiler_clean_up();
+#endif
 #endif

    return 0;
--- a/samples/trimark/main.c
+++ b/samples/trimark/main.c
@ -93,7 +93,7 @@ void do_frame() {
    glKosSwapBuffers();
 }

-time_t begin;
+time_t start;
 void switch_tests(int ppf) {
    printf("Beginning new test: %d polys per frame (%d per second at 60fps)\n",
           ppf * 2, ppf * 2 * 60);
@ -106,8 +106,8 @@ void check_switch() {

    now = time(NULL);

-    if(now >= (begin + 5)) {
-        begin = time(NULL);
+    if(now >= (start + 5)) {
+        start = time(NULL);
        printf("  Average Frame Rate: ~%f fps (%d pps)\n", avgfps, (int)(polycnt * avgfps * 2));

        switch(phase) {
@ -155,7 +155,7 @@ int main(int argc, char **argv) {

    /* Start off with something obscene */
    switch_tests(220000 / 60);
-    begin = time(NULL);
+    start = time(NULL);

    for(;;) {
        if(check_start())
--- a/samples/zclip_triangle/main.c
+++ b/samples/zclip_triangle/main.c
@ -28,8 +28,6 @@ void InitGL(int Width, int Height)	        // We call this right after our OpenG

    glMatrixMode(GL_MODELVIEW);
    glLoadIdentity();
-
-    glEnable(GL_CULL_FACE);
 }

 /* The function called when our window is resized (which shouldn't happen, because we're fullscreen) */
@ -88,13 +86,12 @@ void DrawGLScene()
    rotation = (rotation > 360.0f) ? rotation - 360.0f : rotation;

    glClear(GL_COLOR_BUFFER_BIT | GL_DEPTH_BUFFER_BIT);		// Clear The Screen And The Depth Buffer
-    glClearColor(0.5f, 0.5f, 0.5f, 0.5f);
    glLoadIdentity();				// Reset The View

    glDisable(GL_CULL_FACE);

    glPushMatrix();
-        glTranslatef(0.0f, -1.0f, -movement);
+        glTranslatef(0.0f, -1.0f, movement);
        glRotatef(rotation, 0.0f, 1.0f, 0.0f);

        glBegin(GL_TRIANGLES);
--- a/tests/CMakeLists.txt
+++ b/tests/CMakeLists.txt
@ -1,26 +0,0 @@
-
-
-FILE(GLOB GL_TESTS ${CMAKE_CURRENT_SOURCE_DIR}/test_*.h)
-
-INCLUDE_DIRECTORIES(${CMAKE_SOURCE_DIR})
-
-SET(TEST_GENERATOR_BIN ${CMAKE_SOURCE_DIR}/tools/test_generator.py)
-SET(TEST_MAIN_FILENAME ${CMAKE_CURRENT_BINARY_DIR}/main.cpp)
-
-ADD_CUSTOM_COMMAND(
-    OUTPUT ${TEST_MAIN_FILENAME}
-    COMMAND ${TEST_GENERATOR_BIN} --output ${TEST_MAIN_FILENAME} ${TEST_FILES} ${GL_TESTS}
-    DEPENDS ${TEST_FILES} ${GL_TESTS} ${TEST_GENERATOR_BIN}
-)
-
-add_executable(gldc_tests ${TEST_FILES} ${TEST_SOURCES} ${TEST_MAIN_FILENAME})
-target_link_libraries(gldc_tests GLdc)
-
-if(NOT PLATFORM_DREAMCAST)
-set_target_properties(
-    gldc_tests
-    PROPERTIES
-    COMPILE_OPTIONS "-m32"
-    LINK_OPTIONS "-m32"
-)
-endif()
--- a/tests/test_allocator.h
+++ b/tests/test_allocator.h
@ -1,189 +0,0 @@
-#include "tools/test.h"
-
-#include <cstdint>
-#include <cassert>
-#include <malloc.h>
-#include <utility>
-
-#include <GL/gl.h>
-#include <GL/glkos.h>
-
-#include "GL/alloc/alloc.h"
-
-static inline int round_up(int n, int multiple)
-{
-    assert(multiple);
-    return ((n + multiple - 1) / multiple) * multiple;
-}
-
-#define POOL_SIZE (16 * 2048)
-
-class AllocatorTests : public test::TestCase {
-public:
-    uint8_t* pool = NULL;
-
-    std::vector<std::pair<void*, void*>> defrag_moves;
-
-    void set_up() {
-        pool = (uint8_t*) memalign(2048, POOL_SIZE);
-        assert(((intptr_t) pool) % 2048 == 0);
-    }
-
-    void tear_down() {
-        alloc_shutdown(pool);
-        free(pool);
-    }
-
-    static void on_defrag(void* src, void* dst, void* user_data) {
-        AllocatorTests* self = (AllocatorTests*) user_data;
-        self->defrag_moves.push_back(std::make_pair(src, dst));
-    }
-
-    void test_defrag() {
-        alloc_init(pool, POOL_SIZE);
-
-        alloc_malloc(pool, 256);
-        void* a2 = alloc_malloc(pool, 256);
-        void* a3 = alloc_malloc(pool, 256);
-
-        alloc_free(pool, a2);
-
-        alloc_run_defrag(pool, &AllocatorTests::on_defrag, 5, this);
-
-        assert_equal(defrag_moves.size(), 1u); // Moved a3 -> a2
-
-        assert_equal(defrag_moves[0].first, a3);
-        assert_equal(defrag_moves[0].second, a2);
-
-        assert_equal(alloc_malloc(pool, 256), a3);
-    }
-
-    void test_poor_alloc_aligned() {
-        /* If we try to allocate and there are no suitable aligned
-         * slots available, we fallback to any available unaligned slots */
-        alloc_init(pool, POOL_SIZE);
-
-        // Leave only space for an unaligned block
-        alloc_malloc(pool, (15 * 2048) - 256);
-
-        // Should work, we have space (just) but it's not aligned
-        void* a1 = alloc_malloc(pool, 2048 + 256);
-        assert_is_not_null(a1);
-        assert_equal(a1, pool + ((15 * 2048) - 256));
-    }
-
-    void test_poor_alloc_straddling() {
-        /*
-         * If we try to allocate a small block, it should not
-         * cross a 2048 boundary unless there is no other option */
-        alloc_init(pool, POOL_SIZE);
-        alloc_malloc(pool, (15 * 2048) - 256);
-        void* a1 = alloc_malloc(pool, 512);
-        assert_true((uintptr_t(a1) % 2048) == 0); // Should've aligned to the last 2048 block
-
-        /* Allocate the rest of the last block, this leaves a 256 block in the
-         * penultimate block */
-        alloc_malloc(pool, 1536);
-        alloc_free(pool, a1);
-
-        /* No choice but to straddle the boundary */
-        a1 = alloc_malloc(pool, 768);
-    }
-
-    void test_alloc_init() {
-        alloc_init(pool, POOL_SIZE);
-
-        void* expected_base_address = (void*) round_up((uintptr_t) pool, 2048);
-        assert_equal(alloc_next_available(pool, 16), expected_base_address);
-        assert_equal(alloc_base_address(pool), expected_base_address);
-
-        size_t expected_blocks = (
-            uintptr_t(pool + POOL_SIZE) -
-            uintptr_t(expected_base_address)
-        ) / 2048;
-
-        assert_equal(alloc_block_count(pool), expected_blocks);
-    }
-
-    void test_complex_case() {
-        uint8_t* large_pool = (uint8_t*) malloc(8 * 1024 * 1024);
-
-        alloc_init(large_pool, 8 * 1024 * 1024);
-        alloc_malloc(large_pool, 262144);
-        alloc_malloc(large_pool, 262144);
-        void* a1 = alloc_malloc(large_pool, 524288);
-        alloc_free(large_pool, a1);
-        alloc_malloc(large_pool, 699056);
-        alloc_malloc(large_pool, 128);
-        alloc_shutdown(large_pool);
-
-        free(large_pool);
-    }
-
-    void test_complex_case2() {
-        uint8_t* large_pool = (uint8_t*) malloc(8 * 1024 * 1024);
-        alloc_init(large_pool, 8 * 1024 * 1024);
-
-        void* a1 = alloc_malloc(large_pool, 131072);
-        alloc_free(large_pool, a1);
-
-        alloc_malloc(large_pool, 174768);
-        void* a2 = alloc_malloc(large_pool, 131072);
-        alloc_free(large_pool, a2);
-
-        alloc_malloc(large_pool, 174768);
-        void* a3 = alloc_malloc(large_pool, 128);
-
-        alloc_free(large_pool, a3);
-
-        alloc_shutdown(large_pool);
-        free(large_pool);
-    }
-
-    void test_alloc_malloc() {
-        alloc_init(pool, POOL_SIZE);
-
-        uint8_t* base_address = (uint8_t*) alloc_base_address(pool);
-        void* a1 = alloc_malloc(pool, 1024);
-
-        /* First alloc should always be the base address */
-        assert_equal(a1, base_address);
-
-        /* An allocation of <= 2048 (well 1024) will not necessarily be at
-         * a 2k boundary */
-        void* expected_next_available = base_address + uintptr_t(1024);
-        assert_equal(alloc_next_available(pool, 1024), expected_next_available);
-
-        /* Requesting 2k though will force to a 2k boundary */
-        expected_next_available = base_address + uintptr_t(2048);
-        assert_equal(alloc_next_available(pool, 2048), expected_next_available);
-
-        /* Now alloc 2048 bytes, this should be on the 2k boundary */
-        void* a2 = alloc_malloc(pool, 2048);
-        assert_equal(a2, expected_next_available);
-
-        /* If we try to allocate 1k, this should go in the second half of the
-         * first block */
-        expected_next_available = base_address + uintptr_t(1024);
-        void* a3 = alloc_malloc(pool, 1024);
-        assert_equal(a3, expected_next_available);
-
-        alloc_free(pool, a1);
-
-        /* Next allocation would go in the just freed block */
-        expected_next_available = base_address;
-        assert_equal(alloc_next_available(pool, 64), expected_next_available);
-
-        /* Now allocate 14 more 2048 size blocks, the following one should
-         * return NULL */
-        for(int i = 0; i < 14; ++i) {
-            alloc_malloc(pool, 2048);
-        }
-
-        assert_is_null(alloc_malloc(pool, 2048));
-
-        /* But we should still have room in the second block for this */
-        assert_is_not_null(alloc_malloc(pool, 64));
-    }
-
-};
--- a/tests/test_glteximage2d.h
+++ b/tests/test_glteximage2d.h
@ -1,77 +0,0 @@
-#include "tools/test.h"
-
-#include <stdint.h>
-#include <GL/gl.h>
-#include <GL/glkos.h>
-
-
-class TexImage2DTests : public test::TestCase {
-public:
-    uint8_t image_data[8 * 8 * 4] = {0};
-
-    void set_up() {
-        GLdcConfig config;
-        glKosInitConfig(&config);
-        config.texture_twiddle = false;
-        glKosInitEx(&config);
-
-        /* Init image data so each texel RGBA value matches the
-         * position in the array */
-        for(int i = 0; i < 8 * 8 * 4; i += 4) {
-            image_data[i + 0] = i;
-            image_data[i + 1] = i;
-            image_data[i + 2] = i;
-            image_data[i + 3] = i;
-        }
-    }
-
-    void tear_down() {
-        glKosShutdown();
-    }
-
-    void test_rgb_to_rgb565() {
-        glTexImage2D(GL_TEXTURE_2D, 0, GL_RGB, 8, 8, 0, GL_RGB, GL_UNSIGNED_BYTE, image_data);
-        assert_equal(glGetError(), GL_NO_ERROR);
-
-        GLint internalFormat;
-        glGetIntegerv(GL_TEXTURE_INTERNAL_FORMAT_KOS, &internalFormat);
-
-        assert_equal(internalFormat, GL_RGB565_KOS);
-    }
-
-    void test_rgb_to_rgb565_twiddle() {
-        glEnable(GL_TEXTURE_TWIDDLE_KOS);
-        glTexImage2D(GL_TEXTURE_2D, 0, GL_RGB, 8, 8, 0, GL_RGB, GL_UNSIGNED_BYTE, image_data);
-        glDisable(GL_TEXTURE_TWIDDLE_KOS);
-
-        assert_equal(glGetError(), GL_NO_ERROR);
-
-        GLint internalFormat;
-        glGetIntegerv(GL_TEXTURE_INTERNAL_FORMAT_KOS, &internalFormat);
-
-        assert_equal(internalFormat, GL_RGB565_TWID_KOS);
-    }
-
-    void test_rgba_to_argb4444() {
-        glTexImage2D(GL_TEXTURE_2D, 0, GL_RGBA, 8, 8, 0, GL_RGBA, GL_UNSIGNED_BYTE, image_data);
-        assert_equal(glGetError(), GL_NO_ERROR);
-
-        GLint internalFormat;
-        glGetIntegerv(GL_TEXTURE_INTERNAL_FORMAT_KOS, &internalFormat);
-
-        assert_equal(internalFormat, GL_ARGB4444_KOS);
-    }
-
-    void test_rgba_to_argb4444_twiddle() {
-        glEnable(GL_TEXTURE_TWIDDLE_KOS);
-        glTexImage2D(GL_TEXTURE_2D, 0, GL_RGBA, 8, 8, 0, GL_RGBA, GL_UNSIGNED_BYTE, image_data);
-        glDisable(GL_TEXTURE_TWIDDLE_KOS);
-
-        assert_equal(glGetError(), GL_NO_ERROR);
-
-        GLint internalFormat;
-        glGetIntegerv(GL_TEXTURE_INTERNAL_FORMAT_KOS, &internalFormat);
-
-        assert_equal(internalFormat, GL_ARGB4444_TWID_KOS);
-    }
-};
--- a/tests/zclip/main.cpp
+++ b/tests/zclip/main.cpp
@ -1,637 +0,0 @@
-
-#include <cstdint>
-#include <vector>
-#include <cstdio>
-#include <cmath>
-#include <stdexcept>
-#include <cassert>
-
-#define SQ_BASE_ADDRESS 0
-#define SPAN_SORT_CFG 0
-#define PVR_SET(x, y) (void)(x); (void)(y)
-
-struct Vertex  {
-    uint32_t flags;
-    float xyz[3];
-    float uv[2];
-    float w;
-    uint8_t bgra[4];
-};
-
-struct {
-    float hwidth;
-    float x_plus_hwidth;
-    float hheight;
-    float y_plus_hheight;
-} VIEWPORT = {320, 320, 240, 240};
-
-
-struct VideoMode {
-    float height;
-};
-
-static VideoMode* GetVideoMode() {
-    static VideoMode mode = {320.0f};
-    return &mode;
-}
-
-enum GPUCommand {
-    GPU_CMD_POLYHDR = 0x80840000,
-    GPU_CMD_VERTEX = 0xe0000000,
-    GPU_CMD_VERTEX_EOL = 0xf0000000,
-    GPU_CMD_USERCLIP = 0x20000000,
-    GPU_CMD_MODIFIER = 0x80000000,
-    GPU_CMD_SPRITE = 0xA0000000
-};
-
-static std::vector<Vertex> sent;
-
-static inline void interpolateColour(const uint32_t* a, const uint32_t* b, const float t, uint32_t* out) {
-    const static uint32_t MASK1 = 0x00FF00FF;
-    const static uint32_t MASK2 = 0xFF00FF00;
-
-    const uint32_t f2 = 256 * t;
-    const uint32_t f1 = 256 - f2;
-
-    *out = (((((*a & MASK1) * f1) + ((*b & MASK1) * f2)) >> 8) & MASK1) |
-            (((((*a & MASK2) * f1) + ((*b & MASK2) * f2)) >> 8) & MASK2);
-}
-
-static inline void _glClipEdge(const Vertex* v1, const Vertex* v2, Vertex* vout) {
-    /* Clipping time! */
-    const float d0 = v1->w + v1->xyz[2];
-    const float d1 = v2->w + v2->xyz[2];
-    const float sign = ((2.0f * (d1 < d0)) - 1.0f);
-    const float epsilon = -0.00001f * sign;
-    const float n = (d0 - d1);
-    const float r = (1.f / sqrtf(n * n)) * sign;
-    float t = fmaf(r, d0, epsilon);
-
-    vout->xyz[0] = fmaf(v2->xyz[0] - v1->xyz[0], t, v1->xyz[0]);
-    vout->xyz[1] = fmaf(v2->xyz[1] - v1->xyz[1], t, v1->xyz[1]);
-    vout->xyz[2] = fmaf(v2->xyz[2] - v1->xyz[2], t, v1->xyz[2]);
-    vout->w = fmaf(v2->w - v1->w, t, v1->w);
-
-    vout->uv[0] = fmaf(v2->uv[0] - v1->uv[0], t, v1->uv[0]);
-    vout->uv[1] = fmaf(v2->uv[1] - v1->uv[1], t, v1->uv[1]);
-
-    interpolateColour((uint32_t*) v1->bgra, (uint32_t*) v2->bgra, t, (uint32_t*) vout->bgra);
-}
-
-bool glIsVertex(const uint32_t flags) {
-    return flags == GPU_CMD_VERTEX_EOL || flags == GPU_CMD_VERTEX;
-}
-
-bool glIsLastVertex(const uint32_t flags) {
-    return flags == GPU_CMD_VERTEX_EOL;
-}
-
-void _glSubmitHeaderOrVertex(volatile uint32_t*, Vertex* vtx) {
-    sent.push_back(*vtx);
-}
-
-float _glFastInvert(float x) {
-    return (1.f / __builtin_sqrtf(x * x));
-}
-
-void _glPerspectiveDivideVertex(Vertex* vertex, const float h) {
-    const float f = _glFastInvert(vertex->w);
-
-    /* Convert to NDC and apply viewport */
-    vertex->xyz[0] = __builtin_fmaf(
-        VIEWPORT.hwidth, vertex->xyz[0] * f, VIEWPORT.x_plus_hwidth
-    );
-
-    vertex->xyz[1] = h - __builtin_fmaf(
-        VIEWPORT.hheight, vertex->xyz[1] * f, VIEWPORT.y_plus_hheight
-    );
-
-    /* Orthographic projections need to use invZ otherwise we lose
-    the depth information. As w == 1, and clip-space range is -w to +w
-    we add 1.0 to the Z to bring it into range. We add a little extra to
-    avoid a divide by zero.
-    */
-
-    vertex->xyz[2] = (vertex->w == 1.0f) ? _glFastInvert(1.0001f + vertex->xyz[2]) : f;
-}
-
-
-void memcpy_vertex(Vertex* dst, Vertex* src) {
-    *dst = *src;
-}
-
-/* Zclipping is so difficult to get right, that self sample tests all the cases of clipping and makes sure that things work as expected */
-
-#ifdef __DREAMCAST__
-static volatile int *pvrdmacfg = (int*)0xA05F6888;
-static volatile int *qacr = (int*)0xFF000038;
-#else
-static int pvrdmacfg[2];
-static int qacr[2];
-#endif
-
-void SceneListSubmit(void* src, int n) {
-    /* You need at least a header, and 3 vertices to render anything */
-    if(n < 4) {
-        return;
-    }
-
-    const float h = GetVideoMode()->height;
-
-    PVR_SET(SPAN_SORT_CFG, 0x0);
-
-    //Set PVR DMA registers
-    pvrdmacfg[0] = 1;
-    pvrdmacfg[1] = 1;
-
-    //Set QACR registers
-    qacr[1] = qacr[0] = 0x11;
-
-    volatile uint32_t *d = SQ_BASE_ADDRESS;
-
-    int8_t queue_head = 0;
-    int8_t queue_tail = 0;
-
-    /* The most vertices ever in the queue is 5 (as some clipping operations
-     * produce and additional couple of vertice, but we add one more so the ring buffer doesn't
-     * trip over itself (e.g. if tail == head we can guarantee it's empty, not full) */
-    Vertex __attribute__((aligned(32))) queue[4];
-    const int queue_capacity = sizeof(queue) / sizeof(Vertex);
-
-    Vertex* vertex = (Vertex*) src;
-    uint32_t visible_mask = 0;
-
-#if CLIP_DEBUG
-    for(int i = 0; i < n; ++i) {
-        fprintf(stderr, "{%f, %f, %f, %f}, // %x (%x)\n", vertex[i].xyz[0], vertex[i].xyz[1], vertex[i].xyz[2], vertex[i].w, vertex[i].flags, &vertex[i]);
-    }
-
-    fprintf(stderr, "----\n");
-#endif
-    while(n--) {
-        bool last_vertex = false;
-        memcpy_vertex(queue + queue_tail, vertex);
-        ++vertex;
-        switch(queue[queue_tail].flags) {
-            case GPU_CMD_POLYHDR:
-                _glSubmitHeaderOrVertex(d, &queue[queue_tail]);
-            break;
-            case GPU_CMD_VERTEX_EOL:
-                last_vertex = true;  // fallthru
-            case GPU_CMD_VERTEX:
-                visible_mask = (visible_mask >> 1) | (queue[queue_tail].xyz[2] >= -queue[queue_tail].w) << 2;
-                assert(visible_mask < 15);
-                queue_tail = (queue_tail + 1) % queue_capacity;
-            default:
-            break;
-        }
-
-        int counter = (queue_tail - queue_head + queue_capacity) % queue_capacity;
-        if(counter < 3) {
-            continue;
-        }
-
-#if CLIP_DEBUG
-        fprintf(stderr, "%d\n", visible_mask);
-#endif
-        Vertex __attribute__((aligned(32))) a, b;  // Scratch vertices
-        switch(visible_mask) {
-            case 0:
-            break;
-            case 7:
-                /* All visible, push the first vertex and move on */
-                _glPerspectiveDivideVertex(&queue[queue_head], h);
-                _glSubmitHeaderOrVertex(d, &queue[queue_head]);
-
-                if(last_vertex) {
-                    /* If this was the last vertex in the strip, we need to flush the queue and then
-                       restart it again */
-
-                    int v1 = (queue_head + 1) % queue_capacity;
-                    int v2 = (queue_head + 2) % queue_capacity;
-
-                    _glPerspectiveDivideVertex(&queue[v1], h);
-                    _glSubmitHeaderOrVertex(d, &queue[v1]);
-
-                    _glPerspectiveDivideVertex(&queue[v2], h);
-                    _glSubmitHeaderOrVertex(d, &queue[v2]);
-                }
-            break;
-            case 1:
-                /* First vertex was visible */
-                {
-                        Vertex* v0 = &queue[queue_head];
-                        Vertex* v1 = &queue[(queue_head + 1) % queue_capacity];
-                        Vertex* v2 = &queue[(queue_head + 2) % queue_capacity];
-
-                        _glClipEdge(v0, v1, &a);
-                        _glClipEdge(v2, v0, &b);
-                        a.flags = GPU_CMD_VERTEX;
-
-                        /* If v2 was the last in the strip, then b should be. If it wasn't
-                        we'll create a degenerate triangle by adding b twice in a row so that the
-                        strip processing will continue correctly after crossing the plane so it can
-                        cross back*/
-                        b.flags = v2->flags;
-
-                        _glPerspectiveDivideVertex(v0, h);
-                        _glPerspectiveDivideVertex(&a, h);
-                        _glPerspectiveDivideVertex(&b, h);
-
-                        _glSubmitHeaderOrVertex(d, v0);
-                        _glSubmitHeaderOrVertex(d, &a);
-                        _glSubmitHeaderOrVertex(d, &b);
-                        _glSubmitHeaderOrVertex(d, &b);
-                }
-            break;
-            case 2:
-                /* Second vertex was visible. In self case we need to create a triangle and produce
-                two new vertices: 1-2, and 2-3. */
-                {
-                        Vertex* v0 = &queue[queue_head];
-                        const Vertex* v1 = &queue[(queue_head + 1) % queue_capacity];
-                        const Vertex* v2 = &queue[(queue_head + 2) % queue_capacity];
-
-                        _glClipEdge(v0, v1, &a);
-                        _glClipEdge(v1, v2, &b);
-                        a.flags = GPU_CMD_VERTEX;
-                        b.flags = v2->flags;
-
-                        _glPerspectiveDivideVertex(v0, h);
-                        _glPerspectiveDivideVertex(&a, h);
-                        _glPerspectiveDivideVertex(&b, h);
-
-                        _glSubmitHeaderOrVertex(d, &a);
-                        _glSubmitHeaderOrVertex(d, v0);
-                        _glSubmitHeaderOrVertex(d, &b);
-                }
-            break;
-            case 3:  /* First and second vertex were visible */
-                    {
-                        Vertex* v0 = &queue[queue_head];
-                        Vertex __attribute__((aligned(32))) v1 = queue[(queue_head + 1) % queue_capacity];
-                        Vertex* v2 = &queue[(queue_head + 2) % queue_capacity];
-
-                        _glClipEdge(&v1, v2, &a);
-                        _glClipEdge(v2, v0, &b);
-
-                        a.flags = v2->flags;
-                        b.flags = GPU_CMD_VERTEX;
-
-                        _glPerspectiveDivideVertex(v0, h);
-                        _glPerspectiveDivideVertex(&v1, h);
-                        _glPerspectiveDivideVertex(&a, h);
-                        _glPerspectiveDivideVertex(&b, h);
-
-                        _glSubmitHeaderOrVertex(d, v0);
-                        _glSubmitHeaderOrVertex(d, &v1);
-                        _glSubmitHeaderOrVertex(d, &b);
-                        _glSubmitHeaderOrVertex(d, &v1);
-                        _glSubmitHeaderOrVertex(d, &a);
-                }
-            break;
-            case 4:
-                /* Third vertex was visible. */
-                {
-                        Vertex* v0 = &queue[queue_head];
-                        Vertex* v1 = &queue[(queue_head + 1) % queue_capacity];
-                        Vertex __attribute__((aligned(32))) v2 = queue[(queue_head + 2) % queue_capacity];
-
-                        _glClipEdge(&v2, v0, &a);
-                        _glClipEdge(v1, &v2, &b);
-                        a.flags = GPU_CMD_VERTEX;
-                        b.flags = GPU_CMD_VERTEX;
-
-                        _glPerspectiveDivideVertex(&v2, h);
-                        _glPerspectiveDivideVertex(&a, h);
-                        _glPerspectiveDivideVertex(&b, h);
-
-                        _glSubmitHeaderOrVertex(d, &a);
-                        _glSubmitHeaderOrVertex(d, &a);
-                        _glSubmitHeaderOrVertex(d, &b);
-                        _glSubmitHeaderOrVertex(d, &v2);
-                }
-            break;
-            case 5:  /* First and third vertex were visible */
-                {
-                        Vertex* v0 = &queue[queue_head];
-                        Vertex* v1 = &queue[(queue_head + 1) % queue_capacity];
-                        Vertex __attribute__((aligned(32))) v2 = queue[(queue_head + 2) % queue_capacity];
-
-                        _glClipEdge(v0, v1, &a);
-                        _glClipEdge(v1, &v2, &b);
-                        a.flags = GPU_CMD_VERTEX;
-                        b.flags = GPU_CMD_VERTEX;
-
-                        _glPerspectiveDivideVertex(v0, h);
-                        _glPerspectiveDivideVertex(&v2, h);
-                        _glPerspectiveDivideVertex(&a, h);
-                        _glPerspectiveDivideVertex(&b, h);
-
-                        _glSubmitHeaderOrVertex(d, v0);
-                        _glSubmitHeaderOrVertex(d, &a);
-                        uint32_t v2_flags = v2.flags;
-                        v2.flags = GPU_CMD_VERTEX;
-                        _glSubmitHeaderOrVertex(d, &v2);
-                        v2.flags = v2_flags;
-                        _glSubmitHeaderOrVertex(d, &b);
-                        _glSubmitHeaderOrVertex(d, &v2);
-                }
-            break;
-            case 6:  /* Second and third vertex were visible */
-                {
-                        Vertex* v0 = &queue[queue_head];
-                        Vertex __attribute__((aligned(32))) v1 = queue[(queue_head + 1) % queue_capacity];
-                        Vertex __attribute__((aligned(32))) v2 = queue[(queue_head + 2) % queue_capacity];
-
-                        _glClipEdge(v0, &v1, &a);
-                        _glClipEdge(&v2, v0, &b);
-
-                        a.flags = GPU_CMD_VERTEX;
-                        b.flags = GPU_CMD_VERTEX;
-
-                        _glPerspectiveDivideVertex(&v1, h);
-                        _glPerspectiveDivideVertex(&v2, h);
-                        _glPerspectiveDivideVertex(&a, h);
-                        _glPerspectiveDivideVertex(&b, h);
-
-                        _glSubmitHeaderOrVertex(d, &a);
-                        _glSubmitHeaderOrVertex(d, &v1);
-                        _glSubmitHeaderOrVertex(d, &b);
-                        _glSubmitHeaderOrVertex(d, &v1);
-                        _glSubmitHeaderOrVertex(d, &v2);
-                }
-            break;
-            default:
-                break;
-        }
-
-        if(last_vertex) {
-            visible_mask = queue_head = queue_tail = 0;
-        } else {
-            queue_head = (queue_head + 1) % queue_capacity;
-        }
-    }
-}
-
-
-struct VertexTmpl {
-    VertexTmpl(float x, float y, float z, float w):
-        x(x), y(y), z(z), w(w) {}
-
-    float x, y, z, w;
-};
-
-std::vector<Vertex> make_vertices(const std::vector<VertexTmpl>& verts) {
-    std::vector<Vertex> result;
-    Vertex r;
-
-    r.flags = GPU_CMD_POLYHDR;
-    result.push_back(r);
-
-    for(auto& v: verts) {
-        r.flags = GPU_CMD_VERTEX;
-        r.xyz[0] = v.x;
-        r.xyz[1] = v.y;
-        r.xyz[2] = v.z;
-        r.uv[0] = 0.0f;
-        r.uv[1] = 0.0f;
-        r.w = v.w;
-
-        result.push_back(r);
-    }
-
-    result.back().flags = GPU_CMD_VERTEX_EOL;
-    return result;
-}
-
-template<typename T, typename U>
-void check_equal(const T& lhs, const U& rhs) {
-    if(lhs != rhs) {
-        throw std::runtime_error("Assertion failed");
-    }
-}
-
-template<>
-void check_equal(const Vertex& lhs, const Vertex& rhs) {
-    if(lhs.xyz[0] != rhs.xyz[0] ||
-       lhs.xyz[1] != rhs.xyz[1] ||
-       lhs.xyz[2] != rhs.xyz[2] ||
-       lhs.w != rhs.w) {
-        throw std::runtime_error("Assertion failed");
-    }
-}
-
-
-bool test_clip_case_001() {
-    /* The first vertex is visible only */
-    sent.clear();
-
-    auto data = make_vertices({
-        {0.000000, -2.414213, 3.080808, 5.000000},
-        {-4.526650, -2.414213, -7.121212, -5.000000},
-        {4.526650, -2.414213, -7.121212, -5.000000}
-    });
-
-    SceneListSubmit(&data[0], data.size());
-
-    check_equal(sent.size(), 5);
-    check_equal(sent[0].flags, GPU_CMD_POLYHDR);
-    check_equal(sent[1].flags, GPU_CMD_VERTEX);
-    check_equal(sent[2].flags, GPU_CMD_VERTEX);
-
-    // Because we're sending a single triangle, we end up sending a
-    // degenerate final vert. But if we were sending more than one triangle
-    // this would be GPU_CMD_VERTEX twice
-    check_equal(sent[3].flags, GPU_CMD_VERTEX_EOL);
-    check_equal(sent[4].flags, GPU_CMD_VERTEX_EOL);
-    check_equal(sent[3], sent[4]);
-    return true;
-}
-
-bool test_clip_case_010() {
-    /* The third vertex is visible only */
-    sent.clear();
-
-    auto data = make_vertices({
-        {-4.526650, -2.414213, -7.121212, -5.000000},
-        {0.000000, -2.414213, 3.080808, 5.000000},
-        {4.526650, -2.414213, -7.121212, -5.000000}
-    });
-
-    SceneListSubmit(&data[0], data.size());
-
-    check_equal(sent.size(), 4);
-    check_equal(sent[0].flags, GPU_CMD_POLYHDR);
-    check_equal(sent[1].flags, GPU_CMD_VERTEX);
-    check_equal(sent[2].flags, GPU_CMD_VERTEX);
-    check_equal(sent[3].flags, GPU_CMD_VERTEX_EOL);
-    return true;
-}
-
-bool test_clip_case_100() {
-    /* The third vertex is visible only */
-    sent.clear();
-
-    auto data = make_vertices({
-        {-4.526650, -2.414213, -7.121212, -5.000000},
-        {4.526650, -2.414213, -7.121212, -5.000000},
-        {0.000000, -2.414213, 3.080808, 5.000000}
-    });
-
-    SceneListSubmit(&data[0], data.size());
-
-    check_equal(sent.size(), 5);
-    check_equal(sent[0].flags, GPU_CMD_POLYHDR);
-    check_equal(sent[1].flags, GPU_CMD_VERTEX);
-    check_equal(sent[2].flags, GPU_CMD_VERTEX);
-
-    // Because we're sending a single triangle, we end up sending a
-    // degenerate final vert. But if we were sending more than one triangle
-    // this would be GPU_CMD_VERTEX twice
-    check_equal(sent[3].flags, GPU_CMD_VERTEX);
-    check_equal(sent[4].flags, GPU_CMD_VERTEX_EOL);
-    check_equal(sent[1], sent[2]);
-    return true;
-}
-
-bool test_clip_case_110() {
-    /* 2nd and 3rd visible */
-    sent.clear();
-
-    auto data = make_vertices({
-        {0.0, -2.414213, -7.121212, -5.000000},
-        {-4.526650, -2.414213, 3.080808, 5.000000},
-        {4.526650, -2.414213, 3.080808, 5.000000}
-    });
-
-    SceneListSubmit(&data[0], data.size());
-
-    check_equal(sent.size(), 6);
-    check_equal(sent[0].flags, GPU_CMD_POLYHDR);
-    check_equal(sent[1].flags, GPU_CMD_VERTEX);
-    check_equal(sent[2].flags, GPU_CMD_VERTEX);
-    check_equal(sent[3].flags, GPU_CMD_VERTEX);
-    check_equal(sent[4].flags, GPU_CMD_VERTEX);
-    check_equal(sent[5].flags, GPU_CMD_VERTEX_EOL);
-    check_equal(sent[2], sent[4]);
-    return true;
-}
-
-bool test_clip_case_011() {
-    /* 1st and 2nd visible */
-    sent.clear();
-
-    auto data = make_vertices({
-        {-4.526650, -2.414213, 3.080808, 5.000000},
-        {4.526650, -2.414213, 3.080808, 5.000000},
-        {0.0, -2.414213, -7.121212, -5.000000}
-    });
-
-    SceneListSubmit(&data[0], data.size());
-
-    check_equal(sent.size(), 6);
-    check_equal(sent[0].flags, GPU_CMD_POLYHDR);
-    check_equal(sent[1].flags, GPU_CMD_VERTEX);
-    check_equal(sent[2].flags, GPU_CMD_VERTEX);
-    check_equal(sent[3].flags, GPU_CMD_VERTEX);
-    check_equal(sent[4].flags, GPU_CMD_VERTEX);
-    check_equal(sent[5].flags, GPU_CMD_VERTEX_EOL);
-    check_equal(sent[2], sent[4]);
-    return true;
-}
-
-bool test_clip_case_101() {
-    /* 1st and 3rd visible */
-    sent.clear();
-
-    auto data = make_vertices({
-        {-4.526650, -2.414213, 3.080808, 5.000000},
-        {0.0, -2.414213, -7.121212, -5.000000},
-        {4.526650, -2.414213, 3.080808, 5.000000},
-    });
-
-    SceneListSubmit(&data[0], data.size());
-
-    check_equal(sent.size(), 6);
-    check_equal(sent[0].flags, GPU_CMD_POLYHDR);
-    check_equal(sent[1].flags, GPU_CMD_VERTEX);
-    check_equal(sent[2].flags, GPU_CMD_VERTEX);
-    check_equal(sent[3].flags, GPU_CMD_VERTEX);
-    check_equal(sent[4].flags, GPU_CMD_VERTEX);
-    check_equal(sent[5].flags, GPU_CMD_VERTEX_EOL);
-    check_equal(sent[3], sent[5]);
-    return true;
-}
-
-bool test_clip_case_111() {
-    /* 1st and 3rd visible */
-    sent.clear();
-
-    auto data = make_vertices({
-        {-4.526650, -2.414213, 3.080808, 5.000000},
-        {0.0, -2.414213, -7.121212, 8.000000},
-        {4.526650, -2.414213, 3.080808, 5.000000},
-    });
-
-    SceneListSubmit(&data[0], data.size());
-
-    check_equal(sent.size(), 4);
-    check_equal(sent[0].flags, GPU_CMD_POLYHDR);
-    check_equal(sent[1].flags, GPU_CMD_VERTEX);
-    check_equal(sent[2].flags, GPU_CMD_VERTEX);
-    check_equal(sent[3].flags, GPU_CMD_VERTEX_EOL);
-    return true;
-}
-
-
-bool test_start_behind() {
-    /* Triangle behind the plane, but the strip continues in front */
-    sent.clear();
-
-    auto data = make_vertices({
-      {-3.021717, -2.414213, -10.155344, -9.935254},
-      {5.915236, -2.414213, -9.354721, -9.136231},
-      {-5.915236, -2.414213, -0.264096, -0.063767},
-      {3.021717, -2.414213, 0.536527, 0.735255},
-      {-7.361995, -2.414213, 4.681529, 4.871976},
-      {1.574958, -2.414213, 5.482152, 5.670999},
-    });
-
-    SceneListSubmit(&data[0], data.size());
-
-    return true;
-}
-
-bool test_longer_strip() {
-    sent.clear();
-
-    auto data = make_vertices({
-        {-4.384623, -2.414213, -5.699644, -5.488456},
-        {4.667572, -2.414213, -5.621354, -5.410322},
-        {-4.667572, -2.414213, 4.319152, 4.510323},
-        {4.384623, -2.414213, 4.397442, 4.588456},
-        {-4.809045, -2.414213, 9.328549, 9.509711},
-        {4.243149, -2.414213, 9.406840, 9.587846},
-    });
-
-    SceneListSubmit(&data[0], data.size());
-
-    return true;
-}
-
-int main(int argc, char* argv[]) {
-    // test_clip_case_000();
-    test_clip_case_001();
-    test_clip_case_010();
-    test_clip_case_100();
-    test_clip_case_110();
-    test_clip_case_011();
-    test_clip_case_101();
-    test_clip_case_111();
-
-    test_start_behind();
-    test_longer_strip();
-
-    return 0;
-}
--- a/toolchains/Dreamcast.cmake
+++ b/toolchains/Dreamcast.cmake
@ -49,7 +49,7 @@ ENDIF()
 add_link_options(-L$ENV{KOS_BASE}/lib/dreamcast)
 link_libraries(-Wl,--start-group -lstdc++ -lkallisti -lc -lgcc -Wl,--end-group m)

-SET(CMAKE_EXECUTABLE_SUFFIX_C ".elf")
+SET(CMAKE_EXECUTABLE_SUFFIX ".elf")
 SET(CMAKE_EXECUTABLE_SUFFIX_CXX ".elf")

 ADD_DEFINITIONS(
--- a/tools/test.h
+++ b/tools/test.h
@ -1,451 +0,0 @@
-/* *   Copyright (c) 2011-2017 Luke Benstead https://simulant-engine.appspot.com
- *
- *     This file is part of Simulant.
- *
- *     Simulant is free software: you can redistribute it and/or modify
- *     it under the terms of the GNU Lesser General Public License as published by
- *     the Free Software Foundation, either version 3 of the License, or
- *     (at your option) any later version.
- *
- *     Simulant is distributed in the hope that it will be useful,
- *     but WITHOUT ANY WARRANTY; without even the implied warranty of
- *     MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
- *     GNU Lesser General Public License for more details.
- *
- *     You should have received a copy of the GNU Lesser General Public License
- *     along with Simulant.  If not, see <http://www.gnu.org/licenses/>.
- */
-
-#pragma once
-
-#include <vector>
-#include <functional>
-#include <stdexcept>
-#include <iostream>
-#include <sstream>
-#include <algorithm>
-#include <fstream>
-#include <memory>
-
-#define assert_equal(expected, actual) _assert_equal((expected), (actual), __FILE__, __LINE__)
-#define assert_not_equal(expected, actual) _assert_not_equal((expected), (actual), __FILE__, __LINE__)
-#define assert_false(actual) _assert_false((actual), __FILE__, __LINE__)
-#define assert_true(actual) _assert_true((actual), __FILE__, __LINE__)
-#define assert_close(expected, actual, difference) _assert_close((expected), (actual), (difference), __FILE__, __LINE__)
-#define assert_is_null(actual) _assert_is_null((actual), __FILE__, __LINE__)
-#define assert_is_not_null(actual) _assert_is_not_null((actual), __FILE__, __LINE__)
-#define assert_raises(exception, func) _assert_raises<exception>((func), __FILE__, __LINE__)
-#define assert_items_equal(expected, actual) _assert_items_equal((actual), (expected), __FILE__, __LINE__)
-#define not_implemented() _not_implemented(__FILE__, __LINE__)
-
-
-namespace test {
-
-class StringFormatter {
-public:
-    StringFormatter(const std::string& templ):
-        templ_(templ) { }
-
-    struct Counter {
-        Counter(uint32_t c): c(c) {}
-        uint32_t c;
-    };
-
-    template<typename T>
-    std::string format(T value) {
-        std::stringstream ss;
-        ss << value;
-        return _do_format(0, ss.str());
-    }
-
-    template<typename T>
-    std::string format(Counter count, T value) {
-        std::stringstream ss;
-        ss << value;
-        return _do_format(count.c, ss.str());
-    }
-
-    template<typename T, typename... Args>
-    std::string format(T value, const Args&... args) {
-        std::stringstream ss;
-        ss << value;
-        return StringFormatter(_do_format(0, ss.str())).format(Counter(1), args...);
-    }
-
-    template<typename T, typename... Args>
-    std::string format(Counter count, T value, const Args&... args) {
-        std::stringstream ss;
-        ss << value;
-        return StringFormatter(_do_format(count.c, ss.str())).format(Counter(count.c + 1), args...);
-    }
-
-    std::string _do_format(uint32_t counter, const std::string& value) {
-        std::stringstream ss; // Can't use to_string on all platforms
-        ss << counter;
-
-        const std::string to_replace = "{" + ss.str() + "}";
-        std::string output = templ_;
-
-        auto replace = [](std::string& str, const std::string& from, const std::string& to) -> bool {
-            size_t start_pos = str.find(from);
-            if(start_pos == std::string::npos)
-                return false;
-            str.replace(start_pos, from.length(), to);
-            return true;
-        };
-
-        replace(output, to_replace, value);
-        return output;
-    }
-
-private:
-    std::string templ_;
-};
-
-class StringSplitter {
-public:
-    StringSplitter(const std::string& str):
-        str_(str) {
-
-    }
-
-    std::vector<std::string> split() {
-        std::vector<std::string> result;
-        std::string buffer;
-
-        for(auto c: str_) {
-            if(c == '\n') {
-                if(!buffer.empty()) {
-                    result.push_back(buffer);
-                    buffer.clear();
-                }
-            } else {
-                buffer.push_back(c);
-            }
-        }
-
-        if(!buffer.empty()) {
-            result.push_back(buffer);
-        }
-
-        return result;
-    }
-
-private:
-    std::string str_;
-};
-
-typedef StringFormatter _Format;
-
-class AssertionError : public std::logic_error {
-public:
-    AssertionError(const std::string& what):
-        std::logic_error(what),
-        file(""),
-        line(-1) {
-    }
-
-    AssertionError(const std::pair<std::string, int> file_and_line, const std::string& what):
-        std::logic_error(what),
-        file(file_and_line.first),
-        line(file_and_line.second) {
-
-    }
-
-    ~AssertionError() noexcept (true) {
-
-    }
-
-    std::string file;
-    int line;
-};
-
-
-class NotImplementedError: public std::logic_error {
-public:
-    NotImplementedError(const std::string& file, int line):
-        std::logic_error(_Format("Not implemented at {0}:{1}").format(file, line)) {}
-};
-
-
-class SkippedTestError: public std::logic_error {
-public:
-    SkippedTestError(const std::string& reason):
-    std::logic_error(reason) {
-
-    }
-};
-
-class TestCase {
-public:
-    virtual ~TestCase() {}
-
-    virtual void set_up() {}
-    virtual void tear_down() {}
-
-    void skip_if(const bool& flag, const std::string& reason) {
-        if(flag) { throw test::SkippedTestError(reason); }
-    }
-
-    template<typename T, typename U>
-    void _assert_equal(T expected, U actual, std::string file, int line) {
-        if(expected != actual) {
-            auto file_and_line = std::make_pair(file, line);
-            throw test::AssertionError(file_and_line, test::_Format("{0} does not match {1}").format(actual, expected));
-        }
-    }
-
-    template<typename T, typename U>
-    void _assert_not_equal(T lhs, U rhs, std::string file, int line) {
-        if(lhs == (T) rhs) {
-            auto file_and_line = std::make_pair(file, line);
-            throw test::AssertionError(file_and_line, test::_Format("{0} should not match {1}").format(lhs, rhs));
-        }
-    }
-
-    template<typename T>
-    void _assert_true(T actual, std::string file, int line) {
-        if(!bool(actual)) {
-            auto file_and_line = std::make_pair(file, line);
-            throw test::AssertionError(file_and_line, test::_Format("{0} is not true").format(bool(actual) ? "true" : "false"));
-        }
-    }
-
-    template<typename T>
-    void _assert_false(T actual, std::string file, int line) {
-        if(bool(actual)) {
-            auto file_and_line = std::make_pair(file, line);
-            throw test::AssertionError(file_and_line, test::_Format("{0} is not false").format(bool(actual) ? "true" : "false"));
-        }
-    }
-
-    template<typename T, typename U, typename V>
-    void _assert_close(T expected, U actual, V difference, std::string file, int line) {
-        if(actual < expected - difference ||
-           actual > expected + difference) {
-            auto file_and_line = std::make_pair(file, line);
-            throw test::AssertionError(file_and_line, test::_Format("{0} is not close enough to {1}").format(actual, expected));
-        }
-    }
-
-    template<typename T>
-    void _assert_is_null(T* thing, std::string file, int line) {
-        if(thing != nullptr) {
-            auto file_and_line = std::make_pair(file, line);
-            throw test::AssertionError(file_and_line, "Pointer was not NULL");
-        }
-    }
-
-    template<typename T>
-    void _assert_is_not_null(T* thing, std::string file, int line) {
-        if(thing == nullptr) {
-            auto file_and_line = std::make_pair(file, line);
-            throw test::AssertionError(file_and_line, "Pointer was unexpectedly NULL");
-        }
-    }
-
-    template<typename T, typename Func>
-    void _assert_raises(Func func, std::string file, int line) {
-        try {
-            func();
-            auto file_and_line = std::make_pair(file, line);
-            throw test::AssertionError(file_and_line, test::_Format("Expected exception ({0}) was not thrown").format(typeid(T).name()));
-        } catch(T& e) {}
-    }
-
-    template<typename T, typename U>
-    void _assert_items_equal(const T& lhs, const U& rhs, std::string file, int line) {
-        auto file_and_line = std::make_pair(file, line);
-
-        if(lhs.size() != rhs.size()) {
-            throw test::AssertionError(file_and_line, "Containers are not the same length");
-        }
-
-        for(auto item: lhs) {
-            if(std::find(rhs.begin(), rhs.end(), item) == rhs.end()) {
-                throw test::AssertionError(file_and_line, test::_Format("Container does not contain {0}").format(item));
-            }
-        }
-    }
-
-    void _not_implemented(std::string file, int line) {
-        throw test::NotImplementedError(file, line);
-    }
-};
-
-class TestRunner {
-public:
-    template<typename T, typename U>
-    void register_case(std::vector<U> methods, std::vector<std::string> names) {
-        std::shared_ptr<TestCase> instance = std::make_shared<T>();
-
-        instances_.push_back(instance); //Hold on to it
-
-        for(std::string name: names) {
-            names_.push_back(name);
-        }
-
-        for(U& method: methods) {
-            std::function<void()> func = std::bind(method, dynamic_cast<T*>(instance.get()));
-            tests_.push_back([=]() {
-                instance->set_up();
-                try {
-                    func();
-                } catch(...) {
-                    instance->tear_down();
-                    throw;
-                }
-
-                instance->tear_down();
-            });
-        }
-    }
-
-    int32_t run(const std::string& test_case, const std::string& junit_output="") {
-        int failed = 0;
-        int skipped = 0;
-        int ran = 0;
-        int crashed = 0;
-
-        auto new_tests = tests_;
-        auto new_names = names_;
-
-        if(!test_case.empty()) {
-            new_tests.clear();
-            new_names.clear();
-
-            for(uint32_t i = 0; i < names_.size(); ++i) {
-                if(names_[i].find(test_case) == 0) {
-                    new_tests.push_back(tests_[i]);
-                    new_names.push_back(names_[i]);
-                }
-            }
-        }
-
-        std::cout << std::endl << "Running " << new_tests.size() << " tests" << std::endl << std::endl;
-
-        std::vector<std::string> junit_lines;
-        junit_lines.push_back("<testsuites>\n");
-
-        std::string klass = "";
-
-        for(std::function<void ()> test: new_tests) {
-            std::string name = new_names[ran];
-            std::string this_klass(name.begin(), name.begin() + name.find_first_of(":"));
-            bool close_klass = ran == (int) new_tests.size() - 1;
-
-            if(this_klass != klass) {
-                if(!klass.empty()) {
-                    junit_lines.push_back("  </testsuite>\n");
-                }
-                klass = this_klass;
-                junit_lines.push_back("  <testsuite name=\"" + this_klass + "\">\n");
-            }
-
-            try {
-                junit_lines.push_back("    <testcase name=\"" + new_names[ran] + "\">\n");
-                std::string output = "    " + new_names[ran];
-
-                for(int i = output.length(); i < 76; ++i) {
-                    output += " ";
-                }
-
-                std::cout << output;
-                test();
-                std::cout << "\033[32m" << "   OK   " << "\033[0m" << std::endl;
-                junit_lines.push_back("    </testcase>\n");
-            } catch(test::NotImplementedError& e) {
-                std::cout << "\033[34m" << " SKIPPED" << "\033[0m" << std::endl;
-                ++skipped;
-                junit_lines.push_back("    </testcase>\n");
-            } catch(test::SkippedTestError& e) {
-                std::cout << "\033[34m" << " SKIPPED" << "\033[0m" << std::endl;
-                ++skipped;
-                junit_lines.push_back("    </testcase>\n");
-            } catch(test::AssertionError& e) {
-                std::cout << "\033[33m" << " FAILED " << "\033[0m" << std::endl;
-                std::cout << "        " << e.what() << std::endl;
-                if(!e.file.empty()) {
-                    std::cout << "        " << e.file << ":" << e.line << std::endl;
-
-                    std::ifstream ifs(e.file);
-                    if(ifs.good()) {
-                        std::string buffer;
-                        std::vector<std::string> lines;
-                        while(std::getline(ifs, buffer)) {
-                            lines.push_back(buffer);
-                        }
-
-                        int line_count = lines.size();
-                        if(line_count && e.line <= line_count) {
-                            std::cout << lines.at(e.line - 1) << std::endl << std::endl;
-                        }
-                    }
-                }
-                ++failed;
-
-                junit_lines.push_back("      <failure message=\"" + std::string(e.what()) + "\"/>\n");
-                junit_lines.push_back("    </testcase>\n");
-            } catch(std::exception& e) {
-                std::cout << "\033[31m" << " EXCEPT " << std::endl;
-                std::cout << "        " << e.what() << "\033[0m" << std::endl;
-                ++crashed;
-
-                junit_lines.push_back("      <failure message=\"" + std::string(e.what()) + "\"/>\n");
-                junit_lines.push_back("    </testcase>\n");
-            }
-            std::cout << "\033[0m";
-            ++ran;
-
-            if(close_klass) {
-                junit_lines.push_back("  </testsuite>\n");
-            }
-        }
-
-        junit_lines.push_back("</testsuites>\n");
-
-        if(!junit_output.empty()) {
-            FILE* f = fopen(junit_output.c_str(), "wt");
-            if(f) {
-                for(auto& line: junit_lines) {
-                    fwrite(line.c_str(), sizeof(char), line.length(), f);
-                }
-            }
-
-            fclose(f);
-        }
-
-        std::cout << "-----------------------" << std::endl;
-        if(!failed && !crashed && !skipped) {
-            std::cout << "All tests passed" << std::endl << std::endl;
-        } else {
-            if(skipped) {
-                std::cout << skipped << " tests skipped";
-            }
-
-            if(failed) {
-                if(skipped) {
-                    std::cout << ", ";
-                }
-                std::cout << failed << " tests failed";
-            }
-
-            if(crashed) {
-                if(failed) {
-                    std::cout << ", ";
-                }
-                std::cout << crashed << " tests crashed";
-            }
-            std::cout << std::endl << std::endl;
-        }
-
-        return failed + crashed;
-    }
-
-private:
-    std::vector<std::shared_ptr<TestCase>> instances_;
-    std::vector<std::function<void()> > tests_;
-    std::vector<std::string> names_;
-};
-} // test
-
--- a/tools/test_generator.py
+++ b/tools/test_generator.py
@ -1,212 +0,0 @@
-#!/usr/bin/env python3
-
-import argparse
-import re
-import sys
-
-parser = argparse.ArgumentParser(description="Generate C++ unit tests")
-parser.add_argument("--output", type=str, nargs=1, help="The output source file for the generated test main()", required=True)
-parser.add_argument("test_files", type=str, nargs="+", help="The list of C++ files containing your tests")
-parser.add_argument("--verbose", help="Verbose logging", action="store_true", default=False)
-
-
-CLASS_REGEX = r"\s*class\s+(\w+)\s*([\:|,]\s*(?:public|private|protected)\s+[\w|::]+\s*)*"
-TEST_FUNC_REGEX = r"void\s+(?P<func_name>test_\S[^\(]+)\(\s*(void)?\s*\)"
-
-
-INCLUDE_TEMPLATE = "#include \"%(file_path)s\""
-
-REGISTER_TEMPLATE = """
-    runner->register_case<%(class_name)s>(
-        std::vector<void (%(class_name)s::*)()>({%(members)s}),
-        {%(names)s}
-    );"""
-
-MAIN_TEMPLATE = """
-
-#include <functional>
-#include <memory>
-#include <map>
-
-#include "tools/test.h"
-
-%(includes)s
-
-
-std::map<std::string, std::string> parse_args(int argc, char* argv[]) {
-    std::map<std::string, std::string> ret;
-
-    for(int i = 1; i < argc; ++i) {
-        std::string arg = argv[i];
-
-        auto eq = arg.find('=');
-        if(eq != std::string::npos && arg[0] == '-' && arg[1] == '-') {
-            auto key = std::string(arg.begin(), arg.begin() + eq);
-            auto value = std::string(arg.begin() + eq + 1, arg.end());
-            ret[key] = value;
-        } else if(arg[0] == '-' && arg[1] == '-') {
-            auto key = arg;
-            if(i < (argc - 1)) {
-                auto value = argv[++i];
-                ret[key] = value;
-            } else {
-                ret[key] = "";
-            }
-        } else {
-            ret[arg] = "";  // Positional, not key=value
-        }
-    }
-
-    return ret;
-}
-
-int main(int argc, char* argv[]) {
-    auto runner = std::make_shared<test::TestRunner>();
-
-    auto args = parse_args(argc, argv);
-
-    std::string junit_xml;
-    auto junit_xml_it = args.find("--junit-xml");
-    if(junit_xml_it != args.end()) {
-        junit_xml = junit_xml_it->second;
-        std::cout << "    Outputting junit XML to: " << junit_xml << std::endl;
-        args.erase(junit_xml_it);
-    }
-
-    std::string test_case;
-    if(args.size()) {
-        test_case = args.begin()->first;
-    }
-
-    %(registrations)s
-
-    return runner->run(test_case, junit_xml);
-}
-
-
-"""
-
-VERBOSE = False
-
-def log_verbose(message):
-    if VERBOSE:
-        print(message)
-
-
-def find_tests(files):
-
-    subclasses = []
-
-    # First pass, find all class definitions
-    for path in files:
-        with open(path, "rt") as f:
-            source_file_data = f.read().replace("\r\n", "").replace("\n", "")
-
-            while True:
-                match = re.search(CLASS_REGEX, source_file_data)
-                if not match:
-                    break
-
-                class_name = match.group().split(":")[0].replace("class", "").strip()
-
-                try:
-                    parents = match.group().split(":", 1)[1]
-                except IndexError:
-                    pass
-                else:
-                    parents = [ x.strip() for x in parents.split(",") ]
-                    parents = [
-                        x.replace("public", "").replace("private", "").replace("protected", "").strip()
-                        for x in parents
-                    ]
-
-                    subclasses.append((path, class_name, parents, []))
-                    log_verbose("Found: %s" % str(subclasses[-1]))
-
-                start = match.end()
-
-                # Find the next opening brace
-                while source_file_data[start] in (' ', '\t'):
-                    start += 1
-
-                start -= 1
-                end = start
-                if source_file_data[start+1] == '{':
-
-                    class_data = []
-                    brace_counter = 1
-                    for i in range(start+2, len(source_file_data)):
-                        class_data.append(source_file_data[i])
-                        if class_data[-1] == '{': brace_counter += 1
-                        if class_data[-1] == '}': brace_counter -= 1
-                        if not brace_counter:
-                            end = i
-                            break
-
-                    class_data = "".join(class_data)
-
-                    while True:
-                        match = re.search(TEST_FUNC_REGEX, class_data)
-                        if not match:
-                            break
-
-                        subclasses[-1][-1].append(match.group('func_name'))
-                        class_data = class_data[match.end():]
-
-                source_file_data = source_file_data[end:]
-
-
-    # Now, simplify the list by finding all potential superclasses, and then keeping any classes
-    # that subclass them.
-    test_case_subclasses = []
-    i = 0
-    while i < len(subclasses):
-        subclass_names = [x.rsplit("::")[-1] for x in subclasses[i][2]]
-
-        # If this subclasses TestCase, or it subclasses any of the already found testcase subclasses
-        # then add it to the list
-        if "TestCase" in subclass_names or "SimulantTestCase" in subclass_names or any(x[1] in subclasses[i][2] for x in test_case_subclasses):
-            if subclasses[i] not in test_case_subclasses:
-                test_case_subclasses.append(subclasses[i])
-
-                i = 0 # Go back to the start, as we may have just found another parent class
-                continue
-        i += 1
-
-    log_verbose("\n".join([str(x) for x in test_case_subclasses]))
-    return test_case_subclasses
-
-
-def main():
-    global VERBOSE
-
-    args = parser.parse_args()
-
-    VERBOSE = args.verbose
-
-    testcases = find_tests(args.test_files)
-
-    includes = "\n".join([ INCLUDE_TEMPLATE % { 'file_path' : x } for x in set([y[0] for y in testcases]) ])
-    registrations = []
-
-    for path, class_name, superclasses, funcs in testcases:
-        BIND_TEMPLATE = "&%(class_name)s::%(func)s"
-
-        members = ", ".join([ BIND_TEMPLATE % { 'class_name' : class_name, 'func' : x } for x in funcs ])
-        names = ", ".join([ '"%s::%s"' % (class_name, x) for x in funcs ])
-
-        registrations.append(REGISTER_TEMPLATE % { 'class_name' : class_name, 'members' : members, 'names' : names })
-
-    registrations = "\n".join(registrations)
-
-    final = MAIN_TEMPLATE % {
-        'registrations' : registrations,
-        'includes' : includes
-    }
-
-    open(args.output[0], "w").write(final)
-
-    return 0
-
-if __name__ == '__main__':
-    sys.exit(main())