diff --git a/GL/alloc/alloc.c b/GL/alloc/alloc.c
index 6b787d6..6760ff4 100644
--- a/GL/alloc/alloc.c
+++ b/GL/alloc/alloc.c
@@ -6,7 +6,7 @@
 #include "alloc.h"
 
 
-/* This allocator is designed so that all allocations larger
+/* This allocator is designed so that ideally all allocations larger
  * than 2k, fall on a 2k boundary. Smaller allocations will
  * never cross a 2k boundary.
  *
@@ -15,9 +15,9 @@
  * blocks anyway as they have to be 2k aligned (so you'd need to
  * store them in reverse or something)
  *
- * Defragmenting the pool will move allocations less than 2k
- * first, and then shift any full 2k blocks to the start of the
- * address space.
+ * Defragmenting the pool will move larger allocations first, then
+ * smaller ones, recursively until you tell it to stop, or until things
+ * stop moving.
  *
  * The maximum pool size is 8M, made up of:
  *
@@ -41,10 +41,7 @@
  *
  * FIXME:
  *
- *  - Allocations < 2048 can still cross boundaries
  *  - Only operates on one pool (ignores what you pass)
- *  - If there are no 2048 aligned blocks, we should fall-back to unaligned
- *  - Defrag not implemented!
  */
 
 #include <assert.h>
@@ -90,7 +87,6 @@ typedef struct {
     size_t pool_size; // Size of the memory pool
     uint8_t* base_address; // First 2k aligned address in the pool
     size_t block_count;  // Number of 2k blocks in the pool
-    bool defrag_in_progress;
 
     /* It's frustrating that we need to do this dynamically
      * but we need to know the size allocated when we free()...
@@ -104,7 +100,7 @@ typedef struct {
 
 
 static PoolHeader pool_header = {
-    {0}, NULL, 0, NULL, 0, false, NULL
+    {0}, NULL, 0, NULL, 0, NULL
 };
 
 void* alloc_base_address(void* pool) {
@@ -147,8 +143,6 @@ void* alloc_next_available_ex(void* pool, size_t required_size, size_t* start_su
     uint32_t required_subblocks = (required_size / 256);
     if(required_size % 256) required_subblocks += 1;
 
-    // uint8_t* end = pool_header.block_usage + pool_header.block_count;
-
     /* Anything gte to 2048 must be aligned to a 2048 boundary */
     bool requires_alignment = required_size >= 2048;
 
@@ -160,7 +154,6 @@ void* alloc_next_available_ex(void* pool, size_t required_size, size_t* start_su
      * but it's not aligned, or it's straddling a 2k boundary, then we store
      * it here and if we reach the end of the search and find nothing better
      * we use this instead */
-
     uint8_t* poor_option = NULL;
     size_t poor_start_subblock = 0;
 
@@ -180,7 +173,16 @@ void* alloc_next_available_ex(void* pool, size_t required_size, size_t* start_su
                 /* Now let's see how many consecutive blocks we can find */
                 for(int i = 0; i < 8; ++i) {
                     if((t & 0x80) == 0) {
-                        if(requires_alignment && found_subblocks == 0 && i != 0) {
+                        bool block_overflow = (
+                            required_size < 2048 && found_subblocks > 0 && i == 0
+                        );
+
+                        bool reset_subblocks = (
+                            (requires_alignment && found_subblocks == 0 && i != 0) ||
+                            block_overflow
+                        );
+
+                        if(reset_subblocks) {
                             // Ignore this subblock, because we want the first subblock to be aligned
                             // at a 2048 boundary and this one isn't (i != 0)
                             found_subblocks = 0;
@@ -188,6 +190,12 @@ void* alloc_next_available_ex(void* pool, size_t required_size, size_t* start_su
                             found_subblocks++;
                         }
 
+                        /* If we reset the subblocks due to an overflow, we still
+                         * want to count this free subblock in our count */
+                        if(block_overflow) {
+                            found_subblocks++;
+                        }
+
                         found_poor_subblocks++;
 
                         if(found_subblocks >= required_subblocks) {
@@ -234,7 +242,7 @@ int alloc_init(void* pool, size_t size) {
 
     uint8_t* p = (uint8_t*) pool;
 
-    memset(pool_header.block_usage, 0, sizeof(pool_header.block_usage));
+    memset(pool_header.block_usage, 0, BLOCK_COUNT);
     pool_header.pool = pool;
     pool_header.pool_size = size;
 
@@ -253,6 +261,10 @@ int alloc_init(void* pool, size_t size) {
 void alloc_shutdown(void* pool) {
     (void) pool;
 
+    if(!pool_header.pool) {
+        return;
+    }
+
     struct AllocEntry* it = pool_header.allocations;
     while(it) {
         struct AllocEntry* next = it->next;
@@ -261,6 +273,7 @@ void alloc_shutdown(void* pool) {
     }
 
     memset(&pool_header, 0, sizeof(pool_header));
+    pool_header.pool = NULL;
 }
 
 static inline uint32_t size_to_subblock_count(size_t size) {
@@ -285,10 +298,6 @@ void* alloc_malloc(void* pool, size_t size) {
     size_t start_subblock, required_subblocks;
     void* ret = alloc_next_available_ex(pool, size, &start_subblock, &required_subblocks);
 
-    if(size >= 2048) {
-        assert(((uintptr_t) ret) % 2048 == 0);
-    }
-
     if(ret) {
         size_t block;
         uint8_t offset;
@@ -367,6 +376,45 @@ void* alloc_malloc(void* pool, size_t size) {
     return ret;
 }
 
+static void alloc_release_blocks(struct AllocEntry* it) {
+    size_t used_subblocks = size_to_subblock_count(it->size);
+    size_t subblock = subblock_from_pointer(it->pointer);
+    size_t block;
+    uint8_t offset;
+    block_and_offset_from_subblock(subblock, &block, &offset);
+
+    uint8_t mask = 0;
+
+    DBG_MSG("Free: size: %d, us: %d, sb: %d, off: %d\n", it->size, used_subblocks, block, offset);
+
+    /* Wipe out any leading subblocks */
+    int c = (used_subblocks < 8) ? used_subblocks : 8;
+    for(int i = 0; i < c; ++i) {
+        mask |= (1 << (7 - (offset + i)));
+        used_subblocks--;
+    }
+
+    if(mask) {
+        pool_header.block_usage[block++] &= ~mask;
+    }
+
+    /* Clear any full blocks in the middle of the allocation */
+    while(used_subblocks > 8) {
+        pool_header.block_usage[block++] = 0;
+        used_subblocks -= 8;
+    }
+
+    /* Wipe out any trailing subblocks */
+    mask = 0;
+    for(size_t i = 0; i < used_subblocks; ++i) {
+        mask |= (1 << (7 - i));
+    }
+
+    if(mask) {
+        pool_header.block_usage[block++] &= ~mask;
+    }
+}
+
 void alloc_free(void* pool, void* p) {
     (void) pool;
 
@@ -374,42 +422,7 @@ void alloc_free(void* pool, void* p) {
     struct AllocEntry* last = NULL;
     while(it) {
         if(it->pointer == p) {
-            size_t used_subblocks = size_to_subblock_count(it->size);
-            size_t subblock = subblock_from_pointer(p);
-            size_t block;
-            uint8_t offset;
-            block_and_offset_from_subblock(subblock, &block, &offset);
-
-            uint8_t mask = 0;
-
-            DBG_MSG("Free: size: %d, us: %d, sb: %d, off: %d\n", it->size, used_subblocks, block, offset);
-
-            /* Wipe out any leading subblocks */
-            int c = (used_subblocks < 8) ? used_subblocks : 8;
-            for(int i = 0; i < c; ++i) {
-                mask |= (1 << (7 - (offset + i)));
-                used_subblocks--;
-            }
-
-            if(mask) {
-                pool_header.block_usage[block++] &= ~mask;
-            }
-
-            /* Clear any full blocks in the middle of the allocation */
-            while(used_subblocks > 8) {
-                pool_header.block_usage[block++] = 0;
-                used_subblocks -= 8;
-            }
-
-            /* Wipe out any trailing subblocks */
-            mask = 0;
-            for(size_t i = 0; i < used_subblocks; ++i) {
-                mask |= (1 << (7 - i));
-            }
-
-            if(mask) {
-                pool_header.block_usage[block++] &= ~mask;
-            }
+            alloc_release_blocks(it);
 
             if(last) {
                 last->next = it->next;
@@ -430,24 +443,40 @@ void alloc_free(void* pool, void* p) {
     DBG_MSG("Free done\n");
 }
 
-void alloc_defrag_start(void* pool) {
-    (void) pool;
-    pool_header.defrag_in_progress = true;
-}
+void alloc_run_defrag(void* pool, defrag_address_move callback, int max_iterations, void* user_data) {
 
-void* alloc_defrag_address(void* pool, void* p) {
-    (void) pool;
-    return p;
-}
+    for(int i = 0; i < max_iterations; ++i) {
+        bool move_occurred = false;
 
-void alloc_defrag_commit(void* pool) {
-    (void) pool;
-    pool_header.defrag_in_progress = false;
-}
+        struct AllocEntry* it = pool_header.allocations;
 
-bool alloc_defrag_in_progress(void* pool) {
-    (void) pool;
-    return pool_header.defrag_in_progress;
+        if(!it) {
+            return;
+        }
+
+        while(it) {
+            void* potential_dest = alloc_next_available(pool, it->size);
+            if(potential_dest < it->pointer) {
+                potential_dest = alloc_malloc(pool, it->size);
+                memcpy(potential_dest, it->pointer, it->size);
+
+                /* Mark this block as now free, but don't fiddle with the
+                 * allocation list */
+                alloc_release_blocks(it);
+
+                callback(it->pointer, potential_dest, user_data);
+
+                it->pointer = potential_dest;
+                move_occurred = true;
+            }
+
+            it = it->next;
+        }
+
+        if(!move_occurred) {
+            return;
+        }
+    }
 }
 
 static inline uint8_t count_ones(uint8_t byte) {
@@ -459,6 +488,8 @@ static inline uint8_t count_ones(uint8_t byte) {
 }
 
 size_t alloc_count_free(void* pool) {
+    (void) pool;
+
     uint8_t* it = pool_header.block_usage;
     uint8_t* end = it + pool_header.block_count;
 
diff --git a/GL/alloc/alloc.h b/GL/alloc/alloc.h
index d69bb9f..49400f9 100644
--- a/GL/alloc/alloc.h
+++ b/GL/alloc/alloc.h
@@ -14,10 +14,8 @@ void alloc_shutdown(void* pool);
 void *alloc_malloc(void* pool, size_t size);
 void alloc_free(void* pool, void* p);
 
-void alloc_defrag_start(void* pool);
-void* alloc_defrag_address(void* pool, void* p);
-void alloc_defrag_commit(void* pool);
-bool alloc_defrag_in_progress(void* pool);
+typedef void (defrag_address_move)(void*, void*, void*);
+void alloc_run_defrag(void* pool, defrag_address_move callback, int max_iterations, void* user_data);
 
 size_t alloc_count_free(void* pool);
 size_t alloc_count_continuous(void* pool);
diff --git a/GL/texture.c b/GL/texture.c
index 5fcf473..4871e12 100644
--- a/GL/texture.c
+++ b/GL/texture.c
@@ -78,8 +78,8 @@ static const unsigned short MortonTable256[256] =
 /* Given a 0-based texel location, and an image width/height. Return the
  * new 0-based texel location */
 GL_FORCE_INLINE uint32_t twid_location(uint32_t i, uint32_t w, uint32_t h) {
-    uint16_t y = i % w;
-    uint16_t x = i / w;
+    uint16_t y = i / w;
+    uint16_t x = i % w;
 
     return MortonTable256[y >> 8]   << 17 |
            MortonTable256[x >> 8]   << 16 |
@@ -1611,14 +1611,21 @@ void APIENTRY glTexImage2D(GLenum target, GLint level, GLint internalFormat,
                 for(uint32_t i = 0; i < (width * height); ++i) {
                     uint32_t newLocation = twid_location(i, width, height);
 
+                    assert(newLocation < (width * height));
+                    assert((newLocation / 2) < destBytes);
+                    assert((i / 2) < srcBytes);
+
                     // This is the src/dest byte, but we need to figure
                     // out which half based on the odd/even of i
                     src = &((uint8_t*) data)[i / 2];
                     dst = &conversionBuffer[newLocation / 2];
-                    if(i % 2 == 0) {
-                        *dst = (*dst & 0xF) | (*src & 0xF0);
+
+                    uint8_t src_value = (i % 2) == 0 ? (*src >> 4) : (*src & 0xF);
+
+                    if(newLocation % 2 == 0) {
+                        *dst = (*dst & 0xF) | (src_value << 4);
                     } else {
-                        *dst = (*dst & 0xF0) | (*src & 0xF);
+                        *dst = (*dst & 0xF0) | (src_value & 0xF);
                     }
                 }
             } else {
@@ -2044,21 +2051,20 @@ GLuint _glFreeContiguousTextureMemory() {
     return alloc_count_continuous(ALLOC_BASE);
 }
 
-GLAPI GLvoid APIENTRY glDefragmentTextureMemory_KOS(void) {
-    alloc_defrag_start(ALLOC_BASE);
-
-    GLuint id;
-
-    /* Replace all texture pointers */
-    for(id = 0; id < MAX_TEXTURE_COUNT; id++){
+static void update_data_pointer(void* src, void* dst, void*) {
+    for(size_t id = 0; id < MAX_TEXTURE_COUNT; id++){
         TextureObject* txr = (TextureObject*) named_array_get(&TEXTURE_OBJECTS, id);
-        if(txr){
+        if(txr && txr->data == src) {
+            fprintf(stderr, "Defrag moved 0x%x -> 0x%x\n", src, dst);
             gl_assert(txr->index == id);
-            txr->data = alloc_defrag_address(ALLOC_BASE, txr->data);
+            txr->data = dst;
+            return;
         }
     }
+}
 
-    alloc_defrag_commit(ALLOC_BASE);
+GLAPI GLvoid APIENTRY glDefragmentTextureMemory_KOS(void) {
+    alloc_run_defrag(ALLOC_BASE, update_data_pointer, 5, NULL);
 }
 
 GLAPI void APIENTRY glGetTexImage(GLenum tex, GLint lod, GLenum format, GLenum type, GLvoid* img) {
diff --git a/tests/test_allocator.h b/tests/test_allocator.h
index 5117f52..5967983 100644
--- a/tests/test_allocator.h
+++ b/tests/test_allocator.h
@@ -1,7 +1,9 @@
 #include "tools/test.h"
 
-#include <stdint.h>
-#include <assert.h>
+#include <cstdint>
+#include <cassert>
+#include <malloc.h>
+#include <utility>
 
 #include <GL/gl.h>
 #include <GL/glkos.h>
@@ -14,22 +16,52 @@ static inline int round_up(int n, int multiple)
     return ((n + multiple - 1) / multiple) * multiple;
 }
 
+#define POOL_SIZE (16 * 2048)
+
 class AllocatorTests : public test::TestCase {
 public:
-    uint8_t __attribute__((aligned(2048))) pool[16 * 2048];
+    uint8_t* pool = NULL;
+
+    std::vector<std::pair<void*, void*>> defrag_moves;
 
     void set_up() {
+        pool = (uint8_t*) memalign(2048, POOL_SIZE);
         assert(((intptr_t) pool) % 2048 == 0);
     }
 
     void tear_down() {
         alloc_shutdown(pool);
+        free(pool);
+    }
+
+    static void on_defrag(void* src, void* dst, void* user_data) {
+        AllocatorTests* self = (AllocatorTests*) user_data;
+        self->defrag_moves.push_back(std::make_pair(src, dst));
+    }
+
+    void test_defrag() {
+        alloc_init(pool, POOL_SIZE);
+
+        alloc_malloc(pool, 256);
+        void* a2 = alloc_malloc(pool, 256);
+        void* a3 = alloc_malloc(pool, 256);
+
+        alloc_free(pool, a2);
+
+        alloc_run_defrag(pool, &AllocatorTests::on_defrag, 5, this);
+
+        assert_equal(defrag_moves.size(), 1u); // Moved a3 -> a2
+
+        assert_equal(defrag_moves[0].first, a3);
+        assert_equal(defrag_moves[0].second, a2);
+
+        assert_equal(alloc_malloc(pool, 256), a3);
     }
 
     void test_poor_alloc_aligned() {
         /* If we try to allocate and there are no suitable aligned
          * slots available, we fallback to any available unaligned slots */
-        alloc_init(pool, sizeof(pool));
+        alloc_init(pool, POOL_SIZE);
 
         // Leave only space for an unaligned block
         alloc_malloc(pool, (15 * 2048) - 256);
@@ -44,7 +76,7 @@ public:
         /*
          * If we try to allocate a small block, it should not
          * cross a 2048 boundary unless there is no other option */
-        alloc_init(pool, sizeof(pool));
+        alloc_init(pool, POOL_SIZE);
         alloc_malloc(pool, (15 * 2048) - 256);
         void* a1 = alloc_malloc(pool, 512);
         assert_true((uintptr_t(a1) % 2048) == 0); // Should've aligned to the last 2048 block
@@ -59,14 +91,14 @@ public:
     }
 
     void test_alloc_init() {
-        alloc_init(pool, sizeof(pool));
+        alloc_init(pool, POOL_SIZE);
 
         void* expected_base_address = (void*) round_up((uintptr_t) pool, 2048);
         assert_equal(alloc_next_available(pool, 16), expected_base_address);
         assert_equal(alloc_base_address(pool), expected_base_address);
 
         size_t expected_blocks = (
-            uintptr_t(pool + sizeof(pool)) -
+            uintptr_t(pool + POOL_SIZE) -
             uintptr_t(expected_base_address)
         ) / 2048;
 
@@ -109,7 +141,7 @@ public:
     }
 
     void test_alloc_malloc() {
-        alloc_init(pool, sizeof(pool));
+        alloc_init(pool, POOL_SIZE);
 
         uint8_t* base_address = (uint8_t*) alloc_base_address(pool);
         void* a1 = alloc_malloc(pool, 1024);
diff --git a/tools/test.h b/tools/test.h
index 6341a36..b4ef409 100644
--- a/tools/test.h
+++ b/tools/test.h
@@ -289,7 +289,13 @@ public:
             std::function<void()> func = std::bind(method, dynamic_cast<T*>(instance.get()));
             tests_.push_back([=]() {
                 instance->set_up();
-                func();
+                try {
+                    func();
+                } catch(...) {
+                    instance->tear_down();
+                    throw;
+                }
+
                 instance->tear_down();
             });
         }