diff --git a/CMakeLists.txt b/CMakeLists.txt
index d39a310..8578f94 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -25,7 +25,7 @@ if(NOT PLATFORM_DREAMCAST)
     set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -m32")
 endif()
 
-set(CMAKE_C_FLAGS_RELEASE "${CMAKE_C_FLAGS_RELEASE} -Ofast --fast-math")
+set(CMAKE_C_FLAGS_RELEASE "${CMAKE_C_FLAGS_RELEASE} -O3 --fast-math")
 
 set(
     SOURCES
diff --git a/GL/draw.c b/GL/draw.c
index ab3ade0..5491e39 100644
--- a/GL/draw.c
+++ b/GL/draw.c
@@ -53,39 +53,49 @@ void _glInitAttributePointers() {
 }
 
 GL_FORCE_INLINE GLboolean _glIsVertexDataFastPathCompatible() {
-    /*
-     * We provide a "fast path" if vertex data is provided in
-     * exactly the right format that matches what the PVR can handle.
-     * This function returns true if all the requirements are met.
+    /* The fast path is enabled when all enabled elements of the vertex
+     * match the output format. This means:
+     *
+     * xyz == 3f
+     * uv == 2f
+     * rgba == argb4444
+     * st == 2f
+     * normal == 3f
+     *
+     * When this happens we do inline straight copies of the enabled data
+     * and transforms for positions and normals happen while copying.
      */
 
-    /*
-     * At least these attributes need to be enabled, because we're not going to do any checking
-     * in the loop
-     */
-    if((ENABLED_VERTEX_ATTRIBUTES & VERTEX_ENABLED_FLAG) != VERTEX_ENABLED_FLAG) return GL_FALSE;
-    if((ENABLED_VERTEX_ATTRIBUTES & UV_ENABLED_FLAG) != UV_ENABLED_FLAG) return GL_FALSE;
-    if((ENABLED_VERTEX_ATTRIBUTES & DIFFUSE_ENABLED_FLAG) != DIFFUSE_ENABLED_FLAG) return GL_FALSE;
+    if((ENABLED_VERTEX_ATTRIBUTES & VERTEX_ENABLED_FLAG)) {
+        if(VERTEX_POINTER.size != 3 || VERTEX_POINTER.type != GL_FLOAT) {
+            return GL_FALSE;
+        }
+    }
 
-    // All 3 attribute types must have a stride of 32
-    if(VERTEX_POINTER.stride != 32) return GL_FALSE;
-    if(UV_POINTER.stride != 32) return GL_FALSE;
-    if(DIFFUSE_POINTER.stride != 32) return GL_FALSE;
+    if((ENABLED_VERTEX_ATTRIBUTES & UV_ENABLED_FLAG)) {
+        if(UV_POINTER.size != 2 || UV_POINTER.type != GL_FLOAT) {
+            return GL_FALSE;
+        }
+    }
 
-    // UV must follow vertex, diffuse must follow UV
-    if((UV_POINTER.ptr - VERTEX_POINTER.ptr) != sizeof(GLfloat) * 3) return GL_FALSE;
-    if((DIFFUSE_POINTER.ptr - UV_POINTER.ptr) != sizeof(GLfloat) * 2) return GL_FALSE;
+    if((ENABLED_VERTEX_ATTRIBUTES & DIFFUSE_ENABLED_FLAG)) {
+        /* FIXME: Shouldn't this be a reversed format? */
+        if(DIFFUSE_POINTER.size != GL_BGRA || DIFFUSE_POINTER.type != GL_UNSIGNED_BYTE) {
+            return GL_FALSE;
+        }
+    }
 
-    if(VERTEX_POINTER.type != GL_FLOAT) return GL_FALSE;
-    if(VERTEX_POINTER.size != 3) return GL_FALSE;
+    if((ENABLED_VERTEX_ATTRIBUTES & ST_ENABLED_FLAG)) {
+        if(ST_POINTER.size != 2 || ST_POINTER.type != GL_FLOAT) {
+            return GL_FALSE;
+        }
+    }
 
-    if(UV_POINTER.type != GL_FLOAT) return GL_FALSE;
-    if(UV_POINTER.size != 2) return GL_FALSE;
-
-    if(DIFFUSE_POINTER.type != GL_UNSIGNED_BYTE) return GL_FALSE;
-
-    /* BGRA is the required color order */
-    if(DIFFUSE_POINTER.size != GL_BGRA) return GL_FALSE;
+    if((ENABLED_VERTEX_ATTRIBUTES & NORMAL_ENABLED_FLAG)) {
+        if(NORMAL_POINTER.size != 3 || NORMAL_POINTER.type != GL_FLOAT) {
+            return GL_FALSE;
+        }
+    }
 
     return GL_TRUE;
 }
@@ -109,7 +119,7 @@ typedef void (*FloatParseFunc)(GLfloat* out, const GLubyte* in);
 typedef void (*ByteParseFunc)(GLubyte* out, const GLubyte* in);
 typedef void (*PolyBuildFunc)(Vertex* first, Vertex* previous, Vertex* vertex, Vertex* next, const GLsizei i);
 
-static void _readVertexData3f3f(const GLubyte* in, GLubyte* out) {
+static void _readVertexData3f3f(const GLubyte* __restrict__ in, GLubyte* __restrict__ out) {
     vec3cpy(out, in);
 }
 
@@ -265,7 +275,7 @@ static void _readVertexData4ubRevARGB(const GLubyte* __restrict__ input, GLubyte
     argbcpy(output, input);
 }
 
-static void _readVertexData4fRevARGB(const GLubyte* in, GLubyte* output) {
+static void _readVertexData4fRevARGB(const GLubyte* __restrict__ in, GLubyte* __restrict__ output) {
     const float* input = (const float*) in;
 
     output[0] = (GLubyte) clamp(input[0] * 255.0f, 0, 255);
@@ -286,12 +296,12 @@ static void _fillWithNegZVE(const GLubyte* __restrict__ input, GLubyte* __restri
     *((V*) out) = NegZ;
 }
 
-static void  _fillWhiteARGB(const GLubyte* input, GLubyte* output) {
+static void  _fillWhiteARGB(const GLubyte* __restrict__ input, GLubyte* __restrict__ output) {
     _GL_UNUSED(input);
     *((uint32_t*) output) = ~0;
 }
 
-static void _fillZero2f(const GLubyte* input, GLubyte* out) {
+static void _fillZero2f(const GLubyte* __restrict__ input, GLubyte* __restrict__ out) {
     _GL_UNUSED(input);
     memset(out, sizeof(float) * 2, 0);
 }
@@ -616,25 +626,30 @@ ReadNormalFunc calcReadNormalFunc() {
     }
 }
 
-GL_FORCE_INLINE void _readPositionData(const GLuint first, const GLuint count, const Vertex* output) {
+static void _readPositionData(ReadDiffuseFunc func, const GLuint first, const GLuint count, const Vertex* output) {
     const GLsizei vstride = (VERTEX_POINTER.stride) ? VERTEX_POINTER.stride : VERTEX_POINTER.size * byte_size(VERTEX_POINTER.type);
-    const void* vptr = ((GLubyte*) VERTEX_POINTER.ptr + (first * vstride));
+    const GLubyte* vptr = ((GLubyte*) VERTEX_POINTER.ptr + (first * vstride));
 
-    ReadDiffuseFunc func = calcReadPositionFunc();
     GLubyte* out = (GLubyte*) output[0].xyz;
+    uint32_t* flags;
 
     ITERATE(count) {
         func(vptr, out);
         vptr += vstride;
+
+        /* Set the flags which are 4 bytes before the position. Doing it here saves
+         * an additional loop */
+        flags = (uint32_t*) out - 1;
+        *flags = GPU_CMD_VERTEX;
+
         out += sizeof(Vertex);
     }
 }
 
-GL_FORCE_INLINE void _readUVData(const GLuint first, const GLuint count, const Vertex* output) {
+static void _readUVData(ReadUVFunc func, const GLuint first, const GLuint count, const Vertex* output) {
     const GLsizei uvstride = (UV_POINTER.stride) ? UV_POINTER.stride : UV_POINTER.size * byte_size(UV_POINTER.type);
-    const void* uvptr = ((GLubyte*) UV_POINTER.ptr + (first * uvstride));
+    const GLubyte* uvptr = ((GLubyte*) UV_POINTER.ptr + (first * uvstride));
 
-    ReadUVFunc func = calcReadUVFunc();
     GLubyte* out = (GLubyte*) output[0].uv;
 
     ITERATE(count) {
@@ -644,11 +659,10 @@ GL_FORCE_INLINE void _readUVData(const GLuint first, const GLuint count, const V
     }
 }
 
-GL_FORCE_INLINE void _readSTData(const GLuint first, const GLuint count, const VertexExtra* extra) {
+static void _readSTData(ReadUVFunc func, const GLuint first, const GLuint count, const VertexExtra* extra) {
     const GLsizei ststride = (ST_POINTER.stride) ? ST_POINTER.stride : ST_POINTER.size * byte_size(ST_POINTER.type);
-    const void* stptr = ((GLubyte*) ST_POINTER.ptr + (first * ststride));
+    const GLubyte* stptr = ((GLubyte*) ST_POINTER.ptr + (first * ststride));
 
-    ReadUVFunc func = calcReadSTFunc();
     GLubyte* out = (GLubyte*) extra[0].st;
 
     ITERATE(count) {
@@ -658,11 +672,10 @@ GL_FORCE_INLINE void _readSTData(const GLuint first, const GLuint count, const V
     }
 }
 
-GL_FORCE_INLINE void _readNormalData(const GLuint first, const GLuint count, const VertexExtra* extra) {
+static void _readNormalData(ReadNormalFunc func, const GLuint first, const GLuint count, const VertexExtra* extra) {
     const GLsizei nstride = (NORMAL_POINTER.stride) ? NORMAL_POINTER.stride : NORMAL_POINTER.size * byte_size(NORMAL_POINTER.type);
-    const void* nptr = ((GLubyte*) NORMAL_POINTER.ptr + (first * nstride));
+    const GLubyte* nptr = ((GLubyte*) NORMAL_POINTER.ptr + (first * nstride));
 
-    ReadNormalFunc func = calcReadNormalFunc();
     GLubyte* out = (GLubyte*) extra[0].nxyz;
 
     ITERATE(count) {
@@ -689,12 +702,11 @@ GL_FORCE_INLINE void _readNormalData(const GLuint first, const GLuint count, con
     }
 }
 
-GL_FORCE_INLINE void _readDiffuseData(const GLuint first, const GLuint count, const Vertex* output) {
+static void _readDiffuseData(ReadDiffuseFunc func, const GLuint first, const GLuint count, const Vertex* output) {
     const GLuint size = (DIFFUSE_POINTER.size == GL_BGRA) ? 4 : DIFFUSE_POINTER.size;
     const GLuint cstride = (DIFFUSE_POINTER.stride) ? DIFFUSE_POINTER.stride : size * byte_size(DIFFUSE_POINTER.type);
     const GLubyte* cptr = ((GLubyte*) DIFFUSE_POINTER.ptr) + (first * cstride);
 
-    ReadDiffuseFunc func = calcReadDiffuseFunc();
     GLubyte* out = (GLubyte*) output[0].bgra;
 
     ITERATE(count) {
@@ -765,43 +777,87 @@ static void generateElements(
     }
 }
 
-static const uint32_t FAST_PATH_BYTE_SIZE = (sizeof(GLfloat) * 3) + (sizeof(GLfloat) * 2) + (sizeof(GLubyte) * 4);
-
-static void generateArraysFastPath(SubmissionTarget* target, const GLsizei first, const GLuint count, const GLenum type) {
+static void generateArraysFastPath(SubmissionTarget* target, const GLsizei first, const GLuint count) {
     Vertex* start = _glSubmissionTargetStart(target);
+
+    const GLuint vstride = (VERTEX_POINTER.stride) ?
+        VERTEX_POINTER.stride : VERTEX_POINTER.size * byte_size(VERTEX_POINTER.type);
+
+    const GLuint uvstride = (UV_POINTER.stride) ?
+        UV_POINTER.stride : UV_POINTER.size * byte_size(UV_POINTER.type);
+
+    const GLuint ststride = (ST_POINTER.stride) ?
+        ST_POINTER.stride : ST_POINTER.size * byte_size(ST_POINTER.type);
+
+    const GLuint dstride = (DIFFUSE_POINTER.stride) ?
+        DIFFUSE_POINTER.stride : DIFFUSE_POINTER.size * byte_size(DIFFUSE_POINTER.type);
+
+    const GLuint nstride = (NORMAL_POINTER.stride) ?
+        NORMAL_POINTER.stride : NORMAL_POINTER.size * byte_size(NORMAL_POINTER.type);
+
+
     /* Copy the pos, uv and color directly in one go */
-    const GLubyte* pos = VERTEX_POINTER.ptr;
-    Vertex* it = start;
-    ITERATE(count) {
-        it->flags = GPU_CMD_VERTEX;
-        MEMCPY4(it->xyz, pos, FAST_PATH_BYTE_SIZE);
-        it++;
-        pos += VERTEX_POINTER.stride;
-    }
+    const GLubyte* pos = (ENABLED_VERTEX_ATTRIBUTES & VERTEX_ENABLED_FLAG) ? VERTEX_POINTER.ptr + (first * vstride) : NULL;
+    const GLubyte* uv = (ENABLED_VERTEX_ATTRIBUTES & UV_ENABLED_FLAG) ? UV_POINTER.ptr + (first * uvstride) : NULL;
+    const GLubyte* col = (ENABLED_VERTEX_ATTRIBUTES & DIFFUSE_ENABLED_FLAG) ? DIFFUSE_POINTER.ptr + (first * dstride) : NULL;
+    const GLubyte* st = (ENABLED_VERTEX_ATTRIBUTES & ST_ENABLED_FLAG) ? ST_POINTER.ptr + (first * ststride) : NULL;
+    const GLubyte* n = (ENABLED_VERTEX_ATTRIBUTES & NORMAL_ENABLED_FLAG) ? NORMAL_POINTER.ptr + (first * nstride) : NULL;
 
     VertexExtra* ve = aligned_vector_at(target->extras, 0);
+    Vertex* it = start;
 
-    _readNormalData(first, count, ve);
-    _readSTData(first, count, ve);
+    const float w = 1.0f;
+
+    uint32_t i = count;
+
+    while(i--) {
+        it->flags = GPU_CMD_VERTEX;
+
+        if(pos) {
+            TransformVertex((const float*) pos, &w, it->xyz, &it->w);
+            pos += vstride;
+        }
+
+        if(uv) {
+            MEMCPY4(it->uv, uv, sizeof(float) * 2);
+            uv += uvstride;
+        }
+
+        if(col) {
+            MEMCPY4(it->bgra, col, sizeof(uint32_t));
+            col += dstride;
+        }
+
+        if(st) {
+            MEMCPY4(ve->st, st, sizeof(float) * 2);
+            st += ststride;
+        }
+
+        if(n) {
+            MEMCPY4(ve->nxyz, n, sizeof(float) * 3);
+            n += nstride;
+        }
+
+        it++;
+        ve++;
+    }
 }
 
-static void generateArrays(SubmissionTarget* target, const GLsizei first, const GLuint count, const GLenum type) {
+static void generateArrays(SubmissionTarget* target, const GLsizei first, const GLuint count) {
     Vertex* start = _glSubmissionTargetStart(target);
-    _readPositionData(first, count, start);
-    _readDiffuseData(first, count, start);
-    _readUVData(first, count, start);
-
-    Vertex* it = _glSubmissionTargetStart(target);
-
-    ITERATE(count) {
-        it->flags = GPU_CMD_VERTEX;
-        ++it;
-    }
-
     VertexExtra* ve = aligned_vector_at(target->extras, 0);
 
-    _readNormalData(first, count, ve);
-    _readSTData(first, count, ve);
+    ReadPositionFunc pfunc = calcReadPositionFunc();
+    ReadDiffuseFunc dfunc = calcReadDiffuseFunc();
+    ReadUVFunc uvfunc = calcReadUVFunc();
+    ReadNormalFunc nfunc = calcReadNormalFunc();
+    ReadUVFunc stfunc = calcReadSTFunc();
+
+    _readPositionData(pfunc, first, count, start);
+    _readDiffuseData(dfunc, first, count, start);
+    _readUVData(uvfunc, first, count, start);
+    _readNormalData(nfunc, first, count, ve);
+    _readSTData(stfunc, first, count, ve);
 }
 
 static void generate(SubmissionTarget* target, const GLenum mode, const GLsizei first, const GLuint count,
@@ -812,9 +868,9 @@ static void generate(SubmissionTarget* target, const GLenum mode, const GLsizei
     if(indices) {
         generateElements(target, first, count, indices, type);
     } else if(FAST_PATH_ENABLED) {
-        generateArraysFastPath(target, first, count, type);
+        generateArraysFastPath(target, first, count);
     } else {
-        generateArrays(target, first, count, type);
+        generateArrays(target, first, count);
     }
 
     Vertex* it = _glSubmissionTargetStart(target);
@@ -843,8 +899,6 @@ static void transform(SubmissionTarget* target) {
     /* Perform modelview transform, storing W */
     Vertex* vertex = _glSubmissionTargetStart(target);
 
-    _glApplyRenderMatrix(); /* Apply the Render Matrix Stack */
-
     TransformVertices(vertex, target->count);
 }
 
@@ -1048,13 +1102,18 @@ GL_FORCE_INLINE void submitVertices(GLenum mode, GLsizei first, GLuint count, GL
 
     /* Make room for the vertices and header */
     aligned_vector_extend(&target->output->vector, target->count + 1);
+
+    _glApplyRenderMatrix(); /* Apply the Render Matrix Stack */
+
     generate(target, mode, first, count, (GLubyte*) indices, type);
 
     if(doLighting){
         light(target);
     }
 
-    transform(target);
+    if(!FAST_PATH_ENABLED) {
+        transform(target);
+    }
 
     if(_glIsClippingEnabled()) {
 #if DEBUG_CLIPPING
diff --git a/GL/platforms/sh4.h b/GL/platforms/sh4.h
index d84b25e..b1b6823 100644
--- a/GL/platforms/sh4.h
+++ b/GL/platforms/sh4.h
@@ -69,6 +69,25 @@ inline void TransformVec4(float* x) {
 
 }
 
+static inline void TransformVertex(const float* xyz, const float* w, float* oxyz, float* ow) {
+    register float __x __asm__("fr12") = (xyz[0]);
+    register float __y __asm__("fr13") = (xyz[1]);
+    register float __z __asm__("fr14") = (xyz[2]);
+    register float __w __asm__("fr15") = (*w);
+
+    __asm__ __volatile__(
+        "fldi1 fr15\n"
+        "ftrv   xmtrx,fv12\n"
+        : "=f" (__x), "=f" (__y), "=f" (__z), "=f" (__w)
+        : "0" (__x), "1" (__y), "2" (__z), "3" (__w)
+    );
+
+    oxyz[0] = __x;
+    oxyz[1] = __y;
+    oxyz[2] = __z;
+    *ow = __w;
+}
+
 static inline void TransformVertices(Vertex* vertices, const int count) {
     Vertex* it = vertices;
     for(int i = 0; i < count; ++i, ++it) {
diff --git a/GL/platforms/software.c b/GL/platforms/software.c
index 7e904a0..40ccf8f 100644
--- a/GL/platforms/software.c
+++ b/GL/platforms/software.c
@@ -328,3 +328,18 @@ void TransformVertices(Vertex* vertices, const int count) {
         vertices->w = ret[3];
     }
 }
+
+void TransformVertex(const float* xyz, const float* w, float* oxyz, float* ow) {
+    float ret[4];
+    ret[0] = xyz[0];
+    ret[1] = xyz[1];
+    ret[2] = xyz[2];
+    ret[3] = *w;
+
+    TransformVec4(ret);
+
+    oxyz[0] = ret[0];
+    oxyz[1] = ret[1];
+    oxyz[2] = ret[2];
+    *ow = ret[3];
+}
diff --git a/GL/platforms/software.h b/GL/platforms/software.h
index 47fa9a6..e3a3a03 100644
--- a/GL/platforms/software.h
+++ b/GL/platforms/software.h
@@ -50,6 +50,7 @@ static inline void TransformNormalNoMod(const float* xIn, float* xOut) {
 }
 
 void TransformVertices(Vertex* vertices, const int count);
+void TransformVertex(const float* xyz, const float* w, float* oxyz, float* ow);
 
 void InitGPU(_Bool autosort, _Bool fsaa);
 
diff --git a/containers/aligned_vector.h b/containers/aligned_vector.h
index 53128b8..c99bbf8 100644
--- a/containers/aligned_vector.h
+++ b/containers/aligned_vector.h
@@ -16,16 +16,22 @@ typedef struct {
 
 #define ALIGNED_VECTOR_CHUNK_SIZE 256u
 
+#define AV_NO_INSTRUMENT inline __attribute__((no_instrument_function))
+#define AV_INLINE_DEBUG AV_NO_INSTRUMENT __attribute__((always_inline))
+#define AV_FORCE_INLINE static AV_INLINE_DEBUG
+
 void aligned_vector_init(AlignedVector* vector, unsigned int element_size);
 void* aligned_vector_reserve(AlignedVector* vector, unsigned int element_count);
 void* aligned_vector_push_back(AlignedVector* vector, const void* objs, unsigned int count);
 void* aligned_vector_resize(AlignedVector* vector, const unsigned int element_count);
-static inline void* aligned_vector_at(const AlignedVector* vector, const unsigned int index) {
+
+AV_FORCE_INLINE void* aligned_vector_at(const AlignedVector* vector, const unsigned int index) {
     assert(index < vector->size);
     return &vector->data[index * vector->element_size];
 }
 void* aligned_vector_extend(AlignedVector* vector, const unsigned int additional_count);
-static inline void aligned_vector_clear(AlignedVector* vector){
+
+AV_FORCE_INLINE void aligned_vector_clear(AlignedVector* vector){
     vector->size = 0;
 }
 void aligned_vector_shrink_to_fit(AlignedVector* vector);