diff --git a/GL/config.h b/GL/config.h
index f119e9a..7e3be3a 100644
--- a/GL/config.h
+++ b/GL/config.h
@@ -1,9 +1,8 @@
+#pragma once
 #ifndef CONFIG_H
 #define CONFIG_H
 
-
 /* This figure is derived from the needs of Quake 1 */
 #define MAX_TEXTURE_COUNT 1088
 
-
 #endif // CONFIG_H
diff --git a/GL/cygprofile.c b/GL/cygprofile.c
new file mode 100644
index 0000000..293d179
--- /dev/null
+++ b/GL/cygprofile.c
@@ -0,0 +1,227 @@
+/* Based on the idea from Erich Styger */
+/* profiled instrument guided profiling for gldc on hardware */
+
+#include "cygprofile.h"
+#include <kos.h>
+#include <stdbool.h>
+#include <stdint.h>
+#include <stdlib.h>
+#include "perfctr.h"
+#include "private.h"
+
+#if CYG_FUNC_TRACE_ENABLED
+
+#define _strcat(x, y, z) strncat(x, z, y)
+
+#ifndef __PE_Error_H
+#define __PE_Error_H
+
+#define ERR_OK 0       /* OK */
+#define ERR_SPEED 1    /* This device does not work in the active speed mode. */
+#define ERR_RANGE 2    /* Parameter out of range. */
+#define ERR_VALUE 3    /* Parameter of incorrect value. */
+#define ERR_OVERFLOW 4 /* Timer overflow. */
+#define ERR_MATH 5     /* Overflow during evaluation. */
+#define ERR_ENABLED 6  /* Device is enabled. */
+#define ERR_DISABLED 7 /* Device is disabled. */
+#define ERR_BUSY 8     /* Device is busy. */
+#define ERR_NOTAVAIL 9 /* Requested value or method not available. */
+#define ERR_RXEMPTY 10 /* No data in receiver. */
+#define ERR_TXFULL 11  /* Transmitter is full. */
+#define ERR_BUSOFF 12  /* Bus not available. */
+#define ERR_OVERRUN 13 /* Overrun error is detected. */
+#define ERR_FRAMING 14 /* Framing error is detected. */
+#define ERR_PARITY 15  /* Parity error is detected. */
+#define ERR_NOISE 16   /* Noise error is detected. */
+#define ERR_IDLE 17    /* Idle error is detectes. */
+#define ERR_FAULT 18   /* Fault error is detected. */
+#define ERR_BREAK 19   /* Break char is received during communication. */
+#define ERR_CRC 20     /* CRC error is detected. */
+#define ERR_ARBITR 21  /* A node losts arbitration. This error occurs if two nodes start transmission at the same time. */
+#define ERR_PROTECT 22 /* Protection error is detected. */
+
+#endif /* __PE_Error_H */
+
+#define CYG_RNG_BUF_NOF_ELEMS (8096 * 4)
+/*!< Number of elements in the ring buffer which is used to record function calls */
+#define CYG_THUMB_MASK 0xFFFFFFFF
+/*!< mask out LSB (thumb) bit */
+
+/* Hashing function for two uint32_ts */
+#define HASH_PAIR(x, y) (((x)*0x1f1f1f1f) ^ (y))
+
+static bool CYG_Enabled = false; /*!< flag which enables/disables tracing */
+
+/*!
+ * Element in ring buffer to store the trace information.
+ */
+typedef struct
+{
+  //bool isEnter;    /*!< TRUE for __cyg_profile_func_enter(), FALSE for __cyg_profile_func_exit() */
+  void *this_fn;    /*!< address (with thumb bit) of the (caller) function */
+  void *call_site;  /*!< return address to the function which called this_fn */
+  uint32_t counter; /* also contains isEnter as highest bit */
+} CYG_RNG_ElementType;
+
+typedef uint32_t CYG_RNG_BufSizeType; /*!< index type for ring buffer */
+
+static CYG_RNG_ElementType CYG_RNG_buffer[CYG_RNG_BUF_NOF_ELEMS]; /*!< ring buffer */
+//static CYG_RNG_BufSizeType CYG_RNG_inIdx;                         /*!< input index */
+static CYG_RNG_BufSizeType CYG_RNG_outIdx; /*!< output index */
+static CYG_RNG_BufSizeType CYG_RNG_inSize; /*!< size/number of elements in buffer */
+
+/*!
+ * \brief Stores a trace element into the ring buffer.
+ * \param elem Trace element to put into the buffer.
+ * \return Error code, ERR_OK if everything is ok.
+ */
+__attribute__((no_instrument_function)) static uint8_t CYG_RNG_Put(CYG_RNG_ElementType *elem) {
+  uint8_t res = ERR_OK;
+
+#if 0
+    if (CYG_RNG_inSize == CYG_RNG_BUF_NOF_ELEMS)
+    {
+        res = ERR_TXFULL;
+        CYG_RNG_inSize--;
+        CYG_PrintCallTrace();
+        //CYG_RNG_inIdx = 0;
+        CYG_RNG_outIdx = 0;
+        CYG_RNG_inSize = 0;
+        return CYG_RNG_Put(elem);
+    }
+    else
+    {
+        //CYG_RNG_buffer[CYG_RNG_inIdx] = *elem;
+        
+        /*
+        CYG_RNG_inIdx++;
+        if (CYG_RNG_inIdx == CYG_RNG_BUF_NOF_ELEMS)
+        {
+            CYG_RNG_inIdx = 0;
+        }
+        */
+        CYG_RNG_inSize++;
+    }
+#endif
+  CYG_RNG_ElementType *possible = &CYG_RNG_buffer[HASH_PAIR((uint32_t)elem->call_site, (uint32_t)elem->this_fn) % CYG_RNG_BUF_NOF_ELEMS];
+  if (possible->counter /*& 0x0FFFFFFF*/ == 0) {
+    *possible = *elem;
+  } else {
+    possible->counter++;
+  }
+  return res;
+}
+
+/*!
+ * \brief Gets a trace element from the ring buffer.
+ * \param elem Pointer where to store the trace element.
+ * \return Error code, ERR_OK if everything is ok.
+ */
+__attribute__((no_instrument_function)) static uint8_t CYG_RNG_Get(CYG_RNG_ElementType *elemP) {
+  uint8_t res = ERR_OK;
+
+  if (CYG_RNG_inSize == 0) {
+    res = ERR_RXEMPTY;
+  } else {
+    *elemP = CYG_RNG_buffer[CYG_RNG_outIdx];
+    CYG_RNG_inSize--;
+    CYG_RNG_outIdx++;
+    if (CYG_RNG_outIdx == CYG_RNG_BUF_NOF_ELEMS) {
+      CYG_RNG_outIdx = 0;
+    }
+  }
+  return res;
+}
+
+static uint32_t currentTime[2];
+static uint32_t lastTime;
+
+/*!
+ * \brief Stores a trace element into the ring buffer.
+ * \param this_fn Address of the caller function.
+ * \param call_site Return address to the function which called this_fn
+ * \return Error code, ERR_OK if everything is ok.
+ */
+__attribute__((no_instrument_function)) static void CYG_Store(void *this_fn, void *call_site) {
+  CYG_RNG_ElementType elem;
+  lastTime = currentTime[0];
+  PMCR_Read(1, (unsigned int *)currentTime);
+  //elem.isEnter = isEnter;
+  elem.call_site = call_site;
+  elem.this_fn = this_fn;
+  elem.counter = 1;  //currentTime[0] - lastTime;
+  CYG_RNG_Put(&elem);
+}
+
+/*!
+ * \brief Function which is called upon function enter. The function call is inserted by the compiler.
+ * \param this_fn Address of the caller function.
+ * \param call_site Return address to the function which called this_fn
+ */
+__attribute__((no_instrument_function)) void __cyg_profile_func_enter(void *this_fn, void *call_site) {
+  if (CYG_Enabled) {
+    CYG_Store(call_site, this_fn);
+  }
+}
+
+/*!
+ * \brief Function which is called upon function exit. The function call is inserted by the compiler.
+ * \param this_fn Address of the caller function.
+ * \param call_site Return address to the function which called this_fn
+ */
+__attribute__((no_instrument_function)) void __cyg_profile_func_exit(__attribute__((unused)) void *this_fn, __attribute__((unused)) void *call_site) {
+}
+
+/*!
+ * \brief Dumps the trace to the console.
+ */
+__attribute__((no_instrument_function)) void CYG_PrintCallTrace(void) {
+  CYG_RNG_BufSizeType i;
+  char buf[40];
+  CYG_RNG_ElementType elem;
+  uint8_t res;
+
+  CYG_Enabled = false;
+  printf("0x%08x\n", ((unsigned int)&_etext) - BASE_ADDRESS);
+  //printf("Function Trace:\r\n");
+  CYG_RNG_outIdx = 0;
+  for (i = 0; i < CYG_RNG_BUF_NOF_ELEMS; i++) {
+    buf[0] = '\0';
+    res = CYG_RNG_Get(&elem);
+    if (res == ERR_OK && elem.call_site != NULL) {
+      snprintf(buf, sizeof(buf), "{ 0x%" PRIXPTR " 0x%" PRIXPTR " %u\r\n", (uintptr_t)(elem.this_fn) & CYG_THUMB_MASK, (uintptr_t)(elem.call_site) & CYG_THUMB_MASK, (unsigned int)elem.counter);
+
+      printf(buf);
+    } else {
+      //printf("ERROR getting element!\r\n");
+    }
+  }
+  //printf("Function Trace: done!\r\n");
+}
+
+__attribute__((no_instrument_function)) void CYG_Init(void) {
+  if (CYG_Enabled) {
+    return;
+  }
+  CYG_RNG_inSize = CYG_RNG_BUF_NOF_ELEMS;
+  CYG_RNG_outIdx = 0;
+  CYG_Enabled = true;
+  currentTime[0] = currentTime[1] = 0;
+  lastTime = 0;
+  memset(CYG_RNG_buffer, 0, sizeof(CYG_RNG_buffer));
+  PMCR_Init(1, PMCR_ELAPSED_TIME_MODE, PMCR_COUNT_CPU_CYCLES);
+}
+
+__attribute__((no_instrument_function)) void CYG_Deinit(void) {
+  CYG_RNG_inSize = CYG_RNG_BUF_NOF_ELEMS;
+  CYG_RNG_outIdx = 0;
+  CYG_Enabled = false;
+  memset(CYG_RNG_buffer, 0, sizeof(CYG_RNG_buffer));
+}
+#else
+
+void CYG_PrintCallTrace(void){}
+void CYG_Init(void){}
+void CYG_Deinit(void){}
+
+#endif
diff --git a/GL/cygprofile.h b/GL/cygprofile.h
new file mode 100644
index 0000000..7f2cbcb
--- /dev/null
+++ b/GL/cygprofile.h
@@ -0,0 +1,33 @@
+#pragma once
+#ifndef CYGPROFILE_H_
+#define CYGPROFILE_H_
+
+/* Based on the idea from Erich Styger */
+/* profiled instrument guided profiling for gldc on hardware */
+
+#define NO_INSTRUMENT inline __attribute__((no_instrument_function))
+#define INLINE_DEBUG NO_INSTRUMENT __attribute__((always_inline))
+#define INLINE_ALWAYS static NO_INSTRUMENT __attribute__((always_inline))
+
+extern char _etext;
+#define BASE_ADDRESS 0x8c010000
+
+#define CYG_FUNC_TRACE_ENABLED (1)
+/*!< 1: Trace enabled, 0: trace disabled */
+
+/*!
+ * \brief Print the call trace to the terminal.
+ */
+void CYG_PrintCallTrace(void);
+
+/*!
+ * \brief Driver Initialization.
+ */
+void CYG_Init(void);
+
+/*!
+ * \brief Driver De-Initialization.
+ */
+void CYG_Deinit(void);
+
+#endif /* CYGPROFILE_H_ */
\ No newline at end of file
diff --git a/GL/draw.c b/GL/draw.c
index 68331f4..e97e263 100644
--- a/GL/draw.c
+++ b/GL/draw.c
@@ -56,7 +56,7 @@ void _glInitAttributePointers() {
     NORMAL_POINTER.size = 3;
 }
 
-static inline GLuint byte_size(GLenum type) {
+static INLINE_DEBUG GLuint byte_size(GLenum type) {
     switch(type) {
     case GL_BYTE: return sizeof(GLbyte);
     case GL_UNSIGNED_BYTE: return sizeof(GLubyte);
@@ -513,7 +513,7 @@ PVRHeader* _glSubmissionTargetHeader(SubmissionTarget* target) {
     return aligned_vector_at(&target->output->vector, target->header_offset);
 }
 
-Vertex* _glSubmissionTargetStart(SubmissionTarget* target) {
+INLINE_DEBUG Vertex* _glSubmissionTargetStart(SubmissionTarget* target) {
     assert(target->start_offset < target->output->vector.size);
     return aligned_vector_at(&target->output->vector, target->start_offset);
 }
@@ -1006,6 +1006,7 @@ static void mat_transform_normal3(const float* xyz, const float* xyzOut, const u
 
 static void light(SubmissionTarget* target) {
 
+#if 0
     typedef struct {
         float xyz[3];
         float n[3];
@@ -1057,6 +1058,35 @@ static void light(SubmissionTarget* target) {
         vertex->bgra[G8IDX] = (GLubyte) (255.0f * fminf(total[1], 1.0f));
         vertex->bgra[B8IDX] = (GLubyte) (255.0f * fminf(total[2], 1.0f));
     }
+#endif
+
+    if(!_glIsLightingEnabled()) {
+        return;
+    }
+
+    static AlignedVector* eye_space_data = NULL;
+
+    if(!eye_space_data) {
+        eye_space_data = (AlignedVector*) malloc(sizeof(AlignedVector));
+        aligned_vector_init(eye_space_data, sizeof(EyeSpaceData));
+    }
+
+    aligned_vector_resize(eye_space_data, target->count);
+
+    /* Perform lighting calculations and manipulate the colour */
+    Vertex* vertex = _glSubmissionTargetStart(target);
+    VertexExtra* extra = aligned_vector_at(target->extras, 0);
+    EyeSpaceData* eye_space = (EyeSpaceData*) eye_space_data->data;
+
+    _glMatrixLoadModelView();
+    mat_transform3(vertex->xyz, eye_space->xyz, target->count, sizeof(Vertex), sizeof(EyeSpaceData));
+
+    _glMatrixLoadNormal();
+    mat_transform_normal3(extra->nxyz, eye_space->n, target->count, sizeof(VertexExtra), sizeof(EyeSpaceData));
+
+    EyeSpaceData* ES = aligned_vector_at(eye_space_data, 0);
+    _glPerformLighting(vertex, ES, target->count);
+
 }
 
 static void divide(SubmissionTarget* target) {
diff --git a/GL/framebuffer.c b/GL/framebuffer.c
index 7ab7995..5ca2824 100644
--- a/GL/framebuffer.c
+++ b/GL/framebuffer.c
@@ -1,5 +1,6 @@
 #include <stdio.h>
 #include "private.h"
+#include "config.h"
 #include "../include/glkos.h"
 #include "../include/glext.h"
 
@@ -94,62 +95,62 @@ void APIENTRY glFramebufferTexture2DEXT(GLenum target, GLenum attachment, GLenum
     ACTIVE_FRAMEBUFFER->texture_id = texture;
 }
 
-static inline GLuint A1555(GLuint v) {
+static INLINE_DEBUG GLuint A1555(GLuint v) {
     const GLuint MASK = (1 << 15);
     return (v & MASK) >> 15;
 }
 
-static inline GLuint R1555(GLuint v) {
+static INLINE_DEBUG GLuint R1555(GLuint v) {
     const GLuint MASK = (31 << 10);
     return (v & MASK) >> 10;
 }
 
-static inline GLuint G1555(GLuint v) {
+static INLINE_DEBUG GLuint G1555(GLuint v) {
     const GLuint MASK = (31 << 5);
     return (v & MASK) >> 5;
 }
 
-static inline GLuint B1555(GLuint v) {
+static INLINE_DEBUG GLuint B1555(GLuint v) {
     const GLuint MASK = (31 << 0);
     return (v & MASK) >> 0;
 }
 
-static inline GLuint A4444(GLuint v) {
+static INLINE_DEBUG GLuint A4444(GLuint v) {
     const GLuint MASK = (0xF << 12);
     return (v & MASK) >> 12;
 }
 
-static inline GLuint R4444(GLuint v) {
+static INLINE_DEBUG GLuint R4444(GLuint v) {
     const GLuint MASK = (0xF << 8);
     return (v & MASK) >> 8;
 }
 
-static inline GLuint G4444(GLuint v) {
+static INLINE_DEBUG GLuint G4444(GLuint v) {
     const GLuint MASK = (0xF << 4);
     return (v & MASK) >> 4;
 }
 
-static inline GLuint B4444(GLuint v) {
+static INLINE_DEBUG GLuint B4444(GLuint v) {
     const GLuint MASK = (0xF << 0);
     return (v & MASK) >> 0;
 }
 
-static inline GLuint R565(GLuint v) {
+static INLINE_DEBUG GLuint R565(GLuint v) {
     const GLuint MASK = (31 << 11);
     return (v & MASK) >> 11;
 }
 
-static inline GLuint G565(GLuint v) {
+static INLINE_DEBUG GLuint G565(GLuint v) {
     const GLuint MASK = (63 << 5);
     return (v & MASK) >> 5;
 }
 
-static inline GLuint B565(GLuint v) {
+static INLINE_DEBUG GLuint B565(GLuint v) {
     const GLuint MASK = (31 << 0);
     return (v & MASK) >> 0;
 }
 
-GLboolean _glCalculateAverageTexel(const GLubyte* src, const GLuint srcWidth, const GLuint pvrFormat, GLubyte* dest) {
+static NO_INSTRUMENT GLboolean _glCalculateAverageTexel(const GLubyte* src, const GLuint srcWidth, const GLuint pvrFormat, GLubyte* dest) {
     GLushort* s1 = ((GLushort*) src);
     GLushort* s2 = ((GLushort*) src) + 1;
     GLushort* s3 = ((GLushort*) src) + srcWidth;
diff --git a/GL/gldc.c b/GL/gldc.c
index 22b8c2e..9e4f2df 100644
--- a/GL/gldc.c
+++ b/GL/gldc.c
@@ -19,3 +19,7 @@
 #include "matrix.c"
 #include "state.c"
 #include "texture.c"
+
+#include "../containers/stack.c"
+#include "../containers/aligned_vector.c"
+#include "../containers/named_array.c"
\ No newline at end of file
diff --git a/GL/lighting.c b/GL/lighting.c
index c756df8..10038a5 100644
--- a/GL/lighting.c
+++ b/GL/lighting.c
@@ -281,98 +281,143 @@ static inline float FPOW(float b, float p) {
     return FEXP(FLOG(b) * p);
 }
 
-void _glCalculateLightingContribution(const GLint light, const GLfloat* pos, const GLfloat* normal, uint8_t* bgra, GLfloat* colour) __attribute__((optimize("fast-math")));
-void _glCalculateLightingContribution(const GLint light, const GLfloat* pos, const GLfloat* normal, uint8_t* bgra, GLfloat* colour) {
-    LightSource* l = &LIGHTS[light];
-
-    struct vec3f L = {
-        l->position[0],
-        l->position[1],
-        l->position[2]
-    };
-
-    if(!l->is_directional) {
-        L.x -= pos[0];
-        L.y -= pos[1];
-        L.z -= pos[2];
-    }
-
-    struct vec3f N = {
-        normal[0],
-        normal[1],
-        normal[2]
-    };
-
-    struct vec3f V = {
-        pos[0],
-        pos[1],
-        pos[2]
-    };
-
-    GLfloat d;
-    vec3f_length(L.x, L.y, L.z, d);
-
-    GLfloat oneOverL = 1.0f / d;
-
-    L.x *= oneOverL;
-    L.y *= oneOverL;
-    L.z *= oneOverL;
-
-    vec3f_normalize(V.x, V.y, V.z);
-
-    GLfloat NdotL, VdotN;
-    vec3f_dot(N.x, N.y, N.z, L.x, L.y, L.z, NdotL);
-    vec3f_dot(V.x, V.y, V.z, N.x, N.y, N.z, VdotN);
-
-    GLfloat VdotR = VdotN - NdotL;
-    GLfloat specularPower = FPOW(VdotR > 0 ? VdotR : 0, MATERIAL.exponent);
-
-    GLboolean colorMaterial = _glIsColorMaterialEnabled();
-
-    GLfloat mD [] = {
-        (colorMaterial && isDiffuseColorMaterial()) ? ((GLfloat)bgra[R8IDX]) / 255.0f : MATERIAL.diffuse[0],
-        (colorMaterial && isDiffuseColorMaterial()) ? ((GLfloat)bgra[G8IDX]) / 255.0f : MATERIAL.diffuse[1],
-        (colorMaterial && isDiffuseColorMaterial()) ? ((GLfloat)bgra[B8IDX]) / 255.0f : MATERIAL.diffuse[2],
-        (colorMaterial && isDiffuseColorMaterial()) ? ((GLfloat)bgra[A8IDX]) / 255.0f : MATERIAL.diffuse[3]
-    };
-
-    GLfloat mA [] = {
-        (colorMaterial && isAmbientColorMaterial()) ? ((GLfloat)bgra[R8IDX]) / 255.0f : MATERIAL.ambient[0],
-        (colorMaterial && isAmbientColorMaterial()) ? ((GLfloat)bgra[G8IDX]) / 255.0f : MATERIAL.ambient[1],
-        (colorMaterial && isAmbientColorMaterial()) ? ((GLfloat)bgra[B8IDX]) / 255.0f : MATERIAL.ambient[2],
-        (colorMaterial && isAmbientColorMaterial()) ? ((GLfloat)bgra[A8IDX]) / 255.0f : MATERIAL.ambient[3]
-    };
-
-    GLfloat mS [] = {
-        (colorMaterial && isSpecularColorMaterial()) ? ((GLfloat)bgra[R8IDX]) / 255.0f : MATERIAL.specular[0],
-        (colorMaterial && isSpecularColorMaterial()) ? ((GLfloat)bgra[G8IDX]) / 255.0f : MATERIAL.specular[1],
-        (colorMaterial && isSpecularColorMaterial()) ? ((GLfloat)bgra[B8IDX]) / 255.0f : MATERIAL.specular[2],
-        (colorMaterial && isSpecularColorMaterial()) ? ((GLfloat)bgra[A8IDX]) / 255.0f : MATERIAL.specular[3]
-    };
-
-    colour[0] = l->ambient[0] * mA[0];
-    colour[1] = l->ambient[1] * mA[1];
-    colour[2] = l->ambient[2] * mA[2];
-    colour[3] = mD[3];
-
-    if(NdotL >= 0) {
-        colour[0] += (l->diffuse[0] * mD[0] * NdotL + l->specular[0] * mS[0] * specularPower);
-        colour[1] += (l->diffuse[1] * mD[1] * NdotL + l->specular[1] * mS[1] * specularPower);
-        colour[2] += (l->diffuse[2] * mD[2] * NdotL + l->specular[2] * mS[2] * specularPower);
-    }
-
-    if(!l->is_directional) {
-        GLfloat att = (
-            1.0f / (l->constant_attenuation + (l->linear_attenuation * d) + (l->quadratic_attenuation * d * d))
-        );
-
-        colour[0] *= att;
-        colour[1] *= att;
-        colour[2] *= att;
-    }
-
-    if(colour[0] > 1.0f) colour[0] = 1.0f;
-    if(colour[1] > 1.0f) colour[1] = 1.0f;
-    if(colour[2] > 1.0f) colour[2] = 1.0f;
-    if(colour[3] > 1.0f) colour[3] = 1.0f;
+#define LIGHT_COMPONENT(C) { \
+    const GLfloat* acm = &MA[C]; \
+    const GLfloat* dcm = &MD[C]; \
+    const GLfloat* scm = &MS[C]; \
+    const GLfloat* scli = &light->specular[C]; \
+    const GLfloat* dcli = &light->diffuse[C]; \
+    const GLfloat* acli = &light->ambient[C]; \
+    const GLfloat* srm = &MATERIAL.exponent; \
+    const GLfloat fi = (LdotN == 0) ? 0 : 1; \
+    GLfloat component = (*acm * *acli); \
+    component += (LdotN * *dcm * *dcli); \
+    component += (FPOW((fi * NdotH), *srm) * *scm * *scli); \
+    component *= att; \
+    component *= spot; \
+    final[C] += component; \
 }
+
+static inline float vec3_dot_limited(
+        const float* x1, const float* y1, const float* z1,
+        const float* x2, const float* y2, const float* z2) {
+
+    float ret;
+    vec3f_dot(*x1, *y1, *z1, *x2, *y2, *z2, ret);
+    return (ret < 0) ? 0 : ret;
+}
+
+void _glPerformLighting(Vertex* vertices, const EyeSpaceData* es, const int32_t count) {
+    int8_t i;
+    int32_t j;
+
+    const LightSource* light = NULL;
+
+    const GLboolean colorMaterial = _glIsColorMaterialEnabled();
+    const GLboolean isDiffuseCM = isDiffuseColorMaterial();
+    const GLboolean isAmbientCM = isAmbientColorMaterial();
+    const GLboolean isSpecularCM = isSpecularColorMaterial();
+
+    static GLfloat CM[4];
+
+     /* So the DC has 16 floating point registers, that means
+     * we need to limit the number of floats as much as possible
+     * to give the compiler a good enough chance to do the right
+     * thing */
+
+    Vertex* vertex = vertices;
+    const EyeSpaceData* data = es;
+
+    static const float ONE_OVER_255 = 1.0f / 255.0f;
+
+    for(j = 0; j < count; ++j, ++vertex, ++data) {
+        /* When GL_COLOR_MATERIAL is on, we need to pull out
+         * the passed in diffuse and use it */
+        const GLfloat* MD = MATERIAL.diffuse;
+        const GLfloat* MA = MATERIAL.ambient;
+        const GLfloat* MS = MATERIAL.specular;
+
+        if(colorMaterial) {
+            CM[0] = ((GLfloat) vertex->bgra[R8IDX]) * ONE_OVER_255;
+            CM[1] = ((GLfloat) vertex->bgra[G8IDX]) * ONE_OVER_255;
+            CM[2] = ((GLfloat) vertex->bgra[B8IDX]) * ONE_OVER_255;
+            CM[3] = ((GLfloat) vertex->bgra[A8IDX]) * ONE_OVER_255;
+
+            MD = (isDiffuseCM) ? CM : MATERIAL.diffuse;
+            MA = (isAmbientCM) ? CM : MATERIAL.ambient;
+            MS = (isSpecularCM) ? CM : MATERIAL.specular;
+        }
+
+        float final[4];
+
+        /* Initial, non-light related values */
+        final[0] = (SCENE_AMBIENT[0] * MA[0]) + MATERIAL.emissive[0];
+        final[1] = (SCENE_AMBIENT[1] * MA[1]) + MATERIAL.emissive[1];
+        final[2] = (SCENE_AMBIENT[2] * MA[2]) + MATERIAL.emissive[2];
+        final[3] = MD[3];
+
+        float Vx, Vy, Vz;
+        Vx = -data->xyz[0];
+        Vy = -data->xyz[1];
+        Vz = -data->xyz[2];
+        vec3f_normalize(Vx, Vy, Vz);
+
+        for(i = 0; i < MAX_LIGHTS; ++i) {
+            if(!_glIsLightEnabled(i)) continue;
+
+            /* Calc light specific parameters */
+            light = &LIGHTS[i];
+
+            float Lx, Ly, Lz, D;
+            float Hx, Hy, Hz;
+            const float* Nx = &data->n[0];
+            const float* Ny = &data->n[1];
+            const float* Nz = &data->n[2];
+
+            Lx = light->position[0] - data->xyz[0];
+            Ly = light->position[1] - data->xyz[1];
+            Lz = light->position[2] - data->xyz[2];
+            vec3f_length(Lx, Ly, Lz, D);
+
+            {
+                /* Normalize L - scoping ensures Llen is temporary */
+                const float Llen = 1.0f / D;
+                Lx *= Llen;
+                Ly *= Llen;
+                Lz *= Llen;
+            }
+
+            Hx = (Lx + Vx);
+            Hy = (Ly + Vy);
+            Hz = (Lz + Vz);
+            vec3f_normalize(Hx, Hy, Hz);
+
+            const float LdotN = vec3_dot_limited(
+                &Lx, &Ly, &Lz,
+                Nx, Ny, Nz
+            );
+
+            const float NdotH = vec3_dot_limited(
+                Nx, Ny, Nz,
+                &Hx, &Hy, &Hz
+            );
+
+            const float att = (
+                light->position[3] == 0.0f) ? 1.0f :
+                1.0f / (light->constant_attenuation + (light->linear_attenuation * D) + (light->quadratic_attenuation * D * D)
+            );
+
+            const float spot = 1.0f;
+
+            LIGHT_COMPONENT(0);
+            LIGHT_COMPONENT(1);
+            LIGHT_COMPONENT(2);
+        }
+
+        vertex->bgra[R8IDX] = (GLubyte)(fminf(final[0] * 255.0f, 255.0f));
+        vertex->bgra[G8IDX] = (GLubyte)(fminf(final[1] * 255.0f, 255.0f));
+        vertex->bgra[B8IDX] = (GLubyte)(fminf(final[2] * 255.0f, 255.0f));
+        vertex->bgra[A8IDX] = (GLubyte)(fminf(final[3] * 255.0f, 255.0f));
+    }
+}
+
diff --git a/GL/matrix.c b/GL/matrix.c
index 75b1e56..5d3a629 100644
--- a/GL/matrix.c
+++ b/GL/matrix.c
@@ -476,84 +476,57 @@ void APIENTRY glDepthRange(GLclampf n, GLclampf f) {
     DEPTH_RANGE_MULTIPLIER_H = (n + f) / 2.0f;
 }
 
+#include "sh4_math.h"
+
 /* Vector Cross Product - Used by glhLookAtf2 */
-static inline void vec3f_cross(const GLfloat* v1, const GLfloat* v2, GLfloat* result) {
-    result[0] = v1[1] * v2[2] - v1[2] * v2[1];
-    result[1] = v1[2] * v2[0] - v1[0] * v2[2];
-    result[2] = v1[0] * v2[1] - v1[1] * v2[0];
+static inline void vec3f_cross(GLfloat* v1, GLfloat* v2, GLfloat* result) {
+    result[0] = (v1[1] * v2[2]) - (v1[2] * v2[1]);
+    result[1] = (v1[2] * v2[0]) - (v1[0] * v2[2]);
+    result[2] = (v1[0] * v2[1]) - (v1[1] * v2[0]);
 }
 
-/* glhLookAtf2 adapted from http://www.opengl.org/wiki/GluLookAt_code */
-void glhLookAtf2(const GLfloat* eyePosition3D,
-                 const GLfloat* center3D,
-                 const GLfloat* upVector3D) {
 
-    /* Look-At Matrix */
-    static Matrix4x4 MatrixLookAt __attribute__((aligned(32))) = {
-        1.0f, 0.0f, 0.0f, 0.0f,
-        0.0f, 1.0f, 0.0f, 0.0f,
-        0.0f, 0.0f, 1.0f, 0.0f,
-        0.0f, 0.0f, 0.0f, 1.0f
-    };
+static inline void vec3f_normalize_sh4(float *v){
+    float	length, ilength;
 
-    GLfloat forward[3];
-    GLfloat side[3];
-    GLfloat up[3];
-
-    vec3f_sub_normalize(center3D[0], center3D[1], center3D[2],
-                        eyePosition3D[0], eyePosition3D[1], eyePosition3D[2],
-                        forward[0], forward[1], forward[2]);
-
-    //Side = forward x up
-    vec3f_cross(forward, upVector3D, side);
-    vec3f_normalize(side[0], side[1], side[2]);
-
-    //Recompute up as: up = side x forward
-    vec3f_cross(side, forward, up);
-
-    MatrixLookAt[M0] = side[0];
-    MatrixLookAt[M4] = side[1];
-    MatrixLookAt[M8] = side[2];
-    MatrixLookAt[M12] = 0;
-
-    MatrixLookAt[M1] = up[0];
-    MatrixLookAt[M5] = up[1];
-    MatrixLookAt[M9] = up[2];
-    MatrixLookAt[M13] = 0;
-
-    MatrixLookAt[M2] = -forward[0];
-    MatrixLookAt[M6] = -forward[1];
-    MatrixLookAt[M10] = -forward[2];
-    MatrixLookAt[M14] = 0;
-
-    MatrixLookAt[M3] = MatrixLookAt[11] = MatrixLookAt[15] = 0;
-    MatrixLookAt[M15] = 1;
-
-    static Matrix4x4 trn __attribute__((aligned(32))) = {
-        1.0f, 0.0f, 0.0f, 0.0f,
-        0.0f, 1.0f, 0.0f, 0.0f,
-        0.0f, 0.0f, 1.0f, 0.0f,
-        0.0f, 0.0f, 0.0f, 1.0f
-    };
-
-    trn[M12] = -eyePosition3D[0];
-    trn[M13] = -eyePosition3D[1];
-    trn[M14] = -eyePosition3D[2];
-
-    // Does not modify internal Modelview matrix
-    upload_matrix(&MatrixLookAt);
-    multiply_matrix(&trn);
-    multiply_matrix(stack_top(MATRIX_STACKS + (GL_MODELVIEW & 0xF)));
-    download_matrix(stack_top(MATRIX_STACKS + (GL_MODELVIEW & 0xF)));
+	ilength = MATH_fsrra(v[0]*v[0] + v[1]*v[1] + v[2]*v[2]);
+	length = MATH_Invert(ilength);
+	if (length)
+	{
+		v[0] *= ilength;
+		v[1] *= ilength;
+		v[2] *= ilength;
+	}
 }
 
 void gluLookAt(GLfloat eyex, GLfloat eyey, GLfloat eyez, GLfloat centerx,
                GLfloat centery, GLfloat centerz, GLfloat upx, GLfloat upy,
                GLfloat upz) {
-    GLfloat eye [] = { eyex, eyey, eyez };
-    GLfloat point [] = { centerx, centery, centerz };
-    GLfloat up [] = { upx, upy, upz };
-    glhLookAtf2(eye, point, up);
+    GLfloat m [16];
+   	GLfloat f [3];
+	GLfloat u [3];
+	GLfloat s [3];
+
+	f[0] = centerx - eyex;
+	f[1] = centery - eyey;
+	f[2] = centerz - eyez;
+
+	u[0] = upx;
+	u[1] = upy;
+	u[2] = upz;
+
+    vec3f_normalize_sh4(f);
+	vec3f_cross(f, u, s);
+    vec3f_normalize_sh4(s);
+	vec3f_cross(s, f, u);
+
+	m[0] =  s[0]; m[4] =  s[1]; m[8] =   s[2]; m[12] = 0.0f;
+	m[1] =  u[0]; m[5] =  u[1]; m[9] =   u[2]; m[13] = 0.0f;
+	m[2] = -f[0]; m[6] = -f[1]; m[10] = -f[2]; m[14] = 0.0f;
+    m[3] =   0.0f; m[7] =   0.0f; m[11] =   0.0f; m[15] = 1.0f;
+
+	glMultMatrixf(m);
+	glTranslatef(-eyex, -eyey, -eyez);
 }
 
 void _glApplyRenderMatrix() {
diff --git a/GL/perfctr.c b/GL/perfctr.c
new file mode 100644
index 0000000..4b6b3bc
--- /dev/null
+++ b/GL/perfctr.c
@@ -0,0 +1,247 @@
+// ---- perfctr.c - SH7091 Performance Counter Module Code ----
+//
+// This file is part of the DreamHAL project, a hardware abstraction library
+// primarily intended for use on the SH7091 found in hardware such as the SEGA
+// Dreamcast game console.
+//
+// The performance counter module is hereby released into the public domain in
+// the hope that it may prove useful. Now go profile some code and hit 60 fps! :)
+//
+// --Moopthehedgehog
+
+// See perfctr.h for more of my notes and documentation on these counters.
+#include "perfctr.h"
+#include "cygprofile.h"
+#if CYG_FUNC_TRACE_ENABLED
+
+static unsigned char pmcr_enabled = 0;
+
+//
+// Initialize performance counters. It's just a clear -> enable.
+// It's good practice to clear a counter before starting it for the first time.
+//
+// Also: Disabling and re-enabling the counters doesn't reset them; the clearing
+// needs to happen while a counter is disabled to reset it.
+//
+// You can disable and re-enable with a different mode without explicitly
+// clearing and have it keep going, continuing from where it left off.
+//
+
+__attribute__((no_instrument_function)) void PMCR_Init(int which, unsigned short mode, unsigned char count_type) // Will do nothing if perfcounter is already running!
+{
+	// Don't do anything if being asked to enable an already-enabled counter
+	if( (which == 1) && ((!pmcr_enabled) || (pmcr_enabled == 2)) )
+	{
+		// counter 1
+		PMCR_Enable(1, mode, count_type, PMCR_RESET_COUNTER);
+	}
+	else if( (which == 2) && ((!pmcr_enabled) || (pmcr_enabled == 1)) )
+	{
+		// counter 2
+		PMCR_Enable(2, mode, count_type, PMCR_RESET_COUNTER);
+	}
+	else if( (which == 3) && (!pmcr_enabled) )
+	{
+		// Both
+		PMCR_Enable(3, mode, count_type, PMCR_RESET_COUNTER);
+	}
+}
+
+// Enable "undocumented" performance counters (well, they were undocumented at one point. They're documented now!)
+__attribute__((no_instrument_function)) void PMCR_Enable(int which, unsigned short mode, unsigned char count_type, unsigned char reset_count) // Will do nothing if perfcounter is already running!
+{
+	// Don't do anything if count_type or reset_count are invalid
+	if((count_type | reset_count) > 1)
+	{
+		return;
+	}
+
+	// Build config from parameters
+	unsigned short pmcr_ctrl = PMCR_RUN_COUNTER | (reset_count << PMCR_RESET_COUNTER_SHIFT) | (count_type << PMCR_CLOCK_TYPE_SHIFT) | mode;
+
+	// Don't do anything if being asked to enable an already-enabled counter
+	if( (which == 1) && ((!pmcr_enabled) || (pmcr_enabled == 2)) )
+	{
+		// counter 1
+		*((volatile unsigned short*)PMCR1_CTRL_REG) = pmcr_ctrl;
+
+		pmcr_enabled += 1;
+	}
+	else if( (which == 2) && ((!pmcr_enabled) || (pmcr_enabled == 1)) )
+	{
+		// counter 2
+		*((volatile unsigned short*)PMCR2_CTRL_REG) = pmcr_ctrl;
+
+		pmcr_enabled += 2;
+	}
+	else if( (which == 3) && (!pmcr_enabled) )
+	{
+		// Both
+		*((volatile unsigned short*)PMCR1_CTRL_REG) = pmcr_ctrl;
+		*((volatile unsigned short*)PMCR2_CTRL_REG) = pmcr_ctrl;
+
+		pmcr_enabled = 3;
+	}
+}
+
+// For reference:
+// #define PMCTR1H_REG 0xFF100004
+// #define PMCTR1L_REG 0xFF100008
+
+// #define PMCTR2H_REG 0xFF10000C
+// #define PMCTR2L_REG 0xFF100010
+
+static const unsigned int pmcr1_regh = PMCTR1H_REG;
+static const unsigned int pmcr1_regl = PMCTR1L_REG;
+
+static const unsigned int pmcr2_regh = PMCTR2H_REG;
+static const unsigned int pmcr2_regl = PMCTR2L_REG;
+
+// Sorry, can only read one counter at a time!
+// out_array should be an array consisting of 2x unsigned ints.
+__attribute__((no_instrument_function)) void PMCR_Read(int which, volatile unsigned int *out_array)
+{
+ // if pmcr is not enabled, this function will just return 0
+
+	// little endian (big endian would need to flip [0] and [1])
+
+	// Note: These reads really do need to be done in assembly: unfortunately it
+	// appears that using C causes GCC to insert a branch right smack in between
+	// the high and low reads of perf counter 2 (with a nop, so it's literally
+	// delaying the reads by several cycles!), which is totally insane. Doing it
+	// the assembly way ensures that nothing ridiculous like that happens. It's
+	// also portable between versions of GCC that do put the nonsensical branch in.
+	//
+	// One thing that would be nice is if SH4 had the movi20s instruction to make
+	// absolute addresses in 3 cycles, but only the SH2A has that... :(
+	if( (which == 1) && (pmcr_enabled & 0x1) )
+	{
+		// counter 1
+//		out_array[1] = *((volatile unsigned int*)PMCTR1H_REG) & 0xffff;
+//		out_array[0] = *((volatile unsigned int*)PMCTR1L_REG);
+		asm volatile("mov.l %[reg1h],r1\n\t" // load counter address (high)
+								 "mov.l %[reg1l],r2\n\t" // load counter address (low)
+								 "mov.l @r1,r1\n\t" // read counter (high)
+								 "mov.l @r2,r2\n\t" // read counter (low)
+								 "extu.w r1,r1\n\t" // zero-extend high, aka high & 0xffff
+								 "mov.l r1,%[outh]\n\t" // get data to memory
+								 "mov.l r2,%[outl]\n\t" // get data to memory
+		: [outh] "=m" (out_array[1]), [outl] "=m" (out_array[0])
+		: [reg1h] "m" (pmcr1_regh), [reg1l] "m" (pmcr1_regl) // SH4 can't mov an immediate longword into a register...
+		: "r1", "r2"
+		);
+	}
+	else if( (which == 2) && (pmcr_enabled & 0x2) )
+	{
+		// counter 2
+//		out_array[1] = *((volatile unsigned int*)PMCTR2H_REG) & 0xffff;
+//		out_array[0] = *((volatile unsigned int*)PMCTR2L_REG);
+		asm volatile("mov.l %[reg2h],r1\n\t" // load counter address (high)
+								 "mov.l %[reg2l],r2\n\t" // load counter address (low)
+								 "mov.l @r1,r1\n\t" // read counter (high)
+								 "mov.l @r2,r2\n\t" // read counter (low)
+								 "extu.w r1,r1\n\t" // zero-extend high, aka high & 0xffff
+								 "mov.l r1,%[outh]\n\t" // get data to memory
+								 "mov.l r2,%[outl]\n\t" // get data to memory
+		: [outh] "=m" (out_array[1]), [outl] "=m" (out_array[0])
+		: [reg2h] "m" (pmcr2_regh), [reg2l] "m" (pmcr2_regl) // SH4 can't mov an immediate longword into a register...
+		: "r1", "r2"
+		);
+	}
+	else if(!pmcr_enabled)
+	{
+		out_array[1] = 0;
+		out_array[0] = 0;
+	}
+	else // Invalid
+	{
+		out_array[1] = 0xffff;
+		out_array[0] = 0xffffffff;
+	}
+}
+
+// Reset counter to 0 and start it again
+// NOTE: It does not appear to be possible to clear a counter while it is running.
+__attribute__((no_instrument_function)) void PMCR_Restart(int which, unsigned short mode, unsigned char count_type)
+{
+	if( (which == 1) && (pmcr_enabled & 0x1) )
+ 	{
+ 		// counter 1
+		PMCR_Stop(1);
+		PMCR_Enable(1, mode, count_type, PMCR_RESET_COUNTER);
+ 	}
+	else if( (which == 2) && (pmcr_enabled & 0x2) )
+ 	{
+ 		// counter 2
+		PMCR_Stop(2);
+		PMCR_Enable(2, mode, count_type, PMCR_RESET_COUNTER);
+ 	}
+	else if( (which == 3) && (pmcr_enabled == 3) )
+ 	{
+		// Both
+		PMCR_Stop(3);
+		PMCR_Enable(3, mode, count_type, PMCR_RESET_COUNTER);
+ 	}
+}
+
+// Clearing only works when the counter is disabled. Otherwise, stopping the
+// counter via setting the 0x2000 bit holds the data in the data registers,
+// whereas disabling without setting that bit reads back as all 0 (but doesn't
+// clear the counters for next start). This function just stops a running
+// counter and does nothing if the counter is already stopped or disabled, as
+// clearing is handled by PMCR_Enable().
+__attribute__((no_instrument_function)) void PMCR_Stop(int which)
+{
+	if( (which == 1) && (pmcr_enabled & 0x1) )
+	{
+		// counter 1
+		*((volatile unsigned short*)PMCR1_CTRL_REG) = PMCR_STOP_COUNTER;
+
+		pmcr_enabled &= 0x2;
+	}
+	else if( (which == 2) && (pmcr_enabled & 0x2) )
+	{
+		// counter 2
+		*((volatile unsigned short*)PMCR2_CTRL_REG) = PMCR_STOP_COUNTER;
+
+		pmcr_enabled &= 0x1;
+	}
+	else if( (which == 3) && (pmcr_enabled == 3) )
+	{
+		// Both
+		*((volatile unsigned short*)PMCR1_CTRL_REG) = PMCR_STOP_COUNTER;
+		*((volatile unsigned short*)PMCR2_CTRL_REG) = PMCR_STOP_COUNTER;
+
+		pmcr_enabled = 0;
+	}
+}
+
+// Note that disabling does NOT clear the counter.
+// It may appear that way because reading a disabled counter returns 0, but re-
+// enabling without first clearing will simply continue where it left off.
+__attribute__((no_instrument_function)) void PMCR_Disable(int which)
+{
+	if(which == 1)
+	{
+		// counter 1
+		*((volatile unsigned short*)PMCR1_CTRL_REG) = PMCR_DISABLE_COUNTER;
+
+		pmcr_enabled &= 0x2;
+	}
+	else if(which == 2)
+	{
+		// counter 2
+		*((volatile unsigned short*)PMCR2_CTRL_REG) = PMCR_DISABLE_COUNTER;
+
+		pmcr_enabled &= 0x1;
+	}
+	else if(which == 3)
+	{
+		// Both
+		*((volatile unsigned short*)PMCR1_CTRL_REG) = PMCR_DISABLE_COUNTER;
+		*((volatile unsigned short*)PMCR2_CTRL_REG) = PMCR_DISABLE_COUNTER;
+
+		pmcr_enabled = 0;
+	}
+}
+#endif
diff --git a/GL/perfctr.h b/GL/perfctr.h
new file mode 100644
index 0000000..3cd7467
--- /dev/null
+++ b/GL/perfctr.h
@@ -0,0 +1,316 @@
+// ---- perfctr.h - SH7091 Performance Counter Module Header ----
+//
+// This file is part of the DreamHAL project, a hardware abstraction library
+// primarily intended for use on the SH7091 found in hardware such as the SEGA
+// Dreamcast game console.
+//
+// The performance counter module is hereby released into the public domain in
+// the hope that it may prove useful. Now go profile some code and hit 60 fps! :)
+//
+// --Moopthehedgehog
+//
+
+#ifndef __PERFCTR_H__
+#define __PERFCTR_H__
+
+//
+// -- General SH4 Performance Counter Notes --
+//
+// There are 2 performance counters that can measure elapsed time. They are each
+// 48-bit counters. They are part of the so-called "ASE" subsystem, which you can
+// read about in chapter 13 of the "SuperH™ (SH) 32-bit RISC series SH-4, ST40
+// system architecture, volume 1: system":
+// https://www.st.com/content/ccc/resource/technical/document/user_manual/36/75/05/ac/e8/7e/42/2d/CD00147163.pdf/files/CD00147163.pdf/jcr:content/translations/en.CD00147163.pdf
+//
+// They can count cycles, so that's 199.5MHz (not 200MHz!!) a.k.a. roughly 5 ns
+// increments. At 5 ns increments, a 48-bit cycle counter can run continuously
+// for 16.33 days. It's actually 16 days, 7 hours, 55 minutes, and 2 seconds,
+// depending on how close the bus clock is to 99.75MHz. There is also a second
+// mode that counts cycles according to a ratio between the CPU frequency and
+// the system bus clock, and it increments the counter by 12 every bus cycle.
+// This second mode is detailed in the description for PMCR_CLOCK_TYPE in this
+// file, and it is recommended for use when the CPU frequency is not a runtime
+// constant.
+//
+// Side note: The counters don't have an overflow interrupt or overflow bit.
+// (I did actually run one to 48-bit overflow in elapsed time mode using the
+// ratio method to check this. They don't appear to sign-extend the upper 16
+// bits in elapsed time mode, either.)
+//
+// The two counters are functionally identical. I would recommend using the
+// PMCR_Init() function to start one (or both) up the first time.
+//
+// -- Configuration Address Info --
+//
+// Addresses for these counters can be easily seen here, in lxdream's source code:
+// https://github.com/lutris/lxdream/blob/master/src/sh4/sh4mmio.h
+//
+// They are also on display in the Linux kernel, but at the time of writing appear
+// to be set incorrectly (the clock mode at bit 0x100 is never set or cleared,
+// for example, so they're at the mercy of whatever the hardware defaults are):
+// http://git.lpclinux.com/cgit/linux-2.6.28.2-lpc313x/plain/arch/sh/oprofile/op_model_sh7750.c
+// https://github.com/torvalds/linux/blob/master/arch/sh/kernel/cpu/sh4/perf_event.c
+// ...It also appears as though they may not be handling bus ratio mode correctly,
+// which appears to be the default mode on the Dreamcast in all my tests.
+//
+// You can also find these addresses by ripping a copy of Virtua Fighter 3 that
+// you own for Dreamcast and looking at the raw byte code (or a raw disassembly)
+// of its main program binary. It would appear as though they were timing a loop
+// with the low half of perf counter 1 in elapsed time mode. Definitely seems
+// like a good thing to do when targeting 60fps! Shenmue Disc 4 also uses the
+// same configuration, but what's being timed is not as clear.
+//
+// Another place you can actually find both control addresses 0xFF00008x and all
+// data addresses 0xFF10000x is in binaries of ancient, freely available versions
+// of CodeScape. Literally all you need to do is open an SH7750-related DLL in a
+// hex editor and do a search to find the control register addresses, and the
+// data addresses are equally plain to see in any relevant performance profiling
+// firmware. There's no effort or decryption required to find them whatsoever;
+// all you need is an old trial version and a hex editor.
+//
+// However, something even better than all of that is if you search for "SH4
+// 0xFF000084" (without quotes) online you'll find an old forum where some logs
+// were posted of the terminal/command prompt output from some STMicro JTAG tool,
+// which not only has the address registers but also clearly characterizes their
+// size as 16-bit:
+// https://www.multimediaforum.de/threads/36260834-alice-hsn-3800tw-usb-jtag-ft4232h/page2
+//
+// -- Event Mode Info --
+//
+// Specific information on each counter mode can be found in the document titled
+// "SuperH™ Family E10A-USB Emulator: Additional Document for User’s Manual:
+// Supplementary Information on Using the SH7750R Renesas Microcomputer Development Environment System"
+// which is available on Renesas's website, in the "Documents" section of the
+// E10A-USB product page:
+// https://www.renesas.com/us/en/products/software-tools/tools/emulator/e10a-usb.html
+// At the time of writing (12/2019), the E10A-USB adapter is still available
+// for purchase, and it is priced around $1200 (USD).
+//
+// Appendix C of the "ST40 Micro Toolset Manual" also has these modes documented:
+// https://www.st.com/content/ccc/resource/technical/document/user_manual/c5/98/11/89/50/68/41/66/CD17379953.pdf/files/CD17379953.pdf/jcr:content/translations/en.CD17379953.pdf
+//
+// See here for the hexadecimal values corresponding to each mode (pg. 370):
+// http://www.macmadigan.com/BusaECU/Renesas%20documents/Hitachi_codescape_CS40_light_userguides.pdf
+// You can also find the same "Counter Description Table" in user's guide PDFs
+// bundled in ancient demo versions of CodeScape 3 from 2000 (e.g.
+// CSDemo_272.exe), which can still be found in the Internet Archive.
+// http://web.archive.org/web/*/http://codescape.com/dl/CSDemo/*
+//
+// See here for a support document on Lauterbach's SH2, SH3, and SH4 debugger,
+// which contains units for each mode (e.g. which measure time and which just
+// count): https://www.lauterbach.com/frames.html?home.html (It's in Downloads
+// -> Trace32 Help System -> it's the file called "SH2, SH3 and SH4 Debugger"
+// with the filename debugger_sh4.pdf).
+//
+
+//
+// --- Performance Counter Registers ---
+//
+
+// These registers are 16 bits only and configure the performance counters
+#define PMCR1_CTRL_REG 0xFF000084
+#define PMCR2_CTRL_REG 0xFF000088
+
+// These registers are 32-bits each and hold the high low parts of each counter
+#define PMCTR1H_REG 0xFF100004
+#define PMCTR1L_REG 0xFF100008
+
+#define PMCTR2H_REG 0xFF10000C
+#define PMCTR2L_REG 0xFF100010
+
+//
+// --- Performance Counter Configuration Flags ---
+//
+
+// These bits' functions are currently unknown, but they may simply be reserved.
+// It's possible that there's a [maybe expired?] patent that details the
+// configuration registers, though I haven't been able to find one. Places to
+// check would be Google Patents and the Japanese Patent Office--maybe someone
+// else can find something?
+//
+// Some notes:
+// Writing 1 to all of these bits reads back as 0, so it looks like they aren't
+// config bits. It's possible they are write-only like the stop bit, though,
+// or that they're just reserved-write-0-only. It appears that they are always
+// written with zeros in software that uses them, so that's confirmed safe to do.
+//
+// Also, after running counter 1 to overflow, it appears there's no overflow bit
+// (maybe the designers thought 48-bits would be so much to count to that they
+// didn't bother implementing one?). The upper 16-bits of the counter high
+// register are also not sign-extension bits. They may be a hidden config area,
+// but probably not because big endian mode would swap the byte order.
+#define PMCR_UNKNOWN_BIT_0040 0x0040
+#define PMCR_UNKNOWN_BIT_0080 0x0080
+#define PMCR_UNKNOWN_BIT_0200 0x0200
+#define PMCR_UNKNOWN_BIT_0400 0x0400
+#define PMCR_UNKNOWN_BIT_0800 0x0800
+#define PMCR_UNKNOWN_BIT_1000 0x1000
+
+// PMCR_MODE_CLEAR_INVERTED just clears the event mode if it's inverted with
+// '~', and event modes are listed below.
+#define PMCR_MODE_CLEAR_INVERTED 0x003f
+
+// PMCR_CLOCK_TYPE sets the counters to count clock cycles or CPU/bus ratio mode
+// cycles (where T = C x B / 24 and T is time, C is count, and B is time
+// of one bus cycle). Note: B = 1/99753008 or so, but it may vary, as mine is
+// actually 1/99749010-ish; the target frequency is probably meant to be 99.75MHz.
+//
+// See the ST40 or Renesas SH7750R documents described in the above "Event Mode
+// Info" section for more details about that formula.
+//
+// Set PMCR_CLOCK_TYPE to 0 for CPU cycle counting, where 1 count = 1 cycle, or
+// set it to 1 to use the above formula. Renesas documentation recommends using
+// the ratio version (set the bit to 1) when user programs alter CPU clock
+// frequencies. This header has some definitions later on to help with this.
+#define PMCR_CLOCK_TYPE 0x0100
+#define PMCR_CLOCK_TYPE_SHIFT 8
+
+// PMCR_STOP_COUNTER is write-only, as it always reads back as 0. It does what
+// the name suggests: when this bit is written to, the counter stops. However,
+// if written to while the counter is disabled or stopped, the counter's high
+// and low registers are reset to 0.
+//
+// Using PMCR_STOP_COUNTER to stop the counter has the effect of holding the
+// data in the data registers while stopped, unlike PMCR_DISABLE_COUNTER, and
+// this bit needs to be written to again (e.g. on next start) in order to
+// actually clear the counter data for another run. If not explicitly cleared,
+// the counter will continue from where it left off before being stopped.
+#define PMCR_STOP_COUNTER 0x2000
+#define PMCR_RESET_COUNTER_SHIFT 13
+
+// Bits 0xC000 both need to be set to 1 for the counters to actually begin
+// counting. I have seen that the Linux kernel actually separates them out into
+// two separate labelled bits (PMEN and PMST) for some reason, however they do
+// not appear to do anything separately. Perhaps this is a two-bit mode where
+// 1-1 is run, 1-0 and 0-1 are ???, and 0-0 is off.
+#define PMCR_RUN_COUNTER 0xC000
+#define PMCR_RUN_SHIFT 14
+// Interestingly, the output here writes 0x6000 to the counter config registers,
+// which would be the "PMST" bit and the "RESET" bit:
+// https://www.multimediaforum.de/threads/36260834-alice-hsn-3800tw-usb-jtag-ft4232h/page2
+
+// To disable a counter, just write 0 to its config register. This will not
+// reset the counter to 0, as that requires an explicit clear via setting the
+// PMCR_STOP_COUNTER bit. What's odd is that a disabled counter's data
+// registers read back as all 0, but re-enabling it without a clear will
+// continue from the last value before disabling.
+#define PMCR_DISABLE_COUNTER 0x0000
+
+// These definitions merely separate out the two PMCR_RUN_COUNTER bits, and
+// they are included here for documentation purposes.
+
+// PMST may mean PMCR START. It's consistently used to enable the counter.
+// I'm just calling it PMST here for lack of a better name, since this is what
+// the Linux kernel and lxdream call it. It could also have something to do with
+// a mode specific to STMicroelectronics.
+#define PMCR_PMST_BIT 0x4000
+#define PMCR_PMST_SHIFT 14
+
+// Likewise PMEN may mean PMCR ENABLE
+#define PMCR_PMEN_BIT 0x8000
+#define PMCR_PMEN_SHIFT 15
+
+//
+// --- Performance Counter Event Code Definitions ---
+//
+// Interestingly enough, it so happens that the SEGA Dreamcast's CPU seems to
+// contain the same performance counter functionality as SH4 debug adapters for
+// the SH7750R. Awesome!
+//
+
+//                MODE DEFINITION                  VALUE   MEASURMENT TYPE & NOTES
+#define PMCR_INIT_NO_MODE                           0x00 // None; Just here to be complete
+#define PMCR_OPERAND_READ_ACCESS_MODE               0x01 // Quantity; With cache
+#define PMCR_OPERAND_WRITE_ACCESS_MODE              0x02 // Quantity; With cache
+#define PMCR_UTLB_MISS_MODE                         0x03 // Quantity
+#define PMCR_OPERAND_CACHE_READ_MISS_MODE           0x04 // Quantity
+#define PMCR_OPERAND_CACHE_WRITE_MISS_MODE          0x05 // Quantity
+#define PMCR_INSTRUCTION_FETCH_MODE                 0x06 // Quantity; With cache
+#define PMCR_INSTRUCTION_TLB_MISS_MODE              0x07 // Quantity
+#define PMCR_INSTRUCTION_CACHE_MISS_MODE            0x08 // Quantity
+#define PMCR_ALL_OPERAND_ACCESS_MODE                0x09 // Quantity
+#define PMCR_ALL_INSTRUCTION_FETCH_MODE             0x0a // Quantity
+#define PMCR_ON_CHIP_RAM_OPERAND_ACCESS_MODE        0x0b // Quantity
+// No 0x0c
+#define PMCR_ON_CHIP_IO_ACCESS_MODE                 0x0d // Quantity
+#define PMCR_OPERAND_ACCESS_MODE                    0x0e // Quantity; With cache, counts both reads and writes
+#define PMCR_OPERAND_CACHE_MISS_MODE                0x0f // Quantity
+#define PMCR_BRANCH_ISSUED_MODE                     0x10 // Quantity; Not the same as branch taken!
+#define PMCR_BRANCH_TAKEN_MODE                      0x11 // Quantity
+#define PMCR_SUBROUTINE_ISSUED_MODE                 0x12 // Quantity; Issued a BSR, BSRF, JSR, JSR/N
+#define PMCR_INSTRUCTION_ISSUED_MODE                0x13 // Quantity
+#define PMCR_PARALLEL_INSTRUCTION_ISSUED_MODE       0x14 // Quantity
+#define PMCR_FPU_INSTRUCTION_ISSUED_MODE            0x15 // Quantity
+#define PMCR_INTERRUPT_COUNTER_MODE                 0x16 // Quantity
+#define PMCR_NMI_COUNTER_MODE                       0x17 // Quantity
+#define PMCR_TRAPA_INSTRUCTION_COUNTER_MODE         0x18 // Quantity
+#define PMCR_UBC_A_MATCH_MODE                       0x19 // Quantity
+#define PMCR_UBC_B_MATCH_MODE                       0x1a // Quantity
+// No 0x1b-0x20
+#define PMCR_INSTRUCTION_CACHE_FILL_MODE            0x21 // Cycles
+#define PMCR_OPERAND_CACHE_FILL_MODE                0x22 // Cycles
+#define PMCR_ELAPSED_TIME_MODE                      0x23 // Cycles; For 200MHz CPU: 5ns per count in 1 cycle = 1 count mode, or around 417.715ps per count (increments by 12) in CPU/bus ratio mode
+#define PMCR_PIPELINE_FREEZE_BY_ICACHE_MISS_MODE    0x24 // Cycles
+#define PMCR_PIPELINE_FREEZE_BY_DCACHE_MISS_MODE    0x25 // Cycles
+// No 0x26
+#define PMCR_PIPELINE_FREEZE_BY_BRANCH_MODE         0x27 // Cycles
+#define PMCR_PIPELINE_FREEZE_BY_CPU_REGISTER_MODE   0x28 // Cycles
+#define PMCR_PIPELINE_FREEZE_BY_FPU_MODE            0x29 // Cycles
+
+//
+// --- Performance Counter Support Definitions ---
+//
+
+// This definition can be passed as the init/enable/restart functions'
+// count_type parameter to use the 1 cycle = 1 count mode. This is how the
+// counter can be made to run for 16.3 days.
+#define PMCR_COUNT_CPU_CYCLES 0
+// Likewise this uses the CPU/bus ratio method
+#define PMCR_COUNT_RATIO_CYCLES 1
+
+// These definitions are for the enable function and specify whether to reset
+// a counter to 0 or to continue from where it left off
+#define PMCR_CONTINUE_COUNTER 0
+#define PMCR_RESET_COUNTER 1
+
+//
+// --- Performance Counter Miscellaneous Definitions ---
+//
+// For convenience; assume stock bus clock of 99.75MHz
+// (Bus clock is the external CPU clock, not the peripheral bus clock)
+//
+
+#define PMCR_SH4_CPU_FREQUENCY 199500000
+#define PMCR_CPU_CYCLES_MAX_SECONDS 1410902
+#define PMCR_SH4_BUS_FREQUENCY 99750000
+#define PMCR_SH4_BUS_FREQUENCY_SCALED 2394000000 // 99.75MHz x 24
+#define PMCR_BUS_RATIO_MAX_SECONDS 117575
+
+//
+// --- Performance Counter Functions ---
+//
+// See perfctr.c file for more details about each function and some more usage notes.
+//
+// Note: PMCR_Init() and PMCR_Enable() will do nothing if the perf counter is already running!
+//
+
+// Clear counter and enable
+void PMCR_Init(int which, unsigned short mode, unsigned char count_type);
+
+// Enable one or both of these "undocumented" performance counters.
+void PMCR_Enable(int which, unsigned short mode, unsigned char count_type, unsigned char reset_counter);
+
+// Disable, clear, and re-enable with new mode (or same mode)
+void PMCR_Restart(int which, unsigned short mode, unsigned char count_type);
+
+// Read a counter
+// out_array is specifically uint32 out_array[2] -- 48-bit value needs a 64-bit storage unit
+void PMCR_Read(int which, volatile unsigned int *out_array);
+
+// Stop counter(s) (without clearing)
+void PMCR_Stop(int which);
+
+// Disable counter(s) (without clearing)
+void PMCR_Disable(int which);
+
+#endif /* __PERFCTR_H__ */
diff --git a/GL/private.h b/GL/private.h
index 6987723..344de6b 100644
--- a/GL/private.h
+++ b/GL/private.h
@@ -6,6 +6,7 @@
 #include "../include/gl.h"
 #include "../containers/aligned_vector.h"
 #include "../containers/named_array.h"
+#include "cygprofile.h"
 
 extern void* memcpy4 (void *dest, const void *src, size_t count);
 
@@ -249,6 +250,11 @@ typedef struct {
     GLint size;
 } AttribPointer;
 
+typedef struct {
+    float xyz[3];
+    float n[3];
+} EyeSpaceData;
+
 GLboolean _glCheckValidEnum(GLint param, GLint* values, const char* func);
 
 GLuint* _glGetEnabledAttributes();
@@ -280,7 +286,7 @@ GLuint _glGetMipmapLevelCount(TextureObject* obj);
 GLboolean _glIsLightingEnabled();
 GLboolean _glIsLightEnabled(GLubyte light);
 GLboolean _glIsColorMaterialEnabled();
-void _glCalculateLightingContribution(const GLint light, const GLfloat* pos, const GLfloat* normal, uint8_t* bgra, GLfloat* colour);
+void _glPerformLighting(Vertex* vertices, const EyeSpaceData* es, const int32_t count);
 
 unsigned char _glIsClippingEnabled();
 void _glEnableClipping(unsigned char v);
diff --git a/GL/profiler.c b/GL/profiler.c
index 968bc9f..c74d84a 100644
--- a/GL/profiler.c
+++ b/GL/profiler.c
@@ -6,6 +6,8 @@
 #include "profiler.h"
 #include "../containers/aligned_vector.h"
 
+#if PROFILING_COMPILED
+
 #define MAX_PATH 256
 
 typedef struct {
@@ -141,3 +143,4 @@ void profiler_print_stats() {
         fprintf(stderr, "%-60s%-20f%-20f%" PRIu64 "\n", result->name, (double)avg, (double)ms, result->total_calls);
     }
 }
+#endif
diff --git a/GL/profiler.h b/GL/profiler.h
index acaf8bf..acf07ed 100644
--- a/GL/profiler.h
+++ b/GL/profiler.h
@@ -7,12 +7,26 @@ typedef struct {
     uint64_t start_time_in_us;
 } Profiler;
 
+#define PROFILING_COMPILED 0
 
+#if PROFILING_COMPILED
 Profiler* profiler_push(const char* name);
-void profiler_checkpoint(const char* name);
-void profiler_pop();
+void _profiler_checkpoint(const char* name);
+void _profiler_pop();
 
-void profiler_print_stats();
+void _profiler_print_stats();
 
-void profiler_enable();
-void profiler_disable();
+void _profiler_enable();
+void _profiler_disable();
+
+#else
+#define profiler_push(name);
+#define profiler_checkpoint(name);
+#define profiler_pop();
+
+#define profiler_print_stats();
+
+#define profiler_enable();
+#define profiler_disable();
+
+#endif
diff --git a/GL/sh4_math.h b/GL/sh4_math.h
new file mode 100644
index 0000000..ad1cd7e
--- /dev/null
+++ b/GL/sh4_math.h
@@ -0,0 +1,1448 @@
+// ---- sh4_math.h - SH7091 Math Module ----
+//
+// Version 1.0.3
+//
+// This file is part of the DreamHAL project, a hardware abstraction library
+// primarily intended for use on the SH7091 found in hardware such as the SEGA
+// Dreamcast game console.
+//
+// This math module is hereby released into the public domain in the hope that it
+// may prove useful. Now go hit 60 fps! :)
+//
+// --Moopthehedgehog
+//
+
+// Notes:
+// - GCC 4 users have a different return type for the fsca functions due to an
+//  internal compiler error regarding complex numbers; no issue under GCC 9.2.0
+// - Using -m4 instead of -m4-single-only completely breaks the matrix and
+//  vector operations
+// - Function inlining must be enabled and not blocked by compiler options such
+//  as -ffunction-sections, as blocking inlining will result in significant
+//  performance degradation for the vector and matrix functions employing a
+//  RETURN_VECTOR_STRUCT return type. I have added compiler hints and attributes
+//  "static inline __attribute__((always_inline))" to mitigate this, so in most
+//  cases the functions should be inlined regardless. If in doubt, check the
+//  compiler asm output!
+//
+
+#ifndef __SH4_MATH_H_
+#define __SH4_MATH_H_
+
+#define GNUC_FSCA_ERROR_VERSION 4
+
+//
+// Fast SH4 hardware math functions
+//
+//
+// High-accuracy users beware, the fsrra functions have an error of +/- 2^-21
+// per http://www.shared-ptr.com/sh_insns.html
+//
+
+//==============================================================================
+// Definitions
+//==============================================================================
+//
+// Structures, useful definitions, and reference comments
+//
+
+// Front matrix format:
+//
+//    FV0 FV4 FV8  FV12
+//    --- --- ---  ----
+//  [ fr0 fr4 fr8  fr12 ]
+//  [ fr1 fr5 fr9  fr13 ]
+//  [ fr2 fr6 fr10 fr14 ]
+//  [ fr3 fr7 fr11 fr15 ]
+//
+// Back matrix, XMTRX, is similar, although it has no FVn vector groups:
+//
+//  [ xf0 xf4 xf8  xf12 ]
+//  [ xf1 xf5 xf9  xf13 ]
+//  [ xf2 xf6 xf10 xf14 ]
+//  [ xf3 xf7 xf11 xf15 ]
+//
+
+typedef struct __attribute__((aligned(32))) {
+  float fr0;
+  float fr1;
+  float fr2;
+  float fr3;
+  float fr4;
+  float fr5;
+  float fr6;
+  float fr7;
+  float fr8;
+  float fr9;
+  float fr10;
+  float fr11;
+  float fr12;
+  float fr13;
+  float fr14;
+  float fr15;
+} ALL_FLOATS_STRUCT;
+
+// Return structs should be defined locally so that GCC optimizes them into
+// register usage instead of memory accesses.
+typedef struct {
+  float z1;
+  float z2;
+  float z3;
+  float z4;
+} RETURN_VECTOR_STRUCT;
+
+#if __GNUC__ <= GNUC_FSCA_ERROR_VERSION
+typedef struct {
+  float sine;
+  float cosine;
+} RETURN_FSCA_STRUCT;
+#endif
+
+// Identity Matrix
+//
+//    FV0 FV4 FV8 FV12
+//    --- --- --- ----
+//  [  1   0   0   0  ]
+//  [  0   1   0   0  ]
+//  [  0   0   1   0  ]
+//  [  0   0   0   1  ]
+//
+
+static const ALL_FLOATS_STRUCT identity_matrix = {1.0f, 0.0f, 0.0f, 0.0f, 0.0f, 1.0f, 0.0f, 0.0f, 0.0f, 0.0f, 1.0f, 0.0f, 0.0f, 0.0f, 0.0f, 1.0f};
+
+//==============================================================================
+// Basic math functions
+//==============================================================================
+//
+// The following functions are available.
+// Please see their definitions for other usage info, otherwise they may not
+// work for you.
+//
+/*
+  // |x|
+  float MATH_fabs(float x)
+
+  // sqrt(x)
+  float MATH_fsqrt(float x)
+
+  // a*b+c
+  float MATH_fmac(float a, float b, float c)
+
+  // a*b-c
+  float MATH_fmac_Dec(float a, float b, float c)
+*/
+
+// |x|
+// This one works on ARM and x86, too!
+static inline __attribute__((always_inline)) float MATH_fabs(float x)
+{
+  asm volatile ("FABS %[floatx]\n"
+    : [floatx] "+f" (x) // outputs, "+" means r/w
+    : // no inputs
+    : // no clobbers
+  );
+
+  return x;
+}
+
+// sqrt(x)
+// This one works on ARM and x86, too!
+// NOTE: There is a much faster version (MATH_Fast_Sqrt()) in the fsrra section of
+// this file. Chances are you probably want that one.
+static inline __attribute__((always_inline)) float MATH_fsqrt(float x)
+{
+  asm volatile ("fsqrt %[floatx]\n"
+    : [floatx] "+f" (x) // outputs, "+" means r/w
+    : // no inputs
+    : // no clobbers
+  );
+
+  return x;
+}
+
+// a*b+c
+static inline __attribute__((always_inline)) float MATH_fmac(float a, float b, float c)
+{
+  asm volatile ("fmac fr0, %[floatb], %[floatc]\n"
+    : [floatc] "+f" (c) // outputs, "+" means r/w
+    : "w" (a), [floatb] "f" (b) // inputs
+    : // no clobbers
+  );
+
+  return c;
+}
+
+// a*b-c
+static inline __attribute__((always_inline)) float MATH_fmac_Dec(float a, float b, float c)
+{
+  asm volatile ("fneg %[floatc]\n\t"
+    "fmac fr0, %[floatb], %[floatc]\n"
+    : [floatc] "+&f" (c) // outputs, "+" means r/w, "&" means it's written to before all inputs are consumed
+    : "w" (a), [floatb] "f" (b) // inputs
+    : // no clobbers
+  );
+
+  return c;
+}
+
+//==============================================================================
+// Fun with fsrra, which does 1/sqrt(x) in one cycle
+//==============================================================================
+//
+// Error is +/- 2^-21 per http://www.shared-ptr.com/sh_insns.html
+//
+// The following functions are available.
+// Please see their definitions for other usage info, otherwise they may not
+// work for you.
+//
+/*
+  // 1/x
+  float MATH_Invert(float x)
+
+  // 1/sqrt(x)
+  float MATH_fsrra(float x)
+
+  // A faster divide than the 'fdiv' instruction
+  float MATH_Fast_Divide(float numerator, float denominator)
+
+  // A faster square root then the 'fsqrt' instruction
+  float MATH_Fast_Sqrt(float x)
+*/
+
+// 1/x
+// (1.0f / sqrt(x) ) ^ 2
+// This is about 3x faster than fdiv!
+static inline __attribute__((always_inline)) float MATH_Invert(float x)
+{
+  asm volatile ("fsrra %[one_div_sqrt]\n\t"
+  "fmul %[one_div_sqrt], %[one_div_sqrt]\n"
+  : [one_div_sqrt] "+f" (x) // outputs, "+" means r/w
+  : // no inputs
+  : // no clobbers
+  );
+
+  return x;
+}
+
+// 1/sqrt(x)
+static inline __attribute__((always_inline)) float MATH_fsrra(float x)
+{
+  asm volatile ("fsrra %[one_div_sqrt]\n"
+  : [one_div_sqrt] "+f" (x) // outputs, "+" means r/w
+  : // no inputs
+  : // no clobbers
+  );
+
+  return x;
+}
+
+// It's faster to do this than to do an fdiv. This takes half as many cycles!
+// (~7 vs ~14) Only fdiv can do doubles, however.
+// Of course, not having to divide at all is generally the best way to go. :P
+static inline __attribute__((always_inline)) float MATH_Fast_Divide(float numerator, float denominator)
+{
+  asm volatile ("fsrra %[div_denom]\n\t"
+  "fmul %[div_denom], %[div_denom]\n\t"
+  "fmul %[div_numer], %[div_denom]\n"
+  : [div_denom] "+&f" (denominator) // outputs, "+" means r/w, "&" means it's written to before all inputs are consumed
+  : [div_numer] "f" (numerator) // inputs
+  : // clobbers
+  );
+
+  return denominator;
+}
+
+// fast sqrt(x)
+// Crazy thing: invert(fsrra(x)) is actually about 3x faster than fsqrt.
+// Error is +/- 2^-21 per http://www.shared-ptr.com/sh_insns.html
+static inline __attribute__((always_inline)) float MATH_Fast_Sqrt(float x)
+{
+  return MATH_Invert(MATH_fsrra(x));
+}
+
+//==============================================================================
+// Fun with fsca, which does simultaneous sine and cosine in 3 cycles
+//==============================================================================
+//
+// NOTE: GCC 4.7 has a bug that prevents it from working with fsca and complex
+// numbers in m4-single-only mode, so GCC 4 users will get a RETURN_FSCA_STRUCT
+// instead of a _Complex float. This may be much slower in some instances.
+//
+// VERY IMPORTANT USAGE INFORMATION (sine and cosine functions):
+//
+// Due to the nature in which the fsca instruction behaves, you MUST do the
+// following in your code to get sine and cosine from these functions:
+//
+//  _Complex float sine_cosine = [Call the fsca function here]
+//  float sine_value = __real__ sine_cosine;
+//  float cosine_value = __imag__ sine_cosine;
+//  Your output is now in sine_value and cosine_value.
+//
+// This is necessary because fsca outputs both sine and cosine simultaneously
+// and uses a double register to do so. The fsca functions do not actually
+// return a double--they return two floats--and using a complex float here is
+// just a bit of hacking the C language to make GCC do something that's legal in
+// assembly according to the SH4 calling convention (i.e. multiple return values
+// stored in floating point registers FR0-FR3). This is better than using a
+// struct of floats for optimization purposes--this will operate at peak
+// performance even at -O0, whereas a struct will not be fast at low
+// optimization levels due to memory accesses.
+//
+// Technically you may be able to use the complex return values as a complex
+// number if you wanted to, but that's probably not what you're after and they'd
+// be flipped anyways (in mathematical convention, sine is the imaginary part).
+//
+
+// Notes:
+// - From http://www.shared-ptr.com/sh_insns.html:
+//      The input angle is specified as a signed fraction in twos complement. The result of sin and cos is a single-precision floating-point number.
+//      0x7FFFFFFF to 0x00000001: 360×2^15−360/2^16 to 360/2^16 degrees
+//      0x00000000: 0 degree
+//      0xFFFFFFFF to 0x80000000: −360/2^16 to −360×2^15 degrees
+// - fsca format is 2^16 is 360 degrees, so a value of 1 is actually
+//  1/182.044444444 of a degree
+// - fsca does a %360 automatically for values over 360 degrees
+
+// The following functions are available.
+// Please see their definitions for other usage info, otherwise they may not
+// work for you.
+//
+/*
+  // For integer input in native fsca units (fastest)
+  _Complex float MATH_fsca_Int(unsigned int input_int)
+
+  // For integer input in degrees
+  _Complex float MATH_fsca_Int_Deg(unsigned int input_int)
+
+  // For integer input in radians
+  _Complex float MATH_fsca_Int_Rad(unsigned int input_int)
+
+  // For float input in native fsca units
+  _Complex float MATH_fsca_Float(float input_float)
+
+  // For float input in degrees
+  _Complex float MATH_fsca_Float_Deg(float input_float)
+
+  // For float input in radians
+  _Complex float MATH_fsca_Float_Rad(float input_float)
+*/
+
+//------------------------------------------------------------------------------
+#if __GNUC__ <= GNUC_FSCA_ERROR_VERSION
+//------------------------------------------------------------------------------
+//
+// This set of fsca functions is specifically for old versions of GCC.
+// See later for functions for newer versions of GCC.
+//
+
+//
+// Integer input (faster)
+//
+
+// For int input, input_int is in native fsca units (fastest)
+static inline __attribute__((always_inline)) RETURN_FSCA_STRUCT MATH_fsca_Int(unsigned int input_int)
+{
+  register float __sine __asm__("fr0");
+  register float __cosine __asm__("fr1");
+
+  asm volatile ("lds %[input_number], FPUL\n\t" // load int from register (1 cycle)
+    "fsca FPUL, DR0\n" // 3 cycle simultaneous sine/cosine
+    : "=w" (__sine), "=f" (__cosine) // outputs
+    : [input_number] "r" (input_int)  // inputs
+    : "fpul" // clobbers
+  );
+
+  RETURN_FSCA_STRUCT output = {__sine, __cosine};
+  return output;
+}
+
+// For int input, input_int is in degrees
+static inline __attribute__((always_inline)) RETURN_FSCA_STRUCT MATH_fsca_Int_Deg(unsigned int input_int)
+{
+  // normalize whole number input degrees to fsca format
+  input_int = ((1527099483ULL * input_int) >> 23);
+
+  register float __sine __asm__("fr0");
+  register float __cosine __asm__("fr1");
+
+  asm volatile ("lds %[input_number], FPUL\n\t" // load int from register (1 cycle)
+    "fsca FPUL, DR0\n" // 3 cycle simultaneous sine/cosine
+    : "=w" (__sine), "=f" (__cosine) // outputs
+    : [input_number] "r" (input_int)  // inputs
+    : "fpul" // clobbers
+  );
+
+  RETURN_FSCA_STRUCT output = {__sine, __cosine};
+  return output;
+}
+
+// For int input, input_int is in radians
+static inline __attribute__((always_inline)) RETURN_FSCA_STRUCT MATH_fsca_Int_Rad(unsigned int input_int)
+{
+  // normalize whole number input rads to fsca format
+  input_int = ((2734261102ULL * input_int) >> 18);
+
+  register float __sine __asm__("fr0");
+  register float __cosine __asm__("fr1");
+
+  asm volatile ("lds %[input_number], FPUL\n\t" // load int from register (1 cycle)
+    "fsca FPUL, DR0\n" // 3 cycle simultaneous sine/cosine
+    : "=w" (__sine), "=f" (__cosine) // outputs
+    : [input_number] "r" (input_int)  // inputs
+    : "fpul" // clobbers
+  );
+
+  RETURN_FSCA_STRUCT output = {__sine, __cosine};
+  return output;
+}
+
+//
+// Float input (slower)
+//
+
+// For float input, input_float is in native fsca units
+static inline __attribute__((always_inline)) RETURN_FSCA_STRUCT MATH_fsca_Float(float input_float)
+{
+  register float __sine __asm__("fr0");
+  register float __cosine __asm__("fr1");
+
+  asm volatile ("ftrc %[input_number], FPUL\n\t" // convert float to int. takes 3 cycles
+    "fsca FPUL, DR0\n" // 3 cycle simultaneous sine/cosine
+    : "=w" (__sine), "=f" (__cosine) // outputs
+    : [input_number] "f" (input_float)  // inputs
+    : "fpul" // clobbers
+  );
+
+  RETURN_FSCA_STRUCT output = {__sine, __cosine};
+  return output;
+}
+
+// For float input, input_float is in degrees
+static inline __attribute__((always_inline)) RETURN_FSCA_STRUCT MATH_fsca_Float_Deg(float input_float)
+{
+  input_float *= 182.044444444f;
+
+  register float __sine __asm__("fr0");
+  register float __cosine __asm__("fr1");
+
+  asm volatile ("ftrc %[input_number], FPUL\n\t" // convert float to int. takes 3 cycles
+    "fsca FPUL, DR0\n" // 3 cycle simultaneous sine/cosine
+    : "=w" (__sine), "=f" (__cosine) // outputs
+    : [input_number] "f" (input_float)  // inputs
+    : "fpul" // clobbers
+  );
+
+  RETURN_FSCA_STRUCT output = {__sine, __cosine};
+  return output;
+}
+
+// For float input, input_float is in radians
+static inline __attribute__((always_inline)) RETURN_FSCA_STRUCT MATH_fsca_Float_Rad(float input_float)
+{
+  input_float *= 10430.3783505f;
+
+  register float __sine __asm__("fr0");
+  register float __cosine __asm__("fr1");
+
+  asm volatile ("ftrc %[input_number], FPUL\n\t" // convert float to int. takes 3 cycles
+    "fsca FPUL, DR0\n" // 3 cycle simultaneous sine/cosine
+    : "=w" (__sine), "=f" (__cosine) // outputs
+    : [input_number] "f" (input_float)  // inputs
+    : "fpul" // clobbers
+  );
+
+  RETURN_FSCA_STRUCT output = {__sine, __cosine};
+  return output;
+}
+
+//------------------------------------------------------------------------------
+#else
+//------------------------------------------------------------------------------
+//
+// This set of fsca functions is specifically for newer versions of GCC. They
+// work fine under GCC 9.2.0.
+//
+
+//
+// Integer input (faster)
+//
+
+// For int input, input_int is in native fsca units (fastest)
+static inline __attribute__((always_inline)) _Complex float MATH_fsca_Int(unsigned int input_int)
+{
+  _Complex float output;
+
+  asm volatile ("lds %[input_number], FPUL\n\t" // load int from register (1 cycle)
+    "fsca FPUL, %[out]\n" // 3 cycle simultaneous sine/cosine
+    : [out] "=d" (output) // outputs
+    : [input_number] "r" (input_int)  // inputs
+    : "fpul" // clobbers
+  );
+
+  return output;
+}
+
+// For int input, input_int is in degrees
+static inline __attribute__((always_inline)) _Complex float MATH_fsca_Int_Deg(unsigned int input_int)
+{
+  // normalize whole number input degrees to fsca format
+  input_int = ((1527099483ULL * input_int) >> 23);
+
+  _Complex float output;
+
+  asm volatile ("lds %[input_number], FPUL\n\t" // load int from register (1 cycle)
+    "fsca FPUL, %[out]\n" // 3 cycle simultaneous sine/cosine
+    : [out] "=d" (output) // outputs
+    : [input_number] "r" (input_int)  // inputs
+    : "fpul" // clobbers
+  );
+
+  return output;
+}
+
+// For int input, input_int is in radians
+static inline __attribute__((always_inline)) _Complex float MATH_fsca_Int_Rad(unsigned int input_int)
+{
+  // normalize whole number input rads to fsca format
+  input_int = ((2734261102ULL * input_int) >> 18);
+
+  _Complex float output;
+
+  asm volatile ("lds %[input_number], FPUL\n\t" // load int from register (1 cycle)
+    "fsca FPUL, %[out]\n" // 3 cycle simultaneous sine/cosine
+    : [out] "=d" (output) // outputs
+    : [input_number] "r" (input_int)  // inputs
+    : "fpul" // clobbers
+  );
+
+  return output;
+}
+
+//
+// Float input (slower)
+//
+
+// For float input, input_float is in native fsca units
+static inline __attribute__((always_inline)) _Complex float MATH_fsca_Float(float input_float)
+{
+  _Complex float output;
+
+  asm volatile ("ftrc %[input_number], FPUL\n\t" // convert float to int. takes 3 cycles
+    "fsca FPUL, %[out]\n" // 3 cycle simultaneous sine/cosine
+    : [out] "=d" (output) // outputs
+    : [input_number] "f" (input_float)  // inputs
+    : "fpul" // clobbers
+  );
+
+  return output;
+}
+
+// For float input, input_float is in degrees
+static inline __attribute__((always_inline)) _Complex float MATH_fsca_Float_Deg(float input_float)
+{
+  input_float *= 182.044444444f;
+
+  _Complex float output;
+
+  asm volatile ("ftrc %[input_number], FPUL\n\t" // convert float to int. takes 3 cycles
+    "fsca FPUL, %[out]\n" // 3 cycle simultaneous sine/cosine
+    : [out] "=d" (output) // outputs
+    : [input_number] "f" (input_float)  // inputs
+    : "fpul" // clobbers
+  );
+
+  return output;
+}
+
+// For float input, input_float is in radians
+static inline __attribute__((always_inline)) _Complex float MATH_fsca_Float_Rad(float input_float)
+{
+  input_float *= 10430.3783505f;
+
+  _Complex float output;
+
+  asm volatile ("ftrc %[input_number], FPUL\n\t" // convert float to int. takes 3 cycles
+    "fsca FPUL, %[out]\n" // 3 cycle simultaneous sine/cosine
+    : [out] "=d" (output) // outputs
+    : [input_number] "f" (input_float)  // inputs
+    : "fpul" // clobbers
+  );
+
+  return output;
+}
+
+//------------------------------------------------------------------------------
+#endif
+//------------------------------------------------------------------------------
+
+//==============================================================================
+// Hardware vector and matrix operations
+//==============================================================================
+//
+// These functions each have very specific usage instructions. Please be sure to
+// read them before use or else they won't seem to work right!
+//
+// The following functions are available.
+// Please see their definitions for important usage info, otherwise they may not
+// work for you.
+//
+/*
+  // Inner/dot product (4x1 vec . 4x1 vec = scalar)
+  float MATH_fipr(float x1, float x2, float x3, float x4, float y1, float y2, float y3, float y4)
+
+  // Cross product with bonus multiply (vec X vec = orthogonal vec, with an extra a*b=c)
+  RETURN_VECTOR_STRUCT MATH_Cross_Product_with_Mult(float x1, float x2, float x3, float y1, float y2, float y3, float a, float b)
+
+  // Cross product (vec X vec = orthogonal vec)
+  RETURN_VECTOR_STRUCT MATH_Cross_Product(float x1, float x2, float x3, float y1, float y2, float y3)
+
+  // Outer product (vec (X) vec = 4x4 matrix)
+  void MATH_Outer_Product(float x1, float x2, float x3, float x4, float y1, float y2, float y3, float y4)
+
+  // Matrix transform (4x4 matrix * 4x1 vec = 4x1 vec)
+  RETURN_VECTOR_STRUCT MATH_Matrix_Transform(float x1, float x2, float x3, float x4)
+
+  // 4x4 Matrix transpose (XMTRX^T)
+  void MATH_Matrix_Transpose(void)
+
+  // 4x4 Matrix product (XMTRX and one from memory)
+  void MATH_Matrix_Product(ALL_FLOATS_STRUCT * front_matrix)
+
+  // 4x4 Matrix product (two from memory)
+  void MATH_Load_Matrix_Product(ALL_FLOATS_STRUCT * matrix1, ALL_FLOATS_STRUCT * matrix2)
+
+  // Load 4x4 XMTRX from memory
+  void MATH_Load_XMTRX(ALL_FLOATS_STRUCT * back_matrix)
+
+  // Store 4x4 XMTRX to memory
+  ALL_FLOATS_STRUCT * MATH_Store_XMTRX(ALL_FLOATS_STRUCT * destination)
+
+  // Get 4x1 column vector from XMTRX
+  RETURN_VECTOR_STRUCT MATH_Get_XMTRX_Vector(unsigned int which)
+
+  // Get 2x2 matrix from XMTRX quadrant
+  RETURN_VECTOR_STRUCT MATH_Get_XMTRX_2x2(unsigned int which)
+*/
+
+// Inner/dot product: vec . vec = scalar
+//                       _    _
+//                      |  y1  |
+//  [ x1 x2 x3 x4 ]  .  |  y2  | = scalar
+//                      |  y3  |
+//                      |_ y4 _|
+//
+// SH4 calling convention states we get 8 float arguments. Perfect!
+static inline __attribute__((always_inline)) float MATH_fipr(float x1, float x2, float x3, float x4, float y1, float y2, float y3, float y4)
+{
+  // FR4-FR11 are the regs that are passed in, aka vectors FV4 and FV8.
+  // Just need to make sure GCC doesn't modify anything, and these register vars do that job.
+
+  // Temporary variables are necessary per GCC to avoid clobbering:
+  // https://gcc.gnu.org/onlinedocs/gcc/Local-Register-Variables.html#Local-Register-Variables
+
+  float tx1 = x1;
+  float tx2 = x2;
+  float tx3 = x3;
+  float tx4 = x4;
+
+  float ty1 = y1;
+  float ty2 = y2;
+  float ty3 = y3;
+  float ty4 = y4;
+
+  // vector FV4
+  register float __x1 __asm__("fr4") = tx1;
+  register float __x2 __asm__("fr5") = tx2;
+  register float __x3 __asm__("fr6") = tx3;
+  register float __x4 __asm__("fr7") = tx4;
+
+  // vector FV8
+  register float __y1 __asm__("fr8") = ty1;
+  register float __y2 __asm__("fr9") = ty2;
+  register float __y3 __asm__("fr10") = ty3;
+  register float __y4 __asm__("fr11") = ty4;
+
+  // take care of all the floats in one fell swoop
+  asm volatile ("fipr FV4, FV8\n"
+  : "+f" (__y4) // output (gets written to FR11)
+  : "f" (__x1), "f" (__x2), "f" (__x3), "f" (__x4), "f" (__y1), "f" (__y2), "f" (__y3) // inputs
+  : // clobbers
+  );
+
+  return __y4;
+}
+
+// Cross product: vec X vec = orthogonal vec
+//   _    _       _    _       _    _
+//  |  x1  |     |  y1  |     |  z1  |
+//  |  x2  |  X  |  y2  |  =  |  z2  |
+//  |_ x3 _|     |_ y3 _|     |_ z3 _|
+//
+// With bonus multiply:
+//
+//      a     *     b      =      c
+//
+// IMPORTANT USAGE INFORMATION (cross product):
+//
+// Return vector struct maps as below to the above diagram:
+//
+//  typedef struct {
+//   float z1;
+//   float z2;
+//   float z3;
+//   float z4; // c is stored in z4, and c = a*b if using 'with mult' version (else c = 0)
+// } RETURN_VECTOR_STRUCT;
+//
+//  For people familiar with the unit vector notation, z1 == 'i', z2 == 'j',
+//  and z3 == 'k'.
+//
+// The cross product matrix will also be stored in XMTRX after this, so calling
+// MATH_Matrix_Transform() on a vector after using this function will do a cross
+// product with the same x1-x3 values and a multiply with the same 'a' value
+// as used in this function. In this a situation, 'a' will be multiplied with
+// the x4 parameter of MATH_Matrix_Transform(). a = 0 if not using the 'with mult'
+// version of the cross product function.
+//
+// For reference, XMTRX will look like this:
+//
+//  [  0 -x3 x2 0 ]
+//  [  x3 0 -x1 0 ]
+//  [ -x2 x1 0  0 ]
+//  [  0  0  0  a ] (<-- a = 0 if not using 'with mult')
+//
+// Similarly to how the sine and cosine functions use fsca and return 2 floats,
+// the cross product functions actually return 4 floats. The first 3 are the
+// cross product output, and the 4th is a*b. The SH4 only multiplies 4x4
+// matrices with 4x1 vectors, which is why the output is like that--but it means
+// we also get a bonus float multiplication while we do our cross product!
+//
+
+// Please do not call this function directly (notice the weird syntax); call
+// MATH_Cross_Product() or MATH_Cross_Product_with_Mult() instead.
+static inline __attribute__((always_inline)) RETURN_VECTOR_STRUCT xMATH_do_Cross_Product_with_Mult(float x3, float a, float y3, float b, float x2, float x1, float y1, float y2)
+{
+  // FR4-FR11 are the regs that are passed in, in that order.
+  // Just need to make sure GCC doesn't modify anything, and these register vars do that job.
+
+  // Temporary variables are necessary per GCC to avoid clobbering:
+  // https://gcc.gnu.org/onlinedocs/gcc/Local-Register-Variables.html#Local-Register-Variables
+
+  float tx1 = x1;
+  float tx2 = x2;
+  float tx3 = x3;
+  float ta = a;
+
+  float ty1 = y1;
+  float ty2 = y2;
+  float ty3 = y3;
+  float tb = b;
+
+  register float __x1 __asm__("fr9") = tx1; // need to negate (need to move to fr6, then negate fr9)
+  register float __x2 __asm__("fr8") = tx2; // in place for matrix (need to move to fr2 then negate fr2)
+  register float __x3 __asm__("fr4") = tx3; // need to negate (move to fr1 first, then negate fr4)
+  register float __a __asm__("fr5") = ta;
+
+  register float __y1 __asm__("fr10") = ty1;
+  register float __y2 __asm__("fr11") = ty2;
+  register float __y3 __asm__("fr6") = ty3;
+  register float __b __asm__("fr7") = tb;
+
+  register float __z1 __asm__("fr0") = 0.0f; // z1
+  register float __z2 __asm__("fr1") = 0.0f; // z2 (not moving x3 here yet since a double 0 is needed)
+  register float __z3 __asm__("fr2") = tx2; // z3 (this handles putting x2 in fr2)
+  register float __c __asm__("fr3") = 0.0f; // c
+
+  // This actually does a matrix transform to do the cross product.
+  // It's this:
+  //                   _    _       _            _
+  //  [  0 -x3 x2 0 ] |  y1  |     | -x3y2 + x2y3 |
+  //  [  x3 0 -x1 0 ] |  y2  |  =  |  x3y1 - x1y3 |
+  //  [ -x2 x1 0  0 ] |  y3  |     | -x2y1 + x1y2 |
+  //  [  0  0  0  a ] |_ b  _|     |_      c     _|
+  //
+
+  asm volatile (
+    // set up back bank's FV0
+    "fschg\n\t" // switch fmov to paired moves (note: only paired moves can access XDn regs)
+
+    // Save FR12-FR15, which are supposed to be preserved across functions calls.
+    // This stops them from getting clobbered and saves 4 stack pushes (memory accesses).
+    "fmov DR12, XD12\n\t"
+    "fmov DR14, XD14\n\t"
+
+    "fmov DR10, XD0\n\t" // fmov 'y1' and 'y2' from FR10, FR11 into position (XF0, XF1)
+    "fmov DR6, XD2\n\t" // fmov 'y3' and 'b' from FR6, FR7 into position (XF2, XF3)
+
+    // pair move zeros for some speed in setting up front bank for matrix
+    "fmov DR0, DR10\n\t" // clear FR10, FR11
+    "fmov DR0, DR12\n\t" // clear FR12, FR13
+    "fschg\n\t" // switch back to single moves
+    // prepare front bank for XMTRX
+    "fmov FR5, FR15\n\t" // fmov 'a' into position
+    "fmov FR0, FR14\n\t" // clear out FR14
+    "fmov FR0, FR7\n\t" // clear out FR7
+    "fmov FR0, FR5\n\t" // clear out FR5
+
+    "fneg FR2\n\t" // set up 'x2'
+    "fmov FR9, FR6\n\t" // set up 'x1'
+    "fneg FR9\n\t"
+    "fmov FR4, FR1\n\t" // set up 'x3'
+    "fneg FR4\n\t"
+    // flip banks and matrix multiply
+    "frchg\n\t"
+    "ftrv XMTRX, FV0\n"
+  : "+&w" (__z1), "+&f" (__z2), "+&f" (__z3), "+&f" (__c) // output (using FV0)
+  : "f" (__x1), "f" (__x2), "f" (__x3), "f" (__y1), "f" (__y2), "f" (__y3), "f" (__a), "f" (__b) // inputs
+  : // clobbers (all of the float regs get clobbered, except for FR12-FR15 which were specially preserved)
+  );
+
+  RETURN_VECTOR_STRUCT output = {__z1, __z2, __z3, __c};
+  return output;
+}
+
+// Please do not call this function directly (notice the weird syntax); call
+// MATH_Cross_Product() or MATH_Cross_Product_with_Mult() instead.
+static inline __attribute__((always_inline)) RETURN_VECTOR_STRUCT xMATH_do_Cross_Product(float x3, float zero, float x1, float y3, float x2, float x1_2, float y1, float y2)
+{
+  // FR4-FR11 are the regs that are passed in, in that order.
+  // Just need to make sure GCC doesn't modify anything, and these register vars do that job.
+
+  // Temporary variables are necessary per GCC to avoid clobbering:
+  // https://gcc.gnu.org/onlinedocs/gcc/Local-Register-Variables.html#Local-Register-Variables
+
+  float tx1 = x1;
+  float tx2 = x2;
+  float tx3 = x3;
+  float tx1_2 = x1_2;
+
+  float ty1 = y1;
+  float ty2 = y2;
+  float ty3 = y3;
+  float tzero = zero;
+
+  register float __x1 __asm__("fr6") = tx1; // in place
+  register float __x2 __asm__("fr8") = tx2; // in place (fmov to fr2, negate fr2)
+  register float __x3 __asm__("fr4") = tx3; // need to negate (fmov to fr1, negate fr4)
+
+  register float __zero __asm__("fr5") = tzero; // in place
+  register float __x1_2 __asm__("fr9") = tx1_2; // need to negate
+
+  register float __y1 __asm__("fr10") = ty1;
+  register float __y2 __asm__("fr11") = ty2;
+  // no __y3 needed in this function
+
+  register float __z1 __asm__("fr0") = tzero; // z1
+  register float __z2 __asm__("fr1") = tzero; // z2
+  register float __z3 __asm__("fr2") = ty3; // z3
+  register float __c __asm__("fr3") = tzero; // c
+
+  // This actually does a matrix transform to do the cross product.
+  // It's this:
+  //                   _    _       _            _
+  //  [  0 -x3 x2 0 ] |  y1  |     | -x3y2 + x2y3 |
+  //  [  x3 0 -x1 0 ] |  y2  |  =  |  x3y1 - x1y3 |
+  //  [ -x2 x1 0  0 ] |  y3  |     | -x2y1 + x1y2 |
+  //  [  0  0  0  0 ] |_ 0  _|     |_      0     _|
+  //
+
+  asm volatile (
+    // zero out FR7. For some reason, if this is done in C after __z3 is set:
+    // register float __y3 __asm__("fr7") = tzero;
+    // then GCC will emit a spurious stack push (pushing FR12). So just zero it here.
+    "fmov FR5, FR7\n\t"
+    // set up back bank's FV0
+    "fschg\n\t" // switch fmov to paired moves (note: only paired moves can access XDn regs)
+
+    // Save FR12-FR15, which are supposed to be preserved across functions calls.
+    // This stops them from getting clobbered and saves 4 stack pushes (memory accesses).
+    "fmov DR12, XD12\n\t"
+    "fmov DR14, XD14\n\t"
+
+    "fmov DR10, XD0\n\t" // fmov 'y1' and 'y2' from FR10, FR11 into position (XF0, XF1)
+    "fmov DR2, XD2\n\t" // fmov 'y3' and '0' from FR2, FR3 into position (XF2, XF3)
+
+    // pair move zeros for some speed in setting up front bank for matrix
+    "fmov DR0, DR10\n\t" // clear FR10, FR11
+    "fmov DR0, DR12\n\t" // clear FR12, FR13
+    "fmov DR0, DR14\n\t" // clear FR14, FR15
+    "fschg\n\t" // switch back to single moves
+    // prepare front bank for XMTRX
+    "fneg FR9\n\t" // set up 'x1'
+    "fmov FR8, FR2\n\t" // set up 'x2'
+    "fneg FR2\n\t"
+    "fmov FR4, FR1\n\t" // set up 'x3'
+    "fneg FR4\n\t"
+    // flip banks and matrix multiply
+    "frchg\n\t"
+    "ftrv XMTRX, FV0\n"
+  : "+&w" (__z1), "+&f" (__z2), "+&f" (__z3), "+&f" (__c) // output (using FV0)
+  : "f" (__x1), "f" (__x2), "f" (__x3), "f" (__y1), "f" (__y2), "f" (__zero), "f" (__x1_2) // inputs
+  : "fr7" // clobbers (all of the float regs get clobbered, except for FR12-FR15 which were specially preserved)
+  );
+
+  RETURN_VECTOR_STRUCT output = {__z1, __z2, __z3, __c};
+  return output;
+}
+
+//------------------------------------------------------------------------------
+// Functions that wrap the xMATH_do_Cross_Product[_with_Mult]() functions to make
+// it easier to organize parameters
+//------------------------------------------------------------------------------
+
+// Cross product with a bonus float multiply (c = a * b)
+static inline __attribute__((always_inline)) RETURN_VECTOR_STRUCT MATH_Cross_Product_with_Mult(float x1, float x2, float x3, float y1, float y2, float y3, float a, float b)
+{
+  return xMATH_do_Cross_Product_with_Mult(x3, a, y3, b, x2, x1, y1, y2);
+}
+
+// Plain cross product; does not use the bonus float multiply (c = 0 and a in the cross product matrix will be 0)
+// This is a tiny bit faster than 'with_mult' (about 2 cycles faster)
+static inline __attribute__((always_inline)) RETURN_VECTOR_STRUCT MATH_Cross_Product(float x1, float x2, float x3, float y1, float y2, float y3)
+{
+  return xMATH_do_Cross_Product(x3, 0.0f, x1, y3, x2, x1, y1, y2);
+}
+
+// Outer product: vec (X) vec = matrix
+//   _    _
+//  |  x1  |
+//  |  x2  |  (X)  [ y1 y2 y3 y4 ] = 4x4 matrix
+//  |  x3  |
+//  |_ x4 _|
+//
+// This returns the floats in the back bank (XF0-15), which are inaccessible
+// outside of using frchg or paired-move fmov. It's meant to set up a matrix for
+// use with other matrix functions. GCC also does not touch the XFn bank.
+// This will also wipe out anything stored in the float registers, as it uses the
+// whole FPU register file (all 32 of the float registers).
+static inline __attribute__((always_inline)) void MATH_Outer_Product(float x1, float x2, float x3, float x4, float y1, float y2, float y3, float y4)
+{
+  // FR4-FR11 are the regs that are passed in, in that order.
+  // Just need to make sure GCC doesn't modify anything, and these register vars do that job.
+
+  // Temporary variables are necessary per GCC to avoid clobbering:
+  // https://gcc.gnu.org/onlinedocs/gcc/Local-Register-Variables.html#Local-Register-Variables
+
+  float tx1 = x1;
+  float tx2 = x2;
+  float tx3 = x3;
+  float tx4 = x4;
+
+  float ty1 = y1;
+  float ty2 = y2;
+  float ty3 = y3;
+  float ty4 = y4;
+
+  // vector FV4
+  register float __x1 __asm__("fr4") = tx1;
+  register float __x2 __asm__("fr5") = tx2;
+  register float __x3 __asm__("fr6") = tx3;
+  register float __x4 __asm__("fr7") = tx4;
+
+  // vector FV8
+  register float __y1 __asm__("fr8") = ty1;
+  register float __y2 __asm__("fr9") = ty2;
+  register float __y3 __asm__("fr10") = ty3; // in place already
+  register float __y4 __asm__("fr11") = ty4;
+
+  // This actually does a 4x4 matrix multiply to do the outer product.
+  // It's this:
+  //
+  //  [ x1 x1 x1 x1 ] [ y1 0 0 0 ]     [ x1y1 x1y2 x1y3 x1y4 ]
+  //  [ x2 x2 x2 x2 ] [ 0 y2 0 0 ]  =  [ x2y1 x2y2 x2y3 x2y4 ]
+  //  [ x3 x3 x3 x3 ] [ 0 0 y3 0 ]     [ x3y1 x3y2 x3y3 x3y4 ]
+  //  [ x4 x4 x4 x4 ] [ 0 0 0 y4 ]     [ x4y1 x4y2 x4y3 x4y4 ]
+  //
+
+  asm volatile (
+    // zero out unoccupied front floats to make a double 0 in DR2
+    "fldi0 FR2\n\t"
+    "fmov FR2, FR3\n\t"
+    "fschg\n\t" // switch fmov to paired moves (note: only paired moves can access XDn regs)
+    // fmov 'x1' and 'x2' from FR4, FR5 into position (XF0,4,8,12, XF1,5,9,13)
+    "fmov DR4, XD0\n\t"
+    "fmov DR4, XD4\n\t"
+    "fmov DR4, XD8\n\t"
+    "fmov DR4, XD12\n\t"
+    // fmov 'x3' and 'x4' from FR6, FR7 into position (XF2,6,10,14, XF3,7,11,15)
+    "fmov DR6, XD2\n\t"
+    "fmov DR6, XD6\n\t"
+    "fmov DR6, XD10\n\t"
+    "fmov DR6, XD14\n\t"
+    // set up front floats (y1-y4)
+    "fmov DR8, DR0\n\t"
+    "fmov DR8, DR4\n\t"
+    "fmov DR10, DR14\n\t"
+    // finish zeroing out front floats
+    "fmov DR2, DR6\n\t"
+    "fmov DR2, DR8\n\t"
+    "fmov DR2, DR12\n\t"
+    "fschg\n\t" // switch back to single-move mode
+    "fmov FR2, FR1\n\t"
+    "fmov FR2, FR4\n\t"
+    "fmov FR2, FR11\n\t"
+    "fmov FR2, FR14\n\t"
+    // finally, matrix multiply 4x4
+    "ftrv XMTRX, FV0\n\t"
+    "ftrv XMTRX, FV4\n\t"
+    "ftrv XMTRX, FV8\n\t"
+    "ftrv XMTRX, FV12\n\t"
+    // Save output in XF regs
+    "frchg\n"
+  : // no outputs
+  : "f" (__x1), "f" (__x2), "f" (__x3), "f" (__x4), "f" (__y1), "f" (__y2), "f" (__y3), "f" (__y4) // inputs
+  : "fr0", "fr1", "fr2", "fr3", "fr12", "fr13", "fr14", "fr15" // clobbers, can't avoid it
+  );
+  // GCC will restore FR12-FR15 from the stack after this, so we really can't keep the output in the front bank.
+}
+
+// Matrix transform: matrix * vector = vector
+//                   _    _       _    _
+//  [ ----------- ] |  x1  |     |  z1  |
+//  [ ---XMTRX--- ] |  x2  |  =  |  z2  |
+//  [ ----------- ] |  x3  |     |  z3  |
+//  [ ----------- ] |_ x4 _|     |_ z4 _|
+//
+// IMPORTANT USAGE INFORMATION (matrix transform):
+//
+// Return vector struct maps 1:1 to the above diagram:
+//
+//  typedef struct {
+//   float z1;
+//   float z2;
+//   float z3;
+//   float z4;
+// } RETURN_VECTOR_STRUCT;
+//
+// Similarly to how the sine and cosine functions use fsca and return 2 floats,
+// the matrix transform function actually returns 4 floats. The SH4 only multiplies
+// 4x4 matrices with 4x1 vectors, which is why the output is like that.
+//
+// Multiply a matrix stored in the back bank (XMTRX) with an input vector
+static inline __attribute__((always_inline)) RETURN_VECTOR_STRUCT MATH_Matrix_Transform(float x1, float x2, float x3, float x4)
+{
+  // The floats comprising FV4 are the regs that are passed in.
+  // Just need to make sure GCC doesn't modify anything, and these register vars do that job.
+
+  // Temporary variables are necessary per GCC to avoid clobbering:
+  // https://gcc.gnu.org/onlinedocs/gcc/Local-Register-Variables.html#Local-Register-Variables
+
+  float tx1 = x1;
+  float tx2 = x2;
+  float tx3 = x3;
+  float tx4 = x4;
+
+  // output vector FV0
+  register float __z1 __asm__("fr0") = tx1;
+  register float __z2 __asm__("fr1") = tx2;
+  register float __z3 __asm__("fr2") = tx3;
+  register float __z4 __asm__("fr3") = tx4;
+
+  asm volatile ("ftrv XMTRX, FV0\n\t"
+    // have to do this to obey SH4 calling convention--output returned in FV0
+    : "+w" (__z1), "+f" (__z2), "+f" (__z3), "+f" (__z4) // outputs, "+" means r/w
+    : // no inputs
+    : // no clobbers
+  );
+
+  RETURN_VECTOR_STRUCT output = {__z1, __z2, __z3, __z4};
+  return output;
+}
+
+// Matrix Transpose
+//
+// This does a matrix transpose on the matrix in XMTRX, which swaps rows with
+// columns as follows (math notation is [XMTRX]^T):
+//
+//  [ a b c d ] T   [ a e i m ]
+//  [ e f g h ]  =  [ b f j n ]
+//  [ i j k l ]     [ c g k o ]
+//  [ m n o p ]     [ d h l p ]
+//
+// PLEASE NOTE: It is faster to avoid the need for a transpose altogether by
+// structuring matrices and vectors accordingly.
+static inline __attribute__((always_inline)) void MATH_Matrix_Transpose(void)
+{
+  asm volatile ("frchg\n\t" // fmov for singles only works on front bank
+    // FR0, FR5, FR10, and FR15 are already in place
+    // swap FR1 and FR4
+    "flds FR1, FPUL\n\t"
+    "fmov FR4, FR1\n\t"
+    "fsts FPUL, FR4\n\t"
+    // swap FR2 and FR8
+    "flds FR2, FPUL\n\t"
+    "fmov FR8, FR2\n\t"
+    "fsts FPUL, FR8\n\t"
+    // swap FR3 and FR12
+    "flds FR3, FPUL\n\t"
+    "fmov FR12, FR3\n\t"
+    "fsts FPUL, FR12\n\t"
+    // swap FR6 and FR9
+    "flds FR6, FPUL\n\t"
+    "fmov FR9, FR6\n\t"
+    "fsts FPUL, FR9\n\t"
+    // swap FR7 and FR13
+    "flds FR7, FPUL\n\t"
+    "fmov FR13, FR7\n\t"
+    "fsts FPUL, FR13\n\t"
+    // swap FR11 and FR14
+    "flds FR11, FPUL\n\t"
+    "fmov FR14, FR11\n\t"
+    "fsts FPUL, FR14\n\t"
+    // restore XMTRX to back bank
+    "frchg\n"
+    : // no outputs
+    : // no inputs
+    : "fpul" // clobbers
+  );
+}
+
+// Matrix product: matrix * matrix = matrix
+//
+// These use the whole dang floating point unit.
+//
+//  [ ----------- ] [ ----------- ]     [ ----------- ]
+//  [ ---Back---- ] [ ---Front--- ]  =  [ ---XMTRX--- ]
+//  [ ---Matrix-- ] [ ---Matrix-- ]     [ ----------- ]
+//  [ --(XMTRX)-- ] [ ----------- ]     [ ----------- ]
+//
+// Multiply a matrix stored in the back bank with a matrix loaded from memory
+// Output is stored in the back bank (XMTRX)
+static inline __attribute__((always_inline)) void MATH_Matrix_Product(ALL_FLOATS_STRUCT * front_matrix)
+{
+  asm volatile ("pref @%[fmtrx]\n\t" // Prefetching should help a bit
+    // gotta wait for 6 clocks (30ns) memory access time for pref to work
+    "mov #32, r1\n\t"
+    "add %[fmtrx], r1\n\t" // store offset by 32 in r1
+    "pref @r1\n\t" // Get a head start prefetching the second half of the 64-byte data
+    // NOPs are in the MT group, so they are executed in parallel...
+    // all these nops should equal 2 cycles in this context...
+    "nop\n\t"
+    "nop\n\t"
+    "nop\n\t"
+    "nop\n\t"
+    "nop\n\t"
+    "nop\n\t"
+    "fschg\n\t" // switch fmov to paired moves
+    "fmov.d @%[fmtrx]+, DR0\n\t"
+    "fmov.d @%[fmtrx]+, DR2\n\t"
+    "fmov.d @%[fmtrx]+, DR4\n\t"
+    "fmov.d @%[fmtrx]+, DR6\n\t"
+    "fmov.d @%[fmtrx]+, DR8\n\t"
+    "fmov.d @%[fmtrx]+, DR10\n\t"
+    "fmov.d @%[fmtrx]+, DR12\n\t"
+    "fmov.d @%[fmtrx], DR14\n\t"
+    "fschg\n\t" // switch back to single moves
+    // matrix multiply 4x4
+    "ftrv XMTRX, FV0\n\t"
+    "ftrv XMTRX, FV4\n\t"
+    "ftrv XMTRX, FV8\n\t"
+    "ftrv XMTRX, FV12\n\t"
+    // Save output in XF regs
+    "frchg\n"
+    : [fmtrx] "+r" ((unsigned int)front_matrix) // outputs, "+" means r/w
+    : // no inputs
+    : "r1", "fr0", "fr1", "fr2", "fr3", "fr4", "fr5", "fr6", "fr7", "fr8", "fr9", "fr10", "fr11", "fr12", "fr13", "fr14", "fr15" // clobbers (GCC doesn't know about back bank, so writing to it isn't clobbered)
+  );
+}
+
+// Load two 4x4 matrices and multiply them, storing the output into the back bank (XMTRX)
+//
+// MATH_Load_Matrix_Product() is slightly faster than doing this:
+//    MATH_Load_XMTRX(matrix1)
+//    MATH_Matrix_Product(matrix2)
+// as it saves having to do 2 extraneous 'fschg' instructions.
+//
+static inline __attribute__((always_inline)) void MATH_Load_Matrix_Product(ALL_FLOATS_STRUCT * matrix1, ALL_FLOATS_STRUCT * matrix2)
+{
+  asm volatile ("pref @%[bmtrx]\n\t" // Prefetching should help a bit
+    // gotta wait for 6 clocks (30ns) memory access time for pref to work
+    "mov #32, r0\n\t"
+    "pref @%[fmtrx]\n\t" // prefetch fmtrx now while we wait
+    "mov r0, r1\n\t" // This is parallel-issue
+    "add %[bmtrx], r0\n\t" // store offset by 32 in r0
+    "pref @r0\n\t" // Get a head start prefetching the second half of the 64-byte data
+    "add %[fmtrx], r1\n\t" // store offset by 32 in r1
+    "pref @r1\n\t" // likewise for other matrix
+    "fschg\n\t" // switch fmov to paired moves (note: only paired moves can access XDn regs)
+    // back matrix
+    "fmov.d @%[bmtrx]+, XD0\n\t"
+    "fmov.d @%[bmtrx]+, XD2\n\t"
+    "fmov.d @%[bmtrx]+, XD4\n\t"
+    "fmov.d @%[bmtrx]+, XD6\n\t"
+    "fmov.d @%[bmtrx]+, XD8\n\t"
+    "fmov.d @%[bmtrx]+, XD10\n\t"
+    "fmov.d @%[bmtrx]+, XD12\n\t"
+    "fmov.d @%[bmtrx], XD14\n\t"
+    // front matrix
+    "fmov.d @%[fmtrx]+, DR0\n\t"
+    "fmov.d @%[fmtrx]+, DR2\n\t"
+    "fmov.d @%[fmtrx]+, DR4\n\t"
+    "fmov.d @%[fmtrx]+, DR6\n\t"
+    "fmov.d @%[fmtrx]+, DR8\n\t"
+    "fmov.d @%[fmtrx]+, DR10\n\t"
+    "fmov.d @%[fmtrx]+, DR12\n\t"
+    "fmov.d @%[fmtrx], DR14\n\t"
+    "fschg\n\t" // switch back to single moves
+    // matrix multiply 4x4
+    "ftrv XMTRX, FV0\n\t"
+    "ftrv XMTRX, FV4\n\t"
+    "ftrv XMTRX, FV8\n\t"
+    "ftrv XMTRX, FV12\n\t"
+    // Save output in XF regs
+    "frchg\n"
+    : [bmtrx] "+&r" ((unsigned int)matrix1), [fmtrx] "+r" ((unsigned int)matrix2) // outputs, "+" means r/w, "&" means it's written to before all inputs are consumed
+    : // no inputs
+    : "r0", "r1", "fr0", "fr1", "fr2", "fr3", "fr4", "fr5", "fr6", "fr7", "fr8", "fr9", "fr10", "fr11", "fr12", "fr13", "fr14", "fr15" // clobbers (GCC doesn't know about back bank, so writing to it isn't clobbered)
+  );
+}
+
+//------------------------------------------------------------------------------
+// Matrix load and store operations
+//------------------------------------------------------------------------------
+
+// Load a matrix from memory into the back bank (XMTRX)
+static inline __attribute__((always_inline)) void MATH_Load_XMTRX(ALL_FLOATS_STRUCT * back_matrix)
+{
+  asm volatile ("pref @%[bmtrx]\n\t" // Prefetching should help a bit
+    // gotta wait for 6 clocks (30ns) memory access time for pref to work
+    "mov #32, r1\n\t"
+    "add %[bmtrx], r1\n\t" // store offset by 32 in r1
+    "pref @r1\n\t" // Get a head start prefetching the second half of the 64-byte data
+    // NOPs are in the MT group, so they are executed in parallel...
+    // all these nops should equal 2 cycles in this context...
+    "nop\n\t"
+    "nop\n\t"
+    "nop\n\t"
+    "nop\n\t"
+    "nop\n\t"
+    "nop\n\t"
+    "fschg\n\t" // switch fmov to paired moves (note: only paired moves can access XDn regs)
+    "fmov.d @%[bmtrx]+, XD0\n\t"
+    "fmov.d @%[bmtrx]+, XD2\n\t"
+    "fmov.d @%[bmtrx]+, XD4\n\t"
+    "fmov.d @%[bmtrx]+, XD6\n\t"
+    "fmov.d @%[bmtrx]+, XD8\n\t"
+    "fmov.d @%[bmtrx]+, XD10\n\t"
+    "fmov.d @%[bmtrx]+, XD12\n\t"
+    "fmov.d @%[bmtrx], XD14\n\t"
+    "fschg\n" // switch back to single moves
+    : [bmtrx] "+r" ((unsigned int)back_matrix) // outputs, "+" means r/w
+    : // no inputs
+    : "r1" // clobbers (GCC doesn't know about back bank, so writing to it isn't clobbered)
+  );
+}
+
+// Store XMTRX to memory
+static inline __attribute__((always_inline)) ALL_FLOATS_STRUCT * MATH_Store_XMTRX(ALL_FLOATS_STRUCT * destination)
+{
+  char * output = ((char*)destination) + sizeof(ALL_FLOATS_STRUCT) + 8; // ALL_FLOATS_STRUCT should be 64 bytes
+
+  asm volatile ("pref @%[dest_base]\n\t"
+    // gotta wait for 6 clocks (30ns) memory access time for pref to work
+    "mov #32, r1\n\t"
+    "add %[dest_base], r1\n\t" // store offset by 32 in r1
+    "pref @r1\n\t" // Get a head start prefetching the second half of the 64-byte data
+    // NOPs are in the MT group, so they are executed in parallel...
+    // all these nops should equal 2 cycles in this context...
+    "nop\n\t"
+    "nop\n\t"
+    "nop\n\t"
+    "nop\n\t"
+    "nop\n\t"
+    "nop\n\t"
+    "fschg\n\t" // switch fmov to paired moves (note: only paired moves can access XDn regs)
+    "fmov.d XD0, @-%[out_mtrx]\n\t" // These do *(--output) = XDn
+    "fmov.d XD2, @-%[out_mtrx]\n\t"
+    "fmov.d XD4, @-%[out_mtrx]\n\t"
+    "fmov.d XD6, @-%[out_mtrx]\n\t"
+    "fmov.d XD8, @-%[out_mtrx]\n\t"
+    "fmov.d XD10, @-%[out_mtrx]\n\t"
+    "fmov.d XD12, @-%[out_mtrx]\n\t"
+    "fmov.d XD14, @-%[out_mtrx]\n\t"
+    "fschg\n" // switch back to single moves
+    : [out_mtrx] "+&r" ((unsigned int)output) // outputs, "+" means r/w, "&" means it's written to before all inputs are consumed
+    : [dest_base] "r" ((unsigned int)destination) // inputs
+    : "r1", "memory" // clobbers
+  );
+
+  return destination;
+}
+
+// Returns FV0, 4, 8, or 12 from XMTRX
+//
+// Sorry, it has to be done 4 at a time like this due to calling convention
+// limits; under optimal optimization conditions, we only get 4 float registers
+// for return values; any more and they get pushed to memory.
+//
+// IMPORTANT USAGE INFORMATION (get XMTRX vector)
+//
+// XMTRX format, using the front bank's FVn notation:
+//
+//    FV0 FV4 FV8  FV12
+//    --- --- ---  ----
+//  [ xf0 xf4 xf8  xf12 ]
+//  [ xf1 xf5 xf9  xf13 ]
+//  [ xf2 xf6 xf10 xf14 ]
+//  [ xf3 xf7 xf11 xf15 ]
+//
+// Return vector maps to XMTRX as below depending on the FVn value passed in:
+//
+//  typedef struct {
+//   float z1; // will contain xf0, 4, 8 or 12
+//   float z2; // will contain xf1, 5, 9, or 13
+//   float z3; // will contain xf2, 6, 10, or 14
+//   float z4; // will contain xf3, 7, 11, or 15
+// } RETURN_VECTOR_STRUCT;
+//
+// Valid values of 'which' are 0, 4, 8, or 12, corresponding to FV0, FV4, FV8,
+// or FV12, respectively. Other values will return 0 in all four return values.
+static inline __attribute__((always_inline)) RETURN_VECTOR_STRUCT MATH_Get_XMTRX_Vector(unsigned int which)
+{
+  register float __z1 __asm__("fr0");
+  register float __z2 __asm__("fr1");
+  register float __z3 __asm__("fr2");
+  register float __z4 __asm__("fr3");
+
+  // Note: only paired moves can access XDn regs
+  asm volatile ("cmp/eq #0, %[select]\n\t" // if(which == 0), 1 -> T else 0 -> T
+    "bt.s 0f\n\t" // do FV0
+    " cmp/eq #4, %[select]\n\t" // if(which == 4), 1 -> T else 0 -> T
+    "bt.s 4f\n\t" // do FV4
+    " cmp/eq #8, %[select]\n\t" // if(which == 8), 1 -> T else 0 -> T
+    "bt.s 8f\n\t" // do FV8
+    " cmp/eq #12, %[select]\n\t" // if(which == 12), 1 -> T else 0 -> T
+    "bf.s 1f\n" // exit if not even FV12 was true, otherwise do FV12
+  "12:\n\t"
+    " fschg\n\t" // paired moves for FV12 (and exit case)
+    "fmov XD14, DR2\n\t"
+    "fmov XD12, DR0\n\t"
+    "bt.s 2f\n" // done
+  "8:\n\t"
+    " fschg\n\t" // paired moves for FV8, back to singles for FV12
+    "fmov XD10, DR2\n\t"
+    "fmov XD8, DR0\n\t"
+    "bf.s 2f\n" // done
+  "4:\n\t"
+    " fschg\n\t" // paired moves for FV4, back to singles for FV8
+    "fmov XD6, DR2\n\t"
+    "fmov XD4, DR0\n\t"
+    "bf.s 2f\n" // done
+  "0:\n\t"
+    " fschg\n\t" // paired moves for FV0, back to singles for FV4
+    "fmov XD2, DR2\n\t"
+    "fmov XD0, DR0\n\t"
+    "bf.s 2f\n" // done
+  "1:\n\t"
+    " fschg\n\t" // back to singles for FV0 and exit case
+    "fldi0 FR0\n\t" // FR0-3 get zeroed out, then
+    "fmov FR0, FR1\n\t"
+    "fmov FR0, FR2\n\t"
+    "fmov FR0, FR3\n"
+  "2:\n"
+    : "=w" (__z1), "=f" (__z2), "=f" (__z3), "=f" (__z4) // outputs
+    : [select] "z" (which) // inputs
+    : "t" // clobbers
+  );
+
+  RETURN_VECTOR_STRUCT output = {__z1, __z2, __z3, __z4};
+  return output;
+}
+
+// Returns a 2x2 matrix from a quadrant of XMTRX
+//
+// Sorry, it has to be done 4 at a time like this due to calling convention
+// limits; under optimal optimization conditions, we only get 4 float registers
+// for return values; any more and they get pushed to memory.
+//
+// IMPORTANT USAGE INFORMATION (get XMTRX 2x2)
+//
+// Each 2x2 quadrant is of the form:
+//
+//  [ a b ]
+//  [ c d ]
+//
+// Return vector maps to the 2x2 matrix as below:
+//
+//  typedef struct {
+//   float z1; // a
+//   float z2; // c
+//   float z3; // b
+//   float z4; // d
+// } RETURN_VECTOR_STRUCT;
+//
+//  (So the function does a 2x2 transpose in storing the values relative to the
+//  order stored in XMTRX.)
+//
+// Valid values of 'which' are 1, 2, 3, or 4, corresponding to the following
+// quadrants of XMTRX:
+//
+//       1             2
+//  [ xf0 xf4 ] | [ xf8 xf12 ]
+//  [ xf1 xf5 ] | [ xf9 xf13 ]
+//  --   3   -- |  --  4  --
+//  [ xf2 xf6 ] | [ xf10 xf14 ]
+//  [ xf3 xf7 ] | [ xf11 xf15 ]
+//
+// Other input values will return 0 in all four return floats.
+static inline __attribute__((always_inline)) RETURN_VECTOR_STRUCT MATH_Get_XMTRX_2x2(unsigned int which)
+{
+  register float __z1 __asm__("fr0");
+  register float __z2 __asm__("fr1");
+  register float __z3 __asm__("fr2");
+  register float __z4 __asm__("fr3");
+
+  // Note: only paired moves can access XDn regs
+  asm volatile ("cmp/eq #1, %[select]\n\t" // if(which == 1), 1 -> T else 0 -> T
+    "bt.s 1f\n\t" // do quadrant 1
+    " cmp/eq #2, %[select]\n\t" // if(which == 2), 1 -> T else 0 -> T
+    "bt.s 2f\n\t" // do quadrant 2
+    " cmp/eq #3, %[select]\n\t" // if(which == 3), 1 -> T else 0 -> T
+    "bt.s 3f\n\t" // do quadrant 3
+    " cmp/eq #4, %[select]\n\t" // if(which == 4), 1 -> T else 0 -> T
+    "bf.s 0f\n" // exit if nothing was true, otherwise do quadrant 4
+  "4:\n\t"
+    " fschg\n\t" // paired moves for quadrant 4 (and exit case)
+    "fmov XD14, DR2\n\t"
+    "fmov XD10, DR0\n\t"
+    "bt.s 5f\n" // done
+  "3:\n\t"
+    " fschg\n\t" // paired moves for quadrant 3, back to singles for 4
+    "fmov XD6, DR2\n\t"
+    "fmov XD2, DR0\n\t"
+    "bf.s 5f\n" // done
+  "2:\n\t"
+    " fschg\n\t" // paired moves for quadrant 2, back to singles for 3
+    "fmov XD12, DR2\n\t"
+    "fmov XD8, DR0\n\t"
+    "bf.s 5f\n" // done
+  "1:\n\t"
+    " fschg\n\t" // paired moves for quadrant 1, back to singles for 2
+    "fmov XD4, DR2\n\t"
+    "fmov XD0, DR0\n\t"
+    "bf.s 5f\n" // done
+  "0:\n\t"
+    " fschg\n\t" // back to singles for quadrant 1 and exit case
+    "fldi0 FR0\n\t" // FR0-3 get zeroed out, then
+    "fmov FR0, FR1\n\t"
+    "fmov FR0, FR2\n\t"
+    "fmov FR0, FR3\n"
+  "5:\n"
+    : "=w" (__z1), "=f" (__z2), "=f" (__z3), "=f" (__z4) // outputs
+    : [select] "z" (which) // inputs
+    : "t" // clobbers
+  );
+
+  RETURN_VECTOR_STRUCT output = {__z1, __z2, __z3, __z4};
+  return output;
+}
+
+// It is not possible to return an entire 4x4 matrix in registers, as the only
+// registers allowed for return values are R0-R3 and FR0-FR3. All others are
+// marked caller save, which means they could be restored from stack and clobber
+// anything returned in them.
+//
+// In general, writing the entire required math routine in one asm function is
+// the best way to go for performance reasons anyways, and in that situation one
+// can just throw calling convention to the wind until returning back to C.
+
+#endif /* __SH4_MATH_H_ */
\ No newline at end of file
diff --git a/GL/texture.c b/GL/texture.c
index 7160e94..bf00983 100644
--- a/GL/texture.c
+++ b/GL/texture.c
@@ -743,11 +743,11 @@ GLint _cleanInternalFormat(GLint internalFormat) {
 
 typedef void (*TextureConversionFunc)(const GLubyte*, GLubyte*);
 
-static inline void _rgba8888_to_argb4444(const GLubyte* source, GLubyte* dest) {
+static INLINE_DEBUG void _rgba8888_to_argb4444(const GLubyte* source, GLubyte* dest) {
     *((GLushort*) dest) = (source[3] & 0xF0) << 8 | (source[0] & 0xF0) << 4 | (source[1] & 0xF0) | (source[2] & 0xF0) >> 4;
 }
 
-static inline void _rgba8888_to_rgba8888(const GLubyte* source, GLubyte* dest) {
+static INLINE_DEBUG void _rgba8888_to_rgba8888(const GLubyte* source, GLubyte* dest) {
     /* Noop */
     GLubyte* dst = (GLubyte*) dest;
     dst[0] = source[0];
@@ -756,11 +756,11 @@ static inline void _rgba8888_to_rgba8888(const GLubyte* source, GLubyte* dest) {
     dst[3] = source[3];
 }
 
-static inline void _rgba8888_to_rgb565(const GLubyte* source, GLubyte* dest) {
+static INLINE_DEBUG void _rgba8888_to_rgb565(const GLubyte* source, GLubyte* dest) {
     *((GLushort*) dest) = ((source[0] & 0b11111000) << 8) | ((source[1] & 0b11111100) << 3) | (source[2] >> 3);
 }
 
-static inline void _rgb888_to_rgba8888(const GLubyte* source, GLubyte* dest) {
+static INLINE_DEBUG void _rgb888_to_rgba8888(const GLubyte* source, GLubyte* dest) {
     /* Noop */
     GLubyte* dst = (GLubyte*) dest;
     dst[0] = source[0];
@@ -769,24 +769,24 @@ static inline void _rgb888_to_rgba8888(const GLubyte* source, GLubyte* dest) {
     dst[3] = 255;
 }
 
-static inline void _rgb888_to_rgb565(const GLubyte* source, GLubyte* dest) {
+static INLINE_DEBUG void _rgb888_to_rgb565(const GLubyte* source, GLubyte* dest) {
     *((GLushort*) dest) = ((source[0] & 0b11111000) << 8) | ((source[1] & 0b11111100) << 3) | (source[2] >> 3);
 }
 
-static inline void _rgba8888_to_a000(const GLubyte* source, GLubyte* dest) {
+static INLINE_DEBUG void _rgba8888_to_a000(const GLubyte* source, GLubyte* dest) {
     *((GLushort*) dest) = ((source[3] & 0b11111000) << 8);
 }
 
-static inline void _r8_to_rgb565(const GLubyte* source, GLubyte* dest) {
+static INLINE_DEBUG void _r8_to_rgb565(const GLubyte* source, GLubyte* dest) {
     *((GLushort*) dest) = (source[0] & 0b11111000) << 8;
 }
 
-static inline void _rgba4444_to_argb4444(const GLubyte* source, GLubyte* dest) {
+static INLINE_DEBUG void _rgba4444_to_argb4444(const GLubyte* source, GLubyte* dest) {
     GLushort* src = (GLushort*) source;
     *((GLushort*) dest) = ((*src & 0x000F) << 12) | *src >> 4;
 }
 
-static inline void _rgba4444_to_rgba8888(const GLubyte* source, GLubyte* dest) {
+static INLINE_DEBUG void _rgba4444_to_rgba8888(const GLubyte* source, GLubyte* dest) {
     GLushort src = *((GLushort*) source);
     GLubyte* dst = (GLubyte*) dest;
 
@@ -796,7 +796,7 @@ static inline void _rgba4444_to_rgba8888(const GLubyte* source, GLubyte* dest) {
     dst[3] = ((src & 0x000F)) * 2;
 }
 
-static inline void _i8_to_i8(const GLubyte* source, GLubyte* dest) {
+static INLINE_DEBUG void _i8_to_i8(const GLubyte* source, GLubyte* dest) {
     /* For indexes */
     GLubyte* dst = (GLubyte*) dest;
     *dst = *source;
diff --git a/containers/aligned_vector.c b/containers/aligned_vector.c
index 708b187..1657a60 100644
--- a/containers/aligned_vector.c
+++ b/containers/aligned_vector.c
@@ -3,6 +3,8 @@
 #include <math.h>
 #include <assert.h>
 #include <stdio.h>
+#include <dc/sq.h>
+#include <kos/string.h>
 
 #if defined(__APPLE__) || defined(__WIN32__)
 /* Linux + Kos define this, OSX does not, so just use malloc there */
@@ -25,7 +27,7 @@ void aligned_vector_init(AlignedVector* vector, unsigned int element_size) {
 }
 
 
-static inline unsigned int round_to_chunk_size(unsigned int val) {
+static INLINE_DEBUG unsigned int round_to_chunk_size(unsigned int val) {
     const unsigned int n = val;
     const unsigned int m = ALIGNED_VECTOR_CHUNK_SIZE;
 
@@ -107,33 +109,12 @@ void* aligned_vector_resize(AlignedVector* vector, const unsigned int element_co
     }
 }
 
-void* aligned_vector_at(const AlignedVector* vector, const unsigned int index) {
-    #if 0
-    if(index >= vector->size){
-        char msg[60];
-        sprintf(msg, "Vector OOB: %d %d wanted %d\n", vector->capacity, vector->size, index);
-        //aligned_vector_resize(vector, index);
-        assert_msg(index < vector->size, msg);
-    }
-    #endif
-    assert(index < vector->size);
-    return &vector->data[index * vector->element_size];
-}
-
-void* aligned_vector_back(AlignedVector* vector) {
-    return aligned_vector_at(vector, vector->size - 1);
-}
-
 void* aligned_vector_extend(AlignedVector* vector, const unsigned int additional_count) {
     const unsigned int current = vector->size;
     aligned_vector_resize(vector, vector->size + additional_count);
     return aligned_vector_at(vector, current);
 }
 
-void aligned_vector_clear(AlignedVector* vector) {
-    vector->size = 0;
-}
-
 void aligned_vector_shrink_to_fit(AlignedVector* vector) {
     if(vector->size == 0) {
         free(vector->data);
diff --git a/containers/aligned_vector.h b/containers/aligned_vector.h
index a002ece..d672b47 100644
--- a/containers/aligned_vector.h
+++ b/containers/aligned_vector.h
@@ -5,6 +5,8 @@
 extern "C" {
 #endif
 
+#include "../GL/cygprofile.h"
+
 typedef struct {
     unsigned int size;
     unsigned int capacity;
@@ -18,12 +20,27 @@ void aligned_vector_init(AlignedVector* vector, unsigned int element_size);
 void aligned_vector_reserve(AlignedVector* vector, unsigned int element_count);
 void* aligned_vector_push_back(AlignedVector* vector, const void* objs, unsigned int count);
 void* aligned_vector_resize(AlignedVector* vector, const unsigned int element_count);
-void* aligned_vector_at(const AlignedVector* vector, const unsigned int index);
+INLINE_ALWAYS void* aligned_vector_at(const AlignedVector* vector, const unsigned int index) {
+    #if 0
+    if(index >= vector->size){
+        char msg[60];
+        sprintf(msg, "Vector OOB: %d %d wanted %d\n", vector->capacity, vector->size, index);
+        //aligned_vector_resize(vector, index);
+        assert_msg(index < vector->size, msg);
+    }
+    assert(index < vector->size); /* Check here */
+    #endif
+    return &vector->data[index * vector->element_size];
+}
 void* aligned_vector_extend(AlignedVector* vector, const unsigned int additional_count);
-void aligned_vector_clear(AlignedVector* vector);
+INLINE_ALWAYS void aligned_vector_clear(AlignedVector* vector){
+    vector->size = 0;
+}
 void aligned_vector_shrink_to_fit(AlignedVector* vector);
 void aligned_vector_cleanup(AlignedVector* vector);
-void* aligned_vector_back(AlignedVector* vector);
+INLINE_ALWAYS void* aligned_vector_back(AlignedVector* vector){
+    return aligned_vector_at(vector, vector->size - 1);
+}
 
 #ifdef __cplusplus
 }
diff --git a/containers/named_array.c b/containers/named_array.c
index cf06373..6efa7dc 100644
--- a/containers/named_array.c
+++ b/containers/named_array.c
@@ -44,13 +44,6 @@ void named_array_init(NamedArray* array, unsigned int element_size, unsigned int
     memset(array->elements, 0, element_size * max_elements);
 }
 
-char named_array_used(NamedArray* array, unsigned int id) {
-    unsigned int i = id / 8;
-    unsigned int j = id % 8;
-
-    unsigned char v = array->used_markers[i] & (unsigned char) (1 << j);
-    return !!(v);
-}
 
 void* named_array_alloc(NamedArray* array, unsigned int* new_id) {
     unsigned int i = 0, j = 0;
diff --git a/containers/named_array.h b/containers/named_array.h
index a0f6c97..5877059 100644
--- a/containers/named_array.h
+++ b/containers/named_array.h
@@ -5,6 +5,8 @@
 extern "C" {
 #endif
 
+#include "../GL/cygprofile.h"
+
 typedef struct {
     unsigned int element_size;
     unsigned int max_element_count;
@@ -14,7 +16,13 @@ typedef struct {
 } NamedArray;
 
 void named_array_init(NamedArray* array, unsigned int element_size, unsigned int max_elements);
-char named_array_used(NamedArray* array, unsigned int id);
+INLINE_ALWAYS char named_array_used(NamedArray* array, unsigned int id) {
+    const unsigned int i = id / 8;
+    const unsigned int j = id % 8;
+
+    unsigned char v = array->used_markers[i] & (unsigned char) (1 << j);
+    return !!(v);
+}
 
 void* named_array_alloc(NamedArray* array, unsigned int* new_id);
 void* named_array_reserve(NamedArray* array, unsigned int id);