feat: implement proper chanegs from profiling

- math - inlining
2020-02-18 11:48:37 -05:00 · 2020-02-18 11:48:37 -05:00 · 3a4f09bef2
commit 3a4f09bef2
parent a2dcfcf997
19 changed files with 2572 additions and 227 deletions
--- a/GL/config.h
+++ b/GL/config.h
@ -1,9 +1,8 @@
 #pragma once
 #ifndef CONFIG_H
 #define CONFIG_H
 /* This figure is derived from the needs of Quake 1 */
 #define MAX_TEXTURE_COUNT 1088
 #endif // CONFIG_H
--- a/GL/cygprofile.c
+++ b/GL/cygprofile.c
@ -0,0 +1,227 @@
 /* Based on the idea from Erich Styger */
 /* profiled instrument guided profiling for gldc on hardware */
 #include "cygprofile.h"
 #include <kos.h>
 #include <stdbool.h>
 #include <stdint.h>
 #include <stdlib.h>
 #include "perfctr.h"
 #include "private.h"
 #if CYG_FUNC_TRACE_ENABLED
 #define _strcat(x, y, z) strncat(x, z, y)
 #ifndef __PE_Error_H
 #define __PE_Error_H
 #define ERR_OK 0       /* OK */
 #define ERR_SPEED 1    /* This device does not work in the active speed mode. */
 #define ERR_RANGE 2    /* Parameter out of range. */
 #define ERR_VALUE 3    /* Parameter of incorrect value. */
 #define ERR_OVERFLOW 4 /* Timer overflow. */
 #define ERR_MATH 5     /* Overflow during evaluation. */
 #define ERR_ENABLED 6  /* Device is enabled. */
 #define ERR_DISABLED 7 /* Device is disabled. */
 #define ERR_BUSY 8     /* Device is busy. */
 #define ERR_NOTAVAIL 9 /* Requested value or method not available. */
 #define ERR_RXEMPTY 10 /* No data in receiver. */
 #define ERR_TXFULL 11  /* Transmitter is full. */
 #define ERR_BUSOFF 12  /* Bus not available. */
 #define ERR_OVERRUN 13 /* Overrun error is detected. */
 #define ERR_FRAMING 14 /* Framing error is detected. */
 #define ERR_PARITY 15  /* Parity error is detected. */
 #define ERR_NOISE 16   /* Noise error is detected. */
 #define ERR_IDLE 17    /* Idle error is detectes. */
 #define ERR_FAULT 18   /* Fault error is detected. */
 #define ERR_BREAK 19   /* Break char is received during communication. */
 #define ERR_CRC 20     /* CRC error is detected. */
 #define ERR_ARBITR 21  /* A node losts arbitration. This error occurs if two nodes start transmission at the same time. */
 #define ERR_PROTECT 22 /* Protection error is detected. */
 #endif /* __PE_Error_H */
 #define CYG_RNG_BUF_NOF_ELEMS (8096 * 4)
 /*!< Number of elements in the ring buffer which is used to record function calls */
 #define CYG_THUMB_MASK 0xFFFFFFFF
 /*!< mask out LSB (thumb) bit */
 /* Hashing function for two uint32_ts */
 #define HASH_PAIR(x, y) (((x)*0x1f1f1f1f) ^ (y))
 static bool CYG_Enabled = false; /*!< flag which enables/disables tracing */
 /*!
 * Element in ring buffer to store the trace information.
 */
 typedef struct
 {
  //bool isEnter;    /*!< TRUE for __cyg_profile_func_enter(), FALSE for __cyg_profile_func_exit() */
  void *this_fn;    /*!< address (with thumb bit) of the (caller) function */
  void *call_site;  /*!< return address to the function which called this_fn */
  uint32_t counter; /* also contains isEnter as highest bit */
 } CYG_RNG_ElementType;
 typedef uint32_t CYG_RNG_BufSizeType; /*!< index type for ring buffer */
 static CYG_RNG_ElementType CYG_RNG_buffer[CYG_RNG_BUF_NOF_ELEMS]; /*!< ring buffer */
 //static CYG_RNG_BufSizeType CYG_RNG_inIdx;                         /*!< input index */
 static CYG_RNG_BufSizeType CYG_RNG_outIdx; /*!< output index */
 static CYG_RNG_BufSizeType CYG_RNG_inSize; /*!< size/number of elements in buffer */
 /*!
 * \brief Stores a trace element into the ring buffer.
 * \param elem Trace element to put into the buffer.
 * \return Error code, ERR_OK if everything is ok.
 */
 __attribute__((no_instrument_function)) static uint8_t CYG_RNG_Put(CYG_RNG_ElementType *elem) {
  uint8_t res = ERR_OK;
 #if 0
    if (CYG_RNG_inSize == CYG_RNG_BUF_NOF_ELEMS)
    {
        res = ERR_TXFULL;
        CYG_RNG_inSize--;
        CYG_PrintCallTrace();
        //CYG_RNG_inIdx = 0;
        CYG_RNG_outIdx = 0;
        CYG_RNG_inSize = 0;
        return CYG_RNG_Put(elem);
    }
    else
    {
        //CYG_RNG_buffer[CYG_RNG_inIdx] = *elem;
        /*
        CYG_RNG_inIdx++;
        if (CYG_RNG_inIdx == CYG_RNG_BUF_NOF_ELEMS)
        {
            CYG_RNG_inIdx = 0;
        }
        */
        CYG_RNG_inSize++;
    }
 #endif
  CYG_RNG_ElementType *possible = &CYG_RNG_buffer[HASH_PAIR((uint32_t)elem->call_site, (uint32_t)elem->this_fn) % CYG_RNG_BUF_NOF_ELEMS];
  if (possible->counter /*& 0x0FFFFFFF*/ == 0) {
    *possible = *elem;
  } else {
    possible->counter++;
  }
  return res;
 }
 /*!
 * \brief Gets a trace element from the ring buffer.
 * \param elem Pointer where to store the trace element.
 * \return Error code, ERR_OK if everything is ok.
 */
 __attribute__((no_instrument_function)) static uint8_t CYG_RNG_Get(CYG_RNG_ElementType *elemP) {
  uint8_t res = ERR_OK;
  if (CYG_RNG_inSize == 0) {
    res = ERR_RXEMPTY;
  } else {
    *elemP = CYG_RNG_buffer[CYG_RNG_outIdx];
    CYG_RNG_inSize--;
    CYG_RNG_outIdx++;
    if (CYG_RNG_outIdx == CYG_RNG_BUF_NOF_ELEMS) {
      CYG_RNG_outIdx = 0;
    }
  }
  return res;
 }
 static uint32_t currentTime[2];
 static uint32_t lastTime;
 /*!
 * \brief Stores a trace element into the ring buffer.
 * \param this_fn Address of the caller function.
 * \param call_site Return address to the function which called this_fn
 * \return Error code, ERR_OK if everything is ok.
 */
 __attribute__((no_instrument_function)) static void CYG_Store(void *this_fn, void *call_site) {
  CYG_RNG_ElementType elem;
  lastTime = currentTime[0];
  PMCR_Read(1, (unsigned int *)currentTime);
  //elem.isEnter = isEnter;
  elem.call_site = call_site;
  elem.this_fn = this_fn;
  elem.counter = 1;  //currentTime[0] - lastTime;
  CYG_RNG_Put(&elem);
 }
 /*!
 * \brief Function which is called upon function enter. The function call is inserted by the compiler.
 * \param this_fn Address of the caller function.
 * \param call_site Return address to the function which called this_fn
 */
 __attribute__((no_instrument_function)) void __cyg_profile_func_enter(void *this_fn, void *call_site) {
  if (CYG_Enabled) {
    CYG_Store(call_site, this_fn);
  }
 }
 /*!
 * \brief Function which is called upon function exit. The function call is inserted by the compiler.
 * \param this_fn Address of the caller function.
 * \param call_site Return address to the function which called this_fn
 */
 __attribute__((no_instrument_function)) void __cyg_profile_func_exit(__attribute__((unused)) void *this_fn, __attribute__((unused)) void *call_site) {
 }
 /*!
 * \brief Dumps the trace to the console.
 */
 __attribute__((no_instrument_function)) void CYG_PrintCallTrace(void) {
  CYG_RNG_BufSizeType i;
  char buf[40];
  CYG_RNG_ElementType elem;
  uint8_t res;
  CYG_Enabled = false;
  printf("0x%08x\n", ((unsigned int)&_etext) - BASE_ADDRESS);
  //printf("Function Trace:\r\n");
  CYG_RNG_outIdx = 0;
  for (i = 0; i < CYG_RNG_BUF_NOF_ELEMS; i++) {
    buf[0] = '\0';
    res = CYG_RNG_Get(&elem);
    if (res == ERR_OK && elem.call_site != NULL) {
      snprintf(buf, sizeof(buf), "{ 0x%" PRIXPTR " 0x%" PRIXPTR " %u\r\n", (uintptr_t)(elem.this_fn) & CYG_THUMB_MASK, (uintptr_t)(elem.call_site) & CYG_THUMB_MASK, (unsigned int)elem.counter);
      printf(buf);
    } else {
      //printf("ERROR getting element!\r\n");
    }
  }
  //printf("Function Trace: done!\r\n");
 }
 __attribute__((no_instrument_function)) void CYG_Init(void) {
  if (CYG_Enabled) {
    return;
  }
  CYG_RNG_inSize = CYG_RNG_BUF_NOF_ELEMS;
  CYG_RNG_outIdx = 0;
  CYG_Enabled = true;
  currentTime[0] = currentTime[1] = 0;
  lastTime = 0;
  memset(CYG_RNG_buffer, 0, sizeof(CYG_RNG_buffer));
  PMCR_Init(1, PMCR_ELAPSED_TIME_MODE, PMCR_COUNT_CPU_CYCLES);
 }
 __attribute__((no_instrument_function)) void CYG_Deinit(void) {
  CYG_RNG_inSize = CYG_RNG_BUF_NOF_ELEMS;
  CYG_RNG_outIdx = 0;
  CYG_Enabled = false;
  memset(CYG_RNG_buffer, 0, sizeof(CYG_RNG_buffer));
 }
 #else
 void CYG_PrintCallTrace(void){}
 void CYG_Init(void){}
 void CYG_Deinit(void){}
 #endif
--- a/GL/cygprofile.h
+++ b/GL/cygprofile.h
@ -0,0 +1,33 @@
 #pragma once
 #ifndef CYGPROFILE_H_
 #define CYGPROFILE_H_
 /* Based on the idea from Erich Styger */
 /* profiled instrument guided profiling for gldc on hardware */
 #define NO_INSTRUMENT inline __attribute__((no_instrument_function))
 #define INLINE_DEBUG NO_INSTRUMENT __attribute__((always_inline))
 #define INLINE_ALWAYS static NO_INSTRUMENT __attribute__((always_inline))
 extern char _etext;
 #define BASE_ADDRESS 0x8c010000
 #define CYG_FUNC_TRACE_ENABLED (1)
 /*!< 1: Trace enabled, 0: trace disabled */
 /*!
 * \brief Print the call trace to the terminal.
 */
 void CYG_PrintCallTrace(void);
 /*!
 * \brief Driver Initialization.
 */
 void CYG_Init(void);
 /*!
 * \brief Driver De-Initialization.
 */
 void CYG_Deinit(void);
 #endif /* CYGPROFILE_H_ */
--- a/GL/draw.c
+++ b/GL/draw.c
@ -56,7 +56,7 @@ void _glInitAttributePointers() {
    NORMAL_POINTER.size = 3;
 }
-static inline GLuint byte_size(GLenum type) {
+static INLINE_DEBUG GLuint byte_size(GLenum type) {
    switch(type) {
    case GL_BYTE: return sizeof(GLbyte);
    case GL_UNSIGNED_BYTE: return sizeof(GLubyte);
@ -513,7 +513,7 @@ PVRHeader* _glSubmissionTargetHeader(SubmissionTarget* target) {
    return aligned_vector_at(&target->output->vector, target->header_offset);
 }
-Vertex* _glSubmissionTargetStart(SubmissionTarget* target) {
+INLINE_DEBUG Vertex* _glSubmissionTargetStart(SubmissionTarget* target) {
    assert(target->start_offset < target->output->vector.size);
    return aligned_vector_at(&target->output->vector, target->start_offset);
 }
@ -1006,6 +1006,7 @@ static void mat_transform_normal3(const float* xyz, const float* xyzOut, const u
 static void light(SubmissionTarget* target) {
 #if 0
    typedef struct {
        float xyz[3];
        float n[3];
@ -1057,6 +1058,35 @@ static void light(SubmissionTarget* target) {
        vertex->bgra[G8IDX] = (GLubyte) (255.0f * fminf(total[1], 1.0f));
        vertex->bgra[B8IDX] = (GLubyte) (255.0f * fminf(total[2], 1.0f));
    }
 #endif
    if(!_glIsLightingEnabled()) {
        return;
    }
    static AlignedVector* eye_space_data = NULL;
    if(!eye_space_data) {
        eye_space_data = (AlignedVector*) malloc(sizeof(AlignedVector));
        aligned_vector_init(eye_space_data, sizeof(EyeSpaceData));
    }
    aligned_vector_resize(eye_space_data, target->count);
    /* Perform lighting calculations and manipulate the colour */
    Vertex* vertex = _glSubmissionTargetStart(target);
    VertexExtra* extra = aligned_vector_at(target->extras, 0);
    EyeSpaceData* eye_space = (EyeSpaceData*) eye_space_data->data;
    _glMatrixLoadModelView();
    mat_transform3(vertex->xyz, eye_space->xyz, target->count, sizeof(Vertex), sizeof(EyeSpaceData));
    _glMatrixLoadNormal();
    mat_transform_normal3(extra->nxyz, eye_space->n, target->count, sizeof(VertexExtra), sizeof(EyeSpaceData));
    EyeSpaceData* ES = aligned_vector_at(eye_space_data, 0);
    _glPerformLighting(vertex, ES, target->count);
 }
 static void divide(SubmissionTarget* target) {
--- a/GL/framebuffer.c
+++ b/GL/framebuffer.c
@ -1,5 +1,6 @@
 #include <stdio.h>
 #include "private.h"
 #include "config.h"
 #include "../include/glkos.h"
 #include "../include/glext.h"
@ -94,62 +95,62 @@ void APIENTRY glFramebufferTexture2DEXT(GLenum target, GLenum attachment, GLenum
    ACTIVE_FRAMEBUFFER->texture_id = texture;
 }
-static inline GLuint A1555(GLuint v) {
+static INLINE_DEBUG GLuint A1555(GLuint v) {
    const GLuint MASK = (1 << 15);
    return (v & MASK) >> 15;
 }
-static inline GLuint R1555(GLuint v) {
+static INLINE_DEBUG GLuint R1555(GLuint v) {
    const GLuint MASK = (31 << 10);
    return (v & MASK) >> 10;
 }
-static inline GLuint G1555(GLuint v) {
+static INLINE_DEBUG GLuint G1555(GLuint v) {
    const GLuint MASK = (31 << 5);
    return (v & MASK) >> 5;
 }
-static inline GLuint B1555(GLuint v) {
+static INLINE_DEBUG GLuint B1555(GLuint v) {
    const GLuint MASK = (31 << 0);
    return (v & MASK) >> 0;
 }
-static inline GLuint A4444(GLuint v) {
+static INLINE_DEBUG GLuint A4444(GLuint v) {
    const GLuint MASK = (0xF << 12);
    return (v & MASK) >> 12;
 }
-static inline GLuint R4444(GLuint v) {
+static INLINE_DEBUG GLuint R4444(GLuint v) {
    const GLuint MASK = (0xF << 8);
    return (v & MASK) >> 8;
 }
-static inline GLuint G4444(GLuint v) {
+static INLINE_DEBUG GLuint G4444(GLuint v) {
    const GLuint MASK = (0xF << 4);
    return (v & MASK) >> 4;
 }
-static inline GLuint B4444(GLuint v) {
+static INLINE_DEBUG GLuint B4444(GLuint v) {
    const GLuint MASK = (0xF << 0);
    return (v & MASK) >> 0;
 }
-static inline GLuint R565(GLuint v) {
+static INLINE_DEBUG GLuint R565(GLuint v) {
    const GLuint MASK = (31 << 11);
    return (v & MASK) >> 11;
 }
-static inline GLuint G565(GLuint v) {
+static INLINE_DEBUG GLuint G565(GLuint v) {
    const GLuint MASK = (63 << 5);
    return (v & MASK) >> 5;
 }
-static inline GLuint B565(GLuint v) {
+static INLINE_DEBUG GLuint B565(GLuint v) {
    const GLuint MASK = (31 << 0);
    return (v & MASK) >> 0;
 }
-GLboolean _glCalculateAverageTexel(const GLubyte* src, const GLuint srcWidth, const GLuint pvrFormat, GLubyte* dest) {
+static NO_INSTRUMENT GLboolean _glCalculateAverageTexel(const GLubyte* src, const GLuint srcWidth, const GLuint pvrFormat, GLubyte* dest) {
    GLushort* s1 = ((GLushort*) src);
    GLushort* s2 = ((GLushort*) src) + 1;
    GLushort* s3 = ((GLushort*) src) + srcWidth;
--- a/GL/gldc.c
+++ b/GL/gldc.c
@ -19,3 +19,7 @@
 #include "matrix.c"
 #include "state.c"
 #include "texture.c"
 #include "../containers/stack.c"
 #include "../containers/aligned_vector.c"
 #include "../containers/named_array.c"
--- a/GL/lighting.c
+++ b/GL/lighting.c
@ -281,98 +281,143 @@ static inline float FPOW(float b, float p) {
    return FEXP(FLOG(b) * p);
 }
-void _glCalculateLightingContribution(const GLint light, const GLfloat* pos, const GLfloat* normal, uint8_t* bgra, GLfloat* colour) __attribute__((optimize("fast-math")));
+#define LIGHT_COMPONENT(C) { \
-void _glCalculateLightingContribution(const GLint light, const GLfloat* pos, const GLfloat* normal, uint8_t* bgra, GLfloat* colour) {
+    const GLfloat* acm = &MA[C]; \
-    LightSource* l = &LIGHTS[light];
+    const GLfloat* dcm = &MD[C]; \
-
+    const GLfloat* scm = &MS[C]; \
-    struct vec3f L = {
+    const GLfloat* scli = &light->specular[C]; \
-        l->position[0],
+    const GLfloat* dcli = &light->diffuse[C]; \
-        l->position[1],
+    const GLfloat* acli = &light->ambient[C]; \
-        l->position[2]
+    const GLfloat* srm = &MATERIAL.exponent; \
-    };
+    const GLfloat fi = (LdotN == 0) ? 0 : 1; \
-
+    GLfloat component = (*acm * *acli); \
-    if(!l->is_directional) {
+    component += (LdotN * *dcm * *dcli); \
-        L.x -= pos[0];
+    component += (FPOW((fi * NdotH), *srm) * *scm * *scli); \
-        L.y -= pos[1];
+    component *= att; \
-        L.z -= pos[2];
+    component *= spot; \
-    }
+    final[C] += component; \
    struct vec3f N = {
        normal[0],
        normal[1],
        normal[2]
    };
    struct vec3f V = {
        pos[0],
        pos[1],
        pos[2]
    };
    GLfloat d;
    vec3f_length(L.x, L.y, L.z, d);
    GLfloat oneOverL = 1.0f / d;
    L.x *= oneOverL;
    L.y *= oneOverL;
    L.z *= oneOverL;
    vec3f_normalize(V.x, V.y, V.z);
    GLfloat NdotL, VdotN;
    vec3f_dot(N.x, N.y, N.z, L.x, L.y, L.z, NdotL);
    vec3f_dot(V.x, V.y, V.z, N.x, N.y, N.z, VdotN);
    GLfloat VdotR = VdotN - NdotL;
    GLfloat specularPower = FPOW(VdotR > 0 ? VdotR : 0, MATERIAL.exponent);
    GLboolean colorMaterial = _glIsColorMaterialEnabled();
    GLfloat mD [] = {
        (colorMaterial && isDiffuseColorMaterial()) ? ((GLfloat)bgra[R8IDX]) / 255.0f : MATERIAL.diffuse[0],
        (colorMaterial && isDiffuseColorMaterial()) ? ((GLfloat)bgra[G8IDX]) / 255.0f : MATERIAL.diffuse[1],
        (colorMaterial && isDiffuseColorMaterial()) ? ((GLfloat)bgra[B8IDX]) / 255.0f : MATERIAL.diffuse[2],
        (colorMaterial && isDiffuseColorMaterial()) ? ((GLfloat)bgra[A8IDX]) / 255.0f : MATERIAL.diffuse[3]
    };
    GLfloat mA [] = {
        (colorMaterial && isAmbientColorMaterial()) ? ((GLfloat)bgra[R8IDX]) / 255.0f : MATERIAL.ambient[0],
        (colorMaterial && isAmbientColorMaterial()) ? ((GLfloat)bgra[G8IDX]) / 255.0f : MATERIAL.ambient[1],
        (colorMaterial && isAmbientColorMaterial()) ? ((GLfloat)bgra[B8IDX]) / 255.0f : MATERIAL.ambient[2],
        (colorMaterial && isAmbientColorMaterial()) ? ((GLfloat)bgra[A8IDX]) / 255.0f : MATERIAL.ambient[3]
    };
    GLfloat mS [] = {
        (colorMaterial && isSpecularColorMaterial()) ? ((GLfloat)bgra[R8IDX]) / 255.0f : MATERIAL.specular[0],
        (colorMaterial && isSpecularColorMaterial()) ? ((GLfloat)bgra[G8IDX]) / 255.0f : MATERIAL.specular[1],
        (colorMaterial && isSpecularColorMaterial()) ? ((GLfloat)bgra[B8IDX]) / 255.0f : MATERIAL.specular[2],
        (colorMaterial && isSpecularColorMaterial()) ? ((GLfloat)bgra[A8IDX]) / 255.0f : MATERIAL.specular[3]
    };
    colour[0] = l->ambient[0] * mA[0];
    colour[1] = l->ambient[1] * mA[1];
    colour[2] = l->ambient[2] * mA[2];
    colour[3] = mD[3];
    if(NdotL >= 0) {
        colour[0] += (l->diffuse[0] * mD[0] * NdotL + l->specular[0] * mS[0] * specularPower);
        colour[1] += (l->diffuse[1] * mD[1] * NdotL + l->specular[1] * mS[1] * specularPower);
        colour[2] += (l->diffuse[2] * mD[2] * NdotL + l->specular[2] * mS[2] * specularPower);
    }
    if(!l->is_directional) {
        GLfloat att = (
            1.0f / (l->constant_attenuation + (l->linear_attenuation * d) + (l->quadratic_attenuation * d * d))
        );
        colour[0] *= att;
        colour[1] *= att;
        colour[2] *= att;
    }
    if(colour[0] > 1.0f) colour[0] = 1.0f;
    if(colour[1] > 1.0f) colour[1] = 1.0f;
    if(colour[2] > 1.0f) colour[2] = 1.0f;
    if(colour[3] > 1.0f) colour[3] = 1.0f;
 }
 static inline float vec3_dot_limited(
        const float* x1, const float* y1, const float* z1,
        const float* x2, const float* y2, const float* z2) {
    float ret;
    vec3f_dot(*x1, *y1, *z1, *x2, *y2, *z2, ret);
    return (ret < 0) ? 0 : ret;
 }
 void _glPerformLighting(Vertex* vertices, const EyeSpaceData* es, const int32_t count) {
    int8_t i;
    int32_t j;
    const LightSource* light = NULL;
    const GLboolean colorMaterial = _glIsColorMaterialEnabled();
    const GLboolean isDiffuseCM = isDiffuseColorMaterial();
    const GLboolean isAmbientCM = isAmbientColorMaterial();
    const GLboolean isSpecularCM = isSpecularColorMaterial();
    static GLfloat CM[4];
     /* So the DC has 16 floating point registers, that means
     * we need to limit the number of floats as much as possible
     * to give the compiler a good enough chance to do the right
     * thing */
    Vertex* vertex = vertices;
    const EyeSpaceData* data = es;
    static const float ONE_OVER_255 = 1.0f / 255.0f;
    for(j = 0; j < count; ++j, ++vertex, ++data) {
        /* When GL_COLOR_MATERIAL is on, we need to pull out
         * the passed in diffuse and use it */
        const GLfloat* MD = MATERIAL.diffuse;
        const GLfloat* MA = MATERIAL.ambient;
        const GLfloat* MS = MATERIAL.specular;
        if(colorMaterial) {
            CM[0] = ((GLfloat) vertex->bgra[R8IDX]) * ONE_OVER_255;
            CM[1] = ((GLfloat) vertex->bgra[G8IDX]) * ONE_OVER_255;
            CM[2] = ((GLfloat) vertex->bgra[B8IDX]) * ONE_OVER_255;
            CM[3] = ((GLfloat) vertex->bgra[A8IDX]) * ONE_OVER_255;
            MD = (isDiffuseCM) ? CM : MATERIAL.diffuse;
            MA = (isAmbientCM) ? CM : MATERIAL.ambient;
            MS = (isSpecularCM) ? CM : MATERIAL.specular;
        }
        float final[4];
        /* Initial, non-light related values */
        final[0] = (SCENE_AMBIENT[0] * MA[0]) + MATERIAL.emissive[0];
        final[1] = (SCENE_AMBIENT[1] * MA[1]) + MATERIAL.emissive[1];
        final[2] = (SCENE_AMBIENT[2] * MA[2]) + MATERIAL.emissive[2];
        final[3] = MD[3];
        float Vx, Vy, Vz;
        Vx = -data->xyz[0];
        Vy = -data->xyz[1];
        Vz = -data->xyz[2];
        vec3f_normalize(Vx, Vy, Vz);
        for(i = 0; i < MAX_LIGHTS; ++i) {
            if(!_glIsLightEnabled(i)) continue;
            /* Calc light specific parameters */
            light = &LIGHTS[i];
            float Lx, Ly, Lz, D;
            float Hx, Hy, Hz;
            const float* Nx = &data->n[0];
            const float* Ny = &data->n[1];
            const float* Nz = &data->n[2];
            Lx = light->position[0] - data->xyz[0];
            Ly = light->position[1] - data->xyz[1];
            Lz = light->position[2] - data->xyz[2];
            vec3f_length(Lx, Ly, Lz, D);
            {
                /* Normalize L - scoping ensures Llen is temporary */
                const float Llen = 1.0f / D;
                Lx *= Llen;
                Ly *= Llen;
                Lz *= Llen;
            }
            Hx = (Lx + Vx);
            Hy = (Ly + Vy);
            Hz = (Lz + Vz);
            vec3f_normalize(Hx, Hy, Hz);
            const float LdotN = vec3_dot_limited(
                &Lx, &Ly, &Lz,
                Nx, Ny, Nz
            );
            const float NdotH = vec3_dot_limited(
                Nx, Ny, Nz,
                &Hx, &Hy, &Hz
            );
            const float att = (
                light->position[3] == 0.0f) ? 1.0f :
                1.0f / (light->constant_attenuation + (light->linear_attenuation * D) + (light->quadratic_attenuation * D * D)
            );
            const float spot = 1.0f;
            LIGHT_COMPONENT(0);
            LIGHT_COMPONENT(1);
            LIGHT_COMPONENT(2);
        }
        vertex->bgra[R8IDX] = (GLubyte)(fminf(final[0] * 255.0f, 255.0f));
        vertex->bgra[G8IDX] = (GLubyte)(fminf(final[1] * 255.0f, 255.0f));
        vertex->bgra[B8IDX] = (GLubyte)(fminf(final[2] * 255.0f, 255.0f));
        vertex->bgra[A8IDX] = (GLubyte)(fminf(final[3] * 255.0f, 255.0f));
    }
 }
--- a/GL/matrix.c
+++ b/GL/matrix.c
@ -476,84 +476,57 @@ void APIENTRY glDepthRange(GLclampf n, GLclampf f) {
    DEPTH_RANGE_MULTIPLIER_H = (n + f) / 2.0f;
 }
 #include "sh4_math.h"
 /* Vector Cross Product - Used by glhLookAtf2 */
-static inline void vec3f_cross(const GLfloat* v1, const GLfloat* v2, GLfloat* result) {
+static inline void vec3f_cross(GLfloat* v1, GLfloat* v2, GLfloat* result) {
-    result[0] = v1[1] * v2[2] - v1[2] * v2[1];
+    result[0] = (v1[1] * v2[2]) - (v1[2] * v2[1]);
-    result[1] = v1[2] * v2[0] - v1[0] * v2[2];
+    result[1] = (v1[2] * v2[0]) - (v1[0] * v2[2]);
-    result[2] = v1[0] * v2[1] - v1[1] * v2[0];
+    result[2] = (v1[0] * v2[1]) - (v1[1] * v2[0]);
 }
 /* glhLookAtf2 adapted from http://www.opengl.org/wiki/GluLookAt_code */
 void glhLookAtf2(const GLfloat* eyePosition3D,
                 const GLfloat* center3D,
                 const GLfloat* upVector3D) {
-    /* Look-At Matrix */
+static inline void vec3f_normalize_sh4(float *v){
-    static Matrix4x4 MatrixLookAt __attribute__((aligned(32))) = {
+    float	length, ilength;
        1.0f, 0.0f, 0.0f, 0.0f,
        0.0f, 1.0f, 0.0f, 0.0f,
        0.0f, 0.0f, 1.0f, 0.0f,
        0.0f, 0.0f, 0.0f, 1.0f
    };
-    GLfloat forward[3];
+	ilength = MATH_fsrra(v[0]*v[0] + v[1]*v[1] + v[2]*v[2]);
-    GLfloat side[3];
+	length = MATH_Invert(ilength);
-    GLfloat up[3];
+	if (length)
-
+	{
-    vec3f_sub_normalize(center3D[0], center3D[1], center3D[2],
+		v[0] *= ilength;
-                        eyePosition3D[0], eyePosition3D[1], eyePosition3D[2],
+		v[1] *= ilength;
-                        forward[0], forward[1], forward[2]);
+		v[2] *= ilength;
-
+	}
    //Side = forward x up
    vec3f_cross(forward, upVector3D, side);
    vec3f_normalize(side[0], side[1], side[2]);
    //Recompute up as: up = side x forward
    vec3f_cross(side, forward, up);
    MatrixLookAt[M0] = side[0];
    MatrixLookAt[M4] = side[1];
    MatrixLookAt[M8] = side[2];
    MatrixLookAt[M12] = 0;
    MatrixLookAt[M1] = up[0];
    MatrixLookAt[M5] = up[1];
    MatrixLookAt[M9] = up[2];
    MatrixLookAt[M13] = 0;
    MatrixLookAt[M2] = -forward[0];
    MatrixLookAt[M6] = -forward[1];
    MatrixLookAt[M10] = -forward[2];
    MatrixLookAt[M14] = 0;
    MatrixLookAt[M3] = MatrixLookAt[11] = MatrixLookAt[15] = 0;
    MatrixLookAt[M15] = 1;
    static Matrix4x4 trn __attribute__((aligned(32))) = {
        1.0f, 0.0f, 0.0f, 0.0f,
        0.0f, 1.0f, 0.0f, 0.0f,
        0.0f, 0.0f, 1.0f, 0.0f,
        0.0f, 0.0f, 0.0f, 1.0f
    };
    trn[M12] = -eyePosition3D[0];
    trn[M13] = -eyePosition3D[1];
    trn[M14] = -eyePosition3D[2];
    // Does not modify internal Modelview matrix
    upload_matrix(&MatrixLookAt);
    multiply_matrix(&trn);
    multiply_matrix(stack_top(MATRIX_STACKS + (GL_MODELVIEW & 0xF)));
    download_matrix(stack_top(MATRIX_STACKS + (GL_MODELVIEW & 0xF)));
 }
 void gluLookAt(GLfloat eyex, GLfloat eyey, GLfloat eyez, GLfloat centerx,
               GLfloat centery, GLfloat centerz, GLfloat upx, GLfloat upy,
               GLfloat upz) {
-    GLfloat eye [] = { eyex, eyey, eyez };
+    GLfloat m [16];
-    GLfloat point [] = { centerx, centery, centerz };
+   	GLfloat f [3];
-    GLfloat up [] = { upx, upy, upz };
+	GLfloat u [3];
-    glhLookAtf2(eye, point, up);
+	GLfloat s [3];
 	f[0] = centerx - eyex;
 	f[1] = centery - eyey;
 	f[2] = centerz - eyez;
 	u[0] = upx;
 	u[1] = upy;
 	u[2] = upz;
    vec3f_normalize_sh4(f);
 	vec3f_cross(f, u, s);
    vec3f_normalize_sh4(s);
 	vec3f_cross(s, f, u);
 	m[0] =  s[0]; m[4] =  s[1]; m[8] =   s[2]; m[12] = 0.0f;
 	m[1] =  u[0]; m[5] =  u[1]; m[9] =   u[2]; m[13] = 0.0f;
 	m[2] = -f[0]; m[6] = -f[1]; m[10] = -f[2]; m[14] = 0.0f;
    m[3] =   0.0f; m[7] =   0.0f; m[11] =   0.0f; m[15] = 1.0f;
 	glMultMatrixf(m);
 	glTranslatef(-eyex, -eyey, -eyez);
 }
 void _glApplyRenderMatrix() {
--- a/GL/perfctr.c
+++ b/GL/perfctr.c
@ -0,0 +1,247 @@
 // ---- perfctr.c - SH7091 Performance Counter Module Code ----
 //
 // This file is part of the DreamHAL project, a hardware abstraction library
 // primarily intended for use on the SH7091 found in hardware such as the SEGA
 // Dreamcast game console.
 //
 // The performance counter module is hereby released into the public domain in
 // the hope that it may prove useful. Now go profile some code and hit 60 fps! :)
 //
 // --Moopthehedgehog
 // See perfctr.h for more of my notes and documentation on these counters.
 #include "perfctr.h"
 #include "cygprofile.h"
 #if CYG_FUNC_TRACE_ENABLED
 static unsigned char pmcr_enabled = 0;
 //
 // Initialize performance counters. It's just a clear -> enable.
 // It's good practice to clear a counter before starting it for the first time.
 //
 // Also: Disabling and re-enabling the counters doesn't reset them; the clearing
 // needs to happen while a counter is disabled to reset it.
 //
 // You can disable and re-enable with a different mode without explicitly
 // clearing and have it keep going, continuing from where it left off.
 //
 __attribute__((no_instrument_function)) void PMCR_Init(int which, unsigned short mode, unsigned char count_type) // Will do nothing if perfcounter is already running!
 {
 	// Don't do anything if being asked to enable an already-enabled counter
 	if( (which == 1) && ((!pmcr_enabled) || (pmcr_enabled == 2)) )
 	{
 		// counter 1
 		PMCR_Enable(1, mode, count_type, PMCR_RESET_COUNTER);
 	}
 	else if( (which == 2) && ((!pmcr_enabled) || (pmcr_enabled == 1)) )
 	{
 		// counter 2
 		PMCR_Enable(2, mode, count_type, PMCR_RESET_COUNTER);
 	}
 	else if( (which == 3) && (!pmcr_enabled) )
 	{
 		// Both
 		PMCR_Enable(3, mode, count_type, PMCR_RESET_COUNTER);
 	}
 }
 // Enable "undocumented" performance counters (well, they were undocumented at one point. They're documented now!)
 __attribute__((no_instrument_function)) void PMCR_Enable(int which, unsigned short mode, unsigned char count_type, unsigned char reset_count) // Will do nothing if perfcounter is already running!
 {
 	// Don't do anything if count_type or reset_count are invalid
 	if((count_type | reset_count) > 1)
 	{
 		return;
 	}
 	// Build config from parameters
 	unsigned short pmcr_ctrl = PMCR_RUN_COUNTER | (reset_count << PMCR_RESET_COUNTER_SHIFT) | (count_type << PMCR_CLOCK_TYPE_SHIFT) | mode;
 	// Don't do anything if being asked to enable an already-enabled counter
 	if( (which == 1) && ((!pmcr_enabled) || (pmcr_enabled == 2)) )
 	{
 		// counter 1
 		*((volatile unsigned short*)PMCR1_CTRL_REG) = pmcr_ctrl;
 		pmcr_enabled += 1;
 	}
 	else if( (which == 2) && ((!pmcr_enabled) || (pmcr_enabled == 1)) )
 	{
 		// counter 2
 		*((volatile unsigned short*)PMCR2_CTRL_REG) = pmcr_ctrl;
 		pmcr_enabled += 2;
 	}
 	else if( (which == 3) && (!pmcr_enabled) )
 	{
 		// Both
 		*((volatile unsigned short*)PMCR1_CTRL_REG) = pmcr_ctrl;
 		*((volatile unsigned short*)PMCR2_CTRL_REG) = pmcr_ctrl;
 		pmcr_enabled = 3;
 	}
 }
 // For reference:
 // #define PMCTR1H_REG 0xFF100004
 // #define PMCTR1L_REG 0xFF100008
 // #define PMCTR2H_REG 0xFF10000C
 // #define PMCTR2L_REG 0xFF100010
 static const unsigned int pmcr1_regh = PMCTR1H_REG;
 static const unsigned int pmcr1_regl = PMCTR1L_REG;
 static const unsigned int pmcr2_regh = PMCTR2H_REG;
 static const unsigned int pmcr2_regl = PMCTR2L_REG;
 // Sorry, can only read one counter at a time!
 // out_array should be an array consisting of 2x unsigned ints.
 __attribute__((no_instrument_function)) void PMCR_Read(int which, volatile unsigned int *out_array)
 {
 // if pmcr is not enabled, this function will just return 0
 	// little endian (big endian would need to flip [0] and [1])
 	// Note: These reads really do need to be done in assembly: unfortunately it
 	// appears that using C causes GCC to insert a branch right smack in between
 	// the high and low reads of perf counter 2 (with a nop, so it's literally
 	// delaying the reads by several cycles!), which is totally insane. Doing it
 	// the assembly way ensures that nothing ridiculous like that happens. It's
 	// also portable between versions of GCC that do put the nonsensical branch in.
 	//
 	// One thing that would be nice is if SH4 had the movi20s instruction to make
 	// absolute addresses in 3 cycles, but only the SH2A has that... :(
 	if( (which == 1) && (pmcr_enabled & 0x1) )
 	{
 		// counter 1
 //		out_array[1] = *((volatile unsigned int*)PMCTR1H_REG) & 0xffff;
 //		out_array[0] = *((volatile unsigned int*)PMCTR1L_REG);
 		asm volatile("mov.l %[reg1h],r1\n\t" // load counter address (high)
 								 "mov.l %[reg1l],r2\n\t" // load counter address (low)
 								 "mov.l @r1,r1\n\t" // read counter (high)
 								 "mov.l @r2,r2\n\t" // read counter (low)
 								 "extu.w r1,r1\n\t" // zero-extend high, aka high & 0xffff
 								 "mov.l r1,%[outh]\n\t" // get data to memory
 								 "mov.l r2,%[outl]\n\t" // get data to memory
 		: [outh] "=m" (out_array[1]), [outl] "=m" (out_array[0])
 		: [reg1h] "m" (pmcr1_regh), [reg1l] "m" (pmcr1_regl) // SH4 can't mov an immediate longword into a register...
 		: "r1", "r2"
 		);
 	}
 	else if( (which == 2) && (pmcr_enabled & 0x2) )
 	{
 		// counter 2
 //		out_array[1] = *((volatile unsigned int*)PMCTR2H_REG) & 0xffff;
 //		out_array[0] = *((volatile unsigned int*)PMCTR2L_REG);
 		asm volatile("mov.l %[reg2h],r1\n\t" // load counter address (high)
 								 "mov.l %[reg2l],r2\n\t" // load counter address (low)
 								 "mov.l @r1,r1\n\t" // read counter (high)
 								 "mov.l @r2,r2\n\t" // read counter (low)
 								 "extu.w r1,r1\n\t" // zero-extend high, aka high & 0xffff
 								 "mov.l r1,%[outh]\n\t" // get data to memory
 								 "mov.l r2,%[outl]\n\t" // get data to memory
 		: [outh] "=m" (out_array[1]), [outl] "=m" (out_array[0])
 		: [reg2h] "m" (pmcr2_regh), [reg2l] "m" (pmcr2_regl) // SH4 can't mov an immediate longword into a register...
 		: "r1", "r2"
 		);
 	}
 	else if(!pmcr_enabled)
 	{
 		out_array[1] = 0;
 		out_array[0] = 0;
 	}
 	else // Invalid
 	{
 		out_array[1] = 0xffff;
 		out_array[0] = 0xffffffff;
 	}
 }
 // Reset counter to 0 and start it again
 // NOTE: It does not appear to be possible to clear a counter while it is running.
 __attribute__((no_instrument_function)) void PMCR_Restart(int which, unsigned short mode, unsigned char count_type)
 {
 	if( (which == 1) && (pmcr_enabled & 0x1) )
 	{
 		// counter 1
 		PMCR_Stop(1);
 		PMCR_Enable(1, mode, count_type, PMCR_RESET_COUNTER);
 	}
 	else if( (which == 2) && (pmcr_enabled & 0x2) )
 	{
 		// counter 2
 		PMCR_Stop(2);
 		PMCR_Enable(2, mode, count_type, PMCR_RESET_COUNTER);
 	}
 	else if( (which == 3) && (pmcr_enabled == 3) )
 	{
 		// Both
 		PMCR_Stop(3);
 		PMCR_Enable(3, mode, count_type, PMCR_RESET_COUNTER);
 	}
 }
 // Clearing only works when the counter is disabled. Otherwise, stopping the
 // counter via setting the 0x2000 bit holds the data in the data registers,
 // whereas disabling without setting that bit reads back as all 0 (but doesn't
 // clear the counters for next start). This function just stops a running
 // counter and does nothing if the counter is already stopped or disabled, as
 // clearing is handled by PMCR_Enable().
 __attribute__((no_instrument_function)) void PMCR_Stop(int which)
 {
 	if( (which == 1) && (pmcr_enabled & 0x1) )
 	{
 		// counter 1
 		*((volatile unsigned short*)PMCR1_CTRL_REG) = PMCR_STOP_COUNTER;
 		pmcr_enabled &= 0x2;
 	}
 	else if( (which == 2) && (pmcr_enabled & 0x2) )
 	{
 		// counter 2
 		*((volatile unsigned short*)PMCR2_CTRL_REG) = PMCR_STOP_COUNTER;
 		pmcr_enabled &= 0x1;
 	}
 	else if( (which == 3) && (pmcr_enabled == 3) )
 	{
 		// Both
 		*((volatile unsigned short*)PMCR1_CTRL_REG) = PMCR_STOP_COUNTER;
 		*((volatile unsigned short*)PMCR2_CTRL_REG) = PMCR_STOP_COUNTER;
 		pmcr_enabled = 0;
 	}
 }
 // Note that disabling does NOT clear the counter.
 // It may appear that way because reading a disabled counter returns 0, but re-
 // enabling without first clearing will simply continue where it left off.
 __attribute__((no_instrument_function)) void PMCR_Disable(int which)
 {
 	if(which == 1)
 	{
 		// counter 1
 		*((volatile unsigned short*)PMCR1_CTRL_REG) = PMCR_DISABLE_COUNTER;
 		pmcr_enabled &= 0x2;
 	}
 	else if(which == 2)
 	{
 		// counter 2
 		*((volatile unsigned short*)PMCR2_CTRL_REG) = PMCR_DISABLE_COUNTER;
 		pmcr_enabled &= 0x1;
 	}
 	else if(which == 3)
 	{
 		// Both
 		*((volatile unsigned short*)PMCR1_CTRL_REG) = PMCR_DISABLE_COUNTER;
 		*((volatile unsigned short*)PMCR2_CTRL_REG) = PMCR_DISABLE_COUNTER;
 		pmcr_enabled = 0;
 	}
 }
 #endif
--- a/GL/perfctr.h
+++ b/GL/perfctr.h
@ -0,0 +1,316 @@
 // ---- perfctr.h - SH7091 Performance Counter Module Header ----
 //
 // This file is part of the DreamHAL project, a hardware abstraction library
 // primarily intended for use on the SH7091 found in hardware such as the SEGA
 // Dreamcast game console.
 //
 // The performance counter module is hereby released into the public domain in
 // the hope that it may prove useful. Now go profile some code and hit 60 fps! :)
 //
 // --Moopthehedgehog
 //
 #ifndef __PERFCTR_H__
 #define __PERFCTR_H__
 //
 // -- General SH4 Performance Counter Notes --
 //
 // There are 2 performance counters that can measure elapsed time. They are each
 // 48-bit counters. They are part of the so-called "ASE" subsystem, which you can
 // read about in chapter 13 of the "SuperH™ (SH) 32-bit RISC series SH-4, ST40
 // system architecture, volume 1: system":
 // https://www.st.com/content/ccc/resource/technical/document/user_manual/36/75/05/ac/e8/7e/42/2d/CD00147163.pdf/files/CD00147163.pdf/jcr:content/translations/en.CD00147163.pdf
 //
 // They can count cycles, so that's 199.5MHz (not 200MHz!!) a.k.a. roughly 5 ns
 // increments. At 5 ns increments, a 48-bit cycle counter can run continuously
 // for 16.33 days. It's actually 16 days, 7 hours, 55 minutes, and 2 seconds,
 // depending on how close the bus clock is to 99.75MHz. There is also a second
 // mode that counts cycles according to a ratio between the CPU frequency and
 // the system bus clock, and it increments the counter by 12 every bus cycle.
 // This second mode is detailed in the description for PMCR_CLOCK_TYPE in this
 // file, and it is recommended for use when the CPU frequency is not a runtime
 // constant.
 //
 // Side note: The counters don't have an overflow interrupt or overflow bit.
 // (I did actually run one to 48-bit overflow in elapsed time mode using the
 // ratio method to check this. They don't appear to sign-extend the upper 16
 // bits in elapsed time mode, either.)
 //
 // The two counters are functionally identical. I would recommend using the
 // PMCR_Init() function to start one (or both) up the first time.
 //
 // -- Configuration Address Info --
 //
 // Addresses for these counters can be easily seen here, in lxdream's source code:
 // https://github.com/lutris/lxdream/blob/master/src/sh4/sh4mmio.h
 //
 // They are also on display in the Linux kernel, but at the time of writing appear
 // to be set incorrectly (the clock mode at bit 0x100 is never set or cleared,
 // for example, so they're at the mercy of whatever the hardware defaults are):
 // http://git.lpclinux.com/cgit/linux-2.6.28.2-lpc313x/plain/arch/sh/oprofile/op_model_sh7750.c
 // https://github.com/torvalds/linux/blob/master/arch/sh/kernel/cpu/sh4/perf_event.c
 // ...It also appears as though they may not be handling bus ratio mode correctly,
 // which appears to be the default mode on the Dreamcast in all my tests.
 //
 // You can also find these addresses by ripping a copy of Virtua Fighter 3 that
 // you own for Dreamcast and looking at the raw byte code (or a raw disassembly)
 // of its main program binary. It would appear as though they were timing a loop
 // with the low half of perf counter 1 in elapsed time mode. Definitely seems
 // like a good thing to do when targeting 60fps! Shenmue Disc 4 also uses the
 // same configuration, but what's being timed is not as clear.
 //
 // Another place you can actually find both control addresses 0xFF00008x and all
 // data addresses 0xFF10000x is in binaries of ancient, freely available versions
 // of CodeScape. Literally all you need to do is open an SH7750-related DLL in a
 // hex editor and do a search to find the control register addresses, and the
 // data addresses are equally plain to see in any relevant performance profiling
 // firmware. There's no effort or decryption required to find them whatsoever;
 // all you need is an old trial version and a hex editor.
 //
 // However, something even better than all of that is if you search for "SH4
 // 0xFF000084" (without quotes) online you'll find an old forum where some logs
 // were posted of the terminal/command prompt output from some STMicro JTAG tool,
 // which not only has the address registers but also clearly characterizes their
 // size as 16-bit:
 // https://www.multimediaforum.de/threads/36260834-alice-hsn-3800tw-usb-jtag-ft4232h/page2
 //
 // -- Event Mode Info --
 //
 // Specific information on each counter mode can be found in the document titled
 // "SuperH™ Family E10A-USB Emulator: Additional Document for User’s Manual:
 // Supplementary Information on Using the SH7750R Renesas Microcomputer Development Environment System"
 // which is available on Renesas's website, in the "Documents" section of the
 // E10A-USB product page:
 // https://www.renesas.com/us/en/products/software-tools/tools/emulator/e10a-usb.html
 // At the time of writing (12/2019), the E10A-USB adapter is still available
 // for purchase, and it is priced around $1200 (USD).
 //
 // Appendix C of the "ST40 Micro Toolset Manual" also has these modes documented:
 // https://www.st.com/content/ccc/resource/technical/document/user_manual/c5/98/11/89/50/68/41/66/CD17379953.pdf/files/CD17379953.pdf/jcr:content/translations/en.CD17379953.pdf
 //
 // See here for the hexadecimal values corresponding to each mode (pg. 370):
 // http://www.macmadigan.com/BusaECU/Renesas%20documents/Hitachi_codescape_CS40_light_userguides.pdf
 // You can also find the same "Counter Description Table" in user's guide PDFs
 // bundled in ancient demo versions of CodeScape 3 from 2000 (e.g.
 // CSDemo_272.exe), which can still be found in the Internet Archive.
 // http://web.archive.org/web/*/http://codescape.com/dl/CSDemo/*
 //
 // See here for a support document on Lauterbach's SH2, SH3, and SH4 debugger,
 // which contains units for each mode (e.g. which measure time and which just
 // count): https://www.lauterbach.com/frames.html?home.html (It's in Downloads
 // -> Trace32 Help System -> it's the file called "SH2, SH3 and SH4 Debugger"
 // with the filename debugger_sh4.pdf).
 //
 //
 // --- Performance Counter Registers ---
 //
 // These registers are 16 bits only and configure the performance counters
 #define PMCR1_CTRL_REG 0xFF000084
 #define PMCR2_CTRL_REG 0xFF000088
 // These registers are 32-bits each and hold the high low parts of each counter
 #define PMCTR1H_REG 0xFF100004
 #define PMCTR1L_REG 0xFF100008
 #define PMCTR2H_REG 0xFF10000C
 #define PMCTR2L_REG 0xFF100010
 //
 // --- Performance Counter Configuration Flags ---
 //
 // These bits' functions are currently unknown, but they may simply be reserved.
 // It's possible that there's a [maybe expired?] patent that details the
 // configuration registers, though I haven't been able to find one. Places to
 // check would be Google Patents and the Japanese Patent Office--maybe someone
 // else can find something?
 //
 // Some notes:
 // Writing 1 to all of these bits reads back as 0, so it looks like they aren't
 // config bits. It's possible they are write-only like the stop bit, though,
 // or that they're just reserved-write-0-only. It appears that they are always
 // written with zeros in software that uses them, so that's confirmed safe to do.
 //
 // Also, after running counter 1 to overflow, it appears there's no overflow bit
 // (maybe the designers thought 48-bits would be so much to count to that they
 // didn't bother implementing one?). The upper 16-bits of the counter high
 // register are also not sign-extension bits. They may be a hidden config area,
 // but probably not because big endian mode would swap the byte order.
 #define PMCR_UNKNOWN_BIT_0040 0x0040
 #define PMCR_UNKNOWN_BIT_0080 0x0080
 #define PMCR_UNKNOWN_BIT_0200 0x0200
 #define PMCR_UNKNOWN_BIT_0400 0x0400
 #define PMCR_UNKNOWN_BIT_0800 0x0800
 #define PMCR_UNKNOWN_BIT_1000 0x1000
 // PMCR_MODE_CLEAR_INVERTED just clears the event mode if it's inverted with
 // '~', and event modes are listed below.
 #define PMCR_MODE_CLEAR_INVERTED 0x003f
 // PMCR_CLOCK_TYPE sets the counters to count clock cycles or CPU/bus ratio mode
 // cycles (where T = C x B / 24 and T is time, C is count, and B is time
 // of one bus cycle). Note: B = 1/99753008 or so, but it may vary, as mine is
 // actually 1/99749010-ish; the target frequency is probably meant to be 99.75MHz.
 //
 // See the ST40 or Renesas SH7750R documents described in the above "Event Mode
 // Info" section for more details about that formula.
 //
 // Set PMCR_CLOCK_TYPE to 0 for CPU cycle counting, where 1 count = 1 cycle, or
 // set it to 1 to use the above formula. Renesas documentation recommends using
 // the ratio version (set the bit to 1) when user programs alter CPU clock
 // frequencies. This header has some definitions later on to help with this.
 #define PMCR_CLOCK_TYPE 0x0100
 #define PMCR_CLOCK_TYPE_SHIFT 8
 // PMCR_STOP_COUNTER is write-only, as it always reads back as 0. It does what
 // the name suggests: when this bit is written to, the counter stops. However,
 // if written to while the counter is disabled or stopped, the counter's high
 // and low registers are reset to 0.
 //
 // Using PMCR_STOP_COUNTER to stop the counter has the effect of holding the
 // data in the data registers while stopped, unlike PMCR_DISABLE_COUNTER, and
 // this bit needs to be written to again (e.g. on next start) in order to
 // actually clear the counter data for another run. If not explicitly cleared,
 // the counter will continue from where it left off before being stopped.
 #define PMCR_STOP_COUNTER 0x2000
 #define PMCR_RESET_COUNTER_SHIFT 13
 // Bits 0xC000 both need to be set to 1 for the counters to actually begin
 // counting. I have seen that the Linux kernel actually separates them out into
 // two separate labelled bits (PMEN and PMST) for some reason, however they do
 // not appear to do anything separately. Perhaps this is a two-bit mode where
 // 1-1 is run, 1-0 and 0-1 are ???, and 0-0 is off.
 #define PMCR_RUN_COUNTER 0xC000
 #define PMCR_RUN_SHIFT 14
 // Interestingly, the output here writes 0x6000 to the counter config registers,
 // which would be the "PMST" bit and the "RESET" bit:
 // https://www.multimediaforum.de/threads/36260834-alice-hsn-3800tw-usb-jtag-ft4232h/page2
 // To disable a counter, just write 0 to its config register. This will not
 // reset the counter to 0, as that requires an explicit clear via setting the
 // PMCR_STOP_COUNTER bit. What's odd is that a disabled counter's data
 // registers read back as all 0, but re-enabling it without a clear will
 // continue from the last value before disabling.
 #define PMCR_DISABLE_COUNTER 0x0000
 // These definitions merely separate out the two PMCR_RUN_COUNTER bits, and
 // they are included here for documentation purposes.
 // PMST may mean PMCR START. It's consistently used to enable the counter.
 // I'm just calling it PMST here for lack of a better name, since this is what
 // the Linux kernel and lxdream call it. It could also have something to do with
 // a mode specific to STMicroelectronics.
 #define PMCR_PMST_BIT 0x4000
 #define PMCR_PMST_SHIFT 14
 // Likewise PMEN may mean PMCR ENABLE
 #define PMCR_PMEN_BIT 0x8000
 #define PMCR_PMEN_SHIFT 15
 //
 // --- Performance Counter Event Code Definitions ---
 //
 // Interestingly enough, it so happens that the SEGA Dreamcast's CPU seems to
 // contain the same performance counter functionality as SH4 debug adapters for
 // the SH7750R. Awesome!
 //
 //                MODE DEFINITION                  VALUE   MEASURMENT TYPE & NOTES
 #define PMCR_INIT_NO_MODE                           0x00 // None; Just here to be complete
 #define PMCR_OPERAND_READ_ACCESS_MODE               0x01 // Quantity; With cache
 #define PMCR_OPERAND_WRITE_ACCESS_MODE              0x02 // Quantity; With cache
 #define PMCR_UTLB_MISS_MODE                         0x03 // Quantity
 #define PMCR_OPERAND_CACHE_READ_MISS_MODE           0x04 // Quantity
 #define PMCR_OPERAND_CACHE_WRITE_MISS_MODE          0x05 // Quantity
 #define PMCR_INSTRUCTION_FETCH_MODE                 0x06 // Quantity; With cache
 #define PMCR_INSTRUCTION_TLB_MISS_MODE              0x07 // Quantity
 #define PMCR_INSTRUCTION_CACHE_MISS_MODE            0x08 // Quantity
 #define PMCR_ALL_OPERAND_ACCESS_MODE                0x09 // Quantity
 #define PMCR_ALL_INSTRUCTION_FETCH_MODE             0x0a // Quantity
 #define PMCR_ON_CHIP_RAM_OPERAND_ACCESS_MODE        0x0b // Quantity
 // No 0x0c
 #define PMCR_ON_CHIP_IO_ACCESS_MODE                 0x0d // Quantity
 #define PMCR_OPERAND_ACCESS_MODE                    0x0e // Quantity; With cache, counts both reads and writes
 #define PMCR_OPERAND_CACHE_MISS_MODE                0x0f // Quantity
 #define PMCR_BRANCH_ISSUED_MODE                     0x10 // Quantity; Not the same as branch taken!
 #define PMCR_BRANCH_TAKEN_MODE                      0x11 // Quantity
 #define PMCR_SUBROUTINE_ISSUED_MODE                 0x12 // Quantity; Issued a BSR, BSRF, JSR, JSR/N
 #define PMCR_INSTRUCTION_ISSUED_MODE                0x13 // Quantity
 #define PMCR_PARALLEL_INSTRUCTION_ISSUED_MODE       0x14 // Quantity
 #define PMCR_FPU_INSTRUCTION_ISSUED_MODE            0x15 // Quantity
 #define PMCR_INTERRUPT_COUNTER_MODE                 0x16 // Quantity
 #define PMCR_NMI_COUNTER_MODE                       0x17 // Quantity
 #define PMCR_TRAPA_INSTRUCTION_COUNTER_MODE         0x18 // Quantity
 #define PMCR_UBC_A_MATCH_MODE                       0x19 // Quantity
 #define PMCR_UBC_B_MATCH_MODE                       0x1a // Quantity
 // No 0x1b-0x20
 #define PMCR_INSTRUCTION_CACHE_FILL_MODE            0x21 // Cycles
 #define PMCR_OPERAND_CACHE_FILL_MODE                0x22 // Cycles
 #define PMCR_ELAPSED_TIME_MODE                      0x23 // Cycles; For 200MHz CPU: 5ns per count in 1 cycle = 1 count mode, or around 417.715ps per count (increments by 12) in CPU/bus ratio mode
 #define PMCR_PIPELINE_FREEZE_BY_ICACHE_MISS_MODE    0x24 // Cycles
 #define PMCR_PIPELINE_FREEZE_BY_DCACHE_MISS_MODE    0x25 // Cycles
 // No 0x26
 #define PMCR_PIPELINE_FREEZE_BY_BRANCH_MODE         0x27 // Cycles
 #define PMCR_PIPELINE_FREEZE_BY_CPU_REGISTER_MODE   0x28 // Cycles
 #define PMCR_PIPELINE_FREEZE_BY_FPU_MODE            0x29 // Cycles
 //
 // --- Performance Counter Support Definitions ---
 //
 // This definition can be passed as the init/enable/restart functions'
 // count_type parameter to use the 1 cycle = 1 count mode. This is how the
 // counter can be made to run for 16.3 days.
 #define PMCR_COUNT_CPU_CYCLES 0
 // Likewise this uses the CPU/bus ratio method
 #define PMCR_COUNT_RATIO_CYCLES 1
 // These definitions are for the enable function and specify whether to reset
 // a counter to 0 or to continue from where it left off
 #define PMCR_CONTINUE_COUNTER 0
 #define PMCR_RESET_COUNTER 1
 //
 // --- Performance Counter Miscellaneous Definitions ---
 //
 // For convenience; assume stock bus clock of 99.75MHz
 // (Bus clock is the external CPU clock, not the peripheral bus clock)
 //
 #define PMCR_SH4_CPU_FREQUENCY 199500000
 #define PMCR_CPU_CYCLES_MAX_SECONDS 1410902
 #define PMCR_SH4_BUS_FREQUENCY 99750000
 #define PMCR_SH4_BUS_FREQUENCY_SCALED 2394000000 // 99.75MHz x 24
 #define PMCR_BUS_RATIO_MAX_SECONDS 117575
 //
 // --- Performance Counter Functions ---
 //
 // See perfctr.c file for more details about each function and some more usage notes.
 //
 // Note: PMCR_Init() and PMCR_Enable() will do nothing if the perf counter is already running!
 //
 // Clear counter and enable
 void PMCR_Init(int which, unsigned short mode, unsigned char count_type);
 // Enable one or both of these "undocumented" performance counters.
 void PMCR_Enable(int which, unsigned short mode, unsigned char count_type, unsigned char reset_counter);
 // Disable, clear, and re-enable with new mode (or same mode)
 void PMCR_Restart(int which, unsigned short mode, unsigned char count_type);
 // Read a counter
 // out_array is specifically uint32 out_array[2] -- 48-bit value needs a 64-bit storage unit
 void PMCR_Read(int which, volatile unsigned int *out_array);
 // Stop counter(s) (without clearing)
 void PMCR_Stop(int which);
 // Disable counter(s) (without clearing)
 void PMCR_Disable(int which);
 #endif /* __PERFCTR_H__ */
--- a/GL/private.h
+++ b/GL/private.h
@ -6,6 +6,7 @@
 #include "../include/gl.h"
 #include "../containers/aligned_vector.h"
 #include "../containers/named_array.h"
 #include "cygprofile.h"
 extern void* memcpy4 (void *dest, const void *src, size_t count);
@ -249,6 +250,11 @@ typedef struct {
    GLint size;
 } AttribPointer;
 typedef struct {
    float xyz[3];
    float n[3];
 } EyeSpaceData;
 GLboolean _glCheckValidEnum(GLint param, GLint* values, const char* func);
 GLuint* _glGetEnabledAttributes();
@ -280,7 +286,7 @@ GLuint _glGetMipmapLevelCount(TextureObject* obj);
 GLboolean _glIsLightingEnabled();
 GLboolean _glIsLightEnabled(GLubyte light);
 GLboolean _glIsColorMaterialEnabled();
-void _glCalculateLightingContribution(const GLint light, const GLfloat* pos, const GLfloat* normal, uint8_t* bgra, GLfloat* colour);
+void _glPerformLighting(Vertex* vertices, const EyeSpaceData* es, const int32_t count);
 unsigned char _glIsClippingEnabled();
 void _glEnableClipping(unsigned char v);
--- a/GL/profiler.c
+++ b/GL/profiler.c
@ -6,6 +6,8 @@
 #include "profiler.h"
 #include "../containers/aligned_vector.h"
 #if PROFILING_COMPILED
 #define MAX_PATH 256
 typedef struct {
@ -141,3 +143,4 @@ void profiler_print_stats() {
        fprintf(stderr, "%-60s%-20f%-20f%" PRIu64 "\n", result->name, (double)avg, (double)ms, result->total_calls);
    }
 }
 #endif
--- a/GL/profiler.h
+++ b/GL/profiler.h
@ -7,12 +7,26 @@ typedef struct {
    uint64_t start_time_in_us;
 } Profiler;
 #define PROFILING_COMPILED 0
 #if PROFILING_COMPILED
 Profiler* profiler_push(const char* name);
-void profiler_checkpoint(const char* name);
+void _profiler_checkpoint(const char* name);
-void profiler_pop();
+void _profiler_pop();
-void profiler_print_stats();
+void _profiler_print_stats();
-void profiler_enable();
+void _profiler_enable();
-void profiler_disable();
+void _profiler_disable();
 #else
 #define profiler_push(name);
 #define profiler_checkpoint(name);
 #define profiler_pop();
 #define profiler_print_stats();
 #define profiler_enable();
 #define profiler_disable();
 #endif
--- a/GL/sh4_math.h
+++ b/GL/sh4_math.h
--- a/GL/texture.c
+++ b/GL/texture.c
@ -743,11 +743,11 @@ GLint _cleanInternalFormat(GLint internalFormat) {
 typedef void (*TextureConversionFunc)(const GLubyte*, GLubyte*);
-static inline void _rgba8888_to_argb4444(const GLubyte* source, GLubyte* dest) {
+static INLINE_DEBUG void _rgba8888_to_argb4444(const GLubyte* source, GLubyte* dest) {
    *((GLushort*) dest) = (source[3] & 0xF0) << 8 | (source[0] & 0xF0) << 4 | (source[1] & 0xF0) | (source[2] & 0xF0) >> 4;
 }
-static inline void _rgba8888_to_rgba8888(const GLubyte* source, GLubyte* dest) {
+static INLINE_DEBUG void _rgba8888_to_rgba8888(const GLubyte* source, GLubyte* dest) {
    /* Noop */
    GLubyte* dst = (GLubyte*) dest;
    dst[0] = source[0];
@ -756,11 +756,11 @@ static inline void _rgba8888_to_rgba8888(const GLubyte* source, GLubyte* dest) {
    dst[3] = source[3];
 }
-static inline void _rgba8888_to_rgb565(const GLubyte* source, GLubyte* dest) {
+static INLINE_DEBUG void _rgba8888_to_rgb565(const GLubyte* source, GLubyte* dest) {
    *((GLushort*) dest) = ((source[0] & 0b11111000) << 8) | ((source[1] & 0b11111100) << 3) | (source[2] >> 3);
 }
-static inline void _rgb888_to_rgba8888(const GLubyte* source, GLubyte* dest) {
+static INLINE_DEBUG void _rgb888_to_rgba8888(const GLubyte* source, GLubyte* dest) {
    /* Noop */
    GLubyte* dst = (GLubyte*) dest;
    dst[0] = source[0];
@ -769,24 +769,24 @@ static inline void _rgb888_to_rgba8888(const GLubyte* source, GLubyte* dest) {
    dst[3] = 255;
 }
-static inline void _rgb888_to_rgb565(const GLubyte* source, GLubyte* dest) {
+static INLINE_DEBUG void _rgb888_to_rgb565(const GLubyte* source, GLubyte* dest) {
    *((GLushort*) dest) = ((source[0] & 0b11111000) << 8) | ((source[1] & 0b11111100) << 3) | (source[2] >> 3);
 }
-static inline void _rgba8888_to_a000(const GLubyte* source, GLubyte* dest) {
+static INLINE_DEBUG void _rgba8888_to_a000(const GLubyte* source, GLubyte* dest) {
    *((GLushort*) dest) = ((source[3] & 0b11111000) << 8);
 }
-static inline void _r8_to_rgb565(const GLubyte* source, GLubyte* dest) {
+static INLINE_DEBUG void _r8_to_rgb565(const GLubyte* source, GLubyte* dest) {
    *((GLushort*) dest) = (source[0] & 0b11111000) << 8;
 }
-static inline void _rgba4444_to_argb4444(const GLubyte* source, GLubyte* dest) {
+static INLINE_DEBUG void _rgba4444_to_argb4444(const GLubyte* source, GLubyte* dest) {
    GLushort* src = (GLushort*) source;
    *((GLushort*) dest) = ((*src & 0x000F) << 12) | *src >> 4;
 }
-static inline void _rgba4444_to_rgba8888(const GLubyte* source, GLubyte* dest) {
+static INLINE_DEBUG void _rgba4444_to_rgba8888(const GLubyte* source, GLubyte* dest) {
    GLushort src = *((GLushort*) source);
    GLubyte* dst = (GLubyte*) dest;
@ -796,7 +796,7 @@ static inline void _rgba4444_to_rgba8888(const GLubyte* source, GLubyte* dest) {
    dst[3] = ((src & 0x000F)) * 2;
 }
-static inline void _i8_to_i8(const GLubyte* source, GLubyte* dest) {
+static INLINE_DEBUG void _i8_to_i8(const GLubyte* source, GLubyte* dest) {
    /* For indexes */
    GLubyte* dst = (GLubyte*) dest;
    *dst = *source;
--- a/containers/aligned_vector.c
+++ b/containers/aligned_vector.c
@ -3,6 +3,8 @@
 #include <math.h>
 #include <assert.h>
 #include <stdio.h>
 #include <dc/sq.h>
 #include <kos/string.h>
 #if defined(__APPLE__) || defined(__WIN32__)
 /* Linux + Kos define this, OSX does not, so just use malloc there */
@ -25,7 +27,7 @@ void aligned_vector_init(AlignedVector* vector, unsigned int element_size) {
 }
-static inline unsigned int round_to_chunk_size(unsigned int val) {
+static INLINE_DEBUG unsigned int round_to_chunk_size(unsigned int val) {
    const unsigned int n = val;
    const unsigned int m = ALIGNED_VECTOR_CHUNK_SIZE;
@ -107,33 +109,12 @@ void* aligned_vector_resize(AlignedVector* vector, const unsigned int element_co
    }
 }
 void* aligned_vector_at(const AlignedVector* vector, const unsigned int index) {
    #if 0
    if(index >= vector->size){
        char msg[60];
        sprintf(msg, "Vector OOB: %d %d wanted %d\n", vector->capacity, vector->size, index);
        //aligned_vector_resize(vector, index);
        assert_msg(index < vector->size, msg);
    }
    #endif
    assert(index < vector->size);
    return &vector->data[index * vector->element_size];
 }
 void* aligned_vector_back(AlignedVector* vector) {
    return aligned_vector_at(vector, vector->size - 1);
 }
 void* aligned_vector_extend(AlignedVector* vector, const unsigned int additional_count) {
    const unsigned int current = vector->size;
    aligned_vector_resize(vector, vector->size + additional_count);
    return aligned_vector_at(vector, current);
 }
 void aligned_vector_clear(AlignedVector* vector) {
    vector->size = 0;
 }
 void aligned_vector_shrink_to_fit(AlignedVector* vector) {
    if(vector->size == 0) {
        free(vector->data);
--- a/containers/aligned_vector.h
+++ b/containers/aligned_vector.h
@ -5,6 +5,8 @@
 extern "C" {
 #endif
 #include "../GL/cygprofile.h"
 typedef struct {
    unsigned int size;
    unsigned int capacity;
@ -18,12 +20,27 @@ void aligned_vector_init(AlignedVector* vector, unsigned int element_size);
 void aligned_vector_reserve(AlignedVector* vector, unsigned int element_count);
 void* aligned_vector_push_back(AlignedVector* vector, const void* objs, unsigned int count);
 void* aligned_vector_resize(AlignedVector* vector, const unsigned int element_count);
-void* aligned_vector_at(const AlignedVector* vector, const unsigned int index);
+INLINE_ALWAYS void* aligned_vector_at(const AlignedVector* vector, const unsigned int index) {
    #if 0
    if(index >= vector->size){
        char msg[60];
        sprintf(msg, "Vector OOB: %d %d wanted %d\n", vector->capacity, vector->size, index);
        //aligned_vector_resize(vector, index);
        assert_msg(index < vector->size, msg);
    }
    assert(index < vector->size); /* Check here */
    #endif
    return &vector->data[index * vector->element_size];
 }
 void* aligned_vector_extend(AlignedVector* vector, const unsigned int additional_count);
-void aligned_vector_clear(AlignedVector* vector);
+INLINE_ALWAYS void aligned_vector_clear(AlignedVector* vector){
    vector->size = 0;
 }
 void aligned_vector_shrink_to_fit(AlignedVector* vector);
 void aligned_vector_cleanup(AlignedVector* vector);
-void* aligned_vector_back(AlignedVector* vector);
+INLINE_ALWAYS void* aligned_vector_back(AlignedVector* vector){
    return aligned_vector_at(vector, vector->size - 1);
 }
 #ifdef __cplusplus
 }
--- a/containers/named_array.c
+++ b/containers/named_array.c
@ -44,13 +44,6 @@ void named_array_init(NamedArray* array, unsigned int element_size, unsigned int
    memset(array->elements, 0, element_size * max_elements);
 }
 char named_array_used(NamedArray* array, unsigned int id) {
    unsigned int i = id / 8;
    unsigned int j = id % 8;
    unsigned char v = array->used_markers[i] & (unsigned char) (1 << j);
    return !!(v);
 }
 void* named_array_alloc(NamedArray* array, unsigned int* new_id) {
    unsigned int i = 0, j = 0;
--- a/containers/named_array.h
+++ b/containers/named_array.h
@ -5,6 +5,8 @@
 extern "C" {
 #endif
 #include "../GL/cygprofile.h"
 typedef struct {
    unsigned int element_size;
    unsigned int max_element_count;
@ -14,7 +16,13 @@ typedef struct {
 } NamedArray;
 void named_array_init(NamedArray* array, unsigned int element_size, unsigned int max_elements);
-char named_array_used(NamedArray* array, unsigned int id);
+INLINE_ALWAYS char named_array_used(NamedArray* array, unsigned int id) {
    const unsigned int i = id / 8;
    const unsigned int j = id % 8;
    unsigned char v = array->used_markers[i] & (unsigned char) (1 << j);
    return !!(v);
 }
 void* named_array_alloc(NamedArray* array, unsigned int* new_id);
 void* named_array_reserve(NamedArray* array, unsigned int id);