feat: implement proper chanegs from profiling

- math - inlining
2020-02-18 11:48:37 -05:00 · 2020-02-18 11:48:37 -05:00 · 3a4f09bef2
commit 3a4f09bef2
parent a2dcfcf997
19 changed files with 2572 additions and 227 deletions
--- a/GL/config.h
+++ b/GL/config.h
@ -1,9 +1,8 @@
+#pragma once
 #ifndef CONFIG_H
 #define CONFIG_H

-
 /* This figure is derived from the needs of Quake 1 */
 #define MAX_TEXTURE_COUNT 1088

-
 #endif // CONFIG_H
--- a/GL/cygprofile.c
+++ b/GL/cygprofile.c
@ -0,0 +1,227 @@
+/* Based on the idea from Erich Styger */
+/* profiled instrument guided profiling for gldc on hardware */
+
+#include "cygprofile.h"
+#include <kos.h>
+#include <stdbool.h>
+#include <stdint.h>
+#include <stdlib.h>
+#include "perfctr.h"
+#include "private.h"
+
+#if CYG_FUNC_TRACE_ENABLED
+
+#define _strcat(x, y, z) strncat(x, z, y)
+
+#ifndef __PE_Error_H
+#define __PE_Error_H
+
+#define ERR_OK 0       /* OK */
+#define ERR_SPEED 1    /* This device does not work in the active speed mode. */
+#define ERR_RANGE 2    /* Parameter out of range. */
+#define ERR_VALUE 3    /* Parameter of incorrect value. */
+#define ERR_OVERFLOW 4 /* Timer overflow. */
+#define ERR_MATH 5     /* Overflow during evaluation. */
+#define ERR_ENABLED 6  /* Device is enabled. */
+#define ERR_DISABLED 7 /* Device is disabled. */
+#define ERR_BUSY 8     /* Device is busy. */
+#define ERR_NOTAVAIL 9 /* Requested value or method not available. */
+#define ERR_RXEMPTY 10 /* No data in receiver. */
+#define ERR_TXFULL 11  /* Transmitter is full. */
+#define ERR_BUSOFF 12  /* Bus not available. */
+#define ERR_OVERRUN 13 /* Overrun error is detected. */
+#define ERR_FRAMING 14 /* Framing error is detected. */
+#define ERR_PARITY 15  /* Parity error is detected. */
+#define ERR_NOISE 16   /* Noise error is detected. */
+#define ERR_IDLE 17    /* Idle error is detectes. */
+#define ERR_FAULT 18   /* Fault error is detected. */
+#define ERR_BREAK 19   /* Break char is received during communication. */
+#define ERR_CRC 20     /* CRC error is detected. */
+#define ERR_ARBITR 21  /* A node losts arbitration. This error occurs if two nodes start transmission at the same time. */
+#define ERR_PROTECT 22 /* Protection error is detected. */
+
+#endif /* __PE_Error_H */
+
+#define CYG_RNG_BUF_NOF_ELEMS (8096 * 4)
+/*!< Number of elements in the ring buffer which is used to record function calls */
+#define CYG_THUMB_MASK 0xFFFFFFFF
+/*!< mask out LSB (thumb) bit */
+
+/* Hashing function for two uint32_ts */
+#define HASH_PAIR(x, y) (((x)*0x1f1f1f1f) ^ (y))
+
+static bool CYG_Enabled = false; /*!< flag which enables/disables tracing */
+
+/*!
+ * Element in ring buffer to store the trace information.
+ */
+typedef struct
+{
+  //bool isEnter;    /*!< TRUE for __cyg_profile_func_enter(), FALSE for __cyg_profile_func_exit() */
+  void *this_fn;    /*!< address (with thumb bit) of the (caller) function */
+  void *call_site;  /*!< return address to the function which called this_fn */
+  uint32_t counter; /* also contains isEnter as highest bit */
+} CYG_RNG_ElementType;
+
+typedef uint32_t CYG_RNG_BufSizeType; /*!< index type for ring buffer */
+
+static CYG_RNG_ElementType CYG_RNG_buffer[CYG_RNG_BUF_NOF_ELEMS]; /*!< ring buffer */
+//static CYG_RNG_BufSizeType CYG_RNG_inIdx;                         /*!< input index */
+static CYG_RNG_BufSizeType CYG_RNG_outIdx; /*!< output index */
+static CYG_RNG_BufSizeType CYG_RNG_inSize; /*!< size/number of elements in buffer */
+
+/*!
+ * \brief Stores a trace element into the ring buffer.
+ * \param elem Trace element to put into the buffer.
+ * \return Error code, ERR_OK if everything is ok.
+ */
+__attribute__((no_instrument_function)) static uint8_t CYG_RNG_Put(CYG_RNG_ElementType *elem) {
+  uint8_t res = ERR_OK;
+
+#if 0
+    if (CYG_RNG_inSize == CYG_RNG_BUF_NOF_ELEMS)
+    {
+        res = ERR_TXFULL;
+        CYG_RNG_inSize--;
+        CYG_PrintCallTrace();
+        //CYG_RNG_inIdx = 0;
+        CYG_RNG_outIdx = 0;
+        CYG_RNG_inSize = 0;
+        return CYG_RNG_Put(elem);
+    }
+    else
+    {
+        //CYG_RNG_buffer[CYG_RNG_inIdx] = *elem;
+        
+        /*
+        CYG_RNG_inIdx++;
+        if (CYG_RNG_inIdx == CYG_RNG_BUF_NOF_ELEMS)
+        {
+            CYG_RNG_inIdx = 0;
+        }
+        */
+        CYG_RNG_inSize++;
+    }
+#endif
+  CYG_RNG_ElementType *possible = &CYG_RNG_buffer[HASH_PAIR((uint32_t)elem->call_site, (uint32_t)elem->this_fn) % CYG_RNG_BUF_NOF_ELEMS];
+  if (possible->counter /*& 0x0FFFFFFF*/ == 0) {
+    *possible = *elem;
+  } else {
+    possible->counter++;
+  }
+  return res;
+}
+
+/*!
+ * \brief Gets a trace element from the ring buffer.
+ * \param elem Pointer where to store the trace element.
+ * \return Error code, ERR_OK if everything is ok.
+ */
+__attribute__((no_instrument_function)) static uint8_t CYG_RNG_Get(CYG_RNG_ElementType *elemP) {
+  uint8_t res = ERR_OK;
+
+  if (CYG_RNG_inSize == 0) {
+    res = ERR_RXEMPTY;
+  } else {
+    *elemP = CYG_RNG_buffer[CYG_RNG_outIdx];
+    CYG_RNG_inSize--;
+    CYG_RNG_outIdx++;
+    if (CYG_RNG_outIdx == CYG_RNG_BUF_NOF_ELEMS) {
+      CYG_RNG_outIdx = 0;
+    }
+  }
+  return res;
+}
+
+static uint32_t currentTime[2];
+static uint32_t lastTime;
+
+/*!
+ * \brief Stores a trace element into the ring buffer.
+ * \param this_fn Address of the caller function.
+ * \param call_site Return address to the function which called this_fn
+ * \return Error code, ERR_OK if everything is ok.
+ */
+__attribute__((no_instrument_function)) static void CYG_Store(void *this_fn, void *call_site) {
+  CYG_RNG_ElementType elem;
+  lastTime = currentTime[0];
+  PMCR_Read(1, (unsigned int *)currentTime);
+  //elem.isEnter = isEnter;
+  elem.call_site = call_site;
+  elem.this_fn = this_fn;
+  elem.counter = 1;  //currentTime[0] - lastTime;
+  CYG_RNG_Put(&elem);
+}
+
+/*!
+ * \brief Function which is called upon function enter. The function call is inserted by the compiler.
+ * \param this_fn Address of the caller function.
+ * \param call_site Return address to the function which called this_fn
+ */
+__attribute__((no_instrument_function)) void __cyg_profile_func_enter(void *this_fn, void *call_site) {
+  if (CYG_Enabled) {
+    CYG_Store(call_site, this_fn);
+  }
+}
+
+/*!
+ * \brief Function which is called upon function exit. The function call is inserted by the compiler.
+ * \param this_fn Address of the caller function.
+ * \param call_site Return address to the function which called this_fn
+ */
+__attribute__((no_instrument_function)) void __cyg_profile_func_exit(__attribute__((unused)) void *this_fn, __attribute__((unused)) void *call_site) {
+}
+
+/*!
+ * \brief Dumps the trace to the console.
+ */
+__attribute__((no_instrument_function)) void CYG_PrintCallTrace(void) {
+  CYG_RNG_BufSizeType i;
+  char buf[40];
+  CYG_RNG_ElementType elem;
+  uint8_t res;
+
+  CYG_Enabled = false;
+  printf("0x%08x\n", ((unsigned int)&_etext) - BASE_ADDRESS);
+  //printf("Function Trace:\r\n");
+  CYG_RNG_outIdx = 0;
+  for (i = 0; i < CYG_RNG_BUF_NOF_ELEMS; i++) {
+    buf[0] = '\0';
+    res = CYG_RNG_Get(&elem);
+    if (res == ERR_OK && elem.call_site != NULL) {
+      snprintf(buf, sizeof(buf), "{ 0x%" PRIXPTR " 0x%" PRIXPTR " %u\r\n", (uintptr_t)(elem.this_fn) & CYG_THUMB_MASK, (uintptr_t)(elem.call_site) & CYG_THUMB_MASK, (unsigned int)elem.counter);
+
+      printf(buf);
+    } else {
+      //printf("ERROR getting element!\r\n");
+    }
+  }
+  //printf("Function Trace: done!\r\n");
+}
+
+__attribute__((no_instrument_function)) void CYG_Init(void) {
+  if (CYG_Enabled) {
+    return;
+  }
+  CYG_RNG_inSize = CYG_RNG_BUF_NOF_ELEMS;
+  CYG_RNG_outIdx = 0;
+  CYG_Enabled = true;
+  currentTime[0] = currentTime[1] = 0;
+  lastTime = 0;
+  memset(CYG_RNG_buffer, 0, sizeof(CYG_RNG_buffer));
+  PMCR_Init(1, PMCR_ELAPSED_TIME_MODE, PMCR_COUNT_CPU_CYCLES);
+}
+
+__attribute__((no_instrument_function)) void CYG_Deinit(void) {
+  CYG_RNG_inSize = CYG_RNG_BUF_NOF_ELEMS;
+  CYG_RNG_outIdx = 0;
+  CYG_Enabled = false;
+  memset(CYG_RNG_buffer, 0, sizeof(CYG_RNG_buffer));
+}
+#else
+
+void CYG_PrintCallTrace(void){}
+void CYG_Init(void){}
+void CYG_Deinit(void){}
+
+#endif
--- a/GL/cygprofile.h
+++ b/GL/cygprofile.h
@ -0,0 +1,33 @@
+#pragma once
+#ifndef CYGPROFILE_H_
+#define CYGPROFILE_H_
+
+/* Based on the idea from Erich Styger */
+/* profiled instrument guided profiling for gldc on hardware */
+
+#define NO_INSTRUMENT inline __attribute__((no_instrument_function))
+#define INLINE_DEBUG NO_INSTRUMENT __attribute__((always_inline))
+#define INLINE_ALWAYS static NO_INSTRUMENT __attribute__((always_inline))
+
+extern char _etext;
+#define BASE_ADDRESS 0x8c010000
+
+#define CYG_FUNC_TRACE_ENABLED (1)
+/*!< 1: Trace enabled, 0: trace disabled */
+
+/*!
+ * \brief Print the call trace to the terminal.
+ */
+void CYG_PrintCallTrace(void);
+
+/*!
+ * \brief Driver Initialization.
+ */
+void CYG_Init(void);
+
+/*!
+ * \brief Driver De-Initialization.
+ */
+void CYG_Deinit(void);
+
+#endif /* CYGPROFILE_H_ */
--- a/GL/draw.c
+++ b/GL/draw.c
@ -56,7 +56,7 @@ void _glInitAttributePointers() {
    NORMAL_POINTER.size = 3;
 }

-static inline GLuint byte_size(GLenum type) {
+static INLINE_DEBUG GLuint byte_size(GLenum type) {
    switch(type) {
    case GL_BYTE: return sizeof(GLbyte);
    case GL_UNSIGNED_BYTE: return sizeof(GLubyte);
@ -513,7 +513,7 @@ PVRHeader* _glSubmissionTargetHeader(SubmissionTarget* target) {
    return aligned_vector_at(&target->output->vector, target->header_offset);
 }

-Vertex* _glSubmissionTargetStart(SubmissionTarget* target) {
+INLINE_DEBUG Vertex* _glSubmissionTargetStart(SubmissionTarget* target) {
    assert(target->start_offset < target->output->vector.size);
    return aligned_vector_at(&target->output->vector, target->start_offset);
 }
@ -1006,6 +1006,7 @@ static void mat_transform_normal3(const float* xyz, const float* xyzOut, const u

 static void light(SubmissionTarget* target) {

+#if 0
    typedef struct {
        float xyz[3];
        float n[3];
@ -1057,6 +1058,35 @@ static void light(SubmissionTarget* target) {
        vertex->bgra[G8IDX] = (GLubyte) (255.0f * fminf(total[1], 1.0f));
        vertex->bgra[B8IDX] = (GLubyte) (255.0f * fminf(total[2], 1.0f));
    }
+#endif
+
+    if(!_glIsLightingEnabled()) {
+        return;
+    }
+
+    static AlignedVector* eye_space_data = NULL;
+
+    if(!eye_space_data) {
+        eye_space_data = (AlignedVector*) malloc(sizeof(AlignedVector));
+        aligned_vector_init(eye_space_data, sizeof(EyeSpaceData));
+    }
+
+    aligned_vector_resize(eye_space_data, target->count);
+
+    /* Perform lighting calculations and manipulate the colour */
+    Vertex* vertex = _glSubmissionTargetStart(target);
+    VertexExtra* extra = aligned_vector_at(target->extras, 0);
+    EyeSpaceData* eye_space = (EyeSpaceData*) eye_space_data->data;
+
+    _glMatrixLoadModelView();
+    mat_transform3(vertex->xyz, eye_space->xyz, target->count, sizeof(Vertex), sizeof(EyeSpaceData));
+
+    _glMatrixLoadNormal();
+    mat_transform_normal3(extra->nxyz, eye_space->n, target->count, sizeof(VertexExtra), sizeof(EyeSpaceData));
+
+    EyeSpaceData* ES = aligned_vector_at(eye_space_data, 0);
+    _glPerformLighting(vertex, ES, target->count);
+
 }

 static void divide(SubmissionTarget* target) {
--- a/GL/framebuffer.c
+++ b/GL/framebuffer.c
@ -1,5 +1,6 @@
 #include <stdio.h>
 #include "private.h"
+#include "config.h"
 #include "../include/glkos.h"
 #include "../include/glext.h"

@ -94,62 +95,62 @@ void APIENTRY glFramebufferTexture2DEXT(GLenum target, GLenum attachment, GLenum
    ACTIVE_FRAMEBUFFER->texture_id = texture;
 }

-static inline GLuint A1555(GLuint v) {
+static INLINE_DEBUG GLuint A1555(GLuint v) {
    const GLuint MASK = (1 << 15);
    return (v & MASK) >> 15;
 }

-static inline GLuint R1555(GLuint v) {
+static INLINE_DEBUG GLuint R1555(GLuint v) {
    const GLuint MASK = (31 << 10);
    return (v & MASK) >> 10;
 }

-static inline GLuint G1555(GLuint v) {
+static INLINE_DEBUG GLuint G1555(GLuint v) {
    const GLuint MASK = (31 << 5);
    return (v & MASK) >> 5;
 }

-static inline GLuint B1555(GLuint v) {
+static INLINE_DEBUG GLuint B1555(GLuint v) {
    const GLuint MASK = (31 << 0);
    return (v & MASK) >> 0;
 }

-static inline GLuint A4444(GLuint v) {
+static INLINE_DEBUG GLuint A4444(GLuint v) {
    const GLuint MASK = (0xF << 12);
    return (v & MASK) >> 12;
 }

-static inline GLuint R4444(GLuint v) {
+static INLINE_DEBUG GLuint R4444(GLuint v) {
    const GLuint MASK = (0xF << 8);
    return (v & MASK) >> 8;
 }

-static inline GLuint G4444(GLuint v) {
+static INLINE_DEBUG GLuint G4444(GLuint v) {
    const GLuint MASK = (0xF << 4);
    return (v & MASK) >> 4;
 }

-static inline GLuint B4444(GLuint v) {
+static INLINE_DEBUG GLuint B4444(GLuint v) {
    const GLuint MASK = (0xF << 0);
    return (v & MASK) >> 0;
 }

-static inline GLuint R565(GLuint v) {
+static INLINE_DEBUG GLuint R565(GLuint v) {
    const GLuint MASK = (31 << 11);
    return (v & MASK) >> 11;
 }

-static inline GLuint G565(GLuint v) {
+static INLINE_DEBUG GLuint G565(GLuint v) {
    const GLuint MASK = (63 << 5);
    return (v & MASK) >> 5;
 }

-static inline GLuint B565(GLuint v) {
+static INLINE_DEBUG GLuint B565(GLuint v) {
    const GLuint MASK = (31 << 0);
    return (v & MASK) >> 0;
 }

-GLboolean _glCalculateAverageTexel(const GLubyte* src, const GLuint srcWidth, const GLuint pvrFormat, GLubyte* dest) {
+static NO_INSTRUMENT GLboolean _glCalculateAverageTexel(const GLubyte* src, const GLuint srcWidth, const GLuint pvrFormat, GLubyte* dest) {
    GLushort* s1 = ((GLushort*) src);
    GLushort* s2 = ((GLushort*) src) + 1;
    GLushort* s3 = ((GLushort*) src) + srcWidth;
--- a/GL/gldc.c
+++ b/GL/gldc.c
@ -19,3 +19,7 @@
 #include "matrix.c"
 #include "state.c"
 #include "texture.c"
+
+#include "../containers/stack.c"
+#include "../containers/aligned_vector.c"
+#include "../containers/named_array.c"
--- a/GL/lighting.c
+++ b/GL/lighting.c
@ -281,98 +281,143 @@ static inline float FPOW(float b, float p) {
    return FEXP(FLOG(b) * p);
 }

-void _glCalculateLightingContribution(const GLint light, const GLfloat* pos, const GLfloat* normal, uint8_t* bgra, GLfloat* colour) __attribute__((optimize("fast-math")));
-void _glCalculateLightingContribution(const GLint light, const GLfloat* pos, const GLfloat* normal, uint8_t* bgra, GLfloat* colour) {
-    LightSource* l = &LIGHTS[light];
-
-    struct vec3f L = {
-        l->position[0],
-        l->position[1],
-        l->position[2]
-    };
-
-    if(!l->is_directional) {
-        L.x -= pos[0];
-        L.y -= pos[1];
-        L.z -= pos[2];
-    }
-
-    struct vec3f N = {
-        normal[0],
-        normal[1],
-        normal[2]
-    };
-
-    struct vec3f V = {
-        pos[0],
-        pos[1],
-        pos[2]
-    };
-
-    GLfloat d;
-    vec3f_length(L.x, L.y, L.z, d);
-
-    GLfloat oneOverL = 1.0f / d;
-
-    L.x *= oneOverL;
-    L.y *= oneOverL;
-    L.z *= oneOverL;
-
-    vec3f_normalize(V.x, V.y, V.z);
-
-    GLfloat NdotL, VdotN;
-    vec3f_dot(N.x, N.y, N.z, L.x, L.y, L.z, NdotL);
-    vec3f_dot(V.x, V.y, V.z, N.x, N.y, N.z, VdotN);
-
-    GLfloat VdotR = VdotN - NdotL;
-    GLfloat specularPower = FPOW(VdotR > 0 ? VdotR : 0, MATERIAL.exponent);
-
-    GLboolean colorMaterial = _glIsColorMaterialEnabled();
-
-    GLfloat mD [] = {
-        (colorMaterial && isDiffuseColorMaterial()) ? ((GLfloat)bgra[R8IDX]) / 255.0f : MATERIAL.diffuse[0],
-        (colorMaterial && isDiffuseColorMaterial()) ? ((GLfloat)bgra[G8IDX]) / 255.0f : MATERIAL.diffuse[1],
-        (colorMaterial && isDiffuseColorMaterial()) ? ((GLfloat)bgra[B8IDX]) / 255.0f : MATERIAL.diffuse[2],
-        (colorMaterial && isDiffuseColorMaterial()) ? ((GLfloat)bgra[A8IDX]) / 255.0f : MATERIAL.diffuse[3]
-    };
-
-    GLfloat mA [] = {
-        (colorMaterial && isAmbientColorMaterial()) ? ((GLfloat)bgra[R8IDX]) / 255.0f : MATERIAL.ambient[0],
-        (colorMaterial && isAmbientColorMaterial()) ? ((GLfloat)bgra[G8IDX]) / 255.0f : MATERIAL.ambient[1],
-        (colorMaterial && isAmbientColorMaterial()) ? ((GLfloat)bgra[B8IDX]) / 255.0f : MATERIAL.ambient[2],
-        (colorMaterial && isAmbientColorMaterial()) ? ((GLfloat)bgra[A8IDX]) / 255.0f : MATERIAL.ambient[3]
-    };
-
-    GLfloat mS [] = {
-        (colorMaterial && isSpecularColorMaterial()) ? ((GLfloat)bgra[R8IDX]) / 255.0f : MATERIAL.specular[0],
-        (colorMaterial && isSpecularColorMaterial()) ? ((GLfloat)bgra[G8IDX]) / 255.0f : MATERIAL.specular[1],
-        (colorMaterial && isSpecularColorMaterial()) ? ((GLfloat)bgra[B8IDX]) / 255.0f : MATERIAL.specular[2],
-        (colorMaterial && isSpecularColorMaterial()) ? ((GLfloat)bgra[A8IDX]) / 255.0f : MATERIAL.specular[3]
-    };
-
-    colour[0] = l->ambient[0] * mA[0];
-    colour[1] = l->ambient[1] * mA[1];
-    colour[2] = l->ambient[2] * mA[2];
-    colour[3] = mD[3];
-
-    if(NdotL >= 0) {
-        colour[0] += (l->diffuse[0] * mD[0] * NdotL + l->specular[0] * mS[0] * specularPower);
-        colour[1] += (l->diffuse[1] * mD[1] * NdotL + l->specular[1] * mS[1] * specularPower);
-        colour[2] += (l->diffuse[2] * mD[2] * NdotL + l->specular[2] * mS[2] * specularPower);
-    }
-
-    if(!l->is_directional) {
-        GLfloat att = (
-            1.0f / (l->constant_attenuation + (l->linear_attenuation * d) + (l->quadratic_attenuation * d * d))
-        );
-
-        colour[0] *= att;
-        colour[1] *= att;
-        colour[2] *= att;
-    }
-
-    if(colour[0] > 1.0f) colour[0] = 1.0f;
-    if(colour[1] > 1.0f) colour[1] = 1.0f;
-    if(colour[2] > 1.0f) colour[2] = 1.0f;
-    if(colour[3] > 1.0f) colour[3] = 1.0f;
+#define LIGHT_COMPONENT(C) { \
+    const GLfloat* acm = &MA[C]; \
+    const GLfloat* dcm = &MD[C]; \
+    const GLfloat* scm = &MS[C]; \
+    const GLfloat* scli = &light->specular[C]; \
+    const GLfloat* dcli = &light->diffuse[C]; \
+    const GLfloat* acli = &light->ambient[C]; \
+    const GLfloat* srm = &MATERIAL.exponent; \
+    const GLfloat fi = (LdotN == 0) ? 0 : 1; \
+    GLfloat component = (*acm * *acli); \
+    component += (LdotN * *dcm * *dcli); \
+    component += (FPOW((fi * NdotH), *srm) * *scm * *scli); \
+    component *= att; \
+    component *= spot; \
+    final[C] += component; \
 }
+
+static inline float vec3_dot_limited(
+        const float* x1, const float* y1, const float* z1,
+        const float* x2, const float* y2, const float* z2) {
+
+    float ret;
+    vec3f_dot(*x1, *y1, *z1, *x2, *y2, *z2, ret);
+    return (ret < 0) ? 0 : ret;
+}
+
+void _glPerformLighting(Vertex* vertices, const EyeSpaceData* es, const int32_t count) {
+    int8_t i;
+    int32_t j;
+
+    const LightSource* light = NULL;
+
+    const GLboolean colorMaterial = _glIsColorMaterialEnabled();
+    const GLboolean isDiffuseCM = isDiffuseColorMaterial();
+    const GLboolean isAmbientCM = isAmbientColorMaterial();
+    const GLboolean isSpecularCM = isSpecularColorMaterial();
+
+    static GLfloat CM[4];
+
+     /* So the DC has 16 floating point registers, that means
+     * we need to limit the number of floats as much as possible
+     * to give the compiler a good enough chance to do the right
+     * thing */
+
+    Vertex* vertex = vertices;
+    const EyeSpaceData* data = es;
+
+    static const float ONE_OVER_255 = 1.0f / 255.0f;
+
+    for(j = 0; j < count; ++j, ++vertex, ++data) {
+        /* When GL_COLOR_MATERIAL is on, we need to pull out
+         * the passed in diffuse and use it */
+        const GLfloat* MD = MATERIAL.diffuse;
+        const GLfloat* MA = MATERIAL.ambient;
+        const GLfloat* MS = MATERIAL.specular;
+
+        if(colorMaterial) {
+            CM[0] = ((GLfloat) vertex->bgra[R8IDX]) * ONE_OVER_255;
+            CM[1] = ((GLfloat) vertex->bgra[G8IDX]) * ONE_OVER_255;
+            CM[2] = ((GLfloat) vertex->bgra[B8IDX]) * ONE_OVER_255;
+            CM[3] = ((GLfloat) vertex->bgra[A8IDX]) * ONE_OVER_255;
+
+            MD = (isDiffuseCM) ? CM : MATERIAL.diffuse;
+            MA = (isAmbientCM) ? CM : MATERIAL.ambient;
+            MS = (isSpecularCM) ? CM : MATERIAL.specular;
+        }
+
+        float final[4];
+
+        /* Initial, non-light related values */
+        final[0] = (SCENE_AMBIENT[0] * MA[0]) + MATERIAL.emissive[0];
+        final[1] = (SCENE_AMBIENT[1] * MA[1]) + MATERIAL.emissive[1];
+        final[2] = (SCENE_AMBIENT[2] * MA[2]) + MATERIAL.emissive[2];
+        final[3] = MD[3];
+
+        float Vx, Vy, Vz;
+        Vx = -data->xyz[0];
+        Vy = -data->xyz[1];
+        Vz = -data->xyz[2];
+        vec3f_normalize(Vx, Vy, Vz);
+
+        for(i = 0; i < MAX_LIGHTS; ++i) {
+            if(!_glIsLightEnabled(i)) continue;
+
+            /* Calc light specific parameters */
+            light = &LIGHTS[i];
+
+            float Lx, Ly, Lz, D;
+            float Hx, Hy, Hz;
+            const float* Nx = &data->n[0];
+            const float* Ny = &data->n[1];
+            const float* Nz = &data->n[2];
+
+            Lx = light->position[0] - data->xyz[0];
+            Ly = light->position[1] - data->xyz[1];
+            Lz = light->position[2] - data->xyz[2];
+            vec3f_length(Lx, Ly, Lz, D);
+
+            {
+                /* Normalize L - scoping ensures Llen is temporary */
+                const float Llen = 1.0f / D;
+                Lx *= Llen;
+                Ly *= Llen;
+                Lz *= Llen;
+            }
+
+            Hx = (Lx + Vx);
+            Hy = (Ly + Vy);
+            Hz = (Lz + Vz);
+            vec3f_normalize(Hx, Hy, Hz);
+
+            const float LdotN = vec3_dot_limited(
+                &Lx, &Ly, &Lz,
+                Nx, Ny, Nz
+            );
+
+            const float NdotH = vec3_dot_limited(
+                Nx, Ny, Nz,
+                &Hx, &Hy, &Hz
+            );
+
+            const float att = (
+                light->position[3] == 0.0f) ? 1.0f :
+                1.0f / (light->constant_attenuation + (light->linear_attenuation * D) + (light->quadratic_attenuation * D * D)
+            );
+
+            const float spot = 1.0f;
+
+            LIGHT_COMPONENT(0);
+            LIGHT_COMPONENT(1);
+            LIGHT_COMPONENT(2);
+        }
+
+        vertex->bgra[R8IDX] = (GLubyte)(fminf(final[0] * 255.0f, 255.0f));
+        vertex->bgra[G8IDX] = (GLubyte)(fminf(final[1] * 255.0f, 255.0f));
+        vertex->bgra[B8IDX] = (GLubyte)(fminf(final[2] * 255.0f, 255.0f));
+        vertex->bgra[A8IDX] = (GLubyte)(fminf(final[3] * 255.0f, 255.0f));
+    }
+}
+
--- a/GL/matrix.c
+++ b/GL/matrix.c
@ -476,84 +476,57 @@ void APIENTRY glDepthRange(GLclampf n, GLclampf f) {
    DEPTH_RANGE_MULTIPLIER_H = (n + f) / 2.0f;
 }

+#include "sh4_math.h"
+
 /* Vector Cross Product - Used by glhLookAtf2 */
-static inline void vec3f_cross(const GLfloat* v1, const GLfloat* v2, GLfloat* result) {
-    result[0] = v1[1] * v2[2] - v1[2] * v2[1];
-    result[1] = v1[2] * v2[0] - v1[0] * v2[2];
-    result[2] = v1[0] * v2[1] - v1[1] * v2[0];
+static inline void vec3f_cross(GLfloat* v1, GLfloat* v2, GLfloat* result) {
+    result[0] = (v1[1] * v2[2]) - (v1[2] * v2[1]);
+    result[1] = (v1[2] * v2[0]) - (v1[0] * v2[2]);
+    result[2] = (v1[0] * v2[1]) - (v1[1] * v2[0]);
 }

-/* glhLookAtf2 adapted from http://www.opengl.org/wiki/GluLookAt_code */
-void glhLookAtf2(const GLfloat* eyePosition3D,
-                 const GLfloat* center3D,
-                 const GLfloat* upVector3D) {

-    /* Look-At Matrix */
-    static Matrix4x4 MatrixLookAt __attribute__((aligned(32))) = {
-        1.0f, 0.0f, 0.0f, 0.0f,
-        0.0f, 1.0f, 0.0f, 0.0f,
-        0.0f, 0.0f, 1.0f, 0.0f,
-        0.0f, 0.0f, 0.0f, 1.0f
-    };
+static inline void vec3f_normalize_sh4(float *v){
+    float	length, ilength;

-    GLfloat forward[3];
-    GLfloat side[3];
-    GLfloat up[3];
-
-    vec3f_sub_normalize(center3D[0], center3D[1], center3D[2],
-                        eyePosition3D[0], eyePosition3D[1], eyePosition3D[2],
-                        forward[0], forward[1], forward[2]);
-
-    //Side = forward x up
-    vec3f_cross(forward, upVector3D, side);
-    vec3f_normalize(side[0], side[1], side[2]);
-
-    //Recompute up as: up = side x forward
-    vec3f_cross(side, forward, up);
-
-    MatrixLookAt[M0] = side[0];
-    MatrixLookAt[M4] = side[1];
-    MatrixLookAt[M8] = side[2];
-    MatrixLookAt[M12] = 0;
-
-    MatrixLookAt[M1] = up[0];
-    MatrixLookAt[M5] = up[1];
-    MatrixLookAt[M9] = up[2];
-    MatrixLookAt[M13] = 0;
-
-    MatrixLookAt[M2] = -forward[0];
-    MatrixLookAt[M6] = -forward[1];
-    MatrixLookAt[M10] = -forward[2];
-    MatrixLookAt[M14] = 0;
-
-    MatrixLookAt[M3] = MatrixLookAt[11] = MatrixLookAt[15] = 0;
-    MatrixLookAt[M15] = 1;
-
-    static Matrix4x4 trn __attribute__((aligned(32))) = {
-        1.0f, 0.0f, 0.0f, 0.0f,
-        0.0f, 1.0f, 0.0f, 0.0f,
-        0.0f, 0.0f, 1.0f, 0.0f,
-        0.0f, 0.0f, 0.0f, 1.0f
-    };
-
-    trn[M12] = -eyePosition3D[0];
-    trn[M13] = -eyePosition3D[1];
-    trn[M14] = -eyePosition3D[2];
-
-    // Does not modify internal Modelview matrix
-    upload_matrix(&MatrixLookAt);
-    multiply_matrix(&trn);
-    multiply_matrix(stack_top(MATRIX_STACKS + (GL_MODELVIEW & 0xF)));
-    download_matrix(stack_top(MATRIX_STACKS + (GL_MODELVIEW & 0xF)));
+	ilength = MATH_fsrra(v[0]*v[0] + v[1]*v[1] + v[2]*v[2]);
+	length = MATH_Invert(ilength);
+	if (length)
+	{
+		v[0] *= ilength;
+		v[1] *= ilength;
+		v[2] *= ilength;
+	}
 }

 void gluLookAt(GLfloat eyex, GLfloat eyey, GLfloat eyez, GLfloat centerx,
               GLfloat centery, GLfloat centerz, GLfloat upx, GLfloat upy,
               GLfloat upz) {
-    GLfloat eye [] = { eyex, eyey, eyez };
-    GLfloat point [] = { centerx, centery, centerz };
-    GLfloat up [] = { upx, upy, upz };
-    glhLookAtf2(eye, point, up);
+    GLfloat m [16];
+   	GLfloat f [3];
+	GLfloat u [3];
+	GLfloat s [3];
+
+	f[0] = centerx - eyex;
+	f[1] = centery - eyey;
+	f[2] = centerz - eyez;
+
+	u[0] = upx;
+	u[1] = upy;
+	u[2] = upz;
+
+    vec3f_normalize_sh4(f);
+	vec3f_cross(f, u, s);
+    vec3f_normalize_sh4(s);
+	vec3f_cross(s, f, u);
+
+	m[0] =  s[0]; m[4] =  s[1]; m[8] =   s[2]; m[12] = 0.0f;
+	m[1] =  u[0]; m[5] =  u[1]; m[9] =   u[2]; m[13] = 0.0f;
+	m[2] = -f[0]; m[6] = -f[1]; m[10] = -f[2]; m[14] = 0.0f;
+    m[3] =   0.0f; m[7] =   0.0f; m[11] =   0.0f; m[15] = 1.0f;
+
+	glMultMatrixf(m);
+	glTranslatef(-eyex, -eyey, -eyez);
 }

 void _glApplyRenderMatrix() {
--- a/GL/perfctr.c
+++ b/GL/perfctr.c
@ -0,0 +1,247 @@
+// ---- perfctr.c - SH7091 Performance Counter Module Code ----
+//
+// This file is part of the DreamHAL project, a hardware abstraction library
+// primarily intended for use on the SH7091 found in hardware such as the SEGA
+// Dreamcast game console.
+//
+// The performance counter module is hereby released into the public domain in
+// the hope that it may prove useful. Now go profile some code and hit 60 fps! :)
+//
+// --Moopthehedgehog
+
+// See perfctr.h for more of my notes and documentation on these counters.
+#include "perfctr.h"
+#include "cygprofile.h"
+#if CYG_FUNC_TRACE_ENABLED
+
+static unsigned char pmcr_enabled = 0;
+
+//
+// Initialize performance counters. It's just a clear -> enable.
+// It's good practice to clear a counter before starting it for the first time.
+//
+// Also: Disabling and re-enabling the counters doesn't reset them; the clearing
+// needs to happen while a counter is disabled to reset it.
+//
+// You can disable and re-enable with a different mode without explicitly
+// clearing and have it keep going, continuing from where it left off.
+//
+
+__attribute__((no_instrument_function)) void PMCR_Init(int which, unsigned short mode, unsigned char count_type) // Will do nothing if perfcounter is already running!
+{
+	// Don't do anything if being asked to enable an already-enabled counter
+	if( (which == 1) && ((!pmcr_enabled) || (pmcr_enabled == 2)) )
+	{
+		// counter 1
+		PMCR_Enable(1, mode, count_type, PMCR_RESET_COUNTER);
+	}
+	else if( (which == 2) && ((!pmcr_enabled) || (pmcr_enabled == 1)) )
+	{
+		// counter 2
+		PMCR_Enable(2, mode, count_type, PMCR_RESET_COUNTER);
+	}
+	else if( (which == 3) && (!pmcr_enabled) )
+	{
+		// Both
+		PMCR_Enable(3, mode, count_type, PMCR_RESET_COUNTER);
+	}
+}
+
+// Enable "undocumented" performance counters (well, they were undocumented at one point. They're documented now!)
+__attribute__((no_instrument_function)) void PMCR_Enable(int which, unsigned short mode, unsigned char count_type, unsigned char reset_count) // Will do nothing if perfcounter is already running!
+{
+	// Don't do anything if count_type or reset_count are invalid
+	if((count_type | reset_count) > 1)
+	{
+		return;
+	}
+
+	// Build config from parameters
+	unsigned short pmcr_ctrl = PMCR_RUN_COUNTER | (reset_count << PMCR_RESET_COUNTER_SHIFT) | (count_type << PMCR_CLOCK_TYPE_SHIFT) | mode;
+
+	// Don't do anything if being asked to enable an already-enabled counter
+	if( (which == 1) && ((!pmcr_enabled) || (pmcr_enabled == 2)) )
+	{
+		// counter 1
+		*((volatile unsigned short*)PMCR1_CTRL_REG) = pmcr_ctrl;
+
+		pmcr_enabled += 1;
+	}
+	else if( (which == 2) && ((!pmcr_enabled) || (pmcr_enabled == 1)) )
+	{
+		// counter 2
+		*((volatile unsigned short*)PMCR2_CTRL_REG) = pmcr_ctrl;
+
+		pmcr_enabled += 2;
+	}
+	else if( (which == 3) && (!pmcr_enabled) )
+	{
+		// Both
+		*((volatile unsigned short*)PMCR1_CTRL_REG) = pmcr_ctrl;
+		*((volatile unsigned short*)PMCR2_CTRL_REG) = pmcr_ctrl;
+
+		pmcr_enabled = 3;
+	}
+}
+
+// For reference:
+// #define PMCTR1H_REG 0xFF100004
+// #define PMCTR1L_REG 0xFF100008
+
+// #define PMCTR2H_REG 0xFF10000C
+// #define PMCTR2L_REG 0xFF100010
+
+static const unsigned int pmcr1_regh = PMCTR1H_REG;
+static const unsigned int pmcr1_regl = PMCTR1L_REG;
+
+static const unsigned int pmcr2_regh = PMCTR2H_REG;
+static const unsigned int pmcr2_regl = PMCTR2L_REG;
+
+// Sorry, can only read one counter at a time!
+// out_array should be an array consisting of 2x unsigned ints.
+__attribute__((no_instrument_function)) void PMCR_Read(int which, volatile unsigned int *out_array)
+{
+ // if pmcr is not enabled, this function will just return 0
+
+	// little endian (big endian would need to flip [0] and [1])
+
+	// Note: These reads really do need to be done in assembly: unfortunately it
+	// appears that using C causes GCC to insert a branch right smack in between
+	// the high and low reads of perf counter 2 (with a nop, so it's literally
+	// delaying the reads by several cycles!), which is totally insane. Doing it
+	// the assembly way ensures that nothing ridiculous like that happens. It's
+	// also portable between versions of GCC that do put the nonsensical branch in.
+	//
+	// One thing that would be nice is if SH4 had the movi20s instruction to make
+	// absolute addresses in 3 cycles, but only the SH2A has that... :(
+	if( (which == 1) && (pmcr_enabled & 0x1) )
+	{
+		// counter 1
+//		out_array[1] = *((volatile unsigned int*)PMCTR1H_REG) & 0xffff;
+//		out_array[0] = *((volatile unsigned int*)PMCTR1L_REG);
+		asm volatile("mov.l %[reg1h],r1\n\t" // load counter address (high)
+								 "mov.l %[reg1l],r2\n\t" // load counter address (low)
+								 "mov.l @r1,r1\n\t" // read counter (high)
+								 "mov.l @r2,r2\n\t" // read counter (low)
+								 "extu.w r1,r1\n\t" // zero-extend high, aka high & 0xffff
+								 "mov.l r1,%[outh]\n\t" // get data to memory
+								 "mov.l r2,%[outl]\n\t" // get data to memory
+		: [outh] "=m" (out_array[1]), [outl] "=m" (out_array[0])
+		: [reg1h] "m" (pmcr1_regh), [reg1l] "m" (pmcr1_regl) // SH4 can't mov an immediate longword into a register...
+		: "r1", "r2"
+		);
+	}
+	else if( (which == 2) && (pmcr_enabled & 0x2) )
+	{
+		// counter 2
+//		out_array[1] = *((volatile unsigned int*)PMCTR2H_REG) & 0xffff;
+//		out_array[0] = *((volatile unsigned int*)PMCTR2L_REG);
+		asm volatile("mov.l %[reg2h],r1\n\t" // load counter address (high)
+								 "mov.l %[reg2l],r2\n\t" // load counter address (low)
+								 "mov.l @r1,r1\n\t" // read counter (high)
+								 "mov.l @r2,r2\n\t" // read counter (low)
+								 "extu.w r1,r1\n\t" // zero-extend high, aka high & 0xffff
+								 "mov.l r1,%[outh]\n\t" // get data to memory
+								 "mov.l r2,%[outl]\n\t" // get data to memory
+		: [outh] "=m" (out_array[1]), [outl] "=m" (out_array[0])
+		: [reg2h] "m" (pmcr2_regh), [reg2l] "m" (pmcr2_regl) // SH4 can't mov an immediate longword into a register...
+		: "r1", "r2"
+		);
+	}
+	else if(!pmcr_enabled)
+	{
+		out_array[1] = 0;
+		out_array[0] = 0;
+	}
+	else // Invalid
+	{
+		out_array[1] = 0xffff;
+		out_array[0] = 0xffffffff;
+	}
+}
+
+// Reset counter to 0 and start it again
+// NOTE: It does not appear to be possible to clear a counter while it is running.
+__attribute__((no_instrument_function)) void PMCR_Restart(int which, unsigned short mode, unsigned char count_type)
+{
+	if( (which == 1) && (pmcr_enabled & 0x1) )
+ 	{
+ 		// counter 1
+		PMCR_Stop(1);
+		PMCR_Enable(1, mode, count_type, PMCR_RESET_COUNTER);
+ 	}
+	else if( (which == 2) && (pmcr_enabled & 0x2) )
+ 	{
+ 		// counter 2
+		PMCR_Stop(2);
+		PMCR_Enable(2, mode, count_type, PMCR_RESET_COUNTER);
+ 	}
+	else if( (which == 3) && (pmcr_enabled == 3) )
+ 	{
+		// Both
+		PMCR_Stop(3);
+		PMCR_Enable(3, mode, count_type, PMCR_RESET_COUNTER);
+ 	}
+}
+
+// Clearing only works when the counter is disabled. Otherwise, stopping the
+// counter via setting the 0x2000 bit holds the data in the data registers,
+// whereas disabling without setting that bit reads back as all 0 (but doesn't
+// clear the counters for next start). This function just stops a running
+// counter and does nothing if the counter is already stopped or disabled, as
+// clearing is handled by PMCR_Enable().
+__attribute__((no_instrument_function)) void PMCR_Stop(int which)
+{
+	if( (which == 1) && (pmcr_enabled & 0x1) )
+	{
+		// counter 1
+		*((volatile unsigned short*)PMCR1_CTRL_REG) = PMCR_STOP_COUNTER;
+
+		pmcr_enabled &= 0x2;
+	}
+	else if( (which == 2) && (pmcr_enabled & 0x2) )
+	{
+		// counter 2
+		*((volatile unsigned short*)PMCR2_CTRL_REG) = PMCR_STOP_COUNTER;
+
+		pmcr_enabled &= 0x1;
+	}
+	else if( (which == 3) && (pmcr_enabled == 3) )
+	{
+		// Both
+		*((volatile unsigned short*)PMCR1_CTRL_REG) = PMCR_STOP_COUNTER;
+		*((volatile unsigned short*)PMCR2_CTRL_REG) = PMCR_STOP_COUNTER;
+
+		pmcr_enabled = 0;
+	}
+}
+
+// Note that disabling does NOT clear the counter.
+// It may appear that way because reading a disabled counter returns 0, but re-
+// enabling without first clearing will simply continue where it left off.
+__attribute__((no_instrument_function)) void PMCR_Disable(int which)
+{
+	if(which == 1)
+	{
+		// counter 1
+		*((volatile unsigned short*)PMCR1_CTRL_REG) = PMCR_DISABLE_COUNTER;
+
+		pmcr_enabled &= 0x2;
+	}
+	else if(which == 2)
+	{
+		// counter 2
+		*((volatile unsigned short*)PMCR2_CTRL_REG) = PMCR_DISABLE_COUNTER;
+
+		pmcr_enabled &= 0x1;
+	}
+	else if(which == 3)
+	{
+		// Both
+		*((volatile unsigned short*)PMCR1_CTRL_REG) = PMCR_DISABLE_COUNTER;
+		*((volatile unsigned short*)PMCR2_CTRL_REG) = PMCR_DISABLE_COUNTER;
+
+		pmcr_enabled = 0;
+	}
+}
+#endif
--- a/GL/perfctr.h
+++ b/GL/perfctr.h
@ -0,0 +1,316 @@
+// ---- perfctr.h - SH7091 Performance Counter Module Header ----
+//
+// This file is part of the DreamHAL project, a hardware abstraction library
+// primarily intended for use on the SH7091 found in hardware such as the SEGA
+// Dreamcast game console.
+//
+// The performance counter module is hereby released into the public domain in
+// the hope that it may prove useful. Now go profile some code and hit 60 fps! :)
+//
+// --Moopthehedgehog
+//
+
+#ifndef __PERFCTR_H__
+#define __PERFCTR_H__
+
+//
+// -- General SH4 Performance Counter Notes --
+//
+// There are 2 performance counters that can measure elapsed time. They are each
+// 48-bit counters. They are part of the so-called "ASE" subsystem, which you can
+// read about in chapter 13 of the "SuperH™ (SH) 32-bit RISC series SH-4, ST40
+// system architecture, volume 1: system":
+// https://www.st.com/content/ccc/resource/technical/document/user_manual/36/75/05/ac/e8/7e/42/2d/CD00147163.pdf/files/CD00147163.pdf/jcr:content/translations/en.CD00147163.pdf
+//
+// They can count cycles, so that's 199.5MHz (not 200MHz!!) a.k.a. roughly 5 ns
+// increments. At 5 ns increments, a 48-bit cycle counter can run continuously
+// for 16.33 days. It's actually 16 days, 7 hours, 55 minutes, and 2 seconds,
+// depending on how close the bus clock is to 99.75MHz. There is also a second
+// mode that counts cycles according to a ratio between the CPU frequency and
+// the system bus clock, and it increments the counter by 12 every bus cycle.
+// This second mode is detailed in the description for PMCR_CLOCK_TYPE in this
+// file, and it is recommended for use when the CPU frequency is not a runtime
+// constant.
+//
+// Side note: The counters don't have an overflow interrupt or overflow bit.
+// (I did actually run one to 48-bit overflow in elapsed time mode using the
+// ratio method to check this. They don't appear to sign-extend the upper 16
+// bits in elapsed time mode, either.)
+//
+// The two counters are functionally identical. I would recommend using the
+// PMCR_Init() function to start one (or both) up the first time.
+//
+// -- Configuration Address Info --
+//
+// Addresses for these counters can be easily seen here, in lxdream's source code:
+// https://github.com/lutris/lxdream/blob/master/src/sh4/sh4mmio.h
+//
+// They are also on display in the Linux kernel, but at the time of writing appear
+// to be set incorrectly (the clock mode at bit 0x100 is never set or cleared,
+// for example, so they're at the mercy of whatever the hardware defaults are):
+// http://git.lpclinux.com/cgit/linux-2.6.28.2-lpc313x/plain/arch/sh/oprofile/op_model_sh7750.c
+// https://github.com/torvalds/linux/blob/master/arch/sh/kernel/cpu/sh4/perf_event.c
+// ...It also appears as though they may not be handling bus ratio mode correctly,
+// which appears to be the default mode on the Dreamcast in all my tests.
+//
+// You can also find these addresses by ripping a copy of Virtua Fighter 3 that
+// you own for Dreamcast and looking at the raw byte code (or a raw disassembly)
+// of its main program binary. It would appear as though they were timing a loop
+// with the low half of perf counter 1 in elapsed time mode. Definitely seems
+// like a good thing to do when targeting 60fps! Shenmue Disc 4 also uses the
+// same configuration, but what's being timed is not as clear.
+//
+// Another place you can actually find both control addresses 0xFF00008x and all
+// data addresses 0xFF10000x is in binaries of ancient, freely available versions
+// of CodeScape. Literally all you need to do is open an SH7750-related DLL in a
+// hex editor and do a search to find the control register addresses, and the
+// data addresses are equally plain to see in any relevant performance profiling
+// firmware. There's no effort or decryption required to find them whatsoever;
+// all you need is an old trial version and a hex editor.
+//
+// However, something even better than all of that is if you search for "SH4
+// 0xFF000084" (without quotes) online you'll find an old forum where some logs
+// were posted of the terminal/command prompt output from some STMicro JTAG tool,
+// which not only has the address registers but also clearly characterizes their
+// size as 16-bit:
+// https://www.multimediaforum.de/threads/36260834-alice-hsn-3800tw-usb-jtag-ft4232h/page2
+//
+// -- Event Mode Info --
+//
+// Specific information on each counter mode can be found in the document titled
+// "SuperH™ Family E10A-USB Emulator: Additional Document for User’s Manual:
+// Supplementary Information on Using the SH7750R Renesas Microcomputer Development Environment System"
+// which is available on Renesas's website, in the "Documents" section of the
+// E10A-USB product page:
+// https://www.renesas.com/us/en/products/software-tools/tools/emulator/e10a-usb.html
+// At the time of writing (12/2019), the E10A-USB adapter is still available
+// for purchase, and it is priced around $1200 (USD).
+//
+// Appendix C of the "ST40 Micro Toolset Manual" also has these modes documented:
+// https://www.st.com/content/ccc/resource/technical/document/user_manual/c5/98/11/89/50/68/41/66/CD17379953.pdf/files/CD17379953.pdf/jcr:content/translations/en.CD17379953.pdf
+//
+// See here for the hexadecimal values corresponding to each mode (pg. 370):
+// http://www.macmadigan.com/BusaECU/Renesas%20documents/Hitachi_codescape_CS40_light_userguides.pdf
+// You can also find the same "Counter Description Table" in user's guide PDFs
+// bundled in ancient demo versions of CodeScape 3 from 2000 (e.g.
+// CSDemo_272.exe), which can still be found in the Internet Archive.
+// http://web.archive.org/web/*/http://codescape.com/dl/CSDemo/*
+//
+// See here for a support document on Lauterbach's SH2, SH3, and SH4 debugger,
+// which contains units for each mode (e.g. which measure time and which just
+// count): https://www.lauterbach.com/frames.html?home.html (It's in Downloads
+// -> Trace32 Help System -> it's the file called "SH2, SH3 and SH4 Debugger"
+// with the filename debugger_sh4.pdf).
+//
+
+//
+// --- Performance Counter Registers ---
+//
+
+// These registers are 16 bits only and configure the performance counters
+#define PMCR1_CTRL_REG 0xFF000084
+#define PMCR2_CTRL_REG 0xFF000088
+
+// These registers are 32-bits each and hold the high low parts of each counter
+#define PMCTR1H_REG 0xFF100004
+#define PMCTR1L_REG 0xFF100008
+
+#define PMCTR2H_REG 0xFF10000C
+#define PMCTR2L_REG 0xFF100010
+
+//
+// --- Performance Counter Configuration Flags ---
+//
+
+// These bits' functions are currently unknown, but they may simply be reserved.
+// It's possible that there's a [maybe expired?] patent that details the
+// configuration registers, though I haven't been able to find one. Places to
+// check would be Google Patents and the Japanese Patent Office--maybe someone
+// else can find something?
+//
+// Some notes:
+// Writing 1 to all of these bits reads back as 0, so it looks like they aren't
+// config bits. It's possible they are write-only like the stop bit, though,
+// or that they're just reserved-write-0-only. It appears that they are always
+// written with zeros in software that uses them, so that's confirmed safe to do.
+//
+// Also, after running counter 1 to overflow, it appears there's no overflow bit
+// (maybe the designers thought 48-bits would be so much to count to that they
+// didn't bother implementing one?). The upper 16-bits of the counter high
+// register are also not sign-extension bits. They may be a hidden config area,
+// but probably not because big endian mode would swap the byte order.
+#define PMCR_UNKNOWN_BIT_0040 0x0040
+#define PMCR_UNKNOWN_BIT_0080 0x0080
+#define PMCR_UNKNOWN_BIT_0200 0x0200
+#define PMCR_UNKNOWN_BIT_0400 0x0400
+#define PMCR_UNKNOWN_BIT_0800 0x0800
+#define PMCR_UNKNOWN_BIT_1000 0x1000
+
+// PMCR_MODE_CLEAR_INVERTED just clears the event mode if it's inverted with
+// '~', and event modes are listed below.
+#define PMCR_MODE_CLEAR_INVERTED 0x003f
+
+// PMCR_CLOCK_TYPE sets the counters to count clock cycles or CPU/bus ratio mode
+// cycles (where T = C x B / 24 and T is time, C is count, and B is time
+// of one bus cycle). Note: B = 1/99753008 or so, but it may vary, as mine is
+// actually 1/99749010-ish; the target frequency is probably meant to be 99.75MHz.
+//
+// See the ST40 or Renesas SH7750R documents described in the above "Event Mode
+// Info" section for more details about that formula.
+//
+// Set PMCR_CLOCK_TYPE to 0 for CPU cycle counting, where 1 count = 1 cycle, or
+// set it to 1 to use the above formula. Renesas documentation recommends using
+// the ratio version (set the bit to 1) when user programs alter CPU clock
+// frequencies. This header has some definitions later on to help with this.
+#define PMCR_CLOCK_TYPE 0x0100
+#define PMCR_CLOCK_TYPE_SHIFT 8
+
+// PMCR_STOP_COUNTER is write-only, as it always reads back as 0. It does what
+// the name suggests: when this bit is written to, the counter stops. However,
+// if written to while the counter is disabled or stopped, the counter's high
+// and low registers are reset to 0.
+//
+// Using PMCR_STOP_COUNTER to stop the counter has the effect of holding the
+// data in the data registers while stopped, unlike PMCR_DISABLE_COUNTER, and
+// this bit needs to be written to again (e.g. on next start) in order to
+// actually clear the counter data for another run. If not explicitly cleared,
+// the counter will continue from where it left off before being stopped.
+#define PMCR_STOP_COUNTER 0x2000
+#define PMCR_RESET_COUNTER_SHIFT 13
+
+// Bits 0xC000 both need to be set to 1 for the counters to actually begin
+// counting. I have seen that the Linux kernel actually separates them out into
+// two separate labelled bits (PMEN and PMST) for some reason, however they do
+// not appear to do anything separately. Perhaps this is a two-bit mode where
+// 1-1 is run, 1-0 and 0-1 are ???, and 0-0 is off.
+#define PMCR_RUN_COUNTER 0xC000
+#define PMCR_RUN_SHIFT 14
+// Interestingly, the output here writes 0x6000 to the counter config registers,
+// which would be the "PMST" bit and the "RESET" bit:
+// https://www.multimediaforum.de/threads/36260834-alice-hsn-3800tw-usb-jtag-ft4232h/page2
+
+// To disable a counter, just write 0 to its config register. This will not
+// reset the counter to 0, as that requires an explicit clear via setting the
+// PMCR_STOP_COUNTER bit. What's odd is that a disabled counter's data
+// registers read back as all 0, but re-enabling it without a clear will
+// continue from the last value before disabling.
+#define PMCR_DISABLE_COUNTER 0x0000
+
+// These definitions merely separate out the two PMCR_RUN_COUNTER bits, and
+// they are included here for documentation purposes.
+
+// PMST may mean PMCR START. It's consistently used to enable the counter.
+// I'm just calling it PMST here for lack of a better name, since this is what
+// the Linux kernel and lxdream call it. It could also have something to do with
+// a mode specific to STMicroelectronics.
+#define PMCR_PMST_BIT 0x4000
+#define PMCR_PMST_SHIFT 14
+
+// Likewise PMEN may mean PMCR ENABLE
+#define PMCR_PMEN_BIT 0x8000
+#define PMCR_PMEN_SHIFT 15
+
+//
+// --- Performance Counter Event Code Definitions ---
+//
+// Interestingly enough, it so happens that the SEGA Dreamcast's CPU seems to
+// contain the same performance counter functionality as SH4 debug adapters for
+// the SH7750R. Awesome!
+//
+
+//                MODE DEFINITION                  VALUE   MEASURMENT TYPE & NOTES
+#define PMCR_INIT_NO_MODE                           0x00 // None; Just here to be complete
+#define PMCR_OPERAND_READ_ACCESS_MODE               0x01 // Quantity; With cache
+#define PMCR_OPERAND_WRITE_ACCESS_MODE              0x02 // Quantity; With cache
+#define PMCR_UTLB_MISS_MODE                         0x03 // Quantity
+#define PMCR_OPERAND_CACHE_READ_MISS_MODE           0x04 // Quantity
+#define PMCR_OPERAND_CACHE_WRITE_MISS_MODE          0x05 // Quantity
+#define PMCR_INSTRUCTION_FETCH_MODE                 0x06 // Quantity; With cache
+#define PMCR_INSTRUCTION_TLB_MISS_MODE              0x07 // Quantity
+#define PMCR_INSTRUCTION_CACHE_MISS_MODE            0x08 // Quantity
+#define PMCR_ALL_OPERAND_ACCESS_MODE                0x09 // Quantity
+#define PMCR_ALL_INSTRUCTION_FETCH_MODE             0x0a // Quantity
+#define PMCR_ON_CHIP_RAM_OPERAND_ACCESS_MODE        0x0b // Quantity
+// No 0x0c
+#define PMCR_ON_CHIP_IO_ACCESS_MODE                 0x0d // Quantity
+#define PMCR_OPERAND_ACCESS_MODE                    0x0e // Quantity; With cache, counts both reads and writes
+#define PMCR_OPERAND_CACHE_MISS_MODE                0x0f // Quantity
+#define PMCR_BRANCH_ISSUED_MODE                     0x10 // Quantity; Not the same as branch taken!
+#define PMCR_BRANCH_TAKEN_MODE                      0x11 // Quantity
+#define PMCR_SUBROUTINE_ISSUED_MODE                 0x12 // Quantity; Issued a BSR, BSRF, JSR, JSR/N
+#define PMCR_INSTRUCTION_ISSUED_MODE                0x13 // Quantity
+#define PMCR_PARALLEL_INSTRUCTION_ISSUED_MODE       0x14 // Quantity
+#define PMCR_FPU_INSTRUCTION_ISSUED_MODE            0x15 // Quantity
+#define PMCR_INTERRUPT_COUNTER_MODE                 0x16 // Quantity
+#define PMCR_NMI_COUNTER_MODE                       0x17 // Quantity
+#define PMCR_TRAPA_INSTRUCTION_COUNTER_MODE         0x18 // Quantity
+#define PMCR_UBC_A_MATCH_MODE                       0x19 // Quantity
+#define PMCR_UBC_B_MATCH_MODE                       0x1a // Quantity
+// No 0x1b-0x20
+#define PMCR_INSTRUCTION_CACHE_FILL_MODE            0x21 // Cycles
+#define PMCR_OPERAND_CACHE_FILL_MODE                0x22 // Cycles
+#define PMCR_ELAPSED_TIME_MODE                      0x23 // Cycles; For 200MHz CPU: 5ns per count in 1 cycle = 1 count mode, or around 417.715ps per count (increments by 12) in CPU/bus ratio mode
+#define PMCR_PIPELINE_FREEZE_BY_ICACHE_MISS_MODE    0x24 // Cycles
+#define PMCR_PIPELINE_FREEZE_BY_DCACHE_MISS_MODE    0x25 // Cycles
+// No 0x26
+#define PMCR_PIPELINE_FREEZE_BY_BRANCH_MODE         0x27 // Cycles
+#define PMCR_PIPELINE_FREEZE_BY_CPU_REGISTER_MODE   0x28 // Cycles
+#define PMCR_PIPELINE_FREEZE_BY_FPU_MODE            0x29 // Cycles
+
+//
+// --- Performance Counter Support Definitions ---
+//
+
+// This definition can be passed as the init/enable/restart functions'
+// count_type parameter to use the 1 cycle = 1 count mode. This is how the
+// counter can be made to run for 16.3 days.
+#define PMCR_COUNT_CPU_CYCLES 0
+// Likewise this uses the CPU/bus ratio method
+#define PMCR_COUNT_RATIO_CYCLES 1
+
+// These definitions are for the enable function and specify whether to reset
+// a counter to 0 or to continue from where it left off
+#define PMCR_CONTINUE_COUNTER 0
+#define PMCR_RESET_COUNTER 1
+
+//
+// --- Performance Counter Miscellaneous Definitions ---
+//
+// For convenience; assume stock bus clock of 99.75MHz
+// (Bus clock is the external CPU clock, not the peripheral bus clock)
+//
+
+#define PMCR_SH4_CPU_FREQUENCY 199500000
+#define PMCR_CPU_CYCLES_MAX_SECONDS 1410902
+#define PMCR_SH4_BUS_FREQUENCY 99750000
+#define PMCR_SH4_BUS_FREQUENCY_SCALED 2394000000 // 99.75MHz x 24
+#define PMCR_BUS_RATIO_MAX_SECONDS 117575
+
+//
+// --- Performance Counter Functions ---
+//
+// See perfctr.c file for more details about each function and some more usage notes.
+//
+// Note: PMCR_Init() and PMCR_Enable() will do nothing if the perf counter is already running!
+//
+
+// Clear counter and enable
+void PMCR_Init(int which, unsigned short mode, unsigned char count_type);
+
+// Enable one or both of these "undocumented" performance counters.
+void PMCR_Enable(int which, unsigned short mode, unsigned char count_type, unsigned char reset_counter);
+
+// Disable, clear, and re-enable with new mode (or same mode)
+void PMCR_Restart(int which, unsigned short mode, unsigned char count_type);
+
+// Read a counter
+// out_array is specifically uint32 out_array[2] -- 48-bit value needs a 64-bit storage unit
+void PMCR_Read(int which, volatile unsigned int *out_array);
+
+// Stop counter(s) (without clearing)
+void PMCR_Stop(int which);
+
+// Disable counter(s) (without clearing)
+void PMCR_Disable(int which);
+
+#endif /* __PERFCTR_H__ */
--- a/GL/private.h
+++ b/GL/private.h
@ -6,6 +6,7 @@
 #include "../include/gl.h"
 #include "../containers/aligned_vector.h"
 #include "../containers/named_array.h"
+#include "cygprofile.h"

 extern void* memcpy4 (void *dest, const void *src, size_t count);

@ -249,6 +250,11 @@ typedef struct {
    GLint size;
 } AttribPointer;

+typedef struct {
+    float xyz[3];
+    float n[3];
+} EyeSpaceData;
+
 GLboolean _glCheckValidEnum(GLint param, GLint* values, const char* func);

 GLuint* _glGetEnabledAttributes();
@ -280,7 +286,7 @@ GLuint _glGetMipmapLevelCount(TextureObject* obj);
 GLboolean _glIsLightingEnabled();
 GLboolean _glIsLightEnabled(GLubyte light);
 GLboolean _glIsColorMaterialEnabled();
-void _glCalculateLightingContribution(const GLint light, const GLfloat* pos, const GLfloat* normal, uint8_t* bgra, GLfloat* colour);
+void _glPerformLighting(Vertex* vertices, const EyeSpaceData* es, const int32_t count);

 unsigned char _glIsClippingEnabled();
 void _glEnableClipping(unsigned char v);
--- a/GL/profiler.c
+++ b/GL/profiler.c
@ -6,6 +6,8 @@
 #include "profiler.h"
 #include "../containers/aligned_vector.h"

+#if PROFILING_COMPILED
+
 #define MAX_PATH 256

 typedef struct {
@ -141,3 +143,4 @@ void profiler_print_stats() {
        fprintf(stderr, "%-60s%-20f%-20f%" PRIu64 "\n", result->name, (double)avg, (double)ms, result->total_calls);
    }
 }
+#endif
--- a/GL/profiler.h
+++ b/GL/profiler.h
@ -7,12 +7,26 @@ typedef struct {
    uint64_t start_time_in_us;
 } Profiler;

+#define PROFILING_COMPILED 0

+#if PROFILING_COMPILED
 Profiler* profiler_push(const char* name);
-void profiler_checkpoint(const char* name);
-void profiler_pop();
+void _profiler_checkpoint(const char* name);
+void _profiler_pop();

-void profiler_print_stats();
+void _profiler_print_stats();

-void profiler_enable();
-void profiler_disable();
+void _profiler_enable();
+void _profiler_disable();
+
+#else
+#define profiler_push(name);
+#define profiler_checkpoint(name);
+#define profiler_pop();
+
+#define profiler_print_stats();
+
+#define profiler_enable();
+#define profiler_disable();
+
+#endif
--- a/GL/sh4_math.h
+++ b/GL/sh4_math.h
--- a/GL/texture.c
+++ b/GL/texture.c
@ -743,11 +743,11 @@ GLint _cleanInternalFormat(GLint internalFormat) {

 typedef void (*TextureConversionFunc)(const GLubyte*, GLubyte*);

-static inline void _rgba8888_to_argb4444(const GLubyte* source, GLubyte* dest) {
+static INLINE_DEBUG void _rgba8888_to_argb4444(const GLubyte* source, GLubyte* dest) {
    *((GLushort*) dest) = (source[3] & 0xF0) << 8 | (source[0] & 0xF0) << 4 | (source[1] & 0xF0) | (source[2] & 0xF0) >> 4;
 }

-static inline void _rgba8888_to_rgba8888(const GLubyte* source, GLubyte* dest) {
+static INLINE_DEBUG void _rgba8888_to_rgba8888(const GLubyte* source, GLubyte* dest) {
    /* Noop */
    GLubyte* dst = (GLubyte*) dest;
    dst[0] = source[0];
@ -756,11 +756,11 @@ static inline void _rgba8888_to_rgba8888(const GLubyte* source, GLubyte* dest) {
    dst[3] = source[3];
 }

-static inline void _rgba8888_to_rgb565(const GLubyte* source, GLubyte* dest) {
+static INLINE_DEBUG void _rgba8888_to_rgb565(const GLubyte* source, GLubyte* dest) {
    *((GLushort*) dest) = ((source[0] & 0b11111000) << 8) | ((source[1] & 0b11111100) << 3) | (source[2] >> 3);
 }

-static inline void _rgb888_to_rgba8888(const GLubyte* source, GLubyte* dest) {
+static INLINE_DEBUG void _rgb888_to_rgba8888(const GLubyte* source, GLubyte* dest) {
    /* Noop */
    GLubyte* dst = (GLubyte*) dest;
    dst[0] = source[0];
@ -769,24 +769,24 @@ static inline void _rgb888_to_rgba8888(const GLubyte* source, GLubyte* dest) {
    dst[3] = 255;
 }

-static inline void _rgb888_to_rgb565(const GLubyte* source, GLubyte* dest) {
+static INLINE_DEBUG void _rgb888_to_rgb565(const GLubyte* source, GLubyte* dest) {
    *((GLushort*) dest) = ((source[0] & 0b11111000) << 8) | ((source[1] & 0b11111100) << 3) | (source[2] >> 3);
 }

-static inline void _rgba8888_to_a000(const GLubyte* source, GLubyte* dest) {
+static INLINE_DEBUG void _rgba8888_to_a000(const GLubyte* source, GLubyte* dest) {
    *((GLushort*) dest) = ((source[3] & 0b11111000) << 8);
 }

-static inline void _r8_to_rgb565(const GLubyte* source, GLubyte* dest) {
+static INLINE_DEBUG void _r8_to_rgb565(const GLubyte* source, GLubyte* dest) {
    *((GLushort*) dest) = (source[0] & 0b11111000) << 8;
 }

-static inline void _rgba4444_to_argb4444(const GLubyte* source, GLubyte* dest) {
+static INLINE_DEBUG void _rgba4444_to_argb4444(const GLubyte* source, GLubyte* dest) {
    GLushort* src = (GLushort*) source;
    *((GLushort*) dest) = ((*src & 0x000F) << 12) | *src >> 4;
 }

-static inline void _rgba4444_to_rgba8888(const GLubyte* source, GLubyte* dest) {
+static INLINE_DEBUG void _rgba4444_to_rgba8888(const GLubyte* source, GLubyte* dest) {
    GLushort src = *((GLushort*) source);
    GLubyte* dst = (GLubyte*) dest;

@ -796,7 +796,7 @@ static inline void _rgba4444_to_rgba8888(const GLubyte* source, GLubyte* dest) {
    dst[3] = ((src & 0x000F)) * 2;
 }

-static inline void _i8_to_i8(const GLubyte* source, GLubyte* dest) {
+static INLINE_DEBUG void _i8_to_i8(const GLubyte* source, GLubyte* dest) {
    /* For indexes */
    GLubyte* dst = (GLubyte*) dest;
    *dst = *source;
--- a/containers/aligned_vector.c
+++ b/containers/aligned_vector.c
@ -3,6 +3,8 @@
 #include <math.h>
 #include <assert.h>
 #include <stdio.h>
+#include <dc/sq.h>
+#include <kos/string.h>

 #if defined(__APPLE__) || defined(__WIN32__)
 /* Linux + Kos define this, OSX does not, so just use malloc there */
@ -25,7 +27,7 @@ void aligned_vector_init(AlignedVector* vector, unsigned int element_size) {
 }


-static inline unsigned int round_to_chunk_size(unsigned int val) {
+static INLINE_DEBUG unsigned int round_to_chunk_size(unsigned int val) {
    const unsigned int n = val;
    const unsigned int m = ALIGNED_VECTOR_CHUNK_SIZE;

@ -107,33 +109,12 @@ void* aligned_vector_resize(AlignedVector* vector, const unsigned int element_co
    }
 }

-void* aligned_vector_at(const AlignedVector* vector, const unsigned int index) {
-    #if 0
-    if(index >= vector->size){
-        char msg[60];
-        sprintf(msg, "Vector OOB: %d %d wanted %d\n", vector->capacity, vector->size, index);
-        //aligned_vector_resize(vector, index);
-        assert_msg(index < vector->size, msg);
-    }
-    #endif
-    assert(index < vector->size);
-    return &vector->data[index * vector->element_size];
-}
-
-void* aligned_vector_back(AlignedVector* vector) {
-    return aligned_vector_at(vector, vector->size - 1);
-}
-
 void* aligned_vector_extend(AlignedVector* vector, const unsigned int additional_count) {
    const unsigned int current = vector->size;
    aligned_vector_resize(vector, vector->size + additional_count);
    return aligned_vector_at(vector, current);
 }

-void aligned_vector_clear(AlignedVector* vector) {
-    vector->size = 0;
-}
-
 void aligned_vector_shrink_to_fit(AlignedVector* vector) {
    if(vector->size == 0) {
        free(vector->data);
--- a/containers/aligned_vector.h
+++ b/containers/aligned_vector.h
@ -5,6 +5,8 @@
 extern "C" {
 #endif

+#include "../GL/cygprofile.h"
+
 typedef struct {
    unsigned int size;
    unsigned int capacity;
@ -18,12 +20,27 @@ void aligned_vector_init(AlignedVector* vector, unsigned int element_size);
 void aligned_vector_reserve(AlignedVector* vector, unsigned int element_count);
 void* aligned_vector_push_back(AlignedVector* vector, const void* objs, unsigned int count);
 void* aligned_vector_resize(AlignedVector* vector, const unsigned int element_count);
-void* aligned_vector_at(const AlignedVector* vector, const unsigned int index);
+INLINE_ALWAYS void* aligned_vector_at(const AlignedVector* vector, const unsigned int index) {
+    #if 0
+    if(index >= vector->size){
+        char msg[60];
+        sprintf(msg, "Vector OOB: %d %d wanted %d\n", vector->capacity, vector->size, index);
+        //aligned_vector_resize(vector, index);
+        assert_msg(index < vector->size, msg);
+    }
+    assert(index < vector->size); /* Check here */
+    #endif
+    return &vector->data[index * vector->element_size];
+}
 void* aligned_vector_extend(AlignedVector* vector, const unsigned int additional_count);
-void aligned_vector_clear(AlignedVector* vector);
+INLINE_ALWAYS void aligned_vector_clear(AlignedVector* vector){
+    vector->size = 0;
+}
 void aligned_vector_shrink_to_fit(AlignedVector* vector);
 void aligned_vector_cleanup(AlignedVector* vector);
-void* aligned_vector_back(AlignedVector* vector);
+INLINE_ALWAYS void* aligned_vector_back(AlignedVector* vector){
+    return aligned_vector_at(vector, vector->size - 1);
+}

 #ifdef __cplusplus
 }
--- a/containers/named_array.c
+++ b/containers/named_array.c
@ -44,13 +44,6 @@ void named_array_init(NamedArray* array, unsigned int element_size, unsigned int
    memset(array->elements, 0, element_size * max_elements);
 }

-char named_array_used(NamedArray* array, unsigned int id) {
-    unsigned int i = id / 8;
-    unsigned int j = id % 8;
-
-    unsigned char v = array->used_markers[i] & (unsigned char) (1 << j);
-    return !!(v);
-}

 void* named_array_alloc(NamedArray* array, unsigned int* new_id) {
    unsigned int i = 0, j = 0;
--- a/containers/named_array.h
+++ b/containers/named_array.h
@ -5,6 +5,8 @@
 extern "C" {
 #endif

+#include "../GL/cygprofile.h"
+
 typedef struct {
    unsigned int element_size;
    unsigned int max_element_count;
@ -14,7 +16,13 @@ typedef struct {
 } NamedArray;

 void named_array_init(NamedArray* array, unsigned int element_size, unsigned int max_elements);
-char named_array_used(NamedArray* array, unsigned int id);
+INLINE_ALWAYS char named_array_used(NamedArray* array, unsigned int id) {
+    const unsigned int i = id / 8;
+    const unsigned int j = id % 8;
+
+    unsigned char v = array->used_markers[i] & (unsigned char) (1 << j);
+    return !!(v);
+}

 void* named_array_alloc(NamedArray* array, unsigned int* new_id);
 void* named_array_reserve(NamedArray* array, unsigned int id);