diff --git a/GL/config.h b/GL/config.h index f119e9a..7e3be3a 100644 --- a/GL/config.h +++ b/GL/config.h @@ -1,9 +1,8 @@ +#pragma once #ifndef CONFIG_H #define CONFIG_H - /* This figure is derived from the needs of Quake 1 */ #define MAX_TEXTURE_COUNT 1088 - #endif // CONFIG_H diff --git a/GL/cygprofile.c b/GL/cygprofile.c new file mode 100644 index 0000000..293d179 --- /dev/null +++ b/GL/cygprofile.c @@ -0,0 +1,227 @@ +/* Based on the idea from Erich Styger */ +/* profiled instrument guided profiling for gldc on hardware */ + +#include "cygprofile.h" +#include +#include +#include +#include +#include "perfctr.h" +#include "private.h" + +#if CYG_FUNC_TRACE_ENABLED + +#define _strcat(x, y, z) strncat(x, z, y) + +#ifndef __PE_Error_H +#define __PE_Error_H + +#define ERR_OK 0 /* OK */ +#define ERR_SPEED 1 /* This device does not work in the active speed mode. */ +#define ERR_RANGE 2 /* Parameter out of range. */ +#define ERR_VALUE 3 /* Parameter of incorrect value. */ +#define ERR_OVERFLOW 4 /* Timer overflow. */ +#define ERR_MATH 5 /* Overflow during evaluation. */ +#define ERR_ENABLED 6 /* Device is enabled. */ +#define ERR_DISABLED 7 /* Device is disabled. */ +#define ERR_BUSY 8 /* Device is busy. */ +#define ERR_NOTAVAIL 9 /* Requested value or method not available. */ +#define ERR_RXEMPTY 10 /* No data in receiver. */ +#define ERR_TXFULL 11 /* Transmitter is full. */ +#define ERR_BUSOFF 12 /* Bus not available. */ +#define ERR_OVERRUN 13 /* Overrun error is detected. */ +#define ERR_FRAMING 14 /* Framing error is detected. */ +#define ERR_PARITY 15 /* Parity error is detected. */ +#define ERR_NOISE 16 /* Noise error is detected. */ +#define ERR_IDLE 17 /* Idle error is detectes. */ +#define ERR_FAULT 18 /* Fault error is detected. */ +#define ERR_BREAK 19 /* Break char is received during communication. */ +#define ERR_CRC 20 /* CRC error is detected. */ +#define ERR_ARBITR 21 /* A node losts arbitration. This error occurs if two nodes start transmission at the same time. */ +#define ERR_PROTECT 22 /* Protection error is detected. */ + +#endif /* __PE_Error_H */ + +#define CYG_RNG_BUF_NOF_ELEMS (8096 * 4) +/*!< Number of elements in the ring buffer which is used to record function calls */ +#define CYG_THUMB_MASK 0xFFFFFFFF +/*!< mask out LSB (thumb) bit */ + +/* Hashing function for two uint32_ts */ +#define HASH_PAIR(x, y) (((x)*0x1f1f1f1f) ^ (y)) + +static bool CYG_Enabled = false; /*!< flag which enables/disables tracing */ + +/*! + * Element in ring buffer to store the trace information. + */ +typedef struct +{ + //bool isEnter; /*!< TRUE for __cyg_profile_func_enter(), FALSE for __cyg_profile_func_exit() */ + void *this_fn; /*!< address (with thumb bit) of the (caller) function */ + void *call_site; /*!< return address to the function which called this_fn */ + uint32_t counter; /* also contains isEnter as highest bit */ +} CYG_RNG_ElementType; + +typedef uint32_t CYG_RNG_BufSizeType; /*!< index type for ring buffer */ + +static CYG_RNG_ElementType CYG_RNG_buffer[CYG_RNG_BUF_NOF_ELEMS]; /*!< ring buffer */ +//static CYG_RNG_BufSizeType CYG_RNG_inIdx; /*!< input index */ +static CYG_RNG_BufSizeType CYG_RNG_outIdx; /*!< output index */ +static CYG_RNG_BufSizeType CYG_RNG_inSize; /*!< size/number of elements in buffer */ + +/*! + * \brief Stores a trace element into the ring buffer. + * \param elem Trace element to put into the buffer. + * \return Error code, ERR_OK if everything is ok. + */ +__attribute__((no_instrument_function)) static uint8_t CYG_RNG_Put(CYG_RNG_ElementType *elem) { + uint8_t res = ERR_OK; + +#if 0 + if (CYG_RNG_inSize == CYG_RNG_BUF_NOF_ELEMS) + { + res = ERR_TXFULL; + CYG_RNG_inSize--; + CYG_PrintCallTrace(); + //CYG_RNG_inIdx = 0; + CYG_RNG_outIdx = 0; + CYG_RNG_inSize = 0; + return CYG_RNG_Put(elem); + } + else + { + //CYG_RNG_buffer[CYG_RNG_inIdx] = *elem; + + /* + CYG_RNG_inIdx++; + if (CYG_RNG_inIdx == CYG_RNG_BUF_NOF_ELEMS) + { + CYG_RNG_inIdx = 0; + } + */ + CYG_RNG_inSize++; + } +#endif + CYG_RNG_ElementType *possible = &CYG_RNG_buffer[HASH_PAIR((uint32_t)elem->call_site, (uint32_t)elem->this_fn) % CYG_RNG_BUF_NOF_ELEMS]; + if (possible->counter /*& 0x0FFFFFFF*/ == 0) { + *possible = *elem; + } else { + possible->counter++; + } + return res; +} + +/*! + * \brief Gets a trace element from the ring buffer. + * \param elem Pointer where to store the trace element. + * \return Error code, ERR_OK if everything is ok. + */ +__attribute__((no_instrument_function)) static uint8_t CYG_RNG_Get(CYG_RNG_ElementType *elemP) { + uint8_t res = ERR_OK; + + if (CYG_RNG_inSize == 0) { + res = ERR_RXEMPTY; + } else { + *elemP = CYG_RNG_buffer[CYG_RNG_outIdx]; + CYG_RNG_inSize--; + CYG_RNG_outIdx++; + if (CYG_RNG_outIdx == CYG_RNG_BUF_NOF_ELEMS) { + CYG_RNG_outIdx = 0; + } + } + return res; +} + +static uint32_t currentTime[2]; +static uint32_t lastTime; + +/*! + * \brief Stores a trace element into the ring buffer. + * \param this_fn Address of the caller function. + * \param call_site Return address to the function which called this_fn + * \return Error code, ERR_OK if everything is ok. + */ +__attribute__((no_instrument_function)) static void CYG_Store(void *this_fn, void *call_site) { + CYG_RNG_ElementType elem; + lastTime = currentTime[0]; + PMCR_Read(1, (unsigned int *)currentTime); + //elem.isEnter = isEnter; + elem.call_site = call_site; + elem.this_fn = this_fn; + elem.counter = 1; //currentTime[0] - lastTime; + CYG_RNG_Put(&elem); +} + +/*! + * \brief Function which is called upon function enter. The function call is inserted by the compiler. + * \param this_fn Address of the caller function. + * \param call_site Return address to the function which called this_fn + */ +__attribute__((no_instrument_function)) void __cyg_profile_func_enter(void *this_fn, void *call_site) { + if (CYG_Enabled) { + CYG_Store(call_site, this_fn); + } +} + +/*! + * \brief Function which is called upon function exit. The function call is inserted by the compiler. + * \param this_fn Address of the caller function. + * \param call_site Return address to the function which called this_fn + */ +__attribute__((no_instrument_function)) void __cyg_profile_func_exit(__attribute__((unused)) void *this_fn, __attribute__((unused)) void *call_site) { +} + +/*! + * \brief Dumps the trace to the console. + */ +__attribute__((no_instrument_function)) void CYG_PrintCallTrace(void) { + CYG_RNG_BufSizeType i; + char buf[40]; + CYG_RNG_ElementType elem; + uint8_t res; + + CYG_Enabled = false; + printf("0x%08x\n", ((unsigned int)&_etext) - BASE_ADDRESS); + //printf("Function Trace:\r\n"); + CYG_RNG_outIdx = 0; + for (i = 0; i < CYG_RNG_BUF_NOF_ELEMS; i++) { + buf[0] = '\0'; + res = CYG_RNG_Get(&elem); + if (res == ERR_OK && elem.call_site != NULL) { + snprintf(buf, sizeof(buf), "{ 0x%" PRIXPTR " 0x%" PRIXPTR " %u\r\n", (uintptr_t)(elem.this_fn) & CYG_THUMB_MASK, (uintptr_t)(elem.call_site) & CYG_THUMB_MASK, (unsigned int)elem.counter); + + printf(buf); + } else { + //printf("ERROR getting element!\r\n"); + } + } + //printf("Function Trace: done!\r\n"); +} + +__attribute__((no_instrument_function)) void CYG_Init(void) { + if (CYG_Enabled) { + return; + } + CYG_RNG_inSize = CYG_RNG_BUF_NOF_ELEMS; + CYG_RNG_outIdx = 0; + CYG_Enabled = true; + currentTime[0] = currentTime[1] = 0; + lastTime = 0; + memset(CYG_RNG_buffer, 0, sizeof(CYG_RNG_buffer)); + PMCR_Init(1, PMCR_ELAPSED_TIME_MODE, PMCR_COUNT_CPU_CYCLES); +} + +__attribute__((no_instrument_function)) void CYG_Deinit(void) { + CYG_RNG_inSize = CYG_RNG_BUF_NOF_ELEMS; + CYG_RNG_outIdx = 0; + CYG_Enabled = false; + memset(CYG_RNG_buffer, 0, sizeof(CYG_RNG_buffer)); +} +#else + +void CYG_PrintCallTrace(void){} +void CYG_Init(void){} +void CYG_Deinit(void){} + +#endif diff --git a/GL/cygprofile.h b/GL/cygprofile.h new file mode 100644 index 0000000..7f2cbcb --- /dev/null +++ b/GL/cygprofile.h @@ -0,0 +1,33 @@ +#pragma once +#ifndef CYGPROFILE_H_ +#define CYGPROFILE_H_ + +/* Based on the idea from Erich Styger */ +/* profiled instrument guided profiling for gldc on hardware */ + +#define NO_INSTRUMENT inline __attribute__((no_instrument_function)) +#define INLINE_DEBUG NO_INSTRUMENT __attribute__((always_inline)) +#define INLINE_ALWAYS static NO_INSTRUMENT __attribute__((always_inline)) + +extern char _etext; +#define BASE_ADDRESS 0x8c010000 + +#define CYG_FUNC_TRACE_ENABLED (1) +/*!< 1: Trace enabled, 0: trace disabled */ + +/*! + * \brief Print the call trace to the terminal. + */ +void CYG_PrintCallTrace(void); + +/*! + * \brief Driver Initialization. + */ +void CYG_Init(void); + +/*! + * \brief Driver De-Initialization. + */ +void CYG_Deinit(void); + +#endif /* CYGPROFILE_H_ */ \ No newline at end of file diff --git a/GL/draw.c b/GL/draw.c index 68331f4..e97e263 100644 --- a/GL/draw.c +++ b/GL/draw.c @@ -56,7 +56,7 @@ void _glInitAttributePointers() { NORMAL_POINTER.size = 3; } -static inline GLuint byte_size(GLenum type) { +static INLINE_DEBUG GLuint byte_size(GLenum type) { switch(type) { case GL_BYTE: return sizeof(GLbyte); case GL_UNSIGNED_BYTE: return sizeof(GLubyte); @@ -513,7 +513,7 @@ PVRHeader* _glSubmissionTargetHeader(SubmissionTarget* target) { return aligned_vector_at(&target->output->vector, target->header_offset); } -Vertex* _glSubmissionTargetStart(SubmissionTarget* target) { +INLINE_DEBUG Vertex* _glSubmissionTargetStart(SubmissionTarget* target) { assert(target->start_offset < target->output->vector.size); return aligned_vector_at(&target->output->vector, target->start_offset); } @@ -1006,6 +1006,7 @@ static void mat_transform_normal3(const float* xyz, const float* xyzOut, const u static void light(SubmissionTarget* target) { +#if 0 typedef struct { float xyz[3]; float n[3]; @@ -1057,6 +1058,35 @@ static void light(SubmissionTarget* target) { vertex->bgra[G8IDX] = (GLubyte) (255.0f * fminf(total[1], 1.0f)); vertex->bgra[B8IDX] = (GLubyte) (255.0f * fminf(total[2], 1.0f)); } +#endif + + if(!_glIsLightingEnabled()) { + return; + } + + static AlignedVector* eye_space_data = NULL; + + if(!eye_space_data) { + eye_space_data = (AlignedVector*) malloc(sizeof(AlignedVector)); + aligned_vector_init(eye_space_data, sizeof(EyeSpaceData)); + } + + aligned_vector_resize(eye_space_data, target->count); + + /* Perform lighting calculations and manipulate the colour */ + Vertex* vertex = _glSubmissionTargetStart(target); + VertexExtra* extra = aligned_vector_at(target->extras, 0); + EyeSpaceData* eye_space = (EyeSpaceData*) eye_space_data->data; + + _glMatrixLoadModelView(); + mat_transform3(vertex->xyz, eye_space->xyz, target->count, sizeof(Vertex), sizeof(EyeSpaceData)); + + _glMatrixLoadNormal(); + mat_transform_normal3(extra->nxyz, eye_space->n, target->count, sizeof(VertexExtra), sizeof(EyeSpaceData)); + + EyeSpaceData* ES = aligned_vector_at(eye_space_data, 0); + _glPerformLighting(vertex, ES, target->count); + } static void divide(SubmissionTarget* target) { diff --git a/GL/framebuffer.c b/GL/framebuffer.c index 7ab7995..5ca2824 100644 --- a/GL/framebuffer.c +++ b/GL/framebuffer.c @@ -1,5 +1,6 @@ #include #include "private.h" +#include "config.h" #include "../include/glkos.h" #include "../include/glext.h" @@ -94,62 +95,62 @@ void APIENTRY glFramebufferTexture2DEXT(GLenum target, GLenum attachment, GLenum ACTIVE_FRAMEBUFFER->texture_id = texture; } -static inline GLuint A1555(GLuint v) { +static INLINE_DEBUG GLuint A1555(GLuint v) { const GLuint MASK = (1 << 15); return (v & MASK) >> 15; } -static inline GLuint R1555(GLuint v) { +static INLINE_DEBUG GLuint R1555(GLuint v) { const GLuint MASK = (31 << 10); return (v & MASK) >> 10; } -static inline GLuint G1555(GLuint v) { +static INLINE_DEBUG GLuint G1555(GLuint v) { const GLuint MASK = (31 << 5); return (v & MASK) >> 5; } -static inline GLuint B1555(GLuint v) { +static INLINE_DEBUG GLuint B1555(GLuint v) { const GLuint MASK = (31 << 0); return (v & MASK) >> 0; } -static inline GLuint A4444(GLuint v) { +static INLINE_DEBUG GLuint A4444(GLuint v) { const GLuint MASK = (0xF << 12); return (v & MASK) >> 12; } -static inline GLuint R4444(GLuint v) { +static INLINE_DEBUG GLuint R4444(GLuint v) { const GLuint MASK = (0xF << 8); return (v & MASK) >> 8; } -static inline GLuint G4444(GLuint v) { +static INLINE_DEBUG GLuint G4444(GLuint v) { const GLuint MASK = (0xF << 4); return (v & MASK) >> 4; } -static inline GLuint B4444(GLuint v) { +static INLINE_DEBUG GLuint B4444(GLuint v) { const GLuint MASK = (0xF << 0); return (v & MASK) >> 0; } -static inline GLuint R565(GLuint v) { +static INLINE_DEBUG GLuint R565(GLuint v) { const GLuint MASK = (31 << 11); return (v & MASK) >> 11; } -static inline GLuint G565(GLuint v) { +static INLINE_DEBUG GLuint G565(GLuint v) { const GLuint MASK = (63 << 5); return (v & MASK) >> 5; } -static inline GLuint B565(GLuint v) { +static INLINE_DEBUG GLuint B565(GLuint v) { const GLuint MASK = (31 << 0); return (v & MASK) >> 0; } -GLboolean _glCalculateAverageTexel(const GLubyte* src, const GLuint srcWidth, const GLuint pvrFormat, GLubyte* dest) { +static NO_INSTRUMENT GLboolean _glCalculateAverageTexel(const GLubyte* src, const GLuint srcWidth, const GLuint pvrFormat, GLubyte* dest) { GLushort* s1 = ((GLushort*) src); GLushort* s2 = ((GLushort*) src) + 1; GLushort* s3 = ((GLushort*) src) + srcWidth; diff --git a/GL/gldc.c b/GL/gldc.c index 22b8c2e..9e4f2df 100644 --- a/GL/gldc.c +++ b/GL/gldc.c @@ -19,3 +19,7 @@ #include "matrix.c" #include "state.c" #include "texture.c" + +#include "../containers/stack.c" +#include "../containers/aligned_vector.c" +#include "../containers/named_array.c" \ No newline at end of file diff --git a/GL/lighting.c b/GL/lighting.c index c756df8..10038a5 100644 --- a/GL/lighting.c +++ b/GL/lighting.c @@ -281,98 +281,143 @@ static inline float FPOW(float b, float p) { return FEXP(FLOG(b) * p); } -void _glCalculateLightingContribution(const GLint light, const GLfloat* pos, const GLfloat* normal, uint8_t* bgra, GLfloat* colour) __attribute__((optimize("fast-math"))); -void _glCalculateLightingContribution(const GLint light, const GLfloat* pos, const GLfloat* normal, uint8_t* bgra, GLfloat* colour) { - LightSource* l = &LIGHTS[light]; - - struct vec3f L = { - l->position[0], - l->position[1], - l->position[2] - }; - - if(!l->is_directional) { - L.x -= pos[0]; - L.y -= pos[1]; - L.z -= pos[2]; - } - - struct vec3f N = { - normal[0], - normal[1], - normal[2] - }; - - struct vec3f V = { - pos[0], - pos[1], - pos[2] - }; - - GLfloat d; - vec3f_length(L.x, L.y, L.z, d); - - GLfloat oneOverL = 1.0f / d; - - L.x *= oneOverL; - L.y *= oneOverL; - L.z *= oneOverL; - - vec3f_normalize(V.x, V.y, V.z); - - GLfloat NdotL, VdotN; - vec3f_dot(N.x, N.y, N.z, L.x, L.y, L.z, NdotL); - vec3f_dot(V.x, V.y, V.z, N.x, N.y, N.z, VdotN); - - GLfloat VdotR = VdotN - NdotL; - GLfloat specularPower = FPOW(VdotR > 0 ? VdotR : 0, MATERIAL.exponent); - - GLboolean colorMaterial = _glIsColorMaterialEnabled(); - - GLfloat mD [] = { - (colorMaterial && isDiffuseColorMaterial()) ? ((GLfloat)bgra[R8IDX]) / 255.0f : MATERIAL.diffuse[0], - (colorMaterial && isDiffuseColorMaterial()) ? ((GLfloat)bgra[G8IDX]) / 255.0f : MATERIAL.diffuse[1], - (colorMaterial && isDiffuseColorMaterial()) ? ((GLfloat)bgra[B8IDX]) / 255.0f : MATERIAL.diffuse[2], - (colorMaterial && isDiffuseColorMaterial()) ? ((GLfloat)bgra[A8IDX]) / 255.0f : MATERIAL.diffuse[3] - }; - - GLfloat mA [] = { - (colorMaterial && isAmbientColorMaterial()) ? ((GLfloat)bgra[R8IDX]) / 255.0f : MATERIAL.ambient[0], - (colorMaterial && isAmbientColorMaterial()) ? ((GLfloat)bgra[G8IDX]) / 255.0f : MATERIAL.ambient[1], - (colorMaterial && isAmbientColorMaterial()) ? ((GLfloat)bgra[B8IDX]) / 255.0f : MATERIAL.ambient[2], - (colorMaterial && isAmbientColorMaterial()) ? ((GLfloat)bgra[A8IDX]) / 255.0f : MATERIAL.ambient[3] - }; - - GLfloat mS [] = { - (colorMaterial && isSpecularColorMaterial()) ? ((GLfloat)bgra[R8IDX]) / 255.0f : MATERIAL.specular[0], - (colorMaterial && isSpecularColorMaterial()) ? ((GLfloat)bgra[G8IDX]) / 255.0f : MATERIAL.specular[1], - (colorMaterial && isSpecularColorMaterial()) ? ((GLfloat)bgra[B8IDX]) / 255.0f : MATERIAL.specular[2], - (colorMaterial && isSpecularColorMaterial()) ? ((GLfloat)bgra[A8IDX]) / 255.0f : MATERIAL.specular[3] - }; - - colour[0] = l->ambient[0] * mA[0]; - colour[1] = l->ambient[1] * mA[1]; - colour[2] = l->ambient[2] * mA[2]; - colour[3] = mD[3]; - - if(NdotL >= 0) { - colour[0] += (l->diffuse[0] * mD[0] * NdotL + l->specular[0] * mS[0] * specularPower); - colour[1] += (l->diffuse[1] * mD[1] * NdotL + l->specular[1] * mS[1] * specularPower); - colour[2] += (l->diffuse[2] * mD[2] * NdotL + l->specular[2] * mS[2] * specularPower); - } - - if(!l->is_directional) { - GLfloat att = ( - 1.0f / (l->constant_attenuation + (l->linear_attenuation * d) + (l->quadratic_attenuation * d * d)) - ); - - colour[0] *= att; - colour[1] *= att; - colour[2] *= att; - } - - if(colour[0] > 1.0f) colour[0] = 1.0f; - if(colour[1] > 1.0f) colour[1] = 1.0f; - if(colour[2] > 1.0f) colour[2] = 1.0f; - if(colour[3] > 1.0f) colour[3] = 1.0f; +#define LIGHT_COMPONENT(C) { \ + const GLfloat* acm = &MA[C]; \ + const GLfloat* dcm = &MD[C]; \ + const GLfloat* scm = &MS[C]; \ + const GLfloat* scli = &light->specular[C]; \ + const GLfloat* dcli = &light->diffuse[C]; \ + const GLfloat* acli = &light->ambient[C]; \ + const GLfloat* srm = &MATERIAL.exponent; \ + const GLfloat fi = (LdotN == 0) ? 0 : 1; \ + GLfloat component = (*acm * *acli); \ + component += (LdotN * *dcm * *dcli); \ + component += (FPOW((fi * NdotH), *srm) * *scm * *scli); \ + component *= att; \ + component *= spot; \ + final[C] += component; \ } + +static inline float vec3_dot_limited( + const float* x1, const float* y1, const float* z1, + const float* x2, const float* y2, const float* z2) { + + float ret; + vec3f_dot(*x1, *y1, *z1, *x2, *y2, *z2, ret); + return (ret < 0) ? 0 : ret; +} + +void _glPerformLighting(Vertex* vertices, const EyeSpaceData* es, const int32_t count) { + int8_t i; + int32_t j; + + const LightSource* light = NULL; + + const GLboolean colorMaterial = _glIsColorMaterialEnabled(); + const GLboolean isDiffuseCM = isDiffuseColorMaterial(); + const GLboolean isAmbientCM = isAmbientColorMaterial(); + const GLboolean isSpecularCM = isSpecularColorMaterial(); + + static GLfloat CM[4]; + + /* So the DC has 16 floating point registers, that means + * we need to limit the number of floats as much as possible + * to give the compiler a good enough chance to do the right + * thing */ + + Vertex* vertex = vertices; + const EyeSpaceData* data = es; + + static const float ONE_OVER_255 = 1.0f / 255.0f; + + for(j = 0; j < count; ++j, ++vertex, ++data) { + /* When GL_COLOR_MATERIAL is on, we need to pull out + * the passed in diffuse and use it */ + const GLfloat* MD = MATERIAL.diffuse; + const GLfloat* MA = MATERIAL.ambient; + const GLfloat* MS = MATERIAL.specular; + + if(colorMaterial) { + CM[0] = ((GLfloat) vertex->bgra[R8IDX]) * ONE_OVER_255; + CM[1] = ((GLfloat) vertex->bgra[G8IDX]) * ONE_OVER_255; + CM[2] = ((GLfloat) vertex->bgra[B8IDX]) * ONE_OVER_255; + CM[3] = ((GLfloat) vertex->bgra[A8IDX]) * ONE_OVER_255; + + MD = (isDiffuseCM) ? CM : MATERIAL.diffuse; + MA = (isAmbientCM) ? CM : MATERIAL.ambient; + MS = (isSpecularCM) ? CM : MATERIAL.specular; + } + + float final[4]; + + /* Initial, non-light related values */ + final[0] = (SCENE_AMBIENT[0] * MA[0]) + MATERIAL.emissive[0]; + final[1] = (SCENE_AMBIENT[1] * MA[1]) + MATERIAL.emissive[1]; + final[2] = (SCENE_AMBIENT[2] * MA[2]) + MATERIAL.emissive[2]; + final[3] = MD[3]; + + float Vx, Vy, Vz; + Vx = -data->xyz[0]; + Vy = -data->xyz[1]; + Vz = -data->xyz[2]; + vec3f_normalize(Vx, Vy, Vz); + + for(i = 0; i < MAX_LIGHTS; ++i) { + if(!_glIsLightEnabled(i)) continue; + + /* Calc light specific parameters */ + light = &LIGHTS[i]; + + float Lx, Ly, Lz, D; + float Hx, Hy, Hz; + const float* Nx = &data->n[0]; + const float* Ny = &data->n[1]; + const float* Nz = &data->n[2]; + + Lx = light->position[0] - data->xyz[0]; + Ly = light->position[1] - data->xyz[1]; + Lz = light->position[2] - data->xyz[2]; + vec3f_length(Lx, Ly, Lz, D); + + { + /* Normalize L - scoping ensures Llen is temporary */ + const float Llen = 1.0f / D; + Lx *= Llen; + Ly *= Llen; + Lz *= Llen; + } + + Hx = (Lx + Vx); + Hy = (Ly + Vy); + Hz = (Lz + Vz); + vec3f_normalize(Hx, Hy, Hz); + + const float LdotN = vec3_dot_limited( + &Lx, &Ly, &Lz, + Nx, Ny, Nz + ); + + const float NdotH = vec3_dot_limited( + Nx, Ny, Nz, + &Hx, &Hy, &Hz + ); + + const float att = ( + light->position[3] == 0.0f) ? 1.0f : + 1.0f / (light->constant_attenuation + (light->linear_attenuation * D) + (light->quadratic_attenuation * D * D) + ); + + const float spot = 1.0f; + + LIGHT_COMPONENT(0); + LIGHT_COMPONENT(1); + LIGHT_COMPONENT(2); + } + + vertex->bgra[R8IDX] = (GLubyte)(fminf(final[0] * 255.0f, 255.0f)); + vertex->bgra[G8IDX] = (GLubyte)(fminf(final[1] * 255.0f, 255.0f)); + vertex->bgra[B8IDX] = (GLubyte)(fminf(final[2] * 255.0f, 255.0f)); + vertex->bgra[A8IDX] = (GLubyte)(fminf(final[3] * 255.0f, 255.0f)); + } +} + diff --git a/GL/matrix.c b/GL/matrix.c index 75b1e56..5d3a629 100644 --- a/GL/matrix.c +++ b/GL/matrix.c @@ -476,84 +476,57 @@ void APIENTRY glDepthRange(GLclampf n, GLclampf f) { DEPTH_RANGE_MULTIPLIER_H = (n + f) / 2.0f; } +#include "sh4_math.h" + /* Vector Cross Product - Used by glhLookAtf2 */ -static inline void vec3f_cross(const GLfloat* v1, const GLfloat* v2, GLfloat* result) { - result[0] = v1[1] * v2[2] - v1[2] * v2[1]; - result[1] = v1[2] * v2[0] - v1[0] * v2[2]; - result[2] = v1[0] * v2[1] - v1[1] * v2[0]; +static inline void vec3f_cross(GLfloat* v1, GLfloat* v2, GLfloat* result) { + result[0] = (v1[1] * v2[2]) - (v1[2] * v2[1]); + result[1] = (v1[2] * v2[0]) - (v1[0] * v2[2]); + result[2] = (v1[0] * v2[1]) - (v1[1] * v2[0]); } -/* glhLookAtf2 adapted from http://www.opengl.org/wiki/GluLookAt_code */ -void glhLookAtf2(const GLfloat* eyePosition3D, - const GLfloat* center3D, - const GLfloat* upVector3D) { - /* Look-At Matrix */ - static Matrix4x4 MatrixLookAt __attribute__((aligned(32))) = { - 1.0f, 0.0f, 0.0f, 0.0f, - 0.0f, 1.0f, 0.0f, 0.0f, - 0.0f, 0.0f, 1.0f, 0.0f, - 0.0f, 0.0f, 0.0f, 1.0f - }; +static inline void vec3f_normalize_sh4(float *v){ + float length, ilength; - GLfloat forward[3]; - GLfloat side[3]; - GLfloat up[3]; - - vec3f_sub_normalize(center3D[0], center3D[1], center3D[2], - eyePosition3D[0], eyePosition3D[1], eyePosition3D[2], - forward[0], forward[1], forward[2]); - - //Side = forward x up - vec3f_cross(forward, upVector3D, side); - vec3f_normalize(side[0], side[1], side[2]); - - //Recompute up as: up = side x forward - vec3f_cross(side, forward, up); - - MatrixLookAt[M0] = side[0]; - MatrixLookAt[M4] = side[1]; - MatrixLookAt[M8] = side[2]; - MatrixLookAt[M12] = 0; - - MatrixLookAt[M1] = up[0]; - MatrixLookAt[M5] = up[1]; - MatrixLookAt[M9] = up[2]; - MatrixLookAt[M13] = 0; - - MatrixLookAt[M2] = -forward[0]; - MatrixLookAt[M6] = -forward[1]; - MatrixLookAt[M10] = -forward[2]; - MatrixLookAt[M14] = 0; - - MatrixLookAt[M3] = MatrixLookAt[11] = MatrixLookAt[15] = 0; - MatrixLookAt[M15] = 1; - - static Matrix4x4 trn __attribute__((aligned(32))) = { - 1.0f, 0.0f, 0.0f, 0.0f, - 0.0f, 1.0f, 0.0f, 0.0f, - 0.0f, 0.0f, 1.0f, 0.0f, - 0.0f, 0.0f, 0.0f, 1.0f - }; - - trn[M12] = -eyePosition3D[0]; - trn[M13] = -eyePosition3D[1]; - trn[M14] = -eyePosition3D[2]; - - // Does not modify internal Modelview matrix - upload_matrix(&MatrixLookAt); - multiply_matrix(&trn); - multiply_matrix(stack_top(MATRIX_STACKS + (GL_MODELVIEW & 0xF))); - download_matrix(stack_top(MATRIX_STACKS + (GL_MODELVIEW & 0xF))); + ilength = MATH_fsrra(v[0]*v[0] + v[1]*v[1] + v[2]*v[2]); + length = MATH_Invert(ilength); + if (length) + { + v[0] *= ilength; + v[1] *= ilength; + v[2] *= ilength; + } } void gluLookAt(GLfloat eyex, GLfloat eyey, GLfloat eyez, GLfloat centerx, GLfloat centery, GLfloat centerz, GLfloat upx, GLfloat upy, GLfloat upz) { - GLfloat eye [] = { eyex, eyey, eyez }; - GLfloat point [] = { centerx, centery, centerz }; - GLfloat up [] = { upx, upy, upz }; - glhLookAtf2(eye, point, up); + GLfloat m [16]; + GLfloat f [3]; + GLfloat u [3]; + GLfloat s [3]; + + f[0] = centerx - eyex; + f[1] = centery - eyey; + f[2] = centerz - eyez; + + u[0] = upx; + u[1] = upy; + u[2] = upz; + + vec3f_normalize_sh4(f); + vec3f_cross(f, u, s); + vec3f_normalize_sh4(s); + vec3f_cross(s, f, u); + + m[0] = s[0]; m[4] = s[1]; m[8] = s[2]; m[12] = 0.0f; + m[1] = u[0]; m[5] = u[1]; m[9] = u[2]; m[13] = 0.0f; + m[2] = -f[0]; m[6] = -f[1]; m[10] = -f[2]; m[14] = 0.0f; + m[3] = 0.0f; m[7] = 0.0f; m[11] = 0.0f; m[15] = 1.0f; + + glMultMatrixf(m); + glTranslatef(-eyex, -eyey, -eyez); } void _glApplyRenderMatrix() { diff --git a/GL/perfctr.c b/GL/perfctr.c new file mode 100644 index 0000000..4b6b3bc --- /dev/null +++ b/GL/perfctr.c @@ -0,0 +1,247 @@ +// ---- perfctr.c - SH7091 Performance Counter Module Code ---- +// +// This file is part of the DreamHAL project, a hardware abstraction library +// primarily intended for use on the SH7091 found in hardware such as the SEGA +// Dreamcast game console. +// +// The performance counter module is hereby released into the public domain in +// the hope that it may prove useful. Now go profile some code and hit 60 fps! :) +// +// --Moopthehedgehog + +// See perfctr.h for more of my notes and documentation on these counters. +#include "perfctr.h" +#include "cygprofile.h" +#if CYG_FUNC_TRACE_ENABLED + +static unsigned char pmcr_enabled = 0; + +// +// Initialize performance counters. It's just a clear -> enable. +// It's good practice to clear a counter before starting it for the first time. +// +// Also: Disabling and re-enabling the counters doesn't reset them; the clearing +// needs to happen while a counter is disabled to reset it. +// +// You can disable and re-enable with a different mode without explicitly +// clearing and have it keep going, continuing from where it left off. +// + +__attribute__((no_instrument_function)) void PMCR_Init(int which, unsigned short mode, unsigned char count_type) // Will do nothing if perfcounter is already running! +{ + // Don't do anything if being asked to enable an already-enabled counter + if( (which == 1) && ((!pmcr_enabled) || (pmcr_enabled == 2)) ) + { + // counter 1 + PMCR_Enable(1, mode, count_type, PMCR_RESET_COUNTER); + } + else if( (which == 2) && ((!pmcr_enabled) || (pmcr_enabled == 1)) ) + { + // counter 2 + PMCR_Enable(2, mode, count_type, PMCR_RESET_COUNTER); + } + else if( (which == 3) && (!pmcr_enabled) ) + { + // Both + PMCR_Enable(3, mode, count_type, PMCR_RESET_COUNTER); + } +} + +// Enable "undocumented" performance counters (well, they were undocumented at one point. They're documented now!) +__attribute__((no_instrument_function)) void PMCR_Enable(int which, unsigned short mode, unsigned char count_type, unsigned char reset_count) // Will do nothing if perfcounter is already running! +{ + // Don't do anything if count_type or reset_count are invalid + if((count_type | reset_count) > 1) + { + return; + } + + // Build config from parameters + unsigned short pmcr_ctrl = PMCR_RUN_COUNTER | (reset_count << PMCR_RESET_COUNTER_SHIFT) | (count_type << PMCR_CLOCK_TYPE_SHIFT) | mode; + + // Don't do anything if being asked to enable an already-enabled counter + if( (which == 1) && ((!pmcr_enabled) || (pmcr_enabled == 2)) ) + { + // counter 1 + *((volatile unsigned short*)PMCR1_CTRL_REG) = pmcr_ctrl; + + pmcr_enabled += 1; + } + else if( (which == 2) && ((!pmcr_enabled) || (pmcr_enabled == 1)) ) + { + // counter 2 + *((volatile unsigned short*)PMCR2_CTRL_REG) = pmcr_ctrl; + + pmcr_enabled += 2; + } + else if( (which == 3) && (!pmcr_enabled) ) + { + // Both + *((volatile unsigned short*)PMCR1_CTRL_REG) = pmcr_ctrl; + *((volatile unsigned short*)PMCR2_CTRL_REG) = pmcr_ctrl; + + pmcr_enabled = 3; + } +} + +// For reference: +// #define PMCTR1H_REG 0xFF100004 +// #define PMCTR1L_REG 0xFF100008 + +// #define PMCTR2H_REG 0xFF10000C +// #define PMCTR2L_REG 0xFF100010 + +static const unsigned int pmcr1_regh = PMCTR1H_REG; +static const unsigned int pmcr1_regl = PMCTR1L_REG; + +static const unsigned int pmcr2_regh = PMCTR2H_REG; +static const unsigned int pmcr2_regl = PMCTR2L_REG; + +// Sorry, can only read one counter at a time! +// out_array should be an array consisting of 2x unsigned ints. +__attribute__((no_instrument_function)) void PMCR_Read(int which, volatile unsigned int *out_array) +{ + // if pmcr is not enabled, this function will just return 0 + + // little endian (big endian would need to flip [0] and [1]) + + // Note: These reads really do need to be done in assembly: unfortunately it + // appears that using C causes GCC to insert a branch right smack in between + // the high and low reads of perf counter 2 (with a nop, so it's literally + // delaying the reads by several cycles!), which is totally insane. Doing it + // the assembly way ensures that nothing ridiculous like that happens. It's + // also portable between versions of GCC that do put the nonsensical branch in. + // + // One thing that would be nice is if SH4 had the movi20s instruction to make + // absolute addresses in 3 cycles, but only the SH2A has that... :( + if( (which == 1) && (pmcr_enabled & 0x1) ) + { + // counter 1 +// out_array[1] = *((volatile unsigned int*)PMCTR1H_REG) & 0xffff; +// out_array[0] = *((volatile unsigned int*)PMCTR1L_REG); + asm volatile("mov.l %[reg1h],r1\n\t" // load counter address (high) + "mov.l %[reg1l],r2\n\t" // load counter address (low) + "mov.l @r1,r1\n\t" // read counter (high) + "mov.l @r2,r2\n\t" // read counter (low) + "extu.w r1,r1\n\t" // zero-extend high, aka high & 0xffff + "mov.l r1,%[outh]\n\t" // get data to memory + "mov.l r2,%[outl]\n\t" // get data to memory + : [outh] "=m" (out_array[1]), [outl] "=m" (out_array[0]) + : [reg1h] "m" (pmcr1_regh), [reg1l] "m" (pmcr1_regl) // SH4 can't mov an immediate longword into a register... + : "r1", "r2" + ); + } + else if( (which == 2) && (pmcr_enabled & 0x2) ) + { + // counter 2 +// out_array[1] = *((volatile unsigned int*)PMCTR2H_REG) & 0xffff; +// out_array[0] = *((volatile unsigned int*)PMCTR2L_REG); + asm volatile("mov.l %[reg2h],r1\n\t" // load counter address (high) + "mov.l %[reg2l],r2\n\t" // load counter address (low) + "mov.l @r1,r1\n\t" // read counter (high) + "mov.l @r2,r2\n\t" // read counter (low) + "extu.w r1,r1\n\t" // zero-extend high, aka high & 0xffff + "mov.l r1,%[outh]\n\t" // get data to memory + "mov.l r2,%[outl]\n\t" // get data to memory + : [outh] "=m" (out_array[1]), [outl] "=m" (out_array[0]) + : [reg2h] "m" (pmcr2_regh), [reg2l] "m" (pmcr2_regl) // SH4 can't mov an immediate longword into a register... + : "r1", "r2" + ); + } + else if(!pmcr_enabled) + { + out_array[1] = 0; + out_array[0] = 0; + } + else // Invalid + { + out_array[1] = 0xffff; + out_array[0] = 0xffffffff; + } +} + +// Reset counter to 0 and start it again +// NOTE: It does not appear to be possible to clear a counter while it is running. +__attribute__((no_instrument_function)) void PMCR_Restart(int which, unsigned short mode, unsigned char count_type) +{ + if( (which == 1) && (pmcr_enabled & 0x1) ) + { + // counter 1 + PMCR_Stop(1); + PMCR_Enable(1, mode, count_type, PMCR_RESET_COUNTER); + } + else if( (which == 2) && (pmcr_enabled & 0x2) ) + { + // counter 2 + PMCR_Stop(2); + PMCR_Enable(2, mode, count_type, PMCR_RESET_COUNTER); + } + else if( (which == 3) && (pmcr_enabled == 3) ) + { + // Both + PMCR_Stop(3); + PMCR_Enable(3, mode, count_type, PMCR_RESET_COUNTER); + } +} + +// Clearing only works when the counter is disabled. Otherwise, stopping the +// counter via setting the 0x2000 bit holds the data in the data registers, +// whereas disabling without setting that bit reads back as all 0 (but doesn't +// clear the counters for next start). This function just stops a running +// counter and does nothing if the counter is already stopped or disabled, as +// clearing is handled by PMCR_Enable(). +__attribute__((no_instrument_function)) void PMCR_Stop(int which) +{ + if( (which == 1) && (pmcr_enabled & 0x1) ) + { + // counter 1 + *((volatile unsigned short*)PMCR1_CTRL_REG) = PMCR_STOP_COUNTER; + + pmcr_enabled &= 0x2; + } + else if( (which == 2) && (pmcr_enabled & 0x2) ) + { + // counter 2 + *((volatile unsigned short*)PMCR2_CTRL_REG) = PMCR_STOP_COUNTER; + + pmcr_enabled &= 0x1; + } + else if( (which == 3) && (pmcr_enabled == 3) ) + { + // Both + *((volatile unsigned short*)PMCR1_CTRL_REG) = PMCR_STOP_COUNTER; + *((volatile unsigned short*)PMCR2_CTRL_REG) = PMCR_STOP_COUNTER; + + pmcr_enabled = 0; + } +} + +// Note that disabling does NOT clear the counter. +// It may appear that way because reading a disabled counter returns 0, but re- +// enabling without first clearing will simply continue where it left off. +__attribute__((no_instrument_function)) void PMCR_Disable(int which) +{ + if(which == 1) + { + // counter 1 + *((volatile unsigned short*)PMCR1_CTRL_REG) = PMCR_DISABLE_COUNTER; + + pmcr_enabled &= 0x2; + } + else if(which == 2) + { + // counter 2 + *((volatile unsigned short*)PMCR2_CTRL_REG) = PMCR_DISABLE_COUNTER; + + pmcr_enabled &= 0x1; + } + else if(which == 3) + { + // Both + *((volatile unsigned short*)PMCR1_CTRL_REG) = PMCR_DISABLE_COUNTER; + *((volatile unsigned short*)PMCR2_CTRL_REG) = PMCR_DISABLE_COUNTER; + + pmcr_enabled = 0; + } +} +#endif diff --git a/GL/perfctr.h b/GL/perfctr.h new file mode 100644 index 0000000..3cd7467 --- /dev/null +++ b/GL/perfctr.h @@ -0,0 +1,316 @@ +// ---- perfctr.h - SH7091 Performance Counter Module Header ---- +// +// This file is part of the DreamHAL project, a hardware abstraction library +// primarily intended for use on the SH7091 found in hardware such as the SEGA +// Dreamcast game console. +// +// The performance counter module is hereby released into the public domain in +// the hope that it may prove useful. Now go profile some code and hit 60 fps! :) +// +// --Moopthehedgehog +// + +#ifndef __PERFCTR_H__ +#define __PERFCTR_H__ + +// +// -- General SH4 Performance Counter Notes -- +// +// There are 2 performance counters that can measure elapsed time. They are each +// 48-bit counters. They are part of the so-called "ASE" subsystem, which you can +// read about in chapter 13 of the "SuperH™ (SH) 32-bit RISC series SH-4, ST40 +// system architecture, volume 1: system": +// https://www.st.com/content/ccc/resource/technical/document/user_manual/36/75/05/ac/e8/7e/42/2d/CD00147163.pdf/files/CD00147163.pdf/jcr:content/translations/en.CD00147163.pdf +// +// They can count cycles, so that's 199.5MHz (not 200MHz!!) a.k.a. roughly 5 ns +// increments. At 5 ns increments, a 48-bit cycle counter can run continuously +// for 16.33 days. It's actually 16 days, 7 hours, 55 minutes, and 2 seconds, +// depending on how close the bus clock is to 99.75MHz. There is also a second +// mode that counts cycles according to a ratio between the CPU frequency and +// the system bus clock, and it increments the counter by 12 every bus cycle. +// This second mode is detailed in the description for PMCR_CLOCK_TYPE in this +// file, and it is recommended for use when the CPU frequency is not a runtime +// constant. +// +// Side note: The counters don't have an overflow interrupt or overflow bit. +// (I did actually run one to 48-bit overflow in elapsed time mode using the +// ratio method to check this. They don't appear to sign-extend the upper 16 +// bits in elapsed time mode, either.) +// +// The two counters are functionally identical. I would recommend using the +// PMCR_Init() function to start one (or both) up the first time. +// +// -- Configuration Address Info -- +// +// Addresses for these counters can be easily seen here, in lxdream's source code: +// https://github.com/lutris/lxdream/blob/master/src/sh4/sh4mmio.h +// +// They are also on display in the Linux kernel, but at the time of writing appear +// to be set incorrectly (the clock mode at bit 0x100 is never set or cleared, +// for example, so they're at the mercy of whatever the hardware defaults are): +// http://git.lpclinux.com/cgit/linux-2.6.28.2-lpc313x/plain/arch/sh/oprofile/op_model_sh7750.c +// https://github.com/torvalds/linux/blob/master/arch/sh/kernel/cpu/sh4/perf_event.c +// ...It also appears as though they may not be handling bus ratio mode correctly, +// which appears to be the default mode on the Dreamcast in all my tests. +// +// You can also find these addresses by ripping a copy of Virtua Fighter 3 that +// you own for Dreamcast and looking at the raw byte code (or a raw disassembly) +// of its main program binary. It would appear as though they were timing a loop +// with the low half of perf counter 1 in elapsed time mode. Definitely seems +// like a good thing to do when targeting 60fps! Shenmue Disc 4 also uses the +// same configuration, but what's being timed is not as clear. +// +// Another place you can actually find both control addresses 0xFF00008x and all +// data addresses 0xFF10000x is in binaries of ancient, freely available versions +// of CodeScape. Literally all you need to do is open an SH7750-related DLL in a +// hex editor and do a search to find the control register addresses, and the +// data addresses are equally plain to see in any relevant performance profiling +// firmware. There's no effort or decryption required to find them whatsoever; +// all you need is an old trial version and a hex editor. +// +// However, something even better than all of that is if you search for "SH4 +// 0xFF000084" (without quotes) online you'll find an old forum where some logs +// were posted of the terminal/command prompt output from some STMicro JTAG tool, +// which not only has the address registers but also clearly characterizes their +// size as 16-bit: +// https://www.multimediaforum.de/threads/36260834-alice-hsn-3800tw-usb-jtag-ft4232h/page2 +// +// -- Event Mode Info -- +// +// Specific information on each counter mode can be found in the document titled +// "SuperH™ Family E10A-USB Emulator: Additional Document for User’s Manual: +// Supplementary Information on Using the SH7750R Renesas Microcomputer Development Environment System" +// which is available on Renesas's website, in the "Documents" section of the +// E10A-USB product page: +// https://www.renesas.com/us/en/products/software-tools/tools/emulator/e10a-usb.html +// At the time of writing (12/2019), the E10A-USB adapter is still available +// for purchase, and it is priced around $1200 (USD). +// +// Appendix C of the "ST40 Micro Toolset Manual" also has these modes documented: +// https://www.st.com/content/ccc/resource/technical/document/user_manual/c5/98/11/89/50/68/41/66/CD17379953.pdf/files/CD17379953.pdf/jcr:content/translations/en.CD17379953.pdf +// +// See here for the hexadecimal values corresponding to each mode (pg. 370): +// http://www.macmadigan.com/BusaECU/Renesas%20documents/Hitachi_codescape_CS40_light_userguides.pdf +// You can also find the same "Counter Description Table" in user's guide PDFs +// bundled in ancient demo versions of CodeScape 3 from 2000 (e.g. +// CSDemo_272.exe), which can still be found in the Internet Archive. +// http://web.archive.org/web/*/http://codescape.com/dl/CSDemo/* +// +// See here for a support document on Lauterbach's SH2, SH3, and SH4 debugger, +// which contains units for each mode (e.g. which measure time and which just +// count): https://www.lauterbach.com/frames.html?home.html (It's in Downloads +// -> Trace32 Help System -> it's the file called "SH2, SH3 and SH4 Debugger" +// with the filename debugger_sh4.pdf). +// + +// +// --- Performance Counter Registers --- +// + +// These registers are 16 bits only and configure the performance counters +#define PMCR1_CTRL_REG 0xFF000084 +#define PMCR2_CTRL_REG 0xFF000088 + +// These registers are 32-bits each and hold the high low parts of each counter +#define PMCTR1H_REG 0xFF100004 +#define PMCTR1L_REG 0xFF100008 + +#define PMCTR2H_REG 0xFF10000C +#define PMCTR2L_REG 0xFF100010 + +// +// --- Performance Counter Configuration Flags --- +// + +// These bits' functions are currently unknown, but they may simply be reserved. +// It's possible that there's a [maybe expired?] patent that details the +// configuration registers, though I haven't been able to find one. Places to +// check would be Google Patents and the Japanese Patent Office--maybe someone +// else can find something? +// +// Some notes: +// Writing 1 to all of these bits reads back as 0, so it looks like they aren't +// config bits. It's possible they are write-only like the stop bit, though, +// or that they're just reserved-write-0-only. It appears that they are always +// written with zeros in software that uses them, so that's confirmed safe to do. +// +// Also, after running counter 1 to overflow, it appears there's no overflow bit +// (maybe the designers thought 48-bits would be so much to count to that they +// didn't bother implementing one?). The upper 16-bits of the counter high +// register are also not sign-extension bits. They may be a hidden config area, +// but probably not because big endian mode would swap the byte order. +#define PMCR_UNKNOWN_BIT_0040 0x0040 +#define PMCR_UNKNOWN_BIT_0080 0x0080 +#define PMCR_UNKNOWN_BIT_0200 0x0200 +#define PMCR_UNKNOWN_BIT_0400 0x0400 +#define PMCR_UNKNOWN_BIT_0800 0x0800 +#define PMCR_UNKNOWN_BIT_1000 0x1000 + +// PMCR_MODE_CLEAR_INVERTED just clears the event mode if it's inverted with +// '~', and event modes are listed below. +#define PMCR_MODE_CLEAR_INVERTED 0x003f + +// PMCR_CLOCK_TYPE sets the counters to count clock cycles or CPU/bus ratio mode +// cycles (where T = C x B / 24 and T is time, C is count, and B is time +// of one bus cycle). Note: B = 1/99753008 or so, but it may vary, as mine is +// actually 1/99749010-ish; the target frequency is probably meant to be 99.75MHz. +// +// See the ST40 or Renesas SH7750R documents described in the above "Event Mode +// Info" section for more details about that formula. +// +// Set PMCR_CLOCK_TYPE to 0 for CPU cycle counting, where 1 count = 1 cycle, or +// set it to 1 to use the above formula. Renesas documentation recommends using +// the ratio version (set the bit to 1) when user programs alter CPU clock +// frequencies. This header has some definitions later on to help with this. +#define PMCR_CLOCK_TYPE 0x0100 +#define PMCR_CLOCK_TYPE_SHIFT 8 + +// PMCR_STOP_COUNTER is write-only, as it always reads back as 0. It does what +// the name suggests: when this bit is written to, the counter stops. However, +// if written to while the counter is disabled or stopped, the counter's high +// and low registers are reset to 0. +// +// Using PMCR_STOP_COUNTER to stop the counter has the effect of holding the +// data in the data registers while stopped, unlike PMCR_DISABLE_COUNTER, and +// this bit needs to be written to again (e.g. on next start) in order to +// actually clear the counter data for another run. If not explicitly cleared, +// the counter will continue from where it left off before being stopped. +#define PMCR_STOP_COUNTER 0x2000 +#define PMCR_RESET_COUNTER_SHIFT 13 + +// Bits 0xC000 both need to be set to 1 for the counters to actually begin +// counting. I have seen that the Linux kernel actually separates them out into +// two separate labelled bits (PMEN and PMST) for some reason, however they do +// not appear to do anything separately. Perhaps this is a two-bit mode where +// 1-1 is run, 1-0 and 0-1 are ???, and 0-0 is off. +#define PMCR_RUN_COUNTER 0xC000 +#define PMCR_RUN_SHIFT 14 +// Interestingly, the output here writes 0x6000 to the counter config registers, +// which would be the "PMST" bit and the "RESET" bit: +// https://www.multimediaforum.de/threads/36260834-alice-hsn-3800tw-usb-jtag-ft4232h/page2 + +// To disable a counter, just write 0 to its config register. This will not +// reset the counter to 0, as that requires an explicit clear via setting the +// PMCR_STOP_COUNTER bit. What's odd is that a disabled counter's data +// registers read back as all 0, but re-enabling it without a clear will +// continue from the last value before disabling. +#define PMCR_DISABLE_COUNTER 0x0000 + +// These definitions merely separate out the two PMCR_RUN_COUNTER bits, and +// they are included here for documentation purposes. + +// PMST may mean PMCR START. It's consistently used to enable the counter. +// I'm just calling it PMST here for lack of a better name, since this is what +// the Linux kernel and lxdream call it. It could also have something to do with +// a mode specific to STMicroelectronics. +#define PMCR_PMST_BIT 0x4000 +#define PMCR_PMST_SHIFT 14 + +// Likewise PMEN may mean PMCR ENABLE +#define PMCR_PMEN_BIT 0x8000 +#define PMCR_PMEN_SHIFT 15 + +// +// --- Performance Counter Event Code Definitions --- +// +// Interestingly enough, it so happens that the SEGA Dreamcast's CPU seems to +// contain the same performance counter functionality as SH4 debug adapters for +// the SH7750R. Awesome! +// + +// MODE DEFINITION VALUE MEASURMENT TYPE & NOTES +#define PMCR_INIT_NO_MODE 0x00 // None; Just here to be complete +#define PMCR_OPERAND_READ_ACCESS_MODE 0x01 // Quantity; With cache +#define PMCR_OPERAND_WRITE_ACCESS_MODE 0x02 // Quantity; With cache +#define PMCR_UTLB_MISS_MODE 0x03 // Quantity +#define PMCR_OPERAND_CACHE_READ_MISS_MODE 0x04 // Quantity +#define PMCR_OPERAND_CACHE_WRITE_MISS_MODE 0x05 // Quantity +#define PMCR_INSTRUCTION_FETCH_MODE 0x06 // Quantity; With cache +#define PMCR_INSTRUCTION_TLB_MISS_MODE 0x07 // Quantity +#define PMCR_INSTRUCTION_CACHE_MISS_MODE 0x08 // Quantity +#define PMCR_ALL_OPERAND_ACCESS_MODE 0x09 // Quantity +#define PMCR_ALL_INSTRUCTION_FETCH_MODE 0x0a // Quantity +#define PMCR_ON_CHIP_RAM_OPERAND_ACCESS_MODE 0x0b // Quantity +// No 0x0c +#define PMCR_ON_CHIP_IO_ACCESS_MODE 0x0d // Quantity +#define PMCR_OPERAND_ACCESS_MODE 0x0e // Quantity; With cache, counts both reads and writes +#define PMCR_OPERAND_CACHE_MISS_MODE 0x0f // Quantity +#define PMCR_BRANCH_ISSUED_MODE 0x10 // Quantity; Not the same as branch taken! +#define PMCR_BRANCH_TAKEN_MODE 0x11 // Quantity +#define PMCR_SUBROUTINE_ISSUED_MODE 0x12 // Quantity; Issued a BSR, BSRF, JSR, JSR/N +#define PMCR_INSTRUCTION_ISSUED_MODE 0x13 // Quantity +#define PMCR_PARALLEL_INSTRUCTION_ISSUED_MODE 0x14 // Quantity +#define PMCR_FPU_INSTRUCTION_ISSUED_MODE 0x15 // Quantity +#define PMCR_INTERRUPT_COUNTER_MODE 0x16 // Quantity +#define PMCR_NMI_COUNTER_MODE 0x17 // Quantity +#define PMCR_TRAPA_INSTRUCTION_COUNTER_MODE 0x18 // Quantity +#define PMCR_UBC_A_MATCH_MODE 0x19 // Quantity +#define PMCR_UBC_B_MATCH_MODE 0x1a // Quantity +// No 0x1b-0x20 +#define PMCR_INSTRUCTION_CACHE_FILL_MODE 0x21 // Cycles +#define PMCR_OPERAND_CACHE_FILL_MODE 0x22 // Cycles +#define PMCR_ELAPSED_TIME_MODE 0x23 // Cycles; For 200MHz CPU: 5ns per count in 1 cycle = 1 count mode, or around 417.715ps per count (increments by 12) in CPU/bus ratio mode +#define PMCR_PIPELINE_FREEZE_BY_ICACHE_MISS_MODE 0x24 // Cycles +#define PMCR_PIPELINE_FREEZE_BY_DCACHE_MISS_MODE 0x25 // Cycles +// No 0x26 +#define PMCR_PIPELINE_FREEZE_BY_BRANCH_MODE 0x27 // Cycles +#define PMCR_PIPELINE_FREEZE_BY_CPU_REGISTER_MODE 0x28 // Cycles +#define PMCR_PIPELINE_FREEZE_BY_FPU_MODE 0x29 // Cycles + +// +// --- Performance Counter Support Definitions --- +// + +// This definition can be passed as the init/enable/restart functions' +// count_type parameter to use the 1 cycle = 1 count mode. This is how the +// counter can be made to run for 16.3 days. +#define PMCR_COUNT_CPU_CYCLES 0 +// Likewise this uses the CPU/bus ratio method +#define PMCR_COUNT_RATIO_CYCLES 1 + +// These definitions are for the enable function and specify whether to reset +// a counter to 0 or to continue from where it left off +#define PMCR_CONTINUE_COUNTER 0 +#define PMCR_RESET_COUNTER 1 + +// +// --- Performance Counter Miscellaneous Definitions --- +// +// For convenience; assume stock bus clock of 99.75MHz +// (Bus clock is the external CPU clock, not the peripheral bus clock) +// + +#define PMCR_SH4_CPU_FREQUENCY 199500000 +#define PMCR_CPU_CYCLES_MAX_SECONDS 1410902 +#define PMCR_SH4_BUS_FREQUENCY 99750000 +#define PMCR_SH4_BUS_FREQUENCY_SCALED 2394000000 // 99.75MHz x 24 +#define PMCR_BUS_RATIO_MAX_SECONDS 117575 + +// +// --- Performance Counter Functions --- +// +// See perfctr.c file for more details about each function and some more usage notes. +// +// Note: PMCR_Init() and PMCR_Enable() will do nothing if the perf counter is already running! +// + +// Clear counter and enable +void PMCR_Init(int which, unsigned short mode, unsigned char count_type); + +// Enable one or both of these "undocumented" performance counters. +void PMCR_Enable(int which, unsigned short mode, unsigned char count_type, unsigned char reset_counter); + +// Disable, clear, and re-enable with new mode (or same mode) +void PMCR_Restart(int which, unsigned short mode, unsigned char count_type); + +// Read a counter +// out_array is specifically uint32 out_array[2] -- 48-bit value needs a 64-bit storage unit +void PMCR_Read(int which, volatile unsigned int *out_array); + +// Stop counter(s) (without clearing) +void PMCR_Stop(int which); + +// Disable counter(s) (without clearing) +void PMCR_Disable(int which); + +#endif /* __PERFCTR_H__ */ diff --git a/GL/private.h b/GL/private.h index 6987723..344de6b 100644 --- a/GL/private.h +++ b/GL/private.h @@ -6,6 +6,7 @@ #include "../include/gl.h" #include "../containers/aligned_vector.h" #include "../containers/named_array.h" +#include "cygprofile.h" extern void* memcpy4 (void *dest, const void *src, size_t count); @@ -249,6 +250,11 @@ typedef struct { GLint size; } AttribPointer; +typedef struct { + float xyz[3]; + float n[3]; +} EyeSpaceData; + GLboolean _glCheckValidEnum(GLint param, GLint* values, const char* func); GLuint* _glGetEnabledAttributes(); @@ -280,7 +286,7 @@ GLuint _glGetMipmapLevelCount(TextureObject* obj); GLboolean _glIsLightingEnabled(); GLboolean _glIsLightEnabled(GLubyte light); GLboolean _glIsColorMaterialEnabled(); -void _glCalculateLightingContribution(const GLint light, const GLfloat* pos, const GLfloat* normal, uint8_t* bgra, GLfloat* colour); +void _glPerformLighting(Vertex* vertices, const EyeSpaceData* es, const int32_t count); unsigned char _glIsClippingEnabled(); void _glEnableClipping(unsigned char v); diff --git a/GL/profiler.c b/GL/profiler.c index 968bc9f..c74d84a 100644 --- a/GL/profiler.c +++ b/GL/profiler.c @@ -6,6 +6,8 @@ #include "profiler.h" #include "../containers/aligned_vector.h" +#if PROFILING_COMPILED + #define MAX_PATH 256 typedef struct { @@ -141,3 +143,4 @@ void profiler_print_stats() { fprintf(stderr, "%-60s%-20f%-20f%" PRIu64 "\n", result->name, (double)avg, (double)ms, result->total_calls); } } +#endif diff --git a/GL/profiler.h b/GL/profiler.h index acaf8bf..acf07ed 100644 --- a/GL/profiler.h +++ b/GL/profiler.h @@ -7,12 +7,26 @@ typedef struct { uint64_t start_time_in_us; } Profiler; +#define PROFILING_COMPILED 0 +#if PROFILING_COMPILED Profiler* profiler_push(const char* name); -void profiler_checkpoint(const char* name); -void profiler_pop(); +void _profiler_checkpoint(const char* name); +void _profiler_pop(); -void profiler_print_stats(); +void _profiler_print_stats(); -void profiler_enable(); -void profiler_disable(); +void _profiler_enable(); +void _profiler_disable(); + +#else +#define profiler_push(name); +#define profiler_checkpoint(name); +#define profiler_pop(); + +#define profiler_print_stats(); + +#define profiler_enable(); +#define profiler_disable(); + +#endif diff --git a/GL/sh4_math.h b/GL/sh4_math.h new file mode 100644 index 0000000..ad1cd7e --- /dev/null +++ b/GL/sh4_math.h @@ -0,0 +1,1448 @@ +// ---- sh4_math.h - SH7091 Math Module ---- +// +// Version 1.0.3 +// +// This file is part of the DreamHAL project, a hardware abstraction library +// primarily intended for use on the SH7091 found in hardware such as the SEGA +// Dreamcast game console. +// +// This math module is hereby released into the public domain in the hope that it +// may prove useful. Now go hit 60 fps! :) +// +// --Moopthehedgehog +// + +// Notes: +// - GCC 4 users have a different return type for the fsca functions due to an +// internal compiler error regarding complex numbers; no issue under GCC 9.2.0 +// - Using -m4 instead of -m4-single-only completely breaks the matrix and +// vector operations +// - Function inlining must be enabled and not blocked by compiler options such +// as -ffunction-sections, as blocking inlining will result in significant +// performance degradation for the vector and matrix functions employing a +// RETURN_VECTOR_STRUCT return type. I have added compiler hints and attributes +// "static inline __attribute__((always_inline))" to mitigate this, so in most +// cases the functions should be inlined regardless. If in doubt, check the +// compiler asm output! +// + +#ifndef __SH4_MATH_H_ +#define __SH4_MATH_H_ + +#define GNUC_FSCA_ERROR_VERSION 4 + +// +// Fast SH4 hardware math functions +// +// +// High-accuracy users beware, the fsrra functions have an error of +/- 2^-21 +// per http://www.shared-ptr.com/sh_insns.html +// + +//============================================================================== +// Definitions +//============================================================================== +// +// Structures, useful definitions, and reference comments +// + +// Front matrix format: +// +// FV0 FV4 FV8 FV12 +// --- --- --- ---- +// [ fr0 fr4 fr8 fr12 ] +// [ fr1 fr5 fr9 fr13 ] +// [ fr2 fr6 fr10 fr14 ] +// [ fr3 fr7 fr11 fr15 ] +// +// Back matrix, XMTRX, is similar, although it has no FVn vector groups: +// +// [ xf0 xf4 xf8 xf12 ] +// [ xf1 xf5 xf9 xf13 ] +// [ xf2 xf6 xf10 xf14 ] +// [ xf3 xf7 xf11 xf15 ] +// + +typedef struct __attribute__((aligned(32))) { + float fr0; + float fr1; + float fr2; + float fr3; + float fr4; + float fr5; + float fr6; + float fr7; + float fr8; + float fr9; + float fr10; + float fr11; + float fr12; + float fr13; + float fr14; + float fr15; +} ALL_FLOATS_STRUCT; + +// Return structs should be defined locally so that GCC optimizes them into +// register usage instead of memory accesses. +typedef struct { + float z1; + float z2; + float z3; + float z4; +} RETURN_VECTOR_STRUCT; + +#if __GNUC__ <= GNUC_FSCA_ERROR_VERSION +typedef struct { + float sine; + float cosine; +} RETURN_FSCA_STRUCT; +#endif + +// Identity Matrix +// +// FV0 FV4 FV8 FV12 +// --- --- --- ---- +// [ 1 0 0 0 ] +// [ 0 1 0 0 ] +// [ 0 0 1 0 ] +// [ 0 0 0 1 ] +// + +static const ALL_FLOATS_STRUCT identity_matrix = {1.0f, 0.0f, 0.0f, 0.0f, 0.0f, 1.0f, 0.0f, 0.0f, 0.0f, 0.0f, 1.0f, 0.0f, 0.0f, 0.0f, 0.0f, 1.0f}; + +//============================================================================== +// Basic math functions +//============================================================================== +// +// The following functions are available. +// Please see their definitions for other usage info, otherwise they may not +// work for you. +// +/* + // |x| + float MATH_fabs(float x) + + // sqrt(x) + float MATH_fsqrt(float x) + + // a*b+c + float MATH_fmac(float a, float b, float c) + + // a*b-c + float MATH_fmac_Dec(float a, float b, float c) +*/ + +// |x| +// This one works on ARM and x86, too! +static inline __attribute__((always_inline)) float MATH_fabs(float x) +{ + asm volatile ("FABS %[floatx]\n" + : [floatx] "+f" (x) // outputs, "+" means r/w + : // no inputs + : // no clobbers + ); + + return x; +} + +// sqrt(x) +// This one works on ARM and x86, too! +// NOTE: There is a much faster version (MATH_Fast_Sqrt()) in the fsrra section of +// this file. Chances are you probably want that one. +static inline __attribute__((always_inline)) float MATH_fsqrt(float x) +{ + asm volatile ("fsqrt %[floatx]\n" + : [floatx] "+f" (x) // outputs, "+" means r/w + : // no inputs + : // no clobbers + ); + + return x; +} + +// a*b+c +static inline __attribute__((always_inline)) float MATH_fmac(float a, float b, float c) +{ + asm volatile ("fmac fr0, %[floatb], %[floatc]\n" + : [floatc] "+f" (c) // outputs, "+" means r/w + : "w" (a), [floatb] "f" (b) // inputs + : // no clobbers + ); + + return c; +} + +// a*b-c +static inline __attribute__((always_inline)) float MATH_fmac_Dec(float a, float b, float c) +{ + asm volatile ("fneg %[floatc]\n\t" + "fmac fr0, %[floatb], %[floatc]\n" + : [floatc] "+&f" (c) // outputs, "+" means r/w, "&" means it's written to before all inputs are consumed + : "w" (a), [floatb] "f" (b) // inputs + : // no clobbers + ); + + return c; +} + +//============================================================================== +// Fun with fsrra, which does 1/sqrt(x) in one cycle +//============================================================================== +// +// Error is +/- 2^-21 per http://www.shared-ptr.com/sh_insns.html +// +// The following functions are available. +// Please see their definitions for other usage info, otherwise they may not +// work for you. +// +/* + // 1/x + float MATH_Invert(float x) + + // 1/sqrt(x) + float MATH_fsrra(float x) + + // A faster divide than the 'fdiv' instruction + float MATH_Fast_Divide(float numerator, float denominator) + + // A faster square root then the 'fsqrt' instruction + float MATH_Fast_Sqrt(float x) +*/ + +// 1/x +// (1.0f / sqrt(x) ) ^ 2 +// This is about 3x faster than fdiv! +static inline __attribute__((always_inline)) float MATH_Invert(float x) +{ + asm volatile ("fsrra %[one_div_sqrt]\n\t" + "fmul %[one_div_sqrt], %[one_div_sqrt]\n" + : [one_div_sqrt] "+f" (x) // outputs, "+" means r/w + : // no inputs + : // no clobbers + ); + + return x; +} + +// 1/sqrt(x) +static inline __attribute__((always_inline)) float MATH_fsrra(float x) +{ + asm volatile ("fsrra %[one_div_sqrt]\n" + : [one_div_sqrt] "+f" (x) // outputs, "+" means r/w + : // no inputs + : // no clobbers + ); + + return x; +} + +// It's faster to do this than to do an fdiv. This takes half as many cycles! +// (~7 vs ~14) Only fdiv can do doubles, however. +// Of course, not having to divide at all is generally the best way to go. :P +static inline __attribute__((always_inline)) float MATH_Fast_Divide(float numerator, float denominator) +{ + asm volatile ("fsrra %[div_denom]\n\t" + "fmul %[div_denom], %[div_denom]\n\t" + "fmul %[div_numer], %[div_denom]\n" + : [div_denom] "+&f" (denominator) // outputs, "+" means r/w, "&" means it's written to before all inputs are consumed + : [div_numer] "f" (numerator) // inputs + : // clobbers + ); + + return denominator; +} + +// fast sqrt(x) +// Crazy thing: invert(fsrra(x)) is actually about 3x faster than fsqrt. +// Error is +/- 2^-21 per http://www.shared-ptr.com/sh_insns.html +static inline __attribute__((always_inline)) float MATH_Fast_Sqrt(float x) +{ + return MATH_Invert(MATH_fsrra(x)); +} + +//============================================================================== +// Fun with fsca, which does simultaneous sine and cosine in 3 cycles +//============================================================================== +// +// NOTE: GCC 4.7 has a bug that prevents it from working with fsca and complex +// numbers in m4-single-only mode, so GCC 4 users will get a RETURN_FSCA_STRUCT +// instead of a _Complex float. This may be much slower in some instances. +// +// VERY IMPORTANT USAGE INFORMATION (sine and cosine functions): +// +// Due to the nature in which the fsca instruction behaves, you MUST do the +// following in your code to get sine and cosine from these functions: +// +// _Complex float sine_cosine = [Call the fsca function here] +// float sine_value = __real__ sine_cosine; +// float cosine_value = __imag__ sine_cosine; +// Your output is now in sine_value and cosine_value. +// +// This is necessary because fsca outputs both sine and cosine simultaneously +// and uses a double register to do so. The fsca functions do not actually +// return a double--they return two floats--and using a complex float here is +// just a bit of hacking the C language to make GCC do something that's legal in +// assembly according to the SH4 calling convention (i.e. multiple return values +// stored in floating point registers FR0-FR3). This is better than using a +// struct of floats for optimization purposes--this will operate at peak +// performance even at -O0, whereas a struct will not be fast at low +// optimization levels due to memory accesses. +// +// Technically you may be able to use the complex return values as a complex +// number if you wanted to, but that's probably not what you're after and they'd +// be flipped anyways (in mathematical convention, sine is the imaginary part). +// + +// Notes: +// - From http://www.shared-ptr.com/sh_insns.html: +// The input angle is specified as a signed fraction in twos complement. The result of sin and cos is a single-precision floating-point number. +// 0x7FFFFFFF to 0x00000001: 360×2^15−360/2^16 to 360/2^16 degrees +// 0x00000000: 0 degree +// 0xFFFFFFFF to 0x80000000: −360/2^16 to −360×2^15 degrees +// - fsca format is 2^16 is 360 degrees, so a value of 1 is actually +// 1/182.044444444 of a degree +// - fsca does a %360 automatically for values over 360 degrees + +// The following functions are available. +// Please see their definitions for other usage info, otherwise they may not +// work for you. +// +/* + // For integer input in native fsca units (fastest) + _Complex float MATH_fsca_Int(unsigned int input_int) + + // For integer input in degrees + _Complex float MATH_fsca_Int_Deg(unsigned int input_int) + + // For integer input in radians + _Complex float MATH_fsca_Int_Rad(unsigned int input_int) + + // For float input in native fsca units + _Complex float MATH_fsca_Float(float input_float) + + // For float input in degrees + _Complex float MATH_fsca_Float_Deg(float input_float) + + // For float input in radians + _Complex float MATH_fsca_Float_Rad(float input_float) +*/ + +//------------------------------------------------------------------------------ +#if __GNUC__ <= GNUC_FSCA_ERROR_VERSION +//------------------------------------------------------------------------------ +// +// This set of fsca functions is specifically for old versions of GCC. +// See later for functions for newer versions of GCC. +// + +// +// Integer input (faster) +// + +// For int input, input_int is in native fsca units (fastest) +static inline __attribute__((always_inline)) RETURN_FSCA_STRUCT MATH_fsca_Int(unsigned int input_int) +{ + register float __sine __asm__("fr0"); + register float __cosine __asm__("fr1"); + + asm volatile ("lds %[input_number], FPUL\n\t" // load int from register (1 cycle) + "fsca FPUL, DR0\n" // 3 cycle simultaneous sine/cosine + : "=w" (__sine), "=f" (__cosine) // outputs + : [input_number] "r" (input_int) // inputs + : "fpul" // clobbers + ); + + RETURN_FSCA_STRUCT output = {__sine, __cosine}; + return output; +} + +// For int input, input_int is in degrees +static inline __attribute__((always_inline)) RETURN_FSCA_STRUCT MATH_fsca_Int_Deg(unsigned int input_int) +{ + // normalize whole number input degrees to fsca format + input_int = ((1527099483ULL * input_int) >> 23); + + register float __sine __asm__("fr0"); + register float __cosine __asm__("fr1"); + + asm volatile ("lds %[input_number], FPUL\n\t" // load int from register (1 cycle) + "fsca FPUL, DR0\n" // 3 cycle simultaneous sine/cosine + : "=w" (__sine), "=f" (__cosine) // outputs + : [input_number] "r" (input_int) // inputs + : "fpul" // clobbers + ); + + RETURN_FSCA_STRUCT output = {__sine, __cosine}; + return output; +} + +// For int input, input_int is in radians +static inline __attribute__((always_inline)) RETURN_FSCA_STRUCT MATH_fsca_Int_Rad(unsigned int input_int) +{ + // normalize whole number input rads to fsca format + input_int = ((2734261102ULL * input_int) >> 18); + + register float __sine __asm__("fr0"); + register float __cosine __asm__("fr1"); + + asm volatile ("lds %[input_number], FPUL\n\t" // load int from register (1 cycle) + "fsca FPUL, DR0\n" // 3 cycle simultaneous sine/cosine + : "=w" (__sine), "=f" (__cosine) // outputs + : [input_number] "r" (input_int) // inputs + : "fpul" // clobbers + ); + + RETURN_FSCA_STRUCT output = {__sine, __cosine}; + return output; +} + +// +// Float input (slower) +// + +// For float input, input_float is in native fsca units +static inline __attribute__((always_inline)) RETURN_FSCA_STRUCT MATH_fsca_Float(float input_float) +{ + register float __sine __asm__("fr0"); + register float __cosine __asm__("fr1"); + + asm volatile ("ftrc %[input_number], FPUL\n\t" // convert float to int. takes 3 cycles + "fsca FPUL, DR0\n" // 3 cycle simultaneous sine/cosine + : "=w" (__sine), "=f" (__cosine) // outputs + : [input_number] "f" (input_float) // inputs + : "fpul" // clobbers + ); + + RETURN_FSCA_STRUCT output = {__sine, __cosine}; + return output; +} + +// For float input, input_float is in degrees +static inline __attribute__((always_inline)) RETURN_FSCA_STRUCT MATH_fsca_Float_Deg(float input_float) +{ + input_float *= 182.044444444f; + + register float __sine __asm__("fr0"); + register float __cosine __asm__("fr1"); + + asm volatile ("ftrc %[input_number], FPUL\n\t" // convert float to int. takes 3 cycles + "fsca FPUL, DR0\n" // 3 cycle simultaneous sine/cosine + : "=w" (__sine), "=f" (__cosine) // outputs + : [input_number] "f" (input_float) // inputs + : "fpul" // clobbers + ); + + RETURN_FSCA_STRUCT output = {__sine, __cosine}; + return output; +} + +// For float input, input_float is in radians +static inline __attribute__((always_inline)) RETURN_FSCA_STRUCT MATH_fsca_Float_Rad(float input_float) +{ + input_float *= 10430.3783505f; + + register float __sine __asm__("fr0"); + register float __cosine __asm__("fr1"); + + asm volatile ("ftrc %[input_number], FPUL\n\t" // convert float to int. takes 3 cycles + "fsca FPUL, DR0\n" // 3 cycle simultaneous sine/cosine + : "=w" (__sine), "=f" (__cosine) // outputs + : [input_number] "f" (input_float) // inputs + : "fpul" // clobbers + ); + + RETURN_FSCA_STRUCT output = {__sine, __cosine}; + return output; +} + +//------------------------------------------------------------------------------ +#else +//------------------------------------------------------------------------------ +// +// This set of fsca functions is specifically for newer versions of GCC. They +// work fine under GCC 9.2.0. +// + +// +// Integer input (faster) +// + +// For int input, input_int is in native fsca units (fastest) +static inline __attribute__((always_inline)) _Complex float MATH_fsca_Int(unsigned int input_int) +{ + _Complex float output; + + asm volatile ("lds %[input_number], FPUL\n\t" // load int from register (1 cycle) + "fsca FPUL, %[out]\n" // 3 cycle simultaneous sine/cosine + : [out] "=d" (output) // outputs + : [input_number] "r" (input_int) // inputs + : "fpul" // clobbers + ); + + return output; +} + +// For int input, input_int is in degrees +static inline __attribute__((always_inline)) _Complex float MATH_fsca_Int_Deg(unsigned int input_int) +{ + // normalize whole number input degrees to fsca format + input_int = ((1527099483ULL * input_int) >> 23); + + _Complex float output; + + asm volatile ("lds %[input_number], FPUL\n\t" // load int from register (1 cycle) + "fsca FPUL, %[out]\n" // 3 cycle simultaneous sine/cosine + : [out] "=d" (output) // outputs + : [input_number] "r" (input_int) // inputs + : "fpul" // clobbers + ); + + return output; +} + +// For int input, input_int is in radians +static inline __attribute__((always_inline)) _Complex float MATH_fsca_Int_Rad(unsigned int input_int) +{ + // normalize whole number input rads to fsca format + input_int = ((2734261102ULL * input_int) >> 18); + + _Complex float output; + + asm volatile ("lds %[input_number], FPUL\n\t" // load int from register (1 cycle) + "fsca FPUL, %[out]\n" // 3 cycle simultaneous sine/cosine + : [out] "=d" (output) // outputs + : [input_number] "r" (input_int) // inputs + : "fpul" // clobbers + ); + + return output; +} + +// +// Float input (slower) +// + +// For float input, input_float is in native fsca units +static inline __attribute__((always_inline)) _Complex float MATH_fsca_Float(float input_float) +{ + _Complex float output; + + asm volatile ("ftrc %[input_number], FPUL\n\t" // convert float to int. takes 3 cycles + "fsca FPUL, %[out]\n" // 3 cycle simultaneous sine/cosine + : [out] "=d" (output) // outputs + : [input_number] "f" (input_float) // inputs + : "fpul" // clobbers + ); + + return output; +} + +// For float input, input_float is in degrees +static inline __attribute__((always_inline)) _Complex float MATH_fsca_Float_Deg(float input_float) +{ + input_float *= 182.044444444f; + + _Complex float output; + + asm volatile ("ftrc %[input_number], FPUL\n\t" // convert float to int. takes 3 cycles + "fsca FPUL, %[out]\n" // 3 cycle simultaneous sine/cosine + : [out] "=d" (output) // outputs + : [input_number] "f" (input_float) // inputs + : "fpul" // clobbers + ); + + return output; +} + +// For float input, input_float is in radians +static inline __attribute__((always_inline)) _Complex float MATH_fsca_Float_Rad(float input_float) +{ + input_float *= 10430.3783505f; + + _Complex float output; + + asm volatile ("ftrc %[input_number], FPUL\n\t" // convert float to int. takes 3 cycles + "fsca FPUL, %[out]\n" // 3 cycle simultaneous sine/cosine + : [out] "=d" (output) // outputs + : [input_number] "f" (input_float) // inputs + : "fpul" // clobbers + ); + + return output; +} + +//------------------------------------------------------------------------------ +#endif +//------------------------------------------------------------------------------ + +//============================================================================== +// Hardware vector and matrix operations +//============================================================================== +// +// These functions each have very specific usage instructions. Please be sure to +// read them before use or else they won't seem to work right! +// +// The following functions are available. +// Please see their definitions for important usage info, otherwise they may not +// work for you. +// +/* + // Inner/dot product (4x1 vec . 4x1 vec = scalar) + float MATH_fipr(float x1, float x2, float x3, float x4, float y1, float y2, float y3, float y4) + + // Cross product with bonus multiply (vec X vec = orthogonal vec, with an extra a*b=c) + RETURN_VECTOR_STRUCT MATH_Cross_Product_with_Mult(float x1, float x2, float x3, float y1, float y2, float y3, float a, float b) + + // Cross product (vec X vec = orthogonal vec) + RETURN_VECTOR_STRUCT MATH_Cross_Product(float x1, float x2, float x3, float y1, float y2, float y3) + + // Outer product (vec (X) vec = 4x4 matrix) + void MATH_Outer_Product(float x1, float x2, float x3, float x4, float y1, float y2, float y3, float y4) + + // Matrix transform (4x4 matrix * 4x1 vec = 4x1 vec) + RETURN_VECTOR_STRUCT MATH_Matrix_Transform(float x1, float x2, float x3, float x4) + + // 4x4 Matrix transpose (XMTRX^T) + void MATH_Matrix_Transpose(void) + + // 4x4 Matrix product (XMTRX and one from memory) + void MATH_Matrix_Product(ALL_FLOATS_STRUCT * front_matrix) + + // 4x4 Matrix product (two from memory) + void MATH_Load_Matrix_Product(ALL_FLOATS_STRUCT * matrix1, ALL_FLOATS_STRUCT * matrix2) + + // Load 4x4 XMTRX from memory + void MATH_Load_XMTRX(ALL_FLOATS_STRUCT * back_matrix) + + // Store 4x4 XMTRX to memory + ALL_FLOATS_STRUCT * MATH_Store_XMTRX(ALL_FLOATS_STRUCT * destination) + + // Get 4x1 column vector from XMTRX + RETURN_VECTOR_STRUCT MATH_Get_XMTRX_Vector(unsigned int which) + + // Get 2x2 matrix from XMTRX quadrant + RETURN_VECTOR_STRUCT MATH_Get_XMTRX_2x2(unsigned int which) +*/ + +// Inner/dot product: vec . vec = scalar +// _ _ +// | y1 | +// [ x1 x2 x3 x4 ] . | y2 | = scalar +// | y3 | +// |_ y4 _| +// +// SH4 calling convention states we get 8 float arguments. Perfect! +static inline __attribute__((always_inline)) float MATH_fipr(float x1, float x2, float x3, float x4, float y1, float y2, float y3, float y4) +{ + // FR4-FR11 are the regs that are passed in, aka vectors FV4 and FV8. + // Just need to make sure GCC doesn't modify anything, and these register vars do that job. + + // Temporary variables are necessary per GCC to avoid clobbering: + // https://gcc.gnu.org/onlinedocs/gcc/Local-Register-Variables.html#Local-Register-Variables + + float tx1 = x1; + float tx2 = x2; + float tx3 = x3; + float tx4 = x4; + + float ty1 = y1; + float ty2 = y2; + float ty3 = y3; + float ty4 = y4; + + // vector FV4 + register float __x1 __asm__("fr4") = tx1; + register float __x2 __asm__("fr5") = tx2; + register float __x3 __asm__("fr6") = tx3; + register float __x4 __asm__("fr7") = tx4; + + // vector FV8 + register float __y1 __asm__("fr8") = ty1; + register float __y2 __asm__("fr9") = ty2; + register float __y3 __asm__("fr10") = ty3; + register float __y4 __asm__("fr11") = ty4; + + // take care of all the floats in one fell swoop + asm volatile ("fipr FV4, FV8\n" + : "+f" (__y4) // output (gets written to FR11) + : "f" (__x1), "f" (__x2), "f" (__x3), "f" (__x4), "f" (__y1), "f" (__y2), "f" (__y3) // inputs + : // clobbers + ); + + return __y4; +} + +// Cross product: vec X vec = orthogonal vec +// _ _ _ _ _ _ +// | x1 | | y1 | | z1 | +// | x2 | X | y2 | = | z2 | +// |_ x3 _| |_ y3 _| |_ z3 _| +// +// With bonus multiply: +// +// a * b = c +// +// IMPORTANT USAGE INFORMATION (cross product): +// +// Return vector struct maps as below to the above diagram: +// +// typedef struct { +// float z1; +// float z2; +// float z3; +// float z4; // c is stored in z4, and c = a*b if using 'with mult' version (else c = 0) +// } RETURN_VECTOR_STRUCT; +// +// For people familiar with the unit vector notation, z1 == 'i', z2 == 'j', +// and z3 == 'k'. +// +// The cross product matrix will also be stored in XMTRX after this, so calling +// MATH_Matrix_Transform() on a vector after using this function will do a cross +// product with the same x1-x3 values and a multiply with the same 'a' value +// as used in this function. In this a situation, 'a' will be multiplied with +// the x4 parameter of MATH_Matrix_Transform(). a = 0 if not using the 'with mult' +// version of the cross product function. +// +// For reference, XMTRX will look like this: +// +// [ 0 -x3 x2 0 ] +// [ x3 0 -x1 0 ] +// [ -x2 x1 0 0 ] +// [ 0 0 0 a ] (<-- a = 0 if not using 'with mult') +// +// Similarly to how the sine and cosine functions use fsca and return 2 floats, +// the cross product functions actually return 4 floats. The first 3 are the +// cross product output, and the 4th is a*b. The SH4 only multiplies 4x4 +// matrices with 4x1 vectors, which is why the output is like that--but it means +// we also get a bonus float multiplication while we do our cross product! +// + +// Please do not call this function directly (notice the weird syntax); call +// MATH_Cross_Product() or MATH_Cross_Product_with_Mult() instead. +static inline __attribute__((always_inline)) RETURN_VECTOR_STRUCT xMATH_do_Cross_Product_with_Mult(float x3, float a, float y3, float b, float x2, float x1, float y1, float y2) +{ + // FR4-FR11 are the regs that are passed in, in that order. + // Just need to make sure GCC doesn't modify anything, and these register vars do that job. + + // Temporary variables are necessary per GCC to avoid clobbering: + // https://gcc.gnu.org/onlinedocs/gcc/Local-Register-Variables.html#Local-Register-Variables + + float tx1 = x1; + float tx2 = x2; + float tx3 = x3; + float ta = a; + + float ty1 = y1; + float ty2 = y2; + float ty3 = y3; + float tb = b; + + register float __x1 __asm__("fr9") = tx1; // need to negate (need to move to fr6, then negate fr9) + register float __x2 __asm__("fr8") = tx2; // in place for matrix (need to move to fr2 then negate fr2) + register float __x3 __asm__("fr4") = tx3; // need to negate (move to fr1 first, then negate fr4) + register float __a __asm__("fr5") = ta; + + register float __y1 __asm__("fr10") = ty1; + register float __y2 __asm__("fr11") = ty2; + register float __y3 __asm__("fr6") = ty3; + register float __b __asm__("fr7") = tb; + + register float __z1 __asm__("fr0") = 0.0f; // z1 + register float __z2 __asm__("fr1") = 0.0f; // z2 (not moving x3 here yet since a double 0 is needed) + register float __z3 __asm__("fr2") = tx2; // z3 (this handles putting x2 in fr2) + register float __c __asm__("fr3") = 0.0f; // c + + // This actually does a matrix transform to do the cross product. + // It's this: + // _ _ _ _ + // [ 0 -x3 x2 0 ] | y1 | | -x3y2 + x2y3 | + // [ x3 0 -x1 0 ] | y2 | = | x3y1 - x1y3 | + // [ -x2 x1 0 0 ] | y3 | | -x2y1 + x1y2 | + // [ 0 0 0 a ] |_ b _| |_ c _| + // + + asm volatile ( + // set up back bank's FV0 + "fschg\n\t" // switch fmov to paired moves (note: only paired moves can access XDn regs) + + // Save FR12-FR15, which are supposed to be preserved across functions calls. + // This stops them from getting clobbered and saves 4 stack pushes (memory accesses). + "fmov DR12, XD12\n\t" + "fmov DR14, XD14\n\t" + + "fmov DR10, XD0\n\t" // fmov 'y1' and 'y2' from FR10, FR11 into position (XF0, XF1) + "fmov DR6, XD2\n\t" // fmov 'y3' and 'b' from FR6, FR7 into position (XF2, XF3) + + // pair move zeros for some speed in setting up front bank for matrix + "fmov DR0, DR10\n\t" // clear FR10, FR11 + "fmov DR0, DR12\n\t" // clear FR12, FR13 + "fschg\n\t" // switch back to single moves + // prepare front bank for XMTRX + "fmov FR5, FR15\n\t" // fmov 'a' into position + "fmov FR0, FR14\n\t" // clear out FR14 + "fmov FR0, FR7\n\t" // clear out FR7 + "fmov FR0, FR5\n\t" // clear out FR5 + + "fneg FR2\n\t" // set up 'x2' + "fmov FR9, FR6\n\t" // set up 'x1' + "fneg FR9\n\t" + "fmov FR4, FR1\n\t" // set up 'x3' + "fneg FR4\n\t" + // flip banks and matrix multiply + "frchg\n\t" + "ftrv XMTRX, FV0\n" + : "+&w" (__z1), "+&f" (__z2), "+&f" (__z3), "+&f" (__c) // output (using FV0) + : "f" (__x1), "f" (__x2), "f" (__x3), "f" (__y1), "f" (__y2), "f" (__y3), "f" (__a), "f" (__b) // inputs + : // clobbers (all of the float regs get clobbered, except for FR12-FR15 which were specially preserved) + ); + + RETURN_VECTOR_STRUCT output = {__z1, __z2, __z3, __c}; + return output; +} + +// Please do not call this function directly (notice the weird syntax); call +// MATH_Cross_Product() or MATH_Cross_Product_with_Mult() instead. +static inline __attribute__((always_inline)) RETURN_VECTOR_STRUCT xMATH_do_Cross_Product(float x3, float zero, float x1, float y3, float x2, float x1_2, float y1, float y2) +{ + // FR4-FR11 are the regs that are passed in, in that order. + // Just need to make sure GCC doesn't modify anything, and these register vars do that job. + + // Temporary variables are necessary per GCC to avoid clobbering: + // https://gcc.gnu.org/onlinedocs/gcc/Local-Register-Variables.html#Local-Register-Variables + + float tx1 = x1; + float tx2 = x2; + float tx3 = x3; + float tx1_2 = x1_2; + + float ty1 = y1; + float ty2 = y2; + float ty3 = y3; + float tzero = zero; + + register float __x1 __asm__("fr6") = tx1; // in place + register float __x2 __asm__("fr8") = tx2; // in place (fmov to fr2, negate fr2) + register float __x3 __asm__("fr4") = tx3; // need to negate (fmov to fr1, negate fr4) + + register float __zero __asm__("fr5") = tzero; // in place + register float __x1_2 __asm__("fr9") = tx1_2; // need to negate + + register float __y1 __asm__("fr10") = ty1; + register float __y2 __asm__("fr11") = ty2; + // no __y3 needed in this function + + register float __z1 __asm__("fr0") = tzero; // z1 + register float __z2 __asm__("fr1") = tzero; // z2 + register float __z3 __asm__("fr2") = ty3; // z3 + register float __c __asm__("fr3") = tzero; // c + + // This actually does a matrix transform to do the cross product. + // It's this: + // _ _ _ _ + // [ 0 -x3 x2 0 ] | y1 | | -x3y2 + x2y3 | + // [ x3 0 -x1 0 ] | y2 | = | x3y1 - x1y3 | + // [ -x2 x1 0 0 ] | y3 | | -x2y1 + x1y2 | + // [ 0 0 0 0 ] |_ 0 _| |_ 0 _| + // + + asm volatile ( + // zero out FR7. For some reason, if this is done in C after __z3 is set: + // register float __y3 __asm__("fr7") = tzero; + // then GCC will emit a spurious stack push (pushing FR12). So just zero it here. + "fmov FR5, FR7\n\t" + // set up back bank's FV0 + "fschg\n\t" // switch fmov to paired moves (note: only paired moves can access XDn regs) + + // Save FR12-FR15, which are supposed to be preserved across functions calls. + // This stops them from getting clobbered and saves 4 stack pushes (memory accesses). + "fmov DR12, XD12\n\t" + "fmov DR14, XD14\n\t" + + "fmov DR10, XD0\n\t" // fmov 'y1' and 'y2' from FR10, FR11 into position (XF0, XF1) + "fmov DR2, XD2\n\t" // fmov 'y3' and '0' from FR2, FR3 into position (XF2, XF3) + + // pair move zeros for some speed in setting up front bank for matrix + "fmov DR0, DR10\n\t" // clear FR10, FR11 + "fmov DR0, DR12\n\t" // clear FR12, FR13 + "fmov DR0, DR14\n\t" // clear FR14, FR15 + "fschg\n\t" // switch back to single moves + // prepare front bank for XMTRX + "fneg FR9\n\t" // set up 'x1' + "fmov FR8, FR2\n\t" // set up 'x2' + "fneg FR2\n\t" + "fmov FR4, FR1\n\t" // set up 'x3' + "fneg FR4\n\t" + // flip banks and matrix multiply + "frchg\n\t" + "ftrv XMTRX, FV0\n" + : "+&w" (__z1), "+&f" (__z2), "+&f" (__z3), "+&f" (__c) // output (using FV0) + : "f" (__x1), "f" (__x2), "f" (__x3), "f" (__y1), "f" (__y2), "f" (__zero), "f" (__x1_2) // inputs + : "fr7" // clobbers (all of the float regs get clobbered, except for FR12-FR15 which were specially preserved) + ); + + RETURN_VECTOR_STRUCT output = {__z1, __z2, __z3, __c}; + return output; +} + +//------------------------------------------------------------------------------ +// Functions that wrap the xMATH_do_Cross_Product[_with_Mult]() functions to make +// it easier to organize parameters +//------------------------------------------------------------------------------ + +// Cross product with a bonus float multiply (c = a * b) +static inline __attribute__((always_inline)) RETURN_VECTOR_STRUCT MATH_Cross_Product_with_Mult(float x1, float x2, float x3, float y1, float y2, float y3, float a, float b) +{ + return xMATH_do_Cross_Product_with_Mult(x3, a, y3, b, x2, x1, y1, y2); +} + +// Plain cross product; does not use the bonus float multiply (c = 0 and a in the cross product matrix will be 0) +// This is a tiny bit faster than 'with_mult' (about 2 cycles faster) +static inline __attribute__((always_inline)) RETURN_VECTOR_STRUCT MATH_Cross_Product(float x1, float x2, float x3, float y1, float y2, float y3) +{ + return xMATH_do_Cross_Product(x3, 0.0f, x1, y3, x2, x1, y1, y2); +} + +// Outer product: vec (X) vec = matrix +// _ _ +// | x1 | +// | x2 | (X) [ y1 y2 y3 y4 ] = 4x4 matrix +// | x3 | +// |_ x4 _| +// +// This returns the floats in the back bank (XF0-15), which are inaccessible +// outside of using frchg or paired-move fmov. It's meant to set up a matrix for +// use with other matrix functions. GCC also does not touch the XFn bank. +// This will also wipe out anything stored in the float registers, as it uses the +// whole FPU register file (all 32 of the float registers). +static inline __attribute__((always_inline)) void MATH_Outer_Product(float x1, float x2, float x3, float x4, float y1, float y2, float y3, float y4) +{ + // FR4-FR11 are the regs that are passed in, in that order. + // Just need to make sure GCC doesn't modify anything, and these register vars do that job. + + // Temporary variables are necessary per GCC to avoid clobbering: + // https://gcc.gnu.org/onlinedocs/gcc/Local-Register-Variables.html#Local-Register-Variables + + float tx1 = x1; + float tx2 = x2; + float tx3 = x3; + float tx4 = x4; + + float ty1 = y1; + float ty2 = y2; + float ty3 = y3; + float ty4 = y4; + + // vector FV4 + register float __x1 __asm__("fr4") = tx1; + register float __x2 __asm__("fr5") = tx2; + register float __x3 __asm__("fr6") = tx3; + register float __x4 __asm__("fr7") = tx4; + + // vector FV8 + register float __y1 __asm__("fr8") = ty1; + register float __y2 __asm__("fr9") = ty2; + register float __y3 __asm__("fr10") = ty3; // in place already + register float __y4 __asm__("fr11") = ty4; + + // This actually does a 4x4 matrix multiply to do the outer product. + // It's this: + // + // [ x1 x1 x1 x1 ] [ y1 0 0 0 ] [ x1y1 x1y2 x1y3 x1y4 ] + // [ x2 x2 x2 x2 ] [ 0 y2 0 0 ] = [ x2y1 x2y2 x2y3 x2y4 ] + // [ x3 x3 x3 x3 ] [ 0 0 y3 0 ] [ x3y1 x3y2 x3y3 x3y4 ] + // [ x4 x4 x4 x4 ] [ 0 0 0 y4 ] [ x4y1 x4y2 x4y3 x4y4 ] + // + + asm volatile ( + // zero out unoccupied front floats to make a double 0 in DR2 + "fldi0 FR2\n\t" + "fmov FR2, FR3\n\t" + "fschg\n\t" // switch fmov to paired moves (note: only paired moves can access XDn regs) + // fmov 'x1' and 'x2' from FR4, FR5 into position (XF0,4,8,12, XF1,5,9,13) + "fmov DR4, XD0\n\t" + "fmov DR4, XD4\n\t" + "fmov DR4, XD8\n\t" + "fmov DR4, XD12\n\t" + // fmov 'x3' and 'x4' from FR6, FR7 into position (XF2,6,10,14, XF3,7,11,15) + "fmov DR6, XD2\n\t" + "fmov DR6, XD6\n\t" + "fmov DR6, XD10\n\t" + "fmov DR6, XD14\n\t" + // set up front floats (y1-y4) + "fmov DR8, DR0\n\t" + "fmov DR8, DR4\n\t" + "fmov DR10, DR14\n\t" + // finish zeroing out front floats + "fmov DR2, DR6\n\t" + "fmov DR2, DR8\n\t" + "fmov DR2, DR12\n\t" + "fschg\n\t" // switch back to single-move mode + "fmov FR2, FR1\n\t" + "fmov FR2, FR4\n\t" + "fmov FR2, FR11\n\t" + "fmov FR2, FR14\n\t" + // finally, matrix multiply 4x4 + "ftrv XMTRX, FV0\n\t" + "ftrv XMTRX, FV4\n\t" + "ftrv XMTRX, FV8\n\t" + "ftrv XMTRX, FV12\n\t" + // Save output in XF regs + "frchg\n" + : // no outputs + : "f" (__x1), "f" (__x2), "f" (__x3), "f" (__x4), "f" (__y1), "f" (__y2), "f" (__y3), "f" (__y4) // inputs + : "fr0", "fr1", "fr2", "fr3", "fr12", "fr13", "fr14", "fr15" // clobbers, can't avoid it + ); + // GCC will restore FR12-FR15 from the stack after this, so we really can't keep the output in the front bank. +} + +// Matrix transform: matrix * vector = vector +// _ _ _ _ +// [ ----------- ] | x1 | | z1 | +// [ ---XMTRX--- ] | x2 | = | z2 | +// [ ----------- ] | x3 | | z3 | +// [ ----------- ] |_ x4 _| |_ z4 _| +// +// IMPORTANT USAGE INFORMATION (matrix transform): +// +// Return vector struct maps 1:1 to the above diagram: +// +// typedef struct { +// float z1; +// float z2; +// float z3; +// float z4; +// } RETURN_VECTOR_STRUCT; +// +// Similarly to how the sine and cosine functions use fsca and return 2 floats, +// the matrix transform function actually returns 4 floats. The SH4 only multiplies +// 4x4 matrices with 4x1 vectors, which is why the output is like that. +// +// Multiply a matrix stored in the back bank (XMTRX) with an input vector +static inline __attribute__((always_inline)) RETURN_VECTOR_STRUCT MATH_Matrix_Transform(float x1, float x2, float x3, float x4) +{ + // The floats comprising FV4 are the regs that are passed in. + // Just need to make sure GCC doesn't modify anything, and these register vars do that job. + + // Temporary variables are necessary per GCC to avoid clobbering: + // https://gcc.gnu.org/onlinedocs/gcc/Local-Register-Variables.html#Local-Register-Variables + + float tx1 = x1; + float tx2 = x2; + float tx3 = x3; + float tx4 = x4; + + // output vector FV0 + register float __z1 __asm__("fr0") = tx1; + register float __z2 __asm__("fr1") = tx2; + register float __z3 __asm__("fr2") = tx3; + register float __z4 __asm__("fr3") = tx4; + + asm volatile ("ftrv XMTRX, FV0\n\t" + // have to do this to obey SH4 calling convention--output returned in FV0 + : "+w" (__z1), "+f" (__z2), "+f" (__z3), "+f" (__z4) // outputs, "+" means r/w + : // no inputs + : // no clobbers + ); + + RETURN_VECTOR_STRUCT output = {__z1, __z2, __z3, __z4}; + return output; +} + +// Matrix Transpose +// +// This does a matrix transpose on the matrix in XMTRX, which swaps rows with +// columns as follows (math notation is [XMTRX]^T): +// +// [ a b c d ] T [ a e i m ] +// [ e f g h ] = [ b f j n ] +// [ i j k l ] [ c g k o ] +// [ m n o p ] [ d h l p ] +// +// PLEASE NOTE: It is faster to avoid the need for a transpose altogether by +// structuring matrices and vectors accordingly. +static inline __attribute__((always_inline)) void MATH_Matrix_Transpose(void) +{ + asm volatile ("frchg\n\t" // fmov for singles only works on front bank + // FR0, FR5, FR10, and FR15 are already in place + // swap FR1 and FR4 + "flds FR1, FPUL\n\t" + "fmov FR4, FR1\n\t" + "fsts FPUL, FR4\n\t" + // swap FR2 and FR8 + "flds FR2, FPUL\n\t" + "fmov FR8, FR2\n\t" + "fsts FPUL, FR8\n\t" + // swap FR3 and FR12 + "flds FR3, FPUL\n\t" + "fmov FR12, FR3\n\t" + "fsts FPUL, FR12\n\t" + // swap FR6 and FR9 + "flds FR6, FPUL\n\t" + "fmov FR9, FR6\n\t" + "fsts FPUL, FR9\n\t" + // swap FR7 and FR13 + "flds FR7, FPUL\n\t" + "fmov FR13, FR7\n\t" + "fsts FPUL, FR13\n\t" + // swap FR11 and FR14 + "flds FR11, FPUL\n\t" + "fmov FR14, FR11\n\t" + "fsts FPUL, FR14\n\t" + // restore XMTRX to back bank + "frchg\n" + : // no outputs + : // no inputs + : "fpul" // clobbers + ); +} + +// Matrix product: matrix * matrix = matrix +// +// These use the whole dang floating point unit. +// +// [ ----------- ] [ ----------- ] [ ----------- ] +// [ ---Back---- ] [ ---Front--- ] = [ ---XMTRX--- ] +// [ ---Matrix-- ] [ ---Matrix-- ] [ ----------- ] +// [ --(XMTRX)-- ] [ ----------- ] [ ----------- ] +// +// Multiply a matrix stored in the back bank with a matrix loaded from memory +// Output is stored in the back bank (XMTRX) +static inline __attribute__((always_inline)) void MATH_Matrix_Product(ALL_FLOATS_STRUCT * front_matrix) +{ + asm volatile ("pref @%[fmtrx]\n\t" // Prefetching should help a bit + // gotta wait for 6 clocks (30ns) memory access time for pref to work + "mov #32, r1\n\t" + "add %[fmtrx], r1\n\t" // store offset by 32 in r1 + "pref @r1\n\t" // Get a head start prefetching the second half of the 64-byte data + // NOPs are in the MT group, so they are executed in parallel... + // all these nops should equal 2 cycles in this context... + "nop\n\t" + "nop\n\t" + "nop\n\t" + "nop\n\t" + "nop\n\t" + "nop\n\t" + "fschg\n\t" // switch fmov to paired moves + "fmov.d @%[fmtrx]+, DR0\n\t" + "fmov.d @%[fmtrx]+, DR2\n\t" + "fmov.d @%[fmtrx]+, DR4\n\t" + "fmov.d @%[fmtrx]+, DR6\n\t" + "fmov.d @%[fmtrx]+, DR8\n\t" + "fmov.d @%[fmtrx]+, DR10\n\t" + "fmov.d @%[fmtrx]+, DR12\n\t" + "fmov.d @%[fmtrx], DR14\n\t" + "fschg\n\t" // switch back to single moves + // matrix multiply 4x4 + "ftrv XMTRX, FV0\n\t" + "ftrv XMTRX, FV4\n\t" + "ftrv XMTRX, FV8\n\t" + "ftrv XMTRX, FV12\n\t" + // Save output in XF regs + "frchg\n" + : [fmtrx] "+r" ((unsigned int)front_matrix) // outputs, "+" means r/w + : // no inputs + : "r1", "fr0", "fr1", "fr2", "fr3", "fr4", "fr5", "fr6", "fr7", "fr8", "fr9", "fr10", "fr11", "fr12", "fr13", "fr14", "fr15" // clobbers (GCC doesn't know about back bank, so writing to it isn't clobbered) + ); +} + +// Load two 4x4 matrices and multiply them, storing the output into the back bank (XMTRX) +// +// MATH_Load_Matrix_Product() is slightly faster than doing this: +// MATH_Load_XMTRX(matrix1) +// MATH_Matrix_Product(matrix2) +// as it saves having to do 2 extraneous 'fschg' instructions. +// +static inline __attribute__((always_inline)) void MATH_Load_Matrix_Product(ALL_FLOATS_STRUCT * matrix1, ALL_FLOATS_STRUCT * matrix2) +{ + asm volatile ("pref @%[bmtrx]\n\t" // Prefetching should help a bit + // gotta wait for 6 clocks (30ns) memory access time for pref to work + "mov #32, r0\n\t" + "pref @%[fmtrx]\n\t" // prefetch fmtrx now while we wait + "mov r0, r1\n\t" // This is parallel-issue + "add %[bmtrx], r0\n\t" // store offset by 32 in r0 + "pref @r0\n\t" // Get a head start prefetching the second half of the 64-byte data + "add %[fmtrx], r1\n\t" // store offset by 32 in r1 + "pref @r1\n\t" // likewise for other matrix + "fschg\n\t" // switch fmov to paired moves (note: only paired moves can access XDn regs) + // back matrix + "fmov.d @%[bmtrx]+, XD0\n\t" + "fmov.d @%[bmtrx]+, XD2\n\t" + "fmov.d @%[bmtrx]+, XD4\n\t" + "fmov.d @%[bmtrx]+, XD6\n\t" + "fmov.d @%[bmtrx]+, XD8\n\t" + "fmov.d @%[bmtrx]+, XD10\n\t" + "fmov.d @%[bmtrx]+, XD12\n\t" + "fmov.d @%[bmtrx], XD14\n\t" + // front matrix + "fmov.d @%[fmtrx]+, DR0\n\t" + "fmov.d @%[fmtrx]+, DR2\n\t" + "fmov.d @%[fmtrx]+, DR4\n\t" + "fmov.d @%[fmtrx]+, DR6\n\t" + "fmov.d @%[fmtrx]+, DR8\n\t" + "fmov.d @%[fmtrx]+, DR10\n\t" + "fmov.d @%[fmtrx]+, DR12\n\t" + "fmov.d @%[fmtrx], DR14\n\t" + "fschg\n\t" // switch back to single moves + // matrix multiply 4x4 + "ftrv XMTRX, FV0\n\t" + "ftrv XMTRX, FV4\n\t" + "ftrv XMTRX, FV8\n\t" + "ftrv XMTRX, FV12\n\t" + // Save output in XF regs + "frchg\n" + : [bmtrx] "+&r" ((unsigned int)matrix1), [fmtrx] "+r" ((unsigned int)matrix2) // outputs, "+" means r/w, "&" means it's written to before all inputs are consumed + : // no inputs + : "r0", "r1", "fr0", "fr1", "fr2", "fr3", "fr4", "fr5", "fr6", "fr7", "fr8", "fr9", "fr10", "fr11", "fr12", "fr13", "fr14", "fr15" // clobbers (GCC doesn't know about back bank, so writing to it isn't clobbered) + ); +} + +//------------------------------------------------------------------------------ +// Matrix load and store operations +//------------------------------------------------------------------------------ + +// Load a matrix from memory into the back bank (XMTRX) +static inline __attribute__((always_inline)) void MATH_Load_XMTRX(ALL_FLOATS_STRUCT * back_matrix) +{ + asm volatile ("pref @%[bmtrx]\n\t" // Prefetching should help a bit + // gotta wait for 6 clocks (30ns) memory access time for pref to work + "mov #32, r1\n\t" + "add %[bmtrx], r1\n\t" // store offset by 32 in r1 + "pref @r1\n\t" // Get a head start prefetching the second half of the 64-byte data + // NOPs are in the MT group, so they are executed in parallel... + // all these nops should equal 2 cycles in this context... + "nop\n\t" + "nop\n\t" + "nop\n\t" + "nop\n\t" + "nop\n\t" + "nop\n\t" + "fschg\n\t" // switch fmov to paired moves (note: only paired moves can access XDn regs) + "fmov.d @%[bmtrx]+, XD0\n\t" + "fmov.d @%[bmtrx]+, XD2\n\t" + "fmov.d @%[bmtrx]+, XD4\n\t" + "fmov.d @%[bmtrx]+, XD6\n\t" + "fmov.d @%[bmtrx]+, XD8\n\t" + "fmov.d @%[bmtrx]+, XD10\n\t" + "fmov.d @%[bmtrx]+, XD12\n\t" + "fmov.d @%[bmtrx], XD14\n\t" + "fschg\n" // switch back to single moves + : [bmtrx] "+r" ((unsigned int)back_matrix) // outputs, "+" means r/w + : // no inputs + : "r1" // clobbers (GCC doesn't know about back bank, so writing to it isn't clobbered) + ); +} + +// Store XMTRX to memory +static inline __attribute__((always_inline)) ALL_FLOATS_STRUCT * MATH_Store_XMTRX(ALL_FLOATS_STRUCT * destination) +{ + char * output = ((char*)destination) + sizeof(ALL_FLOATS_STRUCT) + 8; // ALL_FLOATS_STRUCT should be 64 bytes + + asm volatile ("pref @%[dest_base]\n\t" + // gotta wait for 6 clocks (30ns) memory access time for pref to work + "mov #32, r1\n\t" + "add %[dest_base], r1\n\t" // store offset by 32 in r1 + "pref @r1\n\t" // Get a head start prefetching the second half of the 64-byte data + // NOPs are in the MT group, so they are executed in parallel... + // all these nops should equal 2 cycles in this context... + "nop\n\t" + "nop\n\t" + "nop\n\t" + "nop\n\t" + "nop\n\t" + "nop\n\t" + "fschg\n\t" // switch fmov to paired moves (note: only paired moves can access XDn regs) + "fmov.d XD0, @-%[out_mtrx]\n\t" // These do *(--output) = XDn + "fmov.d XD2, @-%[out_mtrx]\n\t" + "fmov.d XD4, @-%[out_mtrx]\n\t" + "fmov.d XD6, @-%[out_mtrx]\n\t" + "fmov.d XD8, @-%[out_mtrx]\n\t" + "fmov.d XD10, @-%[out_mtrx]\n\t" + "fmov.d XD12, @-%[out_mtrx]\n\t" + "fmov.d XD14, @-%[out_mtrx]\n\t" + "fschg\n" // switch back to single moves + : [out_mtrx] "+&r" ((unsigned int)output) // outputs, "+" means r/w, "&" means it's written to before all inputs are consumed + : [dest_base] "r" ((unsigned int)destination) // inputs + : "r1", "memory" // clobbers + ); + + return destination; +} + +// Returns FV0, 4, 8, or 12 from XMTRX +// +// Sorry, it has to be done 4 at a time like this due to calling convention +// limits; under optimal optimization conditions, we only get 4 float registers +// for return values; any more and they get pushed to memory. +// +// IMPORTANT USAGE INFORMATION (get XMTRX vector) +// +// XMTRX format, using the front bank's FVn notation: +// +// FV0 FV4 FV8 FV12 +// --- --- --- ---- +// [ xf0 xf4 xf8 xf12 ] +// [ xf1 xf5 xf9 xf13 ] +// [ xf2 xf6 xf10 xf14 ] +// [ xf3 xf7 xf11 xf15 ] +// +// Return vector maps to XMTRX as below depending on the FVn value passed in: +// +// typedef struct { +// float z1; // will contain xf0, 4, 8 or 12 +// float z2; // will contain xf1, 5, 9, or 13 +// float z3; // will contain xf2, 6, 10, or 14 +// float z4; // will contain xf3, 7, 11, or 15 +// } RETURN_VECTOR_STRUCT; +// +// Valid values of 'which' are 0, 4, 8, or 12, corresponding to FV0, FV4, FV8, +// or FV12, respectively. Other values will return 0 in all four return values. +static inline __attribute__((always_inline)) RETURN_VECTOR_STRUCT MATH_Get_XMTRX_Vector(unsigned int which) +{ + register float __z1 __asm__("fr0"); + register float __z2 __asm__("fr1"); + register float __z3 __asm__("fr2"); + register float __z4 __asm__("fr3"); + + // Note: only paired moves can access XDn regs + asm volatile ("cmp/eq #0, %[select]\n\t" // if(which == 0), 1 -> T else 0 -> T + "bt.s 0f\n\t" // do FV0 + " cmp/eq #4, %[select]\n\t" // if(which == 4), 1 -> T else 0 -> T + "bt.s 4f\n\t" // do FV4 + " cmp/eq #8, %[select]\n\t" // if(which == 8), 1 -> T else 0 -> T + "bt.s 8f\n\t" // do FV8 + " cmp/eq #12, %[select]\n\t" // if(which == 12), 1 -> T else 0 -> T + "bf.s 1f\n" // exit if not even FV12 was true, otherwise do FV12 + "12:\n\t" + " fschg\n\t" // paired moves for FV12 (and exit case) + "fmov XD14, DR2\n\t" + "fmov XD12, DR0\n\t" + "bt.s 2f\n" // done + "8:\n\t" + " fschg\n\t" // paired moves for FV8, back to singles for FV12 + "fmov XD10, DR2\n\t" + "fmov XD8, DR0\n\t" + "bf.s 2f\n" // done + "4:\n\t" + " fschg\n\t" // paired moves for FV4, back to singles for FV8 + "fmov XD6, DR2\n\t" + "fmov XD4, DR0\n\t" + "bf.s 2f\n" // done + "0:\n\t" + " fschg\n\t" // paired moves for FV0, back to singles for FV4 + "fmov XD2, DR2\n\t" + "fmov XD0, DR0\n\t" + "bf.s 2f\n" // done + "1:\n\t" + " fschg\n\t" // back to singles for FV0 and exit case + "fldi0 FR0\n\t" // FR0-3 get zeroed out, then + "fmov FR0, FR1\n\t" + "fmov FR0, FR2\n\t" + "fmov FR0, FR3\n" + "2:\n" + : "=w" (__z1), "=f" (__z2), "=f" (__z3), "=f" (__z4) // outputs + : [select] "z" (which) // inputs + : "t" // clobbers + ); + + RETURN_VECTOR_STRUCT output = {__z1, __z2, __z3, __z4}; + return output; +} + +// Returns a 2x2 matrix from a quadrant of XMTRX +// +// Sorry, it has to be done 4 at a time like this due to calling convention +// limits; under optimal optimization conditions, we only get 4 float registers +// for return values; any more and they get pushed to memory. +// +// IMPORTANT USAGE INFORMATION (get XMTRX 2x2) +// +// Each 2x2 quadrant is of the form: +// +// [ a b ] +// [ c d ] +// +// Return vector maps to the 2x2 matrix as below: +// +// typedef struct { +// float z1; // a +// float z2; // c +// float z3; // b +// float z4; // d +// } RETURN_VECTOR_STRUCT; +// +// (So the function does a 2x2 transpose in storing the values relative to the +// order stored in XMTRX.) +// +// Valid values of 'which' are 1, 2, 3, or 4, corresponding to the following +// quadrants of XMTRX: +// +// 1 2 +// [ xf0 xf4 ] | [ xf8 xf12 ] +// [ xf1 xf5 ] | [ xf9 xf13 ] +// -- 3 -- | -- 4 -- +// [ xf2 xf6 ] | [ xf10 xf14 ] +// [ xf3 xf7 ] | [ xf11 xf15 ] +// +// Other input values will return 0 in all four return floats. +static inline __attribute__((always_inline)) RETURN_VECTOR_STRUCT MATH_Get_XMTRX_2x2(unsigned int which) +{ + register float __z1 __asm__("fr0"); + register float __z2 __asm__("fr1"); + register float __z3 __asm__("fr2"); + register float __z4 __asm__("fr3"); + + // Note: only paired moves can access XDn regs + asm volatile ("cmp/eq #1, %[select]\n\t" // if(which == 1), 1 -> T else 0 -> T + "bt.s 1f\n\t" // do quadrant 1 + " cmp/eq #2, %[select]\n\t" // if(which == 2), 1 -> T else 0 -> T + "bt.s 2f\n\t" // do quadrant 2 + " cmp/eq #3, %[select]\n\t" // if(which == 3), 1 -> T else 0 -> T + "bt.s 3f\n\t" // do quadrant 3 + " cmp/eq #4, %[select]\n\t" // if(which == 4), 1 -> T else 0 -> T + "bf.s 0f\n" // exit if nothing was true, otherwise do quadrant 4 + "4:\n\t" + " fschg\n\t" // paired moves for quadrant 4 (and exit case) + "fmov XD14, DR2\n\t" + "fmov XD10, DR0\n\t" + "bt.s 5f\n" // done + "3:\n\t" + " fschg\n\t" // paired moves for quadrant 3, back to singles for 4 + "fmov XD6, DR2\n\t" + "fmov XD2, DR0\n\t" + "bf.s 5f\n" // done + "2:\n\t" + " fschg\n\t" // paired moves for quadrant 2, back to singles for 3 + "fmov XD12, DR2\n\t" + "fmov XD8, DR0\n\t" + "bf.s 5f\n" // done + "1:\n\t" + " fschg\n\t" // paired moves for quadrant 1, back to singles for 2 + "fmov XD4, DR2\n\t" + "fmov XD0, DR0\n\t" + "bf.s 5f\n" // done + "0:\n\t" + " fschg\n\t" // back to singles for quadrant 1 and exit case + "fldi0 FR0\n\t" // FR0-3 get zeroed out, then + "fmov FR0, FR1\n\t" + "fmov FR0, FR2\n\t" + "fmov FR0, FR3\n" + "5:\n" + : "=w" (__z1), "=f" (__z2), "=f" (__z3), "=f" (__z4) // outputs + : [select] "z" (which) // inputs + : "t" // clobbers + ); + + RETURN_VECTOR_STRUCT output = {__z1, __z2, __z3, __z4}; + return output; +} + +// It is not possible to return an entire 4x4 matrix in registers, as the only +// registers allowed for return values are R0-R3 and FR0-FR3. All others are +// marked caller save, which means they could be restored from stack and clobber +// anything returned in them. +// +// In general, writing the entire required math routine in one asm function is +// the best way to go for performance reasons anyways, and in that situation one +// can just throw calling convention to the wind until returning back to C. + +#endif /* __SH4_MATH_H_ */ \ No newline at end of file diff --git a/GL/texture.c b/GL/texture.c index 7160e94..bf00983 100644 --- a/GL/texture.c +++ b/GL/texture.c @@ -743,11 +743,11 @@ GLint _cleanInternalFormat(GLint internalFormat) { typedef void (*TextureConversionFunc)(const GLubyte*, GLubyte*); -static inline void _rgba8888_to_argb4444(const GLubyte* source, GLubyte* dest) { +static INLINE_DEBUG void _rgba8888_to_argb4444(const GLubyte* source, GLubyte* dest) { *((GLushort*) dest) = (source[3] & 0xF0) << 8 | (source[0] & 0xF0) << 4 | (source[1] & 0xF0) | (source[2] & 0xF0) >> 4; } -static inline void _rgba8888_to_rgba8888(const GLubyte* source, GLubyte* dest) { +static INLINE_DEBUG void _rgba8888_to_rgba8888(const GLubyte* source, GLubyte* dest) { /* Noop */ GLubyte* dst = (GLubyte*) dest; dst[0] = source[0]; @@ -756,11 +756,11 @@ static inline void _rgba8888_to_rgba8888(const GLubyte* source, GLubyte* dest) { dst[3] = source[3]; } -static inline void _rgba8888_to_rgb565(const GLubyte* source, GLubyte* dest) { +static INLINE_DEBUG void _rgba8888_to_rgb565(const GLubyte* source, GLubyte* dest) { *((GLushort*) dest) = ((source[0] & 0b11111000) << 8) | ((source[1] & 0b11111100) << 3) | (source[2] >> 3); } -static inline void _rgb888_to_rgba8888(const GLubyte* source, GLubyte* dest) { +static INLINE_DEBUG void _rgb888_to_rgba8888(const GLubyte* source, GLubyte* dest) { /* Noop */ GLubyte* dst = (GLubyte*) dest; dst[0] = source[0]; @@ -769,24 +769,24 @@ static inline void _rgb888_to_rgba8888(const GLubyte* source, GLubyte* dest) { dst[3] = 255; } -static inline void _rgb888_to_rgb565(const GLubyte* source, GLubyte* dest) { +static INLINE_DEBUG void _rgb888_to_rgb565(const GLubyte* source, GLubyte* dest) { *((GLushort*) dest) = ((source[0] & 0b11111000) << 8) | ((source[1] & 0b11111100) << 3) | (source[2] >> 3); } -static inline void _rgba8888_to_a000(const GLubyte* source, GLubyte* dest) { +static INLINE_DEBUG void _rgba8888_to_a000(const GLubyte* source, GLubyte* dest) { *((GLushort*) dest) = ((source[3] & 0b11111000) << 8); } -static inline void _r8_to_rgb565(const GLubyte* source, GLubyte* dest) { +static INLINE_DEBUG void _r8_to_rgb565(const GLubyte* source, GLubyte* dest) { *((GLushort*) dest) = (source[0] & 0b11111000) << 8; } -static inline void _rgba4444_to_argb4444(const GLubyte* source, GLubyte* dest) { +static INLINE_DEBUG void _rgba4444_to_argb4444(const GLubyte* source, GLubyte* dest) { GLushort* src = (GLushort*) source; *((GLushort*) dest) = ((*src & 0x000F) << 12) | *src >> 4; } -static inline void _rgba4444_to_rgba8888(const GLubyte* source, GLubyte* dest) { +static INLINE_DEBUG void _rgba4444_to_rgba8888(const GLubyte* source, GLubyte* dest) { GLushort src = *((GLushort*) source); GLubyte* dst = (GLubyte*) dest; @@ -796,7 +796,7 @@ static inline void _rgba4444_to_rgba8888(const GLubyte* source, GLubyte* dest) { dst[3] = ((src & 0x000F)) * 2; } -static inline void _i8_to_i8(const GLubyte* source, GLubyte* dest) { +static INLINE_DEBUG void _i8_to_i8(const GLubyte* source, GLubyte* dest) { /* For indexes */ GLubyte* dst = (GLubyte*) dest; *dst = *source; diff --git a/containers/aligned_vector.c b/containers/aligned_vector.c index 708b187..1657a60 100644 --- a/containers/aligned_vector.c +++ b/containers/aligned_vector.c @@ -3,6 +3,8 @@ #include #include #include +#include +#include #if defined(__APPLE__) || defined(__WIN32__) /* Linux + Kos define this, OSX does not, so just use malloc there */ @@ -25,7 +27,7 @@ void aligned_vector_init(AlignedVector* vector, unsigned int element_size) { } -static inline unsigned int round_to_chunk_size(unsigned int val) { +static INLINE_DEBUG unsigned int round_to_chunk_size(unsigned int val) { const unsigned int n = val; const unsigned int m = ALIGNED_VECTOR_CHUNK_SIZE; @@ -107,33 +109,12 @@ void* aligned_vector_resize(AlignedVector* vector, const unsigned int element_co } } -void* aligned_vector_at(const AlignedVector* vector, const unsigned int index) { - #if 0 - if(index >= vector->size){ - char msg[60]; - sprintf(msg, "Vector OOB: %d %d wanted %d\n", vector->capacity, vector->size, index); - //aligned_vector_resize(vector, index); - assert_msg(index < vector->size, msg); - } - #endif - assert(index < vector->size); - return &vector->data[index * vector->element_size]; -} - -void* aligned_vector_back(AlignedVector* vector) { - return aligned_vector_at(vector, vector->size - 1); -} - void* aligned_vector_extend(AlignedVector* vector, const unsigned int additional_count) { const unsigned int current = vector->size; aligned_vector_resize(vector, vector->size + additional_count); return aligned_vector_at(vector, current); } -void aligned_vector_clear(AlignedVector* vector) { - vector->size = 0; -} - void aligned_vector_shrink_to_fit(AlignedVector* vector) { if(vector->size == 0) { free(vector->data); diff --git a/containers/aligned_vector.h b/containers/aligned_vector.h index a002ece..d672b47 100644 --- a/containers/aligned_vector.h +++ b/containers/aligned_vector.h @@ -5,6 +5,8 @@ extern "C" { #endif +#include "../GL/cygprofile.h" + typedef struct { unsigned int size; unsigned int capacity; @@ -18,12 +20,27 @@ void aligned_vector_init(AlignedVector* vector, unsigned int element_size); void aligned_vector_reserve(AlignedVector* vector, unsigned int element_count); void* aligned_vector_push_back(AlignedVector* vector, const void* objs, unsigned int count); void* aligned_vector_resize(AlignedVector* vector, const unsigned int element_count); -void* aligned_vector_at(const AlignedVector* vector, const unsigned int index); +INLINE_ALWAYS void* aligned_vector_at(const AlignedVector* vector, const unsigned int index) { + #if 0 + if(index >= vector->size){ + char msg[60]; + sprintf(msg, "Vector OOB: %d %d wanted %d\n", vector->capacity, vector->size, index); + //aligned_vector_resize(vector, index); + assert_msg(index < vector->size, msg); + } + assert(index < vector->size); /* Check here */ + #endif + return &vector->data[index * vector->element_size]; +} void* aligned_vector_extend(AlignedVector* vector, const unsigned int additional_count); -void aligned_vector_clear(AlignedVector* vector); +INLINE_ALWAYS void aligned_vector_clear(AlignedVector* vector){ + vector->size = 0; +} void aligned_vector_shrink_to_fit(AlignedVector* vector); void aligned_vector_cleanup(AlignedVector* vector); -void* aligned_vector_back(AlignedVector* vector); +INLINE_ALWAYS void* aligned_vector_back(AlignedVector* vector){ + return aligned_vector_at(vector, vector->size - 1); +} #ifdef __cplusplus } diff --git a/containers/named_array.c b/containers/named_array.c index cf06373..6efa7dc 100644 --- a/containers/named_array.c +++ b/containers/named_array.c @@ -44,13 +44,6 @@ void named_array_init(NamedArray* array, unsigned int element_size, unsigned int memset(array->elements, 0, element_size * max_elements); } -char named_array_used(NamedArray* array, unsigned int id) { - unsigned int i = id / 8; - unsigned int j = id % 8; - - unsigned char v = array->used_markers[i] & (unsigned char) (1 << j); - return !!(v); -} void* named_array_alloc(NamedArray* array, unsigned int* new_id) { unsigned int i = 0, j = 0; diff --git a/containers/named_array.h b/containers/named_array.h index a0f6c97..5877059 100644 --- a/containers/named_array.h +++ b/containers/named_array.h @@ -5,6 +5,8 @@ extern "C" { #endif +#include "../GL/cygprofile.h" + typedef struct { unsigned int element_size; unsigned int max_element_count; @@ -14,7 +16,13 @@ typedef struct { } NamedArray; void named_array_init(NamedArray* array, unsigned int element_size, unsigned int max_elements); -char named_array_used(NamedArray* array, unsigned int id); +INLINE_ALWAYS char named_array_used(NamedArray* array, unsigned int id) { + const unsigned int i = id / 8; + const unsigned int j = id % 8; + + unsigned char v = array->used_markers[i] & (unsigned char) (1 << j); + return !!(v); +} void* named_array_alloc(NamedArray* array, unsigned int* new_id); void* named_array_reserve(NamedArray* array, unsigned int id);