feat: implement proper chanegs from profiling

- math
- inlining
This commit is contained in:
Hayden Kowalchuk 2020-02-18 11:48:37 -05:00
parent a2dcfcf997
commit 3a4f09bef2
19 changed files with 2572 additions and 227 deletions

View File

@ -1,9 +1,8 @@
#pragma once
#ifndef CONFIG_H #ifndef CONFIG_H
#define CONFIG_H #define CONFIG_H
/* This figure is derived from the needs of Quake 1 */ /* This figure is derived from the needs of Quake 1 */
#define MAX_TEXTURE_COUNT 1088 #define MAX_TEXTURE_COUNT 1088
#endif // CONFIG_H #endif // CONFIG_H

227
GL/cygprofile.c Normal file
View File

@ -0,0 +1,227 @@
/* Based on the idea from Erich Styger */
/* profiled instrument guided profiling for gldc on hardware */
#include "cygprofile.h"
#include <kos.h>
#include <stdbool.h>
#include <stdint.h>
#include <stdlib.h>
#include "perfctr.h"
#include "private.h"
#if CYG_FUNC_TRACE_ENABLED
#define _strcat(x, y, z) strncat(x, z, y)
#ifndef __PE_Error_H
#define __PE_Error_H
#define ERR_OK 0 /* OK */
#define ERR_SPEED 1 /* This device does not work in the active speed mode. */
#define ERR_RANGE 2 /* Parameter out of range. */
#define ERR_VALUE 3 /* Parameter of incorrect value. */
#define ERR_OVERFLOW 4 /* Timer overflow. */
#define ERR_MATH 5 /* Overflow during evaluation. */
#define ERR_ENABLED 6 /* Device is enabled. */
#define ERR_DISABLED 7 /* Device is disabled. */
#define ERR_BUSY 8 /* Device is busy. */
#define ERR_NOTAVAIL 9 /* Requested value or method not available. */
#define ERR_RXEMPTY 10 /* No data in receiver. */
#define ERR_TXFULL 11 /* Transmitter is full. */
#define ERR_BUSOFF 12 /* Bus not available. */
#define ERR_OVERRUN 13 /* Overrun error is detected. */
#define ERR_FRAMING 14 /* Framing error is detected. */
#define ERR_PARITY 15 /* Parity error is detected. */
#define ERR_NOISE 16 /* Noise error is detected. */
#define ERR_IDLE 17 /* Idle error is detectes. */
#define ERR_FAULT 18 /* Fault error is detected. */
#define ERR_BREAK 19 /* Break char is received during communication. */
#define ERR_CRC 20 /* CRC error is detected. */
#define ERR_ARBITR 21 /* A node losts arbitration. This error occurs if two nodes start transmission at the same time. */
#define ERR_PROTECT 22 /* Protection error is detected. */
#endif /* __PE_Error_H */
#define CYG_RNG_BUF_NOF_ELEMS (8096 * 4)
/*!< Number of elements in the ring buffer which is used to record function calls */
#define CYG_THUMB_MASK 0xFFFFFFFF
/*!< mask out LSB (thumb) bit */
/* Hashing function for two uint32_ts */
#define HASH_PAIR(x, y) (((x)*0x1f1f1f1f) ^ (y))
static bool CYG_Enabled = false; /*!< flag which enables/disables tracing */
/*!
* Element in ring buffer to store the trace information.
*/
typedef struct
{
//bool isEnter; /*!< TRUE for __cyg_profile_func_enter(), FALSE for __cyg_profile_func_exit() */
void *this_fn; /*!< address (with thumb bit) of the (caller) function */
void *call_site; /*!< return address to the function which called this_fn */
uint32_t counter; /* also contains isEnter as highest bit */
} CYG_RNG_ElementType;
typedef uint32_t CYG_RNG_BufSizeType; /*!< index type for ring buffer */
static CYG_RNG_ElementType CYG_RNG_buffer[CYG_RNG_BUF_NOF_ELEMS]; /*!< ring buffer */
//static CYG_RNG_BufSizeType CYG_RNG_inIdx; /*!< input index */
static CYG_RNG_BufSizeType CYG_RNG_outIdx; /*!< output index */
static CYG_RNG_BufSizeType CYG_RNG_inSize; /*!< size/number of elements in buffer */
/*!
* \brief Stores a trace element into the ring buffer.
* \param elem Trace element to put into the buffer.
* \return Error code, ERR_OK if everything is ok.
*/
__attribute__((no_instrument_function)) static uint8_t CYG_RNG_Put(CYG_RNG_ElementType *elem) {
uint8_t res = ERR_OK;
#if 0
if (CYG_RNG_inSize == CYG_RNG_BUF_NOF_ELEMS)
{
res = ERR_TXFULL;
CYG_RNG_inSize--;
CYG_PrintCallTrace();
//CYG_RNG_inIdx = 0;
CYG_RNG_outIdx = 0;
CYG_RNG_inSize = 0;
return CYG_RNG_Put(elem);
}
else
{
//CYG_RNG_buffer[CYG_RNG_inIdx] = *elem;
/*
CYG_RNG_inIdx++;
if (CYG_RNG_inIdx == CYG_RNG_BUF_NOF_ELEMS)
{
CYG_RNG_inIdx = 0;
}
*/
CYG_RNG_inSize++;
}
#endif
CYG_RNG_ElementType *possible = &CYG_RNG_buffer[HASH_PAIR((uint32_t)elem->call_site, (uint32_t)elem->this_fn) % CYG_RNG_BUF_NOF_ELEMS];
if (possible->counter /*& 0x0FFFFFFF*/ == 0) {
*possible = *elem;
} else {
possible->counter++;
}
return res;
}
/*!
* \brief Gets a trace element from the ring buffer.
* \param elem Pointer where to store the trace element.
* \return Error code, ERR_OK if everything is ok.
*/
__attribute__((no_instrument_function)) static uint8_t CYG_RNG_Get(CYG_RNG_ElementType *elemP) {
uint8_t res = ERR_OK;
if (CYG_RNG_inSize == 0) {
res = ERR_RXEMPTY;
} else {
*elemP = CYG_RNG_buffer[CYG_RNG_outIdx];
CYG_RNG_inSize--;
CYG_RNG_outIdx++;
if (CYG_RNG_outIdx == CYG_RNG_BUF_NOF_ELEMS) {
CYG_RNG_outIdx = 0;
}
}
return res;
}
static uint32_t currentTime[2];
static uint32_t lastTime;
/*!
* \brief Stores a trace element into the ring buffer.
* \param this_fn Address of the caller function.
* \param call_site Return address to the function which called this_fn
* \return Error code, ERR_OK if everything is ok.
*/
__attribute__((no_instrument_function)) static void CYG_Store(void *this_fn, void *call_site) {
CYG_RNG_ElementType elem;
lastTime = currentTime[0];
PMCR_Read(1, (unsigned int *)currentTime);
//elem.isEnter = isEnter;
elem.call_site = call_site;
elem.this_fn = this_fn;
elem.counter = 1; //currentTime[0] - lastTime;
CYG_RNG_Put(&elem);
}
/*!
* \brief Function which is called upon function enter. The function call is inserted by the compiler.
* \param this_fn Address of the caller function.
* \param call_site Return address to the function which called this_fn
*/
__attribute__((no_instrument_function)) void __cyg_profile_func_enter(void *this_fn, void *call_site) {
if (CYG_Enabled) {
CYG_Store(call_site, this_fn);
}
}
/*!
* \brief Function which is called upon function exit. The function call is inserted by the compiler.
* \param this_fn Address of the caller function.
* \param call_site Return address to the function which called this_fn
*/
__attribute__((no_instrument_function)) void __cyg_profile_func_exit(__attribute__((unused)) void *this_fn, __attribute__((unused)) void *call_site) {
}
/*!
* \brief Dumps the trace to the console.
*/
__attribute__((no_instrument_function)) void CYG_PrintCallTrace(void) {
CYG_RNG_BufSizeType i;
char buf[40];
CYG_RNG_ElementType elem;
uint8_t res;
CYG_Enabled = false;
printf("0x%08x\n", ((unsigned int)&_etext) - BASE_ADDRESS);
//printf("Function Trace:\r\n");
CYG_RNG_outIdx = 0;
for (i = 0; i < CYG_RNG_BUF_NOF_ELEMS; i++) {
buf[0] = '\0';
res = CYG_RNG_Get(&elem);
if (res == ERR_OK && elem.call_site != NULL) {
snprintf(buf, sizeof(buf), "{ 0x%" PRIXPTR " 0x%" PRIXPTR " %u\r\n", (uintptr_t)(elem.this_fn) & CYG_THUMB_MASK, (uintptr_t)(elem.call_site) & CYG_THUMB_MASK, (unsigned int)elem.counter);
printf(buf);
} else {
//printf("ERROR getting element!\r\n");
}
}
//printf("Function Trace: done!\r\n");
}
__attribute__((no_instrument_function)) void CYG_Init(void) {
if (CYG_Enabled) {
return;
}
CYG_RNG_inSize = CYG_RNG_BUF_NOF_ELEMS;
CYG_RNG_outIdx = 0;
CYG_Enabled = true;
currentTime[0] = currentTime[1] = 0;
lastTime = 0;
memset(CYG_RNG_buffer, 0, sizeof(CYG_RNG_buffer));
PMCR_Init(1, PMCR_ELAPSED_TIME_MODE, PMCR_COUNT_CPU_CYCLES);
}
__attribute__((no_instrument_function)) void CYG_Deinit(void) {
CYG_RNG_inSize = CYG_RNG_BUF_NOF_ELEMS;
CYG_RNG_outIdx = 0;
CYG_Enabled = false;
memset(CYG_RNG_buffer, 0, sizeof(CYG_RNG_buffer));
}
#else
void CYG_PrintCallTrace(void){}
void CYG_Init(void){}
void CYG_Deinit(void){}
#endif

33
GL/cygprofile.h Normal file
View File

@ -0,0 +1,33 @@
#pragma once
#ifndef CYGPROFILE_H_
#define CYGPROFILE_H_
/* Based on the idea from Erich Styger */
/* profiled instrument guided profiling for gldc on hardware */
#define NO_INSTRUMENT inline __attribute__((no_instrument_function))
#define INLINE_DEBUG NO_INSTRUMENT __attribute__((always_inline))
#define INLINE_ALWAYS static NO_INSTRUMENT __attribute__((always_inline))
extern char _etext;
#define BASE_ADDRESS 0x8c010000
#define CYG_FUNC_TRACE_ENABLED (1)
/*!< 1: Trace enabled, 0: trace disabled */
/*!
* \brief Print the call trace to the terminal.
*/
void CYG_PrintCallTrace(void);
/*!
* \brief Driver Initialization.
*/
void CYG_Init(void);
/*!
* \brief Driver De-Initialization.
*/
void CYG_Deinit(void);
#endif /* CYGPROFILE_H_ */

View File

@ -56,7 +56,7 @@ void _glInitAttributePointers() {
NORMAL_POINTER.size = 3; NORMAL_POINTER.size = 3;
} }
static inline GLuint byte_size(GLenum type) { static INLINE_DEBUG GLuint byte_size(GLenum type) {
switch(type) { switch(type) {
case GL_BYTE: return sizeof(GLbyte); case GL_BYTE: return sizeof(GLbyte);
case GL_UNSIGNED_BYTE: return sizeof(GLubyte); case GL_UNSIGNED_BYTE: return sizeof(GLubyte);
@ -513,7 +513,7 @@ PVRHeader* _glSubmissionTargetHeader(SubmissionTarget* target) {
return aligned_vector_at(&target->output->vector, target->header_offset); return aligned_vector_at(&target->output->vector, target->header_offset);
} }
Vertex* _glSubmissionTargetStart(SubmissionTarget* target) { INLINE_DEBUG Vertex* _glSubmissionTargetStart(SubmissionTarget* target) {
assert(target->start_offset < target->output->vector.size); assert(target->start_offset < target->output->vector.size);
return aligned_vector_at(&target->output->vector, target->start_offset); return aligned_vector_at(&target->output->vector, target->start_offset);
} }
@ -1006,6 +1006,7 @@ static void mat_transform_normal3(const float* xyz, const float* xyzOut, const u
static void light(SubmissionTarget* target) { static void light(SubmissionTarget* target) {
#if 0
typedef struct { typedef struct {
float xyz[3]; float xyz[3];
float n[3]; float n[3];
@ -1057,6 +1058,35 @@ static void light(SubmissionTarget* target) {
vertex->bgra[G8IDX] = (GLubyte) (255.0f * fminf(total[1], 1.0f)); vertex->bgra[G8IDX] = (GLubyte) (255.0f * fminf(total[1], 1.0f));
vertex->bgra[B8IDX] = (GLubyte) (255.0f * fminf(total[2], 1.0f)); vertex->bgra[B8IDX] = (GLubyte) (255.0f * fminf(total[2], 1.0f));
} }
#endif
if(!_glIsLightingEnabled()) {
return;
}
static AlignedVector* eye_space_data = NULL;
if(!eye_space_data) {
eye_space_data = (AlignedVector*) malloc(sizeof(AlignedVector));
aligned_vector_init(eye_space_data, sizeof(EyeSpaceData));
}
aligned_vector_resize(eye_space_data, target->count);
/* Perform lighting calculations and manipulate the colour */
Vertex* vertex = _glSubmissionTargetStart(target);
VertexExtra* extra = aligned_vector_at(target->extras, 0);
EyeSpaceData* eye_space = (EyeSpaceData*) eye_space_data->data;
_glMatrixLoadModelView();
mat_transform3(vertex->xyz, eye_space->xyz, target->count, sizeof(Vertex), sizeof(EyeSpaceData));
_glMatrixLoadNormal();
mat_transform_normal3(extra->nxyz, eye_space->n, target->count, sizeof(VertexExtra), sizeof(EyeSpaceData));
EyeSpaceData* ES = aligned_vector_at(eye_space_data, 0);
_glPerformLighting(vertex, ES, target->count);
} }
static void divide(SubmissionTarget* target) { static void divide(SubmissionTarget* target) {

View File

@ -1,5 +1,6 @@
#include <stdio.h> #include <stdio.h>
#include "private.h" #include "private.h"
#include "config.h"
#include "../include/glkos.h" #include "../include/glkos.h"
#include "../include/glext.h" #include "../include/glext.h"
@ -94,62 +95,62 @@ void APIENTRY glFramebufferTexture2DEXT(GLenum target, GLenum attachment, GLenum
ACTIVE_FRAMEBUFFER->texture_id = texture; ACTIVE_FRAMEBUFFER->texture_id = texture;
} }
static inline GLuint A1555(GLuint v) { static INLINE_DEBUG GLuint A1555(GLuint v) {
const GLuint MASK = (1 << 15); const GLuint MASK = (1 << 15);
return (v & MASK) >> 15; return (v & MASK) >> 15;
} }
static inline GLuint R1555(GLuint v) { static INLINE_DEBUG GLuint R1555(GLuint v) {
const GLuint MASK = (31 << 10); const GLuint MASK = (31 << 10);
return (v & MASK) >> 10; return (v & MASK) >> 10;
} }
static inline GLuint G1555(GLuint v) { static INLINE_DEBUG GLuint G1555(GLuint v) {
const GLuint MASK = (31 << 5); const GLuint MASK = (31 << 5);
return (v & MASK) >> 5; return (v & MASK) >> 5;
} }
static inline GLuint B1555(GLuint v) { static INLINE_DEBUG GLuint B1555(GLuint v) {
const GLuint MASK = (31 << 0); const GLuint MASK = (31 << 0);
return (v & MASK) >> 0; return (v & MASK) >> 0;
} }
static inline GLuint A4444(GLuint v) { static INLINE_DEBUG GLuint A4444(GLuint v) {
const GLuint MASK = (0xF << 12); const GLuint MASK = (0xF << 12);
return (v & MASK) >> 12; return (v & MASK) >> 12;
} }
static inline GLuint R4444(GLuint v) { static INLINE_DEBUG GLuint R4444(GLuint v) {
const GLuint MASK = (0xF << 8); const GLuint MASK = (0xF << 8);
return (v & MASK) >> 8; return (v & MASK) >> 8;
} }
static inline GLuint G4444(GLuint v) { static INLINE_DEBUG GLuint G4444(GLuint v) {
const GLuint MASK = (0xF << 4); const GLuint MASK = (0xF << 4);
return (v & MASK) >> 4; return (v & MASK) >> 4;
} }
static inline GLuint B4444(GLuint v) { static INLINE_DEBUG GLuint B4444(GLuint v) {
const GLuint MASK = (0xF << 0); const GLuint MASK = (0xF << 0);
return (v & MASK) >> 0; return (v & MASK) >> 0;
} }
static inline GLuint R565(GLuint v) { static INLINE_DEBUG GLuint R565(GLuint v) {
const GLuint MASK = (31 << 11); const GLuint MASK = (31 << 11);
return (v & MASK) >> 11; return (v & MASK) >> 11;
} }
static inline GLuint G565(GLuint v) { static INLINE_DEBUG GLuint G565(GLuint v) {
const GLuint MASK = (63 << 5); const GLuint MASK = (63 << 5);
return (v & MASK) >> 5; return (v & MASK) >> 5;
} }
static inline GLuint B565(GLuint v) { static INLINE_DEBUG GLuint B565(GLuint v) {
const GLuint MASK = (31 << 0); const GLuint MASK = (31 << 0);
return (v & MASK) >> 0; return (v & MASK) >> 0;
} }
GLboolean _glCalculateAverageTexel(const GLubyte* src, const GLuint srcWidth, const GLuint pvrFormat, GLubyte* dest) { static NO_INSTRUMENT GLboolean _glCalculateAverageTexel(const GLubyte* src, const GLuint srcWidth, const GLuint pvrFormat, GLubyte* dest) {
GLushort* s1 = ((GLushort*) src); GLushort* s1 = ((GLushort*) src);
GLushort* s2 = ((GLushort*) src) + 1; GLushort* s2 = ((GLushort*) src) + 1;
GLushort* s3 = ((GLushort*) src) + srcWidth; GLushort* s3 = ((GLushort*) src) + srcWidth;

View File

@ -19,3 +19,7 @@
#include "matrix.c" #include "matrix.c"
#include "state.c" #include "state.c"
#include "texture.c" #include "texture.c"
#include "../containers/stack.c"
#include "../containers/aligned_vector.c"
#include "../containers/named_array.c"

View File

@ -281,98 +281,143 @@ static inline float FPOW(float b, float p) {
return FEXP(FLOG(b) * p); return FEXP(FLOG(b) * p);
} }
void _glCalculateLightingContribution(const GLint light, const GLfloat* pos, const GLfloat* normal, uint8_t* bgra, GLfloat* colour) __attribute__((optimize("fast-math"))); #define LIGHT_COMPONENT(C) { \
void _glCalculateLightingContribution(const GLint light, const GLfloat* pos, const GLfloat* normal, uint8_t* bgra, GLfloat* colour) { const GLfloat* acm = &MA[C]; \
LightSource* l = &LIGHTS[light]; const GLfloat* dcm = &MD[C]; \
const GLfloat* scm = &MS[C]; \
struct vec3f L = { const GLfloat* scli = &light->specular[C]; \
l->position[0], const GLfloat* dcli = &light->diffuse[C]; \
l->position[1], const GLfloat* acli = &light->ambient[C]; \
l->position[2] const GLfloat* srm = &MATERIAL.exponent; \
}; const GLfloat fi = (LdotN == 0) ? 0 : 1; \
GLfloat component = (*acm * *acli); \
if(!l->is_directional) { component += (LdotN * *dcm * *dcli); \
L.x -= pos[0]; component += (FPOW((fi * NdotH), *srm) * *scm * *scli); \
L.y -= pos[1]; component *= att; \
L.z -= pos[2]; component *= spot; \
} final[C] += component; \
struct vec3f N = {
normal[0],
normal[1],
normal[2]
};
struct vec3f V = {
pos[0],
pos[1],
pos[2]
};
GLfloat d;
vec3f_length(L.x, L.y, L.z, d);
GLfloat oneOverL = 1.0f / d;
L.x *= oneOverL;
L.y *= oneOverL;
L.z *= oneOverL;
vec3f_normalize(V.x, V.y, V.z);
GLfloat NdotL, VdotN;
vec3f_dot(N.x, N.y, N.z, L.x, L.y, L.z, NdotL);
vec3f_dot(V.x, V.y, V.z, N.x, N.y, N.z, VdotN);
GLfloat VdotR = VdotN - NdotL;
GLfloat specularPower = FPOW(VdotR > 0 ? VdotR : 0, MATERIAL.exponent);
GLboolean colorMaterial = _glIsColorMaterialEnabled();
GLfloat mD [] = {
(colorMaterial && isDiffuseColorMaterial()) ? ((GLfloat)bgra[R8IDX]) / 255.0f : MATERIAL.diffuse[0],
(colorMaterial && isDiffuseColorMaterial()) ? ((GLfloat)bgra[G8IDX]) / 255.0f : MATERIAL.diffuse[1],
(colorMaterial && isDiffuseColorMaterial()) ? ((GLfloat)bgra[B8IDX]) / 255.0f : MATERIAL.diffuse[2],
(colorMaterial && isDiffuseColorMaterial()) ? ((GLfloat)bgra[A8IDX]) / 255.0f : MATERIAL.diffuse[3]
};
GLfloat mA [] = {
(colorMaterial && isAmbientColorMaterial()) ? ((GLfloat)bgra[R8IDX]) / 255.0f : MATERIAL.ambient[0],
(colorMaterial && isAmbientColorMaterial()) ? ((GLfloat)bgra[G8IDX]) / 255.0f : MATERIAL.ambient[1],
(colorMaterial && isAmbientColorMaterial()) ? ((GLfloat)bgra[B8IDX]) / 255.0f : MATERIAL.ambient[2],
(colorMaterial && isAmbientColorMaterial()) ? ((GLfloat)bgra[A8IDX]) / 255.0f : MATERIAL.ambient[3]
};
GLfloat mS [] = {
(colorMaterial && isSpecularColorMaterial()) ? ((GLfloat)bgra[R8IDX]) / 255.0f : MATERIAL.specular[0],
(colorMaterial && isSpecularColorMaterial()) ? ((GLfloat)bgra[G8IDX]) / 255.0f : MATERIAL.specular[1],
(colorMaterial && isSpecularColorMaterial()) ? ((GLfloat)bgra[B8IDX]) / 255.0f : MATERIAL.specular[2],
(colorMaterial && isSpecularColorMaterial()) ? ((GLfloat)bgra[A8IDX]) / 255.0f : MATERIAL.specular[3]
};
colour[0] = l->ambient[0] * mA[0];
colour[1] = l->ambient[1] * mA[1];
colour[2] = l->ambient[2] * mA[2];
colour[3] = mD[3];
if(NdotL >= 0) {
colour[0] += (l->diffuse[0] * mD[0] * NdotL + l->specular[0] * mS[0] * specularPower);
colour[1] += (l->diffuse[1] * mD[1] * NdotL + l->specular[1] * mS[1] * specularPower);
colour[2] += (l->diffuse[2] * mD[2] * NdotL + l->specular[2] * mS[2] * specularPower);
}
if(!l->is_directional) {
GLfloat att = (
1.0f / (l->constant_attenuation + (l->linear_attenuation * d) + (l->quadratic_attenuation * d * d))
);
colour[0] *= att;
colour[1] *= att;
colour[2] *= att;
}
if(colour[0] > 1.0f) colour[0] = 1.0f;
if(colour[1] > 1.0f) colour[1] = 1.0f;
if(colour[2] > 1.0f) colour[2] = 1.0f;
if(colour[3] > 1.0f) colour[3] = 1.0f;
} }
static inline float vec3_dot_limited(
const float* x1, const float* y1, const float* z1,
const float* x2, const float* y2, const float* z2) {
float ret;
vec3f_dot(*x1, *y1, *z1, *x2, *y2, *z2, ret);
return (ret < 0) ? 0 : ret;
}
void _glPerformLighting(Vertex* vertices, const EyeSpaceData* es, const int32_t count) {
int8_t i;
int32_t j;
const LightSource* light = NULL;
const GLboolean colorMaterial = _glIsColorMaterialEnabled();
const GLboolean isDiffuseCM = isDiffuseColorMaterial();
const GLboolean isAmbientCM = isAmbientColorMaterial();
const GLboolean isSpecularCM = isSpecularColorMaterial();
static GLfloat CM[4];
/* So the DC has 16 floating point registers, that means
* we need to limit the number of floats as much as possible
* to give the compiler a good enough chance to do the right
* thing */
Vertex* vertex = vertices;
const EyeSpaceData* data = es;
static const float ONE_OVER_255 = 1.0f / 255.0f;
for(j = 0; j < count; ++j, ++vertex, ++data) {
/* When GL_COLOR_MATERIAL is on, we need to pull out
* the passed in diffuse and use it */
const GLfloat* MD = MATERIAL.diffuse;
const GLfloat* MA = MATERIAL.ambient;
const GLfloat* MS = MATERIAL.specular;
if(colorMaterial) {
CM[0] = ((GLfloat) vertex->bgra[R8IDX]) * ONE_OVER_255;
CM[1] = ((GLfloat) vertex->bgra[G8IDX]) * ONE_OVER_255;
CM[2] = ((GLfloat) vertex->bgra[B8IDX]) * ONE_OVER_255;
CM[3] = ((GLfloat) vertex->bgra[A8IDX]) * ONE_OVER_255;
MD = (isDiffuseCM) ? CM : MATERIAL.diffuse;
MA = (isAmbientCM) ? CM : MATERIAL.ambient;
MS = (isSpecularCM) ? CM : MATERIAL.specular;
}
float final[4];
/* Initial, non-light related values */
final[0] = (SCENE_AMBIENT[0] * MA[0]) + MATERIAL.emissive[0];
final[1] = (SCENE_AMBIENT[1] * MA[1]) + MATERIAL.emissive[1];
final[2] = (SCENE_AMBIENT[2] * MA[2]) + MATERIAL.emissive[2];
final[3] = MD[3];
float Vx, Vy, Vz;
Vx = -data->xyz[0];
Vy = -data->xyz[1];
Vz = -data->xyz[2];
vec3f_normalize(Vx, Vy, Vz);
for(i = 0; i < MAX_LIGHTS; ++i) {
if(!_glIsLightEnabled(i)) continue;
/* Calc light specific parameters */
light = &LIGHTS[i];
float Lx, Ly, Lz, D;
float Hx, Hy, Hz;
const float* Nx = &data->n[0];
const float* Ny = &data->n[1];
const float* Nz = &data->n[2];
Lx = light->position[0] - data->xyz[0];
Ly = light->position[1] - data->xyz[1];
Lz = light->position[2] - data->xyz[2];
vec3f_length(Lx, Ly, Lz, D);
{
/* Normalize L - scoping ensures Llen is temporary */
const float Llen = 1.0f / D;
Lx *= Llen;
Ly *= Llen;
Lz *= Llen;
}
Hx = (Lx + Vx);
Hy = (Ly + Vy);
Hz = (Lz + Vz);
vec3f_normalize(Hx, Hy, Hz);
const float LdotN = vec3_dot_limited(
&Lx, &Ly, &Lz,
Nx, Ny, Nz
);
const float NdotH = vec3_dot_limited(
Nx, Ny, Nz,
&Hx, &Hy, &Hz
);
const float att = (
light->position[3] == 0.0f) ? 1.0f :
1.0f / (light->constant_attenuation + (light->linear_attenuation * D) + (light->quadratic_attenuation * D * D)
);
const float spot = 1.0f;
LIGHT_COMPONENT(0);
LIGHT_COMPONENT(1);
LIGHT_COMPONENT(2);
}
vertex->bgra[R8IDX] = (GLubyte)(fminf(final[0] * 255.0f, 255.0f));
vertex->bgra[G8IDX] = (GLubyte)(fminf(final[1] * 255.0f, 255.0f));
vertex->bgra[B8IDX] = (GLubyte)(fminf(final[2] * 255.0f, 255.0f));
vertex->bgra[A8IDX] = (GLubyte)(fminf(final[3] * 255.0f, 255.0f));
}
}

View File

@ -476,84 +476,57 @@ void APIENTRY glDepthRange(GLclampf n, GLclampf f) {
DEPTH_RANGE_MULTIPLIER_H = (n + f) / 2.0f; DEPTH_RANGE_MULTIPLIER_H = (n + f) / 2.0f;
} }
#include "sh4_math.h"
/* Vector Cross Product - Used by glhLookAtf2 */ /* Vector Cross Product - Used by glhLookAtf2 */
static inline void vec3f_cross(const GLfloat* v1, const GLfloat* v2, GLfloat* result) { static inline void vec3f_cross(GLfloat* v1, GLfloat* v2, GLfloat* result) {
result[0] = v1[1] * v2[2] - v1[2] * v2[1]; result[0] = (v1[1] * v2[2]) - (v1[2] * v2[1]);
result[1] = v1[2] * v2[0] - v1[0] * v2[2]; result[1] = (v1[2] * v2[0]) - (v1[0] * v2[2]);
result[2] = v1[0] * v2[1] - v1[1] * v2[0]; result[2] = (v1[0] * v2[1]) - (v1[1] * v2[0]);
} }
/* glhLookAtf2 adapted from http://www.opengl.org/wiki/GluLookAt_code */
void glhLookAtf2(const GLfloat* eyePosition3D,
const GLfloat* center3D,
const GLfloat* upVector3D) {
/* Look-At Matrix */ static inline void vec3f_normalize_sh4(float *v){
static Matrix4x4 MatrixLookAt __attribute__((aligned(32))) = { float length, ilength;
1.0f, 0.0f, 0.0f, 0.0f,
0.0f, 1.0f, 0.0f, 0.0f,
0.0f, 0.0f, 1.0f, 0.0f,
0.0f, 0.0f, 0.0f, 1.0f
};
GLfloat forward[3]; ilength = MATH_fsrra(v[0]*v[0] + v[1]*v[1] + v[2]*v[2]);
GLfloat side[3]; length = MATH_Invert(ilength);
GLfloat up[3]; if (length)
{
vec3f_sub_normalize(center3D[0], center3D[1], center3D[2], v[0] *= ilength;
eyePosition3D[0], eyePosition3D[1], eyePosition3D[2], v[1] *= ilength;
forward[0], forward[1], forward[2]); v[2] *= ilength;
}
//Side = forward x up
vec3f_cross(forward, upVector3D, side);
vec3f_normalize(side[0], side[1], side[2]);
//Recompute up as: up = side x forward
vec3f_cross(side, forward, up);
MatrixLookAt[M0] = side[0];
MatrixLookAt[M4] = side[1];
MatrixLookAt[M8] = side[2];
MatrixLookAt[M12] = 0;
MatrixLookAt[M1] = up[0];
MatrixLookAt[M5] = up[1];
MatrixLookAt[M9] = up[2];
MatrixLookAt[M13] = 0;
MatrixLookAt[M2] = -forward[0];
MatrixLookAt[M6] = -forward[1];
MatrixLookAt[M10] = -forward[2];
MatrixLookAt[M14] = 0;
MatrixLookAt[M3] = MatrixLookAt[11] = MatrixLookAt[15] = 0;
MatrixLookAt[M15] = 1;
static Matrix4x4 trn __attribute__((aligned(32))) = {
1.0f, 0.0f, 0.0f, 0.0f,
0.0f, 1.0f, 0.0f, 0.0f,
0.0f, 0.0f, 1.0f, 0.0f,
0.0f, 0.0f, 0.0f, 1.0f
};
trn[M12] = -eyePosition3D[0];
trn[M13] = -eyePosition3D[1];
trn[M14] = -eyePosition3D[2];
// Does not modify internal Modelview matrix
upload_matrix(&MatrixLookAt);
multiply_matrix(&trn);
multiply_matrix(stack_top(MATRIX_STACKS + (GL_MODELVIEW & 0xF)));
download_matrix(stack_top(MATRIX_STACKS + (GL_MODELVIEW & 0xF)));
} }
void gluLookAt(GLfloat eyex, GLfloat eyey, GLfloat eyez, GLfloat centerx, void gluLookAt(GLfloat eyex, GLfloat eyey, GLfloat eyez, GLfloat centerx,
GLfloat centery, GLfloat centerz, GLfloat upx, GLfloat upy, GLfloat centery, GLfloat centerz, GLfloat upx, GLfloat upy,
GLfloat upz) { GLfloat upz) {
GLfloat eye [] = { eyex, eyey, eyez }; GLfloat m [16];
GLfloat point [] = { centerx, centery, centerz }; GLfloat f [3];
GLfloat up [] = { upx, upy, upz }; GLfloat u [3];
glhLookAtf2(eye, point, up); GLfloat s [3];
f[0] = centerx - eyex;
f[1] = centery - eyey;
f[2] = centerz - eyez;
u[0] = upx;
u[1] = upy;
u[2] = upz;
vec3f_normalize_sh4(f);
vec3f_cross(f, u, s);
vec3f_normalize_sh4(s);
vec3f_cross(s, f, u);
m[0] = s[0]; m[4] = s[1]; m[8] = s[2]; m[12] = 0.0f;
m[1] = u[0]; m[5] = u[1]; m[9] = u[2]; m[13] = 0.0f;
m[2] = -f[0]; m[6] = -f[1]; m[10] = -f[2]; m[14] = 0.0f;
m[3] = 0.0f; m[7] = 0.0f; m[11] = 0.0f; m[15] = 1.0f;
glMultMatrixf(m);
glTranslatef(-eyex, -eyey, -eyez);
} }
void _glApplyRenderMatrix() { void _glApplyRenderMatrix() {

247
GL/perfctr.c Normal file
View File

@ -0,0 +1,247 @@
// ---- perfctr.c - SH7091 Performance Counter Module Code ----
//
// This file is part of the DreamHAL project, a hardware abstraction library
// primarily intended for use on the SH7091 found in hardware such as the SEGA
// Dreamcast game console.
//
// The performance counter module is hereby released into the public domain in
// the hope that it may prove useful. Now go profile some code and hit 60 fps! :)
//
// --Moopthehedgehog
// See perfctr.h for more of my notes and documentation on these counters.
#include "perfctr.h"
#include "cygprofile.h"
#if CYG_FUNC_TRACE_ENABLED
static unsigned char pmcr_enabled = 0;
//
// Initialize performance counters. It's just a clear -> enable.
// It's good practice to clear a counter before starting it for the first time.
//
// Also: Disabling and re-enabling the counters doesn't reset them; the clearing
// needs to happen while a counter is disabled to reset it.
//
// You can disable and re-enable with a different mode without explicitly
// clearing and have it keep going, continuing from where it left off.
//
__attribute__((no_instrument_function)) void PMCR_Init(int which, unsigned short mode, unsigned char count_type) // Will do nothing if perfcounter is already running!
{
// Don't do anything if being asked to enable an already-enabled counter
if( (which == 1) && ((!pmcr_enabled) || (pmcr_enabled == 2)) )
{
// counter 1
PMCR_Enable(1, mode, count_type, PMCR_RESET_COUNTER);
}
else if( (which == 2) && ((!pmcr_enabled) || (pmcr_enabled == 1)) )
{
// counter 2
PMCR_Enable(2, mode, count_type, PMCR_RESET_COUNTER);
}
else if( (which == 3) && (!pmcr_enabled) )
{
// Both
PMCR_Enable(3, mode, count_type, PMCR_RESET_COUNTER);
}
}
// Enable "undocumented" performance counters (well, they were undocumented at one point. They're documented now!)
__attribute__((no_instrument_function)) void PMCR_Enable(int which, unsigned short mode, unsigned char count_type, unsigned char reset_count) // Will do nothing if perfcounter is already running!
{
// Don't do anything if count_type or reset_count are invalid
if((count_type | reset_count) > 1)
{
return;
}
// Build config from parameters
unsigned short pmcr_ctrl = PMCR_RUN_COUNTER | (reset_count << PMCR_RESET_COUNTER_SHIFT) | (count_type << PMCR_CLOCK_TYPE_SHIFT) | mode;
// Don't do anything if being asked to enable an already-enabled counter
if( (which == 1) && ((!pmcr_enabled) || (pmcr_enabled == 2)) )
{
// counter 1
*((volatile unsigned short*)PMCR1_CTRL_REG) = pmcr_ctrl;
pmcr_enabled += 1;
}
else if( (which == 2) && ((!pmcr_enabled) || (pmcr_enabled == 1)) )
{
// counter 2
*((volatile unsigned short*)PMCR2_CTRL_REG) = pmcr_ctrl;
pmcr_enabled += 2;
}
else if( (which == 3) && (!pmcr_enabled) )
{
// Both
*((volatile unsigned short*)PMCR1_CTRL_REG) = pmcr_ctrl;
*((volatile unsigned short*)PMCR2_CTRL_REG) = pmcr_ctrl;
pmcr_enabled = 3;
}
}
// For reference:
// #define PMCTR1H_REG 0xFF100004
// #define PMCTR1L_REG 0xFF100008
// #define PMCTR2H_REG 0xFF10000C
// #define PMCTR2L_REG 0xFF100010
static const unsigned int pmcr1_regh = PMCTR1H_REG;
static const unsigned int pmcr1_regl = PMCTR1L_REG;
static const unsigned int pmcr2_regh = PMCTR2H_REG;
static const unsigned int pmcr2_regl = PMCTR2L_REG;
// Sorry, can only read one counter at a time!
// out_array should be an array consisting of 2x unsigned ints.
__attribute__((no_instrument_function)) void PMCR_Read(int which, volatile unsigned int *out_array)
{
// if pmcr is not enabled, this function will just return 0
// little endian (big endian would need to flip [0] and [1])
// Note: These reads really do need to be done in assembly: unfortunately it
// appears that using C causes GCC to insert a branch right smack in between
// the high and low reads of perf counter 2 (with a nop, so it's literally
// delaying the reads by several cycles!), which is totally insane. Doing it
// the assembly way ensures that nothing ridiculous like that happens. It's
// also portable between versions of GCC that do put the nonsensical branch in.
//
// One thing that would be nice is if SH4 had the movi20s instruction to make
// absolute addresses in 3 cycles, but only the SH2A has that... :(
if( (which == 1) && (pmcr_enabled & 0x1) )
{
// counter 1
// out_array[1] = *((volatile unsigned int*)PMCTR1H_REG) & 0xffff;
// out_array[0] = *((volatile unsigned int*)PMCTR1L_REG);
asm volatile("mov.l %[reg1h],r1\n\t" // load counter address (high)
"mov.l %[reg1l],r2\n\t" // load counter address (low)
"mov.l @r1,r1\n\t" // read counter (high)
"mov.l @r2,r2\n\t" // read counter (low)
"extu.w r1,r1\n\t" // zero-extend high, aka high & 0xffff
"mov.l r1,%[outh]\n\t" // get data to memory
"mov.l r2,%[outl]\n\t" // get data to memory
: [outh] "=m" (out_array[1]), [outl] "=m" (out_array[0])
: [reg1h] "m" (pmcr1_regh), [reg1l] "m" (pmcr1_regl) // SH4 can't mov an immediate longword into a register...
: "r1", "r2"
);
}
else if( (which == 2) && (pmcr_enabled & 0x2) )
{
// counter 2
// out_array[1] = *((volatile unsigned int*)PMCTR2H_REG) & 0xffff;
// out_array[0] = *((volatile unsigned int*)PMCTR2L_REG);
asm volatile("mov.l %[reg2h],r1\n\t" // load counter address (high)
"mov.l %[reg2l],r2\n\t" // load counter address (low)
"mov.l @r1,r1\n\t" // read counter (high)
"mov.l @r2,r2\n\t" // read counter (low)
"extu.w r1,r1\n\t" // zero-extend high, aka high & 0xffff
"mov.l r1,%[outh]\n\t" // get data to memory
"mov.l r2,%[outl]\n\t" // get data to memory
: [outh] "=m" (out_array[1]), [outl] "=m" (out_array[0])
: [reg2h] "m" (pmcr2_regh), [reg2l] "m" (pmcr2_regl) // SH4 can't mov an immediate longword into a register...
: "r1", "r2"
);
}
else if(!pmcr_enabled)
{
out_array[1] = 0;
out_array[0] = 0;
}
else // Invalid
{
out_array[1] = 0xffff;
out_array[0] = 0xffffffff;
}
}
// Reset counter to 0 and start it again
// NOTE: It does not appear to be possible to clear a counter while it is running.
__attribute__((no_instrument_function)) void PMCR_Restart(int which, unsigned short mode, unsigned char count_type)
{
if( (which == 1) && (pmcr_enabled & 0x1) )
{
// counter 1
PMCR_Stop(1);
PMCR_Enable(1, mode, count_type, PMCR_RESET_COUNTER);
}
else if( (which == 2) && (pmcr_enabled & 0x2) )
{
// counter 2
PMCR_Stop(2);
PMCR_Enable(2, mode, count_type, PMCR_RESET_COUNTER);
}
else if( (which == 3) && (pmcr_enabled == 3) )
{
// Both
PMCR_Stop(3);
PMCR_Enable(3, mode, count_type, PMCR_RESET_COUNTER);
}
}
// Clearing only works when the counter is disabled. Otherwise, stopping the
// counter via setting the 0x2000 bit holds the data in the data registers,
// whereas disabling without setting that bit reads back as all 0 (but doesn't
// clear the counters for next start). This function just stops a running
// counter and does nothing if the counter is already stopped or disabled, as
// clearing is handled by PMCR_Enable().
__attribute__((no_instrument_function)) void PMCR_Stop(int which)
{
if( (which == 1) && (pmcr_enabled & 0x1) )
{
// counter 1
*((volatile unsigned short*)PMCR1_CTRL_REG) = PMCR_STOP_COUNTER;
pmcr_enabled &= 0x2;
}
else if( (which == 2) && (pmcr_enabled & 0x2) )
{
// counter 2
*((volatile unsigned short*)PMCR2_CTRL_REG) = PMCR_STOP_COUNTER;
pmcr_enabled &= 0x1;
}
else if( (which == 3) && (pmcr_enabled == 3) )
{
// Both
*((volatile unsigned short*)PMCR1_CTRL_REG) = PMCR_STOP_COUNTER;
*((volatile unsigned short*)PMCR2_CTRL_REG) = PMCR_STOP_COUNTER;
pmcr_enabled = 0;
}
}
// Note that disabling does NOT clear the counter.
// It may appear that way because reading a disabled counter returns 0, but re-
// enabling without first clearing will simply continue where it left off.
__attribute__((no_instrument_function)) void PMCR_Disable(int which)
{
if(which == 1)
{
// counter 1
*((volatile unsigned short*)PMCR1_CTRL_REG) = PMCR_DISABLE_COUNTER;
pmcr_enabled &= 0x2;
}
else if(which == 2)
{
// counter 2
*((volatile unsigned short*)PMCR2_CTRL_REG) = PMCR_DISABLE_COUNTER;
pmcr_enabled &= 0x1;
}
else if(which == 3)
{
// Both
*((volatile unsigned short*)PMCR1_CTRL_REG) = PMCR_DISABLE_COUNTER;
*((volatile unsigned short*)PMCR2_CTRL_REG) = PMCR_DISABLE_COUNTER;
pmcr_enabled = 0;
}
}
#endif

316
GL/perfctr.h Normal file
View File

@ -0,0 +1,316 @@
// ---- perfctr.h - SH7091 Performance Counter Module Header ----
//
// This file is part of the DreamHAL project, a hardware abstraction library
// primarily intended for use on the SH7091 found in hardware such as the SEGA
// Dreamcast game console.
//
// The performance counter module is hereby released into the public domain in
// the hope that it may prove useful. Now go profile some code and hit 60 fps! :)
//
// --Moopthehedgehog
//
#ifndef __PERFCTR_H__
#define __PERFCTR_H__
//
// -- General SH4 Performance Counter Notes --
//
// There are 2 performance counters that can measure elapsed time. They are each
// 48-bit counters. They are part of the so-called "ASE" subsystem, which you can
// read about in chapter 13 of the "SuperH™ (SH) 32-bit RISC series SH-4, ST40
// system architecture, volume 1: system":
// https://www.st.com/content/ccc/resource/technical/document/user_manual/36/75/05/ac/e8/7e/42/2d/CD00147163.pdf/files/CD00147163.pdf/jcr:content/translations/en.CD00147163.pdf
//
// They can count cycles, so that's 199.5MHz (not 200MHz!!) a.k.a. roughly 5 ns
// increments. At 5 ns increments, a 48-bit cycle counter can run continuously
// for 16.33 days. It's actually 16 days, 7 hours, 55 minutes, and 2 seconds,
// depending on how close the bus clock is to 99.75MHz. There is also a second
// mode that counts cycles according to a ratio between the CPU frequency and
// the system bus clock, and it increments the counter by 12 every bus cycle.
// This second mode is detailed in the description for PMCR_CLOCK_TYPE in this
// file, and it is recommended for use when the CPU frequency is not a runtime
// constant.
//
// Side note: The counters don't have an overflow interrupt or overflow bit.
// (I did actually run one to 48-bit overflow in elapsed time mode using the
// ratio method to check this. They don't appear to sign-extend the upper 16
// bits in elapsed time mode, either.)
//
// The two counters are functionally identical. I would recommend using the
// PMCR_Init() function to start one (or both) up the first time.
//
// -- Configuration Address Info --
//
// Addresses for these counters can be easily seen here, in lxdream's source code:
// https://github.com/lutris/lxdream/blob/master/src/sh4/sh4mmio.h
//
// They are also on display in the Linux kernel, but at the time of writing appear
// to be set incorrectly (the clock mode at bit 0x100 is never set or cleared,
// for example, so they're at the mercy of whatever the hardware defaults are):
// http://git.lpclinux.com/cgit/linux-2.6.28.2-lpc313x/plain/arch/sh/oprofile/op_model_sh7750.c
// https://github.com/torvalds/linux/blob/master/arch/sh/kernel/cpu/sh4/perf_event.c
// ...It also appears as though they may not be handling bus ratio mode correctly,
// which appears to be the default mode on the Dreamcast in all my tests.
//
// You can also find these addresses by ripping a copy of Virtua Fighter 3 that
// you own for Dreamcast and looking at the raw byte code (or a raw disassembly)
// of its main program binary. It would appear as though they were timing a loop
// with the low half of perf counter 1 in elapsed time mode. Definitely seems
// like a good thing to do when targeting 60fps! Shenmue Disc 4 also uses the
// same configuration, but what's being timed is not as clear.
//
// Another place you can actually find both control addresses 0xFF00008x and all
// data addresses 0xFF10000x is in binaries of ancient, freely available versions
// of CodeScape. Literally all you need to do is open an SH7750-related DLL in a
// hex editor and do a search to find the control register addresses, and the
// data addresses are equally plain to see in any relevant performance profiling
// firmware. There's no effort or decryption required to find them whatsoever;
// all you need is an old trial version and a hex editor.
//
// However, something even better than all of that is if you search for "SH4
// 0xFF000084" (without quotes) online you'll find an old forum where some logs
// were posted of the terminal/command prompt output from some STMicro JTAG tool,
// which not only has the address registers but also clearly characterizes their
// size as 16-bit:
// https://www.multimediaforum.de/threads/36260834-alice-hsn-3800tw-usb-jtag-ft4232h/page2
//
// -- Event Mode Info --
//
// Specific information on each counter mode can be found in the document titled
// "SuperH™ Family E10A-USB Emulator: Additional Document for Users Manual:
// Supplementary Information on Using the SH7750R Renesas Microcomputer Development Environment System"
// which is available on Renesas's website, in the "Documents" section of the
// E10A-USB product page:
// https://www.renesas.com/us/en/products/software-tools/tools/emulator/e10a-usb.html
// At the time of writing (12/2019), the E10A-USB adapter is still available
// for purchase, and it is priced around $1200 (USD).
//
// Appendix C of the "ST40 Micro Toolset Manual" also has these modes documented:
// https://www.st.com/content/ccc/resource/technical/document/user_manual/c5/98/11/89/50/68/41/66/CD17379953.pdf/files/CD17379953.pdf/jcr:content/translations/en.CD17379953.pdf
//
// See here for the hexadecimal values corresponding to each mode (pg. 370):
// http://www.macmadigan.com/BusaECU/Renesas%20documents/Hitachi_codescape_CS40_light_userguides.pdf
// You can also find the same "Counter Description Table" in user's guide PDFs
// bundled in ancient demo versions of CodeScape 3 from 2000 (e.g.
// CSDemo_272.exe), which can still be found in the Internet Archive.
// http://web.archive.org/web/*/http://codescape.com/dl/CSDemo/*
//
// See here for a support document on Lauterbach's SH2, SH3, and SH4 debugger,
// which contains units for each mode (e.g. which measure time and which just
// count): https://www.lauterbach.com/frames.html?home.html (It's in Downloads
// -> Trace32 Help System -> it's the file called "SH2, SH3 and SH4 Debugger"
// with the filename debugger_sh4.pdf).
//
//
// --- Performance Counter Registers ---
//
// These registers are 16 bits only and configure the performance counters
#define PMCR1_CTRL_REG 0xFF000084
#define PMCR2_CTRL_REG 0xFF000088
// These registers are 32-bits each and hold the high low parts of each counter
#define PMCTR1H_REG 0xFF100004
#define PMCTR1L_REG 0xFF100008
#define PMCTR2H_REG 0xFF10000C
#define PMCTR2L_REG 0xFF100010
//
// --- Performance Counter Configuration Flags ---
//
// These bits' functions are currently unknown, but they may simply be reserved.
// It's possible that there's a [maybe expired?] patent that details the
// configuration registers, though I haven't been able to find one. Places to
// check would be Google Patents and the Japanese Patent Office--maybe someone
// else can find something?
//
// Some notes:
// Writing 1 to all of these bits reads back as 0, so it looks like they aren't
// config bits. It's possible they are write-only like the stop bit, though,
// or that they're just reserved-write-0-only. It appears that they are always
// written with zeros in software that uses them, so that's confirmed safe to do.
//
// Also, after running counter 1 to overflow, it appears there's no overflow bit
// (maybe the designers thought 48-bits would be so much to count to that they
// didn't bother implementing one?). The upper 16-bits of the counter high
// register are also not sign-extension bits. They may be a hidden config area,
// but probably not because big endian mode would swap the byte order.
#define PMCR_UNKNOWN_BIT_0040 0x0040
#define PMCR_UNKNOWN_BIT_0080 0x0080
#define PMCR_UNKNOWN_BIT_0200 0x0200
#define PMCR_UNKNOWN_BIT_0400 0x0400
#define PMCR_UNKNOWN_BIT_0800 0x0800
#define PMCR_UNKNOWN_BIT_1000 0x1000
// PMCR_MODE_CLEAR_INVERTED just clears the event mode if it's inverted with
// '~', and event modes are listed below.
#define PMCR_MODE_CLEAR_INVERTED 0x003f
// PMCR_CLOCK_TYPE sets the counters to count clock cycles or CPU/bus ratio mode
// cycles (where T = C x B / 24 and T is time, C is count, and B is time
// of one bus cycle). Note: B = 1/99753008 or so, but it may vary, as mine is
// actually 1/99749010-ish; the target frequency is probably meant to be 99.75MHz.
//
// See the ST40 or Renesas SH7750R documents described in the above "Event Mode
// Info" section for more details about that formula.
//
// Set PMCR_CLOCK_TYPE to 0 for CPU cycle counting, where 1 count = 1 cycle, or
// set it to 1 to use the above formula. Renesas documentation recommends using
// the ratio version (set the bit to 1) when user programs alter CPU clock
// frequencies. This header has some definitions later on to help with this.
#define PMCR_CLOCK_TYPE 0x0100
#define PMCR_CLOCK_TYPE_SHIFT 8
// PMCR_STOP_COUNTER is write-only, as it always reads back as 0. It does what
// the name suggests: when this bit is written to, the counter stops. However,
// if written to while the counter is disabled or stopped, the counter's high
// and low registers are reset to 0.
//
// Using PMCR_STOP_COUNTER to stop the counter has the effect of holding the
// data in the data registers while stopped, unlike PMCR_DISABLE_COUNTER, and
// this bit needs to be written to again (e.g. on next start) in order to
// actually clear the counter data for another run. If not explicitly cleared,
// the counter will continue from where it left off before being stopped.
#define PMCR_STOP_COUNTER 0x2000
#define PMCR_RESET_COUNTER_SHIFT 13
// Bits 0xC000 both need to be set to 1 for the counters to actually begin
// counting. I have seen that the Linux kernel actually separates them out into
// two separate labelled bits (PMEN and PMST) for some reason, however they do
// not appear to do anything separately. Perhaps this is a two-bit mode where
// 1-1 is run, 1-0 and 0-1 are ???, and 0-0 is off.
#define PMCR_RUN_COUNTER 0xC000
#define PMCR_RUN_SHIFT 14
// Interestingly, the output here writes 0x6000 to the counter config registers,
// which would be the "PMST" bit and the "RESET" bit:
// https://www.multimediaforum.de/threads/36260834-alice-hsn-3800tw-usb-jtag-ft4232h/page2
// To disable a counter, just write 0 to its config register. This will not
// reset the counter to 0, as that requires an explicit clear via setting the
// PMCR_STOP_COUNTER bit. What's odd is that a disabled counter's data
// registers read back as all 0, but re-enabling it without a clear will
// continue from the last value before disabling.
#define PMCR_DISABLE_COUNTER 0x0000
// These definitions merely separate out the two PMCR_RUN_COUNTER bits, and
// they are included here for documentation purposes.
// PMST may mean PMCR START. It's consistently used to enable the counter.
// I'm just calling it PMST here for lack of a better name, since this is what
// the Linux kernel and lxdream call it. It could also have something to do with
// a mode specific to STMicroelectronics.
#define PMCR_PMST_BIT 0x4000
#define PMCR_PMST_SHIFT 14
// Likewise PMEN may mean PMCR ENABLE
#define PMCR_PMEN_BIT 0x8000
#define PMCR_PMEN_SHIFT 15
//
// --- Performance Counter Event Code Definitions ---
//
// Interestingly enough, it so happens that the SEGA Dreamcast's CPU seems to
// contain the same performance counter functionality as SH4 debug adapters for
// the SH7750R. Awesome!
//
// MODE DEFINITION VALUE MEASURMENT TYPE & NOTES
#define PMCR_INIT_NO_MODE 0x00 // None; Just here to be complete
#define PMCR_OPERAND_READ_ACCESS_MODE 0x01 // Quantity; With cache
#define PMCR_OPERAND_WRITE_ACCESS_MODE 0x02 // Quantity; With cache
#define PMCR_UTLB_MISS_MODE 0x03 // Quantity
#define PMCR_OPERAND_CACHE_READ_MISS_MODE 0x04 // Quantity
#define PMCR_OPERAND_CACHE_WRITE_MISS_MODE 0x05 // Quantity
#define PMCR_INSTRUCTION_FETCH_MODE 0x06 // Quantity; With cache
#define PMCR_INSTRUCTION_TLB_MISS_MODE 0x07 // Quantity
#define PMCR_INSTRUCTION_CACHE_MISS_MODE 0x08 // Quantity
#define PMCR_ALL_OPERAND_ACCESS_MODE 0x09 // Quantity
#define PMCR_ALL_INSTRUCTION_FETCH_MODE 0x0a // Quantity
#define PMCR_ON_CHIP_RAM_OPERAND_ACCESS_MODE 0x0b // Quantity
// No 0x0c
#define PMCR_ON_CHIP_IO_ACCESS_MODE 0x0d // Quantity
#define PMCR_OPERAND_ACCESS_MODE 0x0e // Quantity; With cache, counts both reads and writes
#define PMCR_OPERAND_CACHE_MISS_MODE 0x0f // Quantity
#define PMCR_BRANCH_ISSUED_MODE 0x10 // Quantity; Not the same as branch taken!
#define PMCR_BRANCH_TAKEN_MODE 0x11 // Quantity
#define PMCR_SUBROUTINE_ISSUED_MODE 0x12 // Quantity; Issued a BSR, BSRF, JSR, JSR/N
#define PMCR_INSTRUCTION_ISSUED_MODE 0x13 // Quantity
#define PMCR_PARALLEL_INSTRUCTION_ISSUED_MODE 0x14 // Quantity
#define PMCR_FPU_INSTRUCTION_ISSUED_MODE 0x15 // Quantity
#define PMCR_INTERRUPT_COUNTER_MODE 0x16 // Quantity
#define PMCR_NMI_COUNTER_MODE 0x17 // Quantity
#define PMCR_TRAPA_INSTRUCTION_COUNTER_MODE 0x18 // Quantity
#define PMCR_UBC_A_MATCH_MODE 0x19 // Quantity
#define PMCR_UBC_B_MATCH_MODE 0x1a // Quantity
// No 0x1b-0x20
#define PMCR_INSTRUCTION_CACHE_FILL_MODE 0x21 // Cycles
#define PMCR_OPERAND_CACHE_FILL_MODE 0x22 // Cycles
#define PMCR_ELAPSED_TIME_MODE 0x23 // Cycles; For 200MHz CPU: 5ns per count in 1 cycle = 1 count mode, or around 417.715ps per count (increments by 12) in CPU/bus ratio mode
#define PMCR_PIPELINE_FREEZE_BY_ICACHE_MISS_MODE 0x24 // Cycles
#define PMCR_PIPELINE_FREEZE_BY_DCACHE_MISS_MODE 0x25 // Cycles
// No 0x26
#define PMCR_PIPELINE_FREEZE_BY_BRANCH_MODE 0x27 // Cycles
#define PMCR_PIPELINE_FREEZE_BY_CPU_REGISTER_MODE 0x28 // Cycles
#define PMCR_PIPELINE_FREEZE_BY_FPU_MODE 0x29 // Cycles
//
// --- Performance Counter Support Definitions ---
//
// This definition can be passed as the init/enable/restart functions'
// count_type parameter to use the 1 cycle = 1 count mode. This is how the
// counter can be made to run for 16.3 days.
#define PMCR_COUNT_CPU_CYCLES 0
// Likewise this uses the CPU/bus ratio method
#define PMCR_COUNT_RATIO_CYCLES 1
// These definitions are for the enable function and specify whether to reset
// a counter to 0 or to continue from where it left off
#define PMCR_CONTINUE_COUNTER 0
#define PMCR_RESET_COUNTER 1
//
// --- Performance Counter Miscellaneous Definitions ---
//
// For convenience; assume stock bus clock of 99.75MHz
// (Bus clock is the external CPU clock, not the peripheral bus clock)
//
#define PMCR_SH4_CPU_FREQUENCY 199500000
#define PMCR_CPU_CYCLES_MAX_SECONDS 1410902
#define PMCR_SH4_BUS_FREQUENCY 99750000
#define PMCR_SH4_BUS_FREQUENCY_SCALED 2394000000 // 99.75MHz x 24
#define PMCR_BUS_RATIO_MAX_SECONDS 117575
//
// --- Performance Counter Functions ---
//
// See perfctr.c file for more details about each function and some more usage notes.
//
// Note: PMCR_Init() and PMCR_Enable() will do nothing if the perf counter is already running!
//
// Clear counter and enable
void PMCR_Init(int which, unsigned short mode, unsigned char count_type);
// Enable one or both of these "undocumented" performance counters.
void PMCR_Enable(int which, unsigned short mode, unsigned char count_type, unsigned char reset_counter);
// Disable, clear, and re-enable with new mode (or same mode)
void PMCR_Restart(int which, unsigned short mode, unsigned char count_type);
// Read a counter
// out_array is specifically uint32 out_array[2] -- 48-bit value needs a 64-bit storage unit
void PMCR_Read(int which, volatile unsigned int *out_array);
// Stop counter(s) (without clearing)
void PMCR_Stop(int which);
// Disable counter(s) (without clearing)
void PMCR_Disable(int which);
#endif /* __PERFCTR_H__ */

View File

@ -6,6 +6,7 @@
#include "../include/gl.h" #include "../include/gl.h"
#include "../containers/aligned_vector.h" #include "../containers/aligned_vector.h"
#include "../containers/named_array.h" #include "../containers/named_array.h"
#include "cygprofile.h"
extern void* memcpy4 (void *dest, const void *src, size_t count); extern void* memcpy4 (void *dest, const void *src, size_t count);
@ -249,6 +250,11 @@ typedef struct {
GLint size; GLint size;
} AttribPointer; } AttribPointer;
typedef struct {
float xyz[3];
float n[3];
} EyeSpaceData;
GLboolean _glCheckValidEnum(GLint param, GLint* values, const char* func); GLboolean _glCheckValidEnum(GLint param, GLint* values, const char* func);
GLuint* _glGetEnabledAttributes(); GLuint* _glGetEnabledAttributes();
@ -280,7 +286,7 @@ GLuint _glGetMipmapLevelCount(TextureObject* obj);
GLboolean _glIsLightingEnabled(); GLboolean _glIsLightingEnabled();
GLboolean _glIsLightEnabled(GLubyte light); GLboolean _glIsLightEnabled(GLubyte light);
GLboolean _glIsColorMaterialEnabled(); GLboolean _glIsColorMaterialEnabled();
void _glCalculateLightingContribution(const GLint light, const GLfloat* pos, const GLfloat* normal, uint8_t* bgra, GLfloat* colour); void _glPerformLighting(Vertex* vertices, const EyeSpaceData* es, const int32_t count);
unsigned char _glIsClippingEnabled(); unsigned char _glIsClippingEnabled();
void _glEnableClipping(unsigned char v); void _glEnableClipping(unsigned char v);

View File

@ -6,6 +6,8 @@
#include "profiler.h" #include "profiler.h"
#include "../containers/aligned_vector.h" #include "../containers/aligned_vector.h"
#if PROFILING_COMPILED
#define MAX_PATH 256 #define MAX_PATH 256
typedef struct { typedef struct {
@ -141,3 +143,4 @@ void profiler_print_stats() {
fprintf(stderr, "%-60s%-20f%-20f%" PRIu64 "\n", result->name, (double)avg, (double)ms, result->total_calls); fprintf(stderr, "%-60s%-20f%-20f%" PRIu64 "\n", result->name, (double)avg, (double)ms, result->total_calls);
} }
} }
#endif

View File

@ -7,12 +7,26 @@ typedef struct {
uint64_t start_time_in_us; uint64_t start_time_in_us;
} Profiler; } Profiler;
#define PROFILING_COMPILED 0
#if PROFILING_COMPILED
Profiler* profiler_push(const char* name); Profiler* profiler_push(const char* name);
void profiler_checkpoint(const char* name); void _profiler_checkpoint(const char* name);
void profiler_pop(); void _profiler_pop();
void profiler_print_stats(); void _profiler_print_stats();
void profiler_enable(); void _profiler_enable();
void profiler_disable(); void _profiler_disable();
#else
#define profiler_push(name);
#define profiler_checkpoint(name);
#define profiler_pop();
#define profiler_print_stats();
#define profiler_enable();
#define profiler_disable();
#endif

1448
GL/sh4_math.h Normal file

File diff suppressed because it is too large Load Diff

View File

@ -743,11 +743,11 @@ GLint _cleanInternalFormat(GLint internalFormat) {
typedef void (*TextureConversionFunc)(const GLubyte*, GLubyte*); typedef void (*TextureConversionFunc)(const GLubyte*, GLubyte*);
static inline void _rgba8888_to_argb4444(const GLubyte* source, GLubyte* dest) { static INLINE_DEBUG void _rgba8888_to_argb4444(const GLubyte* source, GLubyte* dest) {
*((GLushort*) dest) = (source[3] & 0xF0) << 8 | (source[0] & 0xF0) << 4 | (source[1] & 0xF0) | (source[2] & 0xF0) >> 4; *((GLushort*) dest) = (source[3] & 0xF0) << 8 | (source[0] & 0xF0) << 4 | (source[1] & 0xF0) | (source[2] & 0xF0) >> 4;
} }
static inline void _rgba8888_to_rgba8888(const GLubyte* source, GLubyte* dest) { static INLINE_DEBUG void _rgba8888_to_rgba8888(const GLubyte* source, GLubyte* dest) {
/* Noop */ /* Noop */
GLubyte* dst = (GLubyte*) dest; GLubyte* dst = (GLubyte*) dest;
dst[0] = source[0]; dst[0] = source[0];
@ -756,11 +756,11 @@ static inline void _rgba8888_to_rgba8888(const GLubyte* source, GLubyte* dest) {
dst[3] = source[3]; dst[3] = source[3];
} }
static inline void _rgba8888_to_rgb565(const GLubyte* source, GLubyte* dest) { static INLINE_DEBUG void _rgba8888_to_rgb565(const GLubyte* source, GLubyte* dest) {
*((GLushort*) dest) = ((source[0] & 0b11111000) << 8) | ((source[1] & 0b11111100) << 3) | (source[2] >> 3); *((GLushort*) dest) = ((source[0] & 0b11111000) << 8) | ((source[1] & 0b11111100) << 3) | (source[2] >> 3);
} }
static inline void _rgb888_to_rgba8888(const GLubyte* source, GLubyte* dest) { static INLINE_DEBUG void _rgb888_to_rgba8888(const GLubyte* source, GLubyte* dest) {
/* Noop */ /* Noop */
GLubyte* dst = (GLubyte*) dest; GLubyte* dst = (GLubyte*) dest;
dst[0] = source[0]; dst[0] = source[0];
@ -769,24 +769,24 @@ static inline void _rgb888_to_rgba8888(const GLubyte* source, GLubyte* dest) {
dst[3] = 255; dst[3] = 255;
} }
static inline void _rgb888_to_rgb565(const GLubyte* source, GLubyte* dest) { static INLINE_DEBUG void _rgb888_to_rgb565(const GLubyte* source, GLubyte* dest) {
*((GLushort*) dest) = ((source[0] & 0b11111000) << 8) | ((source[1] & 0b11111100) << 3) | (source[2] >> 3); *((GLushort*) dest) = ((source[0] & 0b11111000) << 8) | ((source[1] & 0b11111100) << 3) | (source[2] >> 3);
} }
static inline void _rgba8888_to_a000(const GLubyte* source, GLubyte* dest) { static INLINE_DEBUG void _rgba8888_to_a000(const GLubyte* source, GLubyte* dest) {
*((GLushort*) dest) = ((source[3] & 0b11111000) << 8); *((GLushort*) dest) = ((source[3] & 0b11111000) << 8);
} }
static inline void _r8_to_rgb565(const GLubyte* source, GLubyte* dest) { static INLINE_DEBUG void _r8_to_rgb565(const GLubyte* source, GLubyte* dest) {
*((GLushort*) dest) = (source[0] & 0b11111000) << 8; *((GLushort*) dest) = (source[0] & 0b11111000) << 8;
} }
static inline void _rgba4444_to_argb4444(const GLubyte* source, GLubyte* dest) { static INLINE_DEBUG void _rgba4444_to_argb4444(const GLubyte* source, GLubyte* dest) {
GLushort* src = (GLushort*) source; GLushort* src = (GLushort*) source;
*((GLushort*) dest) = ((*src & 0x000F) << 12) | *src >> 4; *((GLushort*) dest) = ((*src & 0x000F) << 12) | *src >> 4;
} }
static inline void _rgba4444_to_rgba8888(const GLubyte* source, GLubyte* dest) { static INLINE_DEBUG void _rgba4444_to_rgba8888(const GLubyte* source, GLubyte* dest) {
GLushort src = *((GLushort*) source); GLushort src = *((GLushort*) source);
GLubyte* dst = (GLubyte*) dest; GLubyte* dst = (GLubyte*) dest;
@ -796,7 +796,7 @@ static inline void _rgba4444_to_rgba8888(const GLubyte* source, GLubyte* dest) {
dst[3] = ((src & 0x000F)) * 2; dst[3] = ((src & 0x000F)) * 2;
} }
static inline void _i8_to_i8(const GLubyte* source, GLubyte* dest) { static INLINE_DEBUG void _i8_to_i8(const GLubyte* source, GLubyte* dest) {
/* For indexes */ /* For indexes */
GLubyte* dst = (GLubyte*) dest; GLubyte* dst = (GLubyte*) dest;
*dst = *source; *dst = *source;

View File

@ -3,6 +3,8 @@
#include <math.h> #include <math.h>
#include <assert.h> #include <assert.h>
#include <stdio.h> #include <stdio.h>
#include <dc/sq.h>
#include <kos/string.h>
#if defined(__APPLE__) || defined(__WIN32__) #if defined(__APPLE__) || defined(__WIN32__)
/* Linux + Kos define this, OSX does not, so just use malloc there */ /* Linux + Kos define this, OSX does not, so just use malloc there */
@ -25,7 +27,7 @@ void aligned_vector_init(AlignedVector* vector, unsigned int element_size) {
} }
static inline unsigned int round_to_chunk_size(unsigned int val) { static INLINE_DEBUG unsigned int round_to_chunk_size(unsigned int val) {
const unsigned int n = val; const unsigned int n = val;
const unsigned int m = ALIGNED_VECTOR_CHUNK_SIZE; const unsigned int m = ALIGNED_VECTOR_CHUNK_SIZE;
@ -107,33 +109,12 @@ void* aligned_vector_resize(AlignedVector* vector, const unsigned int element_co
} }
} }
void* aligned_vector_at(const AlignedVector* vector, const unsigned int index) {
#if 0
if(index >= vector->size){
char msg[60];
sprintf(msg, "Vector OOB: %d %d wanted %d\n", vector->capacity, vector->size, index);
//aligned_vector_resize(vector, index);
assert_msg(index < vector->size, msg);
}
#endif
assert(index < vector->size);
return &vector->data[index * vector->element_size];
}
void* aligned_vector_back(AlignedVector* vector) {
return aligned_vector_at(vector, vector->size - 1);
}
void* aligned_vector_extend(AlignedVector* vector, const unsigned int additional_count) { void* aligned_vector_extend(AlignedVector* vector, const unsigned int additional_count) {
const unsigned int current = vector->size; const unsigned int current = vector->size;
aligned_vector_resize(vector, vector->size + additional_count); aligned_vector_resize(vector, vector->size + additional_count);
return aligned_vector_at(vector, current); return aligned_vector_at(vector, current);
} }
void aligned_vector_clear(AlignedVector* vector) {
vector->size = 0;
}
void aligned_vector_shrink_to_fit(AlignedVector* vector) { void aligned_vector_shrink_to_fit(AlignedVector* vector) {
if(vector->size == 0) { if(vector->size == 0) {
free(vector->data); free(vector->data);

View File

@ -5,6 +5,8 @@
extern "C" { extern "C" {
#endif #endif
#include "../GL/cygprofile.h"
typedef struct { typedef struct {
unsigned int size; unsigned int size;
unsigned int capacity; unsigned int capacity;
@ -18,12 +20,27 @@ void aligned_vector_init(AlignedVector* vector, unsigned int element_size);
void aligned_vector_reserve(AlignedVector* vector, unsigned int element_count); void aligned_vector_reserve(AlignedVector* vector, unsigned int element_count);
void* aligned_vector_push_back(AlignedVector* vector, const void* objs, unsigned int count); void* aligned_vector_push_back(AlignedVector* vector, const void* objs, unsigned int count);
void* aligned_vector_resize(AlignedVector* vector, const unsigned int element_count); void* aligned_vector_resize(AlignedVector* vector, const unsigned int element_count);
void* aligned_vector_at(const AlignedVector* vector, const unsigned int index); INLINE_ALWAYS void* aligned_vector_at(const AlignedVector* vector, const unsigned int index) {
#if 0
if(index >= vector->size){
char msg[60];
sprintf(msg, "Vector OOB: %d %d wanted %d\n", vector->capacity, vector->size, index);
//aligned_vector_resize(vector, index);
assert_msg(index < vector->size, msg);
}
assert(index < vector->size); /* Check here */
#endif
return &vector->data[index * vector->element_size];
}
void* aligned_vector_extend(AlignedVector* vector, const unsigned int additional_count); void* aligned_vector_extend(AlignedVector* vector, const unsigned int additional_count);
void aligned_vector_clear(AlignedVector* vector); INLINE_ALWAYS void aligned_vector_clear(AlignedVector* vector){
vector->size = 0;
}
void aligned_vector_shrink_to_fit(AlignedVector* vector); void aligned_vector_shrink_to_fit(AlignedVector* vector);
void aligned_vector_cleanup(AlignedVector* vector); void aligned_vector_cleanup(AlignedVector* vector);
void* aligned_vector_back(AlignedVector* vector); INLINE_ALWAYS void* aligned_vector_back(AlignedVector* vector){
return aligned_vector_at(vector, vector->size - 1);
}
#ifdef __cplusplus #ifdef __cplusplus
} }

View File

@ -44,13 +44,6 @@ void named_array_init(NamedArray* array, unsigned int element_size, unsigned int
memset(array->elements, 0, element_size * max_elements); memset(array->elements, 0, element_size * max_elements);
} }
char named_array_used(NamedArray* array, unsigned int id) {
unsigned int i = id / 8;
unsigned int j = id % 8;
unsigned char v = array->used_markers[i] & (unsigned char) (1 << j);
return !!(v);
}
void* named_array_alloc(NamedArray* array, unsigned int* new_id) { void* named_array_alloc(NamedArray* array, unsigned int* new_id) {
unsigned int i = 0, j = 0; unsigned int i = 0, j = 0;

View File

@ -5,6 +5,8 @@
extern "C" { extern "C" {
#endif #endif
#include "../GL/cygprofile.h"
typedef struct { typedef struct {
unsigned int element_size; unsigned int element_size;
unsigned int max_element_count; unsigned int max_element_count;
@ -14,7 +16,13 @@ typedef struct {
} NamedArray; } NamedArray;
void named_array_init(NamedArray* array, unsigned int element_size, unsigned int max_elements); void named_array_init(NamedArray* array, unsigned int element_size, unsigned int max_elements);
char named_array_used(NamedArray* array, unsigned int id); INLINE_ALWAYS char named_array_used(NamedArray* array, unsigned int id) {
const unsigned int i = id / 8;
const unsigned int j = id % 8;
unsigned char v = array->used_markers[i] & (unsigned char) (1 << j);
return !!(v);
}
void* named_array_alloc(NamedArray* array, unsigned int* new_id); void* named_array_alloc(NamedArray* array, unsigned int* new_id);
void* named_array_reserve(NamedArray* array, unsigned int id); void* named_array_reserve(NamedArray* array, unsigned int id);