feat: implement proper chanegs from profiling

- math
- inlining
This commit is contained in:
Hayden Kowalchuk 2020-02-18 11:48:37 -05:00
parent a2dcfcf997
commit 3a4f09bef2
19 changed files with 2572 additions and 227 deletions

View File

@ -1,9 +1,8 @@
#pragma once
#ifndef CONFIG_H
#define CONFIG_H
/* This figure is derived from the needs of Quake 1 */
#define MAX_TEXTURE_COUNT 1088
#endif // CONFIG_H

227
GL/cygprofile.c Normal file
View File

@ -0,0 +1,227 @@
/* Based on the idea from Erich Styger */
/* profiled instrument guided profiling for gldc on hardware */
#include "cygprofile.h"
#include <kos.h>
#include <stdbool.h>
#include <stdint.h>
#include <stdlib.h>
#include "perfctr.h"
#include "private.h"
#if CYG_FUNC_TRACE_ENABLED
#define _strcat(x, y, z) strncat(x, z, y)
#ifndef __PE_Error_H
#define __PE_Error_H
#define ERR_OK 0 /* OK */
#define ERR_SPEED 1 /* This device does not work in the active speed mode. */
#define ERR_RANGE 2 /* Parameter out of range. */
#define ERR_VALUE 3 /* Parameter of incorrect value. */
#define ERR_OVERFLOW 4 /* Timer overflow. */
#define ERR_MATH 5 /* Overflow during evaluation. */
#define ERR_ENABLED 6 /* Device is enabled. */
#define ERR_DISABLED 7 /* Device is disabled. */
#define ERR_BUSY 8 /* Device is busy. */
#define ERR_NOTAVAIL 9 /* Requested value or method not available. */
#define ERR_RXEMPTY 10 /* No data in receiver. */
#define ERR_TXFULL 11 /* Transmitter is full. */
#define ERR_BUSOFF 12 /* Bus not available. */
#define ERR_OVERRUN 13 /* Overrun error is detected. */
#define ERR_FRAMING 14 /* Framing error is detected. */
#define ERR_PARITY 15 /* Parity error is detected. */
#define ERR_NOISE 16 /* Noise error is detected. */
#define ERR_IDLE 17 /* Idle error is detectes. */
#define ERR_FAULT 18 /* Fault error is detected. */
#define ERR_BREAK 19 /* Break char is received during communication. */
#define ERR_CRC 20 /* CRC error is detected. */
#define ERR_ARBITR 21 /* A node losts arbitration. This error occurs if two nodes start transmission at the same time. */
#define ERR_PROTECT 22 /* Protection error is detected. */
#endif /* __PE_Error_H */
#define CYG_RNG_BUF_NOF_ELEMS (8096 * 4)
/*!< Number of elements in the ring buffer which is used to record function calls */
#define CYG_THUMB_MASK 0xFFFFFFFF
/*!< mask out LSB (thumb) bit */
/* Hashing function for two uint32_ts */
#define HASH_PAIR(x, y) (((x)*0x1f1f1f1f) ^ (y))
static bool CYG_Enabled = false; /*!< flag which enables/disables tracing */
/*!
* Element in ring buffer to store the trace information.
*/
typedef struct
{
//bool isEnter; /*!< TRUE for __cyg_profile_func_enter(), FALSE for __cyg_profile_func_exit() */
void *this_fn; /*!< address (with thumb bit) of the (caller) function */
void *call_site; /*!< return address to the function which called this_fn */
uint32_t counter; /* also contains isEnter as highest bit */
} CYG_RNG_ElementType;
typedef uint32_t CYG_RNG_BufSizeType; /*!< index type for ring buffer */
static CYG_RNG_ElementType CYG_RNG_buffer[CYG_RNG_BUF_NOF_ELEMS]; /*!< ring buffer */
//static CYG_RNG_BufSizeType CYG_RNG_inIdx; /*!< input index */
static CYG_RNG_BufSizeType CYG_RNG_outIdx; /*!< output index */
static CYG_RNG_BufSizeType CYG_RNG_inSize; /*!< size/number of elements in buffer */
/*!
* \brief Stores a trace element into the ring buffer.
* \param elem Trace element to put into the buffer.
* \return Error code, ERR_OK if everything is ok.
*/
__attribute__((no_instrument_function)) static uint8_t CYG_RNG_Put(CYG_RNG_ElementType *elem) {
uint8_t res = ERR_OK;
#if 0
if (CYG_RNG_inSize == CYG_RNG_BUF_NOF_ELEMS)
{
res = ERR_TXFULL;
CYG_RNG_inSize--;
CYG_PrintCallTrace();
//CYG_RNG_inIdx = 0;
CYG_RNG_outIdx = 0;
CYG_RNG_inSize = 0;
return CYG_RNG_Put(elem);
}
else
{
//CYG_RNG_buffer[CYG_RNG_inIdx] = *elem;
/*
CYG_RNG_inIdx++;
if (CYG_RNG_inIdx == CYG_RNG_BUF_NOF_ELEMS)
{
CYG_RNG_inIdx = 0;
}
*/
CYG_RNG_inSize++;
}
#endif
CYG_RNG_ElementType *possible = &CYG_RNG_buffer[HASH_PAIR((uint32_t)elem->call_site, (uint32_t)elem->this_fn) % CYG_RNG_BUF_NOF_ELEMS];
if (possible->counter /*& 0x0FFFFFFF*/ == 0) {
*possible = *elem;
} else {
possible->counter++;
}
return res;
}
/*!
* \brief Gets a trace element from the ring buffer.
* \param elem Pointer where to store the trace element.
* \return Error code, ERR_OK if everything is ok.
*/
__attribute__((no_instrument_function)) static uint8_t CYG_RNG_Get(CYG_RNG_ElementType *elemP) {
uint8_t res = ERR_OK;
if (CYG_RNG_inSize == 0) {
res = ERR_RXEMPTY;
} else {
*elemP = CYG_RNG_buffer[CYG_RNG_outIdx];
CYG_RNG_inSize--;
CYG_RNG_outIdx++;
if (CYG_RNG_outIdx == CYG_RNG_BUF_NOF_ELEMS) {
CYG_RNG_outIdx = 0;
}
}
return res;
}
static uint32_t currentTime[2];
static uint32_t lastTime;
/*!
* \brief Stores a trace element into the ring buffer.
* \param this_fn Address of the caller function.
* \param call_site Return address to the function which called this_fn
* \return Error code, ERR_OK if everything is ok.
*/
__attribute__((no_instrument_function)) static void CYG_Store(void *this_fn, void *call_site) {
CYG_RNG_ElementType elem;
lastTime = currentTime[0];
PMCR_Read(1, (unsigned int *)currentTime);
//elem.isEnter = isEnter;
elem.call_site = call_site;
elem.this_fn = this_fn;
elem.counter = 1; //currentTime[0] - lastTime;
CYG_RNG_Put(&elem);
}
/*!
* \brief Function which is called upon function enter. The function call is inserted by the compiler.
* \param this_fn Address of the caller function.
* \param call_site Return address to the function which called this_fn
*/
__attribute__((no_instrument_function)) void __cyg_profile_func_enter(void *this_fn, void *call_site) {
if (CYG_Enabled) {
CYG_Store(call_site, this_fn);
}
}
/*!
* \brief Function which is called upon function exit. The function call is inserted by the compiler.
* \param this_fn Address of the caller function.
* \param call_site Return address to the function which called this_fn
*/
__attribute__((no_instrument_function)) void __cyg_profile_func_exit(__attribute__((unused)) void *this_fn, __attribute__((unused)) void *call_site) {
}
/*!
* \brief Dumps the trace to the console.
*/
__attribute__((no_instrument_function)) void CYG_PrintCallTrace(void) {
CYG_RNG_BufSizeType i;
char buf[40];
CYG_RNG_ElementType elem;
uint8_t res;
CYG_Enabled = false;
printf("0x%08x\n", ((unsigned int)&_etext) - BASE_ADDRESS);
//printf("Function Trace:\r\n");
CYG_RNG_outIdx = 0;
for (i = 0; i < CYG_RNG_BUF_NOF_ELEMS; i++) {
buf[0] = '\0';
res = CYG_RNG_Get(&elem);
if (res == ERR_OK && elem.call_site != NULL) {
snprintf(buf, sizeof(buf), "{ 0x%" PRIXPTR " 0x%" PRIXPTR " %u\r\n", (uintptr_t)(elem.this_fn) & CYG_THUMB_MASK, (uintptr_t)(elem.call_site) & CYG_THUMB_MASK, (unsigned int)elem.counter);
printf(buf);
} else {
//printf("ERROR getting element!\r\n");
}
}
//printf("Function Trace: done!\r\n");
}
__attribute__((no_instrument_function)) void CYG_Init(void) {
if (CYG_Enabled) {
return;
}
CYG_RNG_inSize = CYG_RNG_BUF_NOF_ELEMS;
CYG_RNG_outIdx = 0;
CYG_Enabled = true;
currentTime[0] = currentTime[1] = 0;
lastTime = 0;
memset(CYG_RNG_buffer, 0, sizeof(CYG_RNG_buffer));
PMCR_Init(1, PMCR_ELAPSED_TIME_MODE, PMCR_COUNT_CPU_CYCLES);
}
__attribute__((no_instrument_function)) void CYG_Deinit(void) {
CYG_RNG_inSize = CYG_RNG_BUF_NOF_ELEMS;
CYG_RNG_outIdx = 0;
CYG_Enabled = false;
memset(CYG_RNG_buffer, 0, sizeof(CYG_RNG_buffer));
}
#else
void CYG_PrintCallTrace(void){}
void CYG_Init(void){}
void CYG_Deinit(void){}
#endif

33
GL/cygprofile.h Normal file
View File

@ -0,0 +1,33 @@
#pragma once
#ifndef CYGPROFILE_H_
#define CYGPROFILE_H_
/* Based on the idea from Erich Styger */
/* profiled instrument guided profiling for gldc on hardware */
#define NO_INSTRUMENT inline __attribute__((no_instrument_function))
#define INLINE_DEBUG NO_INSTRUMENT __attribute__((always_inline))
#define INLINE_ALWAYS static NO_INSTRUMENT __attribute__((always_inline))
extern char _etext;
#define BASE_ADDRESS 0x8c010000
#define CYG_FUNC_TRACE_ENABLED (1)
/*!< 1: Trace enabled, 0: trace disabled */
/*!
* \brief Print the call trace to the terminal.
*/
void CYG_PrintCallTrace(void);
/*!
* \brief Driver Initialization.
*/
void CYG_Init(void);
/*!
* \brief Driver De-Initialization.
*/
void CYG_Deinit(void);
#endif /* CYGPROFILE_H_ */

View File

@ -56,7 +56,7 @@ void _glInitAttributePointers() {
NORMAL_POINTER.size = 3;
}
static inline GLuint byte_size(GLenum type) {
static INLINE_DEBUG GLuint byte_size(GLenum type) {
switch(type) {
case GL_BYTE: return sizeof(GLbyte);
case GL_UNSIGNED_BYTE: return sizeof(GLubyte);
@ -513,7 +513,7 @@ PVRHeader* _glSubmissionTargetHeader(SubmissionTarget* target) {
return aligned_vector_at(&target->output->vector, target->header_offset);
}
Vertex* _glSubmissionTargetStart(SubmissionTarget* target) {
INLINE_DEBUG Vertex* _glSubmissionTargetStart(SubmissionTarget* target) {
assert(target->start_offset < target->output->vector.size);
return aligned_vector_at(&target->output->vector, target->start_offset);
}
@ -1006,6 +1006,7 @@ static void mat_transform_normal3(const float* xyz, const float* xyzOut, const u
static void light(SubmissionTarget* target) {
#if 0
typedef struct {
float xyz[3];
float n[3];
@ -1057,6 +1058,35 @@ static void light(SubmissionTarget* target) {
vertex->bgra[G8IDX] = (GLubyte) (255.0f * fminf(total[1], 1.0f));
vertex->bgra[B8IDX] = (GLubyte) (255.0f * fminf(total[2], 1.0f));
}
#endif
if(!_glIsLightingEnabled()) {
return;
}
static AlignedVector* eye_space_data = NULL;
if(!eye_space_data) {
eye_space_data = (AlignedVector*) malloc(sizeof(AlignedVector));
aligned_vector_init(eye_space_data, sizeof(EyeSpaceData));
}
aligned_vector_resize(eye_space_data, target->count);
/* Perform lighting calculations and manipulate the colour */
Vertex* vertex = _glSubmissionTargetStart(target);
VertexExtra* extra = aligned_vector_at(target->extras, 0);
EyeSpaceData* eye_space = (EyeSpaceData*) eye_space_data->data;
_glMatrixLoadModelView();
mat_transform3(vertex->xyz, eye_space->xyz, target->count, sizeof(Vertex), sizeof(EyeSpaceData));
_glMatrixLoadNormal();
mat_transform_normal3(extra->nxyz, eye_space->n, target->count, sizeof(VertexExtra), sizeof(EyeSpaceData));
EyeSpaceData* ES = aligned_vector_at(eye_space_data, 0);
_glPerformLighting(vertex, ES, target->count);
}
static void divide(SubmissionTarget* target) {

View File

@ -1,5 +1,6 @@
#include <stdio.h>
#include "private.h"
#include "config.h"
#include "../include/glkos.h"
#include "../include/glext.h"
@ -94,62 +95,62 @@ void APIENTRY glFramebufferTexture2DEXT(GLenum target, GLenum attachment, GLenum
ACTIVE_FRAMEBUFFER->texture_id = texture;
}
static inline GLuint A1555(GLuint v) {
static INLINE_DEBUG GLuint A1555(GLuint v) {
const GLuint MASK = (1 << 15);
return (v & MASK) >> 15;
}
static inline GLuint R1555(GLuint v) {
static INLINE_DEBUG GLuint R1555(GLuint v) {
const GLuint MASK = (31 << 10);
return (v & MASK) >> 10;
}
static inline GLuint G1555(GLuint v) {
static INLINE_DEBUG GLuint G1555(GLuint v) {
const GLuint MASK = (31 << 5);
return (v & MASK) >> 5;
}
static inline GLuint B1555(GLuint v) {
static INLINE_DEBUG GLuint B1555(GLuint v) {
const GLuint MASK = (31 << 0);
return (v & MASK) >> 0;
}
static inline GLuint A4444(GLuint v) {
static INLINE_DEBUG GLuint A4444(GLuint v) {
const GLuint MASK = (0xF << 12);
return (v & MASK) >> 12;
}
static inline GLuint R4444(GLuint v) {
static INLINE_DEBUG GLuint R4444(GLuint v) {
const GLuint MASK = (0xF << 8);
return (v & MASK) >> 8;
}
static inline GLuint G4444(GLuint v) {
static INLINE_DEBUG GLuint G4444(GLuint v) {
const GLuint MASK = (0xF << 4);
return (v & MASK) >> 4;
}
static inline GLuint B4444(GLuint v) {
static INLINE_DEBUG GLuint B4444(GLuint v) {
const GLuint MASK = (0xF << 0);
return (v & MASK) >> 0;
}
static inline GLuint R565(GLuint v) {
static INLINE_DEBUG GLuint R565(GLuint v) {
const GLuint MASK = (31 << 11);
return (v & MASK) >> 11;
}
static inline GLuint G565(GLuint v) {
static INLINE_DEBUG GLuint G565(GLuint v) {
const GLuint MASK = (63 << 5);
return (v & MASK) >> 5;
}
static inline GLuint B565(GLuint v) {
static INLINE_DEBUG GLuint B565(GLuint v) {
const GLuint MASK = (31 << 0);
return (v & MASK) >> 0;
}
GLboolean _glCalculateAverageTexel(const GLubyte* src, const GLuint srcWidth, const GLuint pvrFormat, GLubyte* dest) {
static NO_INSTRUMENT GLboolean _glCalculateAverageTexel(const GLubyte* src, const GLuint srcWidth, const GLuint pvrFormat, GLubyte* dest) {
GLushort* s1 = ((GLushort*) src);
GLushort* s2 = ((GLushort*) src) + 1;
GLushort* s3 = ((GLushort*) src) + srcWidth;

View File

@ -19,3 +19,7 @@
#include "matrix.c"
#include "state.c"
#include "texture.c"
#include "../containers/stack.c"
#include "../containers/aligned_vector.c"
#include "../containers/named_array.c"

View File

@ -281,98 +281,143 @@ static inline float FPOW(float b, float p) {
return FEXP(FLOG(b) * p);
}
void _glCalculateLightingContribution(const GLint light, const GLfloat* pos, const GLfloat* normal, uint8_t* bgra, GLfloat* colour) __attribute__((optimize("fast-math")));
void _glCalculateLightingContribution(const GLint light, const GLfloat* pos, const GLfloat* normal, uint8_t* bgra, GLfloat* colour) {
LightSource* l = &LIGHTS[light];
struct vec3f L = {
l->position[0],
l->position[1],
l->position[2]
};
if(!l->is_directional) {
L.x -= pos[0];
L.y -= pos[1];
L.z -= pos[2];
}
struct vec3f N = {
normal[0],
normal[1],
normal[2]
};
struct vec3f V = {
pos[0],
pos[1],
pos[2]
};
GLfloat d;
vec3f_length(L.x, L.y, L.z, d);
GLfloat oneOverL = 1.0f / d;
L.x *= oneOverL;
L.y *= oneOverL;
L.z *= oneOverL;
vec3f_normalize(V.x, V.y, V.z);
GLfloat NdotL, VdotN;
vec3f_dot(N.x, N.y, N.z, L.x, L.y, L.z, NdotL);
vec3f_dot(V.x, V.y, V.z, N.x, N.y, N.z, VdotN);
GLfloat VdotR = VdotN - NdotL;
GLfloat specularPower = FPOW(VdotR > 0 ? VdotR : 0, MATERIAL.exponent);
GLboolean colorMaterial = _glIsColorMaterialEnabled();
GLfloat mD [] = {
(colorMaterial && isDiffuseColorMaterial()) ? ((GLfloat)bgra[R8IDX]) / 255.0f : MATERIAL.diffuse[0],
(colorMaterial && isDiffuseColorMaterial()) ? ((GLfloat)bgra[G8IDX]) / 255.0f : MATERIAL.diffuse[1],
(colorMaterial && isDiffuseColorMaterial()) ? ((GLfloat)bgra[B8IDX]) / 255.0f : MATERIAL.diffuse[2],
(colorMaterial && isDiffuseColorMaterial()) ? ((GLfloat)bgra[A8IDX]) / 255.0f : MATERIAL.diffuse[3]
};
GLfloat mA [] = {
(colorMaterial && isAmbientColorMaterial()) ? ((GLfloat)bgra[R8IDX]) / 255.0f : MATERIAL.ambient[0],
(colorMaterial && isAmbientColorMaterial()) ? ((GLfloat)bgra[G8IDX]) / 255.0f : MATERIAL.ambient[1],
(colorMaterial && isAmbientColorMaterial()) ? ((GLfloat)bgra[B8IDX]) / 255.0f : MATERIAL.ambient[2],
(colorMaterial && isAmbientColorMaterial()) ? ((GLfloat)bgra[A8IDX]) / 255.0f : MATERIAL.ambient[3]
};
GLfloat mS [] = {
(colorMaterial && isSpecularColorMaterial()) ? ((GLfloat)bgra[R8IDX]) / 255.0f : MATERIAL.specular[0],
(colorMaterial && isSpecularColorMaterial()) ? ((GLfloat)bgra[G8IDX]) / 255.0f : MATERIAL.specular[1],
(colorMaterial && isSpecularColorMaterial()) ? ((GLfloat)bgra[B8IDX]) / 255.0f : MATERIAL.specular[2],
(colorMaterial && isSpecularColorMaterial()) ? ((GLfloat)bgra[A8IDX]) / 255.0f : MATERIAL.specular[3]
};
colour[0] = l->ambient[0] * mA[0];
colour[1] = l->ambient[1] * mA[1];
colour[2] = l->ambient[2] * mA[2];
colour[3] = mD[3];
if(NdotL >= 0) {
colour[0] += (l->diffuse[0] * mD[0] * NdotL + l->specular[0] * mS[0] * specularPower);
colour[1] += (l->diffuse[1] * mD[1] * NdotL + l->specular[1] * mS[1] * specularPower);
colour[2] += (l->diffuse[2] * mD[2] * NdotL + l->specular[2] * mS[2] * specularPower);
}
if(!l->is_directional) {
GLfloat att = (
1.0f / (l->constant_attenuation + (l->linear_attenuation * d) + (l->quadratic_attenuation * d * d))
);
colour[0] *= att;
colour[1] *= att;
colour[2] *= att;
}
if(colour[0] > 1.0f) colour[0] = 1.0f;
if(colour[1] > 1.0f) colour[1] = 1.0f;
if(colour[2] > 1.0f) colour[2] = 1.0f;
if(colour[3] > 1.0f) colour[3] = 1.0f;
#define LIGHT_COMPONENT(C) { \
const GLfloat* acm = &MA[C]; \
const GLfloat* dcm = &MD[C]; \
const GLfloat* scm = &MS[C]; \
const GLfloat* scli = &light->specular[C]; \
const GLfloat* dcli = &light->diffuse[C]; \
const GLfloat* acli = &light->ambient[C]; \
const GLfloat* srm = &MATERIAL.exponent; \
const GLfloat fi = (LdotN == 0) ? 0 : 1; \
GLfloat component = (*acm * *acli); \
component += (LdotN * *dcm * *dcli); \
component += (FPOW((fi * NdotH), *srm) * *scm * *scli); \
component *= att; \
component *= spot; \
final[C] += component; \
}
static inline float vec3_dot_limited(
const float* x1, const float* y1, const float* z1,
const float* x2, const float* y2, const float* z2) {
float ret;
vec3f_dot(*x1, *y1, *z1, *x2, *y2, *z2, ret);
return (ret < 0) ? 0 : ret;
}
void _glPerformLighting(Vertex* vertices, const EyeSpaceData* es, const int32_t count) {
int8_t i;
int32_t j;
const LightSource* light = NULL;
const GLboolean colorMaterial = _glIsColorMaterialEnabled();
const GLboolean isDiffuseCM = isDiffuseColorMaterial();
const GLboolean isAmbientCM = isAmbientColorMaterial();
const GLboolean isSpecularCM = isSpecularColorMaterial();
static GLfloat CM[4];
/* So the DC has 16 floating point registers, that means
* we need to limit the number of floats as much as possible
* to give the compiler a good enough chance to do the right
* thing */
Vertex* vertex = vertices;
const EyeSpaceData* data = es;
static const float ONE_OVER_255 = 1.0f / 255.0f;
for(j = 0; j < count; ++j, ++vertex, ++data) {
/* When GL_COLOR_MATERIAL is on, we need to pull out
* the passed in diffuse and use it */
const GLfloat* MD = MATERIAL.diffuse;
const GLfloat* MA = MATERIAL.ambient;
const GLfloat* MS = MATERIAL.specular;
if(colorMaterial) {
CM[0] = ((GLfloat) vertex->bgra[R8IDX]) * ONE_OVER_255;
CM[1] = ((GLfloat) vertex->bgra[G8IDX]) * ONE_OVER_255;
CM[2] = ((GLfloat) vertex->bgra[B8IDX]) * ONE_OVER_255;
CM[3] = ((GLfloat) vertex->bgra[A8IDX]) * ONE_OVER_255;
MD = (isDiffuseCM) ? CM : MATERIAL.diffuse;
MA = (isAmbientCM) ? CM : MATERIAL.ambient;
MS = (isSpecularCM) ? CM : MATERIAL.specular;
}
float final[4];
/* Initial, non-light related values */
final[0] = (SCENE_AMBIENT[0] * MA[0]) + MATERIAL.emissive[0];
final[1] = (SCENE_AMBIENT[1] * MA[1]) + MATERIAL.emissive[1];
final[2] = (SCENE_AMBIENT[2] * MA[2]) + MATERIAL.emissive[2];
final[3] = MD[3];
float Vx, Vy, Vz;
Vx = -data->xyz[0];
Vy = -data->xyz[1];
Vz = -data->xyz[2];
vec3f_normalize(Vx, Vy, Vz);
for(i = 0; i < MAX_LIGHTS; ++i) {
if(!_glIsLightEnabled(i)) continue;
/* Calc light specific parameters */
light = &LIGHTS[i];
float Lx, Ly, Lz, D;
float Hx, Hy, Hz;
const float* Nx = &data->n[0];
const float* Ny = &data->n[1];
const float* Nz = &data->n[2];
Lx = light->position[0] - data->xyz[0];
Ly = light->position[1] - data->xyz[1];
Lz = light->position[2] - data->xyz[2];
vec3f_length(Lx, Ly, Lz, D);
{
/* Normalize L - scoping ensures Llen is temporary */
const float Llen = 1.0f / D;
Lx *= Llen;
Ly *= Llen;
Lz *= Llen;
}
Hx = (Lx + Vx);
Hy = (Ly + Vy);
Hz = (Lz + Vz);
vec3f_normalize(Hx, Hy, Hz);
const float LdotN = vec3_dot_limited(
&Lx, &Ly, &Lz,
Nx, Ny, Nz
);
const float NdotH = vec3_dot_limited(
Nx, Ny, Nz,
&Hx, &Hy, &Hz
);
const float att = (
light->position[3] == 0.0f) ? 1.0f :
1.0f / (light->constant_attenuation + (light->linear_attenuation * D) + (light->quadratic_attenuation * D * D)
);
const float spot = 1.0f;
LIGHT_COMPONENT(0);
LIGHT_COMPONENT(1);
LIGHT_COMPONENT(2);
}
vertex->bgra[R8IDX] = (GLubyte)(fminf(final[0] * 255.0f, 255.0f));
vertex->bgra[G8IDX] = (GLubyte)(fminf(final[1] * 255.0f, 255.0f));
vertex->bgra[B8IDX] = (GLubyte)(fminf(final[2] * 255.0f, 255.0f));
vertex->bgra[A8IDX] = (GLubyte)(fminf(final[3] * 255.0f, 255.0f));
}
}

View File

@ -476,84 +476,57 @@ void APIENTRY glDepthRange(GLclampf n, GLclampf f) {
DEPTH_RANGE_MULTIPLIER_H = (n + f) / 2.0f;
}
#include "sh4_math.h"
/* Vector Cross Product - Used by glhLookAtf2 */
static inline void vec3f_cross(const GLfloat* v1, const GLfloat* v2, GLfloat* result) {
result[0] = v1[1] * v2[2] - v1[2] * v2[1];
result[1] = v1[2] * v2[0] - v1[0] * v2[2];
result[2] = v1[0] * v2[1] - v1[1] * v2[0];
static inline void vec3f_cross(GLfloat* v1, GLfloat* v2, GLfloat* result) {
result[0] = (v1[1] * v2[2]) - (v1[2] * v2[1]);
result[1] = (v1[2] * v2[0]) - (v1[0] * v2[2]);
result[2] = (v1[0] * v2[1]) - (v1[1] * v2[0]);
}
/* glhLookAtf2 adapted from http://www.opengl.org/wiki/GluLookAt_code */
void glhLookAtf2(const GLfloat* eyePosition3D,
const GLfloat* center3D,
const GLfloat* upVector3D) {
/* Look-At Matrix */
static Matrix4x4 MatrixLookAt __attribute__((aligned(32))) = {
1.0f, 0.0f, 0.0f, 0.0f,
0.0f, 1.0f, 0.0f, 0.0f,
0.0f, 0.0f, 1.0f, 0.0f,
0.0f, 0.0f, 0.0f, 1.0f
};
static inline void vec3f_normalize_sh4(float *v){
float length, ilength;
GLfloat forward[3];
GLfloat side[3];
GLfloat up[3];
vec3f_sub_normalize(center3D[0], center3D[1], center3D[2],
eyePosition3D[0], eyePosition3D[1], eyePosition3D[2],
forward[0], forward[1], forward[2]);
//Side = forward x up
vec3f_cross(forward, upVector3D, side);
vec3f_normalize(side[0], side[1], side[2]);
//Recompute up as: up = side x forward
vec3f_cross(side, forward, up);
MatrixLookAt[M0] = side[0];
MatrixLookAt[M4] = side[1];
MatrixLookAt[M8] = side[2];
MatrixLookAt[M12] = 0;
MatrixLookAt[M1] = up[0];
MatrixLookAt[M5] = up[1];
MatrixLookAt[M9] = up[2];
MatrixLookAt[M13] = 0;
MatrixLookAt[M2] = -forward[0];
MatrixLookAt[M6] = -forward[1];
MatrixLookAt[M10] = -forward[2];
MatrixLookAt[M14] = 0;
MatrixLookAt[M3] = MatrixLookAt[11] = MatrixLookAt[15] = 0;
MatrixLookAt[M15] = 1;
static Matrix4x4 trn __attribute__((aligned(32))) = {
1.0f, 0.0f, 0.0f, 0.0f,
0.0f, 1.0f, 0.0f, 0.0f,
0.0f, 0.0f, 1.0f, 0.0f,
0.0f, 0.0f, 0.0f, 1.0f
};
trn[M12] = -eyePosition3D[0];
trn[M13] = -eyePosition3D[1];
trn[M14] = -eyePosition3D[2];
// Does not modify internal Modelview matrix
upload_matrix(&MatrixLookAt);
multiply_matrix(&trn);
multiply_matrix(stack_top(MATRIX_STACKS + (GL_MODELVIEW & 0xF)));
download_matrix(stack_top(MATRIX_STACKS + (GL_MODELVIEW & 0xF)));
ilength = MATH_fsrra(v[0]*v[0] + v[1]*v[1] + v[2]*v[2]);
length = MATH_Invert(ilength);
if (length)
{
v[0] *= ilength;
v[1] *= ilength;
v[2] *= ilength;
}
}
void gluLookAt(GLfloat eyex, GLfloat eyey, GLfloat eyez, GLfloat centerx,
GLfloat centery, GLfloat centerz, GLfloat upx, GLfloat upy,
GLfloat upz) {
GLfloat eye [] = { eyex, eyey, eyez };
GLfloat point [] = { centerx, centery, centerz };
GLfloat up [] = { upx, upy, upz };
glhLookAtf2(eye, point, up);
GLfloat m [16];
GLfloat f [3];
GLfloat u [3];
GLfloat s [3];
f[0] = centerx - eyex;
f[1] = centery - eyey;
f[2] = centerz - eyez;
u[0] = upx;
u[1] = upy;
u[2] = upz;
vec3f_normalize_sh4(f);
vec3f_cross(f, u, s);
vec3f_normalize_sh4(s);
vec3f_cross(s, f, u);
m[0] = s[0]; m[4] = s[1]; m[8] = s[2]; m[12] = 0.0f;
m[1] = u[0]; m[5] = u[1]; m[9] = u[2]; m[13] = 0.0f;
m[2] = -f[0]; m[6] = -f[1]; m[10] = -f[2]; m[14] = 0.0f;
m[3] = 0.0f; m[7] = 0.0f; m[11] = 0.0f; m[15] = 1.0f;
glMultMatrixf(m);
glTranslatef(-eyex, -eyey, -eyez);
}
void _glApplyRenderMatrix() {

247
GL/perfctr.c Normal file
View File

@ -0,0 +1,247 @@
// ---- perfctr.c - SH7091 Performance Counter Module Code ----
//
// This file is part of the DreamHAL project, a hardware abstraction library
// primarily intended for use on the SH7091 found in hardware such as the SEGA
// Dreamcast game console.
//
// The performance counter module is hereby released into the public domain in
// the hope that it may prove useful. Now go profile some code and hit 60 fps! :)
//
// --Moopthehedgehog
// See perfctr.h for more of my notes and documentation on these counters.
#include "perfctr.h"
#include "cygprofile.h"
#if CYG_FUNC_TRACE_ENABLED
static unsigned char pmcr_enabled = 0;
//
// Initialize performance counters. It's just a clear -> enable.
// It's good practice to clear a counter before starting it for the first time.
//
// Also: Disabling and re-enabling the counters doesn't reset them; the clearing
// needs to happen while a counter is disabled to reset it.
//
// You can disable and re-enable with a different mode without explicitly
// clearing and have it keep going, continuing from where it left off.
//
__attribute__((no_instrument_function)) void PMCR_Init(int which, unsigned short mode, unsigned char count_type) // Will do nothing if perfcounter is already running!
{
// Don't do anything if being asked to enable an already-enabled counter
if( (which == 1) && ((!pmcr_enabled) || (pmcr_enabled == 2)) )
{
// counter 1
PMCR_Enable(1, mode, count_type, PMCR_RESET_COUNTER);
}
else if( (which == 2) && ((!pmcr_enabled) || (pmcr_enabled == 1)) )
{
// counter 2
PMCR_Enable(2, mode, count_type, PMCR_RESET_COUNTER);
}
else if( (which == 3) && (!pmcr_enabled) )
{
// Both
PMCR_Enable(3, mode, count_type, PMCR_RESET_COUNTER);
}
}
// Enable "undocumented" performance counters (well, they were undocumented at one point. They're documented now!)
__attribute__((no_instrument_function)) void PMCR_Enable(int which, unsigned short mode, unsigned char count_type, unsigned char reset_count) // Will do nothing if perfcounter is already running!
{
// Don't do anything if count_type or reset_count are invalid
if((count_type | reset_count) > 1)
{
return;
}
// Build config from parameters
unsigned short pmcr_ctrl = PMCR_RUN_COUNTER | (reset_count << PMCR_RESET_COUNTER_SHIFT) | (count_type << PMCR_CLOCK_TYPE_SHIFT) | mode;
// Don't do anything if being asked to enable an already-enabled counter
if( (which == 1) && ((!pmcr_enabled) || (pmcr_enabled == 2)) )
{
// counter 1
*((volatile unsigned short*)PMCR1_CTRL_REG) = pmcr_ctrl;
pmcr_enabled += 1;
}
else if( (which == 2) && ((!pmcr_enabled) || (pmcr_enabled == 1)) )
{
// counter 2
*((volatile unsigned short*)PMCR2_CTRL_REG) = pmcr_ctrl;
pmcr_enabled += 2;
}
else if( (which == 3) && (!pmcr_enabled) )
{
// Both
*((volatile unsigned short*)PMCR1_CTRL_REG) = pmcr_ctrl;
*((volatile unsigned short*)PMCR2_CTRL_REG) = pmcr_ctrl;
pmcr_enabled = 3;
}
}
// For reference:
// #define PMCTR1H_REG 0xFF100004
// #define PMCTR1L_REG 0xFF100008
// #define PMCTR2H_REG 0xFF10000C
// #define PMCTR2L_REG 0xFF100010
static const unsigned int pmcr1_regh = PMCTR1H_REG;
static const unsigned int pmcr1_regl = PMCTR1L_REG;
static const unsigned int pmcr2_regh = PMCTR2H_REG;
static const unsigned int pmcr2_regl = PMCTR2L_REG;
// Sorry, can only read one counter at a time!
// out_array should be an array consisting of 2x unsigned ints.
__attribute__((no_instrument_function)) void PMCR_Read(int which, volatile unsigned int *out_array)
{
// if pmcr is not enabled, this function will just return 0
// little endian (big endian would need to flip [0] and [1])
// Note: These reads really do need to be done in assembly: unfortunately it
// appears that using C causes GCC to insert a branch right smack in between
// the high and low reads of perf counter 2 (with a nop, so it's literally
// delaying the reads by several cycles!), which is totally insane. Doing it
// the assembly way ensures that nothing ridiculous like that happens. It's
// also portable between versions of GCC that do put the nonsensical branch in.
//
// One thing that would be nice is if SH4 had the movi20s instruction to make
// absolute addresses in 3 cycles, but only the SH2A has that... :(
if( (which == 1) && (pmcr_enabled & 0x1) )
{
// counter 1
// out_array[1] = *((volatile unsigned int*)PMCTR1H_REG) & 0xffff;
// out_array[0] = *((volatile unsigned int*)PMCTR1L_REG);
asm volatile("mov.l %[reg1h],r1\n\t" // load counter address (high)
"mov.l %[reg1l],r2\n\t" // load counter address (low)
"mov.l @r1,r1\n\t" // read counter (high)
"mov.l @r2,r2\n\t" // read counter (low)
"extu.w r1,r1\n\t" // zero-extend high, aka high & 0xffff
"mov.l r1,%[outh]\n\t" // get data to memory
"mov.l r2,%[outl]\n\t" // get data to memory
: [outh] "=m" (out_array[1]), [outl] "=m" (out_array[0])
: [reg1h] "m" (pmcr1_regh), [reg1l] "m" (pmcr1_regl) // SH4 can't mov an immediate longword into a register...
: "r1", "r2"
);
}
else if( (which == 2) && (pmcr_enabled & 0x2) )
{
// counter 2
// out_array[1] = *((volatile unsigned int*)PMCTR2H_REG) & 0xffff;
// out_array[0] = *((volatile unsigned int*)PMCTR2L_REG);
asm volatile("mov.l %[reg2h],r1\n\t" // load counter address (high)
"mov.l %[reg2l],r2\n\t" // load counter address (low)
"mov.l @r1,r1\n\t" // read counter (high)
"mov.l @r2,r2\n\t" // read counter (low)
"extu.w r1,r1\n\t" // zero-extend high, aka high & 0xffff
"mov.l r1,%[outh]\n\t" // get data to memory
"mov.l r2,%[outl]\n\t" // get data to memory
: [outh] "=m" (out_array[1]), [outl] "=m" (out_array[0])
: [reg2h] "m" (pmcr2_regh), [reg2l] "m" (pmcr2_regl) // SH4 can't mov an immediate longword into a register...
: "r1", "r2"
);
}
else if(!pmcr_enabled)
{
out_array[1] = 0;
out_array[0] = 0;
}
else // Invalid
{
out_array[1] = 0xffff;
out_array[0] = 0xffffffff;
}
}
// Reset counter to 0 and start it again
// NOTE: It does not appear to be possible to clear a counter while it is running.
__attribute__((no_instrument_function)) void PMCR_Restart(int which, unsigned short mode, unsigned char count_type)
{
if( (which == 1) && (pmcr_enabled & 0x1) )
{
// counter 1
PMCR_Stop(1);
PMCR_Enable(1, mode, count_type, PMCR_RESET_COUNTER);
}
else if( (which == 2) && (pmcr_enabled & 0x2) )
{
// counter 2
PMCR_Stop(2);
PMCR_Enable(2, mode, count_type, PMCR_RESET_COUNTER);
}
else if( (which == 3) && (pmcr_enabled == 3) )
{
// Both
PMCR_Stop(3);
PMCR_Enable(3, mode, count_type, PMCR_RESET_COUNTER);
}
}
// Clearing only works when the counter is disabled. Otherwise, stopping the
// counter via setting the 0x2000 bit holds the data in the data registers,
// whereas disabling without setting that bit reads back as all 0 (but doesn't
// clear the counters for next start). This function just stops a running
// counter and does nothing if the counter is already stopped or disabled, as
// clearing is handled by PMCR_Enable().
__attribute__((no_instrument_function)) void PMCR_Stop(int which)
{
if( (which == 1) && (pmcr_enabled & 0x1) )
{
// counter 1
*((volatile unsigned short*)PMCR1_CTRL_REG) = PMCR_STOP_COUNTER;
pmcr_enabled &= 0x2;
}
else if( (which == 2) && (pmcr_enabled & 0x2) )
{
// counter 2
*((volatile unsigned short*)PMCR2_CTRL_REG) = PMCR_STOP_COUNTER;
pmcr_enabled &= 0x1;
}
else if( (which == 3) && (pmcr_enabled == 3) )
{
// Both
*((volatile unsigned short*)PMCR1_CTRL_REG) = PMCR_STOP_COUNTER;
*((volatile unsigned short*)PMCR2_CTRL_REG) = PMCR_STOP_COUNTER;
pmcr_enabled = 0;
}
}
// Note that disabling does NOT clear the counter.
// It may appear that way because reading a disabled counter returns 0, but re-
// enabling without first clearing will simply continue where it left off.
__attribute__((no_instrument_function)) void PMCR_Disable(int which)
{
if(which == 1)
{
// counter 1
*((volatile unsigned short*)PMCR1_CTRL_REG) = PMCR_DISABLE_COUNTER;
pmcr_enabled &= 0x2;
}
else if(which == 2)
{
// counter 2
*((volatile unsigned short*)PMCR2_CTRL_REG) = PMCR_DISABLE_COUNTER;
pmcr_enabled &= 0x1;
}
else if(which == 3)
{
// Both
*((volatile unsigned short*)PMCR1_CTRL_REG) = PMCR_DISABLE_COUNTER;
*((volatile unsigned short*)PMCR2_CTRL_REG) = PMCR_DISABLE_COUNTER;
pmcr_enabled = 0;
}
}
#endif

316
GL/perfctr.h Normal file
View File

@ -0,0 +1,316 @@
// ---- perfctr.h - SH7091 Performance Counter Module Header ----
//
// This file is part of the DreamHAL project, a hardware abstraction library
// primarily intended for use on the SH7091 found in hardware such as the SEGA
// Dreamcast game console.
//
// The performance counter module is hereby released into the public domain in
// the hope that it may prove useful. Now go profile some code and hit 60 fps! :)
//
// --Moopthehedgehog
//
#ifndef __PERFCTR_H__
#define __PERFCTR_H__
//
// -- General SH4 Performance Counter Notes --
//
// There are 2 performance counters that can measure elapsed time. They are each
// 48-bit counters. They are part of the so-called "ASE" subsystem, which you can
// read about in chapter 13 of the "SuperH™ (SH) 32-bit RISC series SH-4, ST40
// system architecture, volume 1: system":
// https://www.st.com/content/ccc/resource/technical/document/user_manual/36/75/05/ac/e8/7e/42/2d/CD00147163.pdf/files/CD00147163.pdf/jcr:content/translations/en.CD00147163.pdf
//
// They can count cycles, so that's 199.5MHz (not 200MHz!!) a.k.a. roughly 5 ns
// increments. At 5 ns increments, a 48-bit cycle counter can run continuously
// for 16.33 days. It's actually 16 days, 7 hours, 55 minutes, and 2 seconds,
// depending on how close the bus clock is to 99.75MHz. There is also a second
// mode that counts cycles according to a ratio between the CPU frequency and
// the system bus clock, and it increments the counter by 12 every bus cycle.
// This second mode is detailed in the description for PMCR_CLOCK_TYPE in this
// file, and it is recommended for use when the CPU frequency is not a runtime
// constant.
//
// Side note: The counters don't have an overflow interrupt or overflow bit.
// (I did actually run one to 48-bit overflow in elapsed time mode using the
// ratio method to check this. They don't appear to sign-extend the upper 16
// bits in elapsed time mode, either.)
//
// The two counters are functionally identical. I would recommend using the
// PMCR_Init() function to start one (or both) up the first time.
//
// -- Configuration Address Info --
//
// Addresses for these counters can be easily seen here, in lxdream's source code:
// https://github.com/lutris/lxdream/blob/master/src/sh4/sh4mmio.h
//
// They are also on display in the Linux kernel, but at the time of writing appear
// to be set incorrectly (the clock mode at bit 0x100 is never set or cleared,
// for example, so they're at the mercy of whatever the hardware defaults are):
// http://git.lpclinux.com/cgit/linux-2.6.28.2-lpc313x/plain/arch/sh/oprofile/op_model_sh7750.c
// https://github.com/torvalds/linux/blob/master/arch/sh/kernel/cpu/sh4/perf_event.c
// ...It also appears as though they may not be handling bus ratio mode correctly,
// which appears to be the default mode on the Dreamcast in all my tests.
//
// You can also find these addresses by ripping a copy of Virtua Fighter 3 that
// you own for Dreamcast and looking at the raw byte code (or a raw disassembly)
// of its main program binary. It would appear as though they were timing a loop
// with the low half of perf counter 1 in elapsed time mode. Definitely seems
// like a good thing to do when targeting 60fps! Shenmue Disc 4 also uses the
// same configuration, but what's being timed is not as clear.
//
// Another place you can actually find both control addresses 0xFF00008x and all
// data addresses 0xFF10000x is in binaries of ancient, freely available versions
// of CodeScape. Literally all you need to do is open an SH7750-related DLL in a
// hex editor and do a search to find the control register addresses, and the
// data addresses are equally plain to see in any relevant performance profiling
// firmware. There's no effort or decryption required to find them whatsoever;
// all you need is an old trial version and a hex editor.
//
// However, something even better than all of that is if you search for "SH4
// 0xFF000084" (without quotes) online you'll find an old forum where some logs
// were posted of the terminal/command prompt output from some STMicro JTAG tool,
// which not only has the address registers but also clearly characterizes their
// size as 16-bit:
// https://www.multimediaforum.de/threads/36260834-alice-hsn-3800tw-usb-jtag-ft4232h/page2
//
// -- Event Mode Info --
//
// Specific information on each counter mode can be found in the document titled
// "SuperH™ Family E10A-USB Emulator: Additional Document for Users Manual:
// Supplementary Information on Using the SH7750R Renesas Microcomputer Development Environment System"
// which is available on Renesas's website, in the "Documents" section of the
// E10A-USB product page:
// https://www.renesas.com/us/en/products/software-tools/tools/emulator/e10a-usb.html
// At the time of writing (12/2019), the E10A-USB adapter is still available
// for purchase, and it is priced around $1200 (USD).
//
// Appendix C of the "ST40 Micro Toolset Manual" also has these modes documented:
// https://www.st.com/content/ccc/resource/technical/document/user_manual/c5/98/11/89/50/68/41/66/CD17379953.pdf/files/CD17379953.pdf/jcr:content/translations/en.CD17379953.pdf
//
// See here for the hexadecimal values corresponding to each mode (pg. 370):
// http://www.macmadigan.com/BusaECU/Renesas%20documents/Hitachi_codescape_CS40_light_userguides.pdf
// You can also find the same "Counter Description Table" in user's guide PDFs
// bundled in ancient demo versions of CodeScape 3 from 2000 (e.g.
// CSDemo_272.exe), which can still be found in the Internet Archive.
// http://web.archive.org/web/*/http://codescape.com/dl/CSDemo/*
//
// See here for a support document on Lauterbach's SH2, SH3, and SH4 debugger,
// which contains units for each mode (e.g. which measure time and which just
// count): https://www.lauterbach.com/frames.html?home.html (It's in Downloads
// -> Trace32 Help System -> it's the file called "SH2, SH3 and SH4 Debugger"
// with the filename debugger_sh4.pdf).
//
//
// --- Performance Counter Registers ---
//
// These registers are 16 bits only and configure the performance counters
#define PMCR1_CTRL_REG 0xFF000084
#define PMCR2_CTRL_REG 0xFF000088
// These registers are 32-bits each and hold the high low parts of each counter
#define PMCTR1H_REG 0xFF100004
#define PMCTR1L_REG 0xFF100008
#define PMCTR2H_REG 0xFF10000C
#define PMCTR2L_REG 0xFF100010
//
// --- Performance Counter Configuration Flags ---
//
// These bits' functions are currently unknown, but they may simply be reserved.
// It's possible that there's a [maybe expired?] patent that details the
// configuration registers, though I haven't been able to find one. Places to
// check would be Google Patents and the Japanese Patent Office--maybe someone
// else can find something?
//
// Some notes:
// Writing 1 to all of these bits reads back as 0, so it looks like they aren't
// config bits. It's possible they are write-only like the stop bit, though,
// or that they're just reserved-write-0-only. It appears that they are always
// written with zeros in software that uses them, so that's confirmed safe to do.
//
// Also, after running counter 1 to overflow, it appears there's no overflow bit
// (maybe the designers thought 48-bits would be so much to count to that they
// didn't bother implementing one?). The upper 16-bits of the counter high
// register are also not sign-extension bits. They may be a hidden config area,
// but probably not because big endian mode would swap the byte order.
#define PMCR_UNKNOWN_BIT_0040 0x0040
#define PMCR_UNKNOWN_BIT_0080 0x0080
#define PMCR_UNKNOWN_BIT_0200 0x0200
#define PMCR_UNKNOWN_BIT_0400 0x0400
#define PMCR_UNKNOWN_BIT_0800 0x0800
#define PMCR_UNKNOWN_BIT_1000 0x1000
// PMCR_MODE_CLEAR_INVERTED just clears the event mode if it's inverted with
// '~', and event modes are listed below.
#define PMCR_MODE_CLEAR_INVERTED 0x003f
// PMCR_CLOCK_TYPE sets the counters to count clock cycles or CPU/bus ratio mode
// cycles (where T = C x B / 24 and T is time, C is count, and B is time
// of one bus cycle). Note: B = 1/99753008 or so, but it may vary, as mine is
// actually 1/99749010-ish; the target frequency is probably meant to be 99.75MHz.
//
// See the ST40 or Renesas SH7750R documents described in the above "Event Mode
// Info" section for more details about that formula.
//
// Set PMCR_CLOCK_TYPE to 0 for CPU cycle counting, where 1 count = 1 cycle, or
// set it to 1 to use the above formula. Renesas documentation recommends using
// the ratio version (set the bit to 1) when user programs alter CPU clock
// frequencies. This header has some definitions later on to help with this.
#define PMCR_CLOCK_TYPE 0x0100
#define PMCR_CLOCK_TYPE_SHIFT 8
// PMCR_STOP_COUNTER is write-only, as it always reads back as 0. It does what
// the name suggests: when this bit is written to, the counter stops. However,
// if written to while the counter is disabled or stopped, the counter's high
// and low registers are reset to 0.
//
// Using PMCR_STOP_COUNTER to stop the counter has the effect of holding the
// data in the data registers while stopped, unlike PMCR_DISABLE_COUNTER, and
// this bit needs to be written to again (e.g. on next start) in order to
// actually clear the counter data for another run. If not explicitly cleared,
// the counter will continue from where it left off before being stopped.
#define PMCR_STOP_COUNTER 0x2000
#define PMCR_RESET_COUNTER_SHIFT 13
// Bits 0xC000 both need to be set to 1 for the counters to actually begin
// counting. I have seen that the Linux kernel actually separates them out into
// two separate labelled bits (PMEN and PMST) for some reason, however they do
// not appear to do anything separately. Perhaps this is a two-bit mode where
// 1-1 is run, 1-0 and 0-1 are ???, and 0-0 is off.
#define PMCR_RUN_COUNTER 0xC000
#define PMCR_RUN_SHIFT 14
// Interestingly, the output here writes 0x6000 to the counter config registers,
// which would be the "PMST" bit and the "RESET" bit:
// https://www.multimediaforum.de/threads/36260834-alice-hsn-3800tw-usb-jtag-ft4232h/page2
// To disable a counter, just write 0 to its config register. This will not
// reset the counter to 0, as that requires an explicit clear via setting the
// PMCR_STOP_COUNTER bit. What's odd is that a disabled counter's data
// registers read back as all 0, but re-enabling it without a clear will
// continue from the last value before disabling.
#define PMCR_DISABLE_COUNTER 0x0000
// These definitions merely separate out the two PMCR_RUN_COUNTER bits, and
// they are included here for documentation purposes.
// PMST may mean PMCR START. It's consistently used to enable the counter.
// I'm just calling it PMST here for lack of a better name, since this is what
// the Linux kernel and lxdream call it. It could also have something to do with
// a mode specific to STMicroelectronics.
#define PMCR_PMST_BIT 0x4000
#define PMCR_PMST_SHIFT 14
// Likewise PMEN may mean PMCR ENABLE
#define PMCR_PMEN_BIT 0x8000
#define PMCR_PMEN_SHIFT 15
//
// --- Performance Counter Event Code Definitions ---
//
// Interestingly enough, it so happens that the SEGA Dreamcast's CPU seems to
// contain the same performance counter functionality as SH4 debug adapters for
// the SH7750R. Awesome!
//
// MODE DEFINITION VALUE MEASURMENT TYPE & NOTES
#define PMCR_INIT_NO_MODE 0x00 // None; Just here to be complete
#define PMCR_OPERAND_READ_ACCESS_MODE 0x01 // Quantity; With cache
#define PMCR_OPERAND_WRITE_ACCESS_MODE 0x02 // Quantity; With cache
#define PMCR_UTLB_MISS_MODE 0x03 // Quantity
#define PMCR_OPERAND_CACHE_READ_MISS_MODE 0x04 // Quantity
#define PMCR_OPERAND_CACHE_WRITE_MISS_MODE 0x05 // Quantity
#define PMCR_INSTRUCTION_FETCH_MODE 0x06 // Quantity; With cache
#define PMCR_INSTRUCTION_TLB_MISS_MODE 0x07 // Quantity
#define PMCR_INSTRUCTION_CACHE_MISS_MODE 0x08 // Quantity
#define PMCR_ALL_OPERAND_ACCESS_MODE 0x09 // Quantity
#define PMCR_ALL_INSTRUCTION_FETCH_MODE 0x0a // Quantity
#define PMCR_ON_CHIP_RAM_OPERAND_ACCESS_MODE 0x0b // Quantity
// No 0x0c
#define PMCR_ON_CHIP_IO_ACCESS_MODE 0x0d // Quantity
#define PMCR_OPERAND_ACCESS_MODE 0x0e // Quantity; With cache, counts both reads and writes
#define PMCR_OPERAND_CACHE_MISS_MODE 0x0f // Quantity
#define PMCR_BRANCH_ISSUED_MODE 0x10 // Quantity; Not the same as branch taken!
#define PMCR_BRANCH_TAKEN_MODE 0x11 // Quantity
#define PMCR_SUBROUTINE_ISSUED_MODE 0x12 // Quantity; Issued a BSR, BSRF, JSR, JSR/N
#define PMCR_INSTRUCTION_ISSUED_MODE 0x13 // Quantity
#define PMCR_PARALLEL_INSTRUCTION_ISSUED_MODE 0x14 // Quantity
#define PMCR_FPU_INSTRUCTION_ISSUED_MODE 0x15 // Quantity
#define PMCR_INTERRUPT_COUNTER_MODE 0x16 // Quantity
#define PMCR_NMI_COUNTER_MODE 0x17 // Quantity
#define PMCR_TRAPA_INSTRUCTION_COUNTER_MODE 0x18 // Quantity
#define PMCR_UBC_A_MATCH_MODE 0x19 // Quantity
#define PMCR_UBC_B_MATCH_MODE 0x1a // Quantity
// No 0x1b-0x20
#define PMCR_INSTRUCTION_CACHE_FILL_MODE 0x21 // Cycles
#define PMCR_OPERAND_CACHE_FILL_MODE 0x22 // Cycles
#define PMCR_ELAPSED_TIME_MODE 0x23 // Cycles; For 200MHz CPU: 5ns per count in 1 cycle = 1 count mode, or around 417.715ps per count (increments by 12) in CPU/bus ratio mode
#define PMCR_PIPELINE_FREEZE_BY_ICACHE_MISS_MODE 0x24 // Cycles
#define PMCR_PIPELINE_FREEZE_BY_DCACHE_MISS_MODE 0x25 // Cycles
// No 0x26
#define PMCR_PIPELINE_FREEZE_BY_BRANCH_MODE 0x27 // Cycles
#define PMCR_PIPELINE_FREEZE_BY_CPU_REGISTER_MODE 0x28 // Cycles
#define PMCR_PIPELINE_FREEZE_BY_FPU_MODE 0x29 // Cycles
//
// --- Performance Counter Support Definitions ---
//
// This definition can be passed as the init/enable/restart functions'
// count_type parameter to use the 1 cycle = 1 count mode. This is how the
// counter can be made to run for 16.3 days.
#define PMCR_COUNT_CPU_CYCLES 0
// Likewise this uses the CPU/bus ratio method
#define PMCR_COUNT_RATIO_CYCLES 1
// These definitions are for the enable function and specify whether to reset
// a counter to 0 or to continue from where it left off
#define PMCR_CONTINUE_COUNTER 0
#define PMCR_RESET_COUNTER 1
//
// --- Performance Counter Miscellaneous Definitions ---
//
// For convenience; assume stock bus clock of 99.75MHz
// (Bus clock is the external CPU clock, not the peripheral bus clock)
//
#define PMCR_SH4_CPU_FREQUENCY 199500000
#define PMCR_CPU_CYCLES_MAX_SECONDS 1410902
#define PMCR_SH4_BUS_FREQUENCY 99750000
#define PMCR_SH4_BUS_FREQUENCY_SCALED 2394000000 // 99.75MHz x 24
#define PMCR_BUS_RATIO_MAX_SECONDS 117575
//
// --- Performance Counter Functions ---
//
// See perfctr.c file for more details about each function and some more usage notes.
//
// Note: PMCR_Init() and PMCR_Enable() will do nothing if the perf counter is already running!
//
// Clear counter and enable
void PMCR_Init(int which, unsigned short mode, unsigned char count_type);
// Enable one or both of these "undocumented" performance counters.
void PMCR_Enable(int which, unsigned short mode, unsigned char count_type, unsigned char reset_counter);
// Disable, clear, and re-enable with new mode (or same mode)
void PMCR_Restart(int which, unsigned short mode, unsigned char count_type);
// Read a counter
// out_array is specifically uint32 out_array[2] -- 48-bit value needs a 64-bit storage unit
void PMCR_Read(int which, volatile unsigned int *out_array);
// Stop counter(s) (without clearing)
void PMCR_Stop(int which);
// Disable counter(s) (without clearing)
void PMCR_Disable(int which);
#endif /* __PERFCTR_H__ */

View File

@ -6,6 +6,7 @@
#include "../include/gl.h"
#include "../containers/aligned_vector.h"
#include "../containers/named_array.h"
#include "cygprofile.h"
extern void* memcpy4 (void *dest, const void *src, size_t count);
@ -249,6 +250,11 @@ typedef struct {
GLint size;
} AttribPointer;
typedef struct {
float xyz[3];
float n[3];
} EyeSpaceData;
GLboolean _glCheckValidEnum(GLint param, GLint* values, const char* func);
GLuint* _glGetEnabledAttributes();
@ -280,7 +286,7 @@ GLuint _glGetMipmapLevelCount(TextureObject* obj);
GLboolean _glIsLightingEnabled();
GLboolean _glIsLightEnabled(GLubyte light);
GLboolean _glIsColorMaterialEnabled();
void _glCalculateLightingContribution(const GLint light, const GLfloat* pos, const GLfloat* normal, uint8_t* bgra, GLfloat* colour);
void _glPerformLighting(Vertex* vertices, const EyeSpaceData* es, const int32_t count);
unsigned char _glIsClippingEnabled();
void _glEnableClipping(unsigned char v);

View File

@ -6,6 +6,8 @@
#include "profiler.h"
#include "../containers/aligned_vector.h"
#if PROFILING_COMPILED
#define MAX_PATH 256
typedef struct {
@ -141,3 +143,4 @@ void profiler_print_stats() {
fprintf(stderr, "%-60s%-20f%-20f%" PRIu64 "\n", result->name, (double)avg, (double)ms, result->total_calls);
}
}
#endif

View File

@ -7,12 +7,26 @@ typedef struct {
uint64_t start_time_in_us;
} Profiler;
#define PROFILING_COMPILED 0
#if PROFILING_COMPILED
Profiler* profiler_push(const char* name);
void profiler_checkpoint(const char* name);
void profiler_pop();
void _profiler_checkpoint(const char* name);
void _profiler_pop();
void profiler_print_stats();
void _profiler_print_stats();
void profiler_enable();
void profiler_disable();
void _profiler_enable();
void _profiler_disable();
#else
#define profiler_push(name);
#define profiler_checkpoint(name);
#define profiler_pop();
#define profiler_print_stats();
#define profiler_enable();
#define profiler_disable();
#endif

1448
GL/sh4_math.h Normal file

File diff suppressed because it is too large Load Diff

View File

@ -743,11 +743,11 @@ GLint _cleanInternalFormat(GLint internalFormat) {
typedef void (*TextureConversionFunc)(const GLubyte*, GLubyte*);
static inline void _rgba8888_to_argb4444(const GLubyte* source, GLubyte* dest) {
static INLINE_DEBUG void _rgba8888_to_argb4444(const GLubyte* source, GLubyte* dest) {
*((GLushort*) dest) = (source[3] & 0xF0) << 8 | (source[0] & 0xF0) << 4 | (source[1] & 0xF0) | (source[2] & 0xF0) >> 4;
}
static inline void _rgba8888_to_rgba8888(const GLubyte* source, GLubyte* dest) {
static INLINE_DEBUG void _rgba8888_to_rgba8888(const GLubyte* source, GLubyte* dest) {
/* Noop */
GLubyte* dst = (GLubyte*) dest;
dst[0] = source[0];
@ -756,11 +756,11 @@ static inline void _rgba8888_to_rgba8888(const GLubyte* source, GLubyte* dest) {
dst[3] = source[3];
}
static inline void _rgba8888_to_rgb565(const GLubyte* source, GLubyte* dest) {
static INLINE_DEBUG void _rgba8888_to_rgb565(const GLubyte* source, GLubyte* dest) {
*((GLushort*) dest) = ((source[0] & 0b11111000) << 8) | ((source[1] & 0b11111100) << 3) | (source[2] >> 3);
}
static inline void _rgb888_to_rgba8888(const GLubyte* source, GLubyte* dest) {
static INLINE_DEBUG void _rgb888_to_rgba8888(const GLubyte* source, GLubyte* dest) {
/* Noop */
GLubyte* dst = (GLubyte*) dest;
dst[0] = source[0];
@ -769,24 +769,24 @@ static inline void _rgb888_to_rgba8888(const GLubyte* source, GLubyte* dest) {
dst[3] = 255;
}
static inline void _rgb888_to_rgb565(const GLubyte* source, GLubyte* dest) {
static INLINE_DEBUG void _rgb888_to_rgb565(const GLubyte* source, GLubyte* dest) {
*((GLushort*) dest) = ((source[0] & 0b11111000) << 8) | ((source[1] & 0b11111100) << 3) | (source[2] >> 3);
}
static inline void _rgba8888_to_a000(const GLubyte* source, GLubyte* dest) {
static INLINE_DEBUG void _rgba8888_to_a000(const GLubyte* source, GLubyte* dest) {
*((GLushort*) dest) = ((source[3] & 0b11111000) << 8);
}
static inline void _r8_to_rgb565(const GLubyte* source, GLubyte* dest) {
static INLINE_DEBUG void _r8_to_rgb565(const GLubyte* source, GLubyte* dest) {
*((GLushort*) dest) = (source[0] & 0b11111000) << 8;
}
static inline void _rgba4444_to_argb4444(const GLubyte* source, GLubyte* dest) {
static INLINE_DEBUG void _rgba4444_to_argb4444(const GLubyte* source, GLubyte* dest) {
GLushort* src = (GLushort*) source;
*((GLushort*) dest) = ((*src & 0x000F) << 12) | *src >> 4;
}
static inline void _rgba4444_to_rgba8888(const GLubyte* source, GLubyte* dest) {
static INLINE_DEBUG void _rgba4444_to_rgba8888(const GLubyte* source, GLubyte* dest) {
GLushort src = *((GLushort*) source);
GLubyte* dst = (GLubyte*) dest;
@ -796,7 +796,7 @@ static inline void _rgba4444_to_rgba8888(const GLubyte* source, GLubyte* dest) {
dst[3] = ((src & 0x000F)) * 2;
}
static inline void _i8_to_i8(const GLubyte* source, GLubyte* dest) {
static INLINE_DEBUG void _i8_to_i8(const GLubyte* source, GLubyte* dest) {
/* For indexes */
GLubyte* dst = (GLubyte*) dest;
*dst = *source;

View File

@ -3,6 +3,8 @@
#include <math.h>
#include <assert.h>
#include <stdio.h>
#include <dc/sq.h>
#include <kos/string.h>
#if defined(__APPLE__) || defined(__WIN32__)
/* Linux + Kos define this, OSX does not, so just use malloc there */
@ -25,7 +27,7 @@ void aligned_vector_init(AlignedVector* vector, unsigned int element_size) {
}
static inline unsigned int round_to_chunk_size(unsigned int val) {
static INLINE_DEBUG unsigned int round_to_chunk_size(unsigned int val) {
const unsigned int n = val;
const unsigned int m = ALIGNED_VECTOR_CHUNK_SIZE;
@ -107,33 +109,12 @@ void* aligned_vector_resize(AlignedVector* vector, const unsigned int element_co
}
}
void* aligned_vector_at(const AlignedVector* vector, const unsigned int index) {
#if 0
if(index >= vector->size){
char msg[60];
sprintf(msg, "Vector OOB: %d %d wanted %d\n", vector->capacity, vector->size, index);
//aligned_vector_resize(vector, index);
assert_msg(index < vector->size, msg);
}
#endif
assert(index < vector->size);
return &vector->data[index * vector->element_size];
}
void* aligned_vector_back(AlignedVector* vector) {
return aligned_vector_at(vector, vector->size - 1);
}
void* aligned_vector_extend(AlignedVector* vector, const unsigned int additional_count) {
const unsigned int current = vector->size;
aligned_vector_resize(vector, vector->size + additional_count);
return aligned_vector_at(vector, current);
}
void aligned_vector_clear(AlignedVector* vector) {
vector->size = 0;
}
void aligned_vector_shrink_to_fit(AlignedVector* vector) {
if(vector->size == 0) {
free(vector->data);

View File

@ -5,6 +5,8 @@
extern "C" {
#endif
#include "../GL/cygprofile.h"
typedef struct {
unsigned int size;
unsigned int capacity;
@ -18,12 +20,27 @@ void aligned_vector_init(AlignedVector* vector, unsigned int element_size);
void aligned_vector_reserve(AlignedVector* vector, unsigned int element_count);
void* aligned_vector_push_back(AlignedVector* vector, const void* objs, unsigned int count);
void* aligned_vector_resize(AlignedVector* vector, const unsigned int element_count);
void* aligned_vector_at(const AlignedVector* vector, const unsigned int index);
INLINE_ALWAYS void* aligned_vector_at(const AlignedVector* vector, const unsigned int index) {
#if 0
if(index >= vector->size){
char msg[60];
sprintf(msg, "Vector OOB: %d %d wanted %d\n", vector->capacity, vector->size, index);
//aligned_vector_resize(vector, index);
assert_msg(index < vector->size, msg);
}
assert(index < vector->size); /* Check here */
#endif
return &vector->data[index * vector->element_size];
}
void* aligned_vector_extend(AlignedVector* vector, const unsigned int additional_count);
void aligned_vector_clear(AlignedVector* vector);
INLINE_ALWAYS void aligned_vector_clear(AlignedVector* vector){
vector->size = 0;
}
void aligned_vector_shrink_to_fit(AlignedVector* vector);
void aligned_vector_cleanup(AlignedVector* vector);
void* aligned_vector_back(AlignedVector* vector);
INLINE_ALWAYS void* aligned_vector_back(AlignedVector* vector){
return aligned_vector_at(vector, vector->size - 1);
}
#ifdef __cplusplus
}

View File

@ -44,13 +44,6 @@ void named_array_init(NamedArray* array, unsigned int element_size, unsigned int
memset(array->elements, 0, element_size * max_elements);
}
char named_array_used(NamedArray* array, unsigned int id) {
unsigned int i = id / 8;
unsigned int j = id % 8;
unsigned char v = array->used_markers[i] & (unsigned char) (1 << j);
return !!(v);
}
void* named_array_alloc(NamedArray* array, unsigned int* new_id) {
unsigned int i = 0, j = 0;

View File

@ -5,6 +5,8 @@
extern "C" {
#endif
#include "../GL/cygprofile.h"
typedef struct {
unsigned int element_size;
unsigned int max_element_count;
@ -14,7 +16,13 @@ typedef struct {
} NamedArray;
void named_array_init(NamedArray* array, unsigned int element_size, unsigned int max_elements);
char named_array_used(NamedArray* array, unsigned int id);
INLINE_ALWAYS char named_array_used(NamedArray* array, unsigned int id) {
const unsigned int i = id / 8;
const unsigned int j = id % 8;
unsigned char v = array->used_markers[i] & (unsigned char) (1 << j);
return !!(v);
}
void* named_array_alloc(NamedArray* array, unsigned int* new_id);
void* named_array_reserve(NamedArray* array, unsigned int id);