Optimisations
This commit is contained in:
parent
faf24ac61d
commit
ea25251944
|
@ -42,7 +42,6 @@ set(
|
|||
GL/immediate.c
|
||||
GL/lighting.c
|
||||
GL/matrix.c
|
||||
GL/profiler.c
|
||||
GL/state.c
|
||||
GL/texture.c
|
||||
GL/util.c
|
||||
|
@ -143,8 +142,11 @@ gen_sample(zclip_triangle samples/zclip_triangle/main.c)
|
|||
gen_sample(zclip_trianglestrip samples/zclip_trianglestrip/main.c)
|
||||
gen_sample(scissor samples/scissor/main.c)
|
||||
gen_sample(polymark samples/polymark/main.c)
|
||||
gen_sample(quadmark samples/quadmark/main.c)
|
||||
|
||||
|
||||
if(PLATFORM_DREAMCAST)
|
||||
gen_sample(trimark samples/trimark/main.c)
|
||||
gen_sample(quadmark samples/quadmark/main.c samples/profiler.c)
|
||||
else()
|
||||
gen_sample(quadmark samples/quadmark/main.c)
|
||||
endif()
|
||||
|
|
|
@ -11,7 +11,6 @@
|
|||
#define PVR_PACK_COLOR(a, r, g, b) {}
|
||||
#endif
|
||||
|
||||
#include "profiler.h"
|
||||
#include "private.h"
|
||||
#include "../containers/aligned_vector.h"
|
||||
|
||||
|
|
113
GL/draw.c
113
GL/draw.c
|
@ -6,7 +6,6 @@
|
|||
#include <assert.h>
|
||||
|
||||
#include "private.h"
|
||||
#include "profiler.h"
|
||||
#include "platform.h"
|
||||
|
||||
static AttribPointer VERTEX_POINTER;
|
||||
|
@ -255,14 +254,14 @@ static void _readVertexData3fARGB(const GLubyte* in, GLubyte* output) {
|
|||
output[A8IDX] = 1.0f;
|
||||
}
|
||||
|
||||
static void _readVertexData3ubARGB(const GLubyte* input, GLubyte* output) {
|
||||
static void _readVertexData3ubARGB(const GLubyte* __restrict__ input, GLubyte* __restrict__ output) {
|
||||
output[R8IDX] = input[0];
|
||||
output[G8IDX] = input[1];
|
||||
output[B8IDX] = input[2];
|
||||
output[A8IDX] = 1.0f;
|
||||
}
|
||||
|
||||
static void _readVertexData4ubRevARGB(const GLubyte* input, GLubyte* output) {
|
||||
static void _readVertexData4ubRevARGB(const GLubyte* __restrict__ input, GLubyte* __restrict__ output) {
|
||||
argbcpy(output, input);
|
||||
}
|
||||
|
||||
|
@ -275,12 +274,16 @@ static void _readVertexData4fRevARGB(const GLubyte* in, GLubyte* output) {
|
|||
output[3] = (GLubyte) clamp(input[3] * 255.0f, 0, 255);
|
||||
}
|
||||
|
||||
static void _fillWithNegZVE(const GLubyte* input, GLubyte* out) {
|
||||
static void _fillWithNegZVE(const GLubyte* __restrict__ input, GLubyte* __restrict__ out) {
|
||||
_GL_UNUSED(input);
|
||||
|
||||
float* output = (float*) out;
|
||||
output[0] = output[1] = 0.0f;
|
||||
output[2] = -1.0f;
|
||||
typedef struct {
|
||||
float x, y, z;
|
||||
} V;
|
||||
|
||||
const static V NegZ = {0.0f, 0.0f, -1.0f};
|
||||
|
||||
*((V*) out) = NegZ;
|
||||
}
|
||||
|
||||
static void _fillWhiteARGB(const GLubyte* input, GLubyte* output) {
|
||||
|
@ -290,9 +293,7 @@ static void _fillWhiteARGB(const GLubyte* input, GLubyte* output) {
|
|||
|
||||
static void _fillZero2f(const GLubyte* input, GLubyte* out) {
|
||||
_GL_UNUSED(input);
|
||||
|
||||
float* output = (float*) out;
|
||||
output[0] = output[1] = 0.0f;
|
||||
memset(out, sizeof(float) * 2, 0);
|
||||
}
|
||||
|
||||
static void _readVertexData3usARGB(const GLubyte* input, GLubyte* output) {
|
||||
|
@ -615,7 +616,7 @@ ReadNormalFunc calcReadNormalFunc() {
|
|||
}
|
||||
}
|
||||
|
||||
GL_FORCE_INLINE void _readPositionData(const GLuint first, const GLuint count, Vertex* output) {
|
||||
GL_FORCE_INLINE void _readPositionData(const GLuint first, const GLuint count, const Vertex* output) {
|
||||
const GLsizei vstride = (VERTEX_POINTER.stride) ? VERTEX_POINTER.stride : VERTEX_POINTER.size * byte_size(VERTEX_POINTER.type);
|
||||
const void* vptr = ((GLubyte*) VERTEX_POINTER.ptr + (first * vstride));
|
||||
|
||||
|
@ -629,7 +630,7 @@ GL_FORCE_INLINE void _readPositionData(const GLuint first, const GLuint count, V
|
|||
}
|
||||
}
|
||||
|
||||
GL_FORCE_INLINE void _readUVData(const GLuint first, const GLuint count, Vertex* output) {
|
||||
GL_FORCE_INLINE void _readUVData(const GLuint first, const GLuint count, const Vertex* output) {
|
||||
const GLsizei uvstride = (UV_POINTER.stride) ? UV_POINTER.stride : UV_POINTER.size * byte_size(UV_POINTER.type);
|
||||
const void* uvptr = ((GLubyte*) UV_POINTER.ptr + (first * uvstride));
|
||||
|
||||
|
@ -643,7 +644,7 @@ GL_FORCE_INLINE void _readUVData(const GLuint first, const GLuint count, Vertex*
|
|||
}
|
||||
}
|
||||
|
||||
GL_FORCE_INLINE void _readSTData(const GLuint first, const GLuint count, VertexExtra* extra) {
|
||||
GL_FORCE_INLINE void _readSTData(const GLuint first, const GLuint count, const VertexExtra* extra) {
|
||||
const GLsizei ststride = (ST_POINTER.stride) ? ST_POINTER.stride : ST_POINTER.size * byte_size(ST_POINTER.type);
|
||||
const void* stptr = ((GLubyte*) ST_POINTER.ptr + (first * ststride));
|
||||
|
||||
|
@ -657,7 +658,7 @@ GL_FORCE_INLINE void _readSTData(const GLuint first, const GLuint count, VertexE
|
|||
}
|
||||
}
|
||||
|
||||
GL_FORCE_INLINE void _readNormalData(const GLuint first, const GLuint count, VertexExtra* extra) {
|
||||
GL_FORCE_INLINE void _readNormalData(const GLuint first, const GLuint count, const VertexExtra* extra) {
|
||||
const GLsizei nstride = (NORMAL_POINTER.stride) ? NORMAL_POINTER.stride : NORMAL_POINTER.size * byte_size(NORMAL_POINTER.type);
|
||||
const void* nptr = ((GLubyte*) NORMAL_POINTER.ptr + (first * nstride));
|
||||
|
||||
|
@ -688,13 +689,13 @@ GL_FORCE_INLINE void _readNormalData(const GLuint first, const GLuint count, Ver
|
|||
}
|
||||
}
|
||||
|
||||
GL_FORCE_INLINE void _readDiffuseData(const GLuint first, const GLuint count, Vertex* output) {
|
||||
GL_FORCE_INLINE void _readDiffuseData(const GLuint first, const GLuint count, const Vertex* output) {
|
||||
const GLuint size = (DIFFUSE_POINTER.size == GL_BGRA) ? 4 : DIFFUSE_POINTER.size;
|
||||
const GLuint cstride = (DIFFUSE_POINTER.stride) ? DIFFUSE_POINTER.stride : size * byte_size(DIFFUSE_POINTER.type);
|
||||
const GLubyte* cptr = ((GLubyte*) DIFFUSE_POINTER.ptr) + (first * cstride);
|
||||
|
||||
ReadDiffuseFunc func = calcReadDiffuseFunc();
|
||||
GLubyte* out = output[0].bgra;
|
||||
GLubyte* out = (GLubyte*) output[0].bgra;
|
||||
|
||||
ITERATE(count) {
|
||||
func(cptr, out);
|
||||
|
@ -764,48 +765,56 @@ static void generateElements(
|
|||
}
|
||||
}
|
||||
|
||||
static const uint32_t FAST_PATH_BYTE_SIZE = (sizeof(GLfloat) * 3) + (sizeof(GLfloat) * 2) + (sizeof(GLubyte) * 4);
|
||||
|
||||
static void generateArraysFastPath(SubmissionTarget* target, const GLsizei first, const GLuint count, const GLenum type) {
|
||||
Vertex* start = _glSubmissionTargetStart(target);
|
||||
/* Copy the pos, uv and color directly in one go */
|
||||
const GLubyte* pos = VERTEX_POINTER.ptr;
|
||||
Vertex* it = start;
|
||||
ITERATE(count) {
|
||||
it->flags = GPU_CMD_VERTEX;
|
||||
MEMCPY4(it->xyz, pos, FAST_PATH_BYTE_SIZE);
|
||||
it++;
|
||||
pos += VERTEX_POINTER.stride;
|
||||
}
|
||||
|
||||
VertexExtra* ve = aligned_vector_at(target->extras, 0);
|
||||
|
||||
_readNormalData(first, count, ve);
|
||||
_readSTData(first, count, ve);
|
||||
}
|
||||
|
||||
static void generateArrays(SubmissionTarget* target, const GLsizei first, const GLuint count, const GLenum type) {
|
||||
Vertex* start = _glSubmissionTargetStart(target);
|
||||
_readPositionData(first, count, start);
|
||||
_readDiffuseData(first, count, start);
|
||||
_readUVData(first, count, start);
|
||||
|
||||
Vertex* it = _glSubmissionTargetStart(target);
|
||||
|
||||
ITERATE(count) {
|
||||
it->flags = GPU_CMD_VERTEX;
|
||||
++it;
|
||||
}
|
||||
|
||||
VertexExtra* ve = aligned_vector_at(target->extras, 0);
|
||||
|
||||
_readNormalData(first, count, ve);
|
||||
_readSTData(first, count, ve);
|
||||
}
|
||||
|
||||
static void generate(SubmissionTarget* target, const GLenum mode, const GLsizei first, const GLuint count,
|
||||
const GLubyte* indices, const GLenum type) {
|
||||
/* Read from the client buffers and generate an array of ClipVertices */
|
||||
TRACE();
|
||||
|
||||
static const uint32_t FAST_PATH_BYTE_SIZE = (sizeof(GLfloat) * 3) + (sizeof(GLfloat) * 2) + (sizeof(GLubyte) * 4);
|
||||
|
||||
if(!indices) {
|
||||
Vertex* start = _glSubmissionTargetStart(target);
|
||||
|
||||
if(FAST_PATH_ENABLED) {
|
||||
/* Copy the pos, uv and color directly in one go */
|
||||
const GLubyte* pos = VERTEX_POINTER.ptr;
|
||||
Vertex* it = start;
|
||||
ITERATE(count) {
|
||||
it->flags = GPU_CMD_VERTEX;
|
||||
MEMCPY4(it->xyz, pos, FAST_PATH_BYTE_SIZE);
|
||||
it++;
|
||||
pos += VERTEX_POINTER.stride;
|
||||
}
|
||||
} else {
|
||||
_readPositionData(first, count, start);
|
||||
_readDiffuseData(first, count, start);
|
||||
_readUVData(first, count, start);
|
||||
|
||||
Vertex* it = _glSubmissionTargetStart(target);
|
||||
|
||||
ITERATE(count) {
|
||||
it->flags = GPU_CMD_VERTEX;
|
||||
++it;
|
||||
}
|
||||
}
|
||||
|
||||
VertexExtra* ve = aligned_vector_at(target->extras, 0);
|
||||
|
||||
_readNormalData(first, count, ve);
|
||||
_readSTData(first, count, ve);
|
||||
|
||||
if(indices) {
|
||||
generateElements(target, first, count, indices, type);
|
||||
} else if(FAST_PATH_ENABLED) {
|
||||
generateArraysFastPath(target, first, count, type);
|
||||
} else {
|
||||
generateElements(
|
||||
target, first, count, indices, type
|
||||
);
|
||||
generateArrays(target, first, count, type);
|
||||
}
|
||||
|
||||
Vertex* it = _glSubmissionTargetStart(target);
|
||||
|
|
11
GL/flush.c
11
GL/flush.c
|
@ -1,7 +1,6 @@
|
|||
|
||||
#include "../containers/aligned_vector.h"
|
||||
#include "private.h"
|
||||
#include "profiler.h"
|
||||
|
||||
static PolyList OP_LIST;
|
||||
static PolyList PT_LIST;
|
||||
|
@ -93,8 +92,6 @@ void APIENTRY glKosSwapBuffers() {
|
|||
|
||||
TRACE();
|
||||
|
||||
profiler_push(__func__);
|
||||
|
||||
SceneBegin();
|
||||
SceneListBegin(GPU_LIST_OP_POLY);
|
||||
SceneListSubmit(OP_LIST.vector.data, OP_LIST.vector.size);
|
||||
|
@ -114,12 +111,4 @@ void APIENTRY glKosSwapBuffers() {
|
|||
aligned_vector_clear(&TR_LIST.vector);
|
||||
|
||||
_glApplyScissor(true);
|
||||
|
||||
profiler_checkpoint("scene");
|
||||
profiler_pop();
|
||||
|
||||
if(frame_count++ > 100) {
|
||||
profiler_print_stats();
|
||||
frame_count = 0;
|
||||
}
|
||||
}
|
||||
|
|
|
@ -10,7 +10,6 @@
|
|||
#include <string.h>
|
||||
#include <stdio.h>
|
||||
|
||||
#include "profiler.h"
|
||||
#include "private.h"
|
||||
|
||||
static GLboolean IMMEDIATE_MODE_ACTIVE = GL_FALSE;
|
||||
|
@ -266,8 +265,6 @@ void APIENTRY glNormal3fv(const GLfloat* v) {
|
|||
}
|
||||
|
||||
void APIENTRY glEnd() {
|
||||
profiler_push(__func__);
|
||||
|
||||
IMMEDIATE_MODE_ACTIVE = GL_FALSE;
|
||||
|
||||
/* Resizing could have invalidated these pointers */
|
||||
|
@ -333,9 +330,6 @@ void APIENTRY glEnd() {
|
|||
*nattr = nptr;
|
||||
*uattr = uvptr;
|
||||
*sattr = stptr;
|
||||
|
||||
profiler_checkpoint("restore");
|
||||
profiler_pop();
|
||||
}
|
||||
|
||||
void APIENTRY glRectf(GLfloat x1, GLfloat y1, GLfloat x2, GLfloat y2) {
|
||||
|
|
|
@ -185,7 +185,7 @@ typedef struct {
|
|||
|
||||
|
||||
#define argbcpy(dst, src) \
|
||||
*((GLuint*) dst) = *((GLuint*) src) \
|
||||
*((GLuint*) dst) = *((const GLuint*) src) \
|
||||
|
||||
|
||||
typedef struct {
|
||||
|
|
145
GL/profiler.c
145
GL/profiler.c
|
@ -1,145 +0,0 @@
|
|||
#include <stdint.h>
|
||||
#include <string.h>
|
||||
#include <stdio.h>
|
||||
|
||||
#include "profiler.h"
|
||||
#include "../containers/aligned_vector.h"
|
||||
|
||||
#if PROFILING_COMPILED
|
||||
|
||||
#define MAX_PATH 256
|
||||
|
||||
typedef struct {
|
||||
char name[MAX_PATH];
|
||||
|
||||
uint64_t total_time_us;
|
||||
uint64_t total_calls;
|
||||
} ProfilerResult;
|
||||
|
||||
typedef struct {
|
||||
AlignedVector stack;
|
||||
AlignedVector results;
|
||||
uint64_t start_time_in_us;
|
||||
} RootProfiler;
|
||||
|
||||
|
||||
static RootProfiler* root = NULL;
|
||||
|
||||
static char PROFILER_ENABLED = 0;
|
||||
|
||||
void profiler_enable() {
|
||||
PROFILER_ENABLED = 1;
|
||||
}
|
||||
|
||||
void profiler_disable() {
|
||||
PROFILER_ENABLED = 0;
|
||||
}
|
||||
|
||||
static ProfilerResult* profiler_get_or_create_result(const char* name) {
|
||||
if(!PROFILER_ENABLED) return NULL;
|
||||
|
||||
uint16_t i = 0;
|
||||
for(; i < root->results.size; ++i) {
|
||||
ProfilerResult* result = aligned_vector_at(&root->results, i);
|
||||
if(strcmp(result->name, name) == 0) {
|
||||
return result;
|
||||
}
|
||||
}
|
||||
|
||||
ProfilerResult newResult;
|
||||
strcpy(newResult.name, name);
|
||||
newResult.total_calls = 0;
|
||||
newResult.total_time_us = 0;
|
||||
aligned_vector_push_back(&root->results, &newResult, 1);
|
||||
return aligned_vector_back(&root->results);
|
||||
}
|
||||
|
||||
static uint64_t current_time_in_us() {
|
||||
return timer_us_gettime64();
|
||||
}
|
||||
|
||||
static void profiler_generate_path(const char* suffix, char* path) {
|
||||
uint16_t i = 0;
|
||||
for(; i < root->stack.size; ++i) {
|
||||
Profiler* prof = aligned_vector_at(&root->stack, i);
|
||||
strcat(path, prof->name);
|
||||
|
||||
if(i != root->stack.size - 1) {
|
||||
strcat(path, ".");
|
||||
}
|
||||
}
|
||||
|
||||
if(strlen(suffix)) {
|
||||
strcat(path, ":");
|
||||
strcat(path, suffix);
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
Profiler* profiler_push(const char* name) {
|
||||
if(!PROFILER_ENABLED) return NULL;
|
||||
|
||||
if(!root) {
|
||||
root = (RootProfiler*) malloc(sizeof(RootProfiler));
|
||||
aligned_vector_init(
|
||||
&root->stack,
|
||||
sizeof(Profiler)
|
||||
);
|
||||
|
||||
aligned_vector_init(
|
||||
&root->results,
|
||||
sizeof(ProfilerResult)
|
||||
);
|
||||
|
||||
aligned_vector_reserve(&root->stack, 32);
|
||||
aligned_vector_reserve(&root->results, 64);
|
||||
}
|
||||
|
||||
Profiler profiler;
|
||||
strncpy(profiler.name, name, 64);
|
||||
profiler.start_time_in_us = current_time_in_us();
|
||||
|
||||
aligned_vector_push_back(&root->stack, &profiler, 1);
|
||||
return aligned_vector_back(&root->stack);
|
||||
}
|
||||
|
||||
void profiler_checkpoint(const char* name) {
|
||||
if(!PROFILER_ENABLED) return;
|
||||
|
||||
Profiler* prof = aligned_vector_back(&root->stack);
|
||||
|
||||
char path[MAX_PATH];
|
||||
path[0] = '\0';
|
||||
|
||||
profiler_generate_path(name, path);
|
||||
|
||||
uint64_t now = current_time_in_us();
|
||||
uint64_t diff = now - prof->start_time_in_us;
|
||||
prof->start_time_in_us = now;
|
||||
|
||||
ProfilerResult* result = profiler_get_or_create_result(path);
|
||||
result->total_calls++;
|
||||
result->total_time_us += diff;
|
||||
}
|
||||
|
||||
void profiler_pop() {
|
||||
if(!PROFILER_ENABLED) return;
|
||||
|
||||
aligned_vector_resize(&root->stack, root->stack.size - 1);
|
||||
}
|
||||
|
||||
void profiler_print_stats() {
|
||||
if(!PROFILER_ENABLED) return;
|
||||
|
||||
fprintf(stderr, "%-60s%-20s%-20s%-20s\n", "Path", "Average", "Total", "Calls");
|
||||
|
||||
uint16_t i = 0;
|
||||
for(; i < root->results.size; ++i) {
|
||||
ProfilerResult* result = aligned_vector_at(&root->results, i);
|
||||
float ms = ((float) result->total_time_us) / 1000.0f;
|
||||
float avg = ms / (float) result->total_calls;
|
||||
|
||||
fprintf(stderr, "%-60s%-20f%-20f%" PRIu64 "\n", result->name, (double)avg, (double)ms, result->total_calls);
|
||||
}
|
||||
}
|
||||
#endif
|
|
@ -1,32 +0,0 @@
|
|||
#pragma once
|
||||
|
||||
#include <stdint.h>
|
||||
|
||||
typedef struct {
|
||||
char name[64];
|
||||
uint64_t start_time_in_us;
|
||||
} Profiler;
|
||||
|
||||
#define PROFILING_COMPILED 0
|
||||
|
||||
#if PROFILING_COMPILED
|
||||
Profiler* profiler_push(const char* name);
|
||||
void _profiler_checkpoint(const char* name);
|
||||
void _profiler_pop();
|
||||
|
||||
void _profiler_print_stats();
|
||||
|
||||
void _profiler_enable();
|
||||
void _profiler_disable();
|
||||
|
||||
#else
|
||||
#define profiler_push(name);
|
||||
#define profiler_checkpoint(name);
|
||||
#define profiler_pop();
|
||||
|
||||
#define profiler_print_stats();
|
||||
|
||||
#define profiler_enable();
|
||||
#define profiler_disable();
|
||||
|
||||
#endif
|
|
@ -37,13 +37,13 @@ static inline unsigned int round_to_chunk_size(unsigned int val) {
|
|||
return ((n + m - 1) / m) * m;
|
||||
}
|
||||
|
||||
void aligned_vector_reserve(AlignedVector* vector, unsigned int element_count) {
|
||||
void* aligned_vector_reserve(AlignedVector* vector, unsigned int element_count) {
|
||||
if(element_count == 0) {
|
||||
return;
|
||||
return NULL;
|
||||
}
|
||||
|
||||
if(element_count <= vector->capacity) {
|
||||
return;
|
||||
return NULL;
|
||||
}
|
||||
|
||||
unsigned int original_byte_size = vector->size * vector->element_size;
|
||||
|
@ -63,6 +63,8 @@ void aligned_vector_reserve(AlignedVector* vector, unsigned int element_count) {
|
|||
}
|
||||
|
||||
vector->capacity = element_count;
|
||||
|
||||
return vector->data + original_byte_size;
|
||||
}
|
||||
|
||||
void* aligned_vector_push_back(AlignedVector* vector, const void* objs, unsigned int count) {
|
||||
|
@ -84,6 +86,8 @@ void* aligned_vector_push_back(AlignedVector* vector, const void* objs, unsigned
|
|||
}
|
||||
|
||||
void* aligned_vector_resize(AlignedVector* vector, const unsigned int element_count) {
|
||||
void* ret = NULL;
|
||||
|
||||
unsigned int previousCount = vector->size;
|
||||
|
||||
/* Don't change memory when resizing downwards, just change the size */
|
||||
|
@ -93,13 +97,15 @@ void* aligned_vector_resize(AlignedVector* vector, const unsigned int element_co
|
|||
}
|
||||
|
||||
if(vector->capacity < element_count) {
|
||||
aligned_vector_reserve(vector, element_count);
|
||||
ret = aligned_vector_reserve(vector, element_count);
|
||||
vector->size = element_count;
|
||||
} else if(previousCount < element_count) {
|
||||
vector->size = element_count;
|
||||
ret = aligned_vector_at(vector, previousCount);
|
||||
}
|
||||
|
||||
vector->size = element_count;
|
||||
|
||||
if(previousCount < vector->size) {
|
||||
return aligned_vector_at(vector, previousCount);
|
||||
return ret;
|
||||
} else {
|
||||
return NULL;
|
||||
}
|
||||
|
|
|
@ -17,7 +17,7 @@ typedef struct {
|
|||
#define ALIGNED_VECTOR_CHUNK_SIZE 256u
|
||||
|
||||
void aligned_vector_init(AlignedVector* vector, unsigned int element_size);
|
||||
void aligned_vector_reserve(AlignedVector* vector, unsigned int element_count);
|
||||
void* aligned_vector_reserve(AlignedVector* vector, unsigned int element_count);
|
||||
void* aligned_vector_push_back(AlignedVector* vector, const void* objs, unsigned int count);
|
||||
void* aligned_vector_resize(AlignedVector* vector, const unsigned int element_count);
|
||||
static inline void* aligned_vector_at(const AlignedVector* vector, const unsigned int index) {
|
||||
|
|
449
samples/profiler.c
Normal file
449
samples/profiler.c
Normal file
|
@ -0,0 +1,449 @@
|
|||
#include <stdbool.h>
|
||||
#include <assert.h>
|
||||
#include <stdint.h>
|
||||
#include <stdio.h>
|
||||
#include <stdlib.h>
|
||||
#include <string.h>
|
||||
#include <dirent.h>
|
||||
|
||||
#include <kos/thread.h>
|
||||
#include <dc/fs_dcload.h>
|
||||
|
||||
static char OUTPUT_FILENAME[128];
|
||||
static kthread_t* THREAD;
|
||||
static volatile bool PROFILER_RUNNING = false;
|
||||
static volatile bool PROFILER_RECORDING = false;
|
||||
|
||||
#define BASE_ADDRESS 0x8c010000
|
||||
#define BUCKET_SIZE 10000
|
||||
|
||||
#define INTERVAL_IN_MS 10
|
||||
|
||||
/* Simple hash table of samples. An array of Samples
|
||||
* but, each sample in that array can be the head of
|
||||
* a linked list of other samples */
|
||||
typedef struct Arc {
|
||||
uint32_t pc;
|
||||
uint32_t pr; // Caller return address
|
||||
uint32_t count;
|
||||
struct Arc* next;
|
||||
} Arc;
|
||||
|
||||
static Arc ARCS[BUCKET_SIZE];
|
||||
|
||||
/* Hashing function for two uint32_ts */
|
||||
#define HASH_PAIR(x, y) ((x * 0x1f1f1f1f) ^ y)
|
||||
|
||||
#define BUFFER_SIZE (1024 * 8) // 8K buffer
|
||||
|
||||
const static size_t MAX_ARC_COUNT = BUFFER_SIZE / sizeof(Arc);
|
||||
static size_t ARC_COUNT = 0;
|
||||
|
||||
static bool WRITE_TO_STDOUT = false;
|
||||
|
||||
static bool write_samples(const char* path);
|
||||
static bool write_samples_to_stdout();
|
||||
static void clear_samples();
|
||||
|
||||
static Arc* new_arc(uint32_t PC, uint32_t PR) {
|
||||
Arc* s = (Arc*) malloc(sizeof(Arc));
|
||||
s->count = 1;
|
||||
s->pc = PC;
|
||||
s->pr = PR;
|
||||
s->next = NULL;
|
||||
|
||||
++ARC_COUNT;
|
||||
|
||||
return s;
|
||||
}
|
||||
|
||||
static void record_thread(uint32_t PC, uint32_t PR) {
|
||||
uint32_t bucket = HASH_PAIR(PC, PR) % BUCKET_SIZE;
|
||||
|
||||
Arc* s = &ARCS[bucket];
|
||||
|
||||
if(s->pc) {
|
||||
/* Initialized sample in this bucket,
|
||||
* does it match though? */
|
||||
while(s->pc != PC || s->pr != PR) {
|
||||
if(s->next) {
|
||||
s = s->next;
|
||||
} else {
|
||||
s->next = new_arc(PC, PR);
|
||||
return; // We're done
|
||||
}
|
||||
}
|
||||
|
||||
s->count++;
|
||||
} else {
|
||||
/* Initialize this sample */
|
||||
s->count = 1;
|
||||
s->pc = PC;
|
||||
s->pr = PR;
|
||||
s->next = NULL;
|
||||
++ARC_COUNT;
|
||||
}
|
||||
}
|
||||
|
||||
static int thd_each_cb(kthread_t* thd, void* data) {
|
||||
(void) data;
|
||||
|
||||
|
||||
/* Only record the main thread (for now) */
|
||||
if(strcmp(thd->label, "[kernel]") != 0) {
|
||||
return 0;
|
||||
}
|
||||
|
||||
/* The idea is that if this code right here is running in the profiling
|
||||
* thread, then all the PCs from the other threads are
|
||||
* current. Obviouly thought between iterations the
|
||||
* PC will change so it's not like this is a true snapshot
|
||||
* in time across threads */
|
||||
uint32_t PC = thd->context.pc;
|
||||
uint32_t PR = thd->context.pr;
|
||||
record_thread(PC, PR);
|
||||
return 0;
|
||||
}
|
||||
|
||||
static void record_samples() {
|
||||
/* Go through all the active threads and increase
|
||||
* the sample count for the PC for each of them */
|
||||
|
||||
size_t initial = ARC_COUNT;
|
||||
|
||||
/* Note: This is a function added to kallistios-nitro that's
|
||||
* not yet available upstream */
|
||||
thd_each(&thd_each_cb, NULL);
|
||||
|
||||
if(ARC_COUNT >= MAX_ARC_COUNT) {
|
||||
/* TIME TO FLUSH! */
|
||||
if(!write_samples(OUTPUT_FILENAME)) {
|
||||
fprintf(stderr, "Error writing samples\n");
|
||||
}
|
||||
}
|
||||
|
||||
/* We log when the number of PCs recorded hits a certain increment */
|
||||
if((initial != ARC_COUNT) && ((ARC_COUNT % 1000) == 0)) {
|
||||
printf("-- %d arcs recorded...\n", ARC_COUNT);
|
||||
}
|
||||
}
|
||||
|
||||
/* Declared in KOS in fs_dcload.c */
|
||||
int fs_dcload_detected();
|
||||
extern int dcload_type;
|
||||
|
||||
|
||||
#define GMON_COOKIE "gmon"
|
||||
#define GMON_VERSION 1
|
||||
|
||||
typedef struct {
|
||||
char cookie[4]; // 'g','m','o','n'
|
||||
int32_t version; // 1
|
||||
char spare[3 * 4]; // Padding
|
||||
} GmonHeader;
|
||||
|
||||
typedef struct {
|
||||
uint32_t low_pc;
|
||||
uint32_t high_pc;
|
||||
uint32_t hist_size;
|
||||
uint32_t prof_rate;
|
||||
char dimen[15]; /* phys. dim., usually "seconds" */
|
||||
char dimen_abbrev; /* usually 's' for "seconds" */
|
||||
} GmonHistHeader;
|
||||
|
||||
typedef struct {
|
||||
unsigned char tag; // GMON_TAG_TIME_HIST = 0, GMON_TAG_CG_ARC = 1, GMON_TAG_BB_COUNT = 2
|
||||
size_t ncounts; // Number of address/count pairs in this sequence
|
||||
} GmonBBHeader;
|
||||
|
||||
typedef struct {
|
||||
uint32_t from_pc; /* address within caller's body */
|
||||
uint32_t self_pc; /* address within callee's body */
|
||||
uint32_t count; /* number of arc traversals */
|
||||
} GmonArc;
|
||||
|
||||
static bool init_sample_file(const char* path) {
|
||||
printf("Detecting dcload... ");
|
||||
|
||||
if(!fs_dcload_detected() && dcload_type != DCLOAD_TYPE_NONE) {
|
||||
printf("[Not Found]\n");
|
||||
WRITE_TO_STDOUT = true;
|
||||
return false;
|
||||
} else {
|
||||
printf("[Found]\n");
|
||||
}
|
||||
|
||||
FILE* out = fopen(path, "w");
|
||||
if(!out) {
|
||||
WRITE_TO_STDOUT = true;
|
||||
return false;
|
||||
}
|
||||
|
||||
/* Write the GMON header */
|
||||
|
||||
GmonHeader header;
|
||||
memcpy(&header.cookie[0], GMON_COOKIE, sizeof(header.cookie));
|
||||
header.version = 1;
|
||||
memset(header.spare, '\0', sizeof(header.spare));
|
||||
|
||||
fwrite(&header, sizeof(header), 1, out);
|
||||
|
||||
fclose(out);
|
||||
return true;
|
||||
}
|
||||
|
||||
#define ROUNDDOWN(x,y) (((x)/(y))*(y))
|
||||
#define ROUNDUP(x,y) ((((x)+(y)-1)/(y))*(y))
|
||||
|
||||
static bool write_samples(const char* path) {
|
||||
/* Appends the samples to the output file in gmon format
|
||||
*
|
||||
* We iterate the data twice, first generating arcs, then generating
|
||||
* basic block counts. While we do that though we calculate the data
|
||||
* for the histogram so we don't need a third iteration */
|
||||
|
||||
if(WRITE_TO_STDOUT) {
|
||||
write_samples_to_stdout();
|
||||
return true;
|
||||
}
|
||||
|
||||
extern char _etext;
|
||||
|
||||
const uint32_t HISTFRACTION = 8;
|
||||
|
||||
/* We know the lowest address, it's the same for all DC games */
|
||||
uint32_t lowest_address = ROUNDDOWN(BASE_ADDRESS, HISTFRACTION);
|
||||
|
||||
/* We need to calculate the highest address though */
|
||||
uint32_t highest_address = ROUNDUP((uint32_t) &_etext, HISTFRACTION);
|
||||
|
||||
/* Histogram data */
|
||||
const int BIN_COUNT = ((highest_address - lowest_address) / HISTFRACTION);
|
||||
uint16_t* bins = (uint16_t*) malloc(BIN_COUNT * sizeof(uint16_t));
|
||||
memset(bins, 0, sizeof(uint16_t) * BIN_COUNT);
|
||||
|
||||
FILE* out = fopen(path, "r+"); /* Append, as init_sample_file would have created the file */
|
||||
if(!out) {
|
||||
fprintf(stderr, "-- Error writing samples to output file\n");
|
||||
return false;
|
||||
}
|
||||
|
||||
// Seek to the end of the file
|
||||
fseek(out, 0, SEEK_END);
|
||||
|
||||
printf("-- Writing %d arcs\n", ARC_COUNT);
|
||||
|
||||
uint8_t tag = 1;
|
||||
|
||||
#ifndef NDEBUG
|
||||
size_t written = 0;
|
||||
#endif
|
||||
|
||||
/* Write arcs */
|
||||
Arc* root = ARCS;
|
||||
for(int i = 0; i < BUCKET_SIZE; ++i) {
|
||||
if(root->pc) {
|
||||
GmonArc arc;
|
||||
arc.from_pc = root->pr;
|
||||
arc.self_pc = root->pc;
|
||||
arc.count = root->count;
|
||||
|
||||
/* Write the root sample if it has a program counter */
|
||||
fwrite(&tag, sizeof(tag), 1, out);
|
||||
fwrite(&arc, sizeof(GmonArc), 1, out);
|
||||
|
||||
#ifndef NDEBUG
|
||||
++written;
|
||||
#endif
|
||||
|
||||
/* If there's a next pointer, traverse the list */
|
||||
Arc* s = root->next;
|
||||
while(s) {
|
||||
arc.from_pc = s->pr;
|
||||
arc.self_pc = s->pc;
|
||||
arc.count = s->count;
|
||||
|
||||
/* Write the root sample if it has a program counter */
|
||||
fwrite(&tag, sizeof(tag), 1, out);
|
||||
fwrite(&arc, sizeof(GmonArc), 1, out);
|
||||
|
||||
#ifndef NDEBUG
|
||||
++written;
|
||||
#endif
|
||||
|
||||
s = s->next;
|
||||
}
|
||||
}
|
||||
|
||||
root++;
|
||||
}
|
||||
|
||||
uint32_t histogram_range = highest_address - lowest_address;
|
||||
uint32_t bin_size = histogram_range / BIN_COUNT;
|
||||
|
||||
root = ARCS;
|
||||
for(int i = 0; i < BUCKET_SIZE; ++i) {
|
||||
if(root->pc) {
|
||||
printf("Incrementing %d for %x. ", (root->pc - lowest_address) / bin_size, (unsigned int) root->pc);
|
||||
bins[(root->pc - lowest_address) / bin_size]++;
|
||||
printf("Now: %d\n", (int) bins[(root->pc - lowest_address) / bin_size]);
|
||||
|
||||
/* If there's a next pointer, traverse the list */
|
||||
Arc* s = root->next;
|
||||
while(s) {
|
||||
assert(s->pc);
|
||||
bins[(s->pc - lowest_address) / bin_size]++;
|
||||
s = s->next;
|
||||
}
|
||||
}
|
||||
|
||||
root++;
|
||||
}
|
||||
|
||||
|
||||
/* Write histogram now that we have all the information we need */
|
||||
GmonHistHeader hist_header;
|
||||
hist_header.low_pc = lowest_address;
|
||||
hist_header.high_pc = highest_address;
|
||||
hist_header.hist_size = BIN_COUNT;
|
||||
hist_header.prof_rate = INTERVAL_IN_MS;
|
||||
strcpy(hist_header.dimen, "seconds");
|
||||
hist_header.dimen_abbrev = 's';
|
||||
|
||||
unsigned char hist_tag = 0;
|
||||
fwrite(&hist_tag, sizeof(hist_tag), 1, out);
|
||||
fwrite(&hist_header, sizeof(hist_header), 1, out);
|
||||
fwrite(bins, sizeof(uint16_t), BIN_COUNT, out);
|
||||
|
||||
fclose(out);
|
||||
free(bins);
|
||||
|
||||
/* We should have written all the recorded samples */
|
||||
assert(written == ARC_COUNT);
|
||||
|
||||
clear_samples();
|
||||
|
||||
return true;
|
||||
}
|
||||
|
||||
static bool write_samples_to_stdout() {
|
||||
/* Write samples to stdout as a CSV file
|
||||
* for processing */
|
||||
|
||||
printf("--------------\n");
|
||||
printf("\"PC\", \"PR\", \"COUNT\"\n");
|
||||
|
||||
Arc* root = ARCS;
|
||||
for(int i = 0; i < BUCKET_SIZE; ++i) {
|
||||
Arc* s = root;
|
||||
while(s->next) {
|
||||
printf("\"%x\", \"%x\", \"%d\"\n", (unsigned int) s->pc, (unsigned int) s->pr, (unsigned int) s->count);
|
||||
s = s->next;
|
||||
}
|
||||
|
||||
root++;
|
||||
}
|
||||
|
||||
printf("--------------\n");
|
||||
|
||||
return true;
|
||||
}
|
||||
|
||||
|
||||
static void* run(void* args) {
|
||||
printf("-- Entered profiler thread!\n");
|
||||
|
||||
while(PROFILER_RUNNING){
|
||||
if(PROFILER_RECORDING) {
|
||||
record_samples();
|
||||
usleep(INTERVAL_IN_MS * 1000); //usleep takes milliseconds
|
||||
}
|
||||
}
|
||||
|
||||
printf("-- Profiler thread finished!\n");
|
||||
|
||||
return NULL;
|
||||
}
|
||||
|
||||
void profiler_init(const char* output) {
|
||||
/* Store the filename */
|
||||
strncpy(OUTPUT_FILENAME, output, sizeof(OUTPUT_FILENAME));
|
||||
|
||||
/* Initialize the file */
|
||||
printf("Creating samples file...\n");
|
||||
if(!init_sample_file(OUTPUT_FILENAME)) {
|
||||
printf("Read-only filesytem. Writing samples to stdout\n");
|
||||
}
|
||||
|
||||
printf("Creating profiler thread...\n");
|
||||
// Initialize the samples to zero
|
||||
memset(ARCS, 0, sizeof(ARCS));
|
||||
|
||||
PROFILER_RUNNING = true;
|
||||
THREAD = thd_create(0, run, NULL);
|
||||
|
||||
/* Lower priority is... er, higher */
|
||||
thd_set_prio(THREAD, PRIO_DEFAULT / 2);
|
||||
|
||||
printf("Thread started.\n");
|
||||
}
|
||||
|
||||
void profiler_start() {
|
||||
assert(PROFILER_RUNNING);
|
||||
|
||||
if(PROFILER_RECORDING) {
|
||||
return;
|
||||
}
|
||||
|
||||
PROFILER_RECORDING = true;
|
||||
printf("Starting profiling...\n");
|
||||
}
|
||||
|
||||
static void clear_samples() {
|
||||
/* Free the samples we've collected to start again */
|
||||
Arc* root = ARCS;
|
||||
for(int i = 0; i < BUCKET_SIZE; ++i) {
|
||||
Arc* s = root;
|
||||
Arc* next = s->next;
|
||||
|
||||
// While we have a next pointer
|
||||
while(next) {
|
||||
s = next; // Point S at it
|
||||
next = s->next; // Store the new next pointer
|
||||
free(s); // Free S
|
||||
}
|
||||
|
||||
// We've wiped the chain so we can now clear the root
|
||||
// which is statically allocated
|
||||
root->next = NULL;
|
||||
root++;
|
||||
}
|
||||
|
||||
// Wipe the lot
|
||||
memset(ARCS, 0, sizeof(ARCS));
|
||||
ARC_COUNT = 0;
|
||||
}
|
||||
|
||||
bool profiler_stop() {
|
||||
if(!PROFILER_RECORDING) {
|
||||
return false;
|
||||
}
|
||||
|
||||
printf("Stopping profiling...\n");
|
||||
|
||||
PROFILER_RECORDING = false;
|
||||
if(!write_samples(OUTPUT_FILENAME)) {
|
||||
printf("ERROR WRITING SAMPLES (RO filesystem?)! Outputting to stdout\n");
|
||||
return false;
|
||||
}
|
||||
|
||||
|
||||
return true;
|
||||
}
|
||||
|
||||
void profiler_clean_up() {
|
||||
profiler_stop(); // Make sure everything is stopped
|
||||
|
||||
PROFILER_RUNNING = false;
|
||||
thd_join(THREAD, NULL);
|
||||
}
|
18
samples/profiler.h
Normal file
18
samples/profiler.h
Normal file
|
@ -0,0 +1,18 @@
|
|||
#pragma once
|
||||
|
||||
/*
|
||||
* The Dreamcast doesn't have any kind of profiling support from GCC
|
||||
* so this is a cumbersome sampling profiler that runs in a background thread
|
||||
*/
|
||||
#ifdef __cplusplus
|
||||
extern "C" {
|
||||
#endif
|
||||
|
||||
void profiler_init(const char* output);
|
||||
void profiler_start();
|
||||
void profiler_stop();
|
||||
void profiler_clean_up();
|
||||
|
||||
#ifdef __cplusplus
|
||||
}
|
||||
#endif
|
|
@ -9,11 +9,13 @@
|
|||
|
||||
#ifdef __DREAMCAST__
|
||||
#include <kos.h>
|
||||
#include "../profiler.h"
|
||||
#endif
|
||||
|
||||
#include <GL/gl.h>
|
||||
#include <GL/glkos.h>
|
||||
|
||||
#include <stdint.h>
|
||||
#include <stdlib.h>
|
||||
#include <stdio.h>
|
||||
#include <time.h>
|
||||
|
@ -162,14 +164,23 @@ void check_switch() {
|
|||
}
|
||||
|
||||
int main(int argc, char **argv) {
|
||||
#ifndef NDEBUG
|
||||
#ifdef __DREAMCAST__
|
||||
profiler_init("/pc/gmon.out");
|
||||
profiler_start();
|
||||
#endif
|
||||
#endif
|
||||
|
||||
setup();
|
||||
|
||||
/* Start off with something obscene */
|
||||
switch_tests(200000 / 60);
|
||||
start = time(NULL);
|
||||
|
||||
uint32_t iterations = 2000;
|
||||
|
||||
for(;;) {
|
||||
if(check_start())
|
||||
if(check_start() || iterations-- == 0)
|
||||
break;
|
||||
|
||||
printf(" \r");
|
||||
|
@ -180,7 +191,12 @@ int main(int argc, char **argv) {
|
|||
|
||||
stats();
|
||||
|
||||
#ifdef __DREAMCAST__
|
||||
#ifndef NDEBUG
|
||||
profiler_stop();
|
||||
profiler_clean_up();
|
||||
#endif
|
||||
#endif
|
||||
|
||||
return 0;
|
||||
}
|
||||
|
||||
|
||||
|
|
Loading…
Reference in New Issue
Block a user