From ea25251944659c928336b36d5ddc9dc4ab95019c Mon Sep 17 00:00:00 2001 From: Luke Benstead Date: Tue, 20 Apr 2021 16:08:58 +0100 Subject: [PATCH] Optimisations --- CMakeLists.txt | 6 +- GL/clip.c | 1 - GL/draw.c | 113 ++++----- GL/flush.c | 11 - GL/immediate.c | 6 - GL/private.h | 2 +- GL/profiler.c | 145 ------------ GL/profiler.h | 32 --- containers/aligned_vector.c | 20 +- containers/aligned_vector.h | 2 +- samples/profiler.c | 449 ++++++++++++++++++++++++++++++++++++ samples/profiler.h | 18 ++ samples/quadmark/main.c | 22 +- 13 files changed, 566 insertions(+), 261 deletions(-) delete mode 100644 GL/profiler.c delete mode 100644 GL/profiler.h create mode 100644 samples/profiler.c create mode 100644 samples/profiler.h diff --git a/CMakeLists.txt b/CMakeLists.txt index b181ed2..d39a310 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -42,7 +42,6 @@ set( GL/immediate.c GL/lighting.c GL/matrix.c - GL/profiler.c GL/state.c GL/texture.c GL/util.c @@ -143,8 +142,11 @@ gen_sample(zclip_triangle samples/zclip_triangle/main.c) gen_sample(zclip_trianglestrip samples/zclip_trianglestrip/main.c) gen_sample(scissor samples/scissor/main.c) gen_sample(polymark samples/polymark/main.c) -gen_sample(quadmark samples/quadmark/main.c) + if(PLATFORM_DREAMCAST) gen_sample(trimark samples/trimark/main.c) + gen_sample(quadmark samples/quadmark/main.c samples/profiler.c) +else() + gen_sample(quadmark samples/quadmark/main.c) endif() diff --git a/GL/clip.c b/GL/clip.c index 259cac1..4573afe 100644 --- a/GL/clip.c +++ b/GL/clip.c @@ -11,7 +11,6 @@ #define PVR_PACK_COLOR(a, r, g, b) {} #endif -#include "profiler.h" #include "private.h" #include "../containers/aligned_vector.h" diff --git a/GL/draw.c b/GL/draw.c index ee94b10..ab3ade0 100644 --- a/GL/draw.c +++ b/GL/draw.c @@ -6,7 +6,6 @@ #include #include "private.h" -#include "profiler.h" #include "platform.h" static AttribPointer VERTEX_POINTER; @@ -255,14 +254,14 @@ static void _readVertexData3fARGB(const GLubyte* in, GLubyte* output) { output[A8IDX] = 1.0f; } -static void _readVertexData3ubARGB(const GLubyte* input, GLubyte* output) { +static void _readVertexData3ubARGB(const GLubyte* __restrict__ input, GLubyte* __restrict__ output) { output[R8IDX] = input[0]; output[G8IDX] = input[1]; output[B8IDX] = input[2]; output[A8IDX] = 1.0f; } -static void _readVertexData4ubRevARGB(const GLubyte* input, GLubyte* output) { +static void _readVertexData4ubRevARGB(const GLubyte* __restrict__ input, GLubyte* __restrict__ output) { argbcpy(output, input); } @@ -275,12 +274,16 @@ static void _readVertexData4fRevARGB(const GLubyte* in, GLubyte* output) { output[3] = (GLubyte) clamp(input[3] * 255.0f, 0, 255); } -static void _fillWithNegZVE(const GLubyte* input, GLubyte* out) { +static void _fillWithNegZVE(const GLubyte* __restrict__ input, GLubyte* __restrict__ out) { _GL_UNUSED(input); - float* output = (float*) out; - output[0] = output[1] = 0.0f; - output[2] = -1.0f; + typedef struct { + float x, y, z; + } V; + + const static V NegZ = {0.0f, 0.0f, -1.0f}; + + *((V*) out) = NegZ; } static void _fillWhiteARGB(const GLubyte* input, GLubyte* output) { @@ -290,9 +293,7 @@ static void _fillWhiteARGB(const GLubyte* input, GLubyte* output) { static void _fillZero2f(const GLubyte* input, GLubyte* out) { _GL_UNUSED(input); - - float* output = (float*) out; - output[0] = output[1] = 0.0f; + memset(out, sizeof(float) * 2, 0); } static void _readVertexData3usARGB(const GLubyte* input, GLubyte* output) { @@ -615,7 +616,7 @@ ReadNormalFunc calcReadNormalFunc() { } } -GL_FORCE_INLINE void _readPositionData(const GLuint first, const GLuint count, Vertex* output) { +GL_FORCE_INLINE void _readPositionData(const GLuint first, const GLuint count, const Vertex* output) { const GLsizei vstride = (VERTEX_POINTER.stride) ? VERTEX_POINTER.stride : VERTEX_POINTER.size * byte_size(VERTEX_POINTER.type); const void* vptr = ((GLubyte*) VERTEX_POINTER.ptr + (first * vstride)); @@ -629,7 +630,7 @@ GL_FORCE_INLINE void _readPositionData(const GLuint first, const GLuint count, V } } -GL_FORCE_INLINE void _readUVData(const GLuint first, const GLuint count, Vertex* output) { +GL_FORCE_INLINE void _readUVData(const GLuint first, const GLuint count, const Vertex* output) { const GLsizei uvstride = (UV_POINTER.stride) ? UV_POINTER.stride : UV_POINTER.size * byte_size(UV_POINTER.type); const void* uvptr = ((GLubyte*) UV_POINTER.ptr + (first * uvstride)); @@ -643,7 +644,7 @@ GL_FORCE_INLINE void _readUVData(const GLuint first, const GLuint count, Vertex* } } -GL_FORCE_INLINE void _readSTData(const GLuint first, const GLuint count, VertexExtra* extra) { +GL_FORCE_INLINE void _readSTData(const GLuint first, const GLuint count, const VertexExtra* extra) { const GLsizei ststride = (ST_POINTER.stride) ? ST_POINTER.stride : ST_POINTER.size * byte_size(ST_POINTER.type); const void* stptr = ((GLubyte*) ST_POINTER.ptr + (first * ststride)); @@ -657,7 +658,7 @@ GL_FORCE_INLINE void _readSTData(const GLuint first, const GLuint count, VertexE } } -GL_FORCE_INLINE void _readNormalData(const GLuint first, const GLuint count, VertexExtra* extra) { +GL_FORCE_INLINE void _readNormalData(const GLuint first, const GLuint count, const VertexExtra* extra) { const GLsizei nstride = (NORMAL_POINTER.stride) ? NORMAL_POINTER.stride : NORMAL_POINTER.size * byte_size(NORMAL_POINTER.type); const void* nptr = ((GLubyte*) NORMAL_POINTER.ptr + (first * nstride)); @@ -688,13 +689,13 @@ GL_FORCE_INLINE void _readNormalData(const GLuint first, const GLuint count, Ver } } -GL_FORCE_INLINE void _readDiffuseData(const GLuint first, const GLuint count, Vertex* output) { +GL_FORCE_INLINE void _readDiffuseData(const GLuint first, const GLuint count, const Vertex* output) { const GLuint size = (DIFFUSE_POINTER.size == GL_BGRA) ? 4 : DIFFUSE_POINTER.size; const GLuint cstride = (DIFFUSE_POINTER.stride) ? DIFFUSE_POINTER.stride : size * byte_size(DIFFUSE_POINTER.type); const GLubyte* cptr = ((GLubyte*) DIFFUSE_POINTER.ptr) + (first * cstride); ReadDiffuseFunc func = calcReadDiffuseFunc(); - GLubyte* out = output[0].bgra; + GLubyte* out = (GLubyte*) output[0].bgra; ITERATE(count) { func(cptr, out); @@ -764,48 +765,56 @@ static void generateElements( } } +static const uint32_t FAST_PATH_BYTE_SIZE = (sizeof(GLfloat) * 3) + (sizeof(GLfloat) * 2) + (sizeof(GLubyte) * 4); + +static void generateArraysFastPath(SubmissionTarget* target, const GLsizei first, const GLuint count, const GLenum type) { + Vertex* start = _glSubmissionTargetStart(target); + /* Copy the pos, uv and color directly in one go */ + const GLubyte* pos = VERTEX_POINTER.ptr; + Vertex* it = start; + ITERATE(count) { + it->flags = GPU_CMD_VERTEX; + MEMCPY4(it->xyz, pos, FAST_PATH_BYTE_SIZE); + it++; + pos += VERTEX_POINTER.stride; + } + + VertexExtra* ve = aligned_vector_at(target->extras, 0); + + _readNormalData(first, count, ve); + _readSTData(first, count, ve); +} + +static void generateArrays(SubmissionTarget* target, const GLsizei first, const GLuint count, const GLenum type) { + Vertex* start = _glSubmissionTargetStart(target); + _readPositionData(first, count, start); + _readDiffuseData(first, count, start); + _readUVData(first, count, start); + + Vertex* it = _glSubmissionTargetStart(target); + + ITERATE(count) { + it->flags = GPU_CMD_VERTEX; + ++it; + } + + VertexExtra* ve = aligned_vector_at(target->extras, 0); + + _readNormalData(first, count, ve); + _readSTData(first, count, ve); +} + static void generate(SubmissionTarget* target, const GLenum mode, const GLsizei first, const GLuint count, const GLubyte* indices, const GLenum type) { /* Read from the client buffers and generate an array of ClipVertices */ TRACE(); - static const uint32_t FAST_PATH_BYTE_SIZE = (sizeof(GLfloat) * 3) + (sizeof(GLfloat) * 2) + (sizeof(GLubyte) * 4); - - if(!indices) { - Vertex* start = _glSubmissionTargetStart(target); - - if(FAST_PATH_ENABLED) { - /* Copy the pos, uv and color directly in one go */ - const GLubyte* pos = VERTEX_POINTER.ptr; - Vertex* it = start; - ITERATE(count) { - it->flags = GPU_CMD_VERTEX; - MEMCPY4(it->xyz, pos, FAST_PATH_BYTE_SIZE); - it++; - pos += VERTEX_POINTER.stride; - } - } else { - _readPositionData(first, count, start); - _readDiffuseData(first, count, start); - _readUVData(first, count, start); - - Vertex* it = _glSubmissionTargetStart(target); - - ITERATE(count) { - it->flags = GPU_CMD_VERTEX; - ++it; - } - } - - VertexExtra* ve = aligned_vector_at(target->extras, 0); - - _readNormalData(first, count, ve); - _readSTData(first, count, ve); - + if(indices) { + generateElements(target, first, count, indices, type); + } else if(FAST_PATH_ENABLED) { + generateArraysFastPath(target, first, count, type); } else { - generateElements( - target, first, count, indices, type - ); + generateArrays(target, first, count, type); } Vertex* it = _glSubmissionTargetStart(target); diff --git a/GL/flush.c b/GL/flush.c index 197dee9..a2cc7a3 100644 --- a/GL/flush.c +++ b/GL/flush.c @@ -1,7 +1,6 @@ #include "../containers/aligned_vector.h" #include "private.h" -#include "profiler.h" static PolyList OP_LIST; static PolyList PT_LIST; @@ -93,8 +92,6 @@ void APIENTRY glKosSwapBuffers() { TRACE(); - profiler_push(__func__); - SceneBegin(); SceneListBegin(GPU_LIST_OP_POLY); SceneListSubmit(OP_LIST.vector.data, OP_LIST.vector.size); @@ -114,12 +111,4 @@ void APIENTRY glKosSwapBuffers() { aligned_vector_clear(&TR_LIST.vector); _glApplyScissor(true); - - profiler_checkpoint("scene"); - profiler_pop(); - - if(frame_count++ > 100) { - profiler_print_stats(); - frame_count = 0; - } } diff --git a/GL/immediate.c b/GL/immediate.c index e6c14fe..cb152b6 100644 --- a/GL/immediate.c +++ b/GL/immediate.c @@ -10,7 +10,6 @@ #include #include -#include "profiler.h" #include "private.h" static GLboolean IMMEDIATE_MODE_ACTIVE = GL_FALSE; @@ -266,8 +265,6 @@ void APIENTRY glNormal3fv(const GLfloat* v) { } void APIENTRY glEnd() { - profiler_push(__func__); - IMMEDIATE_MODE_ACTIVE = GL_FALSE; /* Resizing could have invalidated these pointers */ @@ -333,9 +330,6 @@ void APIENTRY glEnd() { *nattr = nptr; *uattr = uvptr; *sattr = stptr; - - profiler_checkpoint("restore"); - profiler_pop(); } void APIENTRY glRectf(GLfloat x1, GLfloat y1, GLfloat x2, GLfloat y2) { diff --git a/GL/private.h b/GL/private.h index 1f2c0d1..f9cd113 100644 --- a/GL/private.h +++ b/GL/private.h @@ -185,7 +185,7 @@ typedef struct { #define argbcpy(dst, src) \ - *((GLuint*) dst) = *((GLuint*) src) \ + *((GLuint*) dst) = *((const GLuint*) src) \ typedef struct { diff --git a/GL/profiler.c b/GL/profiler.c deleted file mode 100644 index 6a27981..0000000 --- a/GL/profiler.c +++ /dev/null @@ -1,145 +0,0 @@ -#include -#include -#include - -#include "profiler.h" -#include "../containers/aligned_vector.h" - -#if PROFILING_COMPILED - -#define MAX_PATH 256 - -typedef struct { - char name[MAX_PATH]; - - uint64_t total_time_us; - uint64_t total_calls; -} ProfilerResult; - -typedef struct { - AlignedVector stack; - AlignedVector results; - uint64_t start_time_in_us; -} RootProfiler; - - -static RootProfiler* root = NULL; - -static char PROFILER_ENABLED = 0; - -void profiler_enable() { - PROFILER_ENABLED = 1; -} - -void profiler_disable() { - PROFILER_ENABLED = 0; -} - -static ProfilerResult* profiler_get_or_create_result(const char* name) { - if(!PROFILER_ENABLED) return NULL; - - uint16_t i = 0; - for(; i < root->results.size; ++i) { - ProfilerResult* result = aligned_vector_at(&root->results, i); - if(strcmp(result->name, name) == 0) { - return result; - } - } - - ProfilerResult newResult; - strcpy(newResult.name, name); - newResult.total_calls = 0; - newResult.total_time_us = 0; - aligned_vector_push_back(&root->results, &newResult, 1); - return aligned_vector_back(&root->results); -} - -static uint64_t current_time_in_us() { - return timer_us_gettime64(); -} - -static void profiler_generate_path(const char* suffix, char* path) { - uint16_t i = 0; - for(; i < root->stack.size; ++i) { - Profiler* prof = aligned_vector_at(&root->stack, i); - strcat(path, prof->name); - - if(i != root->stack.size - 1) { - strcat(path, "."); - } - } - - if(strlen(suffix)) { - strcat(path, ":"); - strcat(path, suffix); - } -} - - -Profiler* profiler_push(const char* name) { - if(!PROFILER_ENABLED) return NULL; - - if(!root) { - root = (RootProfiler*) malloc(sizeof(RootProfiler)); - aligned_vector_init( - &root->stack, - sizeof(Profiler) - ); - - aligned_vector_init( - &root->results, - sizeof(ProfilerResult) - ); - - aligned_vector_reserve(&root->stack, 32); - aligned_vector_reserve(&root->results, 64); - } - - Profiler profiler; - strncpy(profiler.name, name, 64); - profiler.start_time_in_us = current_time_in_us(); - - aligned_vector_push_back(&root->stack, &profiler, 1); - return aligned_vector_back(&root->stack); -} - -void profiler_checkpoint(const char* name) { - if(!PROFILER_ENABLED) return; - - Profiler* prof = aligned_vector_back(&root->stack); - - char path[MAX_PATH]; - path[0] = '\0'; - - profiler_generate_path(name, path); - - uint64_t now = current_time_in_us(); - uint64_t diff = now - prof->start_time_in_us; - prof->start_time_in_us = now; - - ProfilerResult* result = profiler_get_or_create_result(path); - result->total_calls++; - result->total_time_us += diff; -} - -void profiler_pop() { - if(!PROFILER_ENABLED) return; - - aligned_vector_resize(&root->stack, root->stack.size - 1); -} - -void profiler_print_stats() { - if(!PROFILER_ENABLED) return; - - fprintf(stderr, "%-60s%-20s%-20s%-20s\n", "Path", "Average", "Total", "Calls"); - - uint16_t i = 0; - for(; i < root->results.size; ++i) { - ProfilerResult* result = aligned_vector_at(&root->results, i); - float ms = ((float) result->total_time_us) / 1000.0f; - float avg = ms / (float) result->total_calls; - - fprintf(stderr, "%-60s%-20f%-20f%" PRIu64 "\n", result->name, (double)avg, (double)ms, result->total_calls); - } -} -#endif diff --git a/GL/profiler.h b/GL/profiler.h deleted file mode 100644 index acf07ed..0000000 --- a/GL/profiler.h +++ /dev/null @@ -1,32 +0,0 @@ -#pragma once - -#include - -typedef struct { - char name[64]; - uint64_t start_time_in_us; -} Profiler; - -#define PROFILING_COMPILED 0 - -#if PROFILING_COMPILED -Profiler* profiler_push(const char* name); -void _profiler_checkpoint(const char* name); -void _profiler_pop(); - -void _profiler_print_stats(); - -void _profiler_enable(); -void _profiler_disable(); - -#else -#define profiler_push(name); -#define profiler_checkpoint(name); -#define profiler_pop(); - -#define profiler_print_stats(); - -#define profiler_enable(); -#define profiler_disable(); - -#endif diff --git a/containers/aligned_vector.c b/containers/aligned_vector.c index c390260..21b6058 100644 --- a/containers/aligned_vector.c +++ b/containers/aligned_vector.c @@ -37,13 +37,13 @@ static inline unsigned int round_to_chunk_size(unsigned int val) { return ((n + m - 1) / m) * m; } -void aligned_vector_reserve(AlignedVector* vector, unsigned int element_count) { +void* aligned_vector_reserve(AlignedVector* vector, unsigned int element_count) { if(element_count == 0) { - return; + return NULL; } if(element_count <= vector->capacity) { - return; + return NULL; } unsigned int original_byte_size = vector->size * vector->element_size; @@ -63,6 +63,8 @@ void aligned_vector_reserve(AlignedVector* vector, unsigned int element_count) { } vector->capacity = element_count; + + return vector->data + original_byte_size; } void* aligned_vector_push_back(AlignedVector* vector, const void* objs, unsigned int count) { @@ -84,6 +86,8 @@ void* aligned_vector_push_back(AlignedVector* vector, const void* objs, unsigned } void* aligned_vector_resize(AlignedVector* vector, const unsigned int element_count) { + void* ret = NULL; + unsigned int previousCount = vector->size; /* Don't change memory when resizing downwards, just change the size */ @@ -93,13 +97,15 @@ void* aligned_vector_resize(AlignedVector* vector, const unsigned int element_co } if(vector->capacity < element_count) { - aligned_vector_reserve(vector, element_count); + ret = aligned_vector_reserve(vector, element_count); + vector->size = element_count; + } else if(previousCount < element_count) { + vector->size = element_count; + ret = aligned_vector_at(vector, previousCount); } - vector->size = element_count; - if(previousCount < vector->size) { - return aligned_vector_at(vector, previousCount); + return ret; } else { return NULL; } diff --git a/containers/aligned_vector.h b/containers/aligned_vector.h index b64d089..53128b8 100644 --- a/containers/aligned_vector.h +++ b/containers/aligned_vector.h @@ -17,7 +17,7 @@ typedef struct { #define ALIGNED_VECTOR_CHUNK_SIZE 256u void aligned_vector_init(AlignedVector* vector, unsigned int element_size); -void aligned_vector_reserve(AlignedVector* vector, unsigned int element_count); +void* aligned_vector_reserve(AlignedVector* vector, unsigned int element_count); void* aligned_vector_push_back(AlignedVector* vector, const void* objs, unsigned int count); void* aligned_vector_resize(AlignedVector* vector, const unsigned int element_count); static inline void* aligned_vector_at(const AlignedVector* vector, const unsigned int index) { diff --git a/samples/profiler.c b/samples/profiler.c new file mode 100644 index 0000000..357eb91 --- /dev/null +++ b/samples/profiler.c @@ -0,0 +1,449 @@ +#include +#include +#include +#include +#include +#include +#include + +#include +#include + +static char OUTPUT_FILENAME[128]; +static kthread_t* THREAD; +static volatile bool PROFILER_RUNNING = false; +static volatile bool PROFILER_RECORDING = false; + +#define BASE_ADDRESS 0x8c010000 +#define BUCKET_SIZE 10000 + +#define INTERVAL_IN_MS 10 + +/* Simple hash table of samples. An array of Samples + * but, each sample in that array can be the head of + * a linked list of other samples */ +typedef struct Arc { + uint32_t pc; + uint32_t pr; // Caller return address + uint32_t count; + struct Arc* next; +} Arc; + +static Arc ARCS[BUCKET_SIZE]; + +/* Hashing function for two uint32_ts */ +#define HASH_PAIR(x, y) ((x * 0x1f1f1f1f) ^ y) + +#define BUFFER_SIZE (1024 * 8) // 8K buffer + +const static size_t MAX_ARC_COUNT = BUFFER_SIZE / sizeof(Arc); +static size_t ARC_COUNT = 0; + +static bool WRITE_TO_STDOUT = false; + +static bool write_samples(const char* path); +static bool write_samples_to_stdout(); +static void clear_samples(); + +static Arc* new_arc(uint32_t PC, uint32_t PR) { + Arc* s = (Arc*) malloc(sizeof(Arc)); + s->count = 1; + s->pc = PC; + s->pr = PR; + s->next = NULL; + + ++ARC_COUNT; + + return s; +} + +static void record_thread(uint32_t PC, uint32_t PR) { + uint32_t bucket = HASH_PAIR(PC, PR) % BUCKET_SIZE; + + Arc* s = &ARCS[bucket]; + + if(s->pc) { + /* Initialized sample in this bucket, + * does it match though? */ + while(s->pc != PC || s->pr != PR) { + if(s->next) { + s = s->next; + } else { + s->next = new_arc(PC, PR); + return; // We're done + } + } + + s->count++; + } else { + /* Initialize this sample */ + s->count = 1; + s->pc = PC; + s->pr = PR; + s->next = NULL; + ++ARC_COUNT; + } +} + +static int thd_each_cb(kthread_t* thd, void* data) { + (void) data; + + + /* Only record the main thread (for now) */ + if(strcmp(thd->label, "[kernel]") != 0) { + return 0; + } + + /* The idea is that if this code right here is running in the profiling + * thread, then all the PCs from the other threads are + * current. Obviouly thought between iterations the + * PC will change so it's not like this is a true snapshot + * in time across threads */ + uint32_t PC = thd->context.pc; + uint32_t PR = thd->context.pr; + record_thread(PC, PR); + return 0; +} + +static void record_samples() { + /* Go through all the active threads and increase + * the sample count for the PC for each of them */ + + size_t initial = ARC_COUNT; + + /* Note: This is a function added to kallistios-nitro that's + * not yet available upstream */ + thd_each(&thd_each_cb, NULL); + + if(ARC_COUNT >= MAX_ARC_COUNT) { + /* TIME TO FLUSH! */ + if(!write_samples(OUTPUT_FILENAME)) { + fprintf(stderr, "Error writing samples\n"); + } + } + + /* We log when the number of PCs recorded hits a certain increment */ + if((initial != ARC_COUNT) && ((ARC_COUNT % 1000) == 0)) { + printf("-- %d arcs recorded...\n", ARC_COUNT); + } +} + +/* Declared in KOS in fs_dcload.c */ +int fs_dcload_detected(); +extern int dcload_type; + + +#define GMON_COOKIE "gmon" +#define GMON_VERSION 1 + +typedef struct { + char cookie[4]; // 'g','m','o','n' + int32_t version; // 1 + char spare[3 * 4]; // Padding +} GmonHeader; + +typedef struct { + uint32_t low_pc; + uint32_t high_pc; + uint32_t hist_size; + uint32_t prof_rate; + char dimen[15]; /* phys. dim., usually "seconds" */ + char dimen_abbrev; /* usually 's' for "seconds" */ +} GmonHistHeader; + +typedef struct { + unsigned char tag; // GMON_TAG_TIME_HIST = 0, GMON_TAG_CG_ARC = 1, GMON_TAG_BB_COUNT = 2 + size_t ncounts; // Number of address/count pairs in this sequence +} GmonBBHeader; + +typedef struct { + uint32_t from_pc; /* address within caller's body */ + uint32_t self_pc; /* address within callee's body */ + uint32_t count; /* number of arc traversals */ +} GmonArc; + +static bool init_sample_file(const char* path) { + printf("Detecting dcload... "); + + if(!fs_dcload_detected() && dcload_type != DCLOAD_TYPE_NONE) { + printf("[Not Found]\n"); + WRITE_TO_STDOUT = true; + return false; + } else { + printf("[Found]\n"); + } + + FILE* out = fopen(path, "w"); + if(!out) { + WRITE_TO_STDOUT = true; + return false; + } + + /* Write the GMON header */ + + GmonHeader header; + memcpy(&header.cookie[0], GMON_COOKIE, sizeof(header.cookie)); + header.version = 1; + memset(header.spare, '\0', sizeof(header.spare)); + + fwrite(&header, sizeof(header), 1, out); + + fclose(out); + return true; +} + +#define ROUNDDOWN(x,y) (((x)/(y))*(y)) +#define ROUNDUP(x,y) ((((x)+(y)-1)/(y))*(y)) + +static bool write_samples(const char* path) { + /* Appends the samples to the output file in gmon format + * + * We iterate the data twice, first generating arcs, then generating + * basic block counts. While we do that though we calculate the data + * for the histogram so we don't need a third iteration */ + + if(WRITE_TO_STDOUT) { + write_samples_to_stdout(); + return true; + } + + extern char _etext; + + const uint32_t HISTFRACTION = 8; + + /* We know the lowest address, it's the same for all DC games */ + uint32_t lowest_address = ROUNDDOWN(BASE_ADDRESS, HISTFRACTION); + + /* We need to calculate the highest address though */ + uint32_t highest_address = ROUNDUP((uint32_t) &_etext, HISTFRACTION); + + /* Histogram data */ + const int BIN_COUNT = ((highest_address - lowest_address) / HISTFRACTION); + uint16_t* bins = (uint16_t*) malloc(BIN_COUNT * sizeof(uint16_t)); + memset(bins, 0, sizeof(uint16_t) * BIN_COUNT); + + FILE* out = fopen(path, "r+"); /* Append, as init_sample_file would have created the file */ + if(!out) { + fprintf(stderr, "-- Error writing samples to output file\n"); + return false; + } + + // Seek to the end of the file + fseek(out, 0, SEEK_END); + + printf("-- Writing %d arcs\n", ARC_COUNT); + + uint8_t tag = 1; + +#ifndef NDEBUG + size_t written = 0; +#endif + + /* Write arcs */ + Arc* root = ARCS; + for(int i = 0; i < BUCKET_SIZE; ++i) { + if(root->pc) { + GmonArc arc; + arc.from_pc = root->pr; + arc.self_pc = root->pc; + arc.count = root->count; + + /* Write the root sample if it has a program counter */ + fwrite(&tag, sizeof(tag), 1, out); + fwrite(&arc, sizeof(GmonArc), 1, out); + +#ifndef NDEBUG + ++written; +#endif + + /* If there's a next pointer, traverse the list */ + Arc* s = root->next; + while(s) { + arc.from_pc = s->pr; + arc.self_pc = s->pc; + arc.count = s->count; + + /* Write the root sample if it has a program counter */ + fwrite(&tag, sizeof(tag), 1, out); + fwrite(&arc, sizeof(GmonArc), 1, out); + +#ifndef NDEBUG + ++written; +#endif + + s = s->next; + } + } + + root++; + } + + uint32_t histogram_range = highest_address - lowest_address; + uint32_t bin_size = histogram_range / BIN_COUNT; + + root = ARCS; + for(int i = 0; i < BUCKET_SIZE; ++i) { + if(root->pc) { + printf("Incrementing %d for %x. ", (root->pc - lowest_address) / bin_size, (unsigned int) root->pc); + bins[(root->pc - lowest_address) / bin_size]++; + printf("Now: %d\n", (int) bins[(root->pc - lowest_address) / bin_size]); + + /* If there's a next pointer, traverse the list */ + Arc* s = root->next; + while(s) { + assert(s->pc); + bins[(s->pc - lowest_address) / bin_size]++; + s = s->next; + } + } + + root++; + } + + + /* Write histogram now that we have all the information we need */ + GmonHistHeader hist_header; + hist_header.low_pc = lowest_address; + hist_header.high_pc = highest_address; + hist_header.hist_size = BIN_COUNT; + hist_header.prof_rate = INTERVAL_IN_MS; + strcpy(hist_header.dimen, "seconds"); + hist_header.dimen_abbrev = 's'; + + unsigned char hist_tag = 0; + fwrite(&hist_tag, sizeof(hist_tag), 1, out); + fwrite(&hist_header, sizeof(hist_header), 1, out); + fwrite(bins, sizeof(uint16_t), BIN_COUNT, out); + + fclose(out); + free(bins); + + /* We should have written all the recorded samples */ + assert(written == ARC_COUNT); + + clear_samples(); + + return true; +} + +static bool write_samples_to_stdout() { + /* Write samples to stdout as a CSV file + * for processing */ + + printf("--------------\n"); + printf("\"PC\", \"PR\", \"COUNT\"\n"); + + Arc* root = ARCS; + for(int i = 0; i < BUCKET_SIZE; ++i) { + Arc* s = root; + while(s->next) { + printf("\"%x\", \"%x\", \"%d\"\n", (unsigned int) s->pc, (unsigned int) s->pr, (unsigned int) s->count); + s = s->next; + } + + root++; + } + + printf("--------------\n"); + + return true; +} + + +static void* run(void* args) { + printf("-- Entered profiler thread!\n"); + + while(PROFILER_RUNNING){ + if(PROFILER_RECORDING) { + record_samples(); + usleep(INTERVAL_IN_MS * 1000); //usleep takes milliseconds + } + } + + printf("-- Profiler thread finished!\n"); + + return NULL; +} + +void profiler_init(const char* output) { + /* Store the filename */ + strncpy(OUTPUT_FILENAME, output, sizeof(OUTPUT_FILENAME)); + + /* Initialize the file */ + printf("Creating samples file...\n"); + if(!init_sample_file(OUTPUT_FILENAME)) { + printf("Read-only filesytem. Writing samples to stdout\n"); + } + + printf("Creating profiler thread...\n"); + // Initialize the samples to zero + memset(ARCS, 0, sizeof(ARCS)); + + PROFILER_RUNNING = true; + THREAD = thd_create(0, run, NULL); + + /* Lower priority is... er, higher */ + thd_set_prio(THREAD, PRIO_DEFAULT / 2); + + printf("Thread started.\n"); +} + +void profiler_start() { + assert(PROFILER_RUNNING); + + if(PROFILER_RECORDING) { + return; + } + + PROFILER_RECORDING = true; + printf("Starting profiling...\n"); +} + +static void clear_samples() { + /* Free the samples we've collected to start again */ + Arc* root = ARCS; + for(int i = 0; i < BUCKET_SIZE; ++i) { + Arc* s = root; + Arc* next = s->next; + + // While we have a next pointer + while(next) { + s = next; // Point S at it + next = s->next; // Store the new next pointer + free(s); // Free S + } + + // We've wiped the chain so we can now clear the root + // which is statically allocated + root->next = NULL; + root++; + } + + // Wipe the lot + memset(ARCS, 0, sizeof(ARCS)); + ARC_COUNT = 0; +} + +bool profiler_stop() { + if(!PROFILER_RECORDING) { + return false; + } + + printf("Stopping profiling...\n"); + + PROFILER_RECORDING = false; + if(!write_samples(OUTPUT_FILENAME)) { + printf("ERROR WRITING SAMPLES (RO filesystem?)! Outputting to stdout\n"); + return false; + } + + + return true; +} + +void profiler_clean_up() { + profiler_stop(); // Make sure everything is stopped + + PROFILER_RUNNING = false; + thd_join(THREAD, NULL); +} diff --git a/samples/profiler.h b/samples/profiler.h new file mode 100644 index 0000000..d2a7435 --- /dev/null +++ b/samples/profiler.h @@ -0,0 +1,18 @@ +#pragma once + +/* + * The Dreamcast doesn't have any kind of profiling support from GCC + * so this is a cumbersome sampling profiler that runs in a background thread + */ +#ifdef __cplusplus +extern "C" { +#endif + +void profiler_init(const char* output); +void profiler_start(); +void profiler_stop(); +void profiler_clean_up(); + +#ifdef __cplusplus +} +#endif diff --git a/samples/quadmark/main.c b/samples/quadmark/main.c index dfe5c17..eca1261 100644 --- a/samples/quadmark/main.c +++ b/samples/quadmark/main.c @@ -9,11 +9,13 @@ #ifdef __DREAMCAST__ #include +#include "../profiler.h" #endif #include #include +#include #include #include #include @@ -162,14 +164,23 @@ void check_switch() { } int main(int argc, char **argv) { +#ifndef NDEBUG +#ifdef __DREAMCAST__ + profiler_init("/pc/gmon.out"); + profiler_start(); +#endif +#endif + setup(); /* Start off with something obscene */ switch_tests(200000 / 60); start = time(NULL); + uint32_t iterations = 2000; + for(;;) { - if(check_start()) + if(check_start() || iterations-- == 0) break; printf(" \r"); @@ -180,7 +191,12 @@ int main(int argc, char **argv) { stats(); +#ifdef __DREAMCAST__ +#ifndef NDEBUG + profiler_stop(); + profiler_clean_up(); +#endif +#endif + return 0; } - -