From 360b8e2d39a95513d8f8355fb25c0083a4ab04b7 Mon Sep 17 00:00:00 2001 From: lerabot Date: Wed, 23 Aug 2023 08:43:45 -0400 Subject: [PATCH] Added a glKosCopyTexture --- CMakeLists.txt | 2 +- GL/texture.c | 15 ++ include/GL/glkos.h | 2 + samples/dreamroq/dreamroqlib.c | 2 +- samples/dreamroq/main.c | 11 +- samples/dreamroq/profiler.c | 452 +++++++++++++++++++++++++++++++++ samples/dreamroq/profiler.h | 18 ++ 7 files changed, 499 insertions(+), 3 deletions(-) create mode 100644 samples/dreamroq/profiler.c create mode 100644 samples/dreamroq/profiler.h diff --git a/CMakeLists.txt b/CMakeLists.txt index eefc7e9..f915fb8 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -180,7 +180,7 @@ gen_sample(polymark samples/polymark/main.c) gen_sample(cubes samples/cubes/main.cpp) gen_sample(zclip_test tests/zclip/main.cpp) -gen_sample(dreamroq samples/dreamroq/main.c samples/dreamroq/dreamroqlib.c samples/dreamroq/libdcmc/snd_stream.c samples/dreamroq/libdcmc/snddrv.c samples/dreamroq/libdcmc/timer.c) +gen_sample(dreamroq samples/dreamroq/main.c samples/dreamroq/dreamroqlib.c samples/dreamroq/libdcmc/snd_stream.c samples/dreamroq/libdcmc/snddrv.c samples/dreamroq/libdcmc/timer.c samples/dreamroq/profiler.c) if(PLATFORM_DREAMCAST) gen_sample(trimark samples/trimark/main.c) diff --git a/GL/texture.c b/GL/texture.c index 016be45..e75fe94 100644 --- a/GL/texture.c +++ b/GL/texture.c @@ -1934,3 +1934,18 @@ GLAPI void APIENTRY glGetTexImage(GLenum tex, GLint lod, GLenum format, GLenum t _GL_UNUSED(type); _GL_UNUSED(img); } + +GLAPI void glKosCopyTexture(void* data, GLuint bytes) { + TextureObject* active = TEXTURE_UNITS[ACTIVE_TEXTURE]; + FASTCPY(active->data, data, bytes); + + /* + //Set PVR DMA register + *((volatile int *)0xA05F6888) = 1; + + //Convert read/write area pointer to DMA write only area pointer + void *dmaareaptr = ((uintptr_t)active->data & 0xffffff) | 0x11000000; + + sq_cpy(dmaareaptr, data, bytes); + */ +} \ No newline at end of file diff --git a/include/GL/glkos.h b/include/GL/glkos.h index e2ef11a..a040b1b 100644 --- a/include/GL/glkos.h +++ b/include/GL/glkos.h @@ -108,6 +108,8 @@ GLAPI void APIENTRY glKosSwapBuffers(); * by default textures use shared palette 0. */ +GLAPI void APIENTRY glKosCopyTexture(void *src, GLuint bytes); + #define GL_SHARED_TEXTURE_PALETTE_0_KOS 0xEEFC #define GL_SHARED_TEXTURE_PALETTE_1_KOS 0xEEFD diff --git a/samples/dreamroq/dreamroqlib.c b/samples/dreamroq/dreamroqlib.c index 4679fce..43e4c78 100644 --- a/samples/dreamroq/dreamroqlib.c +++ b/samples/dreamroq/dreamroqlib.c @@ -53,7 +53,7 @@ typedef struct int mb_count; int current_frame; - unsigned short *frame[2]; + unsigned short *frame[2] __attribute__(( aligned(32))); int stride; int texture_height; diff --git a/samples/dreamroq/main.c b/samples/dreamroq/main.c index 3744066..70d6ea3 100644 --- a/samples/dreamroq/main.c +++ b/samples/dreamroq/main.c @@ -52,6 +52,8 @@ #include #include +#include "profiler.h" + #ifdef __DREAMCAST__ extern uint8 romdisk[]; KOS_INIT_ROMDISK(romdisk); @@ -305,7 +307,8 @@ static int renderGLdc_cb(unsigned short *buf, int width, int height, int stride /* send the video frame as a texture over to video RAM */ //pvr_txr_load(buf, textures[current_frame], stride * texture_height * 2); glBindTexture(GL_TEXTURE_2D, frameTexture[current_frame]); - glTexImage2D(GL_TEXTURE_2D, 0, GL_RGB, 512, 512, 0, GL_RGB, GL_UNSIGNED_SHORT_5_6_5, buf); + //glTexImage2D(GL_TEXTURE_2D, 0, GL_RGB, 512, 512, 0, GL_RGB, GL_UNSIGNED_SHORT_5_6_5, buf); + glKosCopyTexture(buf, 512 * 512 * 2); // Calculate the elapsed time since the last frame @@ -420,6 +423,9 @@ int main() { int status = 0; + profiler_init("/pc/out.gmon"); + profiler_start(); + glKosInit(); printf("dreamroq_play(C) Multimedia Mike Melanson & Josh PH3NOM Pearson 2011\n"); @@ -481,6 +487,9 @@ int main() printf("Freed PVR memory\n"); } + profiler_stop(); + profiler_clean_up(); + printf("Exiting main()\n"); return 0; } diff --git a/samples/dreamroq/profiler.c b/samples/dreamroq/profiler.c new file mode 100644 index 0000000..c44c3c9 --- /dev/null +++ b/samples/dreamroq/profiler.c @@ -0,0 +1,452 @@ +#include +#include +#include +#include +#include +#include +#include + +#include +#include + +static char OUTPUT_FILENAME[128]; +static kthread_t* THREAD; +static volatile bool PROFILER_RUNNING = false; +static volatile bool PROFILER_RECORDING = false; + +#define BASE_ADDRESS 0x8c010000 +#define BUCKET_SIZE 10000 + +#define INTERVAL_IN_MS 10 + +/* Simple hash table of samples. An array of Samples + * but, each sample in that array can be the head of + * a linked list of other samples */ +typedef struct Arc { + uint32_t pc; + uint32_t pr; // Caller return address + uint32_t count; + struct Arc* next; +} Arc; + +static Arc ARCS[BUCKET_SIZE]; + +/* Hashing function for two uint32_ts */ +#define HASH_PAIR(x, y) ((x * 0x1f1f1f1f) ^ y) + +#define BUFFER_SIZE (1024 * 8) // 8K buffer + +const static size_t MAX_ARC_COUNT = BUFFER_SIZE / sizeof(Arc); +static size_t ARC_COUNT = 0; + +static bool WRITE_TO_STDOUT = false; + +static bool write_samples(const char* path); +static bool write_samples_to_stdout(); +static void clear_samples(); + +static Arc* new_arc(uint32_t PC, uint32_t PR) { + Arc* s = (Arc*) malloc(sizeof(Arc)); + s->count = 1; + s->pc = PC; + s->pr = PR; + s->next = NULL; + + ++ARC_COUNT; + + return s; +} + +static void record_thread(uint32_t PC, uint32_t PR) { + uint32_t bucket = HASH_PAIR(PC, PR) % BUCKET_SIZE; + + Arc* s = &ARCS[bucket]; + + if(s->pc) { + /* Initialized sample in this bucket, + * does it match though? */ + while(s->pc != PC || s->pr != PR) { + if(s->next) { + s = s->next; + } else { + s->next = new_arc(PC, PR); + return; // We're done + } + } + + s->count++; + } else { + /* Initialize this sample */ + s->count = 1; + s->pc = PC; + s->pr = PR; + s->next = NULL; + ++ARC_COUNT; + } +} + +static int thd_each_cb(kthread_t* thd, void* data) { + (void) data; + + + /* Only record the main thread (for now) */ + if(strcmp(thd->label, "[kernel]") != 0) { + return 0; + } + + /* The idea is that if this code right here is running in the profiling + * thread, then all the PCs from the other threads are + * current. Obviouly thought between iterations the + * PC will change so it's not like this is a true snapshot + * in time across threads */ + int old = irq_disable(); + uint32_t PC = thd->context.pc; + uint32_t PR = thd->context.pr; + irq_restore(old); + + record_thread(PC, PR); + return 0; +} + +static void record_samples() { + /* Go through all the active threads and increase + * the sample count for the PC for each of them */ + + size_t initial = ARC_COUNT; + + /* Note: This is a function added to kallistios-nitro that's + * not yet available upstream */ + thd_each(&thd_each_cb, NULL); + + if(ARC_COUNT >= MAX_ARC_COUNT) { + /* TIME TO FLUSH! */ + if(!write_samples(OUTPUT_FILENAME)) { + fprintf(stderr, "Error writing samples\n"); + } + } + + /* We log when the number of PCs recorded hits a certain increment */ + if((initial != ARC_COUNT) && ((ARC_COUNT % 1000) == 0)) { + printf("-- %d arcs recorded...\n", ARC_COUNT); + } +} + +/* Declared in KOS in fs_dcload.c */ +int fs_dcload_detected(); +extern int dcload_type; + + +#define GMON_COOKIE "gmon" +#define GMON_VERSION 1 + +typedef struct { + char cookie[4]; // 'g','m','o','n' + int32_t version; // 1 + char spare[3 * 4]; // Padding +} GmonHeader; + +typedef struct { + uint32_t low_pc; + uint32_t high_pc; + uint32_t hist_size; + uint32_t prof_rate; + char dimen[15]; /* phys. dim., usually "seconds" */ + char dimen_abbrev; /* usually 's' for "seconds" */ +} GmonHistHeader; + +typedef struct { + unsigned char tag; // GMON_TAG_TIME_HIST = 0, GMON_TAG_CG_ARC = 1, GMON_TAG_BB_COUNT = 2 + size_t ncounts; // Number of address/count pairs in this sequence +} GmonBBHeader; + +typedef struct { + uint32_t from_pc; /* address within caller's body */ + uint32_t self_pc; /* address within callee's body */ + uint32_t count; /* number of arc traversals */ +} GmonArc; + +static bool init_sample_file(const char* path) { + printf("Detecting dcload... "); + + if(!fs_dcload_detected() && dcload_type != DCLOAD_TYPE_NONE) { + printf("[Not Found]\n"); + WRITE_TO_STDOUT = true; + return false; + } else { + printf("[Found]\n"); + } + + FILE* out = fopen(path, "w"); + if(!out) { + WRITE_TO_STDOUT = true; + return false; + } + + /* Write the GMON header */ + + GmonHeader header; + memcpy(&header.cookie[0], GMON_COOKIE, sizeof(header.cookie)); + header.version = 1; + memset(header.spare, '\0', sizeof(header.spare)); + + fwrite(&header, sizeof(header), 1, out); + + fclose(out); + return true; +} + +#define ROUNDDOWN(x,y) (((x)/(y))*(y)) +#define ROUNDUP(x,y) ((((x)+(y)-1)/(y))*(y)) + +static bool write_samples(const char* path) { + /* Appends the samples to the output file in gmon format + * + * We iterate the data twice, first generating arcs, then generating + * basic block counts. While we do that though we calculate the data + * for the histogram so we don't need a third iteration */ + + if(WRITE_TO_STDOUT) { + write_samples_to_stdout(); + return true; + } + + extern char _etext; + + const uint32_t HISTFRACTION = 8; + + /* We know the lowest address, it's the same for all DC games */ + uint32_t lowest_address = ROUNDDOWN(BASE_ADDRESS, HISTFRACTION); + + /* We need to calculate the highest address though */ + uint32_t highest_address = ROUNDUP((uint32_t) &_etext, HISTFRACTION); + + /* Histogram data */ + const int BIN_COUNT = ((highest_address - lowest_address) / HISTFRACTION); + uint16_t* bins = (uint16_t*) malloc(BIN_COUNT * sizeof(uint16_t)); + memset(bins, 0, sizeof(uint16_t) * BIN_COUNT); + + FILE* out = fopen(path, "r+"); /* Append, as init_sample_file would have created the file */ + if(!out) { + fprintf(stderr, "-- Error writing samples to output file\n"); + return false; + } + + // Seek to the end of the file + fseek(out, 0, SEEK_END); + + printf("-- Writing %d arcs\n", ARC_COUNT); + + uint8_t tag = 1; + +#ifndef NDEBUG + size_t written = 0; +#endif + + /* Write arcs */ + Arc* root = ARCS; + for(int i = 0; i < BUCKET_SIZE; ++i) { + if(root->pc) { + GmonArc arc; + arc.from_pc = root->pr; + arc.self_pc = root->pc; + arc.count = root->count; + + /* Write the root sample if it has a program counter */ + fwrite(&tag, sizeof(tag), 1, out); + fwrite(&arc, sizeof(GmonArc), 1, out); + +#ifndef NDEBUG + ++written; +#endif + + /* If there's a next pointer, traverse the list */ + Arc* s = root->next; + while(s) { + arc.from_pc = s->pr; + arc.self_pc = s->pc; + arc.count = s->count; + + /* Write the root sample if it has a program counter */ + fwrite(&tag, sizeof(tag), 1, out); + fwrite(&arc, sizeof(GmonArc), 1, out); + +#ifndef NDEBUG + ++written; +#endif + + s = s->next; + } + } + + root++; + } + + uint32_t histogram_range = highest_address - lowest_address; + uint32_t bin_size = histogram_range / BIN_COUNT; + + root = ARCS; + for(int i = 0; i < BUCKET_SIZE; ++i) { + if(root->pc) { + printf("Incrementing %d for %x. ", (root->pc - lowest_address) / bin_size, (unsigned int) root->pc); + bins[(root->pc - lowest_address) / bin_size]++; + printf("Now: %d\n", (int) bins[(root->pc - lowest_address) / bin_size]); + + /* If there's a next pointer, traverse the list */ + Arc* s = root->next; + while(s) { + assert(s->pc); + bins[(s->pc - lowest_address) / bin_size]++; + s = s->next; + } + } + + root++; + } + + + /* Write histogram now that we have all the information we need */ + GmonHistHeader hist_header; + hist_header.low_pc = lowest_address; + hist_header.high_pc = highest_address; + hist_header.hist_size = BIN_COUNT; + hist_header.prof_rate = INTERVAL_IN_MS; + strcpy(hist_header.dimen, "seconds"); + hist_header.dimen_abbrev = 's'; + + unsigned char hist_tag = 0; + fwrite(&hist_tag, sizeof(hist_tag), 1, out); + fwrite(&hist_header, sizeof(hist_header), 1, out); + fwrite(bins, sizeof(uint16_t), BIN_COUNT, out); + + fclose(out); + free(bins); + + /* We should have written all the recorded samples */ + assert(written == ARC_COUNT); + + clear_samples(); + + return true; +} + +static bool write_samples_to_stdout() { + /* Write samples to stdout as a CSV file + * for processing */ + + printf("--------------\n"); + printf("\"PC\", \"PR\", \"COUNT\"\n"); + + Arc* root = ARCS; + for(int i = 0; i < BUCKET_SIZE; ++i) { + Arc* s = root; + while(s->next) { + printf("\"%x\", \"%x\", \"%d\"\n", (unsigned int) s->pc, (unsigned int) s->pr, (unsigned int) s->count); + s = s->next; + } + + root++; + } + + printf("--------------\n"); + + return true; +} + + +static void* run(void* args) { + printf("-- Entered profiler thread!\n"); + + while(PROFILER_RUNNING){ + if(PROFILER_RECORDING) { + record_samples(); + usleep(INTERVAL_IN_MS * 1000); //usleep takes milliseconds + } + } + + printf("-- Profiler thread finished!\n"); + + return NULL; +} + +void profiler_init(const char* output) { + /* Store the filename */ + strncpy(OUTPUT_FILENAME, output, sizeof(OUTPUT_FILENAME)); + + /* Initialize the file */ + printf("Creating samples file...\n"); + if(!init_sample_file(OUTPUT_FILENAME)) { + printf("Read-only filesytem. Writing samples to stdout\n"); + } + + printf("Creating profiler thread...\n"); + // Initialize the samples to zero + memset(ARCS, 0, sizeof(ARCS)); + + PROFILER_RUNNING = true; + THREAD = thd_create(0, run, NULL); + + /* Lower priority is... er, higher */ + thd_set_prio(THREAD, PRIO_DEFAULT / 2); + + printf("Thread started.\n"); +} + +void profiler_start() { + assert(PROFILER_RUNNING); + + if(PROFILER_RECORDING) { + return; + } + + PROFILER_RECORDING = true; + printf("Starting profiling...\n"); +} + +static void clear_samples() { + /* Free the samples we've collected to start again */ + Arc* root = ARCS; + for(int i = 0; i < BUCKET_SIZE; ++i) { + Arc* s = root; + Arc* next = s->next; + + // While we have a next pointer + while(next) { + s = next; // Point S at it + next = s->next; // Store the new next pointer + free(s); // Free S + } + + // We've wiped the chain so we can now clear the root + // which is statically allocated + root->next = NULL; + root++; + } + + // Wipe the lot + memset(ARCS, 0, sizeof(ARCS)); + ARC_COUNT = 0; +} + +bool profiler_stop() { + if(!PROFILER_RECORDING) { + return false; + } + + printf("Stopping profiling...\n"); + + PROFILER_RECORDING = false; + if(!write_samples(OUTPUT_FILENAME)) { + printf("ERROR WRITING SAMPLES (RO filesystem?)! Outputting to stdout\n"); + return false; + } + + + return true; +} + +void profiler_clean_up() { + profiler_stop(); // Make sure everything is stopped + + PROFILER_RUNNING = false; + thd_join(THREAD, NULL); +} diff --git a/samples/dreamroq/profiler.h b/samples/dreamroq/profiler.h new file mode 100644 index 0000000..d2a7435 --- /dev/null +++ b/samples/dreamroq/profiler.h @@ -0,0 +1,18 @@ +#pragma once + +/* + * The Dreamcast doesn't have any kind of profiling support from GCC + * so this is a cumbersome sampling profiler that runs in a background thread + */ +#ifdef __cplusplus +extern "C" { +#endif + +void profiler_init(const char* output); +void profiler_start(); +void profiler_stop(); +void profiler_clean_up(); + +#ifdef __cplusplus +} +#endif