diff --git a/GL/draw.c b/GL/draw.c index 3c1db65..60013ab 100644 --- a/GL/draw.c +++ b/GL/draw.c @@ -442,7 +442,7 @@ GL_FORCE_INLINE void transformNormalToEyeSpace(GLfloat* normal) { mat_trans_normal3(normal[0], normal[1], normal[2]); } -PolyHeader *_glSubmissionTargetHeader(SubmissionTarget* target) { +GL_FORCE_INLINE PolyHeader *_glSubmissionTargetHeader(SubmissionTarget* target) { assert(target->header_offset < target->output->vector.size); return aligned_vector_at(&target->output->vector, target->header_offset); } @@ -456,7 +456,7 @@ Vertex* _glSubmissionTargetEnd(SubmissionTarget* target) { return _glSubmissionTargetStart(target) + target->count; } -static inline void genTriangles(Vertex* output, GLuint count) { +GL_FORCE_INLINE void genTriangles(Vertex* output, GLuint count) { Vertex* it = output + 2; GLuint i; @@ -466,17 +466,22 @@ static inline void genTriangles(Vertex* output, GLuint count) { } } -static inline void genQuads(Vertex* output, GLuint count) { +GL_FORCE_INLINE void genQuads(Vertex* output, GLuint count) { + Vertex* pen = output + 2; Vertex* final = output + 3; - GLuint i; - for(i = 0; i < count; i += 4) { - swapVertex((final - 1), final); + GLuint i = count >> 2; + while(i--) { + __asm__("pref @%0" : : "r"(pen + 4)); + + swapVertex(pen, final); final->flags = GPU_CMD_VERTEX_EOL; + + pen += 4; final += 4; } } -static void genTriangleStrip(Vertex* output, GLuint count) { +GL_FORCE_INLINE void genTriangleStrip(Vertex* output, GLuint count) { output[count - 1].flags = GPU_CMD_VERTEX_EOL; } @@ -634,6 +639,8 @@ static void _readPositionData(ReadDiffuseFunc func, const GLuint first, const GL uint32_t* flags; ITERATE(count) { + __asm__("pref @%0" : : "r"(vptr + vstride)); + func(vptr, out); vptr += vstride; @@ -653,6 +660,8 @@ static void _readUVData(ReadUVFunc func, const GLuint first, const GLuint count, GLubyte* out = (GLubyte*) output[0].uv; ITERATE(count) { + __asm__("pref @%0" : : "r"(uvptr + uvstride)); + func(uvptr, out); uvptr += uvstride; out += sizeof(Vertex); @@ -666,6 +675,8 @@ static void _readSTData(ReadUVFunc func, const GLuint first, const GLuint count, GLubyte* out = (GLubyte*) extra[0].st; ITERATE(count) { + __asm__("pref @%0" : : "r"(stptr + ststride)); + func(stptr, out); stptr += ststride; out += sizeof(VertexExtra); @@ -714,6 +725,8 @@ static void _readDiffuseData(ReadDiffuseFunc func, const GLuint first, const GLu GLubyte* out = (GLubyte*) output[0].bgra; ITERATE(count) { + __asm__("pref @%0" : : "r"(cptr + cstride)); + func(cptr, out); cptr += cstride; out += sizeof(Vertex); @@ -874,6 +887,8 @@ static void generateElementsFastPath( } } +#define likely(x) __builtin_expect(!!(x), 1) + static void generateArraysFastPath(SubmissionTarget* target, const GLsizei first, const GLuint count) { Vertex* start = _glSubmissionTargetStart(target); @@ -1290,7 +1305,6 @@ GL_FORCE_INLINE void submitVertices(GLenum mode, GLsizei first, GLuint count, GL } - divide(target); push(_glSubmissionTargetHeader(target), GL_FALSE, target->output, 0); /* diff --git a/GL/flush.c b/GL/flush.c index e3d8dc0..23d5c1f 100644 --- a/GL/flush.c +++ b/GL/flush.c @@ -42,10 +42,10 @@ void APIENTRY glKosInitConfig(GLdcConfig* config) { config->autosort_enabled = GL_FALSE; config->fsaa_enabled = GL_FALSE; - config->initial_op_capacity = 1024; - config->initial_pt_capacity = 512; - config->initial_tr_capacity = 1024; - config->initial_immediate_capacity = 1024; + config->initial_op_capacity = 1024 * 3; + config->initial_pt_capacity = 512 * 3; + config->initial_tr_capacity = 1024 * 3; + config->initial_immediate_capacity = 1024 * 3; config->internal_palette_format = GL_RGBA4; } @@ -86,20 +86,65 @@ void APIENTRY glKosInit() { glKosInitEx(&config); } +#define likely(x) __builtin_expect(!!(x), 1) +#define unlikely(x) __builtin_expect(!!(x), 0) + +GL_FORCE_INLINE bool glIsVertex(const float flags) { + return flags == GPU_CMD_VERTEX_EOL || flags == GPU_CMD_VERTEX; +} + + +GL_FORCE_INLINE void glPerspectiveDivide(void* src, uint32_t n) { + TRACE(); + + /* Perform perspective divide on each vertex */ + Vertex* vertex = (Vertex*) src; + + const float h = GetVideoMode()->height; + + while(n--) { + __asm__("pref @%0" : : "r"(vertex + 1)); + + if(likely(glIsVertex(vertex->flags))) { + const float f = MATH_Fast_Invert(vertex->w); + + /* Convert to NDC and apply viewport */ + vertex->xyz[0] = MATH_fmac( + VIEWPORT.hwidth, vertex->xyz[0] * f, VIEWPORT.x_plus_hwidth + ); + + vertex->xyz[1] = h - MATH_fmac( + VIEWPORT.hheight, vertex->xyz[1] * f, VIEWPORT.y_plus_hheight + ); + + /* Apply depth range */ + vertex->xyz[2] = MAX( + 1.0f - MATH_fmac(vertex->xyz[2] * f, 0.5f, 0.5f), + PVR_MIN_Z + ); + } + + ++vertex; + } +} + void APIENTRY glKosSwapBuffers() { TRACE(); SceneBegin(); SceneListBegin(GPU_LIST_OP_POLY); + glPerspectiveDivide(OP_LIST.vector.data, OP_LIST.vector.size); SceneListSubmit(OP_LIST.vector.data, OP_LIST.vector.size); SceneListFinish(); SceneListBegin(GPU_LIST_PT_POLY); + glPerspectiveDivide(PT_LIST.vector.data, PT_LIST.vector.size); SceneListSubmit(PT_LIST.vector.data, PT_LIST.vector.size); SceneListFinish(); SceneListBegin(GPU_LIST_TR_POLY); + glPerspectiveDivide(TR_LIST.vector.data, TR_LIST.vector.size); SceneListSubmit(TR_LIST.vector.data, TR_LIST.vector.size); SceneListFinish(); SceneFinish(); diff --git a/GL/private.h b/GL/private.h index c099360..ded524f 100644 --- a/GL/private.h +++ b/GL/private.h @@ -255,7 +255,6 @@ typedef struct { AlignedVector* extras; } SubmissionTarget; -PolyHeader* _glSubmissionTargetHeader(SubmissionTarget* target); Vertex* _glSubmissionTargetStart(SubmissionTarget* target); Vertex* _glSubmissionTargetEnd(SubmissionTarget* target); diff --git a/samples/quadmark/main.c b/samples/quadmark/main.c index eca1261..26ea433 100644 --- a/samples/quadmark/main.c +++ b/samples/quadmark/main.c @@ -74,6 +74,8 @@ void setup() { glOrtho(0, 640, 0, 480, -100, 100); glMatrixMode(GL_PROJECTION); glLoadIdentity(); + + glDisable(GL_NEARZ_CLIPPING_KOS); } void do_frame() {