diff --git a/CMakeLists.txt b/CMakeLists.txt index b26026a..f9aadb8 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -17,6 +17,7 @@ string(TOUPPER ${BACKEND} BACKEND_UPPER) add_definitions(-DBACKEND_${BACKEND_UPPER}) set(CMAKE_C_STANDARD 99) +set(CMAKE_CXX_STANDARD 11) include_directories(include) @@ -30,17 +31,29 @@ else() if(COMPILER_HAS_FSRRA) set(CMAKE_C_FLAGS_RELEASE "${CMAKE_C_FLAGS_RELEASE} -mfsrra") set(CMAKE_CXX_FLAGS_RELEASE "${CMAKE_CXX_FLAGS_RELEASE} -mfsrra") + + set(CMAKE_C_FLAGS_RELWITHDEBINFO "${CMAKE_C_FLAGS_RELWITHDEBINFO} -mfsrra") + set(CMAKE_CXX_FLAGS_RELWITHDEBINFO "${CMAKE_CXX_FLAGS_RELWITHDEBINFO} -mfsrra") endif() if(COMPILER_HAS_FSCA) set(CMAKE_C_FLAGS_RELEASE "${CMAKE_C_FLAGS_RELEASE} -mfsca") set(CMAKE_CXX_FLAGS_RELEASE "${CMAKE_CXX_FLAGS_RELEASE} -mfsca") + + set(CMAKE_C_FLAGS_RELWITHDEBINFO "${CMAKE_C_FLAGS_RELWITHDEBINFO} -mfsca") + set(CMAKE_CXX_FLAGS_RELWITHDEBINFO "${CMAKE_CXX_FLAGS_RELWITHDEBINFO} -mfsca") endif() set(CMAKE_C_FLAGS_RELEASE "${CMAKE_C_FLAGS_RELEASE} -ffp-contract=fast -ffast-math") set(CMAKE_CXX_FLAGS_RELEASE "${CMAKE_CXX_FLAGS_RELEASE} -ffast-math") + + set(CMAKE_C_FLAGS_RELWITHDEBINFO "${CMAKE_C_FLAGS_RELWITHDEBINFO} -ffp-contract=fast -ffast-math") + set(CMAKE_CXX_FLAGS_RELWITHDEBINFO "${CMAKE_CXX_FLAGS_RELWITHDEBINFO} -ffast-math") endif() set(CMAKE_C_FLAGS_RELEASE "${CMAKE_C_FLAGS_RELEASE} -O3 -fexpensive-optimizations -fomit-frame-pointer -finline-functions") -set(CMAKE_CXX_FLAGS_RELEASE "${CMAKE_CXX_FLAGS_RELEASE} -std=c++14 -fwhole-program -O3 -g0 -s -fomit-frame-pointer -fstrict-aliasing") +set(CMAKE_CXX_FLAGS_RELEASE "${CMAKE_CXX_FLAGS_RELEASE} -std=c++14 -O3 -g0 -s -fomit-frame-pointer -fstrict-aliasing") + +set(CMAKE_C_FLAGS_RELWITHDEBINFO "${CMAKE_C_FLAGS_RELWITHDEBINFO} -O3 -fexpensive-optimizations -fomit-frame-pointer -finline-functions") +set(CMAKE_CXX_FLAGS_RELWITHDEBINFO "${CMAKE_CXX_FLAGS_RELWITHDEBINFO} -std=c++14 -O3 -fomit-frame-pointer -fstrict-aliasing") set(CMAKE_C_FLAGS_DEBUG "${CMAKE_C_FLAGS_DEBUG} -O0 -g -Wall -Wextra") set(CMAKE_CXX_FLAGS_DEBUG "${CMAKE_CXX_FLAGS_DEBUG} -O0 -g -Wall -Wextra") @@ -178,6 +191,7 @@ gen_sample(scissor samples/scissor/main.c) gen_sample(polymark samples/polymark/main.c) gen_sample(cubes samples/cubes/main.cpp) +gen_sample(zclip_test tests/zclip/main.cpp) if(PLATFORM_DREAMCAST) gen_sample(trimark samples/trimark/main.c) diff --git a/GL/draw.c b/GL/draw.c index aea1f34..3548bd1 100644 --- a/GL/draw.c +++ b/GL/draw.c @@ -239,7 +239,7 @@ static void _fillWithNegZVE(const GLubyte* __restrict__ input, GLubyte* __restri float x, y, z; } V; - const static V NegZ = {0.0f, 0.0f, -1.0f}; + static const V NegZ = {0.0f, 0.0f, -1.0f}; *((V*) out) = NegZ; } @@ -391,12 +391,12 @@ GL_FORCE_INLINE void transformNormalToEyeSpace(GLfloat* normal) { } GL_FORCE_INLINE PolyHeader *_glSubmissionTargetHeader(SubmissionTarget* target) { - gl_assert(target->header_offset < target->output->vector.size); + gl_assert(target->header_offset < aligned_vector_size(&target->output->vector)); return aligned_vector_at(&target->output->vector, target->header_offset); } GL_INLINE_DEBUG Vertex* _glSubmissionTargetStart(SubmissionTarget* target) { - gl_assert(target->start_offset < target->output->vector.size); + gl_assert(target->start_offset < aligned_vector_size(&target->output->vector)); return aligned_vector_at(&target->output->vector, target->start_offset); } @@ -1210,15 +1210,14 @@ GL_FORCE_INLINE void submitVertices(GLenum mode, GLsizei first, GLuint count, GL return; } - GLboolean header_required = (target->output->vector.size == 0) || _glGPUStateIsDirty(); - - // We don't handle this any further, so just make sure we never pass it down */ gl_assert(mode != GL_POLYGON); target->output = _glActivePolyList(); + GLboolean header_required = (aligned_vector_header(&target->output->vector)->size == 0) || _glGPUStateIsDirty(); + target->count = (mode == GL_TRIANGLE_FAN) ? ((count - 2) * 3) : count; - target->header_offset = target->output->vector.size; + target->header_offset = aligned_vector_header(&target->output->vector)->size; target->start_offset = target->header_offset + (header_required); gl_assert(target->count); diff --git a/GL/draw_fastpath.inc b/GL/draw_fastpath.inc index d945d50..d483b9d 100644 --- a/GL/draw_fastpath.inc +++ b/GL/draw_fastpath.inc @@ -5,75 +5,123 @@ MAKE_FUNC(POLYMODE) { - const Vertex* const start = _glSubmissionTargetStart(target); - const VertexExtra* const ve_start = aligned_vector_at(target->extras, 0); - - const GLuint vstride = ATTRIB_POINTERS.vertex.stride; - GLuint uvstride = ATTRIB_POINTERS.uv.stride; - GLuint ststride = ATTRIB_POINTERS.st.stride; - GLuint dstride = ATTRIB_POINTERS.colour.stride; - GLuint nstride = ATTRIB_POINTERS.normal.stride; - - const GLubyte* pos = (ENABLED_VERTEX_ATTRIBUTES & VERTEX_ENABLED_FLAG) ? ATTRIB_POINTERS.vertex.ptr + (first * vstride) : NULL; - const GLubyte* uv = (ENABLED_VERTEX_ATTRIBUTES & UV_ENABLED_FLAG) ? ATTRIB_POINTERS.uv.ptr + (first * uvstride) : NULL; - const GLubyte* col = (ENABLED_VERTEX_ATTRIBUTES & DIFFUSE_ENABLED_FLAG) ? ATTRIB_POINTERS.colour.ptr + (first * dstride) : NULL; - const GLubyte* st = (ENABLED_VERTEX_ATTRIBUTES & ST_ENABLED_FLAG) ? ATTRIB_POINTERS.st.ptr + (first * ststride) : NULL; - const GLubyte* n = (ENABLED_VERTEX_ATTRIBUTES & NORMAL_ENABLED_FLAG) ? ATTRIB_POINTERS.normal.ptr + (first * nstride) : NULL; - - const float w = 1.0f; - - if(!pos) { + static const float w = 1.0f; + if(!(ENABLED_VERTEX_ATTRIBUTES & VERTEX_ENABLED_FLAG)) { /* If we don't have vertices, do nothing */ return; } - if(!col) { - col = (GLubyte*) &U4ONE; - dstride = 0; - } + /* This is the best value we have. PROCESS_VERTEX_FLAGS needs to operate on quads and tris and so + this need to be divisible by 4 and 3. Even though we should be able to go much higher than this + and still be cache-local, trial and error says otherwise... */ - if(!uv) { - uv = (GLubyte*) &F2ZERO; - uvstride = 0; - } +#define BATCH_SIZE 60 - if(!st) { - st = (GLubyte*) &F2ZERO; - ststride = 0; - } + GLuint min = 0; + GLuint stride; + const GLubyte* ptr; + Vertex* it; + VertexExtra* ve; - if(!n) { - n = (GLubyte*) &F3Z; - nstride = 0; - } - VertexExtra* ve = (VertexExtra*) ve_start; - Vertex* it = (Vertex*) start; + for(min = 0; min < count; min += BATCH_SIZE) { + const Vertex* start = ((Vertex*) _glSubmissionTargetStart(target)) + min; + const int_fast32_t loop = ((min + BATCH_SIZE) > count) ? count - min : BATCH_SIZE; + const int offset = (first + min); - for(int_fast32_t i = 0; i < count; ++i) { - TransformVertex((const float*) pos, &w, it->xyz, &it->w); - pos += vstride; - PREFETCH(pos); + stride = ATTRIB_POINTERS.vertex.stride; + ptr = ATTRIB_POINTERS.vertex.ptr + (offset * stride); + it = (Vertex*) start; - *((Float2*) it->uv) = *((Float2*) uv); - uv += uvstride; - PREFETCH(uv); + PREFETCH(ptr); + for(int_fast32_t i = 0; i < loop; ++i, ++it) { + PREFETCH(ptr + stride); + TransformVertex((const float*) ptr, &w, it->xyz, &it->w); + PROCESS_VERTEX_FLAGS(it, min + i); + ptr += stride; + } - *((uint32_t*) it->bgra) = *((uint32_t*) col); - col += dstride; - PREFETCH(col); + stride = ATTRIB_POINTERS.uv.stride; + ptr = (ENABLED_VERTEX_ATTRIBUTES & UV_ENABLED_FLAG) ? ATTRIB_POINTERS.uv.ptr + ((first + min) * stride) : NULL; + it = (Vertex*) start; - *((Float2*) ve->st) = *((Float2*) st); - st += ststride; - PREFETCH(st); + if(ptr) { + PREFETCH(ptr); + for(int_fast32_t i = 0; i < loop; ++i, ++it) { + PREFETCH(ptr + stride); + it->uv[0] = ((float*) ptr)[0]; + it->uv[1] = ((float*) ptr)[1]; + ptr += stride; + } + } else { + for(int_fast32_t i = 0; i < loop; ++i, ++it) { + it->uv[0] = 0; + it->uv[1] = 0; + } + } - *((Float3*) ve->nxyz) = *((Float3*) n); - n += nstride; - PREFETCH(n); + stride = ATTRIB_POINTERS.colour.stride; + ptr = (ENABLED_VERTEX_ATTRIBUTES & DIFFUSE_ENABLED_FLAG) ? ATTRIB_POINTERS.colour.ptr + (offset * stride) : NULL; + it = (Vertex*) start; - PROCESS_VERTEX_FLAGS(it, i); + if(ptr) { + PREFETCH(ptr); + for(int_fast32_t i = 0; i < loop; ++i, ++it) { + PREFETCH(ptr + stride); + it->bgra[0] = ptr[0]; + it->bgra[1] = ptr[1]; + it->bgra[2] = ptr[2]; + it->bgra[3] = ptr[3]; + ptr += stride; + } + } else { + for(int_fast32_t i = 0; i < loop; ++i, ++it) { + *((uint32_t*) it->bgra) = ~0; + } + } - ++it; - ++ve; + start = aligned_vector_at(target->extras, min); + + stride = ATTRIB_POINTERS.st.stride; + ptr = (ENABLED_VERTEX_ATTRIBUTES & ST_ENABLED_FLAG) ? ATTRIB_POINTERS.st.ptr + (offset * stride) : NULL; + ve = (VertexExtra*) start; + + if(ptr) { + PREFETCH(ptr); + + for(int_fast32_t i = 0; i < loop; ++i, ++ve) { + PREFETCH(ptr + stride); + ve->st[0] = ((float*) ptr)[0]; + ve->st[1] = ((float*) ptr)[1]; + ptr += stride; + } + } else { + for(int_fast32_t i = 0; i < loop; ++i, ++ve) { + ve->st[0] = 0; + ve->st[1] = 0; + } + } + + stride = ATTRIB_POINTERS.normal.stride; + ptr = (ENABLED_VERTEX_ATTRIBUTES & NORMAL_ENABLED_FLAG) ? ATTRIB_POINTERS.normal.ptr + (offset * stride) : NULL; + ve = (VertexExtra*) start; + + if(ptr) { + PREFETCH(ptr); + + for(int_fast32_t i = 0; i < loop; ++i, ++ve) { + PREFETCH(ptr + stride); + ve->nxyz[0] = ((float*) ptr)[0]; + ve->nxyz[1] = ((float*) ptr)[1]; + ve->nxyz[2] = ((float*) ptr)[2]; + ptr += stride; + } + } else { + for(int_fast32_t i = 0; i < loop; ++i, ++ve) { + ve->nxyz[0] = 0; + ve->nxyz[1] = 0; + ve->nxyz[2] = 0; + } + } } } diff --git a/GL/flush.c b/GL/flush.c index 11cf3f4..cf85a4e 100644 --- a/GL/flush.c +++ b/GL/flush.c @@ -93,21 +93,21 @@ void APIENTRY glKosSwapBuffers() { TRACE(); SceneBegin(); - if(OP_LIST.vector.size > 2) { + if(aligned_vector_header(&OP_LIST.vector)->size > 2) { SceneListBegin(GPU_LIST_OP_POLY); - SceneListSubmit(OP_LIST.vector.data, OP_LIST.vector.size); + SceneListSubmit((Vertex*) aligned_vector_front(&OP_LIST.vector), aligned_vector_size(&OP_LIST.vector)); SceneListFinish(); } - if(PT_LIST.vector.size > 2) { + if(aligned_vector_header(&PT_LIST.vector)->size > 2) { SceneListBegin(GPU_LIST_PT_POLY); - SceneListSubmit(PT_LIST.vector.data, PT_LIST.vector.size); + SceneListSubmit((Vertex*) aligned_vector_front(&PT_LIST.vector), aligned_vector_size(&PT_LIST.vector)); SceneListFinish(); } - if(TR_LIST.vector.size > 2) { + if(aligned_vector_header(&TR_LIST.vector)->size > 2) { SceneListBegin(GPU_LIST_TR_POLY); - SceneListSubmit(TR_LIST.vector.data, TR_LIST.vector.size); + SceneListSubmit((Vertex*) aligned_vector_front(&TR_LIST.vector), aligned_vector_size(&TR_LIST.vector)); SceneListFinish(); } SceneFinish(); diff --git a/GL/immediate.c b/GL/immediate.c index c0e2adc..69dd7a4 100644 --- a/GL/immediate.c +++ b/GL/immediate.c @@ -50,7 +50,7 @@ void _glInitImmediateMode(GLuint initial_size) { aligned_vector_init(&VERTICES, sizeof(IMVertex)); aligned_vector_reserve(&VERTICES, initial_size); - IM_ATTRIBS.vertex.ptr = VERTICES.data; + IM_ATTRIBS.vertex.ptr = aligned_vector_front(&VERTICES); IM_ATTRIBS.vertex.size = 3; IM_ATTRIBS.vertex.type = GL_FLOAT; IM_ATTRIBS.vertex.stride = sizeof(IMVertex); @@ -161,12 +161,11 @@ void APIENTRY glColor3fv(const GLfloat* v) { void APIENTRY glVertex3f(GLfloat x, GLfloat y, GLfloat z) { IM_ENABLED_VERTEX_ATTRIBUTES |= VERTEX_ENABLED_FLAG; - unsigned int cap = VERTICES.capacity; + uint32_t cap = aligned_vector_capacity(&VERTICES); IMVertex* vert = aligned_vector_extend(&VERTICES, 1); - - if(cap != VERTICES.capacity) { + if(cap != aligned_vector_capacity(&VERTICES)) { /* Resizing could've invalidated the pointers */ - IM_ATTRIBS.vertex.ptr = VERTICES.data; + IM_ATTRIBS.vertex.ptr = aligned_vector_front(&VERTICES); IM_ATTRIBS.uv.ptr = IM_ATTRIBS.vertex.ptr + (sizeof(GLfloat) * 3); IM_ATTRIBS.st.ptr = IM_ATTRIBS.vertex.ptr + (sizeof(GLfloat) * 5); IM_ATTRIBS.colour.ptr = IM_ATTRIBS.vertex.ptr + (sizeof(GLfloat) * 7); @@ -281,7 +280,7 @@ void APIENTRY glEnd() { FAST_PATH_ENABLED = GL_TRUE; #endif - glDrawArrays(ACTIVE_POLYGON_MODE, 0, VERTICES.size); + glDrawArrays(ACTIVE_POLYGON_MODE, 0, aligned_vector_header(&VERTICES)->size); ATTRIB_POINTERS = stashed_attrib_pointers; diff --git a/GL/platform.h b/GL/platform.h index 1348d6f..4e1e65b 100644 --- a/GL/platform.h +++ b/GL/platform.h @@ -5,6 +5,7 @@ #include #include "gl_assert.h" +#include "types.h" #define MEMSET(dst, v, size) memset((dst), (v), (size)) @@ -260,7 +261,7 @@ typedef float Matrix4x4[16]; void SceneBegin(); void SceneListBegin(GPUList list); -void SceneListSubmit(void* src, int n); +void SceneListSubmit(Vertex* v2, int n); void SceneListFinish(); void SceneFinish(); diff --git a/GL/platforms/sh4.c b/GL/platforms/sh4.c index 0d2a35c..addc6fe 100644 --- a/GL/platforms/sh4.c +++ b/GL/platforms/sh4.c @@ -9,11 +9,7 @@ #define likely(x) __builtin_expect(!!(x), 1) #define unlikely(x) __builtin_expect(!!(x), 0) -#define SQ_BASE_ADDRESS (uint32_t *)(void *) \ - (0xe0000000 | (((uint32_t)0x10000000) & 0x03ffffe0)) - - -static volatile uint32_t* PVR_LMMODE0 = (uint32_t*) 0xA05F6884; +#define SQ_BASE_ADDRESS (void*) 0xe0000000 GL_FORCE_INLINE bool glIsVertex(const float flags) { @@ -54,377 +50,382 @@ GL_FORCE_INLINE void _glPerspectiveDivideVertex(Vertex* vertex, const float h) { const float f = _glFastInvert(vertex->w); /* Convert to NDC and apply viewport */ - vertex->xyz[0] = __builtin_fmaf( - VIEWPORT.hwidth, vertex->xyz[0] * f, VIEWPORT.x_plus_hwidth - ); - - vertex->xyz[1] = h - __builtin_fmaf( - VIEWPORT.hheight, vertex->xyz[1] * f, VIEWPORT.y_plus_hheight - ); + vertex->xyz[0] = (vertex->xyz[0] * f * 320) + 320; + vertex->xyz[1] = (vertex->xyz[1] * f * -240) + 240; /* Orthographic projections need to use invZ otherwise we lose the depth information. As w == 1, and clip-space range is -w to +w we add 1.0 to the Z to bring it into range. We add a little extra to avoid a divide by zero. */ - - vertex->xyz[2] = (vertex->w == 1.0f) ? _glFastInvert(1.0001f + vertex->xyz[2]) : f; + if(vertex->w == 1.0f) { + vertex->xyz[2] = _glFastInvert(1.0001f + vertex->xyz[2]); + } else { + vertex->xyz[2] = f; + } } -GL_FORCE_INLINE void _glSubmitHeaderOrVertex(uint32_t* d, const Vertex* v) { -#ifndef NDEBUG - gl_assert(!isnan(v->xyz[2])); - gl_assert(!isnan(v->w)); -#endif -#if CLIP_DEBUG - printf("Submitting: %x (%x)\n", v, v->flags); -#endif +volatile uint32_t *sq = SQ_BASE_ADDRESS; - uint32_t *s = (uint32_t*) v; - d[0] = *(s++); - d[1] = *(s++); - d[2] = *(s++); - d[3] = *(s++); - d[4] = *(s++); - d[5] = *(s++); - d[6] = *(s++); - d[7] = *(s++); - __asm__("pref @%0" : : "r"(d)); - d += 8; +static inline void _glFlushBuffer() {} +static inline void _glPushHeaderOrVertex(Vertex* v) { + uint32_t* s = (uint32_t*) v; + sq[0] = *(s++); + sq[1] = *(s++); + sq[2] = *(s++); + sq[3] = *(s++); + sq[4] = *(s++); + sq[5] = *(s++); + sq[6] = *(s++); + sq[7] = *(s++); + __asm__("pref @%0" : : "r"(sq)); + sq += 8; } -static struct __attribute__((aligned(32))) { - Vertex* v; - int visible; -} triangle[3]; - -static int tri_count = 0; -static int strip_count = 0; - -static inline void interpolateColour(const uint32_t* a, const uint32_t* b, const float t, uint32_t* out) { - const static uint32_t MASK1 = 0x00FF00FF; - const static uint32_t MASK2 = 0xFF00FF00; - - const uint32_t f2 = 256 * t; - const uint32_t f1 = 256 - f2; - - *out = (((((*a & MASK1) * f1) + ((*b & MASK1) * f2)) >> 8) & MASK1) | - (((((*a & MASK2) * f1) + ((*b & MASK2) * f2)) >> 8) & MASK2); -} - -static inline void _glClipEdge(const Vertex* v1, const Vertex* v2, Vertex* vout) { - /* Clipping time! */ +static inline void _glClipEdge(const Vertex* const v1, const Vertex* const v2, Vertex* vout) { + const static float o = 0.003921569f; // 1 / 255 const float d0 = v1->w + v1->xyz[2]; const float d1 = v2->w + v2->xyz[2]; - const float sign = ((2.0f * (d1 < d0)) - 1.0f); - const float epsilon = -0.00001f * sign; - const float n = (d0 - d1); - const float r = (1.f / sqrtf(n * n)) * sign; - float t = fmaf(r, d0, epsilon); + const float t = (fabs(d0) * (1.0f / sqrtf((d1 - d0) * (d1 - d0)))) + 0.000001f; + const float invt = 1.0f - t; - vout->xyz[0] = fmaf(v2->xyz[0] - v1->xyz[0], t, v1->xyz[0]); - vout->xyz[1] = fmaf(v2->xyz[1] - v1->xyz[1], t, v1->xyz[1]); - vout->xyz[2] = fmaf(v2->xyz[2] - v1->xyz[2], t, v1->xyz[2]); - vout->w = fmaf(v2->w - v1->w, t, v1->w); + vout->xyz[0] = invt * v1->xyz[0] + t * v2->xyz[0]; + vout->xyz[1] = invt * v1->xyz[1] + t * v2->xyz[1]; + vout->xyz[2] = invt * v1->xyz[2] + t * v2->xyz[2]; - vout->uv[0] = fmaf(v2->uv[0] - v1->uv[0], t, v1->uv[0]); - vout->uv[1] = fmaf(v2->uv[1] - v1->uv[1], t, v1->uv[1]); + vout->uv[0] = invt * v1->uv[0] + t * v2->uv[0]; + vout->uv[1] = invt * v1->uv[1] + t * v2->uv[1]; - interpolateColour((uint32_t*) v1->bgra, (uint32_t*) v2->bgra, t, (uint32_t*) vout->bgra); -} + vout->w = invt * v1->w + t * v2->w; -GL_FORCE_INLINE void ClearTriangle() { - tri_count = 0; -} + const float m = 255 * t; + const float n = 255 - m; -static inline void ShiftTriangle() { - if(!tri_count) { - return; - } - - tri_count--; - triangle[0] = triangle[1]; - triangle[1] = triangle[2]; - -#ifndef NDEBUG - triangle[2].v = NULL; - triangle[2].visible = false; -#endif -} - - -static inline void ShiftRotateTriangle() { - if(!tri_count) { - return; - } - - if(triangle[0].v < triangle[1].v) { - triangle[0] = triangle[2]; - } else { - triangle[1] = triangle[2]; - } - - tri_count--; + vout->bgra[0] = (v1->bgra[0] * n + v2->bgra[0] * m) * o; + vout->bgra[1] = (v1->bgra[1] * n + v2->bgra[1] * m) * o; + vout->bgra[2] = (v1->bgra[2] * n + v2->bgra[2] * m) * o; + vout->bgra[3] = (v1->bgra[3] * n + v2->bgra[3] * m) * o; } #define SPAN_SORT_CFG 0x005F8030 +static volatile uint32_t* PVR_LMMODE0 = (uint32_t*) 0xA05F6884; +static volatile uint32_t *PVR_LMMODE1 = (uint32_t*) 0xA05F6888; +static volatile uint32_t *QACR = (uint32_t*) 0xFF000038; + +void SceneListSubmit(Vertex* v2, int n) { + /* You need at least a header, and 3 vertices to render anything */ + if(n < 4) { + return; + } -void SceneListSubmit(void* src, int n) { const float h = GetVideoMode()->height; PVR_SET(SPAN_SORT_CFG, 0x0); //Set PVR DMA registers - volatile int *pvrdmacfg = (int*)0xA05F6888; - pvrdmacfg[0] = 1; - pvrdmacfg[1] = 0; + *PVR_LMMODE0 = 0; + *PVR_LMMODE1 = 0; //Set QACR registers - volatile int *qacr = (int*)0xFF000038; - qacr[1] = qacr[0] = 0x11; - - uint32_t *d = SQ_BASE_ADDRESS; - - Vertex __attribute__((aligned(32))) tmp; - - /* Perform perspective divide on each vertex */ - Vertex* vertex = (Vertex*) src; - - if(!_glNearZClippingEnabled()) { - /* Prep store queues */ - - while(n--) { - if(glIsVertex(vertex->flags)) { - _glPerspectiveDivideVertex(vertex, h); - } - - _glSubmitHeaderOrVertex(d, vertex); - ++vertex; - } - - return; - } - - tri_count = 0; - strip_count = 0; + QACR[1] = QACR[0] = 0x11; #if CLIP_DEBUG - printf("----\n"); -#endif + Vertex* vertex = (Vertex*) src; + for(int i = 0; i < n; ++i) { + fprintf(stderr, "{%f, %f, %f, %f}, // %x (%x)\n", vertex[i].xyz[0], vertex[i].xyz[1], vertex[i].xyz[2], vertex[i].w, vertex[i].flags, &vertex[i]); + } - for(int i = 0; i < n; ++i, ++vertex) { - PREFETCH(vertex + 1); - PREFETCH(vertex + 2); - /* Wait until we fill the triangle */ - if(tri_count < 3) { - if(glIsVertex(vertex->flags)) { - ++strip_count; - triangle[tri_count].v = vertex; - triangle[tri_count].visible = vertex->xyz[2] >= -vertex->w; - if(++tri_count < 3) { + fprintf(stderr, "----\n"); +#endif + uint8_t visible_mask = 0; + uint8_t counter = 0; + + sq = SQ_BASE_ADDRESS; + + for(int i = 0; i < n; ++i, ++v2) { + PREFETCH(v2 + 1); + switch(v2->flags) { + case GPU_CMD_VERTEX_EOL: + if(counter < 2) { continue; } - } else { - /* We hit a header */ - tri_count = 0; - strip_count = 0; - _glSubmitHeaderOrVertex(d, vertex); + + counter = 0; + break; + case GPU_CMD_VERTEX: + ++counter; + if(counter < 3) { + continue; + } + break; + default: + _glPushHeaderOrVertex(v2); + counter = 0; continue; - } - } + }; -#if CLIP_DEBUG - printf("SC: %d\n", strip_count); -#endif + Vertex* const v0 = v2 - 2; + Vertex* const v1 = v2 - 1; - /* If we got here, then triangle contains 3 vertices */ - int visible_mask = triangle[0].visible | (triangle[1].visible << 1) | (triangle[2].visible << 2); - - /* Clipping time! - - There are 6 distinct possibilities when clipping a triangle. 3 of them result - in another triangle, 3 of them result in a quadrilateral. - - Assuming you iterate the edges of the triangle in order, and create a new *visible* - vertex when you cross the plane, and discard vertices behind the plane, then the only - difference between the two cases is that the final two vertices that need submitting have - to be reversed. - - Unfortunately we have to copy vertices here, because if we persp-divide a vertex it may - be used in a subsequent triangle in the strip and would end up being double divided. - */ - -#define SUBMIT_QUEUED() \ - if(strip_count > 3) { \ - tmp = *(vertex - 2); \ - /* If we had triangles ahead of this one, submit and finalize */ \ - _glPerspectiveDivideVertex(&tmp, h); \ - _glSubmitHeaderOrVertex(d, &tmp); \ - tmp = *(vertex - 1); \ - tmp.flags = GPU_CMD_VERTEX_EOL; \ - _glPerspectiveDivideVertex(&tmp, h); \ - _glSubmitHeaderOrVertex(d, &tmp); \ - } - - bool is_last_in_strip = glIsLastVertex(vertex->flags); + visible_mask = ( + (v0->xyz[2] > -v0->w) << 0 | + (v1->xyz[2] > -v1->w) << 1 | + (v2->xyz[2] > -v2->w) << 2 | + (counter == 0) << 3 + ); switch(visible_mask) { - case 1: { - SUBMIT_QUEUED(); - /* 0, 0a, 2a */ - tmp = *triangle[0].v; - tmp.flags = GPU_CMD_VERTEX; - _glPerspectiveDivideVertex(&tmp, h); - _glSubmitHeaderOrVertex(d, &tmp); + case 15: /* All visible, but final vertex in strip */ + { + _glPerspectiveDivideVertex(v0, h); + _glPushHeaderOrVertex(v0); - _glClipEdge(triangle[0].v, triangle[1].v, &tmp); - tmp.flags = GPU_CMD_VERTEX; - _glPerspectiveDivideVertex(&tmp, h); - _glSubmitHeaderOrVertex(d, &tmp); + _glPerspectiveDivideVertex(v1, h); + _glPushHeaderOrVertex(v1); - _glClipEdge(triangle[2].v, triangle[0].v, &tmp); - tmp.flags = GPU_CMD_VERTEX_EOL; - _glPerspectiveDivideVertex(&tmp, h); - _glSubmitHeaderOrVertex(d, &tmp); - } break; - case 2: { - SUBMIT_QUEUED(); - /* 0a, 1, 1a */ - _glClipEdge(triangle[0].v, triangle[1].v, &tmp); - tmp.flags = GPU_CMD_VERTEX; - _glPerspectiveDivideVertex(&tmp, h); - _glSubmitHeaderOrVertex(d, &tmp); - - tmp = *triangle[1].v; - tmp.flags = GPU_CMD_VERTEX; - _glPerspectiveDivideVertex(&tmp, h); - _glSubmitHeaderOrVertex(d, &tmp); - - _glClipEdge(triangle[1].v, triangle[2].v, &tmp); - tmp.flags = GPU_CMD_VERTEX_EOL; - _glPerspectiveDivideVertex(&tmp, h); - _glSubmitHeaderOrVertex(d, &tmp); - } break; - case 3: { - SUBMIT_QUEUED(); - /* 0, 1, 2a, 1a */ - tmp = *triangle[0].v; - tmp.flags = GPU_CMD_VERTEX; - _glPerspectiveDivideVertex(&tmp, h); - _glSubmitHeaderOrVertex(d, &tmp); - - tmp = *triangle[1].v; - tmp.flags = GPU_CMD_VERTEX; - _glPerspectiveDivideVertex(&tmp, h); - _glSubmitHeaderOrVertex(d, &tmp); - - _glClipEdge(triangle[2].v, triangle[0].v, &tmp); - tmp.flags = GPU_CMD_VERTEX; - _glPerspectiveDivideVertex(&tmp, h); - _glSubmitHeaderOrVertex(d, &tmp); - - _glClipEdge(triangle[1].v, triangle[2].v, &tmp); - tmp.flags = GPU_CMD_VERTEX_EOL; - _glPerspectiveDivideVertex(&tmp, h); - _glSubmitHeaderOrVertex(d, &tmp); - } break; - case 4: { - SUBMIT_QUEUED(); - /* 1a, 2, 2a */ - _glClipEdge(triangle[1].v, triangle[2].v, &tmp); - tmp.flags = GPU_CMD_VERTEX; - _glPerspectiveDivideVertex(&tmp, h); - _glSubmitHeaderOrVertex(d, &tmp); - - tmp = *triangle[2].v; - tmp.flags = GPU_CMD_VERTEX; - _glPerspectiveDivideVertex(&tmp, h); - _glSubmitHeaderOrVertex(d, &tmp); - - _glClipEdge(triangle[2].v, triangle[0].v, &tmp); - tmp.flags = GPU_CMD_VERTEX_EOL; - _glPerspectiveDivideVertex(&tmp, h); - _glSubmitHeaderOrVertex(d, &tmp); - } break; - case 5: { - SUBMIT_QUEUED(); - /* 0, 0a, 2, 1a */ - tmp = *triangle[0].v; - tmp.flags = GPU_CMD_VERTEX; - _glPerspectiveDivideVertex(&tmp, h); - _glSubmitHeaderOrVertex(d, &tmp); - - _glClipEdge(triangle[0].v, triangle[1].v, &tmp); - tmp.flags = GPU_CMD_VERTEX; - _glPerspectiveDivideVertex(&tmp, h); - _glSubmitHeaderOrVertex(d, &tmp); - - tmp = *triangle[2].v; - tmp.flags = GPU_CMD_VERTEX; - _glPerspectiveDivideVertex(&tmp, h); - _glSubmitHeaderOrVertex(d, &tmp); - - _glClipEdge(triangle[1].v, triangle[2].v, &tmp); - tmp.flags = GPU_CMD_VERTEX_EOL; - _glPerspectiveDivideVertex(&tmp, h); - _glSubmitHeaderOrVertex(d, &tmp); - } break; - case 6: { - SUBMIT_QUEUED(); - /* 0a, 1, 2a, 2 */ - _glClipEdge(triangle[0].v, triangle[1].v, &tmp); - tmp.flags = GPU_CMD_VERTEX; - _glPerspectiveDivideVertex(&tmp, h); - _glSubmitHeaderOrVertex(d, &tmp); - - tmp = *triangle[1].v; - tmp.flags = GPU_CMD_VERTEX; - _glPerspectiveDivideVertex(&tmp, h); - _glSubmitHeaderOrVertex(d, &tmp); - - _glClipEdge(triangle[2].v, triangle[0].v, &tmp); - tmp.flags = GPU_CMD_VERTEX; - _glPerspectiveDivideVertex(&tmp, h); - _glSubmitHeaderOrVertex(d, &tmp); - - tmp = *triangle[2].v; - tmp.flags = GPU_CMD_VERTEX_EOL; - _glPerspectiveDivideVertex(&tmp, h); - _glSubmitHeaderOrVertex(d, &tmp); - } break; - case 7: { - /* All the vertices are visible! We divide and submit v0, then shift */ - _glPerspectiveDivideVertex(vertex - 2, h); - _glSubmitHeaderOrVertex(d, vertex - 2); - - if(is_last_in_strip) { - _glPerspectiveDivideVertex(vertex - 1, h); - _glSubmitHeaderOrVertex(d, vertex - 1); - _glPerspectiveDivideVertex(vertex, h); - _glSubmitHeaderOrVertex(d, vertex); - tri_count = 0; - strip_count = 0; - } - - ShiftRotateTriangle(); - continue; - } break; - case 0: - default: + _glPerspectiveDivideVertex(v2, h); + _glPushHeaderOrVertex(v2); + } break; - } + case 7: + /* All visible, push the first vertex and move on */ + _glPerspectiveDivideVertex(v0, h); + _glPushHeaderOrVertex(v0); + break; + case 9: + /* First vertex was visible, last in strip */ + { + Vertex __attribute__((aligned(32))) scratch[2]; + Vertex* a = &scratch[0]; + Vertex* b = &scratch[1]; - /* If this was the last in the strip, we don't need to - submit anything else, we just wipe the tri_count */ - if(is_last_in_strip) { - tri_count = 0; - strip_count = 0; - } else { - ShiftRotateTriangle(); - strip_count = 2; + _glClipEdge(v0, v1, a); + a->flags = GPU_CMD_VERTEX; + + _glClipEdge(v2, v0, b); + b->flags = GPU_CMD_VERTEX_EOL; + + _glPerspectiveDivideVertex(v0, h); + _glPushHeaderOrVertex(v0); + + _glPerspectiveDivideVertex(a, h); + _glPushHeaderOrVertex(a); + + _glPerspectiveDivideVertex(b, h); + _glPushHeaderOrVertex(b); + } + break; + case 1: + /* First vertex was visible, but not last in strip */ + { + Vertex __attribute__((aligned(32))) scratch[2]; + Vertex* a = &scratch[0]; + Vertex* b = &scratch[1]; + + _glClipEdge(v0, v1, a); + a->flags = GPU_CMD_VERTEX; + + _glClipEdge(v2, v0, b); + b->flags = GPU_CMD_VERTEX; + + _glPerspectiveDivideVertex(v0, h); + _glPushHeaderOrVertex(v0); + + _glPerspectiveDivideVertex(a, h); + _glPushHeaderOrVertex(a); + + _glPerspectiveDivideVertex(b, h); + _glPushHeaderOrVertex(b); + _glPushHeaderOrVertex(b); + } + break; + case 10: + case 2: + /* Second vertex was visible. In self case we need to create a triangle and produce + two new vertices: 1-2, and 2-3. */ + { + Vertex __attribute__((aligned(32))) scratch[2]; + Vertex* a = &scratch[0]; + Vertex* c = &scratch[1]; + + memcpy_vertex(c, v1); + + _glClipEdge(v0, c, a); + a->flags = GPU_CMD_VERTEX; + + _glPerspectiveDivideVertex(a, h); + _glPushHeaderOrVertex(a); + + _glClipEdge(c, v2, a); + a->flags = v2->flags; + + _glPerspectiveDivideVertex(c, h); + _glPushHeaderOrVertex(c); + + _glPerspectiveDivideVertex(a, h); + _glPushHeaderOrVertex(a); + } + break; + case 11: + case 3: /* First and second vertex were visible */ + { + Vertex __attribute__((aligned(32))) scratch[3]; + Vertex* a = &scratch[0]; + Vertex* b = &scratch[1]; + Vertex* c = &scratch[2]; + + memcpy_vertex(c, v1); + + _glClipEdge(v2, v0, b); + b->flags = GPU_CMD_VERTEX; + + _glPerspectiveDivideVertex(v0, h); + _glPushHeaderOrVertex(v0); + + _glClipEdge(v1, v2, a); + a->flags = v2->flags; + + _glPerspectiveDivideVertex(c, h); + _glPushHeaderOrVertex(c); + + _glPerspectiveDivideVertex(b, h); + _glPushHeaderOrVertex(b); + + _glPerspectiveDivideVertex(a, h); + _glPushHeaderOrVertex(c); + _glPushHeaderOrVertex(a); + } + break; + case 12: + case 4: + /* Third vertex was visible. */ + { + Vertex __attribute__((aligned(32))) scratch[3]; + Vertex* a = &scratch[0]; + Vertex* b = &scratch[1]; + Vertex* c = &scratch[2]; + + memcpy_vertex(c, v2); + + _glClipEdge(v2, v0, a); + a->flags = GPU_CMD_VERTEX; + + _glClipEdge(v1, v2, b); + b->flags = GPU_CMD_VERTEX; + + _glPerspectiveDivideVertex(a, h); + _glPushHeaderOrVertex(a); + + if(counter % 2 == 1) { + _glPushHeaderOrVertex(a); + } + + _glPerspectiveDivideVertex(b, h); + _glPushHeaderOrVertex(b); + + _glPerspectiveDivideVertex(c, h); + _glPushHeaderOrVertex(c); + } + break; + case 13: + { + Vertex __attribute__((aligned(32))) scratch[3]; + Vertex* a = &scratch[0]; + Vertex* b = &scratch[1]; + Vertex* c = &scratch[2]; + + memcpy_vertex(c, v2); + c->flags = GPU_CMD_VERTEX; + + _glClipEdge(v0, v1, a); + a->flags = GPU_CMD_VERTEX; + + _glClipEdge(v1, v2, b); + b->flags = GPU_CMD_VERTEX; + + _glPerspectiveDivideVertex(v0, h); + _glPushHeaderOrVertex(v0); + + _glPerspectiveDivideVertex(a, h); + _glPushHeaderOrVertex(a); + + _glPerspectiveDivideVertex(c, h); + _glPushHeaderOrVertex(c); + _glPerspectiveDivideVertex(b, h); + _glPushHeaderOrVertex(b); + + c->flags = GPU_CMD_VERTEX_EOL; + _glPushHeaderOrVertex(c); + } + break; + case 5: /* First and third vertex were visible */ + { + Vertex __attribute__((aligned(32))) scratch[3]; + Vertex* a = &scratch[0]; + Vertex* b = &scratch[1]; + Vertex* c = &scratch[2]; + + memcpy_vertex(c, v2); + c->flags = GPU_CMD_VERTEX; + + _glClipEdge(v0, v1, a); + a->flags = GPU_CMD_VERTEX; + + _glClipEdge(v1, v2, b); + b->flags = GPU_CMD_VERTEX; + + _glPerspectiveDivideVertex(v0, h); + _glPushHeaderOrVertex(v0); + + _glPerspectiveDivideVertex(a, h); + _glPushHeaderOrVertex(a); + + _glPerspectiveDivideVertex(c, h); + _glPushHeaderOrVertex(c); + _glPerspectiveDivideVertex(b, h); + _glPushHeaderOrVertex(b); + _glPushHeaderOrVertex(c); + } + break; + case 14: + case 6: /* Second and third vertex were visible */ + { + Vertex __attribute__((aligned(32))) scratch[4]; + Vertex* a = &scratch[0]; + Vertex* b = &scratch[1]; + Vertex* c = &scratch[2]; + Vertex* d = &scratch[3]; + + memcpy_vertex(c, v1); + memcpy_vertex(d, v2); + + _glClipEdge(v0, v1, a); + a->flags = GPU_CMD_VERTEX; + + _glClipEdge(v2, v0, b); + b->flags = GPU_CMD_VERTEX; + + _glPerspectiveDivideVertex(a, h); + _glPushHeaderOrVertex(a); + + _glPerspectiveDivideVertex(c, h); + _glPushHeaderOrVertex(c); + + _glPerspectiveDivideVertex(b, h); + _glPushHeaderOrVertex(b); + _glPushHeaderOrVertex(c); + + _glPerspectiveDivideVertex(d, h); + _glPushHeaderOrVertex(d); + } + break; + default: + break; } } + + _glFlushBuffer(); } void SceneListFinish() { diff --git a/GL/platforms/software.c b/GL/platforms/software.c index bd527c9..6b5a9ee 100644 --- a/GL/platforms/software.c +++ b/GL/platforms/software.c @@ -255,7 +255,7 @@ GL_FORCE_INLINE void ShiftRotateTriangle() { tri_count--; } -void SceneListSubmit(void* src, int n) { +void SceneListSubmit(Vertex* src, int n) { /* Perform perspective divide on each vertex */ Vertex* vertex = (Vertex*) src; diff --git a/GL/private.h b/GL/private.h index f309571..f945ccb 100644 --- a/GL/private.h +++ b/GL/private.h @@ -233,11 +233,41 @@ GL_FORCE_INLINE float clamp(float d, float min, float max) { return (d < min) ? min : (d > max) ? max : d; } +GL_FORCE_INLINE void memcpy_vertex(Vertex *dest, const Vertex *src) { +#ifdef __DREAMCAST__ + _Complex float double_scratch; + + asm volatile ( + "fschg\n\t" + "clrs\n\t" + ".align 2\n\t" + "fmov.d @%[in]+, %[scratch]\n\t" + "fmov.d %[scratch], @%[out]\n\t" + "fmov.d @%[in]+, %[scratch]\n\t" + "add #8, %[out]\n\t" + "fmov.d %[scratch], @%[out]\n\t" + "fmov.d @%[in]+, %[scratch]\n\t" + "add #8, %[out]\n\t" + "fmov.d %[scratch], @%[out]\n\t" + "fmov.d @%[in], %[scratch]\n\t" + "add #8, %[out]\n\t" + "fmov.d %[scratch], @%[out]\n\t" + "fschg\n" + : [in] "+&r" ((uint32_t) src), [scratch] "=&d" (double_scratch), [out] "+&r" ((uint32_t) dest) + : + : "t", "memory" // clobbers + ); +#else + *dest = *src; +#endif +} + #define swapVertex(a, b) \ do { \ - Vertex c = *a; \ - *a = *b; \ - *b = c; \ + Vertex __attribute__((aligned(32))) c; \ + memcpy_vertex(&c, a); \ + memcpy_vertex(a, b); \ + memcpy_vertex(b, &c); \ } while(0) /* ClipVertex doesn't have room for these, so we need to parse them diff --git a/GL/state.c b/GL/state.c index 618eefd..e1bfe14 100644 --- a/GL/state.c +++ b/GL/state.c @@ -403,8 +403,8 @@ GLAPI void APIENTRY glEnable(GLenum cap) { } break; case GL_CULL_FACE: { - if(GPUState.cull_face != GL_TRUE) { - GPUState.cull_face = GL_TRUE; + if(GPUState.culling_enabled != GL_TRUE) { + GPUState.culling_enabled = GL_TRUE; GPUState.is_dirty = GL_TRUE; } @@ -507,8 +507,8 @@ GLAPI void APIENTRY glDisable(GLenum cap) { } break; case GL_CULL_FACE: { - if(GPUState.cull_face != GL_FALSE) { - GPUState.cull_face = GL_FALSE; + if(GPUState.culling_enabled != GL_FALSE) { + GPUState.culling_enabled = GL_FALSE; GPUState.is_dirty = GL_TRUE; } diff --git a/containers/aligned_vector.c b/containers/aligned_vector.c index 442e0b9..15729ae 100644 --- a/containers/aligned_vector.c +++ b/containers/aligned_vector.c @@ -12,36 +12,44 @@ #include "aligned_vector.h" -extern inline void* aligned_vector_resize(AlignedVector* vector, const unsigned int element_count); -extern inline void* aligned_vector_extend(AlignedVector* vector, const unsigned int additional_count); -extern inline void* aligned_vector_reserve(AlignedVector* vector, unsigned int element_count); -extern inline void* aligned_vector_push_back(AlignedVector* vector, const void* objs, unsigned int count); +extern inline void* aligned_vector_resize(AlignedVector* vector, const uint32_t element_count); +extern inline void* aligned_vector_extend(AlignedVector* vector, const uint32_t additional_count); +extern inline void* aligned_vector_reserve(AlignedVector* vector, uint32_t element_count); +extern inline void* aligned_vector_push_back(AlignedVector* vector, const void* objs, uint32_t count); -void aligned_vector_init(AlignedVector* vector, unsigned int element_size) { - vector->size = vector->capacity = 0; - vector->element_size = element_size; - vector->data = NULL; +void aligned_vector_init(AlignedVector* vector, uint32_t element_size) { + /* Now initialize the header*/ + AlignedVectorHeader* const hdr = &vector->hdr; + hdr->size = 0; + hdr->capacity = ALIGNED_VECTOR_CHUNK_SIZE; + hdr->element_size = element_size; - /* Reserve some initial capacity */ - aligned_vector_reserve(vector, ALIGNED_VECTOR_CHUNK_SIZE); + /* Reserve some initial capacity. This will do the allocation but not set up the header */ + void* ptr = aligned_vector_reserve(vector, ALIGNED_VECTOR_CHUNK_SIZE); + assert(ptr); + (void) ptr; } void aligned_vector_shrink_to_fit(AlignedVector* vector) { - if(vector->size == 0) { + AlignedVectorHeader* const hdr = &vector->hdr; + if(hdr->size == 0) { + uint32_t element_size = hdr->element_size; free(vector->data); - vector->data = NULL; - vector->capacity = 0; + + /* Reallocate the header */ + vector->data = memalign(0x20, sizeof(AlignedVectorHeader)); + hdr->size = hdr->capacity = 0; + hdr->element_size = element_size; } else { - unsigned int new_byte_size = vector->size * vector->element_size; - unsigned char* original_data = vector->data; + uint32_t new_byte_size = (hdr->size * hdr->element_size); + uint8_t* original_data = vector->data; vector->data = (unsigned char*) memalign(0x20, new_byte_size); if(original_data) { FASTCPY(vector->data, original_data, new_byte_size); free(original_data); } - - vector->capacity = vector->size; + hdr->capacity = hdr->size; } } diff --git a/containers/aligned_vector.h b/containers/aligned_vector.h index 3400fe1..5109b8b 100644 --- a/containers/aligned_vector.h +++ b/containers/aligned_vector.h @@ -4,6 +4,7 @@ #include #include #include +#include #ifdef __cplusplus extern "C" { @@ -12,6 +13,7 @@ extern "C" { #if defined(__APPLE__) || defined(__WIN32__) /* Linux + Kos define this, OSX does not, so just use malloc there */ static inline void* memalign(size_t alignment, size_t size) { + (void) alignment; return malloc(size); } #else @@ -65,10 +67,14 @@ AV_FORCE_INLINE void *AV_MEMCPY4(void *dest, const void *src, size_t len) #endif typedef struct { - uint8_t* __attribute__((aligned(32))) data; uint32_t size; uint32_t capacity; uint32_t element_size; +} __attribute__((aligned(32))) AlignedVectorHeader; + +typedef struct { + AlignedVectorHeader hdr; + uint8_t* data; } AlignedVector; #define ALIGNED_VECTOR_CHUNK_SIZE 256u @@ -78,90 +84,129 @@ typedef struct { ((((v) + ALIGNED_VECTOR_CHUNK_SIZE - 1) / ALIGNED_VECTOR_CHUNK_SIZE) * ALIGNED_VECTOR_CHUNK_SIZE) -void aligned_vector_init(AlignedVector* vector, unsigned int element_size); +void aligned_vector_init(AlignedVector* vector, uint32_t element_size); -AV_FORCE_INLINE void* aligned_vector_reserve(AlignedVector* vector, unsigned int element_count) { - if(element_count <= vector->capacity) { - return NULL; +AV_FORCE_INLINE void* aligned_vector_at(const AlignedVector* vector, const uint32_t index) { + const AlignedVectorHeader* hdr = &vector->hdr; + assert(index < hdr->size); + return vector->data + (index * hdr->element_size); +} + +AV_FORCE_INLINE void* aligned_vector_reserve(AlignedVector* vector, uint32_t element_count) { + AlignedVectorHeader* hdr = &vector->hdr; + + if(element_count < hdr->capacity) { + return aligned_vector_at(vector, element_count); } - unsigned int original_byte_size = vector->size * vector->element_size; + uint32_t original_byte_size = (hdr->size * hdr->element_size); /* We overallocate so that we don't make small allocations during push backs */ element_count = ROUND_TO_CHUNK_SIZE(element_count); - unsigned int new_byte_size = element_count * vector->element_size; - unsigned char* original_data = vector->data; + uint32_t new_byte_size = (element_count * hdr->element_size); + uint8_t* original_data = vector->data; - vector->data = (unsigned char*) memalign(0x20, new_byte_size); + vector->data = (uint8_t*) memalign(0x20, new_byte_size); assert(vector->data); - if(original_data) { - AV_MEMCPY4(vector->data, original_data, original_byte_size); - free(original_data); - } - - vector->capacity = element_count; + AV_MEMCPY4(vector->data, original_data, original_byte_size); + free(original_data); + hdr->capacity = element_count; return vector->data + original_byte_size; } -AV_FORCE_INLINE void* aligned_vector_at(const AlignedVector* vector, const unsigned int index) { - assert(index < vector->size); - return &vector->data[index * vector->element_size]; +AV_FORCE_INLINE AlignedVectorHeader* aligned_vector_header(const AlignedVector* vector) { + return (AlignedVectorHeader*) &vector->hdr; } -AV_FORCE_INLINE void* aligned_vector_resize(AlignedVector* vector, const unsigned int element_count) { +AV_FORCE_INLINE uint32_t aligned_vector_size(const AlignedVector* vector) { + const AlignedVectorHeader* hdr = &vector->hdr; + return hdr->size; +} + +AV_FORCE_INLINE uint32_t aligned_vector_capacity(const AlignedVector* vector) { + const AlignedVectorHeader* hdr = &vector->hdr; + return hdr->capacity; +} + +AV_FORCE_INLINE void* aligned_vector_front(const AlignedVector* vector) { + return vector->data; +} + +/* Resizes the array and returns a pointer to the first new element (if upsizing) or NULL (if downsizing) */ +AV_FORCE_INLINE void* aligned_vector_resize(AlignedVector* vector, const uint32_t element_count) { void* ret = NULL; - unsigned int previousCount = vector->size; - - if(vector->capacity < element_count) { + AlignedVectorHeader* hdr = &vector->hdr; + uint32_t previous_count = hdr->size; + if(hdr->capacity <= element_count) { /* If we didn't have capacity, increase capacity (slow) */ - vector->size = element_count; - ret = aligned_vector_reserve(vector, element_count); - } else if(previousCount < element_count) { + + aligned_vector_reserve(vector, element_count); + hdr->size = element_count; + + ret = aligned_vector_at(vector, previous_count); + + assert(hdr->size == element_count); + assert(hdr->size <= hdr->capacity); + } else if(previous_count < element_count) { /* So we grew, but had the capacity, just get a pointer to * where we were */ - vector->size = element_count; - ret = aligned_vector_at(vector, previousCount); - } else { - vector->size = element_count; + hdr->size = element_count; + assert(hdr->size < hdr->capacity); + ret = aligned_vector_at(vector, previous_count); + } else if(hdr->size != element_count) { + hdr->size = element_count; + assert(hdr->size < hdr->capacity); } return ret; } -AV_FORCE_INLINE void* aligned_vector_push_back(AlignedVector* vector, const void* objs, unsigned int count) { +AV_FORCE_INLINE void* aligned_vector_push_back(AlignedVector* vector, const void* objs, uint32_t count) { /* Resize enough room */ + AlignedVectorHeader* hdr = &vector->hdr; + assert(count); - assert(vector->element_size); + assert(hdr->element_size); - unsigned int initial_size = vector->size; - aligned_vector_resize(vector, vector->size + count); +#ifndef NDEBUG + uint32_t element_size = hdr->element_size; + uint32_t initial_size = hdr->size; +#endif - assert(vector->size == initial_size + count); - - unsigned char* dest = vector->data + (vector->element_size * initial_size); + uint8_t* dest = (uint8_t*) aligned_vector_resize(vector, hdr->size + count); + assert(dest); /* Copy the objects in */ - AV_MEMCPY4(dest, objs, vector->element_size * count); + AV_MEMCPY4(dest, objs, hdr->element_size * count); + assert(hdr->element_size == element_size); + assert(hdr->size == initial_size + count); return dest; } -AV_FORCE_INLINE void* aligned_vector_extend(AlignedVector* vector, const unsigned int additional_count) { - return aligned_vector_resize(vector, vector->size + additional_count); +AV_FORCE_INLINE void* aligned_vector_extend(AlignedVector* vector, const uint32_t additional_count) { + AlignedVectorHeader* hdr = &vector->hdr; + void* ret = aligned_vector_resize(vector, hdr->size + additional_count); + assert(ret); // Should always return something + return ret; } AV_FORCE_INLINE void aligned_vector_clear(AlignedVector* vector){ - vector->size = 0; + AlignedVectorHeader* hdr = &vector->hdr; + hdr->size = 0; } + void aligned_vector_shrink_to_fit(AlignedVector* vector); void aligned_vector_cleanup(AlignedVector* vector); -static inline void* aligned_vector_back(AlignedVector* vector){ - return aligned_vector_at(vector, vector->size - 1); + +AV_FORCE_INLINE void* aligned_vector_back(AlignedVector* vector){ + AlignedVectorHeader* hdr = &vector->hdr; + return aligned_vector_at(vector, hdr->size - 1); } #ifdef __cplusplus diff --git a/samples/cubes/main.cpp b/samples/cubes/main.cpp index eaeeaea..59a9f1b 100644 --- a/samples/cubes/main.cpp +++ b/samples/cubes/main.cpp @@ -1,18 +1,19 @@ + #include #include #include #include -#include "GL/gl.h" -#include "GL/glu.h" #ifdef __DREAMCAST__ #include -#include "GL/glext.h" -#include "GL/glkos.h" - float avgfps = -1; #endif +#include "GL/gl.h" +#include "GL/glkos.h" +#include "GL/glu.h" +#include "GL/glext.h" + #define PI 3.14159265358979323846264338327950288f #define RAD_TO_DEG 57.295779513082320876798154814105f #define MAX_CUBES 350 @@ -251,9 +252,7 @@ float rnd(float Min, float Max) void initialize() { debugLog("Initialize video output"); -#ifdef __DREAMCAST__ glKosInit(); -#endif glClearDepth(1.0); glDepthFunc(GL_LEQUAL); @@ -280,7 +279,7 @@ void initialize() glLoadIdentity(); // Set up colors (each face has a different color) - for (int i = 0; i < 6; i++) + for (int i = 0; i < 6; i++) { faceColors[i * 4] = colors[i]; faceColors[i * 4 + 1] = colors[i]; diff --git a/samples/loadbmp.c b/samples/loadbmp.c index cc8d7b7..65bd571 100644 --- a/samples/loadbmp.c +++ b/samples/loadbmp.c @@ -23,7 +23,11 @@ int ImageLoad(char *filename, Image *image) { } // seek through the bmp header, up to the width/height: - fseek(file, 18, SEEK_CUR); + fseek(file, 10, SEEK_CUR); + + uint32_t offset; + fread(&offset, 4, 1, file); + fseek(file, 4, SEEK_CUR); // read the width if ((i = fread(&sizeX, 4, 1, file)) != 1) { @@ -65,7 +69,7 @@ int ImageLoad(char *filename, Image *image) { } // seek past the rest of the bitmap header. - fseek(file, 24, SEEK_CUR); + fseek(file, offset, SEEK_SET); // read the data. image->data = (char *) malloc(size); diff --git a/samples/nehe10/romdisk/world.txt b/samples/nehe10/romdisk/world.txt index a3368bb..8f645ac 100644 --- a/samples/nehe10/romdisk/world.txt +++ b/samples/nehe10/romdisk/world.txt @@ -157,4 +157,4 @@ NUMPOLLIES 36 2.0 0.0 -0.5 0.0 0.0 3.0 1.0 -0.5 1.0 1.0 2.0 1.0 -0.5 0.0 1.0 -2.0 0.0 -0.5 0.0 0.0 \ No newline at end of file +2.0 0.0 -0.5 0.0 0.0 diff --git a/samples/quadmark/main.c b/samples/quadmark/main.c index 4da3046..e1bdcc9 100644 --- a/samples/quadmark/main.c +++ b/samples/quadmark/main.c @@ -68,21 +68,16 @@ int check_start() { } void setup() { - //PVR needs to warm up for a frame, or results will be low - glKosInit(); + GLdcConfig cfg; + glKosInitConfig(&cfg); + cfg.initial_immediate_capacity = 14000; + glKosInitEx(&cfg); + glMatrixMode(GL_MODELVIEW); glLoadIdentity(); glOrtho(0, 640, 0, 480, -100, 100); glMatrixMode(GL_PROJECTION); glLoadIdentity(); - - glDisable(GL_NEARZ_CLIPPING_KOS); - -#ifdef __DREAMCAST__ - pvr_wait_ready(); - pvr_scene_begin(); - pvr_scene_finish(); -#endif } void do_frame() { @@ -116,6 +111,8 @@ time_t begin; void switch_tests(int ppf) { printf("Beginning new test: %d polys per frame (%d per second at 60fps)\n", ppf * 2, ppf * 2 * 60); + fflush(stdout); + avgfps = -1; polycnt = ppf; } @@ -128,7 +125,6 @@ void check_switch() { if(now >= (begin + 5)) { begin = time(NULL); printf(" Average Frame Rate: ~%f fps (%d pps)\n", avgfps, (int)(polycnt * avgfps * 2)); - switch(phase) { case PHASE_HALVE: @@ -169,19 +165,24 @@ void check_switch() { case PHASE_FINAL: break; } + + fflush(stdout); } } +#define PROFILE 0 + int main(int argc, char **argv) { -#ifndef NDEBUG -#ifdef __DREAMCAST__ +#if PROFILE profiler_init("/pc/gmon.out"); - profiler_start(); -#endif #endif setup(); +#if PROFILE + profiler_start(); +#endif + /* Start off with something obscene */ switch_tests(200000 / 60); begin = time(NULL); @@ -200,11 +201,9 @@ int main(int argc, char **argv) { stats(); -#ifdef __DREAMCAST__ -#ifndef NDEBUG +#if PROFILE profiler_stop(); profiler_clean_up(); -#endif #endif return 0; diff --git a/samples/zclip_triangle/main.c b/samples/zclip_triangle/main.c index 17f4887..1cae41a 100644 --- a/samples/zclip_triangle/main.c +++ b/samples/zclip_triangle/main.c @@ -28,6 +28,8 @@ void InitGL(int Width, int Height) // We call this right after our OpenG glMatrixMode(GL_MODELVIEW); glLoadIdentity(); + + glEnable(GL_CULL_FACE); } /* The function called when our window is resized (which shouldn't happen, because we're fullscreen) */ @@ -86,12 +88,13 @@ void DrawGLScene() rotation = (rotation > 360.0f) ? rotation - 360.0f : rotation; glClear(GL_COLOR_BUFFER_BIT | GL_DEPTH_BUFFER_BIT); // Clear The Screen And The Depth Buffer + glClearColor(0.5f, 0.5f, 0.5f, 0.5f); glLoadIdentity(); // Reset The View glDisable(GL_CULL_FACE); glPushMatrix(); - glTranslatef(0.0f, -1.0f, movement); + glTranslatef(0.0f, -1.0f, -movement); glRotatef(rotation, 0.0f, 1.0f, 0.0f); glBegin(GL_TRIANGLES); diff --git a/tests/zclip/main.cpp b/tests/zclip/main.cpp new file mode 100644 index 0000000..7b7e316 --- /dev/null +++ b/tests/zclip/main.cpp @@ -0,0 +1,637 @@ + +#include +#include +#include +#include +#include +#include + +#define SQ_BASE_ADDRESS 0 +#define SPAN_SORT_CFG 0 +#define PVR_SET(x, y) (void)(x); (void)(y) + +struct Vertex { + uint32_t flags; + float xyz[3]; + float uv[2]; + float w; + uint8_t bgra[4]; +}; + +struct { + float hwidth; + float x_plus_hwidth; + float hheight; + float y_plus_hheight; +} VIEWPORT = {320, 320, 240, 240}; + + +struct VideoMode { + float height; +}; + +static VideoMode* GetVideoMode() { + static VideoMode mode = {320.0f}; + return &mode; +} + +enum GPUCommand { + GPU_CMD_POLYHDR = 0x80840000, + GPU_CMD_VERTEX = 0xe0000000, + GPU_CMD_VERTEX_EOL = 0xf0000000, + GPU_CMD_USERCLIP = 0x20000000, + GPU_CMD_MODIFIER = 0x80000000, + GPU_CMD_SPRITE = 0xA0000000 +}; + +static std::vector sent; + +static inline void interpolateColour(const uint32_t* a, const uint32_t* b, const float t, uint32_t* out) { + const static uint32_t MASK1 = 0x00FF00FF; + const static uint32_t MASK2 = 0xFF00FF00; + + const uint32_t f2 = 256 * t; + const uint32_t f1 = 256 - f2; + + *out = (((((*a & MASK1) * f1) + ((*b & MASK1) * f2)) >> 8) & MASK1) | + (((((*a & MASK2) * f1) + ((*b & MASK2) * f2)) >> 8) & MASK2); +} + +static inline void _glClipEdge(const Vertex* v1, const Vertex* v2, Vertex* vout) { + /* Clipping time! */ + const float d0 = v1->w + v1->xyz[2]; + const float d1 = v2->w + v2->xyz[2]; + const float sign = ((2.0f * (d1 < d0)) - 1.0f); + const float epsilon = -0.00001f * sign; + const float n = (d0 - d1); + const float r = (1.f / sqrtf(n * n)) * sign; + float t = fmaf(r, d0, epsilon); + + vout->xyz[0] = fmaf(v2->xyz[0] - v1->xyz[0], t, v1->xyz[0]); + vout->xyz[1] = fmaf(v2->xyz[1] - v1->xyz[1], t, v1->xyz[1]); + vout->xyz[2] = fmaf(v2->xyz[2] - v1->xyz[2], t, v1->xyz[2]); + vout->w = fmaf(v2->w - v1->w, t, v1->w); + + vout->uv[0] = fmaf(v2->uv[0] - v1->uv[0], t, v1->uv[0]); + vout->uv[1] = fmaf(v2->uv[1] - v1->uv[1], t, v1->uv[1]); + + interpolateColour((uint32_t*) v1->bgra, (uint32_t*) v2->bgra, t, (uint32_t*) vout->bgra); +} + +bool glIsVertex(const uint32_t flags) { + return flags == GPU_CMD_VERTEX_EOL || flags == GPU_CMD_VERTEX; +} + +bool glIsLastVertex(const uint32_t flags) { + return flags == GPU_CMD_VERTEX_EOL; +} + +void _glSubmitHeaderOrVertex(volatile uint32_t*, Vertex* vtx) { + sent.push_back(*vtx); +} + +float _glFastInvert(float x) { + return (1.f / __builtin_sqrtf(x * x)); +} + +void _glPerspectiveDivideVertex(Vertex* vertex, const float h) { + const float f = _glFastInvert(vertex->w); + + /* Convert to NDC and apply viewport */ + vertex->xyz[0] = __builtin_fmaf( + VIEWPORT.hwidth, vertex->xyz[0] * f, VIEWPORT.x_plus_hwidth + ); + + vertex->xyz[1] = h - __builtin_fmaf( + VIEWPORT.hheight, vertex->xyz[1] * f, VIEWPORT.y_plus_hheight + ); + + /* Orthographic projections need to use invZ otherwise we lose + the depth information. As w == 1, and clip-space range is -w to +w + we add 1.0 to the Z to bring it into range. We add a little extra to + avoid a divide by zero. + */ + + vertex->xyz[2] = (vertex->w == 1.0f) ? _glFastInvert(1.0001f + vertex->xyz[2]) : f; +} + + +void memcpy_vertex(Vertex* dst, Vertex* src) { + *dst = *src; +} + +/* Zclipping is so difficult to get right, that self sample tests all the cases of clipping and makes sure that things work as expected */ + +#ifdef __DREAMCAST__ +static volatile int *pvrdmacfg = (int*)0xA05F6888; +static volatile int *qacr = (int*)0xFF000038; +#else +static int pvrdmacfg[2]; +static int qacr[2]; +#endif + +void SceneListSubmit(void* src, int n) { + /* You need at least a header, and 3 vertices to render anything */ + if(n < 4) { + return; + } + + const float h = GetVideoMode()->height; + + PVR_SET(SPAN_SORT_CFG, 0x0); + + //Set PVR DMA registers + pvrdmacfg[0] = 1; + pvrdmacfg[1] = 1; + + //Set QACR registers + qacr[1] = qacr[0] = 0x11; + + volatile uint32_t *d = SQ_BASE_ADDRESS; + + int8_t queue_head = 0; + int8_t queue_tail = 0; + + /* The most vertices ever in the queue is 5 (as some clipping operations + * produce and additional couple of vertice, but we add one more so the ring buffer doesn't + * trip over itself (e.g. if tail == head we can guarantee it's empty, not full) */ + Vertex __attribute__((aligned(32))) queue[4]; + const int queue_capacity = sizeof(queue) / sizeof(Vertex); + + Vertex* vertex = (Vertex*) src; + uint32_t visible_mask = 0; + +#if CLIP_DEBUG + for(int i = 0; i < n; ++i) { + fprintf(stderr, "{%f, %f, %f, %f}, // %x (%x)\n", vertex[i].xyz[0], vertex[i].xyz[1], vertex[i].xyz[2], vertex[i].w, vertex[i].flags, &vertex[i]); + } + + fprintf(stderr, "----\n"); +#endif + while(n--) { + bool last_vertex = false; + memcpy_vertex(queue + queue_tail, vertex); + ++vertex; + switch(queue[queue_tail].flags) { + case GPU_CMD_POLYHDR: + _glSubmitHeaderOrVertex(d, &queue[queue_tail]); + break; + case GPU_CMD_VERTEX_EOL: + last_vertex = true; + case GPU_CMD_VERTEX: + visible_mask = (visible_mask >> 1) | (queue[queue_tail].xyz[2] >= -queue[queue_tail].w) << 2; + assert(visible_mask < 15); + queue_tail = (queue_tail + 1) % queue_capacity; + default: + break; + } + + int counter = (queue_tail - queue_head + queue_capacity) % queue_capacity; + if(counter < 3) { + continue; + } + +#if CLIP_DEBUG + fprintf(stderr, "%d\n", visible_mask); +#endif + Vertex __attribute__((aligned(32))) a, b; // Scratch vertices + switch(visible_mask) { + case 0: + break; + case 7: + /* All visible, push the first vertex and move on */ + _glPerspectiveDivideVertex(&queue[queue_head], h); + _glSubmitHeaderOrVertex(d, &queue[queue_head]); + + if(last_vertex) { + /* If this was the last vertex in the strip, we need to flush the queue and then + restart it again */ + + int v1 = (queue_head + 1) % queue_capacity; + int v2 = (queue_head + 2) % queue_capacity; + + _glPerspectiveDivideVertex(&queue[v1], h); + _glSubmitHeaderOrVertex(d, &queue[v1]); + + _glPerspectiveDivideVertex(&queue[v2], h); + _glSubmitHeaderOrVertex(d, &queue[v2]); + } + break; + case 1: + /* First vertex was visible */ + { + Vertex* v0 = &queue[queue_head]; + Vertex* v1 = &queue[(queue_head + 1) % queue_capacity]; + Vertex* v2 = &queue[(queue_head + 2) % queue_capacity]; + + _glClipEdge(v0, v1, &a); + _glClipEdge(v2, v0, &b); + a.flags = GPU_CMD_VERTEX; + + /* If v2 was the last in the strip, then b should be. If it wasn't + we'll create a degenerate triangle by adding b twice in a row so that the + strip processing will continue correctly after crossing the plane so it can + cross back*/ + b.flags = v2->flags; + + _glPerspectiveDivideVertex(v0, h); + _glPerspectiveDivideVertex(&a, h); + _glPerspectiveDivideVertex(&b, h); + + _glSubmitHeaderOrVertex(d, v0); + _glSubmitHeaderOrVertex(d, &a); + _glSubmitHeaderOrVertex(d, &b); + _glSubmitHeaderOrVertex(d, &b); + } + break; + case 2: + /* Second vertex was visible. In self case we need to create a triangle and produce + two new vertices: 1-2, and 2-3. */ + { + Vertex* v0 = &queue[queue_head]; + const Vertex* v1 = &queue[(queue_head + 1) % queue_capacity]; + const Vertex* v2 = &queue[(queue_head + 2) % queue_capacity]; + + _glClipEdge(v0, v1, &a); + _glClipEdge(v1, v2, &b); + a.flags = GPU_CMD_VERTEX; + b.flags = v2->flags; + + _glPerspectiveDivideVertex(v0, h); + _glPerspectiveDivideVertex(&a, h); + _glPerspectiveDivideVertex(&b, h); + + _glSubmitHeaderOrVertex(d, &a); + _glSubmitHeaderOrVertex(d, v0); + _glSubmitHeaderOrVertex(d, &b); + } + break; + case 3: /* First and second vertex were visible */ + { + Vertex* v0 = &queue[queue_head]; + Vertex __attribute__((aligned(32))) v1 = queue[(queue_head + 1) % queue_capacity]; + Vertex* v2 = &queue[(queue_head + 2) % queue_capacity]; + + _glClipEdge(&v1, v2, &a); + _glClipEdge(v2, v0, &b); + + a.flags = v2->flags; + b.flags = GPU_CMD_VERTEX; + + _glPerspectiveDivideVertex(v0, h); + _glPerspectiveDivideVertex(&v1, h); + _glPerspectiveDivideVertex(&a, h); + _glPerspectiveDivideVertex(&b, h); + + _glSubmitHeaderOrVertex(d, v0); + _glSubmitHeaderOrVertex(d, &v1); + _glSubmitHeaderOrVertex(d, &b); + _glSubmitHeaderOrVertex(d, &v1); + _glSubmitHeaderOrVertex(d, &a); + } + break; + case 4: + /* Third vertex was visible. */ + { + Vertex* v0 = &queue[queue_head]; + Vertex* v1 = &queue[(queue_head + 1) % queue_capacity]; + Vertex __attribute__((aligned(32))) v2 = queue[(queue_head + 2) % queue_capacity]; + + _glClipEdge(&v2, v0, &a); + _glClipEdge(v1, &v2, &b); + a.flags = GPU_CMD_VERTEX; + b.flags = GPU_CMD_VERTEX; + + _glPerspectiveDivideVertex(&v2, h); + _glPerspectiveDivideVertex(&a, h); + _glPerspectiveDivideVertex(&b, h); + + _glSubmitHeaderOrVertex(d, &a); + _glSubmitHeaderOrVertex(d, &a); + _glSubmitHeaderOrVertex(d, &b); + _glSubmitHeaderOrVertex(d, &v2); + } + break; + case 5: /* First and third vertex were visible */ + { + Vertex* v0 = &queue[queue_head]; + Vertex* v1 = &queue[(queue_head + 1) % queue_capacity]; + Vertex __attribute__((aligned(32))) v2 = queue[(queue_head + 2) % queue_capacity]; + + _glClipEdge(v0, v1, &a); + _glClipEdge(v1, &v2, &b); + a.flags = GPU_CMD_VERTEX; + b.flags = GPU_CMD_VERTEX; + + _glPerspectiveDivideVertex(v0, h); + _glPerspectiveDivideVertex(&v2, h); + _glPerspectiveDivideVertex(&a, h); + _glPerspectiveDivideVertex(&b, h); + + _glSubmitHeaderOrVertex(d, v0); + _glSubmitHeaderOrVertex(d, &a); + uint32_t v2_flags = v2.flags; + v2.flags = GPU_CMD_VERTEX; + _glSubmitHeaderOrVertex(d, &v2); + v2.flags = v2_flags; + _glSubmitHeaderOrVertex(d, &b); + _glSubmitHeaderOrVertex(d, &v2); + } + break; + case 6: /* Second and third vertex were visible */ + { + Vertex* v0 = &queue[queue_head]; + Vertex __attribute__((aligned(32))) v1 = queue[(queue_head + 1) % queue_capacity]; + Vertex __attribute__((aligned(32))) v2 = queue[(queue_head + 2) % queue_capacity]; + + _glClipEdge(v0, &v1, &a); + _glClipEdge(&v2, v0, &b); + + a.flags = GPU_CMD_VERTEX; + b.flags = GPU_CMD_VERTEX; + + _glPerspectiveDivideVertex(&v1, h); + _glPerspectiveDivideVertex(&v2, h); + _glPerspectiveDivideVertex(&a, h); + _glPerspectiveDivideVertex(&b, h); + + _glSubmitHeaderOrVertex(d, &a); + _glSubmitHeaderOrVertex(d, &v1); + _glSubmitHeaderOrVertex(d, &b); + _glSubmitHeaderOrVertex(d, &v1); + _glSubmitHeaderOrVertex(d, &v2); + } + break; + default: + break; + } + + if(last_vertex) { + visible_mask = queue_head = queue_tail = 0; + } else { + queue_head = (queue_head + 1) % queue_capacity; + } + } +} + + +struct VertexTmpl { + VertexTmpl(float x, float y, float z, float w): + x(x), y(y), z(z), w(w) {} + + float x, y, z, w; +}; + +std::vector make_vertices(const std::vector& verts) { + std::vector result; + Vertex r; + + r.flags = GPU_CMD_POLYHDR; + result.push_back(r); + + for(auto& v: verts) { + r.flags = GPU_CMD_VERTEX; + r.xyz[0] = v.x; + r.xyz[1] = v.y; + r.xyz[2] = v.z; + r.uv[0] = 0.0f; + r.uv[1] = 0.0f; + r.w = v.w; + + result.push_back(r); + } + + result.back().flags = GPU_CMD_VERTEX_EOL; + return result; +} + +template +void check_equal(const T& lhs, const U& rhs) { + if(lhs != rhs) { + throw std::runtime_error("Assertion failed"); + } +} + +template<> +void check_equal(const Vertex& lhs, const Vertex& rhs) { + if(lhs.xyz[0] != rhs.xyz[0] || + lhs.xyz[1] != rhs.xyz[1] || + lhs.xyz[2] != rhs.xyz[2] || + lhs.w != rhs.w) { + throw std::runtime_error("Assertion failed"); + } +} + + +bool test_clip_case_001() { + /* The first vertex is visible only */ + sent.clear(); + + auto data = make_vertices({ + {0.000000, -2.414213, 3.080808, 5.000000}, + {-4.526650, -2.414213, -7.121212, -5.000000}, + {4.526650, -2.414213, -7.121212, -5.000000} + }); + + SceneListSubmit(&data[0], data.size()); + + check_equal(sent.size(), 5); + check_equal(sent[0].flags, GPU_CMD_POLYHDR); + check_equal(sent[1].flags, GPU_CMD_VERTEX); + check_equal(sent[2].flags, GPU_CMD_VERTEX); + + // Because we're sending a single triangle, we end up sending a + // degenerate final vert. But if we were sending more than one triangle + // this would be GPU_CMD_VERTEX twice + check_equal(sent[3].flags, GPU_CMD_VERTEX_EOL); + check_equal(sent[4].flags, GPU_CMD_VERTEX_EOL); + check_equal(sent[3], sent[4]); + return true; +} + +bool test_clip_case_010() { + /* The third vertex is visible only */ + sent.clear(); + + auto data = make_vertices({ + {-4.526650, -2.414213, -7.121212, -5.000000}, + {0.000000, -2.414213, 3.080808, 5.000000}, + {4.526650, -2.414213, -7.121212, -5.000000} + }); + + SceneListSubmit(&data[0], data.size()); + + check_equal(sent.size(), 4); + check_equal(sent[0].flags, GPU_CMD_POLYHDR); + check_equal(sent[1].flags, GPU_CMD_VERTEX); + check_equal(sent[2].flags, GPU_CMD_VERTEX); + check_equal(sent[3].flags, GPU_CMD_VERTEX_EOL); + return true; +} + +bool test_clip_case_100() { + /* The third vertex is visible only */ + sent.clear(); + + auto data = make_vertices({ + {-4.526650, -2.414213, -7.121212, -5.000000}, + {4.526650, -2.414213, -7.121212, -5.000000}, + {0.000000, -2.414213, 3.080808, 5.000000} + }); + + SceneListSubmit(&data[0], data.size()); + + check_equal(sent.size(), 5); + check_equal(sent[0].flags, GPU_CMD_POLYHDR); + check_equal(sent[1].flags, GPU_CMD_VERTEX); + check_equal(sent[2].flags, GPU_CMD_VERTEX); + + // Because we're sending a single triangle, we end up sending a + // degenerate final vert. But if we were sending more than one triangle + // this would be GPU_CMD_VERTEX twice + check_equal(sent[3].flags, GPU_CMD_VERTEX); + check_equal(sent[4].flags, GPU_CMD_VERTEX_EOL); + check_equal(sent[1], sent[2]); + return true; +} + +bool test_clip_case_110() { + /* 2nd and 3rd visible */ + sent.clear(); + + auto data = make_vertices({ + {0.0, -2.414213, -7.121212, -5.000000}, + {-4.526650, -2.414213, 3.080808, 5.000000}, + {4.526650, -2.414213, 3.080808, 5.000000} + }); + + SceneListSubmit(&data[0], data.size()); + + check_equal(sent.size(), 6); + check_equal(sent[0].flags, GPU_CMD_POLYHDR); + check_equal(sent[1].flags, GPU_CMD_VERTEX); + check_equal(sent[2].flags, GPU_CMD_VERTEX); + check_equal(sent[3].flags, GPU_CMD_VERTEX); + check_equal(sent[4].flags, GPU_CMD_VERTEX); + check_equal(sent[5].flags, GPU_CMD_VERTEX_EOL); + check_equal(sent[2], sent[4]); + return true; +} + +bool test_clip_case_011() { + /* 1st and 2nd visible */ + sent.clear(); + + auto data = make_vertices({ + {-4.526650, -2.414213, 3.080808, 5.000000}, + {4.526650, -2.414213, 3.080808, 5.000000}, + {0.0, -2.414213, -7.121212, -5.000000} + }); + + SceneListSubmit(&data[0], data.size()); + + check_equal(sent.size(), 6); + check_equal(sent[0].flags, GPU_CMD_POLYHDR); + check_equal(sent[1].flags, GPU_CMD_VERTEX); + check_equal(sent[2].flags, GPU_CMD_VERTEX); + check_equal(sent[3].flags, GPU_CMD_VERTEX); + check_equal(sent[4].flags, GPU_CMD_VERTEX); + check_equal(sent[5].flags, GPU_CMD_VERTEX_EOL); + check_equal(sent[2], sent[4]); + return true; +} + +bool test_clip_case_101() { + /* 1st and 3rd visible */ + sent.clear(); + + auto data = make_vertices({ + {-4.526650, -2.414213, 3.080808, 5.000000}, + {0.0, -2.414213, -7.121212, -5.000000}, + {4.526650, -2.414213, 3.080808, 5.000000}, + }); + + SceneListSubmit(&data[0], data.size()); + + check_equal(sent.size(), 6); + check_equal(sent[0].flags, GPU_CMD_POLYHDR); + check_equal(sent[1].flags, GPU_CMD_VERTEX); + check_equal(sent[2].flags, GPU_CMD_VERTEX); + check_equal(sent[3].flags, GPU_CMD_VERTEX); + check_equal(sent[4].flags, GPU_CMD_VERTEX); + check_equal(sent[5].flags, GPU_CMD_VERTEX_EOL); + check_equal(sent[3], sent[5]); + return true; +} + +bool test_clip_case_111() { + /* 1st and 3rd visible */ + sent.clear(); + + auto data = make_vertices({ + {-4.526650, -2.414213, 3.080808, 5.000000}, + {0.0, -2.414213, -7.121212, 8.000000}, + {4.526650, -2.414213, 3.080808, 5.000000}, + }); + + SceneListSubmit(&data[0], data.size()); + + check_equal(sent.size(), 4); + check_equal(sent[0].flags, GPU_CMD_POLYHDR); + check_equal(sent[1].flags, GPU_CMD_VERTEX); + check_equal(sent[2].flags, GPU_CMD_VERTEX); + check_equal(sent[3].flags, GPU_CMD_VERTEX_EOL); + return true; +} + + +bool test_start_behind() { + /* Triangle behind the plane, but the strip continues in front */ + sent.clear(); + + auto data = make_vertices({ + {-3.021717, -2.414213, -10.155344, -9.935254}, + {5.915236, -2.414213, -9.354721, -9.136231}, + {-5.915236, -2.414213, -0.264096, -0.063767}, + {3.021717, -2.414213, 0.536527, 0.735255}, + {-7.361995, -2.414213, 4.681529, 4.871976}, + {1.574958, -2.414213, 5.482152, 5.670999}, + }); + + SceneListSubmit(&data[0], data.size()); + + return true; +} + +bool test_longer_strip() { + sent.clear(); + + auto data = make_vertices({ + {-4.384623, -2.414213, -5.699644, -5.488456}, + {4.667572, -2.414213, -5.621354, -5.410322}, + {-4.667572, -2.414213, 4.319152, 4.510323}, + {4.384623, -2.414213, 4.397442, 4.588456}, + {-4.809045, -2.414213, 9.328549, 9.509711}, + {4.243149, -2.414213, 9.406840, 9.587846}, + }); + + SceneListSubmit(&data[0], data.size()); + + return true; +} + +int main(int argc, char* argv[]) { + // test_clip_case_000(); + test_clip_case_001(); + test_clip_case_010(); + test_clip_case_100(); + test_clip_case_110(); + test_clip_case_011(); + test_clip_case_101(); + test_clip_case_111(); + + test_start_behind(); + test_longer_strip(); + + return 0; +}