WIP: Restructure clipping to be much MUCH faster in the visible case

This currently only works with triangles, anything more and it crashes
due to me not queuing subsequent vertices in the strip correctly
This commit is contained in:
Luke Benstead 2023-04-19 20:57:44 +01:00
parent 25d215dad3
commit c5ce81a38d
5 changed files with 916 additions and 276 deletions

View File

@ -172,6 +172,7 @@ gen_sample(scissor samples/scissor/main.c)
gen_sample(polymark samples/polymark/main.c)
gen_sample(cubes samples/cubes/main.cpp)
gen_sample(zclip_test tests/zclip/main.cpp)
if(PLATFORM_DREAMCAST)
gen_sample(trimark samples/trimark/main.c)

View File

@ -71,7 +71,7 @@ GL_FORCE_INLINE void _glPerspectiveDivideVertex(Vertex* vertex, const float h) {
vertex->xyz[2] = (vertex->w == 1.0f) ? _glFastInvert(1.0001f + vertex->xyz[2]) : f;
}
GL_FORCE_INLINE void _glSubmitHeaderOrVertex(uint32_t* d, const Vertex* v) {
GL_FORCE_INLINE void _glSubmitHeaderOrVertex(volatile uint32_t* d, const Vertex* v) {
#ifndef NDEBUG
gl_assert(!isnan(v->xyz[2]));
gl_assert(!isnan(v->w));
@ -94,13 +94,6 @@ GL_FORCE_INLINE void _glSubmitHeaderOrVertex(uint32_t* d, const Vertex* v) {
d += 8;
}
static struct __attribute__((aligned(32))) {
Vertex* v;
int visible;
} triangle[3];
static int tri_count = 0;
static int strip_count = 0;
static inline void interpolateColour(const uint32_t* a, const uint32_t* b, const float t, uint32_t* out) {
const static uint32_t MASK1 = 0x00FF00FF;
@ -134,296 +127,284 @@ static inline void _glClipEdge(const Vertex* v1, const Vertex* v2, Vertex* vout)
interpolateColour((uint32_t*) v1->bgra, (uint32_t*) v2->bgra, t, (uint32_t*) vout->bgra);
}
GL_FORCE_INLINE void ClearTriangle() {
tri_count = 0;
}
static inline void ShiftTriangle() {
if(!tri_count) {
return;
}
tri_count--;
triangle[0] = triangle[1];
triangle[1] = triangle[2];
#ifndef NDEBUG
triangle[2].v = NULL;
triangle[2].visible = false;
#endif
}
static inline void ShiftRotateTriangle() {
if(!tri_count) {
return;
}
if(triangle[0].v < triangle[1].v) {
triangle[0] = triangle[2];
} else {
triangle[1] = triangle[2];
}
tri_count--;
}
#define SPAN_SORT_CFG 0x005F8030
static volatile int *pvrdmacfg = (int*)0xA05F6888;
static volatile int *qacr = (int*)0xFF000038;
void SceneListSubmit(void* src, int n) {
/* You need at least a header, and 3 vertices to render anything */
if(n < 4) {
return;
}
const float h = GetVideoMode()->height;
PVR_SET(SPAN_SORT_CFG, 0x0);
//Set PVR DMA registers
volatile int *pvrdmacfg = (int*)0xA05F6888;
pvrdmacfg[0] = 1;
pvrdmacfg[1] = 0;
//Set QACR registers
volatile int *qacr = (int*)0xFF000038;
qacr[1] = qacr[0] = 0x11;
uint32_t *d = SQ_BASE_ADDRESS;
volatile uint32_t *d = SQ_BASE_ADDRESS;
Vertex __attribute__((aligned(32))) tmp;
int8_t queue_head = 0;
int8_t queue_tail = 0;
Vertex __attribute__((aligned(32))) queue[3];
const int queue_capacity = sizeof(queue) / sizeof(Vertex);
/* Perform perspective divide on each vertex */
Vertex* vertex = (Vertex*) src;
uint32_t visible_mask = 0;
if(!_glNearZClippingEnabled()) {
/* Prep store queues */
while(n--) {
if(glIsVertex(vertex->flags)) {
_glPerspectiveDivideVertex(vertex, h);
}
_glSubmitHeaderOrVertex(d, vertex);
++vertex;
}
return;
for(int i = 0; i < n; ++i) {
Vertex* v = vertex + i;
fprintf(stderr, "{%f, %f, %f, %f},\n", v->xyz[0], v->xyz[1], v->xyz[2], v->w);
}
tri_count = 0;
strip_count = 0;
/* Assume first entry is a header */
_glSubmitHeaderOrVertex(d, vertex++);
#if CLIP_DEBUG
printf("----\n");
#endif
/* Push first 2 vertices of the strip */
memcpy_vertex(&queue[0], vertex++);
memcpy_vertex(&queue[1], vertex++);
visible_mask = ((queue[0].xyz[2] >= -queue[0].w) << 1) | ((queue[1].xyz[2] >= -queue[1].w) << 2);
queue_tail = 2;
n -= 3;
for(int i = 0; i < n; ++i, ++vertex) {
PREFETCH(vertex + 1);
PREFETCH(vertex + 2);
/* Wait until we fill the triangle */
if(tri_count < 3) {
if(glIsVertex(vertex->flags)) {
++strip_count;
triangle[tri_count].v = vertex;
triangle[tri_count].visible = vertex->xyz[2] >= -vertex->w;
if(++tri_count < 3) {
continue;
}
} else {
/* We hit a header */
tri_count = 0;
strip_count = 0;
_glSubmitHeaderOrVertex(d, vertex);
continue;
}
}
#if CLIP_DEBUG
printf("SC: %d\n", strip_count);
#endif
/* If we got here, then triangle contains 3 vertices */
int visible_mask = triangle[0].visible | (triangle[1].visible << 1) | (triangle[2].visible << 2);
/* Clipping time!
There are 6 distinct possibilities when clipping a triangle. 3 of them result
in another triangle, 3 of them result in a quadrilateral.
Assuming you iterate the edges of the triangle in order, and create a new *visible*
vertex when you cross the plane, and discard vertices behind the plane, then the only
difference between the two cases is that the final two vertices that need submitting have
to be reversed.
Unfortunately we have to copy vertices here, because if we persp-divide a vertex it may
be used in a subsequent triangle in the strip and would end up being double divided.
*/
#define SUBMIT_QUEUED() \
if(strip_count > 3) { \
tmp = *(vertex - 2); \
/* If we had triangles ahead of this one, submit and finalize */ \
_glPerspectiveDivideVertex(&tmp, h); \
_glSubmitHeaderOrVertex(d, &tmp); \
tmp = *(vertex - 1); \
tmp.flags = GPU_CMD_VERTEX_EOL; \
_glPerspectiveDivideVertex(&tmp, h); \
_glSubmitHeaderOrVertex(d, &tmp); \
}
bool is_last_in_strip = glIsLastVertex(vertex->flags);
while(n--) {
Vertex* self = &queue[queue_tail];
memcpy_vertex(self, vertex++);
visible_mask = (visible_mask >> 1) | ((self->xyz[2] >= -self->w) << 2); // Push new vertex
queue_tail = (queue_tail + 1) % queue_capacity;
switch(visible_mask) {
case 1: {
SUBMIT_QUEUED();
/* 0, 0a, 2a */
tmp = *triangle[0].v;
tmp.flags = GPU_CMD_VERTEX;
_glPerspectiveDivideVertex(&tmp, h);
_glSubmitHeaderOrVertex(d, &tmp);
_glClipEdge(triangle[0].v, triangle[1].v, &tmp);
tmp.flags = GPU_CMD_VERTEX;
_glPerspectiveDivideVertex(&tmp, h);
_glSubmitHeaderOrVertex(d, &tmp);
_glClipEdge(triangle[2].v, triangle[0].v, &tmp);
tmp.flags = GPU_CMD_VERTEX_EOL;
_glPerspectiveDivideVertex(&tmp, h);
_glSubmitHeaderOrVertex(d, &tmp);
} break;
case 2: {
SUBMIT_QUEUED();
/* 0a, 1, 1a */
_glClipEdge(triangle[0].v, triangle[1].v, &tmp);
tmp.flags = GPU_CMD_VERTEX;
_glPerspectiveDivideVertex(&tmp, h);
_glSubmitHeaderOrVertex(d, &tmp);
tmp = *triangle[1].v;
tmp.flags = GPU_CMD_VERTEX;
_glPerspectiveDivideVertex(&tmp, h);
_glSubmitHeaderOrVertex(d, &tmp);
_glClipEdge(triangle[1].v, triangle[2].v, &tmp);
tmp.flags = GPU_CMD_VERTEX_EOL;
_glPerspectiveDivideVertex(&tmp, h);
_glSubmitHeaderOrVertex(d, &tmp);
} break;
case 3: {
SUBMIT_QUEUED();
/* 0, 1, 2a, 1a */
tmp = *triangle[0].v;
tmp.flags = GPU_CMD_VERTEX;
_glPerspectiveDivideVertex(&tmp, h);
_glSubmitHeaderOrVertex(d, &tmp);
tmp = *triangle[1].v;
tmp.flags = GPU_CMD_VERTEX;
_glPerspectiveDivideVertex(&tmp, h);
_glSubmitHeaderOrVertex(d, &tmp);
_glClipEdge(triangle[2].v, triangle[0].v, &tmp);
tmp.flags = GPU_CMD_VERTEX;
_glPerspectiveDivideVertex(&tmp, h);
_glSubmitHeaderOrVertex(d, &tmp);
_glClipEdge(triangle[1].v, triangle[2].v, &tmp);
tmp.flags = GPU_CMD_VERTEX_EOL;
_glPerspectiveDivideVertex(&tmp, h);
_glSubmitHeaderOrVertex(d, &tmp);
} break;
case 4: {
SUBMIT_QUEUED();
/* 1a, 2, 2a */
_glClipEdge(triangle[1].v, triangle[2].v, &tmp);
tmp.flags = GPU_CMD_VERTEX;
_glPerspectiveDivideVertex(&tmp, h);
_glSubmitHeaderOrVertex(d, &tmp);
tmp = *triangle[2].v;
tmp.flags = GPU_CMD_VERTEX;
_glPerspectiveDivideVertex(&tmp, h);
_glSubmitHeaderOrVertex(d, &tmp);
_glClipEdge(triangle[2].v, triangle[0].v, &tmp);
tmp.flags = GPU_CMD_VERTEX_EOL;
_glPerspectiveDivideVertex(&tmp, h);
_glSubmitHeaderOrVertex(d, &tmp);
} break;
case 5: {
SUBMIT_QUEUED();
/* 0, 0a, 2, 1a */
tmp = *triangle[0].v;
tmp.flags = GPU_CMD_VERTEX;
_glPerspectiveDivideVertex(&tmp, h);
_glSubmitHeaderOrVertex(d, &tmp);
_glClipEdge(triangle[0].v, triangle[1].v, &tmp);
tmp.flags = GPU_CMD_VERTEX;
_glPerspectiveDivideVertex(&tmp, h);
_glSubmitHeaderOrVertex(d, &tmp);
tmp = *triangle[2].v;
tmp.flags = GPU_CMD_VERTEX;
_glPerspectiveDivideVertex(&tmp, h);
_glSubmitHeaderOrVertex(d, &tmp);
_glClipEdge(triangle[1].v, triangle[2].v, &tmp);
tmp.flags = GPU_CMD_VERTEX_EOL;
_glPerspectiveDivideVertex(&tmp, h);
_glSubmitHeaderOrVertex(d, &tmp);
} break;
case 6: {
SUBMIT_QUEUED();
/* 0a, 1, 2a, 2 */
_glClipEdge(triangle[0].v, triangle[1].v, &tmp);
tmp.flags = GPU_CMD_VERTEX;
_glPerspectiveDivideVertex(&tmp, h);
_glSubmitHeaderOrVertex(d, &tmp);
tmp = *triangle[1].v;
tmp.flags = GPU_CMD_VERTEX;
_glPerspectiveDivideVertex(&tmp, h);
_glSubmitHeaderOrVertex(d, &tmp);
_glClipEdge(triangle[2].v, triangle[0].v, &tmp);
tmp.flags = GPU_CMD_VERTEX;
_glPerspectiveDivideVertex(&tmp, h);
_glSubmitHeaderOrVertex(d, &tmp);
tmp = *triangle[2].v;
tmp.flags = GPU_CMD_VERTEX_EOL;
_glPerspectiveDivideVertex(&tmp, h);
_glSubmitHeaderOrVertex(d, &tmp);
} break;
case 7: {
/* All the vertices are visible! We divide and submit v0, then shift */
_glPerspectiveDivideVertex(vertex - 2, h);
_glSubmitHeaderOrVertex(d, vertex - 2);
if(is_last_in_strip) {
_glPerspectiveDivideVertex(vertex - 1, h);
_glSubmitHeaderOrVertex(d, vertex - 1);
_glPerspectiveDivideVertex(vertex, h);
_glSubmitHeaderOrVertex(d, vertex);
tri_count = 0;
strip_count = 0;
}
ShiftRotateTriangle();
continue;
} break;
case 0:
default:
queue_head = (queue_head + 1) % queue_capacity;
continue;
break;
case 7:
/* All visible, push the first vertex and move on */
_glPerspectiveDivideVertex(&queue[queue_head], h);
_glSubmitHeaderOrVertex(d, &queue[queue_head]);
queue_head = (queue_head + 1) % queue_capacity;
if(glIsLastVertex(self->flags)) {
/* If this was the last vertex in the strip, we clear the
* triangle out */
while(queue_head != queue_tail) {
_glPerspectiveDivideVertex(&queue[queue_head], h);
_glSubmitHeaderOrVertex(d, &queue[queue_head]);
queue_head = (queue_head + 1) % queue_capacity;
}
visible_mask = 0;
}
break;
case 1:
/* First vertex was visible */
{
Vertex __attribute__((aligned(32))) a, b; // Scratch vertices
Vertex* v0 = &queue[queue_head];
Vertex* v1 = &queue[(queue_head + 1) % queue_capacity];
Vertex* v2 = &queue[(queue_head + 2) % queue_capacity];
_glClipEdge(v0, v1, &a);
_glClipEdge(v2, v0, &b);
a.flags = GPU_CMD_VERTEX;
/* If v2 was the last in the strip, then b should be. If it wasn't
we'll create a degenerate triangle by adding b twice in a row so that the
strip processing will continue correctly after crossing the plane so it can
cross back*/
b.flags = v2->flags;
_glPerspectiveDivideVertex(v0, h);
_glSubmitHeaderOrVertex(d, v0);
_glPerspectiveDivideVertex(&a, h);
_glSubmitHeaderOrVertex(d, &a);
_glPerspectiveDivideVertex(&b, h);
_glSubmitHeaderOrVertex(d, &b);
_glSubmitHeaderOrVertex(d, &b);
/* But skip the vertices that are already there */
queue_head = (queue_head + 3) % queue_capacity;
visible_mask = 0;
}
break;
case 2:
/* Second vertex was visible. In self case we need to create a triangle and produce
two new vertices: 1-2, and 2-3. */
{
Vertex __attribute__((aligned(32))) a, b; // Scratch vertices
Vertex* v0 = &queue[queue_head];
Vertex* v1 = &queue[(queue_head + 1) % queue_capacity];
Vertex* v2 = &queue[(queue_head + 2) % queue_capacity];
_glClipEdge(v0, v1, &a);
_glClipEdge(v1, v2, &b);
a.flags = GPU_CMD_VERTEX;
b.flags = GPU_CMD_VERTEX_EOL;
_glPerspectiveDivideVertex(&a, h);
_glSubmitHeaderOrVertex(d, &a);
_glPerspectiveDivideVertex(v1, h);
_glSubmitHeaderOrVertex(d, v1);
_glPerspectiveDivideVertex(&b, h);
_glSubmitHeaderOrVertex(d, &b);
/* But skip the vertices that are already there */
queue_head = (queue_head + 3) % queue_capacity;
visible_mask = 0;
}
break;
case 3: /* First and second vertex were visible */
{
Vertex __attribute__((aligned(32))) a, b; // Scratch vertices
Vertex* v0 = &queue[queue_head];
Vertex* v1 = &queue[(queue_head + 1) % queue_capacity];
Vertex* v2 = &queue[(queue_head + 2) % queue_capacity];
_glClipEdge(v1, v2, &a);
_glClipEdge(v2, v0, &b);
a.flags = v2->flags;
b.flags = GPU_CMD_VERTEX;
_glPerspectiveDivideVertex(v0, h);
_glSubmitHeaderOrVertex(d, v0);
_glPerspectiveDivideVertex(v1, h);
_glSubmitHeaderOrVertex(d, v1);
_glPerspectiveDivideVertex(&b, h);
_glSubmitHeaderOrVertex(d, &b);
_glSubmitHeaderOrVertex(d, v1);
_glPerspectiveDivideVertex(&a, h);
_glSubmitHeaderOrVertex(d, &a);
/* But skip the vertices that are already there */
queue_head = (queue_head + 3) % queue_capacity;
visible_mask = 0;
}
break;
case 4:
/* Third vertex was visible. */
{
Vertex __attribute__((aligned(32))) a, b; // Scratch vertices
Vertex* v0 = &queue[queue_head];
Vertex* v1 = &queue[(queue_head + 1) % queue_capacity];
Vertex v2 = queue[(queue_head + 2) % queue_capacity];
_glClipEdge(&v2, v0, &a);
_glClipEdge(v1, &v2, &b);
a.flags = GPU_CMD_VERTEX;
b.flags = GPU_CMD_VERTEX;
_glPerspectiveDivideVertex(&a, h);
_glSubmitHeaderOrVertex(d, &a);
_glSubmitHeaderOrVertex(d, &a);
_glPerspectiveDivideVertex(&b, h);
_glSubmitHeaderOrVertex(d, &b);
_glPerspectiveDivideVertex(&v2, h);
_glSubmitHeaderOrVertex(d, &v2);
queue_head = (queue_head + 3) % queue_capacity;
visible_mask = 0;
}
break;
case 5: /* First and third vertex were visible */
{
Vertex __attribute__((aligned(32))) a, b, c; // Scratch vertices
Vertex* v0 = &queue[queue_head];
Vertex* v1 = &queue[(queue_head + 1) % queue_capacity];
Vertex* v2 = &queue[(queue_head + 2) % queue_capacity];
_glClipEdge(v0, v1, &a);
_glClipEdge(v1, v2, &b);
a.flags = GPU_CMD_VERTEX;
b.flags = GPU_CMD_VERTEX;
_glPerspectiveDivideVertex(v0, h);
_glSubmitHeaderOrVertex(d, v0);
_glPerspectiveDivideVertex(&a, h);
_glSubmitHeaderOrVertex(d, &a);
uint32_t v2_flags = v2->flags;
v2->flags = GPU_CMD_VERTEX;
_glPerspectiveDivideVertex(v2, h);
_glSubmitHeaderOrVertex(d, v2);
_glPerspectiveDivideVertex(&b, h);
_glSubmitHeaderOrVertex(d, &b);
v2->flags = v2_flags;
_glSubmitHeaderOrVertex(d, v2);
queue_head = (queue_head + 3) % queue_capacity;
visible_mask = 0;
}
break;
case 6: /* Second and third vertex were visible */
{
Vertex __attribute__((aligned(32))) a, b; // Scratch vertices
Vertex* v0 = &queue[queue_head];
Vertex* v1 = &queue[(queue_head + 1) % queue_capacity];
Vertex* v2 = &queue[(queue_head + 2) % queue_capacity];
_glClipEdge(v0, v1, &a);
_glClipEdge(v2, v0, &b);
a.flags = GPU_CMD_VERTEX;
b.flags = GPU_CMD_VERTEX;
_glPerspectiveDivideVertex(&a, h);
_glSubmitHeaderOrVertex(d, &a);
_glPerspectiveDivideVertex(v1, h);
_glSubmitHeaderOrVertex(d, v1);
_glPerspectiveDivideVertex(&b, h);
_glSubmitHeaderOrVertex(d, &b);
_glSubmitHeaderOrVertex(d, v1);
_glPerspectiveDivideVertex(v2, h);
_glSubmitHeaderOrVertex(d, v2);
queue_head = (queue_head + 3) % queue_capacity;
visible_mask = 0;
}
break;
default:
break;
}
/* If this was the last in the strip, we don't need to
submit anything else, we just wipe the tri_count */
if(is_last_in_strip) {
tri_count = 0;
strip_count = 0;
} else {
ShiftRotateTriangle();
strip_count = 2;
/* Submit the beginning of the next strip (2 verts, maybe a header) */
int8_t v = 0;
while(v < 2 && n > 1) {
if(!glIsVertex(vertex->flags)) {
_glSubmitHeaderOrVertex(d, vertex);
} else {
memcpy_vertex(&queue[queue_tail], vertex++);
visible_mask = (visible_mask >> 1) | ((queue[queue_tail].xyz[2] >= -queue[queue_tail].w) << 2); // Push new vertex
queue_tail = (queue_tail + 1) % queue_capacity;
++v;
}
--n;
}
}
}

View File

@ -221,23 +221,55 @@ typedef struct {
} _glvec4;
#define vec2cpy(dst, src) \
*((_glvec2*) dst) = *((_glvec2*) src)
*((uint64_t*) dst) = *((uint64_t*) src);
#define vec3cpy(dst, src) \
*((_glvec3*) dst) = *((_glvec3*) src)
*((uint64_t*) dst) = *((uint64_t*) src); \
dst[2] = src[2];
#define vec4cpy(dst, src) \
*((_glvec4*) dst) = *((_glvec4*) src)
*((uint64_t*) dst) = *((uint64_t*) src); \
*((uint64_t*) dst + 2) = *((uint64_t*) src + 2);
GL_FORCE_INLINE float clamp(float d, float min, float max) {
return (d < min) ? min : (d > max) ? max : d;
}
GL_FORCE_INLINE void memcpy_vertex(Vertex *dest, const Vertex *src) {
#ifdef __DREAMCAST__
_Complex float double_scratch;
asm volatile (
"fschg\n\t"
"clrs\n"
".align 2\n"
"fmov.d @%[in]+, %[scratch]\n\t"
"fmov.d %[scratch], @%[out]\n\t"
"fmov.d @%[in]+, %[scratch]\n\t"
"add #8, %[out]\n\t"
"fmov.d %[scratch], @%[out]\n\t"
"fmov.d @%[in]+, %[scratch]\n\t"
"add #8, %[out]\n\t"
"fmov.d %[scratch], @%[out]\n\t"
"fmov.d @%[in], %[scratch]\n\t"
"add #8, %[out]\n\t"
"fmov.d %[scratch], @%[out]\n\t"
"fschg\n"
: [in] "+&r" ((uint32_t) src), [scratch] "=&d" (double_scratch), [out] "+&r" ((uint32_t) dest)
:
: "t", "memory" // clobbers
);
#else
*dest = *src;
#endif
}
#define swapVertex(a, b) \
do { \
Vertex c = *a; \
*a = *b; \
*b = c; \
Vertex __attribute__((aligned(32))) c; \
memcpy_vertex(&c, a); \
memcpy_vertex(a, b); \
memcpy_vertex(b, &c); \
} while(0)
/* ClipVertex doesn't have room for these, so we need to parse them

View File

@ -86,12 +86,13 @@ void DrawGLScene()
rotation = (rotation > 360.0f) ? rotation - 360.0f : rotation;
glClear(GL_COLOR_BUFFER_BIT | GL_DEPTH_BUFFER_BIT); // Clear The Screen And The Depth Buffer
glClearColor(0.5f, 0.5f, 0.5f, 0.5f);
glLoadIdentity(); // Reset The View
glDisable(GL_CULL_FACE);
glPushMatrix();
glTranslatef(0.0f, -1.0f, movement);
glTranslatef(0.0f, -1.0f, -movement);
glRotatef(rotation, 0.0f, 1.0f, 0.0f);
glBegin(GL_TRIANGLES);

625
tests/zclip/main.cpp Normal file
View File

@ -0,0 +1,625 @@
#include <cstdint>
#include <vector>
#include <cstdio>
#include <cmath>
#include <stdexcept>
#define SQ_BASE_ADDRESS 0
#define SPAN_SORT_CFG 0
#define PVR_SET(x, y) (void)(x); (void)(y)
struct Vertex {
uint32_t flags;
float xyz[3];
float uv[2];
float w;
uint8_t bgra[4];
};
struct {
float hwidth;
float x_plus_hwidth;
float hheight;
float y_plus_hheight;
} VIEWPORT = {320, 320, 240, 240};
struct VideoMode {
float height;
};
static VideoMode* GetVideoMode() {
static VideoMode mode = {320.0f};
return &mode;
}
enum GPUCommand {
GPU_CMD_POLYHDR = 0x80840000,
GPU_CMD_VERTEX = 0xe0000000,
GPU_CMD_VERTEX_EOL = 0xf0000000,
GPU_CMD_USERCLIP = 0x20000000,
GPU_CMD_MODIFIER = 0x80000000,
GPU_CMD_SPRITE = 0xA0000000
};
static std::vector<Vertex> sent;
static inline void interpolateColour(const uint32_t* a, const uint32_t* b, const float t, uint32_t* out) {
const static uint32_t MASK1 = 0x00FF00FF;
const static uint32_t MASK2 = 0xFF00FF00;
const uint32_t f2 = 256 * t;
const uint32_t f1 = 256 - f2;
*out = (((((*a & MASK1) * f1) + ((*b & MASK1) * f2)) >> 8) & MASK1) |
(((((*a & MASK2) * f1) + ((*b & MASK2) * f2)) >> 8) & MASK2);
}
static inline void _glClipEdge(const Vertex* v1, const Vertex* v2, Vertex* vout) {
/* Clipping time! */
const float d0 = v1->w + v1->xyz[2];
const float d1 = v2->w + v2->xyz[2];
const float sign = ((2.0f * (d1 < d0)) - 1.0f);
const float epsilon = -0.00001f * sign;
const float n = (d0 - d1);
const float r = (1.f / sqrtf(n * n)) * sign;
float t = fmaf(r, d0, epsilon);
vout->xyz[0] = fmaf(v2->xyz[0] - v1->xyz[0], t, v1->xyz[0]);
vout->xyz[1] = fmaf(v2->xyz[1] - v1->xyz[1], t, v1->xyz[1]);
vout->xyz[2] = fmaf(v2->xyz[2] - v1->xyz[2], t, v1->xyz[2]);
vout->w = fmaf(v2->w - v1->w, t, v1->w);
vout->uv[0] = fmaf(v2->uv[0] - v1->uv[0], t, v1->uv[0]);
vout->uv[1] = fmaf(v2->uv[1] - v1->uv[1], t, v1->uv[1]);
interpolateColour((uint32_t*) v1->bgra, (uint32_t*) v2->bgra, t, (uint32_t*) vout->bgra);
}
bool glIsVertex(const uint32_t flags) {
return flags == GPU_CMD_VERTEX_EOL || flags == GPU_CMD_VERTEX;
}
bool glIsLastVertex(const uint32_t flags) {
return flags == GPU_CMD_VERTEX_EOL;
}
void _glSubmitHeaderOrVertex(volatile uint32_t*, Vertex* vtx) {
sent.push_back(*vtx);
}
float _glFastInvert(float x) {
return (1.f / __builtin_sqrtf(x * x));
}
void _glPerspectiveDivideVertex(Vertex* vertex, const float h) {
const float f = _glFastInvert(vertex->w);
/* Convert to NDC and apply viewport */
vertex->xyz[0] = __builtin_fmaf(
VIEWPORT.hwidth, vertex->xyz[0] * f, VIEWPORT.x_plus_hwidth
);
vertex->xyz[1] = h - __builtin_fmaf(
VIEWPORT.hheight, vertex->xyz[1] * f, VIEWPORT.y_plus_hheight
);
/* Orthographic projections need to use invZ otherwise we lose
the depth information. As w == 1, and clip-space range is -w to +w
we add 1.0 to the Z to bring it into range. We add a little extra to
avoid a divide by zero.
*/
vertex->xyz[2] = (vertex->w == 1.0f) ? _glFastInvert(1.0001f + vertex->xyz[2]) : f;
}
void memcpy_vertex(Vertex* dst, Vertex* src) {
*dst = *src;
}
/* Zclipping is so difficult to get right, that self sample tests all the cases of clipping and makes sure that things work as expected */
#ifdef __DREAMCAST__
static volatile int *pvrdmacfg = (int*)0xA05F6888;
static volatile int *qacr = (int*)0xFF000038;
#else
static int pvrdmacfg[2];
static int qacr[2];
#endif
void SceneListSubmit(void* src, int n) {
/* You need at least a header, and 3 vertices to render anything */
if(n < 4) {
return;
}
const float h = GetVideoMode()->height;
PVR_SET(SPAN_SORT_CFG, 0x0);
//Set PVR DMA registers
pvrdmacfg[0] = 1;
pvrdmacfg[1] = 0;
//Set QACR registers
qacr[1] = qacr[0] = 0x11;
volatile uint32_t *d = SQ_BASE_ADDRESS;
int8_t queue_head = 0;
int8_t queue_tail = 0;
Vertex __attribute__((aligned(32))) queue[5];
const int queue_capacity = sizeof(queue) / sizeof(Vertex);
Vertex* vertex = (Vertex*) src;
uint32_t visible_mask = 0;
/* Assume first entry is a header */
_glSubmitHeaderOrVertex(d, vertex++);
/* Push first 2 vertices of the strip */
memcpy_vertex(&queue[0], vertex++);
memcpy_vertex(&queue[1], vertex++);
visible_mask = ((queue[0].xyz[2] >= -queue[0].w) << 1) | ((queue[1].xyz[2] >= -queue[1].w) << 2);
queue_tail = 2;
n -= 3;
while(n--) {
Vertex* self = &queue[queue_tail];
memcpy_vertex(self, vertex++);
visible_mask = (visible_mask >> 1) | ((self->xyz[2] >= -self->w) << 2); // Push new vertex
queue_tail = (queue_tail + 1) % queue_capacity;
switch(visible_mask) {
case 0:
queue_head = (queue_head + 1) % queue_capacity;
continue;
break;
case 7:
/* All visible, push the first vertex and move on */
_glPerspectiveDivideVertex(&queue[queue_head], h);
_glSubmitHeaderOrVertex(d, &queue[queue_head]);
queue_head = (queue_head + 1) % queue_capacity;
if(glIsLastVertex(self->flags)) {
/* If this was the last vertex in the strip, we clear the
* triangle out */
while(queue_head != queue_tail) {
_glPerspectiveDivideVertex(&queue[queue_head], h);
_glSubmitHeaderOrVertex(d, &queue[queue_head]);
queue_head = (queue_head + 1) % queue_capacity;
}
visible_mask = 0;
}
break;
case 1:
/* First vertex was visible */
{
Vertex __attribute__((aligned(32))) a, b; // Scratch vertices
Vertex* v0 = &queue[queue_head];
Vertex* v1 = &queue[(queue_head + 1) % queue_capacity];
Vertex* v2 = &queue[(queue_head + 2) % queue_capacity];
_glClipEdge(v0, v1, &a);
_glClipEdge(v2, v0, &b);
a.flags = GPU_CMD_VERTEX;
/* If v2 was the last in the strip, then b should be. If it wasn't
we'll create a degenerate triangle by adding b twice in a row so that the
strip processing will continue correctly after crossing the plane so it can
cross back*/
b.flags = v2->flags;
_glPerspectiveDivideVertex(v0, h);
_glSubmitHeaderOrVertex(d, v0);
_glPerspectiveDivideVertex(&a, h);
_glSubmitHeaderOrVertex(d, &a);
_glPerspectiveDivideVertex(&b, h);
_glSubmitHeaderOrVertex(d, &b);
_glSubmitHeaderOrVertex(d, &b);
/* But skip the vertices that are already there */
queue_head = (queue_head + 3) % queue_capacity;
visible_mask = 0;
}
break;
case 2:
/* Second vertex was visible. In self case we need to create a triangle and produce
two new vertices: 1-2, and 2-3. */
{
Vertex __attribute__((aligned(32))) a, b; // Scratch vertices
Vertex* v0 = &queue[queue_head];
Vertex* v1 = &queue[(queue_head + 1) % queue_capacity];
Vertex* v2 = &queue[(queue_head + 2) % queue_capacity];
_glClipEdge(v0, v1, &a);
_glClipEdge(v1, v2, &b);
a.flags = GPU_CMD_VERTEX;
b.flags = GPU_CMD_VERTEX_EOL;
_glPerspectiveDivideVertex(&a, h);
_glSubmitHeaderOrVertex(d, &a);
_glPerspectiveDivideVertex(v1, h);
_glSubmitHeaderOrVertex(d, v1);
_glPerspectiveDivideVertex(&b, h);
_glSubmitHeaderOrVertex(d, &b);
/* But skip the vertices that are already there */
queue_head = (queue_head + 3) % queue_capacity;
visible_mask = 0;
}
break;
case 3: /* First and second vertex were visible */
{
Vertex __attribute__((aligned(32))) a, b; // Scratch vertices
Vertex* v0 = &queue[queue_head];
Vertex* v1 = &queue[(queue_head + 1) % queue_capacity];
Vertex* v2 = &queue[(queue_head + 2) % queue_capacity];
_glClipEdge(v1, v2, &a);
_glClipEdge(v2, v0, &b);
a.flags = v2->flags;
b.flags = GPU_CMD_VERTEX;
_glPerspectiveDivideVertex(v0, h);
_glSubmitHeaderOrVertex(d, v0);
_glPerspectiveDivideVertex(v1, h);
_glSubmitHeaderOrVertex(d, v1);
_glPerspectiveDivideVertex(&b, h);
_glSubmitHeaderOrVertex(d, &b);
_glSubmitHeaderOrVertex(d, v1);
_glPerspectiveDivideVertex(&a, h);
_glSubmitHeaderOrVertex(d, &a);
/* But skip the vertices that are already there */
queue_head = (queue_head + 3) % queue_capacity;
visible_mask = 0;
}
break;
case 4:
/* Third vertex was visible. */
{
Vertex __attribute__((aligned(32))) a, b; // Scratch vertices
Vertex* v0 = &queue[queue_head];
Vertex* v1 = &queue[(queue_head + 1) % queue_capacity];
Vertex v2 = queue[(queue_head + 2) % queue_capacity];
_glClipEdge(&v2, v0, &a);
_glClipEdge(v1, &v2, &b);
a.flags = GPU_CMD_VERTEX;
b.flags = GPU_CMD_VERTEX;
_glPerspectiveDivideVertex(&a, h);
_glSubmitHeaderOrVertex(d, &a);
_glSubmitHeaderOrVertex(d, &a);
_glPerspectiveDivideVertex(&b, h);
_glSubmitHeaderOrVertex(d, &b);
_glPerspectiveDivideVertex(&v2, h);
_glSubmitHeaderOrVertex(d, &v2);
queue_head = (queue_head + 3) % queue_capacity;
visible_mask = 0;
}
break;
case 5: /* First and third vertex were visible */
{
Vertex __attribute__((aligned(32))) a, b, c; // Scratch vertices
Vertex* v0 = &queue[queue_head];
Vertex* v1 = &queue[(queue_head + 1) % queue_capacity];
Vertex* v2 = &queue[(queue_head + 2) % queue_capacity];
_glClipEdge(v0, v1, &a);
_glClipEdge(v1, v2, &b);
a.flags = GPU_CMD_VERTEX;
b.flags = GPU_CMD_VERTEX;
_glPerspectiveDivideVertex(v0, h);
_glSubmitHeaderOrVertex(d, v0);
_glPerspectiveDivideVertex(&a, h);
_glSubmitHeaderOrVertex(d, &a);
uint32_t v2_flags = v2->flags;
v2->flags = GPU_CMD_VERTEX;
_glPerspectiveDivideVertex(v2, h);
_glSubmitHeaderOrVertex(d, v2);
_glPerspectiveDivideVertex(&b, h);
_glSubmitHeaderOrVertex(d, &b);
v2->flags = v2_flags;
_glSubmitHeaderOrVertex(d, v2);
queue_head = (queue_head + 3) % queue_capacity;
visible_mask = 0;
}
break;
case 6: /* Second and third vertex were visible */
{
Vertex __attribute__((aligned(32))) a, b; // Scratch vertices
Vertex* v0 = &queue[queue_head];
Vertex* v1 = &queue[(queue_head + 1) % queue_capacity];
Vertex* v2 = &queue[(queue_head + 2) % queue_capacity];
_glClipEdge(v0, v1, &a);
_glClipEdge(v2, v0, &b);
a.flags = GPU_CMD_VERTEX;
b.flags = GPU_CMD_VERTEX;
_glPerspectiveDivideVertex(&a, h);
_glSubmitHeaderOrVertex(d, &a);
_glPerspectiveDivideVertex(v1, h);
_glSubmitHeaderOrVertex(d, v1);
_glPerspectiveDivideVertex(&b, h);
_glSubmitHeaderOrVertex(d, &b);
_glSubmitHeaderOrVertex(d, v1);
_glPerspectiveDivideVertex(v2, h);
_glSubmitHeaderOrVertex(d, v2);
queue_head = (queue_head + 3) % queue_capacity;
visible_mask = 0;
}
break;
default:
break;
}
/* Submit the beginning of the next strip (2 verts, maybe a header) */
int8_t v = 0;
while(v < 2 && n > 1) {
if(!glIsVertex(vertex->flags)) {
_glSubmitHeaderOrVertex(d, vertex);
} else {
memcpy_vertex(&queue[queue_tail], vertex++);
visible_mask = (visible_mask >> 1) | ((queue[queue_tail].xyz[2] >= -queue[queue_tail].w) << 2); // Push new vertex
queue_tail = (queue_tail + 1) % queue_capacity;
++v;
}
--n;
}
}
}
struct VertexTmpl {
VertexTmpl(float x, float y, float z, float w):
x(x), y(y), z(z), w(w) {}
float x, y, z, w;
};
std::vector<Vertex> make_vertices(const std::vector<VertexTmpl>& verts) {
std::vector<Vertex> result;
Vertex r;
r.flags = GPU_CMD_POLYHDR;
result.push_back(r);
for(auto& v: verts) {
r.flags = GPU_CMD_VERTEX;
r.xyz[0] = v.x;
r.xyz[1] = v.y;
r.xyz[2] = v.z;
r.uv[0] = 0.0f;
r.uv[1] = 0.0f;
r.w = v.w;
result.push_back(r);
}
result.back().flags = GPU_CMD_VERTEX_EOL;
return result;
}
template<typename T, typename U>
void check_equal(const T& lhs, const U& rhs) {
if(lhs != rhs) {
throw std::runtime_error("Assertion failed");
}
}
template<>
void check_equal(const Vertex& lhs, const Vertex& rhs) {
if(lhs.xyz[0] != rhs.xyz[0] ||
lhs.xyz[1] != rhs.xyz[1] ||
lhs.xyz[2] != rhs.xyz[2] ||
lhs.w != rhs.w) {
throw std::runtime_error("Assertion failed");
}
}
bool test_clip_case_001() {
/* The first vertex is visible only */
sent.clear();
auto data = make_vertices({
{0.000000, -2.414213, 3.080808, 5.000000},
{-4.526650, -2.414213, -7.121212, -5.000000},
{4.526650, -2.414213, -7.121212, -5.000000}
});
SceneListSubmit(&data[0], data.size());
check_equal(sent.size(), 5);
check_equal(sent[0].flags, GPU_CMD_POLYHDR);
check_equal(sent[1].flags, GPU_CMD_VERTEX);
check_equal(sent[2].flags, GPU_CMD_VERTEX);
// Because we're sending a single triangle, we end up sending a
// degenerate final vert. But if we were sending more than one triangle
// this would be GPU_CMD_VERTEX twice
check_equal(sent[3].flags, GPU_CMD_VERTEX_EOL);
check_equal(sent[4].flags, GPU_CMD_VERTEX_EOL);
check_equal(sent[3], sent[4]);
return true;
}
bool test_clip_case_010() {
/* The third vertex is visible only */
sent.clear();
auto data = make_vertices({
{-4.526650, -2.414213, -7.121212, -5.000000},
{0.000000, -2.414213, 3.080808, 5.000000},
{4.526650, -2.414213, -7.121212, -5.000000}
});
SceneListSubmit(&data[0], data.size());
check_equal(sent.size(), 4);
check_equal(sent[0].flags, GPU_CMD_POLYHDR);
check_equal(sent[1].flags, GPU_CMD_VERTEX);
check_equal(sent[2].flags, GPU_CMD_VERTEX);
check_equal(sent[3].flags, GPU_CMD_VERTEX_EOL);
return true;
}
bool test_clip_case_100() {
/* The third vertex is visible only */
sent.clear();
auto data = make_vertices({
{-4.526650, -2.414213, -7.121212, -5.000000},
{4.526650, -2.414213, -7.121212, -5.000000},
{0.000000, -2.414213, 3.080808, 5.000000}
});
SceneListSubmit(&data[0], data.size());
check_equal(sent.size(), 5);
check_equal(sent[0].flags, GPU_CMD_POLYHDR);
check_equal(sent[1].flags, GPU_CMD_VERTEX);
check_equal(sent[2].flags, GPU_CMD_VERTEX);
// Because we're sending a single triangle, we end up sending a
// degenerate final vert. But if we were sending more than one triangle
// this would be GPU_CMD_VERTEX twice
check_equal(sent[3].flags, GPU_CMD_VERTEX);
check_equal(sent[4].flags, GPU_CMD_VERTEX_EOL);
check_equal(sent[1], sent[2]);
return true;
}
bool test_clip_case_110() {
/* 2nd and 3rd visible */
sent.clear();
auto data = make_vertices({
{0.0, -2.414213, -7.121212, -5.000000},
{-4.526650, -2.414213, 3.080808, 5.000000},
{4.526650, -2.414213, 3.080808, 5.000000}
});
SceneListSubmit(&data[0], data.size());
check_equal(sent.size(), 6);
check_equal(sent[0].flags, GPU_CMD_POLYHDR);
check_equal(sent[1].flags, GPU_CMD_VERTEX);
check_equal(sent[2].flags, GPU_CMD_VERTEX);
check_equal(sent[3].flags, GPU_CMD_VERTEX);
check_equal(sent[4].flags, GPU_CMD_VERTEX);
check_equal(sent[5].flags, GPU_CMD_VERTEX_EOL);
check_equal(sent[2], sent[4]);
return true;
}
bool test_clip_case_011() {
/* 1st and 2nd visible */
sent.clear();
auto data = make_vertices({
{-4.526650, -2.414213, 3.080808, 5.000000},
{4.526650, -2.414213, 3.080808, 5.000000},
{0.0, -2.414213, -7.121212, -5.000000}
});
SceneListSubmit(&data[0], data.size());
check_equal(sent.size(), 6);
check_equal(sent[0].flags, GPU_CMD_POLYHDR);
check_equal(sent[1].flags, GPU_CMD_VERTEX);
check_equal(sent[2].flags, GPU_CMD_VERTEX);
check_equal(sent[3].flags, GPU_CMD_VERTEX);
check_equal(sent[4].flags, GPU_CMD_VERTEX);
check_equal(sent[5].flags, GPU_CMD_VERTEX_EOL);
check_equal(sent[2], sent[4]);
return true;
}
bool test_clip_case_101() {
/* 1st and 3rd visible */
sent.clear();
auto data = make_vertices({
{-4.526650, -2.414213, 3.080808, 5.000000},
{0.0, -2.414213, -7.121212, -5.000000},
{4.526650, -2.414213, 3.080808, 5.000000},
});
SceneListSubmit(&data[0], data.size());
check_equal(sent.size(), 6);
check_equal(sent[0].flags, GPU_CMD_POLYHDR);
check_equal(sent[1].flags, GPU_CMD_VERTEX);
check_equal(sent[2].flags, GPU_CMD_VERTEX);
check_equal(sent[3].flags, GPU_CMD_VERTEX);
check_equal(sent[4].flags, GPU_CMD_VERTEX);
check_equal(sent[5].flags, GPU_CMD_VERTEX_EOL);
check_equal(sent[3], sent[5]);
return true;
}
bool test_start_behind() {
/* Triangle behind the plane, but the strip continues in front */
sent.clear();
auto data = make_vertices({
{-3.021717, -2.414213, -10.155344, -9.935254},
{5.915236, -2.414213, -9.354721, -9.136231},
{-5.915236, -2.414213, -0.264096, -0.063767},
{3.021717, -2.414213, 0.536527, 0.735255},
{-7.361995, -2.414213, 4.681529, 4.871976},
{1.574958, -2.414213, 5.482152, 5.670999},
});
SceneListSubmit(&data[0], data.size());
return true;
}
int main(int argc, char* argv[]) {
// test_clip_case_000();
test_clip_case_001();
test_clip_case_010();
test_clip_case_100();
test_clip_case_110();
test_clip_case_011();
test_clip_case_101();
// test_clip_case_111();
test_start_behind();
return 0;
}