Dreamcast: Make TnL slightly more efficient

2025-01-22 09:01:57 -05:00 · 2024-10-16 07:10:23 +11:00 · 2024-10-16 07:10:23 +11:00 · a970aea405
commit a970aea405
parent 5a72b13822
3 changed files with 29 additions and 38 deletions
--- a/misc/dreamcast/VertexTransform.S
+++ b/misc/dreamcast/VertexTransform.S
@ -171,17 +171,12 @@ _DrawTexturedQuads:
 ! CLIPFLAGS TESTING
    and     #15,FLG
    cmp/eq   #0,FLG      ! T = CLIPFLAGS == 0 (all points invisible)
-    bt/s    .T_NONE_VISIBLE ! if T goto NONE_VISIBLE
-    nop
-    bra     .T_SOME_VISIBLE
+    bf/s    .T_LOOP_END  ! if !T goto LOOP_END
    nop

-.T_NONE_VISIBLE:
-    bra .T_LOOP_END ! jump to loop end after executing instruction in delay slot
+! No points visible case
    add #-128, DST       ! DST -= 4 * sizeof(VERTEX), move back to prior quad, so that this invisible quad gets overwritten in next iteration

-.T_SOME_VISIBLE:
-
 .T_LOOP_END:
    dt CNT               ! count--; T = count == 0
    bf .T_TRANSFORM_QUAD ! if !T then goto T_TRANSFORM_QUAD
@ -219,17 +214,12 @@ _DrawColouredQuads:
 ! CLIPFLAGS TESTING
    and     #15,FLG
    cmp/eq   #0,FLG      ! T = CLIPFLAGS == 0 (all points invisible)
-    bt/s    .C_NONE_VISIBLE ! if T goto NONE_VISIBLE
-    nop
-    bra     .C_SOME_VISIBLE
+    bf/s    .C_LOOP_END  ! if !T goto LOOP_END
    nop

-.C_NONE_VISIBLE:
-    bra .C_LOOP_END ! jump to loop end after executing instruction in delay slot
+! No points visible case
    add #-128, DST       ! dst -= 4 * sizeof(VERTEX), move back to 1 vertex before start of quad

-.C_SOME_VISIBLE:
-
 .C_LOOP_END:
    dt CNT               ! count--; T = count == 0
    bf .C_TRANSFORM_QUAD ! if !T then goto TRANSFORM_QUAD
--- a/third_party/gldc/src/gldc.h
+++ b/third_party/gldc/src/gldc.h
@ -9,13 +9,9 @@ typedef struct {
    /* Same 32 byte layout as pvr_vertex_t */
    uint32_t flags;
    float x, y, z;
-    float u, v;
+    uint32_t u, v; // really floats, but stored as uint for better load/store codegen
    uint32_t bgra;
-
-    /* In the pvr_vertex_t structure, this next 4 bytes is oargb
-     * but we're not using that for now, so having W here makes the code
-     * simpler */
-    float w;
+    float w; // actually oargb, but repurposed since unused
 } __attribute__ ((aligned (32))) Vertex;

 typedef struct {
--- a/third_party/gldc/src/sh4.c
+++ b/third_party/gldc/src/sh4.c
@ -21,21 +21,26 @@ static GL_FORCE_INLINE float _glFastInvert(float x) {
    return sh4_fsrra(x * x);
 }

-#define PushVertex(vtx) \
-	_glPerspectiveDivideVertex(vtx); \
-	_glPushHeaderOrVertex(vtx);
+static GL_FORCE_INLINE void PushVertex(Vertex* v) {
+    volatile Vertex* dst = (Vertex*)(sq);
+    float f = _glFastInvert(v->w);
+    // Convert to NDC (viewport already applied)
+    float x = v->x * f;
+    float y = v->y * f;

-static GL_FORCE_INLINE void _glPerspectiveDivideVertex(Vertex* vertex) {
-    const float f = _glFastInvert(vertex->w);
-
-    /* Convert to NDC (viewport already applied) */
-    vertex->x = vertex->x * f;
-    vertex->y = vertex->y * f;
-    vertex->z = f;
+    dst->flags = v->flags;
+    dst->x = x;
+    dst->y = y;
+    dst->z = f;
+    dst->u = v->u;
+    dst->v = v->v;
+    dst->bgra = v->bgra;
+    __asm__("pref @%0" : : "r"(dst));
+    dst++;
 }

-static inline void _glPushHeaderOrVertex(Vertex* v)  {
-    uint32_t* s = (uint32_t*) v;
+static inline void PushCommand(Vertex* v)  {
+    uint32_t* s = (uint32_t*)v;
    sq[0] = *(s++);
    sq[1] = *(s++);
    sq[2] = *(s++);
@ -304,7 +309,7 @@ void SceneListSubmit(Vertex* v3, int n) {
        case PVR_CMD_VERTEX:
            continue;
        default:
-            _glPushHeaderOrVertex(v3);
+            PushCommand(v3);
            continue;
        };