Dreamcast: Make TnL slightly more efficient

This commit is contained in:
UnknownShadow200 2024-10-16 07:10:23 +11:00
parent 5a72b13822
commit a970aea405
3 changed files with 29 additions and 38 deletions

View file

@ -171,17 +171,12 @@ _DrawTexturedQuads:
! CLIPFLAGS TESTING ! CLIPFLAGS TESTING
and #15,FLG and #15,FLG
cmp/eq #0,FLG ! T = CLIPFLAGS == 0 (all points invisible) cmp/eq #0,FLG ! T = CLIPFLAGS == 0 (all points invisible)
bt/s .T_NONE_VISIBLE ! if T goto NONE_VISIBLE bf/s .T_LOOP_END ! if !T goto LOOP_END
nop
bra .T_SOME_VISIBLE
nop nop
.T_NONE_VISIBLE: ! No points visible case
bra .T_LOOP_END ! jump to loop end after executing instruction in delay slot
add #-128, DST ! DST -= 4 * sizeof(VERTEX), move back to prior quad, so that this invisible quad gets overwritten in next iteration add #-128, DST ! DST -= 4 * sizeof(VERTEX), move back to prior quad, so that this invisible quad gets overwritten in next iteration
.T_SOME_VISIBLE:
.T_LOOP_END: .T_LOOP_END:
dt CNT ! count--; T = count == 0 dt CNT ! count--; T = count == 0
bf .T_TRANSFORM_QUAD ! if !T then goto T_TRANSFORM_QUAD bf .T_TRANSFORM_QUAD ! if !T then goto T_TRANSFORM_QUAD
@ -219,17 +214,12 @@ _DrawColouredQuads:
! CLIPFLAGS TESTING ! CLIPFLAGS TESTING
and #15,FLG and #15,FLG
cmp/eq #0,FLG ! T = CLIPFLAGS == 0 (all points invisible) cmp/eq #0,FLG ! T = CLIPFLAGS == 0 (all points invisible)
bt/s .C_NONE_VISIBLE ! if T goto NONE_VISIBLE bf/s .C_LOOP_END ! if !T goto LOOP_END
nop
bra .C_SOME_VISIBLE
nop nop
.C_NONE_VISIBLE: ! No points visible case
bra .C_LOOP_END ! jump to loop end after executing instruction in delay slot
add #-128, DST ! dst -= 4 * sizeof(VERTEX), move back to 1 vertex before start of quad add #-128, DST ! dst -= 4 * sizeof(VERTEX), move back to 1 vertex before start of quad
.C_SOME_VISIBLE:
.C_LOOP_END: .C_LOOP_END:
dt CNT ! count--; T = count == 0 dt CNT ! count--; T = count == 0
bf .C_TRANSFORM_QUAD ! if !T then goto TRANSFORM_QUAD bf .C_TRANSFORM_QUAD ! if !T then goto TRANSFORM_QUAD

View file

@ -9,13 +9,9 @@ typedef struct {
/* Same 32 byte layout as pvr_vertex_t */ /* Same 32 byte layout as pvr_vertex_t */
uint32_t flags; uint32_t flags;
float x, y, z; float x, y, z;
float u, v; uint32_t u, v; // really floats, but stored as uint for better load/store codegen
uint32_t bgra; uint32_t bgra;
float w; // actually oargb, but repurposed since unused
/* In the pvr_vertex_t structure, this next 4 bytes is oargb
* but we're not using that for now, so having W here makes the code
* simpler */
float w;
} __attribute__ ((aligned (32))) Vertex; } __attribute__ ((aligned (32))) Vertex;
typedef struct { typedef struct {

View file

@ -21,21 +21,26 @@ static GL_FORCE_INLINE float _glFastInvert(float x) {
return sh4_fsrra(x * x); return sh4_fsrra(x * x);
} }
#define PushVertex(vtx) \ static GL_FORCE_INLINE void PushVertex(Vertex* v) {
_glPerspectiveDivideVertex(vtx); \ volatile Vertex* dst = (Vertex*)(sq);
_glPushHeaderOrVertex(vtx); float f = _glFastInvert(v->w);
// Convert to NDC (viewport already applied)
float x = v->x * f;
float y = v->y * f;
static GL_FORCE_INLINE void _glPerspectiveDivideVertex(Vertex* vertex) { dst->flags = v->flags;
const float f = _glFastInvert(vertex->w); dst->x = x;
dst->y = y;
/* Convert to NDC (viewport already applied) */ dst->z = f;
vertex->x = vertex->x * f; dst->u = v->u;
vertex->y = vertex->y * f; dst->v = v->v;
vertex->z = f; dst->bgra = v->bgra;
__asm__("pref @%0" : : "r"(dst));
dst++;
} }
static inline void _glPushHeaderOrVertex(Vertex* v) { static inline void PushCommand(Vertex* v) {
uint32_t* s = (uint32_t*) v; uint32_t* s = (uint32_t*)v;
sq[0] = *(s++); sq[0] = *(s++);
sq[1] = *(s++); sq[1] = *(s++);
sq[2] = *(s++); sq[2] = *(s++);
@ -304,7 +309,7 @@ void SceneListSubmit(Vertex* v3, int n) {
case PVR_CMD_VERTEX: case PVR_CMD_VERTEX:
continue; continue;
default: default:
_glPushHeaderOrVertex(v3); PushCommand(v3);
continue; continue;
}; };