Dreamcast: Make TnL slightly more efficient

This commit is contained in:
UnknownShadow200 2024-10-16 07:10:23 +11:00
parent 5a72b13822
commit a970aea405
3 changed files with 29 additions and 38 deletions

View file

@ -170,17 +170,12 @@ _DrawTexturedQuads:
! CLIPFLAGS TESTING
and #15,FLG
cmp/eq #0,FLG ! T = CLIPFLAGS == 0 (all points invisible)
bt/s .T_NONE_VISIBLE ! if T goto NONE_VISIBLE
nop
bra .T_SOME_VISIBLE
cmp/eq #0,FLG ! T = CLIPFLAGS == 0 (all points invisible)
bf/s .T_LOOP_END ! if !T goto LOOP_END
nop
.T_NONE_VISIBLE:
bra .T_LOOP_END ! jump to loop end after executing instruction in delay slot
add #-128, DST ! DST -= 4 * sizeof(VERTEX), move back to prior quad, so that this invisible quad gets overwritten in next iteration
.T_SOME_VISIBLE:
! No points visible case
add #-128, DST ! DST -= 4 * sizeof(VERTEX), move back to prior quad, so that this invisible quad gets overwritten in next iteration
.T_LOOP_END:
dt CNT ! count--; T = count == 0
@ -218,17 +213,12 @@ _DrawColouredQuads:
! CLIPFLAGS TESTING
and #15,FLG
cmp/eq #0,FLG ! T = CLIPFLAGS == 0 (all points invisible)
bt/s .C_NONE_VISIBLE ! if T goto NONE_VISIBLE
nop
bra .C_SOME_VISIBLE
cmp/eq #0,FLG ! T = CLIPFLAGS == 0 (all points invisible)
bf/s .C_LOOP_END ! if !T goto LOOP_END
nop
.C_NONE_VISIBLE:
bra .C_LOOP_END ! jump to loop end after executing instruction in delay slot
add #-128, DST ! dst -= 4 * sizeof(VERTEX), move back to 1 vertex before start of quad
.C_SOME_VISIBLE:
! No points visible case
add #-128, DST ! dst -= 4 * sizeof(VERTEX), move back to 1 vertex before start of quad
.C_LOOP_END:
dt CNT ! count--; T = count == 0
@ -237,4 +227,4 @@ _DrawColouredQuads:
TransformEnd
.size _DrawColouredQuads, .-_DrawColouredQuads
.type _DrawColouredQuads, %function
.type _DrawColouredQuads, %function

View file

@ -9,13 +9,9 @@ typedef struct {
/* Same 32 byte layout as pvr_vertex_t */
uint32_t flags;
float x, y, z;
float u, v;
uint32_t u, v; // really floats, but stored as uint for better load/store codegen
uint32_t bgra;
/* In the pvr_vertex_t structure, this next 4 bytes is oargb
* but we're not using that for now, so having W here makes the code
* simpler */
float w;
float w; // actually oargb, but repurposed since unused
} __attribute__ ((aligned (32))) Vertex;
typedef struct {

View file

@ -21,21 +21,26 @@ static GL_FORCE_INLINE float _glFastInvert(float x) {
return sh4_fsrra(x * x);
}
#define PushVertex(vtx) \
_glPerspectiveDivideVertex(vtx); \
_glPushHeaderOrVertex(vtx);
static GL_FORCE_INLINE void PushVertex(Vertex* v) {
volatile Vertex* dst = (Vertex*)(sq);
float f = _glFastInvert(v->w);
// Convert to NDC (viewport already applied)
float x = v->x * f;
float y = v->y * f;
static GL_FORCE_INLINE void _glPerspectiveDivideVertex(Vertex* vertex) {
const float f = _glFastInvert(vertex->w);
/* Convert to NDC (viewport already applied) */
vertex->x = vertex->x * f;
vertex->y = vertex->y * f;
vertex->z = f;
dst->flags = v->flags;
dst->x = x;
dst->y = y;
dst->z = f;
dst->u = v->u;
dst->v = v->v;
dst->bgra = v->bgra;
__asm__("pref @%0" : : "r"(dst));
dst++;
}
static inline void _glPushHeaderOrVertex(Vertex* v) {
uint32_t* s = (uint32_t*) v;
static inline void PushCommand(Vertex* v) {
uint32_t* s = (uint32_t*)v;
sq[0] = *(s++);
sq[1] = *(s++);
sq[2] = *(s++);
@ -304,7 +309,7 @@ void SceneListSubmit(Vertex* v3, int n) {
case PVR_CMD_VERTEX:
continue;
default:
_glPushHeaderOrVertex(v3);
PushCommand(v3);
continue;
};