Dreamcast: Slightly optimise performance by avoiding separate viewport transform

This commit is contained in:
UnknownShadow200 2024-08-03 10:41:06 +10:00
parent 967d52ea6a
commit 11fb4a2b65
6 changed files with 27 additions and 143 deletions

View file

@ -6,16 +6,6 @@
! r13 = cur vertex
! r14 = next vertex (prefetch)
!fr12 = VIEWPORT_HWIDTH
!fr13 = VIEWPORT_HHEIGHT
!fr14 = VIEWPORT_X_PLUS_HWIDTH
!fr15 = VIEWPORT_Y_PLUS_HHEIGHT
#define F_HW fr12
#define F_HH fr13
#define F_XP fr14
#define F_YP fr15
#define R_VTX r10
#define R_EOL r11
#define REG_CMD_VTX r10
@ -51,42 +41,8 @@
add #32,r8 ! EX, SQ += 32
.endm
! Pushes a vertex to the store queue
! CLOBBERS: fr0, fr4, fr5
! INPUTS: R (vertex)
! OUTPUTS:
! TODO optimise greatly
.macro ViewportTransform R
! INVERSE W CALCULATION
add #28, \R ! EX, \R = &vertex->w
fmov.s @\R,fr0 ! LS, fr0 = vertex->w
fmul fr0,fr0 ! FE, fr0 = fr0 * fr0
add #-24, \R ! EX, \R = &vertex->x
fsrra fr0 ! FE, fr0 = 1 / sqrt(fr0) -> 1 / vertex->w
! TRANSFORM X
fmov.s @\R,fr4 ! LS, fr4 = vertex->x
fmov F_XP,fr5 ! LS, fr5 = VIEWPORT_X_PLUS_HWIDTH
fmul F_HW,fr4 ! FE, fr4 = VIEWPORT_HWIDTH * vertex->x
fmac fr0,fr4,fr5 ! FE, fr5 = fr0 * fr4 + fr5 -- (X * F * hwidth) + x_plus_hwidth
fmov.s fr5,@\R ! LS, vertex->x = fr5
add #4, \R ! EX, \R = &vertex->y
! TRANSFORM Y
fmov.s @\R,fr4 ! LS, fr4 = vertex->y
fmov F_YP,fr5 ! LS, fr5 = VIEWPORT_Y_PLUS_HHEIGHT
fmul F_HH,fr4 ! FE, fr4 = VIEWPORT_HHEIGHT * vertex->y
fmac fr0,fr4,fr5 ! FE, fr5 = fr0 * fr4 + fr5 -- (Y * F * hheight) + y_plus_hheight
fmov.s fr5,@\R ! LS, vertex->y = fr5
add #4, \R ! EX, \R = &vertex->z
! ASSIGN Z
fmov.s fr0,@\R ! LS, vertex->z = fr0
add #-12, \R ! EX, \R -= 12 (back to start of vertex)
.endm
! Transforms then pushes a vertex to the store queue
! note: Vertices are assumed as pre viewport transformed already
! CLOBBERS: r2, fr0, fr4, fr5
! INPUTS: R (vertex), r8 (SQ global)
! OUTPUTS: R, r8 altered
@ -109,24 +65,20 @@
! TRANSFORM X
fmov.s @\R,fr4 ! LS, fr4 = SRC->x
fmov F_XP,fr5 ! LS, fr5 = VIEWPORT_X_PLUS_HWIDTH
fmul F_HW,fr4 ! FE, fr4 = VIEWPORT_HWIDTH * SRC->x
fmul fr0,fr4 ! FE, fr4 = invW * SRC->x
mov.l @(20,\R),r2 ! LS, tmp = SRC->bgra
mov.l r2,@(20,r8) ! LS, SRC->bgra = tmp
fmac fr0,fr4,fr5 ! FE, fr5 = invW * fr4 + fr5 -- (X * F * hwidth) + x_plus_hwidth
add #4, \R ! EX, SRC += 4
fmov.s fr5,@r8 ! LS, DST->x = fr5
fmov.s fr4,@r8 ! LS, DST->x = fr4
! TRANSFORM Y
fmov.s @\R,fr4 ! LS, fr4 = SRC->y
fmov F_YP,fr5 ! LS, fr5 = VIEWPORT_Y_PLUS_HHEIGHT
fmul F_HH,fr4 ! FE, fr4 = VIEWPORT_HHEIGHT * SRC->y
add #8, r8 ! EX, DST += 8
fmul fr0,fr4 ! FE, fr4 = invW * SRC->y
fmov.s fr0,@r8 ! LS, DST->z = invW
fmac fr0,fr4,fr5 ! FE, fr5 = invW * fr4 + fr5 -- (Y * F * hheight) + y_plus_hheight
add #-4, r8 ! EX, DST -= 4
add #-8, \R ! EX, src -= 8 (back to start of vertex)
fmov.s fr5,@r8 ! LS, DST->y = fr5
fmov.s fr4,@r8 ! LS, DST->y = fr4
add #-8,r8 ! EX, DST -= 8 (back to start of vertex)
pref @r8 ! LS, Trigger SQ
@ -561,17 +513,6 @@ _ProcessVertexList:
mov.l r13,@-r15
mov.l r14,@-r15
sts.l pr,@-r15
! STORE FPU REGISTERS
fmov.s F_HW,@-r15
fmov.s F_HH,@-r15
fmov.s F_XP,@-r15
fmov.s F_YP,@-r15
! VIEWPORT SETUP
mov.l .VP_1,r0 ! LS, &vp
fmov.s @r0+,F_HW ! LS, vp.HWIDTH
fmov.s @r0+,F_HH ! LS, vp.HHEIGHT
fmov.s @r0+,F_XP ! LS, vp.X_PLUS_HWIDTH
fmov.s @r0+,F_YP ! LS, vp.Y_PLUS_HHEIGHT
! REGISTER SETUP
mov r4,r14
mov r4,r13
@ -583,33 +524,11 @@ _ProcessVertexList:
bra SUBMIT_LOOP
add #-64,r15
! Handles a non-vertex command
! Submits a PowerVR GPU command
DO_CMD:
mov r13,r4 ! r4 = CUR
mov r1,r0 ! r0 = MASK
cmp/eq #35,r0 ! T = MASK == 0x23
bt.s 9f ! if (T) goto 9;
nop
! PowerVR GPU command
PushVertex REG_V0
bra NEXT_ITER
nop
! Viewport update command
9:
add #4,r4
mov.l .VP_1,r2
! Load VIEWPORT registers
fmov.s @r4+,F_HW ! VIEWPORT_HWIDTH = src->x
fmov.s @r4+,F_HH ! VIEWPORT_HHEIGHT = src->y
fmov.s @r4+,F_XP ! VIEWPORT_X_PLUS_HWIDTH = src->z
add #16,r2
fmov.s @r4+,F_YP ! VIEWPORT_Y_PLUS_HHEIGHT = src->u
! And store to vp global
fmov.s F_YP,@-r2
fmov.s F_XP,@-r2
fmov.s F_HH,@-r2
bra NEXT_ITER
fmov.s F_HW,@-r2
SUBMIT_LOOP:
mov.l @r13,r0 ! FLAGS = CUR->flags
@ -645,18 +564,6 @@ NEXT_ITER:
mov r14,r13 ! CUR = NEXT
add #64,r15
! VIEWPORT SAVE
mov.l .VP_1,r0
add #16,r0
fmov.s F_YP,@-r0
fmov.s F_XP,@-r0
fmov.s F_HH,@-r0
fmov.s F_HW,@-r0
! RESTORE FPU REGISTERS
fmov.s @r15+,F_YP
fmov.s @r15+,F_XP
fmov.s @r15+,F_HH
fmov.s @r15+,F_HW
! RESTORE CPU REGISTERS
lds.l @r15+,pr
mov.l @r15+,r14

View file

@ -48,11 +48,6 @@
!fr5 = y
!fr6 = z
!fr7 = w
!fr8 = VIEWPORT_HWIDTH
!fr9 = VIEWPORT_HHEIGHT
!fr10 = VIEWPORT_X_PLUS_HWIDTH
!fr11 = VIEWPORT_Y_PLUS_HHEIGHT
!fv4 = XYZW

View file

@ -448,11 +448,21 @@ static matrix_t __attribute__((aligned(32))) _proj, _view;
static float textureOffsetX, textureOffsetY;
static int textureOffset;
static float vp_scaleX, vp_scaleY, vp_offsetX, vp_offsetY;
static matrix_t __attribute__((aligned(32))) mat_vp;
void Gfx_LoadMatrix(MatrixType type, const struct Matrix* matrix) {
if (type == MATRIX_PROJ) memcpy(&_proj, matrix, sizeof(struct Matrix));
if (type == MATRIX_VIEW) memcpy(&_view, matrix, sizeof(struct Matrix));
mat_load( &_proj);
memcpy(&mat_vp, &Matrix_Identity, sizeof(struct Matrix));
mat_vp[0][0] = vp_scaleX;
mat_vp[1][1] = vp_scaleY;
mat_vp[3][0] = vp_offsetX;
mat_vp[3][1] = vp_offsetY;
mat_load(&mat_vp);
mat_apply(&_proj);
mat_apply(&_view);
}
@ -645,13 +655,10 @@ static void PushCommand(void* cmd) {
}
void Gfx_SetViewport(int x, int y, int w, int h) {
Vertex c;
c.flags = PVR_CMD_USERCLIP | 0x23;
c.x = w * 0.5f; // hwidth
c.y = h * -0.5f; // hheight
c.z = x + w * 0.5f; // x_plus_hwidth
c.u = y + h * 0.5f; // y_plus_hheight
PushCommand(&c);
vp_scaleX = w * 0.5f; // hwidth
vp_scaleY = h * -0.5f; // hheight
vp_offsetX = x + w * 0.5f; // x_plus_hwidth
vp_offsetY = y + h * 0.5f; // y_plus_hheight
}
void Gfx_SetScissor(int x, int y, int w, int h) {

View file

@ -44,15 +44,6 @@ typedef struct {
#define GL_FORCE_INLINE static __attribute__((always_inline)) inline
typedef struct {
float hwidth; /* width * 0.5f */
float hheight; /* height * 0.5f */
float x_plus_hwidth;
float y_plus_hheight;
} Viewport;
extern Viewport VIEWPORTS[3];
typedef struct {
//0
GLuint index;

View file

@ -8,7 +8,6 @@
#define SQ_BASE_ADDRESS (void*) 0xe0000000
#define PREFETCH(addr) __builtin_prefetch((addr))
Viewport vp;
GL_FORCE_INLINE float _glFastInvert(float x) {
return MATH_fsrra(x * x);
@ -17,10 +16,10 @@ GL_FORCE_INLINE float _glFastInvert(float x) {
GL_FORCE_INLINE void _glPerspectiveDivideVertex(Vertex* vertex) {
const float f = _glFastInvert(vertex->w);
/* Convert to NDC and apply viewport */
vertex->x = (vertex->x * f * vp.hwidth) + vp.x_plus_hwidth;
vertex->y = (vertex->y * f * vp.hheight) + vp.y_plus_hheight;
vertex->z = f;
/* Convert to NDC (viewport already applied) */
vertex->x = vertex->x * f;
vertex->y = vertex->y * f;
vertex->z = _glFastInvert(vertex->w);
}
@ -377,21 +376,8 @@ static void SubmitClipped(Vertex* v0, Vertex* v1, Vertex* v2, Vertex* v3, uint8_
}
}
static __attribute__((noinline)) void HandleCommand(Vertex* v) {
if ((v->flags & 0xFF) != 0x23) {
_glPushHeaderOrVertex(v);
return;
}
vp.hwidth = v->x;
vp.hheight = v->y;
vp.x_plus_hwidth = v->z;
vp.y_plus_hheight = v->u;
}
extern void ProcessVertexList(Vertex* v3, int n, void* sq_addr);
void SceneListSubmit(Vertex* v3, int n, int type) {
vp = VIEWPORTS[type];
PVR_SET(SPAN_SORT_CFG, 0x0);
//Set PVR DMA registers
@ -414,7 +400,7 @@ void SceneListSubmit(Vertex* v3, int n, int type) {
case PVR_CMD_VERTEX:
continue;
default:
HandleCommand(v3);
_glPushHeaderOrVertex(v3);
continue;
};
@ -460,5 +446,4 @@ void SceneListSubmit(Vertex* v3, int n, int type) {
}
_glFlushBuffer();
VIEWPORTS[type] = vp;
}

View file

@ -26,7 +26,6 @@ GLboolean AUTOSORT_ENABLED;
AlignedVector OP_LIST;
AlignedVector PT_LIST;
AlignedVector TR_LIST;
Viewport VIEWPORTS[3];
void glKosInit() {
_glInitTextures();