Rework (S)VGA renderer to be internally chunky + attempt to optimise for speed

2025-01-23 17:52:01 -05:00 · 2023-12-06 15:25:17 +13:00 · 2023-12-06 15:25:17 +13:00 · 44d70ab943
commit 44d70ab943
parent cf36889bd2
1 changed files with 88 additions and 83 deletions
--- a/src/video/vid_svga_render.c
+++ b/src/video/vid_svga_render.c
@ -454,11 +454,10 @@ svga_render_indexed_gfx(svga_t *svga, bool highres, bool combine8bits)
    int       x;
    uint32_t  addr;
    uint32_t *p;
-    uint8_t   edat[4];
    uint32_t  changed_offset;

-    const bool    blinked     = svga->blink & 0x10;
-    const bool    attrblink   = ((svga->attrregs[0x10] & 0x08) != 0);
+    const bool blinked   = svga->blink & 0x10;
+    const bool attrblink = ((svga->attrregs[0x10] & 0x08) != 0);

    /*
       The following is likely how it works on an IBM VGA - that is, it works with its BIOS.
@ -466,33 +465,50 @@ svga_render_indexed_gfx(svga_t *svga, bool highres, bool combine8bits)
       - S3 Trio: mode 13h (320x200x8), incbypow2 given as 2 treated as 0
       - ET4000/W32i: mode 2Eh (640x480x8), incevery given as 2 treated as 1
     */
-    const bool    forcepacked = combine8bits && (svga->force_old_addr || svga->packed_chain4);
+    const bool forcepacked = combine8bits && (svga->force_old_addr || svga->packed_chain4);

    /*
       SVGA cards with a high-resolution 8bpp mode may actually bypass the VGA shifter logic.
       - HT-216 (+ other Video7 chipsets?) has 0x3C4.0xC8 bit 4 which, when set to 1, loads
         bytes directly, bypassing the shifters.
     */
-    const bool    highres8bpp = combine8bits && highres;
+    const bool highres8bpp = combine8bits && highres;

-    const bool    dwordload   = ((svga->seqregs[0x01] & 0x10) != 0);
-    const bool    wordload    = ((svga->seqregs[0x01] & 0x04) != 0) && !dwordload;
-    const bool    wordincr    = ((svga->crtc[0x17] & 0x08) != 0);
-    const bool    dwordincr   = ((svga->crtc[0x14] & 0x20) != 0) && !wordincr;
-    const bool    dwordshift  = ((svga->crtc[0x14] & 0x40) != 0);
-    const bool    wordshift   = ((svga->crtc[0x17] & 0x40) == 0) && !dwordshift;
+    const bool     dwordload  = ((svga->seqregs[0x01] & 0x10) != 0);
+    const bool     wordload   = ((svga->seqregs[0x01] & 0x04) != 0) && !dwordload;
+    const bool     wordincr   = ((svga->crtc[0x17] & 0x08) != 0);
+    const bool     dwordincr  = ((svga->crtc[0x14] & 0x20) != 0) && !wordincr;
+    const bool     dwordshift = ((svga->crtc[0x14] & 0x40) != 0);
+    const bool     wordshift  = ((svga->crtc[0x17] & 0x40) == 0) && !dwordshift;
    const uint32_t incbypow2  = forcepacked ? 0 : (dwordshift ? 2 : wordshift ? 1 : 0);
-    const uint32_t incevery   = forcepacked ? 1 : (dwordincr  ? 4 : wordincr  ? 2 : 1);
-    const uint32_t loadevery  = forcepacked ? 1 : (dwordload  ? 4 : wordload  ? 2 : 1);
+    const uint32_t incevery   = forcepacked ? 1 : (dwordincr ? 4 : wordincr ? 2 : 1);
+    const uint32_t loadevery  = forcepacked ? 1 : (dwordload ? 4 : wordload ? 2 : 1);

-    const bool    shift4bit   = ((svga->gdcreg[0x05] & 0x40) == 0x40 ) || highres8bpp;
-    const bool    shift2bit   = ((svga->gdcreg[0x05] & 0x60) == 0x20 ) && !shift4bit;
+    const bool shift4bit = ((svga->gdcreg[0x05] & 0x40) == 0x40) || highres8bpp;
+    const bool shift2bit = ((svga->gdcreg[0x05] & 0x60) == 0x20) && !shift4bit;

-    const int     dwshift     = highres ? 0 : 1;
-    const int     dotwidth    = 1 << dwshift;
-    const int     charwidth   = dotwidth * (combine8bits ? 4 : 8);
-    const uint8_t blinkmask   = (attrblink ? 0x8 : 0x0);
-    const uint8_t blinkval    = (attrblink && blinked ? 0x8 : 0x0);
+    const int      dwshift   = highres ? 0 : 1;
+    const int      dotwidth  = 1 << dwshift;
+    const int      charwidth = dotwidth * (combine8bits ? 4 : 8);
+    const uint32_t planemask = 0x11111111 * (uint32_t) (svga->plane_mask);
+    const uint32_t blinkmask = (attrblink ? 0x88888888 : 0x0);
+    const uint32_t blinkval  = (attrblink && blinked ? 0x88888888 : 0x0);
+
+    /*
+       This is actually a 8x 3-bit lookup table,
+       preshifted by 2 bits to allow shifting by multiples of 4 bits.
+
+       Anyway, when we perform a planar-to-chunky conversion,
+       we keep the pixel values in a scrambled order.
+       This lookup table unscrambles them.
+
+       WARNING: Octal values are used here!
+     */
+    const uint32_t shift_values = (shift4bit
+                                       ? ((067452301) << 2)
+                                       : shift2bit
+                                       ? ((026370415) << 2)
+                                       : ((002461357) << 2));

    if ((svga->displine + svga->y_add) < 0)
        return;
@ -502,8 +518,7 @@ svga_render_indexed_gfx(svga_t *svga, bool highres, bool combine8bits)
    else
        changed_offset = svga->remap_func(svga, svga->ma) >> 12;

-    if (!(svga->changedvram[changed_offset] || svga->changedvram[changed_offset + 1] ||
-        svga->fullchange))
+    if (!(svga->changedvram[changed_offset] || svga->changedvram[changed_offset + 1] || svga->fullchange))
        return;
    p = &svga->monitor->target_buffer->line[svga->displine + svga->y_add][svga->x_add];

@ -513,6 +528,7 @@ svga_render_indexed_gfx(svga_t *svga, bool highres, bool combine8bits)

    uint32_t incr_counter = 0;
    uint32_t load_counter = 0;
+    uint32_t edat         = 0;
    for (x = 0; x <= (svga->hdisp + svga->scrollcache); x += charwidth) {
        if (load_counter == 0) {
            /* Find our address */
@ -529,7 +545,7 @@ svga_render_indexed_gfx(svga_t *svga, bool highres, bool combine8bits)
                        if (svga->ma & (4 << 15))
                            addr |= 0x4;
                    } else {
-                        if (svga->ma & (4<<13))
+                        if (svga->ma & (4 << 13))
                            addr |= 0x4;
                    }
                } else {
@ -548,42 +564,31 @@ svga_render_indexed_gfx(svga_t *svga, bool highres, bool combine8bits)
            addr &= svga->vram_display_mask;

            /* Load VRAM */
-            *(uint32_t *)&edat[0] = *(uint32_t *)&svga->vram[addr];
+            edat = *(uint32_t *) &svga->vram[addr];

-            if (shift4bit) {
-                /*
-                   Remap VGA 4bpp-chunky data into fully planar data
-                   Plane 3 LSbit is aligned with MSbit
-                 */
-                uint8_t tmpdat[4] = {0, 0, 0, 0};
-                for (int j = 0; j < 4; j++) {
-                    for (int i = 0; i < 8; i++) {
-                        tmpdat[j] <<= 1;
-                        tmpdat[j] |= (edat[i>>1] >> (((0x1&~i)<<2)+j)) & 0x1;
-                    }
+            /*
+               EGA and VGA actually use 4bpp planar as its native format.
+               But 4bpp chunky is generally easier to deal with on a modern CPU.
+               shift4bit is the native format for this renderer (4bpp chunky).
+             */
+            if (!shift4bit) {
+                if (shift2bit) {
+                    /* Group 2x 2bpp values into 4bpp values */
+                    edat = (edat & 0xCCCC3333) | ((edat << 14) & 0x33330000) | ((edat >> 14) & 0x0000CCCC);
+                } else {
+                    /* Group 4x 1bpp values into 4bpp values */
+                    edat = (edat & 0xAA55AA55) | ((edat << 7) & 0x55005500) | ((edat >> 7) & 0x00AA00AA);
+                    edat = (edat & 0xCCCC3333) | ((edat << 14) & 0x33330000) | ((edat >> 14) & 0x0000CCCC);
                }
-                *(uint32_t *) (&edat[0]) = *(uint32_t *) (&tmpdat[0]);
-            }
-
-            if (shift2bit) {
-                // Remap CGA 2bpp-chunky data into fully planar data
-                uint8_t dat0 = egaremap2bpp[edat[1]] | (egaremap2bpp[edat[0]] << 4);
-                uint8_t dat1 = egaremap2bpp[edat[1] >> 1] | (egaremap2bpp[edat[0] >> 1] << 4);
-                uint8_t dat2 = egaremap2bpp[edat[3]] | (egaremap2bpp[edat[2]] << 4);
-                uint8_t dat3 = egaremap2bpp[edat[3] >> 1] | (egaremap2bpp[edat[2] >> 1] << 4);
-                edat[0]      = dat0;
-                edat[1]      = dat1;
-                edat[2]      = dat2;
-                edat[3]      = dat3;
            }
        } else {
            /*
               According to the 82C451 VGA clone chipset datasheet, all 4 planes chain in a ring.
               So, rotate them all around.
+               Planar version: edat = (edat >> 8) | (edat << 24);
+               Here's the chunky version...
             */
-            *(uint32_t *)&edat[0]
-                = ((*(uint32_t *)&edat[0]) >> 8)
-                | ((*(uint32_t *)&edat[0]) << 24);
+            edat = ((edat >> 1) & 0x77777777) | ((edat << 3) & 0x88888888);
        }
        load_counter += 1;
        if (load_counter >= loadevery)
@ -593,50 +598,50 @@ svga_render_indexed_gfx(svga_t *svga, bool highres, bool combine8bits)
        if (incr_counter >= incevery) {
            incr_counter = 0;
            svga->ma += 4;
-            // DISCREPANCY TODO FIXME 2/4bpp used vram_mask, 8bpp used vram_display_mask --GM
+            /* DISCREPANCY TODO FIXME 2/4bpp used vram_mask, 8bpp used vram_display_mask --GM */
            svga->ma &= svga->vram_display_mask;
        }

+        uint32_t current_shift = shift_values;
+        uint32_t out_edat      = edat;
        /*
-           Now that we've converted it all to planar, convert it (back?) to chunky!
+           Apply blink
+           FIXME: Confirm blink behaviour on real hardware
+
+           The VGA 4bpp graphics blink logic was a pain to work out.
+
+           If plane 3 is enabled in the attribute controller, then:
+           - if bit 3 is 0, then we force the output of it to be 1.
+           - if bit 3 is 1, then the output blinks.
+           This can be tested with Lotus 1-2-3 release 2.3 with the WYSIWYG addon.
+
+           If plane 3 is disabled in the attribute controller, then the output blinks.
+           This can be tested with QBASIC SCREEN 10 - anything using color #2 should
+           blink and nothing else.
+
+           If you can simplify the following and have it still work, give yourself a medal.
         */
+        out_edat = ((out_edat & planemask & ~blinkmask) | ((out_edat | ~planemask) & blinkmask & blinkval)) ^ blinkmask;
+
        for (int i = 0; i < 8; i += 2) {
-            const int inshift = 6 - i;
-            uint8_t dat
-                = (edatlookup[(edat[0] >> inshift) & 3][(edat[1] >> inshift) & 3])
-                | (edatlookup[(edat[2] >> inshift) & 3][(edat[3] >> inshift) & 3] << 2);
-
-            /* FIXME: Confirm blink behaviour on real hardware
-
-               The VGA 4bpp graphics blink logic was a pain to work out.
-
-               If plane 3 is enabled in the attribute controller, then:
-               - if bit 3 is 0, then we force the output of it to be 1.
-               - if bit 3 is 1, then the output blinks.
-               This can be tested with Lotus 1-2-3 release 2.3 with the WYSIWYG addon.
-
-               If plane 3 is disabled in the attribute controller, then the output blinks.
-               This can be tested with QBASIC SCREEN 10 - anything using color #2 should
-               blink and nothing else.
-
-               If you can simplify the following and have it still work, give yourself a medal.
+            /*
+               c0 denotes the first 4bpp pixel shifted, while c1 denotes the second.
+               For 8bpp modes, the first 4bpp pixel is the upper 4 bits.
             */
-            uint32_t c0 = (dat >> 4) & 0xF;
-            uint32_t c1 = dat & 0xF;
-            c0 = ((c0 & svga->plane_mask & ~blinkmask) |
-                 ((c0 | ~svga->plane_mask) & blinkmask & blinkval)) ^ blinkmask;
-            c1 = ((c1 & svga->plane_mask & ~blinkmask) |
-                 ((c1 | ~svga->plane_mask) & blinkmask & blinkval)) ^ blinkmask;
+            uint32_t c0 = (out_edat >> (current_shift & 0x1C)) & 0xF;
+            current_shift >>= 3;
+            uint32_t c1 = (out_edat >> (current_shift & 0x1C)) & 0xF;
+            current_shift >>= 3;

            if (combine8bits) {
-                uint32_t ccombined = (c0 << 4) | c1;
-                uint32_t p0 = svga->map8[ccombined];
-                const int outoffs = (i >> 1) << dwshift;
+                uint32_t  ccombined = (c0 << 4) | c1;
+                uint32_t  p0        = svga->map8[ccombined];
+                const int outoffs   = (i >> 1) << dwshift;
                for (int subx = 0; subx < dotwidth; subx++)
                    p[outoffs + subx] = p0;
            } else {
-                uint32_t p0 = svga->pallook[svga->egapal[c0]];
-                uint32_t p1 = svga->pallook[svga->egapal[c1]];
+                uint32_t  p0      = svga->pallook[svga->egapal[c0]];
+                uint32_t  p1      = svga->pallook[svga->egapal[c1]];
                const int outoffs = i << dwshift;
                for (int subx = 0; subx < dotwidth; subx++)
                    p[outoffs + subx] = p0;