LibGfx/PNGWriter: Compute which predictor to use first, store data then

Before, we would compute and store the output of each predictor,
then pick the best one, and then copy its data.

Now, we compute the output of each predictor but only compute its
score and do not store the predicted data. We then pick the best
one, and do a second pass that re-computes the output of the best
predictor, and stores it.

Instead of computing the output of the 5 different predictors, we now
compute the output of the 5 different predictors, and then the output of
one of them again. In exchange, we only write each output row once
instead of 5 times. (We also have to read the input row twice instead of
once, but the second time round it'll come from L1 or L2.)
Making the simplifying assumption that each predictor takes the same
time to compute, this increases compute to 6/5th, and reduces memory
bandwidth to 3/6th. (Before: 1 input row read, 5 output row writes;
after: 2 input row reads, 1 output row write.)

Produces exactly the same output, but is faster:

    image -o sunset_retro.png sunset_retro.bmp --png-compression-level 0
       34.8 ms ± 0.9 ms ->  22.7 ms ± 0.8 ms (34.7% faster)

    image -o sunset_retro.png sunset_retro.bmp --png-compression-level 1
       64.2 ms ± 4.9 ms ->  50.5 ms ± 0.5 ms (31.3% faster)

    image -o sunset_retro.png sunset_retro.bmp --png-compression-level 2
      190.3 ms ± 1.6 ms -> 179.0 ms ± 2.8 ms (5.8% faster)

    image -o sunset_retro.png sunset_retro.bmp --png-compression-level 3
      646.5 ms ± 4.7 ms -> 635.3 ms ± 4.4 ms (3.3% faster)

Compression level 2 is the default, so about a 6% speedup in practice.

`sips` still needs 49.9 ms ± 3.0 ms to convert sunset_retro.bmp to
sunset_retro.png at its default compression level 1.
We used to take 1.27x as long as sips, now we take 1.01x as long,
while producing a smaller output :^)

(For other, larger, input files sips is still faster and produces
smaller output.)
This commit is contained in:
Nico Weber 2024-08-10 13:30:11 -04:00
parent ae57f6cad6
commit 32855d2c49

View file

@ -182,7 +182,6 @@ ErrorOr<void> PNGWriter::add_IDAT_chunk(Gfx::Bitmap const& bitmap, Compress::Zli
struct Filter {
PNG::FilterType type;
ByteBuffer buffer {};
AK::SIMD::i32x4 sum { 0, 0, 0, 0 };
AK::SIMD::u8x4 predict(AK::SIMD::u8x4 pixel, AK::SIMD::u8x4 pixel_x_minus_1, AK::SIMD::u8x4 pixel_y_minus_1, AK::SIMD::u8x4 pixel_xy_minus_1)
@ -206,15 +205,9 @@ ErrorOr<void> PNGWriter::add_IDAT_chunk(Gfx::Bitmap const& bitmap, Compress::Zli
VERIFY_NOT_REACHED();
}
ErrorOr<void> append(AK::SIMD::u8x4 simd)
void append(AK::SIMD::u8x4 simd)
{
TRY(buffer.try_append(simd[0]));
TRY(buffer.try_append(simd[1]));
TRY(buffer.try_append(simd[2]));
if constexpr (include_alpha)
TRY(buffer.try_append(simd[3]));
sum += AK::SIMD::simd_cast<AK::SIMD::i32x4>(AK::SIMD::simd_cast<AK::SIMD::i8x4>(simd));
return {};
}
i32 sum_of_signed_values() const
@ -227,19 +220,10 @@ ErrorOr<void> PNGWriter::add_IDAT_chunk(Gfx::Bitmap const& bitmap, Compress::Zli
};
Filter none_filter { .type = PNG::FilterType::None };
TRY(none_filter.buffer.try_ensure_capacity(sizeof(Pixel) * bitmap.width()));
Filter sub_filter { .type = PNG::FilterType::Sub };
TRY(sub_filter.buffer.try_ensure_capacity(sizeof(Pixel) * bitmap.width()));
Filter up_filter { .type = PNG::FilterType::Up };
TRY(up_filter.buffer.try_ensure_capacity(sizeof(Pixel) * bitmap.width()));
Filter average_filter { .type = PNG::FilterType::Average };
TRY(average_filter.buffer.try_ensure_capacity(sizeof(ARGB32) * bitmap.width()));
Filter paeth_filter { .type = PNG::FilterType::Paeth };
TRY(paeth_filter.buffer.try_ensure_capacity(sizeof(ARGB32) * bitmap.width()));
auto pixel_x_minus_1 = Pixel::gfx_to_png(dummy_scanline[0]);
auto pixel_xy_minus_1 = Pixel::gfx_to_png(dummy_scanline[0]);
@ -248,18 +232,16 @@ ErrorOr<void> PNGWriter::add_IDAT_chunk(Gfx::Bitmap const& bitmap, Compress::Zli
auto pixel = Pixel::gfx_to_png(scanline[x]);
auto pixel_y_minus_1 = Pixel::gfx_to_png(scanline_minus_1[x]);
TRY(none_filter.append(none_filter.predict(pixel, pixel_x_minus_1, pixel_y_minus_1, pixel_xy_minus_1)));
TRY(sub_filter.append(sub_filter.predict(pixel, pixel_x_minus_1, pixel_y_minus_1, pixel_xy_minus_1)));
TRY(up_filter.append(up_filter.predict(pixel, pixel_x_minus_1, pixel_y_minus_1, pixel_xy_minus_1)));
TRY(average_filter.append(average_filter.predict(pixel, pixel_x_minus_1, pixel_y_minus_1, pixel_xy_minus_1)));
TRY(paeth_filter.append(paeth_filter.predict(pixel, pixel_x_minus_1, pixel_y_minus_1, pixel_xy_minus_1)));
none_filter.append(none_filter.predict(pixel, pixel_x_minus_1, pixel_y_minus_1, pixel_xy_minus_1));
sub_filter.append(sub_filter.predict(pixel, pixel_x_minus_1, pixel_y_minus_1, pixel_xy_minus_1));
up_filter.append(up_filter.predict(pixel, pixel_x_minus_1, pixel_y_minus_1, pixel_xy_minus_1));
average_filter.append(average_filter.predict(pixel, pixel_x_minus_1, pixel_y_minus_1, pixel_xy_minus_1));
paeth_filter.append(paeth_filter.predict(pixel, pixel_x_minus_1, pixel_y_minus_1, pixel_xy_minus_1));
pixel_x_minus_1 = pixel;
pixel_xy_minus_1 = pixel_y_minus_1;
}
scanline_minus_1 = scanline;
// 12.8 Filter selection: https://www.w3.org/TR/PNG/#12Filter-selection
// For best compression of truecolour and greyscale images, the recommended approach
// is adaptive filtering in which a filter is chosen for each scanline.
@ -276,8 +258,31 @@ ErrorOr<void> PNGWriter::add_IDAT_chunk(Gfx::Bitmap const& bitmap, Compress::Zli
if (abs(best_filter.sum_of_signed_values()) > abs(paeth_filter.sum_of_signed_values()))
best_filter = paeth_filter;
ByteBuffer buffer {};
TRY(buffer.try_ensure_capacity(sizeof(ARGB32) * bitmap.width()));
pixel_x_minus_1 = Pixel::gfx_to_png(dummy_scanline[0]);
pixel_xy_minus_1 = Pixel::gfx_to_png(dummy_scanline[0]);
for (int x = 0; x < bitmap.width(); ++x) {
auto pixel = Pixel::gfx_to_png(scanline[x]);
auto pixel_y_minus_1 = Pixel::gfx_to_png(scanline_minus_1[x]);
auto predicted_pixel = best_filter.predict(pixel, pixel_x_minus_1, pixel_y_minus_1, pixel_xy_minus_1);
TRY(buffer.try_append(predicted_pixel[0]));
TRY(buffer.try_append(predicted_pixel[1]));
TRY(buffer.try_append(predicted_pixel[2]));
if constexpr (include_alpha)
TRY(buffer.try_append(predicted_pixel[3]));
pixel_x_minus_1 = pixel;
pixel_xy_minus_1 = pixel_y_minus_1;
}
TRY(uncompressed_block_data.try_append(to_underlying(best_filter.type)));
TRY(uncompressed_block_data.try_append(best_filter.buffer));
TRY(uncompressed_block_data.try_append(buffer));
scanline_minus_1 = scanline;
}
TRY(png_chunk.compress_and_add(uncompressed_block_data, compression_level));