LibGfx/PNGWriter: Use SIMD for PNG score calculation

Produces exactly the same output, but a bit faster.

The speedup is relatively bigger for worse compression:

    image -o sunset_retro.png sunset_retro.bmp --png-compression-level 0
       56.8 ms ±  1.5 ms ->  34.8 ms ± 0.9 ms (38.7% faster)

    image -o sunset_retro.png sunset_retro.bmp --png-compression-level 1
       84.6 ms ±  1.7 ms ->  64.2 ms ± 4.9 ms (24.1% faster)

    image -o sunset_retro.png sunset_retro.bmp --png-compression-level 2
      212.1 ms ±  2.5 ms -> 190.3 ms ± 1.6 ms (10.3% faster)

    image -o sunset_retro.png sunset_retro.bmp --png-compression-level 3
      671.4 ms ± 12.3 ms -> 646.5 ms ± 4.7 ms (3.7% faster)

Compression level 2 is the default, so about a 10% speedup in practice.

For comparison, `sips` needs 49.9 ms ± 3.0 ms to convert
sunset_retro.bmp to sunset_retro.png, and judging from the output file
size, it uses something similar to our compression level 1.
We used to take 1.7x as long as sips, now we take 1.29x as long.
This commit is contained in:
Nico Weber 2024-08-05 23:14:59 -04:00
parent 34a4d16776
commit 781a39e613

View file

@ -183,12 +183,11 @@ ErrorOr<void> PNGWriter::add_IDAT_chunk(Gfx::Bitmap const& bitmap, Compress::Zli
struct Filter {
PNG::FilterType type;
ByteBuffer buffer {};
int sum = 0;
AK::SIMD::i32x4 sum { 0, 0, 0, 0 };
ErrorOr<void> append(u8 byte)
{
TRY(buffer.try_append(byte));
sum += static_cast<i8>(byte);
return {};
}
@ -199,8 +198,17 @@ ErrorOr<void> PNGWriter::add_IDAT_chunk(Gfx::Bitmap const& bitmap, Compress::Zli
TRY(append(simd[2]));
if constexpr (include_alpha)
TRY(append(simd[3]));
sum += AK::SIMD::to_i32x4(AK::SIMD::to_i8x4(simd));
return {};
}
i32 sum_of_signed_values() const
{
i32 result = sum[0] + sum[1] + sum[2];
if constexpr (include_alpha)
result += sum[3];
return result;
}
};
Filter none_filter { .type = PNG::FilterType::None };
@ -251,13 +259,13 @@ ErrorOr<void> PNGWriter::add_IDAT_chunk(Gfx::Bitmap const& bitmap, Compress::Zli
// compute the output scanline using all five filters, and select the filter that gives the smallest sum of absolute values of outputs.
// (Consider the output bytes as signed differences for this test.)
Filter& best_filter = none_filter;
if (abs(best_filter.sum) > abs(sub_filter.sum))
if (abs(best_filter.sum_of_signed_values()) > abs(sub_filter.sum_of_signed_values()))
best_filter = sub_filter;
if (abs(best_filter.sum) > abs(up_filter.sum))
if (abs(best_filter.sum_of_signed_values()) > abs(up_filter.sum_of_signed_values()))
best_filter = up_filter;
if (abs(best_filter.sum) > abs(average_filter.sum))
if (abs(best_filter.sum_of_signed_values()) > abs(average_filter.sum_of_signed_values()))
best_filter = average_filter;
if (abs(best_filter.sum) > abs(paeth_filter.sum))
if (abs(best_filter.sum_of_signed_values()) > abs(paeth_filter.sum_of_signed_values()))
best_filter = paeth_filter;
TRY(uncompressed_block_data.try_append(to_underlying(best_filter.type)));