AK: Add simd_cast<T> and replace to_TxN with it

2025-01-22 09:21:57 -05:00 · 2024-08-06 14:35:38 +02:00 · 2024-08-06 14:35:38 +02:00 · 832b5ff603
commit 832b5ff603
parent e0242abf93
7 changed files with 36 additions and 70 deletions
--- a/AK/SIMD.h
+++ b/AK/SIMD.h
@ -80,6 +80,13 @@ constexpr static size_t vector_length = sizeof(V) / sizeof(ElementOf<V>);
 static_assert(vector_length<i8x4> == 4);
 static_assert(vector_length<f32x4> == 4);

+template<SIMDVector T, SIMDVector U>
+requires(vector_length<T> == vector_length<U>)
+ALWAYS_INLINE constexpr T simd_cast(U v)
+{
+    return __builtin_convertvector(v, T);
+}
+
 namespace Detail {
 template<typename T>
 struct IndexVectorFor;
--- a/AK/SIMDExtras.h
+++ b/AK/SIMDExtras.h
@ -29,44 +29,6 @@ ALWAYS_INLINE static constexpr u32x4 expand4(u32 u)
    return u32x4 { u, u, u, u };
 }

-// Casting
-
-template<typename TSrc>
-ALWAYS_INLINE static u8x4 to_u8x4(TSrc v)
-{
-    return __builtin_convertvector(v, u8x4);
-}
-
-template<typename TSrc>
-ALWAYS_INLINE static u16x4 to_u16x4(TSrc v)
-{
-    return __builtin_convertvector(v, u16x4);
-}
-
-template<typename TSrc>
-ALWAYS_INLINE static u32x4 to_u32x4(TSrc v)
-{
-    return __builtin_convertvector(v, u32x4);
-}
-
-template<typename TSrc>
-ALWAYS_INLINE static i8x4 to_i8x4(TSrc v)
-{
-    return __builtin_convertvector(v, i8x4);
-}
-
-template<typename TSrc>
-ALWAYS_INLINE static i32x4 to_i32x4(TSrc v)
-{
-    return __builtin_convertvector(v, i32x4);
-}
-
-template<typename TSrc>
-ALWAYS_INLINE static f32x4 to_f32x4(TSrc v)
-{
-    return __builtin_convertvector(v, f32x4);
-}
-
 // Masking

 ALWAYS_INLINE static i32 maskbits(i32x4 mask)
--- a/AK/SIMDMath.h
+++ b/AK/SIMDMath.h
@ -18,7 +18,7 @@ namespace AK::SIMD {

 ALWAYS_INLINE static f32x4 truncate_int_range(f32x4 v)
 {
-    return to_f32x4(to_i32x4(v));
+    return simd_cast<f32x4>(simd_cast<i32x4>(v));
 }

 ALWAYS_INLINE static f32x4 floor_int_range(f32x4 v)
--- a/Userland/Libraries/LibGfx/ImageFormats/PNGWriter.cpp
+++ b/Userland/Libraries/LibGfx/ImageFormats/PNGWriter.cpp
@ -192,7 +192,7 @@ ErrorOr<void> PNGWriter::add_IDAT_chunk(Gfx::Bitmap const& bitmap, Compress::Zli
                TRY(buffer.try_append(simd[2]));
                if constexpr (include_alpha)
                    TRY(buffer.try_append(simd[3]));
-                sum += AK::SIMD::to_i32x4(AK::SIMD::to_i8x4(simd));
+                sum += AK::SIMD::simd_cast<AK::SIMD::i32x4>(AK::SIMD::simd_cast<AK::SIMD::i8x4>(simd));
                return {};
            }

@ -234,8 +234,8 @@ ErrorOr<void> PNGWriter::add_IDAT_chunk(Gfx::Bitmap const& bitmap, Compress::Zli
            TRY(up_filter.append(pixel - pixel_y_minus_1));

            // The sum Orig(a) + Orig(b) shall be performed without overflow (using at least nine-bit arithmetic).
-            auto sum = AK::SIMD::to_u16x4(pixel_x_minus_1) + AK::SIMD::to_u16x4(pixel_y_minus_1);
-            auto average = AK::SIMD::to_u8x4(sum / 2);
+            auto sum = AK::SIMD::simd_cast<AK::SIMD::u16x4>(pixel_x_minus_1) + AK::SIMD::simd_cast<AK::SIMD::u16x4>(pixel_y_minus_1);
+            auto average = AK::SIMD::simd_cast<AK::SIMD::u8x4>(sum / 2);
            TRY(average_filter.append(pixel - average));

            TRY(paeth_filter.append(pixel - PNG::paeth_predictor(pixel_x_minus_1, pixel_y_minus_1, pixel_xy_minus_1)));
--- a/Userland/Libraries/LibSoftGPU/Device.cpp
+++ b/Userland/Libraries/LibSoftGPU/Device.cpp
@ -46,9 +46,8 @@ using AK::SIMD::i32x4;
 using AK::SIMD::load4_masked;
 using AK::SIMD::maskbits;
 using AK::SIMD::maskcount;
+using AK::SIMD::simd_cast;
 using AK::SIMD::store4_masked;
-using AK::SIMD::to_f32x4;
-using AK::SIMD::to_u32x4;
 using AK::SIMD::u32x4;

 static constexpr int subpixel_factor = 1 << SUBPIXEL_BITS;
@ -84,10 +83,10 @@ static GPU::ColorType to_argb32(FloatVector4 const& color)
 ALWAYS_INLINE static u32x4 to_argb32(Vector4<f32x4> const& color)
 {
    auto clamped = color.clamped(expand4(0.0f), expand4(1.0f));
-    auto r = to_u32x4(clamped.x() * 255);
-    auto g = to_u32x4(clamped.y() * 255);
-    auto b = to_u32x4(clamped.z() * 255);
-    auto a = to_u32x4(clamped.w() * 255);
+    auto r = simd_cast<u32x4>(clamped.x() * 255);
+    auto g = simd_cast<u32x4>(clamped.y() * 255);
+    auto b = simd_cast<u32x4>(clamped.z() * 255);
+    auto a = simd_cast<u32x4>(clamped.w() * 255);

    return a << 24 | r << 16 | g << 8 | b;
 }
@ -96,10 +95,10 @@ static Vector4<f32x4> to_vec4(u32x4 bgra)
 {
    auto constexpr one_over_255 = expand4(1.0f / 255);
    return {
-        to_f32x4((bgra >> 16) & 0xff) * one_over_255,
-        to_f32x4((bgra >> 8) & 0xff) * one_over_255,
-        to_f32x4(bgra & 0xff) * one_over_255,
-        to_f32x4((bgra >> 24) & 0xff) * one_over_255,
+        simd_cast<f32x4>((bgra >> 16) & 0xff) * one_over_255,
+        simd_cast<f32x4>((bgra >> 8) & 0xff) * one_over_255,
+        simd_cast<f32x4>(bgra & 0xff) * one_over_255,
+        simd_cast<f32x4>((bgra >> 24) & 0xff) * one_over_255,
    };
 }

@ -779,9 +778,9 @@ void Device::rasterize_triangle(Triangle& triangle)
            quad.mask = test_point4(edge_values);

            quad.barycentrics = {
-                to_f32x4(edge_values.x()),
-                to_f32x4(edge_values.y()),
-                to_f32x4(edge_values.z()),
+                simd_cast<f32x4>(edge_values.x()),
+                simd_cast<f32x4>(edge_values.y()),
+                simd_cast<f32x4>(edge_values.z()),
            };
        },
        [&](auto& quad) {
--- a/Userland/Libraries/LibSoftGPU/SIMD.h
+++ b/Userland/Libraries/LibSoftGPU/SIMD.h
@ -120,7 +120,7 @@ ALWAYS_INLINE static AK::SIMD::f32x4 log2_approximate(AK::SIMD::f32x4 v)
    } u { v };

    // Extract just the exponent minus 1, giving a lower integral bound for log2.
-    auto log = AK::SIMD::to_f32x4(((u.int_val >> 23) & 255) - 128);
+    auto log = AK::SIMD::simd_cast<AK::SIMD::f32x4>(((u.int_val >> 23) & 255) - 128);

    // Replace the exponent with 0, giving a value between 1 and 2.
    u.int_val &= ~(255 << 23);
@ -134,8 +134,8 @@ ALWAYS_INLINE static AK::SIMD::f32x4 log2_approximate(AK::SIMD::f32x4 v)
 ALWAYS_INLINE static Vector2<AK::SIMD::f32x4> to_vec2_f32x4(Vector2<AK::SIMD::i32x4> const& v)
 {
    return {
-        AK::SIMD::to_f32x4(v.x()),
-        AK::SIMD::to_f32x4(v.y()),
+        AK::SIMD::simd_cast<AK::SIMD::f32x4>(v.x()),
+        AK::SIMD::simd_cast<AK::SIMD::f32x4>(v.y()),
    };
 }

--- a/Userland/Libraries/LibSoftGPU/Sampler.cpp
+++ b/Userland/Libraries/LibSoftGPU/Sampler.cpp
@ -23,9 +23,7 @@ using AK::SIMD::expand4;
 using AK::SIMD::floor_int_range;
 using AK::SIMD::frac_int_range;
 using AK::SIMD::maskbits;
-using AK::SIMD::to_f32x4;
-using AK::SIMD::to_i32x4;
-using AK::SIMD::to_u32x4;
+using AK::SIMD::simd_cast;

 static f32x4 wrap_repeat(f32x4 value)
 {
@ -47,7 +45,7 @@ static f32x4 wrap_mirrored_repeat(f32x4 value, f32x4 num_texels)
 {
    f32x4 integer = floor_int_range(value);
    f32x4 frac = value - integer;
-    auto is_odd = to_i32x4(integer) & 1;
+    auto is_odd = simd_cast<i32x4>(integer) & 1;
    return wrap_clamp_to_edge(is_odd ? 1 - frac : frac, num_texels);
 }

@ -141,12 +139,12 @@ Vector4<AK::SIMD::f32x4> Sampler::sample_2d(Vector2<AK::SIMD::f32x4> const& uv)
    auto lambda_xy = log2_approximate(scale_factor) * .5f + texture_lod_bias;
    auto level = clamp(lambda_xy, min_level, max_level);

-    auto lower_level_texel = sample_2d_lod(uv, to_u32x4(level), m_config.texture_min_filter);
+    auto lower_level_texel = sample_2d_lod(uv, simd_cast<u32x4>(level), m_config.texture_min_filter);

    if (m_config.mipmap_filter == GPU::MipMapFilter::Nearest)
        return lower_level_texel;

-    auto higher_level_texel = sample_2d_lod(uv, to_u32x4(min(level + 1.f, max_level)), m_config.texture_min_filter);
+    auto higher_level_texel = sample_2d_lod(uv, simd_cast<u32x4>(min(level + 1.f, max_level)), m_config.texture_min_filter);

    return mix(lower_level_texel, higher_level_texel, frac_int_range(level));
 }
@ -168,8 +166,8 @@ Vector4<AK::SIMD::f32x4> Sampler::sample_2d_lod(Vector2<AK::SIMD::f32x4> const&
        image.height_at_level(level[3]),
    };

-    auto f_width = to_f32x4(width);
-    auto f_height = to_f32x4(height);
+    auto f_width = simd_cast<f32x4>(width);
+    auto f_height = simd_cast<f32x4>(height);

    u32x4 width_mask = width - 1;
    u32x4 height_mask = height - 1;
@ -178,8 +176,8 @@ Vector4<AK::SIMD::f32x4> Sampler::sample_2d_lod(Vector2<AK::SIMD::f32x4> const&
    f32x4 v = wrap(uv.y(), m_config.texture_wrap_v, f_height) * f_height;

    if (filter == GPU::TextureFilter::Nearest) {
-        u32x4 i = to_u32x4(u);
-        u32x4 j = to_u32x4(v);
+        u32x4 i = simd_cast<u32x4>(u);
+        u32x4 j = simd_cast<u32x4>(v);

        i = image.width_is_power_of_two() ? i & width_mask : i % width;
        j = image.height_is_power_of_two() ? j & height_mask : j % height;
@ -193,9 +191,9 @@ Vector4<AK::SIMD::f32x4> Sampler::sample_2d_lod(Vector2<AK::SIMD::f32x4> const&
    f32x4 const floored_u = floor_int_range(u);
    f32x4 const floored_v = floor_int_range(v);

-    u32x4 i0 = to_u32x4(floored_u);
+    u32x4 i0 = simd_cast<u32x4>(floored_u);
    u32x4 i1 = i0 + 1;
-    u32x4 j0 = to_u32x4(floored_v);
+    u32x4 j0 = simd_cast<u32x4>(floored_v);
    u32x4 j1 = j0 + 1;

    if (m_config.texture_wrap_u == GPU::TextureWrapMode::Repeat) {