AK: Add simd_cast<T> and replace to_TxN with it

This commit is contained in:
Hendiadyoin1 2024-08-06 14:35:38 +02:00 committed by Nico Weber
parent e0242abf93
commit 832b5ff603
7 changed files with 36 additions and 70 deletions

View file

@ -80,6 +80,13 @@ constexpr static size_t vector_length = sizeof(V) / sizeof(ElementOf<V>);
static_assert(vector_length<i8x4> == 4);
static_assert(vector_length<f32x4> == 4);
template<SIMDVector T, SIMDVector U>
requires(vector_length<T> == vector_length<U>)
ALWAYS_INLINE constexpr T simd_cast(U v)
{
return __builtin_convertvector(v, T);
}
namespace Detail {
template<typename T>
struct IndexVectorFor;

View file

@ -29,44 +29,6 @@ ALWAYS_INLINE static constexpr u32x4 expand4(u32 u)
return u32x4 { u, u, u, u };
}
// Casting
template<typename TSrc>
ALWAYS_INLINE static u8x4 to_u8x4(TSrc v)
{
return __builtin_convertvector(v, u8x4);
}
template<typename TSrc>
ALWAYS_INLINE static u16x4 to_u16x4(TSrc v)
{
return __builtin_convertvector(v, u16x4);
}
template<typename TSrc>
ALWAYS_INLINE static u32x4 to_u32x4(TSrc v)
{
return __builtin_convertvector(v, u32x4);
}
template<typename TSrc>
ALWAYS_INLINE static i8x4 to_i8x4(TSrc v)
{
return __builtin_convertvector(v, i8x4);
}
template<typename TSrc>
ALWAYS_INLINE static i32x4 to_i32x4(TSrc v)
{
return __builtin_convertvector(v, i32x4);
}
template<typename TSrc>
ALWAYS_INLINE static f32x4 to_f32x4(TSrc v)
{
return __builtin_convertvector(v, f32x4);
}
// Masking
ALWAYS_INLINE static i32 maskbits(i32x4 mask)

View file

@ -18,7 +18,7 @@ namespace AK::SIMD {
ALWAYS_INLINE static f32x4 truncate_int_range(f32x4 v)
{
return to_f32x4(to_i32x4(v));
return simd_cast<f32x4>(simd_cast<i32x4>(v));
}
ALWAYS_INLINE static f32x4 floor_int_range(f32x4 v)

View file

@ -192,7 +192,7 @@ ErrorOr<void> PNGWriter::add_IDAT_chunk(Gfx::Bitmap const& bitmap, Compress::Zli
TRY(buffer.try_append(simd[2]));
if constexpr (include_alpha)
TRY(buffer.try_append(simd[3]));
sum += AK::SIMD::to_i32x4(AK::SIMD::to_i8x4(simd));
sum += AK::SIMD::simd_cast<AK::SIMD::i32x4>(AK::SIMD::simd_cast<AK::SIMD::i8x4>(simd));
return {};
}
@ -234,8 +234,8 @@ ErrorOr<void> PNGWriter::add_IDAT_chunk(Gfx::Bitmap const& bitmap, Compress::Zli
TRY(up_filter.append(pixel - pixel_y_minus_1));
// The sum Orig(a) + Orig(b) shall be performed without overflow (using at least nine-bit arithmetic).
auto sum = AK::SIMD::to_u16x4(pixel_x_minus_1) + AK::SIMD::to_u16x4(pixel_y_minus_1);
auto average = AK::SIMD::to_u8x4(sum / 2);
auto sum = AK::SIMD::simd_cast<AK::SIMD::u16x4>(pixel_x_minus_1) + AK::SIMD::simd_cast<AK::SIMD::u16x4>(pixel_y_minus_1);
auto average = AK::SIMD::simd_cast<AK::SIMD::u8x4>(sum / 2);
TRY(average_filter.append(pixel - average));
TRY(paeth_filter.append(pixel - PNG::paeth_predictor(pixel_x_minus_1, pixel_y_minus_1, pixel_xy_minus_1)));

View file

@ -46,9 +46,8 @@ using AK::SIMD::i32x4;
using AK::SIMD::load4_masked;
using AK::SIMD::maskbits;
using AK::SIMD::maskcount;
using AK::SIMD::simd_cast;
using AK::SIMD::store4_masked;
using AK::SIMD::to_f32x4;
using AK::SIMD::to_u32x4;
using AK::SIMD::u32x4;
static constexpr int subpixel_factor = 1 << SUBPIXEL_BITS;
@ -84,10 +83,10 @@ static GPU::ColorType to_argb32(FloatVector4 const& color)
ALWAYS_INLINE static u32x4 to_argb32(Vector4<f32x4> const& color)
{
auto clamped = color.clamped(expand4(0.0f), expand4(1.0f));
auto r = to_u32x4(clamped.x() * 255);
auto g = to_u32x4(clamped.y() * 255);
auto b = to_u32x4(clamped.z() * 255);
auto a = to_u32x4(clamped.w() * 255);
auto r = simd_cast<u32x4>(clamped.x() * 255);
auto g = simd_cast<u32x4>(clamped.y() * 255);
auto b = simd_cast<u32x4>(clamped.z() * 255);
auto a = simd_cast<u32x4>(clamped.w() * 255);
return a << 24 | r << 16 | g << 8 | b;
}
@ -96,10 +95,10 @@ static Vector4<f32x4> to_vec4(u32x4 bgra)
{
auto constexpr one_over_255 = expand4(1.0f / 255);
return {
to_f32x4((bgra >> 16) & 0xff) * one_over_255,
to_f32x4((bgra >> 8) & 0xff) * one_over_255,
to_f32x4(bgra & 0xff) * one_over_255,
to_f32x4((bgra >> 24) & 0xff) * one_over_255,
simd_cast<f32x4>((bgra >> 16) & 0xff) * one_over_255,
simd_cast<f32x4>((bgra >> 8) & 0xff) * one_over_255,
simd_cast<f32x4>(bgra & 0xff) * one_over_255,
simd_cast<f32x4>((bgra >> 24) & 0xff) * one_over_255,
};
}
@ -779,9 +778,9 @@ void Device::rasterize_triangle(Triangle& triangle)
quad.mask = test_point4(edge_values);
quad.barycentrics = {
to_f32x4(edge_values.x()),
to_f32x4(edge_values.y()),
to_f32x4(edge_values.z()),
simd_cast<f32x4>(edge_values.x()),
simd_cast<f32x4>(edge_values.y()),
simd_cast<f32x4>(edge_values.z()),
};
},
[&](auto& quad) {

View file

@ -120,7 +120,7 @@ ALWAYS_INLINE static AK::SIMD::f32x4 log2_approximate(AK::SIMD::f32x4 v)
} u { v };
// Extract just the exponent minus 1, giving a lower integral bound for log2.
auto log = AK::SIMD::to_f32x4(((u.int_val >> 23) & 255) - 128);
auto log = AK::SIMD::simd_cast<AK::SIMD::f32x4>(((u.int_val >> 23) & 255) - 128);
// Replace the exponent with 0, giving a value between 1 and 2.
u.int_val &= ~(255 << 23);
@ -134,8 +134,8 @@ ALWAYS_INLINE static AK::SIMD::f32x4 log2_approximate(AK::SIMD::f32x4 v)
ALWAYS_INLINE static Vector2<AK::SIMD::f32x4> to_vec2_f32x4(Vector2<AK::SIMD::i32x4> const& v)
{
return {
AK::SIMD::to_f32x4(v.x()),
AK::SIMD::to_f32x4(v.y()),
AK::SIMD::simd_cast<AK::SIMD::f32x4>(v.x()),
AK::SIMD::simd_cast<AK::SIMD::f32x4>(v.y()),
};
}

View file

@ -23,9 +23,7 @@ using AK::SIMD::expand4;
using AK::SIMD::floor_int_range;
using AK::SIMD::frac_int_range;
using AK::SIMD::maskbits;
using AK::SIMD::to_f32x4;
using AK::SIMD::to_i32x4;
using AK::SIMD::to_u32x4;
using AK::SIMD::simd_cast;
static f32x4 wrap_repeat(f32x4 value)
{
@ -47,7 +45,7 @@ static f32x4 wrap_mirrored_repeat(f32x4 value, f32x4 num_texels)
{
f32x4 integer = floor_int_range(value);
f32x4 frac = value - integer;
auto is_odd = to_i32x4(integer) & 1;
auto is_odd = simd_cast<i32x4>(integer) & 1;
return wrap_clamp_to_edge(is_odd ? 1 - frac : frac, num_texels);
}
@ -141,12 +139,12 @@ Vector4<AK::SIMD::f32x4> Sampler::sample_2d(Vector2<AK::SIMD::f32x4> const& uv)
auto lambda_xy = log2_approximate(scale_factor) * .5f + texture_lod_bias;
auto level = clamp(lambda_xy, min_level, max_level);
auto lower_level_texel = sample_2d_lod(uv, to_u32x4(level), m_config.texture_min_filter);
auto lower_level_texel = sample_2d_lod(uv, simd_cast<u32x4>(level), m_config.texture_min_filter);
if (m_config.mipmap_filter == GPU::MipMapFilter::Nearest)
return lower_level_texel;
auto higher_level_texel = sample_2d_lod(uv, to_u32x4(min(level + 1.f, max_level)), m_config.texture_min_filter);
auto higher_level_texel = sample_2d_lod(uv, simd_cast<u32x4>(min(level + 1.f, max_level)), m_config.texture_min_filter);
return mix(lower_level_texel, higher_level_texel, frac_int_range(level));
}
@ -168,8 +166,8 @@ Vector4<AK::SIMD::f32x4> Sampler::sample_2d_lod(Vector2<AK::SIMD::f32x4> const&
image.height_at_level(level[3]),
};
auto f_width = to_f32x4(width);
auto f_height = to_f32x4(height);
auto f_width = simd_cast<f32x4>(width);
auto f_height = simd_cast<f32x4>(height);
u32x4 width_mask = width - 1;
u32x4 height_mask = height - 1;
@ -178,8 +176,8 @@ Vector4<AK::SIMD::f32x4> Sampler::sample_2d_lod(Vector2<AK::SIMD::f32x4> const&
f32x4 v = wrap(uv.y(), m_config.texture_wrap_v, f_height) * f_height;
if (filter == GPU::TextureFilter::Nearest) {
u32x4 i = to_u32x4(u);
u32x4 j = to_u32x4(v);
u32x4 i = simd_cast<u32x4>(u);
u32x4 j = simd_cast<u32x4>(v);
i = image.width_is_power_of_two() ? i & width_mask : i % width;
j = image.height_is_power_of_two() ? j & height_mask : j % height;
@ -193,9 +191,9 @@ Vector4<AK::SIMD::f32x4> Sampler::sample_2d_lod(Vector2<AK::SIMD::f32x4> const&
f32x4 const floored_u = floor_int_range(u);
f32x4 const floored_v = floor_int_range(v);
u32x4 i0 = to_u32x4(floored_u);
u32x4 i0 = simd_cast<u32x4>(floored_u);
u32x4 i1 = i0 + 1;
u32x4 j0 = to_u32x4(floored_v);
u32x4 j0 = simd_cast<u32x4>(floored_v);
u32x4 j1 = j0 + 1;
if (m_config.texture_wrap_u == GPU::TextureWrapMode::Repeat) {