LibCrypto: Implement SHA-256 by using x86 intrinsics

Co-Authored-By: Hendiadyoin1 <leon.a@serenityos.org>
2025-01-23 09:51:57 -05:00 · 2024-04-14 09:09:19 +02:00 · 2024-04-14 09:09:19 +02:00 · 37c66c1520
commit 37c66c1520
parent e01d78d8b6
2 changed files with 101 additions and 15 deletions
--- a/Userland/Libraries/LibCrypto/Hash/SHA2.cpp
+++ b/Userland/Libraries/LibCrypto/Hash/SHA2.cpp
@ -5,9 +5,17 @@
 * SPDX-License-Identifier: BSD-2-Clause
 */

+#include <AK/Platform.h>
 #include <AK/Types.h>
 #include <LibCrypto/Hash/SHA2.h>

+#if (ARCH(I386) || ARCH(X86_64)) && !defined(KERNEL)
+#    include <AK/SIMD.h>
+#    include <AK/SIMDExtras.h>
+#    include <cpuid.h>
+#    pragma GCC diagnostic ignored "-Wpsabi"
+#endif
+
 namespace Crypto::Hash {
 constexpr static auto ROTRIGHT(u32 a, size_t b) { return (a >> b) | (a << (32 - b)); }
 constexpr static auto CH(u32 x, u32 y, u32 z) { return (x & y) ^ (z & ~x); }
@ -25,9 +33,12 @@ constexpr static auto EP1(u64 x) { return ROTRIGHT(x, 14) ^ ROTRIGHT(x, 18) ^ RO
 constexpr static auto SIGN0(u64 x) { return ROTRIGHT(x, 1) ^ ROTRIGHT(x, 8) ^ (x >> 7); }
 constexpr static auto SIGN1(u64 x) { return ROTRIGHT(x, 19) ^ ROTRIGHT(x, 61) ^ (x >> 6); }

-inline void SHA256::transform(u8 const* data)
+static void SHA256_transform_impl_base(u32 (&state)[8], u8 const (&data)[64])
 {
-    u32 m[64];
+    constexpr static auto BlockSize = 64;
+    constexpr static auto Rounds = 64;
+
+    u32 m[BlockSize];

    size_t i = 0;
    for (size_t j = 0; i < 16; ++i, j += 4) {
@ -38,10 +49,10 @@ inline void SHA256::transform(u8 const* data)
        m[i] = SIGN1(m[i - 2]) + m[i - 7] + SIGN0(m[i - 15]) + m[i - 16];
    }

-    auto a = m_state[0], b = m_state[1],
-         c = m_state[2], d = m_state[3],
-         e = m_state[4], f = m_state[5],
-         g = m_state[6], h = m_state[7];
+    auto a = state[0], b = state[1],
+         c = state[2], d = state[3],
+         e = state[4], f = state[5],
+         g = state[6], h = state[7];

    for (i = 0; i < Rounds; ++i) {
        auto temp0 = h + EP1(e) + CH(e, f, g) + SHA256Constants::RoundConstants[i] + m[i];
@ -56,14 +67,89 @@ inline void SHA256::transform(u8 const* data)
        a = temp0 + temp1;
    }

-    m_state[0] += a;
-    m_state[1] += b;
-    m_state[2] += c;
-    m_state[3] += d;
-    m_state[4] += e;
-    m_state[5] += f;
-    m_state[6] += g;
-    m_state[7] += h;
+    state[0] += a;
+    state[1] += b;
+    state[2] += c;
+    state[3] += d;
+    state[4] += e;
+    state[5] += f;
+    state[6] += g;
+    state[7] += h;
+}
+
+#if (ARCH(I386) || ARCH(X86_64)) && !defined(KERNEL)
+// Note: The SHA extension was introduced with
+//       Intel Goldmont (SSE4.2), Ice Lake (AVX512), Rocket Lake (AVX512), and AMD Zen (AVX2)
+//       So it's safe to assume that if we have SHA we have at least SSE4.2
+//      ~https://en.wikipedia.org/wiki/Intel_SHA_extensions
+[[gnu::target("sha,sse4.2")]] static void SHA256_transform_impl_sha(u32 (&state)[8], u8 const (&data)[64])
+{
+    using AK::SIMD::i32x4, AK::SIMD::u32x4;
+
+    u32x4 states[2] {};
+    states[0] = AK::SIMD::load_unaligned<u32x4>(&state[0]);
+    states[1] = AK::SIMD::load_unaligned<u32x4>(&state[4]);
+    auto tmp = u32x4 { states[0][1], states[0][0], states[0][3], states[0][2] };
+    states[1] = u32x4 { states[1][3], states[1][2], states[1][1], states[1][0] };
+    states[0] = u32x4 { states[1][2], states[1][3], tmp[0], tmp[1] };
+    states[1] = u32x4 { states[1][0], states[1][1], tmp[2], tmp[3] };
+
+    u32x4 msgs[4] {};
+    u32x4 old[2] { states[0], states[1] };
+    for (int i = 0; i != 16; ++i) {
+        u32x4 msg {};
+        if (i < 4) {
+            msgs[i] = AK::SIMD::load_unaligned<u32x4>(&data[i * 16]);
+            msgs[i] = AK::SIMD::elementwise_byte_reverse(msgs[i]);
+            tmp = AK::SIMD::load_unaligned<u32x4>(&SHA256Constants::RoundConstants[i * 4]);
+            msg = msgs[i] + tmp;
+        } else {
+            msgs[(i + 0) % 4] = bit_cast<u32x4>(__builtin_ia32_sha256msg1(bit_cast<i32x4>(msgs[(i + 0) % 4]), bit_cast<i32x4>(msgs[(i + 1) % 4])));
+            tmp = u32x4 { msgs[(i + 2) % 4][1], msgs[(i + 2) % 4][2], msgs[(i + 2) % 4][3], msgs[(i + 3) % 4][0] };
+            msgs[(i + 0) % 4] += tmp;
+            msgs[(i + 0) % 4] = bit_cast<u32x4>(__builtin_ia32_sha256msg2(bit_cast<i32x4>(msgs[(i + 0) % 4]), bit_cast<i32x4>(msgs[(i + 3) % 4])));
+            tmp = AK::SIMD::load_unaligned<u32x4>(&SHA256Constants::RoundConstants[i * 4]);
+            msg = msgs[(i + 0) % 4] + tmp;
+        }
+        states[1] = bit_cast<u32x4>(__builtin_ia32_sha256rnds2(bit_cast<i32x4>(states[1]), bit_cast<i32x4>(states[0]), bit_cast<i32x4>(msg)));
+        msg = u32x4 { msg[2], msg[3], 0, 0 };
+        states[0] = bit_cast<u32x4>(__builtin_ia32_sha256rnds2(bit_cast<i32x4>(states[0]), bit_cast<i32x4>(states[1]), bit_cast<i32x4>(msg)));
+    }
+    states[0] += old[0];
+    states[1] += old[1];
+
+    tmp = u32x4 { states[0][3], states[0][2], states[0][1], states[0][0] };
+    states[1] = u32x4 { states[1][1], states[1][0], states[1][3], states[1][2] };
+    states[0] = u32x4 { tmp[0], tmp[1], states[1][2], states[1][3] };
+    states[1] = u32x4 { tmp[2], tmp[3], states[1][0], states[1][1] };
+    AK::SIMD::store_unaligned(&state[0], states[0]);
+    AK::SIMD::store_unaligned(&state[4], states[1]);
+}
+// FIXME: We need a custom resolver as Clang and GCC either refuse or silently ignore the `sha` target
+//        for function multiversioning
+[[gnu::ifunc("resolve_SHA256_transform_impl")]] static void SHA256_transform_impl(u32 (&state)[8], u8 const (&data)[64]);
+namespace {
+extern "C" [[gnu::used]] decltype(&SHA256_transform_impl) resolve_SHA256_transform_impl()
+{
+    // FIXME: Use __builtin_cpu_supports("sha") when compilers support it
+    constexpr u32 cpuid_sha_ebx = 1 << 29;
+    u32 eax, ebx, ecx, edx;
+    __cpuid_count(7, 0, eax, ebx, ecx, edx);
+    if (ebx & cpuid_sha_ebx)
+        return SHA256_transform_impl_sha;
+
+    // FIXME: Investigate if more target clones (avx) make sense
+
+    return SHA256_transform_impl_base;
+}
+}
+#else
+#    define SHA256_transform_impl SHA256_transform_impl_base
+#endif
+
+inline void SHA256::transform(u8 const (&data)[BlockSize])
+{
+    SHA256_transform_impl(m_state, data);
 }

 template<size_t BlockSize, typename Callback>
--- a/Userland/Libraries/LibCrypto/Hash/SHA2.h
+++ b/Userland/Libraries/LibCrypto/Hash/SHA2.h
@ -115,7 +115,7 @@ public:
    }

 private:
-    inline void transform(u8 const*);
+    inline void transform(u8 const (&data)[BlockSize]);

    u8 m_data_buffer[BlockSize] {};
    size_t m_data_length { 0 };