LibCrypto: Implement SHA-256 by using x86 intrinsics

Co-Authored-By: Hendiadyoin1 <leon.a@serenityos.org>
This commit is contained in:
Marek Knápek 2024-04-14 09:09:19 +02:00 committed by Jelle Raaijmakers
parent e01d78d8b6
commit 37c66c1520
2 changed files with 101 additions and 15 deletions

View file

@ -5,9 +5,17 @@
* SPDX-License-Identifier: BSD-2-Clause
*/
#include <AK/Platform.h>
#include <AK/Types.h>
#include <LibCrypto/Hash/SHA2.h>
#if (ARCH(I386) || ARCH(X86_64)) && !defined(KERNEL)
# include <AK/SIMD.h>
# include <AK/SIMDExtras.h>
# include <cpuid.h>
# pragma GCC diagnostic ignored "-Wpsabi"
#endif
namespace Crypto::Hash {
constexpr static auto ROTRIGHT(u32 a, size_t b) { return (a >> b) | (a << (32 - b)); }
constexpr static auto CH(u32 x, u32 y, u32 z) { return (x & y) ^ (z & ~x); }
@ -25,9 +33,12 @@ constexpr static auto EP1(u64 x) { return ROTRIGHT(x, 14) ^ ROTRIGHT(x, 18) ^ RO
constexpr static auto SIGN0(u64 x) { return ROTRIGHT(x, 1) ^ ROTRIGHT(x, 8) ^ (x >> 7); }
constexpr static auto SIGN1(u64 x) { return ROTRIGHT(x, 19) ^ ROTRIGHT(x, 61) ^ (x >> 6); }
inline void SHA256::transform(u8 const* data)
static void SHA256_transform_impl_base(u32 (&state)[8], u8 const (&data)[64])
{
u32 m[64];
constexpr static auto BlockSize = 64;
constexpr static auto Rounds = 64;
u32 m[BlockSize];
size_t i = 0;
for (size_t j = 0; i < 16; ++i, j += 4) {
@ -38,10 +49,10 @@ inline void SHA256::transform(u8 const* data)
m[i] = SIGN1(m[i - 2]) + m[i - 7] + SIGN0(m[i - 15]) + m[i - 16];
}
auto a = m_state[0], b = m_state[1],
c = m_state[2], d = m_state[3],
e = m_state[4], f = m_state[5],
g = m_state[6], h = m_state[7];
auto a = state[0], b = state[1],
c = state[2], d = state[3],
e = state[4], f = state[5],
g = state[6], h = state[7];
for (i = 0; i < Rounds; ++i) {
auto temp0 = h + EP1(e) + CH(e, f, g) + SHA256Constants::RoundConstants[i] + m[i];
@ -56,14 +67,89 @@ inline void SHA256::transform(u8 const* data)
a = temp0 + temp1;
}
m_state[0] += a;
m_state[1] += b;
m_state[2] += c;
m_state[3] += d;
m_state[4] += e;
m_state[5] += f;
m_state[6] += g;
m_state[7] += h;
state[0] += a;
state[1] += b;
state[2] += c;
state[3] += d;
state[4] += e;
state[5] += f;
state[6] += g;
state[7] += h;
}
#if (ARCH(I386) || ARCH(X86_64)) && !defined(KERNEL)
// Note: The SHA extension was introduced with
// Intel Goldmont (SSE4.2), Ice Lake (AVX512), Rocket Lake (AVX512), and AMD Zen (AVX2)
// So it's safe to assume that if we have SHA we have at least SSE4.2
// ~https://en.wikipedia.org/wiki/Intel_SHA_extensions
[[gnu::target("sha,sse4.2")]] static void SHA256_transform_impl_sha(u32 (&state)[8], u8 const (&data)[64])
{
using AK::SIMD::i32x4, AK::SIMD::u32x4;
u32x4 states[2] {};
states[0] = AK::SIMD::load_unaligned<u32x4>(&state[0]);
states[1] = AK::SIMD::load_unaligned<u32x4>(&state[4]);
auto tmp = u32x4 { states[0][1], states[0][0], states[0][3], states[0][2] };
states[1] = u32x4 { states[1][3], states[1][2], states[1][1], states[1][0] };
states[0] = u32x4 { states[1][2], states[1][3], tmp[0], tmp[1] };
states[1] = u32x4 { states[1][0], states[1][1], tmp[2], tmp[3] };
u32x4 msgs[4] {};
u32x4 old[2] { states[0], states[1] };
for (int i = 0; i != 16; ++i) {
u32x4 msg {};
if (i < 4) {
msgs[i] = AK::SIMD::load_unaligned<u32x4>(&data[i * 16]);
msgs[i] = AK::SIMD::elementwise_byte_reverse(msgs[i]);
tmp = AK::SIMD::load_unaligned<u32x4>(&SHA256Constants::RoundConstants[i * 4]);
msg = msgs[i] + tmp;
} else {
msgs[(i + 0) % 4] = bit_cast<u32x4>(__builtin_ia32_sha256msg1(bit_cast<i32x4>(msgs[(i + 0) % 4]), bit_cast<i32x4>(msgs[(i + 1) % 4])));
tmp = u32x4 { msgs[(i + 2) % 4][1], msgs[(i + 2) % 4][2], msgs[(i + 2) % 4][3], msgs[(i + 3) % 4][0] };
msgs[(i + 0) % 4] += tmp;
msgs[(i + 0) % 4] = bit_cast<u32x4>(__builtin_ia32_sha256msg2(bit_cast<i32x4>(msgs[(i + 0) % 4]), bit_cast<i32x4>(msgs[(i + 3) % 4])));
tmp = AK::SIMD::load_unaligned<u32x4>(&SHA256Constants::RoundConstants[i * 4]);
msg = msgs[(i + 0) % 4] + tmp;
}
states[1] = bit_cast<u32x4>(__builtin_ia32_sha256rnds2(bit_cast<i32x4>(states[1]), bit_cast<i32x4>(states[0]), bit_cast<i32x4>(msg)));
msg = u32x4 { msg[2], msg[3], 0, 0 };
states[0] = bit_cast<u32x4>(__builtin_ia32_sha256rnds2(bit_cast<i32x4>(states[0]), bit_cast<i32x4>(states[1]), bit_cast<i32x4>(msg)));
}
states[0] += old[0];
states[1] += old[1];
tmp = u32x4 { states[0][3], states[0][2], states[0][1], states[0][0] };
states[1] = u32x4 { states[1][1], states[1][0], states[1][3], states[1][2] };
states[0] = u32x4 { tmp[0], tmp[1], states[1][2], states[1][3] };
states[1] = u32x4 { tmp[2], tmp[3], states[1][0], states[1][1] };
AK::SIMD::store_unaligned(&state[0], states[0]);
AK::SIMD::store_unaligned(&state[4], states[1]);
}
// FIXME: We need a custom resolver as Clang and GCC either refuse or silently ignore the `sha` target
// for function multiversioning
[[gnu::ifunc("resolve_SHA256_transform_impl")]] static void SHA256_transform_impl(u32 (&state)[8], u8 const (&data)[64]);
namespace {
extern "C" [[gnu::used]] decltype(&SHA256_transform_impl) resolve_SHA256_transform_impl()
{
// FIXME: Use __builtin_cpu_supports("sha") when compilers support it
constexpr u32 cpuid_sha_ebx = 1 << 29;
u32 eax, ebx, ecx, edx;
__cpuid_count(7, 0, eax, ebx, ecx, edx);
if (ebx & cpuid_sha_ebx)
return SHA256_transform_impl_sha;
// FIXME: Investigate if more target clones (avx) make sense
return SHA256_transform_impl_base;
}
}
#else
# define SHA256_transform_impl SHA256_transform_impl_base
#endif
inline void SHA256::transform(u8 const (&data)[BlockSize])
{
SHA256_transform_impl(m_state, data);
}
template<size_t BlockSize, typename Callback>

View file

@ -115,7 +115,7 @@ public:
}
private:
inline void transform(u8 const*);
inline void transform(u8 const (&data)[BlockSize]);
u8 m_data_buffer[BlockSize] {};
size_t m_data_length { 0 };