mirror of
https://github.com/SerenityOS/serenity.git
synced 2025-01-23 09:51:57 -05:00
LibCrypto: Implement SHA-256 by using x86 intrinsics
Co-Authored-By: Hendiadyoin1 <leon.a@serenityos.org>
This commit is contained in:
parent
e01d78d8b6
commit
37c66c1520
2 changed files with 101 additions and 15 deletions
|
@ -5,9 +5,17 @@
|
|||
* SPDX-License-Identifier: BSD-2-Clause
|
||||
*/
|
||||
|
||||
#include <AK/Platform.h>
|
||||
#include <AK/Types.h>
|
||||
#include <LibCrypto/Hash/SHA2.h>
|
||||
|
||||
#if (ARCH(I386) || ARCH(X86_64)) && !defined(KERNEL)
|
||||
# include <AK/SIMD.h>
|
||||
# include <AK/SIMDExtras.h>
|
||||
# include <cpuid.h>
|
||||
# pragma GCC diagnostic ignored "-Wpsabi"
|
||||
#endif
|
||||
|
||||
namespace Crypto::Hash {
|
||||
constexpr static auto ROTRIGHT(u32 a, size_t b) { return (a >> b) | (a << (32 - b)); }
|
||||
constexpr static auto CH(u32 x, u32 y, u32 z) { return (x & y) ^ (z & ~x); }
|
||||
|
@ -25,9 +33,12 @@ constexpr static auto EP1(u64 x) { return ROTRIGHT(x, 14) ^ ROTRIGHT(x, 18) ^ RO
|
|||
constexpr static auto SIGN0(u64 x) { return ROTRIGHT(x, 1) ^ ROTRIGHT(x, 8) ^ (x >> 7); }
|
||||
constexpr static auto SIGN1(u64 x) { return ROTRIGHT(x, 19) ^ ROTRIGHT(x, 61) ^ (x >> 6); }
|
||||
|
||||
inline void SHA256::transform(u8 const* data)
|
||||
static void SHA256_transform_impl_base(u32 (&state)[8], u8 const (&data)[64])
|
||||
{
|
||||
u32 m[64];
|
||||
constexpr static auto BlockSize = 64;
|
||||
constexpr static auto Rounds = 64;
|
||||
|
||||
u32 m[BlockSize];
|
||||
|
||||
size_t i = 0;
|
||||
for (size_t j = 0; i < 16; ++i, j += 4) {
|
||||
|
@ -38,10 +49,10 @@ inline void SHA256::transform(u8 const* data)
|
|||
m[i] = SIGN1(m[i - 2]) + m[i - 7] + SIGN0(m[i - 15]) + m[i - 16];
|
||||
}
|
||||
|
||||
auto a = m_state[0], b = m_state[1],
|
||||
c = m_state[2], d = m_state[3],
|
||||
e = m_state[4], f = m_state[5],
|
||||
g = m_state[6], h = m_state[7];
|
||||
auto a = state[0], b = state[1],
|
||||
c = state[2], d = state[3],
|
||||
e = state[4], f = state[5],
|
||||
g = state[6], h = state[7];
|
||||
|
||||
for (i = 0; i < Rounds; ++i) {
|
||||
auto temp0 = h + EP1(e) + CH(e, f, g) + SHA256Constants::RoundConstants[i] + m[i];
|
||||
|
@ -56,14 +67,89 @@ inline void SHA256::transform(u8 const* data)
|
|||
a = temp0 + temp1;
|
||||
}
|
||||
|
||||
m_state[0] += a;
|
||||
m_state[1] += b;
|
||||
m_state[2] += c;
|
||||
m_state[3] += d;
|
||||
m_state[4] += e;
|
||||
m_state[5] += f;
|
||||
m_state[6] += g;
|
||||
m_state[7] += h;
|
||||
state[0] += a;
|
||||
state[1] += b;
|
||||
state[2] += c;
|
||||
state[3] += d;
|
||||
state[4] += e;
|
||||
state[5] += f;
|
||||
state[6] += g;
|
||||
state[7] += h;
|
||||
}
|
||||
|
||||
#if (ARCH(I386) || ARCH(X86_64)) && !defined(KERNEL)
|
||||
// Note: The SHA extension was introduced with
|
||||
// Intel Goldmont (SSE4.2), Ice Lake (AVX512), Rocket Lake (AVX512), and AMD Zen (AVX2)
|
||||
// So it's safe to assume that if we have SHA we have at least SSE4.2
|
||||
// ~https://en.wikipedia.org/wiki/Intel_SHA_extensions
|
||||
[[gnu::target("sha,sse4.2")]] static void SHA256_transform_impl_sha(u32 (&state)[8], u8 const (&data)[64])
|
||||
{
|
||||
using AK::SIMD::i32x4, AK::SIMD::u32x4;
|
||||
|
||||
u32x4 states[2] {};
|
||||
states[0] = AK::SIMD::load_unaligned<u32x4>(&state[0]);
|
||||
states[1] = AK::SIMD::load_unaligned<u32x4>(&state[4]);
|
||||
auto tmp = u32x4 { states[0][1], states[0][0], states[0][3], states[0][2] };
|
||||
states[1] = u32x4 { states[1][3], states[1][2], states[1][1], states[1][0] };
|
||||
states[0] = u32x4 { states[1][2], states[1][3], tmp[0], tmp[1] };
|
||||
states[1] = u32x4 { states[1][0], states[1][1], tmp[2], tmp[3] };
|
||||
|
||||
u32x4 msgs[4] {};
|
||||
u32x4 old[2] { states[0], states[1] };
|
||||
for (int i = 0; i != 16; ++i) {
|
||||
u32x4 msg {};
|
||||
if (i < 4) {
|
||||
msgs[i] = AK::SIMD::load_unaligned<u32x4>(&data[i * 16]);
|
||||
msgs[i] = AK::SIMD::elementwise_byte_reverse(msgs[i]);
|
||||
tmp = AK::SIMD::load_unaligned<u32x4>(&SHA256Constants::RoundConstants[i * 4]);
|
||||
msg = msgs[i] + tmp;
|
||||
} else {
|
||||
msgs[(i + 0) % 4] = bit_cast<u32x4>(__builtin_ia32_sha256msg1(bit_cast<i32x4>(msgs[(i + 0) % 4]), bit_cast<i32x4>(msgs[(i + 1) % 4])));
|
||||
tmp = u32x4 { msgs[(i + 2) % 4][1], msgs[(i + 2) % 4][2], msgs[(i + 2) % 4][3], msgs[(i + 3) % 4][0] };
|
||||
msgs[(i + 0) % 4] += tmp;
|
||||
msgs[(i + 0) % 4] = bit_cast<u32x4>(__builtin_ia32_sha256msg2(bit_cast<i32x4>(msgs[(i + 0) % 4]), bit_cast<i32x4>(msgs[(i + 3) % 4])));
|
||||
tmp = AK::SIMD::load_unaligned<u32x4>(&SHA256Constants::RoundConstants[i * 4]);
|
||||
msg = msgs[(i + 0) % 4] + tmp;
|
||||
}
|
||||
states[1] = bit_cast<u32x4>(__builtin_ia32_sha256rnds2(bit_cast<i32x4>(states[1]), bit_cast<i32x4>(states[0]), bit_cast<i32x4>(msg)));
|
||||
msg = u32x4 { msg[2], msg[3], 0, 0 };
|
||||
states[0] = bit_cast<u32x4>(__builtin_ia32_sha256rnds2(bit_cast<i32x4>(states[0]), bit_cast<i32x4>(states[1]), bit_cast<i32x4>(msg)));
|
||||
}
|
||||
states[0] += old[0];
|
||||
states[1] += old[1];
|
||||
|
||||
tmp = u32x4 { states[0][3], states[0][2], states[0][1], states[0][0] };
|
||||
states[1] = u32x4 { states[1][1], states[1][0], states[1][3], states[1][2] };
|
||||
states[0] = u32x4 { tmp[0], tmp[1], states[1][2], states[1][3] };
|
||||
states[1] = u32x4 { tmp[2], tmp[3], states[1][0], states[1][1] };
|
||||
AK::SIMD::store_unaligned(&state[0], states[0]);
|
||||
AK::SIMD::store_unaligned(&state[4], states[1]);
|
||||
}
|
||||
// FIXME: We need a custom resolver as Clang and GCC either refuse or silently ignore the `sha` target
|
||||
// for function multiversioning
|
||||
[[gnu::ifunc("resolve_SHA256_transform_impl")]] static void SHA256_transform_impl(u32 (&state)[8], u8 const (&data)[64]);
|
||||
namespace {
|
||||
extern "C" [[gnu::used]] decltype(&SHA256_transform_impl) resolve_SHA256_transform_impl()
|
||||
{
|
||||
// FIXME: Use __builtin_cpu_supports("sha") when compilers support it
|
||||
constexpr u32 cpuid_sha_ebx = 1 << 29;
|
||||
u32 eax, ebx, ecx, edx;
|
||||
__cpuid_count(7, 0, eax, ebx, ecx, edx);
|
||||
if (ebx & cpuid_sha_ebx)
|
||||
return SHA256_transform_impl_sha;
|
||||
|
||||
// FIXME: Investigate if more target clones (avx) make sense
|
||||
|
||||
return SHA256_transform_impl_base;
|
||||
}
|
||||
}
|
||||
#else
|
||||
# define SHA256_transform_impl SHA256_transform_impl_base
|
||||
#endif
|
||||
|
||||
inline void SHA256::transform(u8 const (&data)[BlockSize])
|
||||
{
|
||||
SHA256_transform_impl(m_state, data);
|
||||
}
|
||||
|
||||
template<size_t BlockSize, typename Callback>
|
||||
|
|
|
@ -115,7 +115,7 @@ public:
|
|||
}
|
||||
|
||||
private:
|
||||
inline void transform(u8 const*);
|
||||
inline void transform(u8 const (&data)[BlockSize]);
|
||||
|
||||
u8 m_data_buffer[BlockSize] {};
|
||||
size_t m_data_length { 0 };
|
||||
|
|
Loading…
Add table
Reference in a new issue