From 71c29504af27c3fc2ea770e98b3957a97aa9b9b9 Mon Sep 17 00:00:00 2001 From: Timothy Flynn Date: Thu, 18 Jul 2024 11:57:01 -0400 Subject: [PATCH] AK: Support non-native endianness in Utf16View Utf16View currently assumes host endianness. Add support for specifying either big or little endianness (which we mostly just pipe through to simdutf). This will allow using simdutf facilities with LibTextCodec. --- AK/Endian.h | 7 ++ AK/String.cpp | 30 +++++-- AK/Utf16View.cpp | 172 ++++++++++++++++++++++++++--------------- AK/Utf16View.h | 28 ++++--- Tests/AK/TestUtf16.cpp | 70 ++++++++++++++++- 5 files changed, 228 insertions(+), 79 deletions(-) diff --git a/AK/Endian.h b/AK/Endian.h index 2a3aa2ce570..71435e6d34d 100644 --- a/AK/Endian.h +++ b/AK/Endian.h @@ -11,12 +11,19 @@ #include namespace AK { + #if __BYTE_ORDER__ == __ORDER_LITTLE_ENDIAN__ inline constexpr static bool HostIsLittleEndian = true; #else inline constexpr static bool HostIsLittleEndian = false; #endif +enum class Endianness { + Host, + Big, + Little, +}; + template ALWAYS_INLINE constexpr T convert_between_host_and_little_endian(T value) { diff --git a/AK/String.cpp b/AK/String.cpp index a7ea72d8ea8..ef679ea6931 100644 --- a/AK/String.cpp +++ b/AK/String.cpp @@ -8,6 +8,7 @@ #include #include +#include #include #include #include @@ -51,15 +52,30 @@ ErrorOr String::from_utf16(Utf16View const& utf16) String result; - auto utf8_length = simdutf::utf8_length_from_utf16( - reinterpret_cast(utf16.data()), - utf16.length_in_code_units()); + auto utf8_length = [&]() { + switch (utf16.endianness()) { + case Endianness::Host: + return simdutf::utf8_length_from_utf16(utf16.char_data(), utf16.length_in_code_units()); + case Endianness::Big: + return simdutf::utf8_length_from_utf16be(utf16.char_data(), utf16.length_in_code_units()); + case Endianness::Little: + return simdutf::utf8_length_from_utf16le(utf16.char_data(), utf16.length_in_code_units()); + } + VERIFY_NOT_REACHED(); + }(); TRY(result.replace_with_new_string(utf8_length, [&](Bytes buffer) -> ErrorOr { - [[maybe_unused]] auto result = simdutf::convert_utf16_to_utf8( - reinterpret_cast(utf16.data()), - utf16.length_in_code_units(), - reinterpret_cast(buffer.data())); + [[maybe_unused]] auto result = [&]() { + switch (utf16.endianness()) { + case Endianness::Host: + return simdutf::convert_utf16_to_utf8(utf16.char_data(), utf16.length_in_code_units(), reinterpret_cast(buffer.data())); + case Endianness::Big: + return simdutf::convert_utf16be_to_utf8(utf16.char_data(), utf16.length_in_code_units(), reinterpret_cast(buffer.data())); + case Endianness::Little: + return simdutf::convert_utf16le_to_utf8(utf16.char_data(), utf16.length_in_code_units(), reinterpret_cast(buffer.data())); + } + VERIFY_NOT_REACHED(); + }(); ASSERT(result == buffer.size()); return {}; diff --git a/AK/Utf16View.cpp b/AK/Utf16View.cpp index a6e569fbb45..324e042e438 100644 --- a/AK/Utf16View.cpp +++ b/AK/Utf16View.cpp @@ -25,71 +25,102 @@ static constexpr u16 low_surrogate_max = 0xdfff; static constexpr u32 replacement_code_point = 0xfffd; static constexpr u32 first_supplementary_plane_code_point = 0x10000; +static constexpr u16 host_code_unit(u16 code_unit, Endianness endianness) +{ + switch (endianness) { + case Endianness::Host: + return code_unit; + case Endianness::Big: + return convert_between_host_and_big_endian(code_unit); + case Endianness::Little: + return convert_between_host_and_little_endian(code_unit); + } + VERIFY_NOT_REACHED(); +} + template UtfViewType> -static ErrorOr to_utf16_slow(UtfViewType const& view) +static ErrorOr to_utf16_slow(UtfViewType const& view, Endianness endianness) { Utf16Data utf16_data; TRY(utf16_data.try_ensure_capacity(view.length())); for (auto code_point : view) - TRY(code_point_to_utf16(utf16_data, code_point)); + TRY(code_point_to_utf16(utf16_data, code_point, endianness)); return utf16_data; } -ErrorOr utf8_to_utf16(StringView utf8_view) +ErrorOr utf8_to_utf16(StringView utf8_view, Endianness endianness) { - return utf8_to_utf16(Utf8View { utf8_view }); + return utf8_to_utf16(Utf8View { utf8_view }, endianness); } -ErrorOr utf8_to_utf16(Utf8View const& utf8_view) +ErrorOr utf8_to_utf16(Utf8View const& utf8_view, Endianness endianness) { // All callers want to allow lonely surrogates, which simdutf does not permit. if (!utf8_view.validate(Utf8View::AllowSurrogates::No)) [[unlikely]] - return to_utf16_slow(utf8_view); + return to_utf16_slow(utf8_view, endianness); + + auto const* data = reinterpret_cast(utf8_view.bytes()); + auto length = utf8_view.byte_length(); Utf16Data utf16_data; + TRY(utf16_data.try_resize(simdutf::utf16_length_from_utf8(data, length))); - TRY(utf16_data.try_resize(simdutf::utf16_length_from_utf8( - reinterpret_cast(utf8_view.bytes()), - utf8_view.byte_length()))); - - [[maybe_unused]] auto result = simdutf::convert_utf8_to_utf16( - reinterpret_cast(utf8_view.bytes()), - utf8_view.byte_length(), - reinterpret_cast(utf16_data.data())); + [[maybe_unused]] auto result = [&]() { + switch (endianness) { + case Endianness::Host: + return simdutf::convert_utf8_to_utf16(data, length, reinterpret_cast(utf16_data.data())); + case Endianness::Big: + return simdutf::convert_utf8_to_utf16be(data, length, reinterpret_cast(utf16_data.data())); + case Endianness::Little: + return simdutf::convert_utf8_to_utf16le(data, length, reinterpret_cast(utf16_data.data())); + } + VERIFY_NOT_REACHED(); + }(); ASSERT(result == utf16_data.size()); return utf16_data; } -ErrorOr utf32_to_utf16(Utf32View const& utf32_view) +ErrorOr utf32_to_utf16(Utf32View const& utf32_view, Endianness endianness) { + auto const* data = reinterpret_cast(utf32_view.code_points()); + auto length = utf32_view.length(); + Utf16Data utf16_data; + TRY(utf16_data.try_resize(simdutf::utf16_length_from_utf32(data, length))); - TRY(utf16_data.try_resize(simdutf::utf16_length_from_utf32( - reinterpret_cast(utf32_view.code_points()), - utf32_view.length()))); - - [[maybe_unused]] auto result = simdutf::convert_utf32_to_utf16( - reinterpret_cast(utf32_view.code_points()), - utf32_view.length(), - reinterpret_cast(utf16_data.data())); + [[maybe_unused]] auto result = [&]() { + switch (endianness) { + case Endianness::Host: + return simdutf::convert_utf32_to_utf16(data, length, reinterpret_cast(utf16_data.data())); + case Endianness::Big: + return simdutf::convert_utf32_to_utf16be(data, length, reinterpret_cast(utf16_data.data())); + case Endianness::Little: + return simdutf::convert_utf32_to_utf16le(data, length, reinterpret_cast(utf16_data.data())); + } + VERIFY_NOT_REACHED(); + }(); ASSERT(result == utf16_data.size()); return utf16_data; } -ErrorOr code_point_to_utf16(Utf16Data& string, u32 code_point) +ErrorOr code_point_to_utf16(Utf16Data& string, u32 code_point, Endianness endianness) { VERIFY(is_unicode(code_point)); if (code_point < first_supplementary_plane_code_point) { - TRY(string.try_append(static_cast(code_point))); + TRY(string.try_append(host_code_unit(static_cast(code_point), endianness))); } else { code_point -= first_supplementary_plane_code_point; - TRY(string.try_append(static_cast(high_surrogate_min | (code_point >> 10)))); - TRY(string.try_append(static_cast(low_surrogate_min | (code_point & 0x3ff)))); + + auto code_unit = static_cast(high_surrogate_min | (code_point >> 10)); + TRY(string.try_append(host_code_unit(code_unit, endianness))); + + code_unit = static_cast(low_surrogate_min | (code_point & 0x3ff)); + TRY(string.try_append(host_code_unit(code_unit, endianness))); } return {}; @@ -125,19 +156,12 @@ ErrorOr Utf16View::to_utf8(AllowInvalidCodeUnits allow_invalid_code_unit StringBuilder builder; - for (auto const* ptr = begin_ptr(); ptr < end_ptr(); ++ptr) { - if (is_high_surrogate(*ptr)) { - auto const* next = ptr + 1; + for (size_t i = 0; i < length_in_code_units(); ++i) { + auto code_point = code_point_at(i); + TRY(builder.try_append_code_point(code_point)); - if ((next < end_ptr()) && is_low_surrogate(*next)) { - auto code_point = decode_surrogate_pair(*ptr, *next); - TRY(builder.try_append_code_point(code_point)); - ++ptr; - continue; - } - } - - TRY(builder.try_append_code_point(static_cast(*ptr))); + if (code_point >= first_supplementary_plane_code_point) + ++i; } return builder.to_string_without_validation(); @@ -153,7 +177,7 @@ size_t Utf16View::length_in_code_points() const u16 Utf16View::code_unit_at(size_t index) const { VERIFY(index < length_in_code_units()); - return m_code_units[index]; + return host_code_unit(m_code_units[index], m_endianness); } u32 Utf16View::code_point_at(size_t index) const @@ -264,14 +288,32 @@ bool Utf16View::starts_with(Utf16View const& needle) const bool Utf16View::validate() const { - return simdutf::validate_utf16(reinterpret_cast(m_code_units.data()), m_code_units.size()); + switch (m_endianness) { + case Endianness::Host: + return simdutf::validate_utf16(char_data(), length_in_code_units()); + case Endianness::Big: + return simdutf::validate_utf16be(char_data(), length_in_code_units()); + case Endianness::Little: + return simdutf::validate_utf16le(char_data(), length_in_code_units()); + } + VERIFY_NOT_REACHED(); } bool Utf16View::validate(size_t& valid_code_units) const { - auto result = simdutf::validate_utf16_with_errors(reinterpret_cast(m_code_units.data()), m_code_units.size()); - valid_code_units = result.count; + auto result = [&]() { + switch (m_endianness) { + case Endianness::Host: + return simdutf::validate_utf16_with_errors(char_data(), length_in_code_units()); + case Endianness::Big: + return simdutf::validate_utf16be_with_errors(char_data(), length_in_code_units()); + case Endianness::Little: + return simdutf::validate_utf16le_with_errors(char_data(), length_in_code_units()); + } + VERIFY_NOT_REACHED(); + }(); + valid_code_units = result.count; return result.error == simdutf::SUCCESS; } @@ -280,8 +322,16 @@ size_t Utf16View::calculate_length_in_code_points() const // FIXME: simdutf's code point length method assumes valid UTF-16, whereas Utf16View uses U+FFFD as a replacement // for invalid code points. If we change Utf16View to only accept valid encodings as an invariant, we can // remove this branch. - if (validate()) [[likely]] - return simdutf::count_utf16(reinterpret_cast(m_code_units.data()), m_code_units.size()); + if (validate()) [[likely]] { + switch (m_endianness) { + case Endianness::Host: + return simdutf::count_utf16(char_data(), length_in_code_units()); + case Endianness::Big: + return simdutf::count_utf16be(char_data(), length_in_code_units()); + case Endianness::Little: + return simdutf::count_utf16le(char_data(), length_in_code_units()); + } + } size_t code_points = 0; for ([[maybe_unused]] auto code_point : *this) @@ -339,30 +389,28 @@ u32 Utf16CodePointIterator::operator*() const // W2 as its 10 low-order bits. // 5) Add 0x10000 to U' to obtain the character value U. Terminate. - if (Utf16View::is_high_surrogate(*m_ptr)) { - if ((m_remaining_code_units > 1) && Utf16View::is_low_surrogate(*(m_ptr + 1))) - return Utf16View::decode_surrogate_pair(*m_ptr, *(m_ptr + 1)); - return replacement_code_point; - } else if (Utf16View::is_low_surrogate(*m_ptr)) { + auto code_unit = host_code_unit(*m_ptr, m_endianness); + + if (Utf16View::is_high_surrogate(code_unit)) { + if (m_remaining_code_units > 1) { + auto next_code_unit = host_code_unit(*(m_ptr + 1), m_endianness); + + if (Utf16View::is_low_surrogate(next_code_unit)) + return Utf16View::decode_surrogate_pair(code_unit, next_code_unit); + } + return replacement_code_point; } - return static_cast(*m_ptr); + if (Utf16View::is_low_surrogate(code_unit)) + return replacement_code_point; + + return static_cast(code_unit); } size_t Utf16CodePointIterator::length_in_code_units() const { - VERIFY(m_remaining_code_units > 0); - - if (Utf16View::is_high_surrogate(*m_ptr)) { - if ((m_remaining_code_units > 1) && Utf16View::is_low_surrogate(*(m_ptr + 1))) - return 2; - } - - // If this return is reached, either the encoded code point is a valid single code unit, or that - // code point is invalid (e.g. began with a low surrogate, or a low surrogate did not follow a - // high surrogate). In the latter case, a single replacement code unit will be used. - return 1; + return *(*this) < first_supplementary_plane_code_point ? 1 : 2; } } diff --git a/AK/Utf16View.h b/AK/Utf16View.h index 8ed9b825824..e0bd8d79aaf 100644 --- a/AK/Utf16View.h +++ b/AK/Utf16View.h @@ -7,6 +7,7 @@ #pragma once #include +#include #include #include #include @@ -20,10 +21,10 @@ namespace AK { using Utf16Data = Vector; -ErrorOr utf8_to_utf16(StringView); -ErrorOr utf8_to_utf16(Utf8View const&); -ErrorOr utf32_to_utf16(Utf32View const&); -ErrorOr code_point_to_utf16(Utf16Data&, u32); +ErrorOr utf8_to_utf16(StringView, Endianness = Endianness::Host); +ErrorOr utf8_to_utf16(Utf8View const&, Endianness = Endianness::Host); +ErrorOr utf32_to_utf16(Utf32View const&, Endianness = Endianness::Host); +ErrorOr code_point_to_utf16(Utf16Data&, u32, Endianness = Endianness::Host); class Utf16View; @@ -45,14 +46,16 @@ public: size_t length_in_code_units() const; private: - Utf16CodePointIterator(u16 const* ptr, size_t length) + Utf16CodePointIterator(u16 const* ptr, size_t length, Endianness endianness) : m_ptr(ptr) , m_remaining_code_units(length) + , m_endianness(endianness) { } u16 const* m_ptr { nullptr }; size_t m_remaining_code_units { 0 }; + Endianness m_endianness { Endianness::Host }; }; class Utf16View { @@ -66,16 +69,18 @@ public: Utf16View() = default; ~Utf16View() = default; - explicit Utf16View(ReadonlySpan code_units) + explicit Utf16View(ReadonlySpan code_units, Endianness endianness = Endianness::Host) : m_code_units(code_units) + , m_endianness(endianness) { } template - Utf16View(char16_t const (&code_units)[Size]) + Utf16View(char16_t const (&code_units)[Size], Endianness endianness = Endianness::Host) : m_code_units( reinterpret_cast(&code_units[0]), code_units[Size - 1] == u'\0' ? Size - 1 : Size) + , m_endianness(endianness) { } @@ -94,10 +99,14 @@ public: size_t length_in_code_units() const { return m_code_units.size(); } size_t length_in_code_points() const; - Utf16CodePointIterator begin() const { return { begin_ptr(), m_code_units.size() }; } - Utf16CodePointIterator end() const { return { end_ptr(), 0 }; } + Endianness endianness() const { return m_endianness; } + + Utf16CodePointIterator begin() const { return { begin_ptr(), m_code_units.size(), m_endianness }; } + Utf16CodePointIterator end() const { return { end_ptr(), 0, m_endianness }; } u16 const* data() const { return m_code_units.data(); } + char16_t const* char_data() const { return reinterpret_cast(data()); } + u16 code_unit_at(size_t index) const; u32 code_point_at(size_t index) const; @@ -126,6 +135,7 @@ private: ReadonlySpan m_code_units; mutable Optional m_length_in_code_points; + Endianness m_endianness { Endianness::Host }; }; } diff --git a/Tests/AK/TestUtf16.cpp b/Tests/AK/TestUtf16.cpp index 79aba4edeb7..607b6df7295 100644 --- a/Tests/AK/TestUtf16.cpp +++ b/Tests/AK/TestUtf16.cpp @@ -1,5 +1,5 @@ /* - * Copyright (c) 2021, Tim Flynn + * Copyright (c) 2021-2024, Tim Flynn * * SPDX-License-Identifier: BSD-2-Clause */ @@ -336,3 +336,71 @@ TEST_CASE(starts_with) EXPECT(!emoji.starts_with(u"a")); EXPECT(!emoji.starts_with(u"🙃")); } + +TEST_CASE(big_endian) +{ + auto string = MUST(AK::utf8_to_utf16("säk😀"sv, AK::Endianness::Big)); + Utf16View view { string, AK::Endianness::Big }; + { + EXPECT(view.validate()); + EXPECT_EQ(MUST(view.to_utf8()), "säk😀"sv); + + EXPECT_EQ(view.length_in_code_units(), 5u); + EXPECT_EQ(view.length_in_code_points(), 4u); + + EXPECT_EQ(view.code_unit_at(0), 0x73u); + EXPECT_EQ(view.code_unit_at(1), 0xe4u); + EXPECT_EQ(view.code_unit_at(2), 0x6bu); + EXPECT_EQ(view.code_unit_at(3), 0xd83d); + EXPECT_EQ(view.code_unit_at(4), 0xde00u); + + EXPECT_EQ(view.code_point_at(0), 0x73u); + EXPECT_EQ(view.code_point_at(1), 0xe4u); + EXPECT_EQ(view.code_point_at(2), 0x6bu); + EXPECT_EQ(view.code_point_at(3), 0x1f600u); + EXPECT_EQ(view.code_point_at(4), 0xde00u); + } + { + Utf16Data data; + MUST(code_point_to_utf16(data, 's', AK::Endianness::Big)); + MUST(code_point_to_utf16(data, 0xe4, AK::Endianness::Big)); + MUST(code_point_to_utf16(data, 'k', AK::Endianness::Big)); + MUST(code_point_to_utf16(data, 0x1f600, AK::Endianness::Big)); + EXPECT_EQ(data, to_array({ 0x7300, 0xe400, 0x6b00, 0x3dd8, 0x00de })); + EXPECT_EQ(data, string); + } +} + +TEST_CASE(little_endian) +{ + auto string = MUST(AK::utf8_to_utf16("säk😀"sv, AK::Endianness::Little)); + Utf16View view { string, AK::Endianness::Little }; + { + EXPECT(view.validate()); + EXPECT_EQ(MUST(view.to_utf8()), "säk😀"sv); + + EXPECT_EQ(view.length_in_code_units(), 5u); + EXPECT_EQ(view.length_in_code_points(), 4u); + + EXPECT_EQ(view.code_unit_at(0), 0x73u); + EXPECT_EQ(view.code_unit_at(1), 0xe4u); + EXPECT_EQ(view.code_unit_at(2), 0x6bu); + EXPECT_EQ(view.code_unit_at(3), 0xd83d); + EXPECT_EQ(view.code_unit_at(4), 0xde00u); + + EXPECT_EQ(view.code_point_at(0), 0x73u); + EXPECT_EQ(view.code_point_at(1), 0xe4u); + EXPECT_EQ(view.code_point_at(2), 0x6bu); + EXPECT_EQ(view.code_point_at(3), 0x1f600u); + EXPECT_EQ(view.code_point_at(4), 0xde00u); + } + { + Utf16Data data; + MUST(code_point_to_utf16(data, 's', AK::Endianness::Little)); + MUST(code_point_to_utf16(data, 0xe4, AK::Endianness::Little)); + MUST(code_point_to_utf16(data, 'k', AK::Endianness::Little)); + MUST(code_point_to_utf16(data, 0x1f600, AK::Endianness::Little)); + EXPECT_EQ(data, to_array({ 0x73, 0xe4, 0x6b, 0xd83d, 0xde00 })); + EXPECT_EQ(data, string); + } +}