diff --git a/AK/Endian.h b/AK/Endian.h index 2a3aa2ce570..71435e6d34d 100644 --- a/AK/Endian.h +++ b/AK/Endian.h @@ -11,12 +11,19 @@ #include namespace AK { + #if __BYTE_ORDER__ == __ORDER_LITTLE_ENDIAN__ inline constexpr static bool HostIsLittleEndian = true; #else inline constexpr static bool HostIsLittleEndian = false; #endif +enum class Endianness { + Host, + Big, + Little, +}; + template ALWAYS_INLINE constexpr T convert_between_host_and_little_endian(T value) { diff --git a/AK/String.cpp b/AK/String.cpp index a7ea72d8ea8..ef679ea6931 100644 --- a/AK/String.cpp +++ b/AK/String.cpp @@ -8,6 +8,7 @@ #include #include +#include #include #include #include @@ -51,15 +52,30 @@ ErrorOr String::from_utf16(Utf16View const& utf16) String result; - auto utf8_length = simdutf::utf8_length_from_utf16( - reinterpret_cast(utf16.data()), - utf16.length_in_code_units()); + auto utf8_length = [&]() { + switch (utf16.endianness()) { + case Endianness::Host: + return simdutf::utf8_length_from_utf16(utf16.char_data(), utf16.length_in_code_units()); + case Endianness::Big: + return simdutf::utf8_length_from_utf16be(utf16.char_data(), utf16.length_in_code_units()); + case Endianness::Little: + return simdutf::utf8_length_from_utf16le(utf16.char_data(), utf16.length_in_code_units()); + } + VERIFY_NOT_REACHED(); + }(); TRY(result.replace_with_new_string(utf8_length, [&](Bytes buffer) -> ErrorOr { - [[maybe_unused]] auto result = simdutf::convert_utf16_to_utf8( - reinterpret_cast(utf16.data()), - utf16.length_in_code_units(), - reinterpret_cast(buffer.data())); + [[maybe_unused]] auto result = [&]() { + switch (utf16.endianness()) { + case Endianness::Host: + return simdutf::convert_utf16_to_utf8(utf16.char_data(), utf16.length_in_code_units(), reinterpret_cast(buffer.data())); + case Endianness::Big: + return simdutf::convert_utf16be_to_utf8(utf16.char_data(), utf16.length_in_code_units(), reinterpret_cast(buffer.data())); + case Endianness::Little: + return simdutf::convert_utf16le_to_utf8(utf16.char_data(), utf16.length_in_code_units(), reinterpret_cast(buffer.data())); + } + VERIFY_NOT_REACHED(); + }(); ASSERT(result == buffer.size()); return {}; diff --git a/AK/Utf16View.cpp b/AK/Utf16View.cpp index a6e569fbb45..324e042e438 100644 --- a/AK/Utf16View.cpp +++ b/AK/Utf16View.cpp @@ -25,71 +25,102 @@ static constexpr u16 low_surrogate_max = 0xdfff; static constexpr u32 replacement_code_point = 0xfffd; static constexpr u32 first_supplementary_plane_code_point = 0x10000; +static constexpr u16 host_code_unit(u16 code_unit, Endianness endianness) +{ + switch (endianness) { + case Endianness::Host: + return code_unit; + case Endianness::Big: + return convert_between_host_and_big_endian(code_unit); + case Endianness::Little: + return convert_between_host_and_little_endian(code_unit); + } + VERIFY_NOT_REACHED(); +} + template UtfViewType> -static ErrorOr to_utf16_slow(UtfViewType const& view) +static ErrorOr to_utf16_slow(UtfViewType const& view, Endianness endianness) { Utf16Data utf16_data; TRY(utf16_data.try_ensure_capacity(view.length())); for (auto code_point : view) - TRY(code_point_to_utf16(utf16_data, code_point)); + TRY(code_point_to_utf16(utf16_data, code_point, endianness)); return utf16_data; } -ErrorOr utf8_to_utf16(StringView utf8_view) +ErrorOr utf8_to_utf16(StringView utf8_view, Endianness endianness) { - return utf8_to_utf16(Utf8View { utf8_view }); + return utf8_to_utf16(Utf8View { utf8_view }, endianness); } -ErrorOr utf8_to_utf16(Utf8View const& utf8_view) +ErrorOr utf8_to_utf16(Utf8View const& utf8_view, Endianness endianness) { // All callers want to allow lonely surrogates, which simdutf does not permit. if (!utf8_view.validate(Utf8View::AllowSurrogates::No)) [[unlikely]] - return to_utf16_slow(utf8_view); + return to_utf16_slow(utf8_view, endianness); + + auto const* data = reinterpret_cast(utf8_view.bytes()); + auto length = utf8_view.byte_length(); Utf16Data utf16_data; + TRY(utf16_data.try_resize(simdutf::utf16_length_from_utf8(data, length))); - TRY(utf16_data.try_resize(simdutf::utf16_length_from_utf8( - reinterpret_cast(utf8_view.bytes()), - utf8_view.byte_length()))); - - [[maybe_unused]] auto result = simdutf::convert_utf8_to_utf16( - reinterpret_cast(utf8_view.bytes()), - utf8_view.byte_length(), - reinterpret_cast(utf16_data.data())); + [[maybe_unused]] auto result = [&]() { + switch (endianness) { + case Endianness::Host: + return simdutf::convert_utf8_to_utf16(data, length, reinterpret_cast(utf16_data.data())); + case Endianness::Big: + return simdutf::convert_utf8_to_utf16be(data, length, reinterpret_cast(utf16_data.data())); + case Endianness::Little: + return simdutf::convert_utf8_to_utf16le(data, length, reinterpret_cast(utf16_data.data())); + } + VERIFY_NOT_REACHED(); + }(); ASSERT(result == utf16_data.size()); return utf16_data; } -ErrorOr utf32_to_utf16(Utf32View const& utf32_view) +ErrorOr utf32_to_utf16(Utf32View const& utf32_view, Endianness endianness) { + auto const* data = reinterpret_cast(utf32_view.code_points()); + auto length = utf32_view.length(); + Utf16Data utf16_data; + TRY(utf16_data.try_resize(simdutf::utf16_length_from_utf32(data, length))); - TRY(utf16_data.try_resize(simdutf::utf16_length_from_utf32( - reinterpret_cast(utf32_view.code_points()), - utf32_view.length()))); - - [[maybe_unused]] auto result = simdutf::convert_utf32_to_utf16( - reinterpret_cast(utf32_view.code_points()), - utf32_view.length(), - reinterpret_cast(utf16_data.data())); + [[maybe_unused]] auto result = [&]() { + switch (endianness) { + case Endianness::Host: + return simdutf::convert_utf32_to_utf16(data, length, reinterpret_cast(utf16_data.data())); + case Endianness::Big: + return simdutf::convert_utf32_to_utf16be(data, length, reinterpret_cast(utf16_data.data())); + case Endianness::Little: + return simdutf::convert_utf32_to_utf16le(data, length, reinterpret_cast(utf16_data.data())); + } + VERIFY_NOT_REACHED(); + }(); ASSERT(result == utf16_data.size()); return utf16_data; } -ErrorOr code_point_to_utf16(Utf16Data& string, u32 code_point) +ErrorOr code_point_to_utf16(Utf16Data& string, u32 code_point, Endianness endianness) { VERIFY(is_unicode(code_point)); if (code_point < first_supplementary_plane_code_point) { - TRY(string.try_append(static_cast(code_point))); + TRY(string.try_append(host_code_unit(static_cast(code_point), endianness))); } else { code_point -= first_supplementary_plane_code_point; - TRY(string.try_append(static_cast(high_surrogate_min | (code_point >> 10)))); - TRY(string.try_append(static_cast(low_surrogate_min | (code_point & 0x3ff)))); + + auto code_unit = static_cast(high_surrogate_min | (code_point >> 10)); + TRY(string.try_append(host_code_unit(code_unit, endianness))); + + code_unit = static_cast(low_surrogate_min | (code_point & 0x3ff)); + TRY(string.try_append(host_code_unit(code_unit, endianness))); } return {}; @@ -125,19 +156,12 @@ ErrorOr Utf16View::to_utf8(AllowInvalidCodeUnits allow_invalid_code_unit StringBuilder builder; - for (auto const* ptr = begin_ptr(); ptr < end_ptr(); ++ptr) { - if (is_high_surrogate(*ptr)) { - auto const* next = ptr + 1; + for (size_t i = 0; i < length_in_code_units(); ++i) { + auto code_point = code_point_at(i); + TRY(builder.try_append_code_point(code_point)); - if ((next < end_ptr()) && is_low_surrogate(*next)) { - auto code_point = decode_surrogate_pair(*ptr, *next); - TRY(builder.try_append_code_point(code_point)); - ++ptr; - continue; - } - } - - TRY(builder.try_append_code_point(static_cast(*ptr))); + if (code_point >= first_supplementary_plane_code_point) + ++i; } return builder.to_string_without_validation(); @@ -153,7 +177,7 @@ size_t Utf16View::length_in_code_points() const u16 Utf16View::code_unit_at(size_t index) const { VERIFY(index < length_in_code_units()); - return m_code_units[index]; + return host_code_unit(m_code_units[index], m_endianness); } u32 Utf16View::code_point_at(size_t index) const @@ -264,14 +288,32 @@ bool Utf16View::starts_with(Utf16View const& needle) const bool Utf16View::validate() const { - return simdutf::validate_utf16(reinterpret_cast(m_code_units.data()), m_code_units.size()); + switch (m_endianness) { + case Endianness::Host: + return simdutf::validate_utf16(char_data(), length_in_code_units()); + case Endianness::Big: + return simdutf::validate_utf16be(char_data(), length_in_code_units()); + case Endianness::Little: + return simdutf::validate_utf16le(char_data(), length_in_code_units()); + } + VERIFY_NOT_REACHED(); } bool Utf16View::validate(size_t& valid_code_units) const { - auto result = simdutf::validate_utf16_with_errors(reinterpret_cast(m_code_units.data()), m_code_units.size()); - valid_code_units = result.count; + auto result = [&]() { + switch (m_endianness) { + case Endianness::Host: + return simdutf::validate_utf16_with_errors(char_data(), length_in_code_units()); + case Endianness::Big: + return simdutf::validate_utf16be_with_errors(char_data(), length_in_code_units()); + case Endianness::Little: + return simdutf::validate_utf16le_with_errors(char_data(), length_in_code_units()); + } + VERIFY_NOT_REACHED(); + }(); + valid_code_units = result.count; return result.error == simdutf::SUCCESS; } @@ -280,8 +322,16 @@ size_t Utf16View::calculate_length_in_code_points() const // FIXME: simdutf's code point length method assumes valid UTF-16, whereas Utf16View uses U+FFFD as a replacement // for invalid code points. If we change Utf16View to only accept valid encodings as an invariant, we can // remove this branch. - if (validate()) [[likely]] - return simdutf::count_utf16(reinterpret_cast(m_code_units.data()), m_code_units.size()); + if (validate()) [[likely]] { + switch (m_endianness) { + case Endianness::Host: + return simdutf::count_utf16(char_data(), length_in_code_units()); + case Endianness::Big: + return simdutf::count_utf16be(char_data(), length_in_code_units()); + case Endianness::Little: + return simdutf::count_utf16le(char_data(), length_in_code_units()); + } + } size_t code_points = 0; for ([[maybe_unused]] auto code_point : *this) @@ -339,30 +389,28 @@ u32 Utf16CodePointIterator::operator*() const // W2 as its 10 low-order bits. // 5) Add 0x10000 to U' to obtain the character value U. Terminate. - if (Utf16View::is_high_surrogate(*m_ptr)) { - if ((m_remaining_code_units > 1) && Utf16View::is_low_surrogate(*(m_ptr + 1))) - return Utf16View::decode_surrogate_pair(*m_ptr, *(m_ptr + 1)); - return replacement_code_point; - } else if (Utf16View::is_low_surrogate(*m_ptr)) { + auto code_unit = host_code_unit(*m_ptr, m_endianness); + + if (Utf16View::is_high_surrogate(code_unit)) { + if (m_remaining_code_units > 1) { + auto next_code_unit = host_code_unit(*(m_ptr + 1), m_endianness); + + if (Utf16View::is_low_surrogate(next_code_unit)) + return Utf16View::decode_surrogate_pair(code_unit, next_code_unit); + } + return replacement_code_point; } - return static_cast(*m_ptr); + if (Utf16View::is_low_surrogate(code_unit)) + return replacement_code_point; + + return static_cast(code_unit); } size_t Utf16CodePointIterator::length_in_code_units() const { - VERIFY(m_remaining_code_units > 0); - - if (Utf16View::is_high_surrogate(*m_ptr)) { - if ((m_remaining_code_units > 1) && Utf16View::is_low_surrogate(*(m_ptr + 1))) - return 2; - } - - // If this return is reached, either the encoded code point is a valid single code unit, or that - // code point is invalid (e.g. began with a low surrogate, or a low surrogate did not follow a - // high surrogate). In the latter case, a single replacement code unit will be used. - return 1; + return *(*this) < first_supplementary_plane_code_point ? 1 : 2; } } diff --git a/AK/Utf16View.h b/AK/Utf16View.h index 8ed9b825824..e0bd8d79aaf 100644 --- a/AK/Utf16View.h +++ b/AK/Utf16View.h @@ -7,6 +7,7 @@ #pragma once #include +#include #include #include #include @@ -20,10 +21,10 @@ namespace AK { using Utf16Data = Vector; -ErrorOr utf8_to_utf16(StringView); -ErrorOr utf8_to_utf16(Utf8View const&); -ErrorOr utf32_to_utf16(Utf32View const&); -ErrorOr code_point_to_utf16(Utf16Data&, u32); +ErrorOr utf8_to_utf16(StringView, Endianness = Endianness::Host); +ErrorOr utf8_to_utf16(Utf8View const&, Endianness = Endianness::Host); +ErrorOr utf32_to_utf16(Utf32View const&, Endianness = Endianness::Host); +ErrorOr code_point_to_utf16(Utf16Data&, u32, Endianness = Endianness::Host); class Utf16View; @@ -45,14 +46,16 @@ public: size_t length_in_code_units() const; private: - Utf16CodePointIterator(u16 const* ptr, size_t length) + Utf16CodePointIterator(u16 const* ptr, size_t length, Endianness endianness) : m_ptr(ptr) , m_remaining_code_units(length) + , m_endianness(endianness) { } u16 const* m_ptr { nullptr }; size_t m_remaining_code_units { 0 }; + Endianness m_endianness { Endianness::Host }; }; class Utf16View { @@ -66,16 +69,18 @@ public: Utf16View() = default; ~Utf16View() = default; - explicit Utf16View(ReadonlySpan code_units) + explicit Utf16View(ReadonlySpan code_units, Endianness endianness = Endianness::Host) : m_code_units(code_units) + , m_endianness(endianness) { } template - Utf16View(char16_t const (&code_units)[Size]) + Utf16View(char16_t const (&code_units)[Size], Endianness endianness = Endianness::Host) : m_code_units( reinterpret_cast(&code_units[0]), code_units[Size - 1] == u'\0' ? Size - 1 : Size) + , m_endianness(endianness) { } @@ -94,10 +99,14 @@ public: size_t length_in_code_units() const { return m_code_units.size(); } size_t length_in_code_points() const; - Utf16CodePointIterator begin() const { return { begin_ptr(), m_code_units.size() }; } - Utf16CodePointIterator end() const { return { end_ptr(), 0 }; } + Endianness endianness() const { return m_endianness; } + + Utf16CodePointIterator begin() const { return { begin_ptr(), m_code_units.size(), m_endianness }; } + Utf16CodePointIterator end() const { return { end_ptr(), 0, m_endianness }; } u16 const* data() const { return m_code_units.data(); } + char16_t const* char_data() const { return reinterpret_cast(data()); } + u16 code_unit_at(size_t index) const; u32 code_point_at(size_t index) const; @@ -126,6 +135,7 @@ private: ReadonlySpan m_code_units; mutable Optional m_length_in_code_points; + Endianness m_endianness { Endianness::Host }; }; } diff --git a/Tests/AK/TestUtf16.cpp b/Tests/AK/TestUtf16.cpp index 79aba4edeb7..607b6df7295 100644 --- a/Tests/AK/TestUtf16.cpp +++ b/Tests/AK/TestUtf16.cpp @@ -1,5 +1,5 @@ /* - * Copyright (c) 2021, Tim Flynn + * Copyright (c) 2021-2024, Tim Flynn * * SPDX-License-Identifier: BSD-2-Clause */ @@ -336,3 +336,71 @@ TEST_CASE(starts_with) EXPECT(!emoji.starts_with(u"a")); EXPECT(!emoji.starts_with(u"🙃")); } + +TEST_CASE(big_endian) +{ + auto string = MUST(AK::utf8_to_utf16("säk😀"sv, AK::Endianness::Big)); + Utf16View view { string, AK::Endianness::Big }; + { + EXPECT(view.validate()); + EXPECT_EQ(MUST(view.to_utf8()), "säk😀"sv); + + EXPECT_EQ(view.length_in_code_units(), 5u); + EXPECT_EQ(view.length_in_code_points(), 4u); + + EXPECT_EQ(view.code_unit_at(0), 0x73u); + EXPECT_EQ(view.code_unit_at(1), 0xe4u); + EXPECT_EQ(view.code_unit_at(2), 0x6bu); + EXPECT_EQ(view.code_unit_at(3), 0xd83d); + EXPECT_EQ(view.code_unit_at(4), 0xde00u); + + EXPECT_EQ(view.code_point_at(0), 0x73u); + EXPECT_EQ(view.code_point_at(1), 0xe4u); + EXPECT_EQ(view.code_point_at(2), 0x6bu); + EXPECT_EQ(view.code_point_at(3), 0x1f600u); + EXPECT_EQ(view.code_point_at(4), 0xde00u); + } + { + Utf16Data data; + MUST(code_point_to_utf16(data, 's', AK::Endianness::Big)); + MUST(code_point_to_utf16(data, 0xe4, AK::Endianness::Big)); + MUST(code_point_to_utf16(data, 'k', AK::Endianness::Big)); + MUST(code_point_to_utf16(data, 0x1f600, AK::Endianness::Big)); + EXPECT_EQ(data, to_array({ 0x7300, 0xe400, 0x6b00, 0x3dd8, 0x00de })); + EXPECT_EQ(data, string); + } +} + +TEST_CASE(little_endian) +{ + auto string = MUST(AK::utf8_to_utf16("säk😀"sv, AK::Endianness::Little)); + Utf16View view { string, AK::Endianness::Little }; + { + EXPECT(view.validate()); + EXPECT_EQ(MUST(view.to_utf8()), "säk😀"sv); + + EXPECT_EQ(view.length_in_code_units(), 5u); + EXPECT_EQ(view.length_in_code_points(), 4u); + + EXPECT_EQ(view.code_unit_at(0), 0x73u); + EXPECT_EQ(view.code_unit_at(1), 0xe4u); + EXPECT_EQ(view.code_unit_at(2), 0x6bu); + EXPECT_EQ(view.code_unit_at(3), 0xd83d); + EXPECT_EQ(view.code_unit_at(4), 0xde00u); + + EXPECT_EQ(view.code_point_at(0), 0x73u); + EXPECT_EQ(view.code_point_at(1), 0xe4u); + EXPECT_EQ(view.code_point_at(2), 0x6bu); + EXPECT_EQ(view.code_point_at(3), 0x1f600u); + EXPECT_EQ(view.code_point_at(4), 0xde00u); + } + { + Utf16Data data; + MUST(code_point_to_utf16(data, 's', AK::Endianness::Little)); + MUST(code_point_to_utf16(data, 0xe4, AK::Endianness::Little)); + MUST(code_point_to_utf16(data, 'k', AK::Endianness::Little)); + MUST(code_point_to_utf16(data, 0x1f600, AK::Endianness::Little)); + EXPECT_EQ(data, to_array({ 0x73, 0xe4, 0x6b, 0xd83d, 0xde00 })); + EXPECT_EQ(data, string); + } +}