From 4cd250dc2d4764afc436747df2df45284106f03b Mon Sep 17 00:00:00 2001 From: BenJilks Date: Mon, 5 Aug 2024 20:23:59 +0100 Subject: [PATCH] LibTextCodec: Implement `big5` encoder Implements the `big5` encoder, as specified by https://encoding.spec.whatwg.org/#big5-encoder (cherry picked from commit 34c8c559c112796af0f99b48a7b88cb26633a764) --- Tests/LibTextCodec/TestTextEncoders.cpp | 18 ++++++ Userland/Libraries/LibTextCodec/Encoder.cpp | 69 +++++++++++++++++++++ Userland/Libraries/LibTextCodec/Encoder.h | 5 ++ 3 files changed, 92 insertions(+) diff --git a/Tests/LibTextCodec/TestTextEncoders.cpp b/Tests/LibTextCodec/TestTextEncoders.cpp index 64c0c9a96a4..1ee6adf20da 100644 --- a/Tests/LibTextCodec/TestTextEncoders.cpp +++ b/Tests/LibTextCodec/TestTextEncoders.cpp @@ -61,3 +61,21 @@ TEST_CASE(test_euc_kr_encoder) EXPECT(processed_bytes[2] == 0xF0); EXPECT(processed_bytes[3] == 0xD8); } + +TEST_CASE(test_big5_encoder) +{ + TextCodec::Big5Encoder encoder; + // U+A7 Section Sign + // U+70D7 CJK Unified Ideograph-70D7 + auto test_string = "\U000000A7\U000070D7"sv; + + Vector processed_bytes; + MUST(encoder.process(Utf8View(test_string), [&](u8 byte) { + return processed_bytes.try_append(byte); + })); + EXPECT(processed_bytes.size() == 4); + EXPECT(processed_bytes[0] == 0xA1); + EXPECT(processed_bytes[1] == 0xB1); + EXPECT(processed_bytes[2] == 0xD2); + EXPECT(processed_bytes[3] == 0x71); +} diff --git a/Userland/Libraries/LibTextCodec/Encoder.cpp b/Userland/Libraries/LibTextCodec/Encoder.cpp index 8206bedc617..08304dfe15b 100644 --- a/Userland/Libraries/LibTextCodec/Encoder.cpp +++ b/Userland/Libraries/LibTextCodec/Encoder.cpp @@ -14,6 +14,7 @@ namespace TextCodec { namespace { UTF8Encoder s_utf8_encoder; +Big5Encoder s_big5_encoder; EUCJPEncoder s_euc_jp_encoder; EUCKREncoder s_euc_kr_encoder; } @@ -22,6 +23,8 @@ Optional encoder_for_exact_name(StringView encoding) { if (encoding.equals_ignoring_ascii_case("utf-8"sv)) return s_utf8_encoder; + if (encoding.equals_ignoring_ascii_case("big5"sv)) + return s_big5_encoder; if (encoding.equals_ignoring_ascii_case("euc-jp"sv)) return s_euc_jp_encoder; if (encoding.equals_ignoring_ascii_case("euc-kr"sv)) @@ -138,4 +141,70 @@ ErrorOr EUCKREncoder::process(Utf8View input, Function(u8)> return {}; } +// https://encoding.spec.whatwg.org/#index-big5-pointer +static Optional index_big5_pointer(u32 code_point) +{ + // 1. Let index be index Big5 excluding all entries whose pointer is less than (0xA1 - 0x81) × 157. + auto start_index = (0xA1 - 0x81) * 157 - s_big5_index_first_pointer; + + // 2. If code point is U+2550, U+255E, U+2561, U+256A, U+5341, or U+5345, return the last pointer + // corresponding to code point in index. + if (Array { 0x2550, 0x255E, 0x2561, 0x256A, 0x5341, 0x5345 }.contains_slow(code_point)) { + for (u32 i = s_big5_index.size() - 1; i >= start_index; --i) { + if (s_big5_index[i] == code_point) { + return s_big5_index_first_pointer + i; + } + } + return {}; + } + + // 3. Return the index pointer for code point in index. + for (u32 i = start_index; i < s_big5_index.size(); ++i) { + if (s_big5_index[i] == code_point) { + return s_big5_index_first_pointer + i; + } + } + return {}; +} + +// https://encoding.spec.whatwg.org/#big5-encoder +ErrorOr Big5Encoder::process(Utf8View input, Function(u8)> on_byte) +{ + for (u32 item : input) { + // 1. If code point is end-of-queue, return finished. + + // 2. If code point is an ASCII code point, return a byte whose value is code point. + if (is_ascii(item)) { + TRY(on_byte(static_cast(item))); + continue; + } + + // 3. Let pointer be the index Big5 pointer for code point. + auto pointer = index_big5_pointer(item); + + // 4. If pointer is null, return error with code point. + if (!pointer.has_value()) { + // TODO: Report error. + continue; + } + + // 5. Let lead be pointer / 157 + 0x81. + auto lead = *pointer / 157 + 0x81; + + // 6. Let trail be pointer % 157. + auto trail = *pointer % 157; + + // 7. Let offset be 0x40 if trail is less than 0x3F, otherwise 0x62. + auto offset = 0x62; + if (trail < 0x3f) + offset = 0x40; + + // 8. Return two bytes whose values are lead and trail + offset. + TRY(on_byte(static_cast(lead))); + TRY(on_byte(static_cast(trail + offset))); + } + + return {}; +} + } diff --git a/Userland/Libraries/LibTextCodec/Encoder.h b/Userland/Libraries/LibTextCodec/Encoder.h index 508c654a910..78529b3dbb5 100644 --- a/Userland/Libraries/LibTextCodec/Encoder.h +++ b/Userland/Libraries/LibTextCodec/Encoder.h @@ -34,6 +34,11 @@ public: virtual ErrorOr process(Utf8View, Function(u8)> on_byte) override; }; +class Big5Encoder final : public Encoder { +public: + virtual ErrorOr process(Utf8View, Function(u8)> on_byte) override; +}; + Optional encoder_for_exact_name(StringView encoding); Optional encoder_for(StringView label);