LibTextCodec: Implement euc-kr encoder

Implements the `euc-kr` encoder, as specified by
https://encoding.spec.whatwg.org/#euc-kr-encoder
This commit is contained in:
BenJilks 2024-08-05 20:00:47 +01:00 committed by Tim Ledbetter
parent 72d0e3284b
commit 826292536c
Notes: github-actions[bot] 2024-08-08 16:51:31 +00:00
4 changed files with 62 additions and 1 deletions

View file

@ -270,7 +270,7 @@ ErrorOr<int> serenity_main(Main::Arguments arguments)
{ "big5"sv, prepare_table(data.get("big5"sv)->as_array(), GenerateAccessor::Yes) },
{ "jis0208"sv, prepare_table(data.get("jis0208"sv)->as_array(), GenerateAccessor::Yes, GenerateInverseAccessor::Yes) },
{ "jis0212"sv, prepare_table(data.get("jis0212"sv)->as_array(), GenerateAccessor::Yes) },
{ "euc_kr"sv, prepare_table(data.get("euc-kr"sv)->as_array(), GenerateAccessor::Yes) },
{ "euc_kr"sv, prepare_table(data.get("euc-kr"sv)->as_array(), GenerateAccessor::Yes, GenerateInverseAccessor::Yes) },
{ "ibm866"sv, prepare_table(data.get("ibm866"sv)->as_array()) },
{ "iso_8859_2"sv, prepare_table(data.get("iso-8859-2"sv)->as_array()) },
{ "iso_8859_3"sv, prepare_table(data.get("iso-8859-3"sv)->as_array()) },

View file

@ -43,3 +43,21 @@ TEST_CASE(test_euc_jp_encoder)
EXPECT(processed_bytes[3] == 0xA5);
EXPECT(processed_bytes[4] == 0xC4);
}
TEST_CASE(test_euc_kr_encoder)
{
TextCodec::EUCKREncoder encoder;
// U+B29F Hangul Syllable Neulh
// U+7C97 CJK Unified Ideograph-7C97
auto test_string = "\U0000B29F\U00007C97"sv;
Vector<u8> processed_bytes;
MUST(encoder.process(Utf8View(test_string), [&](u8 byte) {
return processed_bytes.try_append(byte);
}));
EXPECT(processed_bytes.size() == 4);
EXPECT(processed_bytes[0] == 0x88);
EXPECT(processed_bytes[1] == 0x6B);
EXPECT(processed_bytes[2] == 0xF0);
EXPECT(processed_bytes[3] == 0xD8);
}

View file

@ -15,6 +15,7 @@ namespace TextCodec {
namespace {
UTF8Encoder s_utf8_encoder;
EUCJPEncoder s_euc_jp_encoder;
EUCKREncoder s_euc_kr_encoder;
}
Optional<Encoder&> encoder_for_exact_name(StringView encoding)
@ -23,6 +24,8 @@ Optional<Encoder&> encoder_for_exact_name(StringView encoding)
return s_utf8_encoder;
if (encoding.equals_ignoring_ascii_case("euc-jp"sv))
return s_euc_jp_encoder;
if (encoding.equals_ignoring_ascii_case("euc-kr"sv))
return s_euc_kr_encoder;
dbgln("TextCodec: No encoder implemented for encoding '{}'", encoding);
return {};
}
@ -100,4 +103,39 @@ ErrorOr<void> EUCJPEncoder::process(Utf8View input, Function<ErrorOr<void>(u8)>
return {};
}
// https://encoding.spec.whatwg.org/#euc-kr-encoder
ErrorOr<void> EUCKREncoder::process(Utf8View input, Function<ErrorOr<void>(u8)> on_byte)
{
for (u32 item : input) {
// 1. If code point is end-of-queue, return finished.
// 2. If code point is an ASCII code point, return a byte whose value is code point.
if (is_ascii(item)) {
TRY(on_byte(static_cast<u8>(item)));
continue;
}
// 3. Let pointer be the index pointer for code point in index EUC-KR.
auto pointer = code_point_euc_kr_index(item);
// 4. If pointer is null, return error with code point.
if (!pointer.has_value()) {
// TODO: Report error.
continue;
}
// 5. Let lead be pointer / 190 + 0x81.
auto lead = *pointer / 190 + 0x81;
// 6. Let trail be pointer % 190 + 0x41.
auto trail = *pointer % 190 + 0x41;
// 7. Return two bytes whose values are lead and trail.
TRY(on_byte(static_cast<u8>(lead)));
TRY(on_byte(static_cast<u8>(trail)));
}
return {};
}
}

View file

@ -29,6 +29,11 @@ public:
virtual ErrorOr<void> process(Utf8View, Function<ErrorOr<void>(u8)> on_byte) override;
};
class EUCKREncoder final : public Encoder {
public:
virtual ErrorOr<void> process(Utf8View, Function<ErrorOr<void>(u8)> on_byte) override;
};
Optional<Encoder&> encoder_for_exact_name(StringView encoding);
Optional<Encoder&> encoder_for(StringView label);