mirror of
https://github.com/SerenityOS/serenity.git
synced 2025-01-22 09:21:57 -05:00
LibTextCodec: Implement big5
encoder
Implements the `big5` encoder, as specified by https://encoding.spec.whatwg.org/#big5-encoder (cherry picked from commit 34c8c559c112796af0f99b48a7b88cb26633a764)
This commit is contained in:
parent
399dc388d6
commit
4cd250dc2d
3 changed files with 92 additions and 0 deletions
|
@ -61,3 +61,21 @@ TEST_CASE(test_euc_kr_encoder)
|
|||
EXPECT(processed_bytes[2] == 0xF0);
|
||||
EXPECT(processed_bytes[3] == 0xD8);
|
||||
}
|
||||
|
||||
TEST_CASE(test_big5_encoder)
|
||||
{
|
||||
TextCodec::Big5Encoder encoder;
|
||||
// U+A7 Section Sign
|
||||
// U+70D7 CJK Unified Ideograph-70D7
|
||||
auto test_string = "\U000000A7\U000070D7"sv;
|
||||
|
||||
Vector<u8> processed_bytes;
|
||||
MUST(encoder.process(Utf8View(test_string), [&](u8 byte) {
|
||||
return processed_bytes.try_append(byte);
|
||||
}));
|
||||
EXPECT(processed_bytes.size() == 4);
|
||||
EXPECT(processed_bytes[0] == 0xA1);
|
||||
EXPECT(processed_bytes[1] == 0xB1);
|
||||
EXPECT(processed_bytes[2] == 0xD2);
|
||||
EXPECT(processed_bytes[3] == 0x71);
|
||||
}
|
||||
|
|
|
@ -14,6 +14,7 @@ namespace TextCodec {
|
|||
|
||||
namespace {
|
||||
UTF8Encoder s_utf8_encoder;
|
||||
Big5Encoder s_big5_encoder;
|
||||
EUCJPEncoder s_euc_jp_encoder;
|
||||
EUCKREncoder s_euc_kr_encoder;
|
||||
}
|
||||
|
@ -22,6 +23,8 @@ Optional<Encoder&> encoder_for_exact_name(StringView encoding)
|
|||
{
|
||||
if (encoding.equals_ignoring_ascii_case("utf-8"sv))
|
||||
return s_utf8_encoder;
|
||||
if (encoding.equals_ignoring_ascii_case("big5"sv))
|
||||
return s_big5_encoder;
|
||||
if (encoding.equals_ignoring_ascii_case("euc-jp"sv))
|
||||
return s_euc_jp_encoder;
|
||||
if (encoding.equals_ignoring_ascii_case("euc-kr"sv))
|
||||
|
@ -138,4 +141,70 @@ ErrorOr<void> EUCKREncoder::process(Utf8View input, Function<ErrorOr<void>(u8)>
|
|||
return {};
|
||||
}
|
||||
|
||||
// https://encoding.spec.whatwg.org/#index-big5-pointer
|
||||
static Optional<u32> index_big5_pointer(u32 code_point)
|
||||
{
|
||||
// 1. Let index be index Big5 excluding all entries whose pointer is less than (0xA1 - 0x81) × 157.
|
||||
auto start_index = (0xA1 - 0x81) * 157 - s_big5_index_first_pointer;
|
||||
|
||||
// 2. If code point is U+2550, U+255E, U+2561, U+256A, U+5341, or U+5345, return the last pointer
|
||||
// corresponding to code point in index.
|
||||
if (Array<u32, 6> { 0x2550, 0x255E, 0x2561, 0x256A, 0x5341, 0x5345 }.contains_slow(code_point)) {
|
||||
for (u32 i = s_big5_index.size() - 1; i >= start_index; --i) {
|
||||
if (s_big5_index[i] == code_point) {
|
||||
return s_big5_index_first_pointer + i;
|
||||
}
|
||||
}
|
||||
return {};
|
||||
}
|
||||
|
||||
// 3. Return the index pointer for code point in index.
|
||||
for (u32 i = start_index; i < s_big5_index.size(); ++i) {
|
||||
if (s_big5_index[i] == code_point) {
|
||||
return s_big5_index_first_pointer + i;
|
||||
}
|
||||
}
|
||||
return {};
|
||||
}
|
||||
|
||||
// https://encoding.spec.whatwg.org/#big5-encoder
|
||||
ErrorOr<void> Big5Encoder::process(Utf8View input, Function<ErrorOr<void>(u8)> on_byte)
|
||||
{
|
||||
for (u32 item : input) {
|
||||
// 1. If code point is end-of-queue, return finished.
|
||||
|
||||
// 2. If code point is an ASCII code point, return a byte whose value is code point.
|
||||
if (is_ascii(item)) {
|
||||
TRY(on_byte(static_cast<u8>(item)));
|
||||
continue;
|
||||
}
|
||||
|
||||
// 3. Let pointer be the index Big5 pointer for code point.
|
||||
auto pointer = index_big5_pointer(item);
|
||||
|
||||
// 4. If pointer is null, return error with code point.
|
||||
if (!pointer.has_value()) {
|
||||
// TODO: Report error.
|
||||
continue;
|
||||
}
|
||||
|
||||
// 5. Let lead be pointer / 157 + 0x81.
|
||||
auto lead = *pointer / 157 + 0x81;
|
||||
|
||||
// 6. Let trail be pointer % 157.
|
||||
auto trail = *pointer % 157;
|
||||
|
||||
// 7. Let offset be 0x40 if trail is less than 0x3F, otherwise 0x62.
|
||||
auto offset = 0x62;
|
||||
if (trail < 0x3f)
|
||||
offset = 0x40;
|
||||
|
||||
// 8. Return two bytes whose values are lead and trail + offset.
|
||||
TRY(on_byte(static_cast<u8>(lead)));
|
||||
TRY(on_byte(static_cast<u8>(trail + offset)));
|
||||
}
|
||||
|
||||
return {};
|
||||
}
|
||||
|
||||
}
|
||||
|
|
|
@ -34,6 +34,11 @@ public:
|
|||
virtual ErrorOr<void> process(Utf8View, Function<ErrorOr<void>(u8)> on_byte) override;
|
||||
};
|
||||
|
||||
class Big5Encoder final : public Encoder {
|
||||
public:
|
||||
virtual ErrorOr<void> process(Utf8View, Function<ErrorOr<void>(u8)> on_byte) override;
|
||||
};
|
||||
|
||||
Optional<Encoder&> encoder_for_exact_name(StringView encoding);
|
||||
Optional<Encoder&> encoder_for(StringView label);
|
||||
|
||||
|
|
Loading…
Reference in a new issue