mirror of
https://github.com/SerenityOS/serenity.git
synced 2025-01-23 09:51:57 -05:00
d325403cb5
This patch changes get_standardized_encoding to use an Optional<String> return type instead of just returning the null string when unable to match the provided encoding to one of the canonical encoding names. This is part of an effort to move away from using null strings towards explicitly using Optional<String> to indicate that the String may not have a value.
343 lines
14 KiB
C++
343 lines
14 KiB
C++
/*
|
|
* Copyright (c) 2020, Andreas Kling <kling@serenityos.org>
|
|
*
|
|
* SPDX-License-Identifier: BSD-2-Clause
|
|
*/
|
|
|
|
#include <AK/String.h>
|
|
#include <AK/StringBuilder.h>
|
|
#include <LibTextCodec/Decoder.h>
|
|
|
|
namespace TextCodec {
|
|
|
|
namespace {
|
|
Latin1Decoder& latin1_decoder()
|
|
{
|
|
static Latin1Decoder* decoder = nullptr;
|
|
if (!decoder)
|
|
decoder = new Latin1Decoder;
|
|
return *decoder;
|
|
}
|
|
|
|
UTF8Decoder& utf8_decoder()
|
|
{
|
|
static UTF8Decoder* decoder = nullptr;
|
|
if (!decoder)
|
|
decoder = new UTF8Decoder;
|
|
return *decoder;
|
|
}
|
|
|
|
UTF16BEDecoder& utf16be_decoder()
|
|
{
|
|
static UTF16BEDecoder* decoder = nullptr;
|
|
if (!decoder)
|
|
decoder = new UTF16BEDecoder;
|
|
return *decoder;
|
|
}
|
|
|
|
Latin2Decoder& latin2_decoder()
|
|
{
|
|
static Latin2Decoder* decoder = nullptr;
|
|
if (!decoder)
|
|
decoder = new Latin2Decoder;
|
|
return *decoder;
|
|
}
|
|
|
|
HebrewDecoder& hebrew_decoder()
|
|
{
|
|
static HebrewDecoder* decoder = nullptr;
|
|
if (!decoder)
|
|
decoder = new HebrewDecoder;
|
|
return *decoder;
|
|
}
|
|
|
|
CyrillicDecoder& cyrillic_decoder()
|
|
{
|
|
static CyrillicDecoder* decoder = nullptr;
|
|
if (!decoder)
|
|
decoder = new CyrillicDecoder;
|
|
return *decoder;
|
|
}
|
|
|
|
}
|
|
|
|
Decoder* decoder_for(const String& a_encoding)
|
|
{
|
|
auto encoding = get_standardized_encoding(a_encoding);
|
|
if (encoding.has_value()) {
|
|
if (encoding.value().equals_ignoring_case("windows-1252"))
|
|
return &latin1_decoder();
|
|
if (encoding.value().equals_ignoring_case("utf-8"))
|
|
return &utf8_decoder();
|
|
if (encoding.value().equals_ignoring_case("utf-16be"))
|
|
return &utf16be_decoder();
|
|
if (encoding.value().equals_ignoring_case("iso-8859-2"))
|
|
return &latin2_decoder();
|
|
if (encoding.value().equals_ignoring_case("windows-1255"))
|
|
return &hebrew_decoder();
|
|
if (encoding.value().equals_ignoring_case("windows-1251"))
|
|
return &cyrillic_decoder();
|
|
}
|
|
dbgln("TextCodec: No decoder implemented for encoding '{}'", a_encoding);
|
|
return nullptr;
|
|
}
|
|
|
|
// https://encoding.spec.whatwg.org/#concept-encoding-get
|
|
Optional<String> get_standardized_encoding(const String& encoding)
|
|
{
|
|
String trimmed_lowercase_encoding = encoding.trim_whitespace().to_lowercase();
|
|
|
|
if (trimmed_lowercase_encoding.is_one_of("unicode-1-1-utf-8", "unicode11utf8", "unicode20utf8", "utf-8", "utf8", "x-unicode20utf8"))
|
|
return "UTF-8";
|
|
if (trimmed_lowercase_encoding.is_one_of("866", "cp866", "csibm866", "ibm866"))
|
|
return "IBM866";
|
|
if (trimmed_lowercase_encoding.is_one_of("csisolatin2", "iso-8859-2", "iso-ir-101", "iso8859-2", "iso88592", "iso_8859-2", "iso_8859-2:1987", "l2", "latin2"))
|
|
return "ISO-8859-2";
|
|
if (trimmed_lowercase_encoding.is_one_of("csisolatin3", "iso-8859-3", "iso-ir-109", "iso8859-3", "iso88593", "iso_8859-3", "iso_8859-3:1988", "l3", "latin3"))
|
|
return "ISO-8859-3";
|
|
if (trimmed_lowercase_encoding.is_one_of("csisolatin4", "iso-8859-4", "iso-ir-110", "iso8859-4", "iso88594", "iso_8859-4", "iso_8859-4:1989", "l4", "latin4"))
|
|
return "ISO-8859-4";
|
|
if (trimmed_lowercase_encoding.is_one_of("csisolatincyrillic", "cyrillic", "iso-8859-5", "iso-ir-144", "iso8859-5", "iso88595", "iso_8859-5", "iso_8859-5:1988"))
|
|
return "ISO-8859-5";
|
|
if (trimmed_lowercase_encoding.is_one_of("arabic", "asmo-708", "csiso88596e", "csiso88596i", "csisolatinarabic", "ecma-114", "iso-8859-6", "iso-8859-6-e", "iso-8859-6-i", "iso-ir-127", "iso8859-6", "iso88596", "iso_8859-6", "iso_8859-6:1987"))
|
|
return "ISO-8859-6";
|
|
if (trimmed_lowercase_encoding.is_one_of("csisolatingreek", "ecma-118", "elot_928", "greek", "greek8", "iso-8859-7", "iso-ir-126", "iso8859-7", "iso88597", "iso_8859-7", "iso_8859-7:1987", "sun_eu_greek"))
|
|
return "ISO-8859-7";
|
|
if (trimmed_lowercase_encoding.is_one_of("csiso88598e", "csisolatinhebrew", "hebrew", "iso-8859-8", "iso-8859-8-e", "iso-ir-138", "iso8859-8", "iso88598", "iso_8859-8", "iso_8859-8:1988", "visual"))
|
|
return "ISO-8859-8";
|
|
if (trimmed_lowercase_encoding.is_one_of("csiso88598i", "iso-8859-8-i", "logical"))
|
|
return "ISO-8859-8-I";
|
|
if (trimmed_lowercase_encoding.is_one_of("csisolatin6", "iso8859-10", "iso-ir-157", "iso8859-10", "iso885910", "l6", "latin6"))
|
|
return "ISO-8859-10";
|
|
if (trimmed_lowercase_encoding.is_one_of("iso-8859-13", "iso8859-13", "iso885913"))
|
|
return "ISO-8859-13";
|
|
if (trimmed_lowercase_encoding.is_one_of("iso-8859-14", "iso8859-14", "iso885914"))
|
|
return "ISO-8859-14";
|
|
if (trimmed_lowercase_encoding.is_one_of("csisolatin9", "iso-8859-15", "iso8859-15", "iso885915", "iso_8859-15", "l9"))
|
|
return "ISO-8859-15";
|
|
if (trimmed_lowercase_encoding == "iso-8859-16")
|
|
return "ISO-8859-16";
|
|
if (trimmed_lowercase_encoding.is_one_of("cskoi8r", "koi", "koi8", "koi8-r", "koi8_r"))
|
|
return "KOI8-R";
|
|
if (trimmed_lowercase_encoding.is_one_of("koi8-ru", "koi8-u"))
|
|
return "KOI8-U";
|
|
if (trimmed_lowercase_encoding.is_one_of("csmacintosh", "mac", "macintosh", "x-mac-roman"))
|
|
return "macintosh";
|
|
if (trimmed_lowercase_encoding.is_one_of("dos-874", "iso-8859-11", "iso8859-11", "iso885911", "tis-620", "windows-874"))
|
|
return "windows-874";
|
|
if (trimmed_lowercase_encoding.is_one_of("cp1250", "windows-1250", "x-cp1250"))
|
|
return "windows-1250";
|
|
if (trimmed_lowercase_encoding.is_one_of("cp1251", "windows-1251", "x-cp1251"))
|
|
return "windows-1251";
|
|
if (trimmed_lowercase_encoding.is_one_of("ansi_x3.4-1968", "ascii", "cp1252", "cp819", "csisolatin1", "ibm819", "iso-8859-1", "iso-ir-100", "iso8859-1", "iso88591", "iso_8859-1", "iso_8859-1:1987", "l1", "latin1", "us-ascii", "windows-1252", "x-cp1252"))
|
|
return "windows-1252";
|
|
if (trimmed_lowercase_encoding.is_one_of("cp1253", "windows-1253", "x-cp1253"))
|
|
return "windows-1253";
|
|
if (trimmed_lowercase_encoding.is_one_of("cp1254", "csisolatin5", "iso-8859-9", "iso-ir-148", "iso-8859-9", "iso-88599", "iso_8859-9", "iso_8859-9:1989", "l5", "latin5", "windows-1254", "x-cp1254"))
|
|
return "windows-1254";
|
|
if (trimmed_lowercase_encoding.is_one_of("cp1255", "windows-1255", "x-cp1255"))
|
|
return "windows-1255";
|
|
if (trimmed_lowercase_encoding.is_one_of("cp1256", "windows-1256", "x-cp1256"))
|
|
return "windows-1256";
|
|
if (trimmed_lowercase_encoding.is_one_of("cp1257", "windows-1257", "x-cp1257"))
|
|
return "windows-1257";
|
|
if (trimmed_lowercase_encoding.is_one_of("cp1258", "windows-1258", "x-cp1258"))
|
|
return "windows-1258";
|
|
if (trimmed_lowercase_encoding.is_one_of("x-mac-cyrillic", "x-mac-ukrainian"))
|
|
return "x-mac-cyrillic";
|
|
if (trimmed_lowercase_encoding.is_one_of("chinese", "csgb2312", "csiso58gb231280", "gb2312", "gb_2312", "gb_2312-80", "gbk", "iso-ir-58", "x-gbk"))
|
|
return "GBK";
|
|
if (trimmed_lowercase_encoding == "gb18030")
|
|
return "gb18030";
|
|
if (trimmed_lowercase_encoding.is_one_of("big5", "big5-hkscs", "cn-big5", "csbig5", "x-x-big5"))
|
|
return "Big5";
|
|
if (trimmed_lowercase_encoding.is_one_of("cseucpkdfmtjapanese", "euc-jp", "x-euc-jp"))
|
|
return "EUC-JP";
|
|
if (trimmed_lowercase_encoding.is_one_of("csiso2022jp", "iso-2022-jp"))
|
|
return "ISO-2022-JP";
|
|
if (trimmed_lowercase_encoding.is_one_of("csshiftjis", "ms932", "ms_kanji", "shift-jis", "shift_jis", "sjis", "windows-31j", "x-sjis"))
|
|
return "Shift_JIS";
|
|
if (trimmed_lowercase_encoding.is_one_of("cseuckr", "csksc56011987", "euc-kr", "iso-ir-149", "korean", "ks_c_5601-1987", "ks_c_5601-1989", "ksc5601", "ksc_5601", "windows-949"))
|
|
return "EUC-KR";
|
|
if (trimmed_lowercase_encoding.is_one_of("csiso2022kr", "hz-gb-2312", "iso-2022-cn", "iso-2022-cn-ext", "iso-2022-kr", "replacement"))
|
|
return "replacement";
|
|
if (trimmed_lowercase_encoding.is_one_of("unicodefffe", "utf-16be"))
|
|
return "UTF-16BE";
|
|
if (trimmed_lowercase_encoding.is_one_of("csunicode", "iso-10646-ucs-2", "ucs-2", "unicode", "unicodefeff", "utf-16", "utf-16le"))
|
|
return "UTF-16LE";
|
|
if (trimmed_lowercase_encoding == "x-user-defined")
|
|
return "x-user-defined";
|
|
|
|
dbgln("TextCodec: Unrecognized encoding: {}", encoding);
|
|
return {};
|
|
}
|
|
|
|
bool is_standardized_encoding(const String& encoding)
|
|
{
|
|
auto standardized_encoding = get_standardized_encoding(encoding);
|
|
return standardized_encoding.has_value() && encoding.equals_ignoring_case(standardized_encoding.value());
|
|
}
|
|
|
|
String UTF8Decoder::to_utf8(const StringView& input)
|
|
{
|
|
return input;
|
|
}
|
|
|
|
String UTF16BEDecoder::to_utf8(const StringView& input)
|
|
{
|
|
StringBuilder builder(input.length() / 2);
|
|
size_t utf16_length = input.length() - (input.length() % 2);
|
|
for (size_t i = 0; i < utf16_length; i += 2) {
|
|
u16 code_point = (input[i] << 8) | input[i + 1];
|
|
builder.append_code_point(code_point);
|
|
}
|
|
return builder.to_string();
|
|
}
|
|
|
|
String Latin1Decoder::to_utf8(const StringView& input)
|
|
{
|
|
StringBuilder builder(input.length());
|
|
for (size_t i = 0; i < input.length(); ++i) {
|
|
u8 ch = input[i];
|
|
// Latin1 is the same as the first 256 Unicode code_points, so no mapping is needed, just utf-8 encoding.
|
|
builder.append_code_point(ch);
|
|
}
|
|
return builder.to_string();
|
|
}
|
|
|
|
namespace {
|
|
u32 convert_latin2_to_utf8(u8 in)
|
|
{
|
|
switch (in) {
|
|
|
|
#define MAP(X, Y) \
|
|
case X: \
|
|
return Y
|
|
|
|
MAP(0xA1, 0x104);
|
|
MAP(0xA2, 0x2D8);
|
|
MAP(0xA3, 0x141);
|
|
MAP(0xA5, 0x13D);
|
|
MAP(0xA6, 0x15A);
|
|
MAP(0xA9, 0x160);
|
|
MAP(0xAA, 0x15E);
|
|
MAP(0xAB, 0x164);
|
|
MAP(0xAC, 0x179);
|
|
MAP(0xAE, 0x17D);
|
|
MAP(0xAF, 0x17B);
|
|
|
|
MAP(0xB1, 0x105);
|
|
MAP(0xB2, 0x2DB);
|
|
MAP(0xB3, 0x142);
|
|
MAP(0xB5, 0x13E);
|
|
MAP(0xB6, 0x15B);
|
|
MAP(0xB7, 0x2C7);
|
|
MAP(0xB9, 0x161);
|
|
MAP(0xBA, 0x15F);
|
|
MAP(0xBB, 0x165);
|
|
MAP(0xBC, 0x17A);
|
|
MAP(0xBD, 0x2DD);
|
|
MAP(0xBE, 0x17E);
|
|
MAP(0xBF, 0x17C);
|
|
|
|
MAP(0xC0, 0x154);
|
|
MAP(0xC3, 0x102);
|
|
MAP(0xC5, 0x139);
|
|
MAP(0xC6, 0x106);
|
|
MAP(0xC8, 0x10C);
|
|
MAP(0xCA, 0x118);
|
|
MAP(0xCC, 0x11A);
|
|
MAP(0xCF, 0x10E);
|
|
|
|
MAP(0xD0, 0x110);
|
|
MAP(0xD1, 0x143);
|
|
MAP(0xD2, 0x147);
|
|
MAP(0xD5, 0x150);
|
|
MAP(0xD8, 0x158);
|
|
MAP(0xD9, 0x16E);
|
|
MAP(0xDB, 0x170);
|
|
MAP(0xDE, 0x162);
|
|
|
|
MAP(0xE0, 0x155);
|
|
MAP(0xE3, 0x103);
|
|
MAP(0xE5, 0x13A);
|
|
MAP(0xE6, 0x107);
|
|
MAP(0xE8, 0x10D);
|
|
MAP(0xEA, 0x119);
|
|
MAP(0xEC, 0x11B);
|
|
MAP(0xEF, 0x10F);
|
|
|
|
MAP(0xF0, 0x111);
|
|
MAP(0xF1, 0x144);
|
|
MAP(0xF2, 0x148);
|
|
MAP(0xF5, 0x151);
|
|
MAP(0xF8, 0x159);
|
|
MAP(0xF9, 0x16F);
|
|
MAP(0xFB, 0x171);
|
|
MAP(0xFE, 0x163);
|
|
MAP(0xFF, 0x2D9);
|
|
#undef MAP
|
|
|
|
default:
|
|
return in;
|
|
}
|
|
}
|
|
}
|
|
|
|
String Latin2Decoder::to_utf8(const StringView& input)
|
|
{
|
|
StringBuilder builder(input.length());
|
|
for (auto c : input) {
|
|
builder.append_code_point(convert_latin2_to_utf8(c));
|
|
}
|
|
|
|
return builder.to_string();
|
|
}
|
|
|
|
String HebrewDecoder::to_utf8(const StringView& input)
|
|
{
|
|
static constexpr Array<u32, 128> translation_table = {
|
|
0x20AC, 0xFFFD, 0x201A, 0x192, 0x201E, 0x2026, 0x2020, 0x2021, 0x2C6, 0x2030, 0xFFFD, 0x2039, 0xFFFD, 0xFFFD, 0xFFFD, 0xFFFD,
|
|
0xFFFD, 0x2018, 0x2019, 0x201C, 0x201D, 0x2022, 0x2013, 0x2014, 0x2DC, 0x2122, 0xFFFD, 0x203A, 0xFFFD, 0xFFFD, 0xFFFD, 0xFFFD,
|
|
0xA0, 0xA1, 0xA2, 0xA3, 0x20AA, 0xA5, 0xA6, 0xA7, 0xA8, 0xA9, 0xD7, 0xAB, 0xAC, 0xAD, 0xAE, 0xAF,
|
|
0xB0, 0xB1, 0xB2, 0xB3, 0xB4, 0xB5, 0xB6, 0xB7, 0xB8, 0xB9, 0xF7, 0xBB, 0xBC, 0xBD, 0xBE, 0xBF,
|
|
0x5B0, 0x5B1, 0x5B2, 0x5B3, 0x5B4, 0x5B5, 0x5B6, 0x5B7, 0x5B8, 0x5B9, 0x5BA, 0x5BB, 0x5BC, 0x5BD, 0x5BE, 0x5BF,
|
|
0x5C0, 0x5C1, 0x5C2, 0x5C3, 0x5F0, 0x5F1, 0x5F2, 0x5F3, 0x5F4, 0xFFFD, 0xFFFD, 0xFFFD, 0xFFFD, 0xFFFD, 0xFFFD, 0xFFFD,
|
|
0x5D0, 0x5D1, 0x5D2, 0x5D3, 0x5D4, 0x5D5, 0x5D6, 0x5D7, 0x5D8, 0x5D9, 0x5DA, 0x5DB, 0x5DC, 0x5DD, 0x5DE, 0x5DF,
|
|
0x5E0, 0x5E1, 0x5E2, 0x5E3, 0x5E4, 0x5E5, 0x5E6, 0x5E7, 0x5E8, 0x5E9, 0x5EA, 0xFFFD, 0xFFFD, 0x200E, 0x200F, 0xFFFD
|
|
};
|
|
StringBuilder builder(input.length());
|
|
for (unsigned char ch : input) {
|
|
if (ch < 0x80) { // Superset of ASCII
|
|
builder.append(ch);
|
|
} else {
|
|
builder.append_code_point(translation_table[ch - 0x80]);
|
|
}
|
|
}
|
|
return builder.to_string();
|
|
}
|
|
|
|
String CyrillicDecoder::to_utf8(const StringView& input)
|
|
{
|
|
static constexpr Array<u32, 128> translation_table = {
|
|
0x402, 0x403, 0x201A, 0x453, 0x201E, 0x2026, 0x2020, 0x2021, 0x20AC, 0x2030, 0x409, 0x2039, 0x40A, 0x40C, 0x40B, 0x40F,
|
|
0x452, 0x2018, 0x2019, 0x201C, 0x201D, 0x2022, 0x2013, 0x2014, 0xFFFD, 0x2122, 0x459, 0x203A, 0x45A, 0x45C, 0x45B, 0x45F,
|
|
0xA0, 0x40E, 0x45E, 0x408, 0xA4, 0x490, 0xA6, 0xA7, 0x401, 0xA9, 0x404, 0xAB, 0xAC, 0xAD, 0xAE, 0x407,
|
|
0xB0, 0xB1, 0x406, 0x456, 0x491, 0xB5, 0xB6, 0xB7, 0x451, 0x2116, 0x454, 0xBB, 0x458, 0x405, 0x455, 0x457,
|
|
0x410, 0x411, 0x412, 0x413, 0x414, 0x415, 0x416, 0x417, 0x418, 0x419, 0x41A, 0x41B, 0x41C, 0x41D, 0x41E, 0x41F,
|
|
0x420, 0x421, 0x422, 0x423, 0x424, 0x425, 0x426, 0x427, 0x428, 0x429, 0x42A, 0x42B, 0x42C, 0x42D, 0x42E, 0x42F,
|
|
0x430, 0x431, 0x432, 0x433, 0x434, 0x435, 0x436, 0x437, 0x438, 0x439, 0x43A, 0x43B, 0x43C, 0x43D, 0x43E, 0x43F,
|
|
0x440, 0x441, 0x442, 0x443, 0x444, 0x445, 0x446, 0x447, 0x448, 0x449, 0x44A, 0x44B, 0x44C, 0x44D, 0x44E, 0x44F
|
|
};
|
|
StringBuilder builder(input.length());
|
|
for (unsigned char ch : input) {
|
|
if (ch < 0x80) { // Superset of ASCII
|
|
builder.append(ch);
|
|
} else {
|
|
builder.append_code_point(translation_table[ch - 0x80]);
|
|
}
|
|
}
|
|
return builder.to_string();
|
|
}
|
|
|
|
}
|