serenity/Userland/Libraries/LibTextCodec/Decoder.cpp
Max Wipfli d325403cb5 LibTextCodec: Use Optional<String> for get_standardized_encoding
This patch changes get_standardized_encoding to use an Optional<String>
return type instead of just returning the null string when unable to
match the provided encoding to one of the canonical encoding names.

This is part of an effort to move away from using null strings towards
explicitly using Optional<String> to indicate that the String may not
have a value.
2021-05-18 21:02:07 +02:00

343 lines
14 KiB
C++

/*
* Copyright (c) 2020, Andreas Kling <kling@serenityos.org>
*
* SPDX-License-Identifier: BSD-2-Clause
*/
#include <AK/String.h>
#include <AK/StringBuilder.h>
#include <LibTextCodec/Decoder.h>
namespace TextCodec {
namespace {
Latin1Decoder& latin1_decoder()
{
static Latin1Decoder* decoder = nullptr;
if (!decoder)
decoder = new Latin1Decoder;
return *decoder;
}
UTF8Decoder& utf8_decoder()
{
static UTF8Decoder* decoder = nullptr;
if (!decoder)
decoder = new UTF8Decoder;
return *decoder;
}
UTF16BEDecoder& utf16be_decoder()
{
static UTF16BEDecoder* decoder = nullptr;
if (!decoder)
decoder = new UTF16BEDecoder;
return *decoder;
}
Latin2Decoder& latin2_decoder()
{
static Latin2Decoder* decoder = nullptr;
if (!decoder)
decoder = new Latin2Decoder;
return *decoder;
}
HebrewDecoder& hebrew_decoder()
{
static HebrewDecoder* decoder = nullptr;
if (!decoder)
decoder = new HebrewDecoder;
return *decoder;
}
CyrillicDecoder& cyrillic_decoder()
{
static CyrillicDecoder* decoder = nullptr;
if (!decoder)
decoder = new CyrillicDecoder;
return *decoder;
}
}
Decoder* decoder_for(const String& a_encoding)
{
auto encoding = get_standardized_encoding(a_encoding);
if (encoding.has_value()) {
if (encoding.value().equals_ignoring_case("windows-1252"))
return &latin1_decoder();
if (encoding.value().equals_ignoring_case("utf-8"))
return &utf8_decoder();
if (encoding.value().equals_ignoring_case("utf-16be"))
return &utf16be_decoder();
if (encoding.value().equals_ignoring_case("iso-8859-2"))
return &latin2_decoder();
if (encoding.value().equals_ignoring_case("windows-1255"))
return &hebrew_decoder();
if (encoding.value().equals_ignoring_case("windows-1251"))
return &cyrillic_decoder();
}
dbgln("TextCodec: No decoder implemented for encoding '{}'", a_encoding);
return nullptr;
}
// https://encoding.spec.whatwg.org/#concept-encoding-get
Optional<String> get_standardized_encoding(const String& encoding)
{
String trimmed_lowercase_encoding = encoding.trim_whitespace().to_lowercase();
if (trimmed_lowercase_encoding.is_one_of("unicode-1-1-utf-8", "unicode11utf8", "unicode20utf8", "utf-8", "utf8", "x-unicode20utf8"))
return "UTF-8";
if (trimmed_lowercase_encoding.is_one_of("866", "cp866", "csibm866", "ibm866"))
return "IBM866";
if (trimmed_lowercase_encoding.is_one_of("csisolatin2", "iso-8859-2", "iso-ir-101", "iso8859-2", "iso88592", "iso_8859-2", "iso_8859-2:1987", "l2", "latin2"))
return "ISO-8859-2";
if (trimmed_lowercase_encoding.is_one_of("csisolatin3", "iso-8859-3", "iso-ir-109", "iso8859-3", "iso88593", "iso_8859-3", "iso_8859-3:1988", "l3", "latin3"))
return "ISO-8859-3";
if (trimmed_lowercase_encoding.is_one_of("csisolatin4", "iso-8859-4", "iso-ir-110", "iso8859-4", "iso88594", "iso_8859-4", "iso_8859-4:1989", "l4", "latin4"))
return "ISO-8859-4";
if (trimmed_lowercase_encoding.is_one_of("csisolatincyrillic", "cyrillic", "iso-8859-5", "iso-ir-144", "iso8859-5", "iso88595", "iso_8859-5", "iso_8859-5:1988"))
return "ISO-8859-5";
if (trimmed_lowercase_encoding.is_one_of("arabic", "asmo-708", "csiso88596e", "csiso88596i", "csisolatinarabic", "ecma-114", "iso-8859-6", "iso-8859-6-e", "iso-8859-6-i", "iso-ir-127", "iso8859-6", "iso88596", "iso_8859-6", "iso_8859-6:1987"))
return "ISO-8859-6";
if (trimmed_lowercase_encoding.is_one_of("csisolatingreek", "ecma-118", "elot_928", "greek", "greek8", "iso-8859-7", "iso-ir-126", "iso8859-7", "iso88597", "iso_8859-7", "iso_8859-7:1987", "sun_eu_greek"))
return "ISO-8859-7";
if (trimmed_lowercase_encoding.is_one_of("csiso88598e", "csisolatinhebrew", "hebrew", "iso-8859-8", "iso-8859-8-e", "iso-ir-138", "iso8859-8", "iso88598", "iso_8859-8", "iso_8859-8:1988", "visual"))
return "ISO-8859-8";
if (trimmed_lowercase_encoding.is_one_of("csiso88598i", "iso-8859-8-i", "logical"))
return "ISO-8859-8-I";
if (trimmed_lowercase_encoding.is_one_of("csisolatin6", "iso8859-10", "iso-ir-157", "iso8859-10", "iso885910", "l6", "latin6"))
return "ISO-8859-10";
if (trimmed_lowercase_encoding.is_one_of("iso-8859-13", "iso8859-13", "iso885913"))
return "ISO-8859-13";
if (trimmed_lowercase_encoding.is_one_of("iso-8859-14", "iso8859-14", "iso885914"))
return "ISO-8859-14";
if (trimmed_lowercase_encoding.is_one_of("csisolatin9", "iso-8859-15", "iso8859-15", "iso885915", "iso_8859-15", "l9"))
return "ISO-8859-15";
if (trimmed_lowercase_encoding == "iso-8859-16")
return "ISO-8859-16";
if (trimmed_lowercase_encoding.is_one_of("cskoi8r", "koi", "koi8", "koi8-r", "koi8_r"))
return "KOI8-R";
if (trimmed_lowercase_encoding.is_one_of("koi8-ru", "koi8-u"))
return "KOI8-U";
if (trimmed_lowercase_encoding.is_one_of("csmacintosh", "mac", "macintosh", "x-mac-roman"))
return "macintosh";
if (trimmed_lowercase_encoding.is_one_of("dos-874", "iso-8859-11", "iso8859-11", "iso885911", "tis-620", "windows-874"))
return "windows-874";
if (trimmed_lowercase_encoding.is_one_of("cp1250", "windows-1250", "x-cp1250"))
return "windows-1250";
if (trimmed_lowercase_encoding.is_one_of("cp1251", "windows-1251", "x-cp1251"))
return "windows-1251";
if (trimmed_lowercase_encoding.is_one_of("ansi_x3.4-1968", "ascii", "cp1252", "cp819", "csisolatin1", "ibm819", "iso-8859-1", "iso-ir-100", "iso8859-1", "iso88591", "iso_8859-1", "iso_8859-1:1987", "l1", "latin1", "us-ascii", "windows-1252", "x-cp1252"))
return "windows-1252";
if (trimmed_lowercase_encoding.is_one_of("cp1253", "windows-1253", "x-cp1253"))
return "windows-1253";
if (trimmed_lowercase_encoding.is_one_of("cp1254", "csisolatin5", "iso-8859-9", "iso-ir-148", "iso-8859-9", "iso-88599", "iso_8859-9", "iso_8859-9:1989", "l5", "latin5", "windows-1254", "x-cp1254"))
return "windows-1254";
if (trimmed_lowercase_encoding.is_one_of("cp1255", "windows-1255", "x-cp1255"))
return "windows-1255";
if (trimmed_lowercase_encoding.is_one_of("cp1256", "windows-1256", "x-cp1256"))
return "windows-1256";
if (trimmed_lowercase_encoding.is_one_of("cp1257", "windows-1257", "x-cp1257"))
return "windows-1257";
if (trimmed_lowercase_encoding.is_one_of("cp1258", "windows-1258", "x-cp1258"))
return "windows-1258";
if (trimmed_lowercase_encoding.is_one_of("x-mac-cyrillic", "x-mac-ukrainian"))
return "x-mac-cyrillic";
if (trimmed_lowercase_encoding.is_one_of("chinese", "csgb2312", "csiso58gb231280", "gb2312", "gb_2312", "gb_2312-80", "gbk", "iso-ir-58", "x-gbk"))
return "GBK";
if (trimmed_lowercase_encoding == "gb18030")
return "gb18030";
if (trimmed_lowercase_encoding.is_one_of("big5", "big5-hkscs", "cn-big5", "csbig5", "x-x-big5"))
return "Big5";
if (trimmed_lowercase_encoding.is_one_of("cseucpkdfmtjapanese", "euc-jp", "x-euc-jp"))
return "EUC-JP";
if (trimmed_lowercase_encoding.is_one_of("csiso2022jp", "iso-2022-jp"))
return "ISO-2022-JP";
if (trimmed_lowercase_encoding.is_one_of("csshiftjis", "ms932", "ms_kanji", "shift-jis", "shift_jis", "sjis", "windows-31j", "x-sjis"))
return "Shift_JIS";
if (trimmed_lowercase_encoding.is_one_of("cseuckr", "csksc56011987", "euc-kr", "iso-ir-149", "korean", "ks_c_5601-1987", "ks_c_5601-1989", "ksc5601", "ksc_5601", "windows-949"))
return "EUC-KR";
if (trimmed_lowercase_encoding.is_one_of("csiso2022kr", "hz-gb-2312", "iso-2022-cn", "iso-2022-cn-ext", "iso-2022-kr", "replacement"))
return "replacement";
if (trimmed_lowercase_encoding.is_one_of("unicodefffe", "utf-16be"))
return "UTF-16BE";
if (trimmed_lowercase_encoding.is_one_of("csunicode", "iso-10646-ucs-2", "ucs-2", "unicode", "unicodefeff", "utf-16", "utf-16le"))
return "UTF-16LE";
if (trimmed_lowercase_encoding == "x-user-defined")
return "x-user-defined";
dbgln("TextCodec: Unrecognized encoding: {}", encoding);
return {};
}
bool is_standardized_encoding(const String& encoding)
{
auto standardized_encoding = get_standardized_encoding(encoding);
return standardized_encoding.has_value() && encoding.equals_ignoring_case(standardized_encoding.value());
}
String UTF8Decoder::to_utf8(const StringView& input)
{
return input;
}
String UTF16BEDecoder::to_utf8(const StringView& input)
{
StringBuilder builder(input.length() / 2);
size_t utf16_length = input.length() - (input.length() % 2);
for (size_t i = 0; i < utf16_length; i += 2) {
u16 code_point = (input[i] << 8) | input[i + 1];
builder.append_code_point(code_point);
}
return builder.to_string();
}
String Latin1Decoder::to_utf8(const StringView& input)
{
StringBuilder builder(input.length());
for (size_t i = 0; i < input.length(); ++i) {
u8 ch = input[i];
// Latin1 is the same as the first 256 Unicode code_points, so no mapping is needed, just utf-8 encoding.
builder.append_code_point(ch);
}
return builder.to_string();
}
namespace {
u32 convert_latin2_to_utf8(u8 in)
{
switch (in) {
#define MAP(X, Y) \
case X: \
return Y
MAP(0xA1, 0x104);
MAP(0xA2, 0x2D8);
MAP(0xA3, 0x141);
MAP(0xA5, 0x13D);
MAP(0xA6, 0x15A);
MAP(0xA9, 0x160);
MAP(0xAA, 0x15E);
MAP(0xAB, 0x164);
MAP(0xAC, 0x179);
MAP(0xAE, 0x17D);
MAP(0xAF, 0x17B);
MAP(0xB1, 0x105);
MAP(0xB2, 0x2DB);
MAP(0xB3, 0x142);
MAP(0xB5, 0x13E);
MAP(0xB6, 0x15B);
MAP(0xB7, 0x2C7);
MAP(0xB9, 0x161);
MAP(0xBA, 0x15F);
MAP(0xBB, 0x165);
MAP(0xBC, 0x17A);
MAP(0xBD, 0x2DD);
MAP(0xBE, 0x17E);
MAP(0xBF, 0x17C);
MAP(0xC0, 0x154);
MAP(0xC3, 0x102);
MAP(0xC5, 0x139);
MAP(0xC6, 0x106);
MAP(0xC8, 0x10C);
MAP(0xCA, 0x118);
MAP(0xCC, 0x11A);
MAP(0xCF, 0x10E);
MAP(0xD0, 0x110);
MAP(0xD1, 0x143);
MAP(0xD2, 0x147);
MAP(0xD5, 0x150);
MAP(0xD8, 0x158);
MAP(0xD9, 0x16E);
MAP(0xDB, 0x170);
MAP(0xDE, 0x162);
MAP(0xE0, 0x155);
MAP(0xE3, 0x103);
MAP(0xE5, 0x13A);
MAP(0xE6, 0x107);
MAP(0xE8, 0x10D);
MAP(0xEA, 0x119);
MAP(0xEC, 0x11B);
MAP(0xEF, 0x10F);
MAP(0xF0, 0x111);
MAP(0xF1, 0x144);
MAP(0xF2, 0x148);
MAP(0xF5, 0x151);
MAP(0xF8, 0x159);
MAP(0xF9, 0x16F);
MAP(0xFB, 0x171);
MAP(0xFE, 0x163);
MAP(0xFF, 0x2D9);
#undef MAP
default:
return in;
}
}
}
String Latin2Decoder::to_utf8(const StringView& input)
{
StringBuilder builder(input.length());
for (auto c : input) {
builder.append_code_point(convert_latin2_to_utf8(c));
}
return builder.to_string();
}
String HebrewDecoder::to_utf8(const StringView& input)
{
static constexpr Array<u32, 128> translation_table = {
0x20AC, 0xFFFD, 0x201A, 0x192, 0x201E, 0x2026, 0x2020, 0x2021, 0x2C6, 0x2030, 0xFFFD, 0x2039, 0xFFFD, 0xFFFD, 0xFFFD, 0xFFFD,
0xFFFD, 0x2018, 0x2019, 0x201C, 0x201D, 0x2022, 0x2013, 0x2014, 0x2DC, 0x2122, 0xFFFD, 0x203A, 0xFFFD, 0xFFFD, 0xFFFD, 0xFFFD,
0xA0, 0xA1, 0xA2, 0xA3, 0x20AA, 0xA5, 0xA6, 0xA7, 0xA8, 0xA9, 0xD7, 0xAB, 0xAC, 0xAD, 0xAE, 0xAF,
0xB0, 0xB1, 0xB2, 0xB3, 0xB4, 0xB5, 0xB6, 0xB7, 0xB8, 0xB9, 0xF7, 0xBB, 0xBC, 0xBD, 0xBE, 0xBF,
0x5B0, 0x5B1, 0x5B2, 0x5B3, 0x5B4, 0x5B5, 0x5B6, 0x5B7, 0x5B8, 0x5B9, 0x5BA, 0x5BB, 0x5BC, 0x5BD, 0x5BE, 0x5BF,
0x5C0, 0x5C1, 0x5C2, 0x5C3, 0x5F0, 0x5F1, 0x5F2, 0x5F3, 0x5F4, 0xFFFD, 0xFFFD, 0xFFFD, 0xFFFD, 0xFFFD, 0xFFFD, 0xFFFD,
0x5D0, 0x5D1, 0x5D2, 0x5D3, 0x5D4, 0x5D5, 0x5D6, 0x5D7, 0x5D8, 0x5D9, 0x5DA, 0x5DB, 0x5DC, 0x5DD, 0x5DE, 0x5DF,
0x5E0, 0x5E1, 0x5E2, 0x5E3, 0x5E4, 0x5E5, 0x5E6, 0x5E7, 0x5E8, 0x5E9, 0x5EA, 0xFFFD, 0xFFFD, 0x200E, 0x200F, 0xFFFD
};
StringBuilder builder(input.length());
for (unsigned char ch : input) {
if (ch < 0x80) { // Superset of ASCII
builder.append(ch);
} else {
builder.append_code_point(translation_table[ch - 0x80]);
}
}
return builder.to_string();
}
String CyrillicDecoder::to_utf8(const StringView& input)
{
static constexpr Array<u32, 128> translation_table = {
0x402, 0x403, 0x201A, 0x453, 0x201E, 0x2026, 0x2020, 0x2021, 0x20AC, 0x2030, 0x409, 0x2039, 0x40A, 0x40C, 0x40B, 0x40F,
0x452, 0x2018, 0x2019, 0x201C, 0x201D, 0x2022, 0x2013, 0x2014, 0xFFFD, 0x2122, 0x459, 0x203A, 0x45A, 0x45C, 0x45B, 0x45F,
0xA0, 0x40E, 0x45E, 0x408, 0xA4, 0x490, 0xA6, 0xA7, 0x401, 0xA9, 0x404, 0xAB, 0xAC, 0xAD, 0xAE, 0x407,
0xB0, 0xB1, 0x406, 0x456, 0x491, 0xB5, 0xB6, 0xB7, 0x451, 0x2116, 0x454, 0xBB, 0x458, 0x405, 0x455, 0x457,
0x410, 0x411, 0x412, 0x413, 0x414, 0x415, 0x416, 0x417, 0x418, 0x419, 0x41A, 0x41B, 0x41C, 0x41D, 0x41E, 0x41F,
0x420, 0x421, 0x422, 0x423, 0x424, 0x425, 0x426, 0x427, 0x428, 0x429, 0x42A, 0x42B, 0x42C, 0x42D, 0x42E, 0x42F,
0x430, 0x431, 0x432, 0x433, 0x434, 0x435, 0x436, 0x437, 0x438, 0x439, 0x43A, 0x43B, 0x43C, 0x43D, 0x43E, 0x43F,
0x440, 0x441, 0x442, 0x443, 0x444, 0x445, 0x446, 0x447, 0x448, 0x449, 0x44A, 0x44B, 0x44C, 0x44D, 0x44E, 0x44F
};
StringBuilder builder(input.length());
for (unsigned char ch : input) {
if (ch < 0x80) { // Superset of ASCII
builder.append(ch);
} else {
builder.append_code_point(translation_table[ch - 0x80]);
}
}
return builder.to_string();
}
}