HTMLEncodingDetection: Use mime type in encoding sniffing

Also added proper spec comments. Fixes at least one WPT test that was failing previously: https://wpt.live/encoding/single-byte-decoder.window.html?document (cherry picked from commit c1a14f66adf4b5e55a0e2a78068749e7d8b3ed98)
2025-01-22 17:31:58 -05:00 · 2024-10-10 02:04:07 +02:00 · 2024-10-10 02:04:07 +02:00 · fcfe89e21d
commit fcfe89e21d
parent 9e773e4977
3 changed files with 67 additions and 28 deletions
--- a/Userland/Libraries/LibWeb/DOM/DocumentLoading.cpp
+++ b/Userland/Libraries/LibWeb/DOM/DocumentLoading.cpp
@ -216,14 +216,14 @@ static WebIDL::ExceptionOr<JS::NonnullGCPtr<DOM::Document>> load_xml_document(HT
    if (auto maybe_encoding = type.parameters().get("charset"sv); maybe_encoding.has_value())
        content_encoding = maybe_encoding.value();

-    auto process_body = JS::create_heap_function(document->heap(), [document, url = navigation_params.response->url().value(), content_encoding = move(content_encoding)](ByteBuffer data) {
+    auto process_body = JS::create_heap_function(document->heap(), [document, url = navigation_params.response->url().value(), content_encoding = move(content_encoding), mime = type](ByteBuffer data) {
        Optional<TextCodec::Decoder&> decoder;
        // The actual HTTP headers and other metadata, not the headers as mutated or implied by the algorithms given in this specification,
        // are the ones that must be used when determining the character encoding according to the rules given in the above specifications.
        if (content_encoding.has_value())
            decoder = TextCodec::decoder_for(*content_encoding);
        if (!decoder.has_value()) {
-            auto encoding = HTML::run_encoding_sniffing_algorithm(document, data);
+            auto encoding = HTML::run_encoding_sniffing_algorithm(document, data, mime);
            decoder = TextCodec::decoder_for(encoding);
        }
        VERIFY(decoder.has_value());
@ -296,8 +296,8 @@ static WebIDL::ExceptionOr<JS::NonnullGCPtr<DOM::Document>> load_text_document(H
    //    document's relevant global object to have the parser to process the implied EOF character, which eventually causes a
    //    load event to be fired.
    // FIXME: Parse as we receive the document data, instead of waiting for the whole document to be fetched first.
-    auto process_body = JS::create_heap_function(document->heap(), [document, url = navigation_params.response->url().value()](ByteBuffer data) {
-        auto encoding = run_encoding_sniffing_algorithm(document, data);
+    auto process_body = JS::create_heap_function(document->heap(), [document, url = navigation_params.response->url().value(), mime = type](ByteBuffer data) {
+        auto encoding = run_encoding_sniffing_algorithm(document, data, mime);
        dbgln_if(HTML_PARSER_DEBUG, "The encoding sniffing algorithm returned encoding '{}'", encoding);

        auto parser = HTML::HTMLParser::create_for_scripting(document);
--- a/Userland/Libraries/LibWeb/HTML/Parser/HTMLEncodingDetection.cpp
+++ b/Userland/Libraries/LibWeb/HTML/Parser/HTMLEncodingDetection.cpp
@ -336,42 +336,80 @@ Optional<ByteString> run_prescan_byte_stream_algorithm(DOM::Document& document,
    return {};
 }

-// https://html.spec.whatwg.org/multipage/parsing.html#determining-the-character-encoding
-ByteString run_encoding_sniffing_algorithm(DOM::Document& document, ByteBuffer const& input)
+// https://encoding.spec.whatwg.org/#bom-sniff
+Optional<ByteString> run_bom_sniff(ByteBuffer const& input)
 {
-    if (input.size() >= 2) {
-        if (input[0] == 0xFE && input[1] == 0xFF) {
-            return "UTF-16BE";
-        } else if (input[0] == 0xFF && input[1] == 0xFE) {
-            return "UTF-16LE";
-        } else if (input.size() >= 3 && input[0] == 0xEF && input[1] == 0xBB && input[2] == 0xBF) {
+    if (input.size() >= 3) {
+        // 1. Let BOM be the result of peeking 3 bytes from ioQueue, converted to a byte sequence.
+        // 2. For each of the rows in the table below, starting with the first one and going down, if BOM starts with the bytes given in the first column, then return the encoding given in the cell in the second column of that row. Otherwise, return null.
+        // Byte order mark  Encoding
+        // 0xEF 0xBB 0xBF   UTF-8
+        // 0xFE 0xFF        UTF-16BE
+        // 0xFF 0xFE 	    UTF-16LE
+        if (input[0] == 0xEF && input[1] == 0xBB && input[2] == 0xBF) {
            return "UTF-8";
        }
+        if (input[0] == 0xFE && input[1] == 0xFF) {
+            return "UTF-16BE";
+        }
+        if (input[0] == 0xFF && input[1] == 0xFE) {
+            return "UTF-16LE";
+        }
+    }
+    return {};
+}
+
+// https://html.spec.whatwg.org/multipage/parsing.html#determining-the-character-encoding
+ByteString run_encoding_sniffing_algorithm(DOM::Document& document, ByteBuffer const& input, Optional<MimeSniff::MimeType> maybe_mime_type)
+{
+    // 1. If the result of BOM sniffing is an encoding, return that encoding with confidence certain.
+    // FIXME: There is no concept of decoding certainty yet.
+    auto bom = run_bom_sniff(input);
+    if (bom.has_value())
+        return bom.value();
+    // 2. FIXME: If the user has explicitly instructed the user agent to override the document's character encoding with a specific encoding,
+    //    optionally return that encoding with the confidence certain.
+
+    // 3. FIXME: The user agent may wait for more bytes of the resource to be available, either in this step or at any later step in this algorithm.
+    //    For instance, a user agent might wait 500ms or 1024 bytes, whichever came first. In general preparsing the source to find the encoding improves performance,
+    //    as it reduces the need to throw away the data structures used when parsing upon finding the encoding information. However, if the user agent delays too long
+    //    to obtain data to determine the encoding, then the cost of the delay could outweigh any performance improvements from the preparse.
+
+    // 4. If the transport layer specifies a character encoding, and it is supported, return that encoding with the confidence certain.
+    if (maybe_mime_type.has_value()) {
+        // FIXME: This is awkward because lecacy_extract_an_encoding can not fail
+        auto maybe_transport_encoding = Fetch::Infrastructure::legacy_extract_an_encoding(maybe_mime_type, "invalid"sv);
+        if (maybe_transport_encoding != "invalid"sv)
+            return maybe_transport_encoding;
    }

-    // FIXME: If the user has explicitly instructed the user agent to override the document's character
-    //        encoding with a specific encoding.
-    // FIXME: The user agent may wait for more bytes of the resource to be available, either in this step or
-    //        at any later step in this algorithm.
-    // FIXME: If the transport layer specifies a character encoding, and it is supported.
+    // 5. Optionally prescan the byte stream to determine its encoding, with the end condition being when the user agent decides that scanning further bytes would not
+    //    be efficient. User agents are encouraged to only prescan the first 1024 bytes. User agents may decide that scanning any bytes is not efficient, in which case
+    //    these substeps are entirely skipped.
+    //    The aforementioned algorithm returns either a character encoding or failure. If it returns a character encoding, then return the same encoding, with confidence tentative.
+    auto prescan = run_prescan_byte_stream_algorithm(document, input);
+    if (prescan.has_value())
+        return prescan.value();

-    auto optional_encoding = run_prescan_byte_stream_algorithm(document, input);
-    if (optional_encoding.has_value()) {
-        return optional_encoding.value();
-    }
+    // 6. FIXME: If the HTML parser for which this algorithm is being run is associated with a Document d whose container document is non-null, then:
+    // 1. Let parentDocument be d's container document.
+    // 2. If parentDocument's origin is same origin with d's origin and parentDocument's character encoding is not UTF-16BE/LE, then return parentDocument's character
+    //    encoding, with the confidence tentative.

-    // FIXME: If the HTML parser for which this algorithm is being run is associated with a Document whose browsing context
-    //        is non-null and a child browsing context.
-    // FIXME: If the user agent has information on the likely encoding for this page, e.g. based on the encoding of the page
-    //        when it was last visited.
+    // 7. Otherwise, if the user agent has information on the likely encoding for this page, e.g. based on the encoding of the page when it was last visited, then return
+    //    that encoding, with the confidence tentative.

+    // 8. FIXME: The user agent may attempt to autodetect the character encoding from applying frequency analysis or other algorithms to the data stream. Such algorithms
+    //    may use information about the resource other than the resource's contents, including the address of the resource. If autodetection succeeds in determining a
+    //    character encoding, and that encoding is a supported encoding, then return that encoding, with the confidence tentative. [UNIVCHARDET]
    if (!Utf8View(StringView(input)).validate()) {
        // FIXME: As soon as Locale is supported, this should sometimes return a different encoding based on the locale.
        return "windows-1252";
    }

-    // NOTE: This is the authoritative place to actually decide on using the default encoding as per the HTML specification.
-    //       "Otherwise, return an implementation-defined or user-specified default character encoding, [...]."
+    // 9. Otherwise, return an implementation-defined or user-specified default character encoding, with the confidence tentative.
+    //    In controlled environments or in environments where the encoding of documents can be prescribed (for example, for user agents intended for dedicated use in new
+    //    networks), the comprehensive UTF-8 encoding is suggested.
    return "UTF-8";
 }

--- a/Userland/Libraries/LibWeb/HTML/Parser/HTMLEncodingDetection.h
+++ b/Userland/Libraries/LibWeb/HTML/Parser/HTMLEncodingDetection.h
@ -19,6 +19,7 @@ bool prescan_skip_whitespace_and_slashes(ByteBuffer const& input, size_t& positi
 Optional<StringView> extract_character_encoding_from_meta_element(ByteString const&);
 JS::GCPtr<DOM::Attr> prescan_get_attribute(DOM::Document&, ByteBuffer const& input, size_t& position);
 Optional<ByteString> run_prescan_byte_stream_algorithm(DOM::Document&, ByteBuffer const& input);
-ByteString run_encoding_sniffing_algorithm(DOM::Document&, ByteBuffer const& input);
+Optional<ByteString> run_bom_sniff(ByteBuffer const& input);
+ByteString run_encoding_sniffing_algorithm(DOM::Document&, ByteBuffer const& input, Optional<MimeSniff::MimeType> maybe_mime_type = {});

 }