From 5d1469114996d2b9bd242d3459ceb9798f700d87 Mon Sep 17 00:00:00 2001 From: Kemal Zebari Date: Thu, 7 Dec 2023 21:27:25 -0800 Subject: [PATCH] LibWeb: Add rules for distinguishing if a resource is text or binary Resolves a FIXME in MimeSniff::Resource allowing us to determine the computed MIME type given supplied types that are used in older versions of Apache that need special handling. --- Tests/LibWeb/TestMimeSniff.cpp | 62 ++++++++++++++++--- .../Libraries/LibWeb/MimeSniff/Resource.cpp | 50 +++++++++++++-- .../Libraries/LibWeb/MimeSniff/Resource.h | 3 + 3 files changed, 102 insertions(+), 13 deletions(-) diff --git a/Tests/LibWeb/TestMimeSniff.cpp b/Tests/LibWeb/TestMimeSniff.cpp index cc451693613..5331e8f80e1 100644 --- a/Tests/LibWeb/TestMimeSniff.cpp +++ b/Tests/LibWeb/TestMimeSniff.cpp @@ -31,11 +31,13 @@ TEST_CASE(determine_computed_mime_type_given_no_sniff_is_unset) auto computed_mime_type = MUST(Web::MimeSniff::Resource::sniff("\x00"sv.bytes(), Web::MimeSniff::SniffingConfiguration { .supplied_type = supplied_type })); EXPECT_EQ("application/octet-stream"sv, MUST(computed_mime_type.serialized())); +} - // Make sure we cover the XML code path in the mime type sniffing algorithm. +TEST_CASE(determine_computed_mime_type_given_xml_mime_type_as_supplied_type) +{ auto xml_mime_type = "application/rss+xml"sv; - supplied_type = MUST(Web::MimeSniff::MimeType::parse(xml_mime_type)).release_value(); - computed_mime_type = MUST(Web::MimeSniff::Resource::sniff("\x00"sv.bytes(), Web::MimeSniff::SniffingConfiguration { .supplied_type = supplied_type })); + auto supplied_type = MUST(Web::MimeSniff::MimeType::parse(xml_mime_type)).release_value(); + auto computed_mime_type = MUST(Web::MimeSniff::Resource::sniff("\x00"sv.bytes(), Web::MimeSniff::SniffingConfiguration { .supplied_type = supplied_type })); EXPECT_EQ(xml_mime_type, MUST(computed_mime_type.serialized())); } @@ -60,6 +62,53 @@ static void set_audio_or_video_type_mappings(HashMap>& mime_type_to_headers_map) +{ + mime_type_to_headers_map.set("text/plain"sv, { + "\xFE\xFF\x00\x00"sv, + "\xFF\xFE\x00\x00"sv, + "\xEF\xBB\xBF\x00"sv, + "Hello world!"sv, + }); +} + +TEST_CASE(determine_computed_mime_type_given_supplied_type_that_is_an_apache_bug_mime_type) +{ + Vector apache_bug_mime_types = { + "text/plain"sv, + "text/plain; charset=ISO-8859-1"sv, + "text/plain; charset=iso-8859-1"sv, + "text/plain; charset=UTF-8"sv + }; + + // Cover all Apache bug MIME types. + for (auto const& apache_bug_mime_type : apache_bug_mime_types) { + auto supplied_type = MUST(Web::MimeSniff::MimeType::parse(apache_bug_mime_type)).release_value(); + auto computed_mime_type = MUST(Web::MimeSniff::Resource::sniff("Hello world!"sv.bytes(), + Web::MimeSniff::SniffingConfiguration { .scheme = "http"sv, .supplied_type = supplied_type })); + + EXPECT_EQ("text/plain"sv, MUST(computed_mime_type.serialized())); + } + + // Cover all code paths in "rules for distinguishing if a resource is text or binary". + HashMap> mime_type_to_headers_map; + mime_type_to_headers_map.set("application/octet-stream"sv, { "\x00"sv }); + + set_text_plain_type_mappings(mime_type_to_headers_map); + + auto supplied_type = MUST(Web::MimeSniff::MimeType::create("text"_string, "plain"_string)); + for (auto const& mime_type_to_headers : mime_type_to_headers_map) { + auto mime_type = mime_type_to_headers.key; + + for (auto const& header : mime_type_to_headers.value) { + auto computed_mime_type = MUST(Web::MimeSniff::Resource::sniff(header.bytes(), + Web::MimeSniff::SniffingConfiguration { .scheme = "http"sv, .supplied_type = supplied_type })); + + EXPECT_EQ(mime_type, MUST(computed_mime_type.serialized())); + } + } +} + TEST_CASE(determine_computed_mime_type_in_both_none_and_browsing_sniffing_context) { HashMap> mime_type_to_headers_map; @@ -87,13 +136,8 @@ TEST_CASE(determine_computed_mime_type_in_both_none_and_browsing_sniffing_contex mime_type_to_headers_map.set("text/xml"sv, { " Resource::supplied_mime_type_detection_algorithm(StringView scheme // NOTE: Non-standard but this algorithm expects the caller to handle step 2.1.1. if (supplied_type.has_value()) { if (Fetch::Infrastructure::is_http_or_https_scheme(scheme)) { + // NOTE: The spec expects a space between the semicolon and the start of the charset parameter. However, we will lose this + // space because MimeType::parse() ignores any spaces found there. static Array constexpr apache_bug_mime_types = { "text/plain"sv, - "text/plain; charset=ISO-8859-1"sv, - "text/plain; charset=iso-8859-1"sv, - "text/plain; charset=UTF-8"sv + "text/plain;charset=ISO-8859-1"sv, + "text/plain;charset=iso-8859-1"sv, + "text/plain;charset=UTF-8"sv }; auto serialized_supplied_type = TRY(supplied_type->serialized()); @@ -517,7 +519,7 @@ ErrorOr Resource::mime_type_sniffing_algorithm() // 3. If the check-for-apache-bug flag is set, execute the rules for distinguishing // if a resource is text or binary and abort these steps. if (m_check_for_apache_bug_flag) { - // FIXME: Execute the rules for distinguishing if a resource is text or binary and abort these steps. + TRY(rules_for_distinguishing_if_a_resource_is_text_or_binary()); return {}; } @@ -564,6 +566,46 @@ ErrorOr Resource::mime_type_sniffing_algorithm() return {}; } +// https://mimesniff.spec.whatwg.org/#sniffing-a-mislabeled-binary-resource +ErrorOr Resource::rules_for_distinguishing_if_a_resource_is_text_or_binary() +{ + // 1. Let length be the number of bytes in the resource header. + auto length = m_resource_header.size(); + + // 2. If length is greater than or equal to 2 and the first 2 bytes of the + // resource header are equal to 0xFE 0xFF (UTF-16BE BOM) or 0xFF 0xFE (UTF-16LE BOM), the computed MIME type is "text/plain". + // Abort these steps. + auto resource_header_span = m_resource_header.span(); + auto utf_16_be_bom = "\xFE\xFF"sv.bytes(); + auto utf_16_le_bom = "\xFF\xFE"sv.bytes(); + if (length >= 2 + && (resource_header_span.starts_with(utf_16_be_bom) + || resource_header_span.starts_with(utf_16_le_bom))) { + m_computed_mime_type = TRY(MimeType::create("text"_string, "plain"_string)); + return {}; + } + + // 3. If length is greater than or equal to 3 and the first 3 bytes of the resource header are equal to 0xEF 0xBB 0xBF (UTF-8 BOM), + // the computed MIME type is "text/plain". + // Abort these steps. + auto utf_8_bom = "\xEF\xBB\xBF"sv.bytes(); + if (length >= 3 && resource_header_span.starts_with(utf_8_bom)) { + m_computed_mime_type = TRY(MimeType::create("text"_string, "plain"_string)); + return {}; + } + + // 4. If the resource header contains no binary data bytes, the computed MIME type is "text/plain". + // Abort these steps. + if (!any_of(resource_header(), is_binary_data_byte)) { + m_computed_mime_type = TRY(MimeType::create("text"_string, "plain"_string)); + return {}; + } + + // 5. The computed MIME type is "application/octet-stream". + // NOTE: This is the default MIME type of the computed MIME type. + return {}; +} + // https://mimesniff.spec.whatwg.org/#context-specific-sniffing-algorithm ErrorOr Resource::context_specific_sniffing_algorithm(SniffingContext sniffing_context) { diff --git a/Userland/Libraries/LibWeb/MimeSniff/Resource.h b/Userland/Libraries/LibWeb/MimeSniff/Resource.h index 3c3a367d8c4..c8a605ac354 100644 --- a/Userland/Libraries/LibWeb/MimeSniff/Resource.h +++ b/Userland/Libraries/LibWeb/MimeSniff/Resource.h @@ -42,6 +42,9 @@ private: void read_the_resource_header(ReadonlyBytes data); ErrorOr supplied_mime_type_detection_algorithm(StringView scheme, Optional supplied_type); ErrorOr mime_type_sniffing_algorithm(); + + ErrorOr rules_for_distinguishing_if_a_resource_is_text_or_binary(); + ErrorOr context_specific_sniffing_algorithm(SniffingContext sniffing_context); ErrorOr rules_for_sniffing_images_specifically(); ErrorOr rules_for_sniffing_audio_or_video_specifically();