LibWeb: Add rules for distinguishing if a resource is text or binary

Resolves a FIXME in MimeSniff::Resource allowing us to determine
the computed MIME type given supplied types that are used in older
versions of Apache that need special handling.
This commit is contained in:
Kemal Zebari 2023-12-07 21:27:25 -08:00 committed by Andrew Kaster
parent 0b7148e2a6
commit 5d14691149
3 changed files with 102 additions and 13 deletions

View file

@ -31,11 +31,13 @@ TEST_CASE(determine_computed_mime_type_given_no_sniff_is_unset)
auto computed_mime_type = MUST(Web::MimeSniff::Resource::sniff("\x00"sv.bytes(), Web::MimeSniff::SniffingConfiguration { .supplied_type = supplied_type }));
EXPECT_EQ("application/octet-stream"sv, MUST(computed_mime_type.serialized()));
}
// Make sure we cover the XML code path in the mime type sniffing algorithm.
TEST_CASE(determine_computed_mime_type_given_xml_mime_type_as_supplied_type)
{
auto xml_mime_type = "application/rss+xml"sv;
supplied_type = MUST(Web::MimeSniff::MimeType::parse(xml_mime_type)).release_value();
computed_mime_type = MUST(Web::MimeSniff::Resource::sniff("\x00"sv.bytes(), Web::MimeSniff::SniffingConfiguration { .supplied_type = supplied_type }));
auto supplied_type = MUST(Web::MimeSniff::MimeType::parse(xml_mime_type)).release_value();
auto computed_mime_type = MUST(Web::MimeSniff::Resource::sniff("\x00"sv.bytes(), Web::MimeSniff::SniffingConfiguration { .supplied_type = supplied_type }));
EXPECT_EQ(xml_mime_type, MUST(computed_mime_type.serialized()));
}
@ -60,6 +62,53 @@ static void set_audio_or_video_type_mappings(HashMap<StringView, Vector<StringVi
mime_type_to_headers_map.set("audio/wave"sv, { "RIFF\x00\x00\x00\x00WAVE"sv });
}
static void set_text_plain_type_mappings(HashMap<StringView, Vector<StringView>>& mime_type_to_headers_map)
{
mime_type_to_headers_map.set("text/plain"sv, {
"\xFE\xFF\x00\x00"sv,
"\xFF\xFE\x00\x00"sv,
"\xEF\xBB\xBF\x00"sv,
"Hello world!"sv,
});
}
TEST_CASE(determine_computed_mime_type_given_supplied_type_that_is_an_apache_bug_mime_type)
{
Vector<StringView> apache_bug_mime_types = {
"text/plain"sv,
"text/plain; charset=ISO-8859-1"sv,
"text/plain; charset=iso-8859-1"sv,
"text/plain; charset=UTF-8"sv
};
// Cover all Apache bug MIME types.
for (auto const& apache_bug_mime_type : apache_bug_mime_types) {
auto supplied_type = MUST(Web::MimeSniff::MimeType::parse(apache_bug_mime_type)).release_value();
auto computed_mime_type = MUST(Web::MimeSniff::Resource::sniff("Hello world!"sv.bytes(),
Web::MimeSniff::SniffingConfiguration { .scheme = "http"sv, .supplied_type = supplied_type }));
EXPECT_EQ("text/plain"sv, MUST(computed_mime_type.serialized()));
}
// Cover all code paths in "rules for distinguishing if a resource is text or binary".
HashMap<StringView, Vector<StringView>> mime_type_to_headers_map;
mime_type_to_headers_map.set("application/octet-stream"sv, { "\x00"sv });
set_text_plain_type_mappings(mime_type_to_headers_map);
auto supplied_type = MUST(Web::MimeSniff::MimeType::create("text"_string, "plain"_string));
for (auto const& mime_type_to_headers : mime_type_to_headers_map) {
auto mime_type = mime_type_to_headers.key;
for (auto const& header : mime_type_to_headers.value) {
auto computed_mime_type = MUST(Web::MimeSniff::Resource::sniff(header.bytes(),
Web::MimeSniff::SniffingConfiguration { .scheme = "http"sv, .supplied_type = supplied_type }));
EXPECT_EQ(mime_type, MUST(computed_mime_type.serialized()));
}
}
}
TEST_CASE(determine_computed_mime_type_in_both_none_and_browsing_sniffing_context)
{
HashMap<StringView, Vector<StringView>> mime_type_to_headers_map;
@ -87,13 +136,8 @@ TEST_CASE(determine_computed_mime_type_in_both_none_and_browsing_sniffing_contex
mime_type_to_headers_map.set("text/xml"sv, { "<?xml"sv });
mime_type_to_headers_map.set("application/pdf"sv, { "%PDF-"sv });
mime_type_to_headers_map.set("application/postscript"sv, { "%!PS-Adobe-"sv });
mime_type_to_headers_map.set("text/plain"sv, {
"\xFE\xFF\x00\x00"sv,
"\xFF\xFE\x00\x00"sv,
"\xEF\xBB\xBF\x00"sv,
"Hello world!"sv,
});
set_text_plain_type_mappings(mime_type_to_headers_map);
set_image_type_mappings(mime_type_to_headers_map);
set_audio_or_video_type_mappings(mime_type_to_headers_map);

View file

@ -446,11 +446,13 @@ ErrorOr<void> Resource::supplied_mime_type_detection_algorithm(StringView scheme
// NOTE: Non-standard but this algorithm expects the caller to handle step 2.1.1.
if (supplied_type.has_value()) {
if (Fetch::Infrastructure::is_http_or_https_scheme(scheme)) {
// NOTE: The spec expects a space between the semicolon and the start of the charset parameter. However, we will lose this
// space because MimeType::parse() ignores any spaces found there.
static Array<StringView, 4> constexpr apache_bug_mime_types = {
"text/plain"sv,
"text/plain; charset=ISO-8859-1"sv,
"text/plain; charset=iso-8859-1"sv,
"text/plain; charset=UTF-8"sv
"text/plain;charset=ISO-8859-1"sv,
"text/plain;charset=iso-8859-1"sv,
"text/plain;charset=UTF-8"sv
};
auto serialized_supplied_type = TRY(supplied_type->serialized());
@ -517,7 +519,7 @@ ErrorOr<void> Resource::mime_type_sniffing_algorithm()
// 3. If the check-for-apache-bug flag is set, execute the rules for distinguishing
// if a resource is text or binary and abort these steps.
if (m_check_for_apache_bug_flag) {
// FIXME: Execute the rules for distinguishing if a resource is text or binary and abort these steps.
TRY(rules_for_distinguishing_if_a_resource_is_text_or_binary());
return {};
}
@ -564,6 +566,46 @@ ErrorOr<void> Resource::mime_type_sniffing_algorithm()
return {};
}
// https://mimesniff.spec.whatwg.org/#sniffing-a-mislabeled-binary-resource
ErrorOr<void> Resource::rules_for_distinguishing_if_a_resource_is_text_or_binary()
{
// 1. Let length be the number of bytes in the resource header.
auto length = m_resource_header.size();
// 2. If length is greater than or equal to 2 and the first 2 bytes of the
// resource header are equal to 0xFE 0xFF (UTF-16BE BOM) or 0xFF 0xFE (UTF-16LE BOM), the computed MIME type is "text/plain".
// Abort these steps.
auto resource_header_span = m_resource_header.span();
auto utf_16_be_bom = "\xFE\xFF"sv.bytes();
auto utf_16_le_bom = "\xFF\xFE"sv.bytes();
if (length >= 2
&& (resource_header_span.starts_with(utf_16_be_bom)
|| resource_header_span.starts_with(utf_16_le_bom))) {
m_computed_mime_type = TRY(MimeType::create("text"_string, "plain"_string));
return {};
}
// 3. If length is greater than or equal to 3 and the first 3 bytes of the resource header are equal to 0xEF 0xBB 0xBF (UTF-8 BOM),
// the computed MIME type is "text/plain".
// Abort these steps.
auto utf_8_bom = "\xEF\xBB\xBF"sv.bytes();
if (length >= 3 && resource_header_span.starts_with(utf_8_bom)) {
m_computed_mime_type = TRY(MimeType::create("text"_string, "plain"_string));
return {};
}
// 4. If the resource header contains no binary data bytes, the computed MIME type is "text/plain".
// Abort these steps.
if (!any_of(resource_header(), is_binary_data_byte)) {
m_computed_mime_type = TRY(MimeType::create("text"_string, "plain"_string));
return {};
}
// 5. The computed MIME type is "application/octet-stream".
// NOTE: This is the default MIME type of the computed MIME type.
return {};
}
// https://mimesniff.spec.whatwg.org/#context-specific-sniffing-algorithm
ErrorOr<void> Resource::context_specific_sniffing_algorithm(SniffingContext sniffing_context)
{

View file

@ -42,6 +42,9 @@ private:
void read_the_resource_header(ReadonlyBytes data);
ErrorOr<void> supplied_mime_type_detection_algorithm(StringView scheme, Optional<MimeType> supplied_type);
ErrorOr<void> mime_type_sniffing_algorithm();
ErrorOr<void> rules_for_distinguishing_if_a_resource_is_text_or_binary();
ErrorOr<void> context_specific_sniffing_algorithm(SniffingContext sniffing_context);
ErrorOr<void> rules_for_sniffing_images_specifically();
ErrorOr<void> rules_for_sniffing_audio_or_video_specifically();