serenity/AK/URL.cpp
Andreas Kling 6c51ba27a2 AK: Make URL percent encoding faster by exploiting ASCII knowledge
Once we know that a code point must be a valid ASCII character,
we now cast it to `char` and avoid the expensive generic
StringView::contains(u32 code_point) checks.

This dramatically speeds up URL parsing.
2023-12-30 13:49:50 +01:00

583 lines
21 KiB
C++
Raw Blame History

This file contains ambiguous Unicode characters

This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.

/*
* Copyright (c) 2018-2020, Andreas Kling <kling@serenityos.org>
* Copyright (c) 2021, Max Wipfli <mail@maxwipfli.ch>
*
* SPDX-License-Identifier: BSD-2-Clause
*/
#include <AK/Base64.h>
#include <AK/CharacterTypes.h>
#include <AK/Debug.h>
#include <AK/LexicalPath.h>
#include <AK/StringBuilder.h>
#include <AK/URL.h>
#include <AK/URLParser.h>
#include <AK/Utf8View.h>
namespace AK {
// FIXME: It could make sense to force users of URL to use URLParser::basic_parse() explicitly instead of using a constructor.
URL::URL(StringView string)
: URL(URLParser::basic_parse(string))
{
if constexpr (URL_PARSER_DEBUG) {
if (m_valid)
dbgln("URL constructor: Parsed URL to be '{}'.", serialize());
else
dbgln("URL constructor: Parsed URL to be invalid.");
}
}
URL URL::complete_url(StringView relative_url) const
{
if (!is_valid())
return {};
return URLParser::basic_parse(relative_url, *this);
}
ErrorOr<String> URL::username() const
{
return String::from_byte_string(percent_decode(m_username));
}
ErrorOr<String> URL::password() const
{
return String::from_byte_string(percent_decode(m_password));
}
ByteString URL::path_segment_at_index(size_t index) const
{
VERIFY(index < path_segment_count());
return percent_decode(m_paths[index]);
}
ByteString URL::basename() const
{
if (!m_valid)
return {};
if (m_paths.is_empty())
return {};
auto& last_segment = m_paths.last();
return percent_decode(last_segment);
}
void URL::set_scheme(String scheme)
{
m_scheme = move(scheme);
m_valid = compute_validity();
}
// https://url.spec.whatwg.org/#set-the-username
ErrorOr<void> URL::set_username(StringView username)
{
// To set the username given a url and username, set urls username to the result of running UTF-8 percent-encode on username using the userinfo percent-encode set.
m_username = TRY(String::from_byte_string(percent_encode(username, PercentEncodeSet::Userinfo)));
m_valid = compute_validity();
return {};
}
// https://url.spec.whatwg.org/#set-the-password
ErrorOr<void> URL::set_password(StringView password)
{
// To set the password given a url and password, set urls password to the result of running UTF-8 percent-encode on password using the userinfo percent-encode set.
m_password = TRY(String::from_byte_string(percent_encode(password, PercentEncodeSet::Userinfo)));
m_valid = compute_validity();
return {};
}
void URL::set_host(Host host)
{
m_host = move(host);
m_valid = compute_validity();
}
// https://url.spec.whatwg.org/#concept-host-serializer
ErrorOr<String> URL::serialized_host() const
{
return URLParser::serialize_host(m_host);
}
void URL::set_port(Optional<u16> port)
{
if (port == default_port_for_scheme(m_scheme)) {
m_port = {};
return;
}
m_port = move(port);
m_valid = compute_validity();
}
void URL::set_paths(Vector<ByteString> const& paths)
{
m_paths.clear_with_capacity();
m_paths.ensure_capacity(paths.size());
for (auto const& segment : paths)
m_paths.unchecked_append(String::from_byte_string(percent_encode(segment, PercentEncodeSet::Path)).release_value_but_fixme_should_propagate_errors());
m_valid = compute_validity();
}
void URL::append_path(StringView path)
{
m_paths.append(String::from_byte_string(percent_encode(path, PercentEncodeSet::Path)).release_value_but_fixme_should_propagate_errors());
}
// https://url.spec.whatwg.org/#cannot-have-a-username-password-port
bool URL::cannot_have_a_username_or_password_or_port() const
{
// A URL cannot have a username/password/port if its host is null or the empty string, or its scheme is "file".
return m_host.has<Empty>() || m_host == String {} || m_scheme == "file"sv;
}
// FIXME: This is by no means complete.
// NOTE: This relies on some assumptions about how the spec-defined URL parser works that may turn out to be wrong.
bool URL::compute_validity() const
{
if (m_scheme.is_empty())
return false;
if (m_cannot_be_a_base_url) {
if (m_paths.size() != 1)
return false;
if (m_paths[0].is_empty())
return false;
} else {
if (m_scheme.is_one_of("about", "mailto"))
return false;
// NOTE: Maybe it is allowed to have a zero-segment path.
if (m_paths.size() == 0)
return false;
}
// NOTE: A file URL's host should be the empty string for localhost, not null.
if (m_scheme == "file" && m_host.has<Empty>())
return false;
return true;
}
// https://url.spec.whatwg.org/#default-port
Optional<u16> URL::default_port_for_scheme(StringView scheme)
{
// Spec defined mappings with port:
if (scheme == "ftp")
return 21;
if (scheme == "http")
return 80;
if (scheme == "https")
return 443;
if (scheme == "ws")
return 80;
if (scheme == "wss")
return 443;
// NOTE: not in spec, but we support these too
if (scheme == "gemini")
return 1965;
if (scheme == "irc")
return 6667;
if (scheme == "ircs")
return 6697;
return {};
}
URL URL::create_with_file_scheme(ByteString const& path, ByteString const& fragment, ByteString const& hostname)
{
LexicalPath lexical_path(path);
if (!lexical_path.is_absolute())
return {};
URL url;
url.set_scheme("file"_string);
url.set_host(hostname == "localhost" ? String {} : String::from_byte_string(hostname).release_value_but_fixme_should_propagate_errors());
url.set_paths(lexical_path.parts());
if (path.ends_with('/'))
url.append_slash();
if (!fragment.is_empty())
url.set_fragment(String::from_byte_string(fragment).release_value_but_fixme_should_propagate_errors());
return url;
}
URL URL::create_with_help_scheme(ByteString const& path, ByteString const& fragment, ByteString const& hostname)
{
LexicalPath lexical_path(path);
URL url;
url.set_scheme("help"_string);
url.set_host(hostname == "localhost" ? String {} : String::from_byte_string(hostname).release_value_but_fixme_should_propagate_errors());
url.set_paths(lexical_path.parts());
if (path.ends_with('/'))
url.append_slash();
if (!fragment.is_empty())
url.set_fragment(String::from_byte_string(fragment).release_value_but_fixme_should_propagate_errors());
return url;
}
URL URL::create_with_url_or_path(ByteString const& url_or_path)
{
URL url = url_or_path;
if (url.is_valid())
return url;
ByteString path = LexicalPath::canonicalized_path(url_or_path);
return URL::create_with_file_scheme(path);
}
URL URL::create_with_data(StringView mime_type, StringView payload, bool is_base64)
{
URL url;
url.set_cannot_be_a_base_url(true);
url.set_scheme("data"_string);
StringBuilder builder;
builder.append(mime_type);
if (is_base64)
builder.append(";base64"sv);
builder.append(',');
builder.append(payload);
url.set_paths({ builder.to_byte_string() });
return url;
}
// https://url.spec.whatwg.org/#special-scheme
bool URL::is_special_scheme(StringView scheme)
{
return scheme.is_one_of("ftp", "file", "http", "https", "ws", "wss");
}
// https://url.spec.whatwg.org/#url-path-serializer
ByteString URL::serialize_path(ApplyPercentDecoding apply_percent_decoding) const
{
// 1. If url has an opaque path, then return urls path.
// FIXME: Reimplement this step once we modernize the URL implementation to meet the spec.
if (cannot_be_a_base_url())
return m_paths[0].to_byte_string();
// 2. Let output be the empty string.
StringBuilder output;
// 3. For each segment of urls path: append U+002F (/) followed by segment to output.
for (auto const& segment : m_paths) {
output.append('/');
output.append(apply_percent_decoding == ApplyPercentDecoding::Yes ? percent_decode(segment) : segment.to_byte_string());
}
// 4. Return output.
return output.to_byte_string();
}
// https://url.spec.whatwg.org/#concept-url-serializer
ByteString URL::serialize(ExcludeFragment exclude_fragment) const
{
// 1. Let output be urls scheme and U+003A (:) concatenated.
StringBuilder output;
output.append(m_scheme);
output.append(':');
// 2. If urls host is non-null:
if (!m_host.has<Empty>()) {
// 1. Append "//" to output.
output.append("//"sv);
// 2. If url includes credentials, then:
if (includes_credentials()) {
// 1. Append urls username to output.
output.append(m_username);
// 2. If urls password is not the empty string, then append U+003A (:), followed by urls password, to output.
if (!m_password.is_empty()) {
output.append(':');
output.append(m_password);
}
// 3. Append U+0040 (@) to output.
output.append('@');
}
// 3. Append urls host, serialized, to output.
output.append(serialized_host().release_value_but_fixme_should_propagate_errors());
// 4. If urls port is non-null, append U+003A (:) followed by urls port, serialized, to output.
if (m_port.has_value())
output.appendff(":{}", *m_port);
}
// 3. If urls host is null, url does not have an opaque path, urls paths size is greater than 1, and urls path[0] is the empty string, then append U+002F (/) followed by U+002E (.) to output.
// 4. Append the result of URL path serializing url to output.
// FIXME: Implement this closer to spec steps.
if (cannot_be_a_base_url()) {
output.append(m_paths[0]);
} else {
if (m_host.has<Empty>() && m_paths.size() > 1 && m_paths[0].is_empty())
output.append("/."sv);
for (auto& segment : m_paths) {
output.append('/');
output.append(segment);
}
}
// 5. If urls query is non-null, append U+003F (?), followed by urls query, to output.
if (m_query.has_value()) {
output.append('?');
output.append(*m_query);
}
// 6. If exclude fragment is false and urls fragment is non-null, then append U+0023 (#), followed by urls fragment, to output.
if (exclude_fragment == ExcludeFragment::No && m_fragment.has_value()) {
output.append('#');
output.append(*m_fragment);
}
// 7. Return output.
return output.to_byte_string();
}
// https://url.spec.whatwg.org/#url-rendering
// NOTE: This does e.g. not display credentials.
// FIXME: Parts of the URL other than the host should have their sequences of percent-encoded bytes replaced with code points
// resulting from percent-decoding those sequences converted to bytes, unless that renders those sequences invisible.
ByteString URL::serialize_for_display() const
{
VERIFY(m_valid);
StringBuilder builder;
builder.append(m_scheme);
builder.append(':');
if (!m_host.has<Empty>()) {
builder.append("//"sv);
builder.append(serialized_host().release_value_but_fixme_should_propagate_errors());
if (m_port.has_value())
builder.appendff(":{}", *m_port);
}
if (cannot_be_a_base_url()) {
builder.append(m_paths[0]);
} else {
if (m_host.has<Empty>() && m_paths.size() > 1 && m_paths[0].is_empty())
builder.append("/."sv);
for (auto& segment : m_paths) {
builder.append('/');
builder.append(segment);
}
}
if (m_query.has_value()) {
builder.append('?');
builder.append(*m_query);
}
if (m_fragment.has_value()) {
builder.append('#');
builder.append(*m_fragment);
}
return builder.to_byte_string();
}
ErrorOr<String> URL::to_string() const
{
return String::from_byte_string(serialize());
}
// https://html.spec.whatwg.org/multipage/origin.html#ascii-serialisation-of-an-origin
// https://url.spec.whatwg.org/#concept-url-origin
ByteString URL::serialize_origin() const
{
VERIFY(m_valid);
if (m_scheme == "blob"sv) {
// TODO: 1. If URLs blob URL entry is non-null, then return URLs blob URL entrys environments origin.
// 2. Let url be the result of parsing URLs path[0].
VERIFY(!m_paths.is_empty());
URL url = m_paths[0];
// 3. Return a new opaque origin, if url is failure, and urls origin otherwise.
if (!url.is_valid())
return "null";
return url.serialize_origin();
} else if (!m_scheme.is_one_of("ftp"sv, "http"sv, "https"sv, "ws"sv, "wss"sv)) { // file: "Unfortunate as it is, this is left as an exercise to the reader. When in doubt, return a new opaque origin."
return "null";
}
StringBuilder builder;
builder.append(m_scheme);
builder.append("://"sv);
builder.append(serialized_host().release_value_but_fixme_should_propagate_errors());
if (m_port.has_value())
builder.appendff(":{}", *m_port);
return builder.to_byte_string();
}
bool URL::equals(URL const& other, ExcludeFragment exclude_fragments) const
{
if (this == &other)
return true;
if (!m_valid || !other.m_valid)
return false;
return serialize(exclude_fragments) == other.serialize(exclude_fragments);
}
// https://fetch.spec.whatwg.org/#data-url-processor
ErrorOr<URL::DataURL> URL::process_data_url() const
{
// 1. Assert: dataURLs scheme is "data".
VERIFY(scheme() == "data");
// 2. Let input be the result of running the URL serializer on dataURL with exclude fragment set to true.
auto input = serialize(URL::ExcludeFragment::Yes);
// 3. Remove the leading "data:" from input.
input = input.substring("data:"sv.length());
// 4. Let position point at the start of input.
// 5. Let mimeType be the result of collecting a sequence of code points that are not equal to U+002C (,), given position.
auto position = input.find(',');
auto mime_type = input.substring_view(0, position.value_or(input.length()));
// 6. Strip leading and trailing ASCII whitespace from mimeType.
mime_type = mime_type.trim_whitespace(TrimMode::Both);
// 7. If position is past the end of input, then return failure.
if (!position.has_value())
return Error::from_string_literal("Missing a comma character");
// 8. Advance position by 1.
position = position.value() + 1;
// 9. Let encodedBody be the remainder of input.
auto encoded_body = input.substring_view(position.value());
// 10. Let body be the percent-decoding of encodedBody.
auto body = URL::percent_decode(encoded_body).to_byte_buffer();
// 11. If mimeType ends with U+003B (;), followed by zero or more U+0020 SPACE, followed by an ASCII case-insensitive match for "base64", then:
if (mime_type.ends_with("base64"sv, CaseSensitivity::CaseInsensitive)) {
auto trimmed_substring_view = mime_type.substring_view(0, mime_type.length() - 6);
trimmed_substring_view = trimmed_substring_view.trim(" "sv, TrimMode::Right);
if (trimmed_substring_view.ends_with(';')) {
// 1. Let stringBody be the isomorphic decode of body.
auto string_body = StringView(body);
// 2. Set body to the forgiving-base64 decode of stringBody.
// FIXME: Check if it's really forgiving.
// 3. If body is failure, then return failure.
body = TRY(decode_base64(string_body));
// 4. Remove the last 6 code points from mimeType.
// 5. Remove trailing U+0020 SPACE code points from mimeType, if any.
// 6. Remove the last U+003B (;) from mimeType.
mime_type = trimmed_substring_view.substring_view(0, trimmed_substring_view.length() - 1);
}
}
// 12. If mimeType starts with ";", then prepend "text/plain" to mimeType.
StringBuilder builder;
if (mime_type.starts_with(';')) {
builder.append("text/plain"sv);
builder.append(mime_type);
mime_type = builder.string_view();
}
// FIXME: Parse the MIME type's components according to https://mimesniff.spec.whatwg.org/#parse-a-mime-type
// FIXME: 13. Let mimeTypeRecord be the result of parsing mimeType.
auto mime_type_record = mime_type.trim("\n\r\t "sv, TrimMode::Both);
// 14. If mimeTypeRecord is failure, then set mimeTypeRecord to text/plain;charset=US-ASCII.
if (mime_type_record.is_empty())
mime_type_record = "text/plain;charset=US-ASCII"sv;
// 15. Return a new data: URL struct whose MIME type is mimeTypeRecord and body is body.
return URL::DataURL { TRY(String::from_utf8(mime_type_record)), body };
}
void URL::append_percent_encoded(StringBuilder& builder, u32 code_point)
{
if (code_point <= 0x7f)
builder.appendff("%{:02X}", code_point);
else if (code_point <= 0x07ff)
builder.appendff("%{:02X}%{:02X}", ((code_point >> 6) & 0x1f) | 0xc0, (code_point & 0x3f) | 0x80);
else if (code_point <= 0xffff)
builder.appendff("%{:02X}%{:02X}%{:02X}", ((code_point >> 12) & 0x0f) | 0xe0, ((code_point >> 6) & 0x3f) | 0x80, (code_point & 0x3f) | 0x80);
else if (code_point <= 0x10ffff)
builder.appendff("%{:02X}%{:02X}%{:02X}%{:02X}", ((code_point >> 18) & 0x07) | 0xf0, ((code_point >> 12) & 0x3f) | 0x80, ((code_point >> 6) & 0x3f) | 0x80, (code_point & 0x3f) | 0x80);
else
VERIFY_NOT_REACHED();
}
// https://url.spec.whatwg.org/#c0-control-percent-encode-set
bool URL::code_point_is_in_percent_encode_set(u32 code_point, URL::PercentEncodeSet set)
{
// NOTE: Once we've checked for presence in the C0Control set, we know that the code point is
// a valid ASCII character in the range 0x20..0x7E, so we can safely cast it to char.
switch (set) {
case URL::PercentEncodeSet::C0Control:
return code_point < 0x20 || code_point > 0x7E;
case URL::PercentEncodeSet::Fragment:
return code_point_is_in_percent_encode_set(code_point, URL::PercentEncodeSet::C0Control) || " \"<>`"sv.contains(static_cast<char>(code_point));
case URL::PercentEncodeSet::Query:
return code_point_is_in_percent_encode_set(code_point, URL::PercentEncodeSet::C0Control) || " \"#<>"sv.contains(static_cast<char>(code_point));
case URL::PercentEncodeSet::SpecialQuery:
return code_point_is_in_percent_encode_set(code_point, URL::PercentEncodeSet::Query) || code_point == '\'';
case URL::PercentEncodeSet::Path:
return code_point_is_in_percent_encode_set(code_point, URL::PercentEncodeSet::Query) || "?`{}"sv.contains(static_cast<char>(code_point));
case URL::PercentEncodeSet::Userinfo:
return code_point_is_in_percent_encode_set(code_point, URL::PercentEncodeSet::Path) || "/:;=@[\\]^|"sv.contains(static_cast<char>(code_point));
case URL::PercentEncodeSet::Component:
return code_point_is_in_percent_encode_set(code_point, URL::PercentEncodeSet::Userinfo) || "$%&+,"sv.contains(static_cast<char>(code_point));
case URL::PercentEncodeSet::ApplicationXWWWFormUrlencoded:
return code_point_is_in_percent_encode_set(code_point, URL::PercentEncodeSet::Component) || "!'()~"sv.contains(static_cast<char>(code_point));
case URL::PercentEncodeSet::EncodeURI:
// NOTE: This is the same percent encode set that JS encodeURI() uses.
// https://developer.mozilla.org/en-US/docs/Web/JavaScript/Reference/Global_Objects/encodeURI
return code_point > 0x7E || (!is_ascii_alphanumeric(code_point) && !";,/?:@&=+$-_.!~*'()#"sv.contains(static_cast<char>(code_point)));
default:
VERIFY_NOT_REACHED();
}
}
void URL::append_percent_encoded_if_necessary(StringBuilder& builder, u32 code_point, URL::PercentEncodeSet set)
{
if (code_point_is_in_percent_encode_set(code_point, set))
append_percent_encoded(builder, code_point);
else
builder.append_code_point(code_point);
}
ByteString URL::percent_encode(StringView input, URL::PercentEncodeSet set, SpaceAsPlus space_as_plus)
{
StringBuilder builder;
for (auto code_point : Utf8View(input)) {
if (space_as_plus == SpaceAsPlus::Yes && code_point == ' ')
builder.append('+');
else
append_percent_encoded_if_necessary(builder, code_point, set);
}
return builder.to_byte_string();
}
ByteString URL::percent_decode(StringView input)
{
if (!input.contains('%'))
return input;
StringBuilder builder;
Utf8View utf8_view(input);
for (auto it = utf8_view.begin(); !it.done(); ++it) {
if (*it != '%') {
builder.append_code_point(*it);
} else if (!is_ascii_hex_digit(it.peek(1).value_or(0)) || !is_ascii_hex_digit(it.peek(2).value_or(0))) {
builder.append_code_point(*it);
} else {
++it;
u8 byte = parse_ascii_hex_digit(*it) << 4;
++it;
byte += parse_ascii_hex_digit(*it);
builder.append(byte);
}
}
return builder.to_byte_string();
}
}