2021-05-25 22:13:15 +02:00
/*
* Copyright ( c ) 2021 , Max Wipfli < mail @ maxwipfli . ch >
*
* SPDX - License - Identifier : BSD - 2 - Clause
*/
2021-06-01 21:18:08 +02:00
# include <AK/CharacterTypes.h>
2021-05-25 22:13:15 +02:00
# include <AK/Debug.h>
2022-12-04 18:02:33 +00:00
# include <AK/DeprecatedString.h>
2021-05-25 22:13:15 +02:00
# include <AK/Optional.h>
# include <AK/SourceLocation.h>
# include <AK/StringBuilder.h>
# include <AK/StringUtils.h>
# include <AK/URLParser.h>
# include <AK/Utf8View.h>
namespace AK {
2021-06-03 12:43:08 +02:00
// NOTE: This is similar to the LibC macro EOF = -1.
constexpr u32 end_of_file = 0xFFFFFFFF ;
2022-09-12 18:32:52 +02:00
static bool is_url_code_point ( u32 code_point )
2021-05-25 22:13:15 +02:00
{
// FIXME: [...] and code points in the range U+00A0 to U+10FFFD, inclusive, excluding surrogates and noncharacters.
return is_ascii_alphanumeric ( code_point ) | | code_point > = 0xA0 | | " !$&'()*+,-./: ; = ? @ _ ~ " sv.contains(code_point);
}
2021-06-03 12:03:56 +02:00
static void report_validation_error ( SourceLocation const & location = SourceLocation : : current ( ) )
2021-05-25 22:13:15 +02:00
{
dbgln_if ( URL_PARSER_DEBUG , " URLParser::parse: Validation error! {} " , location ) ;
}
2022-12-04 18:02:33 +00:00
static Optional < DeprecatedString > parse_opaque_host ( StringView input )
2021-05-25 22:13:15 +02:00
{
2022-09-12 16:31:16 +02:00
auto forbidden_host_characters_excluding_percent = " \0 \t \n \r #/:<>?@[ \\ ]^| " sv ;
for ( auto character : forbidden_host_characters_excluding_percent ) {
if ( input . contains ( character ) ) {
2021-05-25 22:13:15 +02:00
report_validation_error ( ) ;
return { } ;
}
}
// FIXME: If input contains a code point that is not a URL code point and not U+0025 (%), validation error.
// FIXME: If input contains a U+0025 (%) and the two code points following it are not ASCII hex digits, validation error.
return URL : : percent_encode ( input , URL : : PercentEncodeSet : : C0Control ) ;
}
2022-12-04 18:02:33 +00:00
static Optional < DeprecatedString > parse_ipv4_address ( StringView input )
2021-05-25 22:13:15 +02:00
{
// FIXME: Implement the correct IPv4 parser as specified by https://url.spec.whatwg.org/#concept-ipv4-parser.
return input ;
}
// https://url.spec.whatwg.org/#concept-host-parser
// NOTE: This is a very bare-bones implementation.
2022-12-04 18:02:33 +00:00
static Optional < DeprecatedString > parse_host ( StringView input , bool is_not_special = false )
2021-05-25 22:13:15 +02:00
{
if ( input . starts_with ( ' [ ' ) ) {
if ( ! input . ends_with ( ' ] ' ) ) {
report_validation_error ( ) ;
return { } ;
}
// FIXME: Return the result of IPv6 parsing input with its leading U+005B ([) and trailing U+005D (]) removed.
TODO ( ) ;
}
if ( is_not_special )
return parse_opaque_host ( input ) ;
VERIFY ( ! input . is_empty ( ) ) ;
// FIXME: Let domain be the result of running UTF-8 decode without BOM on the percent-decoding of input.
auto domain = URL : : percent_decode ( input ) ;
// FIXME: Let asciiDomain be the result of running domain to ASCII on domain.
auto & ascii_domain = domain ;
2022-09-12 16:31:16 +02:00
auto forbidden_host_characters = " \0 \t \n \r #%/:<>?@[ \\ ]^| " sv ;
for ( auto character : forbidden_host_characters ) {
if ( ascii_domain . view ( ) . contains ( character ) ) {
2021-05-25 22:13:15 +02:00
report_validation_error ( ) ;
return { } ;
}
}
auto ipv4_host = parse_ipv4_address ( ascii_domain ) ;
return ipv4_host ;
}
2022-09-20 15:38:53 +02:00
// https://url.spec.whatwg.org/#start-with-a-windows-drive-letter
2021-11-11 00:55:02 +01:00
constexpr bool starts_with_windows_drive_letter ( StringView input )
2021-05-25 22:13:15 +02:00
{
if ( input . length ( ) < 2 )
return false ;
2022-09-20 15:38:53 +02:00
if ( ! is_ascii_alpha ( input [ 0 ] ) | | ! ( input [ 1 ] = = ' : ' | | input [ 1 ] = = ' | ' ) )
2021-05-25 22:13:15 +02:00
return false ;
if ( input . length ( ) = = 2 )
return true ;
return " / \\ ?# " sv . contains ( input [ 2 ] ) ;
}
2021-11-11 00:55:02 +01:00
constexpr bool is_windows_drive_letter ( StringView input )
2021-05-25 22:13:15 +02:00
{
return input . length ( ) = = 2 & & is_ascii_alpha ( input [ 0 ] ) & & ( input [ 1 ] = = ' : ' | | input [ 1 ] = = ' | ' ) ;
}
2021-11-11 00:55:02 +01:00
constexpr bool is_normalized_windows_drive_letter ( StringView input )
2021-05-25 22:13:15 +02:00
{
return input . length ( ) = = 2 & & is_ascii_alpha ( input [ 0 ] ) & & input [ 1 ] = = ' : ' ;
}
2021-11-11 00:55:02 +01:00
constexpr bool is_single_dot_path_segment ( StringView input )
2021-05-25 22:13:15 +02:00
{
2023-03-10 08:48:54 +01:00
return input = = " . " sv | | input . equals_ignoring_ascii_case ( " %2e " sv ) ;
2021-05-25 22:13:15 +02:00
}
2021-11-11 00:55:02 +01:00
constexpr bool is_double_dot_path_segment ( StringView input )
2021-05-25 22:13:15 +02:00
{
2023-03-10 08:48:54 +01:00
return input = = " .. " sv | | input . equals_ignoring_ascii_case ( " .%2e " sv ) | | input . equals_ignoring_ascii_case ( " %2e. " sv ) | | input . equals_ignoring_ascii_case ( " %2e%2e " sv ) ;
2021-05-25 22:13:15 +02:00
}
2022-04-10 00:48:15 +02:00
// https://url.spec.whatwg.org/#string-percent-encode-after-encoding
2023-06-25 14:11:34 +12:00
DeprecatedString URLParser : : percent_encode_after_encoding ( StringView input , URL : : PercentEncodeSet percent_encode_set , bool space_as_plus )
2022-04-10 00:48:15 +02:00
{
// NOTE: This is written somewhat ad-hoc since we don't yet implement the Encoding spec.
StringBuilder output ;
// 3. For each byte of encodeOutput converted to a byte sequence:
for ( auto byte : input ) {
// 1. If spaceAsPlus is true and byte is 0x20 (SP), then append U+002B (+) to output and continue.
if ( space_as_plus & & byte = = ' ' ) {
output . append ( ' + ' ) ;
continue ;
}
// 2. Let isomorph be a code point whose value is byte’ s value.
u32 isomorph = byte ;
// 3. Assert: percentEncodeSet includes all non-ASCII code points.
// 4. If isomorphic is not in percentEncodeSet, then append isomorph to output.
if ( ! URL : : code_point_is_in_percent_encode_set ( isomorph , percent_encode_set ) ) {
output . append_code_point ( isomorph ) ;
}
// 5. Otherwise, percent-encode byte and append the result to output.
else {
output . appendff ( " %{:02X} " , byte ) ;
}
}
// 6. Return output.
2022-12-06 01:12:49 +00:00
return output . to_deprecated_string ( ) ;
2022-04-10 00:48:15 +02:00
}
2021-05-25 22:13:15 +02:00
// https://fetch.spec.whatwg.org/#data-urls
2021-08-05 00:29:06 +08:00
// FIXME: This only loosely follows the spec, as we use the same class for "regular" and data URLs, unlike the spec.
2021-11-11 00:55:02 +01:00
Optional < URL > URLParser : : parse_data_url ( StringView raw_input )
2021-05-25 22:13:15 +02:00
{
dbgln_if ( URL_PARSER_DEBUG , " URLParser::parse_data_url: Parsing '{}'. " , raw_input ) ;
2022-07-11 17:32:29 +00:00
VERIFY ( raw_input . starts_with ( " data: " sv ) ) ;
2021-05-25 22:13:15 +02:00
auto input = raw_input . substring_view ( 5 ) ;
auto comma_offset = input . find ( ' , ' ) ;
if ( ! comma_offset . has_value ( ) )
return { } ;
2022-07-11 17:32:29 +00:00
auto mime_type = StringUtils : : trim ( input . substring_view ( 0 , comma_offset . value ( ) ) , " \t \n \f \r " sv , TrimMode : : Both ) ;
2021-05-25 22:13:15 +02:00
auto encoded_body = input . substring_view ( comma_offset . value ( ) + 1 ) ;
auto body = URL : : percent_decode ( encoded_body ) ;
2021-08-05 00:29:06 +08:00
bool is_base64_encoded = false ;
2022-07-11 17:32:29 +00:00
if ( mime_type . ends_with ( " base64 " sv , CaseSensitivity : : CaseInsensitive ) ) {
2021-08-05 00:29:06 +08:00
auto substring_view = mime_type . substring_view ( 0 , mime_type . length ( ) - 6 ) ;
2022-07-11 17:32:29 +00:00
auto trimmed_substring_view = StringUtils : : trim ( substring_view , " " sv , TrimMode : : Right ) ;
2021-08-05 00:29:06 +08:00
if ( trimmed_substring_view . ends_with ( ' ; ' ) ) {
is_base64_encoded = true ;
mime_type = trimmed_substring_view . substring_view ( 0 , trimmed_substring_view . length ( ) - 1 ) ;
}
2021-05-25 22:13:15 +02:00
}
2021-08-05 00:29:06 +08:00
StringBuilder builder ;
2022-07-11 17:32:29 +00:00
if ( mime_type . starts_with ( " ; " sv ) | | mime_type . is_empty ( ) ) {
builder . append ( " text/plain " sv ) ;
2021-05-25 22:13:15 +02:00
builder . append ( mime_type ) ;
2021-08-05 00:29:06 +08:00
mime_type = builder . string_view ( ) ;
2021-05-25 22:13:15 +02:00
}
2021-08-05 00:29:06 +08:00
// FIXME: Parse the MIME type's components according to https://mimesniff.spec.whatwg.org/#parse-a-mime-type
2022-07-11 17:32:29 +00:00
URL url { StringUtils : : trim ( mime_type , " \n \r \t " sv , TrimMode : : Both ) , move ( body ) , is_base64_encoded } ;
2021-05-25 22:13:15 +02:00
dbgln_if ( URL_PARSER_DEBUG , " URLParser::parse_data_url: Parsed data URL to be '{}'. " , url . serialize ( ) ) ;
return url ;
}
// https://url.spec.whatwg.org/#concept-basic-url-parser
// NOTE: This parser assumes a UTF-8 encoding.
// NOTE: Refrain from using the URL classes setters inside this algorithm. Rather, set the values directly. This bypasses the setters' built-in
// validation, which is strictly unnecessary since we set m_valid=true at the end anyways. Furthermore, this algorithm may be used in the
// future for validation of URLs, which would then lead to infinite recursion.
// The same goes for base_url, because e.g. the port() getter does not always return m_port, and we are interested in the underlying member
// variables' values here, not what the URL class presents to its users.
2023-04-11 14:53:40 +02:00
URL URLParser : : parse ( StringView raw_input , Optional < URL > const & base_url , Optional < URL > url , Optional < State > state_override )
2021-05-25 22:13:15 +02:00
{
dbgln_if ( URL_PARSER_DEBUG , " URLParser::parse: Parsing '{}' " , raw_input ) ;
if ( raw_input . is_empty ( ) )
2023-04-11 14:53:40 +02:00
return base_url . has_value ( ) ? * base_url : URL { } ;
2021-05-25 22:13:15 +02:00
2022-07-11 17:32:29 +00:00
if ( raw_input . starts_with ( " data: " sv ) ) {
2021-05-25 22:13:15 +02:00
auto maybe_url = parse_data_url ( raw_input ) ;
if ( ! maybe_url . has_value ( ) )
return { } ;
return maybe_url . release_value ( ) ;
}
size_t start_index = 0 ;
size_t end_index = raw_input . length ( ) ;
2023-07-03 22:52:08 +12:00
// 1. If url is not given:
2021-09-13 22:34:14 +03:00
if ( ! url . has_value ( ) ) {
2023-07-03 22:52:08 +12:00
// 1. Set url to a new URL.
2021-09-13 22:34:14 +03:00
url = URL ( ) ;
2023-07-03 22:52:08 +12:00
// 2. If input contains any leading or trailing C0 control or space, invalid-URL-unit validation error.
// 3. Remove any leading and trailing C0 control or space from input.
//
// FIXME: We aren't checking exactly for 'trailing C0 control or space' here.
2021-09-13 22:34:14 +03:00
bool has_validation_error = false ;
for ( size_t i = 0 ; i < raw_input . length ( ) ; + + i ) {
i8 ch = raw_input [ i ] ;
if ( 0 < = ch & & ch < = 0x20 ) {
+ + start_index ;
has_validation_error = true ;
} else {
break ;
}
2021-05-25 22:13:15 +02:00
}
2021-09-13 22:34:14 +03:00
for ( ssize_t i = raw_input . length ( ) - 1 ; i > = 0 ; - - i ) {
i8 ch = raw_input [ i ] ;
if ( 0 < = ch & & ch < = 0x20 ) {
- - end_index ;
has_validation_error = true ;
} else {
break ;
}
2021-05-25 22:13:15 +02:00
}
2021-09-13 22:34:14 +03:00
if ( has_validation_error )
report_validation_error ( ) ;
2021-05-25 22:13:15 +02:00
}
if ( start_index > = end_index )
return { } ;
2022-12-04 18:02:33 +00:00
DeprecatedString processed_input = raw_input . substring_view ( start_index , end_index - start_index ) ;
2021-05-25 22:13:15 +02:00
2023-07-03 22:52:08 +12:00
// 2. If input contains any ASCII tab or newline, invalid-URL-unit validation error.
// 3. Remove all ASCII tab or newline from input.
2022-07-11 17:32:29 +00:00
if ( processed_input . contains ( " \t " sv ) | | processed_input . contains ( " \n " sv ) ) {
2021-05-25 22:13:15 +02:00
report_validation_error ( ) ;
2022-07-11 17:32:29 +00:00
processed_input = processed_input . replace ( " \t " sv , " " sv , ReplaceMode : : All ) . replace ( " \n " sv , " " sv , ReplaceMode : : All ) ;
2021-05-25 22:13:15 +02:00
}
2023-07-03 22:52:08 +12:00
// 4. Let state be state override if given, or scheme start state otherwise.
2021-09-13 22:34:14 +03:00
State state = state_override . value_or ( State : : SchemeStart ) ;
2023-07-03 22:52:08 +12:00
// FIXME: 5. Set encoding to the result of getting an output encoding from encoding.
// 6. Let buffer be the empty string.
2021-05-25 22:13:15 +02:00
StringBuilder buffer ;
2023-07-03 22:52:08 +12:00
// 7. Let atSignSeen, insideBrackets, and passwordTokenSeen be false.
2021-05-25 22:13:15 +02:00
bool at_sign_seen = false ;
bool inside_brackets = false ;
bool password_token_seen = false ;
Utf8View input ( processed_input ) ;
2023-07-03 22:52:08 +12:00
// 8. Let pointer be a pointer for input.
2021-06-01 09:45:52 +02:00
Utf8CodePointIterator iterator = input . begin ( ) ;
2021-05-25 22:13:15 +02:00
auto get_remaining = [ & input , & iterator ] {
2021-05-30 18:52:24 +02:00
return input . substring_view ( iterator - input . begin ( ) + iterator . underlying_code_point_length_in_bytes ( ) ) . as_string ( ) ;
2021-05-25 22:13:15 +02:00
} ;
2023-07-03 22:52:08 +12:00
// 9. Keep running the following state machine by switching on state. If after a run pointer points to the EOF code point, go to the next step. Otherwise, increase pointer by 1 and continue with the state machine.
2021-05-25 22:13:15 +02:00
// NOTE: "continue" should only be used to prevent incrementing the iterator, as this is done at the end of the loop.
// ++iterator : "increase pointer by 1"
// continue : "decrease pointer by 1"
for ( ; ; ) {
2021-06-03 12:43:08 +02:00
u32 code_point = end_of_file ;
2021-05-25 22:13:15 +02:00
if ( ! iterator . done ( ) )
code_point = * iterator ;
if constexpr ( URL_PARSER_DEBUG ) {
2021-06-03 12:43:08 +02:00
if ( code_point = = end_of_file )
2021-06-03 12:40:04 +02:00
dbgln ( " URLParser::parse: {} state with EOF. " , state_name ( state ) ) ;
else if ( is_ascii_printable ( code_point ) )
dbgln ( " URLParser::parse: {} state with code point U+{:04X} ({:c}). " , state_name ( state ) , code_point , code_point ) ;
2021-05-25 22:13:15 +02:00
else
2021-06-03 12:40:04 +02:00
dbgln ( " URLParser::parse: {} state with code point U+{:04X}. " , state_name ( state ) , code_point ) ;
2021-05-25 22:13:15 +02:00
}
switch ( state ) {
2023-07-03 22:52:08 +12:00
// -> scheme start state, https://url.spec.whatwg.org/#scheme-start-state
2021-05-25 22:13:15 +02:00
case State : : SchemeStart :
2023-07-03 22:52:08 +12:00
// 1. If c is an ASCII alpha, append c, lowercased, to buffer, and set state to scheme state.
2021-05-25 22:13:15 +02:00
if ( is_ascii_alpha ( code_point ) ) {
buffer . append_as_lowercase ( code_point ) ;
state = State : : Scheme ;
2023-07-03 22:52:08 +12:00
}
// FIXME: 2. Otherwise, if state override is not given, set state to no scheme state and decrease pointer by 1.
else {
2021-05-25 22:13:15 +02:00
state = State : : NoScheme ;
continue ;
}
2023-07-03 22:52:08 +12:00
// FIXME: 3. Otherwise, return failure.
2021-05-25 22:13:15 +02:00
break ;
2023-07-03 22:52:08 +12:00
// -> scheme state, https://url.spec.whatwg.org/#scheme-state
2021-05-25 22:13:15 +02:00
case State : : Scheme :
2023-07-03 22:52:08 +12:00
// 1. If c is an ASCII alphanumeric, U+002B (+), U+002D (-), or U+002E (.), append c, lowercased, to buffer.
2021-05-25 22:13:15 +02:00
if ( is_ascii_alphanumeric ( code_point ) | | code_point = = ' + ' | | code_point = = ' - ' | | code_point = = ' . ' ) {
buffer . append_as_lowercase ( code_point ) ;
2023-07-03 22:52:08 +12:00
}
// 2. Otherwise, if c is U+003A (:), then:
else if ( code_point = = ' : ' ) {
// FIXME: 1. If state override is given, then:
if ( false ) {
// FIXME: 1. If url’ s scheme is a special scheme and buffer is not a special scheme, then return.
// FIXME: 2. If url’ s scheme is a special scheme and buffer is not a special scheme, then return.
// FIXME: 3. If url includes credentials or has a non-null port, and buffer is "file", then return.
// FIXME: 4. If url’ s scheme is "file" and its host is an empty host, then return.
}
// 2. Set url’ s scheme to buffer.
2022-12-06 01:12:49 +00:00
url - > m_scheme = buffer . to_deprecated_string ( ) ;
2023-07-03 22:52:08 +12:00
// FIXME: 3. If state override is given, then:
if ( false ) {
// FIXME: 1. If url’ s port is url’ s scheme’ s default port, then set url’ s port to null.
// FIXME: 2. Return.
}
// 4. Set buffer to the empty string.
2021-05-25 22:13:15 +02:00
buffer . clear ( ) ;
2023-07-03 22:52:08 +12:00
// 5. If url’ s scheme is "file", then:
2021-09-13 22:34:14 +03:00
if ( url - > scheme ( ) = = " file " ) {
2023-07-03 22:52:08 +12:00
// 1. If remaining does not start with "//", special-scheme-missing-following-solidus validation error.
2022-07-11 17:32:29 +00:00
if ( ! get_remaining ( ) . starts_with ( " // " sv ) ) {
2021-05-25 22:13:15 +02:00
report_validation_error ( ) ;
}
2023-07-03 22:52:08 +12:00
// 2. Set state to file state.
2021-05-25 22:13:15 +02:00
state = State : : File ;
2023-07-03 22:52:08 +12:00
}
// 6. Otherwise, if url is special, base is non-null, and base’ s scheme is url’ s scheme:
// 7. Otherwise, if url is special, set state to special authority slashes state.
// FIXME: Write this block closer to spec text.
else if ( url - > is_special ( ) ) {
// FIXME: 1. Assert: base is is special (and therefore does not have an opaque path).
// 2. Set state to special relative or authority state.
2023-04-11 14:53:40 +02:00
if ( base_url . has_value ( ) & & base_url - > m_scheme = = url - > m_scheme )
2021-05-25 22:13:15 +02:00
state = State : : SpecialRelativeOrAuthority ;
else
state = State : : SpecialAuthoritySlashes ;
2023-07-03 22:52:08 +12:00
}
// 8. Otherwise, if remaining starts with an U+002F (/), set state to path or authority state and increase pointer by 1.
else if ( get_remaining ( ) . starts_with ( " / " sv ) ) {
2021-05-25 22:13:15 +02:00
state = State : : PathOrAuthority ;
+ + iterator ;
2023-07-03 22:52:08 +12:00
}
// 9. Otherwise, set url’ s path to the empty string and set state to opaque path state.
else {
2021-09-13 22:34:14 +03:00
url - > m_cannot_be_a_base_url = true ;
2023-04-09 14:21:00 +01:00
url - > append_slash ( ) ;
2021-05-25 22:13:15 +02:00
state = State : : CannotBeABaseUrlPath ;
}
2023-07-03 22:52:08 +12:00
}
// FIXME: 3. Otherwise, if state override is not given, set buffer to the empty string, state to no scheme state, and start over (from the first code point in input).
else {
2021-05-25 22:13:15 +02:00
buffer . clear ( ) ;
state = State : : NoScheme ;
iterator = input . begin ( ) ;
continue ;
}
2023-07-03 22:52:08 +12:00
// FIXME: 4. Otherwise, return failure.
2021-05-25 22:13:15 +02:00
break ;
2023-07-03 22:52:08 +12:00
// -> no scheme state, https://url.spec.whatwg.org/#no-scheme-state
2021-05-25 22:13:15 +02:00
case State : : NoScheme :
2023-07-03 22:52:08 +12:00
// 1. If base is null, or base has an opaque path and c is not U+0023 (#), missing-scheme-non-relative-URL validation error, return failure.
2023-04-11 14:53:40 +02:00
if ( ! base_url . has_value ( ) | | ( base_url - > m_cannot_be_a_base_url & & code_point ! = ' # ' ) ) {
2021-05-25 22:13:15 +02:00
report_validation_error ( ) ;
return { } ;
2023-07-03 22:52:08 +12:00
}
// 2. Otherwise, if base has an opaque path and c is U+0023 (#), set url’ s scheme to base’ s scheme, url’ s path to base’ s path, url’ s query
// to base’ s query,url’ s fragment to the empty string, and set state to fragment state.
else if ( base_url - > m_cannot_be_a_base_url & & code_point = = ' # ' ) {
2021-09-13 22:34:14 +03:00
url - > m_scheme = base_url - > m_scheme ;
url - > m_paths = base_url - > m_paths ;
url - > m_query = base_url - > m_query ;
url - > m_fragment = " " ;
url - > m_cannot_be_a_base_url = true ;
2021-05-25 22:13:15 +02:00
state = State : : Fragment ;
2023-07-03 22:52:08 +12:00
}
// 3. Otherwise, if base’ s scheme is not "file", set state to relative state and decrease pointer by 1.
else if ( base_url - > m_scheme ! = " file " ) {
2021-05-25 22:13:15 +02:00
state = State : : Relative ;
continue ;
2023-07-03 22:52:08 +12:00
}
// 4. Otherwise, set state to file state and decrease pointer by 1.
else {
2021-05-25 22:13:15 +02:00
state = State : : File ;
continue ;
}
break ;
2023-07-03 22:52:08 +12:00
// -> special relative or authority state, https://url.spec.whatwg.org/#special-relative-or-authority-state
2021-05-25 22:13:15 +02:00
case State : : SpecialRelativeOrAuthority :
2023-07-03 22:52:08 +12:00
// 1. If c is U+002F (/) and remaining starts with U+002F (/), then set state to special authority ignore slashes state and increase pointer by 1.
2022-07-11 17:32:29 +00:00
if ( code_point = = ' / ' & & get_remaining ( ) . starts_with ( " / " sv ) ) {
2021-05-25 22:13:15 +02:00
state = State : : SpecialAuthorityIgnoreSlashes ;
+ + iterator ;
2023-07-03 22:52:08 +12:00
}
// 2. Otherwise, special-scheme-missing-following-solidus validation error, set state to relative state and decrease pointer by 1.
else {
2021-05-25 22:13:15 +02:00
report_validation_error ( ) ;
state = State : : Relative ;
continue ;
}
break ;
2023-07-03 22:52:08 +12:00
// -> path or authority state, https://url.spec.whatwg.org/#path-or-authority-state
2021-05-25 22:13:15 +02:00
case State : : PathOrAuthority :
2023-07-03 22:52:08 +12:00
// 1. If c is U+002F (/), then set state to authority state.
2021-05-25 22:13:15 +02:00
if ( code_point = = ' / ' ) {
state = State : : Authority ;
2023-07-03 22:52:08 +12:00
}
// 2. Otherwise, set state to path state, and decrease pointer by 1.
else {
2021-05-25 22:13:15 +02:00
state = State : : Path ;
continue ;
}
break ;
2023-07-03 22:52:08 +12:00
// -> relative state, https://url.spec.whatwg.org/#relative-state
2021-05-25 22:13:15 +02:00
case State : : Relative :
2023-07-03 22:52:08 +12:00
// FIXME: 1. Assert: base’ s scheme is not "file".
// 2. Set url’ s scheme to base’ s scheme.
2021-09-13 22:34:14 +03:00
url - > m_scheme = base_url - > m_scheme ;
2023-07-03 22:52:08 +12:00
// 3. If c is U+002F (/), then set state to relative slash state.
2021-05-25 22:13:15 +02:00
if ( code_point = = ' / ' ) {
state = State : : RelativeSlash ;
2023-07-03 22:52:08 +12:00
}
// 4. Otherwise, if url is special and c is U+005C (\), invalid-reverse-solidus validation error, set state to relative slash state.
else if ( url - > is_special ( ) & & code_point = = ' \\ ' ) {
2021-05-25 22:13:15 +02:00
report_validation_error ( ) ;
state = State : : RelativeSlash ;
2023-07-03 22:52:08 +12:00
}
// 5. Otherwise:
else {
// 1. Set url’ s username to base’ s username, url’ s password to base’ s password, url’ s host to base’ s host, url’ s port to base’ s port, url’ s path to a clone of base’ s path, and url’ s query to base’ s query.
2021-09-13 22:34:14 +03:00
url - > m_username = base_url - > m_username ;
url - > m_password = base_url - > m_password ;
url - > m_host = base_url - > m_host ;
url - > m_port = base_url - > m_port ;
url - > m_paths = base_url - > m_paths ;
url - > m_query = base_url - > m_query ;
2021-05-25 22:13:15 +02:00
2023-07-03 22:52:08 +12:00
// 2. If c is U+003F (?), then set url’ s query to the empty string, and state to query state.
2021-05-25 22:13:15 +02:00
if ( code_point = = ' ? ' ) {
2021-09-13 22:34:14 +03:00
url - > m_query = " " ;
2021-05-25 22:13:15 +02:00
state = State : : Query ;
2023-07-03 22:52:08 +12:00
}
// 3. Otherwise, if c is U+0023 (#), set url’ s fragment to the empty string and state to fragment state.
else if ( code_point = = ' # ' ) {
2021-09-13 22:34:14 +03:00
url - > m_fragment = " " ;
2021-05-25 22:13:15 +02:00
state = State : : Fragment ;
2023-07-03 22:52:08 +12:00
}
// 4. Otherwise, if c is not the EOF code point:
else if ( code_point ! = end_of_file ) {
// 1. Set url’ s query to null.
2021-09-13 22:34:14 +03:00
url - > m_query = { } ;
2023-07-03 22:52:08 +12:00
// 2. Shorten url’ s path.
2021-09-13 22:34:14 +03:00
if ( url - > m_paths . size ( ) )
url - > m_paths . remove ( url - > m_paths . size ( ) - 1 ) ;
2023-07-03 22:52:08 +12:00
// 3. Set state to path state and decrease pointer by 1.
2021-05-25 22:13:15 +02:00
state = State : : Path ;
continue ;
}
}
break ;
2023-07-03 22:52:08 +12:00
// -> relative slash state, https://url.spec.whatwg.org/#relative-slash-state
2021-05-25 22:13:15 +02:00
case State : : RelativeSlash :
2023-07-03 22:52:08 +12:00
// 1. If url is special and c is U+002F (/) or U+005C (\), then:
2021-09-13 22:34:14 +03:00
if ( url - > is_special ( ) & & ( code_point = = ' / ' | | code_point = = ' \\ ' ) ) {
2023-07-03 22:52:08 +12:00
// 1. If c is U+005C (\), invalid-reverse-solidus validation error.
2021-05-25 22:13:15 +02:00
if ( code_point = = ' \\ ' )
report_validation_error ( ) ;
2023-07-03 22:52:08 +12:00
// 2. Set state to special authority ignore slashes state.
2021-05-25 22:13:15 +02:00
state = State : : SpecialAuthorityIgnoreSlashes ;
2023-07-03 22:52:08 +12:00
}
// 2. Otherwise, if c is U+002F (/), then set state to authority state.
else if ( code_point = = ' / ' ) {
2021-05-25 22:13:15 +02:00
state = State : : Authority ;
2023-07-03 22:52:08 +12:00
}
// 3. Otherwise, set url’ s username to base’ s username, url’ s password to base’ s password, url’ s host to base’ s host, url’ s port to base’ s port, state to path state, and then, decrease pointer by 1.
else {
2021-09-13 22:34:14 +03:00
url - > m_username = base_url - > m_username ;
url - > m_password = base_url - > m_password ;
url - > m_host = base_url - > m_host ;
url - > m_port = base_url - > m_port ;
2021-05-25 22:13:15 +02:00
state = State : : Path ;
continue ;
}
break ;
2023-07-03 22:52:08 +12:00
// -> special authority slashes state, https://url.spec.whatwg.org/#special-authority-slashes-state
2021-05-25 22:13:15 +02:00
case State : : SpecialAuthoritySlashes :
2023-07-03 22:52:08 +12:00
// 1. If c is U+002F (/) and remaining starts with U+002F (/), then set state to special authority ignore slashes state and increase pointer by 1.
2022-07-11 17:32:29 +00:00
if ( code_point = = ' / ' & & get_remaining ( ) . starts_with ( " / " sv ) ) {
2021-05-25 22:13:15 +02:00
state = State : : SpecialAuthorityIgnoreSlashes ;
+ + iterator ;
2023-07-03 22:52:08 +12:00
}
// 2. Otherwise, special-scheme-missing-following-solidus validation error, set state to special authority ignore slashes state and decrease pointer by 1.
else {
2021-05-25 22:13:15 +02:00
report_validation_error ( ) ;
state = State : : SpecialAuthorityIgnoreSlashes ;
continue ;
}
break ;
2023-07-03 22:52:08 +12:00
// -> special authority ignore slashes state, https://url.spec.whatwg.org/#special-authority-ignore-slashes-state
2021-05-25 22:13:15 +02:00
case State : : SpecialAuthorityIgnoreSlashes :
2023-07-03 22:52:08 +12:00
// 1. If c is neither U+002F (/) nor U+005C (\), then set state to authority state and decrease pointer by 1.
2021-05-25 22:13:15 +02:00
if ( code_point ! = ' / ' & & code_point ! = ' \\ ' ) {
state = State : : Authority ;
continue ;
2023-07-03 22:52:08 +12:00
}
// 2. Otherwise, special-scheme-missing-following-solidus validation error.
else {
2021-05-25 22:13:15 +02:00
report_validation_error ( ) ;
}
break ;
2023-07-03 22:52:08 +12:00
// -> authority state, https://url.spec.whatwg.org/#authority-state
2021-05-25 22:13:15 +02:00
case State : : Authority :
2023-07-03 22:52:08 +12:00
// 1. If c is U+0040 (@), then:
2021-05-25 22:13:15 +02:00
if ( code_point = = ' @ ' ) {
2023-07-03 22:52:08 +12:00
// 1. Invalid-credentials validation error.
2021-05-25 22:13:15 +02:00
report_validation_error ( ) ;
2023-07-03 22:52:08 +12:00
// 2. If atSignSeen is true, then prepend "%40" to buffer.
2021-05-25 22:13:15 +02:00
if ( at_sign_seen ) {
2022-12-06 01:12:49 +00:00
auto content = buffer . to_deprecated_string ( ) ;
2021-05-25 22:13:15 +02:00
buffer . clear ( ) ;
2022-07-11 17:32:29 +00:00
buffer . append ( " %40 " sv ) ;
2021-05-25 22:13:15 +02:00
buffer . append ( content ) ;
}
2023-07-03 22:52:08 +12:00
// 3. Set atSignSeen to true.
2021-05-25 22:13:15 +02:00
at_sign_seen = true ;
2023-07-03 22:52:08 +12:00
2021-05-25 22:13:15 +02:00
StringBuilder builder ;
2023-07-03 22:52:08 +12:00
// FIXME: 4. For each codePoint in buffer:
2021-06-08 15:22:02 +02:00
for ( auto c : Utf8View ( builder . string_view ( ) ) ) {
2023-07-03 22:52:08 +12:00
// 1. If codePoint is U+003A (:) and passwordTokenSeen is false, then set passwordTokenSeen to true and continue.
2021-05-25 22:13:15 +02:00
if ( c = = ' : ' & & ! password_token_seen ) {
password_token_seen = true ;
continue ;
}
2023-07-03 22:52:08 +12:00
// 2. Let encodedCodePoints be the result of running UTF-8 percent-encode codePoint using the userinfo percent-encode set.
// NOTE: This is done inside of step 3 and 4 implementation
2021-05-25 22:13:15 +02:00
builder . clear ( ) ;
2023-07-03 22:52:08 +12:00
// 3. If passwordTokenSeen is true, then append encodedCodePoints to url’ s password.
2021-05-25 22:13:15 +02:00
if ( password_token_seen ) {
2021-09-13 22:34:14 +03:00
builder . append ( url - > password ( ) ) ;
2021-05-25 22:13:15 +02:00
URL : : append_percent_encoded_if_necessary ( builder , c , URL : : PercentEncodeSet : : Userinfo ) ;
2023-04-09 14:21:00 +01:00
url - > m_password = builder . string_view ( ) ;
2023-07-03 22:52:08 +12:00
}
// 4. Otherwise, append encodedCodePoints to url’ s username.
else {
2021-09-13 22:34:14 +03:00
builder . append ( url - > username ( ) ) ;
2021-05-25 22:13:15 +02:00
URL : : append_percent_encoded_if_necessary ( builder , c , URL : : PercentEncodeSet : : Userinfo ) ;
2023-04-09 14:21:00 +01:00
url - > m_username = builder . string_view ( ) ;
2021-05-25 22:13:15 +02:00
}
}
2023-07-03 22:52:08 +12:00
// 5. Set buffer to the empty string.
2021-05-25 22:13:15 +02:00
buffer . clear ( ) ;
2023-07-03 22:52:08 +12:00
}
// 2. Otherwise, if one of the following is true:
// * c is the EOF code point, U+002F (/), U+003F (?), or U+0023 (#)
// * url is special and c is U+005C (\)
else if ( ( code_point = = end_of_file | | code_point = = ' / ' | | code_point = = ' ? ' | | code_point = = ' # ' )
| | ( url - > is_special ( ) & & code_point = = ' \\ ' ) ) {
// then:
// 1. If atSignSeen is true and buffer is the empty string, invalid-credentials validation error, return failure.
2021-05-25 22:13:15 +02:00
if ( at_sign_seen & & buffer . is_empty ( ) ) {
report_validation_error ( ) ;
return { } ;
}
2023-07-03 22:52:08 +12:00
// 2. Decrease pointer by buffer’ s code point length + 1, set buffer to the empty string, and set state to host state.
2021-05-25 22:13:15 +02:00
iterator = input . iterator_at_byte_offset ( iterator - input . begin ( ) - buffer . length ( ) - 1 ) ;
buffer . clear ( ) ;
state = State : : Host ;
2023-07-03 22:52:08 +12:00
}
// 3. Otherwise, append c to buffer.
else {
2021-05-25 22:13:15 +02:00
buffer . append_code_point ( code_point ) ;
}
break ;
2023-07-03 22:52:08 +12:00
// -> host state, https://url.spec.whatwg.org/#host-state
// -> hostname state, https://url.spec.whatwg.org/#hostname-state
2021-05-25 22:13:15 +02:00
case State : : Host :
case State : : Hostname :
2023-07-03 22:52:08 +12:00
// FIXME: 1. If state override is given and url’ s scheme is "file", then decrease pointer by 1 and set state to file host state.
// 2. Otherwise, if c is U+003A (:) and insideBrackets is false, then:
2021-05-25 22:13:15 +02:00
if ( code_point = = ' : ' & & ! inside_brackets ) {
2023-07-03 22:52:08 +12:00
// 1. If buffer is the empty string, host-missing validation error, return failure.
2021-05-25 22:13:15 +02:00
if ( buffer . is_empty ( ) ) {
report_validation_error ( ) ;
return { } ;
}
2023-07-03 22:52:08 +12:00
// FIXME: 2. If state override is given and state override is hostname state, then return.
// 3. Let host be the result of host parsing buffer with url is not special.
2021-09-13 22:34:14 +03:00
auto host = parse_host ( buffer . string_view ( ) , ! url - > is_special ( ) ) ;
2023-07-03 22:52:08 +12:00
// 4. If host is failure, then return failure.
2021-05-25 22:13:15 +02:00
if ( ! host . has_value ( ) )
return { } ;
2023-07-03 22:52:08 +12:00
// 5. Set url’ s host to host, buffer to the empty string, and state to port state.
2021-09-13 22:34:14 +03:00
url - > m_host = host . release_value ( ) ;
2021-05-25 22:13:15 +02:00
buffer . clear ( ) ;
state = State : : Port ;
2023-07-03 22:52:08 +12:00
}
// 3. Otherwise, if one of the following is true:
// * c is the EOF code point, U+002F (/), U+003F (?), or U+0023 (#)
// * url is special and c is U+005C (\)
else if ( ( code_point = = end_of_file | | code_point = = ' / ' | | code_point = = ' ? ' | | code_point = = ' # ' )
| | ( url - > is_special ( ) & & code_point = = ' \\ ' ) ) {
// then decrease pointer by 1, and then:
// NOTE: pointer decrement is done by the continue below
// 1. If url is special and buffer is the empty string, host-missing validation error, return failure.
2021-09-13 22:34:14 +03:00
if ( url - > is_special ( ) & & buffer . is_empty ( ) ) {
2021-05-25 22:13:15 +02:00
report_validation_error ( ) ;
return { } ;
}
2023-07-03 22:52:08 +12:00
// FIXME: 2. Otherwise, if state override is given, buffer is the empty string, and either url includes credentials or url’ s port is non-null, return.
// 3. Let host be the result of host parsing buffer with url is not special.
2021-09-13 22:34:14 +03:00
auto host = parse_host ( buffer . string_view ( ) , ! url - > is_special ( ) ) ;
2023-07-03 22:52:08 +12:00
// 4. If host is failure, then return failure.
2021-05-25 22:13:15 +02:00
if ( ! host . has_value ( ) )
return { } ;
2023-07-03 22:52:08 +12:00
// 5. Set url’ s host to host, buffer to the empty string, and state to path start state.
2021-09-13 22:34:14 +03:00
url - > m_host = host . value ( ) ;
2021-05-25 22:13:15 +02:00
buffer . clear ( ) ;
state = State : : Port ;
2023-07-03 22:52:08 +12:00
// FIXME: 6. If state override is given, then return.
2021-05-25 22:13:15 +02:00
continue ;
2023-07-03 22:52:08 +12:00
}
// 4. Otherwise:
// FIXME: Implement closer to spec text. From reading it, shouldn't we be appending [ or ] to buffer as well? Step 3. below does not have an 'otherwise'.
//
// 1. If c is U+005B ([), then set insideBrackets to true.
else if ( code_point = = ' [ ' ) {
2021-05-25 22:13:15 +02:00
inside_brackets = true ;
2023-07-03 22:52:08 +12:00
}
// 2. If c is U+005D (]), then set insideBrackets to false.
else if ( code_point = = ' ] ' ) {
2021-05-25 22:13:15 +02:00
inside_brackets = false ;
2023-07-03 22:52:08 +12:00
}
// 3. Append c to buffer.
else {
2021-05-25 22:13:15 +02:00
buffer . append_code_point ( code_point ) ;
}
break ;
2023-07-03 22:52:08 +12:00
// -> port state, https://url.spec.whatwg.org/#port-state
2021-05-25 22:13:15 +02:00
case State : : Port :
2023-07-03 22:52:08 +12:00
// 1. If c is an ASCII digit, append c to buffer.
2021-05-25 22:13:15 +02:00
if ( is_ascii_digit ( code_point ) ) {
buffer . append_code_point ( code_point ) ;
2023-07-03 22:52:08 +12:00
}
// 2. Otherwise, if one of the following is true:
// * c is the EOF code point, U+002F (/), U+003F (?), or U+0023 (#)
// * url is special and c is U+005C (\)
// * FIXME: state override is given
else if ( code_point = = end_of_file | | code_point = = ' / ' | | code_point = = ' ? ' | | code_point = = ' # ' | | ( url - > is_special ( ) & & code_point = = ' \\ ' ) ) {
// then:
// 1. If buffer is not the empty string, then:
2021-05-25 22:13:15 +02:00
if ( ! buffer . is_empty ( ) ) {
2023-07-03 22:52:08 +12:00
// 1. Let port be the mathematical integer value that is represented by buffer in radix-10 using ASCII digits for digits with values 0 through 9.
2021-06-08 15:22:02 +02:00
auto port = buffer . string_view ( ) . to_uint ( ) ;
2023-07-03 22:52:08 +12:00
// 2. If port is greater than 2^16 − 1, port-out-of-range validation error, return failure.
2021-05-25 22:13:15 +02:00
if ( ! port . has_value ( ) | | port . value ( ) > 65535 ) {
report_validation_error ( ) ;
return { } ;
}
2023-07-03 22:52:08 +12:00
// 3. Set url’ s port to null, if port is url’ s scheme’ s default port; otherwise to port.
2021-09-13 22:34:14 +03:00
if ( port . value ( ) = = URL : : default_port_for_scheme ( url - > scheme ( ) ) )
2021-09-13 23:12:16 +03:00
url - > m_port = { } ;
2021-05-25 22:13:15 +02:00
else
2021-09-13 22:34:14 +03:00
url - > m_port = port . value ( ) ;
2023-07-03 22:52:08 +12:00
// 4. Set buffer to the empty string.
2021-05-25 22:13:15 +02:00
buffer . clear ( ) ;
}
2023-07-03 22:52:08 +12:00
// FIXME: 2. If state override is given, then return.
// 3. Set state to path start state and decrease pointer by 1.
2021-05-25 22:13:15 +02:00
state = State : : PathStart ;
continue ;
2023-07-03 22:52:08 +12:00
}
// 3. Otherwise, port-invalid validation error, return failure.
else {
2021-05-25 22:13:15 +02:00
report_validation_error ( ) ;
return { } ;
}
break ;
2023-07-03 22:52:08 +12:00
// -> file state, https://url.spec.whatwg.org/#file-state
2021-05-25 22:13:15 +02:00
case State : : File :
2023-07-03 22:52:08 +12:00
// 1. Set url’ s scheme to "file".
2021-09-13 22:34:14 +03:00
url - > m_scheme = " file " ;
2023-07-03 22:52:08 +12:00
// 2. Set url’ s host to the empty string.
2021-09-13 22:34:14 +03:00
url - > m_host = " " ;
2023-07-03 22:52:08 +12:00
// 3. If c is U+002F (/) or U+005C (\), then:
2021-05-25 22:13:15 +02:00
if ( code_point = = ' / ' | | code_point = = ' \\ ' ) {
2023-07-03 22:52:08 +12:00
// 1. If c is U+005C (\), invalid-reverse-solidus validation error.
2021-05-25 22:13:15 +02:00
if ( code_point = = ' \\ ' )
report_validation_error ( ) ;
2023-07-03 22:52:08 +12:00
// 2. Set state to file slash state.
2021-05-25 22:13:15 +02:00
state = State : : FileSlash ;
2023-07-03 22:52:08 +12:00
}
// 4. Otherwise, if base is non-null and base’ s scheme is "file":
else if ( base_url . has_value ( ) & & base_url - > m_scheme = = " file " ) {
// 1. Set url’ s host to base’ s host, url’ s path to a clone of base’ s path, and url’ s query to base’ s query.
2021-09-13 22:34:14 +03:00
url - > m_host = base_url - > m_host ;
url - > m_paths = base_url - > m_paths ;
url - > m_query = base_url - > m_query ;
2023-07-03 22:52:08 +12:00
// 2. If c is U+003F (?), then set url’ s query to the empty string and state to query state.
2021-05-25 22:13:15 +02:00
if ( code_point = = ' ? ' ) {
2021-09-13 22:34:14 +03:00
url - > m_query = " " ;
2021-05-25 22:13:15 +02:00
state = State : : Query ;
2023-07-03 22:52:08 +12:00
}
// 3. Otherwise, if c is U+0023 (#), set url’ s fragment to the empty string and state to fragment state.
else if ( code_point = = ' # ' ) {
2021-09-13 22:34:14 +03:00
url - > m_fragment = " " ;
2021-05-25 22:13:15 +02:00
state = State : : Fragment ;
2023-07-03 22:52:08 +12:00
}
// 4. Otherwise, if c is not the EOF code point:
else if ( code_point ! = end_of_file ) {
// 1. Set url’ s query to null.
2021-09-13 22:34:14 +03:00
url - > m_query = { } ;
2023-07-03 22:52:08 +12:00
// 2. If the code point substring from pointer to the end of input does not start with a Windows drive letter, then shorten url’ s path.
2021-05-25 22:13:15 +02:00
auto substring_from_pointer = input . substring_view ( iterator - input . begin ( ) ) . as_string ( ) ;
if ( ! starts_with_windows_drive_letter ( substring_from_pointer ) ) {
2023-04-13 23:29:51 +01:00
if ( ! url - > m_paths . is_empty ( ) & & ! ( url - > scheme ( ) = = " file " & & url - > m_paths . size ( ) = = 1 & & is_normalized_windows_drive_letter ( url - > m_paths [ 0 ] ) ) )
2021-09-13 22:34:14 +03:00
url - > m_paths . remove ( url - > m_paths . size ( ) - 1 ) ;
2023-07-03 22:52:08 +12:00
}
// 3. Otherwise:
else {
// 1. File-invalid-Windows-drive-letter validation error.
2021-05-25 22:13:15 +02:00
report_validation_error ( ) ;
2023-07-03 22:52:08 +12:00
// 2. Set url’ s path to « ».
2021-09-13 22:34:14 +03:00
url - > m_paths . clear ( ) ;
2021-05-25 22:13:15 +02:00
}
2023-07-03 22:52:08 +12:00
// 4. Set state to path state and decrease pointer by 1.
2021-05-25 22:13:15 +02:00
state = State : : Path ;
continue ;
}
}
2023-07-03 22:52:08 +12:00
// FIXME: 5. Otherwise, set state to path state, and decrease pointer by 1.
2021-05-25 22:13:15 +02:00
break ;
2023-07-03 22:52:08 +12:00
// -> file slash state, https://url.spec.whatwg.org/#file-slash-state
2021-05-25 22:13:15 +02:00
case State : : FileSlash :
2023-07-03 22:52:08 +12:00
// 1. If c is U+002F (/) or U+005C (\), then:
2021-05-25 22:13:15 +02:00
if ( code_point = = ' / ' | | code_point = = ' \\ ' ) {
2023-07-03 22:52:08 +12:00
// 1. If c is U+005C (\), invalid-reverse-solidus validation error.
2021-05-25 22:13:15 +02:00
if ( code_point = = ' \\ ' )
report_validation_error ( ) ;
2023-07-03 22:52:08 +12:00
// 2. Set state to file host state.
2021-05-25 22:13:15 +02:00
state = State : : FileHost ;
2023-07-03 22:52:08 +12:00
}
// 2. Otherwise:
// 1. If base is non-null and base’ s scheme is "file", then:
else if ( base_url . has_value ( ) & & base_url - > m_scheme = = " file " ) {
// 1. Set url’ s host to base’ s host.
2023-06-18 02:48:06 +03:00
url - > m_paths = base_url - > m_paths ;
url - > m_paths . remove ( url - > m_paths . size ( ) - 1 ) ;
2023-07-03 22:52:08 +12:00
// 2. If the code point substring from pointer to the end of input does not start with a Windows drive letter and base’ s path[0] is a normalized Windows drive letter, then append base’ s path[0] to url’ s path.
2021-05-25 22:13:15 +02:00
auto substring_from_pointer = input . substring_view ( iterator - input . begin ( ) ) . as_string ( ) ;
if ( ! starts_with_windows_drive_letter ( substring_from_pointer ) & & is_normalized_windows_drive_letter ( base_url - > m_paths [ 0 ] ) )
2023-04-09 14:21:00 +01:00
url - > append_path ( base_url - > m_paths [ 0 ] , URL : : ApplyPercentEncoding : : No ) ;
2023-07-03 22:52:08 +12:00
// FIXME: This should be done outside of this file block, see below.
2021-05-25 22:13:15 +02:00
state = State : : Path ;
continue ;
}
2023-07-03 22:52:08 +12:00
// FIXME: 2. Set state to path state, and decrease pointer by 1.
2021-05-25 22:13:15 +02:00
break ;
2023-07-03 22:52:08 +12:00
// -> file host state, https://url.spec.whatwg.org/#file-host-state
2021-05-25 22:13:15 +02:00
case State : : FileHost :
2023-07-03 22:52:08 +12:00
// 1. If c is the EOF code point, U+002F (/), U+005C (\), U+003F (?), or U+0023 (#), then decrease pointer by 1 and then:
// NOTE: decreasing the pointer is done at the bottom of this block.
2021-06-03 12:43:08 +02:00
if ( code_point = = end_of_file | | code_point = = ' / ' | | code_point = = ' \\ ' | | code_point = = ' ? ' | | code_point = = ' # ' ) {
2023-07-03 22:52:08 +12:00
// 1. If state override is not given and buffer is a Windows drive letter, file-invalid-Windows-drive-letter-host validation error, set state to path state.
// FIXME: Check state override.
2021-06-08 15:22:02 +02:00
if ( is_windows_drive_letter ( buffer . string_view ( ) ) ) {
2021-05-25 22:13:15 +02:00
report_validation_error ( ) ;
state = State : : Path ;
2023-07-03 22:52:08 +12:00
}
// 2. Otherwise, if buffer is the empty string, then:
else if ( buffer . is_empty ( ) ) {
// 1. Set url’ s host to the empty string.
2021-09-13 22:34:14 +03:00
url - > m_host = " " ;
2023-07-03 22:52:08 +12:00
// FIXME: 2. If state override is given, then return.
// 3. Set state to path start state.
2021-05-25 22:13:15 +02:00
state = State : : PathStart ;
2023-07-03 22:52:08 +12:00
}
// 3. Otherwise, run these steps:
else {
// 1. Let host be the result of host parsing buffer with url is not special.
// FIXME: It seems we are not passing through url is not special through here
2021-06-08 15:22:02 +02:00
auto host = parse_host ( buffer . string_view ( ) , true ) ;
2023-07-03 22:52:08 +12:00
// 2. If host is failure, then return failure.
2021-05-25 22:13:15 +02:00
if ( ! host . has_value ( ) )
return { } ;
2023-07-03 22:52:08 +12:00
// 3. If host is "localhost", then set host to the empty string.
2021-05-25 22:13:15 +02:00
if ( host . value ( ) = = " localhost " )
host = " " ;
2023-07-03 22:52:08 +12:00
// 4. Set url’ s host to host.
2021-09-13 22:34:14 +03:00
url - > m_host = host . release_value ( ) ;
2023-07-03 22:52:08 +12:00
// FIXME: 5. If state override is given, then return.
// 6. Set buffer to the empty string and state to path start state.
2021-05-25 22:13:15 +02:00
buffer . clear ( ) ;
state = State : : PathStart ;
}
2023-07-03 22:52:08 +12:00
// NOTE: Decrement specified at the top of this 'if' statement.
2021-05-25 22:13:15 +02:00
continue ;
} else {
buffer . append_code_point ( code_point ) ;
}
break ;
2023-07-03 22:52:08 +12:00
// -> path start state, https://url.spec.whatwg.org/#path-start-state
2021-05-25 22:13:15 +02:00
case State : : PathStart :
2023-07-03 22:52:08 +12:00
// 1. If url is special, then:
2021-09-13 22:34:14 +03:00
if ( url - > is_special ( ) ) {
2023-07-03 22:52:08 +12:00
// 1. If c is U+005C (\), invalid-reverse-solidus validation error.
2021-05-25 22:13:15 +02:00
if ( code_point = = ' \\ ' )
report_validation_error ( ) ;
2023-07-03 22:52:08 +12:00
// 2. Set state to path state.
2021-05-25 22:13:15 +02:00
state = State : : Path ;
2023-07-03 22:52:08 +12:00
// 3. If c is neither U+002F (/) nor U+005C (\), then decrease pointer by 1.
2021-05-25 22:13:15 +02:00
if ( code_point ! = ' / ' & & code_point ! = ' \\ ' )
continue ;
2023-07-03 22:52:08 +12:00
}
// 2. Otherwise, if state override is not given and c is U+003F (?), set url’ s query to the empty string and state to query state.
// FIXME: Check for state override
else if ( code_point = = ' ? ' ) {
2021-09-13 22:34:14 +03:00
url - > m_query = " " ;
2021-05-25 22:13:15 +02:00
state = State : : Query ;
2023-07-03 22:52:08 +12:00
}
// 3. Otherwise, if state override is not given and c is U+0023 (#), set url’ s fragment to the empty string and state to fragment state.
// FIXME: Check for state override
else if ( code_point = = ' # ' ) {
2021-09-13 22:34:14 +03:00
url - > m_fragment = " " ;
2021-05-25 22:13:15 +02:00
state = State : : Fragment ;
2023-07-03 22:52:08 +12:00
}
// 4. Otherwise, if c is not the EOF code point:
else if ( code_point ! = end_of_file ) {
// 1. Set state to path state.
2021-05-25 22:13:15 +02:00
state = State : : Path ;
2023-07-03 22:52:08 +12:00
// 2. If c is not U+002F (/), then decrease pointer by 1.
2021-05-25 22:13:15 +02:00
if ( code_point ! = ' / ' )
continue ;
}
2023-07-03 22:52:08 +12:00
// FIXME: 5. Otherwise, if state override is given and url’ s host is null, append the empty string to url’ s path.
2021-05-25 22:13:15 +02:00
break ;
2023-07-03 22:52:08 +12:00
// -> path state, https://url.spec.whatwg.org/#path-state
2021-05-25 22:13:15 +02:00
case State : : Path :
2023-07-03 22:52:08 +12:00
// 1. If one of the following is true:
// * c is the EOF code point or U+002F (/)
// * url is special and c is U+005C (\)
// * FIXME: state override is not given and c is U+003F (?) or U+0023 (#)
2021-09-13 22:34:14 +03:00
if ( code_point = = end_of_file | | code_point = = ' / ' | | ( url - > is_special ( ) & & code_point = = ' \\ ' ) | | code_point = = ' ? ' | | code_point = = ' # ' ) {
2023-07-03 22:52:08 +12:00
// then:
// 1. If url is special and c is U+005C (\), invalid-reverse-solidus validation error.
2021-09-13 22:34:14 +03:00
if ( url - > is_special ( ) & & code_point = = ' \\ ' )
2021-05-25 22:13:15 +02:00
report_validation_error ( ) ;
2023-07-03 22:52:08 +12:00
// 2. If buffer is a double-dot URL path segment, then:
2021-06-08 15:22:02 +02:00
if ( is_double_dot_path_segment ( buffer . string_view ( ) ) ) {
2023-07-03 22:52:08 +12:00
// FIXME: 1. Shorten url’ s path.
2021-09-13 22:34:14 +03:00
if ( ! url - > m_paths . is_empty ( ) & & ! ( url - > m_scheme = = " file " & & url - > m_paths . size ( ) = = 1 & & is_normalized_windows_drive_letter ( url - > m_paths [ 0 ] ) ) )
url - > m_paths . remove ( url - > m_paths . size ( ) - 1 ) ;
2023-07-03 22:52:08 +12:00
// 2. If neither c is U+002F (/), nor url is special and c is U+005C (\), append the empty string to url’ s path.
2021-09-13 22:34:14 +03:00
if ( code_point ! = ' / ' & & ! ( url - > is_special ( ) & & code_point = = ' \\ ' ) )
2023-04-09 14:21:00 +01:00
url - > append_slash ( ) ;
2023-07-03 22:52:08 +12:00
}
// 3. Otherwise, if buffer is a single-dot URL path segment and if neither c is U+002F (/), nor url is special and c is U+005C (\), append the empty string to url’ s path.
else if ( is_single_dot_path_segment ( buffer . string_view ( ) ) & & code_point ! = ' / ' & & ! ( url - > is_special ( ) & & code_point = = ' \\ ' ) ) {
2023-04-09 14:21:00 +01:00
url - > append_slash ( ) ;
2023-07-03 22:52:08 +12:00
}
// 4. Otherwise, if buffer is not a single-dot URL path segment, then:
else if ( ! is_single_dot_path_segment ( buffer . string_view ( ) ) ) {
// 1. If url’ s scheme is "file", url’ s path is empty, and buffer is a Windows drive letter, then replace the second code point in buffer with U+003A (:).
2021-09-13 22:34:14 +03:00
if ( url - > m_scheme = = " file " & & url - > m_paths . is_empty ( ) & & is_windows_drive_letter ( buffer . string_view ( ) ) ) {
2021-06-08 15:22:02 +02:00
auto drive_letter = buffer . string_view ( ) [ 0 ] ;
2021-05-25 22:13:15 +02:00
buffer . clear ( ) ;
buffer . append ( drive_letter ) ;
buffer . append ( ' : ' ) ;
}
2023-07-03 22:52:08 +12:00
// 2. Append buffer to url’ s path.
// FIXME: It would be nicer (and closer to spec) if URLParser could just directly append the path.
2023-04-09 14:21:00 +01:00
url - > append_path ( buffer . string_view ( ) , URL : : ApplyPercentEncoding : : No ) ;
2021-05-25 22:13:15 +02:00
}
2023-07-03 22:52:08 +12:00
// 5. Set buffer to the empty string.
2021-05-25 22:13:15 +02:00
buffer . clear ( ) ;
2023-07-03 22:52:08 +12:00
// 6. If c is U+003F (?), then set url’ s query to the empty string and state to query state.
2021-05-25 22:13:15 +02:00
if ( code_point = = ' ? ' ) {
2021-09-13 22:34:14 +03:00
url - > m_query = " " ;
2021-05-25 22:13:15 +02:00
state = State : : Query ;
2023-07-03 22:52:08 +12:00
}
// 7. If c is U+0023 (#), then set url’ s fragment to the empty string and state to fragment state.
else if ( code_point = = ' # ' ) {
2021-09-13 22:34:14 +03:00
url - > m_fragment = " " ;
2021-05-25 22:13:15 +02:00
state = State : : Fragment ;
}
2023-07-03 22:52:08 +12:00
}
// 2. Otherwise, run these steps
else {
// 1. If c is not a URL code point and not U+0025 (%), invalid-URL-unit validation error.
2021-05-25 22:13:15 +02:00
if ( ! is_url_code_point ( code_point ) & & code_point ! = ' % ' )
report_validation_error ( ) ;
2023-07-03 22:52:08 +12:00
// FIXME: 2. If c is U+0025 (%) and remaining does not start with two ASCII hex digits, validation error.
// 3. UTF-8 percent-encode c using the path percent-encode set and append the result to buffer.
2021-05-25 22:13:15 +02:00
URL : : append_percent_encoded_if_necessary ( buffer , code_point , URL : : PercentEncodeSet : : Path ) ;
}
break ;
2023-07-03 22:52:08 +12:00
// -> opaque path state, https://url.spec.whatwg.org/#cannot-be-a-base-url-path-state
2021-05-25 22:13:15 +02:00
case State : : CannotBeABaseUrlPath :
// NOTE: This does not follow the spec exactly but rather uses the buffer and only sets the path on EOF.
// NOTE: Verify that the assumptions required for this simplification are correct.
2021-09-13 22:34:14 +03:00
VERIFY ( url - > m_paths . size ( ) = = 1 & & url - > m_paths [ 0 ] . is_empty ( ) ) ;
2023-07-03 22:52:08 +12:00
// 1. If c is U+003F (?), then set url’ s query to the empty string and state to query state.
2021-05-25 22:13:15 +02:00
if ( code_point = = ' ? ' ) {
2023-04-09 14:21:00 +01:00
url - > m_paths [ 0 ] = buffer . string_view ( ) ;
2021-09-13 22:34:14 +03:00
url - > m_query = " " ;
2021-05-25 22:13:15 +02:00
state = State : : Query ;
2023-07-03 22:52:08 +12:00
}
// 2. Otherwise, if c is U+0023 (#), then set url’ s fragment to the empty string and state to fragment state.
else if ( code_point = = ' # ' ) {
2021-05-25 22:13:15 +02:00
// NOTE: This needs to be percent decoded since the member variables contain decoded data.
2023-04-09 14:21:00 +01:00
url - > m_paths [ 0 ] = buffer . string_view ( ) ;
2021-09-13 22:34:14 +03:00
url - > m_fragment = " " ;
2021-05-25 22:13:15 +02:00
state = State : : Fragment ;
2023-07-03 22:52:08 +12:00
}
// 3. Otherwise:
else {
// 1. If c is not the EOF code point, not a URL code point, and not U+0025 (%), invalid-URL-unit validation error.
2021-06-03 12:43:08 +02:00
if ( code_point ! = end_of_file & & ! is_url_code_point ( code_point ) & & code_point ! = ' % ' )
2021-05-25 22:13:15 +02:00
report_validation_error ( ) ;
2023-07-03 22:52:08 +12:00
// FIXME: 2. If c is U+0025 (%) and remaining does not start with two ASCII hex digits, validation error.
// 3. If c is not the EOF code point, UTF-8 percent-encode c using the C0 control percent-encode set and append the result to url’ s path.
2021-06-03 12:43:08 +02:00
if ( code_point ! = end_of_file ) {
2021-05-25 22:13:15 +02:00
URL : : append_percent_encoded_if_necessary ( buffer , code_point , URL : : PercentEncodeSet : : C0Control ) ;
} else {
2023-04-09 14:21:00 +01:00
url - > m_paths [ 0 ] = buffer . string_view ( ) ;
2021-05-25 22:13:15 +02:00
}
}
break ;
2023-07-03 22:52:08 +12:00
// -> query state, https://url.spec.whatwg.org/#query-state
2021-05-25 22:13:15 +02:00
case State : : Query :
2023-07-03 22:52:08 +12:00
// FIXME: 1. If encoding is not UTF-8 and one of the following is true:
// * url is not special
// * url’ s scheme is "ws" or "wss"
// then set encoding to UTF-8.
// 2. If one of the following is true:
// * FIXME: state override is not given and c is U+0023 (#)
// * c is the EOF code point
2021-06-03 12:43:08 +02:00
if ( code_point = = end_of_file | | code_point = = ' # ' ) {
2021-09-13 22:34:14 +03:00
VERIFY ( url - > m_query = = " " ) ;
2023-07-03 22:52:08 +12:00
// then:
// 1. Let queryPercentEncodeSet be the special-query percent-encode set if url is special; otherwise the query percent-encode set.
2021-09-13 22:34:14 +03:00
auto query_percent_encode_set = url - > is_special ( ) ? URL : : PercentEncodeSet : : SpecialQuery : URL : : PercentEncodeSet : : Query ;
2023-07-03 22:52:08 +12:00
// 2. Percent-encode after encoding, with encoding, buffer, and queryPercentEncodeSet, and append the result to url’ s query.
2022-04-10 00:48:15 +02:00
url - > m_query = percent_encode_after_encoding ( buffer . string_view ( ) , query_percent_encode_set ) ;
2023-07-03 22:52:08 +12:00
// 3. Set buffer to the empty string.
2021-05-25 22:13:15 +02:00
buffer . clear ( ) ;
2023-07-03 22:52:08 +12:00
// 4. If c is U+0023 (#), then set url’ s fragment to the empty string and state to fragment state.
2021-05-25 22:13:15 +02:00
if ( code_point = = ' # ' ) {
2021-09-13 22:34:14 +03:00
url - > m_fragment = " " ;
2021-05-25 22:13:15 +02:00
state = State : : Fragment ;
}
2023-07-03 22:52:08 +12:00
}
// 3. Otherwise, if c is not the EOF code point:
else if ( code_point ! = end_of_file ) {
// 1. If c is not a URL code point and not U+0025 (%), invalid-URL-unit validation error.
2021-05-25 22:13:15 +02:00
if ( ! is_url_code_point ( code_point ) & & code_point ! = ' % ' )
report_validation_error ( ) ;
2023-07-03 22:52:08 +12:00
// FIXME: 2. If c is U+0025 (%) and remaining does not start with two ASCII hex digits, validation error.
// 3. Append c to buffer.
2021-05-25 22:13:15 +02:00
buffer . append_code_point ( code_point ) ;
}
break ;
2023-07-03 22:52:08 +12:00
// -> fragment state, https://url.spec.whatwg.org/#fragment-state
2021-05-25 22:13:15 +02:00
case State : : Fragment :
// NOTE: This does not follow the spec exactly but rather uses the buffer and only sets the fragment on EOF.
2023-07-03 22:52:08 +12:00
// 1. If c is not the EOF code point, then:
2021-06-03 12:43:08 +02:00
if ( code_point ! = end_of_file ) {
2023-07-03 22:52:08 +12:00
// 1. If c is not a URL code point and not U+0025 (%), invalid-URL-unit validation error.
2021-05-25 22:13:15 +02:00
if ( ! is_url_code_point ( code_point ) & & code_point ! = ' % ' )
report_validation_error ( ) ;
2023-07-03 22:52:08 +12:00
// FIXME: 2. If c is U+0025 (%) and remaining does not start with two ASCII hex digits, validation error.
// FIXME: 3. UTF-8 percent-encode c using the fragment percent-encode set and append the result to url’ s fragment.
2021-05-25 22:13:15 +02:00
buffer . append_code_point ( code_point ) ;
} else {
2023-04-09 14:21:00 +01:00
url - > m_fragment = buffer . string_view ( ) ;
2021-05-25 22:13:15 +02:00
buffer . clear ( ) ;
}
break ;
default :
VERIFY_NOT_REACHED ( ) ;
}
if ( iterator . done ( ) )
break ;
+ + iterator ;
}
2021-09-13 22:34:14 +03:00
url - > m_valid = true ;
dbgln_if ( URL_PARSER_DEBUG , " URLParser::parse: Parsed URL to be '{}'. " , url - > serialize ( ) ) ;
2023-07-03 22:52:08 +12:00
// 10. Return url.
2021-09-13 22:34:14 +03:00
return url . release_value ( ) ;
2021-05-25 22:13:15 +02:00
}
}