2020-01-18 09:38:21 +01:00
/*
* Copyright ( c ) 2018 - 2020 , Andreas Kling < kling @ serenityos . org >
2021-05-23 23:31:16 +02:00
* Copyright ( c ) 2021 , Max Wipfli < mail @ maxwipfli . ch >
2023-07-23 20:10:32 +12:00
* Copyright ( c ) 2023 , Shannon Booth < shannon @ serenityos . org >
2020-01-18 09:38:21 +01:00
*
2021-04-22 01:24:48 -07:00
* SPDX - License - Identifier : BSD - 2 - Clause
2020-01-18 09:38:21 +01:00
*/
2019-08-10 17:27:56 +02:00
# pragma once
2022-12-04 18:02:33 +00:00
# include <AK/DeprecatedString.h>
2023-02-28 21:35:41 +01:00
# include <AK/String.h>
2019-08-10 17:27:56 +02:00
# include <AK/StringView.h>
2021-11-10 11:05:21 +01:00
# include <AK/Vector.h>
2019-08-10 17:27:56 +02:00
2022-09-25 20:54:06 +02:00
// On Linux distros that use mlibc `basename` is defined as a macro that expands to `__mlibc_gnu_basename` or `__mlibc_gnu_basename_c`, so we undefine it.
# if defined(AK_OS_LINUX) && defined(basename)
# undef basename
# endif
2019-08-10 17:27:56 +02:00
namespace AK {
2023-07-23 20:10:32 +12:00
// https://url.spec.whatwg.org/#url-representation
// A URL is a struct that represents a universal identifier. To disambiguate from a valid URL string it can also be referred to as a URL record.
2019-08-10 17:27:56 +02:00
class URL {
2021-05-25 22:13:15 +02:00
friend class URLParser ;
2019-08-10 17:27:56 +02:00
public :
2021-05-25 13:50:03 +02:00
enum class PercentEncodeSet {
C0Control ,
Fragment ,
Query ,
SpecialQuery ,
Path ,
Userinfo ,
Component ,
ApplicationXWWWFormUrlencoded ,
EncodeURI
} ;
2021-05-25 22:32:39 +02:00
enum class ExcludeFragment {
No ,
Yes
} ;
2021-01-10 16:29:28 -07:00
URL ( ) = default ;
2021-11-11 00:55:02 +01:00
URL ( StringView ) ;
2022-12-04 18:02:33 +00:00
URL ( DeprecatedString const & string )
2019-08-10 19:31:37 +02:00
: URL ( string . view ( ) )
{
}
2023-02-28 21:35:41 +01:00
URL ( String const & string )
: URL ( string . bytes_as_string_view ( ) )
{
}
2019-08-10 17:27:56 +02:00
2023-07-26 20:49:49 +12:00
// https://url.spec.whatwg.org/#concept-ipv4
// An IPv4 address is a 32-bit unsigned integer that identifies a network address. [RFC791]
// FIXME: It would be nice if this were an AK::IPv4Address
using IPv4Address = u32 ;
// https://url.spec.whatwg.org/#concept-ipv6
// An IPv6 address is a 128-bit unsigned integer that identifies a network address. For the purposes of this standard
// it is represented as a list of eight 16-bit unsigned integers, also known as IPv6 pieces. [RFC4291]
// FIXME: It would be nice if this were an AK::IPv6Address
using IPv6Address = Array < u16 , 8 > ;
// https://url.spec.whatwg.org/#concept-host
// A host is a domain, an IP address, an opaque host, or an empty host. Typically a host serves as a network address,
// but it is sometimes used as opaque identifier in URLs where a network address is not necessary.
using Host = Variant < IPv4Address , IPv6Address , String , Empty > ;
2021-09-13 22:42:48 +03:00
bool is_valid ( ) const { return m_valid ; }
2021-05-23 23:31:16 +02:00
2023-04-13 23:06:58 +01:00
enum class ApplyPercentDecoding {
Yes ,
No
} ;
2022-12-04 18:02:33 +00:00
DeprecatedString const & scheme ( ) const { return m_scheme ; }
2023-08-12 16:52:38 +12:00
ErrorOr < String > username ( ) const ;
ErrorOr < String > password ( ) const ;
2023-07-27 21:40:41 +12:00
Host const & host ( ) const { return m_host ; }
ErrorOr < String > serialized_host ( ) const ;
2023-08-06 16:43:50 +12:00
DeprecatedString basename ( ) const ;
DeprecatedString query ( ) const ;
2023-08-09 17:23:23 +01:00
// NOTE: fragment() is percent-decoded, raw_fragment() is not.
2023-08-06 16:43:50 +12:00
DeprecatedString fragment ( ) const ;
2023-08-09 17:23:23 +01:00
DeprecatedString raw_fragment ( ) const ;
2021-09-13 23:12:16 +03:00
Optional < u16 > port ( ) const { return m_port ; }
2023-08-06 16:43:50 +12:00
DeprecatedString path_segment_at_index ( size_t index ) const ;
2023-04-13 23:29:51 +01:00
size_t path_segment_count ( ) const { return m_paths . size ( ) ; }
2023-04-13 23:06:58 +01:00
2021-09-13 23:12:16 +03:00
u16 port_or_default ( ) const { return m_port . value_or ( default_port_for_scheme ( m_scheme ) ) ; }
2021-09-13 22:42:48 +03:00
bool cannot_be_a_base_url ( ) const { return m_cannot_be_a_base_url ; }
2023-07-26 20:54:36 +12:00
bool cannot_have_a_username_or_password_or_port ( ) const ;
2019-08-10 17:27:56 +02:00
2021-05-25 22:05:01 +02:00
bool includes_credentials ( ) const { return ! m_username . is_empty ( ) | | ! m_password . is_empty ( ) ; }
bool is_special ( ) const { return is_special_scheme ( m_scheme ) ; }
2022-12-04 18:02:33 +00:00
void set_scheme ( DeprecatedString ) ;
2023-08-12 16:52:38 +12:00
ErrorOr < void > set_username ( StringView ) ;
ErrorOr < void > set_password ( StringView ) ;
2023-07-27 21:40:41 +12:00
void set_host ( Host ) ;
2021-09-13 23:12:16 +03:00
void set_port ( Optional < u16 > ) ;
2023-08-06 16:32:44 +12:00
void set_paths ( Vector < DeprecatedString > const & ) ;
void set_query ( StringView ) ;
void set_fragment ( StringView fragment ) ;
2021-06-01 10:58:27 +02:00
void set_cannot_be_a_base_url ( bool value ) { m_cannot_be_a_base_url = value ; }
2023-08-06 16:32:44 +12:00
void append_path ( StringView ) ;
2023-04-09 14:21:00 +01:00
void append_slash ( )
{
// NOTE: To indicate that we want to end the path with a slash, we have to append an empty path segment.
2023-08-06 16:13:08 +12:00
m_paths . append ( " " ) ;
2023-04-09 14:21:00 +01:00
}
2019-10-05 10:14:42 +02:00
2023-08-06 16:43:50 +12:00
DeprecatedString serialize_path ( ) const ;
2022-12-04 18:02:33 +00:00
DeprecatedString serialize ( ExcludeFragment = ExcludeFragment : : No ) const ;
DeprecatedString serialize_for_display ( ) const ;
2022-12-06 01:12:49 +00:00
DeprecatedString to_deprecated_string ( ) const { return serialize ( ) ; }
2023-06-17 09:15:40 +02:00
ErrorOr < String > to_string ( ) const ;
2021-05-27 21:38:16 +02:00
2021-09-13 22:18:14 +03:00
// HTML origin
2022-12-04 18:02:33 +00:00
DeprecatedString serialize_origin ( ) const ;
2021-09-13 22:18:14 +03:00
2021-06-01 10:58:27 +02:00
bool equals ( URL const & other , ExcludeFragment = ExcludeFragment : : No ) const ;
2021-05-27 21:38:16 +02:00
2023-02-13 17:42:27 +00:00
URL complete_url ( StringView ) const ;
2019-11-18 22:04:39 +01:00
AK: Decode data URLs to separate class (and parse like every other URL)
Parsing 'data:' URLs took it's own route. It never set standard URL
fields like path, query or fragment (except for scheme) and instead
gave us separate methods called `data_payload()`, `data_mime_type()`,
and `data_payload_is_base64()`.
Because parsing 'data:' didn't use standard fields, running the
following JS code:
new URL('#a', 'data:text/plain,hello').toString()
not only cleared the path as URLParser doesn't check for data from
data_payload() function (making the result be 'data:#a'), but it also
crashes the program because we forbid having an empty MIME type when we
serialize to string.
With this change, 'data:' URLs will be parsed like every other URLs.
To decode the 'data:' URL contents, one needs to call process_data_url()
on a URL, which will return a struct containing MIME type with already
decoded data! :^)
2023-07-06 19:11:58 +02:00
struct DataURL {
String mime_type ;
ByteBuffer body ;
} ;
ErrorOr < DataURL > process_data_url ( ) const ;
2020-04-26 22:48:54 +02:00
2022-12-04 18:02:33 +00:00
static URL create_with_url_or_path ( DeprecatedString const & ) ;
static URL create_with_file_scheme ( DeprecatedString const & path , DeprecatedString const & fragment = { } , DeprecatedString const & hostname = { } ) ;
static URL create_with_help_scheme ( DeprecatedString const & path , DeprecatedString const & fragment = { } , DeprecatedString const & hostname = { } ) ;
AK: Decode data URLs to separate class (and parse like every other URL)
Parsing 'data:' URLs took it's own route. It never set standard URL
fields like path, query or fragment (except for scheme) and instead
gave us separate methods called `data_payload()`, `data_mime_type()`,
and `data_payload_is_base64()`.
Because parsing 'data:' didn't use standard fields, running the
following JS code:
new URL('#a', 'data:text/plain,hello').toString()
not only cleared the path as URLParser doesn't check for data from
data_payload() function (making the result be 'data:#a'), but it also
crashes the program because we forbid having an empty MIME type when we
serialize to string.
With this change, 'data:' URLs will be parsed like every other URLs.
To decode the 'data:' URL contents, one needs to call process_data_url()
on a URL, which will return a struct containing MIME type with already
decoded data! :^)
2023-07-06 19:11:58 +02:00
static URL create_with_data ( StringView mime_type , StringView payload , bool is_base64 = false ) ;
2021-05-25 22:05:01 +02:00
2021-11-11 00:55:02 +01:00
static u16 default_port_for_scheme ( StringView ) ;
static bool is_special_scheme ( StringView ) ;
2020-04-18 22:02:04 +02:00
2022-04-09 18:34:49 +02:00
enum class SpaceAsPlus {
No ,
Yes ,
} ;
2022-12-04 18:02:33 +00:00
static DeprecatedString percent_encode ( StringView input , PercentEncodeSet set = PercentEncodeSet : : Userinfo , SpaceAsPlus = SpaceAsPlus : : No ) ;
static DeprecatedString percent_decode ( StringView input ) ;
2021-05-25 13:50:03 +02:00
2021-06-01 11:14:30 +02:00
bool operator = = ( URL const & other ) const { return equals ( other , ExcludeFragment : : No ) ; }
2020-06-01 21:50:07 +02:00
2022-04-10 00:48:15 +02:00
static bool code_point_is_in_percent_encode_set ( u32 code_point , URL : : PercentEncodeSet ) ;
2023-08-12 16:52:38 +12:00
String const & raw_username ( ) const { return m_username ; }
String const & raw_password ( ) const { return m_password ; }
2019-08-10 17:27:56 +02:00
private :
2020-04-11 23:07:23 +02:00
bool compute_validity ( ) const ;
2019-08-10 17:27:56 +02:00
2022-04-08 14:20:30 +01:00
static void append_percent_encoded_if_necessary ( StringBuilder & , u32 code_point , PercentEncodeSet set = PercentEncodeSet : : Userinfo ) ;
2021-05-25 13:50:03 +02:00
static void append_percent_encoded ( StringBuilder & , u32 code_point ) ;
2019-08-10 17:27:56 +02:00
bool m_valid { false } ;
2021-05-25 21:32:20 +02:00
2023-07-23 20:10:32 +12:00
// A URL’ s scheme is an ASCII string that identifies the type of URL and can be used to dispatch a URL for further processing after parsing. It is initially the empty string.
2022-12-04 18:02:33 +00:00
DeprecatedString m_scheme ;
2023-07-23 20:10:32 +12:00
// A URL’ s username is an ASCII string identifying a username. It is initially the empty string.
2023-08-12 16:52:38 +12:00
String m_username ;
2023-07-23 20:10:32 +12:00
// A URL’ s password is an ASCII string identifying a password. It is initially the empty string.
2023-08-12 16:52:38 +12:00
String m_password ;
2023-07-23 20:10:32 +12:00
// A URL’ s host is null or a host. It is initially null.
2023-07-27 21:40:41 +12:00
Host m_host ;
2023-07-23 20:10:32 +12:00
// A URL’ s port is either null or a 16-bit unsigned integer that identifies a networking port. It is initially null.
2021-09-13 23:12:16 +03:00
Optional < u16 > m_port ;
2023-07-23 20:10:32 +12:00
// A URL’ s path is either a URL path segment or a list of zero or more URL path segments, usually identifying a location. It is initially « ».
// A URL path segment is an ASCII string. It commonly refers to a directory or a file, but has no predefined meaning.
2022-12-04 18:02:33 +00:00
Vector < DeprecatedString > m_paths ;
2023-07-23 20:10:32 +12:00
// A URL’ s query is either null or an ASCII string. It is initially null.
2022-12-04 18:02:33 +00:00
DeprecatedString m_query ;
2023-07-23 20:10:32 +12:00
// A URL’ s fragment is either null or an ASCII string that can be used for further processing on the resource the URL’ s other components identify. It is initially null.
2022-12-04 18:02:33 +00:00
DeprecatedString m_fragment ;
2021-05-25 21:32:20 +02:00
bool m_cannot_be_a_base_url { false } ;
2019-08-10 17:27:56 +02:00
} ;
2020-10-04 13:29:47 +02:00
template < >
struct Formatter < URL > : Formatter < StringView > {
2021-11-16 01:15:21 +01:00
ErrorOr < void > format ( FormatBuilder & builder , URL const & value )
2020-10-04 13:29:47 +02:00
{
2021-11-16 01:15:21 +01:00
return Formatter < StringView > : : format ( builder , value . serialize ( ) ) ;
2020-10-04 13:29:47 +02:00
}
} ;
2020-06-01 21:50:07 +02:00
template < >
struct Traits < URL > : public GenericTraits < URL > {
2022-12-06 01:12:49 +00:00
static unsigned hash ( URL const & url ) { return url . to_deprecated_string ( ) . hash ( ) ; }
2020-06-01 21:50:07 +02:00
} ;
2020-05-16 18:35:39 +01:00
}