AK+LibWeb: Add {Fly,}String::to_ascii_{upper,lower}_case()

These don't have to worry about the input not being valid UTF-8 and
so can be infallible (and can even return self if no changes needed.)

We use this instead of Infra::to_ascii_{upper,lower}_case in LibWeb.

(cherry picked from commit 073bcfd3866852a4c4bcca2bd131bd65ae53541f)
This commit is contained in:
Andreas Kling 2024-10-14 10:51:15 +02:00 committed by Nico Weber
parent 14b6e5b89c
commit e1ba881587
16 changed files with 147 additions and 13 deletions

View file

@ -173,6 +173,54 @@ ErrorOr<void> Formatter<FlyString>::format(FormatBuilder& builder, FlyString con
return Formatter<StringView>::format(builder, fly_string.bytes_as_string_view());
}
FlyString FlyString::to_ascii_lowercase() const
{
bool const has_ascii_uppercase = [&] {
for (u8 const byte : bytes()) {
if (AK::is_ascii_upper_alpha(byte))
return true;
}
return false;
}();
if (!has_ascii_uppercase)
return *this;
Vector<u8> lowercase_bytes;
lowercase_bytes.ensure_capacity(bytes().size());
for (u8 const byte : bytes()) {
if (AK::is_ascii_upper_alpha(byte))
lowercase_bytes.unchecked_append(AK::to_ascii_lowercase(byte));
else
lowercase_bytes.unchecked_append(byte);
}
return String::from_utf8_without_validation(lowercase_bytes);
}
FlyString FlyString::to_ascii_uppercase() const
{
bool const has_ascii_lowercase = [&] {
for (u8 const byte : bytes()) {
if (AK::is_ascii_lower_alpha(byte))
return true;
}
return false;
}();
if (!has_ascii_lowercase)
return *this;
Vector<u8> uppercase_bytes;
uppercase_bytes.ensure_capacity(bytes().size());
for (u8 const byte : bytes()) {
if (AK::is_ascii_lower_alpha(byte))
uppercase_bytes.unchecked_append(AK::to_ascii_uppercase(byte));
else
uppercase_bytes.unchecked_append(byte);
}
return String::from_utf8_without_validation(uppercase_bytes);
}
bool FlyString::equals_ignoring_ascii_case(FlyString const& other) const
{
if (*this == other)

View file

@ -66,6 +66,9 @@ public:
[[nodiscard]] bool equals_ignoring_ascii_case(FlyString const&) const;
[[nodiscard]] bool equals_ignoring_ascii_case(StringView) const;
[[nodiscard]] FlyString to_ascii_lowercase() const;
[[nodiscard]] FlyString to_ascii_uppercase() const;
[[nodiscard]] bool starts_with_bytes(StringView, CaseSensitivity = CaseSensitivity::CaseSensitive) const;
[[nodiscard]] bool ends_with_bytes(StringView, CaseSensitivity = CaseSensitivity::CaseSensitive) const;

View file

@ -319,6 +319,59 @@ ErrorOr<String> String::from_byte_string(ByteString const& byte_string)
return String::from_utf8(byte_string.view());
}
String String::to_ascii_lowercase() const
{
bool const has_ascii_uppercase = [&] {
for (u8 const byte : bytes()) {
if (AK::is_ascii_upper_alpha(byte))
return true;
}
return false;
}();
if (!has_ascii_uppercase)
return *this;
Vector<u8> lowercase_bytes;
lowercase_bytes.ensure_capacity(bytes().size());
for (u8 const byte : bytes()) {
if (AK::is_ascii_upper_alpha(byte))
lowercase_bytes.unchecked_append(AK::to_ascii_lowercase(byte));
else
lowercase_bytes.unchecked_append(byte);
}
return String::from_utf8_without_validation(lowercase_bytes);
}
String String::to_ascii_uppercase() const
{
bool const has_ascii_lowercase = [&] {
for (u8 const byte : bytes()) {
if (AK::is_ascii_lower_alpha(byte))
return true;
}
return false;
}();
if (!has_ascii_lowercase)
return *this;
Vector<u8> uppercase_bytes;
uppercase_bytes.ensure_capacity(bytes().size());
for (u8 const byte : bytes()) {
if (AK::is_ascii_lower_alpha(byte))
uppercase_bytes.unchecked_append(AK::to_ascii_uppercase(byte));
else
uppercase_bytes.unchecked_append(byte);
}
return String::from_utf8_without_validation(uppercase_bytes);
}
bool String::equals_ignoring_ascii_case(String const& other) const
{
return StringUtils::equals_ignoring_ascii_case(bytes_as_string_view(), other.bytes_as_string_view());
}
bool String::equals_ignoring_ascii_case(StringView other) const
{
return StringUtils::equals_ignoring_ascii_case(bytes_as_string_view(), other);

View file

@ -95,9 +95,13 @@ public:
ErrorOr<String> to_titlecase(Optional<StringView> const& locale = {}, TrailingCodePointTransformation trailing_code_point_transformation = TrailingCodePointTransformation::Lowercase) const;
ErrorOr<String> to_casefold() const;
[[nodiscard]] String to_ascii_lowercase() const;
[[nodiscard]] String to_ascii_uppercase() const;
// Compare this String against another string with caseless matching. Using this method requires linking LibUnicode into your application.
[[nodiscard]] bool equals_ignoring_case(String const&) const;
[[nodiscard]] bool equals_ignoring_ascii_case(String const&) const;
[[nodiscard]] bool equals_ignoring_ascii_case(StringView) const;
[[nodiscard]] bool starts_with(u32 code_point) const;

View file

@ -1407,3 +1407,27 @@ TEST_CASE(ends_with)
EXPECT(emoji.ends_with(0x1F643));
EXPECT(!emoji.ends_with(0x1F600));
}
TEST_CASE(to_ascii_lowercase)
{
EXPECT_EQ("foobar"_string.to_ascii_lowercase(), "foobar"_string);
EXPECT_EQ("FooBar"_string.to_ascii_lowercase(), "foobar"_string);
EXPECT_EQ("FOOBAR"_string.to_ascii_lowercase(), "foobar"_string);
// NOTE: We expect to_ascii_lowercase() to return the same underlying string if no changes are needed.
auto long_string = "this is a long string that cannot use the short string optimization"_string;
auto lowercased = long_string.to_ascii_lowercase();
EXPECT_EQ(long_string.bytes().data(), lowercased.bytes().data());
}
TEST_CASE(to_ascii_uppercase)
{
EXPECT_EQ("foobar"_string.to_ascii_uppercase(), "FOOBAR"_string);
EXPECT_EQ("FooBar"_string.to_ascii_uppercase(), "FOOBAR"_string);
EXPECT_EQ("FOOBAR"_string.to_ascii_uppercase(), "FOOBAR"_string);
// NOTE: We expect to_ascii_uppercase() to return the same underlying string if no changes are needed.
auto long_string = "THIS IS A LONG STRING THAT CANNOT USE THE SHORT STRING OPTIMIZATION"_string;
auto uppercased = long_string.to_ascii_uppercase();
EXPECT_EQ(long_string.bytes().data(), uppercased.bytes().data());
}

View file

@ -395,7 +395,7 @@ Parser::ParseErrorOr<Selector::SimpleSelector> Parser::parse_pseudo_simple_selec
return Selector::SimpleSelector {
.type = Selector::SimpleSelector::Type::PseudoElement,
// Unknown -webkit- pseudo-elements must be serialized in ASCII lowercase.
.value = Selector::PseudoElement { Selector::PseudoElement::Type::UnknownWebKit, MUST(Infra::to_ascii_lowercase(pseudo_name.to_string())) },
.value = Selector::PseudoElement { Selector::PseudoElement::Type::UnknownWebKit, pseudo_name.to_string().to_ascii_lowercase() },
};
}

View file

@ -3682,7 +3682,7 @@ WebIDL::ExceptionOr<JS::NonnullGCPtr<Attr>> Document::create_attribute(String co
// 2. If this is an HTML document, then set localName to localName in ASCII lowercase.
// 3. Return a new attribute whose local name is localName and node document is this.
return Attr::create(*this, is_html_document() ? MUST(Infra::to_ascii_lowercase(local_name)) : local_name);
return Attr::create(*this, is_html_document() ? local_name.to_ascii_lowercase() : local_name);
}
// https://dom.spec.whatwg.org/#dom-document-createattributens

View file

@ -184,7 +184,7 @@ WebIDL::ExceptionOr<void> Element::set_attribute(FlyString const& name, String c
// 4. If attribute is null, create an attribute whose local name is qualifiedName, value is value, and node document
// is thiss node document, then append this attribute to this, and then return.
if (!attribute) {
auto new_attribute = Attr::create(document(), insert_as_lowercase ? MUST(Infra::to_ascii_lowercase(name)) : name, value);
auto new_attribute = Attr::create(document(), insert_as_lowercase ? name.to_ascii_lowercase() : name, value);
m_attributes->append_attribute(new_attribute);
return {};
@ -354,7 +354,7 @@ WebIDL::ExceptionOr<bool> Element::toggle_attribute(FlyString const& name, Optio
// 1. If force is not given or is true, create an attribute whose local name is qualifiedName, value is the empty
// string, and node document is thiss node document, then append this attribute to this, and then return true.
if (!force.has_value() || force.value()) {
auto new_attribute = Attr::create(document(), insert_as_lowercase ? MUST(Infra::to_ascii_lowercase(name)) : name.to_string(), String {});
auto new_attribute = Attr::create(document(), insert_as_lowercase ? name.to_ascii_lowercase() : name.to_string(), String {});
m_attributes->append_attribute(new_attribute);
return true;
@ -891,7 +891,7 @@ void Element::make_html_uppercased_qualified_name()
{
// This is allowed by the spec: "User agents could optimize qualified name and HTML-uppercased qualified name by storing them in internal slots."
if (namespace_uri() == Namespace::HTML && document().document_type() == Document::Type::HTML)
m_html_uppercased_qualified_name = MUST(Infra::to_ascii_uppercase(qualified_name()));
m_html_uppercased_qualified_name = qualified_name().to_ascii_uppercase();
else
m_html_uppercased_qualified_name = qualified_name();
}

View file

@ -65,7 +65,7 @@ Vector<FlyString> NamedNodeMap::supported_property_names() const
if (associated_element().namespace_uri() == Namespace::HTML) {
// 1. Let lowercaseName be name, in ASCII lowercase.
// 2. If lowercaseName is not equal to name, remove name from names.
names.remove_all_matching([](auto const& name) { return name != MUST(Infra::to_ascii_lowercase(name)); });
names.remove_all_matching([](auto const& name) { return name != name.to_ascii_lowercase(); });
}
// 3. Return names.

View file

@ -139,7 +139,7 @@ JS::NonnullGCPtr<HTMLCollection> ParentNode::get_elements_by_tag_name(FlyString
// 2. Otherwise, if roots node document is an HTML document, return a HTMLCollection rooted at root, whose filter matches the following descendant elements:
if (root().document().document_type() == Document::Type::HTML) {
FlyString qualified_name_in_ascii_lowercase = MUST(Infra::to_ascii_lowercase(qualified_name));
FlyString qualified_name_in_ascii_lowercase = qualified_name.to_ascii_lowercase();
return HTMLCollection::create(*this, HTMLCollection::Scope::Descendants, [qualified_name, qualified_name_in_ascii_lowercase](Element const& element) {
// - Whose namespace is the HTML namespace and whose qualified name is qualifiedName, in ASCII lowercase.
if (element.namespace_uri() == Namespace::HTML)

View file

@ -292,7 +292,7 @@ WebIDL::ExceptionOr<JS::NonnullGCPtr<Blob>> Blob::slice_blob(Optional<i64> start
}
// 2. Convert every character in relativeContentType to ASCII lowercase.
else {
relative_content_type = TRY_OR_THROW_OOM(vm, Infra::to_ascii_lowercase(content_type.value()));
relative_content_type = content_type.value().to_ascii_lowercase();
}
}

View file

@ -142,7 +142,7 @@ String DataTransfer::get_data(String const& format_argument) const
return {};
// 3. Let format be the first argument, converted to ASCII lowercase.
auto format = MUST(Infra::to_ascii_lowercase(format_argument));
auto format = format_argument.to_ascii_lowercase();
// 4. Let convert-to-URL be false.
[[maybe_unused]] bool convert_to_url = false;

View file

@ -76,7 +76,7 @@ WebIDL::ExceptionOr<JS::GCPtr<DataTransferItem>> DataTransferItemList::add(Strin
// method's first argument.
auto item = m_data_transfer->add_item({
.kind = HTML::DragDataStoreItem::Kind::Text,
.type_string = MUST(Infra::to_ascii_lowercase(type)),
.type_string = type.to_ascii_lowercase(),
.data = MUST(ByteBuffer::copy(data.bytes())),
.file_name = {},
});
@ -100,7 +100,7 @@ JS::GCPtr<DataTransferItem> DataTransferItemList::add(JS::NonnullGCPtr<FileAPI::
// converted to ASCII lowercase, and whose data is the same as the File's data.
auto item = m_data_transfer->add_item({
.kind = HTML::DragDataStoreItem::Kind::File,
.type_string = MUST(Infra::to_ascii_lowercase(file->type())),
.type_string = file->type().to_ascii_lowercase(),
.data = MUST(ByteBuffer::copy(file->raw_bytes())),
.file_name = file->name().to_byte_string(),
});

View file

@ -1498,7 +1498,7 @@ String HTMLInputElement::value_sanitization_algorithm(String const& value) const
// https://html.spec.whatwg.org/multipage/input.html#color-state-(type=color):value-sanitization-algorithm
// If the value of the element is a valid simple color, then set it to the value of the element converted to ASCII lowercase;
if (is_valid_simple_color(value))
return MUST(Infra::to_ascii_lowercase(value));
return value.to_ascii_lowercase();
// otherwise, set it to the string "#000000".
return "#000000"_string;
}

View file

@ -136,7 +136,7 @@ void HTMLLinkElement::attribute_changed(FlyString const& name, Optional<String>
if (name == HTML::AttributeNames::rel) {
m_relationship = 0;
// Keywords are always ASCII case-insensitive, and must be compared as such.
auto lowercased_value = MUST(Infra::to_ascii_lowercase(value.value_or(String {})));
auto lowercased_value = value.value_or(String {}).to_ascii_lowercase();
// To determine which link types apply to a link, a, area, or form element,
// the element's rel attribute must be split on ASCII whitespace.
// The resulting tokens are the keywords for the link types that apply to that element.

View file

@ -3,11 +3,13 @@
* Copyright (c) 2022, networkException <networkexception@serenityos.org>
* Copyright (c) 2023, Kenneth Myhra <kennethmyhra@serenityos.org>
* Copyright (c) 2023, Sam Atkins <atkinssj@serenityos.org>
* Copyright (c) 2024, Andreas Kling <andreas@ladybird.org>
*
* SPDX-License-Identifier: BSD-2-Clause
*/
#include <AK/CharacterTypes.h>
#include <AK/FlyString.h>
#include <AK/GenericLexer.h>
#include <AK/String.h>
#include <AK/Utf16View.h>