mirror of
https://github.com/LadybirdBrowser/ladybird.git
synced 2025-01-22 09:12:13 -05:00
LibXML: Read code points when parsing names
This commit is contained in:
parent
42b31820a6
commit
453e034801
Notes:
github-actions[bot]
2024-11-06 09:09:03 +00:00
Author: https://github.com/Gingeh Commit: https://github.com/LadybirdBrowser/ladybird/commit/453e0348010 Pull-request: https://github.com/LadybirdBrowser/ladybird/pull/2192 Reviewed-by: https://github.com/alimpfard
2 changed files with 26 additions and 6 deletions
|
@ -41,3 +41,9 @@ TEST_CASE(predefined_character_reference)
|
|||
auto const& content = node.children[0]->content.get<XML::Node::Text>();
|
||||
EXPECT_EQ(content.builder.string_view(), "Well hello &, <, >, ', and \"!");
|
||||
}
|
||||
|
||||
TEST_CASE(unicode_name)
|
||||
{
|
||||
XML::Parser parser("<div 中文=\"\"></div>"sv);
|
||||
TRY_OR_FAIL(parser.parse());
|
||||
}
|
||||
|
|
|
@ -545,16 +545,30 @@ ErrorOr<Name, ParseError> Parser::parse_name()
|
|||
auto rule = enter_rule();
|
||||
|
||||
// Name ::= NameStartChar (NameChar)*
|
||||
auto start = TRY(expect(s_name_start_characters, "a NameStartChar"sv));
|
||||
|
||||
// FIXME: This is a hacky workaround to read code points instead of bytes.
|
||||
// Replace this once we have a unicode-aware lexer.
|
||||
auto start = m_lexer.tell();
|
||||
StringView remaining = m_lexer.input().substring_view(start);
|
||||
Utf8View view { remaining };
|
||||
auto code_points = view.begin();
|
||||
if (code_points.done() || !s_name_start_characters.contains(*code_points)) {
|
||||
if (m_options.treat_errors_as_fatal)
|
||||
return parse_error(m_lexer.current_position(), Expectation { "a NameStartChar"sv });
|
||||
}
|
||||
|
||||
m_lexer.ignore(code_points.underlying_code_point_length_in_bytes());
|
||||
++code_points;
|
||||
|
||||
auto accept = accept_rule();
|
||||
|
||||
auto rest = m_lexer.consume_while(s_name_characters);
|
||||
StringBuilder builder;
|
||||
builder.append(start);
|
||||
builder.append(rest);
|
||||
while (!code_points.done() && s_name_characters.contains(*code_points)) {
|
||||
m_lexer.ignore(code_points.underlying_code_point_length_in_bytes());
|
||||
++code_points;
|
||||
}
|
||||
|
||||
rollback.disarm();
|
||||
return builder.to_byte_string();
|
||||
return remaining.substring_view(0, m_lexer.tell() - start);
|
||||
}
|
||||
|
||||
// 2.8.28. doctypedecl, https://www.w3.org/TR/2006/REC-xml11-20060816/#NT-doctypedecl
|
||||
|
|
Loading…
Reference in a new issue