LibXML: Read code points when parsing names

(cherry picked from commit 453e0348010561b4de922ee37e49d324eeedf3ac)
This commit is contained in:
Gingeh 2024-11-06 18:40:14 +11:00 committed by Nico Weber
parent 2717669104
commit b571b4317c
2 changed files with 26 additions and 6 deletions

View file

@ -41,3 +41,9 @@ TEST_CASE(predefined_character_reference)
auto const& content = node.children[0]->content.get<XML::Node::Text>();
EXPECT_EQ(content.builder.string_view(), "Well hello &, <, >, ', and \"!");
}
TEST_CASE(unicode_name)
{
XML::Parser parser("<div 中文=\"\"></div>"sv);
TRY_OR_FAIL(parser.parse());
}

View file

@ -542,16 +542,30 @@ ErrorOr<Name, ParseError> Parser::parse_name()
auto rule = enter_rule();
// Name ::= NameStartChar (NameChar)*
auto start = TRY(expect(s_name_start_characters, "a NameStartChar"sv));
// FIXME: This is a hacky workaround to read code points instead of bytes.
// Replace this once we have a unicode-aware lexer.
auto start = m_lexer.tell();
StringView remaining = m_lexer.input().substring_view(start);
Utf8View view { remaining };
auto code_points = view.begin();
if (code_points.done() || !s_name_start_characters.contains(*code_points)) {
if (m_options.treat_errors_as_fatal)
return parse_error(m_lexer.current_position(), Expectation { "a NameStartChar"sv });
}
m_lexer.ignore(code_points.underlying_code_point_length_in_bytes());
++code_points;
auto accept = accept_rule();
auto rest = m_lexer.consume_while(s_name_characters);
StringBuilder builder;
builder.append(start);
builder.append(rest);
while (!code_points.done() && s_name_characters.contains(*code_points)) {
m_lexer.ignore(code_points.underlying_code_point_length_in_bytes());
++code_points;
}
rollback.disarm();
return builder.to_byte_string();
return remaining.substring_view(0, m_lexer.tell() - start);
}
// 2.8.28. doctypedecl, https://www.w3.org/TR/2006/REC-xml11-20060816/#NT-doctypedecl