From df87a9689c11560ef0f9e1b7f9241049a86cbf53 Mon Sep 17 00:00:00 2001 From: Ryan Liptak Date: Fri, 20 Dec 2024 06:05:37 -0800 Subject: [PATCH] LibWeb: Fix numeric character reference at EOF leaking its last digit Previously, if the NumericCharacterReferenceEnd state was reached when current_input_character was None, then the DONT_CONSUME_NEXT_INPUT_CHARACTER macro would restore back before the EOF, and allow the next state (after the SWITCH_TO_RETURN_STATE) to proceed with the last digit of the numeric character reference. For example, with something like `ї`, before this commit the output would incorrectly be `1` instead of just ``. Instead of putting the `if (current_input_character.has_value())` check inside NumericCharacterReferenceEnd directly, it was instead added to DONT_CONSUME_NEXT_INPUT_CHARACTER, because all usages of the macro benefit from this check, even if the other existing usage sites don't exhibit any bugs without it: - In MarkupDeclarationOpen, if the current_input_character is EOF, then the previous character is always `!`, so restoring and then checking forward for strings like `--`, `DOCTYPE`, etc won't match and the BogusComment state will run one extra time (once for `!` and once for EOF) with no practical consequences. With the `has_value()` check, BogusComment will only run once with EOF. - In AfterDOCTYPEName, ConsumeNextResult::RanOutOfCharacters can only occur when stopping at the insertion point, and because of how the code is structured, it is guaranteed that current_input_character is either `P` or `S`, so the `has_value()` check is irrelevant. --- Libraries/LibWeb/HTML/Parser/HTMLTokenizer.cpp | 7 ++++--- Tests/LibWeb/TestHTMLTokenizer.cpp | 9 +++++++++ .../wpt-import/html/syntax/parsing/html5lib_tests2.txt | 5 ++--- 3 files changed, 15 insertions(+), 6 deletions(-) diff --git a/Libraries/LibWeb/HTML/Parser/HTMLTokenizer.cpp b/Libraries/LibWeb/HTML/Parser/HTMLTokenizer.cpp index fa69194299b..50a4f8c00e5 100644 --- a/Libraries/LibWeb/HTML/Parser/HTMLTokenizer.cpp +++ b/Libraries/LibWeb/HTML/Parser/HTMLTokenizer.cpp @@ -94,9 +94,10 @@ namespace Web::HTML { } \ } while (0) -#define DONT_CONSUME_NEXT_INPUT_CHARACTER \ - do { \ - restore_to(m_prev_utf8_iterator); \ +#define DONT_CONSUME_NEXT_INPUT_CHARACTER \ + do { \ + if (current_input_character.has_value()) \ + restore_to(m_prev_utf8_iterator); \ } while (0) #define ON(code_point) \ diff --git a/Tests/LibWeb/TestHTMLTokenizer.cpp b/Tests/LibWeb/TestHTMLTokenizer.cpp index a859af123eb..3689ba40054 100644 --- a/Tests/LibWeb/TestHTMLTokenizer.cpp +++ b/Tests/LibWeb/TestHTMLTokenizer.cpp @@ -199,6 +199,15 @@ TEST_CASE(character_reference_in_attribute) END_ENUMERATION(); } +TEST_CASE(numeric_character_reference) +{ + auto tokens = run_tokenizer("ї"sv); + BEGIN_ENUMERATION(tokens); + EXPECT_CHARACTER_TOKEN(1111); + EXPECT_END_OF_FILE_TOKEN(); + END_ENUMERATION(); +} + TEST_CASE(comment) { auto tokens = run_tokenizer("

"sv); diff --git a/Tests/LibWeb/Text/expected/wpt-import/html/syntax/parsing/html5lib_tests2.txt b/Tests/LibWeb/Text/expected/wpt-import/html/syntax/parsing/html5lib_tests2.txt index e4d23453164..648f7260739 100644 --- a/Tests/LibWeb/Text/expected/wpt-import/html/syntax/parsing/html5lib_tests2.txt +++ b/Tests/LibWeb/Text/expected/wpt-import/html/syntax/parsing/html5lib_tests2.txt @@ -2,8 +2,7 @@ Harness status: OK Found 63 tests -62 Pass -1 Fail +63 Pass Pass html5lib_tests2.html e070301fb578bd639ecbc7ec720fa60222d05826 Pass html5lib_tests2.html aaf24dabcb42470e447d241a40def0d136c12b93 Pass html5lib_tests2.html b6c1142484570bb90c36e454ee193cca17bb618a @@ -27,7 +26,7 @@ Pass html5lib_tests2.html 73b97cd984a62703ec54ec4a876ec32aa5fd3b8c Pass html5lib_tests2.html 2db9616ed62fc2a26056f3395459869cf556974d Pass html5lib_tests2.html b59aa1c714892618eaccd51696658887fcbd2045 Pass html5lib_tests2.html 98818e7fda2506603bd208662613edb40297c2d3 -Fail html5lib_tests2.html e0c43080cf61c0696031bdb097bea4f2a647cfc2 +Pass html5lib_tests2.html e0c43080cf61c0696031bdb097bea4f2a647cfc2 Pass html5lib_tests2.html f7753d80a422c40b5fa04d99e52d8ae83369757a Pass html5lib_tests2.html 7cbd584aef9508a90c98f80040078149a92ec869 Pass html5lib_tests2.html e0f7f130b1e3653dd06f10f3492e4f0bf4cd3cfa