mirror of
https://github.com/SerenityOS/serenity.git
synced 2025-01-24 02:12:09 -05:00
LibRegex: Implement legacy octal escape parsing closer to the spec
The grammar for the ECMA-262 CharacterEscape is: CharacterEscape[U, N] :: ControlEscape c ControlLetter 0 [lookahead ∉ DecimalDigit] HexEscapeSequence RegExpUnicodeEscapeSequence[?U] [~U]LegacyOctalEscapeSequence IdentityEscape[?U, ?N] It's important to parse the standalone "\0 [lookahead ∉ DecimalDigit]" before parsing LegacyOctalEscapeSequence. Otherwise, all standalone "\0" patterns are parsed as octal, which are disallowed in Unicode mode. Further, LegacyOctalEscapeSequence should also be parsed while parsing character classes.
This commit is contained in:
parent
83ca8c7e38
commit
6a485f612f
2 changed files with 43 additions and 9 deletions
|
@ -22,6 +22,12 @@ static PosixOptions match_test_api_options(const PosixOptions options)
|
|||
return options;
|
||||
}
|
||||
|
||||
template<typename... Flags>
|
||||
static constexpr ECMAScriptFlags combine_flags(Flags&&... flags) requires((IsSame<Flags, ECMAScriptFlags> && ...))
|
||||
{
|
||||
return static_cast<ECMAScriptFlags>((static_cast<regex::FlagsUnderlyingType>(flags) | ...));
|
||||
}
|
||||
|
||||
TEST_CASE(regex_options_ecmascript)
|
||||
{
|
||||
ECMAScriptOptions eo;
|
||||
|
@ -543,6 +549,14 @@ TEST_CASE(ECMA262_parse)
|
|||
{ "\\A"sv, regex::Error::InvalidCharacterClass, ECMAScriptFlags::Unicode },
|
||||
{ "[\\A]"sv, regex::Error::NoError, ECMAScriptFlags::BrowserExtended },
|
||||
{ "[\\A]"sv, regex::Error::InvalidPattern, ECMAScriptFlags::Unicode },
|
||||
{ "\\0"sv, regex::Error::NoError, ECMAScriptFlags::BrowserExtended },
|
||||
{ "\\0"sv, regex::Error::NoError, combine_flags(ECMAScriptFlags::Unicode, ECMAScriptFlags::BrowserExtended) },
|
||||
{ "\\00"sv, regex::Error::NoError, ECMAScriptFlags::BrowserExtended },
|
||||
{ "\\00"sv, regex::Error::InvalidCharacterClass, combine_flags(ECMAScriptFlags::Unicode, ECMAScriptFlags::BrowserExtended) },
|
||||
{ "[\\0]"sv, regex::Error::NoError, ECMAScriptFlags::BrowserExtended },
|
||||
{ "[\\0]"sv, regex::Error::NoError, combine_flags(ECMAScriptFlags::Unicode, ECMAScriptFlags::BrowserExtended) },
|
||||
{ "[\\00]"sv, regex::Error::NoError, ECMAScriptFlags::BrowserExtended },
|
||||
{ "[\\00]"sv, regex::Error::InvalidPattern, combine_flags(ECMAScriptFlags::Unicode, ECMAScriptFlags::BrowserExtended) },
|
||||
};
|
||||
|
||||
for (auto& test : tests) {
|
||||
|
@ -606,6 +620,12 @@ TEST_CASE(ECMA262_match)
|
|||
"return /xx/"sv, true, ECMAScriptFlags::BrowserExtended
|
||||
}, // #5517, appears to be matching JS expressions that involve regular expressions...
|
||||
{ "a{2,}"sv, "aaaa"sv }, // #5518
|
||||
{ "\\0"sv, "\0"sv, true, ECMAScriptFlags::BrowserExtended },
|
||||
{ "\\0"sv, "\0"sv, true, combine_flags(ECMAScriptFlags::Unicode, ECMAScriptFlags::BrowserExtended) },
|
||||
{ "\\01"sv, "\1"sv, true, ECMAScriptFlags::BrowserExtended },
|
||||
{ "[\\0]"sv, "\0"sv, true, ECMAScriptFlags::BrowserExtended },
|
||||
{ "[\\0]"sv, "\0"sv, true, combine_flags(ECMAScriptFlags::Unicode, ECMAScriptFlags::BrowserExtended) },
|
||||
{ "[\\01]"sv, "\1"sv, true, ECMAScriptFlags::BrowserExtended },
|
||||
};
|
||||
// clang-format on
|
||||
|
||||
|
|
|
@ -18,6 +18,7 @@ namespace regex {
|
|||
|
||||
static constexpr size_t s_maximum_repetition_count = 1024 * 1024;
|
||||
static constexpr auto s_alphabetic_characters = "ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz"sv;
|
||||
static constexpr auto s_decimal_characters = "0123456789"sv;
|
||||
|
||||
ALWAYS_INLINE bool Parser::set_error(Error error)
|
||||
{
|
||||
|
@ -1430,6 +1431,17 @@ bool ECMA262Parser::parse_atom_escape(ByteCode& stack, size_t& match_length_mini
|
|||
return true;
|
||||
}
|
||||
|
||||
// '\0'
|
||||
if (try_skip("0")) {
|
||||
if (!lookahead_any(s_decimal_characters)) {
|
||||
match_length_minimum += 1;
|
||||
stack.insert_bytecode_compare_values({ { CharacterCompareType::Char, (ByteCodeValueType)0 } });
|
||||
return true;
|
||||
}
|
||||
|
||||
back();
|
||||
}
|
||||
|
||||
// LegacyOctalEscapeSequence
|
||||
if (m_should_use_browser_extended_grammar) {
|
||||
if (!unicode) {
|
||||
|
@ -1441,13 +1453,6 @@ bool ECMA262Parser::parse_atom_escape(ByteCode& stack, size_t& match_length_mini
|
|||
}
|
||||
}
|
||||
|
||||
// '\0'
|
||||
if (try_skip("0")) {
|
||||
match_length_minimum += 1;
|
||||
stack.insert_bytecode_compare_values({ { CharacterCompareType::Char, (ByteCodeValueType)0 } });
|
||||
return true;
|
||||
}
|
||||
|
||||
// HexEscape
|
||||
if (try_skip("x")) {
|
||||
if (auto hex_escape = read_digits(ReadDigitsInitialZeroState::Allow, true, 2, 2); hex_escape.has_value()) {
|
||||
|
@ -1797,8 +1802,17 @@ bool ECMA262Parser::parse_nonempty_class_ranges(Vector<CompareTypeAndValuePair>&
|
|||
}
|
||||
|
||||
// '\0'
|
||||
if (try_skip("0"))
|
||||
return { CharClassRangeElement { .code_point = 0, .is_character_class = false } };
|
||||
if (try_skip("0")) {
|
||||
if (!lookahead_any(s_decimal_characters))
|
||||
return { CharClassRangeElement { .code_point = 0, .is_character_class = false } };
|
||||
back();
|
||||
}
|
||||
|
||||
// LegacyOctalEscapeSequence
|
||||
if (m_should_use_browser_extended_grammar && !unicode) {
|
||||
if (auto escape = parse_legacy_octal_escape(); escape.has_value())
|
||||
return { CharClassRangeElement { .code_point = escape.value(), .is_character_class = false } };
|
||||
}
|
||||
|
||||
// HexEscape
|
||||
if (try_skip("x")) {
|
||||
|
|
Loading…
Add table
Reference in a new issue