mirror of
https://github.com/LadybirdBrowser/ladybird.git
synced 2025-01-23 09:46:04 -05:00
LibRegex: Allow Unicode escape sequences in capture group names
Unfortunately, this requires a slight divergence in the way the capture group names are stored. Previously, the generated byte code would simply store a view into the regex pattern string, so no string copying was required. Now, the escape sequences are decoded into a new string, and a vector of all parsed capture group names are stored in a vector in the parser result structure. The byte code then stores a view into the corresponding string in that vector.
This commit is contained in:
parent
6131c0485e
commit
4f2cbe119b
Notes:
sideshowbarker
2024-07-18 05:28:18 +09:00
Author: https://github.com/trflynn89 Commit: https://github.com/SerenityOS/serenity/commit/4f2cbe119b6 Pull-request: https://github.com/SerenityOS/serenity/pull/9499 Reviewed-by: https://github.com/alimpfard Reviewed-by: https://github.com/davidot ✅
4 changed files with 38 additions and 9 deletions
|
@ -698,6 +698,9 @@ TEST_CASE(ECMA262_unicode_match)
|
|||
{ "\\ud83d\\ud83d"sv, "\xed\xa0\xbd\xed\xa0\xbd"sv, true, ECMAScriptFlags::Unicode },
|
||||
{ "(?<=.{3})f"sv, "abcdef"sv, true, ECMAScriptFlags::Unicode },
|
||||
{ "(?<=.{3})f"sv, "abc😀ef"sv, true, ECMAScriptFlags::Unicode },
|
||||
{ "(?<𝓑𝓻𝓸𝔀𝓷>brown)"sv, "brown"sv, true, ECMAScriptFlags::Unicode },
|
||||
{ "(?<\\u{1d4d1}\\u{1d4fb}\\u{1d4f8}\\u{1d500}\\u{1d4f7}>brown)"sv, "brown"sv, true, ECMAScriptFlags::Unicode },
|
||||
{ "(?<\\ud835\\udcd1\\ud835\\udcfb\\ud835\\udcf8\\ud835\\udd00\\ud835\\udcf7>brown)"sv, "brown"sv, true, ECMAScriptFlags::Unicode },
|
||||
};
|
||||
|
||||
for (auto& test : tests) {
|
||||
|
|
|
@ -55,3 +55,18 @@ test("UTF-16", () => {
|
|||
expect("😀😀".match(/\ud83d/g)).toEqual(["\ud83d", "\ud83d"]);
|
||||
expect("😀😀".match(/\ude00/g)).toEqual(["\ude00", "\ude00"]);
|
||||
});
|
||||
|
||||
test("escaped code points", () => {
|
||||
var string = "The quick brown fox jumped over the lazy dog's back";
|
||||
|
||||
var re = /(?<𝓑𝓻𝓸𝔀𝓷>brown)/u;
|
||||
expect(string.match(re).groups.𝓑𝓻𝓸𝔀𝓷).toBe("brown");
|
||||
|
||||
re = /(?<\u{1d4d1}\u{1d4fb}\u{1d4f8}\u{1d500}\u{1d4f7}>brown)/u;
|
||||
expect(string.match(re).groups.𝓑𝓻𝓸𝔀𝓷).toBe("brown");
|
||||
expect(string.match(re).groups.𝓑𝓻𝓸𝔀𝓷).toBe("brown");
|
||||
|
||||
re = /(?<\ud835\udcd1\ud835\udcfb\ud835\udcf8\ud835\udd00\ud835\udcf7>brown)/u;
|
||||
expect(string.match(re).groups.𝓑𝓻𝓸𝔀𝓷).toBe("brown");
|
||||
expect(string.match(re).groups.𝓑𝓻𝓸𝔀𝓷).toBe("brown");
|
||||
});
|
||||
|
|
|
@ -195,7 +195,8 @@ Parser::Result Parser::parse(Optional<AllOptions> regex_options)
|
|||
move(m_parser_state.named_capture_groups_count),
|
||||
move(m_parser_state.match_length_minimum),
|
||||
move(m_parser_state.error),
|
||||
move(m_parser_state.error_token)
|
||||
move(m_parser_state.error_token),
|
||||
m_parser_state.named_capture_groups.keys()
|
||||
};
|
||||
}
|
||||
|
||||
|
@ -2009,21 +2010,30 @@ bool ECMA262Parser::parse_unicode_property_escape(PropertyEscape& property, bool
|
|||
[](Empty&) -> bool { VERIFY_NOT_REACHED(); });
|
||||
}
|
||||
|
||||
StringView ECMA262Parser::read_capture_group_specifier(bool take_starting_angle_bracket)
|
||||
FlyString ECMA262Parser::read_capture_group_specifier(bool take_starting_angle_bracket)
|
||||
{
|
||||
if (take_starting_angle_bracket && !consume("<"))
|
||||
return {};
|
||||
|
||||
auto start_token = m_parser_state.current_token;
|
||||
size_t offset = 0;
|
||||
while (match(TokenType::Char) || match(TokenType::Dollar)) {
|
||||
StringBuilder builder;
|
||||
while (match(TokenType::Char) || match(TokenType::Dollar) || match(TokenType::LeftCurly) || match(TokenType::RightCurly)) {
|
||||
auto c = m_parser_state.current_token.value();
|
||||
if (c == ">")
|
||||
break;
|
||||
offset += consume().value().length();
|
||||
|
||||
if (try_skip("\\u"sv)) {
|
||||
if (auto code_point = consume_escaped_code_point(true); code_point.has_value()) {
|
||||
builder.append_code_point(*code_point);
|
||||
} else {
|
||||
set_error(Error::InvalidNameForCaptureGroup);
|
||||
return {};
|
||||
}
|
||||
} else {
|
||||
builder.append(consume().value());
|
||||
}
|
||||
}
|
||||
|
||||
StringView name { start_token.value().characters_without_null_termination(), offset };
|
||||
FlyString name = builder.build();
|
||||
if (!consume(">") || name.is_empty())
|
||||
set_error(Error::InvalidNameForCaptureGroup);
|
||||
|
||||
|
@ -2146,7 +2156,7 @@ bool ECMA262Parser::parse_capture_group(ByteCode& stack, size_t& match_length_mi
|
|||
|
||||
stack.insert_bytecode_group_capture_left(group_index);
|
||||
stack.extend(move(capture_group_bytecode));
|
||||
stack.insert_bytecode_group_capture_right(group_index, name);
|
||||
stack.insert_bytecode_group_capture_right(group_index, name.view());
|
||||
|
||||
match_length_minimum += length;
|
||||
|
||||
|
|
|
@ -53,6 +53,7 @@ public:
|
|||
size_t match_length_minimum;
|
||||
Error error;
|
||||
Token error_token;
|
||||
Vector<FlyString> capture_groups;
|
||||
};
|
||||
|
||||
explicit Parser(Lexer& lexer)
|
||||
|
@ -218,7 +219,7 @@ private:
|
|||
};
|
||||
StringView read_digits_as_string(ReadDigitsInitialZeroState initial_zero = ReadDigitsInitialZeroState::Allow, bool hex = false, int max_count = -1, int min_count = -1);
|
||||
Optional<unsigned> read_digits(ReadDigitsInitialZeroState initial_zero = ReadDigitsInitialZeroState::Allow, bool hex = false, int max_count = -1, int min_count = -1);
|
||||
StringView read_capture_group_specifier(bool take_starting_angle_bracket = false);
|
||||
FlyString read_capture_group_specifier(bool take_starting_angle_bracket = false);
|
||||
|
||||
struct Script {
|
||||
Unicode::Script script {};
|
||||
|
|
Loading…
Add table
Reference in a new issue