mirror of
https://github.com/SerenityOS/serenity.git
synced 2025-01-24 10:22:05 -05:00
LibRegex: Support property escapes of Unicode General Categories
This changes LibRegex to parse the property escape as a Variant of Unicode Property & General Category values. A byte code instruction is added to perform matching based on General Category values.
This commit is contained in:
parent
5de6d3dd90
commit
1e10d6d7ce
5 changed files with 77 additions and 19 deletions
|
@ -661,6 +661,12 @@ TEST_CASE(ECMA262_property_match)
|
||||||
{ "\\p{ASCII_Hex_Digit}", "x", false, ECMAScriptFlags::Unicode },
|
{ "\\p{ASCII_Hex_Digit}", "x", false, ECMAScriptFlags::Unicode },
|
||||||
{ "\\p{Any}", "\xcd\xb8", true, ECMAScriptFlags::Unicode }, // U+0378, which is an unassigned code point.
|
{ "\\p{Any}", "\xcd\xb8", true, ECMAScriptFlags::Unicode }, // U+0378, which is an unassigned code point.
|
||||||
{ "\\p{Assigned}", "\xcd\xb8", false, ECMAScriptFlags::Unicode }, // U+0378, which is an unassigned code point.
|
{ "\\p{Assigned}", "\xcd\xb8", false, ECMAScriptFlags::Unicode }, // U+0378, which is an unassigned code point.
|
||||||
|
{ "\\p{Lu}", "a", false, ECMAScriptFlags::Unicode },
|
||||||
|
{ "\\p{Lu}", "A", true, ECMAScriptFlags::Unicode },
|
||||||
|
{ "\\p{Lu}", "9", false, ECMAScriptFlags::Unicode },
|
||||||
|
{ "\\p{Cased_Letter}", "a", true, ECMAScriptFlags::Unicode },
|
||||||
|
{ "\\p{Cased_Letter}", "A", true, ECMAScriptFlags::Unicode },
|
||||||
|
{ "\\p{Cased_Letter}", "9", false, ECMAScriptFlags::Unicode },
|
||||||
};
|
};
|
||||||
|
|
||||||
for (auto& test : tests) {
|
for (auto& test : tests) {
|
||||||
|
|
|
@ -537,6 +537,10 @@ ALWAYS_INLINE ExecutionResult OpCode_Compare::execute(MatchInput const& input, M
|
||||||
auto property = static_cast<Unicode::Property>(m_bytecode->at(offset++));
|
auto property = static_cast<Unicode::Property>(m_bytecode->at(offset++));
|
||||||
compare_property(input, state, property, current_inversion_state(), inverse_matched);
|
compare_property(input, state, property, current_inversion_state(), inverse_matched);
|
||||||
|
|
||||||
|
} else if (compare_type == CharacterCompareType::GeneralCategory) {
|
||||||
|
auto general_category = static_cast<Unicode::GeneralCategory>(m_bytecode->at(offset++));
|
||||||
|
compare_general_category(input, state, general_category, current_inversion_state(), inverse_matched);
|
||||||
|
|
||||||
} else {
|
} else {
|
||||||
warnln("Undefined comparison: {}", (int)compare_type);
|
warnln("Undefined comparison: {}", (int)compare_type);
|
||||||
VERIFY_NOT_REACHED();
|
VERIFY_NOT_REACHED();
|
||||||
|
@ -742,6 +746,22 @@ ALWAYS_INLINE void OpCode_Compare::compare_property(MatchInput const& input, Mat
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
ALWAYS_INLINE void OpCode_Compare::compare_general_category(MatchInput const& input, MatchState& state, Unicode::GeneralCategory general_category, bool inverse, bool& inverse_matched)
|
||||||
|
{
|
||||||
|
if (state.string_position == input.view.length())
|
||||||
|
return;
|
||||||
|
|
||||||
|
u32 code_point = input.view[state.string_position];
|
||||||
|
bool equal = Unicode::code_point_has_general_category(code_point, general_category);
|
||||||
|
|
||||||
|
if (equal) {
|
||||||
|
if (inverse)
|
||||||
|
inverse_matched = true;
|
||||||
|
else
|
||||||
|
++state.string_position;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
String const OpCode_Compare::arguments_string() const
|
String const OpCode_Compare::arguments_string() const
|
||||||
{
|
{
|
||||||
return String::formatted("argc={}, args={} ", arguments_count(), arguments_size());
|
return String::formatted("argc={}, args={} ", arguments_count(), arguments_size());
|
||||||
|
|
|
@ -67,6 +67,7 @@ enum class OpCodeId : ByteCodeValueType {
|
||||||
__ENUMERATE_CHARACTER_COMPARE_TYPE(Reference) \
|
__ENUMERATE_CHARACTER_COMPARE_TYPE(Reference) \
|
||||||
__ENUMERATE_CHARACTER_COMPARE_TYPE(NamedReference) \
|
__ENUMERATE_CHARACTER_COMPARE_TYPE(NamedReference) \
|
||||||
__ENUMERATE_CHARACTER_COMPARE_TYPE(Property) \
|
__ENUMERATE_CHARACTER_COMPARE_TYPE(Property) \
|
||||||
|
__ENUMERATE_CHARACTER_COMPARE_TYPE(GeneralCategory) \
|
||||||
__ENUMERATE_CHARACTER_COMPARE_TYPE(RangeExpressionDummy)
|
__ENUMERATE_CHARACTER_COMPARE_TYPE(RangeExpressionDummy)
|
||||||
|
|
||||||
enum class CharacterCompareType : ByteCodeValueType {
|
enum class CharacterCompareType : ByteCodeValueType {
|
||||||
|
@ -725,6 +726,7 @@ private:
|
||||||
ALWAYS_INLINE static void compare_character_class(MatchInput const& input, MatchState& state, CharClass character_class, u32 ch, bool inverse, bool& inverse_matched);
|
ALWAYS_INLINE static void compare_character_class(MatchInput const& input, MatchState& state, CharClass character_class, u32 ch, bool inverse, bool& inverse_matched);
|
||||||
ALWAYS_INLINE static void compare_character_range(MatchInput const& input, MatchState& state, u32 from, u32 to, u32 ch, bool inverse, bool& inverse_matched);
|
ALWAYS_INLINE static void compare_character_range(MatchInput const& input, MatchState& state, u32 from, u32 to, u32 ch, bool inverse, bool& inverse_matched);
|
||||||
ALWAYS_INLINE static void compare_property(MatchInput const& input, MatchState& state, Unicode::Property property, bool inverse, bool& inverse_matched);
|
ALWAYS_INLINE static void compare_property(MatchInput const& input, MatchState& state, Unicode::Property property, bool inverse, bool& inverse_matched);
|
||||||
|
ALWAYS_INLINE static void compare_general_category(MatchInput const& input, MatchState& state, Unicode::GeneralCategory general_category, bool inverse, bool& inverse_matched);
|
||||||
};
|
};
|
||||||
|
|
||||||
template<typename T>
|
template<typename T>
|
||||||
|
|
|
@ -1542,13 +1542,19 @@ bool ECMA262Parser::parse_atom_escape(ByteCode& stack, size_t& match_length_mini
|
||||||
}
|
}
|
||||||
|
|
||||||
if (unicode) {
|
if (unicode) {
|
||||||
Unicode::Property property {};
|
PropertyEscape property {};
|
||||||
bool negated = false;
|
bool negated = false;
|
||||||
|
|
||||||
if (parse_unicode_property_escape(property, negated)) {
|
if (parse_unicode_property_escape(property, negated)) {
|
||||||
if (negated)
|
if (negated)
|
||||||
stack.insert_bytecode_compare_values({ { CharacterCompareType::Inverse, 0 } });
|
stack.insert_bytecode_compare_values({ { CharacterCompareType::Inverse, 0 } });
|
||||||
stack.insert_bytecode_compare_values({ { CharacterCompareType::Property, (ByteCodeValueType)(property) } });
|
property.visit(
|
||||||
|
[&](Unicode::Property property) {
|
||||||
|
stack.insert_bytecode_compare_values({ { CharacterCompareType::Property, (ByteCodeValueType)(property) } });
|
||||||
|
},
|
||||||
|
[&](Unicode::GeneralCategory general_category) {
|
||||||
|
stack.insert_bytecode_compare_values({ { CharacterCompareType::GeneralCategory, (ByteCodeValueType)(general_category) } });
|
||||||
|
});
|
||||||
return true;
|
return true;
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
@ -1695,11 +1701,13 @@ struct CharClassRangeElement {
|
||||||
CharClass character_class;
|
CharClass character_class;
|
||||||
u32 code_point { 0 };
|
u32 code_point { 0 };
|
||||||
Unicode::Property property;
|
Unicode::Property property;
|
||||||
|
Unicode::GeneralCategory general_category;
|
||||||
};
|
};
|
||||||
|
|
||||||
bool is_negated { false };
|
bool is_negated { false };
|
||||||
bool is_character_class { false };
|
bool is_character_class { false };
|
||||||
bool is_property_escape { false };
|
bool is_property { false };
|
||||||
|
bool is_general_category { false };
|
||||||
};
|
};
|
||||||
|
|
||||||
bool ECMA262Parser::parse_nonempty_class_ranges(Vector<CompareTypeAndValuePair>& ranges, bool unicode)
|
bool ECMA262Parser::parse_nonempty_class_ranges(Vector<CompareTypeAndValuePair>& ranges, bool unicode)
|
||||||
|
@ -1784,10 +1792,17 @@ bool ECMA262Parser::parse_nonempty_class_ranges(Vector<CompareTypeAndValuePair>&
|
||||||
if (try_skip("-"))
|
if (try_skip("-"))
|
||||||
return { CharClassRangeElement { .code_point = '-', .is_character_class = false } };
|
return { CharClassRangeElement { .code_point = '-', .is_character_class = false } };
|
||||||
|
|
||||||
Unicode::Property property {};
|
PropertyEscape property {};
|
||||||
bool negated = false;
|
bool negated = false;
|
||||||
if (parse_unicode_property_escape(property, negated))
|
if (parse_unicode_property_escape(property, negated)) {
|
||||||
return { CharClassRangeElement { .property = property, .is_negated = negated, .is_character_class = true, .is_property_escape = true } };
|
return property.visit(
|
||||||
|
[&](Unicode::Property property) {
|
||||||
|
return CharClassRangeElement { .property = property, .is_negated = negated, .is_character_class = true, .is_property = true };
|
||||||
|
},
|
||||||
|
[&](Unicode::GeneralCategory general_category) {
|
||||||
|
return CharClassRangeElement { .general_category = general_category, .is_negated = negated, .is_character_class = true, .is_general_category = true };
|
||||||
|
});
|
||||||
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
if (try_skip("d"))
|
if (try_skip("d"))
|
||||||
|
@ -1828,8 +1843,11 @@ bool ECMA262Parser::parse_nonempty_class_ranges(Vector<CompareTypeAndValuePair>&
|
||||||
if (atom.is_character_class) {
|
if (atom.is_character_class) {
|
||||||
if (atom.is_negated)
|
if (atom.is_negated)
|
||||||
ranges.empend(CompareTypeAndValuePair { CharacterCompareType::TemporaryInverse, 0 });
|
ranges.empend(CompareTypeAndValuePair { CharacterCompareType::TemporaryInverse, 0 });
|
||||||
if (atom.is_property_escape)
|
|
||||||
|
if (atom.is_property)
|
||||||
ranges.empend(CompareTypeAndValuePair { CharacterCompareType::Property, (ByteCodeValueType)(atom.property) });
|
ranges.empend(CompareTypeAndValuePair { CharacterCompareType::Property, (ByteCodeValueType)(atom.property) });
|
||||||
|
else if (atom.is_general_category)
|
||||||
|
ranges.empend(CompareTypeAndValuePair { CharacterCompareType::GeneralCategory, (ByteCodeValueType)(atom.general_category) });
|
||||||
else
|
else
|
||||||
ranges.empend(CompareTypeAndValuePair { CharacterCompareType::CharClass, (ByteCodeValueType)atom.character_class });
|
ranges.empend(CompareTypeAndValuePair { CharacterCompareType::CharClass, (ByteCodeValueType)atom.character_class });
|
||||||
} else {
|
} else {
|
||||||
|
@ -1901,7 +1919,7 @@ bool ECMA262Parser::parse_nonempty_class_ranges(Vector<CompareTypeAndValuePair>&
|
||||||
return true;
|
return true;
|
||||||
}
|
}
|
||||||
|
|
||||||
bool ECMA262Parser::parse_unicode_property_escape(Unicode::Property& property, bool& negated)
|
bool ECMA262Parser::parse_unicode_property_escape(PropertyEscape& property, bool& negated)
|
||||||
{
|
{
|
||||||
negated = false;
|
negated = false;
|
||||||
|
|
||||||
|
@ -1918,13 +1936,19 @@ bool ECMA262Parser::parse_unicode_property_escape(Unicode::Property& property, b
|
||||||
return false;
|
return false;
|
||||||
}
|
}
|
||||||
|
|
||||||
if (!Unicode::is_ecma262_property(*parsed_property)) {
|
property = move(*parsed_property);
|
||||||
set_error(Error::InvalidNameForProperty);
|
|
||||||
return false;
|
|
||||||
}
|
|
||||||
|
|
||||||
property = *parsed_property;
|
return property.visit(
|
||||||
return true;
|
[this](Unicode::Property property) {
|
||||||
|
if (!Unicode::is_ecma262_property(property)) {
|
||||||
|
set_error(Error::InvalidNameForProperty);
|
||||||
|
return false;
|
||||||
|
}
|
||||||
|
return true;
|
||||||
|
},
|
||||||
|
[](Unicode::GeneralCategory) {
|
||||||
|
return true;
|
||||||
|
});
|
||||||
}
|
}
|
||||||
|
|
||||||
StringView ECMA262Parser::read_capture_group_specifier(bool take_starting_angle_bracket)
|
StringView ECMA262Parser::read_capture_group_specifier(bool take_starting_angle_bracket)
|
||||||
|
@ -1948,7 +1972,7 @@ StringView ECMA262Parser::read_capture_group_specifier(bool take_starting_angle_
|
||||||
return name;
|
return name;
|
||||||
}
|
}
|
||||||
|
|
||||||
Optional<Unicode::Property> ECMA262Parser::read_unicode_property_escape()
|
Optional<ECMA262Parser::PropertyEscape> ECMA262Parser::read_unicode_property_escape()
|
||||||
{
|
{
|
||||||
consume(TokenType::LeftCurly, Error::InvalidPattern);
|
consume(TokenType::LeftCurly, Error::InvalidPattern);
|
||||||
|
|
||||||
|
@ -1960,10 +1984,14 @@ Optional<Unicode::Property> ECMA262Parser::read_unicode_property_escape()
|
||||||
offset += consume().value().length();
|
offset += consume().value().length();
|
||||||
}
|
}
|
||||||
|
|
||||||
|
StringView property_name { start_token.value().characters_without_null_termination(), offset };
|
||||||
consume(TokenType::RightCurly, Error::InvalidPattern);
|
consume(TokenType::RightCurly, Error::InvalidPattern);
|
||||||
|
|
||||||
StringView property_name { start_token.value().characters_without_null_termination(), offset };
|
if (auto property = Unicode::property_from_string(property_name); property.has_value())
|
||||||
return Unicode::property_from_string(property_name);
|
return { *property };
|
||||||
|
if (auto general_category = Unicode::general_category_from_string(property_name); general_category.has_value())
|
||||||
|
return { *general_category };
|
||||||
|
return {};
|
||||||
}
|
}
|
||||||
|
|
||||||
bool ECMA262Parser::parse_capture_group(ByteCode& stack, size_t& match_length_minimum, bool unicode, bool named)
|
bool ECMA262Parser::parse_capture_group(ByteCode& stack, size_t& match_length_minimum, bool unicode, bool named)
|
||||||
|
|
|
@ -213,7 +213,9 @@ private:
|
||||||
StringView read_digits_as_string(ReadDigitsInitialZeroState initial_zero = ReadDigitsInitialZeroState::Allow, bool hex = false, int max_count = -1);
|
StringView read_digits_as_string(ReadDigitsInitialZeroState initial_zero = ReadDigitsInitialZeroState::Allow, bool hex = false, int max_count = -1);
|
||||||
Optional<unsigned> read_digits(ReadDigitsInitialZeroState initial_zero = ReadDigitsInitialZeroState::Allow, bool hex = false, int max_count = -1);
|
Optional<unsigned> read_digits(ReadDigitsInitialZeroState initial_zero = ReadDigitsInitialZeroState::Allow, bool hex = false, int max_count = -1);
|
||||||
StringView read_capture_group_specifier(bool take_starting_angle_bracket = false);
|
StringView read_capture_group_specifier(bool take_starting_angle_bracket = false);
|
||||||
Optional<Unicode::Property> read_unicode_property_escape();
|
|
||||||
|
using PropertyEscape = Variant<Unicode::Property, Unicode::GeneralCategory>;
|
||||||
|
Optional<PropertyEscape> read_unicode_property_escape();
|
||||||
|
|
||||||
bool parse_pattern(ByteCode&, size_t&, bool unicode, bool named);
|
bool parse_pattern(ByteCode&, size_t&, bool unicode, bool named);
|
||||||
bool parse_disjunction(ByteCode&, size_t&, bool unicode, bool named);
|
bool parse_disjunction(ByteCode&, size_t&, bool unicode, bool named);
|
||||||
|
@ -227,7 +229,7 @@ private:
|
||||||
bool parse_capture_group(ByteCode&, size_t&, bool unicode, bool named);
|
bool parse_capture_group(ByteCode&, size_t&, bool unicode, bool named);
|
||||||
Optional<CharClass> parse_character_class_escape(bool& out_inverse, bool expect_backslash = false);
|
Optional<CharClass> parse_character_class_escape(bool& out_inverse, bool expect_backslash = false);
|
||||||
bool parse_nonempty_class_ranges(Vector<CompareTypeAndValuePair>&, bool unicode);
|
bool parse_nonempty_class_ranges(Vector<CompareTypeAndValuePair>&, bool unicode);
|
||||||
bool parse_unicode_property_escape(Unicode::Property& property, bool& negated);
|
bool parse_unicode_property_escape(PropertyEscape& property, bool& negated);
|
||||||
|
|
||||||
// Used only by B.1.4, Regular Expression Patterns (Extended for use in browsers)
|
// Used only by B.1.4, Regular Expression Patterns (Extended for use in browsers)
|
||||||
bool parse_quantifiable_assertion(ByteCode&, size_t&, bool named);
|
bool parse_quantifiable_assertion(ByteCode&, size_t&, bool named);
|
||||||
|
|
Loading…
Add table
Reference in a new issue