LibJS: Fix some small remaining issues with parsing unicode escapes

Added a test to ensure the behavior stays the same.
We now throw on a direct usage of an escaped keywords with a specific
error to make it more clear to the user.
This commit is contained in:
davidot 2021-08-21 11:27:20 +02:00 committed by Linus Groh
parent b012170d69
commit 7bcffd1b6a
6 changed files with 125 additions and 9 deletions

View file

@ -372,11 +372,14 @@ bool Lexer::is_whitespace() const
return false;
}
Optional<u32> Lexer::is_unicode_escape(size_t& identifier_length) const
// UnicodeEscapeSequence :: https://tc39.es/ecma262/#prod-UnicodeEscapeSequence
// u Hex4Digits
// u{ CodePoint }
Optional<u32> Lexer::is_identifier_unicode_escape(size_t& identifier_length) const
{
GenericLexer lexer(source().substring_view(m_position - 1));
if (auto code_point_or_error = lexer.consume_escaped_code_point(); !code_point_or_error.is_error()) {
if (auto code_point_or_error = lexer.consume_escaped_code_point(false); !code_point_or_error.is_error()) {
identifier_length = lexer.tell();
return code_point_or_error.value();
}
@ -384,13 +387,18 @@ Optional<u32> Lexer::is_unicode_escape(size_t& identifier_length) const
return {};
}
// IdentifierStart :: https://tc39.es/ecma262/#prod-IdentifierStart
// UnicodeIDStart
// $
// _
// \ UnicodeEscapeSequence
Optional<u32> Lexer::is_identifier_start(size_t& identifier_length) const
{
u32 code_point = current_code_point();
identifier_length = 1;
if (code_point == '\\') {
if (auto maybe_code_point = is_unicode_escape(identifier_length); maybe_code_point.has_value())
if (auto maybe_code_point = is_identifier_unicode_escape(identifier_length); maybe_code_point.has_value())
code_point = *maybe_code_point;
else
return {};
@ -406,13 +414,19 @@ Optional<u32> Lexer::is_identifier_start(size_t& identifier_length) const
return {};
}
// IdentifierPart :: https://tc39.es/ecma262/#prod-IdentifierPart
// UnicodeIDContinue
// $
// \ UnicodeEscapeSequence
// <ZWNJ>
// <ZWJ>
Optional<u32> Lexer::is_identifier_middle(size_t& identifier_length) const
{
u32 code_point = current_code_point();
identifier_length = 1;
if (code_point == '\\') {
if (auto maybe_code_point = is_unicode_escape(identifier_length); maybe_code_point.has_value())
if (auto maybe_code_point = is_identifier_unicode_escape(identifier_length); maybe_code_point.has_value())
code_point = *maybe_code_point;
else
return {};
@ -574,6 +588,7 @@ Token Lexer::next()
token_type = TokenType::TemplateLiteralString;
}
} else if (auto code_point = is_identifier_start(identifier_length); code_point.has_value()) {
bool has_escaped_character = false;
// identifier or keyword
StringBuilder builder;
do {
@ -581,6 +596,8 @@ Token Lexer::next()
for (size_t i = 0; i < identifier_length; ++i)
consume();
has_escaped_character |= identifier_length > 1;
code_point = is_identifier_middle(identifier_length);
} while (code_point.has_value());
@ -592,7 +609,7 @@ Token Lexer::next()
if (it == s_keywords.end())
token_type = TokenType::Identifier;
else
token_type = it->value;
token_type = has_escaped_character ? TokenType::EscapedKeyword : it->value;
} else if (is_numeric_literal_start()) {
token_type = TokenType::NumericLiteral;
bool is_invalid_numeric_literal = false;

View file

@ -41,7 +41,7 @@ private:
bool is_eof() const;
bool is_line_terminator() const;
bool is_whitespace() const;
Optional<u32> is_unicode_escape(size_t& identifier_length) const;
Optional<u32> is_identifier_unicode_escape(size_t& identifier_length) const;
Optional<u32> is_identifier_start(size_t& identifier_length) const;
Optional<u32> is_identifier_middle(size_t& identifier_length) const;
bool is_line_comment_start(bool line_has_token_yet) const;

View file

@ -404,6 +404,11 @@ NonnullRefPtr<Statement> Parser::parse_statement(AllowLabelledFunction allow_lab
m_state.current_token = m_state.lexer.force_slash_as_regex();
[[fallthrough]];
default:
if (m_state.current_token.type() == TokenType::EscapedKeyword
&& (m_state.strict_mode
|| (m_state.current_token.value() != "yield"sv && m_state.current_token.value() != "let"sv)))
syntax_error("Keyword must not contain escaped characters");
if (match_identifier_name()) {
auto result = try_parse_labelled_statement(allow_labelled_function);
if (!result.is_null())
@ -545,7 +550,7 @@ RefPtr<Statement> Parser::try_parse_labelled_statement(AllowLabelledFunction all
load_state();
};
if (match(TokenType::Yield) && (m_state.strict_mode || m_state.in_generator_function_context)) {
if (m_state.current_token.value() == "yield"sv && (m_state.strict_mode || m_state.in_generator_function_context)) {
syntax_error("'yield' label not allowed in this context");
return {};
}
@ -604,7 +609,8 @@ RefPtr<MetaProperty> Parser::try_parse_new_target_expression()
consume();
if (!match(TokenType::Identifier))
return {};
if (consume().value() != "target")
// The string 'target' cannot have escapes so we check original value.
if (consume().original_value() != "target"sv)
return {};
state_rollback_guard.disarm();
@ -847,6 +853,9 @@ Parser::PrimaryExpressionParseResult Parser::parse_primary_expression()
if (!m_state.allow_super_property_lookup)
syntax_error("'super' keyword unexpected here");
return { create_ast_node<SuperExpression>({ m_state.current_token.filename(), rule_start.position(), position() }) };
case TokenType::EscapedKeyword:
syntax_error("Keyword must not contain escaped characters");
[[fallthrough]];
case TokenType::Identifier: {
read_as_identifier:;
if (!try_parse_arrow_function_expression_failed_at_position(position())) {
@ -2800,6 +2809,14 @@ bool Parser::match_variable_declaration()
bool Parser::match_identifier() const
{
if (m_state.current_token.type() == TokenType::EscapedKeyword) {
if (m_state.current_token.value() == "let"sv)
return !m_state.strict_mode;
if (m_state.current_token.value() == "yield"sv)
return !m_state.strict_mode && !m_state.in_generator_function_context;
return true;
}
return m_state.current_token.type() == TokenType::Identifier
|| (m_state.current_token.type() == TokenType::Let && !m_state.strict_mode)
|| (m_state.current_token.type() == TokenType::Yield && !m_state.in_generator_function_context && !m_state.strict_mode); // See note in Parser::parse_identifier().
@ -2859,6 +2876,9 @@ Token Parser::consume_identifier()
if (match(TokenType::Identifier))
return consume(TokenType::Identifier);
if (match(TokenType::EscapedKeyword))
return consume(TokenType::EscapedKeyword);
// Note that 'let' is not a reserved keyword, but our lexer considers it such
// As it's pretty nice to have that (for syntax highlighting and such), we'll
// special-case it here instead.
@ -2884,6 +2904,16 @@ Token Parser::consume_identifier_reference()
if (match(TokenType::Identifier))
return consume(TokenType::Identifier);
if (match(TokenType::EscapedKeyword)) {
auto name = m_state.current_token.value();
if (name == "await"sv)
syntax_error("Identifier reference may not be 'await'");
else if (m_state.strict_mode && (name == "let"sv || name == "yield"sv))
syntax_error(String::formatted("'{}' is not allowed as an identifier in strict mode", name));
return consume();
}
// See note in Parser::parse_identifier().
if (match(TokenType::Let)) {
if (m_state.strict_mode)

View file

@ -13,7 +13,74 @@ test("non-ascii escapes", () => {
foo.𝓑𝓻𝓸𝔀𝓷 = 12389;
expect(foo.𝓑𝓻𝓸𝔀𝓷).toBe(12389);
expect(foo.𝓑𝓻\ud835\udcf8𝔀𝓷).toBe(12389);
expect(foo.𝓑𝓻\u{1d4f8}𝔀𝓷).toBe(12389);
expect(foo.\u{1d4d1}\u{1d4fb}\u{1d4f8}\u{1d500}\u{1d4f7}).toBe(12389);
// U-16 High surrogate pair is allowed in string but not in identifier.
expect("foo.𝓑𝓻\ud835\udcf8𝔀𝓷").toEval();
expect("foo.𝓑𝓻\\ud835\\udcf8𝔀𝓷").not.toEval();
});
describe("escaped keywords", () => {
// We must double escape the slashes here else the strings already convert
// the escaped characters (and string is more lenient).
test("keywords cannot be used in an escaped form", () => {
expect("\\u{69}\\u{66}(true) throw 'Should fail'").not.toEval();
expect("wh\\u{69}le(true) throw 'Should fail'").not.toEval();
expect("l\\u{65}t a = 3;").not.toEval();
expect("function *G(){ yiel\\0064 3; }").not.toEval();
});
test("escaped keywords cannot be used as standalone variables", () => {
expect("var fu\\u{6e}ction = 4").not.toEval();
expect("var \\u0077ith = 4").not.toEval();
});
test("'yield' and 'let' can be escaped as variables", () => {
var l\u{65}t = 3;
var yi\u0065ld = 5;
expect(let).toBe(3);
expect(yield).toBe(5);
});
test("'let' cannot be used in a lexical declaration but 'yield' can", () => {
expect("const l\\u{65}t = 3;").not.toEval();
const yi\u0065ld = 5;
expect(yield).toBe(5);
});
test("escaped 'yield' and 'let' variables are not allowed in strict mode", () => {
expect("function f() { 'use strict'; var l\\u{65}t = 3; }").not.toEval();
expect("function g() { 'use strict'; var yi\u0065ld = 5; }").not.toEval();
});
test("cannot use escaped 'yield' variable or label in generator context", () => {
expect("function *g() { var yi\u0065ld = 5; }").not.toEval();
expect("function *g() { yi\u0065ld: 5; }").not.toEval();
});
test("can use escaped 'let' variable and label in generator context", () => {
expect("function *i() { var \\u{6c}et = 6; }").toEval();
expect("function *j() { \\u{6c}et: 6; }").toEval();
});
test("can use keywords in some contexts", () => {
var obj = {
\u{69}\u{66}: 3,
wh\u{69}le() {
return 4;
},
ca\u0073e: "case",
get true() {
return false;
},
};
expect(obj.\u{69}f).toBe(3);
expect(obj.whi\u{6c}e()).toBe(4);
expect(obj.\u{63}ase).toBe("case");
expect(obj.\u0074r\u{0000075}e).toBeFalse();
});
});

View file

@ -204,6 +204,7 @@ bool Token::is_identifier_name() const
// The standard defines this reversed: Identifiers are IdentifierNames except reserved words
// https://tc39.es/ecma262/#prod-Identifier
return m_type == TokenType::Identifier
|| m_type == TokenType::EscapedKeyword
|| m_type == TokenType::Await
|| m_type == TokenType::BoolLiteral
|| m_type == TokenType::Break

View file

@ -74,6 +74,7 @@ constexpr const u32 ZERO_WIDTH_JOINER { 0x200D };
__ENUMERATE_JS_TOKEN(Equals, Operator) \
__ENUMERATE_JS_TOKEN(EqualsEquals, Operator) \
__ENUMERATE_JS_TOKEN(EqualsEqualsEquals, Operator) \
__ENUMERATE_JS_TOKEN(EscapedKeyword, Identifier) \
__ENUMERATE_JS_TOKEN(ExclamationMark, Operator) \
__ENUMERATE_JS_TOKEN(ExclamationMarkEquals, Operator) \
__ENUMERATE_JS_TOKEN(ExclamationMarkEqualsEquals, Operator) \