LibJS: Fix some small remaining issues with parsing unicode escapes

Added a test to ensure the behavior stays the same. We now throw on a direct usage of an escaped keywords with a specific error to make it more clear to the user.
2025-01-24 18:32:28 -05:00 · 2021-08-21 11:27:20 +02:00 · 2021-08-21 11:27:20 +02:00 · 7bcffd1b6a
commit 7bcffd1b6a
parent b012170d69
6 changed files with 125 additions and 9 deletions
--- a/Userland/Libraries/LibJS/Lexer.cpp
+++ b/Userland/Libraries/LibJS/Lexer.cpp
@ -372,11 +372,14 @@ bool Lexer::is_whitespace() const
    return false;
 }

-Optional<u32> Lexer::is_unicode_escape(size_t& identifier_length) const
+// UnicodeEscapeSequence :: https://tc39.es/ecma262/#prod-UnicodeEscapeSequence
+//          u Hex4Digits
+//          u{ CodePoint }
+Optional<u32> Lexer::is_identifier_unicode_escape(size_t& identifier_length) const
 {
    GenericLexer lexer(source().substring_view(m_position - 1));

-    if (auto code_point_or_error = lexer.consume_escaped_code_point(); !code_point_or_error.is_error()) {
+    if (auto code_point_or_error = lexer.consume_escaped_code_point(false); !code_point_or_error.is_error()) {
        identifier_length = lexer.tell();
        return code_point_or_error.value();
    }
@ -384,13 +387,18 @@ Optional<u32> Lexer::is_unicode_escape(size_t& identifier_length) const
    return {};
 }

+// IdentifierStart :: https://tc39.es/ecma262/#prod-IdentifierStart
+//          UnicodeIDStart
+//          $
+//          _
+//          \ UnicodeEscapeSequence
 Optional<u32> Lexer::is_identifier_start(size_t& identifier_length) const
 {
    u32 code_point = current_code_point();
    identifier_length = 1;

    if (code_point == '\\') {
-        if (auto maybe_code_point = is_unicode_escape(identifier_length); maybe_code_point.has_value())
+        if (auto maybe_code_point = is_identifier_unicode_escape(identifier_length); maybe_code_point.has_value())
            code_point = *maybe_code_point;
        else
            return {};
@ -406,13 +414,19 @@ Optional<u32> Lexer::is_identifier_start(size_t& identifier_length) const
    return {};
 }

+// IdentifierPart :: https://tc39.es/ecma262/#prod-IdentifierPart
+//          UnicodeIDContinue
+//          $
+//          \ UnicodeEscapeSequence
+//          <ZWNJ>
+//          <ZWJ>
 Optional<u32> Lexer::is_identifier_middle(size_t& identifier_length) const
 {
    u32 code_point = current_code_point();
    identifier_length = 1;

    if (code_point == '\\') {
-        if (auto maybe_code_point = is_unicode_escape(identifier_length); maybe_code_point.has_value())
+        if (auto maybe_code_point = is_identifier_unicode_escape(identifier_length); maybe_code_point.has_value())
            code_point = *maybe_code_point;
        else
            return {};
@ -574,6 +588,7 @@ Token Lexer::next()
                token_type = TokenType::TemplateLiteralString;
        }
    } else if (auto code_point = is_identifier_start(identifier_length); code_point.has_value()) {
+        bool has_escaped_character = false;
        // identifier or keyword
        StringBuilder builder;
        do {
@ -581,6 +596,8 @@ Token Lexer::next()
            for (size_t i = 0; i < identifier_length; ++i)
                consume();

+            has_escaped_character |= identifier_length > 1;
+
            code_point = is_identifier_middle(identifier_length);
        } while (code_point.has_value());

@ -592,7 +609,7 @@ Token Lexer::next()
        if (it == s_keywords.end())
            token_type = TokenType::Identifier;
        else
-            token_type = it->value;
+            token_type = has_escaped_character ? TokenType::EscapedKeyword : it->value;
    } else if (is_numeric_literal_start()) {
        token_type = TokenType::NumericLiteral;
        bool is_invalid_numeric_literal = false;
--- a/Userland/Libraries/LibJS/Lexer.h
+++ b/Userland/Libraries/LibJS/Lexer.h
@ -41,7 +41,7 @@ private:
    bool is_eof() const;
    bool is_line_terminator() const;
    bool is_whitespace() const;
-    Optional<u32> is_unicode_escape(size_t& identifier_length) const;
+    Optional<u32> is_identifier_unicode_escape(size_t& identifier_length) const;
    Optional<u32> is_identifier_start(size_t& identifier_length) const;
    Optional<u32> is_identifier_middle(size_t& identifier_length) const;
    bool is_line_comment_start(bool line_has_token_yet) const;
--- a/Userland/Libraries/LibJS/Parser.cpp
+++ b/Userland/Libraries/LibJS/Parser.cpp
@ -404,6 +404,11 @@ NonnullRefPtr<Statement> Parser::parse_statement(AllowLabelledFunction allow_lab
        m_state.current_token = m_state.lexer.force_slash_as_regex();
        [[fallthrough]];
    default:
+        if (m_state.current_token.type() == TokenType::EscapedKeyword
+            && (m_state.strict_mode
+                || (m_state.current_token.value() != "yield"sv && m_state.current_token.value() != "let"sv)))
+            syntax_error("Keyword must not contain escaped characters");
+
        if (match_identifier_name()) {
            auto result = try_parse_labelled_statement(allow_labelled_function);
            if (!result.is_null())
@ -545,7 +550,7 @@ RefPtr<Statement> Parser::try_parse_labelled_statement(AllowLabelledFunction all
        load_state();
    };

-    if (match(TokenType::Yield) && (m_state.strict_mode || m_state.in_generator_function_context)) {
+    if (m_state.current_token.value() == "yield"sv && (m_state.strict_mode || m_state.in_generator_function_context)) {
        syntax_error("'yield' label not allowed in this context");
        return {};
    }
@ -604,7 +609,8 @@ RefPtr<MetaProperty> Parser::try_parse_new_target_expression()
    consume();
    if (!match(TokenType::Identifier))
        return {};
-    if (consume().value() != "target")
+    // The string 'target' cannot have escapes so we check original value.
+    if (consume().original_value() != "target"sv)
        return {};

    state_rollback_guard.disarm();
@ -847,6 +853,9 @@ Parser::PrimaryExpressionParseResult Parser::parse_primary_expression()
        if (!m_state.allow_super_property_lookup)
            syntax_error("'super' keyword unexpected here");
        return { create_ast_node<SuperExpression>({ m_state.current_token.filename(), rule_start.position(), position() }) };
+    case TokenType::EscapedKeyword:
+        syntax_error("Keyword must not contain escaped characters");
+        [[fallthrough]];
    case TokenType::Identifier: {
    read_as_identifier:;
        if (!try_parse_arrow_function_expression_failed_at_position(position())) {
@ -2800,6 +2809,14 @@ bool Parser::match_variable_declaration()

 bool Parser::match_identifier() const
 {
+    if (m_state.current_token.type() == TokenType::EscapedKeyword) {
+        if (m_state.current_token.value() == "let"sv)
+            return !m_state.strict_mode;
+        if (m_state.current_token.value() == "yield"sv)
+            return !m_state.strict_mode && !m_state.in_generator_function_context;
+        return true;
+    }
+
    return m_state.current_token.type() == TokenType::Identifier
        || (m_state.current_token.type() == TokenType::Let && !m_state.strict_mode)
        || (m_state.current_token.type() == TokenType::Yield && !m_state.in_generator_function_context && !m_state.strict_mode); // See note in Parser::parse_identifier().
@ -2859,6 +2876,9 @@ Token Parser::consume_identifier()
    if (match(TokenType::Identifier))
        return consume(TokenType::Identifier);

+    if (match(TokenType::EscapedKeyword))
+        return consume(TokenType::EscapedKeyword);
+
    // Note that 'let' is not a reserved keyword, but our lexer considers it such
    // As it's pretty nice to have that (for syntax highlighting and such), we'll
    // special-case it here instead.
@ -2884,6 +2904,16 @@ Token Parser::consume_identifier_reference()
    if (match(TokenType::Identifier))
        return consume(TokenType::Identifier);

+    if (match(TokenType::EscapedKeyword)) {
+        auto name = m_state.current_token.value();
+        if (name == "await"sv)
+            syntax_error("Identifier reference may not be 'await'");
+        else if (m_state.strict_mode && (name == "let"sv || name == "yield"sv))
+            syntax_error(String::formatted("'{}' is not allowed as an identifier in strict mode", name));
+
+        return consume();
+    }
+
    // See note in Parser::parse_identifier().
    if (match(TokenType::Let)) {
        if (m_state.strict_mode)
--- a/Userland/Libraries/LibJS/Tests/unicode-identifier-escape.js
+++ b/Userland/Libraries/LibJS/Tests/unicode-identifier-escape.js
@ -13,7 +13,74 @@ test("non-ascii escapes", () => {
    foo.𝓑𝓻𝓸𝔀𝓷 = 12389;

    expect(foo.𝓑𝓻𝓸𝔀𝓷).toBe(12389);
-    expect(foo.𝓑𝓻\ud835\udcf8𝔀𝓷).toBe(12389);
    expect(foo.𝓑𝓻\u{1d4f8}𝔀𝓷).toBe(12389);
    expect(foo.\u{1d4d1}\u{1d4fb}\u{1d4f8}\u{1d500}\u{1d4f7}).toBe(12389);
+
+    // U-16 High surrogate pair is allowed in string but not in identifier.
+    expect("foo.𝓑𝓻\ud835\udcf8𝔀𝓷").toEval();
+    expect("foo.𝓑𝓻\\ud835\\udcf8𝔀𝓷").not.toEval();
+});
+
+describe("escaped keywords", () => {
+    // We must double escape the slashes here else the strings already convert
+    // the escaped characters (and string is more lenient).
+    test("keywords cannot be used in an escaped form", () => {
+        expect("\\u{69}\\u{66}(true) throw 'Should fail'").not.toEval();
+        expect("wh\\u{69}le(true) throw 'Should fail'").not.toEval();
+
+        expect("l\\u{65}t a = 3;").not.toEval();
+        expect("function *G(){ yiel\\0064 3; }").not.toEval();
+    });
+
+    test("escaped keywords cannot be used as standalone variables", () => {
+        expect("var fu\\u{6e}ction = 4").not.toEval();
+        expect("var \\u0077ith = 4").not.toEval();
+    });
+
+    test("'yield' and 'let' can be escaped as variables", () => {
+        var l\u{65}t = 3;
+        var yi\u0065ld = 5;
+        expect(let).toBe(3);
+        expect(yield).toBe(5);
+    });
+
+    test("'let' cannot be used in a lexical declaration but 'yield' can", () => {
+        expect("const l\\u{65}t = 3;").not.toEval();
+
+        const yi\u0065ld = 5;
+        expect(yield).toBe(5);
+    });
+
+    test("escaped 'yield' and 'let' variables are not allowed in strict mode", () => {
+        expect("function f() { 'use strict'; var l\\u{65}t = 3; }").not.toEval();
+        expect("function g() { 'use strict'; var yi\u0065ld = 5; }").not.toEval();
+    });
+
+    test("cannot use escaped 'yield' variable or label in generator context", () => {
+        expect("function *g() { var yi\u0065ld = 5; }").not.toEval();
+        expect("function *g() { yi\u0065ld: 5; }").not.toEval();
+    });
+
+    test("can use escaped 'let' variable and label in generator context", () => {
+        expect("function *i() { var \\u{6c}et = 6; }").toEval();
+        expect("function *j() { \\u{6c}et: 6; }").toEval();
+    });
+
+    test("can use keywords in some contexts", () => {
+        var obj = {
+            \u{69}\u{66}: 3,
+            wh\u{69}le() {
+                return 4;
+            },
+            ca\u0073e: "case",
+            get true() {
+                return false;
+            },
+        };
+
+        expect(obj.\u{69}f).toBe(3);
+        expect(obj.whi\u{6c}e()).toBe(4);
+        expect(obj.\u{63}ase).toBe("case");
+        expect(obj.\u0074r\u{0000075}e).toBeFalse();
+    });
 });
--- a/Userland/Libraries/LibJS/Token.cpp
+++ b/Userland/Libraries/LibJS/Token.cpp
@ -204,6 +204,7 @@ bool Token::is_identifier_name() const
    // The standard defines this reversed: Identifiers are IdentifierNames except reserved words
    // https://tc39.es/ecma262/#prod-Identifier
    return m_type == TokenType::Identifier
+        || m_type == TokenType::EscapedKeyword
        || m_type == TokenType::Await
        || m_type == TokenType::BoolLiteral
        || m_type == TokenType::Break
--- a/Userland/Libraries/LibJS/Token.h
+++ b/Userland/Libraries/LibJS/Token.h
@ -74,6 +74,7 @@ constexpr const u32 ZERO_WIDTH_JOINER { 0x200D };
    __ENUMERATE_JS_TOKEN(Equals, Operator)                      \
    __ENUMERATE_JS_TOKEN(EqualsEquals, Operator)                \
    __ENUMERATE_JS_TOKEN(EqualsEqualsEquals, Operator)          \
+    __ENUMERATE_JS_TOKEN(EscapedKeyword, Identifier)            \
    __ENUMERATE_JS_TOKEN(ExclamationMark, Operator)             \
    __ENUMERATE_JS_TOKEN(ExclamationMarkEquals, Operator)       \
    __ENUMERATE_JS_TOKEN(ExclamationMarkEqualsEquals, Operator) \