From 9d0ce4df0f2e6981338688ecd6e00854929c8660 Mon Sep 17 00:00:00 2001 From: Andrew Kaster Date: Sun, 13 Oct 2024 17:47:29 -0600 Subject: [PATCH] LibWeb: Add support for parsing comments in the Swift HTML tokenizer --- Tests/LibWeb/TestHTMLTokenizerSwift.swift | 57 +++++ .../LibWeb/HTML/Parser/HTMLToken.swift | 22 ++ .../LibWeb/HTML/Parser/HTMLTokenizer.swift | 194 ++++++++++++++++++ 3 files changed, 273 insertions(+) diff --git a/Tests/LibWeb/TestHTMLTokenizerSwift.swift b/Tests/LibWeb/TestHTMLTokenizerSwift.swift index d641fee8935..e773023efcf 100644 --- a/Tests/LibWeb/TestHTMLTokenizerSwift.swift +++ b/Tests/LibWeb/TestHTMLTokenizerSwift.swift @@ -252,4 +252,61 @@ struct TestHTMLTokenizerSwift { let token3 = tokenizer.nextToken() #expect(token3?.type == .EndOfFile) } + + @Test func xmlDeclaration() { + guard let tokenizer = HTMLTokenizer(input: "") else { + Issue.record("Failed to create tokenizer for ''") + return + } + + let token = tokenizer.nextToken() + #expect(token?.type == .Comment(data: "?xml version=\"1.0\" encoding=\"UTF-8\"?")) + + let token2 = tokenizer.nextToken() + #expect(token2?.type == .EndOfFile) + } + + @Test func simpleComment() { + guard let tokenizer = HTMLTokenizer(input: "") else { + Issue.record("Failed to create tokenizer for ''") + return + } + + let token = tokenizer.nextToken() + #expect(token?.type == .Comment(data: " comment ")) + + let token2 = tokenizer.nextToken() + #expect(token2?.type == .EndOfFile) + } + + @Test func nestedComment() { + guard let tokenizer = HTMLTokenizer(input: " -->") else { + Issue.record("Failed to create tokenizer for ' -->'") + return + } + + let token = tokenizer.nextToken() + #expect(token?.type == .Comment(data: " " { + let token = tokenizer.nextToken() + #expect(token?.type == .Character(codePoint: codePoint)) + } + + let token2 = tokenizer.nextToken() + #expect(token2?.type == .EndOfFile) + } + + @Test func commentWithScriptTagInside() { + guard let tokenizer = HTMLTokenizer(input: "") else { + Issue.record("Failed to create tokenizer for ''") + return + } + + let token = tokenizer.nextToken() + #expect(token?.type == .Comment(data: " ")) + + let token2 = tokenizer.nextToken() + #expect(token2?.type == .EndOfFile) + } } diff --git a/Userland/Libraries/LibWeb/HTML/Parser/HTMLToken.swift b/Userland/Libraries/LibWeb/HTML/Parser/HTMLToken.swift index bb391503352..bdecd8b8eb5 100644 --- a/Userland/Libraries/LibWeb/HTML/Parser/HTMLToken.swift +++ b/Userland/Libraries/LibWeb/HTML/Parser/HTMLToken.swift @@ -141,6 +141,28 @@ public class HTMLToken { } } } + public var selfClosing: Bool { + get { + switch self.type { + case .StartTag(_, let selfClosing, _, _): + return selfClosing + case .EndTag(_, let selfClosing, _, _): + return selfClosing + default: + preconditionFailure("selfClosing called on non-tag token") + } + } + set { + switch self.type { + case .StartTag(let tagName, _, let selfClosingAcknowledged, let attributes): + self.type = .StartTag(tagName: tagName, selfClosing: newValue, selfClosingAcknowledged: selfClosingAcknowledged, attributes: attributes) + case .EndTag(let tagName, _, let selfClosingAcknowledged, let attributes): + self.type = .EndTag(tagName: tagName, selfClosing: newValue, selfClosingAcknowledged: selfClosingAcknowledged, attributes: attributes) + default: + preconditionFailure("selfClosing= called on non-tag token") + } + } + } public init() {} public init(type: TokenType) { diff --git a/Userland/Libraries/LibWeb/HTML/Parser/HTMLTokenizer.swift b/Userland/Libraries/LibWeb/HTML/Parser/HTMLTokenizer.swift index 3bffa831939..9ef32a532ef 100644 --- a/Userland/Libraries/LibWeb/HTML/Parser/HTMLTokenizer.swift +++ b/Userland/Libraries/LibWeb/HTML/Parser/HTMLTokenizer.swift @@ -173,6 +173,13 @@ public class HTMLTokenizer { return self.input[index] } + func peekNext(count: Int) -> Swift.Substring? { + guard let endIndex = self.input.index(self.cursor, offsetBy: count, limitedBy: self.input.index(before: self.input.endIndex)) else { + return nil + } + return self.input[self.cursor.. Character? { guard self.cursor < self.input.endIndex else { return nil @@ -1031,6 +1038,193 @@ public class HTMLTokenizer { self.finalizeCurrentAttribute(.SetValue) return reconsume(currentInputCharacter!, in: .BeforeAttributeName) } + // 13.2.5.40 Self-closing start tag state, https://html.spec.whatwg.org/multipage/parsing.html#self-closing-start-tag-state + case .SelfClosingStartTag: + switch currentInputCharacter { + case ">": + self.currentToken.selfClosing = true + return switchToAndEmitCurrentToken(.Data) + case nil: + // FIXME: log_parse_error() + return emitEOF() + default: + // FIXME: log_parse_error() + return reconsume(currentInputCharacter!, in: .BeforeAttributeName) + } + // 13.2.5.41 Bogus comment state, https://html.spec.whatwg.org/multipage/parsing.html#bogus-comment-state + case .BogusComment: + switch currentInputCharacter { + case ">": + currentToken = HTMLToken(type: .Comment(data: currentBuilder.takeString())) + return switchToAndEmitCurrentToken(.Data) + case nil: + currentToken = HTMLToken(type: .Comment(data: currentBuilder.takeString())) + return emitCurrentTokenFollowedByEOF() + case "\0": + // FIXME: log_parse_error() + currentBuilder.append("\u{FFFD}") + return continueInCurrentState() + default: + self.currentBuilder.append(currentInputCharacter!) + return continueInCurrentState() + } + // 13.2.5.42 Markup declaration open state, https://html.spec.whatwg.org/multipage/parsing.html#markup-declaration-open-state + case .MarkupDeclarationOpen: + dontConsumeNextInputCharacter() + if let nextTwo = peekNext(count: 2), nextTwo == "--" { + skip(2) + return switchTo(.CommentStart) + } else if let nextSeven = peekNext(count: 7), nextSeven.uppercased() == "DOCTYPE" { + skip(7) + return switchTo(.DOCTYPE) + } else if let nextSeven = peekNext(count: 7), nextSeven.uppercased() == "[CDATA[" { + skip(7) + // FIXME: If there is an adjusted current node and it is not an element in the HTML namespace, + // then switch to the CDATA section state. + // FIXME: log_parse_error() + self.currentBuilder = "[CDATA[" + self.currentToken = HTMLToken(type: .Comment(data: "")) + return switchTo(.BogusComment) + } else { + // FIXME: log_parse_error() + self.currentToken = HTMLToken(type: .Comment(data: "")) + return switchTo(.BogusComment) + } + // 13.2.5.43 Comment start state, https://html.spec.whatwg.org/multipage/parsing.html#comment-start-state + case .CommentStart: + switch currentInputCharacter { + case "-": + return switchTo(.CommentStartDash) + case ">": + // FIXME: log_parse_error() + return switchToAndEmitCurrentToken(.Data) + default: + return reconsume(currentInputCharacter, in: .Comment) + } + // 13.2.5.44 Comment start dash state, https://html.spec.whatwg.org/multipage/parsing.html#comment-start-dash-state + case .CommentStartDash: + switch currentInputCharacter { + case "-": + return switchTo(.CommentEnd) + case ">": + // FIXME: log_parse_error() + currentToken = HTMLToken(type: .Comment(data: currentBuilder.takeString())) + return switchToAndEmitCurrentToken(.Data) + case nil: + // FIXME: log_parse_error() + currentToken = HTMLToken(type: .Comment(data: currentBuilder.takeString())) + return emitCurrentTokenFollowedByEOF() + default: + currentBuilder.append("-") + return reconsume(currentInputCharacter, in: .Comment) + } + // 13.2.5.45 Comment state, https://html.spec.whatwg.org/multipage/parsing.html#comment-state + case .Comment: + switch currentInputCharacter { + case "<": + currentBuilder.append("<") + return switchTo(.CommentLessThanSign) + case "-": + return switchTo(.CommentEndDash) + case "\0": + // FIXME: log_parse_error() + currentBuilder.append("\u{FFFD}") + return continueInCurrentState() + case nil: + // FIXME: log_parse_error() + currentToken = HTMLToken(type: .Comment(data: currentBuilder.takeString())) + return emitCurrentTokenFollowedByEOF() + default: + currentBuilder.append(currentInputCharacter!) + return continueInCurrentState() + } + // 13.2.5.46 Comment less-than sign state, https://html.spec.whatwg.org/multipage/parsing.html#comment-less-than-sign-state + case .CommentLessThanSign: + switch currentInputCharacter { + case "!": + currentBuilder.append(currentInputCharacter!) + return switchTo(.CommentLessThanSignBang) + case "<": + currentBuilder.append(currentInputCharacter!) + return continueInCurrentState() + default: + return reconsume(currentInputCharacter, in: .Comment) + } + // 13.2.5.47 Comment less-than sign bang state, https://html.spec.whatwg.org/multipage/parsing.html#comment-less-than-sign-bang-state + case .CommentLessThanSignBang: + switch currentInputCharacter { + case "-": + return switchTo(.CommentLessThanSignBangDash) + default: + return reconsume(currentInputCharacter, in: .Comment) + } + // 13.2.5.48 Comment less-than sign bang dash state, https://html.spec.whatwg.org/multipage/parsing.html#comment-less-than-sign-bang-dash-state + case .CommentLessThanSignBangDash: + switch currentInputCharacter { + case "-": + return switchTo(.CommentLessThanSignBangDashDash) + default: + return reconsume(currentInputCharacter, in: .CommentEndDash) + } + // 13.2.5.49 Comment less-than sign bang dash dash state, https://html.spec.whatwg.org/multipage/parsing.html#comment-less-than-sign-bang-dash-dash-state + case .CommentLessThanSignBangDashDash: + switch currentInputCharacter { + case ">", nil: + return reconsume(currentInputCharacter, in: .CommentEnd) + default: + // FIXME: log_parse_error() + return reconsume(currentInputCharacter, in: .CommentEnd) + } + // 13.2.5.50 Comment end dash state, https://html.spec.whatwg.org/multipage/parsing.html#comment-end-dash-state + case .CommentEndDash: + switch currentInputCharacter { + case "-": + return switchTo(.CommentEnd) + case nil: + // FIXME: log_parse_error() + currentToken = HTMLToken(type: .Comment(data: currentBuilder.takeString())) + return emitCurrentTokenFollowedByEOF() + default: + currentBuilder.append("-") + return reconsume(currentInputCharacter, in: .Comment) + } + // 13.2.5.51 Comment end state, https://html.spec.whatwg.org/multipage/parsing.html#comment-end-state + case .CommentEnd: + switch currentInputCharacter { + case ">": + currentToken = HTMLToken(type: .Comment(data: currentBuilder.takeString())) + return switchToAndEmitCurrentToken(.Data) + case "!": + return switchTo(.CommentEndBang) + case "-": + currentBuilder.append("-") + return continueInCurrentState() + case nil: + // FIXME: log_parse_error() + currentToken = HTMLToken(type: .Comment(data: currentBuilder.takeString())) + return emitCurrentTokenFollowedByEOF() + default: + currentBuilder.append("--") + return reconsume(currentInputCharacter, in: .Comment) + } + // 13.2.5.52 Comment end bang state, https://html.spec.whatwg.org/multipage/parsing.html#comment-end-bang-state + case .CommentEndBang: + switch currentInputCharacter { + case "-": + currentBuilder.append("--!") + return switchTo(.CommentEndDash) + case ">": + // FIXME: log_parse_error() + currentToken = HTMLToken(type: .Comment(data: currentBuilder.takeString())) + return switchToAndEmitCurrentToken(.Data) + case nil: + // FIXME: log_parse_error() + currentToken = HTMLToken(type: .Comment(data: currentBuilder.takeString())) + return emitCurrentTokenFollowedByEOF() + default: + currentBuilder.append("--!") + return reconsume(currentInputCharacter, in: .Comment) + } default: print("TODO: In state \(self.state) with input \(Swift.String(describing: currentInputCharacter))") return emitEOF()