LibWeb: Add support for parsing comments in the Swift HTML tokenizer

This commit is contained in:
Andrew Kaster 2024-10-13 17:47:29 -06:00 committed by Andreas Kling
parent 36a8ad9157
commit 9d0ce4df0f
Notes: github-actions[bot] 2024-10-16 06:32:37 +00:00
3 changed files with 273 additions and 0 deletions

View file

@ -252,4 +252,61 @@ struct TestHTMLTokenizerSwift {
let token3 = tokenizer.nextToken()
#expect(token3?.type == .EndOfFile)
}
@Test func xmlDeclaration() {
guard let tokenizer = HTMLTokenizer(input: "<?xml version=\"1.0\" encoding=\"UTF-8\"?>") else {
Issue.record("Failed to create tokenizer for '<?xml version=\"1.0\" encoding=\"UTF-8\"?>'")
return
}
let token = tokenizer.nextToken()
#expect(token?.type == .Comment(data: "?xml version=\"1.0\" encoding=\"UTF-8\"?"))
let token2 = tokenizer.nextToken()
#expect(token2?.type == .EndOfFile)
}
@Test func simpleComment() {
guard let tokenizer = HTMLTokenizer(input: "<!-- comment -->") else {
Issue.record("Failed to create tokenizer for '<!-- comment -->'")
return
}
let token = tokenizer.nextToken()
#expect(token?.type == .Comment(data: " comment "))
let token2 = tokenizer.nextToken()
#expect(token2?.type == .EndOfFile)
}
@Test func nestedComment() {
guard let tokenizer = HTMLTokenizer(input: "<!-- <!-- nested --> -->") else {
Issue.record("Failed to create tokenizer for '<!-- <!-- nested --> -->'")
return
}
let token = tokenizer.nextToken()
#expect(token?.type == .Comment(data: " <!-- nested "))
for codePoint in " -->" {
let token = tokenizer.nextToken()
#expect(token?.type == .Character(codePoint: codePoint))
}
let token2 = tokenizer.nextToken()
#expect(token2?.type == .EndOfFile)
}
@Test func commentWithScriptTagInside() {
guard let tokenizer = HTMLTokenizer(input: "<!-- <script>var x = 1;</script> -->") else {
Issue.record("Failed to create tokenizer for '<!-- <script>var x = 1;</script> -->'")
return
}
let token = tokenizer.nextToken()
#expect(token?.type == .Comment(data: " <script>var x = 1;</script> "))
let token2 = tokenizer.nextToken()
#expect(token2?.type == .EndOfFile)
}
}

View file

@ -141,6 +141,28 @@ public class HTMLToken {
}
}
}
public var selfClosing: Bool {
get {
switch self.type {
case .StartTag(_, let selfClosing, _, _):
return selfClosing
case .EndTag(_, let selfClosing, _, _):
return selfClosing
default:
preconditionFailure("selfClosing called on non-tag token")
}
}
set {
switch self.type {
case .StartTag(let tagName, _, let selfClosingAcknowledged, let attributes):
self.type = .StartTag(tagName: tagName, selfClosing: newValue, selfClosingAcknowledged: selfClosingAcknowledged, attributes: attributes)
case .EndTag(let tagName, _, let selfClosingAcknowledged, let attributes):
self.type = .EndTag(tagName: tagName, selfClosing: newValue, selfClosingAcknowledged: selfClosingAcknowledged, attributes: attributes)
default:
preconditionFailure("selfClosing= called on non-tag token")
}
}
}
public init() {}
public init(type: TokenType) {

View file

@ -173,6 +173,13 @@ public class HTMLTokenizer {
return self.input[index]
}
func peekNext(count: Int) -> Swift.Substring? {
guard let endIndex = self.input.index(self.cursor, offsetBy: count, limitedBy: self.input.index(before: self.input.endIndex)) else {
return nil
}
return self.input[self.cursor..<endIndex]
}
func nextCodePoint() -> Character? {
guard self.cursor < self.input.endIndex else {
return nil
@ -1031,6 +1038,193 @@ public class HTMLTokenizer {
self.finalizeCurrentAttribute(.SetValue)
return reconsume(currentInputCharacter!, in: .BeforeAttributeName)
}
// 13.2.5.40 Self-closing start tag state, https://html.spec.whatwg.org/multipage/parsing.html#self-closing-start-tag-state
case .SelfClosingStartTag:
switch currentInputCharacter {
case ">":
self.currentToken.selfClosing = true
return switchToAndEmitCurrentToken(.Data)
case nil:
// FIXME: log_parse_error()
return emitEOF()
default:
// FIXME: log_parse_error()
return reconsume(currentInputCharacter!, in: .BeforeAttributeName)
}
// 13.2.5.41 Bogus comment state, https://html.spec.whatwg.org/multipage/parsing.html#bogus-comment-state
case .BogusComment:
switch currentInputCharacter {
case ">":
currentToken = HTMLToken(type: .Comment(data: currentBuilder.takeString()))
return switchToAndEmitCurrentToken(.Data)
case nil:
currentToken = HTMLToken(type: .Comment(data: currentBuilder.takeString()))
return emitCurrentTokenFollowedByEOF()
case "\0":
// FIXME: log_parse_error()
currentBuilder.append("\u{FFFD}")
return continueInCurrentState()
default:
self.currentBuilder.append(currentInputCharacter!)
return continueInCurrentState()
}
// 13.2.5.42 Markup declaration open state, https://html.spec.whatwg.org/multipage/parsing.html#markup-declaration-open-state
case .MarkupDeclarationOpen:
dontConsumeNextInputCharacter()
if let nextTwo = peekNext(count: 2), nextTwo == "--" {
skip(2)
return switchTo(.CommentStart)
} else if let nextSeven = peekNext(count: 7), nextSeven.uppercased() == "DOCTYPE" {
skip(7)
return switchTo(.DOCTYPE)
} else if let nextSeven = peekNext(count: 7), nextSeven.uppercased() == "[CDATA[" {
skip(7)
// FIXME: If there is an adjusted current node and it is not an element in the HTML namespace,
// then switch to the CDATA section state.
// FIXME: log_parse_error()
self.currentBuilder = "[CDATA["
self.currentToken = HTMLToken(type: .Comment(data: ""))
return switchTo(.BogusComment)
} else {
// FIXME: log_parse_error()
self.currentToken = HTMLToken(type: .Comment(data: ""))
return switchTo(.BogusComment)
}
// 13.2.5.43 Comment start state, https://html.spec.whatwg.org/multipage/parsing.html#comment-start-state
case .CommentStart:
switch currentInputCharacter {
case "-":
return switchTo(.CommentStartDash)
case ">":
// FIXME: log_parse_error()
return switchToAndEmitCurrentToken(.Data)
default:
return reconsume(currentInputCharacter, in: .Comment)
}
// 13.2.5.44 Comment start dash state, https://html.spec.whatwg.org/multipage/parsing.html#comment-start-dash-state
case .CommentStartDash:
switch currentInputCharacter {
case "-":
return switchTo(.CommentEnd)
case ">":
// FIXME: log_parse_error()
currentToken = HTMLToken(type: .Comment(data: currentBuilder.takeString()))
return switchToAndEmitCurrentToken(.Data)
case nil:
// FIXME: log_parse_error()
currentToken = HTMLToken(type: .Comment(data: currentBuilder.takeString()))
return emitCurrentTokenFollowedByEOF()
default:
currentBuilder.append("-")
return reconsume(currentInputCharacter, in: .Comment)
}
// 13.2.5.45 Comment state, https://html.spec.whatwg.org/multipage/parsing.html#comment-state
case .Comment:
switch currentInputCharacter {
case "<":
currentBuilder.append("<")
return switchTo(.CommentLessThanSign)
case "-":
return switchTo(.CommentEndDash)
case "\0":
// FIXME: log_parse_error()
currentBuilder.append("\u{FFFD}")
return continueInCurrentState()
case nil:
// FIXME: log_parse_error()
currentToken = HTMLToken(type: .Comment(data: currentBuilder.takeString()))
return emitCurrentTokenFollowedByEOF()
default:
currentBuilder.append(currentInputCharacter!)
return continueInCurrentState()
}
// 13.2.5.46 Comment less-than sign state, https://html.spec.whatwg.org/multipage/parsing.html#comment-less-than-sign-state
case .CommentLessThanSign:
switch currentInputCharacter {
case "!":
currentBuilder.append(currentInputCharacter!)
return switchTo(.CommentLessThanSignBang)
case "<":
currentBuilder.append(currentInputCharacter!)
return continueInCurrentState()
default:
return reconsume(currentInputCharacter, in: .Comment)
}
// 13.2.5.47 Comment less-than sign bang state, https://html.spec.whatwg.org/multipage/parsing.html#comment-less-than-sign-bang-state
case .CommentLessThanSignBang:
switch currentInputCharacter {
case "-":
return switchTo(.CommentLessThanSignBangDash)
default:
return reconsume(currentInputCharacter, in: .Comment)
}
// 13.2.5.48 Comment less-than sign bang dash state, https://html.spec.whatwg.org/multipage/parsing.html#comment-less-than-sign-bang-dash-state
case .CommentLessThanSignBangDash:
switch currentInputCharacter {
case "-":
return switchTo(.CommentLessThanSignBangDashDash)
default:
return reconsume(currentInputCharacter, in: .CommentEndDash)
}
// 13.2.5.49 Comment less-than sign bang dash dash state, https://html.spec.whatwg.org/multipage/parsing.html#comment-less-than-sign-bang-dash-dash-state
case .CommentLessThanSignBangDashDash:
switch currentInputCharacter {
case ">", nil:
return reconsume(currentInputCharacter, in: .CommentEnd)
default:
// FIXME: log_parse_error()
return reconsume(currentInputCharacter, in: .CommentEnd)
}
// 13.2.5.50 Comment end dash state, https://html.spec.whatwg.org/multipage/parsing.html#comment-end-dash-state
case .CommentEndDash:
switch currentInputCharacter {
case "-":
return switchTo(.CommentEnd)
case nil:
// FIXME: log_parse_error()
currentToken = HTMLToken(type: .Comment(data: currentBuilder.takeString()))
return emitCurrentTokenFollowedByEOF()
default:
currentBuilder.append("-")
return reconsume(currentInputCharacter, in: .Comment)
}
// 13.2.5.51 Comment end state, https://html.spec.whatwg.org/multipage/parsing.html#comment-end-state
case .CommentEnd:
switch currentInputCharacter {
case ">":
currentToken = HTMLToken(type: .Comment(data: currentBuilder.takeString()))
return switchToAndEmitCurrentToken(.Data)
case "!":
return switchTo(.CommentEndBang)
case "-":
currentBuilder.append("-")
return continueInCurrentState()
case nil:
// FIXME: log_parse_error()
currentToken = HTMLToken(type: .Comment(data: currentBuilder.takeString()))
return emitCurrentTokenFollowedByEOF()
default:
currentBuilder.append("--")
return reconsume(currentInputCharacter, in: .Comment)
}
// 13.2.5.52 Comment end bang state, https://html.spec.whatwg.org/multipage/parsing.html#comment-end-bang-state
case .CommentEndBang:
switch currentInputCharacter {
case "-":
currentBuilder.append("--!")
return switchTo(.CommentEndDash)
case ">":
// FIXME: log_parse_error()
currentToken = HTMLToken(type: .Comment(data: currentBuilder.takeString()))
return switchToAndEmitCurrentToken(.Data)
case nil:
// FIXME: log_parse_error()
currentToken = HTMLToken(type: .Comment(data: currentBuilder.takeString()))
return emitCurrentTokenFollowedByEOF()
default:
currentBuilder.append("--!")
return reconsume(currentInputCharacter, in: .Comment)
}
default:
print("TODO: In state \(self.state) with input \(Swift.String(describing: currentInputCharacter))")
return emitEOF()