mirror of
https://github.com/LadybirdBrowser/ladybird.git
synced 2025-01-22 09:12:13 -05:00
LibWeb: Add support for parsing comments in the Swift HTML tokenizer
This commit is contained in:
parent
36a8ad9157
commit
9d0ce4df0f
Notes:
github-actions[bot]
2024-10-16 06:32:37 +00:00
Author: https://github.com/ADKaster Commit: https://github.com/LadybirdBrowser/ladybird/commit/9d0ce4df0f2 Pull-request: https://github.com/LadybirdBrowser/ladybird/pull/1819
3 changed files with 273 additions and 0 deletions
|
@ -252,4 +252,61 @@ struct TestHTMLTokenizerSwift {
|
|||
let token3 = tokenizer.nextToken()
|
||||
#expect(token3?.type == .EndOfFile)
|
||||
}
|
||||
|
||||
@Test func xmlDeclaration() {
|
||||
guard let tokenizer = HTMLTokenizer(input: "<?xml version=\"1.0\" encoding=\"UTF-8\"?>") else {
|
||||
Issue.record("Failed to create tokenizer for '<?xml version=\"1.0\" encoding=\"UTF-8\"?>'")
|
||||
return
|
||||
}
|
||||
|
||||
let token = tokenizer.nextToken()
|
||||
#expect(token?.type == .Comment(data: "?xml version=\"1.0\" encoding=\"UTF-8\"?"))
|
||||
|
||||
let token2 = tokenizer.nextToken()
|
||||
#expect(token2?.type == .EndOfFile)
|
||||
}
|
||||
|
||||
@Test func simpleComment() {
|
||||
guard let tokenizer = HTMLTokenizer(input: "<!-- comment -->") else {
|
||||
Issue.record("Failed to create tokenizer for '<!-- comment -->'")
|
||||
return
|
||||
}
|
||||
|
||||
let token = tokenizer.nextToken()
|
||||
#expect(token?.type == .Comment(data: " comment "))
|
||||
|
||||
let token2 = tokenizer.nextToken()
|
||||
#expect(token2?.type == .EndOfFile)
|
||||
}
|
||||
|
||||
@Test func nestedComment() {
|
||||
guard let tokenizer = HTMLTokenizer(input: "<!-- <!-- nested --> -->") else {
|
||||
Issue.record("Failed to create tokenizer for '<!-- <!-- nested --> -->'")
|
||||
return
|
||||
}
|
||||
|
||||
let token = tokenizer.nextToken()
|
||||
#expect(token?.type == .Comment(data: " <!-- nested "))
|
||||
|
||||
for codePoint in " -->" {
|
||||
let token = tokenizer.nextToken()
|
||||
#expect(token?.type == .Character(codePoint: codePoint))
|
||||
}
|
||||
|
||||
let token2 = tokenizer.nextToken()
|
||||
#expect(token2?.type == .EndOfFile)
|
||||
}
|
||||
|
||||
@Test func commentWithScriptTagInside() {
|
||||
guard let tokenizer = HTMLTokenizer(input: "<!-- <script>var x = 1;</script> -->") else {
|
||||
Issue.record("Failed to create tokenizer for '<!-- <script>var x = 1;</script> -->'")
|
||||
return
|
||||
}
|
||||
|
||||
let token = tokenizer.nextToken()
|
||||
#expect(token?.type == .Comment(data: " <script>var x = 1;</script> "))
|
||||
|
||||
let token2 = tokenizer.nextToken()
|
||||
#expect(token2?.type == .EndOfFile)
|
||||
}
|
||||
}
|
||||
|
|
|
@ -141,6 +141,28 @@ public class HTMLToken {
|
|||
}
|
||||
}
|
||||
}
|
||||
public var selfClosing: Bool {
|
||||
get {
|
||||
switch self.type {
|
||||
case .StartTag(_, let selfClosing, _, _):
|
||||
return selfClosing
|
||||
case .EndTag(_, let selfClosing, _, _):
|
||||
return selfClosing
|
||||
default:
|
||||
preconditionFailure("selfClosing called on non-tag token")
|
||||
}
|
||||
}
|
||||
set {
|
||||
switch self.type {
|
||||
case .StartTag(let tagName, _, let selfClosingAcknowledged, let attributes):
|
||||
self.type = .StartTag(tagName: tagName, selfClosing: newValue, selfClosingAcknowledged: selfClosingAcknowledged, attributes: attributes)
|
||||
case .EndTag(let tagName, _, let selfClosingAcknowledged, let attributes):
|
||||
self.type = .EndTag(tagName: tagName, selfClosing: newValue, selfClosingAcknowledged: selfClosingAcknowledged, attributes: attributes)
|
||||
default:
|
||||
preconditionFailure("selfClosing= called on non-tag token")
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
public init() {}
|
||||
public init(type: TokenType) {
|
||||
|
|
|
@ -173,6 +173,13 @@ public class HTMLTokenizer {
|
|||
return self.input[index]
|
||||
}
|
||||
|
||||
func peekNext(count: Int) -> Swift.Substring? {
|
||||
guard let endIndex = self.input.index(self.cursor, offsetBy: count, limitedBy: self.input.index(before: self.input.endIndex)) else {
|
||||
return nil
|
||||
}
|
||||
return self.input[self.cursor..<endIndex]
|
||||
}
|
||||
|
||||
func nextCodePoint() -> Character? {
|
||||
guard self.cursor < self.input.endIndex else {
|
||||
return nil
|
||||
|
@ -1031,6 +1038,193 @@ public class HTMLTokenizer {
|
|||
self.finalizeCurrentAttribute(.SetValue)
|
||||
return reconsume(currentInputCharacter!, in: .BeforeAttributeName)
|
||||
}
|
||||
// 13.2.5.40 Self-closing start tag state, https://html.spec.whatwg.org/multipage/parsing.html#self-closing-start-tag-state
|
||||
case .SelfClosingStartTag:
|
||||
switch currentInputCharacter {
|
||||
case ">":
|
||||
self.currentToken.selfClosing = true
|
||||
return switchToAndEmitCurrentToken(.Data)
|
||||
case nil:
|
||||
// FIXME: log_parse_error()
|
||||
return emitEOF()
|
||||
default:
|
||||
// FIXME: log_parse_error()
|
||||
return reconsume(currentInputCharacter!, in: .BeforeAttributeName)
|
||||
}
|
||||
// 13.2.5.41 Bogus comment state, https://html.spec.whatwg.org/multipage/parsing.html#bogus-comment-state
|
||||
case .BogusComment:
|
||||
switch currentInputCharacter {
|
||||
case ">":
|
||||
currentToken = HTMLToken(type: .Comment(data: currentBuilder.takeString()))
|
||||
return switchToAndEmitCurrentToken(.Data)
|
||||
case nil:
|
||||
currentToken = HTMLToken(type: .Comment(data: currentBuilder.takeString()))
|
||||
return emitCurrentTokenFollowedByEOF()
|
||||
case "\0":
|
||||
// FIXME: log_parse_error()
|
||||
currentBuilder.append("\u{FFFD}")
|
||||
return continueInCurrentState()
|
||||
default:
|
||||
self.currentBuilder.append(currentInputCharacter!)
|
||||
return continueInCurrentState()
|
||||
}
|
||||
// 13.2.5.42 Markup declaration open state, https://html.spec.whatwg.org/multipage/parsing.html#markup-declaration-open-state
|
||||
case .MarkupDeclarationOpen:
|
||||
dontConsumeNextInputCharacter()
|
||||
if let nextTwo = peekNext(count: 2), nextTwo == "--" {
|
||||
skip(2)
|
||||
return switchTo(.CommentStart)
|
||||
} else if let nextSeven = peekNext(count: 7), nextSeven.uppercased() == "DOCTYPE" {
|
||||
skip(7)
|
||||
return switchTo(.DOCTYPE)
|
||||
} else if let nextSeven = peekNext(count: 7), nextSeven.uppercased() == "[CDATA[" {
|
||||
skip(7)
|
||||
// FIXME: If there is an adjusted current node and it is not an element in the HTML namespace,
|
||||
// then switch to the CDATA section state.
|
||||
// FIXME: log_parse_error()
|
||||
self.currentBuilder = "[CDATA["
|
||||
self.currentToken = HTMLToken(type: .Comment(data: ""))
|
||||
return switchTo(.BogusComment)
|
||||
} else {
|
||||
// FIXME: log_parse_error()
|
||||
self.currentToken = HTMLToken(type: .Comment(data: ""))
|
||||
return switchTo(.BogusComment)
|
||||
}
|
||||
// 13.2.5.43 Comment start state, https://html.spec.whatwg.org/multipage/parsing.html#comment-start-state
|
||||
case .CommentStart:
|
||||
switch currentInputCharacter {
|
||||
case "-":
|
||||
return switchTo(.CommentStartDash)
|
||||
case ">":
|
||||
// FIXME: log_parse_error()
|
||||
return switchToAndEmitCurrentToken(.Data)
|
||||
default:
|
||||
return reconsume(currentInputCharacter, in: .Comment)
|
||||
}
|
||||
// 13.2.5.44 Comment start dash state, https://html.spec.whatwg.org/multipage/parsing.html#comment-start-dash-state
|
||||
case .CommentStartDash:
|
||||
switch currentInputCharacter {
|
||||
case "-":
|
||||
return switchTo(.CommentEnd)
|
||||
case ">":
|
||||
// FIXME: log_parse_error()
|
||||
currentToken = HTMLToken(type: .Comment(data: currentBuilder.takeString()))
|
||||
return switchToAndEmitCurrentToken(.Data)
|
||||
case nil:
|
||||
// FIXME: log_parse_error()
|
||||
currentToken = HTMLToken(type: .Comment(data: currentBuilder.takeString()))
|
||||
return emitCurrentTokenFollowedByEOF()
|
||||
default:
|
||||
currentBuilder.append("-")
|
||||
return reconsume(currentInputCharacter, in: .Comment)
|
||||
}
|
||||
// 13.2.5.45 Comment state, https://html.spec.whatwg.org/multipage/parsing.html#comment-state
|
||||
case .Comment:
|
||||
switch currentInputCharacter {
|
||||
case "<":
|
||||
currentBuilder.append("<")
|
||||
return switchTo(.CommentLessThanSign)
|
||||
case "-":
|
||||
return switchTo(.CommentEndDash)
|
||||
case "\0":
|
||||
// FIXME: log_parse_error()
|
||||
currentBuilder.append("\u{FFFD}")
|
||||
return continueInCurrentState()
|
||||
case nil:
|
||||
// FIXME: log_parse_error()
|
||||
currentToken = HTMLToken(type: .Comment(data: currentBuilder.takeString()))
|
||||
return emitCurrentTokenFollowedByEOF()
|
||||
default:
|
||||
currentBuilder.append(currentInputCharacter!)
|
||||
return continueInCurrentState()
|
||||
}
|
||||
// 13.2.5.46 Comment less-than sign state, https://html.spec.whatwg.org/multipage/parsing.html#comment-less-than-sign-state
|
||||
case .CommentLessThanSign:
|
||||
switch currentInputCharacter {
|
||||
case "!":
|
||||
currentBuilder.append(currentInputCharacter!)
|
||||
return switchTo(.CommentLessThanSignBang)
|
||||
case "<":
|
||||
currentBuilder.append(currentInputCharacter!)
|
||||
return continueInCurrentState()
|
||||
default:
|
||||
return reconsume(currentInputCharacter, in: .Comment)
|
||||
}
|
||||
// 13.2.5.47 Comment less-than sign bang state, https://html.spec.whatwg.org/multipage/parsing.html#comment-less-than-sign-bang-state
|
||||
case .CommentLessThanSignBang:
|
||||
switch currentInputCharacter {
|
||||
case "-":
|
||||
return switchTo(.CommentLessThanSignBangDash)
|
||||
default:
|
||||
return reconsume(currentInputCharacter, in: .Comment)
|
||||
}
|
||||
// 13.2.5.48 Comment less-than sign bang dash state, https://html.spec.whatwg.org/multipage/parsing.html#comment-less-than-sign-bang-dash-state
|
||||
case .CommentLessThanSignBangDash:
|
||||
switch currentInputCharacter {
|
||||
case "-":
|
||||
return switchTo(.CommentLessThanSignBangDashDash)
|
||||
default:
|
||||
return reconsume(currentInputCharacter, in: .CommentEndDash)
|
||||
}
|
||||
// 13.2.5.49 Comment less-than sign bang dash dash state, https://html.spec.whatwg.org/multipage/parsing.html#comment-less-than-sign-bang-dash-dash-state
|
||||
case .CommentLessThanSignBangDashDash:
|
||||
switch currentInputCharacter {
|
||||
case ">", nil:
|
||||
return reconsume(currentInputCharacter, in: .CommentEnd)
|
||||
default:
|
||||
// FIXME: log_parse_error()
|
||||
return reconsume(currentInputCharacter, in: .CommentEnd)
|
||||
}
|
||||
// 13.2.5.50 Comment end dash state, https://html.spec.whatwg.org/multipage/parsing.html#comment-end-dash-state
|
||||
case .CommentEndDash:
|
||||
switch currentInputCharacter {
|
||||
case "-":
|
||||
return switchTo(.CommentEnd)
|
||||
case nil:
|
||||
// FIXME: log_parse_error()
|
||||
currentToken = HTMLToken(type: .Comment(data: currentBuilder.takeString()))
|
||||
return emitCurrentTokenFollowedByEOF()
|
||||
default:
|
||||
currentBuilder.append("-")
|
||||
return reconsume(currentInputCharacter, in: .Comment)
|
||||
}
|
||||
// 13.2.5.51 Comment end state, https://html.spec.whatwg.org/multipage/parsing.html#comment-end-state
|
||||
case .CommentEnd:
|
||||
switch currentInputCharacter {
|
||||
case ">":
|
||||
currentToken = HTMLToken(type: .Comment(data: currentBuilder.takeString()))
|
||||
return switchToAndEmitCurrentToken(.Data)
|
||||
case "!":
|
||||
return switchTo(.CommentEndBang)
|
||||
case "-":
|
||||
currentBuilder.append("-")
|
||||
return continueInCurrentState()
|
||||
case nil:
|
||||
// FIXME: log_parse_error()
|
||||
currentToken = HTMLToken(type: .Comment(data: currentBuilder.takeString()))
|
||||
return emitCurrentTokenFollowedByEOF()
|
||||
default:
|
||||
currentBuilder.append("--")
|
||||
return reconsume(currentInputCharacter, in: .Comment)
|
||||
}
|
||||
// 13.2.5.52 Comment end bang state, https://html.spec.whatwg.org/multipage/parsing.html#comment-end-bang-state
|
||||
case .CommentEndBang:
|
||||
switch currentInputCharacter {
|
||||
case "-":
|
||||
currentBuilder.append("--!")
|
||||
return switchTo(.CommentEndDash)
|
||||
case ">":
|
||||
// FIXME: log_parse_error()
|
||||
currentToken = HTMLToken(type: .Comment(data: currentBuilder.takeString()))
|
||||
return switchToAndEmitCurrentToken(.Data)
|
||||
case nil:
|
||||
// FIXME: log_parse_error()
|
||||
currentToken = HTMLToken(type: .Comment(data: currentBuilder.takeString()))
|
||||
return emitCurrentTokenFollowedByEOF()
|
||||
default:
|
||||
currentBuilder.append("--!")
|
||||
return reconsume(currentInputCharacter, in: .Comment)
|
||||
}
|
||||
default:
|
||||
print("TODO: In state \(self.state) with input \(Swift.String(describing: currentInputCharacter))")
|
||||
return emitEOF()
|
||||
|
|
Loading…
Reference in a new issue