LibWeb: Add more HTML tokenization states to Swift implementation

This patch adds support for start and end tags, as well as script tag
rules.
This commit is contained in:
Andrew Kaster 2024-09-28 18:28:54 -06:00 committed by Andreas Kling
parent 91de0438fe
commit d96c7edfb6
Notes: github-actions[bot] 2024-10-02 07:45:32 +00:00
3 changed files with 942 additions and 22 deletions

View file

@ -76,7 +76,7 @@ struct TestHTMLTokenizerSwift {
#expect(token2 == nil)
}
@Test func dataStateTagOpen() {
@Test func tagOpenOnly() {
guard let tokenizer = HTMLTokenizer(input: "<") else {
Issue.record("Failed to create tokenizer for '<'")
return
@ -84,11 +84,14 @@ struct TestHTMLTokenizerSwift {
#expect(tokenizer.state == HTMLTokenizer.State.Data) // initial state
let token = tokenizer.nextToken()
#expect(token?.type == .EndOfFile)
#expect(tokenizer.state == HTMLTokenizer.State.TagOpen)
#expect(token?.type == .Character(codePoint: "<"))
let token2 = tokenizer.nextToken()
#expect(token2 == nil)
#expect(token2?.type == .EndOfFile)
#expect(tokenizer.state == HTMLTokenizer.State.TagOpen)
let token3 = tokenizer.nextToken()
#expect(token3 == nil)
}
@Test func dataStateNulChar() {
@ -112,4 +115,141 @@ struct TestHTMLTokenizerSwift {
#expect(tokenizer.state == HTMLTokenizer.State.Data)
}
@Test func scriptTagWithAttributes() {
guard let tokenizer = HTMLTokenizer(input: "<script type=\"text/javascript\">") else {
Issue.record("Failed to create tokenizer for '<script type=\"text/javascript\">'")
return
}
#expect(tokenizer.state == HTMLTokenizer.State.Data) // initial state
let token = tokenizer.nextToken()
#expect(token?.type == .StartTag(tagName: "script", attributes: [HTMLToken.Attribute(localName: "type", value: "text/javascript")]))
let token2 = tokenizer.nextToken()
#expect(token2?.type == .EndOfFile)
#expect(tokenizer.state == HTMLTokenizer.State.Data)
}
@Test func scriptWithContent() {
guard let tokenizer = HTMLTokenizer(input: "<script>var x = 1;</script>") else {
Issue.record("Failed to create tokenizer for '<script>var x = 1;</script>'")
return
}
let token = tokenizer.nextToken()
#expect(token?.type == .StartTag(tagName: "script", attributes: []))
for codePoint in "var x = 1;" {
let token = tokenizer.nextToken()
#expect(token?.type == .Character(codePoint: codePoint))
}
let token2 = tokenizer.nextToken()
#expect(token2?.type == .EndTag(tagName: "script"))
let token3 = tokenizer.nextToken()
#expect(token3?.type == .EndOfFile)
}
@Test func simpleDivWithContent() {
guard let tokenizer = HTMLTokenizer(input: "<div>hi</div>") else {
Issue.record("Failed to create tokenizer for '<div>hi</div>'")
return
}
#expect(tokenizer.state == HTMLTokenizer.State.Data) // initial state
let token = tokenizer.nextToken()
#expect(token?.type == .StartTag(tagName: "div", attributes: []))
let token2 = tokenizer.nextToken()
#expect(token2?.type == .Character(codePoint: "h"))
let token3 = tokenizer.nextToken()
#expect(token3?.type == .Character(codePoint: "i"))
let token4 = tokenizer.nextToken()
#expect(token4?.type == .EndTag(tagName: "div"))
let token5 = tokenizer.nextToken()
#expect(token5?.type == .EndOfFile)
}
@Test func simpleDivWithContentAndAttributes() {
guard let tokenizer = HTMLTokenizer(input: "<div class=\"foo\">hi</div>") else {
Issue.record("Failed to create tokenizer for '<div class=\"foo\">hi</div>'")
return
}
#expect(tokenizer.state == HTMLTokenizer.State.Data) // initial state
let token = tokenizer.nextToken()
#expect(token?.type == .StartTag(tagName: "div", attributes: [HTMLToken.Attribute(localName: "class", value: "foo")]))
let token2 = tokenizer.nextToken()
#expect(token2?.type == .Character(codePoint: "h"))
let token3 = tokenizer.nextToken()
#expect(token3?.type == .Character(codePoint: "i"))
let token4 = tokenizer.nextToken()
#expect(token4?.type == .EndTag(tagName: "div"))
let token5 = tokenizer.nextToken()
#expect(token5?.type == .EndOfFile)
}
@Test func severalDivsWithAttributesAndContent() {
// Explicitly use unquoted and single quotes for attribute values
guard let tokenizer = HTMLTokenizer(input: "<div class=foo>hi</div><div class='bar'>bye</div>") else {
Issue.record("Failed to create tokenizer for '<div class=\"foo\">hi</div><div class=\"bar\">bye</div>'")
return
}
let token = tokenizer.nextToken()
#expect(token?.type == .StartTag(tagName: "div", attributes: [HTMLToken.Attribute(localName: "class", value: "foo")]))
for codePoint in "hi" {
let token = tokenizer.nextToken()
#expect(token?.type == .Character(codePoint: codePoint))
}
let token2 = tokenizer.nextToken()
#expect(token2?.type == .EndTag(tagName: "div"))
let token3 = tokenizer.nextToken()
#expect(token3?.type == .StartTag(tagName: "div", attributes: [HTMLToken.Attribute(localName: "class", value: "bar")]))
for codePoint in "bye" {
let token = tokenizer.nextToken()
#expect(token?.type == .Character(codePoint: codePoint))
}
let token4 = tokenizer.nextToken()
#expect(token4?.type == .EndTag(tagName: "div"))
let token5 = tokenizer.nextToken()
#expect(token5?.type == .EndOfFile)
}
@Test func startTagWithMultipleAttributes() {
guard let tokenizer = HTMLTokenizer(input: "<div class=\"foo\" id=\"bar\">hi</div attr=endTagAttributeWhee>") else {
Issue.record("Failed to create tokenizer for '<div class=\"foo\" id=\"bar\">hi</div>'")
return
}
let token = tokenizer.nextToken()
#expect(token?.type == .StartTag(tagName: "div", attributes: [HTMLToken.Attribute(localName: "class", value: "foo"), HTMLToken.Attribute(localName: "id", value: "bar")]))
for codePoint in "hi" {
let token = tokenizer.nextToken()
#expect(token?.type == .Character(codePoint: codePoint))
}
let token2 = tokenizer.nextToken()
#expect(token2?.type == .EndTag(tagName: "div", attributes: [HTMLToken.Attribute(localName: "attr", value: "endTagAttributeWhee")]))
let token3 = tokenizer.nextToken()
#expect(token3?.type == .EndOfFile)
}
}

View file

@ -14,14 +14,19 @@ public class HTMLToken {
}
public struct Attribute: Equatable {
var prefix: Swift.String?
var localName: Swift.String
var namespace_: Swift.String?
var value: Swift.String
var nameStartPosition: Position
var nameEndPosition: Position
var valueStartPosition: Position
var valueEndPosition: Position
public var prefix: Swift.String? = nil
public var localName: Swift.String
public var namespace_: Swift.String? = nil
public var value: Swift.String
public var nameStartPosition = Position()
public var nameEndPosition = Position()
public var valueStartPosition = Position()
public var valueEndPosition = Position()
public init(localName: Swift.String, value: Swift.String) {
self.localName = localName
self.value = value
}
}
public enum TokenType: Equatable {
@ -33,14 +38,14 @@ public class HTMLToken {
forceQuirksMode: Bool)
case StartTag(
tagName: Swift.String,
selfClosing: Bool,
selfClosingAcknowledged: Bool,
attributes: [Attribute])
selfClosing: Bool = false,
selfClosingAcknowledged: Bool = false,
attributes: [Attribute] = [])
case EndTag(
tagName: Swift.String,
selfClosing: Bool,
selfClosingAcknowledged: Bool,
attributes: [Attribute])
selfClosing: Bool = false,
selfClosingAcknowledged: Bool = false,
attributes: [Attribute] = [])
case Comment(data: Swift.String)
case Character(codePoint: Character)
case EndOfFile
@ -53,6 +58,24 @@ public class HTMLToken {
return false
}
public func isEndTag() -> Bool {
if case .EndTag(_, _, _, _) = self.type {
return true
}
return false
}
public func isStartTag() -> Bool {
if case .StartTag(_, _, _, _) = self.type {
return true
}
return false
}
public func isTag() -> Bool {
return isStartTag() || isEndTag()
}
public func isParserWhitespace() -> Bool {
precondition(isCharacter(), "isParserWhitespace() called on non-character token")
@ -73,6 +96,52 @@ public class HTMLToken {
public var startPosition = Position()
public var endPosition = Position()
// Is in-place mutating enums a thing? Seems not https://forums.swift.org/t/in-place-mutation-of-an-enum-associated-value/11747
public var attributes: [Attribute] {
get {
switch self.type {
case .StartTag(_, _, _, let attributes):
return attributes
case .EndTag(_, _, _, let attributes):
return attributes
default:
preconditionFailure("attributes called on non-tag token")
}
}
set {
switch self.type {
case .StartTag(let tagName, let selfClosing, let selfClosingAcknowledged, attributes: _):
self.type = .StartTag(tagName: tagName, selfClosing: selfClosing, selfClosingAcknowledged: selfClosingAcknowledged, attributes: newValue)
case .EndTag(let tagName, let selfClosing, let selfClosingAcknowledged, attributes: _):
self.type = .EndTag(tagName: tagName, selfClosing: selfClosing, selfClosingAcknowledged: selfClosingAcknowledged, attributes: newValue)
default:
preconditionFailure("attributes= called on non-tag token")
}
}
}
public var tagName: Swift.String {
get {
switch self.type {
case .StartTag(let tagName, _, _, _):
return tagName
case .EndTag(let tagName, _, _, _):
return tagName
default:
preconditionFailure("tagName called on non-tag token")
}
}
set {
switch self.type {
case .StartTag(tagName: _, let selfClosing, let selfClosingAcknowledged, let attributes):
self.type = .StartTag(tagName: newValue, selfClosing: selfClosing, selfClosingAcknowledged: selfClosingAcknowledged, attributes: attributes)
case .EndTag(tagName: _, let selfClosing, let selfClosingAcknowledged, let attributes):
self.type = .EndTag(tagName: newValue, selfClosing: selfClosing, selfClosingAcknowledged: selfClosingAcknowledged, attributes: attributes)
default:
preconditionFailure("tagName= called on non-tag token")
}
}
}
public init() {}
public init(type: TokenType) {
self.type = type

View file

@ -18,6 +18,12 @@ extension Swift.String {
return nil
}
}
public mutating func takeString() -> Swift.String {
let result = self
self = ""
return result
}
}
public class HTMLTokenizer {
@ -115,9 +121,24 @@ public class HTMLTokenizer {
private var currentToken = HTMLToken()
private var queuedTokens = Deque<HTMLToken>()
private var currentBuilder = Swift.String()
private var temporaryBuffer = Swift.String()
private var lastStartTagName: Swift.String? = nil
private var currentTokensAttributes: [HTMLToken.Attribute]? = nil
private var currentAttribute: HTMLToken.Attribute? = nil
private var aborted = false
private var hasEmittedEOF = false
// https://infra.spec.whatwg.org/#ascii-upper-alpha
static private var asciiUpperAlpha = CharacterSet(charactersIn: "ABCDEFGHIJKLMNOPQRSTUVWXYZ")
// https://infra.spec.whatwg.org/#ascii-lower-alpha
static private var asciiLowerAlpha = CharacterSet(charactersIn: "abcdefghijklmnopqrstuvwxyz")
// https://infra.spec.whatwg.org/#ascii-upper-alpha
static private var asciiAlpha = asciiUpperAlpha.union(asciiLowerAlpha)
public init() {
self.cursor = self.input.startIndex
self.previousCursor = self.input.startIndex
@ -181,14 +202,37 @@ public class HTMLTokenizer {
func createNewToken(_ token: HTMLToken) {
self.currentToken = token
if self.currentToken.isTag() {
self.currentTokensAttributes = []
}
// FIXME: Assign Position
}
enum AttributeStringBehavior {
case SetName
case SetValue
case IgnoreString
}
func finalizeCurrentAttribute(_ behavior: AttributeStringBehavior) {
precondition(self.currentAttribute != nil && self.currentTokensAttributes != nil)
switch behavior {
case .SetName:
self.currentAttribute!.localName = self.currentBuilder.takeString()
case .SetValue:
self.currentAttribute!.value = self.currentBuilder.takeString()
case .IgnoreString:
_ = self.currentBuilder.takeString()
}
self.currentTokensAttributes!.append(self.currentAttribute!)
self.currentAttribute = nil
}
enum NextTokenState {
case Emit(token: HTMLToken?)
case SwitchTo
case Reconsume(inputCharacter: Character?)
case ReprocessQueue
case Continue
}
public func nextToken(stopAtInsertionPoint: Bool = false) -> HTMLToken? {
@ -210,7 +254,7 @@ public class HTMLTokenizer {
switch nextTokenImpl(nextInputCharacter) {
case .Emit(let token):
return token
case .SwitchTo:
case .SwitchTo, .Continue:
nextInputCharacter = nil
break
case .Reconsume(let character):
@ -226,12 +270,16 @@ public class HTMLTokenizer {
}
}
func continueInCurrentState() -> NextTokenState {
return .Continue
}
func switchTo(_ state: State) -> NextTokenState {
self.state = state
return .SwitchTo
}
func reconsume(_ character: Character, `in` state: State) -> NextTokenState {
func reconsume(_ character: Character?, `in` state: State) -> NextTokenState {
self.state = state
return .Reconsume(inputCharacter: character)
}
@ -251,6 +299,10 @@ public class HTMLTokenizer {
func switchToAndEmitCurrentToken(_ state: State) -> NextTokenState {
self.state = state
if self.currentToken.isTag() {
self.currentToken.attributes = self.currentTokensAttributes ?? []
self.currentTokensAttributes = nil
}
self.queuedTokens.append(self.currentToken)
self.currentToken = HTMLToken()
return .Emit(token: self.queuedTokens.popFirst()!)
@ -280,6 +332,10 @@ public class HTMLTokenizer {
func emitCurrentTokenFollowedByEOF() -> NextTokenState {
precondition(!self.hasEmittedEOF)
if self.currentToken.isTag() {
self.currentToken.attributes = self.currentTokensAttributes ?? []
self.currentTokensAttributes = nil
}
self.queuedTokens.append(self.currentToken)
self.currentToken = HTMLToken()
return emitEOF()
@ -292,20 +348,44 @@ public class HTMLTokenizer {
return .Emit(token: self.queuedTokens.popFirst()!)
}
func flushCodepointsConsumedAsACharacterReference() {
if consumedAsPartOfAnAttribute() {
self.currentBuilder += self.temporaryBuffer.takeString()
} else {
for codePoint in self.temporaryBuffer.takeString() {
self.queuedTokens.append(HTMLToken(type: .Character(codePoint: codePoint)))
}
}
}
func consumedAsPartOfAnAttribute() -> Bool {
return self.returnState == .AttributeValueDoubleQuoted || self.returnState == .AttributeValueSingleQuoted || self.returnState == .AttributeValueUnquoted
}
func isAppropriateEndTagToken(_ token: HTMLToken) -> Bool {
guard case let .EndTag(endTagName, _, _, _) = token.type else {
preconditionFailure("isAppropriateEndTagToken called with non-end-tag token")
}
if let startTagName = self.lastStartTagName {
return startTagName == endTagName
} else {
return false
}
}
func nextTokenImpl(_ nextInputCharacter: Character? = nil) -> NextTokenState {
let dontConsumeNextInputCharacter = {
self.restoreCursorToPrevious()
}
let _ = dontConsumeNextInputCharacter
// FIXME: flushCodepointsConsumedAsACharacterReference needs currentBuilder
// Handle reconsume by passing the character around in the state enum
let currentInputCharacter = nextInputCharacter ?? nextCodePoint()
switch self.state {
// 13.2.5.1 Data state, https://html.spec.whatwg.org/multipage/parsing.html#data-state
case .Data:
precondition(currentTokensAttributes == nil)
switch currentInputCharacter {
case "&":
self.returnState = .Data
@ -320,6 +400,637 @@ public class HTMLTokenizer {
default:
return emitCharacter(currentInputCharacter!)
}
// 13.2.5.2 RCDATA state, https://html.spec.whatwg.org/multipage/parsing.html#rcdata-state
case .RCDATA:
switch currentInputCharacter {
case "&":
self.returnState = .RCDATA
return switchTo(.CharacterReference)
case "<":
return switchTo(.RCDATALessThanSign)
case "\0":
// FIXME: log_parse_error()
return emitCharacter("\u{FFFD}")
case nil:
return emitEOF()
default:
return emitCharacter(currentInputCharacter!)
}
// 13.2.5.3. RAWTEXT state, https://html.spec.whatwg.org/multipage/parsing.html#rawtext-state
case .RAWTEXT:
switch currentInputCharacter {
case "<":
return switchTo(.RAWTEXTLessThanSign)
case "\0":
// FIXME: log_parse_error()
return emitCharacter("\u{FFFD}")
case nil:
return emitEOF()
default:
return emitCharacter(currentInputCharacter!)
}
// 13.2.5.4 Script data state, https://html.spec.whatwg.org/multipage/parsing.html#script-data-state
case .ScriptData:
switch currentInputCharacter {
case "<":
return switchTo(.ScriptDataLessThanSign)
case "\0":
// FIXME: log_parse_error()
return emitCharacter("\u{FFFD}")
case nil:
return emitEOF()
default:
return emitCharacter(currentInputCharacter!)
}
// 13.2.5.5 PLAINTEXT state, https://html.spec.whatwg.org/multipage/parsing.html#plaintext-state
case .PLAINTEXT:
switch currentInputCharacter {
case "\0":
// FIXME: log_parse_error()
return emitCharacter("\u{FFFD}")
case nil:
return emitEOF()
default:
return emitCharacter(currentInputCharacter!)
}
// 13.2.5.6 Tag open state https://html.spec.whatwg.org/multipage/parsing.html#tag-open-state
case .TagOpen:
switch currentInputCharacter {
case "!":
return switchTo(.MarkupDeclarationOpen)
case "/":
return switchTo(.EndTagOpen)
case let c? where HTMLTokenizer.asciiAlpha.contains(c.unicodeScalars.first!):
createNewToken(HTMLToken(type: .StartTag(tagName: "")))
return reconsume(currentInputCharacter!, in: .TagName)
case "?":
// FIXME: log_parse_error()
createNewToken(HTMLToken(type: .Comment(data: "")))
return reconsume(currentInputCharacter!, in: .BogusComment)
case nil:
// FIXME: log_parse_error()
queuedTokens.append(HTMLToken(type: .Character(codePoint: "<")))
return emitEOF()
default:
// FIXME: log_parse_error()
queuedTokens.append(HTMLToken(type: .Character(codePoint: "<")))
return reconsume(currentInputCharacter!, in: .Data)
}
// 13.2.5.7 End tag open state, https://html.spec.whatwg.org/multipage/parsing.html#end-tag-open-state
case .EndTagOpen:
switch currentInputCharacter {
case let c? where HTMLTokenizer.asciiAlpha.contains(c.unicodeScalars.first!):
createNewToken(HTMLToken(type: .EndTag(tagName: "")))
return reconsume(currentInputCharacter!, in: .TagName)
default:
return emitEOF()
}
// 13.2.5.8 Tag name state, https://html.spec.whatwg.org/multipage/parsing.html#tag-name-state
case .TagName:
switch currentInputCharacter {
case "\t", "\n", "\u{000C}", " ":
self.currentToken.tagName = self.currentBuilder.takeString()
return switchTo(.BeforeAttributeName)
case "/":
self.currentToken.tagName = self.currentBuilder.takeString()
return switchTo(.SelfClosingStartTag)
case ">":
self.currentToken.tagName = self.currentBuilder.takeString()
return switchToAndEmitCurrentToken(.Data)
case let c? where HTMLTokenizer.asciiUpperAlpha.contains(c.unicodeScalars.first!):
currentBuilder.append(Character(Unicode.Scalar(c.asciiValue! + 0x20)))
return continueInCurrentState()
case "\0":
// FIXME: log_parse_error()
currentBuilder += "\u{FFFD}"
return continueInCurrentState()
case nil:
// FIXME: log_parse_error()
return emitEOF()
default:
currentBuilder.append(currentInputCharacter!)
return continueInCurrentState()
}
// 13.2.5.9 RCDATA less-than sign state, https://html.spec.whatwg.org/multipage/parsing.html#rcdata-less-than-sign-state
case .RCDATALessThanSign:
switch currentInputCharacter {
case "/":
self.temporaryBuffer = ""
return switchTo(.RCDATAEndTagOpen)
default:
return emitCharacterAndReconsume("<", in: .RCDATA, currentInputCharacter: currentInputCharacter)
}
// 13.2.5.10 RCDATA end tag open state, https://html.spec.whatwg.org/multipage/parsing.html#rcdata-end-tag-open-state
case .RCDATAEndTagOpen:
switch currentInputCharacter {
case let c? where HTMLTokenizer.asciiAlpha.contains(c.unicodeScalars.first!):
createNewToken(HTMLToken(type: .EndTag(tagName: "")))
return reconsume(currentInputCharacter!, in: .RCDATAEndTagName)
default:
queuedTokens.append(HTMLToken(type: .Character(codePoint: "<")))
queuedTokens.append(HTMLToken(type: .Character(codePoint: "/")))
return reconsume(currentInputCharacter, in: .RCDATA)
}
// 13.2.5.11 RCDATA end tag name state, https://html.spec.whatwg.org/multipage/parsing.html#rcdata-end-tag-name-state
case .RCDATAEndTagName:
switch currentInputCharacter {
case "\t", "\n", "\u{000C}", " ":
if self.isAppropriateEndTagToken(currentToken) {
return switchTo(.BeforeAttributeName)
}
break
case "/":
if self.isAppropriateEndTagToken(currentToken) {
return switchTo(.SelfClosingStartTag)
}
break
case ">":
if self.isAppropriateEndTagToken(currentToken) {
return switchToAndEmitCurrentToken(.Data)
}
break
case let c? where HTMLTokenizer.asciiUpperAlpha.contains(c.unicodeScalars.first!):
self.currentBuilder.append(Character(Unicode.Scalar(c.asciiValue! + 0x20)))
self.temporaryBuffer.append(c)
return continueInCurrentState()
case let c? where HTMLTokenizer.asciiLowerAlpha.contains(c.unicodeScalars.first!):
self.currentBuilder.append(c)
self.temporaryBuffer.append(c)
return continueInCurrentState()
default:
break
}
// First three steps fall through to the "anything else" block
self.queuedTokens.append(HTMLToken(type: .Character(codePoint: "<")))
self.queuedTokens.append(HTMLToken(type: .Character(codePoint: "/")))
// NOTE: The spec doesn't mention this, but it seems that m_current_token (an end tag) is just dropped in this case.
self.currentBuilder = ""
for codePoint in self.temporaryBuffer {
self.queuedTokens.append(HTMLToken(type: .Character(codePoint: codePoint)))
}
return reconsume(currentInputCharacter, in: .RCDATA)
// 13.2.5.15 Script data less-than sign state, https://html.spec.whatwg.org/multipage/parsing.html#script-data-less-than-sign-state
case .ScriptDataLessThanSign:
switch currentInputCharacter {
case "/":
self.temporaryBuffer = ""
return switchTo(.ScriptDataEndTagOpen)
case "!":
self.queuedTokens.append(HTMLToken(type: .Character(codePoint: "<")))
self.queuedTokens.append(HTMLToken(type: .Character(codePoint: "!")))
return switchTo(.ScriptDataEscapeStart)
default:
return emitCharacterAndReconsume("<", in: .ScriptData, currentInputCharacter: currentInputCharacter)
}
// 13.2.5.16 Script data end tag open state, https://html.spec.whatwg.org/multipage/parsing.html#script-data-end-tag-open-state
case .ScriptDataEndTagOpen:
switch currentInputCharacter {
case let c? where HTMLTokenizer.asciiAlpha.contains(c.unicodeScalars.first!):
createNewToken(HTMLToken(type: .EndTag(tagName: "")))
return reconsume(currentInputCharacter!, in: .ScriptDataEndTagName)
default:
queuedTokens.append(HTMLToken(type: .Character(codePoint: "<")))
queuedTokens.append(HTMLToken(type: .Character(codePoint: "/")))
return reconsume(currentInputCharacter, in: .ScriptData)
}
// 13.2.5.17 Script data end tag name state, https://html.spec.whatwg.org/multipage/parsing.html#script-data-end-tag-name-state
case .ScriptDataEndTagName:
switch currentInputCharacter {
case "\t", "\n", "\u{000C}", " ":
if self.isAppropriateEndTagToken(currentToken) {
return switchTo(.BeforeAttributeName)
}
break
case "/":
if self.isAppropriateEndTagToken(currentToken) {
return switchTo(.SelfClosingStartTag)
}
break
case ">":
if self.isAppropriateEndTagToken(currentToken) {
return switchToAndEmitCurrentToken(.Data)
}
break
case let c? where HTMLTokenizer.asciiUpperAlpha.contains(c.unicodeScalars.first!):
self.currentBuilder.append(Character(Unicode.Scalar(c.asciiValue! + 0x20)))
self.temporaryBuffer.append(c)
return continueInCurrentState()
case let c? where HTMLTokenizer.asciiLowerAlpha.contains(c.unicodeScalars.first!):
self.currentBuilder.append(c)
self.temporaryBuffer.append(c)
return continueInCurrentState()
default:
break
}
// First three steps fall through to the "anything else" block
self.queuedTokens.append(HTMLToken(type: .Character(codePoint: "<")))
self.queuedTokens.append(HTMLToken(type: .Character(codePoint: "/")))
// NOTE: The spec doesn't mention this, but it seems that m_current_token (an end tag) is just dropped in this case.
self.currentBuilder = ""
for codePoint in self.temporaryBuffer {
self.queuedTokens.append(HTMLToken(type: .Character(codePoint: codePoint)))
}
return reconsume(currentInputCharacter, in: .ScriptData)
// 13.2.5.18 Script data escape start state, https://html.spec.whatwg.org/multipage/parsing.html#script-data-escape-start-state
case .ScriptDataEscapeStart:
switch currentInputCharacter {
case "-":
return switchToAndEmitCharacter(.ScriptDataEscapeStartDash, character: "-")
default:
return reconsume(currentInputCharacter, in: .ScriptData)
}
// 13.2.5.19 Script data escape start dash state, https://html.spec.whatwg.org/multipage/parsing.html#script-data-escape-start-dash-state
case .ScriptDataEscapeStartDash:
switch currentInputCharacter {
case "-":
return switchToAndEmitCharacter(.ScriptDataEscapedDashDash, character: "-")
default:
return reconsume(currentInputCharacter, in: .ScriptData)
}
// 13.2.5.20 Script data escaped state, https://html.spec.whatwg.org/multipage/parsing.html#script-data-escaped-state
case .ScriptDataEscaped:
switch currentInputCharacter {
case "-":
return switchToAndEmitCharacter(.ScriptDataEscapedDash, character: "-")
case "<":
return switchTo(.ScriptDataEscapedLessThanSign)
case "\0":
// FIXME: log_parse_error()
return emitCharacter("\u{FFFD}")
case nil:
// FIXME: log_parse_error()
return emitEOF()
default:
return emitCharacter(currentInputCharacter!)
}
// 13.2.5.21 Script data escaped dash state, https://html.spec.whatwg.org/multipage/parsing.html#script-data-escaped-dash-state
case .ScriptDataEscapedDash:
switch currentInputCharacter {
case "-":
return switchToAndEmitCharacter(.ScriptDataEscapedDashDash, character: "-")
case "<":
return switchTo(.ScriptDataEscapedLessThanSign)
case "\0":
// FIXME: log_parse_error()
return switchToAndEmitCharacter(.ScriptDataEscaped, character: "\u{FFFD}")
case nil:
// FIXME: log_parse_error()
return emitEOF()
default:
return switchToAndEmitCharacter(.ScriptDataEscaped, character: currentInputCharacter!)
}
// 13.2.5.22 Script data escaped dash dash state, https://html.spec.whatwg.org/multipage/parsing.html#script-data-escaped-dash-dash-state
case .ScriptDataEscapedDashDash:
switch currentInputCharacter {
case "-":
return emitCharacter("-")
case "<":
return switchTo(.ScriptDataEscapedLessThanSign)
case ">":
return switchToAndEmitCharacter(.ScriptData, character: ">")
case "\0":
// FIXME: log_parse_error()
return switchToAndEmitCharacter(.ScriptDataEscaped, character: "\u{FFFD}")
case nil:
// FIXME: log_parse_error()
return emitEOF()
default:
return switchToAndEmitCharacter(.ScriptDataEscaped, character: currentInputCharacter!)
}
// 13.2.5.23 Script data escaped less-than sign state, https://html.spec.whatwg.org/multipage/parsing.html#script-data-escaped-less-than-sign-state
case .ScriptDataEscapedLessThanSign:
switch currentInputCharacter {
case "/":
self.temporaryBuffer = ""
return switchTo(.ScriptDataEscapedEndTagOpen)
case let c? where HTMLTokenizer.asciiAlpha.contains(c.unicodeScalars.first!):
self.temporaryBuffer = ""
self.queuedTokens.append(HTMLToken(type: .Character(codePoint: "<")))
return reconsume(currentInputCharacter!, in: .ScriptDataDoubleEscapeStart)
default:
return emitCharacterAndReconsume("<", in: .ScriptDataEscaped, currentInputCharacter: currentInputCharacter)
}
// 13.2.5.24 Script data escaped end tag open state, https://html.spec.whatwg.org/multipage/parsing.html#script-data-escaped-end-tag-open-state
case .ScriptDataEscapedEndTagOpen:
switch currentInputCharacter {
case let c? where HTMLTokenizer.asciiAlpha.contains(c.unicodeScalars.first!):
createNewToken(HTMLToken(type: .EndTag(tagName: "")))
return reconsume(currentInputCharacter!, in: .ScriptDataEscapedEndTagName)
default:
queuedTokens.append(HTMLToken(type: .Character(codePoint: "<")))
queuedTokens.append(HTMLToken(type: .Character(codePoint: "/")))
return reconsume(currentInputCharacter, in: .ScriptDataEscaped)
}
// 13.2.5.25 Script data escaped end tag name state, https://html.spec.whatwg.org/multipage/parsing.html#script-data-escaped-end-tag-name-state
case .ScriptDataEscapedEndTagName:
switch currentInputCharacter {
case "\t", "\n", "\u{000C}", " ":
if self.isAppropriateEndTagToken(currentToken) {
return switchTo(.BeforeAttributeName)
}
break
case "/":
if self.isAppropriateEndTagToken(currentToken) {
return switchTo(.SelfClosingStartTag)
}
break
case ">":
if self.isAppropriateEndTagToken(currentToken) {
return switchToAndEmitCurrentToken(.Data)
}
break
case let c? where HTMLTokenizer.asciiUpperAlpha.contains(c.unicodeScalars.first!):
self.currentBuilder.append(Character(Unicode.Scalar(c.asciiValue! + 0x20)))
self.temporaryBuffer.append(c)
return continueInCurrentState()
case let c? where HTMLTokenizer.asciiLowerAlpha.contains(c.unicodeScalars.first!):
self.currentBuilder.append(c)
self.temporaryBuffer.append(c)
return continueInCurrentState()
default:
break
}
// First three steps fall through to the "anything else" block
self.queuedTokens.append(HTMLToken(type: .Character(codePoint: "<")))
self.queuedTokens.append(HTMLToken(type: .Character(codePoint: "/")))
// NOTE: The spec doesn't mention this, but it seems that m_current_token (an end tag) is just dropped in this case.
self.currentBuilder = ""
for codePoint in self.temporaryBuffer {
self.queuedTokens.append(HTMLToken(type: .Character(codePoint: codePoint)))
}
return reconsume(currentInputCharacter, in: .ScriptDataEscaped)
// 13.2.5.26 Script data double escape start state, https://html.spec.whatwg.org/multipage/parsing.html#script-data-double-escape-start-state
case .ScriptDataDoubleEscapeStart:
switch currentInputCharacter {
case "\t", "\n", "\u{000C}", " ", "/", ">":
if self.temporaryBuffer == "script" {
return switchToAndEmitCharacter(.ScriptDataDoubleEscaped, character: currentInputCharacter!)
} else {
return switchToAndEmitCharacter(.ScriptDataEscaped, character: currentInputCharacter!)
}
case let c? where HTMLTokenizer.asciiUpperAlpha.contains(c.unicodeScalars.first!):
self.temporaryBuffer.append(Character(Unicode.Scalar(c.asciiValue! + 0x20)))
return emitCharacter(currentInputCharacter!)
case let c? where HTMLTokenizer.asciiLowerAlpha.contains(c.unicodeScalars.first!):
self.temporaryBuffer.append(c)
return emitCharacter(currentInputCharacter!)
default:
return reconsume(currentInputCharacter, in: .ScriptDataEscaped)
}
// 13.2.5.27 Script data double escaped state, https://html.spec.whatwg.org/multipage/parsing.html#script-data-double-escaped-state
case .ScriptDataDoubleEscaped:
switch currentInputCharacter {
case "-":
return switchToAndEmitCharacter(.ScriptDataDoubleEscapedDash, character: "-")
case "<":
return switchTo(.ScriptDataDoubleEscapedLessThanSign)
case "\0":
// FIXME: log_parse_error()
return emitCharacter("\u{FFFD}")
case nil:
// FIXME: log_parse_error()
return emitEOF()
default:
return emitCharacter(currentInputCharacter!)
}
// 13.2.5.28 Script data double escaped dash state, https://html.spec.whatwg.org/multipage/parsing.html#script-data-double-escaped-dash-state
case .ScriptDataDoubleEscapedDash:
switch currentInputCharacter {
case "-":
return switchToAndEmitCharacter(.ScriptDataDoubleEscapedDashDash, character: "-")
case "<":
return switchTo(.ScriptDataDoubleEscapedLessThanSign)
case "\0":
// FIXME: log_parse_error()
return switchToAndEmitCharacter(.ScriptDataDoubleEscaped, character: "\u{FFFD}")
case nil:
// FIXME: log_parse_error()
return emitEOF()
default:
return switchToAndEmitCharacter(.ScriptDataDoubleEscaped, character: currentInputCharacter!)
}
// 13.2.5.29 Script data double escaped dash dash state, https://html.spec.whatwg.org/multipage/parsing.html#script-data-double-escaped-dash-dash-state
case .ScriptDataDoubleEscapedDashDash:
switch currentInputCharacter {
case "-":
return emitCharacter("-")
case "<":
return switchToAndEmitCharacter(.ScriptDataDoubleEscapedLessThanSign, character: "<")
case ">":
return switchToAndEmitCharacter(.ScriptData, character: ">")
case "\0":
// FIXME: log_parse_error()
return switchToAndEmitCharacter(.ScriptDataDoubleEscaped, character: "\u{FFFD}")
case nil:
// FIXME: log_parse_error()
return emitEOF()
default:
return switchToAndEmitCharacter(.ScriptDataDoubleEscaped, character: currentInputCharacter!)
}
// 13.2.5.30 Script data double escaped less-than sign state, https://html.spec.whatwg.org/multipage/parsing.html#script-data-double-escaped-less-than-sign-state
case .ScriptDataDoubleEscapedLessThanSign:
switch currentInputCharacter {
case "/":
self.temporaryBuffer = ""
return switchToAndEmitCharacter(.ScriptDataDoubleEscapeEnd, character: "/")
default:
return reconsume(currentInputCharacter, in: .ScriptDataDoubleEscaped)
}
// 13.2.5.31 Script data double escape end state, https://html.spec.whatwg.org/multipage/parsing.html#script-data-double-escape-end-state
case .ScriptDataDoubleEscapeEnd:
switch currentInputCharacter {
case "\t", "\n", "\u{000C}", " ", "/", ">":
if self.temporaryBuffer == "script" {
return switchToAndEmitCharacter(.ScriptDataEscaped, character: currentInputCharacter!)
} else {
return switchToAndEmitCharacter(.ScriptDataDoubleEscaped, character: currentInputCharacter!)
}
case let c? where HTMLTokenizer.asciiUpperAlpha.contains(c.unicodeScalars.first!):
self.temporaryBuffer.append(Character(Unicode.Scalar(c.asciiValue! + 0x20)))
return emitCharacter(currentInputCharacter!)
case let c? where HTMLTokenizer.asciiLowerAlpha.contains(c.unicodeScalars.first!):
self.temporaryBuffer.append(c)
return emitCharacter(currentInputCharacter!)
default:
return reconsume(currentInputCharacter, in: .ScriptDataDoubleEscaped)
}
// 13.2.5.32 Before attribute name state, https://html.spec.whatwg.org/multipage/parsing.html#before-attribute-name-state
case .BeforeAttributeName:
switch currentInputCharacter {
case "\t", "\n", "\u{000C}", " ":
return continueInCurrentState()
case "/", ">", nil:
return reconsume(currentInputCharacter, in: .AfterAttributeName)
case "=":
// FIXME: log_parse_error()
self.currentBuilder = Swift.String(currentInputCharacter!)
self.currentAttribute = HTMLToken.Attribute(localName: "", value: "")
return switchTo(.AttributeName)
default:
self.currentAttribute = HTMLToken.Attribute(localName: "", value: "")
return reconsume(currentInputCharacter!, in: .AttributeName)
}
// 13.2.5.33 Attribute name state, https://html.spec.whatwg.org/multipage/parsing.html#attribute-name-state
case .AttributeName:
// FIXME: When the user agent leaves the attribute name state (and before emitting the tag token, if appropriate),
// the complete attribute's name must be compared to the other attributes on the same token;
// if there is already an attribute on the token with the exact same name, then this is a duplicate-attribute
// parse error and the new attribute must be removed from the token.
// NOTE: If an attribute is so removed from a token, it, and the value that gets associated with it, if any,
// are never subsequently used by the parser, and are therefore effectively discarded. Removing the attribute
// in this way does not change its status as the "current attribute" for the purposes of the tokenizer, however.
switch currentInputCharacter {
case "\t", "\n", "\u{000C}", " ", "/", ">", nil:
// FIXME: set name position
self.currentAttribute!.localName = self.currentBuilder.takeString()
return reconsume(currentInputCharacter, in: .AfterAttributeName)
case "=":
// FIXME: set name position
self.currentAttribute!.localName = self.currentBuilder.takeString()
return switchTo(.BeforeAttributeValue)
case let c? where HTMLTokenizer.asciiUpperAlpha.contains(c.unicodeScalars.first!):
self.currentBuilder.append(Character(Unicode.Scalar(c.asciiValue! + 0x20)))
return continueInCurrentState()
case "\0":
// FIXME: log_parse_error()
self.currentBuilder.append("\u{FFFD}")
return continueInCurrentState()
default:
self.currentBuilder.append(currentInputCharacter!)
return continueInCurrentState()
}
// 13.2.5.34 After attribute name state, https://html.spec.whatwg.org/multipage/parsing.html#after-attribute-name-state
case .AfterAttributeName:
switch currentInputCharacter {
case "\t", "\n", "\u{000C}", " ":
return continueInCurrentState()
case "/":
self.finalizeCurrentAttribute(.SetName)
return switchTo(.SelfClosingStartTag)
case "=":
self.finalizeCurrentAttribute(.SetName)
return switchTo(.BeforeAttributeValue)
case ">":
self.finalizeCurrentAttribute(.SetName)
return switchToAndEmitCurrentToken(.Data)
case nil:
// FIXME: log_parse_error()
self.finalizeCurrentAttribute(.IgnoreString)
return emitEOF()
default:
self.finalizeCurrentAttribute(.SetName)
self.currentAttribute = HTMLToken.Attribute(localName: "", value: "")
return reconsume(currentInputCharacter!, in: .AttributeName)
}
// 13.2.5.35 Before attribute value state, https://html.spec.whatwg.org/multipage/parsing.html#before-attribute-value-state
case .BeforeAttributeValue:
switch currentInputCharacter {
case "\t", "\n", "\u{000C}", " ":
return continueInCurrentState()
case "\"":
return switchTo(.AttributeValueDoubleQuoted)
case "'":
return switchTo(.AttributeValueSingleQuoted)
case ">":
// FIXME: log_parse_error()
self.finalizeCurrentAttribute(.IgnoreString)
return switchToAndEmitCurrentToken(.Data)
default:
return reconsume(currentInputCharacter, in: .AttributeValueUnquoted)
}
// 13.2.5.36 Attribute value (double-quoted) state, https://html.spec.whatwg.org/multipage/parsing.html#attribute-value-double-quoted-state
case .AttributeValueDoubleQuoted:
switch currentInputCharacter {
case "\"":
return switchTo(.AfterAttributeValueQuoted)
case "&":
self.returnState = .AttributeValueDoubleQuoted
return switchTo(.CharacterReference)
case "\0":
// FIXME: log_parse_error()
self.currentBuilder.append("\u{FFFD}")
return continueInCurrentState()
case nil:
// FIXME: log_parse_error()
self.finalizeCurrentAttribute(.IgnoreString)
return emitEOF()
default:
self.currentBuilder.append(currentInputCharacter!)
return continueInCurrentState()
}
// 13.2.5.37 Attribute value (single-quoted) state, https://html.spec.whatwg.org/multipage/parsing.html#attribute-value-single-quoted-state
case .AttributeValueSingleQuoted:
switch currentInputCharacter {
case "'":
return switchTo(.AfterAttributeValueQuoted)
case "&":
self.returnState = .AttributeValueSingleQuoted
return switchTo(.CharacterReference)
case "\0":
// FIXME: log_parse_error()
self.currentBuilder.append("\u{FFFD}")
return continueInCurrentState()
case nil:
// FIXME: log_parse_error()
return emitEOF()
default:
self.currentBuilder.append(currentInputCharacter!)
return continueInCurrentState()
}
// 13.2.5.38 Attribute value (unquoted) state, https://html.spec.whatwg.org/multipage/parsing.html#attribute-value-unquoted-state
case .AttributeValueUnquoted:
switch currentInputCharacter {
case "\t", "\n", "\u{000C}", " ":
self.finalizeCurrentAttribute(.SetValue)
return switchTo(.BeforeAttributeName)
case "&":
self.returnState = .AttributeValueUnquoted
return switchTo(.CharacterReference)
case ">":
self.finalizeCurrentAttribute(.SetValue)
return switchToAndEmitCurrentToken(.Data)
case "\0":
// FIXME: log_parse_error()
self.currentBuilder.append("\u{FFFD}")
return continueInCurrentState()
case "\"", "'", "<", "=", "`":
// FIXME: log_parse_error()
self.currentBuilder.append(currentInputCharacter!)
return continueInCurrentState()
case nil:
// FIXME: log_parse_error()
self.finalizeCurrentAttribute(.IgnoreString)
return emitEOF()
default:
self.currentBuilder.append(currentInputCharacter!)
return continueInCurrentState()
}
// 13.2.5.39 After attribute value (quoted) state, https://html.spec.whatwg.org/multipage/parsing.html#after-attribute-value-quoted-state
case .AfterAttributeValueQuoted:
switch currentInputCharacter {
case "\t", "\n", "\u{000C}", " ":
self.finalizeCurrentAttribute(.SetValue)
return switchTo(.BeforeAttributeName)
case "/":
self.finalizeCurrentAttribute(.SetValue)
return switchTo(.SelfClosingStartTag)
case ">":
self.finalizeCurrentAttribute(.SetValue)
return switchToAndEmitCurrentToken(.Data)
case nil:
// FIXME: log_parse_error()
self.finalizeCurrentAttribute(.IgnoreString)
return emitEOF()
default:
// FIXME: log_parse_error()
self.finalizeCurrentAttribute(.SetValue)
return reconsume(currentInputCharacter!, in: .BeforeAttributeName)
}
default:
print("TODO: In state \(self.state) with input \(Swift.String(describing: currentInputCharacter))")
return emitEOF()