diff --git a/Tests/LibWeb/TestHTMLTokenizerSwift.swift b/Tests/LibWeb/TestHTMLTokenizerSwift.swift index b6be94539de..d641fee8935 100644 --- a/Tests/LibWeb/TestHTMLTokenizerSwift.swift +++ b/Tests/LibWeb/TestHTMLTokenizerSwift.swift @@ -76,7 +76,7 @@ struct TestHTMLTokenizerSwift { #expect(token2 == nil) } - @Test func dataStateTagOpen() { + @Test func tagOpenOnly() { guard let tokenizer = HTMLTokenizer(input: "<") else { Issue.record("Failed to create tokenizer for '<'") return @@ -84,11 +84,14 @@ struct TestHTMLTokenizerSwift { #expect(tokenizer.state == HTMLTokenizer.State.Data) // initial state let token = tokenizer.nextToken() - #expect(token?.type == .EndOfFile) - #expect(tokenizer.state == HTMLTokenizer.State.TagOpen) + #expect(token?.type == .Character(codePoint: "<")) let token2 = tokenizer.nextToken() - #expect(token2 == nil) + #expect(token2?.type == .EndOfFile) + #expect(tokenizer.state == HTMLTokenizer.State.TagOpen) + + let token3 = tokenizer.nextToken() + #expect(token3 == nil) } @Test func dataStateNulChar() { @@ -112,4 +115,141 @@ struct TestHTMLTokenizerSwift { #expect(tokenizer.state == HTMLTokenizer.State.Data) } + + @Test func scriptTagWithAttributes() { + guard let tokenizer = HTMLTokenizer(input: "") else { + Issue.record("Failed to create tokenizer for ''") + return + } + + let token = tokenizer.nextToken() + #expect(token?.type == .StartTag(tagName: "script", attributes: [])) + + for codePoint in "var x = 1;" { + let token = tokenizer.nextToken() + #expect(token?.type == .Character(codePoint: codePoint)) + } + + let token2 = tokenizer.nextToken() + #expect(token2?.type == .EndTag(tagName: "script")) + + let token3 = tokenizer.nextToken() + #expect(token3?.type == .EndOfFile) + } + + @Test func simpleDivWithContent() { + guard let tokenizer = HTMLTokenizer(input: "
hi
") else { + Issue.record("Failed to create tokenizer for '
hi
'") + return + } + #expect(tokenizer.state == HTMLTokenizer.State.Data) // initial state + + let token = tokenizer.nextToken() + #expect(token?.type == .StartTag(tagName: "div", attributes: [])) + + let token2 = tokenizer.nextToken() + #expect(token2?.type == .Character(codePoint: "h")) + + let token3 = tokenizer.nextToken() + #expect(token3?.type == .Character(codePoint: "i")) + + let token4 = tokenizer.nextToken() + #expect(token4?.type == .EndTag(tagName: "div")) + + let token5 = tokenizer.nextToken() + #expect(token5?.type == .EndOfFile) + } + + @Test func simpleDivWithContentAndAttributes() { + guard let tokenizer = HTMLTokenizer(input: "
hi
") else { + Issue.record("Failed to create tokenizer for '
hi
'") + return + } + #expect(tokenizer.state == HTMLTokenizer.State.Data) // initial state + + let token = tokenizer.nextToken() + #expect(token?.type == .StartTag(tagName: "div", attributes: [HTMLToken.Attribute(localName: "class", value: "foo")])) + + let token2 = tokenizer.nextToken() + #expect(token2?.type == .Character(codePoint: "h")) + + let token3 = tokenizer.nextToken() + #expect(token3?.type == .Character(codePoint: "i")) + + let token4 = tokenizer.nextToken() + #expect(token4?.type == .EndTag(tagName: "div")) + + let token5 = tokenizer.nextToken() + #expect(token5?.type == .EndOfFile) + } + + @Test func severalDivsWithAttributesAndContent() { + // Explicitly use unquoted and single quotes for attribute values + guard let tokenizer = HTMLTokenizer(input: "
hi
bye
") else { + Issue.record("Failed to create tokenizer for '
hi
bye
'") + return + } + + let token = tokenizer.nextToken() + #expect(token?.type == .StartTag(tagName: "div", attributes: [HTMLToken.Attribute(localName: "class", value: "foo")])) + + for codePoint in "hi" { + let token = tokenizer.nextToken() + #expect(token?.type == .Character(codePoint: codePoint)) + } + + let token2 = tokenizer.nextToken() + #expect(token2?.type == .EndTag(tagName: "div")) + + let token3 = tokenizer.nextToken() + #expect(token3?.type == .StartTag(tagName: "div", attributes: [HTMLToken.Attribute(localName: "class", value: "bar")])) + + for codePoint in "bye" { + let token = tokenizer.nextToken() + #expect(token?.type == .Character(codePoint: codePoint)) + } + + let token4 = tokenizer.nextToken() + #expect(token4?.type == .EndTag(tagName: "div")) + + let token5 = tokenizer.nextToken() + #expect(token5?.type == .EndOfFile) + } + + @Test func startTagWithMultipleAttributes() { + guard let tokenizer = HTMLTokenizer(input: "
hi
") else { + Issue.record("Failed to create tokenizer for '
hi
'") + return + } + + let token = tokenizer.nextToken() + #expect(token?.type == .StartTag(tagName: "div", attributes: [HTMLToken.Attribute(localName: "class", value: "foo"), HTMLToken.Attribute(localName: "id", value: "bar")])) + + for codePoint in "hi" { + let token = tokenizer.nextToken() + #expect(token?.type == .Character(codePoint: codePoint)) + } + + let token2 = tokenizer.nextToken() + #expect(token2?.type == .EndTag(tagName: "div", attributes: [HTMLToken.Attribute(localName: "attr", value: "endTagAttributeWhee")])) + + let token3 = tokenizer.nextToken() + #expect(token3?.type == .EndOfFile) + } } diff --git a/Userland/Libraries/LibWeb/HTML/Parser/HTMLToken.swift b/Userland/Libraries/LibWeb/HTML/Parser/HTMLToken.swift index c5920d13cc6..bb391503352 100644 --- a/Userland/Libraries/LibWeb/HTML/Parser/HTMLToken.swift +++ b/Userland/Libraries/LibWeb/HTML/Parser/HTMLToken.swift @@ -14,14 +14,19 @@ public class HTMLToken { } public struct Attribute: Equatable { - var prefix: Swift.String? - var localName: Swift.String - var namespace_: Swift.String? - var value: Swift.String - var nameStartPosition: Position - var nameEndPosition: Position - var valueStartPosition: Position - var valueEndPosition: Position + public var prefix: Swift.String? = nil + public var localName: Swift.String + public var namespace_: Swift.String? = nil + public var value: Swift.String + public var nameStartPosition = Position() + public var nameEndPosition = Position() + public var valueStartPosition = Position() + public var valueEndPosition = Position() + + public init(localName: Swift.String, value: Swift.String) { + self.localName = localName + self.value = value + } } public enum TokenType: Equatable { @@ -33,14 +38,14 @@ public class HTMLToken { forceQuirksMode: Bool) case StartTag( tagName: Swift.String, - selfClosing: Bool, - selfClosingAcknowledged: Bool, - attributes: [Attribute]) + selfClosing: Bool = false, + selfClosingAcknowledged: Bool = false, + attributes: [Attribute] = []) case EndTag( tagName: Swift.String, - selfClosing: Bool, - selfClosingAcknowledged: Bool, - attributes: [Attribute]) + selfClosing: Bool = false, + selfClosingAcknowledged: Bool = false, + attributes: [Attribute] = []) case Comment(data: Swift.String) case Character(codePoint: Character) case EndOfFile @@ -53,6 +58,24 @@ public class HTMLToken { return false } + public func isEndTag() -> Bool { + if case .EndTag(_, _, _, _) = self.type { + return true + } + return false + } + + public func isStartTag() -> Bool { + if case .StartTag(_, _, _, _) = self.type { + return true + } + return false + } + + public func isTag() -> Bool { + return isStartTag() || isEndTag() + } + public func isParserWhitespace() -> Bool { precondition(isCharacter(), "isParserWhitespace() called on non-character token") @@ -73,6 +96,52 @@ public class HTMLToken { public var startPosition = Position() public var endPosition = Position() + // Is in-place mutating enums a thing? Seems not https://forums.swift.org/t/in-place-mutation-of-an-enum-associated-value/11747 + public var attributes: [Attribute] { + get { + switch self.type { + case .StartTag(_, _, _, let attributes): + return attributes + case .EndTag(_, _, _, let attributes): + return attributes + default: + preconditionFailure("attributes called on non-tag token") + } + } + set { + switch self.type { + case .StartTag(let tagName, let selfClosing, let selfClosingAcknowledged, attributes: _): + self.type = .StartTag(tagName: tagName, selfClosing: selfClosing, selfClosingAcknowledged: selfClosingAcknowledged, attributes: newValue) + case .EndTag(let tagName, let selfClosing, let selfClosingAcknowledged, attributes: _): + self.type = .EndTag(tagName: tagName, selfClosing: selfClosing, selfClosingAcknowledged: selfClosingAcknowledged, attributes: newValue) + default: + preconditionFailure("attributes= called on non-tag token") + } + } + } + public var tagName: Swift.String { + get { + switch self.type { + case .StartTag(let tagName, _, _, _): + return tagName + case .EndTag(let tagName, _, _, _): + return tagName + default: + preconditionFailure("tagName called on non-tag token") + } + } + set { + switch self.type { + case .StartTag(tagName: _, let selfClosing, let selfClosingAcknowledged, let attributes): + self.type = .StartTag(tagName: newValue, selfClosing: selfClosing, selfClosingAcknowledged: selfClosingAcknowledged, attributes: attributes) + case .EndTag(tagName: _, let selfClosing, let selfClosingAcknowledged, let attributes): + self.type = .EndTag(tagName: newValue, selfClosing: selfClosing, selfClosingAcknowledged: selfClosingAcknowledged, attributes: attributes) + default: + preconditionFailure("tagName= called on non-tag token") + } + } + } + public init() {} public init(type: TokenType) { self.type = type diff --git a/Userland/Libraries/LibWeb/HTML/Parser/HTMLTokenizer.swift b/Userland/Libraries/LibWeb/HTML/Parser/HTMLTokenizer.swift index 79bce616082..3bffa831939 100644 --- a/Userland/Libraries/LibWeb/HTML/Parser/HTMLTokenizer.swift +++ b/Userland/Libraries/LibWeb/HTML/Parser/HTMLTokenizer.swift @@ -18,6 +18,12 @@ extension Swift.String { return nil } } + + public mutating func takeString() -> Swift.String { + let result = self + self = "" + return result + } } public class HTMLTokenizer { @@ -115,9 +121,24 @@ public class HTMLTokenizer { private var currentToken = HTMLToken() private var queuedTokens = Deque() + private var currentBuilder = Swift.String() + private var temporaryBuffer = Swift.String() + private var lastStartTagName: Swift.String? = nil + private var currentTokensAttributes: [HTMLToken.Attribute]? = nil + private var currentAttribute: HTMLToken.Attribute? = nil + private var aborted = false private var hasEmittedEOF = false + // https://infra.spec.whatwg.org/#ascii-upper-alpha + static private var asciiUpperAlpha = CharacterSet(charactersIn: "ABCDEFGHIJKLMNOPQRSTUVWXYZ") + + // https://infra.spec.whatwg.org/#ascii-lower-alpha + static private var asciiLowerAlpha = CharacterSet(charactersIn: "abcdefghijklmnopqrstuvwxyz") + + // https://infra.spec.whatwg.org/#ascii-upper-alpha + static private var asciiAlpha = asciiUpperAlpha.union(asciiLowerAlpha) + public init() { self.cursor = self.input.startIndex self.previousCursor = self.input.startIndex @@ -181,14 +202,37 @@ public class HTMLTokenizer { func createNewToken(_ token: HTMLToken) { self.currentToken = token + if self.currentToken.isTag() { + self.currentTokensAttributes = [] + } // FIXME: Assign Position } + enum AttributeStringBehavior { + case SetName + case SetValue + case IgnoreString + } + func finalizeCurrentAttribute(_ behavior: AttributeStringBehavior) { + precondition(self.currentAttribute != nil && self.currentTokensAttributes != nil) + switch behavior { + case .SetName: + self.currentAttribute!.localName = self.currentBuilder.takeString() + case .SetValue: + self.currentAttribute!.value = self.currentBuilder.takeString() + case .IgnoreString: + _ = self.currentBuilder.takeString() + } + self.currentTokensAttributes!.append(self.currentAttribute!) + self.currentAttribute = nil + } + enum NextTokenState { case Emit(token: HTMLToken?) case SwitchTo case Reconsume(inputCharacter: Character?) case ReprocessQueue + case Continue } public func nextToken(stopAtInsertionPoint: Bool = false) -> HTMLToken? { @@ -210,7 +254,7 @@ public class HTMLTokenizer { switch nextTokenImpl(nextInputCharacter) { case .Emit(let token): return token - case .SwitchTo: + case .SwitchTo, .Continue: nextInputCharacter = nil break case .Reconsume(let character): @@ -226,12 +270,16 @@ public class HTMLTokenizer { } } + func continueInCurrentState() -> NextTokenState { + return .Continue + } + func switchTo(_ state: State) -> NextTokenState { self.state = state return .SwitchTo } - func reconsume(_ character: Character, `in` state: State) -> NextTokenState { + func reconsume(_ character: Character?, `in` state: State) -> NextTokenState { self.state = state return .Reconsume(inputCharacter: character) } @@ -251,6 +299,10 @@ public class HTMLTokenizer { func switchToAndEmitCurrentToken(_ state: State) -> NextTokenState { self.state = state + if self.currentToken.isTag() { + self.currentToken.attributes = self.currentTokensAttributes ?? [] + self.currentTokensAttributes = nil + } self.queuedTokens.append(self.currentToken) self.currentToken = HTMLToken() return .Emit(token: self.queuedTokens.popFirst()!) @@ -280,6 +332,10 @@ public class HTMLTokenizer { func emitCurrentTokenFollowedByEOF() -> NextTokenState { precondition(!self.hasEmittedEOF) + if self.currentToken.isTag() { + self.currentToken.attributes = self.currentTokensAttributes ?? [] + self.currentTokensAttributes = nil + } self.queuedTokens.append(self.currentToken) self.currentToken = HTMLToken() return emitEOF() @@ -292,20 +348,44 @@ public class HTMLTokenizer { return .Emit(token: self.queuedTokens.popFirst()!) } + func flushCodepointsConsumedAsACharacterReference() { + if consumedAsPartOfAnAttribute() { + self.currentBuilder += self.temporaryBuffer.takeString() + } else { + for codePoint in self.temporaryBuffer.takeString() { + self.queuedTokens.append(HTMLToken(type: .Character(codePoint: codePoint))) + } + } + } + + func consumedAsPartOfAnAttribute() -> Bool { + return self.returnState == .AttributeValueDoubleQuoted || self.returnState == .AttributeValueSingleQuoted || self.returnState == .AttributeValueUnquoted + } + + func isAppropriateEndTagToken(_ token: HTMLToken) -> Bool { + guard case let .EndTag(endTagName, _, _, _) = token.type else { + preconditionFailure("isAppropriateEndTagToken called with non-end-tag token") + } + if let startTagName = self.lastStartTagName { + return startTagName == endTagName + } else { + return false + } + } + func nextTokenImpl(_ nextInputCharacter: Character? = nil) -> NextTokenState { let dontConsumeNextInputCharacter = { self.restoreCursorToPrevious() } let _ = dontConsumeNextInputCharacter - // FIXME: flushCodepointsConsumedAsACharacterReference needs currentBuilder - // Handle reconsume by passing the character around in the state enum let currentInputCharacter = nextInputCharacter ?? nextCodePoint() switch self.state { // 13.2.5.1 Data state, https://html.spec.whatwg.org/multipage/parsing.html#data-state case .Data: + precondition(currentTokensAttributes == nil) switch currentInputCharacter { case "&": self.returnState = .Data @@ -320,6 +400,637 @@ public class HTMLTokenizer { default: return emitCharacter(currentInputCharacter!) } + + // 13.2.5.2 RCDATA state, https://html.spec.whatwg.org/multipage/parsing.html#rcdata-state + case .RCDATA: + switch currentInputCharacter { + case "&": + self.returnState = .RCDATA + return switchTo(.CharacterReference) + case "<": + return switchTo(.RCDATALessThanSign) + case "\0": + // FIXME: log_parse_error() + return emitCharacter("\u{FFFD}") + case nil: + return emitEOF() + default: + return emitCharacter(currentInputCharacter!) + } + + // 13.2.5.3. RAWTEXT state, https://html.spec.whatwg.org/multipage/parsing.html#rawtext-state + case .RAWTEXT: + switch currentInputCharacter { + case "<": + return switchTo(.RAWTEXTLessThanSign) + case "\0": + // FIXME: log_parse_error() + return emitCharacter("\u{FFFD}") + case nil: + return emitEOF() + default: + return emitCharacter(currentInputCharacter!) + } + // 13.2.5.4 Script data state, https://html.spec.whatwg.org/multipage/parsing.html#script-data-state + case .ScriptData: + switch currentInputCharacter { + case "<": + return switchTo(.ScriptDataLessThanSign) + case "\0": + // FIXME: log_parse_error() + return emitCharacter("\u{FFFD}") + case nil: + return emitEOF() + default: + return emitCharacter(currentInputCharacter!) + } + // 13.2.5.5 PLAINTEXT state, https://html.spec.whatwg.org/multipage/parsing.html#plaintext-state + case .PLAINTEXT: + switch currentInputCharacter { + case "\0": + // FIXME: log_parse_error() + return emitCharacter("\u{FFFD}") + case nil: + return emitEOF() + default: + return emitCharacter(currentInputCharacter!) + } + // 13.2.5.6 Tag open state https://html.spec.whatwg.org/multipage/parsing.html#tag-open-state + case .TagOpen: + switch currentInputCharacter { + case "!": + return switchTo(.MarkupDeclarationOpen) + case "/": + return switchTo(.EndTagOpen) + case let c? where HTMLTokenizer.asciiAlpha.contains(c.unicodeScalars.first!): + createNewToken(HTMLToken(type: .StartTag(tagName: ""))) + return reconsume(currentInputCharacter!, in: .TagName) + case "?": + // FIXME: log_parse_error() + createNewToken(HTMLToken(type: .Comment(data: ""))) + return reconsume(currentInputCharacter!, in: .BogusComment) + case nil: + // FIXME: log_parse_error() + queuedTokens.append(HTMLToken(type: .Character(codePoint: "<"))) + return emitEOF() + default: + // FIXME: log_parse_error() + queuedTokens.append(HTMLToken(type: .Character(codePoint: "<"))) + return reconsume(currentInputCharacter!, in: .Data) + } + // 13.2.5.7 End tag open state, https://html.spec.whatwg.org/multipage/parsing.html#end-tag-open-state + case .EndTagOpen: + switch currentInputCharacter { + case let c? where HTMLTokenizer.asciiAlpha.contains(c.unicodeScalars.first!): + createNewToken(HTMLToken(type: .EndTag(tagName: ""))) + return reconsume(currentInputCharacter!, in: .TagName) + default: + return emitEOF() + } + // 13.2.5.8 Tag name state, https://html.spec.whatwg.org/multipage/parsing.html#tag-name-state + case .TagName: + switch currentInputCharacter { + case "\t", "\n", "\u{000C}", " ": + self.currentToken.tagName = self.currentBuilder.takeString() + return switchTo(.BeforeAttributeName) + case "/": + self.currentToken.tagName = self.currentBuilder.takeString() + return switchTo(.SelfClosingStartTag) + case ">": + self.currentToken.tagName = self.currentBuilder.takeString() + return switchToAndEmitCurrentToken(.Data) + case let c? where HTMLTokenizer.asciiUpperAlpha.contains(c.unicodeScalars.first!): + currentBuilder.append(Character(Unicode.Scalar(c.asciiValue! + 0x20))) + return continueInCurrentState() + case "\0": + // FIXME: log_parse_error() + currentBuilder += "\u{FFFD}" + return continueInCurrentState() + case nil: + // FIXME: log_parse_error() + return emitEOF() + default: + currentBuilder.append(currentInputCharacter!) + return continueInCurrentState() + } + // 13.2.5.9 RCDATA less-than sign state, https://html.spec.whatwg.org/multipage/parsing.html#rcdata-less-than-sign-state + case .RCDATALessThanSign: + switch currentInputCharacter { + case "/": + self.temporaryBuffer = "" + return switchTo(.RCDATAEndTagOpen) + default: + return emitCharacterAndReconsume("<", in: .RCDATA, currentInputCharacter: currentInputCharacter) + } + // 13.2.5.10 RCDATA end tag open state, https://html.spec.whatwg.org/multipage/parsing.html#rcdata-end-tag-open-state + case .RCDATAEndTagOpen: + switch currentInputCharacter { + case let c? where HTMLTokenizer.asciiAlpha.contains(c.unicodeScalars.first!): + createNewToken(HTMLToken(type: .EndTag(tagName: ""))) + return reconsume(currentInputCharacter!, in: .RCDATAEndTagName) + default: + queuedTokens.append(HTMLToken(type: .Character(codePoint: "<"))) + queuedTokens.append(HTMLToken(type: .Character(codePoint: "/"))) + return reconsume(currentInputCharacter, in: .RCDATA) + } + // 13.2.5.11 RCDATA end tag name state, https://html.spec.whatwg.org/multipage/parsing.html#rcdata-end-tag-name-state + case .RCDATAEndTagName: + switch currentInputCharacter { + case "\t", "\n", "\u{000C}", " ": + if self.isAppropriateEndTagToken(currentToken) { + return switchTo(.BeforeAttributeName) + } + break + case "/": + if self.isAppropriateEndTagToken(currentToken) { + return switchTo(.SelfClosingStartTag) + } + break + case ">": + if self.isAppropriateEndTagToken(currentToken) { + return switchToAndEmitCurrentToken(.Data) + } + break + case let c? where HTMLTokenizer.asciiUpperAlpha.contains(c.unicodeScalars.first!): + self.currentBuilder.append(Character(Unicode.Scalar(c.asciiValue! + 0x20))) + self.temporaryBuffer.append(c) + return continueInCurrentState() + case let c? where HTMLTokenizer.asciiLowerAlpha.contains(c.unicodeScalars.first!): + self.currentBuilder.append(c) + self.temporaryBuffer.append(c) + return continueInCurrentState() + default: + break + } + + // First three steps fall through to the "anything else" block + self.queuedTokens.append(HTMLToken(type: .Character(codePoint: "<"))) + self.queuedTokens.append(HTMLToken(type: .Character(codePoint: "/"))) + // NOTE: The spec doesn't mention this, but it seems that m_current_token (an end tag) is just dropped in this case. + self.currentBuilder = "" + for codePoint in self.temporaryBuffer { + self.queuedTokens.append(HTMLToken(type: .Character(codePoint: codePoint))) + } + return reconsume(currentInputCharacter, in: .RCDATA) + // 13.2.5.15 Script data less-than sign state, https://html.spec.whatwg.org/multipage/parsing.html#script-data-less-than-sign-state + case .ScriptDataLessThanSign: + switch currentInputCharacter { + case "/": + self.temporaryBuffer = "" + return switchTo(.ScriptDataEndTagOpen) + case "!": + self.queuedTokens.append(HTMLToken(type: .Character(codePoint: "<"))) + self.queuedTokens.append(HTMLToken(type: .Character(codePoint: "!"))) + return switchTo(.ScriptDataEscapeStart) + default: + return emitCharacterAndReconsume("<", in: .ScriptData, currentInputCharacter: currentInputCharacter) + } + // 13.2.5.16 Script data end tag open state, https://html.spec.whatwg.org/multipage/parsing.html#script-data-end-tag-open-state + case .ScriptDataEndTagOpen: + switch currentInputCharacter { + case let c? where HTMLTokenizer.asciiAlpha.contains(c.unicodeScalars.first!): + createNewToken(HTMLToken(type: .EndTag(tagName: ""))) + return reconsume(currentInputCharacter!, in: .ScriptDataEndTagName) + default: + queuedTokens.append(HTMLToken(type: .Character(codePoint: "<"))) + queuedTokens.append(HTMLToken(type: .Character(codePoint: "/"))) + return reconsume(currentInputCharacter, in: .ScriptData) + } + // 13.2.5.17 Script data end tag name state, https://html.spec.whatwg.org/multipage/parsing.html#script-data-end-tag-name-state + case .ScriptDataEndTagName: + switch currentInputCharacter { + case "\t", "\n", "\u{000C}", " ": + if self.isAppropriateEndTagToken(currentToken) { + return switchTo(.BeforeAttributeName) + } + break + case "/": + if self.isAppropriateEndTagToken(currentToken) { + return switchTo(.SelfClosingStartTag) + } + break + case ">": + if self.isAppropriateEndTagToken(currentToken) { + return switchToAndEmitCurrentToken(.Data) + } + break + case let c? where HTMLTokenizer.asciiUpperAlpha.contains(c.unicodeScalars.first!): + self.currentBuilder.append(Character(Unicode.Scalar(c.asciiValue! + 0x20))) + self.temporaryBuffer.append(c) + return continueInCurrentState() + case let c? where HTMLTokenizer.asciiLowerAlpha.contains(c.unicodeScalars.first!): + self.currentBuilder.append(c) + self.temporaryBuffer.append(c) + return continueInCurrentState() + default: + break + } + + // First three steps fall through to the "anything else" block + self.queuedTokens.append(HTMLToken(type: .Character(codePoint: "<"))) + self.queuedTokens.append(HTMLToken(type: .Character(codePoint: "/"))) + // NOTE: The spec doesn't mention this, but it seems that m_current_token (an end tag) is just dropped in this case. + self.currentBuilder = "" + for codePoint in self.temporaryBuffer { + self.queuedTokens.append(HTMLToken(type: .Character(codePoint: codePoint))) + } + return reconsume(currentInputCharacter, in: .ScriptData) + // 13.2.5.18 Script data escape start state, https://html.spec.whatwg.org/multipage/parsing.html#script-data-escape-start-state + case .ScriptDataEscapeStart: + switch currentInputCharacter { + case "-": + return switchToAndEmitCharacter(.ScriptDataEscapeStartDash, character: "-") + default: + return reconsume(currentInputCharacter, in: .ScriptData) + } + // 13.2.5.19 Script data escape start dash state, https://html.spec.whatwg.org/multipage/parsing.html#script-data-escape-start-dash-state + case .ScriptDataEscapeStartDash: + switch currentInputCharacter { + case "-": + return switchToAndEmitCharacter(.ScriptDataEscapedDashDash, character: "-") + default: + return reconsume(currentInputCharacter, in: .ScriptData) + } + // 13.2.5.20 Script data escaped state, https://html.spec.whatwg.org/multipage/parsing.html#script-data-escaped-state + case .ScriptDataEscaped: + switch currentInputCharacter { + case "-": + return switchToAndEmitCharacter(.ScriptDataEscapedDash, character: "-") + case "<": + return switchTo(.ScriptDataEscapedLessThanSign) + case "\0": + // FIXME: log_parse_error() + return emitCharacter("\u{FFFD}") + case nil: + // FIXME: log_parse_error() + return emitEOF() + default: + return emitCharacter(currentInputCharacter!) + } + // 13.2.5.21 Script data escaped dash state, https://html.spec.whatwg.org/multipage/parsing.html#script-data-escaped-dash-state + case .ScriptDataEscapedDash: + switch currentInputCharacter { + case "-": + return switchToAndEmitCharacter(.ScriptDataEscapedDashDash, character: "-") + case "<": + return switchTo(.ScriptDataEscapedLessThanSign) + case "\0": + // FIXME: log_parse_error() + return switchToAndEmitCharacter(.ScriptDataEscaped, character: "\u{FFFD}") + case nil: + // FIXME: log_parse_error() + return emitEOF() + default: + return switchToAndEmitCharacter(.ScriptDataEscaped, character: currentInputCharacter!) + } + // 13.2.5.22 Script data escaped dash dash state, https://html.spec.whatwg.org/multipage/parsing.html#script-data-escaped-dash-dash-state + case .ScriptDataEscapedDashDash: + switch currentInputCharacter { + case "-": + return emitCharacter("-") + case "<": + return switchTo(.ScriptDataEscapedLessThanSign) + case ">": + return switchToAndEmitCharacter(.ScriptData, character: ">") + case "\0": + // FIXME: log_parse_error() + return switchToAndEmitCharacter(.ScriptDataEscaped, character: "\u{FFFD}") + case nil: + // FIXME: log_parse_error() + return emitEOF() + default: + return switchToAndEmitCharacter(.ScriptDataEscaped, character: currentInputCharacter!) + } + // 13.2.5.23 Script data escaped less-than sign state, https://html.spec.whatwg.org/multipage/parsing.html#script-data-escaped-less-than-sign-state + case .ScriptDataEscapedLessThanSign: + switch currentInputCharacter { + case "/": + self.temporaryBuffer = "" + return switchTo(.ScriptDataEscapedEndTagOpen) + case let c? where HTMLTokenizer.asciiAlpha.contains(c.unicodeScalars.first!): + self.temporaryBuffer = "" + self.queuedTokens.append(HTMLToken(type: .Character(codePoint: "<"))) + return reconsume(currentInputCharacter!, in: .ScriptDataDoubleEscapeStart) + default: + return emitCharacterAndReconsume("<", in: .ScriptDataEscaped, currentInputCharacter: currentInputCharacter) + } + // 13.2.5.24 Script data escaped end tag open state, https://html.spec.whatwg.org/multipage/parsing.html#script-data-escaped-end-tag-open-state + case .ScriptDataEscapedEndTagOpen: + switch currentInputCharacter { + case let c? where HTMLTokenizer.asciiAlpha.contains(c.unicodeScalars.first!): + createNewToken(HTMLToken(type: .EndTag(tagName: ""))) + return reconsume(currentInputCharacter!, in: .ScriptDataEscapedEndTagName) + default: + queuedTokens.append(HTMLToken(type: .Character(codePoint: "<"))) + queuedTokens.append(HTMLToken(type: .Character(codePoint: "/"))) + return reconsume(currentInputCharacter, in: .ScriptDataEscaped) + } + // 13.2.5.25 Script data escaped end tag name state, https://html.spec.whatwg.org/multipage/parsing.html#script-data-escaped-end-tag-name-state + case .ScriptDataEscapedEndTagName: + switch currentInputCharacter { + case "\t", "\n", "\u{000C}", " ": + if self.isAppropriateEndTagToken(currentToken) { + return switchTo(.BeforeAttributeName) + } + break + case "/": + if self.isAppropriateEndTagToken(currentToken) { + return switchTo(.SelfClosingStartTag) + } + break + case ">": + if self.isAppropriateEndTagToken(currentToken) { + return switchToAndEmitCurrentToken(.Data) + } + break + case let c? where HTMLTokenizer.asciiUpperAlpha.contains(c.unicodeScalars.first!): + self.currentBuilder.append(Character(Unicode.Scalar(c.asciiValue! + 0x20))) + self.temporaryBuffer.append(c) + return continueInCurrentState() + case let c? where HTMLTokenizer.asciiLowerAlpha.contains(c.unicodeScalars.first!): + self.currentBuilder.append(c) + self.temporaryBuffer.append(c) + return continueInCurrentState() + default: + break + } + + // First three steps fall through to the "anything else" block + self.queuedTokens.append(HTMLToken(type: .Character(codePoint: "<"))) + self.queuedTokens.append(HTMLToken(type: .Character(codePoint: "/"))) + // NOTE: The spec doesn't mention this, but it seems that m_current_token (an end tag) is just dropped in this case. + self.currentBuilder = "" + for codePoint in self.temporaryBuffer { + self.queuedTokens.append(HTMLToken(type: .Character(codePoint: codePoint))) + } + return reconsume(currentInputCharacter, in: .ScriptDataEscaped) + // 13.2.5.26 Script data double escape start state, https://html.spec.whatwg.org/multipage/parsing.html#script-data-double-escape-start-state + case .ScriptDataDoubleEscapeStart: + switch currentInputCharacter { + case "\t", "\n", "\u{000C}", " ", "/", ">": + if self.temporaryBuffer == "script" { + return switchToAndEmitCharacter(.ScriptDataDoubleEscaped, character: currentInputCharacter!) + } else { + return switchToAndEmitCharacter(.ScriptDataEscaped, character: currentInputCharacter!) + } + case let c? where HTMLTokenizer.asciiUpperAlpha.contains(c.unicodeScalars.first!): + self.temporaryBuffer.append(Character(Unicode.Scalar(c.asciiValue! + 0x20))) + return emitCharacter(currentInputCharacter!) + case let c? where HTMLTokenizer.asciiLowerAlpha.contains(c.unicodeScalars.first!): + self.temporaryBuffer.append(c) + return emitCharacter(currentInputCharacter!) + default: + return reconsume(currentInputCharacter, in: .ScriptDataEscaped) + } + // 13.2.5.27 Script data double escaped state, https://html.spec.whatwg.org/multipage/parsing.html#script-data-double-escaped-state + case .ScriptDataDoubleEscaped: + switch currentInputCharacter { + case "-": + return switchToAndEmitCharacter(.ScriptDataDoubleEscapedDash, character: "-") + case "<": + return switchTo(.ScriptDataDoubleEscapedLessThanSign) + case "\0": + // FIXME: log_parse_error() + return emitCharacter("\u{FFFD}") + case nil: + // FIXME: log_parse_error() + return emitEOF() + default: + return emitCharacter(currentInputCharacter!) + } + // 13.2.5.28 Script data double escaped dash state, https://html.spec.whatwg.org/multipage/parsing.html#script-data-double-escaped-dash-state + case .ScriptDataDoubleEscapedDash: + switch currentInputCharacter { + case "-": + return switchToAndEmitCharacter(.ScriptDataDoubleEscapedDashDash, character: "-") + case "<": + return switchTo(.ScriptDataDoubleEscapedLessThanSign) + case "\0": + // FIXME: log_parse_error() + return switchToAndEmitCharacter(.ScriptDataDoubleEscaped, character: "\u{FFFD}") + case nil: + // FIXME: log_parse_error() + return emitEOF() + default: + return switchToAndEmitCharacter(.ScriptDataDoubleEscaped, character: currentInputCharacter!) + } + // 13.2.5.29 Script data double escaped dash dash state, https://html.spec.whatwg.org/multipage/parsing.html#script-data-double-escaped-dash-dash-state + case .ScriptDataDoubleEscapedDashDash: + switch currentInputCharacter { + case "-": + return emitCharacter("-") + case "<": + return switchToAndEmitCharacter(.ScriptDataDoubleEscapedLessThanSign, character: "<") + case ">": + return switchToAndEmitCharacter(.ScriptData, character: ">") + case "\0": + // FIXME: log_parse_error() + return switchToAndEmitCharacter(.ScriptDataDoubleEscaped, character: "\u{FFFD}") + case nil: + // FIXME: log_parse_error() + return emitEOF() + default: + return switchToAndEmitCharacter(.ScriptDataDoubleEscaped, character: currentInputCharacter!) + } + // 13.2.5.30 Script data double escaped less-than sign state, https://html.spec.whatwg.org/multipage/parsing.html#script-data-double-escaped-less-than-sign-state + case .ScriptDataDoubleEscapedLessThanSign: + switch currentInputCharacter { + case "/": + self.temporaryBuffer = "" + return switchToAndEmitCharacter(.ScriptDataDoubleEscapeEnd, character: "/") + default: + return reconsume(currentInputCharacter, in: .ScriptDataDoubleEscaped) + } + // 13.2.5.31 Script data double escape end state, https://html.spec.whatwg.org/multipage/parsing.html#script-data-double-escape-end-state + case .ScriptDataDoubleEscapeEnd: + switch currentInputCharacter { + case "\t", "\n", "\u{000C}", " ", "/", ">": + if self.temporaryBuffer == "script" { + return switchToAndEmitCharacter(.ScriptDataEscaped, character: currentInputCharacter!) + } else { + return switchToAndEmitCharacter(.ScriptDataDoubleEscaped, character: currentInputCharacter!) + } + case let c? where HTMLTokenizer.asciiUpperAlpha.contains(c.unicodeScalars.first!): + self.temporaryBuffer.append(Character(Unicode.Scalar(c.asciiValue! + 0x20))) + return emitCharacter(currentInputCharacter!) + case let c? where HTMLTokenizer.asciiLowerAlpha.contains(c.unicodeScalars.first!): + self.temporaryBuffer.append(c) + return emitCharacter(currentInputCharacter!) + default: + return reconsume(currentInputCharacter, in: .ScriptDataDoubleEscaped) + } + // 13.2.5.32 Before attribute name state, https://html.spec.whatwg.org/multipage/parsing.html#before-attribute-name-state + case .BeforeAttributeName: + switch currentInputCharacter { + case "\t", "\n", "\u{000C}", " ": + return continueInCurrentState() + case "/", ">", nil: + return reconsume(currentInputCharacter, in: .AfterAttributeName) + case "=": + // FIXME: log_parse_error() + self.currentBuilder = Swift.String(currentInputCharacter!) + self.currentAttribute = HTMLToken.Attribute(localName: "", value: "") + return switchTo(.AttributeName) + default: + self.currentAttribute = HTMLToken.Attribute(localName: "", value: "") + return reconsume(currentInputCharacter!, in: .AttributeName) + } + // 13.2.5.33 Attribute name state, https://html.spec.whatwg.org/multipage/parsing.html#attribute-name-state + case .AttributeName: + // FIXME: When the user agent leaves the attribute name state (and before emitting the tag token, if appropriate), + // the complete attribute's name must be compared to the other attributes on the same token; + // if there is already an attribute on the token with the exact same name, then this is a duplicate-attribute + // parse error and the new attribute must be removed from the token. + // NOTE: If an attribute is so removed from a token, it, and the value that gets associated with it, if any, + // are never subsequently used by the parser, and are therefore effectively discarded. Removing the attribute + // in this way does not change its status as the "current attribute" for the purposes of the tokenizer, however. + switch currentInputCharacter { + case "\t", "\n", "\u{000C}", " ", "/", ">", nil: + // FIXME: set name position + self.currentAttribute!.localName = self.currentBuilder.takeString() + return reconsume(currentInputCharacter, in: .AfterAttributeName) + case "=": + // FIXME: set name position + self.currentAttribute!.localName = self.currentBuilder.takeString() + return switchTo(.BeforeAttributeValue) + case let c? where HTMLTokenizer.asciiUpperAlpha.contains(c.unicodeScalars.first!): + self.currentBuilder.append(Character(Unicode.Scalar(c.asciiValue! + 0x20))) + return continueInCurrentState() + case "\0": + // FIXME: log_parse_error() + self.currentBuilder.append("\u{FFFD}") + return continueInCurrentState() + default: + self.currentBuilder.append(currentInputCharacter!) + return continueInCurrentState() + } + // 13.2.5.34 After attribute name state, https://html.spec.whatwg.org/multipage/parsing.html#after-attribute-name-state + case .AfterAttributeName: + switch currentInputCharacter { + case "\t", "\n", "\u{000C}", " ": + return continueInCurrentState() + case "/": + self.finalizeCurrentAttribute(.SetName) + return switchTo(.SelfClosingStartTag) + case "=": + self.finalizeCurrentAttribute(.SetName) + return switchTo(.BeforeAttributeValue) + case ">": + self.finalizeCurrentAttribute(.SetName) + return switchToAndEmitCurrentToken(.Data) + case nil: + // FIXME: log_parse_error() + self.finalizeCurrentAttribute(.IgnoreString) + return emitEOF() + default: + self.finalizeCurrentAttribute(.SetName) + self.currentAttribute = HTMLToken.Attribute(localName: "", value: "") + return reconsume(currentInputCharacter!, in: .AttributeName) + } + // 13.2.5.35 Before attribute value state, https://html.spec.whatwg.org/multipage/parsing.html#before-attribute-value-state + case .BeforeAttributeValue: + switch currentInputCharacter { + case "\t", "\n", "\u{000C}", " ": + return continueInCurrentState() + case "\"": + return switchTo(.AttributeValueDoubleQuoted) + case "'": + return switchTo(.AttributeValueSingleQuoted) + case ">": + // FIXME: log_parse_error() + self.finalizeCurrentAttribute(.IgnoreString) + return switchToAndEmitCurrentToken(.Data) + default: + return reconsume(currentInputCharacter, in: .AttributeValueUnquoted) + } + // 13.2.5.36 Attribute value (double-quoted) state, https://html.spec.whatwg.org/multipage/parsing.html#attribute-value-double-quoted-state + case .AttributeValueDoubleQuoted: + switch currentInputCharacter { + case "\"": + return switchTo(.AfterAttributeValueQuoted) + case "&": + self.returnState = .AttributeValueDoubleQuoted + return switchTo(.CharacterReference) + case "\0": + // FIXME: log_parse_error() + self.currentBuilder.append("\u{FFFD}") + return continueInCurrentState() + case nil: + // FIXME: log_parse_error() + self.finalizeCurrentAttribute(.IgnoreString) + return emitEOF() + default: + self.currentBuilder.append(currentInputCharacter!) + return continueInCurrentState() + } + // 13.2.5.37 Attribute value (single-quoted) state, https://html.spec.whatwg.org/multipage/parsing.html#attribute-value-single-quoted-state + case .AttributeValueSingleQuoted: + switch currentInputCharacter { + case "'": + return switchTo(.AfterAttributeValueQuoted) + case "&": + self.returnState = .AttributeValueSingleQuoted + return switchTo(.CharacterReference) + case "\0": + // FIXME: log_parse_error() + self.currentBuilder.append("\u{FFFD}") + return continueInCurrentState() + case nil: + // FIXME: log_parse_error() + return emitEOF() + default: + self.currentBuilder.append(currentInputCharacter!) + return continueInCurrentState() + } + // 13.2.5.38 Attribute value (unquoted) state, https://html.spec.whatwg.org/multipage/parsing.html#attribute-value-unquoted-state + case .AttributeValueUnquoted: + switch currentInputCharacter { + case "\t", "\n", "\u{000C}", " ": + self.finalizeCurrentAttribute(.SetValue) + return switchTo(.BeforeAttributeName) + case "&": + self.returnState = .AttributeValueUnquoted + return switchTo(.CharacterReference) + case ">": + self.finalizeCurrentAttribute(.SetValue) + return switchToAndEmitCurrentToken(.Data) + case "\0": + // FIXME: log_parse_error() + self.currentBuilder.append("\u{FFFD}") + return continueInCurrentState() + case "\"", "'", "<", "=", "`": + // FIXME: log_parse_error() + self.currentBuilder.append(currentInputCharacter!) + return continueInCurrentState() + case nil: + // FIXME: log_parse_error() + self.finalizeCurrentAttribute(.IgnoreString) + return emitEOF() + default: + self.currentBuilder.append(currentInputCharacter!) + return continueInCurrentState() + } + // 13.2.5.39 After attribute value (quoted) state, https://html.spec.whatwg.org/multipage/parsing.html#after-attribute-value-quoted-state + case .AfterAttributeValueQuoted: + switch currentInputCharacter { + case "\t", "\n", "\u{000C}", " ": + self.finalizeCurrentAttribute(.SetValue) + return switchTo(.BeforeAttributeName) + case "/": + self.finalizeCurrentAttribute(.SetValue) + return switchTo(.SelfClosingStartTag) + case ">": + self.finalizeCurrentAttribute(.SetValue) + return switchToAndEmitCurrentToken(.Data) + case nil: + // FIXME: log_parse_error() + self.finalizeCurrentAttribute(.IgnoreString) + return emitEOF() + default: + // FIXME: log_parse_error() + self.finalizeCurrentAttribute(.SetValue) + return reconsume(currentInputCharacter!, in: .BeforeAttributeName) + } default: print("TODO: In state \(self.state) with input \(Swift.String(describing: currentInputCharacter))") return emitEOF()