/* * Copyright (c) 2020, Andreas Kling * All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions are met: * * 1. Redistributions of source code must retain the above copyright notice, this * list of conditions and the following disclaimer. * * 2. Redistributions in binary form must reproduce the above copyright notice, * this list of conditions and the following disclaimer in the documentation * and/or other materials provided with the distribution. * * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. */ //#define PARSER_DEBUG #include #include #include #include #include #include #include #include #include #include #include #include #define PARSE_ERROR() \ do { \ dbg() << "Parse error! " << __PRETTY_FUNCTION__ << " @ " << __LINE__; \ } while (0) namespace Web { RefPtr parse_html_document(const StringView& data, const URL& url, const String& encoding) { HTMLDocumentParser parser(data, encoding); parser.run(url); return parser.document(); } HTMLDocumentParser::HTMLDocumentParser(const StringView& input, const String& encoding) : m_tokenizer(input, encoding) { } HTMLDocumentParser::~HTMLDocumentParser() { } void HTMLDocumentParser::run(const URL& url) { m_document = adopt(*new Document); m_document->set_url(url); m_document->set_source(m_tokenizer.source()); for (;;) { auto optional_token = m_tokenizer.next_token(); if (!optional_token.has_value()) break; auto& token = optional_token.value(); #ifdef PARSER_DEBUG dbg() << "[" << insertion_mode_name() << "] " << token.to_string(); #endif process_using_the_rules_for(m_insertion_mode, token); if (m_stop_parsing) { dbg() << "Stop parsing! :^)"; break; } } flush_character_insertions(); // "The end" auto scripts_to_execute_when_parsing_has_finished = m_document->take_scripts_to_execute_when_parsing_has_finished({}); for (auto& script : scripts_to_execute_when_parsing_has_finished) { script.execute_script(); } m_document->dispatch_event(Event::create("DOMContentLoaded")); auto scripts_to_execute_as_soon_as_possible = m_document->take_scripts_to_execute_as_soon_as_possible({}); for (auto& script : scripts_to_execute_as_soon_as_possible) { script.execute_script(); } } void HTMLDocumentParser::process_using_the_rules_for(InsertionMode mode, HTMLToken& token) { switch (mode) { case InsertionMode::Initial: handle_initial(token); break; case InsertionMode::BeforeHTML: handle_before_html(token); break; case InsertionMode::BeforeHead: handle_before_head(token); break; case InsertionMode::InHead: handle_in_head(token); break; case InsertionMode::InHeadNoscript: handle_in_head_noscript(token); break; case InsertionMode::AfterHead: handle_after_head(token); break; case InsertionMode::InBody: handle_in_body(token); break; case InsertionMode::AfterBody: handle_after_body(token); break; case InsertionMode::AfterAfterBody: handle_after_after_body(token); break; case InsertionMode::Text: handle_text(token); break; case InsertionMode::InTable: handle_in_table(token); break; case InsertionMode::InTableBody: handle_in_table_body(token); break; case InsertionMode::InRow: handle_in_row(token); break; case InsertionMode::InCell: handle_in_cell(token); break; case InsertionMode::InTableText: handle_in_table_text(token); break; case InsertionMode::InSelectInTable: handle_in_select_in_table(token); break; case InsertionMode::InSelect: handle_in_select(token); break; case InsertionMode::InCaption: handle_in_caption(token); break; case InsertionMode::InColumnGroup: handle_in_column_group(token); break; case InsertionMode::InTemplate: handle_in_template(token); break; case InsertionMode::InFrameset: handle_in_frameset(token); break; case InsertionMode::AfterFrameset: handle_after_frameset(token); break; case InsertionMode::AfterAfterFrameset: handle_after_after_frameset(token); break; default: ASSERT_NOT_REACHED(); } } void HTMLDocumentParser::handle_initial(HTMLToken& token) { if (token.is_character() && token.is_parser_whitespace()) { return; } if (token.is_comment()) { auto comment = adopt(*new Comment(document(), token.m_comment_or_character.data.to_string())); document().append_child(move(comment)); return; } if (token.is_doctype()) { auto doctype = adopt(*new DocumentType(document())); doctype->set_name(token.m_doctype.name.to_string()); document().append_child(move(doctype)); document().set_quirks_mode(token.m_doctype.force_quirks); m_insertion_mode = InsertionMode::BeforeHTML; return; } PARSE_ERROR(); document().set_quirks_mode(true); m_insertion_mode = InsertionMode::BeforeHTML; process_using_the_rules_for(InsertionMode::BeforeHTML, token); } void HTMLDocumentParser::handle_before_html(HTMLToken& token) { if (token.is_doctype()) { PARSE_ERROR(); return; } if (token.is_comment()) { auto comment = adopt(*new Comment(document(), token.m_comment_or_character.data.to_string())); document().append_child(move(comment)); return; } if (token.is_character() && token.is_parser_whitespace()) { return; } if (token.is_start_tag() && token.tag_name() == HTML::TagNames::html) { auto element = create_element_for(token); document().append_child(element); m_stack_of_open_elements.push(move(element)); m_insertion_mode = InsertionMode::BeforeHead; return; } if (token.is_end_tag() && token.tag_name().is_one_of(HTML::TagNames::head, HTML::TagNames::body, HTML::TagNames::html, HTML::TagNames::br)) { goto AnythingElse; } if (token.is_end_tag()) { PARSE_ERROR(); return; } AnythingElse: auto element = create_element(document(), HTML::TagNames::html); document().append_child(element); m_stack_of_open_elements.push(element); // FIXME: If the Document is being loaded as part of navigation of a browsing context, then: run the application cache selection algorithm with no manifest, passing it the Document object. m_insertion_mode = InsertionMode::BeforeHead; process_using_the_rules_for(InsertionMode::BeforeHead, token); return; } Element& HTMLDocumentParser::current_node() { return m_stack_of_open_elements.current_node(); } Element& HTMLDocumentParser::node_before_current_node() { return m_stack_of_open_elements.elements().at(m_stack_of_open_elements.elements().size() - 2); } HTMLDocumentParser::AdjustedInsertionLocation HTMLDocumentParser::find_appropriate_place_for_inserting_node() { auto& target = current_node(); if (m_foster_parenting && target.tag_name().is_one_of(HTML::TagNames::table, HTML::TagNames::tbody, HTML::TagNames::tfoot, HTML::TagNames::thead, HTML::TagNames::tr)) { // FIXME: There's a bunch of steps for