mirror of
https://github.com/SerenityOS/serenity.git
synced 2025-01-23 09:51:57 -05:00
LibHTML: Start working on a very simplified HTML parser.
This commit is contained in:
parent
a67e823838
commit
581d6b00c8
4 changed files with 139 additions and 22 deletions
15
Base/home/anon/lorem.html
Normal file
15
Base/home/anon/lorem.html
Normal file
|
@ -0,0 +1,15 @@
|
|||
<html>
|
||||
<head><title>Lorem Ipsum</title></head>
|
||||
<body>
|
||||
<h1>Lorem Ipsum</h1>
|
||||
<p>Lorem ipsum dolor sit amet, consectetur adipiscing elit. In non elit dignissim, lobortis velit id, rutrum enim. Fusce urna nulla, semper in nisl consectetur, dictum dignissim felis. Vivamus mollis porttitor neque non pulvinar. Donec sollicitudin pulvinar nisi, nec vestibulum massa rutrum id. Aenean convallis tincidunt diam vel egestas. Pellentesque laoreet commodo arcu id dignissim. Etiam mattis elementum lectus, ut ultricies nibh dapibus sit amet. Curabitur sodales cursus ipsum vitae porttitor. Vestibulum ac nulla auctor, imperdiet augue accumsan, ornare eros.</p>
|
||||
|
||||
<p>Proin vel orci lobortis, ultrices nunc non, placerat odio. Proin nec nibh et odio pellentesque lobortis. Donec id urna ac sapien commodo facilisis in quis magna. Ut tempus aliquet elit, ut semper ante accumsan ornare. Morbi ac egestas quam. Pellentesque ut convallis metus, sit amet dignissim turpis. Sed feugiat hendrerit nibh, id tincidunt tortor euismod et. In vel fringilla ante. Etiam volutpat risus egestas congue sollicitudin.</p>
|
||||
|
||||
<p>Sed libero urna, fermentum quis leo at, lacinia suscipit ipsum. Vivamus in dignissim nibh. Proin ultricies sapien quis tortor luctus vehicula. Morbi ut consequat ipsum. Morbi imperdiet lectus libero, at tristique erat scelerisque sed. Duis eu risus at lectus vehicula facilisis. In tempor felis a nulla imperdiet volutpat. Quisque at auctor libero. Nunc ornare eros eget libero faucibus, vehicula ullamcorper erat laoreet. Aliquam dignissim eget est et aliquam. Phasellus imperdiet tincidunt mi, vitae viverra enim elementum a. Nullam pellentesque odio eu mauris bibendum tempor.</p>
|
||||
|
||||
<p>Sed mattis, elit eu pulvinar sagittis, ipsum enim interdum nisl, eu ornare augue orci at enim. Sed cursus, dolor in vestibulum maximus, mauris magna bibendum enim, in fringilla mauris metus vel nunc. Cras in quam mi. Nullam aliquam velit mauris, quis aliquet nulla pretium auctor. Donec non lobortis tellus. Nunc sodales libero id libero ultricies cursus. Cras ipsum nibh, dictum eu augue fermentum, blandit bibendum odio. Pellentesque tincidunt hendrerit aliquam. Donec sit amet justo vel magna pretium lobortis tempus vitae lorem. Maecenas quam purus, scelerisque dapibus lectus at, mattis tempus enim. Suspendisse ac ante turpis. Suspendisse aliquet, velit at hendrerit elementum, risus tortor accumsan est, quis luctus nisl sapien sed risus. Donec cursus ex diam, nec iaculis urna bibendum eget. Cras neque lacus, ornare eget elit eu, fringilla vestibulum velit. Phasellus lacinia condimentum enim accumsan aliquam. Nulla finibus ex elit, id semper erat posuere suscipit.</p>
|
||||
|
||||
<p>Integer at libero purus. Maecenas eu cursus nunc, vitae pellentesque sapien. Mauris auctor condimentum massa. Sed pharetra nibh varius leo rutrum, vel auctor tortor venenatis. Donec tincidunt tempus libero vel iaculis. Nam pretium non augue et pretium. Nunc dignissim tortor venenatis, blandit sem ac, mattis dolor. Donec et lacinia nunc. Vestibulum enim eros, aliquam pulvinar cursus ornare, volutpat eu mi. Quisque semper mi id metus elementum malesuada. Suspendisse nisl felis, pretium id consectetur quis, lacinia sit amet est.</p>
|
||||
</body>
|
||||
</html>
|
6
Base/home/anon/small.html
Normal file
6
Base/home/anon/small.html
Normal file
|
@ -0,0 +1,6 @@
|
|||
<html>
|
||||
<head><title>Small test page</title></head>
|
||||
<body>
|
||||
<p>This is a <b>very small</b> test page :^)</p>
|
||||
</body>
|
||||
</html>
|
|
@ -1,32 +1,120 @@
|
|||
#include <LibHTML/Element.h>
|
||||
#include <LibHTML/Parser.h>
|
||||
#include <LibHTML/Text.h>
|
||||
#include <ctype.h>
|
||||
|
||||
static Retained<Element> create_element(const String& tag_name)
|
||||
{
|
||||
return adopt(*new Element(tag_name));
|
||||
}
|
||||
|
||||
Retained<Document> parse(const String& html)
|
||||
static bool is_self_closing_tag(const String& tag_name)
|
||||
{
|
||||
auto doc = adopt(*new Document);
|
||||
|
||||
auto head = create_element("head");
|
||||
auto title = create_element("title");
|
||||
auto title_text = adopt(*new Text("Page Title"));
|
||||
title->append_child(title_text);
|
||||
head->append_child(title);
|
||||
|
||||
doc->append_child(head);
|
||||
|
||||
auto body = create_element("body");
|
||||
auto h1 = create_element("h1");
|
||||
auto h1_text = adopt(*new Text("Hello World!"));
|
||||
|
||||
h1->append_child(h1_text);
|
||||
body->append_child(h1);
|
||||
doc->append_child(body);
|
||||
|
||||
return doc;
|
||||
return tag_name == "area"
|
||||
|| tag_name == "base"
|
||||
|| tag_name == "br"
|
||||
|| tag_name == "col"
|
||||
|| tag_name == "embed"
|
||||
|| tag_name == "hr"
|
||||
|| tag_name == "img"
|
||||
|| tag_name == "input"
|
||||
|| tag_name == "link"
|
||||
|| tag_name == "meta"
|
||||
|| tag_name == "param"
|
||||
|| tag_name == "source"
|
||||
|| tag_name == "track"
|
||||
|| tag_name == "wbr";
|
||||
}
|
||||
|
||||
Retained<Document> parse(const String& html)
|
||||
{
|
||||
Vector<Retained<ParentNode>> node_stack;
|
||||
|
||||
auto doc = adopt(*new Document);
|
||||
node_stack.append(doc);
|
||||
|
||||
enum class State {
|
||||
Free,
|
||||
BeforeTagName,
|
||||
InTagName,
|
||||
InAttributeList,
|
||||
InAttributeName,
|
||||
InAttributeValueNoQuote,
|
||||
InAttributeValueSingleQuote,
|
||||
InAttributeValueDoubleQuote,
|
||||
};
|
||||
|
||||
auto state = State::Free;
|
||||
|
||||
Vector<char, 256> buffer;
|
||||
|
||||
bool is_slash_tag = false;
|
||||
|
||||
auto move_to_state = [&](State new_state) {
|
||||
if (new_state == State::BeforeTagName)
|
||||
is_slash_tag = false;
|
||||
if (state == State::Free && !buffer.is_empty()) {
|
||||
auto text_node = adopt(*new Text(String::copy(buffer)));
|
||||
node_stack.last()->append_child(text_node);
|
||||
}
|
||||
state = new_state;
|
||||
buffer.clear();
|
||||
};
|
||||
|
||||
auto close_tag = [&] {
|
||||
if (node_stack.size() > 1)
|
||||
node_stack.take_last();
|
||||
};
|
||||
|
||||
auto open_tag = [&] {
|
||||
auto new_element = create_element(String::copy(buffer));
|
||||
node_stack.append(new_element);
|
||||
if (node_stack.size() != 1)
|
||||
node_stack[node_stack.size() - 2]->append_child(new_element);
|
||||
|
||||
if (is_self_closing_tag(new_element->tag_name()))
|
||||
close_tag();
|
||||
};
|
||||
|
||||
for (int i = 0; i < html.length(); ++i) {
|
||||
char ch = html[i];
|
||||
switch (state) {
|
||||
case State::Free:
|
||||
if (ch == '<') {
|
||||
move_to_state(State::BeforeTagName);
|
||||
break;
|
||||
}
|
||||
buffer.append(ch);
|
||||
break;
|
||||
case State::BeforeTagName:
|
||||
if (ch == '/') {
|
||||
is_slash_tag = true;
|
||||
break;
|
||||
}
|
||||
if (ch == '>') {
|
||||
move_to_state(State::Free);
|
||||
break;
|
||||
}
|
||||
if (!isascii(ch))
|
||||
break;
|
||||
move_to_state(State::InTagName);
|
||||
[[fallthrough]];
|
||||
case State::InTagName:
|
||||
if (ch == ' ') {
|
||||
move_to_state(State::InAttributeList);
|
||||
break;
|
||||
}
|
||||
if (ch == '>') {
|
||||
if (is_slash_tag)
|
||||
close_tag();
|
||||
else
|
||||
open_tag();
|
||||
move_to_state(State::Free);
|
||||
break;
|
||||
}
|
||||
buffer.append(ch);
|
||||
break;
|
||||
}
|
||||
}
|
||||
return doc;
|
||||
}
|
||||
|
|
|
@ -1,10 +1,18 @@
|
|||
#include <LibCore/CFile.h>
|
||||
#include <LibHTML/Dump.h>
|
||||
#include <LibHTML/Element.h>
|
||||
#include <LibHTML/Parser.h>
|
||||
#include <stdio.h>
|
||||
|
||||
int main()
|
||||
int main(int argc, char** argv)
|
||||
{
|
||||
String html = "<html><head><title>my page</title></head><body><h1>Hi there</h1><p>Hello World!</p></body></html>";
|
||||
CFile f(argc == 1 ? "/home/anon/small.html" : argv[1]);
|
||||
if (!f.open(CIODevice::ReadOnly)) {
|
||||
fprintf(stderr, "Error: %s\n", f.error_string());
|
||||
return 1;
|
||||
}
|
||||
String html = String::copy(f.read_all());
|
||||
auto doc = parse(html);
|
||||
dump_tree(doc);
|
||||
return 0;
|
||||
}
|
||||
|
|
Loading…
Add table
Reference in a new issue