2024-10-29 09:19:14 -04:00
|
|
|
#!/usr/bin/env python3
|
|
|
|
|
2024-12-15 15:39:03 -05:00
|
|
|
from collections import namedtuple
|
|
|
|
from dataclasses import dataclass
|
|
|
|
from enum import Enum
|
2024-11-01 03:09:06 -04:00
|
|
|
from html.parser import HTMLParser
|
2024-10-29 09:19:14 -04:00
|
|
|
from pathlib import Path
|
2024-11-22 19:53:20 -05:00
|
|
|
from urllib.parse import urljoin, urlparse
|
2024-10-29 09:19:14 -04:00
|
|
|
from urllib.request import urlopen
|
2024-11-06 07:32:57 -05:00
|
|
|
import re
|
2024-12-15 15:39:03 -05:00
|
|
|
import os
|
|
|
|
import sys
|
2024-10-29 09:19:14 -04:00
|
|
|
|
|
|
|
wpt_base_url = 'https://wpt.live/'
|
2024-11-05 09:17:38 -05:00
|
|
|
|
|
|
|
|
|
|
|
class TestType(Enum):
|
|
|
|
TEXT = 1, 'Tests/LibWeb/Text/input/wpt-import', 'Tests/LibWeb/Text/expected/wpt-import'
|
|
|
|
REF = 2, 'Tests/LibWeb/Ref/input/wpt-import', 'Tests/LibWeb/Ref/expected/wpt-import'
|
2024-11-22 19:53:20 -05:00
|
|
|
CRASH = 3, 'Tests/LibWeb/Crash/wpt-import', ''
|
2024-11-05 09:17:38 -05:00
|
|
|
|
|
|
|
def __new__(cls, *args, **kwds):
|
|
|
|
obj = object.__new__(cls)
|
|
|
|
obj._value_ = args[0]
|
|
|
|
return obj
|
|
|
|
|
|
|
|
def __init__(self, _: str, input_path: str, expected_path: str):
|
|
|
|
self.input_path = input_path
|
|
|
|
self.expected_path = expected_path
|
|
|
|
|
|
|
|
|
2024-10-29 09:19:14 -04:00
|
|
|
PathMapping = namedtuple('PathMapping', ['source', 'destination'])
|
|
|
|
|
2024-12-15 15:39:03 -05:00
|
|
|
|
|
|
|
class ResourceType(Enum):
|
|
|
|
INPUT = 1
|
|
|
|
EXPECTED = 2
|
|
|
|
|
|
|
|
|
|
|
|
@dataclass
|
|
|
|
class ResourceAndType:
|
|
|
|
resource: str
|
|
|
|
type: ResourceType
|
|
|
|
|
|
|
|
|
2024-11-05 09:17:38 -05:00
|
|
|
test_type = TestType.TEXT
|
|
|
|
raw_reference_path = None # As specified in the test HTML
|
|
|
|
reference_path = None # With parent directories
|
2024-10-29 09:19:14 -04:00
|
|
|
|
|
|
|
|
2024-11-06 07:32:57 -05:00
|
|
|
class LinkedResourceFinder(HTMLParser):
|
|
|
|
def __init__(self):
|
|
|
|
super().__init__()
|
|
|
|
self._tag_stack_ = []
|
|
|
|
self._match_css_url_ = re.compile(r"url\(\"?(?P<url>[^\")]+)\"?\)")
|
|
|
|
self._match_css_import_string_ = re.compile(r"@import\s+\"(?P<url>[^\")]+)\"")
|
2024-12-15 14:10:18 -05:00
|
|
|
self._resources = []
|
|
|
|
|
|
|
|
@property
|
|
|
|
def resources(self):
|
|
|
|
return self._resources
|
2024-10-29 09:19:14 -04:00
|
|
|
|
2024-11-01 03:09:06 -04:00
|
|
|
def handle_starttag(self, tag, attrs):
|
2024-11-06 07:32:57 -05:00
|
|
|
self._tag_stack_.append(tag)
|
2024-12-14 17:56:09 -05:00
|
|
|
if tag in ["script", "img"]:
|
2024-11-01 03:09:06 -04:00
|
|
|
attr_dict = dict(attrs)
|
|
|
|
if "src" in attr_dict:
|
2024-12-15 14:10:18 -05:00
|
|
|
self._resources.append(attr_dict["src"])
|
2024-11-06 07:32:57 -05:00
|
|
|
if tag == "link":
|
|
|
|
attr_dict = dict(attrs)
|
2024-12-23 06:08:09 -05:00
|
|
|
if "rel" in attr_dict and attr_dict["rel"] == "stylesheet":
|
2024-12-15 14:10:18 -05:00
|
|
|
self._resources.append(attr_dict["href"])
|
2024-11-06 07:32:57 -05:00
|
|
|
|
|
|
|
def handle_endtag(self, tag):
|
|
|
|
self._tag_stack_.pop()
|
|
|
|
|
|
|
|
def handle_data(self, data):
|
|
|
|
if self._tag_stack_ and self._tag_stack_[-1] == "style":
|
|
|
|
# Look for uses of url()
|
|
|
|
url_iterator = self._match_css_url_.finditer(data)
|
|
|
|
for match in url_iterator:
|
2024-12-15 14:10:18 -05:00
|
|
|
self._resources.append(match.group("url"))
|
2024-11-06 07:32:57 -05:00
|
|
|
# Look for @imports that use plain strings - we already found the url() ones
|
|
|
|
import_iterator = self._match_css_import_string_.finditer(data)
|
|
|
|
for match in import_iterator:
|
2024-12-15 14:10:18 -05:00
|
|
|
self._resources.append(match.group("url"))
|
2024-10-29 09:19:14 -04:00
|
|
|
|
|
|
|
|
2024-11-05 09:17:38 -05:00
|
|
|
class TestTypeIdentifier(HTMLParser):
|
|
|
|
"""Identifies what kind of test the page is, and stores it in self.test_type
|
|
|
|
For reference tests, the URL of the reference page is saved as self.reference_path
|
|
|
|
"""
|
|
|
|
|
|
|
|
def __init__(self, url):
|
|
|
|
super().__init__()
|
|
|
|
self.url = url
|
|
|
|
self.test_type = TestType.TEXT
|
|
|
|
self.reference_path = None
|
2024-12-04 12:15:19 -05:00
|
|
|
self.ref_test_link_found = False
|
2024-11-05 09:17:38 -05:00
|
|
|
|
|
|
|
def handle_starttag(self, tag, attrs):
|
|
|
|
if tag == "link":
|
|
|
|
attr_dict = dict(attrs)
|
2024-12-23 06:08:09 -05:00
|
|
|
if "rel" in attr_dict and (attr_dict["rel"] == "match" or attr_dict["rel"] == "mismatch"):
|
2024-12-04 12:15:19 -05:00
|
|
|
if self.ref_test_link_found:
|
|
|
|
raise RuntimeError("Ref tests with multiple match or mismatch links are not currently supported")
|
2024-11-05 09:17:38 -05:00
|
|
|
self.test_type = TestType.REF
|
|
|
|
self.reference_path = attr_dict["href"]
|
2024-12-04 12:15:19 -05:00
|
|
|
self.ref_test_link_found = True
|
2024-11-05 09:17:38 -05:00
|
|
|
|
|
|
|
|
2024-12-15 15:39:03 -05:00
|
|
|
def map_to_path(sources: list[ResourceAndType], is_resource=True, resource_path=None) -> list[PathMapping]:
|
2024-12-15 15:22:49 -05:00
|
|
|
filepaths: list[PathMapping] = []
|
|
|
|
|
|
|
|
for source in sources:
|
2024-12-15 15:39:03 -05:00
|
|
|
base_directory = test_type.input_path if source.type == ResourceType.INPUT else test_type.expected_path
|
|
|
|
|
|
|
|
if source.resource.startswith('/') or not is_resource:
|
2024-12-23 19:27:21 -05:00
|
|
|
file_path = Path(base_directory, source.resource.lstrip('/'))
|
2024-12-15 15:22:49 -05:00
|
|
|
else:
|
|
|
|
# Add it as a sibling path if it's a relative resource
|
2024-12-23 19:27:21 -05:00
|
|
|
sibling_location = Path(resource_path).parent
|
|
|
|
parent_directory = Path(base_directory, sibling_location)
|
2024-12-15 15:22:49 -05:00
|
|
|
|
2024-12-23 19:27:21 -05:00
|
|
|
file_path = Path(parent_directory, source.resource)
|
2024-12-15 15:22:49 -05:00
|
|
|
# Map to source and destination
|
2024-12-23 19:27:21 -05:00
|
|
|
output_path = wpt_base_url + str(file_path).replace(base_directory, '')
|
2024-12-15 15:22:49 -05:00
|
|
|
|
2024-12-23 19:27:21 -05:00
|
|
|
filepaths.append(PathMapping(output_path, file_path.absolute()))
|
2024-10-29 09:19:14 -04:00
|
|
|
|
|
|
|
return filepaths
|
|
|
|
|
|
|
|
|
2024-11-22 19:53:20 -05:00
|
|
|
def is_crash_test(url_string):
|
|
|
|
# https://web-platform-tests.org/writing-tests/crashtest.html
|
|
|
|
# A test file is treated as a crash test if they have -crash in their name before the file extension, or they are
|
|
|
|
# located in a folder named crashtests
|
|
|
|
parsed_url = urlparse(url_string)
|
|
|
|
path_segments = parsed_url.path.strip('/').split('/')
|
|
|
|
if len(path_segments) > 1 and "crashtests" in path_segments[::-1]:
|
|
|
|
return True
|
|
|
|
file_name = path_segments[-1]
|
|
|
|
file_name_parts = file_name.split('.')
|
|
|
|
if len(file_name_parts) > 1 and any([part.endswith('-crash') for part in file_name_parts[:-1]]):
|
|
|
|
return True
|
|
|
|
return False
|
|
|
|
|
|
|
|
|
2024-12-15 15:39:03 -05:00
|
|
|
def modify_sources(files, resources: list[ResourceAndType]) -> None:
|
2024-10-29 09:19:14 -04:00
|
|
|
for file in files:
|
|
|
|
# Get the distance to the wpt-imports folder
|
2024-11-05 09:17:38 -05:00
|
|
|
folder_index = str(file).find(test_type.input_path)
|
|
|
|
if folder_index == -1:
|
|
|
|
folder_index = str(file).find(test_type.expected_path)
|
|
|
|
non_prefixed_path = str(file)[folder_index + len(test_type.expected_path):]
|
|
|
|
else:
|
|
|
|
non_prefixed_path = str(file)[folder_index + len(test_type.input_path):]
|
|
|
|
|
2024-10-29 09:19:14 -04:00
|
|
|
parent_folder_count = len(Path(non_prefixed_path).parent.parts) - 1
|
|
|
|
parent_folder_path = '../' * parent_folder_count
|
|
|
|
|
|
|
|
with open(file, 'r') as f:
|
2024-11-01 03:09:06 -04:00
|
|
|
page_source = f.read()
|
2024-10-29 09:19:14 -04:00
|
|
|
|
2024-11-01 03:09:06 -04:00
|
|
|
# Iterate all scripts and overwrite the src attribute
|
2024-12-15 15:39:03 -05:00
|
|
|
for i, resource in enumerate(map(lambda r: r.resource, resources)):
|
2024-12-15 14:10:18 -05:00
|
|
|
if resource.startswith('/'):
|
|
|
|
new_src_value = parent_folder_path + resource[1::]
|
|
|
|
page_source = page_source.replace(resource, new_src_value)
|
2024-11-05 09:17:38 -05:00
|
|
|
|
|
|
|
# Look for mentions of the reference page, and update their href
|
|
|
|
if raw_reference_path is not None:
|
|
|
|
new_reference_path = parent_folder_path + '../../expected/wpt-import/' + reference_path[::]
|
|
|
|
page_source = page_source.replace(raw_reference_path, new_reference_path)
|
|
|
|
|
|
|
|
with open(file, 'w') as f:
|
|
|
|
f.write(str(page_source))
|
2024-10-29 09:19:14 -04:00
|
|
|
|
|
|
|
|
|
|
|
def download_files(filepaths):
|
|
|
|
downloaded_files = []
|
|
|
|
|
|
|
|
for file in filepaths:
|
2024-10-30 23:43:09 -04:00
|
|
|
source = urljoin(file.source, "/".join(file.source.split('/')[3:]))
|
2024-11-01 03:09:06 -04:00
|
|
|
destination = Path(os.path.normpath(file.destination))
|
2024-10-30 23:43:09 -04:00
|
|
|
|
|
|
|
if destination.exists():
|
|
|
|
print(f"Skipping {destination} as it already exists")
|
2024-10-29 09:19:14 -04:00
|
|
|
continue
|
|
|
|
|
2024-10-30 23:43:09 -04:00
|
|
|
print(f"Downloading {source} to {destination}")
|
2024-10-29 09:19:14 -04:00
|
|
|
|
2024-10-30 23:43:09 -04:00
|
|
|
connection = urlopen(source)
|
2024-10-29 09:19:14 -04:00
|
|
|
if connection.status != 200:
|
|
|
|
print(f"Failed to download {file.source}")
|
|
|
|
continue
|
|
|
|
|
2024-10-30 23:43:09 -04:00
|
|
|
os.makedirs(destination.parent, exist_ok=True)
|
2024-10-29 09:19:14 -04:00
|
|
|
|
2024-10-30 23:43:09 -04:00
|
|
|
with open(destination, 'wb') as f:
|
2024-10-29 09:19:14 -04:00
|
|
|
f.write(connection.read())
|
|
|
|
|
2024-10-30 23:43:09 -04:00
|
|
|
downloaded_files.append(destination)
|
2024-10-29 09:19:14 -04:00
|
|
|
|
|
|
|
return downloaded_files
|
|
|
|
|
|
|
|
|
|
|
|
def create_expectation_files(files):
|
2024-11-05 09:17:38 -05:00
|
|
|
# Ref tests don't have an expectation text file
|
2024-11-22 19:53:20 -05:00
|
|
|
if test_type in [TestType.REF, TestType.CRASH]:
|
2024-11-05 09:17:38 -05:00
|
|
|
return
|
|
|
|
|
2024-10-29 09:19:14 -04:00
|
|
|
for file in files:
|
2024-11-05 09:17:38 -05:00
|
|
|
new_path = str(file.destination).replace(test_type.input_path, test_type.expected_path)
|
2024-10-29 09:19:14 -04:00
|
|
|
new_path = new_path.rsplit(".", 1)[0] + '.txt'
|
|
|
|
|
|
|
|
expected_file = Path(new_path)
|
|
|
|
if expected_file.exists():
|
|
|
|
print(f"Skipping {expected_file} as it already exists")
|
|
|
|
continue
|
|
|
|
|
|
|
|
os.makedirs(expected_file.parent, exist_ok=True)
|
|
|
|
expected_file.touch()
|
|
|
|
|
|
|
|
|
|
|
|
def main():
|
|
|
|
if len(sys.argv) != 2:
|
|
|
|
print("Usage: import-wpt-test.py <url>")
|
|
|
|
return
|
|
|
|
|
|
|
|
url_to_import = sys.argv[1]
|
|
|
|
resource_path = '/'.join(Path(url_to_import).parts[2::])
|
|
|
|
|
2024-11-05 09:17:38 -05:00
|
|
|
with urlopen(url_to_import) as response:
|
|
|
|
page = response.read().decode("utf-8")
|
|
|
|
|
|
|
|
global test_type, reference_path, raw_reference_path
|
2024-11-22 19:53:20 -05:00
|
|
|
if is_crash_test(url_to_import):
|
|
|
|
test_type = TestType.CRASH
|
|
|
|
else:
|
|
|
|
identifier = TestTypeIdentifier(url_to_import)
|
|
|
|
identifier.feed(page)
|
|
|
|
test_type = identifier.test_type
|
|
|
|
raw_reference_path = identifier.reference_path
|
|
|
|
|
2024-11-05 09:17:38 -05:00
|
|
|
print(f"Identified {url_to_import} as type {test_type}, ref {raw_reference_path}")
|
|
|
|
|
2024-12-15 15:39:03 -05:00
|
|
|
main_file = [ResourceAndType(resource_path, ResourceType.INPUT)]
|
2024-10-29 09:19:14 -04:00
|
|
|
main_paths = map_to_path(main_file, False)
|
2024-11-05 09:17:38 -05:00
|
|
|
|
|
|
|
if test_type == TestType.REF and raw_reference_path is None:
|
|
|
|
raise RuntimeError('Failed to file reference path in ref test')
|
|
|
|
|
|
|
|
if raw_reference_path is not None:
|
2024-11-06 09:14:46 -05:00
|
|
|
if raw_reference_path.startswith('/'):
|
|
|
|
reference_path = raw_reference_path
|
|
|
|
main_paths.append(PathMapping(
|
|
|
|
wpt_base_url + raw_reference_path,
|
|
|
|
Path(test_type.expected_path + raw_reference_path).absolute()
|
|
|
|
))
|
|
|
|
else:
|
|
|
|
reference_path = Path(resource_path).parent.joinpath(raw_reference_path).__str__()
|
|
|
|
main_paths.append(PathMapping(
|
|
|
|
wpt_base_url + '/' + reference_path,
|
|
|
|
Path(test_type.expected_path + '/' + reference_path).absolute()
|
|
|
|
))
|
2024-11-05 09:17:38 -05:00
|
|
|
|
2024-10-29 09:19:14 -04:00
|
|
|
files_to_modify = download_files(main_paths)
|
|
|
|
create_expectation_files(main_paths)
|
|
|
|
|
2024-12-15 15:39:03 -05:00
|
|
|
input_parser = LinkedResourceFinder()
|
|
|
|
input_parser.feed(page)
|
|
|
|
additional_resources = list(map(lambda s: ResourceAndType(s, ResourceType.INPUT), input_parser.resources))
|
|
|
|
|
|
|
|
expected_parser = LinkedResourceFinder()
|
|
|
|
for path in main_paths[1:]:
|
|
|
|
with urlopen(path.source) as response:
|
|
|
|
page = response.read().decode("utf-8")
|
|
|
|
expected_parser.feed(page)
|
|
|
|
additional_resources.extend(
|
|
|
|
list(map(lambda s: ResourceAndType(s, ResourceType.EXPECTED), expected_parser.resources))
|
|
|
|
)
|
|
|
|
|
|
|
|
modify_sources(files_to_modify, additional_resources)
|
|
|
|
script_paths = map_to_path(additional_resources, True, resource_path)
|
2024-10-29 09:19:14 -04:00
|
|
|
download_files(script_paths)
|
|
|
|
|
|
|
|
|
|
|
|
if __name__ == "__main__":
|
|
|
|
main()
|