2021-09-10 21:37:42 +02:00
/*
* Copyright ( c ) 2021 , Ben Wiederhake < BenWiederhake . GitHub @ gmx . de >
*
* SPDX - License - Identifier : BSD - 2 - Clause
*/
/*
* You may want to invoke the checker like this :
2023-05-20 01:36:52 +02:00
* $ ninja - C Build / lagom
* $ export SERENITY_SOURCE_DIR = / path / to / serenity
2023-03-18 19:23:35 +04:00
* $ find AK Base Documentation Kernel Meta Ports Tests Userland - type f - name ' * . md ' - print0 | xargs - 0 Build / lagom / bin / markdown - check README . md CONTRIBUTING . md
2021-09-10 21:37:42 +02:00
*/
# include <AK/Format.h>
# include <AK/HashMap.h>
# include <AK/HashTable.h>
# include <AK/LexicalPath.h>
2022-01-12 12:40:36 +01:00
# include <AK/RecursionDecision.h>
2022-02-25 13:44:52 +01:00
# include <AK/URL.h>
2021-09-10 21:37:42 +02:00
# include <AK/Vector.h>
2022-03-28 21:12:18 +02:00
# include <LibCore/ArgsParser.h>
2023-02-09 03:02:46 +01:00
# include <LibCore/File.h>
2023-03-22 02:35:30 +11:00
# include <LibFileSystem/FileSystem.h>
2022-03-28 20:59:25 +02:00
# include <LibMain/Main.h>
2023-07-02 13:20:23 +02:00
# include <LibManual/PageNode.h>
# include <LibManual/Path.h>
# include <LibManual/SectionNode.h>
2021-09-10 21:37:42 +02:00
# include <LibMarkdown/Document.h>
# include <LibMarkdown/Visitor.h>
2022-02-26 16:51:24 +01:00
# include <stdlib.h>
2021-09-10 21:37:42 +02:00
2023-05-20 01:35:18 +02:00
static bool is_missing_file_acceptable ( String const & filename )
2021-10-06 22:55:12 +02:00
{
const StringView acceptable_missing_files [ ] = {
2022-02-25 15:01:22 +01:00
// FIXME: Please write these manpages!
2022-07-11 17:32:29 +00:00
" /usr/share/man/man2/exec.md " sv ,
" /usr/share/man/man2/fcntl.md " sv ,
" /usr/share/man/man2/fork.md " sv ,
" /usr/share/man/man2/ioctl.md " sv ,
" /usr/share/man/man2/listen.md " sv ,
" /usr/share/man/man2/mmap.md " sv ,
" /usr/share/man/man2/mprotect.md " sv ,
" /usr/share/man/man2/open.md " sv ,
" /usr/share/man/man2/ptrace.md " sv ,
" /usr/share/man/man5/perfcore.md " sv ,
2021-10-20 21:52:34 +02:00
// These ones are okay:
2022-07-11 17:32:29 +00:00
" /home/anon/Tests/js-tests/test-common.js " sv ,
" /man1/index.html " sv ,
" /man2/index.html " sv ,
" /man3/index.html " sv ,
" /man4/index.html " sv ,
" /man5/index.html " sv ,
" /man6/index.html " sv ,
" /man7/index.html " sv ,
" /man8/index.html " sv ,
" index.html " sv ,
2021-10-06 22:55:12 +02:00
} ;
for ( auto const & suffix : acceptable_missing_files ) {
2023-05-20 01:35:18 +02:00
if ( filename . ends_with_bytes ( suffix ) )
2021-10-06 22:55:12 +02:00
return true ;
}
return false ;
}
2021-09-10 21:37:42 +02:00
struct FileLink {
2022-12-04 18:02:33 +00:00
DeprecatedString file_path ; // May be empty, but not null
DeprecatedString anchor ; // May be null ("foo.md", "bar.png"), may be empty ("baz.md#")
DeprecatedString label ; // May be empty, but not null
2021-09-10 21:37:42 +02:00
} ;
class MarkdownLinkage final : Markdown : : Visitor {
public :
~ MarkdownLinkage ( ) = default ;
static MarkdownLinkage analyze ( Markdown : : Document const & ) ;
2022-12-04 18:02:33 +00:00
bool has_anchor ( DeprecatedString const & anchor ) const { return m_anchors . contains ( anchor ) ; }
HashTable < DeprecatedString > const & anchors ( ) const { return m_anchors ; }
2022-02-25 13:42:51 +01:00
bool has_invalid_link ( ) const { return m_has_invalid_link ; }
2021-09-10 21:37:42 +02:00
Vector < FileLink > const & file_links ( ) const { return m_file_links ; }
private :
2022-02-26 16:51:24 +01:00
MarkdownLinkage ( )
{
auto const * source_directory = getenv ( " SERENITY_SOURCE_DIR " ) ;
if ( source_directory ! = nullptr ) {
m_serenity_source_directory = source_directory ;
} else {
warnln ( " The environment variable SERENITY_SOURCE_DIR was not found. Link checking inside Serenity's filesystem will fail. " ) ;
}
}
2021-09-10 21:37:42 +02:00
virtual RecursionDecision visit ( Markdown : : Heading const & ) override ;
virtual RecursionDecision visit ( Markdown : : Text : : LinkNode const & ) override ;
2022-12-04 18:02:33 +00:00
HashTable < DeprecatedString > m_anchors ;
2021-09-10 21:37:42 +02:00
Vector < FileLink > m_file_links ;
2022-02-25 13:42:51 +01:00
bool m_has_invalid_link { false } ;
2022-02-26 16:51:24 +01:00
2022-12-04 18:02:33 +00:00
DeprecatedString m_serenity_source_directory ;
2021-09-10 21:37:42 +02:00
} ;
MarkdownLinkage MarkdownLinkage : : analyze ( Markdown : : Document const & document )
{
MarkdownLinkage linkage ;
document . walk ( linkage ) ;
return linkage ;
}
class StringCollector final : Markdown : : Visitor {
public :
StringCollector ( ) = default ;
virtual ~ StringCollector ( ) = default ;
2023-01-26 18:58:09 +00:00
DeprecatedString build ( ) { return m_builder . to_deprecated_string ( ) ; }
2021-09-10 21:37:42 +02:00
2022-12-04 18:02:33 +00:00
static DeprecatedString from ( Markdown : : Heading const & heading )
2021-09-10 21:37:42 +02:00
{
StringCollector collector ;
heading . walk ( collector ) ;
return collector . build ( ) ;
}
2022-12-04 18:02:33 +00:00
static DeprecatedString from ( Markdown : : Text : : Node const & node )
2021-09-10 21:37:42 +02:00
{
StringCollector collector ;
node . walk ( collector ) ;
return collector . build ( ) ;
}
private :
2022-12-04 18:02:33 +00:00
virtual RecursionDecision visit ( DeprecatedString const & text ) override
2021-09-10 21:37:42 +02:00
{
m_builder . append ( text ) ;
return RecursionDecision : : Recurse ;
}
StringBuilder m_builder ;
} ;
2022-12-04 18:02:33 +00:00
static DeprecatedString slugify ( DeprecatedString const & text )
2021-09-10 21:37:42 +02:00
{
// TODO: This feels like it belongs into LibWeb.
2022-12-04 18:02:33 +00:00
DeprecatedString slug = text . to_lowercase ( ) ;
2021-09-10 21:37:42 +02:00
// Reverse-engineered through github, using:
// find AK/ Base/ Documentation/ Kernel/ Meta/ Ports/ Tests/ Userland/ -name '*.md' | xargs grep --color=always -Pin '^##+ .*[^a-z0-9 ?()`_:/!&|.$'"'"',<>"+-]' README.md
2022-07-11 17:32:29 +00:00
slug = slug . replace ( " " sv , " - " sv , ReplaceMode : : All )
. replace ( " ! " sv , " " sv , ReplaceMode : : All )
. replace ( " ? " sv , " " sv , ReplaceMode : : All )
. replace ( " ( " sv , " " sv , ReplaceMode : : All )
. replace ( " ) " sv , " " sv , ReplaceMode : : All )
. replace ( " : " sv , " " sv , ReplaceMode : : All )
. replace ( " / " sv , " - " sv , ReplaceMode : : All )
. replace ( " & " sv , " " sv , ReplaceMode : : All )
. replace ( " | " sv , " " sv , ReplaceMode : : All )
. replace ( " . " sv , " " sv , ReplaceMode : : All )
. replace ( " $ " sv , " " sv , ReplaceMode : : All )
. replace ( " ' " sv , " " sv , ReplaceMode : : All )
. replace ( " , " sv , " " sv , ReplaceMode : : All )
. replace ( " \" " sv , " " sv , ReplaceMode : : All )
. replace ( " + " sv , " " sv , ReplaceMode : : All )
. replace ( " \\ " sv , " " sv , ReplaceMode : : All )
. replace ( " < " sv , " " sv , ReplaceMode : : All )
. replace ( " > " sv , " " sv , ReplaceMode : : All ) ;
2021-09-10 21:37:42 +02:00
// What about "="?
return slug ;
}
RecursionDecision MarkdownLinkage : : visit ( Markdown : : Heading const & heading )
{
m_anchors . set ( slugify ( StringCollector : : from ( heading ) ) ) ;
return RecursionDecision : : Recurse ;
}
RecursionDecision MarkdownLinkage : : visit ( Markdown : : Text : : LinkNode const & link_node )
{
2022-12-04 18:02:33 +00:00
DeprecatedString const & href = link_node . href ;
2021-09-10 21:37:42 +02:00
if ( href . is_null ( ) ) {
// Nothing to do here.
return RecursionDecision : : Recurse ;
}
2022-02-25 13:44:52 +01:00
auto url = URL : : create_with_url_or_path ( href ) ;
if ( url . is_valid ( ) ) {
if ( url . scheme ( ) = = " https " | | url . scheme ( ) = = " http " ) {
outln ( " Not checking external link {} " , href ) ;
return RecursionDecision : : Recurse ;
}
if ( url . scheme ( ) = = " help " ) {
2022-02-25 13:47:41 +01:00
if ( url . host ( ) ! = " man " ) {
warnln ( " help:// URL without 'man': {} " , href ) ;
m_has_invalid_link = true ;
return RecursionDecision : : Recurse ;
}
2023-04-14 20:12:03 +01:00
if ( url . path_segment_count ( ) < 2 ) {
2022-02-25 13:47:41 +01:00
warnln ( " help://man URL is missing section or page: {} " , href ) ;
m_has_invalid_link = true ;
return RecursionDecision : : Recurse ;
}
2022-12-14 15:48:19 +01:00
// Remove leading '/' from the path.
2023-04-14 20:12:03 +01:00
auto file = DeprecatedString : : formatted ( " {}/Base/usr/share/man/man{}.md " , m_serenity_source_directory , url . serialize_path ( ) . substring ( 1 ) ) ;
2022-02-25 13:47:41 +01:00
2022-12-04 18:02:33 +00:00
m_file_links . append ( { file , DeprecatedString ( ) , StringCollector : : from ( * link_node . text ) } ) ;
2022-02-25 13:44:52 +01:00
return RecursionDecision : : Recurse ;
}
if ( url . scheme ( ) = = " file " ) {
2023-04-14 20:12:03 +01:00
auto file_path = url . serialize_path ( ) ;
if ( file_path . contains ( " man " sv ) & & file_path . ends_with ( " .md " sv ) ) {
2023-01-07 17:05:33 +01:00
warnln ( " Inter-manpage link without the help:// scheme: {} \n Please use help URLs of the form 'help://man/<section>/<subsection...>/<page>' " , href ) ;
m_has_invalid_link = true ;
return RecursionDecision : : Recurse ;
}
2022-02-26 16:51:24 +01:00
// TODO: Check more possible links other than icons.
2023-04-14 20:12:03 +01:00
if ( file_path . starts_with ( " /res/icons/ " sv ) ) {
auto file = DeprecatedString : : formatted ( " {}/Base{} " , m_serenity_source_directory , file_path ) ;
2022-12-04 18:02:33 +00:00
m_file_links . append ( { file , DeprecatedString ( ) , StringCollector : : from ( * link_node . text ) } ) ;
2023-04-14 20:12:03 +01:00
} else if ( file_path . starts_with ( " /bin " sv ) ) {
2023-01-07 17:15:19 +01:00
StringBuilder builder ;
link_node . text - > render_to_html ( builder ) ;
auto link_text = builder . string_view ( ) ;
if ( link_text ! = " Open " sv ) {
warnln ( " Binary link named '{}' is not allowed, binary links must be called 'Open'. Linked binary: {} " , link_text , href ) ;
m_has_invalid_link = true ;
}
} else {
outln ( " Not checking local link {} " , href ) ;
2022-02-25 13:44:52 +01:00
}
return RecursionDecision : : Recurse ;
}
2022-01-22 19:31:02 +02:00
}
2021-09-10 21:37:42 +02:00
2022-12-04 18:02:33 +00:00
DeprecatedString label = StringCollector : : from ( * link_node . text ) ;
2021-09-10 21:37:42 +02:00
Optional < size_t > last_hash = href . find_last ( ' # ' ) ;
if ( last_hash . has_value ( ) ) {
m_file_links . append ( { href . substring ( 0 , last_hash . value ( ) ) , href . substring ( last_hash . value ( ) + 1 ) , label } ) ;
} else {
2022-12-04 18:02:33 +00:00
m_file_links . append ( { href , DeprecatedString ( ) , label } ) ;
2021-09-10 21:37:42 +02:00
}
return RecursionDecision : : Recurse ;
}
2023-07-02 13:20:23 +02:00
static ErrorOr < String > generate_link_graph ( HashMap < NonnullRefPtr < Manual : : PageNode const > , Vector < NonnullRefPtr < Manual : : PageNode const > > > const & page_links )
{
auto const header = " digraph manpage_links { \n " sv ;
StringBuilder builder ;
TRY ( builder . try_append ( header ) ) ;
// Not displayed to the user.
HashMap < NonnullRefPtr < Manual : : PageNode const > , String > page_identifiers ;
for ( auto const & page : page_links . keys ( ) ) {
auto path = TRY ( page - > path ( ) ) ;
StringBuilder identifier_builder ;
// Only allow alphanumerics, replace everything else with underscores.
for ( auto const & character : path . code_points ( ) ) {
if ( AK : : is_ascii_alphanumeric ( character ) )
TRY ( identifier_builder . try_append_code_point ( character ) ) ;
else
TRY ( identifier_builder . try_append ( ' _ ' ) ) ;
}
auto const identifier = TRY ( identifier_builder . to_string ( ) ) ;
TRY ( builder . try_appendff ( " {} [label= \" {}({}) \" ]; \n " , identifier , TRY ( page - > name ( ) ) , page - > section_number ( ) ) ) ;
TRY ( page_identifiers . try_set ( page , identifier ) ) ;
}
for ( auto const & from_page_list : page_links ) {
auto const & from_page = from_page_list . key ;
for ( auto const & to_page : from_page_list . value ) {
auto const to_page_identifier = page_identifiers . get ( to_page ) ;
// Target page doesn't actually exist; it's probably an ignored page.
if ( ! to_page_identifier . has_value ( ) )
continue ;
TRY ( builder . try_appendff ( " {} -> {}; \n " , page_identifiers . get ( from_page ) . value ( ) , page_identifiers . get ( to_page ) . value ( ) ) ) ;
}
}
TRY ( builder . try_append ( " } \n " sv ) ) ;
return builder . to_string ( ) ;
}
2022-03-28 20:59:25 +02:00
ErrorOr < int > serenity_main ( Main : : Arguments arguments )
2021-09-10 21:37:42 +02:00
{
2022-03-28 21:12:18 +02:00
Core : : ArgsParser args_parser ;
Vector < StringView > file_paths ;
2023-07-02 13:20:23 +02:00
bool output_link_graph { false } ;
StringView base_path = " / " sv ;
2022-03-28 21:12:18 +02:00
args_parser . add_positional_argument ( file_paths , " Path to markdown files to read and parse " , " paths " , Core : : ArgsParser : : Required : : Yes ) ;
2023-07-02 13:20:23 +02:00
args_parser . add_option ( base_path , " System base path (default: \" / \" ) " , " base " , ' b ' , " path " ) ;
args_parser . add_option ( output_link_graph , " Output a page link graph into \" manpage-links.gv \" . The recommended tool to process this graph is `fdp`. " , " link-graph " , ' g ' ) ;
2022-03-28 21:12:18 +02:00
args_parser . parse ( arguments ) ;
2021-09-10 21:37:42 +02:00
outln ( " Reading and parsing Markdown files ... " ) ;
2023-05-20 01:35:18 +02:00
HashMap < String , MarkdownLinkage > files ;
2022-03-28 21:12:18 +02:00
for ( auto path : file_paths ) {
2023-02-09 03:02:46 +01:00
auto file_or_error = Core : : File : : open ( path , Core : : File : : OpenMode : : Read ) ;
2021-09-10 21:37:42 +02:00
if ( file_or_error . is_error ( ) ) {
2022-09-14 16:18:44 +01:00
warnln ( " Failed to open {}: {} " , path , file_or_error . error ( ) ) ;
2021-09-10 21:37:42 +02:00
// Since this should never happen anyway, fail early.
2022-03-28 20:59:25 +02:00
return file_or_error . release_error ( ) ;
2021-09-10 21:37:42 +02:00
}
auto file = file_or_error . release_value ( ) ;
2022-09-14 16:18:44 +01:00
2022-12-11 17:49:00 +01:00
auto content_buffer_or_error = file - > read_until_eof ( ) ;
2022-09-14 16:18:44 +01:00
if ( content_buffer_or_error . is_error ( ) ) {
warnln ( " Failed to read {}: {} " , path , file_or_error . error ( ) ) ;
// Since this should never happen anyway, fail early.
return file_or_error . release_error ( ) ;
}
auto content_buffer = content_buffer_or_error . release_value ( ) ;
2021-09-10 21:37:42 +02:00
auto content = StringView ( content_buffer ) ;
auto document = Markdown : : Document : : parse ( content ) ;
if ( ! document ) {
warnln ( " Failed to parse {} due to an unspecified error. " , path ) ;
// Since this should never happen anyway, fail early.
return 1 ;
}
2023-05-20 01:35:18 +02:00
files . set ( TRY ( FileSystem : : real_path ( path ) ) , MarkdownLinkage : : analyze ( * document ) ) ;
2021-09-10 21:37:42 +02:00
}
outln ( " Checking links ... " ) ;
bool any_problems = false ;
for ( auto const & file_item : files ) {
2022-02-25 13:42:51 +01:00
if ( file_item . value . has_invalid_link ( ) ) {
outln ( " File '{}' has invalid links. " , file_item . key ) ;
any_problems = true ;
continue ;
}
2023-05-20 01:35:18 +02:00
auto file_lexical_path = LexicalPath ( file_item . key . to_deprecated_string ( ) ) ;
2021-09-10 21:37:42 +02:00
auto file_dir = file_lexical_path . dirname ( ) ;
for ( auto const & file_link : file_item . value . file_links ( ) ) {
2023-05-20 01:35:18 +02:00
String pointee_file ;
2021-09-10 21:37:42 +02:00
if ( file_link . file_path . is_empty ( ) ) {
pointee_file = file_item . key ;
} else {
2023-05-20 01:35:18 +02:00
pointee_file = TRY ( String : : from_deprecated_string ( LexicalPath : : absolute_path ( file_dir , file_link . file_path ) ) ) ;
2021-09-10 21:37:42 +02:00
}
2023-03-22 02:35:30 +11:00
if ( ! FileSystem : : exists ( pointee_file ) & & ! is_missing_file_acceptable ( pointee_file ) ) {
2021-09-10 21:37:42 +02:00
outln ( " File '{}' points to '{}' (label '{}'), but '{}' does not exist! " ,
file_item . key , file_link . file_path , file_link . label , pointee_file ) ;
any_problems = true ;
continue ;
}
if ( file_link . anchor . is_empty ( ) ) {
// No anchor to test for.
continue ;
}
auto pointee_linkage = files . find ( pointee_file ) ;
if ( pointee_linkage = = files . end ( ) ) {
outln ( " File '{}' points to file '{}', which exists, but was not scanned. Add it to the command-line arguments and re-run. " ,
file_item . key , pointee_file ) ;
any_problems = true ;
continue ;
}
if ( ! pointee_linkage - > value . has_anchor ( file_link . anchor ) ) {
outln ( " File '{}' points to '{}#{}' (label '{}'), but file '{}' does not have any heading that results in the anchor '{}'. " ,
file_item . key , file_link . file_path , file_link . anchor , file_link . label , pointee_file , file_link . anchor ) ;
out ( " The following anchors seem to be available: \n " ) ;
bool any_anchors = false ;
for ( auto const & anchor : pointee_linkage - > value . anchors ( ) ) {
if ( any_anchors )
out ( " , " ) ;
out ( " '{}' " , anchor ) ;
any_anchors = true ;
}
if ( ! any_anchors )
out ( " (none) " ) ;
outln ( ) ;
any_problems = true ;
}
}
}
2023-07-02 13:20:23 +02:00
if ( output_link_graph ) {
// First, collect all pages, and collect links between pages in a second step after all pages must have been collected.
HashMap < String , NonnullRefPtr < Manual : : PageNode const > > pages ;
for ( auto const & file : files ) {
auto base_relative_path = TRY ( String : : formatted ( " /{} " , LexicalPath : : relative_path ( file . key , base_path ) ) ) ;
auto page = Manual : : Node : : try_create_from_query ( { base_relative_path } ) ;
if ( page . is_error ( ) ) {
dbgln ( " Not including {} in the link graph since it's not a man page. " , file . key ) ;
continue ;
}
TRY ( pages . try_set ( file . key , page . value ( ) ) ) ;
for ( auto const & link : file . value . file_links ( ) ) {
auto base_relative_path = TRY ( String : : formatted ( " /{} " , LexicalPath : : relative_path ( link . file_path , base_path ) ) ) ;
auto maybe_target_page = Manual : : Node : : try_create_from_query ( { base_relative_path } ) ;
if ( maybe_target_page . is_error ( ) ) {
dbgln ( " Not including {} in the link graph since it's not a man page. " , link . file_path ) ;
continue ;
}
TRY ( pages . try_set ( TRY ( String : : from_deprecated_string ( link . file_path ) ) , maybe_target_page . value ( ) ) ) ;
}
}
HashMap < NonnullRefPtr < Manual : : PageNode const > , Vector < NonnullRefPtr < Manual : : PageNode const > > > page_links ;
for ( auto const & file : files ) {
auto page = pages . get ( file . key ) ;
if ( ! page . has_value ( ) )
continue ;
Vector < NonnullRefPtr < Manual : : PageNode const > > linked_pages ;
for ( auto const & link : file . value . file_links ( ) ) {
auto linked_page = pages . get ( TRY ( String : : from_deprecated_string ( link . file_path ) ) ) ;
if ( ! linked_page . has_value ( ) )
continue ;
TRY ( linked_pages . try_append ( * linked_page . value ( ) ) ) ;
}
TRY ( page_links . try_set ( * page . value ( ) , move ( linked_pages ) ) ) ;
}
auto const graph_text = TRY ( generate_link_graph ( page_links ) ) ;
auto const graph_file = TRY ( Core : : File : : open ( " manpage-links.gv " sv , Core : : File : : OpenMode : : Write | Core : : File : : OpenMode : : Truncate ) ) ;
TRY ( graph_file - > write_until_depleted ( graph_text . bytes ( ) ) ) ;
}
2021-09-10 21:37:42 +02:00
if ( any_problems ) {
outln ( " Done. Some errors were encountered, please check above log. " ) ;
return 1 ;
} else {
outln ( " Done. No problems detected. " ) ;
return 0 ;
}
}