Skip to content


Add reference- and in-article link checking to the wikilink checker
Browse files Browse the repository at this point in the history
The wikilink checker is updated with changes
from Walavouchey/osu-wiki-tools@76f2b53
to Walavouchey/osu-wiki-tools@cdbfa96
to make some more progress towards ppy#6233. The full commit history along
with tests can be viewed in that repo.

The two main features of reference- and in-article link checking require
additional parsing logic to handle ignoring comments and getting the
correct identifier from headings and custom identifier tags. For
in-article links, the articles they point to are parsed for their
identifiers in order to check if the #link-fragment is correct, and if
not, all possible identifiers are printed as suggestions.

By default, all links from outdated articles are ignored. The plan is to
eventually turn on link checking for all files in CI runs when the
remaining errors have been fixed, so this is to prevent contributors
from having to edit already outdated articles.

Currently on master this this runs for about 40s with `--outdated` and
about 32s without on my Windows machine. There are 152 remaining errors
without counting outdated articles and 253 errors in total.
  • Loading branch information
Walavouchey committed May 6, 2022
1 parent 7ab84e0 commit 8b52363
Show file tree
Hide file tree
Showing 12 changed files with 1,115 additions and 212 deletions.
314 changes: 102 additions & 212 deletions scripts/ci/
Original file line number Diff line number Diff line change
Expand Up @@ -3,198 +3,11 @@
import sys
import typing

Redirects = typing.Dict[str, typing.Tuple[str, int]]

class Link(typing.NamedTuple):
A Markdown link, external or internal. May be relative. Example:
See [Difficulty Names](/wiki/Beatmap/Difficulty#naming-conventions)
- title: 'Difficulty Names'
- location: '/wiki/Beatmap/Difficulty'
- extra: '#naming-conventions'
Another example:
![Player is AFK](img/chat-console-afk.png "Player is away from keyboard")
- title: 'Player is AFK'
- location: 'img/chat-console-afk.png'
- extra: ' "Player is away from keyboard"'

title: str
location: str
extra: str

# Link position within the line. Example:
# See also: [Difficulty names](/wiki/Beatmap/Difficulty#naming-conventions)
# ^ link_start ^ link_end
link_start: int
link_end: int

# Sections of a link. Example:
# ![Player is AFK](img/chat-console-afk.png "Player is away from keyboard")
# ^ ----- location ----- ^
# ^ ---------- extra ---------- ^
# ^ --------------------- content --------------------- ^
# ^ ------------------ full_link / full_coloured_link ------------------ ^
def content(self):
return self.location + self.extra

def full_link(self):
return f"[{self.title}]{self.content}"

def full_coloured_link(self):
return "{title_in_braces}{left_brace}{location}{extra}{right_brace}".format(

def red(s):
return f"\x1b[31m{s}\x1b[0m"

def green(s):
return f"\x1b[32m{s}\x1b[0m"

def yellow(s):
return f"\x1b[33m{s}\x1b[0m"

def blue(s):
return f"\x1b[34m{s}\x1b[0m"

def load_redirects(path: str) -> Redirects:
redirects = {}
with open(path, 'r', encoding='utf-8') as fd:
for line_number, line in enumerate(fd, start=1):
split = line.split('"')
redirects[split[1]] = (split[3], line_number)
except IndexError:
return redirects

def child(path: str) -> str:
return path[path.find('/', 1) + 1:]

def directory(filename: str) -> str:
return filename[filename.find('/') + 1:filename.rfind('/')]

def check_redirect(redirects: Redirects, link: str):
link = link.lower()
destination, line_no = redirects[link]
except KeyError:
return (False, "")
if not os.path.exists(f"wiki/{destination}"):
note = f"{blue('Note:')} Broken redirect (redirect.yaml:{line_no}: {link} --> {destination})"
return (False, note)
return (True, "")

def check_link(redirects: Redirects, directory: str, link: str) -> typing.Tuple[bool, str]:
if link.startswith("/wiki/"):
# absolute wikilink
if os.path.exists(link[1:]):
return (True, "")
# may have a redirect
return check_redirect(redirects, child(link))
elif not any(link.startswith(prefix) for prefix in ("http://", "https://", "mailto:")):
# relative wikilink
if os.path.exists(f"wiki/{directory}/{link}"):
return (True, "")
# may have a redirect
return check_redirect(redirects, f"{directory}/{link}")
# some other link; don't care
return (True, "")

def is_in_comment(s, start, end):
return s.rfind("<!--", 0, start) != -1 and s.find("-->", end, -1)

def find_link(s: str, index=0) -> typing.Optional[Link]:
found_brackets = False
started = False
start = None
mid = None
extra = None
end = None
square_bracket_level = 0
parenthesis_level = 0
for i, c in enumerate(s[index:]):
i += index
if not found_brackets and c == '[':
if not start:
start = i
started = True
square_bracket_level += 1
if started and not found_brackets and c == ']':
square_bracket_level -= 1
if square_bracket_level == 0:
if len(s) > i + 1 and s[i + 1] == '(':
found_brackets = True
mid = i + 1
if found_brackets and (c == ' ' or c == '#' or c == '?'):
if extra is None:
extra = i
if found_brackets and c == '(':
parenthesis_level += 1
if found_brackets and c == ')':
parenthesis_level -= 1
if parenthesis_level == 0:
end = i
if is_in_comment(s, start, end):
return None
if extra is None:
extra = end

return Link(
location=s[mid + 1: extra],
title=s[start + 1: mid - 1],
extra=s[extra: end],

return None

def find_links(s: str) -> typing.List[Link]:
results = []
index = 0
match = find_link(s, index)
while match:
match = find_link(s, match.link_end + 1)
return results
from wikitools import article_parser, console, link_checker, redirect_parser, errors as error_types

def print_error():
print(f"{red('Error:')} Some wiki or image links in the files you've changed have errors.\n")
print(f"{'Error:')} Some wiki or image links in the files you've changed have errors.\n")
print("This can happen in one of the following ways:\n")
print("- The article or image that the link points to has since been moved or renamed (make sure to match capitalisation)")
print("- The link simply contains typos or formatting errors")
Expand All @@ -208,52 +21,129 @@ def print_clean():
print("Notice: No broken wiki or image links detected.")

def s(i: int, s: str) -> str:
return f"{i} {s}{'s' if i != 1 else ''}"

def print_count(errors: int, matches: int, error_files: int, files: int):
print(f"{'Note:')} Found {s(errors, 'error')} in {s(error_files, 'file')} ({s(matches, 'link')} in {s(files, 'file')} checked).")

def highlight_links(s: str, errors: typing.List[error_types.LinkError]) -> str:
highlighted_line = ""
prev_index = 0
for error in errors:
highlighted_line += s[prev_index:]
highlighted_line += error.pretty_link
prev_index = + 1
highlighted_line += s[prev_index: -1]
return highlighted_line

def pretty_location(path, lineno, pos, location):
return f"{console.yellow(path)}:{lineno}:{pos}: {}"

def parse_args(args):
parser = argparse.ArgumentParser()
parser.add_argument("-t", "--target", nargs='*', help="paths to the articles you want to check")
parser.add_argument("-a", "--all", action='store_true', help="check all articles")
parser.add_argument("-s", "--separate", action='store_true', help="print errors that appear on the same line separately")
parser.add_argument("--outdated", action='store_true', help="check links in outdated articles")
return parser.parse_args(args)

def file_iterator(roots: list):
for item in roots:
if os.path.isdir(item):
for root, _, filenames in os.walk(item):
for f in filenames:
filepath = os.path.join(root, f)
yield filepath
elif os.path.isfile(item):
yield item

def identifier_suggestions(e, articles):
return '\n\t'.join((
'line {}: {}'.format(lineno, identifier)
for identifier, lineno in sorted(
articles[e.path].identifiers.items(), key=lambda tuple_: tuple_[1]

def main():
args = parse_args(sys.argv[1:])
if not
print("Notice: No articles to check.")
if not and not args.all:
print(f"{console.grey('Notice:')} No articles to check.")

redirects = load_redirects("wiki/redirect.yaml")
filenames = []
if args.all:
filenames = file_iterator(["wiki", "news"])
filenames =

redirects = redirect_parser.load_redirects("wiki/redirect.yaml")
exit_code = 0
for filename in
filename = filename.replace('\\', '/')
if filename.startswith("./"):
filename = filename[2:]

articles: typing.Dict[str, article_parser.Article] = {}
for filename in filenames:
if any((
not filename.endswith(".md"),
"TEMPLATE" in filename,
"README" in filename,
"Article_styling_criteria" in filename,

with open(filename, 'r', encoding='utf-8') as fd:
for linenumber, line in enumerate(fd, start=1):
for match in find_links(line):
if match.content == "/wiki/Sitemap":
success, note = check_link(redirects, directory(filename), match.location)
if success:
a = article_parser.parse(filename)
articles[a.path] = a

if exit_code == 0:
exit_code = 1
print(f"{yellow(filename)}:{linenumber}:{match.link_start + 1}: {red(match.location)}")
if note:
error_count = 0
link_count = 0
error_file_count = 0
file_count = 0

print("{}{}{}".format(line[:match.link_start], match.full_coloured_link, line[match.link_end + 1:]), end="\n\n")
for _, a in sorted(articles.items()):
if a.front_matter.get("outdated", False) and not args.outdated:

link_count += sum(len(_.links) for _ in a.lines.values())
file_count += 1

errors = link_checker.check_article(a, redirects, articles)
if not errors:

error_file_count += 1
if exit_code == 0:
exit_code = 1

for lineno, errors_on_line in sorted(errors.items()):
error_count += len(errors_on_line)
for e in errors_on_line:
print(e.pretty_location(a.path, lineno))
for e in errors_on_line:
if isinstance(e, error_types.MissingIdentifierError):
suggestions = identifier_suggestions(e, articles)
if suggestions:
print('{}\n\t{}'.format('Suggestions:'), suggestions))

if args.separate:
for e in errors_on_line:
print(highlight_links(a.lines[lineno].raw_line, [e]), end="\n\n")
print(highlight_links(a.lines[lineno].raw_line, errors_on_line), end="\n\n")

if exit_code == 0:

print_count(error_count, link_count, error_file_count, file_count)

Expand Down
Empty file.

0 comments on commit 8b52363

Please sign in to comment.