diff --git a/Doc/.ruff.toml b/Doc/.ruff.toml index 3e676e13c3f41a..3531c586337686 100644 --- a/Doc/.ruff.toml +++ b/Doc/.ruff.toml @@ -30,6 +30,7 @@ select = [ ] ignore = [ "E501", # Ignore line length errors (we use auto-formatting) + "I001", # Import block is un-sorted or un-formatted ] [format] diff --git a/Doc/Makefile b/Doc/Makefile index 4d605980a62904..d39c2fe3c3f22a 100644 --- a/Doc/Makefile +++ b/Doc/Makefile @@ -336,3 +336,9 @@ autobuild-stable-html: exit 1;; \ esac @$(MAKE) autobuild-dev-html + +# Collect HTML IDs to a JSON document +.PHONY: html-ids +html-ids: + $(PYTHON) tools/check-html-ids.py collect build/html \ + -o build/html/html-ids.json.gz diff --git a/Doc/tools/check-html-ids.py b/Doc/tools/check-html-ids.py new file mode 100644 index 00000000000000..8e8e0a581df72d --- /dev/null +++ b/Doc/tools/check-html-ids.py @@ -0,0 +1,181 @@ +from compression import gzip +import concurrent.futures +from pathlib import Path +import html.parser +import functools +import argparse +import json +import sys +import re + + +IGNORED_ID_RE = re.compile( + r""" + index-\d+ + | id\d+ + | [_a-z]+_\d+ +""", + re.VERBOSE, +) + + +class IDGatherer(html.parser.HTMLParser): + def __init__(self, ids): + super().__init__() + self.__ids = ids + + def handle_starttag(self, tag, attrs): + for name, value in attrs: + if name == 'id': + if not IGNORED_ID_RE.fullmatch(value): + self.__ids.add(value) + + +def get_ids_from_file(path): + ids = set() + gatherer = IDGatherer(ids) + with path.open(encoding='utf-8') as file: + while chunk := file.read(4096): + gatherer.feed(chunk) + return ids + + +def gather_ids(htmldir, *, verbose_print): + if not htmldir.joinpath('objects.inv').exists(): + raise ValueError(f'{htmldir!r} is not a Sphinx HTML output directory') + + if sys._is_gil_enabled: + pool = concurrent.futures.ProcessPoolExecutor() + else: + pool = concurrent.futures.ThreadPoolExecutor() + tasks = {} + for path in htmldir.glob('**/*.html'): + relative_path = path.relative_to(htmldir) + if '_static' in relative_path.parts: + continue + if 'whatsnew' in relative_path.parts: + continue + tasks[relative_path] = pool.submit(get_ids_from_file, path=path) + + ids_by_page = {} + for relative_path, future in tasks.items(): + verbose_print(relative_path) + ids = future.result() + ids_by_page[str(relative_path)] = ids + verbose_print(f' - {len(ids)} ids found') + + common = set.intersection(*ids_by_page.values()) + verbose_print(f'Filtering out {len(common)} common ids') + for key, page_ids in ids_by_page.items(): + ids_by_page[key] = sorted(page_ids - common) + + return ids_by_page + + +def do_check(baseline, checked, excluded, *, verbose_print): + successful = True + for name, baseline_ids in sorted(baseline.items()): + try: + checked_ids = checked[name] + except KeyError: + successful = False + print(f'{name}: (page missing)') + print() + else: + missing_ids = set(baseline_ids) - set(checked_ids) + if missing_ids: + missing_ids = { + a + for a in missing_ids + if not IGNORED_ID_RE.fullmatch(a) + and (name, a) not in excluded + } + if missing_ids: + successful = False + for missing_id in sorted(missing_ids): + print(f'{name}: {missing_id}') + print() + return successful + + +def main(argv): + parser = argparse.ArgumentParser() + parser.add_argument( + '-v', + '--verbose', + action='store_true', + help='print out more information', + ) + subparsers = parser.add_subparsers(dest='command', required=True) + + collect = subparsers.add_parser( + 'collect', help='collect IDs from a set of HTML files' + ) + collect.add_argument( + 'htmldir', type=Path, help='directory with HTML documentation' + ) + collect.add_argument( + '-o', + '--outfile', + help='File to save the result in; default /html-ids.json.gz', + ) + + check = subparsers.add_parser('check', help='check two archives of IDs') + check.add_argument( + 'baseline_file', type=Path, help='file with baseline IDs' + ) + check.add_argument('checked_file', type=Path, help='file with checked IDs') + check.add_argument( + '-x', + '--exclude-file', + type=Path, + help='file with IDs to exclude from the check', + ) + + args = parser.parse_args(argv[1:]) + + if args.verbose: + verbose_print = functools.partial(print, file=sys.stderr) + else: + + def verbose_print(*args, **kwargs): + """do nothing""" + + if args.command == 'collect': + ids = gather_ids(args.htmldir, verbose_print=verbose_print) + if args.outfile is None: + args.outfile = args.htmldir / 'html-ids.json.gz' + with gzip.open(args.outfile, 'wt', encoding='utf-8') as zfile: + json.dump({'ids_by_page': ids}, zfile) + + if args.command == 'check': + with gzip.open(args.baseline_file) as zfile: + baseline = json.load(zfile)['ids_by_page'] + with gzip.open(args.checked_file) as zfile: + checked = json.load(zfile)['ids_by_page'] + excluded = set() + if args.exclude_file: + with open(args.exclude_file, encoding='utf-8') as file: + for line in file: + line = line.strip() + if line and not line.startswith('#'): + name, sep, excluded_id = line.partition(':') + if sep: + excluded.add((name.strip(), excluded_id.strip())) + if do_check(baseline, checked, excluded, verbose_print=verbose_print): + verbose_print('All OK') + else: + sys.stdout.flush() + print( + 'ERROR: Removed IDs found', + 'The above HTML IDs were removed from the documentation, ' + + 'resulting in broken links. Please add them back.', + sep='\n', + file=sys.stderr, + ) + if args.exclude_file: + print(f'Alternatively, add them to {args.exclude_file}.') + + +if __name__ == '__main__': + main(sys.argv)