Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 1 addition & 0 deletions Doc/.ruff.toml
Original file line number Diff line number Diff line change
Expand Up @@ -30,6 +30,7 @@ select = [
]
ignore = [
"E501", # Ignore line length errors (we use auto-formatting)
"I001", # Import block is un-sorted or un-formatted
]

[format]
Expand Down
6 changes: 6 additions & 0 deletions Doc/Makefile
Original file line number Diff line number Diff line change
Expand Up @@ -336,3 +336,9 @@ autobuild-stable-html:
exit 1;; \
esac
@$(MAKE) autobuild-dev-html

# Collect HTML IDs to a JSON document
.PHONY: html-ids
html-ids:
$(PYTHON) tools/check-html-ids.py collect build/html \
-o build/html/html-ids.json.gz
181 changes: 181 additions & 0 deletions Doc/tools/check-html-ids.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,181 @@
from compression import gzip
import concurrent.futures
from pathlib import Path
import html.parser
import functools
import argparse
import json
import sys
import re


IGNORED_ID_RE = re.compile(
r"""
index-\d+
| id\d+
| [_a-z]+_\d+
""",
re.VERBOSE,
)


class IDGatherer(html.parser.HTMLParser):
def __init__(self, ids):
super().__init__()
self.__ids = ids

def handle_starttag(self, tag, attrs):
for name, value in attrs:
if name == 'id':
if not IGNORED_ID_RE.fullmatch(value):
self.__ids.add(value)


def get_ids_from_file(path):
ids = set()
gatherer = IDGatherer(ids)
with path.open(encoding='utf-8') as file:
while chunk := file.read(4096):
gatherer.feed(chunk)
return ids


def gather_ids(htmldir, *, verbose_print):
if not htmldir.joinpath('objects.inv').exists():
raise ValueError(f'{htmldir!r} is not a Sphinx HTML output directory')

if sys._is_gil_enabled:
pool = concurrent.futures.ProcessPoolExecutor()
else:
pool = concurrent.futures.ThreadPoolExecutor()
tasks = {}
for path in htmldir.glob('**/*.html'):
relative_path = path.relative_to(htmldir)
if '_static' in relative_path.parts:
continue
if 'whatsnew' in relative_path.parts:
continue
tasks[relative_path] = pool.submit(get_ids_from_file, path=path)

ids_by_page = {}
for relative_path, future in tasks.items():
verbose_print(relative_path)
ids = future.result()
ids_by_page[str(relative_path)] = ids
verbose_print(f' - {len(ids)} ids found')

common = set.intersection(*ids_by_page.values())
verbose_print(f'Filtering out {len(common)} common ids')
for key, page_ids in ids_by_page.items():
ids_by_page[key] = sorted(page_ids - common)

return ids_by_page


def do_check(baseline, checked, excluded, *, verbose_print):
successful = True
for name, baseline_ids in sorted(baseline.items()):
try:
checked_ids = checked[name]
except KeyError:
successful = False
print(f'{name}: (page missing)')
print()
else:
missing_ids = set(baseline_ids) - set(checked_ids)
if missing_ids:
missing_ids = {
a
for a in missing_ids
if not IGNORED_ID_RE.fullmatch(a)
and (name, a) not in excluded
}
if missing_ids:
successful = False
for missing_id in sorted(missing_ids):
print(f'{name}: {missing_id}')
print()
return successful


def main(argv):
parser = argparse.ArgumentParser()
parser.add_argument(
'-v',
'--verbose',
action='store_true',
help='print out more information',
)
subparsers = parser.add_subparsers(dest='command', required=True)

collect = subparsers.add_parser(
'collect', help='collect IDs from a set of HTML files'
)
collect.add_argument(
'htmldir', type=Path, help='directory with HTML documentation'
)
collect.add_argument(
'-o',
'--outfile',
help='File to save the result in; default <htmldir>/html-ids.json.gz',
)

check = subparsers.add_parser('check', help='check two archives of IDs')
check.add_argument(
'baseline_file', type=Path, help='file with baseline IDs'
)
check.add_argument('checked_file', type=Path, help='file with checked IDs')
check.add_argument(
'-x',
'--exclude-file',
type=Path,
help='file with IDs to exclude from the check',
)

args = parser.parse_args(argv[1:])

if args.verbose:
verbose_print = functools.partial(print, file=sys.stderr)
else:

def verbose_print(*args, **kwargs):
"""do nothing"""

if args.command == 'collect':
ids = gather_ids(args.htmldir, verbose_print=verbose_print)
if args.outfile is None:
args.outfile = args.htmldir / 'html-ids.json.gz'
with gzip.open(args.outfile, 'wt', encoding='utf-8') as zfile:
json.dump({'ids_by_page': ids}, zfile)

if args.command == 'check':
with gzip.open(args.baseline_file) as zfile:
baseline = json.load(zfile)['ids_by_page']
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

json.load() requires a text file, gzip.open() returns a binary file.

Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

That's not true.

with gzip.open(args.checked_file) as zfile:
checked = json.load(zfile)['ids_by_page']
excluded = set()
if args.exclude_file:
with open(args.exclude_file, encoding='utf-8') as file:
for line in file:
line = line.strip()
if line and not line.startswith('#'):
name, sep, excluded_id = line.partition(':')
if sep:
excluded.add((name.strip(), excluded_id.strip()))
if do_check(baseline, checked, excluded, verbose_print=verbose_print):
verbose_print('All OK')
else:
sys.stdout.flush()
print(
'ERROR: Removed IDs found',
'The above HTML IDs were removed from the documentation, '
+ 'resulting in broken links. Please add them back.',
sep='\n',
file=sys.stderr,
)
if args.exclude_file:
print(f'Alternatively, add them to {args.exclude_file}.')


if __name__ == '__main__':
main(sys.argv)
Loading