From 3aa8db06993c9d7784801e16740bf88f4fdc42a4 Mon Sep 17 00:00:00 2001 From: Petr Viktorin Date: Thu, 5 Feb 2026 16:52:49 +0100 Subject: [PATCH 1/5] Add a tool to check removed HTML anchors --- Doc/.ruff.toml | 1 + Doc/Makefile | 6 ++ Doc/data/removed-ids.txt | 93 ++++++++++++++++++++ Doc/tools/check-html-ids.py | 171 ++++++++++++++++++++++++++++++++++++ 4 files changed, 271 insertions(+) create mode 100644 Doc/data/removed-ids.txt create mode 100644 Doc/tools/check-html-ids.py diff --git a/Doc/.ruff.toml b/Doc/.ruff.toml index 3e676e13c3f41a..3531c586337686 100644 --- a/Doc/.ruff.toml +++ b/Doc/.ruff.toml @@ -30,6 +30,7 @@ select = [ ] ignore = [ "E501", # Ignore line length errors (we use auto-formatting) + "I001", # Import block is un-sorted or un-formatted ] [format] diff --git a/Doc/Makefile b/Doc/Makefile index 4d605980a62904..d39c2fe3c3f22a 100644 --- a/Doc/Makefile +++ b/Doc/Makefile @@ -336,3 +336,9 @@ autobuild-stable-html: exit 1;; \ esac @$(MAKE) autobuild-dev-html + +# Collect HTML IDs to a JSON document +.PHONY: html-ids +html-ids: + $(PYTHON) tools/check-html-ids.py collect build/html \ + -o build/html/html-ids.json.gz diff --git a/Doc/data/removed-ids.txt b/Doc/data/removed-ids.txt new file mode 100644 index 00000000000000..8b38b2fd4df64c --- /dev/null +++ b/Doc/data/removed-ids.txt @@ -0,0 +1,93 @@ +# Known removed HTML IDs: + +c-api/complex.html: complex-numbers-as-python-objects + +c-api/extension-modules.html: initialization-function + +c-api/import.html: c.PyImport_ImportModuleNoBlock + +c-api/init.html: c.Py_GetExecPrefix +c-api/init.html: c.Py_GetPath +c-api/init.html: c.Py_GetPrefix +c-api/init.html: c.Py_GetProgramFullPath +c-api/init.html: c.Py_GetProgramName +c-api/init.html: c.Py_GetPythonHome + +c-api/module.html: module-definitions +c-api/module.html: module-slots + +c-api/stable.html: c-api-stability + +c-api/sys.html: c.PySys_ResetWarnOptions + +c-api/weakref.html: c.PyWeakref_GET_OBJECT +c-api/weakref.html: c.PyWeakref_GetObject + +extending/extending.html: a-simple-example +extending/extending.html: back-to-the-example +extending/extending.html: backtoexample +extending/extending.html: compilation-and-linkage +extending/extending.html: extending-python-with-c-or-c +extending/extending.html: extending-simpleexample +extending/extending.html: intermezzo-errors-and-exceptions +extending/extending.html: methodtable +extending/extending.html: the-module-s-method-table-and-initialization-function + +extending/index.html: creating-extensions-without-third-party-tools + +howto/perf_profiling.html: python-support-for-the-linux-perf-profiler + +library/dis.html: opcode-LOAD_CONST_IMMORTAL + +library/ftplib.html: ftplib.FTP_TLS.ssl_version + +library/http.server.html: cmdoption-http.server-cgi +library/http.server.html: http.server.CGIHTTPRequestHandler +library/http.server.html: http.server.CGIHTTPRequestHandler.cgi_directories +library/http.server.html: http.server.CGIHTTPRequestHandler.do_POST + +library/importlib.html: importlib.abc.FileLoader.load_module +library/importlib.html: importlib.abc.InspectLoader.load_module +library/importlib.html: importlib.abc.Loader.load_module +library/importlib.html: importlib.abc.SourceLoader.load_module +library/importlib.html: importlib.machinery.SourceFileLoader.load_module +library/importlib.html: importlib.machinery.SourcelessFileLoader.load_module + +library/pathlib.html: pathlib.PurePath.is_reserved + +library/platform.html: java-platform +library/platform.html: platform.java_ver + +library/profile.html: cmdoption-cProfile-m +library/profile.html: cmdoption-cProfile-o +library/profile.html: cmdoption-cProfile-s +library/profile.html: instant-user-s-manual +library/profile.html: introduction-to-the-profilers +library/profile.html: module-cProfile +library/profile.html: module-pstats +library/profile.html: profile-and-cprofile-module-reference +library/profile.html: profile-cli +library/profile.html: profile-instant +library/profile.html: profile-stats +library/profile.html: profiler-introduction +library/profile.html: pstats.Stats +library/profile.html: pstats.Stats.add +library/profile.html: pstats.Stats.dump_stats +library/profile.html: pstats.Stats.get_stats_profile +library/profile.html: pstats.Stats.print_callees +library/profile.html: pstats.Stats.print_callers +library/profile.html: pstats.Stats.print_stats +library/profile.html: pstats.Stats.reverse_order +library/profile.html: pstats.Stats.sort_stats +library/profile.html: pstats.Stats.strip_dirs +library/profile.html: the-python-profilers +library/profile.html: the-stats-class + +library/typing.html: typing.no_type_check_decorator + +library/wave.html: wave.Wave_read.getmark +library/wave.html: wave.Wave_read.getmarkers + +library/zipimport.html: zipimport.zipimporter.load_module + +reference/datamodel.html: module.__cached__ diff --git a/Doc/tools/check-html-ids.py b/Doc/tools/check-html-ids.py new file mode 100644 index 00000000000000..9d3cb2463f35b4 --- /dev/null +++ b/Doc/tools/check-html-ids.py @@ -0,0 +1,171 @@ +from compression import gzip +import concurrent.futures +from pathlib import Path +import html.parser +import functools +import argparse +import json +import sys +import re + + +IGNORED_ID_RE = re.compile(r""" + index-\d+ + | id\d+ + | [_a-z]+_\d+ +""", re.VERBOSE) + + +class IDGatherer(html.parser.HTMLParser): + def __init__(self, ids): + super().__init__() + self.__ids = ids + + def handle_starttag(self, tag, attrs): + for name, value in attrs: + if name == 'id': + if not IGNORED_ID_RE.fullmatch(value): + self.__ids.add(value) + + +def get_ids_from_file(path): + ids = set() + gatherer = IDGatherer(ids) + with path.open() as file: + while chunk := file.read(4096): + gatherer.feed(chunk) + return ids + + +def gather_ids(htmldir, *, verbose_print): + if not htmldir.joinpath('objects.inv').exists(): + raise ValueError(f'{htmldir!r} is not a Sphinx HTML output directory') + + if sys._is_gil_enabled: + pool = concurrent.futures.ProcessPoolExecutor() + else: + pool = concurrent.futures.ThreadPoolExecutor() + tasks = {} + for path in htmldir.glob('**/*.html'): + relative_path = path.relative_to(htmldir) + if '_static' in relative_path.parts: + continue + if 'whatsnew' in relative_path.parts: + continue + tasks[relative_path] = pool.submit(get_ids_from_file, path=path) + + ids_by_page = {} + for relative_path, future in tasks.items(): + verbose_print(relative_path) + ids = future.result() + ids_by_page[str(relative_path)] = future.result() + verbose_print(f' - {len(ids)} ids found') + + common = set.intersection(*ids_by_page.values()) + verbose_print(f'Filtering out {len(common)} common ids') + for key, page_ids in ids_by_page.items(): + ids_by_page[key] = sorted(page_ids - common) + + return ids_by_page + + +def do_check(baseline, checked, excluded, *, verbose_print): + successful = True + for name, baseline_ids in sorted(baseline.items()): + try: + checked_ids = checked[name] + except KeyError: + successful = False + print(f'{name}: (page missing)') + print() + else: + missing_ids = set(baseline_ids) - set(checked_ids) + if missing_ids: + missing_ids = { + a for a in missing_ids + if not IGNORED_ID_RE.fullmatch(a) + and (name, a) not in excluded + } + if missing_ids: + successful = False + for missing_id in sorted(missing_ids): + print(f'{name}: {missing_id}') + print() + return successful + + +def main(argv): + parser = argparse.ArgumentParser() + parser.add_argument( + '-v', '--verbose', action='store_true', + help='print out more information') + subparsers = parser.add_subparsers(dest='command', required=True) + + collect = subparsers.add_parser( + 'collect', + help='collect IDs from a set of HTML files') + collect.add_argument( + 'htmldir', type=Path, + help='directory with HTML documentation') + collect.add_argument( + '-o', '--outfile', + help='File to save the result in; default /html-ids.json.gz') + + check = subparsers.add_parser( + 'check', + help='check two archives of IDs') + check.add_argument( + 'baseline_file', type=Path, + help='file with baseline IDs') + check.add_argument( + 'checked_file', type=Path, + help='file with checked IDs') + check.add_argument( + '-x', '--exclude-file', type=Path, + help='file with IDs to exclude from the check') + + args = parser.parse_args(argv[1:]) + + if args.verbose: + verbose_print = functools.partial(print, file=sys.stderr) + else: + def verbose_print(*args, **kwargs): + """do nothing""" + + if args.command == 'collect': + ids = gather_ids(args.htmldir, verbose_print=verbose_print) + if args.outfile is None: + args.outfile = args.htmldir / 'html-ids.json.gz' + with gzip.open(args.outfile, 'wt') as zfile: + json.dump({'ids_by_page': ids}, zfile) + + if args.command == 'check': + with gzip.open(args.baseline_file) as zfile: + baseline = json.load(zfile)['ids_by_page'] + with gzip.open(args.checked_file) as zfile: + checked = json.load(zfile)['ids_by_page'] + excluded = set() + if args.exclude_file: + with open(args.exclude_file) as file: + for line in file: + line = line.strip() + if line and not line.startswith('#'): + name, sep, excluded_id = line.partition(':') + if sep: + excluded.add((name.strip(), excluded_id.strip())) + if do_check(baseline, checked, excluded, verbose_print=verbose_print): + verbose_print('All OK') + else: + sys.stdout.flush() + print( + 'ERROR: Removed IDs found', + 'The above HTML IDs were removed from the documentation, ' + + 'resulting in broken links. Please add them back.', + sep='\n', + file=sys.stderr) + if args.exclude_file: + print(f'Alternatively, add them to {args.exclude_file}.') + + +if __name__ == '__main__': + main(sys.argv) From c782058f0a5b692eab95faa8030d4a4b05297a8f Mon Sep 17 00:00:00 2001 From: Petr Viktorin Date: Thu, 19 Feb 2026 14:51:47 +0100 Subject: [PATCH 2/5] Remove the list for now; to be added when we start checking --- Doc/data/removed-ids.txt | 93 ---------------------------------------- 1 file changed, 93 deletions(-) delete mode 100644 Doc/data/removed-ids.txt diff --git a/Doc/data/removed-ids.txt b/Doc/data/removed-ids.txt deleted file mode 100644 index 8b38b2fd4df64c..00000000000000 --- a/Doc/data/removed-ids.txt +++ /dev/null @@ -1,93 +0,0 @@ -# Known removed HTML IDs: - -c-api/complex.html: complex-numbers-as-python-objects - -c-api/extension-modules.html: initialization-function - -c-api/import.html: c.PyImport_ImportModuleNoBlock - -c-api/init.html: c.Py_GetExecPrefix -c-api/init.html: c.Py_GetPath -c-api/init.html: c.Py_GetPrefix -c-api/init.html: c.Py_GetProgramFullPath -c-api/init.html: c.Py_GetProgramName -c-api/init.html: c.Py_GetPythonHome - -c-api/module.html: module-definitions -c-api/module.html: module-slots - -c-api/stable.html: c-api-stability - -c-api/sys.html: c.PySys_ResetWarnOptions - -c-api/weakref.html: c.PyWeakref_GET_OBJECT -c-api/weakref.html: c.PyWeakref_GetObject - -extending/extending.html: a-simple-example -extending/extending.html: back-to-the-example -extending/extending.html: backtoexample -extending/extending.html: compilation-and-linkage -extending/extending.html: extending-python-with-c-or-c -extending/extending.html: extending-simpleexample -extending/extending.html: intermezzo-errors-and-exceptions -extending/extending.html: methodtable -extending/extending.html: the-module-s-method-table-and-initialization-function - -extending/index.html: creating-extensions-without-third-party-tools - -howto/perf_profiling.html: python-support-for-the-linux-perf-profiler - -library/dis.html: opcode-LOAD_CONST_IMMORTAL - -library/ftplib.html: ftplib.FTP_TLS.ssl_version - -library/http.server.html: cmdoption-http.server-cgi -library/http.server.html: http.server.CGIHTTPRequestHandler -library/http.server.html: http.server.CGIHTTPRequestHandler.cgi_directories -library/http.server.html: http.server.CGIHTTPRequestHandler.do_POST - -library/importlib.html: importlib.abc.FileLoader.load_module -library/importlib.html: importlib.abc.InspectLoader.load_module -library/importlib.html: importlib.abc.Loader.load_module -library/importlib.html: importlib.abc.SourceLoader.load_module -library/importlib.html: importlib.machinery.SourceFileLoader.load_module -library/importlib.html: importlib.machinery.SourcelessFileLoader.load_module - -library/pathlib.html: pathlib.PurePath.is_reserved - -library/platform.html: java-platform -library/platform.html: platform.java_ver - -library/profile.html: cmdoption-cProfile-m -library/profile.html: cmdoption-cProfile-o -library/profile.html: cmdoption-cProfile-s -library/profile.html: instant-user-s-manual -library/profile.html: introduction-to-the-profilers -library/profile.html: module-cProfile -library/profile.html: module-pstats -library/profile.html: profile-and-cprofile-module-reference -library/profile.html: profile-cli -library/profile.html: profile-instant -library/profile.html: profile-stats -library/profile.html: profiler-introduction -library/profile.html: pstats.Stats -library/profile.html: pstats.Stats.add -library/profile.html: pstats.Stats.dump_stats -library/profile.html: pstats.Stats.get_stats_profile -library/profile.html: pstats.Stats.print_callees -library/profile.html: pstats.Stats.print_callers -library/profile.html: pstats.Stats.print_stats -library/profile.html: pstats.Stats.reverse_order -library/profile.html: pstats.Stats.sort_stats -library/profile.html: pstats.Stats.strip_dirs -library/profile.html: the-python-profilers -library/profile.html: the-stats-class - -library/typing.html: typing.no_type_check_decorator - -library/wave.html: wave.Wave_read.getmark -library/wave.html: wave.Wave_read.getmarkers - -library/zipimport.html: zipimport.zipimporter.load_module - -reference/datamodel.html: module.__cached__ From cbb6c1436a68a344b9eba6bd0aeeaec2737efd68 Mon Sep 17 00:00:00 2001 From: Petr Viktorin Date: Thu, 19 Feb 2026 15:08:21 +0100 Subject: [PATCH 3/5] Decrease readability --- Doc/tools/check-html-ids.py | 54 ++++++++++++++++++++++--------------- 1 file changed, 32 insertions(+), 22 deletions(-) diff --git a/Doc/tools/check-html-ids.py b/Doc/tools/check-html-ids.py index 9d3cb2463f35b4..eb8cc661ff15df 100644 --- a/Doc/tools/check-html-ids.py +++ b/Doc/tools/check-html-ids.py @@ -9,11 +9,14 @@ import re -IGNORED_ID_RE = re.compile(r""" +IGNORED_ID_RE = re.compile( + r""" index-\d+ | id\d+ | [_a-z]+_\d+ -""", re.VERBOSE) +""", + re.VERBOSE, +) class IDGatherer(html.parser.HTMLParser): @@ -82,7 +85,8 @@ def do_check(baseline, checked, excluded, *, verbose_print): missing_ids = set(baseline_ids) - set(checked_ids) if missing_ids: missing_ids = { - a for a in missing_ids + a + for a in missing_ids if not IGNORED_ID_RE.fullmatch(a) and (name, a) not in excluded } @@ -97,38 +101,43 @@ def do_check(baseline, checked, excluded, *, verbose_print): def main(argv): parser = argparse.ArgumentParser() parser.add_argument( - '-v', '--verbose', action='store_true', - help='print out more information') + '-v', + '--verbose', + action='store_true', + help='print out more information', + ) subparsers = parser.add_subparsers(dest='command', required=True) collect = subparsers.add_parser( - 'collect', - help='collect IDs from a set of HTML files') + 'collect', help='collect IDs from a set of HTML files' + ) collect.add_argument( - 'htmldir', type=Path, - help='directory with HTML documentation') + 'htmldir', type=Path, help='directory with HTML documentation' + ) collect.add_argument( - '-o', '--outfile', - help='File to save the result in; default /html-ids.json.gz') + '-o', + '--outfile', + help='File to save the result in; default /html-ids.json.gz', + ) - check = subparsers.add_parser( - 'check', - help='check two archives of IDs') + check = subparsers.add_parser('check', help='check two archives of IDs') check.add_argument( - 'baseline_file', type=Path, - help='file with baseline IDs') + 'baseline_file', type=Path, help='file with baseline IDs' + ) + check.add_argument('checked_file', type=Path, help='file with checked IDs') check.add_argument( - 'checked_file', type=Path, - help='file with checked IDs') - check.add_argument( - '-x', '--exclude-file', type=Path, - help='file with IDs to exclude from the check') + '-x', + '--exclude-file', + type=Path, + help='file with IDs to exclude from the check', + ) args = parser.parse_args(argv[1:]) if args.verbose: verbose_print = functools.partial(print, file=sys.stderr) else: + def verbose_print(*args, **kwargs): """do nothing""" @@ -162,7 +171,8 @@ def verbose_print(*args, **kwargs): 'The above HTML IDs were removed from the documentation, ' + 'resulting in broken links. Please add them back.', sep='\n', - file=sys.stderr) + file=sys.stderr, + ) if args.exclude_file: print(f'Alternatively, add them to {args.exclude_file}.') From 99c82d16f9de3c3ed484920455bb8158b2e3bc23 Mon Sep 17 00:00:00 2001 From: Petr Viktorin Date: Fri, 20 Feb 2026 17:14:06 +0100 Subject: [PATCH 4/5] Only get the result once --- Doc/tools/check-html-ids.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/Doc/tools/check-html-ids.py b/Doc/tools/check-html-ids.py index eb8cc661ff15df..578b0ab2663650 100644 --- a/Doc/tools/check-html-ids.py +++ b/Doc/tools/check-html-ids.py @@ -61,7 +61,7 @@ def gather_ids(htmldir, *, verbose_print): for relative_path, future in tasks.items(): verbose_print(relative_path) ids = future.result() - ids_by_page[str(relative_path)] = future.result() + ids_by_page[str(relative_path)] = ids verbose_print(f' - {len(ids)} ids found') common = set.intersection(*ids_by_page.values()) From ac992ee0cf34ac2e33a942c4a501ff4d8c83a40d Mon Sep 17 00:00:00 2001 From: Petr Viktorin Date: Fri, 20 Feb 2026 17:13:04 +0100 Subject: [PATCH 5/5] Set encoding='utf-8' to ease backports --- Doc/tools/check-html-ids.py | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/Doc/tools/check-html-ids.py b/Doc/tools/check-html-ids.py index 578b0ab2663650..8e8e0a581df72d 100644 --- a/Doc/tools/check-html-ids.py +++ b/Doc/tools/check-html-ids.py @@ -34,7 +34,7 @@ def handle_starttag(self, tag, attrs): def get_ids_from_file(path): ids = set() gatherer = IDGatherer(ids) - with path.open() as file: + with path.open(encoding='utf-8') as file: while chunk := file.read(4096): gatherer.feed(chunk) return ids @@ -145,7 +145,7 @@ def verbose_print(*args, **kwargs): ids = gather_ids(args.htmldir, verbose_print=verbose_print) if args.outfile is None: args.outfile = args.htmldir / 'html-ids.json.gz' - with gzip.open(args.outfile, 'wt') as zfile: + with gzip.open(args.outfile, 'wt', encoding='utf-8') as zfile: json.dump({'ids_by_page': ids}, zfile) if args.command == 'check': @@ -155,7 +155,7 @@ def verbose_print(*args, **kwargs): checked = json.load(zfile)['ids_by_page'] excluded = set() if args.exclude_file: - with open(args.exclude_file) as file: + with open(args.exclude_file, encoding='utf-8') as file: for line in file: line = line.strip() if line and not line.startswith('#'):