From 3aa8db06993c9d7784801e16740bf88f4fdc42a4 Mon Sep 17 00:00:00 2001
From: Petr Viktorin <encukou@gmail.com>
Date: Thu, 5 Feb 2026 16:52:49 +0100
Subject: [PATCH 1/5] Add a tool to check removed HTML anchors

---
 Doc/.ruff.toml              |   1 +
 Doc/Makefile                |   6 ++
 Doc/data/removed-ids.txt    |  93 ++++++++++++++++++++
 Doc/tools/check-html-ids.py | 171 ++++++++++++++++++++++++++++++++++++
 4 files changed, 271 insertions(+)
 create mode 100644 Doc/data/removed-ids.txt
 create mode 100644 Doc/tools/check-html-ids.py

diff --git a/Doc/.ruff.toml b/Doc/.ruff.toml
index 3e676e13c3f41a..3531c586337686 100644
--- a/Doc/.ruff.toml
+++ b/Doc/.ruff.toml
@@ -30,6 +30,7 @@ select = [
 ]
 ignore = [
     "E501",  # Ignore line length errors (we use auto-formatting)
+    "I001",  # Import block is un-sorted or un-formatted
 ]
 
 [format]
diff --git a/Doc/Makefile b/Doc/Makefile
index 4d605980a62904..d39c2fe3c3f22a 100644
--- a/Doc/Makefile
+++ b/Doc/Makefile
@@ -336,3 +336,9 @@ autobuild-stable-html:
 		exit 1;; \
 	esac
 	@$(MAKE) autobuild-dev-html
+
+# Collect HTML IDs to a JSON document
+.PHONY: html-ids
+html-ids:
+	$(PYTHON) tools/check-html-ids.py collect build/html \
+		-o build/html/html-ids.json.gz
diff --git a/Doc/data/removed-ids.txt b/Doc/data/removed-ids.txt
new file mode 100644
index 00000000000000..8b38b2fd4df64c
--- /dev/null
+++ b/Doc/data/removed-ids.txt
@@ -0,0 +1,93 @@
+# Known removed HTML IDs:
+
+c-api/complex.html: complex-numbers-as-python-objects
+
+c-api/extension-modules.html: initialization-function
+
+c-api/import.html: c.PyImport_ImportModuleNoBlock
+
+c-api/init.html: c.Py_GetExecPrefix
+c-api/init.html: c.Py_GetPath
+c-api/init.html: c.Py_GetPrefix
+c-api/init.html: c.Py_GetProgramFullPath
+c-api/init.html: c.Py_GetProgramName
+c-api/init.html: c.Py_GetPythonHome
+
+c-api/module.html: module-definitions
+c-api/module.html: module-slots
+
+c-api/stable.html: c-api-stability
+
+c-api/sys.html: c.PySys_ResetWarnOptions
+
+c-api/weakref.html: c.PyWeakref_GET_OBJECT
+c-api/weakref.html: c.PyWeakref_GetObject
+
+extending/extending.html: a-simple-example
+extending/extending.html: back-to-the-example
+extending/extending.html: backtoexample
+extending/extending.html: compilation-and-linkage
+extending/extending.html: extending-python-with-c-or-c
+extending/extending.html: extending-simpleexample
+extending/extending.html: intermezzo-errors-and-exceptions
+extending/extending.html: methodtable
+extending/extending.html: the-module-s-method-table-and-initialization-function
+
+extending/index.html: creating-extensions-without-third-party-tools
+
+howto/perf_profiling.html: python-support-for-the-linux-perf-profiler
+
+library/dis.html: opcode-LOAD_CONST_IMMORTAL
+
+library/ftplib.html: ftplib.FTP_TLS.ssl_version
+
+library/http.server.html: cmdoption-http.server-cgi
+library/http.server.html: http.server.CGIHTTPRequestHandler
+library/http.server.html: http.server.CGIHTTPRequestHandler.cgi_directories
+library/http.server.html: http.server.CGIHTTPRequestHandler.do_POST
+
+library/importlib.html: importlib.abc.FileLoader.load_module
+library/importlib.html: importlib.abc.InspectLoader.load_module
+library/importlib.html: importlib.abc.Loader.load_module
+library/importlib.html: importlib.abc.SourceLoader.load_module
+library/importlib.html: importlib.machinery.SourceFileLoader.load_module
+library/importlib.html: importlib.machinery.SourcelessFileLoader.load_module
+
+library/pathlib.html: pathlib.PurePath.is_reserved
+
+library/platform.html: java-platform
+library/platform.html: platform.java_ver
+
+library/profile.html: cmdoption-cProfile-m
+library/profile.html: cmdoption-cProfile-o
+library/profile.html: cmdoption-cProfile-s
+library/profile.html: instant-user-s-manual
+library/profile.html: introduction-to-the-profilers
+library/profile.html: module-cProfile
+library/profile.html: module-pstats
+library/profile.html: profile-and-cprofile-module-reference
+library/profile.html: profile-cli
+library/profile.html: profile-instant
+library/profile.html: profile-stats
+library/profile.html: profiler-introduction
+library/profile.html: pstats.Stats
+library/profile.html: pstats.Stats.add
+library/profile.html: pstats.Stats.dump_stats
+library/profile.html: pstats.Stats.get_stats_profile
+library/profile.html: pstats.Stats.print_callees
+library/profile.html: pstats.Stats.print_callers
+library/profile.html: pstats.Stats.print_stats
+library/profile.html: pstats.Stats.reverse_order
+library/profile.html: pstats.Stats.sort_stats
+library/profile.html: pstats.Stats.strip_dirs
+library/profile.html: the-python-profilers
+library/profile.html: the-stats-class
+
+library/typing.html: typing.no_type_check_decorator
+
+library/wave.html: wave.Wave_read.getmark
+library/wave.html: wave.Wave_read.getmarkers
+
+library/zipimport.html: zipimport.zipimporter.load_module
+
+reference/datamodel.html: module.__cached__
diff --git a/Doc/tools/check-html-ids.py b/Doc/tools/check-html-ids.py
new file mode 100644
index 00000000000000..9d3cb2463f35b4
--- /dev/null
+++ b/Doc/tools/check-html-ids.py
@@ -0,0 +1,171 @@
+from compression import gzip
+import concurrent.futures
+from pathlib import Path
+import html.parser
+import functools
+import argparse
+import json
+import sys
+import re
+
+
+IGNORED_ID_RE = re.compile(r"""
+    index-\d+
+    | id\d+
+    | [_a-z]+_\d+
+""", re.VERBOSE)
+
+
+class IDGatherer(html.parser.HTMLParser):
+    def __init__(self, ids):
+        super().__init__()
+        self.__ids = ids
+
+    def handle_starttag(self, tag, attrs):
+        for name, value in attrs:
+            if name == 'id':
+                if not IGNORED_ID_RE.fullmatch(value):
+                    self.__ids.add(value)
+
+
+def get_ids_from_file(path):
+    ids = set()
+    gatherer = IDGatherer(ids)
+    with path.open() as file:
+        while chunk := file.read(4096):
+            gatherer.feed(chunk)
+    return ids
+
+
+def gather_ids(htmldir, *, verbose_print):
+    if not htmldir.joinpath('objects.inv').exists():
+        raise ValueError(f'{htmldir!r} is not a Sphinx HTML output directory')
+
+    if sys._is_gil_enabled:
+        pool = concurrent.futures.ProcessPoolExecutor()
+    else:
+        pool = concurrent.futures.ThreadPoolExecutor()
+    tasks = {}
+    for path in htmldir.glob('**/*.html'):
+        relative_path = path.relative_to(htmldir)
+        if '_static' in relative_path.parts:
+            continue
+        if 'whatsnew' in relative_path.parts:
+            continue
+        tasks[relative_path] = pool.submit(get_ids_from_file, path=path)
+
+    ids_by_page = {}
+    for relative_path, future in tasks.items():
+        verbose_print(relative_path)
+        ids = future.result()
+        ids_by_page[str(relative_path)] = future.result()
+        verbose_print(f'    - {len(ids)} ids found')
+
+    common = set.intersection(*ids_by_page.values())
+    verbose_print(f'Filtering out {len(common)} common ids')
+    for key, page_ids in ids_by_page.items():
+        ids_by_page[key] = sorted(page_ids - common)
+
+    return ids_by_page
+
+
+def do_check(baseline, checked, excluded, *, verbose_print):
+    successful = True
+    for name, baseline_ids in sorted(baseline.items()):
+        try:
+            checked_ids = checked[name]
+        except KeyError:
+            successful = False
+            print(f'{name}: (page missing)')
+            print()
+        else:
+            missing_ids = set(baseline_ids) - set(checked_ids)
+            if missing_ids:
+                missing_ids = {
+                    a for a in missing_ids
+                    if not IGNORED_ID_RE.fullmatch(a)
+                    and (name, a) not in excluded
+                }
+            if missing_ids:
+                successful = False
+                for missing_id in sorted(missing_ids):
+                    print(f'{name}: {missing_id}')
+                print()
+    return successful
+
+
+def main(argv):
+    parser = argparse.ArgumentParser()
+    parser.add_argument(
+        '-v', '--verbose', action='store_true',
+        help='print out more information')
+    subparsers = parser.add_subparsers(dest='command', required=True)
+
+    collect = subparsers.add_parser(
+        'collect',
+        help='collect IDs from a set of HTML files')
+    collect.add_argument(
+        'htmldir', type=Path,
+        help='directory with HTML documentation')
+    collect.add_argument(
+        '-o', '--outfile',
+        help='File to save the result in; default <htmldir>/html-ids.json.gz')
+
+    check = subparsers.add_parser(
+        'check',
+        help='check two archives of IDs')
+    check.add_argument(
+        'baseline_file', type=Path,
+        help='file with baseline IDs')
+    check.add_argument(
+        'checked_file', type=Path,
+        help='file with checked IDs')
+    check.add_argument(
+        '-x', '--exclude-file', type=Path,
+        help='file with IDs to exclude from the check')
+
+    args = parser.parse_args(argv[1:])
+
+    if args.verbose:
+        verbose_print = functools.partial(print, file=sys.stderr)
+    else:
+        def verbose_print(*args, **kwargs):
+            """do nothing"""
+
+    if args.command == 'collect':
+        ids = gather_ids(args.htmldir, verbose_print=verbose_print)
+        if args.outfile is None:
+            args.outfile = args.htmldir / 'html-ids.json.gz'
+        with gzip.open(args.outfile, 'wt') as zfile:
+            json.dump({'ids_by_page': ids}, zfile)
+
+    if args.command == 'check':
+        with gzip.open(args.baseline_file) as zfile:
+            baseline = json.load(zfile)['ids_by_page']
+        with gzip.open(args.checked_file) as zfile:
+            checked = json.load(zfile)['ids_by_page']
+        excluded = set()
+        if args.exclude_file:
+            with open(args.exclude_file) as file:
+                for line in file:
+                    line = line.strip()
+                    if line and not line.startswith('#'):
+                        name, sep, excluded_id = line.partition(':')
+                        if sep:
+                            excluded.add((name.strip(), excluded_id.strip()))
+        if do_check(baseline, checked, excluded, verbose_print=verbose_print):
+            verbose_print('All OK')
+        else:
+            sys.stdout.flush()
+            print(
+                'ERROR: Removed IDs found',
+                'The above HTML IDs were removed from the documentation, '
+                + 'resulting in broken links. Please add them back.',
+                sep='\n',
+                file=sys.stderr)
+            if args.exclude_file:
+                print(f'Alternatively, add them to {args.exclude_file}.')
+
+
+if __name__ == '__main__':
+    main(sys.argv)

From c782058f0a5b692eab95faa8030d4a4b05297a8f Mon Sep 17 00:00:00 2001
From: Petr Viktorin <encukou@gmail.com>
Date: Thu, 19 Feb 2026 14:51:47 +0100
Subject: [PATCH 2/5] Remove the list for now; to be added when we start
 checking

---
 Doc/data/removed-ids.txt | 93 ----------------------------------------
 1 file changed, 93 deletions(-)
 delete mode 100644 Doc/data/removed-ids.txt

diff --git a/Doc/data/removed-ids.txt b/Doc/data/removed-ids.txt
deleted file mode 100644
index 8b38b2fd4df64c..00000000000000
--- a/Doc/data/removed-ids.txt
+++ /dev/null
@@ -1,93 +0,0 @@
-# Known removed HTML IDs:
-
-c-api/complex.html: complex-numbers-as-python-objects
-
-c-api/extension-modules.html: initialization-function
-
-c-api/import.html: c.PyImport_ImportModuleNoBlock
-
-c-api/init.html: c.Py_GetExecPrefix
-c-api/init.html: c.Py_GetPath
-c-api/init.html: c.Py_GetPrefix
-c-api/init.html: c.Py_GetProgramFullPath
-c-api/init.html: c.Py_GetProgramName
-c-api/init.html: c.Py_GetPythonHome
-
-c-api/module.html: module-definitions
-c-api/module.html: module-slots
-
-c-api/stable.html: c-api-stability
-
-c-api/sys.html: c.PySys_ResetWarnOptions
-
-c-api/weakref.html: c.PyWeakref_GET_OBJECT
-c-api/weakref.html: c.PyWeakref_GetObject
-
-extending/extending.html: a-simple-example
-extending/extending.html: back-to-the-example
-extending/extending.html: backtoexample
-extending/extending.html: compilation-and-linkage
-extending/extending.html: extending-python-with-c-or-c
-extending/extending.html: extending-simpleexample
-extending/extending.html: intermezzo-errors-and-exceptions
-extending/extending.html: methodtable
-extending/extending.html: the-module-s-method-table-and-initialization-function
-
-extending/index.html: creating-extensions-without-third-party-tools
-
-howto/perf_profiling.html: python-support-for-the-linux-perf-profiler
-
-library/dis.html: opcode-LOAD_CONST_IMMORTAL
-
-library/ftplib.html: ftplib.FTP_TLS.ssl_version
-
-library/http.server.html: cmdoption-http.server-cgi
-library/http.server.html: http.server.CGIHTTPRequestHandler
-library/http.server.html: http.server.CGIHTTPRequestHandler.cgi_directories
-library/http.server.html: http.server.CGIHTTPRequestHandler.do_POST
-
-library/importlib.html: importlib.abc.FileLoader.load_module
-library/importlib.html: importlib.abc.InspectLoader.load_module
-library/importlib.html: importlib.abc.Loader.load_module
-library/importlib.html: importlib.abc.SourceLoader.load_module
-library/importlib.html: importlib.machinery.SourceFileLoader.load_module
-library/importlib.html: importlib.machinery.SourcelessFileLoader.load_module
-
-library/pathlib.html: pathlib.PurePath.is_reserved
-
-library/platform.html: java-platform
-library/platform.html: platform.java_ver
-
-library/profile.html: cmdoption-cProfile-m
-library/profile.html: cmdoption-cProfile-o
-library/profile.html: cmdoption-cProfile-s
-library/profile.html: instant-user-s-manual
-library/profile.html: introduction-to-the-profilers
-library/profile.html: module-cProfile
-library/profile.html: module-pstats
-library/profile.html: profile-and-cprofile-module-reference
-library/profile.html: profile-cli
-library/profile.html: profile-instant
-library/profile.html: profile-stats
-library/profile.html: profiler-introduction
-library/profile.html: pstats.Stats
-library/profile.html: pstats.Stats.add
-library/profile.html: pstats.Stats.dump_stats
-library/profile.html: pstats.Stats.get_stats_profile
-library/profile.html: pstats.Stats.print_callees
-library/profile.html: pstats.Stats.print_callers
-library/profile.html: pstats.Stats.print_stats
-library/profile.html: pstats.Stats.reverse_order
-library/profile.html: pstats.Stats.sort_stats
-library/profile.html: pstats.Stats.strip_dirs
-library/profile.html: the-python-profilers
-library/profile.html: the-stats-class
-
-library/typing.html: typing.no_type_check_decorator
-
-library/wave.html: wave.Wave_read.getmark
-library/wave.html: wave.Wave_read.getmarkers
-
-library/zipimport.html: zipimport.zipimporter.load_module
-
-reference/datamodel.html: module.__cached__

From cbb6c1436a68a344b9eba6bd0aeeaec2737efd68 Mon Sep 17 00:00:00 2001
From: Petr Viktorin <encukou@gmail.com>
Date: Thu, 19 Feb 2026 15:08:21 +0100
Subject: [PATCH 3/5] Decrease readability

---
 Doc/tools/check-html-ids.py | 54 ++++++++++++++++++++++---------------
 1 file changed, 32 insertions(+), 22 deletions(-)

diff --git a/Doc/tools/check-html-ids.py b/Doc/tools/check-html-ids.py
index 9d3cb2463f35b4..eb8cc661ff15df 100644
--- a/Doc/tools/check-html-ids.py
+++ b/Doc/tools/check-html-ids.py
@@ -9,11 +9,14 @@
 import re
 
 
-IGNORED_ID_RE = re.compile(r"""
+IGNORED_ID_RE = re.compile(
+    r"""
     index-\d+
     | id\d+
     | [_a-z]+_\d+
-""", re.VERBOSE)
+""",
+    re.VERBOSE,
+)
 
 
 class IDGatherer(html.parser.HTMLParser):
@@ -82,7 +85,8 @@ def do_check(baseline, checked, excluded, *, verbose_print):
             missing_ids = set(baseline_ids) - set(checked_ids)
             if missing_ids:
                 missing_ids = {
-                    a for a in missing_ids
+                    a
+                    for a in missing_ids
                     if not IGNORED_ID_RE.fullmatch(a)
                     and (name, a) not in excluded
                 }
@@ -97,38 +101,43 @@ def do_check(baseline, checked, excluded, *, verbose_print):
 def main(argv):
     parser = argparse.ArgumentParser()
     parser.add_argument(
-        '-v', '--verbose', action='store_true',
-        help='print out more information')
+        '-v',
+        '--verbose',
+        action='store_true',
+        help='print out more information',
+    )
     subparsers = parser.add_subparsers(dest='command', required=True)
 
     collect = subparsers.add_parser(
-        'collect',
-        help='collect IDs from a set of HTML files')
+        'collect', help='collect IDs from a set of HTML files'
+    )
     collect.add_argument(
-        'htmldir', type=Path,
-        help='directory with HTML documentation')
+        'htmldir', type=Path, help='directory with HTML documentation'
+    )
     collect.add_argument(
-        '-o', '--outfile',
-        help='File to save the result in; default <htmldir>/html-ids.json.gz')
+        '-o',
+        '--outfile',
+        help='File to save the result in; default <htmldir>/html-ids.json.gz',
+    )
 
-    check = subparsers.add_parser(
-        'check',
-        help='check two archives of IDs')
+    check = subparsers.add_parser('check', help='check two archives of IDs')
     check.add_argument(
-        'baseline_file', type=Path,
-        help='file with baseline IDs')
+        'baseline_file', type=Path, help='file with baseline IDs'
+    )
+    check.add_argument('checked_file', type=Path, help='file with checked IDs')
     check.add_argument(
-        'checked_file', type=Path,
-        help='file with checked IDs')
-    check.add_argument(
-        '-x', '--exclude-file', type=Path,
-        help='file with IDs to exclude from the check')
+        '-x',
+        '--exclude-file',
+        type=Path,
+        help='file with IDs to exclude from the check',
+    )
 
     args = parser.parse_args(argv[1:])
 
     if args.verbose:
         verbose_print = functools.partial(print, file=sys.stderr)
     else:
+
         def verbose_print(*args, **kwargs):
             """do nothing"""
 
@@ -162,7 +171,8 @@ def verbose_print(*args, **kwargs):
                 'The above HTML IDs were removed from the documentation, '
                 + 'resulting in broken links. Please add them back.',
                 sep='\n',
-                file=sys.stderr)
+                file=sys.stderr,
+            )
             if args.exclude_file:
                 print(f'Alternatively, add them to {args.exclude_file}.')
 

From 99c82d16f9de3c3ed484920455bb8158b2e3bc23 Mon Sep 17 00:00:00 2001
From: Petr Viktorin <encukou@gmail.com>
Date: Fri, 20 Feb 2026 17:14:06 +0100
Subject: [PATCH 4/5] Only get the result once

---
 Doc/tools/check-html-ids.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/Doc/tools/check-html-ids.py b/Doc/tools/check-html-ids.py
index eb8cc661ff15df..578b0ab2663650 100644
--- a/Doc/tools/check-html-ids.py
+++ b/Doc/tools/check-html-ids.py
@@ -61,7 +61,7 @@ def gather_ids(htmldir, *, verbose_print):
     for relative_path, future in tasks.items():
         verbose_print(relative_path)
         ids = future.result()
-        ids_by_page[str(relative_path)] = future.result()
+        ids_by_page[str(relative_path)] = ids
         verbose_print(f'    - {len(ids)} ids found')
 
     common = set.intersection(*ids_by_page.values())

From ac992ee0cf34ac2e33a942c4a501ff4d8c83a40d Mon Sep 17 00:00:00 2001
From: Petr Viktorin <encukou@gmail.com>
Date: Fri, 20 Feb 2026 17:13:04 +0100
Subject: [PATCH 5/5] Set encoding='utf-8' to ease backports

---
 Doc/tools/check-html-ids.py | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/Doc/tools/check-html-ids.py b/Doc/tools/check-html-ids.py
index 578b0ab2663650..8e8e0a581df72d 100644
--- a/Doc/tools/check-html-ids.py
+++ b/Doc/tools/check-html-ids.py
@@ -34,7 +34,7 @@ def handle_starttag(self, tag, attrs):
 def get_ids_from_file(path):
     ids = set()
     gatherer = IDGatherer(ids)
-    with path.open() as file:
+    with path.open(encoding='utf-8') as file:
         while chunk := file.read(4096):
             gatherer.feed(chunk)
     return ids
@@ -145,7 +145,7 @@ def verbose_print(*args, **kwargs):
         ids = gather_ids(args.htmldir, verbose_print=verbose_print)
         if args.outfile is None:
             args.outfile = args.htmldir / 'html-ids.json.gz'
-        with gzip.open(args.outfile, 'wt') as zfile:
+        with gzip.open(args.outfile, 'wt', encoding='utf-8') as zfile:
             json.dump({'ids_by_page': ids}, zfile)
 
     if args.command == 'check':
@@ -155,7 +155,7 @@ def verbose_print(*args, **kwargs):
             checked = json.load(zfile)['ids_by_page']
         excluded = set()
         if args.exclude_file:
-            with open(args.exclude_file) as file:
+            with open(args.exclude_file, encoding='utf-8') as file:
                 for line in file:
                     line = line.strip()
                     if line and not line.startswith('#'):