1 #!/usr/bin/env python3
2 """
3 Check the output of running Sphinx in nit-picky mode (missing references).
4 """
5 from __future__ import annotations
6
7 import argparse
8 import itertools
9 import os
10 import re
11 import subprocess
12 import sys
13 from pathlib import Path
14 from typing import TextIO
15
16 # Exclude these whether they're dirty or clean,
17 # because they trigger a rebuild of dirty files.
18 EXCLUDE_FILES = {
19 "Doc/whatsnew/changelog.rst",
20 }
21
22 # Subdirectories of Doc/ to exclude.
23 EXCLUDE_SUBDIRS = {
24 ".env",
25 ".venv",
26 "env",
27 "includes",
28 "venv",
29 }
30
31 # Regex pattern to match the parts of a Sphinx warning
32 WARNING_PATTERN = re.compile(
33 r"(?P<file>([A-Za-z]:[\\/])?[^:]+):(?P<line>\d+): WARNING: (?P<msg>.+)"
34 )
35
36 # Regex pattern to match the line numbers in a Git unified diff
37 DIFF_PATTERN = re.compile(
38 r"^@@ -(?P<linea>\d+)(?:,(?P<removed>\d+))? \+(?P<lineb>\d+)(?:,(?P<added>\d+))? @@",
39 flags=re.MULTILINE,
40 )
41
42
43 def get_diff_files(ref_a: str, ref_b: str, filter_mode: str = "") -> set[Path]:
44 """List the files changed between two Git refs, filtered by change type."""
45 added_files_result = subprocess.run(
46 [
47 "git",
48 "diff",
49 f"--diff-filter={filter_mode}",
50 "--name-only",
51 f"{ref_a}...{ref_b}",
52 "--",
53 ],
54 stdout=subprocess.PIPE,
55 check=True,
56 text=True,
57 encoding="UTF-8",
58 )
59
60 added_files = added_files_result.stdout.strip().split("\n")
61 return {Path(file.strip()) for file in added_files if file.strip()}
62
63
64 def get_diff_lines(ref_a: str, ref_b: str, file: Path) -> list[int]:
65 """List the lines changed between two Git refs for a specific file."""
66 diff_output = subprocess.run(
67 [
68 "git",
69 "diff",
70 "--unified=0",
71 f"{ref_a}...{ref_b}",
72 "--",
73 str(file),
74 ],
75 stdout=subprocess.PIPE,
76 check=True,
77 text=True,
78 encoding="UTF-8",
79 )
80
81 # Scrape line offsets + lengths from diff and convert to line numbers
82 line_matches = DIFF_PATTERN.finditer(diff_output.stdout)
83 # Removed and added line counts are 1 if not printed
84 line_match_values = [
85 line_match.groupdict(default=1) for line_match in line_matches
86 ]
87 line_ints = [
88 (int(match_value["lineb"]), int(match_value["added"]))
89 for match_value in line_match_values
90 ]
91 line_ranges = [
92 range(line_b, line_b + added) for line_b, added in line_ints
93 ]
94 line_numbers = list(itertools.chain(*line_ranges))
95
96 return line_numbers
97
98
99 def get_para_line_numbers(file_obj: TextIO) -> list[list[int]]:
100 """Get the line numbers of text in a file object, grouped by paragraph."""
101 paragraphs = []
102 prev_line = None
103 for lineno, line in enumerate(file_obj):
104 lineno = lineno + 1
105 if prev_line is None or (line.strip() and not prev_line.strip()):
106 paragraph = [lineno - 1]
107 paragraphs.append(paragraph)
108 paragraph.append(lineno)
109 prev_line = line
110 return paragraphs
111
112
113 def filter_and_parse_warnings(
114 warnings: list[str], files: set[Path]
115 ) -> list[re.Match[str]]:
116 """Get the warnings matching passed files and parse them with regex."""
117 filtered_warnings = [
118 warning
119 for warning in warnings
120 if any(str(file) in warning for file in files)
121 ]
122 warning_matches = [
123 WARNING_PATTERN.fullmatch(warning.strip())
124 for warning in filtered_warnings
125 ]
126 non_null_matches = [warning for warning in warning_matches if warning]
127 return non_null_matches
128
129
130 def filter_warnings_by_diff(
131 warnings: list[re.Match[str]], ref_a: str, ref_b: str, file: Path
132 ) -> list[re.Match[str]]:
133 """Filter the passed per-file warnings to just those on changed lines."""
134 diff_lines = get_diff_lines(ref_a, ref_b, file)
135 with file.open(encoding="UTF-8") as file_obj:
136 paragraphs = get_para_line_numbers(file_obj)
137 touched_paras = [
138 para_lines
139 for para_lines in paragraphs
140 if set(diff_lines) & set(para_lines)
141 ]
142 touched_para_lines = set(itertools.chain(*touched_paras))
143 warnings_infile = [
144 warning for warning in warnings if str(file) in warning["file"]
145 ]
146 warnings_touched = [
147 warning
148 for warning in warnings_infile
149 if int(warning["line"]) in touched_para_lines
150 ]
151 return warnings_touched
152
153
154 def process_touched_warnings(
155 warnings: list[str], ref_a: str, ref_b: str
156 ) -> list[re.Match[str]]:
157 """Filter a list of Sphinx warnings to those affecting touched lines."""
158 added_files, modified_files = tuple(
159 get_diff_files(ref_a, ref_b, filter_mode=mode) for mode in ("A", "M")
160 )
161
162 warnings_added = filter_and_parse_warnings(warnings, added_files)
163 warnings_modified = filter_and_parse_warnings(warnings, modified_files)
164
165 modified_files_warned = {
166 file
167 for file in modified_files
168 if any(str(file) in warning["file"] for warning in warnings_modified)
169 }
170
171 warnings_modified_touched = [
172 filter_warnings_by_diff(warnings_modified, ref_a, ref_b, file)
173 for file in modified_files_warned
174 ]
175 warnings_touched = warnings_added + list(
176 itertools.chain(*warnings_modified_touched)
177 )
178
179 return warnings_touched
180
181
182 def annotate_diff(
183 warnings: list[str], ref_a: str = "main", ref_b: str = "HEAD"
184 ) -> None:
185 """
186 Convert Sphinx warning messages to GitHub Actions for changed paragraphs.
187
188 Converts lines like:
189 .../Doc/library/cgi.rst:98: WARNING: reference target not found
190 to:
191 ::warning file=.../Doc/library/cgi.rst,line=98::reference target not found
192
193 See:
194 https://docs.github.com/en/actions/using-workflows/workflow-commands-for-github-actions#setting-a-warning-message
195 """
196 warnings_touched = process_touched_warnings(warnings, ref_a, ref_b)
197 print("Emitting doc warnings matching modified lines:")
198 for warning in warnings_touched:
199 print("::warning file={file},line={line}::{msg}".format_map(warning))
200 print(warning[0])
201 if not warnings_touched:
202 print("None")
203
204
205 def fail_if_regression(
206 warnings: list[str], files_with_expected_nits: set[str], files_with_nits: set[str]
207 ) -> int:
208 """
209 Ensure some files always pass Sphinx nit-picky mode (no missing references).
210 These are files which are *not* in .nitignore.
211 """
212 all_rst = {
213 str(rst)
214 for rst in Path("Doc/").rglob("*.rst")
215 if rst.parts[1] not in EXCLUDE_SUBDIRS
216 }
217 should_be_clean = all_rst - files_with_expected_nits - EXCLUDE_FILES
218 problem_files = sorted(should_be_clean & files_with_nits)
219 if problem_files:
220 print("\nError: must not contain warnings:\n")
221 for filename in problem_files:
222 print(filename)
223 for warning in warnings:
224 if filename in warning:
225 if match := WARNING_PATTERN.fullmatch(warning):
226 print(" {line}: {msg}".format_map(match))
227 return -1
228 return 0
229
230
231 def fail_if_improved(
232 files_with_expected_nits: set[str], files_with_nits: set[str]
233 ) -> int:
234 """
235 We may have fixed warnings in some files so that the files are now completely clean.
236 Good news! Let's add them to .nitignore to prevent regression.
237 """
238 files_with_no_nits = files_with_expected_nits - files_with_nits
239 if files_with_no_nits:
240 print("\nCongratulations! You improved:\n")
241 for filename in sorted(files_with_no_nits):
242 print(filename)
243 print("\nPlease remove from Doc/tools/.nitignore\n")
244 return -1
245 return 0
246
247
248 def main(argv: list[str] | None = None) -> int:
249 parser = argparse.ArgumentParser()
250 parser.add_argument(
251 "--annotate-diff",
252 nargs="*",
253 metavar=("BASE_REF", "HEAD_REF"),
254 help="Add GitHub Actions annotations on the diff for warnings on "
255 "lines changed between the given refs (main and HEAD, by default)",
256 )
257 parser.add_argument(
258 "--fail-if-regression",
259 action="store_true",
260 help="Fail if known-good files have warnings",
261 )
262 parser.add_argument(
263 "--fail-if-improved",
264 action="store_true",
265 help="Fail if new files with no nits are found",
266 )
267
268 args = parser.parse_args(argv)
269 if args.annotate_diff is not None and len(args.annotate_diff) > 2:
270 parser.error(
271 "--annotate-diff takes between 0 and 2 ref args, not "
272 f"{len(args.annotate_diff)} {tuple(args.annotate_diff)}"
273 )
274 exit_code = 0
275
276 wrong_directory_msg = "Must run this script from the repo root"
277 assert Path("Doc").exists() and Path("Doc").is_dir(), wrong_directory_msg
278
279 with Path("Doc/sphinx-warnings.txt").open(encoding="UTF-8") as f:
280 warnings = f.read().splitlines()
281
282 cwd = str(Path.cwd()) + os.path.sep
283 files_with_nits = {
284 warning.removeprefix(cwd).split(":")[0]
285 for warning in warnings
286 if "Doc/" in warning
287 }
288
289 with Path("Doc/tools/.nitignore").open(encoding="UTF-8") as clean_files:
290 files_with_expected_nits = {
291 filename.strip()
292 for filename in clean_files
293 if filename.strip() and not filename.startswith("#")
294 }
295
296 if args.annotate_diff is not None:
297 annotate_diff(warnings, *args.annotate_diff)
298
299 if args.fail_if_regression:
300 exit_code += fail_if_regression(
301 warnings, files_with_expected_nits, files_with_nits
302 )
303
304 if args.fail_if_improved:
305 exit_code += fail_if_improved(files_with_expected_nits, files_with_nits)
306
307 return exit_code
308
309
310 if __name__ == "__main__":
311 sys.exit(main())