1  #!/usr/bin/env python3
       2  """
       3  Check the output of running Sphinx in nit-picky mode (missing references).
       4  """
       5  from __future__ import annotations
       6  
       7  import argparse
       8  import itertools
       9  import os
      10  import re
      11  import subprocess
      12  import sys
      13  from pathlib import Path
      14  from typing import TextIO
      15  
      16  # Exclude these whether they're dirty or clean,
      17  # because they trigger a rebuild of dirty files.
      18  EXCLUDE_FILES = {
      19      "Doc/whatsnew/changelog.rst",
      20  }
      21  
      22  # Subdirectories of Doc/ to exclude.
      23  EXCLUDE_SUBDIRS = {
      24      ".env",
      25      ".venv",
      26      "env",
      27      "includes",
      28      "venv",
      29  }
      30  
      31  # Regex pattern to match the parts of a Sphinx warning
      32  WARNING_PATTERN = re.compile(
      33      r"(?P<file>([A-Za-z]:[\\/])?[^:]+):(?P<line>\d+): WARNING: (?P<msg>.+)"
      34  )
      35  
      36  # Regex pattern to match the line numbers in a Git unified diff
      37  DIFF_PATTERN = re.compile(
      38      r"^@@ -(?P<linea>\d+)(?:,(?P<removed>\d+))? \+(?P<lineb>\d+)(?:,(?P<added>\d+))? @@",
      39      flags=re.MULTILINE,
      40  )
      41  
      42  
      43  def get_diff_files(ref_a: str, ref_b: str, filter_mode: str = "") -> set[Path]:
      44      """List the files changed between two Git refs, filtered by change type."""
      45      added_files_result = subprocess.run(
      46          [
      47              "git",
      48              "diff",
      49              f"--diff-filter={filter_mode}",
      50              "--name-only",
      51              f"{ref_a}...{ref_b}",
      52              "--",
      53          ],
      54          stdout=subprocess.PIPE,
      55          check=True,
      56          text=True,
      57          encoding="UTF-8",
      58      )
      59  
      60      added_files = added_files_result.stdout.strip().split("\n")
      61      return {Path(file.strip()) for file in added_files if file.strip()}
      62  
      63  
      64  def get_diff_lines(ref_a: str, ref_b: str, file: Path) -> list[int]:
      65      """List the lines changed between two Git refs for a specific file."""
      66      diff_output = subprocess.run(
      67          [
      68              "git",
      69              "diff",
      70              "--unified=0",
      71              f"{ref_a}...{ref_b}",
      72              "--",
      73              str(file),
      74          ],
      75          stdout=subprocess.PIPE,
      76          check=True,
      77          text=True,
      78          encoding="UTF-8",
      79      )
      80  
      81      # Scrape line offsets + lengths from diff and convert to line numbers
      82      line_matches = DIFF_PATTERN.finditer(diff_output.stdout)
      83      # Removed and added line counts are 1 if not printed
      84      line_match_values = [
      85          line_match.groupdict(default=1) for line_match in line_matches
      86      ]
      87      line_ints = [
      88          (int(match_value["lineb"]), int(match_value["added"]))
      89          for match_value in line_match_values
      90      ]
      91      line_ranges = [
      92          range(line_b, line_b + added) for line_b, added in line_ints
      93      ]
      94      line_numbers = list(itertools.chain(*line_ranges))
      95  
      96      return line_numbers
      97  
      98  
      99  def get_para_line_numbers(file_obj: TextIO) -> list[list[int]]:
     100      """Get the line numbers of text in a file object, grouped by paragraph."""
     101      paragraphs = []
     102      prev_line = None
     103      for lineno, line in enumerate(file_obj):
     104          lineno = lineno + 1
     105          if prev_line is None or (line.strip() and not prev_line.strip()):
     106              paragraph = [lineno - 1]
     107              paragraphs.append(paragraph)
     108          paragraph.append(lineno)
     109          prev_line = line
     110      return paragraphs
     111  
     112  
     113  def filter_and_parse_warnings(
     114      warnings: list[str], files: set[Path]
     115  ) -> list[re.Match[str]]:
     116      """Get the warnings matching passed files and parse them with regex."""
     117      filtered_warnings = [
     118          warning
     119          for warning in warnings
     120          if any(str(file) in warning for file in files)
     121      ]
     122      warning_matches = [
     123          WARNING_PATTERN.fullmatch(warning.strip())
     124          for warning in filtered_warnings
     125      ]
     126      non_null_matches = [warning for warning in warning_matches if warning]
     127      return non_null_matches
     128  
     129  
     130  def filter_warnings_by_diff(
     131      warnings: list[re.Match[str]], ref_a: str, ref_b: str, file: Path
     132  ) -> list[re.Match[str]]:
     133      """Filter the passed per-file warnings to just those on changed lines."""
     134      diff_lines = get_diff_lines(ref_a, ref_b, file)
     135      with file.open(encoding="UTF-8") as file_obj:
     136          paragraphs = get_para_line_numbers(file_obj)
     137      touched_paras = [
     138          para_lines
     139          for para_lines in paragraphs
     140          if set(diff_lines) & set(para_lines)
     141      ]
     142      touched_para_lines = set(itertools.chain(*touched_paras))
     143      warnings_infile = [
     144          warning for warning in warnings if str(file) in warning["file"]
     145      ]
     146      warnings_touched = [
     147          warning
     148          for warning in warnings_infile
     149          if int(warning["line"]) in touched_para_lines
     150      ]
     151      return warnings_touched
     152  
     153  
     154  def process_touched_warnings(
     155      warnings: list[str], ref_a: str, ref_b: str
     156  ) -> list[re.Match[str]]:
     157      """Filter a list of Sphinx warnings to those affecting touched lines."""
     158      added_files, modified_files = tuple(
     159          get_diff_files(ref_a, ref_b, filter_mode=mode) for mode in ("A", "M")
     160      )
     161  
     162      warnings_added = filter_and_parse_warnings(warnings, added_files)
     163      warnings_modified = filter_and_parse_warnings(warnings, modified_files)
     164  
     165      modified_files_warned = {
     166          file
     167          for file in modified_files
     168          if any(str(file) in warning["file"] for warning in warnings_modified)
     169      }
     170  
     171      warnings_modified_touched = [
     172          filter_warnings_by_diff(warnings_modified, ref_a, ref_b, file)
     173          for file in modified_files_warned
     174      ]
     175      warnings_touched = warnings_added + list(
     176          itertools.chain(*warnings_modified_touched)
     177      )
     178  
     179      return warnings_touched
     180  
     181  
     182  def annotate_diff(
     183      warnings: list[str], ref_a: str = "main", ref_b: str = "HEAD"
     184  ) -> None:
     185      """
     186      Convert Sphinx warning messages to GitHub Actions for changed paragraphs.
     187  
     188      Converts lines like:
     189          .../Doc/library/cgi.rst:98: WARNING: reference target not found
     190      to:
     191          ::warning file=.../Doc/library/cgi.rst,line=98::reference target not found
     192  
     193      See:
     194      https://docs.github.com/en/actions/using-workflows/workflow-commands-for-github-actions#setting-a-warning-message
     195      """
     196      warnings_touched = process_touched_warnings(warnings, ref_a, ref_b)
     197      print("Emitting doc warnings matching modified lines:")
     198      for warning in warnings_touched:
     199          print("::warning file={file},line={line}::{msg}".format_map(warning))
     200          print(warning[0])
     201      if not warnings_touched:
     202          print("None")
     203  
     204  
     205  def fail_if_regression(
     206      warnings: list[str], files_with_expected_nits: set[str], files_with_nits: set[str]
     207  ) -> int:
     208      """
     209      Ensure some files always pass Sphinx nit-picky mode (no missing references).
     210      These are files which are *not* in .nitignore.
     211      """
     212      all_rst = {
     213          str(rst)
     214          for rst in Path("Doc/").rglob("*.rst")
     215          if rst.parts[1] not in EXCLUDE_SUBDIRS
     216      }
     217      should_be_clean = all_rst - files_with_expected_nits - EXCLUDE_FILES
     218      problem_files = sorted(should_be_clean & files_with_nits)
     219      if problem_files:
     220          print("\nError: must not contain warnings:\n")
     221          for filename in problem_files:
     222              print(filename)
     223              for warning in warnings:
     224                  if filename in warning:
     225                      if match := WARNING_PATTERN.fullmatch(warning):
     226                          print("  {line}: {msg}".format_map(match))
     227          return -1
     228      return 0
     229  
     230  
     231  def fail_if_improved(
     232      files_with_expected_nits: set[str], files_with_nits: set[str]
     233  ) -> int:
     234      """
     235      We may have fixed warnings in some files so that the files are now completely clean.
     236      Good news! Let's add them to .nitignore to prevent regression.
     237      """
     238      files_with_no_nits = files_with_expected_nits - files_with_nits
     239      if files_with_no_nits:
     240          print("\nCongratulations! You improved:\n")
     241          for filename in sorted(files_with_no_nits):
     242              print(filename)
     243          print("\nPlease remove from Doc/tools/.nitignore\n")
     244          return -1
     245      return 0
     246  
     247  
     248  def main(argv: list[str] | None = None) -> int:
     249      parser = argparse.ArgumentParser()
     250      parser.add_argument(
     251          "--annotate-diff",
     252          nargs="*",
     253          metavar=("BASE_REF", "HEAD_REF"),
     254          help="Add GitHub Actions annotations on the diff for warnings on "
     255          "lines changed between the given refs (main and HEAD, by default)",
     256      )
     257      parser.add_argument(
     258          "--fail-if-regression",
     259          action="store_true",
     260          help="Fail if known-good files have warnings",
     261      )
     262      parser.add_argument(
     263          "--fail-if-improved",
     264          action="store_true",
     265          help="Fail if new files with no nits are found",
     266      )
     267  
     268      args = parser.parse_args(argv)
     269      if args.annotate_diff is not None and len(args.annotate_diff) > 2:
     270          parser.error(
     271              "--annotate-diff takes between 0 and 2 ref args, not "
     272              f"{len(args.annotate_diff)} {tuple(args.annotate_diff)}"
     273          )
     274      exit_code = 0
     275  
     276      wrong_directory_msg = "Must run this script from the repo root"
     277      assert Path("Doc").exists() and Path("Doc").is_dir(), wrong_directory_msg
     278  
     279      with Path("Doc/sphinx-warnings.txt").open(encoding="UTF-8") as f:
     280          warnings = f.read().splitlines()
     281  
     282      cwd = str(Path.cwd()) + os.path.sep
     283      files_with_nits = {
     284          warning.removeprefix(cwd).split(":")[0]
     285          for warning in warnings
     286          if "Doc/" in warning
     287      }
     288  
     289      with Path("Doc/tools/.nitignore").open(encoding="UTF-8") as clean_files:
     290          files_with_expected_nits = {
     291              filename.strip()
     292              for filename in clean_files
     293              if filename.strip() and not filename.startswith("#")
     294          }
     295  
     296      if args.annotate_diff is not None:
     297          annotate_diff(warnings, *args.annotate_diff)
     298  
     299      if args.fail_if_regression:
     300          exit_code += fail_if_regression(
     301              warnings, files_with_expected_nits, files_with_nits
     302          )
     303  
     304      if args.fail_if_improved:
     305          exit_code += fail_if_improved(files_with_expected_nits, files_with_nits)
     306  
     307      return exit_code
     308  
     309  
     310  if __name__ == "__main__":
     311      sys.exit(main())