(root)/
gcc-13.2.0/
contrib/
check-internal-format-escaping.py
       1  #!/usr/bin/env python3
       2  #
       3  # Check gcc.pot file for stylistic issues as described in
       4  # https://gcc.gnu.org/onlinedocs/gccint/Guidelines-for-Diagnostics.html,
       5  # especially in gcc-internal-format messages.
       6  #
       7  # This file is part of GCC.
       8  #
       9  # GCC is free software; you can redistribute it and/or modify it under
      10  # the terms of the GNU General Public License as published by the Free
      11  # Software Foundation; either version 3, or (at your option) any later
      12  # version.
      13  #
      14  # GCC is distributed in the hope that it will be useful, but WITHOUT ANY
      15  # WARRANTY; without even the implied warranty of MERCHANTABILITY or
      16  # FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
      17  # for more details.
      18  #
      19  # You should have received a copy of the GNU General Public License
      20  # along with GCC; see the file COPYING3.  If not see
      21  # <http://www.gnu.org/licenses/>.
      22  
      23  import argparse
      24  import re
      25  from collections import Counter
      26  from typing import Dict, Match
      27  
      28  import polib
      29  
      30  seen_warnings = Counter()
      31  
      32  
      33  def location(msg: polib.POEntry):
      34      if msg.occurrences:
      35          occ = msg.occurrences[0]
      36          return f'{occ[0]}:{occ[1]}'
      37      return '<unknown location>'
      38  
      39  
      40  def warn(msg: polib.POEntry,
      41           diagnostic_id: str, diagnostic: str, include_msgid=True):
      42      """
      43      To suppress a warning for a particular message,
      44      add a line "#, gcclint:ignore:{diagnostic_id}" to the message.
      45      """
      46  
      47      if f'gcclint:ignore:{diagnostic_id}' in msg.flags:
      48          return
      49  
      50      seen_warnings[diagnostic] += 1
      51  
      52      if include_msgid:
      53          print(f'{location(msg)}: {diagnostic} in {repr(msg.msgid)}')
      54      else:
      55          print(f'{location(msg)}: {diagnostic}')
      56  
      57  
      58  def lint_gcc_internal_format(msg: polib.POEntry):
      59      """
      60      Checks a single message that has the gcc-internal-format. These
      61      messages use a variety of placeholders like %qs, %<quotes%> and
      62      %q#E.
      63      """
      64  
      65      msgid: str = msg.msgid
      66  
      67      def outside_quotes(m: Match[str]):
      68          before = msgid[:m.start(0)]
      69          return before.count('%<') == before.count('%>')
      70  
      71      def lint_matching_placeholders():
      72          """
      73          Warns when literal values in placeholders are not exactly equal
      74          in the translation. This can happen when doing copy-and-paste
      75          translations of similar messages.
      76  
      77          To avoid these mismatches in the first place,
      78          structurally equal messages are found by
      79          lint_diagnostics_differing_only_in_placeholders.
      80  
      81          This check only applies when checking a finished translation
      82          such as de.po, not gcc.pot.
      83          """
      84  
      85          if not msg.translated():
      86              return
      87  
      88          in_msgid = re.findall('%<[^%]+%>', msgid)
      89          in_msgstr = re.findall('%<[^%]+%>', msg.msgstr)
      90  
      91          if set(in_msgid) != set(in_msgstr):
      92              warn(msg,
      93                   'placeholder-mismatch',
      94                   f'placeholder mismatch: msgid has {in_msgid}, '
      95                   f'msgstr has {in_msgstr}',
      96                   include_msgid=False)
      97  
      98      def lint_option_outside_quotes():
      99          for match in re.finditer(r'\S+', msgid):
     100              part = match.group()
     101              if not outside_quotes(match):
     102                  continue
     103  
     104              if part.startswith('-'):
     105                  if len(part) >= 2 and part[1].isalpha():
     106                      if part == '-INF':
     107                          continue
     108  
     109                      warn(msg,
     110                           'option-outside-quotes',
     111                           'command line option outside %<quotes%>')
     112  
     113              if part.startswith('__builtin_'):
     114                  warn(msg,
     115                       'builtin-outside-quotes',
     116                       'builtin function outside %<quotes%>')
     117  
     118      def lint_plain_apostrophe():
     119          for match in re.finditer("[^%]'", msgid):
     120              if outside_quotes(match):
     121                  warn(msg, 'apostrophe', 'apostrophe without leading %')
     122  
     123      def lint_space_before_quote():
     124          """
     125          A space before %< is often the result of string literals that
     126          are joined by the C compiler and neither literal has a space
     127          to separate the words.
     128          """
     129  
     130          for match in re.finditer('(.?[a-zA-Z0-9])%<', msgid):
     131              if match.group(1) != '%s':
     132                  warn(msg,
     133                       'no-space-before-quote',
     134                       '%< directly following a letter or digit')
     135  
     136      def lint_underscore_outside_quotes():
     137          """
     138          An underscore outside of quotes is used in several contexts,
     139          and many of them violate the GCC Guidelines for Diagnostics:
     140  
     141          * names of GCC-internal compiler functions
     142          * names of GCC-internal data structures
     143          * static_cast and the like (which are legitimate)
     144          """
     145  
     146          for match in re.finditer('_', msgid):
     147              if outside_quotes(match):
     148                  warn(msg,
     149                       'underscore-outside-quotes',
     150                       'underscore outside of %<quotes%>')
     151                  return
     152  
     153      def lint_may_not():
     154          """
     155          The term "may not" may either mean "it could be the case"
     156          or "should not". These two different meanings are sometimes
     157          hard to tell apart.
     158          """
     159  
     160          if re.search(r'\bmay not\b', msgid):
     161              warn(msg,
     162                   'ambiguous-may-not',
     163                   'the term "may not" is ambiguous')
     164  
     165      def lint_unbalanced_quotes():
     166          if msgid.count('%<') != msgid.count('%>'):
     167              warn(msg,
     168                   'unbalanced-quotes',
     169                   'unbalanced %< and %> quotes')
     170  
     171          if msg.translated():
     172              if msg.msgstr.count('%<') != msg.msgstr.count('%>'):
     173                  warn(msg,
     174                       'unbalanced-quotes',
     175                       'unbalanced %< and %> quotes')
     176  
     177      def lint_single_space_after_sentence():
     178          """
     179          After a sentence there should be two spaces.
     180          """
     181  
     182          if re.search(r'[.] [A-Z]', msgid):
     183              warn(msg,
     184                   'single-space-after-sentence',
     185                   'single space after sentence')
     186  
     187      def lint_non_canonical_quotes():
     188          """
     189          Catches %<%s%>, which can be written in the shorter form %qs.
     190          """
     191          match = re.search("%<%s%>|'%s'|\"%s\"|`%s'", msgid)
     192          if match:
     193              warn(msg,
     194                   'non-canonical-quotes',
     195                   f'placeholder {match.group()} should be written as %qs')
     196  
     197      lint_option_outside_quotes()
     198      lint_plain_apostrophe()
     199      lint_space_before_quote()
     200      lint_underscore_outside_quotes()
     201      lint_may_not()
     202      lint_unbalanced_quotes()
     203      lint_matching_placeholders()
     204      lint_single_space_after_sentence()
     205      lint_non_canonical_quotes()
     206  
     207  
     208  def lint_diagnostics_differing_only_in_placeholders(po: polib.POFile):
     209      """
     210      Detects messages that are structurally the same, except that they
     211      use different plain strings inside %<quotes%>. These messages can
     212      be merged in order to prevent copy-and-paste mistakes by the
     213      translators.
     214  
     215      See bug 90119.
     216      """
     217  
     218      seen: Dict[str, polib.POEntry] = {}
     219  
     220      for msg in po:
     221          msg: polib.POEntry
     222          msgid = msg.msgid
     223  
     224          normalized = re.sub('%<[^%]+%>', '%qs', msgid)
     225          if normalized not in seen:
     226              seen[normalized] = msg
     227              seen[msgid] = msg
     228              continue
     229  
     230          prev = seen[normalized]
     231          warn(msg,
     232               'same-pattern',
     233               f'same pattern for {repr(msgid)} and '
     234               f'{repr(prev.msgid)} in {location(prev)}',
     235               include_msgid=False)
     236  
     237  
     238  def lint_file(po: polib.POFile):
     239      for msg in po:
     240          msg: polib.POEntry
     241  
     242          if not msg.obsolete and not msg.fuzzy:
     243              if 'gcc-internal-format' in msg.flags:
     244                  lint_gcc_internal_format(msg)
     245  
     246      lint_diagnostics_differing_only_in_placeholders(po)
     247  
     248  
     249  def main():
     250      parser = argparse.ArgumentParser(description='')
     251      parser.add_argument('file', help='pot file')
     252  
     253      args = parser.parse_args()
     254  
     255      po = polib.pofile(args.file)
     256      lint_file(po)
     257  
     258      print()
     259      print('summary:')
     260      for entry in seen_warnings.most_common():
     261          if entry[1] > 1:
     262              print(f'{entry[1]}\t{entry[0]}')
     263  
     264  
     265  if __name__ == '__main__':
     266      main()