1  #! /usr/bin/env python3
       2  # This script generates Lib/re/_casefix.py.
       3  
       4  import collections
       5  import sys
       6  import unicodedata
       7  
       8  SCRIPT_NAME = 'Tools/build/generate_re_casefix.py'
       9  
      10  def update_file(file, content):
      11      try:
      12          with open(file, 'r', encoding='utf-8') as fobj:
      13              if fobj.read() == content:
      14                  return False
      15      except (OSError, ValueError):
      16          pass
      17      with open(file, 'w', encoding='utf-8') as fobj:
      18          fobj.write(content)
      19      return True
      20  
      21  re_casefix_template = f"""\
      22  # Auto-generated by {SCRIPT_NAME}.
      23  
      24  # Maps the code of lowercased character to codes of different lowercased
      25  # characters which have the same uppercase.
      26  _EXTRA_CASES = {
      27  %s
      28  }
      29  """
      30  
      31  def uname(i):
      32      return unicodedata.name(chr(i), r'U+%04X' % i)
      33  
      34  class ESC[4;38;5;81mhexint(ESC[4;38;5;149mint):
      35      def __repr__(self):
      36          return '%#06x' % self
      37  
      38  def alpha(i):
      39      c = chr(i)
      40      return c if c.isalpha() else ascii(c)[1:-1]
      41  
      42  
      43  def main(outfile='Lib/re/_casefix.py'):
      44      # Find sets of characters which have the same uppercase.
      45      equivalent_chars = collections.defaultdict(str)
      46      for c in map(chr, range(sys.maxunicode + 1)):
      47          equivalent_chars[c.upper()] += c
      48      equivalent_chars = [t for t in equivalent_chars.values() if len(t) > 1]
      49  
      50      # List of codes of lowercased characters which have the same uppercase.
      51      equivalent_lower_codes = [sorted(t)
      52                                for s in equivalent_chars
      53                                for t in [set(ord(c.lower()) for c in s)]
      54                                if len(t) > 1]
      55  
      56      bad_codes = []
      57      for t in equivalent_lower_codes:
      58          for i in t:
      59              if i > 0xffff:
      60                  bad_codes.extend(t)
      61                  try:
      62                      bad_codes.append(ord(chr(i).upper()))
      63                  except (ValueError, TypeError):
      64                      pass
      65                  break
      66      if bad_codes:
      67          print('Case-insensitive matching may not work correctly for character:',
      68                file=sys.stderr)
      69          for i in sorted(bad_codes):
      70              print("  '%s' (U+%04x, %s)" % (alpha(i), i, uname(i)),
      71                    file=sys.stderr)
      72          sys.exit(1)
      73  
      74      mapping = {i: tuple(j for j in t if i != j)
      75                 for t in equivalent_lower_codes
      76                 for i in t}
      77  
      78      items = []
      79      for i, t in sorted(mapping.items()):
      80          items.append('    # %s: %s' % (
      81              uname(i),
      82              ', '.join(map(uname, t)),
      83          ))
      84          items.append("    %r: %r, # '%s': '%s'" % (
      85              hexint(i),
      86              tuple(map(hexint, t)),
      87              alpha(i),
      88              ''.join(map(alpha, t)),
      89          ))
      90  
      91      update_file(outfile, re_casefix_template % '\n'.join(items))
      92  
      93  
      94  if __name__ == '__main__':
      95      import sys
      96      main(*sys.argv[1:])