1  #! /usr/bin/env python3
       2  # This script generates Lib/re/_casefix.py.
       3  
       4  import collections
       5  import re
       6  import sys
       7  import unicodedata
       8  
       9  def update_file(file, content):
      10      try:
      11          with open(file, 'r', encoding='utf-8') as fobj:
      12              if fobj.read() == content:
      13                  return False
      14      except (OSError, ValueError):
      15          pass
      16      with open(file, 'w', encoding='utf-8') as fobj:
      17          fobj.write(content)
      18      return True
      19  
      20  re_casefix_template = """\
      21  # Auto-generated by Tools/scripts/generate_re_casefix.py.
      22  
      23  # Maps the code of lowercased character to codes of different lowercased
      24  # characters which have the same uppercase.
      25  _EXTRA_CASES = {
      26  %s
      27  }
      28  """
      29  
      30  def uname(i):
      31      return unicodedata.name(chr(i), r'U+%04X' % i)
      32  
      33  class ESC[4;38;5;81mhexint(ESC[4;38;5;149mint):
      34      def __repr__(self):
      35          return '%#06x' % self
      36  
      37  def alpha(i):
      38      c = chr(i)
      39      return c if c.isalpha() else ascii(c)[1:-1]
      40  
      41  
      42  def main(outfile='Lib/re/_casefix.py'):
      43      # Find sets of characters which have the same uppercase.
      44      equivalent_chars = collections.defaultdict(str)
      45      for c in map(chr, range(sys.maxunicode + 1)):
      46          equivalent_chars[c.upper()] += c
      47      equivalent_chars = [t for t in equivalent_chars.values() if len(t) > 1]
      48  
      49      # List of codes of lowercased characters which have the same uppercase.
      50      equivalent_lower_codes = [sorted(t)
      51                                for s in equivalent_chars
      52                                for t in [set(ord(c.lower()) for c in s)]
      53                                if len(t) > 1]
      54  
      55      bad_codes = []
      56      for t in equivalent_lower_codes:
      57          for i in t:
      58              if i > 0xffff:
      59                  bad_codes.extend(t)
      60                  try:
      61                      bad_codes.append(ord(chr(i).upper()))
      62                  except (ValueError, TypeError):
      63                      pass
      64                  break
      65      if bad_codes:
      66          print('Case-insensitive matching may not work correctly for character:',
      67                file=sys.stderr)
      68          for i in sorted(bad_codes):
      69              print("  '%s' (U+%04x, %s)" % (alpha(i), i, uname(i)),
      70                    file=sys.stderr)
      71          sys.exit(1)
      72  
      73      mapping = {i: tuple(j for j in t if i != j)
      74                 for t in equivalent_lower_codes
      75                 for i in t}
      76  
      77      items = []
      78      for i, t in sorted(mapping.items()):
      79          items.append('    # %s: %s' % (
      80              uname(i),
      81              ', '.join(map(uname, t)),
      82          ))
      83          items.append("    %r: %r, # '%s': '%s'" % (
      84              hexint(i),
      85              tuple(map(hexint, t)),
      86              alpha(i),
      87              ''.join(map(alpha, t)),
      88          ))
      89  
      90      update_file(outfile, re_casefix_template % '\n'.join(items))
      91  
      92  
      93  if __name__ == '__main__':
      94      import sys
      95      main(*sys.argv[1:])