1 #! /usr/bin/env python3
2 # This script generates Lib/re/_casefix.py.
3
4 import collections
5 import re
6 import sys
7 import unicodedata
8
9 def update_file(file, content):
10 try:
11 with open(file, 'r', encoding='utf-8') as fobj:
12 if fobj.read() == content:
13 return False
14 except (OSError, ValueError):
15 pass
16 with open(file, 'w', encoding='utf-8') as fobj:
17 fobj.write(content)
18 return True
19
20 re_casefix_template = """\
21 # Auto-generated by Tools/scripts/generate_re_casefix.py.
22
23 # Maps the code of lowercased character to codes of different lowercased
24 # characters which have the same uppercase.
25 _EXTRA_CASES = {
26 %s
27 }
28 """
29
30 def uname(i):
31 return unicodedata.name(chr(i), r'U+%04X' % i)
32
33 class ESC[4;38;5;81mhexint(ESC[4;38;5;149mint):
34 def __repr__(self):
35 return '%#06x' % self
36
37 def alpha(i):
38 c = chr(i)
39 return c if c.isalpha() else ascii(c)[1:-1]
40
41
42 def main(outfile='Lib/re/_casefix.py'):
43 # Find sets of characters which have the same uppercase.
44 equivalent_chars = collections.defaultdict(str)
45 for c in map(chr, range(sys.maxunicode + 1)):
46 equivalent_chars[c.upper()] += c
47 equivalent_chars = [t for t in equivalent_chars.values() if len(t) > 1]
48
49 # List of codes of lowercased characters which have the same uppercase.
50 equivalent_lower_codes = [sorted(t)
51 for s in equivalent_chars
52 for t in [set(ord(c.lower()) for c in s)]
53 if len(t) > 1]
54
55 bad_codes = []
56 for t in equivalent_lower_codes:
57 for i in t:
58 if i > 0xffff:
59 bad_codes.extend(t)
60 try:
61 bad_codes.append(ord(chr(i).upper()))
62 except (ValueError, TypeError):
63 pass
64 break
65 if bad_codes:
66 print('Case-insensitive matching may not work correctly for character:',
67 file=sys.stderr)
68 for i in sorted(bad_codes):
69 print(" '%s' (U+%04x, %s)" % (alpha(i), i, uname(i)),
70 file=sys.stderr)
71 sys.exit(1)
72
73 mapping = {i: tuple(j for j in t if i != j)
74 for t in equivalent_lower_codes
75 for i in t}
76
77 items = []
78 for i, t in sorted(mapping.items()):
79 items.append(' # %s: %s' % (
80 uname(i),
81 ', '.join(map(uname, t)),
82 ))
83 items.append(" %r: %r, # '%s': '%s'" % (
84 hexint(i),
85 tuple(map(hexint, t)),
86 alpha(i),
87 ''.join(map(alpha, t)),
88 ))
89
90 update_file(outfile, re_casefix_template % '\n'.join(items))
91
92
93 if __name__ == '__main__':
94 import sys
95 main(*sys.argv[1:])