1 #! /usr/bin/env python3
2 # This script generates Lib/re/_casefix.py.
3
4 import collections
5 import sys
6 import unicodedata
7
8 SCRIPT_NAME = 'Tools/build/generate_re_casefix.py'
9
10 def update_file(file, content):
11 try:
12 with open(file, 'r', encoding='utf-8') as fobj:
13 if fobj.read() == content:
14 return False
15 except (OSError, ValueError):
16 pass
17 with open(file, 'w', encoding='utf-8') as fobj:
18 fobj.write(content)
19 return True
20
21 re_casefix_template = f"""\
22 # Auto-generated by {SCRIPT_NAME}.
23
24 # Maps the code of lowercased character to codes of different lowercased
25 # characters which have the same uppercase.
26 _EXTRA_CASES = {
27 %s
28 }
29 """
30
31 def uname(i):
32 return unicodedata.name(chr(i), r'U+%04X' % i)
33
34 class ESC[4;38;5;81mhexint(ESC[4;38;5;149mint):
35 def __repr__(self):
36 return '%#06x' % self
37
38 def alpha(i):
39 c = chr(i)
40 return c if c.isalpha() else ascii(c)[1:-1]
41
42
43 def main(outfile='Lib/re/_casefix.py'):
44 # Find sets of characters which have the same uppercase.
45 equivalent_chars = collections.defaultdict(str)
46 for c in map(chr, range(sys.maxunicode + 1)):
47 equivalent_chars[c.upper()] += c
48 equivalent_chars = [t for t in equivalent_chars.values() if len(t) > 1]
49
50 # List of codes of lowercased characters which have the same uppercase.
51 equivalent_lower_codes = [sorted(t)
52 for s in equivalent_chars
53 for t in [set(ord(c.lower()) for c in s)]
54 if len(t) > 1]
55
56 bad_codes = []
57 for t in equivalent_lower_codes:
58 for i in t:
59 if i > 0xffff:
60 bad_codes.extend(t)
61 try:
62 bad_codes.append(ord(chr(i).upper()))
63 except (ValueError, TypeError):
64 pass
65 break
66 if bad_codes:
67 print('Case-insensitive matching may not work correctly for character:',
68 file=sys.stderr)
69 for i in sorted(bad_codes):
70 print(" '%s' (U+%04x, %s)" % (alpha(i), i, uname(i)),
71 file=sys.stderr)
72 sys.exit(1)
73
74 mapping = {i: tuple(j for j in t if i != j)
75 for t in equivalent_lower_codes
76 for i in t}
77
78 items = []
79 for i, t in sorted(mapping.items()):
80 items.append(' # %s: %s' % (
81 uname(i),
82 ', '.join(map(uname, t)),
83 ))
84 items.append(" %r: %r, # '%s': '%s'" % (
85 hexint(i),
86 tuple(map(hexint, t)),
87 alpha(i),
88 ''.join(map(alpha, t)),
89 ))
90
91 update_file(outfile, re_casefix_template % '\n'.join(items))
92
93
94 if __name__ == '__main__':
95 import sys
96 main(*sys.argv[1:])