1 #!/usr/bin/env python3
2 # Copyright (C) 1998, 1999 Tom Tromey
3 # Copyright (C) 2001 Red Hat Software
4 #
5 # SPDX-License-Identifier: GPL-2.0-or-later
6 #
7 # This program is free software; you can redistribute it and/or modify
8 # it under the terms of the GNU General Public License as published by
9 # the Free Software Foundation; either version 2, or (at your option)
10 # any later version.
11 #
12 # This program is distributed in the hope that it will be useful,
13 # but WITHOUT ANY WARRANTY; without even the implied warranty of
14 # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
15 # GNU General Public License for more details.
16 #
17 # You should have received a copy of the GNU General Public License
18 # along with this program; if not, see <http://www.gnu.org/licenses/>.
19
20 """
21 gen-casemap-txt.py - Generate test cases for case mapping from Unicode data.
22 See http://www.unicode.org/Public/UNIDATA/UnicodeCharacterDatabase.html
23 Usage:
24 I consider the output of this program to be unrestricted.
25 Use it as you will.
26 """
27
28 import sys
29 import argparse
30
31
32 # Disable line length warnings as wrapping the test templates would be hard
33 # flake8: noqa: E501
34
35
36 def main(argv):
37 parser = argparse.ArgumentParser(
38 description="Generate test cases for case mapping from Unicode data"
39 )
40 parser.add_argument("UNICODE-VERSION")
41 parser.add_argument("UnicodeData.txt")
42 parser.add_argument("SpecialCasing.txt")
43 args = parser.parse_args(argv[1:])
44 version = getattr(args, "UNICODE-VERSION")
45 filename_udata = getattr(args, "UnicodeData.txt")
46 filename_casing = getattr(args, "SpecialCasing.txt")
47
48 # Names of fields in Unicode data table.
49 (
50 CODE,
51 NAME,
52 CATEGORY,
53 COMBINING_CLASSES,
54 BIDI_CATEGORY,
55 DECOMPOSITION,
56 DECIMAL_VALUE,
57 DIGIT_VALUE,
58 NUMERIC_VALUE,
59 MIRRORED,
60 OLD_NAME,
61 COMMENT,
62 UPPER,
63 LOWER,
64 TITLE,
65 ) = range(15)
66
67 # Names of fields in the SpecialCasing table
68 CASE_CODE, CASE_LOWER, CASE_TITLE, CASE_UPPER, CASE_CONDITION = range(5)
69
70 upper = {}
71 title = {}
72 lower = {}
73
74 def make_hex(codes):
75 """Converts a string of white space separated code points encoded as
76 hex values to a Unicode string. Any extra white space is ignored.
77 """
78 return "".join([chr(int(c, 16)) for c in codes.split()])
79
80 def process_one(code, fields):
81 type_ = fields[CATEGORY]
82 if type_ == "Ll":
83 upper[code] = make_hex(fields[UPPER])
84 lower[code] = chr(code)
85 title[code] = make_hex(fields[TITLE])
86 elif type_ == "Lu":
87 lower[code] = make_hex(fields[LOWER])
88 upper[code] = chr(code)
89 title[code] = make_hex(fields[TITLE])
90 elif type_ == "Lt":
91 upper[code] = make_hex(fields[UPPER])
92 lower[code] = make_hex(fields[LOWER])
93 title[code] = make_hex(fields[LOWER])
94
95 with open(filename_udata, encoding="utf-8") as fileobj:
96 last_code = -1
97 for line in fileobj:
98 line = line.strip()
99 fields = [f.strip() for f in line.split(";")]
100 if len(fields) != 15:
101 raise SystemExit(
102 "Entry for %s has wrong number of fields (%d)"
103 % (fields[CODE], len(fields))
104 )
105
106 code = int(fields[CODE], 16)
107
108 if code > last_code + 1:
109 # Found a gap
110 if fields[NAME].endswith("Last>"):
111 # Fill the gap with the last character read,
112 # since this was a range specified in the char database
113 gfields = fields
114 else:
115 # The gap represents undefined characters. Only the type
116 # matters.
117 gfields = [
118 "",
119 "",
120 "Cn",
121 "0",
122 "",
123 "",
124 "",
125 "",
126 "",
127 "",
128 "",
129 "",
130 "",
131 "",
132 "",
133 ]
134
135 last_code += 1
136 while last_code < code:
137 gfields[CODE] = "%04x" % last_code
138 process_one(last_code, gfields)
139 last_code += 1
140
141 process_one(code, fields)
142 last_code = code
143
144 with open(filename_casing, encoding="utf-8") as fileobj:
145 last_code = -1
146 for line in fileobj:
147 # strip comments and skip empty lines
148 line = line.split("#", 1)[0].strip()
149 if not line:
150 continue
151
152 # all lines end with ";" so just remove it
153 line = line.rstrip(";").rstrip()
154 fields = [f.strip() for f in line.split(";")]
155 if len(fields) not in (4, 5):
156 raise SystemExit(
157 "Entry for %s has wrong number of fields (%d)"
158 % (fields[CASE_CODE], len(fields))
159 )
160
161 if len(fields) == 5:
162 # Ignore conditional special cases - we'll handle them manually
163 continue
164
165 code = int(fields[CASE_CODE], 16)
166
167 upper[code] = make_hex(fields[CASE_UPPER])
168 lower[code] = make_hex(fields[CASE_LOWER])
169 title[code] = make_hex(fields[CASE_TITLE])
170
171 print_tests(version, upper, title, lower)
172
173
174 def print_tests(version, upper, title, lower):
175 print(
176 """\
177 # Test cases generated from Unicode {} data
178 # by gen-casemap-txt.py. Do not edit.
179 #
180 # Some special hand crafted tests
181 #
182 tr_TR\ti\ti\t\u0130\t\u0130\t# i => LATIN CAPITAL LETTER I WITH DOT ABOVE
183 tr_TR\tI\t\u0131\tI\tI\t# I => LATIN SMALL LETTER DOTLESS I
184 tr_TR\tI\u0307\ti\tI\u0307\tI\u0307\t# I => LATIN SMALL LETTER DOTLESS I
185 tr_TR.UTF-8\ti\ti\t\u0130\t\u0130\t# i => LATIN CAPITAL LETTER I WITH DOT ABOVE
186 tr_TR.UTF-8\tI\t\u0131\tI\tI\t# I => LATIN SMALL LETTER DOTLESS I
187 tr_TR.UTF-8\tI\u0307\ti\tI\u0307\tI\u0307\t# I => LATIN SMALL LETTER DOTLESS I
188 # Test reordering of YPOGEGRAMMENI across other accents
189 \t\u03b1\u0345\u0314\t\u03b1\u0345\u0314\t\u0391\u0345\u0314\t\u0391\u0314\u0399\t
190 \t\u03b1\u0314\u0345\t\u03b1\u0314\u0345\t\u0391\u0314\u0345\t\u0391\u0314\u0399\t
191 # Handling of final and nonfinal sigma
192 \tΜΆΙΟΣ μάιος Μάιος ΜΆΙΟΣ \t
193 \tΜΆΙΟΣ μάιος Μάιος ΜΆΙΟΣ\t
194 \tΣΙΓΜΑ σιγμα Σιγμα ΣΙΓΜΑ\t
195 # Lithuanian rule of i followed by letter with dot. Not at all sure
196 # about the titlecase part here
197 lt_LT\ti\u0117\ti\u0117\tIe\tIE\t
198 lt_LT\tie\u0307\tie\u0307\tIe\tIE\t
199 lt_LT\t\u00cc\ti\u0307\u0300\t\u00cc\t\u00cc\t # LATIN CAPITAL LETTER I WITH GRAVE
200 lt_LT\t\u00CD\ti\u0307\u0301\t\u00CD\t\u00CD\t # LATIN CAPITAL LETTER I WITH ACUTE
201 lt_LT\t\u0128\ti\u0307\u0303\t\u0128\t\u0128\t # LATIN CAPITAL LETTER I WITH TILDE
202 lt_LT\tI\u0301\ti\u0307\u0301\tI\u0301\tI\u0301\t # LATIN CAPITAL LETTER I (with acute accent)
203 lt_LT\tI\u0300\ti\u0307\u0300\tI\u0300\tI\u0300\t # LATIN CAPITAL LETTER I (with grave accent)
204 lt_LT\tI\u0303\ti\u0307\u0303\tI\u0303\tI\u0303\t # LATIN CAPITAL LETTER I (with tilde above)
205 lt_LT\tI\u0328\u0301\ti\u0307\u0328\u0301\tI\u0328\u0301\tI\u0328\u0301\t # LATIN CAPITAL LETTER I (with ogonek and acute accent)
206 lt_LT\tJ\u0301\tj\u0307\u0301\tJ\u0301\tJ\u0301\t # LATIN CAPITAL LETTER J (with acute accent)
207 lt_LT\t\u012e\u0301\t\u012f\u0307\u0301\t\u012e\u0301\t\u012e\u0301\t # LATIN CAPITAL LETTER I WITH OGONEK (with acute accent)
208 lt_LT.UTF-8\ti\u0117\ti\u0117\tIe\tIE\t
209 lt_LT.UTF-8\tie\u0307\tie\u0307\tIe\tIE\t
210 lt_LT.UTF-8\t\u00cc\ti\u0307\u0300\t\u00cc\t\u00cc\t # LATIN CAPITAL LETTER I WITH GRAVE
211 lt_LT.UTF-8\t\u00CD\ti\u0307\u0301\t\u00CD\t\u00CD\t # LATIN CAPITAL LETTER I WITH ACUTE
212 lt_LT.UTF-8\t\u0128\ti\u0307\u0303\t\u0128\t\u0128\t # LATIN CAPITAL LETTER I WITH TILDE
213 lt_LT.UTF-8\tI\u0301\ti\u0307\u0301\tI\u0301\tI\u0301\t # LATIN CAPITAL LETTER I (with acute accent)
214 lt_LT.UTF-8\tI\u0300\ti\u0307\u0300\tI\u0300\tI\u0300\t # LATIN CAPITAL LETTER I (with grave accent)
215 lt_LT.UTF-8\tI\u0303\ti\u0307\u0303\tI\u0303\tI\u0303\t # LATIN CAPITAL LETTER I (with tilde above)
216 lt_LT.UTF-8\tI\u0328\u0301\ti\u0307\u0328\u0301\tI\u0328\u0301\tI\u0328\u0301\t # LATIN CAPITAL LETTER I (with ogonek and acute accent)
217 lt_LT.UTF-8\tJ\u0301\tj\u0307\u0301\tJ\u0301\tJ\u0301\t # LATIN CAPITAL LETTER J (with acute accent)
218 lt_LT.UTF-8\t\u012e\u0301\t\u012f\u0307\u0301\t\u012e\u0301\t\u012e\u0301\t # LATIN CAPITAL LETTER I WITH OGONEK (with acute accent)
219 # Special case not at initial position
220 \ta\ufb04\ta\ufb04\tAffl\tAFFL\t# FB04
221 #
222 # Now the automatic tests
223 #""".format(
224 version
225 )
226 )
227
228 for i in range(0x10FFFF):
229 if i == 0x3A3:
230 # Greek sigma needs special tests
231 continue
232
233 up = upper.get(i, "")
234 lo = lower.get(i, "")
235 ti = title.get(i, "")
236
237 if any([up, lo, ti]):
238 print("\t%s\t%s\t%s\t%s\t# %4X" % (chr(i), lo, ti, up, i))
239
240
241 if __name__ == "__main__":
242 sys.exit(main(sys.argv))