1 """ Tests for the unicodedata module.
2
3 Written by Marc-Andre Lemburg (mal@lemburg.com).
4
5 (c) Copyright CNRI, All Rights Reserved. NO WARRANTY.
6
7 """
8
9 import hashlib
10 from http.client import HTTPException
11 import sys
12 import unicodedata
13 import unittest
14 from test.support import (open_urlresource, requires_resource, script_helper,
15 cpython_only, check_disallow_instantiation)
16
17
18 class ESC[4;38;5;81mUnicodeMethodsTest(ESC[4;38;5;149munittestESC[4;38;5;149m.ESC[4;38;5;149mTestCase):
19
20 # update this, if the database changes
21 expectedchecksum = 'e708c31c0d51f758adf475cb7201cf80917362be'
22
23 @requires_resource('cpu')
24 def test_method_checksum(self):
25 h = hashlib.sha1()
26 for i in range(sys.maxunicode + 1):
27 char = chr(i)
28 data = [
29 # Predicates (single char)
30 "01"[char.isalnum()],
31 "01"[char.isalpha()],
32 "01"[char.isdecimal()],
33 "01"[char.isdigit()],
34 "01"[char.islower()],
35 "01"[char.isnumeric()],
36 "01"[char.isspace()],
37 "01"[char.istitle()],
38 "01"[char.isupper()],
39
40 # Predicates (multiple chars)
41 "01"[(char + 'abc').isalnum()],
42 "01"[(char + 'abc').isalpha()],
43 "01"[(char + '123').isdecimal()],
44 "01"[(char + '123').isdigit()],
45 "01"[(char + 'abc').islower()],
46 "01"[(char + '123').isnumeric()],
47 "01"[(char + ' \t').isspace()],
48 "01"[(char + 'abc').istitle()],
49 "01"[(char + 'ABC').isupper()],
50
51 # Mappings (single char)
52 char.lower(),
53 char.upper(),
54 char.title(),
55
56 # Mappings (multiple chars)
57 (char + 'abc').lower(),
58 (char + 'ABC').upper(),
59 (char + 'abc').title(),
60 (char + 'ABC').title(),
61
62 ]
63 h.update(''.join(data).encode('utf-8', 'surrogatepass'))
64 result = h.hexdigest()
65 self.assertEqual(result, self.expectedchecksum)
66
67 class ESC[4;38;5;81mUnicodeDatabaseTest(ESC[4;38;5;149munittestESC[4;38;5;149m.ESC[4;38;5;149mTestCase):
68 db = unicodedata
69
70 class ESC[4;38;5;81mUnicodeFunctionsTest(ESC[4;38;5;149mUnicodeDatabaseTest):
71
72 # Update this if the database changes. Make sure to do a full rebuild
73 # (e.g. 'make distclean && make') to get the correct checksum.
74 expectedchecksum = '26ff0d31c14194b4606a5b3a81ac36df3a14e331'
75
76 @requires_resource('cpu')
77 def test_function_checksum(self):
78 data = []
79 h = hashlib.sha1()
80
81 for i in range(sys.maxunicode + 1):
82 char = chr(i)
83 data = [
84 # Properties
85 format(self.db.digit(char, -1), '.12g'),
86 format(self.db.numeric(char, -1), '.12g'),
87 format(self.db.decimal(char, -1), '.12g'),
88 self.db.category(char),
89 self.db.bidirectional(char),
90 self.db.decomposition(char),
91 str(self.db.mirrored(char)),
92 str(self.db.combining(char)),
93 unicodedata.east_asian_width(char),
94 self.db.name(char, ""),
95 ]
96 h.update(''.join(data).encode("ascii"))
97 result = h.hexdigest()
98 self.assertEqual(result, self.expectedchecksum)
99
100 @requires_resource('cpu')
101 def test_name_inverse_lookup(self):
102 for i in range(sys.maxunicode + 1):
103 char = chr(i)
104 if looked_name := self.db.name(char, None):
105 self.assertEqual(self.db.lookup(looked_name), char)
106
107 def test_digit(self):
108 self.assertEqual(self.db.digit('A', None), None)
109 self.assertEqual(self.db.digit('9'), 9)
110 self.assertEqual(self.db.digit('\u215b', None), None)
111 self.assertEqual(self.db.digit('\u2468'), 9)
112 self.assertEqual(self.db.digit('\U00020000', None), None)
113 self.assertEqual(self.db.digit('\U0001D7FD'), 7)
114
115 self.assertRaises(TypeError, self.db.digit)
116 self.assertRaises(TypeError, self.db.digit, 'xx')
117 self.assertRaises(ValueError, self.db.digit, 'x')
118
119 def test_numeric(self):
120 self.assertEqual(self.db.numeric('A',None), None)
121 self.assertEqual(self.db.numeric('9'), 9)
122 self.assertEqual(self.db.numeric('\u215b'), 0.125)
123 self.assertEqual(self.db.numeric('\u2468'), 9.0)
124 self.assertEqual(self.db.numeric('\ua627'), 7.0)
125 self.assertEqual(self.db.numeric('\U00020000', None), None)
126 self.assertEqual(self.db.numeric('\U0001012A'), 9000)
127
128 self.assertRaises(TypeError, self.db.numeric)
129 self.assertRaises(TypeError, self.db.numeric, 'xx')
130 self.assertRaises(ValueError, self.db.numeric, 'x')
131
132 def test_decimal(self):
133 self.assertEqual(self.db.decimal('A',None), None)
134 self.assertEqual(self.db.decimal('9'), 9)
135 self.assertEqual(self.db.decimal('\u215b', None), None)
136 self.assertEqual(self.db.decimal('\u2468', None), None)
137 self.assertEqual(self.db.decimal('\U00020000', None), None)
138 self.assertEqual(self.db.decimal('\U0001D7FD'), 7)
139
140 self.assertRaises(TypeError, self.db.decimal)
141 self.assertRaises(TypeError, self.db.decimal, 'xx')
142 self.assertRaises(ValueError, self.db.decimal, 'x')
143
144 def test_category(self):
145 self.assertEqual(self.db.category('\uFFFE'), 'Cn')
146 self.assertEqual(self.db.category('a'), 'Ll')
147 self.assertEqual(self.db.category('A'), 'Lu')
148 self.assertEqual(self.db.category('\U00020000'), 'Lo')
149 self.assertEqual(self.db.category('\U0001012A'), 'No')
150
151 self.assertRaises(TypeError, self.db.category)
152 self.assertRaises(TypeError, self.db.category, 'xx')
153
154 def test_bidirectional(self):
155 self.assertEqual(self.db.bidirectional('\uFFFE'), '')
156 self.assertEqual(self.db.bidirectional(' '), 'WS')
157 self.assertEqual(self.db.bidirectional('A'), 'L')
158 self.assertEqual(self.db.bidirectional('\U00020000'), 'L')
159
160 self.assertRaises(TypeError, self.db.bidirectional)
161 self.assertRaises(TypeError, self.db.bidirectional, 'xx')
162
163 def test_decomposition(self):
164 self.assertEqual(self.db.decomposition('\uFFFE'),'')
165 self.assertEqual(self.db.decomposition('\u00bc'), '<fraction> 0031 2044 0034')
166
167 self.assertRaises(TypeError, self.db.decomposition)
168 self.assertRaises(TypeError, self.db.decomposition, 'xx')
169
170 def test_mirrored(self):
171 self.assertEqual(self.db.mirrored('\uFFFE'), 0)
172 self.assertEqual(self.db.mirrored('a'), 0)
173 self.assertEqual(self.db.mirrored('\u2201'), 1)
174 self.assertEqual(self.db.mirrored('\U00020000'), 0)
175
176 self.assertRaises(TypeError, self.db.mirrored)
177 self.assertRaises(TypeError, self.db.mirrored, 'xx')
178
179 def test_combining(self):
180 self.assertEqual(self.db.combining('\uFFFE'), 0)
181 self.assertEqual(self.db.combining('a'), 0)
182 self.assertEqual(self.db.combining('\u20e1'), 230)
183 self.assertEqual(self.db.combining('\U00020000'), 0)
184
185 self.assertRaises(TypeError, self.db.combining)
186 self.assertRaises(TypeError, self.db.combining, 'xx')
187
188 def test_pr29(self):
189 # https://www.unicode.org/review/pr-29.html
190 # See issues #1054943 and #10254.
191 composed = ("\u0b47\u0300\u0b3e", "\u1100\u0300\u1161",
192 'Li\u030dt-s\u1e73\u0301',
193 '\u092e\u093e\u0930\u094d\u0915 \u091c\u093c'
194 + '\u0941\u0915\u0947\u0930\u092c\u0930\u094d\u0917',
195 '\u0915\u093f\u0930\u094d\u0917\u093f\u091c\u093c'
196 + '\u0938\u094d\u0924\u093e\u0928')
197 for text in composed:
198 self.assertEqual(self.db.normalize('NFC', text), text)
199
200 def test_issue10254(self):
201 # Crash reported in #10254
202 a = 'C\u0338' * 20 + 'C\u0327'
203 b = 'C\u0338' * 20 + '\xC7'
204 self.assertEqual(self.db.normalize('NFC', a), b)
205
206 def test_issue29456(self):
207 # Fix #29456
208 u1176_str_a = '\u1100\u1176\u11a8'
209 u1176_str_b = '\u1100\u1176\u11a8'
210 u11a7_str_a = '\u1100\u1175\u11a7'
211 u11a7_str_b = '\uae30\u11a7'
212 u11c3_str_a = '\u1100\u1175\u11c3'
213 u11c3_str_b = '\uae30\u11c3'
214 self.assertEqual(self.db.normalize('NFC', u1176_str_a), u1176_str_b)
215 self.assertEqual(self.db.normalize('NFC', u11a7_str_a), u11a7_str_b)
216 self.assertEqual(self.db.normalize('NFC', u11c3_str_a), u11c3_str_b)
217
218 def test_east_asian_width(self):
219 eaw = self.db.east_asian_width
220 self.assertRaises(TypeError, eaw, b'a')
221 self.assertRaises(TypeError, eaw, bytearray())
222 self.assertRaises(TypeError, eaw, '')
223 self.assertRaises(TypeError, eaw, 'ra')
224 self.assertEqual(eaw('\x1e'), 'N')
225 self.assertEqual(eaw('\x20'), 'Na')
226 self.assertEqual(eaw('\uC894'), 'W')
227 self.assertEqual(eaw('\uFF66'), 'H')
228 self.assertEqual(eaw('\uFF1F'), 'F')
229 self.assertEqual(eaw('\u2010'), 'A')
230 self.assertEqual(eaw('\U00020000'), 'W')
231
232 def test_east_asian_width_unassigned(self):
233 eaw = self.db.east_asian_width
234 # unassigned
235 for char in '\u0530\u0ecf\u10c6\u20fc\uaaca\U000107bd\U000115f2':
236 self.assertEqual(eaw(char), 'N')
237 self.assertIs(self.db.name(char, None), None)
238
239 # unassigned but reserved for CJK
240 for char in '\uFA6E\uFADA\U0002A6E0\U0002FA20\U0003134B\U0003FFFD':
241 self.assertEqual(eaw(char), 'W')
242 self.assertIs(self.db.name(char, None), None)
243
244 # private use areas
245 for char in '\uE000\uF800\U000F0000\U000FFFEE\U00100000\U0010FFF0':
246 self.assertEqual(eaw(char), 'A')
247 self.assertIs(self.db.name(char, None), None)
248
249 def test_east_asian_width_9_0_changes(self):
250 self.assertEqual(self.db.ucd_3_2_0.east_asian_width('\u231a'), 'N')
251 self.assertEqual(self.db.east_asian_width('\u231a'), 'W')
252
253 class ESC[4;38;5;81mUnicodeMiscTest(ESC[4;38;5;149mUnicodeDatabaseTest):
254
255 @cpython_only
256 def test_disallow_instantiation(self):
257 # Ensure that the type disallows instantiation (bpo-43916)
258 check_disallow_instantiation(self, unicodedata.UCD)
259
260 def test_failed_import_during_compiling(self):
261 # Issue 4367
262 # Decoding \N escapes requires the unicodedata module. If it can't be
263 # imported, we shouldn't segfault.
264
265 # This program should raise a SyntaxError in the eval.
266 code = "import sys;" \
267 "sys.modules['unicodedata'] = None;" \
268 """eval("'\\\\N{SOFT HYPHEN}'")"""
269 # We use a separate process because the unicodedata module may already
270 # have been loaded in this process.
271 result = script_helper.assert_python_failure("-c", code)
272 error = "SyntaxError: (unicode error) \\N escapes not supported " \
273 "(can't load unicodedata module)"
274 self.assertIn(error, result.err.decode("ascii"))
275
276 def test_decimal_numeric_consistent(self):
277 # Test that decimal and numeric are consistent,
278 # i.e. if a character has a decimal value,
279 # its numeric value should be the same.
280 count = 0
281 for i in range(0x10000):
282 c = chr(i)
283 dec = self.db.decimal(c, -1)
284 if dec != -1:
285 self.assertEqual(dec, self.db.numeric(c))
286 count += 1
287 self.assertTrue(count >= 10) # should have tested at least the ASCII digits
288
289 def test_digit_numeric_consistent(self):
290 # Test that digit and numeric are consistent,
291 # i.e. if a character has a digit value,
292 # its numeric value should be the same.
293 count = 0
294 for i in range(0x10000):
295 c = chr(i)
296 dec = self.db.digit(c, -1)
297 if dec != -1:
298 self.assertEqual(dec, self.db.numeric(c))
299 count += 1
300 self.assertTrue(count >= 10) # should have tested at least the ASCII digits
301
302 def test_bug_1704793(self):
303 self.assertEqual(self.db.lookup("GOTHIC LETTER FAIHU"), '\U00010346')
304
305 def test_ucd_510(self):
306 import unicodedata
307 # In UCD 5.1.0, a mirrored property changed wrt. UCD 3.2.0
308 self.assertTrue(unicodedata.mirrored("\u0f3a"))
309 self.assertTrue(not unicodedata.ucd_3_2_0.mirrored("\u0f3a"))
310 # Also, we now have two ways of representing
311 # the upper-case mapping: as delta, or as absolute value
312 self.assertTrue("a".upper()=='A')
313 self.assertTrue("\u1d79".upper()=='\ua77d')
314 self.assertTrue(".".upper()=='.')
315
316 @requires_resource('cpu')
317 def test_bug_5828(self):
318 self.assertEqual("\u1d79".lower(), "\u1d79")
319 # Only U+0000 should have U+0000 as its upper/lower/titlecase variant
320 self.assertEqual(
321 [
322 c for c in range(sys.maxunicode+1)
323 if "\x00" in chr(c).lower()+chr(c).upper()+chr(c).title()
324 ],
325 [0]
326 )
327
328 def test_bug_4971(self):
329 # LETTER DZ WITH CARON: DZ, Dz, dz
330 self.assertEqual("\u01c4".title(), "\u01c5")
331 self.assertEqual("\u01c5".title(), "\u01c5")
332 self.assertEqual("\u01c6".title(), "\u01c5")
333
334 def test_linebreak_7643(self):
335 for i in range(0x10000):
336 lines = (chr(i) + 'A').splitlines()
337 if i in (0x0a, 0x0b, 0x0c, 0x0d, 0x85,
338 0x1c, 0x1d, 0x1e, 0x2028, 0x2029):
339 self.assertEqual(len(lines), 2,
340 r"\u%.4x should be a linebreak" % i)
341 else:
342 self.assertEqual(len(lines), 1,
343 r"\u%.4x should not be a linebreak" % i)
344
345 class ESC[4;38;5;81mNormalizationTest(ESC[4;38;5;149munittestESC[4;38;5;149m.ESC[4;38;5;149mTestCase):
346 @staticmethod
347 def check_version(testfile):
348 hdr = testfile.readline()
349 return unicodedata.unidata_version in hdr
350
351 @staticmethod
352 def unistr(data):
353 data = [int(x, 16) for x in data.split(" ")]
354 return "".join([chr(x) for x in data])
355
356 @requires_resource('network')
357 @requires_resource('cpu')
358 def test_normalization(self):
359 TESTDATAFILE = "NormalizationTest.txt"
360 TESTDATAURL = f"http://www.pythontest.net/unicode/{unicodedata.unidata_version}/{TESTDATAFILE}"
361
362 # Hit the exception early
363 try:
364 testdata = open_urlresource(TESTDATAURL, encoding="utf-8",
365 check=self.check_version)
366 except PermissionError:
367 self.skipTest(f"Permission error when downloading {TESTDATAURL} "
368 f"into the test data directory")
369 except (OSError, HTTPException) as exc:
370 self.skipTest(f"Failed to download {TESTDATAURL}: {exc}")
371
372 with testdata:
373 self.run_normalization_tests(testdata)
374
375 def run_normalization_tests(self, testdata):
376 part = None
377 part1_data = {}
378
379 def NFC(str):
380 return unicodedata.normalize("NFC", str)
381
382 def NFKC(str):
383 return unicodedata.normalize("NFKC", str)
384
385 def NFD(str):
386 return unicodedata.normalize("NFD", str)
387
388 def NFKD(str):
389 return unicodedata.normalize("NFKD", str)
390
391 for line in testdata:
392 if '#' in line:
393 line = line.split('#')[0]
394 line = line.strip()
395 if not line:
396 continue
397 if line.startswith("@Part"):
398 part = line.split()[0]
399 continue
400 c1,c2,c3,c4,c5 = [self.unistr(x) for x in line.split(';')[:-1]]
401
402 # Perform tests
403 self.assertTrue(c2 == NFC(c1) == NFC(c2) == NFC(c3), line)
404 self.assertTrue(c4 == NFC(c4) == NFC(c5), line)
405 self.assertTrue(c3 == NFD(c1) == NFD(c2) == NFD(c3), line)
406 self.assertTrue(c5 == NFD(c4) == NFD(c5), line)
407 self.assertTrue(c4 == NFKC(c1) == NFKC(c2) == \
408 NFKC(c3) == NFKC(c4) == NFKC(c5),
409 line)
410 self.assertTrue(c5 == NFKD(c1) == NFKD(c2) == \
411 NFKD(c3) == NFKD(c4) == NFKD(c5),
412 line)
413
414 self.assertTrue(unicodedata.is_normalized("NFC", c2))
415 self.assertTrue(unicodedata.is_normalized("NFC", c4))
416
417 self.assertTrue(unicodedata.is_normalized("NFD", c3))
418 self.assertTrue(unicodedata.is_normalized("NFD", c5))
419
420 self.assertTrue(unicodedata.is_normalized("NFKC", c4))
421 self.assertTrue(unicodedata.is_normalized("NFKD", c5))
422
423 # Record part 1 data
424 if part == "@Part1":
425 part1_data[c1] = 1
426
427 # Perform tests for all other data
428 for c in range(sys.maxunicode+1):
429 X = chr(c)
430 if X in part1_data:
431 continue
432 self.assertTrue(X == NFC(X) == NFD(X) == NFKC(X) == NFKD(X), c)
433
434 def test_edge_cases(self):
435 self.assertRaises(TypeError, unicodedata.normalize)
436 self.assertRaises(ValueError, unicodedata.normalize, 'unknown', 'xx')
437 self.assertEqual(unicodedata.normalize('NFKC', ''), '')
438
439 def test_bug_834676(self):
440 # Check for bug 834676
441 unicodedata.normalize('NFC', '\ud55c\uae00')
442
443
444 if __name__ == "__main__":
445 unittest.main()