python (3.11.7)
1 """ Test script for the Unicode implementation.
2
3 Written by Bill Tutt.
4 Modified for Python 2.0 by Fredrik Lundh (fredrik@pythonware.com)
5
6 (c) Copyright CNRI, All Rights Reserved. NO WARRANTY.
7
8 """#"
9
10 import ast
11 import unittest
12 import unicodedata
13
14 from test import support
15 from http.client import HTTPException
16
17 try:
18 from _testcapi import INT_MAX, PY_SSIZE_T_MAX, UINT_MAX
19 except ImportError:
20 INT_MAX = PY_SSIZE_T_MAX = UINT_MAX = 2**64 - 1
21
22 class ESC[4;38;5;81mUnicodeNamesTest(ESC[4;38;5;149munittestESC[4;38;5;149m.ESC[4;38;5;149mTestCase):
23
24 def checkletter(self, name, code):
25 # Helper that put all \N escapes inside eval'd raw strings,
26 # to make sure this script runs even if the compiler
27 # chokes on \N escapes
28 res = ast.literal_eval(r'"\N{%s}"' % name)
29 self.assertEqual(res, code)
30 return res
31
32 def test_general(self):
33 # General and case insensitivity test:
34 chars = [
35 "LATIN CAPITAL LETTER T",
36 "LATIN SMALL LETTER H",
37 "LATIN SMALL LETTER E",
38 "SPACE",
39 "LATIN SMALL LETTER R",
40 "LATIN CAPITAL LETTER E",
41 "LATIN SMALL LETTER D",
42 "SPACE",
43 "LATIN SMALL LETTER f",
44 "LATIN CAPITAL LeTtEr o",
45 "LATIN SMaLl LETTER x",
46 "SPACE",
47 "LATIN SMALL LETTER A",
48 "LATIN SMALL LETTER T",
49 "LATIN SMALL LETTER E",
50 "SPACE",
51 "LATIN SMALL LETTER T",
52 "LATIN SMALL LETTER H",
53 "LATIN SMALL LETTER E",
54 "SpAcE",
55 "LATIN SMALL LETTER S",
56 "LATIN SMALL LETTER H",
57 "LATIN small LETTER e",
58 "LATIN small LETTER e",
59 "LATIN SMALL LETTER P",
60 "FULL STOP"
61 ]
62 string = "The rEd fOx ate the sheep."
63
64 self.assertEqual(
65 "".join([self.checkletter(*args) for args in zip(chars, string)]),
66 string
67 )
68
69 def test_ascii_letters(self):
70 for char in "".join(map(chr, range(ord("a"), ord("z")))):
71 name = "LATIN SMALL LETTER %s" % char.upper()
72 code = unicodedata.lookup(name)
73 self.assertEqual(unicodedata.name(code), name)
74
75 def test_hangul_syllables(self):
76 self.checkletter("HANGUL SYLLABLE GA", "\uac00")
77 self.checkletter("HANGUL SYLLABLE GGWEOSS", "\uafe8")
78 self.checkletter("HANGUL SYLLABLE DOLS", "\ub3d0")
79 self.checkletter("HANGUL SYLLABLE RYAN", "\ub7b8")
80 self.checkletter("HANGUL SYLLABLE MWIK", "\ubba0")
81 self.checkletter("HANGUL SYLLABLE BBWAEM", "\ubf88")
82 self.checkletter("HANGUL SYLLABLE SSEOL", "\uc370")
83 self.checkletter("HANGUL SYLLABLE YI", "\uc758")
84 self.checkletter("HANGUL SYLLABLE JJYOSS", "\ucb40")
85 self.checkletter("HANGUL SYLLABLE KYEOLS", "\ucf28")
86 self.checkletter("HANGUL SYLLABLE PAN", "\ud310")
87 self.checkletter("HANGUL SYLLABLE HWEOK", "\ud6f8")
88 self.checkletter("HANGUL SYLLABLE HIH", "\ud7a3")
89
90 self.assertRaises(ValueError, unicodedata.name, "\ud7a4")
91
92 def test_cjk_unified_ideographs(self):
93 self.checkletter("CJK UNIFIED IDEOGRAPH-3400", "\u3400")
94 self.checkletter("CJK UNIFIED IDEOGRAPH-4DB5", "\u4db5")
95 self.checkletter("CJK UNIFIED IDEOGRAPH-4E00", "\u4e00")
96 self.checkletter("CJK UNIFIED IDEOGRAPH-9FCB", "\u9fCB")
97 self.checkletter("CJK UNIFIED IDEOGRAPH-20000", "\U00020000")
98 self.checkletter("CJK UNIFIED IDEOGRAPH-2A6D6", "\U0002a6d6")
99 self.checkletter("CJK UNIFIED IDEOGRAPH-2A700", "\U0002A700")
100 self.checkletter("CJK UNIFIED IDEOGRAPH-2B734", "\U0002B734")
101 self.checkletter("CJK UNIFIED IDEOGRAPH-2B740", "\U0002B740")
102 self.checkletter("CJK UNIFIED IDEOGRAPH-2B81D", "\U0002B81D")
103 self.checkletter("CJK UNIFIED IDEOGRAPH-3134A", "\U0003134A")
104
105 def test_bmp_characters(self):
106 for code in range(0x10000):
107 char = chr(code)
108 name = unicodedata.name(char, None)
109 if name is not None:
110 self.assertEqual(unicodedata.lookup(name), char)
111
112 def test_misc_symbols(self):
113 self.checkletter("PILCROW SIGN", "\u00b6")
114 self.checkletter("REPLACEMENT CHARACTER", "\uFFFD")
115 self.checkletter("HALFWIDTH KATAKANA SEMI-VOICED SOUND MARK", "\uFF9F")
116 self.checkletter("FULLWIDTH LATIN SMALL LETTER A", "\uFF41")
117
118 def test_aliases(self):
119 # Check that the aliases defined in the NameAliases.txt file work.
120 # This should be updated when new aliases are added or the file
121 # should be downloaded and parsed instead. See #12753.
122 aliases = [
123 ('LATIN CAPITAL LETTER GHA', 0x01A2),
124 ('LATIN SMALL LETTER GHA', 0x01A3),
125 ('KANNADA LETTER LLLA', 0x0CDE),
126 ('LAO LETTER FO FON', 0x0E9D),
127 ('LAO LETTER FO FAY', 0x0E9F),
128 ('LAO LETTER RO', 0x0EA3),
129 ('LAO LETTER LO', 0x0EA5),
130 ('TIBETAN MARK BKA- SHOG GI MGO RGYAN', 0x0FD0),
131 ('YI SYLLABLE ITERATION MARK', 0xA015),
132 ('PRESENTATION FORM FOR VERTICAL RIGHT WHITE LENTICULAR BRACKET', 0xFE18),
133 ('BYZANTINE MUSICAL SYMBOL FTHORA SKLIRON CHROMA VASIS', 0x1D0C5)
134 ]
135 for alias, codepoint in aliases:
136 self.checkletter(alias, chr(codepoint))
137 name = unicodedata.name(chr(codepoint))
138 self.assertNotEqual(name, alias)
139 self.assertEqual(unicodedata.lookup(alias),
140 unicodedata.lookup(name))
141 with self.assertRaises(KeyError):
142 unicodedata.ucd_3_2_0.lookup(alias)
143
144 def test_aliases_names_in_pua_range(self):
145 # We are storing aliases in the PUA 15, but their names shouldn't leak
146 for cp in range(0xf0000, 0xf0100):
147 with self.assertRaises(ValueError) as cm:
148 unicodedata.name(chr(cp))
149 self.assertEqual(str(cm.exception), 'no such name')
150
151 def test_named_sequences_names_in_pua_range(self):
152 # We are storing named seq in the PUA 15, but their names shouldn't leak
153 for cp in range(0xf0100, 0xf0fff):
154 with self.assertRaises(ValueError) as cm:
155 unicodedata.name(chr(cp))
156 self.assertEqual(str(cm.exception), 'no such name')
157
158 def test_named_sequences_sample(self):
159 # Check a few named sequences. See #12753.
160 sequences = [
161 ('LATIN SMALL LETTER R WITH TILDE', '\u0072\u0303'),
162 ('TAMIL SYLLABLE SAI', '\u0BB8\u0BC8'),
163 ('TAMIL SYLLABLE MOO', '\u0BAE\u0BCB'),
164 ('TAMIL SYLLABLE NNOO', '\u0BA3\u0BCB'),
165 ('TAMIL CONSONANT KSS', '\u0B95\u0BCD\u0BB7\u0BCD'),
166 ]
167 for seqname, codepoints in sequences:
168 self.assertEqual(unicodedata.lookup(seqname), codepoints)
169 with self.assertRaises(SyntaxError):
170 self.checkletter(seqname, None)
171 with self.assertRaises(KeyError):
172 unicodedata.ucd_3_2_0.lookup(seqname)
173
174 def test_named_sequences_full(self):
175 # Check all the named sequences
176 def check_version(testfile):
177 hdr = testfile.readline()
178 return unicodedata.unidata_version in hdr
179 url = ("http://www.pythontest.net/unicode/%s/NamedSequences.txt" %
180 unicodedata.unidata_version)
181 try:
182 testdata = support.open_urlresource(url, encoding="utf-8",
183 check=check_version)
184 except (OSError, HTTPException):
185 self.skipTest("Could not retrieve " + url)
186 self.addCleanup(testdata.close)
187 for line in testdata:
188 line = line.strip()
189 if not line or line.startswith('#'):
190 continue
191 seqname, codepoints = line.split(';')
192 codepoints = ''.join(chr(int(cp, 16)) for cp in codepoints.split())
193 self.assertEqual(unicodedata.lookup(seqname), codepoints)
194 with self.assertRaises(SyntaxError):
195 self.checkletter(seqname, None)
196 with self.assertRaises(KeyError):
197 unicodedata.ucd_3_2_0.lookup(seqname)
198
199 def test_errors(self):
200 self.assertRaises(TypeError, unicodedata.name)
201 self.assertRaises(TypeError, unicodedata.name, 'xx')
202 self.assertRaises(TypeError, unicodedata.lookup)
203 self.assertRaises(KeyError, unicodedata.lookup, 'unknown')
204
205 def test_strict_error_handling(self):
206 # bogus character name
207 self.assertRaises(
208 UnicodeError,
209 str, b"\\N{blah}", 'unicode-escape', 'strict'
210 )
211 # long bogus character name
212 self.assertRaises(
213 UnicodeError,
214 str, bytes("\\N{%s}" % ("x" * 100000), "ascii"), 'unicode-escape', 'strict'
215 )
216 # missing closing brace
217 self.assertRaises(
218 UnicodeError,
219 str, b"\\N{SPACE", 'unicode-escape', 'strict'
220 )
221 # missing opening brace
222 self.assertRaises(
223 UnicodeError,
224 str, b"\\NSPACE", 'unicode-escape', 'strict'
225 )
226
227 @support.cpython_only
228 @unittest.skipUnless(INT_MAX < PY_SSIZE_T_MAX, "needs UINT_MAX < SIZE_MAX")
229 @support.bigmemtest(size=UINT_MAX + 1, memuse=2 + 1, dry_run=False)
230 def test_issue16335(self, size):
231 # very very long bogus character name
232 x = b'\\N{SPACE' + b'x' * (UINT_MAX + 1) + b'}'
233 self.assertEqual(len(x), len(b'\\N{SPACE}') + (UINT_MAX + 1))
234 self.assertRaisesRegex(UnicodeError,
235 'unknown Unicode character name',
236 x.decode, 'unicode-escape'
237 )
238
239
240 if __name__ == "__main__":
241 unittest.main()