1 import unittest
2 import sys
3 from test import support
4 from test.support import import_helper
5
6 try:
7 import _testcapi
8 except ImportError:
9 _testcapi = None
10
11
12 class ESC[4;38;5;81mCAPITest(ESC[4;38;5;149munittestESC[4;38;5;149m.ESC[4;38;5;149mTestCase):
13
14 # Test PyUnicode_FromFormat()
15 def test_from_format(self):
16 import_helper.import_module('ctypes')
17 from ctypes import (
18 c_char_p,
19 pythonapi, py_object, sizeof,
20 c_int, c_long, c_longlong, c_ssize_t,
21 c_uint, c_ulong, c_ulonglong, c_size_t, c_void_p)
22 name = "PyUnicode_FromFormat"
23 _PyUnicode_FromFormat = getattr(pythonapi, name)
24 _PyUnicode_FromFormat.argtypes = (c_char_p,)
25 _PyUnicode_FromFormat.restype = py_object
26
27 def PyUnicode_FromFormat(format, *args):
28 cargs = tuple(
29 py_object(arg) if isinstance(arg, str) else arg
30 for arg in args)
31 return _PyUnicode_FromFormat(format, *cargs)
32
33 def check_format(expected, format, *args):
34 text = PyUnicode_FromFormat(format, *args)
35 self.assertEqual(expected, text)
36
37 # ascii format, non-ascii argument
38 check_format('ascii\x7f=unicode\xe9',
39 b'ascii\x7f=%U', 'unicode\xe9')
40
41 # non-ascii format, ascii argument: ensure that PyUnicode_FromFormatV()
42 # raises an error
43 self.assertRaisesRegex(ValueError,
44 r'^PyUnicode_FromFormatV\(\) expects an ASCII-encoded format '
45 'string, got a non-ASCII byte: 0xe9$',
46 PyUnicode_FromFormat, b'unicode\xe9=%s', 'ascii')
47
48 # test "%c"
49 check_format('\uabcd',
50 b'%c', c_int(0xabcd))
51 check_format('\U0010ffff',
52 b'%c', c_int(0x10ffff))
53 with self.assertRaises(OverflowError):
54 PyUnicode_FromFormat(b'%c', c_int(0x110000))
55 # Issue #18183
56 check_format('\U00010000\U00100000',
57 b'%c%c', c_int(0x10000), c_int(0x100000))
58
59 # test "%"
60 check_format('%',
61 b'%')
62 check_format('%',
63 b'%%')
64 check_format('%s',
65 b'%%s')
66 check_format('[%]',
67 b'[%%]')
68 check_format('%abc',
69 b'%%%s', b'abc')
70
71 # truncated string
72 check_format('abc',
73 b'%.3s', b'abcdef')
74 check_format('abc[\ufffd',
75 b'%.5s', 'abc[\u20ac]'.encode('utf8'))
76 check_format("'\\u20acABC'",
77 b'%A', '\u20acABC')
78 check_format("'\\u20",
79 b'%.5A', '\u20acABCDEF')
80 check_format("'\u20acABC'",
81 b'%R', '\u20acABC')
82 check_format("'\u20acA",
83 b'%.3R', '\u20acABCDEF')
84 check_format('\u20acAB',
85 b'%.3S', '\u20acABCDEF')
86 check_format('\u20acAB',
87 b'%.3U', '\u20acABCDEF')
88 check_format('\u20acAB',
89 b'%.3V', '\u20acABCDEF', None)
90 check_format('abc[\ufffd',
91 b'%.5V', None, 'abc[\u20ac]'.encode('utf8'))
92
93 # following tests comes from #7330
94 # test width modifier and precision modifier with %S
95 check_format("repr= abc",
96 b'repr=%5S', 'abc')
97 check_format("repr=ab",
98 b'repr=%.2S', 'abc')
99 check_format("repr= ab",
100 b'repr=%5.2S', 'abc')
101
102 # test width modifier and precision modifier with %R
103 check_format("repr= 'abc'",
104 b'repr=%8R', 'abc')
105 check_format("repr='ab",
106 b'repr=%.3R', 'abc')
107 check_format("repr= 'ab",
108 b'repr=%5.3R', 'abc')
109
110 # test width modifier and precision modifier with %A
111 check_format("repr= 'abc'",
112 b'repr=%8A', 'abc')
113 check_format("repr='ab",
114 b'repr=%.3A', 'abc')
115 check_format("repr= 'ab",
116 b'repr=%5.3A', 'abc')
117
118 # test width modifier and precision modifier with %s
119 check_format("repr= abc",
120 b'repr=%5s', b'abc')
121 check_format("repr=ab",
122 b'repr=%.2s', b'abc')
123 check_format("repr= ab",
124 b'repr=%5.2s', b'abc')
125
126 # test width modifier and precision modifier with %U
127 check_format("repr= abc",
128 b'repr=%5U', 'abc')
129 check_format("repr=ab",
130 b'repr=%.2U', 'abc')
131 check_format("repr= ab",
132 b'repr=%5.2U', 'abc')
133
134 # test width modifier and precision modifier with %V
135 check_format("repr= abc",
136 b'repr=%5V', 'abc', b'123')
137 check_format("repr=ab",
138 b'repr=%.2V', 'abc', b'123')
139 check_format("repr= ab",
140 b'repr=%5.2V', 'abc', b'123')
141 check_format("repr= 123",
142 b'repr=%5V', None, b'123')
143 check_format("repr=12",
144 b'repr=%.2V', None, b'123')
145 check_format("repr= 12",
146 b'repr=%5.2V', None, b'123')
147
148 # test integer formats (%i, %d, %u)
149 check_format('010',
150 b'%03i', c_int(10))
151 check_format('0010',
152 b'%0.4i', c_int(10))
153 check_format('-123',
154 b'%i', c_int(-123))
155 check_format('-123',
156 b'%li', c_long(-123))
157 check_format('-123',
158 b'%lli', c_longlong(-123))
159 check_format('-123',
160 b'%zi', c_ssize_t(-123))
161
162 check_format('-123',
163 b'%d', c_int(-123))
164 check_format('-123',
165 b'%ld', c_long(-123))
166 check_format('-123',
167 b'%lld', c_longlong(-123))
168 check_format('-123',
169 b'%zd', c_ssize_t(-123))
170
171 check_format('123',
172 b'%u', c_uint(123))
173 check_format('123',
174 b'%lu', c_ulong(123))
175 check_format('123',
176 b'%llu', c_ulonglong(123))
177 check_format('123',
178 b'%zu', c_size_t(123))
179
180 # test long output
181 min_longlong = -(2 ** (8 * sizeof(c_longlong) - 1))
182 max_longlong = -min_longlong - 1
183 check_format(str(min_longlong),
184 b'%lld', c_longlong(min_longlong))
185 check_format(str(max_longlong),
186 b'%lld', c_longlong(max_longlong))
187 max_ulonglong = 2 ** (8 * sizeof(c_ulonglong)) - 1
188 check_format(str(max_ulonglong),
189 b'%llu', c_ulonglong(max_ulonglong))
190 PyUnicode_FromFormat(b'%p', c_void_p(-1))
191
192 # test padding (width and/or precision)
193 check_format('123'.rjust(10, '0'),
194 b'%010i', c_int(123))
195 check_format('123'.rjust(100),
196 b'%100i', c_int(123))
197 check_format('123'.rjust(100, '0'),
198 b'%.100i', c_int(123))
199 check_format('123'.rjust(80, '0').rjust(100),
200 b'%100.80i', c_int(123))
201
202 check_format('123'.rjust(10, '0'),
203 b'%010u', c_uint(123))
204 check_format('123'.rjust(100),
205 b'%100u', c_uint(123))
206 check_format('123'.rjust(100, '0'),
207 b'%.100u', c_uint(123))
208 check_format('123'.rjust(80, '0').rjust(100),
209 b'%100.80u', c_uint(123))
210
211 check_format('123'.rjust(10, '0'),
212 b'%010x', c_int(0x123))
213 check_format('123'.rjust(100),
214 b'%100x', c_int(0x123))
215 check_format('123'.rjust(100, '0'),
216 b'%.100x', c_int(0x123))
217 check_format('123'.rjust(80, '0').rjust(100),
218 b'%100.80x', c_int(0x123))
219
220 # test %A
221 check_format(r"%A:'abc\xe9\uabcd\U0010ffff'",
222 b'%%A:%A', 'abc\xe9\uabcd\U0010ffff')
223
224 # test %V
225 check_format('repr=abc',
226 b'repr=%V', 'abc', b'xyz')
227
228 # test %p
229 # We cannot test the exact result,
230 # because it returns a hex representation of a C pointer,
231 # which is going to be different each time. But, we can test the format.
232 p_format_regex = r'^0x[a-zA-Z0-9]{3,}$'
233 p_format1 = PyUnicode_FromFormat(b'%p', 'abc')
234 self.assertIsInstance(p_format1, str)
235 self.assertRegex(p_format1, p_format_regex)
236
237 p_format2 = PyUnicode_FromFormat(b'%p %p', '123456', b'xyz')
238 self.assertIsInstance(p_format2, str)
239 self.assertRegex(p_format2,
240 r'0x[a-zA-Z0-9]{3,} 0x[a-zA-Z0-9]{3,}')
241
242 # Extra args are ignored:
243 p_format3 = PyUnicode_FromFormat(b'%p', '123456', None, b'xyz')
244 self.assertIsInstance(p_format3, str)
245 self.assertRegex(p_format3, p_format_regex)
246
247 # Test string decode from parameter of %s using utf-8.
248 # b'\xe4\xba\xba\xe6\xb0\x91' is utf-8 encoded byte sequence of
249 # '\u4eba\u6c11'
250 check_format('repr=\u4eba\u6c11',
251 b'repr=%V', None, b'\xe4\xba\xba\xe6\xb0\x91')
252
253 #Test replace error handler.
254 check_format('repr=abc\ufffd',
255 b'repr=%V', None, b'abc\xff')
256
257 # not supported: copy the raw format string. these tests are just here
258 # to check for crashes and should not be considered as specifications
259 check_format('%s',
260 b'%1%s', b'abc')
261 check_format('%1abc',
262 b'%1abc')
263 check_format('%+i',
264 b'%+i', c_int(10))
265 check_format('%.%s',
266 b'%.%s', b'abc')
267
268 # Issue #33817: empty strings
269 check_format('',
270 b'')
271 check_format('',
272 b'%s', b'')
273
274 # Test PyUnicode_AsWideChar()
275 @support.cpython_only
276 @unittest.skipIf(_testcapi is None, 'need _testcapi module')
277 def test_aswidechar(self):
278 from _testcapi import unicode_aswidechar
279 import_helper.import_module('ctypes')
280 from ctypes import c_wchar, sizeof
281
282 wchar, size = unicode_aswidechar('abcdef', 2)
283 self.assertEqual(size, 2)
284 self.assertEqual(wchar, 'ab')
285
286 wchar, size = unicode_aswidechar('abc', 3)
287 self.assertEqual(size, 3)
288 self.assertEqual(wchar, 'abc')
289
290 wchar, size = unicode_aswidechar('abc', 4)
291 self.assertEqual(size, 3)
292 self.assertEqual(wchar, 'abc\0')
293
294 wchar, size = unicode_aswidechar('abc', 10)
295 self.assertEqual(size, 3)
296 self.assertEqual(wchar, 'abc\0')
297
298 wchar, size = unicode_aswidechar('abc\0def', 20)
299 self.assertEqual(size, 7)
300 self.assertEqual(wchar, 'abc\0def\0')
301
302 nonbmp = chr(0x10ffff)
303 if sizeof(c_wchar) == 2:
304 buflen = 3
305 nchar = 2
306 else: # sizeof(c_wchar) == 4
307 buflen = 2
308 nchar = 1
309 wchar, size = unicode_aswidechar(nonbmp, buflen)
310 self.assertEqual(size, nchar)
311 self.assertEqual(wchar, nonbmp + '\0')
312
313 # Test PyUnicode_AsWideCharString()
314 @support.cpython_only
315 @unittest.skipIf(_testcapi is None, 'need _testcapi module')
316 def test_aswidecharstring(self):
317 from _testcapi import unicode_aswidecharstring
318 import_helper.import_module('ctypes')
319 from ctypes import c_wchar, sizeof
320
321 wchar, size = unicode_aswidecharstring('abc')
322 self.assertEqual(size, 3)
323 self.assertEqual(wchar, 'abc\0')
324
325 wchar, size = unicode_aswidecharstring('abc\0def')
326 self.assertEqual(size, 7)
327 self.assertEqual(wchar, 'abc\0def\0')
328
329 nonbmp = chr(0x10ffff)
330 if sizeof(c_wchar) == 2:
331 nchar = 2
332 else: # sizeof(c_wchar) == 4
333 nchar = 1
334 wchar, size = unicode_aswidecharstring(nonbmp)
335 self.assertEqual(size, nchar)
336 self.assertEqual(wchar, nonbmp + '\0')
337
338 # Test PyUnicode_AsUCS4()
339 @support.cpython_only
340 @unittest.skipIf(_testcapi is None, 'need _testcapi module')
341 def test_asucs4(self):
342 from _testcapi import unicode_asucs4
343 for s in ['abc', '\xa1\xa2', '\u4f60\u597d', 'a\U0001f600',
344 'a\ud800b\udfffc', '\ud834\udd1e']:
345 l = len(s)
346 self.assertEqual(unicode_asucs4(s, l, True), s+'\0')
347 self.assertEqual(unicode_asucs4(s, l, False), s+'\uffff')
348 self.assertEqual(unicode_asucs4(s, l+1, True), s+'\0\uffff')
349 self.assertEqual(unicode_asucs4(s, l+1, False), s+'\0\uffff')
350 self.assertRaises(SystemError, unicode_asucs4, s, l-1, True)
351 self.assertRaises(SystemError, unicode_asucs4, s, l-2, False)
352 s = '\0'.join([s, s])
353 self.assertEqual(unicode_asucs4(s, len(s), True), s+'\0')
354 self.assertEqual(unicode_asucs4(s, len(s), False), s+'\uffff')
355
356 # Test PyUnicode_AsUTF8()
357 @support.cpython_only
358 @unittest.skipIf(_testcapi is None, 'need _testcapi module')
359 def test_asutf8(self):
360 from _testcapi import unicode_asutf8
361
362 bmp = '\u0100'
363 bmp2 = '\uffff'
364 nonbmp = chr(0x10ffff)
365
366 self.assertEqual(unicode_asutf8(bmp), b'\xc4\x80')
367 self.assertEqual(unicode_asutf8(bmp2), b'\xef\xbf\xbf')
368 self.assertEqual(unicode_asutf8(nonbmp), b'\xf4\x8f\xbf\xbf')
369 self.assertRaises(UnicodeEncodeError, unicode_asutf8, 'a\ud800b\udfffc')
370
371 # Test PyUnicode_AsUTF8AndSize()
372 @support.cpython_only
373 @unittest.skipIf(_testcapi is None, 'need _testcapi module')
374 def test_asutf8andsize(self):
375 from _testcapi import unicode_asutf8andsize
376
377 bmp = '\u0100'
378 bmp2 = '\uffff'
379 nonbmp = chr(0x10ffff)
380
381 self.assertEqual(unicode_asutf8andsize(bmp), (b'\xc4\x80', 2))
382 self.assertEqual(unicode_asutf8andsize(bmp2), (b'\xef\xbf\xbf', 3))
383 self.assertEqual(unicode_asutf8andsize(nonbmp), (b'\xf4\x8f\xbf\xbf', 4))
384 self.assertRaises(UnicodeEncodeError, unicode_asutf8andsize, 'a\ud800b\udfffc')
385
386 # Test PyUnicode_FindChar()
387 @support.cpython_only
388 @unittest.skipIf(_testcapi is None, 'need _testcapi module')
389 def test_findchar(self):
390 from _testcapi import unicode_findchar
391
392 for str in "\xa1", "\u8000\u8080", "\ud800\udc02", "\U0001f100\U0001f1f1":
393 for i, ch in enumerate(str):
394 self.assertEqual(unicode_findchar(str, ord(ch), 0, len(str), 1), i)
395 self.assertEqual(unicode_findchar(str, ord(ch), 0, len(str), -1), i)
396
397 str = "!>_<!"
398 self.assertEqual(unicode_findchar(str, 0x110000, 0, len(str), 1), -1)
399 self.assertEqual(unicode_findchar(str, 0x110000, 0, len(str), -1), -1)
400 # start < end
401 self.assertEqual(unicode_findchar(str, ord('!'), 1, len(str)+1, 1), 4)
402 self.assertEqual(unicode_findchar(str, ord('!'), 1, len(str)+1, -1), 4)
403 # start >= end
404 self.assertEqual(unicode_findchar(str, ord('!'), 0, 0, 1), -1)
405 self.assertEqual(unicode_findchar(str, ord('!'), len(str), 0, 1), -1)
406 # negative
407 self.assertEqual(unicode_findchar(str, ord('!'), -len(str), -1, 1), 0)
408 self.assertEqual(unicode_findchar(str, ord('!'), -len(str), -1, -1), 0)
409
410 # Test PyUnicode_CopyCharacters()
411 @support.cpython_only
412 @unittest.skipIf(_testcapi is None, 'need _testcapi module')
413 def test_copycharacters(self):
414 from _testcapi import unicode_copycharacters
415
416 strings = [
417 'abcde', '\xa1\xa2\xa3\xa4\xa5',
418 '\u4f60\u597d\u4e16\u754c\uff01',
419 '\U0001f600\U0001f601\U0001f602\U0001f603\U0001f604'
420 ]
421
422 for idx, from_ in enumerate(strings):
423 # wide -> narrow: exceed maxchar limitation
424 for to in strings[:idx]:
425 self.assertRaises(
426 SystemError,
427 unicode_copycharacters, to, 0, from_, 0, 5
428 )
429 # same kind
430 for from_start in range(5):
431 self.assertEqual(
432 unicode_copycharacters(from_, 0, from_, from_start, 5),
433 (from_[from_start:from_start+5].ljust(5, '\0'),
434 5-from_start)
435 )
436 for to_start in range(5):
437 self.assertEqual(
438 unicode_copycharacters(from_, to_start, from_, to_start, 5),
439 (from_[to_start:to_start+5].rjust(5, '\0'),
440 5-to_start)
441 )
442 # narrow -> wide
443 # Tests omitted since this creates invalid strings.
444
445 s = strings[0]
446 self.assertRaises(IndexError, unicode_copycharacters, s, 6, s, 0, 5)
447 self.assertRaises(IndexError, unicode_copycharacters, s, -1, s, 0, 5)
448 self.assertRaises(IndexError, unicode_copycharacters, s, 0, s, 6, 5)
449 self.assertRaises(IndexError, unicode_copycharacters, s, 0, s, -1, 5)
450 self.assertRaises(SystemError, unicode_copycharacters, s, 1, s, 0, 5)
451 self.assertRaises(SystemError, unicode_copycharacters, s, 0, s, 0, -1)
452 self.assertRaises(SystemError, unicode_copycharacters, s, 0, b'', 0, 0)
453
454 @support.cpython_only
455 @unittest.skipIf(_testcapi is None, 'need _testcapi module')
456 def test_pep393_utf8_caching_bug(self):
457 # Issue #25709: Problem with string concatenation and utf-8 cache
458 from _testcapi import getargs_s_hash
459 for k in 0x24, 0xa4, 0x20ac, 0x1f40d:
460 s = ''
461 for i in range(5):
462 # Due to CPython specific optimization the 's' string can be
463 # resized in-place.
464 s += chr(k)
465 # Parsing with the "s#" format code calls indirectly
466 # PyUnicode_AsUTF8AndSize() which creates the UTF-8
467 # encoded string cached in the Unicode object.
468 self.assertEqual(getargs_s_hash(s), chr(k).encode() * (i + 1))
469 # Check that the second call returns the same result
470 self.assertEqual(getargs_s_hash(s), chr(k).encode() * (i + 1))
471
472
473 if __name__ == "__main__":
474 unittest.main()