(root)/
Python-3.11.7/
Lib/
test/
test_capi/
test_unicode.py
       1  import unittest
       2  import sys
       3  from test import support
       4  from test.support import import_helper
       5  
       6  try:
       7      import _testcapi
       8  except ImportError:
       9      _testcapi = None
      10  
      11  
      12  class ESC[4;38;5;81mCAPITest(ESC[4;38;5;149munittestESC[4;38;5;149m.ESC[4;38;5;149mTestCase):
      13  
      14      # Test PyUnicode_FromFormat()
      15      def test_from_format(self):
      16          import_helper.import_module('ctypes')
      17          from ctypes import (
      18              c_char_p,
      19              pythonapi, py_object, sizeof,
      20              c_int, c_long, c_longlong, c_ssize_t,
      21              c_uint, c_ulong, c_ulonglong, c_size_t, c_void_p)
      22          name = "PyUnicode_FromFormat"
      23          _PyUnicode_FromFormat = getattr(pythonapi, name)
      24          _PyUnicode_FromFormat.argtypes = (c_char_p,)
      25          _PyUnicode_FromFormat.restype = py_object
      26  
      27          def PyUnicode_FromFormat(format, *args):
      28              cargs = tuple(
      29                  py_object(arg) if isinstance(arg, str) else arg
      30                  for arg in args)
      31              return _PyUnicode_FromFormat(format, *cargs)
      32  
      33          def check_format(expected, format, *args):
      34              text = PyUnicode_FromFormat(format, *args)
      35              self.assertEqual(expected, text)
      36  
      37          # ascii format, non-ascii argument
      38          check_format('ascii\x7f=unicode\xe9',
      39                       b'ascii\x7f=%U', 'unicode\xe9')
      40  
      41          # non-ascii format, ascii argument: ensure that PyUnicode_FromFormatV()
      42          # raises an error
      43          self.assertRaisesRegex(ValueError,
      44              r'^PyUnicode_FromFormatV\(\) expects an ASCII-encoded format '
      45              'string, got a non-ASCII byte: 0xe9$',
      46              PyUnicode_FromFormat, b'unicode\xe9=%s', 'ascii')
      47  
      48          # test "%c"
      49          check_format('\uabcd',
      50                       b'%c', c_int(0xabcd))
      51          check_format('\U0010ffff',
      52                       b'%c', c_int(0x10ffff))
      53          with self.assertRaises(OverflowError):
      54              PyUnicode_FromFormat(b'%c', c_int(0x110000))
      55          # Issue #18183
      56          check_format('\U00010000\U00100000',
      57                       b'%c%c', c_int(0x10000), c_int(0x100000))
      58  
      59          # test "%"
      60          check_format('%',
      61                       b'%')
      62          check_format('%',
      63                       b'%%')
      64          check_format('%s',
      65                       b'%%s')
      66          check_format('[%]',
      67                       b'[%%]')
      68          check_format('%abc',
      69                       b'%%%s', b'abc')
      70  
      71          # truncated string
      72          check_format('abc',
      73                       b'%.3s', b'abcdef')
      74          check_format('abc[\ufffd',
      75                       b'%.5s', 'abc[\u20ac]'.encode('utf8'))
      76          check_format("'\\u20acABC'",
      77                       b'%A', '\u20acABC')
      78          check_format("'\\u20",
      79                       b'%.5A', '\u20acABCDEF')
      80          check_format("'\u20acABC'",
      81                       b'%R', '\u20acABC')
      82          check_format("'\u20acA",
      83                       b'%.3R', '\u20acABCDEF')
      84          check_format('\u20acAB',
      85                       b'%.3S', '\u20acABCDEF')
      86          check_format('\u20acAB',
      87                       b'%.3U', '\u20acABCDEF')
      88          check_format('\u20acAB',
      89                       b'%.3V', '\u20acABCDEF', None)
      90          check_format('abc[\ufffd',
      91                       b'%.5V', None, 'abc[\u20ac]'.encode('utf8'))
      92  
      93          # following tests comes from #7330
      94          # test width modifier and precision modifier with %S
      95          check_format("repr=  abc",
      96                       b'repr=%5S', 'abc')
      97          check_format("repr=ab",
      98                       b'repr=%.2S', 'abc')
      99          check_format("repr=   ab",
     100                       b'repr=%5.2S', 'abc')
     101  
     102          # test width modifier and precision modifier with %R
     103          check_format("repr=   'abc'",
     104                       b'repr=%8R', 'abc')
     105          check_format("repr='ab",
     106                       b'repr=%.3R', 'abc')
     107          check_format("repr=  'ab",
     108                       b'repr=%5.3R', 'abc')
     109  
     110          # test width modifier and precision modifier with %A
     111          check_format("repr=   'abc'",
     112                       b'repr=%8A', 'abc')
     113          check_format("repr='ab",
     114                       b'repr=%.3A', 'abc')
     115          check_format("repr=  'ab",
     116                       b'repr=%5.3A', 'abc')
     117  
     118          # test width modifier and precision modifier with %s
     119          check_format("repr=  abc",
     120                       b'repr=%5s', b'abc')
     121          check_format("repr=ab",
     122                       b'repr=%.2s', b'abc')
     123          check_format("repr=   ab",
     124                       b'repr=%5.2s', b'abc')
     125  
     126          # test width modifier and precision modifier with %U
     127          check_format("repr=  abc",
     128                       b'repr=%5U', 'abc')
     129          check_format("repr=ab",
     130                       b'repr=%.2U', 'abc')
     131          check_format("repr=   ab",
     132                       b'repr=%5.2U', 'abc')
     133  
     134          # test width modifier and precision modifier with %V
     135          check_format("repr=  abc",
     136                       b'repr=%5V', 'abc', b'123')
     137          check_format("repr=ab",
     138                       b'repr=%.2V', 'abc', b'123')
     139          check_format("repr=   ab",
     140                       b'repr=%5.2V', 'abc', b'123')
     141          check_format("repr=  123",
     142                       b'repr=%5V', None, b'123')
     143          check_format("repr=12",
     144                       b'repr=%.2V', None, b'123')
     145          check_format("repr=   12",
     146                       b'repr=%5.2V', None, b'123')
     147  
     148          # test integer formats (%i, %d, %u)
     149          check_format('010',
     150                       b'%03i', c_int(10))
     151          check_format('0010',
     152                       b'%0.4i', c_int(10))
     153          check_format('-123',
     154                       b'%i', c_int(-123))
     155          check_format('-123',
     156                       b'%li', c_long(-123))
     157          check_format('-123',
     158                       b'%lli', c_longlong(-123))
     159          check_format('-123',
     160                       b'%zi', c_ssize_t(-123))
     161  
     162          check_format('-123',
     163                       b'%d', c_int(-123))
     164          check_format('-123',
     165                       b'%ld', c_long(-123))
     166          check_format('-123',
     167                       b'%lld', c_longlong(-123))
     168          check_format('-123',
     169                       b'%zd', c_ssize_t(-123))
     170  
     171          check_format('123',
     172                       b'%u', c_uint(123))
     173          check_format('123',
     174                       b'%lu', c_ulong(123))
     175          check_format('123',
     176                       b'%llu', c_ulonglong(123))
     177          check_format('123',
     178                       b'%zu', c_size_t(123))
     179  
     180          # test long output
     181          min_longlong = -(2 ** (8 * sizeof(c_longlong) - 1))
     182          max_longlong = -min_longlong - 1
     183          check_format(str(min_longlong),
     184                       b'%lld', c_longlong(min_longlong))
     185          check_format(str(max_longlong),
     186                       b'%lld', c_longlong(max_longlong))
     187          max_ulonglong = 2 ** (8 * sizeof(c_ulonglong)) - 1
     188          check_format(str(max_ulonglong),
     189                       b'%llu', c_ulonglong(max_ulonglong))
     190          PyUnicode_FromFormat(b'%p', c_void_p(-1))
     191  
     192          # test padding (width and/or precision)
     193          check_format('123'.rjust(10, '0'),
     194                       b'%010i', c_int(123))
     195          check_format('123'.rjust(100),
     196                       b'%100i', c_int(123))
     197          check_format('123'.rjust(100, '0'),
     198                       b'%.100i', c_int(123))
     199          check_format('123'.rjust(80, '0').rjust(100),
     200                       b'%100.80i', c_int(123))
     201  
     202          check_format('123'.rjust(10, '0'),
     203                       b'%010u', c_uint(123))
     204          check_format('123'.rjust(100),
     205                       b'%100u', c_uint(123))
     206          check_format('123'.rjust(100, '0'),
     207                       b'%.100u', c_uint(123))
     208          check_format('123'.rjust(80, '0').rjust(100),
     209                       b'%100.80u', c_uint(123))
     210  
     211          check_format('123'.rjust(10, '0'),
     212                       b'%010x', c_int(0x123))
     213          check_format('123'.rjust(100),
     214                       b'%100x', c_int(0x123))
     215          check_format('123'.rjust(100, '0'),
     216                       b'%.100x', c_int(0x123))
     217          check_format('123'.rjust(80, '0').rjust(100),
     218                       b'%100.80x', c_int(0x123))
     219  
     220          # test %A
     221          check_format(r"%A:'abc\xe9\uabcd\U0010ffff'",
     222                       b'%%A:%A', 'abc\xe9\uabcd\U0010ffff')
     223  
     224          # test %V
     225          check_format('repr=abc',
     226                       b'repr=%V', 'abc', b'xyz')
     227  
     228          # test %p
     229          # We cannot test the exact result,
     230          # because it returns a hex representation of a C pointer,
     231          # which is going to be different each time. But, we can test the format.
     232          p_format_regex = r'^0x[a-zA-Z0-9]{3,}$'
     233          p_format1 = PyUnicode_FromFormat(b'%p', 'abc')
     234          self.assertIsInstance(p_format1, str)
     235          self.assertRegex(p_format1, p_format_regex)
     236  
     237          p_format2 = PyUnicode_FromFormat(b'%p %p', '123456', b'xyz')
     238          self.assertIsInstance(p_format2, str)
     239          self.assertRegex(p_format2,
     240                           r'0x[a-zA-Z0-9]{3,} 0x[a-zA-Z0-9]{3,}')
     241  
     242          # Extra args are ignored:
     243          p_format3 = PyUnicode_FromFormat(b'%p', '123456', None, b'xyz')
     244          self.assertIsInstance(p_format3, str)
     245          self.assertRegex(p_format3, p_format_regex)
     246  
     247          # Test string decode from parameter of %s using utf-8.
     248          # b'\xe4\xba\xba\xe6\xb0\x91' is utf-8 encoded byte sequence of
     249          # '\u4eba\u6c11'
     250          check_format('repr=\u4eba\u6c11',
     251                       b'repr=%V', None, b'\xe4\xba\xba\xe6\xb0\x91')
     252  
     253          #Test replace error handler.
     254          check_format('repr=abc\ufffd',
     255                       b'repr=%V', None, b'abc\xff')
     256  
     257          # not supported: copy the raw format string. these tests are just here
     258          # to check for crashes and should not be considered as specifications
     259          check_format('%s',
     260                       b'%1%s', b'abc')
     261          check_format('%1abc',
     262                       b'%1abc')
     263          check_format('%+i',
     264                       b'%+i', c_int(10))
     265          check_format('%.%s',
     266                       b'%.%s', b'abc')
     267  
     268          # Issue #33817: empty strings
     269          check_format('',
     270                       b'')
     271          check_format('',
     272                       b'%s', b'')
     273  
     274      # Test PyUnicode_AsWideChar()
     275      @support.cpython_only
     276      @unittest.skipIf(_testcapi is None, 'need _testcapi module')
     277      def test_aswidechar(self):
     278          from _testcapi import unicode_aswidechar
     279          import_helper.import_module('ctypes')
     280          from ctypes import c_wchar, sizeof
     281  
     282          wchar, size = unicode_aswidechar('abcdef', 2)
     283          self.assertEqual(size, 2)
     284          self.assertEqual(wchar, 'ab')
     285  
     286          wchar, size = unicode_aswidechar('abc', 3)
     287          self.assertEqual(size, 3)
     288          self.assertEqual(wchar, 'abc')
     289  
     290          wchar, size = unicode_aswidechar('abc', 4)
     291          self.assertEqual(size, 3)
     292          self.assertEqual(wchar, 'abc\0')
     293  
     294          wchar, size = unicode_aswidechar('abc', 10)
     295          self.assertEqual(size, 3)
     296          self.assertEqual(wchar, 'abc\0')
     297  
     298          wchar, size = unicode_aswidechar('abc\0def', 20)
     299          self.assertEqual(size, 7)
     300          self.assertEqual(wchar, 'abc\0def\0')
     301  
     302          nonbmp = chr(0x10ffff)
     303          if sizeof(c_wchar) == 2:
     304              buflen = 3
     305              nchar = 2
     306          else: # sizeof(c_wchar) == 4
     307              buflen = 2
     308              nchar = 1
     309          wchar, size = unicode_aswidechar(nonbmp, buflen)
     310          self.assertEqual(size, nchar)
     311          self.assertEqual(wchar, nonbmp + '\0')
     312  
     313      # Test PyUnicode_AsWideCharString()
     314      @support.cpython_only
     315      @unittest.skipIf(_testcapi is None, 'need _testcapi module')
     316      def test_aswidecharstring(self):
     317          from _testcapi import unicode_aswidecharstring
     318          import_helper.import_module('ctypes')
     319          from ctypes import c_wchar, sizeof
     320  
     321          wchar, size = unicode_aswidecharstring('abc')
     322          self.assertEqual(size, 3)
     323          self.assertEqual(wchar, 'abc\0')
     324  
     325          wchar, size = unicode_aswidecharstring('abc\0def')
     326          self.assertEqual(size, 7)
     327          self.assertEqual(wchar, 'abc\0def\0')
     328  
     329          nonbmp = chr(0x10ffff)
     330          if sizeof(c_wchar) == 2:
     331              nchar = 2
     332          else: # sizeof(c_wchar) == 4
     333              nchar = 1
     334          wchar, size = unicode_aswidecharstring(nonbmp)
     335          self.assertEqual(size, nchar)
     336          self.assertEqual(wchar, nonbmp + '\0')
     337  
     338      # Test PyUnicode_AsUCS4()
     339      @support.cpython_only
     340      @unittest.skipIf(_testcapi is None, 'need _testcapi module')
     341      def test_asucs4(self):
     342          from _testcapi import unicode_asucs4
     343          for s in ['abc', '\xa1\xa2', '\u4f60\u597d', 'a\U0001f600',
     344                    'a\ud800b\udfffc', '\ud834\udd1e']:
     345              l = len(s)
     346              self.assertEqual(unicode_asucs4(s, l, True), s+'\0')
     347              self.assertEqual(unicode_asucs4(s, l, False), s+'\uffff')
     348              self.assertEqual(unicode_asucs4(s, l+1, True), s+'\0\uffff')
     349              self.assertEqual(unicode_asucs4(s, l+1, False), s+'\0\uffff')
     350              self.assertRaises(SystemError, unicode_asucs4, s, l-1, True)
     351              self.assertRaises(SystemError, unicode_asucs4, s, l-2, False)
     352              s = '\0'.join([s, s])
     353              self.assertEqual(unicode_asucs4(s, len(s), True), s+'\0')
     354              self.assertEqual(unicode_asucs4(s, len(s), False), s+'\uffff')
     355  
     356      # Test PyUnicode_AsUTF8()
     357      @support.cpython_only
     358      @unittest.skipIf(_testcapi is None, 'need _testcapi module')
     359      def test_asutf8(self):
     360          from _testcapi import unicode_asutf8
     361  
     362          bmp = '\u0100'
     363          bmp2 = '\uffff'
     364          nonbmp = chr(0x10ffff)
     365  
     366          self.assertEqual(unicode_asutf8(bmp), b'\xc4\x80')
     367          self.assertEqual(unicode_asutf8(bmp2), b'\xef\xbf\xbf')
     368          self.assertEqual(unicode_asutf8(nonbmp), b'\xf4\x8f\xbf\xbf')
     369          self.assertRaises(UnicodeEncodeError, unicode_asutf8, 'a\ud800b\udfffc')
     370  
     371      # Test PyUnicode_AsUTF8AndSize()
     372      @support.cpython_only
     373      @unittest.skipIf(_testcapi is None, 'need _testcapi module')
     374      def test_asutf8andsize(self):
     375          from _testcapi import unicode_asutf8andsize
     376  
     377          bmp = '\u0100'
     378          bmp2 = '\uffff'
     379          nonbmp = chr(0x10ffff)
     380  
     381          self.assertEqual(unicode_asutf8andsize(bmp), (b'\xc4\x80', 2))
     382          self.assertEqual(unicode_asutf8andsize(bmp2), (b'\xef\xbf\xbf', 3))
     383          self.assertEqual(unicode_asutf8andsize(nonbmp), (b'\xf4\x8f\xbf\xbf', 4))
     384          self.assertRaises(UnicodeEncodeError, unicode_asutf8andsize, 'a\ud800b\udfffc')
     385  
     386      # Test PyUnicode_FindChar()
     387      @support.cpython_only
     388      @unittest.skipIf(_testcapi is None, 'need _testcapi module')
     389      def test_findchar(self):
     390          from _testcapi import unicode_findchar
     391  
     392          for str in "\xa1", "\u8000\u8080", "\ud800\udc02", "\U0001f100\U0001f1f1":
     393              for i, ch in enumerate(str):
     394                  self.assertEqual(unicode_findchar(str, ord(ch), 0, len(str), 1), i)
     395                  self.assertEqual(unicode_findchar(str, ord(ch), 0, len(str), -1), i)
     396  
     397          str = "!>_<!"
     398          self.assertEqual(unicode_findchar(str, 0x110000, 0, len(str), 1), -1)
     399          self.assertEqual(unicode_findchar(str, 0x110000, 0, len(str), -1), -1)
     400          # start < end
     401          self.assertEqual(unicode_findchar(str, ord('!'), 1, len(str)+1, 1), 4)
     402          self.assertEqual(unicode_findchar(str, ord('!'), 1, len(str)+1, -1), 4)
     403          # start >= end
     404          self.assertEqual(unicode_findchar(str, ord('!'), 0, 0, 1), -1)
     405          self.assertEqual(unicode_findchar(str, ord('!'), len(str), 0, 1), -1)
     406          # negative
     407          self.assertEqual(unicode_findchar(str, ord('!'), -len(str), -1, 1), 0)
     408          self.assertEqual(unicode_findchar(str, ord('!'), -len(str), -1, -1), 0)
     409  
     410      # Test PyUnicode_CopyCharacters()
     411      @support.cpython_only
     412      @unittest.skipIf(_testcapi is None, 'need _testcapi module')
     413      def test_copycharacters(self):
     414          from _testcapi import unicode_copycharacters
     415  
     416          strings = [
     417              'abcde', '\xa1\xa2\xa3\xa4\xa5',
     418              '\u4f60\u597d\u4e16\u754c\uff01',
     419              '\U0001f600\U0001f601\U0001f602\U0001f603\U0001f604'
     420          ]
     421  
     422          for idx, from_ in enumerate(strings):
     423              # wide -> narrow: exceed maxchar limitation
     424              for to in strings[:idx]:
     425                  self.assertRaises(
     426                      SystemError,
     427                      unicode_copycharacters, to, 0, from_, 0, 5
     428                  )
     429              # same kind
     430              for from_start in range(5):
     431                  self.assertEqual(
     432                      unicode_copycharacters(from_, 0, from_, from_start, 5),
     433                      (from_[from_start:from_start+5].ljust(5, '\0'),
     434                       5-from_start)
     435                  )
     436              for to_start in range(5):
     437                  self.assertEqual(
     438                      unicode_copycharacters(from_, to_start, from_, to_start, 5),
     439                      (from_[to_start:to_start+5].rjust(5, '\0'),
     440                       5-to_start)
     441                  )
     442              # narrow -> wide
     443              # Tests omitted since this creates invalid strings.
     444  
     445          s = strings[0]
     446          self.assertRaises(IndexError, unicode_copycharacters, s, 6, s, 0, 5)
     447          self.assertRaises(IndexError, unicode_copycharacters, s, -1, s, 0, 5)
     448          self.assertRaises(IndexError, unicode_copycharacters, s, 0, s, 6, 5)
     449          self.assertRaises(IndexError, unicode_copycharacters, s, 0, s, -1, 5)
     450          self.assertRaises(SystemError, unicode_copycharacters, s, 1, s, 0, 5)
     451          self.assertRaises(SystemError, unicode_copycharacters, s, 0, s, 0, -1)
     452          self.assertRaises(SystemError, unicode_copycharacters, s, 0, b'', 0, 0)
     453  
     454      @support.cpython_only
     455      @unittest.skipIf(_testcapi is None, 'need _testcapi module')
     456      def test_pep393_utf8_caching_bug(self):
     457          # Issue #25709: Problem with string concatenation and utf-8 cache
     458          from _testcapi import getargs_s_hash
     459          for k in 0x24, 0xa4, 0x20ac, 0x1f40d:
     460              s = ''
     461              for i in range(5):
     462                  # Due to CPython specific optimization the 's' string can be
     463                  # resized in-place.
     464                  s += chr(k)
     465                  # Parsing with the "s#" format code calls indirectly
     466                  # PyUnicode_AsUTF8AndSize() which creates the UTF-8
     467                  # encoded string cached in the Unicode object.
     468                  self.assertEqual(getargs_s_hash(s), chr(k).encode() * (i + 1))
     469                  # Check that the second call returns the same result
     470                  self.assertEqual(getargs_s_hash(s), chr(k).encode() * (i + 1))
     471  
     472  
     473  if __name__ == "__main__":
     474      unittest.main()