python (3.11.7)

(root)/
lib/
python3.11/
test/
test_source_encoding.py
       1  # -*- coding: koi8-r -*-
       2  
       3  import unittest
       4  from test.support import script_helper, captured_stdout, requires_subprocess, requires_resource
       5  from test.support.os_helper import TESTFN, unlink, rmtree
       6  from test.support.import_helper import unload
       7  import importlib
       8  import os
       9  import sys
      10  import subprocess
      11  import tempfile
      12  
      13  class ESC[4;38;5;81mMiscSourceEncodingTest(ESC[4;38;5;149munittestESC[4;38;5;149m.ESC[4;38;5;149mTestCase):
      14  
      15      def test_pep263(self):
      16          self.assertEqual(
      17              "�����".encode("utf-8"),
      18              b'\xd0\x9f\xd0\xb8\xd1\x82\xd0\xbe\xd0\xbd'
      19          )
      20          self.assertEqual(
      21              "\�".encode("utf-8"),
      22              b'\\\xd0\x9f'
      23          )
      24  
      25      def test_compilestring(self):
      26          # see #1882
      27          c = compile(b"\n# coding: utf-8\nu = '\xc3\xb3'\n", "dummy", "exec")
      28          d = {}
      29          exec(c, d)
      30          self.assertEqual(d['u'], '\xf3')
      31  
      32      def test_issue2301(self):
      33          try:
      34              compile(b"# coding: cp932\nprint '\x94\x4e'", "dummy", "exec")
      35          except SyntaxError as v:
      36              self.assertEqual(v.text.rstrip('\n'), "print '\u5e74'")
      37          else:
      38              self.fail()
      39  
      40      def test_issue4626(self):
      41          c = compile("# coding=latin-1\n\u00c6 = '\u00c6'", "dummy", "exec")
      42          d = {}
      43          exec(c, d)
      44          self.assertEqual(d['\xc6'], '\xc6')
      45  
      46      def test_issue3297(self):
      47          c = compile("a, b = '\U0001010F', '\\U0001010F'", "dummy", "exec")
      48          d = {}
      49          exec(c, d)
      50          self.assertEqual(d['a'], d['b'])
      51          self.assertEqual(len(d['a']), len(d['b']))
      52          self.assertEqual(ascii(d['a']), ascii(d['b']))
      53  
      54      def test_issue7820(self):
      55          # Ensure that check_bom() restores all bytes in the right order if
      56          # check_bom() fails in pydebug mode: a buffer starts with the first
      57          # byte of a valid BOM, but next bytes are different
      58  
      59          # one byte in common with the UTF-16-LE BOM
      60          self.assertRaises(SyntaxError, eval, b'\xff\x20')
      61  
      62          # one byte in common with the UTF-8 BOM
      63          self.assertRaises(SyntaxError, eval, b'\xef\x20')
      64  
      65          # two bytes in common with the UTF-8 BOM
      66          self.assertRaises(SyntaxError, eval, b'\xef\xbb\x20')
      67  
      68      @requires_subprocess()
      69      def test_20731(self):
      70          sub = subprocess.Popen([sys.executable,
      71                          os.path.join(os.path.dirname(__file__),
      72                                       'tokenizedata',
      73                                       'coding20731.py')],
      74                          stderr=subprocess.PIPE)
      75          err = sub.communicate()[1]
      76          self.assertEqual(sub.returncode, 0)
      77          self.assertNotIn(b'SyntaxError', err)
      78  
      79      def test_error_message(self):
      80          compile(b'# -*- coding: iso-8859-15 -*-\n', 'dummy', 'exec')
      81          compile(b'\xef\xbb\xbf\n', 'dummy', 'exec')
      82          compile(b'\xef\xbb\xbf# -*- coding: utf-8 -*-\n', 'dummy', 'exec')
      83          with self.assertRaisesRegex(SyntaxError, 'fake'):
      84              compile(b'# -*- coding: fake -*-\n', 'dummy', 'exec')
      85          with self.assertRaisesRegex(SyntaxError, 'iso-8859-15'):
      86              compile(b'\xef\xbb\xbf# -*- coding: iso-8859-15 -*-\n',
      87                      'dummy', 'exec')
      88          with self.assertRaisesRegex(SyntaxError, 'BOM'):
      89              compile(b'\xef\xbb\xbf# -*- coding: iso-8859-15 -*-\n',
      90                      'dummy', 'exec')
      91          with self.assertRaisesRegex(SyntaxError, 'fake'):
      92              compile(b'\xef\xbb\xbf# -*- coding: fake -*-\n', 'dummy', 'exec')
      93          with self.assertRaisesRegex(SyntaxError, 'BOM'):
      94              compile(b'\xef\xbb\xbf# -*- coding: fake -*-\n', 'dummy', 'exec')
      95  
      96      def test_bad_coding(self):
      97          module_name = 'bad_coding'
      98          self.verify_bad_module(module_name)
      99  
     100      def test_bad_coding2(self):
     101          module_name = 'bad_coding2'
     102          self.verify_bad_module(module_name)
     103  
     104      def verify_bad_module(self, module_name):
     105          self.assertRaises(SyntaxError, __import__, 'test.tokenizedata.' + module_name)
     106  
     107          path = os.path.dirname(__file__)
     108          filename = os.path.join(path, 'tokenizedata', module_name + '.py')
     109          with open(filename, "rb") as fp:
     110              bytes = fp.read()
     111          self.assertRaises(SyntaxError, compile, bytes, filename, 'exec')
     112  
     113      def test_exec_valid_coding(self):
     114          d = {}
     115          exec(b'# coding: cp949\na = "\xaa\xa7"\n', d)
     116          self.assertEqual(d['a'], '\u3047')
     117  
     118      def test_file_parse(self):
     119          # issue1134: all encodings outside latin-1 and utf-8 fail on
     120          # multiline strings and long lines (>512 columns)
     121          unload(TESTFN)
     122          filename = TESTFN + ".py"
     123          f = open(filename, "w", encoding="cp1252")
     124          sys.path.insert(0, os.curdir)
     125          try:
     126              with f:
     127                  f.write("# -*- coding: cp1252 -*-\n")
     128                  f.write("'''A short string\n")
     129                  f.write("'''\n")
     130                  f.write("'A very long string %s'\n" % ("X" * 1000))
     131  
     132              importlib.invalidate_caches()
     133              __import__(TESTFN)
     134          finally:
     135              del sys.path[0]
     136              unlink(filename)
     137              unlink(filename + "c")
     138              unlink(filename + "o")
     139              unload(TESTFN)
     140              rmtree('__pycache__')
     141  
     142      def test_error_from_string(self):
     143          # See http://bugs.python.org/issue6289
     144          input = "# coding: ascii\n\N{SNOWMAN}".encode('utf-8')
     145          with self.assertRaises(SyntaxError) as c:
     146              compile(input, "<string>", "exec")
     147          expected = "'ascii' codec can't decode byte 0xe2 in position 16: " \
     148                     "ordinal not in range(128)"
     149          self.assertTrue(c.exception.args[0].startswith(expected),
     150                          msg=c.exception.args[0])
     151  
     152      def test_file_parse_error_multiline(self):
     153          # gh96611:
     154          with open(TESTFN, "wb") as fd:
     155              fd.write(b'print("""\n\xb1""")\n')
     156  
     157          try:
     158              retcode, stdout, stderr = script_helper.assert_python_failure(TESTFN)
     159  
     160              self.assertGreater(retcode, 0)
     161              self.assertIn(b"Non-UTF-8 code starting with '\\xb1'", stderr)
     162          finally:
     163              os.unlink(TESTFN)
     164  
     165      def test_tokenizer_fstring_warning_in_first_line(self):
     166          source = "0b1and 2"
     167          with open(TESTFN, "w") as fd:
     168              fd.write("{}".format(source))
     169          try:
     170              retcode, stdout, stderr = script_helper.assert_python_ok(TESTFN)
     171              self.assertIn(b"SyntaxWarning: invalid binary litera", stderr)
     172              self.assertEqual(stderr.count(source.encode()), 1)
     173          finally:
     174              os.unlink(TESTFN)
     175  
     176  
     177  class ESC[4;38;5;81mAbstractSourceEncodingTest:
     178  
     179      def test_default_coding(self):
     180          src = (b'print(ascii("\xc3\xa4"))\n')
     181          self.check_script_output(src, br"'\xe4'")
     182  
     183      def test_first_coding_line(self):
     184          src = (b'#coding:iso8859-15\n'
     185                 b'print(ascii("\xc3\xa4"))\n')
     186          self.check_script_output(src, br"'\xc3\u20ac'")
     187  
     188      def test_second_coding_line(self):
     189          src = (b'#\n'
     190                 b'#coding:iso8859-15\n'
     191                 b'print(ascii("\xc3\xa4"))\n')
     192          self.check_script_output(src, br"'\xc3\u20ac'")
     193  
     194      def test_third_coding_line(self):
     195          # Only first two lines are tested for a magic comment.
     196          src = (b'#\n'
     197                 b'#\n'
     198                 b'#coding:iso8859-15\n'
     199                 b'print(ascii("\xc3\xa4"))\n')
     200          self.check_script_output(src, br"'\xe4'")
     201  
     202      def test_double_coding_line(self):
     203          # If the first line matches the second line is ignored.
     204          src = (b'#coding:iso8859-15\n'
     205                 b'#coding:latin1\n'
     206                 b'print(ascii("\xc3\xa4"))\n')
     207          self.check_script_output(src, br"'\xc3\u20ac'")
     208  
     209      def test_double_coding_same_line(self):
     210          src = (b'#coding:iso8859-15 coding:latin1\n'
     211                 b'print(ascii("\xc3\xa4"))\n')
     212          self.check_script_output(src, br"'\xc3\u20ac'")
     213  
     214      def test_first_non_utf8_coding_line(self):
     215          src = (b'#coding:iso-8859-15 \xa4\n'
     216                 b'print(ascii("\xc3\xa4"))\n')
     217          self.check_script_output(src, br"'\xc3\u20ac'")
     218  
     219      def test_second_non_utf8_coding_line(self):
     220          src = (b'\n'
     221                 b'#coding:iso-8859-15 \xa4\n'
     222                 b'print(ascii("\xc3\xa4"))\n')
     223          self.check_script_output(src, br"'\xc3\u20ac'")
     224  
     225      def test_utf8_bom(self):
     226          src = (b'\xef\xbb\xbfprint(ascii("\xc3\xa4"))\n')
     227          self.check_script_output(src, br"'\xe4'")
     228  
     229      def test_utf8_bom_and_utf8_coding_line(self):
     230          src = (b'\xef\xbb\xbf#coding:utf-8\n'
     231                 b'print(ascii("\xc3\xa4"))\n')
     232          self.check_script_output(src, br"'\xe4'")
     233  
     234      def test_crlf(self):
     235          src = (b'print(ascii("""\r\n"""))\n')
     236          out = self.check_script_output(src, br"'\n'")
     237  
     238      def test_crcrlf(self):
     239          src = (b'print(ascii("""\r\r\n"""))\n')
     240          out = self.check_script_output(src, br"'\n\n'")
     241  
     242      def test_crcrcrlf(self):
     243          src = (b'print(ascii("""\r\r\r\n"""))\n')
     244          out = self.check_script_output(src, br"'\n\n\n'")
     245  
     246      def test_crcrcrlf2(self):
     247          src = (b'#coding:iso-8859-1\n'
     248                 b'print(ascii("""\r\r\r\n"""))\n')
     249          out = self.check_script_output(src, br"'\n\n\n'")
     250  
     251  
     252  class ESC[4;38;5;81mUTF8ValidatorTest(ESC[4;38;5;149munittestESC[4;38;5;149m.ESC[4;38;5;149mTestCase):
     253      @unittest.skipIf(not sys.platform.startswith("linux"),
     254                       "Too slow to run on non-Linux platforms")
     255      @requires_resource('cpu')
     256      def test_invalid_utf8(self):
     257          # This is a port of test_utf8_decode_invalid_sequences in
     258          # test_unicode.py to exercise the separate utf8 validator in
     259          # Parser/tokenizer.c used when reading source files.
     260  
     261          # That file is written using low-level C file I/O, so the only way to
     262          # test it is to write actual files to disk.
     263  
     264          # Each example is put inside a string at the top of the file so
     265          # it's an otherwise valid Python source file. Put some newlines
     266          # beforehand so we can assert that the error is reported on the
     267          # correct line.
     268          template = b'\n\n\n"%s"\n'
     269  
     270          fn = TESTFN
     271          self.addCleanup(unlink, fn)
     272  
     273          def check(content):
     274              with open(fn, 'wb') as fp:
     275                  fp.write(template % content)
     276              rc, stdout, stderr = script_helper.assert_python_failure(fn)
     277              # We want to assert that the python subprocess failed gracefully,
     278              # not via a signal.
     279              self.assertGreaterEqual(rc, 1)
     280              self.assertIn(b"Non-UTF-8 code starting with", stderr)
     281              self.assertIn(b"on line 4", stderr)
     282  
     283          # continuation bytes in a sequence of 2, 3, or 4 bytes
     284          continuation_bytes = [bytes([x]) for x in range(0x80, 0xC0)]
     285          # start bytes of a 2-byte sequence equivalent to code points < 0x7F
     286          invalid_2B_seq_start_bytes = [bytes([x]) for x in range(0xC0, 0xC2)]
     287          # start bytes of a 4-byte sequence equivalent to code points > 0x10FFFF
     288          invalid_4B_seq_start_bytes = [bytes([x]) for x in range(0xF5, 0xF8)]
     289          invalid_start_bytes = (
     290              continuation_bytes + invalid_2B_seq_start_bytes +
     291              invalid_4B_seq_start_bytes + [bytes([x]) for x in range(0xF7, 0x100)]
     292          )
     293  
     294          for byte in invalid_start_bytes:
     295              check(byte)
     296  
     297          for sb in invalid_2B_seq_start_bytes:
     298              for cb in continuation_bytes:
     299                  check(sb + cb)
     300  
     301          for sb in invalid_4B_seq_start_bytes:
     302              for cb1 in continuation_bytes[:3]:
     303                  for cb3 in continuation_bytes[:3]:
     304                      check(sb+cb1+b'\x80'+cb3)
     305  
     306          for cb in [bytes([x]) for x in range(0x80, 0xA0)]:
     307              check(b'\xE0'+cb+b'\x80')
     308              check(b'\xE0'+cb+b'\xBF')
     309              # surrogates
     310          for cb in [bytes([x]) for x in range(0xA0, 0xC0)]:
     311              check(b'\xED'+cb+b'\x80')
     312              check(b'\xED'+cb+b'\xBF')
     313          for cb in [bytes([x]) for x in range(0x80, 0x90)]:
     314              check(b'\xF0'+cb+b'\x80\x80')
     315              check(b'\xF0'+cb+b'\xBF\xBF')
     316          for cb in [bytes([x]) for x in range(0x90, 0xC0)]:
     317              check(b'\xF4'+cb+b'\x80\x80')
     318              check(b'\xF4'+cb+b'\xBF\xBF')
     319  
     320  
     321  class ESC[4;38;5;81mBytesSourceEncodingTest(ESC[4;38;5;149mAbstractSourceEncodingTest, ESC[4;38;5;149munittestESC[4;38;5;149m.ESC[4;38;5;149mTestCase):
     322  
     323      def check_script_output(self, src, expected):
     324          with captured_stdout() as stdout:
     325              exec(src)
     326          out = stdout.getvalue().encode('latin1')
     327          self.assertEqual(out.rstrip(), expected)
     328  
     329  
     330  class ESC[4;38;5;81mFileSourceEncodingTest(ESC[4;38;5;149mAbstractSourceEncodingTest, ESC[4;38;5;149munittestESC[4;38;5;149m.ESC[4;38;5;149mTestCase):
     331  
     332      def check_script_output(self, src, expected):
     333          with tempfile.TemporaryDirectory() as tmpd:
     334              fn = os.path.join(tmpd, 'test.py')
     335              with open(fn, 'wb') as fp:
     336                  fp.write(src)
     337              res = script_helper.assert_python_ok(fn)
     338          self.assertEqual(res.out.rstrip(), expected)
     339  
     340  
     341  if __name__ == "__main__":
     342      unittest.main()