1  """
       2  Test the implementation of the PEP 540: the UTF-8 Mode.
       3  """
       4  
       5  import locale
       6  import subprocess
       7  import sys
       8  import textwrap
       9  import unittest
      10  from test import support
      11  from test.support.script_helper import assert_python_ok, assert_python_failure
      12  from test.support import os_helper
      13  
      14  
      15  MS_WINDOWS = (sys.platform == 'win32')
      16  POSIX_LOCALES = ('C', 'POSIX')
      17  VXWORKS = (sys.platform == "vxworks")
      18  
      19  class ESC[4;38;5;81mUTF8ModeTests(ESC[4;38;5;149munittestESC[4;38;5;149m.ESC[4;38;5;149mTestCase):
      20      DEFAULT_ENV = {
      21          'PYTHONUTF8': '',
      22          'PYTHONLEGACYWINDOWSFSENCODING': '',
      23          'PYTHONCOERCECLOCALE': '0',
      24      }
      25  
      26      def posix_locale(self):
      27          loc = locale.setlocale(locale.LC_CTYPE, None)
      28          return (loc in POSIX_LOCALES)
      29  
      30      def get_output(self, *args, failure=False, **kw):
      31          kw = dict(self.DEFAULT_ENV, **kw)
      32          if failure:
      33              out = assert_python_failure(*args, **kw)
      34              out = out[2]
      35          else:
      36              out = assert_python_ok(*args, **kw)
      37              out = out[1]
      38          return out.decode().rstrip("\n\r")
      39  
      40      @unittest.skipIf(MS_WINDOWS, 'Windows has no POSIX locale')
      41      def test_posix_locale(self):
      42          code = 'import sys; print(sys.flags.utf8_mode)'
      43  
      44          for loc in POSIX_LOCALES:
      45              with self.subTest(LC_ALL=loc):
      46                  out = self.get_output('-c', code, LC_ALL=loc)
      47                  self.assertEqual(out, '1')
      48  
      49      def test_xoption(self):
      50          code = 'import sys; print(sys.flags.utf8_mode)'
      51  
      52          out = self.get_output('-X', 'utf8', '-c', code)
      53          self.assertEqual(out, '1')
      54  
      55          # undocumented but accepted syntax: -X utf8=1
      56          out = self.get_output('-X', 'utf8=1', '-c', code)
      57          self.assertEqual(out, '1')
      58  
      59          out = self.get_output('-X', 'utf8=0', '-c', code)
      60          self.assertEqual(out, '0')
      61  
      62          if MS_WINDOWS:
      63              # PYTHONLEGACYWINDOWSFSENCODING disables the UTF-8 Mode
      64              # and has the priority over -X utf8
      65              out = self.get_output('-X', 'utf8', '-c', code,
      66                                    PYTHONLEGACYWINDOWSFSENCODING='1')
      67              self.assertEqual(out, '0')
      68  
      69      def test_env_var(self):
      70          code = 'import sys; print(sys.flags.utf8_mode)'
      71  
      72          out = self.get_output('-c', code, PYTHONUTF8='1')
      73          self.assertEqual(out, '1')
      74  
      75          out = self.get_output('-c', code, PYTHONUTF8='0')
      76          self.assertEqual(out, '0')
      77  
      78          # -X utf8 has the priority over PYTHONUTF8
      79          out = self.get_output('-X', 'utf8=0', '-c', code, PYTHONUTF8='1')
      80          self.assertEqual(out, '0')
      81  
      82          if MS_WINDOWS:
      83              # PYTHONLEGACYWINDOWSFSENCODING disables the UTF-8 mode
      84              # and has the priority over PYTHONUTF8
      85              out = self.get_output('-X', 'utf8', '-c', code, PYTHONUTF8='1',
      86                                    PYTHONLEGACYWINDOWSFSENCODING='1')
      87              self.assertEqual(out, '0')
      88  
      89          # Cannot test with the POSIX locale, since the POSIX locale enables
      90          # the UTF-8 mode
      91          if not self.posix_locale():
      92              # PYTHONUTF8 should be ignored if -E is used
      93              out = self.get_output('-E', '-c', code, PYTHONUTF8='1')
      94              self.assertEqual(out, '0')
      95  
      96          # invalid mode
      97          out = self.get_output('-c', code, PYTHONUTF8='xxx', failure=True)
      98          self.assertIn('invalid PYTHONUTF8 environment variable value',
      99                        out.rstrip())
     100  
     101      def test_filesystemencoding(self):
     102          code = textwrap.dedent('''
     103              import sys
     104              print("{}/{}".format(sys.getfilesystemencoding(),
     105                                   sys.getfilesystemencodeerrors()))
     106          ''')
     107  
     108          if MS_WINDOWS:
     109              expected = 'utf-8/surrogatepass'
     110          else:
     111              expected = 'utf-8/surrogateescape'
     112  
     113          out = self.get_output('-X', 'utf8', '-c', code)
     114          self.assertEqual(out, expected)
     115  
     116          if MS_WINDOWS:
     117              # PYTHONLEGACYWINDOWSFSENCODING disables the UTF-8 mode
     118              # and has the priority over -X utf8 and PYTHONUTF8
     119              out = self.get_output('-X', 'utf8', '-c', code,
     120                                    PYTHONUTF8='strict',
     121                                    PYTHONLEGACYWINDOWSFSENCODING='1')
     122              self.assertEqual(out, 'mbcs/replace')
     123  
     124      def test_stdio(self):
     125          code = textwrap.dedent('''
     126              import sys
     127              print(f"stdin: {sys.stdin.encoding}/{sys.stdin.errors}")
     128              print(f"stdout: {sys.stdout.encoding}/{sys.stdout.errors}")
     129              print(f"stderr: {sys.stderr.encoding}/{sys.stderr.errors}")
     130          ''')
     131  
     132          out = self.get_output('-X', 'utf8', '-c', code,
     133                                PYTHONIOENCODING='')
     134          self.assertEqual(out.splitlines(),
     135                           ['stdin: utf-8/surrogateescape',
     136                            'stdout: utf-8/surrogateescape',
     137                            'stderr: utf-8/backslashreplace'])
     138  
     139          # PYTHONIOENCODING has the priority over PYTHONUTF8
     140          out = self.get_output('-X', 'utf8', '-c', code,
     141                                PYTHONIOENCODING="latin1")
     142          self.assertEqual(out.splitlines(),
     143                           ['stdin: iso8859-1/strict',
     144                            'stdout: iso8859-1/strict',
     145                            'stderr: iso8859-1/backslashreplace'])
     146  
     147          out = self.get_output('-X', 'utf8', '-c', code,
     148                                PYTHONIOENCODING=":namereplace")
     149          self.assertEqual(out.splitlines(),
     150                           ['stdin: utf-8/namereplace',
     151                            'stdout: utf-8/namereplace',
     152                            'stderr: utf-8/backslashreplace'])
     153  
     154      def test_io(self):
     155          code = textwrap.dedent('''
     156              import sys
     157              filename = sys.argv[1]
     158              with open(filename) as fp:
     159                  print(f"{fp.encoding}/{fp.errors}")
     160          ''')
     161          filename = __file__
     162  
     163          out = self.get_output('-c', code, filename, PYTHONUTF8='1')
     164          self.assertEqual(out.lower(), 'utf-8/strict')
     165  
     166      def _check_io_encoding(self, module, encoding=None, errors=None):
     167          filename = __file__
     168  
     169          # Encoding explicitly set
     170          args = []
     171          if encoding:
     172              args.append(f'encoding={encoding!r}')
     173          if errors:
     174              args.append(f'errors={errors!r}')
     175          code = textwrap.dedent('''
     176              import sys
     177              from %s import open
     178              filename = sys.argv[1]
     179              with open(filename, %s) as fp:
     180                  print(f"{fp.encoding}/{fp.errors}")
     181          ''') % (module, ', '.join(args))
     182          out = self.get_output('-c', code, filename,
     183                                PYTHONUTF8='1')
     184  
     185          if not encoding:
     186              encoding = 'utf-8'
     187          if not errors:
     188              errors = 'strict'
     189          self.assertEqual(out.lower(), f'{encoding}/{errors}')
     190  
     191      def check_io_encoding(self, module):
     192          self._check_io_encoding(module, encoding="latin1")
     193          self._check_io_encoding(module, errors="namereplace")
     194          self._check_io_encoding(module,
     195                                  encoding="latin1", errors="namereplace")
     196  
     197      def test_io_encoding(self):
     198          self.check_io_encoding('io')
     199  
     200      def test_pyio_encoding(self):
     201          self.check_io_encoding('_pyio')
     202  
     203      def test_locale_getpreferredencoding(self):
     204          code = 'import locale; print(locale.getpreferredencoding(False), locale.getpreferredencoding(True))'
     205          out = self.get_output('-X', 'utf8', '-c', code)
     206          self.assertEqual(out, 'utf-8 utf-8')
     207  
     208          for loc in POSIX_LOCALES:
     209              with self.subTest(LC_ALL=loc):
     210                  out = self.get_output('-X', 'utf8', '-c', code, LC_ALL=loc)
     211                  self.assertEqual(out, 'utf-8 utf-8')
     212  
     213      @unittest.skipIf(MS_WINDOWS, 'test specific to Unix')
     214      def test_cmd_line(self):
     215          arg = 'h\xe9\u20ac'.encode('utf-8')
     216          arg_utf8 = arg.decode('utf-8')
     217          arg_ascii = arg.decode('ascii', 'surrogateescape')
     218          code = 'import locale, sys; print("%s:%s" % (locale.getpreferredencoding(), ascii(sys.argv[1:])))'
     219  
     220          def check(utf8_opt, expected, **kw):
     221              out = self.get_output('-X', utf8_opt, '-c', code, arg, **kw)
     222              args = out.partition(':')[2].rstrip()
     223              self.assertEqual(args, ascii(expected), out)
     224  
     225          check('utf8', [arg_utf8])
     226          for loc in POSIX_LOCALES:
     227              with self.subTest(LC_ALL=loc):
     228                  check('utf8', [arg_utf8], LC_ALL=loc)
     229  
     230          if sys.platform == 'darwin' or support.is_android or VXWORKS:
     231              c_arg = arg_utf8
     232          elif sys.platform.startswith("aix"):
     233              c_arg = arg.decode('iso-8859-1')
     234          else:
     235              c_arg = arg_ascii
     236          for loc in POSIX_LOCALES:
     237              with self.subTest(LC_ALL=loc):
     238                  check('utf8=0', [c_arg], LC_ALL=loc)
     239  
     240      def test_optim_level(self):
     241          # CPython: check that Py_Main() doesn't increment Py_OptimizeFlag
     242          # twice when -X utf8 requires to parse the configuration twice (when
     243          # the encoding changes after reading the configuration, the
     244          # configuration is read again with the new encoding).
     245          code = 'import sys; print(sys.flags.optimize)'
     246          out = self.get_output('-X', 'utf8', '-O', '-c', code)
     247          self.assertEqual(out, '1')
     248          out = self.get_output('-X', 'utf8', '-OO', '-c', code)
     249          self.assertEqual(out, '2')
     250  
     251          code = 'import sys; print(sys.flags.ignore_environment)'
     252          out = self.get_output('-X', 'utf8', '-E', '-c', code)
     253          self.assertEqual(out, '1')
     254  
     255      @unittest.skipIf(MS_WINDOWS,
     256                       "os.device_encoding() doesn't implement "
     257                       "the UTF-8 Mode on Windows")
     258      @support.requires_subprocess()
     259      def test_device_encoding(self):
     260          # Use stdout as TTY
     261          if not sys.stdout.isatty():
     262              self.skipTest("sys.stdout is not a TTY")
     263  
     264          filename = 'out.txt'
     265          self.addCleanup(os_helper.unlink, filename)
     266  
     267          code = (f'import os, sys; fd = sys.stdout.fileno(); '
     268                  f'out = open({filename!r}, "w", encoding="utf-8"); '
     269                  f'print(os.isatty(fd), os.device_encoding(fd), file=out); '
     270                  f'out.close()')
     271          cmd = [sys.executable, '-X', 'utf8', '-c', code]
     272          # The stdout TTY is inherited to the child process
     273          proc = subprocess.run(cmd, text=True)
     274          self.assertEqual(proc.returncode, 0, proc)
     275  
     276          # In UTF-8 Mode, device_encoding(fd) returns "UTF-8" if fd is a TTY
     277          with open(filename, encoding="utf8") as fp:
     278              out = fp.read().rstrip()
     279          self.assertEqual(out, 'True utf-8')
     280  
     281  
     282  if __name__ == "__main__":
     283      unittest.main()