(root)/
Python-3.12.0/
Lib/
test/
test_c_locale_coercion.py
       1  # Tests the attempted automatic coercion of the C locale to a UTF-8 locale
       2  
       3  import locale
       4  import os
       5  import subprocess
       6  import sys
       7  import sysconfig
       8  import unittest
       9  from collections import namedtuple
      10  
      11  from test import support
      12  from test.support.script_helper import run_python_until_end
      13  
      14  
      15  # Set the list of ways we expect to be able to ask for the "C" locale
      16  EXPECTED_C_LOCALE_EQUIVALENTS = ["C", "invalid.ascii"]
      17  
      18  # Set our expectation for the default encoding used in the C locale
      19  # for the filesystem encoding and the standard streams
      20  EXPECTED_C_LOCALE_STREAM_ENCODING = "ascii"
      21  EXPECTED_C_LOCALE_FS_ENCODING = "ascii"
      22  
      23  # Set our expectation for the default locale used when none is specified
      24  EXPECT_COERCION_IN_DEFAULT_LOCALE = True
      25  
      26  TARGET_LOCALES = ["C.UTF-8", "C.utf8", "UTF-8"]
      27  
      28  # Apply some platform dependent overrides
      29  if sys.platform.startswith("linux"):
      30      if support.is_android:
      31          # Android defaults to using UTF-8 for all system interfaces
      32          EXPECTED_C_LOCALE_STREAM_ENCODING = "utf-8"
      33          EXPECTED_C_LOCALE_FS_ENCODING = "utf-8"
      34      else:
      35          # Linux distros typically alias the POSIX locale directly to the C
      36          # locale.
      37          # TODO: Once https://bugs.python.org/issue30672 is addressed, we'll be
      38          #       able to check this case unconditionally
      39          EXPECTED_C_LOCALE_EQUIVALENTS.append("POSIX")
      40  elif sys.platform.startswith("aix"):
      41      # AIX uses iso8859-1 in the C locale, other *nix platforms use ASCII
      42      EXPECTED_C_LOCALE_STREAM_ENCODING = "iso8859-1"
      43      EXPECTED_C_LOCALE_FS_ENCODING = "iso8859-1"
      44  elif sys.platform == "darwin":
      45      # FS encoding is UTF-8 on macOS
      46      EXPECTED_C_LOCALE_FS_ENCODING = "utf-8"
      47  elif sys.platform == "cygwin":
      48      # Cygwin defaults to using C.UTF-8
      49      # TODO: Work out a robust dynamic test for this that doesn't rely on
      50      #       CPython's own locale handling machinery
      51      EXPECT_COERCION_IN_DEFAULT_LOCALE = False
      52  elif sys.platform == "vxworks":
      53      # VxWorks defaults to using UTF-8 for all system interfaces
      54      EXPECTED_C_LOCALE_STREAM_ENCODING = "utf-8"
      55      EXPECTED_C_LOCALE_FS_ENCODING = "utf-8"
      56  
      57  # Note that the above expectations are still wrong in some cases, such as:
      58  # * Windows when PYTHONLEGACYWINDOWSFSENCODING is set
      59  # * Any platform other than AIX that uses latin-1 in the C locale
      60  # * Any Linux distro where POSIX isn't a simple alias for the C locale
      61  # * Any Linux distro where the default locale is something other than "C"
      62  #
      63  # Options for dealing with this:
      64  # * Don't set the PY_COERCE_C_LOCALE preprocessor definition on
      65  #   such platforms (e.g. it isn't set on Windows)
      66  # * Fix the test expectations to match the actual platform behaviour
      67  
      68  # In order to get the warning messages to match up as expected, the candidate
      69  # order here must much the target locale order in Python/pylifecycle.c
      70  _C_UTF8_LOCALES = ("C.UTF-8", "C.utf8", "UTF-8")
      71  
      72  # There's no reliable cross-platform way of checking locale alias
      73  # lists, so the only way of knowing which of these locales will work
      74  # is to try them with locale.setlocale(). We do that in a subprocess
      75  # in setUpModule() below to avoid altering the locale of the test runner.
      76  #
      77  # If the relevant locale module attributes exist, and we're not on a platform
      78  # where we expect it to always succeed, we also check that
      79  # `locale.nl_langinfo(locale.CODESET)` works, as if it fails, the interpreter
      80  # will skip locale coercion for that particular target locale
      81  _check_nl_langinfo_CODESET = bool(
      82      sys.platform not in ("darwin", "linux") and
      83      hasattr(locale, "nl_langinfo") and
      84      hasattr(locale, "CODESET")
      85  )
      86  
      87  def _set_locale_in_subprocess(locale_name):
      88      cmd_fmt = "import locale; print(locale.setlocale(locale.LC_CTYPE, '{}'))"
      89      if _check_nl_langinfo_CODESET:
      90          # If there's no valid CODESET, we expect coercion to be skipped
      91          cmd_fmt += "; import sys; sys.exit(not locale.nl_langinfo(locale.CODESET))"
      92      cmd = cmd_fmt.format(locale_name)
      93      result, py_cmd = run_python_until_end("-c", cmd, PYTHONCOERCECLOCALE='')
      94      return result.rc == 0
      95  
      96  
      97  
      98  _fields = "fsencoding stdin_info stdout_info stderr_info lang lc_ctype lc_all"
      99  _EncodingDetails = namedtuple("EncodingDetails", _fields)
     100  
     101  class ESC[4;38;5;81mEncodingDetails(ESC[4;38;5;149m_EncodingDetails):
     102      # XXX (ncoghlan): Using JSON for child state reporting may be less fragile
     103      CHILD_PROCESS_SCRIPT = ";".join([
     104          "import sys, os",
     105          "print(sys.getfilesystemencoding())",
     106          "print(sys.stdin.encoding + ':' + sys.stdin.errors)",
     107          "print(sys.stdout.encoding + ':' + sys.stdout.errors)",
     108          "print(sys.stderr.encoding + ':' + sys.stderr.errors)",
     109          "print(os.environ.get('LANG', 'not set'))",
     110          "print(os.environ.get('LC_CTYPE', 'not set'))",
     111          "print(os.environ.get('LC_ALL', 'not set'))",
     112      ])
     113  
     114      @classmethod
     115      def get_expected_details(cls, coercion_expected, fs_encoding, stream_encoding, env_vars):
     116          """Returns expected child process details for a given encoding"""
     117          _stream = stream_encoding + ":{}"
     118          # stdin and stdout should use surrogateescape either because the
     119          # coercion triggered, or because the C locale was detected
     120          stream_info = 2*[_stream.format("surrogateescape")]
     121          # stderr should always use backslashreplace
     122          stream_info.append(_stream.format("backslashreplace"))
     123          expected_lang = env_vars.get("LANG", "not set")
     124          if coercion_expected:
     125              expected_lc_ctype = CLI_COERCION_TARGET
     126          else:
     127              expected_lc_ctype = env_vars.get("LC_CTYPE", "not set")
     128          expected_lc_all = env_vars.get("LC_ALL", "not set")
     129          env_info = expected_lang, expected_lc_ctype, expected_lc_all
     130          return dict(cls(fs_encoding, *stream_info, *env_info)._asdict())
     131  
     132      @classmethod
     133      def get_child_details(cls, env_vars):
     134          """Retrieves fsencoding and standard stream details from a child process
     135  
     136          Returns (encoding_details, stderr_lines):
     137  
     138          - encoding_details: EncodingDetails for eager decoding
     139          - stderr_lines: result of calling splitlines() on the stderr output
     140  
     141          The child is run in isolated mode if the current interpreter supports
     142          that.
     143          """
     144          result, py_cmd = run_python_until_end(
     145              "-X", "utf8=0", "-c", cls.CHILD_PROCESS_SCRIPT,
     146              **env_vars
     147          )
     148          if not result.rc == 0:
     149              result.fail(py_cmd)
     150          # All subprocess outputs in this test case should be pure ASCII
     151          stdout_lines = result.out.decode("ascii").splitlines()
     152          child_encoding_details = dict(cls(*stdout_lines)._asdict())
     153          stderr_lines = result.err.decode("ascii").rstrip().splitlines()
     154          return child_encoding_details, stderr_lines
     155  
     156  
     157  # Details of the shared library warning emitted at runtime
     158  LEGACY_LOCALE_WARNING = (
     159      "Python runtime initialized with LC_CTYPE=C (a locale with default ASCII "
     160      "encoding), which may cause Unicode compatibility problems. Using C.UTF-8, "
     161      "C.utf8, or UTF-8 (if available) as alternative Unicode-compatible "
     162      "locales is recommended."
     163  )
     164  
     165  # Details of the CLI locale coercion warning emitted at runtime
     166  CLI_COERCION_WARNING_FMT = (
     167      "Python detected LC_CTYPE=C: LC_CTYPE coerced to {} (set another locale "
     168      "or PYTHONCOERCECLOCALE=0 to disable this locale coercion behavior)."
     169  )
     170  
     171  
     172  AVAILABLE_TARGETS = None
     173  CLI_COERCION_TARGET = None
     174  CLI_COERCION_WARNING = None
     175  
     176  def setUpModule():
     177      global AVAILABLE_TARGETS
     178      global CLI_COERCION_TARGET
     179      global CLI_COERCION_WARNING
     180  
     181      if AVAILABLE_TARGETS is not None:
     182          # initialization already done
     183          return
     184      AVAILABLE_TARGETS = []
     185  
     186      # Find the target locales available in the current system
     187      for target_locale in _C_UTF8_LOCALES:
     188          if _set_locale_in_subprocess(target_locale):
     189              AVAILABLE_TARGETS.append(target_locale)
     190  
     191      if AVAILABLE_TARGETS:
     192          # Coercion is expected to use the first available target locale
     193          CLI_COERCION_TARGET = AVAILABLE_TARGETS[0]
     194          CLI_COERCION_WARNING = CLI_COERCION_WARNING_FMT.format(CLI_COERCION_TARGET)
     195  
     196      if support.verbose:
     197          print(f"AVAILABLE_TARGETS = {AVAILABLE_TARGETS!r}")
     198          print(f"EXPECTED_C_LOCALE_EQUIVALENTS = {EXPECTED_C_LOCALE_EQUIVALENTS!r}")
     199          print(f"EXPECTED_C_LOCALE_STREAM_ENCODING = {EXPECTED_C_LOCALE_STREAM_ENCODING!r}")
     200          print(f"EXPECTED_C_LOCALE_FS_ENCODING = {EXPECTED_C_LOCALE_FS_ENCODING!r}")
     201          print(f"EXPECT_COERCION_IN_DEFAULT_LOCALE = {EXPECT_COERCION_IN_DEFAULT_LOCALE!r}")
     202          print(f"_C_UTF8_LOCALES = {_C_UTF8_LOCALES!r}")
     203          print(f"_check_nl_langinfo_CODESET = {_check_nl_langinfo_CODESET!r}")
     204  
     205  
     206  class ESC[4;38;5;81m_LocaleHandlingTestCase(ESC[4;38;5;149munittestESC[4;38;5;149m.ESC[4;38;5;149mTestCase):
     207      # Base class to check expected locale handling behaviour
     208  
     209      def _check_child_encoding_details(self,
     210                                        env_vars,
     211                                        expected_fs_encoding,
     212                                        expected_stream_encoding,
     213                                        expected_warnings,
     214                                        coercion_expected):
     215          """Check the C locale handling for the given process environment
     216  
     217          Parameters:
     218              expected_fs_encoding: expected sys.getfilesystemencoding() result
     219              expected_stream_encoding: expected encoding for standard streams
     220              expected_warning: stderr output to expect (if any)
     221          """
     222          result = EncodingDetails.get_child_details(env_vars)
     223          encoding_details, stderr_lines = result
     224          expected_details = EncodingDetails.get_expected_details(
     225              coercion_expected,
     226              expected_fs_encoding,
     227              expected_stream_encoding,
     228              env_vars
     229          )
     230          self.assertEqual(encoding_details, expected_details)
     231          if expected_warnings is None:
     232              expected_warnings = []
     233          self.assertEqual(stderr_lines, expected_warnings)
     234  
     235  
     236  class ESC[4;38;5;81mLocaleConfigurationTests(ESC[4;38;5;149m_LocaleHandlingTestCase):
     237      # Test explicit external configuration via the process environment
     238  
     239      @classmethod
     240      def setUpClass(cls):
     241          # This relies on setUpModule() having been run, so it can't be
     242          # handled via the @unittest.skipUnless decorator
     243          if not AVAILABLE_TARGETS:
     244              raise unittest.SkipTest("No C-with-UTF-8 locale available")
     245  
     246      def test_external_target_locale_configuration(self):
     247  
     248          # Explicitly setting a target locale should give the same behaviour as
     249          # is seen when implicitly coercing to that target locale
     250          self.maxDiff = None
     251  
     252          expected_fs_encoding = "utf-8"
     253          expected_stream_encoding = "utf-8"
     254  
     255          base_var_dict = {
     256              "LANG": "",
     257              "LC_CTYPE": "",
     258              "LC_ALL": "",
     259              "PYTHONCOERCECLOCALE": "",
     260          }
     261          for env_var in ("LANG", "LC_CTYPE"):
     262              for locale_to_set in AVAILABLE_TARGETS:
     263                  # XXX (ncoghlan): LANG=UTF-8 doesn't appear to work as
     264                  #                 expected, so skip that combination for now
     265                  # See https://bugs.python.org/issue30672 for discussion
     266                  if env_var == "LANG" and locale_to_set == "UTF-8":
     267                      continue
     268  
     269                  with self.subTest(env_var=env_var,
     270                                    configured_locale=locale_to_set):
     271                      var_dict = base_var_dict.copy()
     272                      var_dict[env_var] = locale_to_set
     273                      self._check_child_encoding_details(var_dict,
     274                                                         expected_fs_encoding,
     275                                                         expected_stream_encoding,
     276                                                         expected_warnings=None,
     277                                                         coercion_expected=False)
     278  
     279  
     280  
     281  @support.cpython_only
     282  @unittest.skipUnless(sysconfig.get_config_var("PY_COERCE_C_LOCALE"),
     283                       "C locale coercion disabled at build time")
     284  class ESC[4;38;5;81mLocaleCoercionTests(ESC[4;38;5;149m_LocaleHandlingTestCase):
     285      # Test implicit reconfiguration of the environment during CLI startup
     286  
     287      def _check_c_locale_coercion(self,
     288                                   fs_encoding, stream_encoding,
     289                                   coerce_c_locale,
     290                                   expected_warnings=None,
     291                                   coercion_expected=True,
     292                                   **extra_vars):
     293          """Check the C locale handling for various configurations
     294  
     295          Parameters:
     296              fs_encoding: expected sys.getfilesystemencoding() result
     297              stream_encoding: expected encoding for standard streams
     298              coerce_c_locale: setting to use for PYTHONCOERCECLOCALE
     299                None: don't set the variable at all
     300                str: the value set in the child's environment
     301              expected_warnings: expected warning lines on stderr
     302              extra_vars: additional environment variables to set in subprocess
     303          """
     304          self.maxDiff = None
     305  
     306          if not AVAILABLE_TARGETS:
     307              # Locale coercion is disabled when there aren't any target locales
     308              fs_encoding = EXPECTED_C_LOCALE_FS_ENCODING
     309              stream_encoding = EXPECTED_C_LOCALE_STREAM_ENCODING
     310              coercion_expected = False
     311              if expected_warnings:
     312                  expected_warnings = [LEGACY_LOCALE_WARNING]
     313  
     314          base_var_dict = {
     315              "LANG": "",
     316              "LC_CTYPE": "",
     317              "LC_ALL": "",
     318              "PYTHONCOERCECLOCALE": "",
     319          }
     320          base_var_dict.update(extra_vars)
     321          if coerce_c_locale is not None:
     322              base_var_dict["PYTHONCOERCECLOCALE"] = coerce_c_locale
     323  
     324          # Check behaviour for the default locale
     325          with self.subTest(default_locale=True,
     326                            PYTHONCOERCECLOCALE=coerce_c_locale):
     327              if EXPECT_COERCION_IN_DEFAULT_LOCALE:
     328                  _expected_warnings = expected_warnings
     329                  _coercion_expected = coercion_expected
     330              else:
     331                  _expected_warnings = None
     332                  _coercion_expected = False
     333              # On Android CLI_COERCION_WARNING is not printed when all the
     334              # locale environment variables are undefined or empty. When
     335              # this code path is run with environ['LC_ALL'] == 'C', then
     336              # LEGACY_LOCALE_WARNING is printed.
     337              if (support.is_android and
     338                      _expected_warnings == [CLI_COERCION_WARNING]):
     339                  _expected_warnings = None
     340              self._check_child_encoding_details(base_var_dict,
     341                                                 fs_encoding,
     342                                                 stream_encoding,
     343                                                 _expected_warnings,
     344                                                 _coercion_expected)
     345  
     346          # Check behaviour for explicitly configured locales
     347          for locale_to_set in EXPECTED_C_LOCALE_EQUIVALENTS:
     348              for env_var in ("LANG", "LC_CTYPE"):
     349                  with self.subTest(env_var=env_var,
     350                                    nominal_locale=locale_to_set,
     351                                    PYTHONCOERCECLOCALE=coerce_c_locale):
     352                      var_dict = base_var_dict.copy()
     353                      var_dict[env_var] = locale_to_set
     354                      # Check behaviour on successful coercion
     355                      self._check_child_encoding_details(var_dict,
     356                                                         fs_encoding,
     357                                                         stream_encoding,
     358                                                         expected_warnings,
     359                                                         coercion_expected)
     360  
     361      def test_PYTHONCOERCECLOCALE_not_set(self):
     362          # This should coerce to the first available target locale by default
     363          self._check_c_locale_coercion("utf-8", "utf-8", coerce_c_locale=None)
     364  
     365      def test_PYTHONCOERCECLOCALE_not_zero(self):
     366          # *Any* string other than "0" is considered "set" for our purposes
     367          # and hence should result in the locale coercion being enabled
     368          for setting in ("", "1", "true", "false"):
     369              self._check_c_locale_coercion("utf-8", "utf-8", coerce_c_locale=setting)
     370  
     371      def test_PYTHONCOERCECLOCALE_set_to_warn(self):
     372          # PYTHONCOERCECLOCALE=warn enables runtime warnings for legacy locales
     373          self._check_c_locale_coercion("utf-8", "utf-8",
     374                                        coerce_c_locale="warn",
     375                                        expected_warnings=[CLI_COERCION_WARNING])
     376  
     377  
     378      def test_PYTHONCOERCECLOCALE_set_to_zero(self):
     379          # The setting "0" should result in the locale coercion being disabled
     380          self._check_c_locale_coercion(EXPECTED_C_LOCALE_FS_ENCODING,
     381                                        EXPECTED_C_LOCALE_STREAM_ENCODING,
     382                                        coerce_c_locale="0",
     383                                        coercion_expected=False)
     384          # Setting LC_ALL=C shouldn't make any difference to the behaviour
     385          self._check_c_locale_coercion(EXPECTED_C_LOCALE_FS_ENCODING,
     386                                        EXPECTED_C_LOCALE_STREAM_ENCODING,
     387                                        coerce_c_locale="0",
     388                                        LC_ALL="C",
     389                                        coercion_expected=False)
     390  
     391      def test_LC_ALL_set_to_C(self):
     392          # Setting LC_ALL should render the locale coercion ineffective
     393          self._check_c_locale_coercion(EXPECTED_C_LOCALE_FS_ENCODING,
     394                                        EXPECTED_C_LOCALE_STREAM_ENCODING,
     395                                        coerce_c_locale=None,
     396                                        LC_ALL="C",
     397                                        coercion_expected=False)
     398          # And result in a warning about a lack of locale compatibility
     399          self._check_c_locale_coercion(EXPECTED_C_LOCALE_FS_ENCODING,
     400                                        EXPECTED_C_LOCALE_STREAM_ENCODING,
     401                                        coerce_c_locale="warn",
     402                                        LC_ALL="C",
     403                                        expected_warnings=[LEGACY_LOCALE_WARNING],
     404                                        coercion_expected=False)
     405  
     406      def test_PYTHONCOERCECLOCALE_set_to_one(self):
     407          # skip the test if the LC_CTYPE locale is C or coerced
     408          old_loc = locale.setlocale(locale.LC_CTYPE, None)
     409          self.addCleanup(locale.setlocale, locale.LC_CTYPE, old_loc)
     410          try:
     411              loc = locale.setlocale(locale.LC_CTYPE, "")
     412          except locale.Error as e:
     413              self.skipTest(str(e))
     414          if loc == "C":
     415              self.skipTest("test requires LC_CTYPE locale different than C")
     416          if loc in TARGET_LOCALES :
     417              self.skipTest("coerced LC_CTYPE locale: %s" % loc)
     418  
     419          # bpo-35336: PYTHONCOERCECLOCALE=1 must not coerce the LC_CTYPE locale
     420          # if it's not equal to "C"
     421          code = 'import locale; print(locale.setlocale(locale.LC_CTYPE, None))'
     422          env = dict(os.environ, PYTHONCOERCECLOCALE='1')
     423          cmd = subprocess.run([sys.executable, '-c', code],
     424                               stdout=subprocess.PIPE,
     425                               env=env,
     426                               text=True)
     427          self.assertEqual(cmd.stdout.rstrip(), loc)
     428  
     429  
     430  def tearDownModule():
     431      support.reap_children()
     432  
     433  
     434  if __name__ == "__main__":
     435      unittest.main()