1 # Tests the attempted automatic coercion of the C locale to a UTF-8 locale
2
3 import locale
4 import os
5 import subprocess
6 import sys
7 import sysconfig
8 import unittest
9 from collections import namedtuple
10
11 from test import support
12 from test.support.script_helper import run_python_until_end
13
14
15 # Set the list of ways we expect to be able to ask for the "C" locale
16 EXPECTED_C_LOCALE_EQUIVALENTS = ["C", "invalid.ascii"]
17
18 # Set our expectation for the default encoding used in the C locale
19 # for the filesystem encoding and the standard streams
20 EXPECTED_C_LOCALE_STREAM_ENCODING = "ascii"
21 EXPECTED_C_LOCALE_FS_ENCODING = "ascii"
22
23 # Set our expectation for the default locale used when none is specified
24 EXPECT_COERCION_IN_DEFAULT_LOCALE = True
25
26 TARGET_LOCALES = ["C.UTF-8", "C.utf8", "UTF-8"]
27
28 # Apply some platform dependent overrides
29 if sys.platform.startswith("linux"):
30 if support.is_android:
31 # Android defaults to using UTF-8 for all system interfaces
32 EXPECTED_C_LOCALE_STREAM_ENCODING = "utf-8"
33 EXPECTED_C_LOCALE_FS_ENCODING = "utf-8"
34 else:
35 # Linux distros typically alias the POSIX locale directly to the C
36 # locale.
37 # TODO: Once https://bugs.python.org/issue30672 is addressed, we'll be
38 # able to check this case unconditionally
39 EXPECTED_C_LOCALE_EQUIVALENTS.append("POSIX")
40 elif sys.platform.startswith("aix"):
41 # AIX uses iso8859-1 in the C locale, other *nix platforms use ASCII
42 EXPECTED_C_LOCALE_STREAM_ENCODING = "iso8859-1"
43 EXPECTED_C_LOCALE_FS_ENCODING = "iso8859-1"
44 elif sys.platform == "darwin":
45 # FS encoding is UTF-8 on macOS
46 EXPECTED_C_LOCALE_FS_ENCODING = "utf-8"
47 elif sys.platform == "cygwin":
48 # Cygwin defaults to using C.UTF-8
49 # TODO: Work out a robust dynamic test for this that doesn't rely on
50 # CPython's own locale handling machinery
51 EXPECT_COERCION_IN_DEFAULT_LOCALE = False
52 elif sys.platform == "vxworks":
53 # VxWorks defaults to using UTF-8 for all system interfaces
54 EXPECTED_C_LOCALE_STREAM_ENCODING = "utf-8"
55 EXPECTED_C_LOCALE_FS_ENCODING = "utf-8"
56
57 # Note that the above expectations are still wrong in some cases, such as:
58 # * Windows when PYTHONLEGACYWINDOWSFSENCODING is set
59 # * Any platform other than AIX that uses latin-1 in the C locale
60 # * Any Linux distro where POSIX isn't a simple alias for the C locale
61 # * Any Linux distro where the default locale is something other than "C"
62 #
63 # Options for dealing with this:
64 # * Don't set the PY_COERCE_C_LOCALE preprocessor definition on
65 # such platforms (e.g. it isn't set on Windows)
66 # * Fix the test expectations to match the actual platform behaviour
67
68 # In order to get the warning messages to match up as expected, the candidate
69 # order here must much the target locale order in Python/pylifecycle.c
70 _C_UTF8_LOCALES = ("C.UTF-8", "C.utf8", "UTF-8")
71
72 # There's no reliable cross-platform way of checking locale alias
73 # lists, so the only way of knowing which of these locales will work
74 # is to try them with locale.setlocale(). We do that in a subprocess
75 # in setUpModule() below to avoid altering the locale of the test runner.
76 #
77 # If the relevant locale module attributes exist, and we're not on a platform
78 # where we expect it to always succeed, we also check that
79 # `locale.nl_langinfo(locale.CODESET)` works, as if it fails, the interpreter
80 # will skip locale coercion for that particular target locale
81 _check_nl_langinfo_CODESET = bool(
82 sys.platform not in ("darwin", "linux") and
83 hasattr(locale, "nl_langinfo") and
84 hasattr(locale, "CODESET")
85 )
86
87 def _set_locale_in_subprocess(locale_name):
88 cmd_fmt = "import locale; print(locale.setlocale(locale.LC_CTYPE, '{}'))"
89 if _check_nl_langinfo_CODESET:
90 # If there's no valid CODESET, we expect coercion to be skipped
91 cmd_fmt += "; import sys; sys.exit(not locale.nl_langinfo(locale.CODESET))"
92 cmd = cmd_fmt.format(locale_name)
93 result, py_cmd = run_python_until_end("-c", cmd, PYTHONCOERCECLOCALE='')
94 return result.rc == 0
95
96
97
98 _fields = "fsencoding stdin_info stdout_info stderr_info lang lc_ctype lc_all"
99 _EncodingDetails = namedtuple("EncodingDetails", _fields)
100
101 class ESC[4;38;5;81mEncodingDetails(ESC[4;38;5;149m_EncodingDetails):
102 # XXX (ncoghlan): Using JSON for child state reporting may be less fragile
103 CHILD_PROCESS_SCRIPT = ";".join([
104 "import sys, os",
105 "print(sys.getfilesystemencoding())",
106 "print(sys.stdin.encoding + ':' + sys.stdin.errors)",
107 "print(sys.stdout.encoding + ':' + sys.stdout.errors)",
108 "print(sys.stderr.encoding + ':' + sys.stderr.errors)",
109 "print(os.environ.get('LANG', 'not set'))",
110 "print(os.environ.get('LC_CTYPE', 'not set'))",
111 "print(os.environ.get('LC_ALL', 'not set'))",
112 ])
113
114 @classmethod
115 def get_expected_details(cls, coercion_expected, fs_encoding, stream_encoding, env_vars):
116 """Returns expected child process details for a given encoding"""
117 _stream = stream_encoding + ":{}"
118 # stdin and stdout should use surrogateescape either because the
119 # coercion triggered, or because the C locale was detected
120 stream_info = 2*[_stream.format("surrogateescape")]
121 # stderr should always use backslashreplace
122 stream_info.append(_stream.format("backslashreplace"))
123 expected_lang = env_vars.get("LANG", "not set")
124 if coercion_expected:
125 expected_lc_ctype = CLI_COERCION_TARGET
126 else:
127 expected_lc_ctype = env_vars.get("LC_CTYPE", "not set")
128 expected_lc_all = env_vars.get("LC_ALL", "not set")
129 env_info = expected_lang, expected_lc_ctype, expected_lc_all
130 return dict(cls(fs_encoding, *stream_info, *env_info)._asdict())
131
132 @classmethod
133 def get_child_details(cls, env_vars):
134 """Retrieves fsencoding and standard stream details from a child process
135
136 Returns (encoding_details, stderr_lines):
137
138 - encoding_details: EncodingDetails for eager decoding
139 - stderr_lines: result of calling splitlines() on the stderr output
140
141 The child is run in isolated mode if the current interpreter supports
142 that.
143 """
144 result, py_cmd = run_python_until_end(
145 "-X", "utf8=0", "-c", cls.CHILD_PROCESS_SCRIPT,
146 **env_vars
147 )
148 if not result.rc == 0:
149 result.fail(py_cmd)
150 # All subprocess outputs in this test case should be pure ASCII
151 stdout_lines = result.out.decode("ascii").splitlines()
152 child_encoding_details = dict(cls(*stdout_lines)._asdict())
153 stderr_lines = result.err.decode("ascii").rstrip().splitlines()
154 return child_encoding_details, stderr_lines
155
156
157 # Details of the shared library warning emitted at runtime
158 LEGACY_LOCALE_WARNING = (
159 "Python runtime initialized with LC_CTYPE=C (a locale with default ASCII "
160 "encoding), which may cause Unicode compatibility problems. Using C.UTF-8, "
161 "C.utf8, or UTF-8 (if available) as alternative Unicode-compatible "
162 "locales is recommended."
163 )
164
165 # Details of the CLI locale coercion warning emitted at runtime
166 CLI_COERCION_WARNING_FMT = (
167 "Python detected LC_CTYPE=C: LC_CTYPE coerced to {} (set another locale "
168 "or PYTHONCOERCECLOCALE=0 to disable this locale coercion behavior)."
169 )
170
171
172 AVAILABLE_TARGETS = None
173 CLI_COERCION_TARGET = None
174 CLI_COERCION_WARNING = None
175
176 def setUpModule():
177 global AVAILABLE_TARGETS
178 global CLI_COERCION_TARGET
179 global CLI_COERCION_WARNING
180
181 if AVAILABLE_TARGETS is not None:
182 # initialization already done
183 return
184 AVAILABLE_TARGETS = []
185
186 # Find the target locales available in the current system
187 for target_locale in _C_UTF8_LOCALES:
188 if _set_locale_in_subprocess(target_locale):
189 AVAILABLE_TARGETS.append(target_locale)
190
191 if AVAILABLE_TARGETS:
192 # Coercion is expected to use the first available target locale
193 CLI_COERCION_TARGET = AVAILABLE_TARGETS[0]
194 CLI_COERCION_WARNING = CLI_COERCION_WARNING_FMT.format(CLI_COERCION_TARGET)
195
196 if support.verbose:
197 print(f"AVAILABLE_TARGETS = {AVAILABLE_TARGETS!r}")
198 print(f"EXPECTED_C_LOCALE_EQUIVALENTS = {EXPECTED_C_LOCALE_EQUIVALENTS!r}")
199 print(f"EXPECTED_C_LOCALE_STREAM_ENCODING = {EXPECTED_C_LOCALE_STREAM_ENCODING!r}")
200 print(f"EXPECTED_C_LOCALE_FS_ENCODING = {EXPECTED_C_LOCALE_FS_ENCODING!r}")
201 print(f"EXPECT_COERCION_IN_DEFAULT_LOCALE = {EXPECT_COERCION_IN_DEFAULT_LOCALE!r}")
202 print(f"_C_UTF8_LOCALES = {_C_UTF8_LOCALES!r}")
203 print(f"_check_nl_langinfo_CODESET = {_check_nl_langinfo_CODESET!r}")
204
205
206 class ESC[4;38;5;81m_LocaleHandlingTestCase(ESC[4;38;5;149munittestESC[4;38;5;149m.ESC[4;38;5;149mTestCase):
207 # Base class to check expected locale handling behaviour
208
209 def _check_child_encoding_details(self,
210 env_vars,
211 expected_fs_encoding,
212 expected_stream_encoding,
213 expected_warnings,
214 coercion_expected):
215 """Check the C locale handling for the given process environment
216
217 Parameters:
218 expected_fs_encoding: expected sys.getfilesystemencoding() result
219 expected_stream_encoding: expected encoding for standard streams
220 expected_warning: stderr output to expect (if any)
221 """
222 result = EncodingDetails.get_child_details(env_vars)
223 encoding_details, stderr_lines = result
224 expected_details = EncodingDetails.get_expected_details(
225 coercion_expected,
226 expected_fs_encoding,
227 expected_stream_encoding,
228 env_vars
229 )
230 self.assertEqual(encoding_details, expected_details)
231 if expected_warnings is None:
232 expected_warnings = []
233 self.assertEqual(stderr_lines, expected_warnings)
234
235
236 class ESC[4;38;5;81mLocaleConfigurationTests(ESC[4;38;5;149m_LocaleHandlingTestCase):
237 # Test explicit external configuration via the process environment
238
239 @classmethod
240 def setUpClass(cls):
241 # This relies on setUpModule() having been run, so it can't be
242 # handled via the @unittest.skipUnless decorator
243 if not AVAILABLE_TARGETS:
244 raise unittest.SkipTest("No C-with-UTF-8 locale available")
245
246 def test_external_target_locale_configuration(self):
247
248 # Explicitly setting a target locale should give the same behaviour as
249 # is seen when implicitly coercing to that target locale
250 self.maxDiff = None
251
252 expected_fs_encoding = "utf-8"
253 expected_stream_encoding = "utf-8"
254
255 base_var_dict = {
256 "LANG": "",
257 "LC_CTYPE": "",
258 "LC_ALL": "",
259 "PYTHONCOERCECLOCALE": "",
260 }
261 for env_var in ("LANG", "LC_CTYPE"):
262 for locale_to_set in AVAILABLE_TARGETS:
263 # XXX (ncoghlan): LANG=UTF-8 doesn't appear to work as
264 # expected, so skip that combination for now
265 # See https://bugs.python.org/issue30672 for discussion
266 if env_var == "LANG" and locale_to_set == "UTF-8":
267 continue
268
269 with self.subTest(env_var=env_var,
270 configured_locale=locale_to_set):
271 var_dict = base_var_dict.copy()
272 var_dict[env_var] = locale_to_set
273 self._check_child_encoding_details(var_dict,
274 expected_fs_encoding,
275 expected_stream_encoding,
276 expected_warnings=None,
277 coercion_expected=False)
278
279
280
281 @support.cpython_only
282 @unittest.skipUnless(sysconfig.get_config_var("PY_COERCE_C_LOCALE"),
283 "C locale coercion disabled at build time")
284 class ESC[4;38;5;81mLocaleCoercionTests(ESC[4;38;5;149m_LocaleHandlingTestCase):
285 # Test implicit reconfiguration of the environment during CLI startup
286
287 def _check_c_locale_coercion(self,
288 fs_encoding, stream_encoding,
289 coerce_c_locale,
290 expected_warnings=None,
291 coercion_expected=True,
292 **extra_vars):
293 """Check the C locale handling for various configurations
294
295 Parameters:
296 fs_encoding: expected sys.getfilesystemencoding() result
297 stream_encoding: expected encoding for standard streams
298 coerce_c_locale: setting to use for PYTHONCOERCECLOCALE
299 None: don't set the variable at all
300 str: the value set in the child's environment
301 expected_warnings: expected warning lines on stderr
302 extra_vars: additional environment variables to set in subprocess
303 """
304 self.maxDiff = None
305
306 if not AVAILABLE_TARGETS:
307 # Locale coercion is disabled when there aren't any target locales
308 fs_encoding = EXPECTED_C_LOCALE_FS_ENCODING
309 stream_encoding = EXPECTED_C_LOCALE_STREAM_ENCODING
310 coercion_expected = False
311 if expected_warnings:
312 expected_warnings = [LEGACY_LOCALE_WARNING]
313
314 base_var_dict = {
315 "LANG": "",
316 "LC_CTYPE": "",
317 "LC_ALL": "",
318 "PYTHONCOERCECLOCALE": "",
319 }
320 base_var_dict.update(extra_vars)
321 if coerce_c_locale is not None:
322 base_var_dict["PYTHONCOERCECLOCALE"] = coerce_c_locale
323
324 # Check behaviour for the default locale
325 with self.subTest(default_locale=True,
326 PYTHONCOERCECLOCALE=coerce_c_locale):
327 if EXPECT_COERCION_IN_DEFAULT_LOCALE:
328 _expected_warnings = expected_warnings
329 _coercion_expected = coercion_expected
330 else:
331 _expected_warnings = None
332 _coercion_expected = False
333 # On Android CLI_COERCION_WARNING is not printed when all the
334 # locale environment variables are undefined or empty. When
335 # this code path is run with environ['LC_ALL'] == 'C', then
336 # LEGACY_LOCALE_WARNING is printed.
337 if (support.is_android and
338 _expected_warnings == [CLI_COERCION_WARNING]):
339 _expected_warnings = None
340 self._check_child_encoding_details(base_var_dict,
341 fs_encoding,
342 stream_encoding,
343 _expected_warnings,
344 _coercion_expected)
345
346 # Check behaviour for explicitly configured locales
347 for locale_to_set in EXPECTED_C_LOCALE_EQUIVALENTS:
348 for env_var in ("LANG", "LC_CTYPE"):
349 with self.subTest(env_var=env_var,
350 nominal_locale=locale_to_set,
351 PYTHONCOERCECLOCALE=coerce_c_locale):
352 var_dict = base_var_dict.copy()
353 var_dict[env_var] = locale_to_set
354 # Check behaviour on successful coercion
355 self._check_child_encoding_details(var_dict,
356 fs_encoding,
357 stream_encoding,
358 expected_warnings,
359 coercion_expected)
360
361 def test_PYTHONCOERCECLOCALE_not_set(self):
362 # This should coerce to the first available target locale by default
363 self._check_c_locale_coercion("utf-8", "utf-8", coerce_c_locale=None)
364
365 def test_PYTHONCOERCECLOCALE_not_zero(self):
366 # *Any* string other than "0" is considered "set" for our purposes
367 # and hence should result in the locale coercion being enabled
368 for setting in ("", "1", "true", "false"):
369 self._check_c_locale_coercion("utf-8", "utf-8", coerce_c_locale=setting)
370
371 def test_PYTHONCOERCECLOCALE_set_to_warn(self):
372 # PYTHONCOERCECLOCALE=warn enables runtime warnings for legacy locales
373 self._check_c_locale_coercion("utf-8", "utf-8",
374 coerce_c_locale="warn",
375 expected_warnings=[CLI_COERCION_WARNING])
376
377
378 def test_PYTHONCOERCECLOCALE_set_to_zero(self):
379 # The setting "0" should result in the locale coercion being disabled
380 self._check_c_locale_coercion(EXPECTED_C_LOCALE_FS_ENCODING,
381 EXPECTED_C_LOCALE_STREAM_ENCODING,
382 coerce_c_locale="0",
383 coercion_expected=False)
384 # Setting LC_ALL=C shouldn't make any difference to the behaviour
385 self._check_c_locale_coercion(EXPECTED_C_LOCALE_FS_ENCODING,
386 EXPECTED_C_LOCALE_STREAM_ENCODING,
387 coerce_c_locale="0",
388 LC_ALL="C",
389 coercion_expected=False)
390
391 def test_LC_ALL_set_to_C(self):
392 # Setting LC_ALL should render the locale coercion ineffective
393 self._check_c_locale_coercion(EXPECTED_C_LOCALE_FS_ENCODING,
394 EXPECTED_C_LOCALE_STREAM_ENCODING,
395 coerce_c_locale=None,
396 LC_ALL="C",
397 coercion_expected=False)
398 # And result in a warning about a lack of locale compatibility
399 self._check_c_locale_coercion(EXPECTED_C_LOCALE_FS_ENCODING,
400 EXPECTED_C_LOCALE_STREAM_ENCODING,
401 coerce_c_locale="warn",
402 LC_ALL="C",
403 expected_warnings=[LEGACY_LOCALE_WARNING],
404 coercion_expected=False)
405
406 def test_PYTHONCOERCECLOCALE_set_to_one(self):
407 # skip the test if the LC_CTYPE locale is C or coerced
408 old_loc = locale.setlocale(locale.LC_CTYPE, None)
409 self.addCleanup(locale.setlocale, locale.LC_CTYPE, old_loc)
410 try:
411 loc = locale.setlocale(locale.LC_CTYPE, "")
412 except locale.Error as e:
413 self.skipTest(str(e))
414 if loc == "C":
415 self.skipTest("test requires LC_CTYPE locale different than C")
416 if loc in TARGET_LOCALES :
417 self.skipTest("coerced LC_CTYPE locale: %s" % loc)
418
419 # bpo-35336: PYTHONCOERCECLOCALE=1 must not coerce the LC_CTYPE locale
420 # if it's not equal to "C"
421 code = 'import locale; print(locale.setlocale(locale.LC_CTYPE, None))'
422 env = dict(os.environ, PYTHONCOERCECLOCALE='1')
423 cmd = subprocess.run([sys.executable, '-c', code],
424 stdout=subprocess.PIPE,
425 env=env,
426 text=True)
427 self.assertEqual(cmd.stdout.rstrip(), loc)
428
429
430 def tearDownModule():
431 support.reap_children()
432
433
434 if __name__ == "__main__":
435 unittest.main()