1 # -*- coding: utf-8 -*-
2
3 import unittest
4 from test.support import script_helper, captured_stdout, requires_subprocess, requires_resource
5 from test.support.os_helper import TESTFN, unlink, rmtree
6 from test.support.import_helper import unload
7 import importlib
8 import os
9 import sys
10 import subprocess
11 import tempfile
12
13 class ESC[4;38;5;81mMiscSourceEncodingTest(ESC[4;38;5;149munittestESC[4;38;5;149m.ESC[4;38;5;149mTestCase):
14
15 def test_import_encoded_module(self):
16 from test.encoded_modules import test_strings
17 # Make sure we're actually testing something
18 self.assertGreaterEqual(len(test_strings), 1)
19 for modname, encoding, teststr in test_strings:
20 mod = importlib.import_module('test.encoded_modules.'
21 'module_' + modname)
22 self.assertEqual(teststr, mod.test)
23
24 def test_compilestring(self):
25 # see #1882
26 c = compile(b"\n# coding: utf-8\nu = '\xc3\xb3'\n", "dummy", "exec")
27 d = {}
28 exec(c, d)
29 self.assertEqual(d['u'], '\xf3')
30
31 def test_issue2301(self):
32 try:
33 compile(b"# coding: cp932\nprint '\x94\x4e'", "dummy", "exec")
34 except SyntaxError as v:
35 self.assertEqual(v.text.rstrip('\n'), "print '\u5e74'")
36 else:
37 self.fail()
38
39 def test_issue4626(self):
40 c = compile("# coding=latin-1\n\u00c6 = '\u00c6'", "dummy", "exec")
41 d = {}
42 exec(c, d)
43 self.assertEqual(d['\xc6'], '\xc6')
44
45 def test_issue3297(self):
46 c = compile("a, b = '\U0001010F', '\\U0001010F'", "dummy", "exec")
47 d = {}
48 exec(c, d)
49 self.assertEqual(d['a'], d['b'])
50 self.assertEqual(len(d['a']), len(d['b']))
51 self.assertEqual(ascii(d['a']), ascii(d['b']))
52
53 def test_issue7820(self):
54 # Ensure that check_bom() restores all bytes in the right order if
55 # check_bom() fails in pydebug mode: a buffer starts with the first
56 # byte of a valid BOM, but next bytes are different
57
58 # one byte in common with the UTF-16-LE BOM
59 self.assertRaises(SyntaxError, eval, b'\xff\x20')
60
61 # one byte in common with the UTF-8 BOM
62 self.assertRaises(SyntaxError, eval, b'\xef\x20')
63
64 # two bytes in common with the UTF-8 BOM
65 self.assertRaises(SyntaxError, eval, b'\xef\xbb\x20')
66
67 @requires_subprocess()
68 def test_20731(self):
69 sub = subprocess.Popen([sys.executable,
70 os.path.join(os.path.dirname(__file__),
71 'coding20731.py')],
72 stderr=subprocess.PIPE)
73 err = sub.communicate()[1]
74 self.assertEqual(sub.returncode, 0)
75 self.assertNotIn(b'SyntaxError', err)
76
77 def test_error_message(self):
78 compile(b'# -*- coding: iso-8859-15 -*-\n', 'dummy', 'exec')
79 compile(b'\xef\xbb\xbf\n', 'dummy', 'exec')
80 compile(b'\xef\xbb\xbf# -*- coding: utf-8 -*-\n', 'dummy', 'exec')
81 with self.assertRaisesRegex(SyntaxError, 'fake'):
82 compile(b'# -*- coding: fake -*-\n', 'dummy', 'exec')
83 with self.assertRaisesRegex(SyntaxError, 'iso-8859-15'):
84 compile(b'\xef\xbb\xbf# -*- coding: iso-8859-15 -*-\n',
85 'dummy', 'exec')
86 with self.assertRaisesRegex(SyntaxError, 'BOM'):
87 compile(b'\xef\xbb\xbf# -*- coding: iso-8859-15 -*-\n',
88 'dummy', 'exec')
89 with self.assertRaisesRegex(SyntaxError, 'fake'):
90 compile(b'\xef\xbb\xbf# -*- coding: fake -*-\n', 'dummy', 'exec')
91 with self.assertRaisesRegex(SyntaxError, 'BOM'):
92 compile(b'\xef\xbb\xbf# -*- coding: fake -*-\n', 'dummy', 'exec')
93
94 def test_bad_coding(self):
95 module_name = 'bad_coding'
96 self.verify_bad_module(module_name)
97
98 def test_bad_coding2(self):
99 module_name = 'bad_coding2'
100 self.verify_bad_module(module_name)
101
102 def verify_bad_module(self, module_name):
103 self.assertRaises(SyntaxError, __import__, 'test.' + module_name)
104
105 path = os.path.dirname(__file__)
106 filename = os.path.join(path, module_name + '.py')
107 with open(filename, "rb") as fp:
108 bytes = fp.read()
109 self.assertRaises(SyntaxError, compile, bytes, filename, 'exec')
110
111 def test_exec_valid_coding(self):
112 d = {}
113 exec(b'# coding: cp949\na = "\xaa\xa7"\n', d)
114 self.assertEqual(d['a'], '\u3047')
115
116 def test_file_parse(self):
117 # issue1134: all encodings outside latin-1 and utf-8 fail on
118 # multiline strings and long lines (>512 columns)
119 unload(TESTFN)
120 filename = TESTFN + ".py"
121 f = open(filename, "w", encoding="cp1252")
122 sys.path.insert(0, os.curdir)
123 try:
124 with f:
125 f.write("# -*- coding: cp1252 -*-\n")
126 f.write("'''A short string\n")
127 f.write("'''\n")
128 f.write("'A very long string %s'\n" % ("X" * 1000))
129
130 importlib.invalidate_caches()
131 __import__(TESTFN)
132 finally:
133 del sys.path[0]
134 unlink(filename)
135 unlink(filename + "c")
136 unlink(filename + "o")
137 unload(TESTFN)
138 rmtree('__pycache__')
139
140 def test_error_from_string(self):
141 # See http://bugs.python.org/issue6289
142 input = "# coding: ascii\n\N{SNOWMAN}".encode('utf-8')
143 with self.assertRaises(SyntaxError) as c:
144 compile(input, "<string>", "exec")
145 expected = "'ascii' codec can't decode byte 0xe2 in position 16: " \
146 "ordinal not in range(128)"
147 self.assertTrue(c.exception.args[0].startswith(expected),
148 msg=c.exception.args[0])
149
150 def test_file_parse_error_multiline(self):
151 # gh96611:
152 with open(TESTFN, "wb") as fd:
153 fd.write(b'print("""\n\xb1""")\n')
154
155 try:
156 retcode, stdout, stderr = script_helper.assert_python_failure(TESTFN)
157
158 self.assertGreater(retcode, 0)
159 self.assertIn(b"Non-UTF-8 code starting with '\\xb1'", stderr)
160 finally:
161 os.unlink(TESTFN)
162
163 def test_tokenizer_fstring_warning_in_first_line(self):
164 source = "0b1and 2"
165 with open(TESTFN, "w") as fd:
166 fd.write("{}".format(source))
167 try:
168 retcode, stdout, stderr = script_helper.assert_python_ok(TESTFN)
169 self.assertIn(b"SyntaxWarning: invalid binary litera", stderr)
170 self.assertEqual(stderr.count(source.encode()), 1)
171 finally:
172 os.unlink(TESTFN)
173
174
175 class ESC[4;38;5;81mAbstractSourceEncodingTest:
176
177 def test_default_coding(self):
178 src = (b'print(ascii("\xc3\xa4"))\n')
179 self.check_script_output(src, br"'\xe4'")
180
181 def test_first_coding_line(self):
182 src = (b'#coding:iso8859-15\n'
183 b'print(ascii("\xc3\xa4"))\n')
184 self.check_script_output(src, br"'\xc3\u20ac'")
185
186 def test_second_coding_line(self):
187 src = (b'#\n'
188 b'#coding:iso8859-15\n'
189 b'print(ascii("\xc3\xa4"))\n')
190 self.check_script_output(src, br"'\xc3\u20ac'")
191
192 def test_third_coding_line(self):
193 # Only first two lines are tested for a magic comment.
194 src = (b'#\n'
195 b'#\n'
196 b'#coding:iso8859-15\n'
197 b'print(ascii("\xc3\xa4"))\n')
198 self.check_script_output(src, br"'\xe4'")
199
200 def test_double_coding_line(self):
201 # If the first line matches the second line is ignored.
202 src = (b'#coding:iso8859-15\n'
203 b'#coding:latin1\n'
204 b'print(ascii("\xc3\xa4"))\n')
205 self.check_script_output(src, br"'\xc3\u20ac'")
206
207 def test_double_coding_same_line(self):
208 src = (b'#coding:iso8859-15 coding:latin1\n'
209 b'print(ascii("\xc3\xa4"))\n')
210 self.check_script_output(src, br"'\xc3\u20ac'")
211
212 def test_first_non_utf8_coding_line(self):
213 src = (b'#coding:iso-8859-15 \xa4\n'
214 b'print(ascii("\xc3\xa4"))\n')
215 self.check_script_output(src, br"'\xc3\u20ac'")
216
217 def test_second_non_utf8_coding_line(self):
218 src = (b'\n'
219 b'#coding:iso-8859-15 \xa4\n'
220 b'print(ascii("\xc3\xa4"))\n')
221 self.check_script_output(src, br"'\xc3\u20ac'")
222
223 def test_utf8_bom(self):
224 src = (b'\xef\xbb\xbfprint(ascii("\xc3\xa4"))\n')
225 self.check_script_output(src, br"'\xe4'")
226
227 def test_utf8_bom_and_utf8_coding_line(self):
228 src = (b'\xef\xbb\xbf#coding:utf-8\n'
229 b'print(ascii("\xc3\xa4"))\n')
230 self.check_script_output(src, br"'\xe4'")
231
232 def test_crlf(self):
233 src = (b'print(ascii("""\r\n"""))\n')
234 out = self.check_script_output(src, br"'\n'")
235
236 def test_crcrlf(self):
237 src = (b'print(ascii("""\r\r\n"""))\n')
238 out = self.check_script_output(src, br"'\n\n'")
239
240 def test_crcrcrlf(self):
241 src = (b'print(ascii("""\r\r\r\n"""))\n')
242 out = self.check_script_output(src, br"'\n\n\n'")
243
244 def test_crcrcrlf2(self):
245 src = (b'#coding:iso-8859-1\n'
246 b'print(ascii("""\r\r\r\n"""))\n')
247 out = self.check_script_output(src, br"'\n\n\n'")
248
249
250 class ESC[4;38;5;81mUTF8ValidatorTest(ESC[4;38;5;149munittestESC[4;38;5;149m.ESC[4;38;5;149mTestCase):
251 @unittest.skipIf(not sys.platform.startswith("linux"),
252 "Too slow to run on non-Linux platforms")
253 @requires_resource('cpu')
254 def test_invalid_utf8(self):
255 # This is a port of test_utf8_decode_invalid_sequences in
256 # test_unicode.py to exercise the separate utf8 validator in
257 # Parser/tokenizer.c used when reading source files.
258
259 # That file is written using low-level C file I/O, so the only way to
260 # test it is to write actual files to disk.
261
262 # Each example is put inside a string at the top of the file so
263 # it's an otherwise valid Python source file. Put some newlines
264 # beforehand so we can assert that the error is reported on the
265 # correct line.
266 template = b'\n\n\n"%s"\n'
267
268 fn = TESTFN
269 self.addCleanup(unlink, fn)
270
271 def check(content):
272 with open(fn, 'wb') as fp:
273 fp.write(template % content)
274 rc, stdout, stderr = script_helper.assert_python_failure(fn)
275 # We want to assert that the python subprocess failed gracefully,
276 # not via a signal.
277 self.assertGreaterEqual(rc, 1)
278 self.assertIn(b"Non-UTF-8 code starting with", stderr)
279 self.assertIn(b"on line 4", stderr)
280
281 # continuation bytes in a sequence of 2, 3, or 4 bytes
282 continuation_bytes = [bytes([x]) for x in range(0x80, 0xC0)]
283 # start bytes of a 2-byte sequence equivalent to code points < 0x7F
284 invalid_2B_seq_start_bytes = [bytes([x]) for x in range(0xC0, 0xC2)]
285 # start bytes of a 4-byte sequence equivalent to code points > 0x10FFFF
286 invalid_4B_seq_start_bytes = [bytes([x]) for x in range(0xF5, 0xF8)]
287 invalid_start_bytes = (
288 continuation_bytes + invalid_2B_seq_start_bytes +
289 invalid_4B_seq_start_bytes + [bytes([x]) for x in range(0xF7, 0x100)]
290 )
291
292 for byte in invalid_start_bytes:
293 check(byte)
294
295 for sb in invalid_2B_seq_start_bytes:
296 for cb in continuation_bytes:
297 check(sb + cb)
298
299 for sb in invalid_4B_seq_start_bytes:
300 for cb1 in continuation_bytes[:3]:
301 for cb3 in continuation_bytes[:3]:
302 check(sb+cb1+b'\x80'+cb3)
303
304 for cb in [bytes([x]) for x in range(0x80, 0xA0)]:
305 check(b'\xE0'+cb+b'\x80')
306 check(b'\xE0'+cb+b'\xBF')
307 # surrogates
308 for cb in [bytes([x]) for x in range(0xA0, 0xC0)]:
309 check(b'\xED'+cb+b'\x80')
310 check(b'\xED'+cb+b'\xBF')
311 for cb in [bytes([x]) for x in range(0x80, 0x90)]:
312 check(b'\xF0'+cb+b'\x80\x80')
313 check(b'\xF0'+cb+b'\xBF\xBF')
314 for cb in [bytes([x]) for x in range(0x90, 0xC0)]:
315 check(b'\xF4'+cb+b'\x80\x80')
316 check(b'\xF4'+cb+b'\xBF\xBF')
317
318
319 class ESC[4;38;5;81mBytesSourceEncodingTest(ESC[4;38;5;149mAbstractSourceEncodingTest, ESC[4;38;5;149munittestESC[4;38;5;149m.ESC[4;38;5;149mTestCase):
320
321 def check_script_output(self, src, expected):
322 with captured_stdout() as stdout:
323 exec(src)
324 out = stdout.getvalue().encode('latin1')
325 self.assertEqual(out.rstrip(), expected)
326
327
328 class ESC[4;38;5;81mFileSourceEncodingTest(ESC[4;38;5;149mAbstractSourceEncodingTest, ESC[4;38;5;149munittestESC[4;38;5;149m.ESC[4;38;5;149mTestCase):
329
330 def check_script_output(self, src, expected):
331 with tempfile.TemporaryDirectory() as tmpd:
332 fn = os.path.join(tmpd, 'test.py')
333 with open(fn, 'wb') as fp:
334 fp.write(src)
335 res = script_helper.assert_python_ok(fn)
336 self.assertEqual(res.out.rstrip(), expected)
337
338
339 if __name__ == "__main__":
340 unittest.main()