1 from test import support
2 from test.support import os_helper
3 from tokenize import (tokenize, _tokenize, untokenize, NUMBER, NAME, OP,
4 STRING, ENDMARKER, ENCODING, tok_name, detect_encoding,
5 open as tokenize_open, Untokenizer, generate_tokens,
6 NEWLINE, _generate_tokens_from_c_tokenizer, DEDENT)
7 from io import BytesIO, StringIO
8 import unittest
9 from textwrap import dedent
10 from unittest import TestCase, mock
11 from test.test_grammar import (VALID_UNDERSCORE_LITERALS,
12 INVALID_UNDERSCORE_LITERALS)
13 from test.support import os_helper
14 from test.support.script_helper import run_test_script, make_script
15 import os
16 import token
17
18 # Converts a source string into a list of textual representation
19 # of the tokens such as:
20 # ` NAME 'if' (1, 0) (1, 2)`
21 # to make writing tests easier.
22 def stringify_tokens_from_source(token_generator, source_string):
23 result = []
24 num_lines = len(source_string.splitlines())
25 missing_trailing_nl = source_string[-1] not in '\r\n'
26
27 for type, token, start, end, line in token_generator:
28 if type == ENDMARKER:
29 break
30 # Ignore the new line on the last line if the input lacks one
31 if missing_trailing_nl and type == NEWLINE and end[0] == num_lines:
32 continue
33 type = tok_name[type]
34 result.append(f" {type:10} {token!r:13} {start} {end}")
35
36 return result
37
38 class ESC[4;38;5;81mTokenizeTest(ESC[4;38;5;149mTestCase):
39 # Tests for the tokenize module.
40
41 # The tests can be really simple. Given a small fragment of source
42 # code, print out a table with tokens. The ENDMARKER, ENCODING and
43 # final NEWLINE are omitted for brevity.
44
45 def check_tokenize(self, s, expected):
46 # Format the tokens in s in a table format.
47 # The ENDMARKER and final NEWLINE are omitted.
48 f = BytesIO(s.encode('utf-8'))
49 result = stringify_tokens_from_source(tokenize(f.readline), s)
50 self.assertEqual(result,
51 [" ENCODING 'utf-8' (0, 0) (0, 0)"] +
52 expected.rstrip().splitlines())
53
54 def test_implicit_newline(self):
55 # Make sure that the tokenizer puts in an implicit NEWLINE
56 # when the input lacks a trailing new line.
57 f = BytesIO("x".encode('utf-8'))
58 tokens = list(tokenize(f.readline))
59 self.assertEqual(tokens[-2].type, NEWLINE)
60 self.assertEqual(tokens[-1].type, ENDMARKER)
61
62 def test_basic(self):
63 self.check_tokenize("1 + 1", """\
64 NUMBER '1' (1, 0) (1, 1)
65 OP '+' (1, 2) (1, 3)
66 NUMBER '1' (1, 4) (1, 5)
67 """)
68 self.check_tokenize("if False:\n"
69 " # NL\n"
70 " \n"
71 " True = False # NEWLINE\n", """\
72 NAME 'if' (1, 0) (1, 2)
73 NAME 'False' (1, 3) (1, 8)
74 OP ':' (1, 8) (1, 9)
75 NEWLINE '\\n' (1, 9) (1, 10)
76 COMMENT '# NL' (2, 4) (2, 8)
77 NL '\\n' (2, 8) (2, 9)
78 NL '\\n' (3, 4) (3, 5)
79 INDENT ' ' (4, 0) (4, 4)
80 NAME 'True' (4, 4) (4, 8)
81 OP '=' (4, 9) (4, 10)
82 NAME 'False' (4, 11) (4, 16)
83 COMMENT '# NEWLINE' (4, 17) (4, 26)
84 NEWLINE '\\n' (4, 26) (4, 27)
85 DEDENT '' (5, 0) (5, 0)
86 """)
87 indent_error_file = b"""\
88 def k(x):
89 x += 2
90 x += 5
91 """
92 readline = BytesIO(indent_error_file).readline
93 with self.assertRaisesRegex(IndentationError,
94 "unindent does not match any "
95 "outer indentation level"):
96 for tok in tokenize(readline):
97 pass
98
99 def test_int(self):
100 # Ordinary integers and binary operators
101 self.check_tokenize("0xff <= 255", """\
102 NUMBER '0xff' (1, 0) (1, 4)
103 OP '<=' (1, 5) (1, 7)
104 NUMBER '255' (1, 8) (1, 11)
105 """)
106 self.check_tokenize("0b10 <= 255", """\
107 NUMBER '0b10' (1, 0) (1, 4)
108 OP '<=' (1, 5) (1, 7)
109 NUMBER '255' (1, 8) (1, 11)
110 """)
111 self.check_tokenize("0o123 <= 0O123", """\
112 NUMBER '0o123' (1, 0) (1, 5)
113 OP '<=' (1, 6) (1, 8)
114 NUMBER '0O123' (1, 9) (1, 14)
115 """)
116 self.check_tokenize("1234567 > ~0x15", """\
117 NUMBER '1234567' (1, 0) (1, 7)
118 OP '>' (1, 8) (1, 9)
119 OP '~' (1, 10) (1, 11)
120 NUMBER '0x15' (1, 11) (1, 15)
121 """)
122 self.check_tokenize("2134568 != 1231515", """\
123 NUMBER '2134568' (1, 0) (1, 7)
124 OP '!=' (1, 8) (1, 10)
125 NUMBER '1231515' (1, 11) (1, 18)
126 """)
127 self.check_tokenize("(-124561-1) & 200000000", """\
128 OP '(' (1, 0) (1, 1)
129 OP '-' (1, 1) (1, 2)
130 NUMBER '124561' (1, 2) (1, 8)
131 OP '-' (1, 8) (1, 9)
132 NUMBER '1' (1, 9) (1, 10)
133 OP ')' (1, 10) (1, 11)
134 OP '&' (1, 12) (1, 13)
135 NUMBER '200000000' (1, 14) (1, 23)
136 """)
137 self.check_tokenize("0xdeadbeef != -1", """\
138 NUMBER '0xdeadbeef' (1, 0) (1, 10)
139 OP '!=' (1, 11) (1, 13)
140 OP '-' (1, 14) (1, 15)
141 NUMBER '1' (1, 15) (1, 16)
142 """)
143 self.check_tokenize("0xdeadc0de & 12345", """\
144 NUMBER '0xdeadc0de' (1, 0) (1, 10)
145 OP '&' (1, 11) (1, 12)
146 NUMBER '12345' (1, 13) (1, 18)
147 """)
148 self.check_tokenize("0xFF & 0x15 | 1234", """\
149 NUMBER '0xFF' (1, 0) (1, 4)
150 OP '&' (1, 5) (1, 6)
151 NUMBER '0x15' (1, 7) (1, 11)
152 OP '|' (1, 12) (1, 13)
153 NUMBER '1234' (1, 14) (1, 18)
154 """)
155
156 def test_long(self):
157 # Long integers
158 self.check_tokenize("x = 0", """\
159 NAME 'x' (1, 0) (1, 1)
160 OP '=' (1, 2) (1, 3)
161 NUMBER '0' (1, 4) (1, 5)
162 """)
163 self.check_tokenize("x = 0xfffffffffff", """\
164 NAME 'x' (1, 0) (1, 1)
165 OP '=' (1, 2) (1, 3)
166 NUMBER '0xfffffffffff' (1, 4) (1, 17)
167 """)
168 self.check_tokenize("x = 123141242151251616110", """\
169 NAME 'x' (1, 0) (1, 1)
170 OP '=' (1, 2) (1, 3)
171 NUMBER '123141242151251616110' (1, 4) (1, 25)
172 """)
173 self.check_tokenize("x = -15921590215012591", """\
174 NAME 'x' (1, 0) (1, 1)
175 OP '=' (1, 2) (1, 3)
176 OP '-' (1, 4) (1, 5)
177 NUMBER '15921590215012591' (1, 5) (1, 22)
178 """)
179
180 def test_float(self):
181 # Floating point numbers
182 self.check_tokenize("x = 3.14159", """\
183 NAME 'x' (1, 0) (1, 1)
184 OP '=' (1, 2) (1, 3)
185 NUMBER '3.14159' (1, 4) (1, 11)
186 """)
187 self.check_tokenize("x = 314159.", """\
188 NAME 'x' (1, 0) (1, 1)
189 OP '=' (1, 2) (1, 3)
190 NUMBER '314159.' (1, 4) (1, 11)
191 """)
192 self.check_tokenize("x = .314159", """\
193 NAME 'x' (1, 0) (1, 1)
194 OP '=' (1, 2) (1, 3)
195 NUMBER '.314159' (1, 4) (1, 11)
196 """)
197 self.check_tokenize("x = 3e14159", """\
198 NAME 'x' (1, 0) (1, 1)
199 OP '=' (1, 2) (1, 3)
200 NUMBER '3e14159' (1, 4) (1, 11)
201 """)
202 self.check_tokenize("x = 3E123", """\
203 NAME 'x' (1, 0) (1, 1)
204 OP '=' (1, 2) (1, 3)
205 NUMBER '3E123' (1, 4) (1, 9)
206 """)
207 self.check_tokenize("x+y = 3e-1230", """\
208 NAME 'x' (1, 0) (1, 1)
209 OP '+' (1, 1) (1, 2)
210 NAME 'y' (1, 2) (1, 3)
211 OP '=' (1, 4) (1, 5)
212 NUMBER '3e-1230' (1, 6) (1, 13)
213 """)
214 self.check_tokenize("x = 3.14e159", """\
215 NAME 'x' (1, 0) (1, 1)
216 OP '=' (1, 2) (1, 3)
217 NUMBER '3.14e159' (1, 4) (1, 12)
218 """)
219
220 def test_underscore_literals(self):
221 def number_token(s):
222 f = BytesIO(s.encode('utf-8'))
223 for toktype, token, start, end, line in tokenize(f.readline):
224 if toktype == NUMBER:
225 return token
226 return 'invalid token'
227 for lit in VALID_UNDERSCORE_LITERALS:
228 if '(' in lit:
229 # this won't work with compound complex inputs
230 continue
231 self.assertEqual(number_token(lit), lit)
232 for lit in INVALID_UNDERSCORE_LITERALS:
233 self.assertNotEqual(number_token(lit), lit)
234
235 def test_string(self):
236 # String literals
237 self.check_tokenize("x = ''; y = \"\"", """\
238 NAME 'x' (1, 0) (1, 1)
239 OP '=' (1, 2) (1, 3)
240 STRING "''" (1, 4) (1, 6)
241 OP ';' (1, 6) (1, 7)
242 NAME 'y' (1, 8) (1, 9)
243 OP '=' (1, 10) (1, 11)
244 STRING '""' (1, 12) (1, 14)
245 """)
246 self.check_tokenize("x = '\"'; y = \"'\"", """\
247 NAME 'x' (1, 0) (1, 1)
248 OP '=' (1, 2) (1, 3)
249 STRING '\\'"\\'' (1, 4) (1, 7)
250 OP ';' (1, 7) (1, 8)
251 NAME 'y' (1, 9) (1, 10)
252 OP '=' (1, 11) (1, 12)
253 STRING '"\\'"' (1, 13) (1, 16)
254 """)
255 self.check_tokenize("x = \"doesn't \"shrink\", does it\"", """\
256 NAME 'x' (1, 0) (1, 1)
257 OP '=' (1, 2) (1, 3)
258 STRING '"doesn\\'t "' (1, 4) (1, 14)
259 NAME 'shrink' (1, 14) (1, 20)
260 STRING '", does it"' (1, 20) (1, 31)
261 """)
262 self.check_tokenize("x = 'abc' + 'ABC'", """\
263 NAME 'x' (1, 0) (1, 1)
264 OP '=' (1, 2) (1, 3)
265 STRING "'abc'" (1, 4) (1, 9)
266 OP '+' (1, 10) (1, 11)
267 STRING "'ABC'" (1, 12) (1, 17)
268 """)
269 self.check_tokenize('y = "ABC" + "ABC"', """\
270 NAME 'y' (1, 0) (1, 1)
271 OP '=' (1, 2) (1, 3)
272 STRING '"ABC"' (1, 4) (1, 9)
273 OP '+' (1, 10) (1, 11)
274 STRING '"ABC"' (1, 12) (1, 17)
275 """)
276 self.check_tokenize("x = r'abc' + r'ABC' + R'ABC' + R'ABC'", """\
277 NAME 'x' (1, 0) (1, 1)
278 OP '=' (1, 2) (1, 3)
279 STRING "r'abc'" (1, 4) (1, 10)
280 OP '+' (1, 11) (1, 12)
281 STRING "r'ABC'" (1, 13) (1, 19)
282 OP '+' (1, 20) (1, 21)
283 STRING "R'ABC'" (1, 22) (1, 28)
284 OP '+' (1, 29) (1, 30)
285 STRING "R'ABC'" (1, 31) (1, 37)
286 """)
287 self.check_tokenize('y = r"abc" + r"ABC" + R"ABC" + R"ABC"', """\
288 NAME 'y' (1, 0) (1, 1)
289 OP '=' (1, 2) (1, 3)
290 STRING 'r"abc"' (1, 4) (1, 10)
291 OP '+' (1, 11) (1, 12)
292 STRING 'r"ABC"' (1, 13) (1, 19)
293 OP '+' (1, 20) (1, 21)
294 STRING 'R"ABC"' (1, 22) (1, 28)
295 OP '+' (1, 29) (1, 30)
296 STRING 'R"ABC"' (1, 31) (1, 37)
297 """)
298
299 self.check_tokenize("u'abc' + U'abc'", """\
300 STRING "u'abc'" (1, 0) (1, 6)
301 OP '+' (1, 7) (1, 8)
302 STRING "U'abc'" (1, 9) (1, 15)
303 """)
304 self.check_tokenize('u"abc" + U"abc"', """\
305 STRING 'u"abc"' (1, 0) (1, 6)
306 OP '+' (1, 7) (1, 8)
307 STRING 'U"abc"' (1, 9) (1, 15)
308 """)
309
310 self.check_tokenize("b'abc' + B'abc'", """\
311 STRING "b'abc'" (1, 0) (1, 6)
312 OP '+' (1, 7) (1, 8)
313 STRING "B'abc'" (1, 9) (1, 15)
314 """)
315 self.check_tokenize('b"abc" + B"abc"', """\
316 STRING 'b"abc"' (1, 0) (1, 6)
317 OP '+' (1, 7) (1, 8)
318 STRING 'B"abc"' (1, 9) (1, 15)
319 """)
320 self.check_tokenize("br'abc' + bR'abc' + Br'abc' + BR'abc'", """\
321 STRING "br'abc'" (1, 0) (1, 7)
322 OP '+' (1, 8) (1, 9)
323 STRING "bR'abc'" (1, 10) (1, 17)
324 OP '+' (1, 18) (1, 19)
325 STRING "Br'abc'" (1, 20) (1, 27)
326 OP '+' (1, 28) (1, 29)
327 STRING "BR'abc'" (1, 30) (1, 37)
328 """)
329 self.check_tokenize('br"abc" + bR"abc" + Br"abc" + BR"abc"', """\
330 STRING 'br"abc"' (1, 0) (1, 7)
331 OP '+' (1, 8) (1, 9)
332 STRING 'bR"abc"' (1, 10) (1, 17)
333 OP '+' (1, 18) (1, 19)
334 STRING 'Br"abc"' (1, 20) (1, 27)
335 OP '+' (1, 28) (1, 29)
336 STRING 'BR"abc"' (1, 30) (1, 37)
337 """)
338 self.check_tokenize("rb'abc' + rB'abc' + Rb'abc' + RB'abc'", """\
339 STRING "rb'abc'" (1, 0) (1, 7)
340 OP '+' (1, 8) (1, 9)
341 STRING "rB'abc'" (1, 10) (1, 17)
342 OP '+' (1, 18) (1, 19)
343 STRING "Rb'abc'" (1, 20) (1, 27)
344 OP '+' (1, 28) (1, 29)
345 STRING "RB'abc'" (1, 30) (1, 37)
346 """)
347 self.check_tokenize('rb"abc" + rB"abc" + Rb"abc" + RB"abc"', """\
348 STRING 'rb"abc"' (1, 0) (1, 7)
349 OP '+' (1, 8) (1, 9)
350 STRING 'rB"abc"' (1, 10) (1, 17)
351 OP '+' (1, 18) (1, 19)
352 STRING 'Rb"abc"' (1, 20) (1, 27)
353 OP '+' (1, 28) (1, 29)
354 STRING 'RB"abc"' (1, 30) (1, 37)
355 """)
356 # Check 0, 1, and 2 character string prefixes.
357 self.check_tokenize(r'"a\
358 de\
359 fg"', """\
360 STRING '"a\\\\\\nde\\\\\\nfg"\' (1, 0) (3, 3)
361 """)
362 self.check_tokenize(r'u"a\
363 de"', """\
364 STRING 'u"a\\\\\\nde"\' (1, 0) (2, 3)
365 """)
366 self.check_tokenize(r'rb"a\
367 d"', """\
368 STRING 'rb"a\\\\\\nd"\' (1, 0) (2, 2)
369 """)
370 self.check_tokenize(r'"""a\
371 b"""', """\
372 STRING '\"\""a\\\\\\nb\"\""' (1, 0) (2, 4)
373 """)
374 self.check_tokenize(r'u"""a\
375 b"""', """\
376 STRING 'u\"\""a\\\\\\nb\"\""' (1, 0) (2, 4)
377 """)
378 self.check_tokenize(r'rb"""a\
379 b\
380 c"""', """\
381 STRING 'rb"\""a\\\\\\nb\\\\\\nc"\""' (1, 0) (3, 4)
382 """)
383 self.check_tokenize('f"abc"', """\
384 STRING 'f"abc"' (1, 0) (1, 6)
385 """)
386 self.check_tokenize('fR"a{b}c"', """\
387 STRING 'fR"a{b}c"' (1, 0) (1, 9)
388 """)
389 self.check_tokenize('f"""abc"""', """\
390 STRING 'f\"\"\"abc\"\"\"' (1, 0) (1, 10)
391 """)
392 self.check_tokenize(r'f"abc\
393 def"', """\
394 STRING 'f"abc\\\\\\ndef"' (1, 0) (2, 4)
395 """)
396 self.check_tokenize(r'Rf"abc\
397 def"', """\
398 STRING 'Rf"abc\\\\\\ndef"' (1, 0) (2, 4)
399 """)
400
401 def test_function(self):
402 self.check_tokenize("def d22(a, b, c=2, d=2, *k): pass", """\
403 NAME 'def' (1, 0) (1, 3)
404 NAME 'd22' (1, 4) (1, 7)
405 OP '(' (1, 7) (1, 8)
406 NAME 'a' (1, 8) (1, 9)
407 OP ',' (1, 9) (1, 10)
408 NAME 'b' (1, 11) (1, 12)
409 OP ',' (1, 12) (1, 13)
410 NAME 'c' (1, 14) (1, 15)
411 OP '=' (1, 15) (1, 16)
412 NUMBER '2' (1, 16) (1, 17)
413 OP ',' (1, 17) (1, 18)
414 NAME 'd' (1, 19) (1, 20)
415 OP '=' (1, 20) (1, 21)
416 NUMBER '2' (1, 21) (1, 22)
417 OP ',' (1, 22) (1, 23)
418 OP '*' (1, 24) (1, 25)
419 NAME 'k' (1, 25) (1, 26)
420 OP ')' (1, 26) (1, 27)
421 OP ':' (1, 27) (1, 28)
422 NAME 'pass' (1, 29) (1, 33)
423 """)
424 self.check_tokenize("def d01v_(a=1, *k, **w): pass", """\
425 NAME 'def' (1, 0) (1, 3)
426 NAME 'd01v_' (1, 4) (1, 9)
427 OP '(' (1, 9) (1, 10)
428 NAME 'a' (1, 10) (1, 11)
429 OP '=' (1, 11) (1, 12)
430 NUMBER '1' (1, 12) (1, 13)
431 OP ',' (1, 13) (1, 14)
432 OP '*' (1, 15) (1, 16)
433 NAME 'k' (1, 16) (1, 17)
434 OP ',' (1, 17) (1, 18)
435 OP '**' (1, 19) (1, 21)
436 NAME 'w' (1, 21) (1, 22)
437 OP ')' (1, 22) (1, 23)
438 OP ':' (1, 23) (1, 24)
439 NAME 'pass' (1, 25) (1, 29)
440 """)
441 self.check_tokenize("def d23(a: str, b: int=3) -> int: pass", """\
442 NAME 'def' (1, 0) (1, 3)
443 NAME 'd23' (1, 4) (1, 7)
444 OP '(' (1, 7) (1, 8)
445 NAME 'a' (1, 8) (1, 9)
446 OP ':' (1, 9) (1, 10)
447 NAME 'str' (1, 11) (1, 14)
448 OP ',' (1, 14) (1, 15)
449 NAME 'b' (1, 16) (1, 17)
450 OP ':' (1, 17) (1, 18)
451 NAME 'int' (1, 19) (1, 22)
452 OP '=' (1, 22) (1, 23)
453 NUMBER '3' (1, 23) (1, 24)
454 OP ')' (1, 24) (1, 25)
455 OP '->' (1, 26) (1, 28)
456 NAME 'int' (1, 29) (1, 32)
457 OP ':' (1, 32) (1, 33)
458 NAME 'pass' (1, 34) (1, 38)
459 """)
460
461 def test_comparison(self):
462 # Comparison
463 self.check_tokenize("if 1 < 1 > 1 == 1 >= 5 <= 0x15 <= 0x12 != "
464 "1 and 5 in 1 not in 1 is 1 or 5 is not 1: pass", """\
465 NAME 'if' (1, 0) (1, 2)
466 NUMBER '1' (1, 3) (1, 4)
467 OP '<' (1, 5) (1, 6)
468 NUMBER '1' (1, 7) (1, 8)
469 OP '>' (1, 9) (1, 10)
470 NUMBER '1' (1, 11) (1, 12)
471 OP '==' (1, 13) (1, 15)
472 NUMBER '1' (1, 16) (1, 17)
473 OP '>=' (1, 18) (1, 20)
474 NUMBER '5' (1, 21) (1, 22)
475 OP '<=' (1, 23) (1, 25)
476 NUMBER '0x15' (1, 26) (1, 30)
477 OP '<=' (1, 31) (1, 33)
478 NUMBER '0x12' (1, 34) (1, 38)
479 OP '!=' (1, 39) (1, 41)
480 NUMBER '1' (1, 42) (1, 43)
481 NAME 'and' (1, 44) (1, 47)
482 NUMBER '5' (1, 48) (1, 49)
483 NAME 'in' (1, 50) (1, 52)
484 NUMBER '1' (1, 53) (1, 54)
485 NAME 'not' (1, 55) (1, 58)
486 NAME 'in' (1, 59) (1, 61)
487 NUMBER '1' (1, 62) (1, 63)
488 NAME 'is' (1, 64) (1, 66)
489 NUMBER '1' (1, 67) (1, 68)
490 NAME 'or' (1, 69) (1, 71)
491 NUMBER '5' (1, 72) (1, 73)
492 NAME 'is' (1, 74) (1, 76)
493 NAME 'not' (1, 77) (1, 80)
494 NUMBER '1' (1, 81) (1, 82)
495 OP ':' (1, 82) (1, 83)
496 NAME 'pass' (1, 84) (1, 88)
497 """)
498
499 def test_shift(self):
500 # Shift
501 self.check_tokenize("x = 1 << 1 >> 5", """\
502 NAME 'x' (1, 0) (1, 1)
503 OP '=' (1, 2) (1, 3)
504 NUMBER '1' (1, 4) (1, 5)
505 OP '<<' (1, 6) (1, 8)
506 NUMBER '1' (1, 9) (1, 10)
507 OP '>>' (1, 11) (1, 13)
508 NUMBER '5' (1, 14) (1, 15)
509 """)
510
511 def test_additive(self):
512 # Additive
513 self.check_tokenize("x = 1 - y + 15 - 1 + 0x124 + z + a[5]", """\
514 NAME 'x' (1, 0) (1, 1)
515 OP '=' (1, 2) (1, 3)
516 NUMBER '1' (1, 4) (1, 5)
517 OP '-' (1, 6) (1, 7)
518 NAME 'y' (1, 8) (1, 9)
519 OP '+' (1, 10) (1, 11)
520 NUMBER '15' (1, 12) (1, 14)
521 OP '-' (1, 15) (1, 16)
522 NUMBER '1' (1, 17) (1, 18)
523 OP '+' (1, 19) (1, 20)
524 NUMBER '0x124' (1, 21) (1, 26)
525 OP '+' (1, 27) (1, 28)
526 NAME 'z' (1, 29) (1, 30)
527 OP '+' (1, 31) (1, 32)
528 NAME 'a' (1, 33) (1, 34)
529 OP '[' (1, 34) (1, 35)
530 NUMBER '5' (1, 35) (1, 36)
531 OP ']' (1, 36) (1, 37)
532 """)
533
534 def test_multiplicative(self):
535 # Multiplicative
536 self.check_tokenize("x = 1//1*1/5*12%0x12@42", """\
537 NAME 'x' (1, 0) (1, 1)
538 OP '=' (1, 2) (1, 3)
539 NUMBER '1' (1, 4) (1, 5)
540 OP '//' (1, 5) (1, 7)
541 NUMBER '1' (1, 7) (1, 8)
542 OP '*' (1, 8) (1, 9)
543 NUMBER '1' (1, 9) (1, 10)
544 OP '/' (1, 10) (1, 11)
545 NUMBER '5' (1, 11) (1, 12)
546 OP '*' (1, 12) (1, 13)
547 NUMBER '12' (1, 13) (1, 15)
548 OP '%' (1, 15) (1, 16)
549 NUMBER '0x12' (1, 16) (1, 20)
550 OP '@' (1, 20) (1, 21)
551 NUMBER '42' (1, 21) (1, 23)
552 """)
553
554 def test_unary(self):
555 # Unary
556 self.check_tokenize("~1 ^ 1 & 1 |1 ^ -1", """\
557 OP '~' (1, 0) (1, 1)
558 NUMBER '1' (1, 1) (1, 2)
559 OP '^' (1, 3) (1, 4)
560 NUMBER '1' (1, 5) (1, 6)
561 OP '&' (1, 7) (1, 8)
562 NUMBER '1' (1, 9) (1, 10)
563 OP '|' (1, 11) (1, 12)
564 NUMBER '1' (1, 12) (1, 13)
565 OP '^' (1, 14) (1, 15)
566 OP '-' (1, 16) (1, 17)
567 NUMBER '1' (1, 17) (1, 18)
568 """)
569 self.check_tokenize("-1*1/1+1*1//1 - ---1**1", """\
570 OP '-' (1, 0) (1, 1)
571 NUMBER '1' (1, 1) (1, 2)
572 OP '*' (1, 2) (1, 3)
573 NUMBER '1' (1, 3) (1, 4)
574 OP '/' (1, 4) (1, 5)
575 NUMBER '1' (1, 5) (1, 6)
576 OP '+' (1, 6) (1, 7)
577 NUMBER '1' (1, 7) (1, 8)
578 OP '*' (1, 8) (1, 9)
579 NUMBER '1' (1, 9) (1, 10)
580 OP '//' (1, 10) (1, 12)
581 NUMBER '1' (1, 12) (1, 13)
582 OP '-' (1, 14) (1, 15)
583 OP '-' (1, 16) (1, 17)
584 OP '-' (1, 17) (1, 18)
585 OP '-' (1, 18) (1, 19)
586 NUMBER '1' (1, 19) (1, 20)
587 OP '**' (1, 20) (1, 22)
588 NUMBER '1' (1, 22) (1, 23)
589 """)
590
591 def test_selector(self):
592 # Selector
593 self.check_tokenize("import sys, time\nx = sys.modules['time'].time()", """\
594 NAME 'import' (1, 0) (1, 6)
595 NAME 'sys' (1, 7) (1, 10)
596 OP ',' (1, 10) (1, 11)
597 NAME 'time' (1, 12) (1, 16)
598 NEWLINE '\\n' (1, 16) (1, 17)
599 NAME 'x' (2, 0) (2, 1)
600 OP '=' (2, 2) (2, 3)
601 NAME 'sys' (2, 4) (2, 7)
602 OP '.' (2, 7) (2, 8)
603 NAME 'modules' (2, 8) (2, 15)
604 OP '[' (2, 15) (2, 16)
605 STRING "'time'" (2, 16) (2, 22)
606 OP ']' (2, 22) (2, 23)
607 OP '.' (2, 23) (2, 24)
608 NAME 'time' (2, 24) (2, 28)
609 OP '(' (2, 28) (2, 29)
610 OP ')' (2, 29) (2, 30)
611 """)
612
613 def test_method(self):
614 # Methods
615 self.check_tokenize("@staticmethod\ndef foo(x,y): pass", """\
616 OP '@' (1, 0) (1, 1)
617 NAME 'staticmethod' (1, 1) (1, 13)
618 NEWLINE '\\n' (1, 13) (1, 14)
619 NAME 'def' (2, 0) (2, 3)
620 NAME 'foo' (2, 4) (2, 7)
621 OP '(' (2, 7) (2, 8)
622 NAME 'x' (2, 8) (2, 9)
623 OP ',' (2, 9) (2, 10)
624 NAME 'y' (2, 10) (2, 11)
625 OP ')' (2, 11) (2, 12)
626 OP ':' (2, 12) (2, 13)
627 NAME 'pass' (2, 14) (2, 18)
628 """)
629
630 def test_tabs(self):
631 # Evil tabs
632 self.check_tokenize("def f():\n"
633 "\tif x\n"
634 " \tpass", """\
635 NAME 'def' (1, 0) (1, 3)
636 NAME 'f' (1, 4) (1, 5)
637 OP '(' (1, 5) (1, 6)
638 OP ')' (1, 6) (1, 7)
639 OP ':' (1, 7) (1, 8)
640 NEWLINE '\\n' (1, 8) (1, 9)
641 INDENT '\\t' (2, 0) (2, 1)
642 NAME 'if' (2, 1) (2, 3)
643 NAME 'x' (2, 4) (2, 5)
644 NEWLINE '\\n' (2, 5) (2, 6)
645 INDENT ' \\t' (3, 0) (3, 9)
646 NAME 'pass' (3, 9) (3, 13)
647 DEDENT '' (4, 0) (4, 0)
648 DEDENT '' (4, 0) (4, 0)
649 """)
650
651 def test_non_ascii_identifiers(self):
652 # Non-ascii identifiers
653 self.check_tokenize("Örter = 'places'\ngrün = 'green'", """\
654 NAME 'Örter' (1, 0) (1, 5)
655 OP '=' (1, 6) (1, 7)
656 STRING "'places'" (1, 8) (1, 16)
657 NEWLINE '\\n' (1, 16) (1, 17)
658 NAME 'grün' (2, 0) (2, 4)
659 OP '=' (2, 5) (2, 6)
660 STRING "'green'" (2, 7) (2, 14)
661 """)
662
663 def test_unicode(self):
664 # Legacy unicode literals:
665 self.check_tokenize("Örter = u'places'\ngrün = U'green'", """\
666 NAME 'Örter' (1, 0) (1, 5)
667 OP '=' (1, 6) (1, 7)
668 STRING "u'places'" (1, 8) (1, 17)
669 NEWLINE '\\n' (1, 17) (1, 18)
670 NAME 'grün' (2, 0) (2, 4)
671 OP '=' (2, 5) (2, 6)
672 STRING "U'green'" (2, 7) (2, 15)
673 """)
674
675 def test_async(self):
676 # Async/await extension:
677 self.check_tokenize("async = 1", """\
678 NAME 'async' (1, 0) (1, 5)
679 OP '=' (1, 6) (1, 7)
680 NUMBER '1' (1, 8) (1, 9)
681 """)
682
683 self.check_tokenize("a = (async = 1)", """\
684 NAME 'a' (1, 0) (1, 1)
685 OP '=' (1, 2) (1, 3)
686 OP '(' (1, 4) (1, 5)
687 NAME 'async' (1, 5) (1, 10)
688 OP '=' (1, 11) (1, 12)
689 NUMBER '1' (1, 13) (1, 14)
690 OP ')' (1, 14) (1, 15)
691 """)
692
693 self.check_tokenize("async()", """\
694 NAME 'async' (1, 0) (1, 5)
695 OP '(' (1, 5) (1, 6)
696 OP ')' (1, 6) (1, 7)
697 """)
698
699 self.check_tokenize("class async(Bar):pass", """\
700 NAME 'class' (1, 0) (1, 5)
701 NAME 'async' (1, 6) (1, 11)
702 OP '(' (1, 11) (1, 12)
703 NAME 'Bar' (1, 12) (1, 15)
704 OP ')' (1, 15) (1, 16)
705 OP ':' (1, 16) (1, 17)
706 NAME 'pass' (1, 17) (1, 21)
707 """)
708
709 self.check_tokenize("class async:pass", """\
710 NAME 'class' (1, 0) (1, 5)
711 NAME 'async' (1, 6) (1, 11)
712 OP ':' (1, 11) (1, 12)
713 NAME 'pass' (1, 12) (1, 16)
714 """)
715
716 self.check_tokenize("await = 1", """\
717 NAME 'await' (1, 0) (1, 5)
718 OP '=' (1, 6) (1, 7)
719 NUMBER '1' (1, 8) (1, 9)
720 """)
721
722 self.check_tokenize("foo.async", """\
723 NAME 'foo' (1, 0) (1, 3)
724 OP '.' (1, 3) (1, 4)
725 NAME 'async' (1, 4) (1, 9)
726 """)
727
728 self.check_tokenize("async for a in b: pass", """\
729 NAME 'async' (1, 0) (1, 5)
730 NAME 'for' (1, 6) (1, 9)
731 NAME 'a' (1, 10) (1, 11)
732 NAME 'in' (1, 12) (1, 14)
733 NAME 'b' (1, 15) (1, 16)
734 OP ':' (1, 16) (1, 17)
735 NAME 'pass' (1, 18) (1, 22)
736 """)
737
738 self.check_tokenize("async with a as b: pass", """\
739 NAME 'async' (1, 0) (1, 5)
740 NAME 'with' (1, 6) (1, 10)
741 NAME 'a' (1, 11) (1, 12)
742 NAME 'as' (1, 13) (1, 15)
743 NAME 'b' (1, 16) (1, 17)
744 OP ':' (1, 17) (1, 18)
745 NAME 'pass' (1, 19) (1, 23)
746 """)
747
748 self.check_tokenize("async.foo", """\
749 NAME 'async' (1, 0) (1, 5)
750 OP '.' (1, 5) (1, 6)
751 NAME 'foo' (1, 6) (1, 9)
752 """)
753
754 self.check_tokenize("async", """\
755 NAME 'async' (1, 0) (1, 5)
756 """)
757
758 self.check_tokenize("async\n#comment\nawait", """\
759 NAME 'async' (1, 0) (1, 5)
760 NEWLINE '\\n' (1, 5) (1, 6)
761 COMMENT '#comment' (2, 0) (2, 8)
762 NL '\\n' (2, 8) (2, 9)
763 NAME 'await' (3, 0) (3, 5)
764 """)
765
766 self.check_tokenize("async\n...\nawait", """\
767 NAME 'async' (1, 0) (1, 5)
768 NEWLINE '\\n' (1, 5) (1, 6)
769 OP '...' (2, 0) (2, 3)
770 NEWLINE '\\n' (2, 3) (2, 4)
771 NAME 'await' (3, 0) (3, 5)
772 """)
773
774 self.check_tokenize("async\nawait", """\
775 NAME 'async' (1, 0) (1, 5)
776 NEWLINE '\\n' (1, 5) (1, 6)
777 NAME 'await' (2, 0) (2, 5)
778 """)
779
780 self.check_tokenize("foo.async + 1", """\
781 NAME 'foo' (1, 0) (1, 3)
782 OP '.' (1, 3) (1, 4)
783 NAME 'async' (1, 4) (1, 9)
784 OP '+' (1, 10) (1, 11)
785 NUMBER '1' (1, 12) (1, 13)
786 """)
787
788 self.check_tokenize("async def foo(): pass", """\
789 NAME 'async' (1, 0) (1, 5)
790 NAME 'def' (1, 6) (1, 9)
791 NAME 'foo' (1, 10) (1, 13)
792 OP '(' (1, 13) (1, 14)
793 OP ')' (1, 14) (1, 15)
794 OP ':' (1, 15) (1, 16)
795 NAME 'pass' (1, 17) (1, 21)
796 """)
797
798 self.check_tokenize('''\
799 async def foo():
800 def foo(await):
801 await = 1
802 if 1:
803 await
804 async += 1
805 ''', """\
806 NAME 'async' (1, 0) (1, 5)
807 NAME 'def' (1, 6) (1, 9)
808 NAME 'foo' (1, 10) (1, 13)
809 OP '(' (1, 13) (1, 14)
810 OP ')' (1, 14) (1, 15)
811 OP ':' (1, 15) (1, 16)
812 NEWLINE '\\n' (1, 16) (1, 17)
813 INDENT ' ' (2, 0) (2, 2)
814 NAME 'def' (2, 2) (2, 5)
815 NAME 'foo' (2, 6) (2, 9)
816 OP '(' (2, 9) (2, 10)
817 NAME 'await' (2, 10) (2, 15)
818 OP ')' (2, 15) (2, 16)
819 OP ':' (2, 16) (2, 17)
820 NEWLINE '\\n' (2, 17) (2, 18)
821 INDENT ' ' (3, 0) (3, 4)
822 NAME 'await' (3, 4) (3, 9)
823 OP '=' (3, 10) (3, 11)
824 NUMBER '1' (3, 12) (3, 13)
825 NEWLINE '\\n' (3, 13) (3, 14)
826 DEDENT '' (4, 2) (4, 2)
827 NAME 'if' (4, 2) (4, 4)
828 NUMBER '1' (4, 5) (4, 6)
829 OP ':' (4, 6) (4, 7)
830 NEWLINE '\\n' (4, 7) (4, 8)
831 INDENT ' ' (5, 0) (5, 4)
832 NAME 'await' (5, 4) (5, 9)
833 NEWLINE '\\n' (5, 9) (5, 10)
834 DEDENT '' (6, 0) (6, 0)
835 DEDENT '' (6, 0) (6, 0)
836 NAME 'async' (6, 0) (6, 5)
837 OP '+=' (6, 6) (6, 8)
838 NUMBER '1' (6, 9) (6, 10)
839 NEWLINE '\\n' (6, 10) (6, 11)
840 """)
841
842 self.check_tokenize('''\
843 async def foo():
844 async for i in 1: pass''', """\
845 NAME 'async' (1, 0) (1, 5)
846 NAME 'def' (1, 6) (1, 9)
847 NAME 'foo' (1, 10) (1, 13)
848 OP '(' (1, 13) (1, 14)
849 OP ')' (1, 14) (1, 15)
850 OP ':' (1, 15) (1, 16)
851 NEWLINE '\\n' (1, 16) (1, 17)
852 INDENT ' ' (2, 0) (2, 2)
853 NAME 'async' (2, 2) (2, 7)
854 NAME 'for' (2, 8) (2, 11)
855 NAME 'i' (2, 12) (2, 13)
856 NAME 'in' (2, 14) (2, 16)
857 NUMBER '1' (2, 17) (2, 18)
858 OP ':' (2, 18) (2, 19)
859 NAME 'pass' (2, 20) (2, 24)
860 DEDENT '' (3, 0) (3, 0)
861 """)
862
863 self.check_tokenize('''async def foo(async): await''', """\
864 NAME 'async' (1, 0) (1, 5)
865 NAME 'def' (1, 6) (1, 9)
866 NAME 'foo' (1, 10) (1, 13)
867 OP '(' (1, 13) (1, 14)
868 NAME 'async' (1, 14) (1, 19)
869 OP ')' (1, 19) (1, 20)
870 OP ':' (1, 20) (1, 21)
871 NAME 'await' (1, 22) (1, 27)
872 """)
873
874 self.check_tokenize('''\
875 def f():
876
877 def baz(): pass
878 async def bar(): pass
879
880 await = 2''', """\
881 NAME 'def' (1, 0) (1, 3)
882 NAME 'f' (1, 4) (1, 5)
883 OP '(' (1, 5) (1, 6)
884 OP ')' (1, 6) (1, 7)
885 OP ':' (1, 7) (1, 8)
886 NEWLINE '\\n' (1, 8) (1, 9)
887 NL '\\n' (2, 0) (2, 1)
888 INDENT ' ' (3, 0) (3, 2)
889 NAME 'def' (3, 2) (3, 5)
890 NAME 'baz' (3, 6) (3, 9)
891 OP '(' (3, 9) (3, 10)
892 OP ')' (3, 10) (3, 11)
893 OP ':' (3, 11) (3, 12)
894 NAME 'pass' (3, 13) (3, 17)
895 NEWLINE '\\n' (3, 17) (3, 18)
896 NAME 'async' (4, 2) (4, 7)
897 NAME 'def' (4, 8) (4, 11)
898 NAME 'bar' (4, 12) (4, 15)
899 OP '(' (4, 15) (4, 16)
900 OP ')' (4, 16) (4, 17)
901 OP ':' (4, 17) (4, 18)
902 NAME 'pass' (4, 19) (4, 23)
903 NEWLINE '\\n' (4, 23) (4, 24)
904 NL '\\n' (5, 0) (5, 1)
905 NAME 'await' (6, 2) (6, 7)
906 OP '=' (6, 8) (6, 9)
907 NUMBER '2' (6, 10) (6, 11)
908 DEDENT '' (7, 0) (7, 0)
909 """)
910
911 self.check_tokenize('''\
912 async def f():
913
914 def baz(): pass
915 async def bar(): pass
916
917 await = 2''', """\
918 NAME 'async' (1, 0) (1, 5)
919 NAME 'def' (1, 6) (1, 9)
920 NAME 'f' (1, 10) (1, 11)
921 OP '(' (1, 11) (1, 12)
922 OP ')' (1, 12) (1, 13)
923 OP ':' (1, 13) (1, 14)
924 NEWLINE '\\n' (1, 14) (1, 15)
925 NL '\\n' (2, 0) (2, 1)
926 INDENT ' ' (3, 0) (3, 2)
927 NAME 'def' (3, 2) (3, 5)
928 NAME 'baz' (3, 6) (3, 9)
929 OP '(' (3, 9) (3, 10)
930 OP ')' (3, 10) (3, 11)
931 OP ':' (3, 11) (3, 12)
932 NAME 'pass' (3, 13) (3, 17)
933 NEWLINE '\\n' (3, 17) (3, 18)
934 NAME 'async' (4, 2) (4, 7)
935 NAME 'def' (4, 8) (4, 11)
936 NAME 'bar' (4, 12) (4, 15)
937 OP '(' (4, 15) (4, 16)
938 OP ')' (4, 16) (4, 17)
939 OP ':' (4, 17) (4, 18)
940 NAME 'pass' (4, 19) (4, 23)
941 NEWLINE '\\n' (4, 23) (4, 24)
942 NL '\\n' (5, 0) (5, 1)
943 NAME 'await' (6, 2) (6, 7)
944 OP '=' (6, 8) (6, 9)
945 NUMBER '2' (6, 10) (6, 11)
946 DEDENT '' (7, 0) (7, 0)
947 """)
948
949 class ESC[4;38;5;81mGenerateTokensTest(ESC[4;38;5;149mTokenizeTest):
950 def check_tokenize(self, s, expected):
951 # Format the tokens in s in a table format.
952 # The ENDMARKER and final NEWLINE are omitted.
953 f = StringIO(s)
954 result = stringify_tokens_from_source(generate_tokens(f.readline), s)
955 self.assertEqual(result, expected.rstrip().splitlines())
956
957
958 def decistmt(s):
959 result = []
960 g = tokenize(BytesIO(s.encode('utf-8')).readline) # tokenize the string
961 for toknum, tokval, _, _, _ in g:
962 if toknum == NUMBER and '.' in tokval: # replace NUMBER tokens
963 result.extend([
964 (NAME, 'Decimal'),
965 (OP, '('),
966 (STRING, repr(tokval)),
967 (OP, ')')
968 ])
969 else:
970 result.append((toknum, tokval))
971 return untokenize(result).decode('utf-8')
972
973 class ESC[4;38;5;81mTestMisc(ESC[4;38;5;149mTestCase):
974
975 def test_decistmt(self):
976 # Substitute Decimals for floats in a string of statements.
977 # This is an example from the docs.
978
979 from decimal import Decimal
980 s = '+21.3e-5*-.1234/81.7'
981 self.assertEqual(decistmt(s),
982 "+Decimal ('21.3e-5')*-Decimal ('.1234')/Decimal ('81.7')")
983
984 # The format of the exponent is inherited from the platform C library.
985 # Known cases are "e-007" (Windows) and "e-07" (not Windows). Since
986 # we're only showing 11 digits, and the 12th isn't close to 5, the
987 # rest of the output should be platform-independent.
988 self.assertRegex(repr(eval(s)), '-3.2171603427[0-9]*e-0+7')
989
990 # Output from calculations with Decimal should be identical across all
991 # platforms.
992 self.assertEqual(eval(decistmt(s)),
993 Decimal('-3.217160342717258261933904529E-7'))
994
995
996 class ESC[4;38;5;81mTestTokenizerAdheresToPep0263(ESC[4;38;5;149mTestCase):
997 """
998 Test that tokenizer adheres to the coding behaviour stipulated in PEP 0263.
999 """
1000
1001 def _testFile(self, filename):
1002 path = os.path.join(os.path.dirname(__file__), 'tokenizedata', filename)
1003 TestRoundtrip.check_roundtrip(self, open(path, 'rb'))
1004
1005 def test_utf8_coding_cookie_and_no_utf8_bom(self):
1006 f = 'tokenize_tests-utf8-coding-cookie-and-no-utf8-bom-sig.txt'
1007 self._testFile(f)
1008
1009 def test_latin1_coding_cookie_and_utf8_bom(self):
1010 """
1011 As per PEP 0263, if a file starts with a utf-8 BOM signature, the only
1012 allowed encoding for the comment is 'utf-8'. The text file used in
1013 this test starts with a BOM signature, but specifies latin1 as the
1014 coding, so verify that a SyntaxError is raised, which matches the
1015 behaviour of the interpreter when it encounters a similar condition.
1016 """
1017 f = 'tokenize_tests-latin1-coding-cookie-and-utf8-bom-sig.txt'
1018 self.assertRaises(SyntaxError, self._testFile, f)
1019
1020 def test_no_coding_cookie_and_utf8_bom(self):
1021 f = 'tokenize_tests-no-coding-cookie-and-utf8-bom-sig-only.txt'
1022 self._testFile(f)
1023
1024 def test_utf8_coding_cookie_and_utf8_bom(self):
1025 f = 'tokenize_tests-utf8-coding-cookie-and-utf8-bom-sig.txt'
1026 self._testFile(f)
1027
1028 def test_bad_coding_cookie(self):
1029 self.assertRaises(SyntaxError, self._testFile, 'bad_coding.py')
1030 self.assertRaises(SyntaxError, self._testFile, 'bad_coding2.py')
1031
1032
1033 class ESC[4;38;5;81mTest_Tokenize(ESC[4;38;5;149mTestCase):
1034
1035 def test__tokenize_decodes_with_specified_encoding(self):
1036 literal = '"ЉЊЈЁЂ"'
1037 line = literal.encode('utf-8')
1038 first = False
1039 def readline():
1040 nonlocal first
1041 if not first:
1042 first = True
1043 return line
1044 else:
1045 return b''
1046
1047 # skip the initial encoding token and the end tokens
1048 tokens = list(_tokenize(readline, encoding='utf-8'))[1:-2]
1049 expected_tokens = [(3, '"ЉЊЈЁЂ"', (1, 0), (1, 7), '"ЉЊЈЁЂ"')]
1050 self.assertEqual(tokens, expected_tokens,
1051 "bytes not decoded with encoding")
1052
1053 def test__tokenize_does_not_decode_with_encoding_none(self):
1054 literal = '"ЉЊЈЁЂ"'
1055 first = False
1056 def readline():
1057 nonlocal first
1058 if not first:
1059 first = True
1060 return literal
1061 else:
1062 return b''
1063
1064 # skip the end tokens
1065 tokens = list(_tokenize(readline, encoding=None))[:-2]
1066 expected_tokens = [(3, '"ЉЊЈЁЂ"', (1, 0), (1, 7), '"ЉЊЈЁЂ"')]
1067 self.assertEqual(tokens, expected_tokens,
1068 "string not tokenized when encoding is None")
1069
1070
1071 class ESC[4;38;5;81mTestDetectEncoding(ESC[4;38;5;149mTestCase):
1072
1073 def get_readline(self, lines):
1074 index = 0
1075 def readline():
1076 nonlocal index
1077 if index == len(lines):
1078 raise StopIteration
1079 line = lines[index]
1080 index += 1
1081 return line
1082 return readline
1083
1084 def test_no_bom_no_encoding_cookie(self):
1085 lines = (
1086 b'# something\n',
1087 b'print(something)\n',
1088 b'do_something(else)\n'
1089 )
1090 encoding, consumed_lines = detect_encoding(self.get_readline(lines))
1091 self.assertEqual(encoding, 'utf-8')
1092 self.assertEqual(consumed_lines, list(lines[:2]))
1093
1094 def test_bom_no_cookie(self):
1095 lines = (
1096 b'\xef\xbb\xbf# something\n',
1097 b'print(something)\n',
1098 b'do_something(else)\n'
1099 )
1100 encoding, consumed_lines = detect_encoding(self.get_readline(lines))
1101 self.assertEqual(encoding, 'utf-8-sig')
1102 self.assertEqual(consumed_lines,
1103 [b'# something\n', b'print(something)\n'])
1104
1105 def test_cookie_first_line_no_bom(self):
1106 lines = (
1107 b'# -*- coding: latin-1 -*-\n',
1108 b'print(something)\n',
1109 b'do_something(else)\n'
1110 )
1111 encoding, consumed_lines = detect_encoding(self.get_readline(lines))
1112 self.assertEqual(encoding, 'iso-8859-1')
1113 self.assertEqual(consumed_lines, [b'# -*- coding: latin-1 -*-\n'])
1114
1115 def test_matched_bom_and_cookie_first_line(self):
1116 lines = (
1117 b'\xef\xbb\xbf# coding=utf-8\n',
1118 b'print(something)\n',
1119 b'do_something(else)\n'
1120 )
1121 encoding, consumed_lines = detect_encoding(self.get_readline(lines))
1122 self.assertEqual(encoding, 'utf-8-sig')
1123 self.assertEqual(consumed_lines, [b'# coding=utf-8\n'])
1124
1125 def test_mismatched_bom_and_cookie_first_line_raises_syntaxerror(self):
1126 lines = (
1127 b'\xef\xbb\xbf# vim: set fileencoding=ascii :\n',
1128 b'print(something)\n',
1129 b'do_something(else)\n'
1130 )
1131 readline = self.get_readline(lines)
1132 self.assertRaises(SyntaxError, detect_encoding, readline)
1133
1134 def test_cookie_second_line_no_bom(self):
1135 lines = (
1136 b'#! something\n',
1137 b'# vim: set fileencoding=ascii :\n',
1138 b'print(something)\n',
1139 b'do_something(else)\n'
1140 )
1141 encoding, consumed_lines = detect_encoding(self.get_readline(lines))
1142 self.assertEqual(encoding, 'ascii')
1143 expected = [b'#! something\n', b'# vim: set fileencoding=ascii :\n']
1144 self.assertEqual(consumed_lines, expected)
1145
1146 def test_matched_bom_and_cookie_second_line(self):
1147 lines = (
1148 b'\xef\xbb\xbf#! something\n',
1149 b'f# coding=utf-8\n',
1150 b'print(something)\n',
1151 b'do_something(else)\n'
1152 )
1153 encoding, consumed_lines = detect_encoding(self.get_readline(lines))
1154 self.assertEqual(encoding, 'utf-8-sig')
1155 self.assertEqual(consumed_lines,
1156 [b'#! something\n', b'f# coding=utf-8\n'])
1157
1158 def test_mismatched_bom_and_cookie_second_line_raises_syntaxerror(self):
1159 lines = (
1160 b'\xef\xbb\xbf#! something\n',
1161 b'# vim: set fileencoding=ascii :\n',
1162 b'print(something)\n',
1163 b'do_something(else)\n'
1164 )
1165 readline = self.get_readline(lines)
1166 self.assertRaises(SyntaxError, detect_encoding, readline)
1167
1168 def test_cookie_second_line_noncommented_first_line(self):
1169 lines = (
1170 b"print('\xc2\xa3')\n",
1171 b'# vim: set fileencoding=iso8859-15 :\n',
1172 b"print('\xe2\x82\xac')\n"
1173 )
1174 encoding, consumed_lines = detect_encoding(self.get_readline(lines))
1175 self.assertEqual(encoding, 'utf-8')
1176 expected = [b"print('\xc2\xa3')\n"]
1177 self.assertEqual(consumed_lines, expected)
1178
1179 def test_cookie_second_line_commented_first_line(self):
1180 lines = (
1181 b"#print('\xc2\xa3')\n",
1182 b'# vim: set fileencoding=iso8859-15 :\n',
1183 b"print('\xe2\x82\xac')\n"
1184 )
1185 encoding, consumed_lines = detect_encoding(self.get_readline(lines))
1186 self.assertEqual(encoding, 'iso8859-15')
1187 expected = [b"#print('\xc2\xa3')\n", b'# vim: set fileencoding=iso8859-15 :\n']
1188 self.assertEqual(consumed_lines, expected)
1189
1190 def test_cookie_second_line_empty_first_line(self):
1191 lines = (
1192 b'\n',
1193 b'# vim: set fileencoding=iso8859-15 :\n',
1194 b"print('\xe2\x82\xac')\n"
1195 )
1196 encoding, consumed_lines = detect_encoding(self.get_readline(lines))
1197 self.assertEqual(encoding, 'iso8859-15')
1198 expected = [b'\n', b'# vim: set fileencoding=iso8859-15 :\n']
1199 self.assertEqual(consumed_lines, expected)
1200
1201 def test_latin1_normalization(self):
1202 # See get_normal_name() in tokenizer.c.
1203 encodings = ("latin-1", "iso-8859-1", "iso-latin-1", "latin-1-unix",
1204 "iso-8859-1-unix", "iso-latin-1-mac")
1205 for encoding in encodings:
1206 for rep in ("-", "_"):
1207 enc = encoding.replace("-", rep)
1208 lines = (b"#!/usr/bin/python\n",
1209 b"# coding: " + enc.encode("ascii") + b"\n",
1210 b"print(things)\n",
1211 b"do_something += 4\n")
1212 rl = self.get_readline(lines)
1213 found, consumed_lines = detect_encoding(rl)
1214 self.assertEqual(found, "iso-8859-1")
1215
1216 def test_syntaxerror_latin1(self):
1217 # Issue 14629: need to raise SyntaxError if the first
1218 # line(s) have non-UTF-8 characters
1219 lines = (
1220 b'print("\xdf")', # Latin-1: LATIN SMALL LETTER SHARP S
1221 )
1222 readline = self.get_readline(lines)
1223 self.assertRaises(SyntaxError, detect_encoding, readline)
1224
1225
1226 def test_utf8_normalization(self):
1227 # See get_normal_name() in tokenizer.c.
1228 encodings = ("utf-8", "utf-8-mac", "utf-8-unix")
1229 for encoding in encodings:
1230 for rep in ("-", "_"):
1231 enc = encoding.replace("-", rep)
1232 lines = (b"#!/usr/bin/python\n",
1233 b"# coding: " + enc.encode("ascii") + b"\n",
1234 b"1 + 3\n")
1235 rl = self.get_readline(lines)
1236 found, consumed_lines = detect_encoding(rl)
1237 self.assertEqual(found, "utf-8")
1238
1239 def test_short_files(self):
1240 readline = self.get_readline((b'print(something)\n',))
1241 encoding, consumed_lines = detect_encoding(readline)
1242 self.assertEqual(encoding, 'utf-8')
1243 self.assertEqual(consumed_lines, [b'print(something)\n'])
1244
1245 encoding, consumed_lines = detect_encoding(self.get_readline(()))
1246 self.assertEqual(encoding, 'utf-8')
1247 self.assertEqual(consumed_lines, [])
1248
1249 readline = self.get_readline((b'\xef\xbb\xbfprint(something)\n',))
1250 encoding, consumed_lines = detect_encoding(readline)
1251 self.assertEqual(encoding, 'utf-8-sig')
1252 self.assertEqual(consumed_lines, [b'print(something)\n'])
1253
1254 readline = self.get_readline((b'\xef\xbb\xbf',))
1255 encoding, consumed_lines = detect_encoding(readline)
1256 self.assertEqual(encoding, 'utf-8-sig')
1257 self.assertEqual(consumed_lines, [])
1258
1259 readline = self.get_readline((b'# coding: bad\n',))
1260 self.assertRaises(SyntaxError, detect_encoding, readline)
1261
1262 def test_false_encoding(self):
1263 # Issue 18873: "Encoding" detected in non-comment lines
1264 readline = self.get_readline((b'print("#coding=fake")',))
1265 encoding, consumed_lines = detect_encoding(readline)
1266 self.assertEqual(encoding, 'utf-8')
1267 self.assertEqual(consumed_lines, [b'print("#coding=fake")'])
1268
1269 def test_open(self):
1270 filename = os_helper.TESTFN + '.py'
1271 self.addCleanup(os_helper.unlink, filename)
1272
1273 # test coding cookie
1274 for encoding in ('iso-8859-15', 'utf-8'):
1275 with open(filename, 'w', encoding=encoding) as fp:
1276 print("# coding: %s" % encoding, file=fp)
1277 print("print('euro:\u20ac')", file=fp)
1278 with tokenize_open(filename) as fp:
1279 self.assertEqual(fp.encoding, encoding)
1280 self.assertEqual(fp.mode, 'r')
1281
1282 # test BOM (no coding cookie)
1283 with open(filename, 'w', encoding='utf-8-sig') as fp:
1284 print("print('euro:\u20ac')", file=fp)
1285 with tokenize_open(filename) as fp:
1286 self.assertEqual(fp.encoding, 'utf-8-sig')
1287 self.assertEqual(fp.mode, 'r')
1288
1289 def test_filename_in_exception(self):
1290 # When possible, include the file name in the exception.
1291 path = 'some_file_path'
1292 lines = (
1293 b'print("\xdf")', # Latin-1: LATIN SMALL LETTER SHARP S
1294 )
1295 class ESC[4;38;5;81mBunk:
1296 def __init__(self, lines, path):
1297 self.name = path
1298 self._lines = lines
1299 self._index = 0
1300
1301 def readline(self):
1302 if self._index == len(lines):
1303 raise StopIteration
1304 line = lines[self._index]
1305 self._index += 1
1306 return line
1307
1308 with self.assertRaises(SyntaxError):
1309 ins = Bunk(lines, path)
1310 # Make sure lacking a name isn't an issue.
1311 del ins.name
1312 detect_encoding(ins.readline)
1313 with self.assertRaisesRegex(SyntaxError, '.*{}'.format(path)):
1314 ins = Bunk(lines, path)
1315 detect_encoding(ins.readline)
1316
1317 def test_open_error(self):
1318 # Issue #23840: open() must close the binary file on error
1319 m = BytesIO(b'#coding:xxx')
1320 with mock.patch('tokenize._builtin_open', return_value=m):
1321 self.assertRaises(SyntaxError, tokenize_open, 'foobar')
1322 self.assertTrue(m.closed)
1323
1324
1325 class ESC[4;38;5;81mTestTokenize(ESC[4;38;5;149mTestCase):
1326
1327 def test_tokenize(self):
1328 import tokenize as tokenize_module
1329 encoding = object()
1330 encoding_used = None
1331 def mock_detect_encoding(readline):
1332 return encoding, [b'first', b'second']
1333
1334 def mock__tokenize(readline, encoding):
1335 nonlocal encoding_used
1336 encoding_used = encoding
1337 out = []
1338 while True:
1339 next_line = readline()
1340 if next_line:
1341 out.append(next_line)
1342 continue
1343 return out
1344
1345 counter = 0
1346 def mock_readline():
1347 nonlocal counter
1348 counter += 1
1349 if counter == 5:
1350 return b''
1351 return str(counter).encode()
1352
1353 orig_detect_encoding = tokenize_module.detect_encoding
1354 orig__tokenize = tokenize_module._tokenize
1355 tokenize_module.detect_encoding = mock_detect_encoding
1356 tokenize_module._tokenize = mock__tokenize
1357 try:
1358 results = tokenize(mock_readline)
1359 self.assertEqual(list(results),
1360 [b'first', b'second', b'1', b'2', b'3', b'4'])
1361 finally:
1362 tokenize_module.detect_encoding = orig_detect_encoding
1363 tokenize_module._tokenize = orig__tokenize
1364
1365 self.assertEqual(encoding_used, encoding)
1366
1367 def test_oneline_defs(self):
1368 buf = []
1369 for i in range(500):
1370 buf.append('def i{i}(): return {i}'.format(i=i))
1371 buf.append('OK')
1372 buf = '\n'.join(buf)
1373
1374 # Test that 500 consequent, one-line defs is OK
1375 toks = list(tokenize(BytesIO(buf.encode('utf-8')).readline))
1376 self.assertEqual(toks[-3].string, 'OK') # [-1] is always ENDMARKER
1377 # [-2] is always NEWLINE
1378
1379 def assertExactTypeEqual(self, opstr, *optypes):
1380 tokens = list(tokenize(BytesIO(opstr.encode('utf-8')).readline))
1381 num_optypes = len(optypes)
1382 self.assertEqual(len(tokens), 3 + num_optypes)
1383 self.assertEqual(tok_name[tokens[0].exact_type],
1384 tok_name[ENCODING])
1385 for i in range(num_optypes):
1386 self.assertEqual(tok_name[tokens[i + 1].exact_type],
1387 tok_name[optypes[i]])
1388 self.assertEqual(tok_name[tokens[1 + num_optypes].exact_type],
1389 tok_name[token.NEWLINE])
1390 self.assertEqual(tok_name[tokens[2 + num_optypes].exact_type],
1391 tok_name[token.ENDMARKER])
1392
1393 def test_exact_type(self):
1394 self.assertExactTypeEqual('()', token.LPAR, token.RPAR)
1395 self.assertExactTypeEqual('[]', token.LSQB, token.RSQB)
1396 self.assertExactTypeEqual(':', token.COLON)
1397 self.assertExactTypeEqual(',', token.COMMA)
1398 self.assertExactTypeEqual(';', token.SEMI)
1399 self.assertExactTypeEqual('+', token.PLUS)
1400 self.assertExactTypeEqual('-', token.MINUS)
1401 self.assertExactTypeEqual('*', token.STAR)
1402 self.assertExactTypeEqual('/', token.SLASH)
1403 self.assertExactTypeEqual('|', token.VBAR)
1404 self.assertExactTypeEqual('&', token.AMPER)
1405 self.assertExactTypeEqual('<', token.LESS)
1406 self.assertExactTypeEqual('>', token.GREATER)
1407 self.assertExactTypeEqual('=', token.EQUAL)
1408 self.assertExactTypeEqual('.', token.DOT)
1409 self.assertExactTypeEqual('%', token.PERCENT)
1410 self.assertExactTypeEqual('{}', token.LBRACE, token.RBRACE)
1411 self.assertExactTypeEqual('==', token.EQEQUAL)
1412 self.assertExactTypeEqual('!=', token.NOTEQUAL)
1413 self.assertExactTypeEqual('<=', token.LESSEQUAL)
1414 self.assertExactTypeEqual('>=', token.GREATEREQUAL)
1415 self.assertExactTypeEqual('~', token.TILDE)
1416 self.assertExactTypeEqual('^', token.CIRCUMFLEX)
1417 self.assertExactTypeEqual('<<', token.LEFTSHIFT)
1418 self.assertExactTypeEqual('>>', token.RIGHTSHIFT)
1419 self.assertExactTypeEqual('**', token.DOUBLESTAR)
1420 self.assertExactTypeEqual('+=', token.PLUSEQUAL)
1421 self.assertExactTypeEqual('-=', token.MINEQUAL)
1422 self.assertExactTypeEqual('*=', token.STAREQUAL)
1423 self.assertExactTypeEqual('/=', token.SLASHEQUAL)
1424 self.assertExactTypeEqual('%=', token.PERCENTEQUAL)
1425 self.assertExactTypeEqual('&=', token.AMPEREQUAL)
1426 self.assertExactTypeEqual('|=', token.VBAREQUAL)
1427 self.assertExactTypeEqual('^=', token.CIRCUMFLEXEQUAL)
1428 self.assertExactTypeEqual('^=', token.CIRCUMFLEXEQUAL)
1429 self.assertExactTypeEqual('<<=', token.LEFTSHIFTEQUAL)
1430 self.assertExactTypeEqual('>>=', token.RIGHTSHIFTEQUAL)
1431 self.assertExactTypeEqual('**=', token.DOUBLESTAREQUAL)
1432 self.assertExactTypeEqual('//', token.DOUBLESLASH)
1433 self.assertExactTypeEqual('//=', token.DOUBLESLASHEQUAL)
1434 self.assertExactTypeEqual(':=', token.COLONEQUAL)
1435 self.assertExactTypeEqual('...', token.ELLIPSIS)
1436 self.assertExactTypeEqual('->', token.RARROW)
1437 self.assertExactTypeEqual('@', token.AT)
1438 self.assertExactTypeEqual('@=', token.ATEQUAL)
1439
1440 self.assertExactTypeEqual('a**2+b**2==c**2',
1441 NAME, token.DOUBLESTAR, NUMBER,
1442 token.PLUS,
1443 NAME, token.DOUBLESTAR, NUMBER,
1444 token.EQEQUAL,
1445 NAME, token.DOUBLESTAR, NUMBER)
1446 self.assertExactTypeEqual('{1, 2, 3}',
1447 token.LBRACE,
1448 token.NUMBER, token.COMMA,
1449 token.NUMBER, token.COMMA,
1450 token.NUMBER,
1451 token.RBRACE)
1452 self.assertExactTypeEqual('^(x & 0x1)',
1453 token.CIRCUMFLEX,
1454 token.LPAR,
1455 token.NAME, token.AMPER, token.NUMBER,
1456 token.RPAR)
1457
1458 def test_pathological_trailing_whitespace(self):
1459 # See http://bugs.python.org/issue16152
1460 self.assertExactTypeEqual('@ ', token.AT)
1461
1462 def test_comment_at_the_end_of_the_source_without_newline(self):
1463 # See http://bugs.python.org/issue44667
1464 source = 'b = 1\n\n#test'
1465 expected_tokens = [token.NAME, token.EQUAL, token.NUMBER, token.NEWLINE, token.NL, token.COMMENT]
1466
1467 tokens = list(tokenize(BytesIO(source.encode('utf-8')).readline))
1468 self.assertEqual(tok_name[tokens[0].exact_type], tok_name[ENCODING])
1469 for i in range(6):
1470 self.assertEqual(tok_name[tokens[i + 1].exact_type], tok_name[expected_tokens[i]])
1471 self.assertEqual(tok_name[tokens[-1].exact_type], tok_name[token.ENDMARKER])
1472
1473 class ESC[4;38;5;81mUntokenizeTest(ESC[4;38;5;149mTestCase):
1474
1475 def test_bad_input_order(self):
1476 # raise if previous row
1477 u = Untokenizer()
1478 u.prev_row = 2
1479 u.prev_col = 2
1480 with self.assertRaises(ValueError) as cm:
1481 u.add_whitespace((1,3))
1482 self.assertEqual(cm.exception.args[0],
1483 'start (1,3) precedes previous end (2,2)')
1484 # raise if previous column in row
1485 self.assertRaises(ValueError, u.add_whitespace, (2,1))
1486
1487 def test_backslash_continuation(self):
1488 # The problem is that <whitespace>\<newline> leaves no token
1489 u = Untokenizer()
1490 u.prev_row = 1
1491 u.prev_col = 1
1492 u.tokens = []
1493 u.add_whitespace((2, 0))
1494 self.assertEqual(u.tokens, ['\\\n'])
1495 u.prev_row = 2
1496 u.add_whitespace((4, 4))
1497 self.assertEqual(u.tokens, ['\\\n', '\\\n\\\n', ' '])
1498 TestRoundtrip.check_roundtrip(self, 'a\n b\n c\n \\\n c\n')
1499
1500 def test_iter_compat(self):
1501 u = Untokenizer()
1502 token = (NAME, 'Hello')
1503 tokens = [(ENCODING, 'utf-8'), token]
1504 u.compat(token, iter([]))
1505 self.assertEqual(u.tokens, ["Hello "])
1506 u = Untokenizer()
1507 self.assertEqual(u.untokenize(iter([token])), 'Hello ')
1508 u = Untokenizer()
1509 self.assertEqual(u.untokenize(iter(tokens)), 'Hello ')
1510 self.assertEqual(u.encoding, 'utf-8')
1511 self.assertEqual(untokenize(iter(tokens)), b'Hello ')
1512
1513
1514 class ESC[4;38;5;81mTestRoundtrip(ESC[4;38;5;149mTestCase):
1515
1516 def check_roundtrip(self, f):
1517 """
1518 Test roundtrip for `untokenize`. `f` is an open file or a string.
1519 The source code in f is tokenized to both 5- and 2-tuples.
1520 Both sequences are converted back to source code via
1521 tokenize.untokenize(), and the latter tokenized again to 2-tuples.
1522 The test fails if the 3 pair tokenizations do not match.
1523
1524 When untokenize bugs are fixed, untokenize with 5-tuples should
1525 reproduce code that does not contain a backslash continuation
1526 following spaces. A proper test should test this.
1527 """
1528 # Get source code and original tokenizations
1529 if isinstance(f, str):
1530 code = f.encode('utf-8')
1531 else:
1532 code = f.read()
1533 f.close()
1534 readline = iter(code.splitlines(keepends=True)).__next__
1535 tokens5 = list(tokenize(readline))
1536 tokens2 = [tok[:2] for tok in tokens5]
1537 # Reproduce tokens2 from pairs
1538 bytes_from2 = untokenize(tokens2)
1539 readline2 = iter(bytes_from2.splitlines(keepends=True)).__next__
1540 tokens2_from2 = [tok[:2] for tok in tokenize(readline2)]
1541 self.assertEqual(tokens2_from2, tokens2)
1542 # Reproduce tokens2 from 5-tuples
1543 bytes_from5 = untokenize(tokens5)
1544 readline5 = iter(bytes_from5.splitlines(keepends=True)).__next__
1545 tokens2_from5 = [tok[:2] for tok in tokenize(readline5)]
1546 self.assertEqual(tokens2_from5, tokens2)
1547
1548 def test_roundtrip(self):
1549 # There are some standard formatting practices that are easy to get right.
1550
1551 self.check_roundtrip("if x == 1:\n"
1552 " print(x)\n")
1553 self.check_roundtrip("# This is a comment\n"
1554 "# This also\n")
1555
1556 # Some people use different formatting conventions, which makes
1557 # untokenize a little trickier. Note that this test involves trailing
1558 # whitespace after the colon. Note that we use hex escapes to make the
1559 # two trailing blanks apparent in the expected output.
1560
1561 self.check_roundtrip("if x == 1 : \n"
1562 " print(x)\n")
1563 fn = support.findfile("tokenize_tests.txt", subdir="tokenizedata")
1564 with open(fn, 'rb') as f:
1565 self.check_roundtrip(f)
1566 self.check_roundtrip("if x == 1:\n"
1567 " # A comment by itself.\n"
1568 " print(x) # Comment here, too.\n"
1569 " # Another comment.\n"
1570 "after_if = True\n")
1571 self.check_roundtrip("if (x # The comments need to go in the right place\n"
1572 " == 1):\n"
1573 " print('x==1')\n")
1574 self.check_roundtrip("class Test: # A comment here\n"
1575 " # A comment with weird indent\n"
1576 " after_com = 5\n"
1577 " def x(m): return m*5 # a one liner\n"
1578 " def y(m): # A whitespace after the colon\n"
1579 " return y*4 # 3-space indent\n")
1580
1581 # Some error-handling code
1582 self.check_roundtrip("try: import somemodule\n"
1583 "except ImportError: # comment\n"
1584 " print('Can not import' # comment2\n)"
1585 "else: print('Loaded')\n")
1586
1587 def test_continuation(self):
1588 # Balancing continuation
1589 self.check_roundtrip("a = (3,4, \n"
1590 "5,6)\n"
1591 "y = [3, 4,\n"
1592 "5]\n"
1593 "z = {'a': 5,\n"
1594 "'b':15, 'c':True}\n"
1595 "x = len(y) + 5 - a[\n"
1596 "3] - a[2]\n"
1597 "+ len(z) - z[\n"
1598 "'b']\n")
1599
1600 def test_backslash_continuation(self):
1601 # Backslash means line continuation, except for comments
1602 self.check_roundtrip("x=1+\\\n"
1603 "1\n"
1604 "# This is a comment\\\n"
1605 "# This also\n")
1606 self.check_roundtrip("# Comment \\\n"
1607 "x = 0")
1608
1609 def test_string_concatenation(self):
1610 # Two string literals on the same line
1611 self.check_roundtrip("'' ''")
1612
1613 def test_random_files(self):
1614 # Test roundtrip on random python modules.
1615 # pass the '-ucpu' option to process the full directory.
1616
1617 import glob, random
1618 tempdir = os.path.dirname(__file__) or os.curdir
1619 testfiles = glob.glob(os.path.join(glob.escape(tempdir), "test*.py"))
1620
1621 # Tokenize is broken on test_pep3131.py because regular expressions are
1622 # broken on the obscure unicode identifiers in it. *sigh*
1623 # With roundtrip extended to test the 5-tuple mode of untokenize,
1624 # 7 more testfiles fail. Remove them also until the failure is diagnosed.
1625
1626 testfiles.remove(os.path.join(tempdir, "test_unicode_identifiers.py"))
1627 for f in ('buffer', 'builtin', 'fileio', 'os', 'platform', 'sys'):
1628 testfiles.remove(os.path.join(tempdir, "test_%s.py") % f)
1629
1630 if not support.is_resource_enabled("cpu"):
1631 testfiles = random.sample(testfiles, 10)
1632
1633 for testfile in testfiles:
1634 if support.verbose >= 2:
1635 print('tokenize', testfile)
1636 with open(testfile, 'rb') as f:
1637 with self.subTest(file=testfile):
1638 self.check_roundtrip(f)
1639
1640
1641 def roundtrip(self, code):
1642 if isinstance(code, str):
1643 code = code.encode('utf-8')
1644 return untokenize(tokenize(BytesIO(code).readline)).decode('utf-8')
1645
1646 def test_indentation_semantics_retained(self):
1647 """
1648 Ensure that although whitespace might be mutated in a roundtrip,
1649 the semantic meaning of the indentation remains consistent.
1650 """
1651 code = "if False:\n\tx=3\n\tx=3\n"
1652 codelines = self.roundtrip(code).split('\n')
1653 self.assertEqual(codelines[1], codelines[2])
1654 self.check_roundtrip(code)
1655
1656
1657 class ESC[4;38;5;81mCTokenizeTest(ESC[4;38;5;149mTestCase):
1658 def check_tokenize(self, s, expected):
1659 # Format the tokens in s in a table format.
1660 # The ENDMARKER and final NEWLINE are omitted.
1661 with self.subTest(source=s):
1662 result = stringify_tokens_from_source(
1663 _generate_tokens_from_c_tokenizer(s), s
1664 )
1665 self.assertEqual(result, expected.rstrip().splitlines())
1666
1667 def test_int(self):
1668
1669 self.check_tokenize('0xff <= 255', """\
1670 NUMBER '0xff' (1, 0) (1, 4)
1671 LESSEQUAL '<=' (1, 5) (1, 7)
1672 NUMBER '255' (1, 8) (1, 11)
1673 """)
1674
1675 self.check_tokenize('0b10 <= 255', """\
1676 NUMBER '0b10' (1, 0) (1, 4)
1677 LESSEQUAL '<=' (1, 5) (1, 7)
1678 NUMBER '255' (1, 8) (1, 11)
1679 """)
1680
1681 self.check_tokenize('0o123 <= 0O123', """\
1682 NUMBER '0o123' (1, 0) (1, 5)
1683 LESSEQUAL '<=' (1, 6) (1, 8)
1684 NUMBER '0O123' (1, 9) (1, 14)
1685 """)
1686
1687 self.check_tokenize('1234567 > ~0x15', """\
1688 NUMBER '1234567' (1, 0) (1, 7)
1689 GREATER '>' (1, 8) (1, 9)
1690 TILDE '~' (1, 10) (1, 11)
1691 NUMBER '0x15' (1, 11) (1, 15)
1692 """)
1693
1694 self.check_tokenize('2134568 != 1231515', """\
1695 NUMBER '2134568' (1, 0) (1, 7)
1696 NOTEQUAL '!=' (1, 8) (1, 10)
1697 NUMBER '1231515' (1, 11) (1, 18)
1698 """)
1699
1700 self.check_tokenize('(-124561-1) & 200000000', """\
1701 LPAR '(' (1, 0) (1, 1)
1702 MINUS '-' (1, 1) (1, 2)
1703 NUMBER '124561' (1, 2) (1, 8)
1704 MINUS '-' (1, 8) (1, 9)
1705 NUMBER '1' (1, 9) (1, 10)
1706 RPAR ')' (1, 10) (1, 11)
1707 AMPER '&' (1, 12) (1, 13)
1708 NUMBER '200000000' (1, 14) (1, 23)
1709 """)
1710
1711 self.check_tokenize('0xdeadbeef != -1', """\
1712 NUMBER '0xdeadbeef' (1, 0) (1, 10)
1713 NOTEQUAL '!=' (1, 11) (1, 13)
1714 MINUS '-' (1, 14) (1, 15)
1715 NUMBER '1' (1, 15) (1, 16)
1716 """)
1717
1718 self.check_tokenize('0xdeadc0de & 12345', """\
1719 NUMBER '0xdeadc0de' (1, 0) (1, 10)
1720 AMPER '&' (1, 11) (1, 12)
1721 NUMBER '12345' (1, 13) (1, 18)
1722 """)
1723
1724 self.check_tokenize('0xFF & 0x15 | 1234', """\
1725 NUMBER '0xFF' (1, 0) (1, 4)
1726 AMPER '&' (1, 5) (1, 6)
1727 NUMBER '0x15' (1, 7) (1, 11)
1728 VBAR '|' (1, 12) (1, 13)
1729 NUMBER '1234' (1, 14) (1, 18)
1730 """)
1731
1732 def test_float(self):
1733
1734 self.check_tokenize('x = 3.14159', """\
1735 NAME 'x' (1, 0) (1, 1)
1736 EQUAL '=' (1, 2) (1, 3)
1737 NUMBER '3.14159' (1, 4) (1, 11)
1738 """)
1739
1740 self.check_tokenize('x = 314159.', """\
1741 NAME 'x' (1, 0) (1, 1)
1742 EQUAL '=' (1, 2) (1, 3)
1743 NUMBER '314159.' (1, 4) (1, 11)
1744 """)
1745
1746 self.check_tokenize('x = .314159', """\
1747 NAME 'x' (1, 0) (1, 1)
1748 EQUAL '=' (1, 2) (1, 3)
1749 NUMBER '.314159' (1, 4) (1, 11)
1750 """)
1751
1752 self.check_tokenize('x = 3e14159', """\
1753 NAME 'x' (1, 0) (1, 1)
1754 EQUAL '=' (1, 2) (1, 3)
1755 NUMBER '3e14159' (1, 4) (1, 11)
1756 """)
1757
1758 self.check_tokenize('x = 3E123', """\
1759 NAME 'x' (1, 0) (1, 1)
1760 EQUAL '=' (1, 2) (1, 3)
1761 NUMBER '3E123' (1, 4) (1, 9)
1762 """)
1763
1764 self.check_tokenize('x+y = 3e-1230', """\
1765 NAME 'x' (1, 0) (1, 1)
1766 PLUS '+' (1, 1) (1, 2)
1767 NAME 'y' (1, 2) (1, 3)
1768 EQUAL '=' (1, 4) (1, 5)
1769 NUMBER '3e-1230' (1, 6) (1, 13)
1770 """)
1771
1772 self.check_tokenize('x = 3.14e159', """\
1773 NAME 'x' (1, 0) (1, 1)
1774 EQUAL '=' (1, 2) (1, 3)
1775 NUMBER '3.14e159' (1, 4) (1, 12)
1776 """)
1777
1778 def test_string(self):
1779
1780 self.check_tokenize('x = \'\'; y = ""', """\
1781 NAME 'x' (1, 0) (1, 1)
1782 EQUAL '=' (1, 2) (1, 3)
1783 STRING "''" (1, 4) (1, 6)
1784 SEMI ';' (1, 6) (1, 7)
1785 NAME 'y' (1, 8) (1, 9)
1786 EQUAL '=' (1, 10) (1, 11)
1787 STRING '""' (1, 12) (1, 14)
1788 """)
1789
1790 self.check_tokenize('x = \'"\'; y = "\'"', """\
1791 NAME 'x' (1, 0) (1, 1)
1792 EQUAL '=' (1, 2) (1, 3)
1793 STRING '\\'"\\'' (1, 4) (1, 7)
1794 SEMI ';' (1, 7) (1, 8)
1795 NAME 'y' (1, 9) (1, 10)
1796 EQUAL '=' (1, 11) (1, 12)
1797 STRING '"\\'"' (1, 13) (1, 16)
1798 """)
1799
1800 self.check_tokenize('x = "doesn\'t "shrink", does it"', """\
1801 NAME 'x' (1, 0) (1, 1)
1802 EQUAL '=' (1, 2) (1, 3)
1803 STRING '"doesn\\'t "' (1, 4) (1, 14)
1804 NAME 'shrink' (1, 14) (1, 20)
1805 STRING '", does it"' (1, 20) (1, 31)
1806 """)
1807
1808 self.check_tokenize("x = 'abc' + 'ABC'", """\
1809 NAME 'x' (1, 0) (1, 1)
1810 EQUAL '=' (1, 2) (1, 3)
1811 STRING "'abc'" (1, 4) (1, 9)
1812 PLUS '+' (1, 10) (1, 11)
1813 STRING "'ABC'" (1, 12) (1, 17)
1814 """)
1815
1816 self.check_tokenize('y = "ABC" + "ABC"', """\
1817 NAME 'y' (1, 0) (1, 1)
1818 EQUAL '=' (1, 2) (1, 3)
1819 STRING '"ABC"' (1, 4) (1, 9)
1820 PLUS '+' (1, 10) (1, 11)
1821 STRING '"ABC"' (1, 12) (1, 17)
1822 """)
1823
1824 self.check_tokenize("x = r'abc' + r'ABC' + R'ABC' + R'ABC'", """\
1825 NAME 'x' (1, 0) (1, 1)
1826 EQUAL '=' (1, 2) (1, 3)
1827 STRING "r'abc'" (1, 4) (1, 10)
1828 PLUS '+' (1, 11) (1, 12)
1829 STRING "r'ABC'" (1, 13) (1, 19)
1830 PLUS '+' (1, 20) (1, 21)
1831 STRING "R'ABC'" (1, 22) (1, 28)
1832 PLUS '+' (1, 29) (1, 30)
1833 STRING "R'ABC'" (1, 31) (1, 37)
1834 """)
1835
1836 self.check_tokenize('y = r"abc" + r"ABC" + R"ABC" + R"ABC"', """\
1837 NAME 'y' (1, 0) (1, 1)
1838 EQUAL '=' (1, 2) (1, 3)
1839 STRING 'r"abc"' (1, 4) (1, 10)
1840 PLUS '+' (1, 11) (1, 12)
1841 STRING 'r"ABC"' (1, 13) (1, 19)
1842 PLUS '+' (1, 20) (1, 21)
1843 STRING 'R"ABC"' (1, 22) (1, 28)
1844 PLUS '+' (1, 29) (1, 30)
1845 STRING 'R"ABC"' (1, 31) (1, 37)
1846 """)
1847
1848 self.check_tokenize("u'abc' + U'abc'", """\
1849 STRING "u'abc'" (1, 0) (1, 6)
1850 PLUS '+' (1, 7) (1, 8)
1851 STRING "U'abc'" (1, 9) (1, 15)
1852 """)
1853
1854 self.check_tokenize('u"abc" + U"abc"', """\
1855 STRING 'u"abc"' (1, 0) (1, 6)
1856 PLUS '+' (1, 7) (1, 8)
1857 STRING 'U"abc"' (1, 9) (1, 15)
1858 """)
1859
1860 self.check_tokenize("b'abc' + B'abc'", """\
1861 STRING "b'abc'" (1, 0) (1, 6)
1862 PLUS '+' (1, 7) (1, 8)
1863 STRING "B'abc'" (1, 9) (1, 15)
1864 """)
1865
1866 self.check_tokenize('b"abc" + B"abc"', """\
1867 STRING 'b"abc"' (1, 0) (1, 6)
1868 PLUS '+' (1, 7) (1, 8)
1869 STRING 'B"abc"' (1, 9) (1, 15)
1870 """)
1871
1872 self.check_tokenize("br'abc' + bR'abc' + Br'abc' + BR'abc'", """\
1873 STRING "br'abc'" (1, 0) (1, 7)
1874 PLUS '+' (1, 8) (1, 9)
1875 STRING "bR'abc'" (1, 10) (1, 17)
1876 PLUS '+' (1, 18) (1, 19)
1877 STRING "Br'abc'" (1, 20) (1, 27)
1878 PLUS '+' (1, 28) (1, 29)
1879 STRING "BR'abc'" (1, 30) (1, 37)
1880 """)
1881
1882 self.check_tokenize('br"abc" + bR"abc" + Br"abc" + BR"abc"', """\
1883 STRING 'br"abc"' (1, 0) (1, 7)
1884 PLUS '+' (1, 8) (1, 9)
1885 STRING 'bR"abc"' (1, 10) (1, 17)
1886 PLUS '+' (1, 18) (1, 19)
1887 STRING 'Br"abc"' (1, 20) (1, 27)
1888 PLUS '+' (1, 28) (1, 29)
1889 STRING 'BR"abc"' (1, 30) (1, 37)
1890 """)
1891
1892 self.check_tokenize("rb'abc' + rB'abc' + Rb'abc' + RB'abc'", """\
1893 STRING "rb'abc'" (1, 0) (1, 7)
1894 PLUS '+' (1, 8) (1, 9)
1895 STRING "rB'abc'" (1, 10) (1, 17)
1896 PLUS '+' (1, 18) (1, 19)
1897 STRING "Rb'abc'" (1, 20) (1, 27)
1898 PLUS '+' (1, 28) (1, 29)
1899 STRING "RB'abc'" (1, 30) (1, 37)
1900 """)
1901
1902 self.check_tokenize('rb"abc" + rB"abc" + Rb"abc" + RB"abc"', """\
1903 STRING 'rb"abc"' (1, 0) (1, 7)
1904 PLUS '+' (1, 8) (1, 9)
1905 STRING 'rB"abc"' (1, 10) (1, 17)
1906 PLUS '+' (1, 18) (1, 19)
1907 STRING 'Rb"abc"' (1, 20) (1, 27)
1908 PLUS '+' (1, 28) (1, 29)
1909 STRING 'RB"abc"' (1, 30) (1, 37)
1910 """)
1911
1912 self.check_tokenize('"a\\\nde\\\nfg"', """\
1913 STRING '"a\\\\\\nde\\\\\\nfg"\' (1, 0) (3, 3)
1914 """)
1915
1916 self.check_tokenize('u"a\\\nde"', """\
1917 STRING 'u"a\\\\\\nde"\' (1, 0) (2, 3)
1918 """)
1919
1920 self.check_tokenize('rb"a\\\nd"', """\
1921 STRING 'rb"a\\\\\\nd"\' (1, 0) (2, 2)
1922 """)
1923
1924 self.check_tokenize(r'"""a\
1925 b"""', """\
1926 STRING '\"\""a\\\\\\nb\"\""' (1, 0) (2, 4)
1927 """)
1928 self.check_tokenize(r'u"""a\
1929 b"""', """\
1930 STRING 'u\"\""a\\\\\\nb\"\""' (1, 0) (2, 4)
1931 """)
1932 self.check_tokenize(r'rb"""a\
1933 b\
1934 c"""', """\
1935 STRING 'rb"\""a\\\\\\nb\\\\\\nc"\""' (1, 0) (3, 4)
1936 """)
1937
1938 self.check_tokenize('f"abc"', """\
1939 STRING 'f"abc"' (1, 0) (1, 6)
1940 """)
1941
1942 self.check_tokenize('fR"a{b}c"', """\
1943 STRING 'fR"a{b}c"' (1, 0) (1, 9)
1944 """)
1945
1946 self.check_tokenize('f"""abc"""', """\
1947 STRING 'f\"\"\"abc\"\"\"' (1, 0) (1, 10)
1948 """)
1949
1950 self.check_tokenize(r'f"abc\
1951 def"', """\
1952 STRING 'f"abc\\\\\\ndef"' (1, 0) (2, 4)
1953 """)
1954
1955 self.check_tokenize(r'Rf"abc\
1956 def"', """\
1957 STRING 'Rf"abc\\\\\\ndef"' (1, 0) (2, 4)
1958 """)
1959
1960 def test_function(self):
1961
1962 self.check_tokenize('def d22(a, b, c=2, d=2, *k): pass', """\
1963 NAME 'def' (1, 0) (1, 3)
1964 NAME 'd22' (1, 4) (1, 7)
1965 LPAR '(' (1, 7) (1, 8)
1966 NAME 'a' (1, 8) (1, 9)
1967 COMMA ',' (1, 9) (1, 10)
1968 NAME 'b' (1, 11) (1, 12)
1969 COMMA ',' (1, 12) (1, 13)
1970 NAME 'c' (1, 14) (1, 15)
1971 EQUAL '=' (1, 15) (1, 16)
1972 NUMBER '2' (1, 16) (1, 17)
1973 COMMA ',' (1, 17) (1, 18)
1974 NAME 'd' (1, 19) (1, 20)
1975 EQUAL '=' (1, 20) (1, 21)
1976 NUMBER '2' (1, 21) (1, 22)
1977 COMMA ',' (1, 22) (1, 23)
1978 STAR '*' (1, 24) (1, 25)
1979 NAME 'k' (1, 25) (1, 26)
1980 RPAR ')' (1, 26) (1, 27)
1981 COLON ':' (1, 27) (1, 28)
1982 NAME 'pass' (1, 29) (1, 33)
1983 """)
1984
1985 self.check_tokenize('def d01v_(a=1, *k, **w): pass', """\
1986 NAME 'def' (1, 0) (1, 3)
1987 NAME 'd01v_' (1, 4) (1, 9)
1988 LPAR '(' (1, 9) (1, 10)
1989 NAME 'a' (1, 10) (1, 11)
1990 EQUAL '=' (1, 11) (1, 12)
1991 NUMBER '1' (1, 12) (1, 13)
1992 COMMA ',' (1, 13) (1, 14)
1993 STAR '*' (1, 15) (1, 16)
1994 NAME 'k' (1, 16) (1, 17)
1995 COMMA ',' (1, 17) (1, 18)
1996 DOUBLESTAR '**' (1, 19) (1, 21)
1997 NAME 'w' (1, 21) (1, 22)
1998 RPAR ')' (1, 22) (1, 23)
1999 COLON ':' (1, 23) (1, 24)
2000 NAME 'pass' (1, 25) (1, 29)
2001 """)
2002
2003 self.check_tokenize('def d23(a: str, b: int=3) -> int: pass', """\
2004 NAME 'def' (1, 0) (1, 3)
2005 NAME 'd23' (1, 4) (1, 7)
2006 LPAR '(' (1, 7) (1, 8)
2007 NAME 'a' (1, 8) (1, 9)
2008 COLON ':' (1, 9) (1, 10)
2009 NAME 'str' (1, 11) (1, 14)
2010 COMMA ',' (1, 14) (1, 15)
2011 NAME 'b' (1, 16) (1, 17)
2012 COLON ':' (1, 17) (1, 18)
2013 NAME 'int' (1, 19) (1, 22)
2014 EQUAL '=' (1, 22) (1, 23)
2015 NUMBER '3' (1, 23) (1, 24)
2016 RPAR ')' (1, 24) (1, 25)
2017 RARROW '->' (1, 26) (1, 28)
2018 NAME 'int' (1, 29) (1, 32)
2019 COLON ':' (1, 32) (1, 33)
2020 NAME 'pass' (1, 34) (1, 38)
2021 """)
2022
2023 def test_comparison(self):
2024
2025 self.check_tokenize("if 1 < 1 > 1 == 1 >= 5 <= 0x15 <= 0x12 != "
2026 "1 and 5 in 1 not in 1 is 1 or 5 is not 1: pass", """\
2027 NAME 'if' (1, 0) (1, 2)
2028 NUMBER '1' (1, 3) (1, 4)
2029 LESS '<' (1, 5) (1, 6)
2030 NUMBER '1' (1, 7) (1, 8)
2031 GREATER '>' (1, 9) (1, 10)
2032 NUMBER '1' (1, 11) (1, 12)
2033 EQEQUAL '==' (1, 13) (1, 15)
2034 NUMBER '1' (1, 16) (1, 17)
2035 GREATEREQUAL '>=' (1, 18) (1, 20)
2036 NUMBER '5' (1, 21) (1, 22)
2037 LESSEQUAL '<=' (1, 23) (1, 25)
2038 NUMBER '0x15' (1, 26) (1, 30)
2039 LESSEQUAL '<=' (1, 31) (1, 33)
2040 NUMBER '0x12' (1, 34) (1, 38)
2041 NOTEQUAL '!=' (1, 39) (1, 41)
2042 NUMBER '1' (1, 42) (1, 43)
2043 NAME 'and' (1, 44) (1, 47)
2044 NUMBER '5' (1, 48) (1, 49)
2045 NAME 'in' (1, 50) (1, 52)
2046 NUMBER '1' (1, 53) (1, 54)
2047 NAME 'not' (1, 55) (1, 58)
2048 NAME 'in' (1, 59) (1, 61)
2049 NUMBER '1' (1, 62) (1, 63)
2050 NAME 'is' (1, 64) (1, 66)
2051 NUMBER '1' (1, 67) (1, 68)
2052 NAME 'or' (1, 69) (1, 71)
2053 NUMBER '5' (1, 72) (1, 73)
2054 NAME 'is' (1, 74) (1, 76)
2055 NAME 'not' (1, 77) (1, 80)
2056 NUMBER '1' (1, 81) (1, 82)
2057 COLON ':' (1, 82) (1, 83)
2058 NAME 'pass' (1, 84) (1, 88)
2059 """)
2060
2061 def test_additive(self):
2062
2063 self.check_tokenize('x = 1 - y + 15 - 1 + 0x124 + z + a[5]', """\
2064 NAME 'x' (1, 0) (1, 1)
2065 EQUAL '=' (1, 2) (1, 3)
2066 NUMBER '1' (1, 4) (1, 5)
2067 MINUS '-' (1, 6) (1, 7)
2068 NAME 'y' (1, 8) (1, 9)
2069 PLUS '+' (1, 10) (1, 11)
2070 NUMBER '15' (1, 12) (1, 14)
2071 MINUS '-' (1, 15) (1, 16)
2072 NUMBER '1' (1, 17) (1, 18)
2073 PLUS '+' (1, 19) (1, 20)
2074 NUMBER '0x124' (1, 21) (1, 26)
2075 PLUS '+' (1, 27) (1, 28)
2076 NAME 'z' (1, 29) (1, 30)
2077 PLUS '+' (1, 31) (1, 32)
2078 NAME 'a' (1, 33) (1, 34)
2079 LSQB '[' (1, 34) (1, 35)
2080 NUMBER '5' (1, 35) (1, 36)
2081 RSQB ']' (1, 36) (1, 37)
2082 """)
2083
2084 def test_multiplicative(self):
2085
2086 self.check_tokenize('x = 1//1*1/5*12%0x12@42', """\
2087 NAME 'x' (1, 0) (1, 1)
2088 EQUAL '=' (1, 2) (1, 3)
2089 NUMBER '1' (1, 4) (1, 5)
2090 DOUBLESLASH '//' (1, 5) (1, 7)
2091 NUMBER '1' (1, 7) (1, 8)
2092 STAR '*' (1, 8) (1, 9)
2093 NUMBER '1' (1, 9) (1, 10)
2094 SLASH '/' (1, 10) (1, 11)
2095 NUMBER '5' (1, 11) (1, 12)
2096 STAR '*' (1, 12) (1, 13)
2097 NUMBER '12' (1, 13) (1, 15)
2098 PERCENT '%' (1, 15) (1, 16)
2099 NUMBER '0x12' (1, 16) (1, 20)
2100 AT '@' (1, 20) (1, 21)
2101 NUMBER '42' (1, 21) (1, 23)
2102 """)
2103
2104 def test_unary(self):
2105
2106 self.check_tokenize('~1 ^ 1 & 1 |1 ^ -1', """\
2107 TILDE '~' (1, 0) (1, 1)
2108 NUMBER '1' (1, 1) (1, 2)
2109 CIRCUMFLEX '^' (1, 3) (1, 4)
2110 NUMBER '1' (1, 5) (1, 6)
2111 AMPER '&' (1, 7) (1, 8)
2112 NUMBER '1' (1, 9) (1, 10)
2113 VBAR '|' (1, 11) (1, 12)
2114 NUMBER '1' (1, 12) (1, 13)
2115 CIRCUMFLEX '^' (1, 14) (1, 15)
2116 MINUS '-' (1, 16) (1, 17)
2117 NUMBER '1' (1, 17) (1, 18)
2118 """)
2119
2120 self.check_tokenize('-1*1/1+1*1//1 - ---1**1', """\
2121 MINUS '-' (1, 0) (1, 1)
2122 NUMBER '1' (1, 1) (1, 2)
2123 STAR '*' (1, 2) (1, 3)
2124 NUMBER '1' (1, 3) (1, 4)
2125 SLASH '/' (1, 4) (1, 5)
2126 NUMBER '1' (1, 5) (1, 6)
2127 PLUS '+' (1, 6) (1, 7)
2128 NUMBER '1' (1, 7) (1, 8)
2129 STAR '*' (1, 8) (1, 9)
2130 NUMBER '1' (1, 9) (1, 10)
2131 DOUBLESLASH '//' (1, 10) (1, 12)
2132 NUMBER '1' (1, 12) (1, 13)
2133 MINUS '-' (1, 14) (1, 15)
2134 MINUS '-' (1, 16) (1, 17)
2135 MINUS '-' (1, 17) (1, 18)
2136 MINUS '-' (1, 18) (1, 19)
2137 NUMBER '1' (1, 19) (1, 20)
2138 DOUBLESTAR '**' (1, 20) (1, 22)
2139 NUMBER '1' (1, 22) (1, 23)
2140 """)
2141
2142 def test_selector(self):
2143
2144 self.check_tokenize("import sys, time\nx = sys.modules['time'].time()", """\
2145 NAME 'import' (1, 0) (1, 6)
2146 NAME 'sys' (1, 7) (1, 10)
2147 COMMA ',' (1, 10) (1, 11)
2148 NAME 'time' (1, 12) (1, 16)
2149 NEWLINE '' (1, 16) (1, 16)
2150 NAME 'x' (2, 0) (2, 1)
2151 EQUAL '=' (2, 2) (2, 3)
2152 NAME 'sys' (2, 4) (2, 7)
2153 DOT '.' (2, 7) (2, 8)
2154 NAME 'modules' (2, 8) (2, 15)
2155 LSQB '[' (2, 15) (2, 16)
2156 STRING "'time'" (2, 16) (2, 22)
2157 RSQB ']' (2, 22) (2, 23)
2158 DOT '.' (2, 23) (2, 24)
2159 NAME 'time' (2, 24) (2, 28)
2160 LPAR '(' (2, 28) (2, 29)
2161 RPAR ')' (2, 29) (2, 30)
2162 """)
2163
2164 def test_method(self):
2165
2166 self.check_tokenize('@staticmethod\ndef foo(x,y): pass', """\
2167 AT '@' (1, 0) (1, 1)
2168 NAME 'staticmethod' (1, 1) (1, 13)
2169 NEWLINE '' (1, 13) (1, 13)
2170 NAME 'def' (2, 0) (2, 3)
2171 NAME 'foo' (2, 4) (2, 7)
2172 LPAR '(' (2, 7) (2, 8)
2173 NAME 'x' (2, 8) (2, 9)
2174 COMMA ',' (2, 9) (2, 10)
2175 NAME 'y' (2, 10) (2, 11)
2176 RPAR ')' (2, 11) (2, 12)
2177 COLON ':' (2, 12) (2, 13)
2178 NAME 'pass' (2, 14) (2, 18)
2179 """)
2180
2181 def test_tabs(self):
2182
2183 self.check_tokenize('@staticmethod\ndef foo(x,y): pass', """\
2184 AT '@' (1, 0) (1, 1)
2185 NAME 'staticmethod' (1, 1) (1, 13)
2186 NEWLINE '' (1, 13) (1, 13)
2187 NAME 'def' (2, 0) (2, 3)
2188 NAME 'foo' (2, 4) (2, 7)
2189 LPAR '(' (2, 7) (2, 8)
2190 NAME 'x' (2, 8) (2, 9)
2191 COMMA ',' (2, 9) (2, 10)
2192 NAME 'y' (2, 10) (2, 11)
2193 RPAR ')' (2, 11) (2, 12)
2194 COLON ':' (2, 12) (2, 13)
2195 NAME 'pass' (2, 14) (2, 18)
2196 """)
2197
2198 def test_async(self):
2199
2200 self.check_tokenize('async = 1', """\
2201 ASYNC 'async' (1, 0) (1, 5)
2202 EQUAL '=' (1, 6) (1, 7)
2203 NUMBER '1' (1, 8) (1, 9)
2204 """)
2205
2206 self.check_tokenize('a = (async = 1)', """\
2207 NAME 'a' (1, 0) (1, 1)
2208 EQUAL '=' (1, 2) (1, 3)
2209 LPAR '(' (1, 4) (1, 5)
2210 ASYNC 'async' (1, 5) (1, 10)
2211 EQUAL '=' (1, 11) (1, 12)
2212 NUMBER '1' (1, 13) (1, 14)
2213 RPAR ')' (1, 14) (1, 15)
2214 """)
2215
2216 self.check_tokenize('async()', """\
2217 ASYNC 'async' (1, 0) (1, 5)
2218 LPAR '(' (1, 5) (1, 6)
2219 RPAR ')' (1, 6) (1, 7)
2220 """)
2221
2222 self.check_tokenize('class async(Bar):pass', """\
2223 NAME 'class' (1, 0) (1, 5)
2224 ASYNC 'async' (1, 6) (1, 11)
2225 LPAR '(' (1, 11) (1, 12)
2226 NAME 'Bar' (1, 12) (1, 15)
2227 RPAR ')' (1, 15) (1, 16)
2228 COLON ':' (1, 16) (1, 17)
2229 NAME 'pass' (1, 17) (1, 21)
2230 """)
2231
2232 self.check_tokenize('class async:pass', """\
2233 NAME 'class' (1, 0) (1, 5)
2234 ASYNC 'async' (1, 6) (1, 11)
2235 COLON ':' (1, 11) (1, 12)
2236 NAME 'pass' (1, 12) (1, 16)
2237 """)
2238
2239 self.check_tokenize('await = 1', """\
2240 AWAIT 'await' (1, 0) (1, 5)
2241 EQUAL '=' (1, 6) (1, 7)
2242 NUMBER '1' (1, 8) (1, 9)
2243 """)
2244
2245 self.check_tokenize('foo.async', """\
2246 NAME 'foo' (1, 0) (1, 3)
2247 DOT '.' (1, 3) (1, 4)
2248 ASYNC 'async' (1, 4) (1, 9)
2249 """)
2250
2251 self.check_tokenize('async for a in b: pass', """\
2252 ASYNC 'async' (1, 0) (1, 5)
2253 NAME 'for' (1, 6) (1, 9)
2254 NAME 'a' (1, 10) (1, 11)
2255 NAME 'in' (1, 12) (1, 14)
2256 NAME 'b' (1, 15) (1, 16)
2257 COLON ':' (1, 16) (1, 17)
2258 NAME 'pass' (1, 18) (1, 22)
2259 """)
2260
2261 self.check_tokenize('async with a as b: pass', """\
2262 ASYNC 'async' (1, 0) (1, 5)
2263 NAME 'with' (1, 6) (1, 10)
2264 NAME 'a' (1, 11) (1, 12)
2265 NAME 'as' (1, 13) (1, 15)
2266 NAME 'b' (1, 16) (1, 17)
2267 COLON ':' (1, 17) (1, 18)
2268 NAME 'pass' (1, 19) (1, 23)
2269 """)
2270
2271 self.check_tokenize('async.foo', """\
2272 ASYNC 'async' (1, 0) (1, 5)
2273 DOT '.' (1, 5) (1, 6)
2274 NAME 'foo' (1, 6) (1, 9)
2275 """)
2276
2277 self.check_tokenize('async', """\
2278 ASYNC 'async' (1, 0) (1, 5)
2279 """)
2280
2281 self.check_tokenize('async\n#comment\nawait', """\
2282 ASYNC 'async' (1, 0) (1, 5)
2283 NEWLINE '' (1, 5) (1, 5)
2284 AWAIT 'await' (3, 0) (3, 5)
2285 """)
2286
2287 self.check_tokenize('async\n...\nawait', """\
2288 ASYNC 'async' (1, 0) (1, 5)
2289 NEWLINE '' (1, 5) (1, 5)
2290 ELLIPSIS '...' (2, 0) (2, 3)
2291 NEWLINE '' (2, 3) (2, 3)
2292 AWAIT 'await' (3, 0) (3, 5)
2293 """)
2294
2295 self.check_tokenize('async\nawait', """\
2296 ASYNC 'async' (1, 0) (1, 5)
2297 NEWLINE '' (1, 5) (1, 5)
2298 AWAIT 'await' (2, 0) (2, 5)
2299 """)
2300
2301 self.check_tokenize('foo.async + 1', """\
2302 NAME 'foo' (1, 0) (1, 3)
2303 DOT '.' (1, 3) (1, 4)
2304 ASYNC 'async' (1, 4) (1, 9)
2305 PLUS '+' (1, 10) (1, 11)
2306 NUMBER '1' (1, 12) (1, 13)
2307 """)
2308
2309 self.check_tokenize('async def foo(): pass', """\
2310 ASYNC 'async' (1, 0) (1, 5)
2311 NAME 'def' (1, 6) (1, 9)
2312 NAME 'foo' (1, 10) (1, 13)
2313 LPAR '(' (1, 13) (1, 14)
2314 RPAR ')' (1, 14) (1, 15)
2315 COLON ':' (1, 15) (1, 16)
2316 NAME 'pass' (1, 17) (1, 21)
2317 """)
2318
2319 self.check_tokenize('''\
2320 async def foo():
2321 def foo(await):
2322 await = 1
2323 if 1:
2324 await
2325 async += 1
2326 ''', """\
2327 ASYNC 'async' (1, 0) (1, 5)
2328 NAME 'def' (1, 6) (1, 9)
2329 NAME 'foo' (1, 10) (1, 13)
2330 LPAR '(' (1, 13) (1, 14)
2331 RPAR ')' (1, 14) (1, 15)
2332 COLON ':' (1, 15) (1, 16)
2333 NEWLINE '' (1, 16) (1, 16)
2334 INDENT '' (2, -1) (2, -1)
2335 NAME 'def' (2, 2) (2, 5)
2336 NAME 'foo' (2, 6) (2, 9)
2337 LPAR '(' (2, 9) (2, 10)
2338 AWAIT 'await' (2, 10) (2, 15)
2339 RPAR ')' (2, 15) (2, 16)
2340 COLON ':' (2, 16) (2, 17)
2341 NEWLINE '' (2, 17) (2, 17)
2342 INDENT '' (3, -1) (3, -1)
2343 AWAIT 'await' (3, 4) (3, 9)
2344 EQUAL '=' (3, 10) (3, 11)
2345 NUMBER '1' (3, 12) (3, 13)
2346 NEWLINE '' (3, 13) (3, 13)
2347 DEDENT '' (4, -1) (4, -1)
2348 NAME 'if' (4, 2) (4, 4)
2349 NUMBER '1' (4, 5) (4, 6)
2350 COLON ':' (4, 6) (4, 7)
2351 NEWLINE '' (4, 7) (4, 7)
2352 INDENT '' (5, -1) (5, -1)
2353 AWAIT 'await' (5, 4) (5, 9)
2354 NEWLINE '' (5, 9) (5, 9)
2355 DEDENT '' (6, -1) (6, -1)
2356 DEDENT '' (6, -1) (6, -1)
2357 ASYNC 'async' (6, 0) (6, 5)
2358 PLUSEQUAL '+=' (6, 6) (6, 8)
2359 NUMBER '1' (6, 9) (6, 10)
2360 NEWLINE '' (6, 10) (6, 10)
2361 """)
2362
2363 self.check_tokenize('async def foo():\n async for i in 1: pass', """\
2364 ASYNC 'async' (1, 0) (1, 5)
2365 NAME 'def' (1, 6) (1, 9)
2366 NAME 'foo' (1, 10) (1, 13)
2367 LPAR '(' (1, 13) (1, 14)
2368 RPAR ')' (1, 14) (1, 15)
2369 COLON ':' (1, 15) (1, 16)
2370 NEWLINE '' (1, 16) (1, 16)
2371 INDENT '' (2, -1) (2, -1)
2372 ASYNC 'async' (2, 2) (2, 7)
2373 NAME 'for' (2, 8) (2, 11)
2374 NAME 'i' (2, 12) (2, 13)
2375 NAME 'in' (2, 14) (2, 16)
2376 NUMBER '1' (2, 17) (2, 18)
2377 COLON ':' (2, 18) (2, 19)
2378 NAME 'pass' (2, 20) (2, 24)
2379 DEDENT '' (2, -1) (2, -1)
2380 """)
2381
2382 self.check_tokenize('async def foo(async): await', """\
2383 ASYNC 'async' (1, 0) (1, 5)
2384 NAME 'def' (1, 6) (1, 9)
2385 NAME 'foo' (1, 10) (1, 13)
2386 LPAR '(' (1, 13) (1, 14)
2387 ASYNC 'async' (1, 14) (1, 19)
2388 RPAR ')' (1, 19) (1, 20)
2389 COLON ':' (1, 20) (1, 21)
2390 AWAIT 'await' (1, 22) (1, 27)
2391 """)
2392
2393 self.check_tokenize('''\
2394 def f():
2395
2396 def baz(): pass
2397 async def bar(): pass
2398
2399 await = 2''', """\
2400 NAME 'def' (1, 0) (1, 3)
2401 NAME 'f' (1, 4) (1, 5)
2402 LPAR '(' (1, 5) (1, 6)
2403 RPAR ')' (1, 6) (1, 7)
2404 COLON ':' (1, 7) (1, 8)
2405 NEWLINE '' (1, 8) (1, 8)
2406 INDENT '' (3, -1) (3, -1)
2407 NAME 'def' (3, 2) (3, 5)
2408 NAME 'baz' (3, 6) (3, 9)
2409 LPAR '(' (3, 9) (3, 10)
2410 RPAR ')' (3, 10) (3, 11)
2411 COLON ':' (3, 11) (3, 12)
2412 NAME 'pass' (3, 13) (3, 17)
2413 NEWLINE '' (3, 17) (3, 17)
2414 ASYNC 'async' (4, 2) (4, 7)
2415 NAME 'def' (4, 8) (4, 11)
2416 NAME 'bar' (4, 12) (4, 15)
2417 LPAR '(' (4, 15) (4, 16)
2418 RPAR ')' (4, 16) (4, 17)
2419 COLON ':' (4, 17) (4, 18)
2420 NAME 'pass' (4, 19) (4, 23)
2421 NEWLINE '' (4, 23) (4, 23)
2422 AWAIT 'await' (6, 2) (6, 7)
2423 EQUAL '=' (6, 8) (6, 9)
2424 NUMBER '2' (6, 10) (6, 11)
2425 DEDENT '' (6, -1) (6, -1)
2426 """)
2427
2428 self.check_tokenize('''\
2429 async def f():
2430
2431 def baz(): pass
2432 async def bar(): pass
2433
2434 await = 2''', """\
2435 ASYNC 'async' (1, 0) (1, 5)
2436 NAME 'def' (1, 6) (1, 9)
2437 NAME 'f' (1, 10) (1, 11)
2438 LPAR '(' (1, 11) (1, 12)
2439 RPAR ')' (1, 12) (1, 13)
2440 COLON ':' (1, 13) (1, 14)
2441 NEWLINE '' (1, 14) (1, 14)
2442 INDENT '' (3, -1) (3, -1)
2443 NAME 'def' (3, 2) (3, 5)
2444 NAME 'baz' (3, 6) (3, 9)
2445 LPAR '(' (3, 9) (3, 10)
2446 RPAR ')' (3, 10) (3, 11)
2447 COLON ':' (3, 11) (3, 12)
2448 NAME 'pass' (3, 13) (3, 17)
2449 NEWLINE '' (3, 17) (3, 17)
2450 ASYNC 'async' (4, 2) (4, 7)
2451 NAME 'def' (4, 8) (4, 11)
2452 NAME 'bar' (4, 12) (4, 15)
2453 LPAR '(' (4, 15) (4, 16)
2454 RPAR ')' (4, 16) (4, 17)
2455 COLON ':' (4, 17) (4, 18)
2456 NAME 'pass' (4, 19) (4, 23)
2457 NEWLINE '' (4, 23) (4, 23)
2458 AWAIT 'await' (6, 2) (6, 7)
2459 EQUAL '=' (6, 8) (6, 9)
2460 NUMBER '2' (6, 10) (6, 11)
2461 DEDENT '' (6, -1) (6, -1)
2462 """)
2463
2464 def test_unicode(self):
2465
2466 self.check_tokenize("Örter = u'places'\ngrün = U'green'", """\
2467 NAME 'Örter' (1, 0) (1, 6)
2468 EQUAL '=' (1, 7) (1, 8)
2469 STRING "u'places'" (1, 9) (1, 18)
2470 NEWLINE '' (1, 18) (1, 18)
2471 NAME 'grün' (2, 0) (2, 5)
2472 EQUAL '=' (2, 6) (2, 7)
2473 STRING "U'green'" (2, 8) (2, 16)
2474 """)
2475
2476 def test_invalid_syntax(self):
2477 def get_tokens(string):
2478 return list(_generate_tokens_from_c_tokenizer(string))
2479
2480 self.assertRaises(SyntaxError, get_tokens, "(1+2]")
2481 self.assertRaises(SyntaxError, get_tokens, "(1+2}")
2482 self.assertRaises(SyntaxError, get_tokens, "{1+2]")
2483
2484 self.assertRaises(SyntaxError, get_tokens, "1_")
2485 self.assertRaises(SyntaxError, get_tokens, "1.2_")
2486 self.assertRaises(SyntaxError, get_tokens, "1e2_")
2487 self.assertRaises(SyntaxError, get_tokens, "1e+")
2488
2489 self.assertRaises(SyntaxError, get_tokens, "\xa0")
2490 self.assertRaises(SyntaxError, get_tokens, "€")
2491
2492 self.assertRaises(SyntaxError, get_tokens, "0b12")
2493 self.assertRaises(SyntaxError, get_tokens, "0b1_2")
2494 self.assertRaises(SyntaxError, get_tokens, "0b2")
2495 self.assertRaises(SyntaxError, get_tokens, "0b1_")
2496 self.assertRaises(SyntaxError, get_tokens, "0b")
2497 self.assertRaises(SyntaxError, get_tokens, "0o18")
2498 self.assertRaises(SyntaxError, get_tokens, "0o1_8")
2499 self.assertRaises(SyntaxError, get_tokens, "0o8")
2500 self.assertRaises(SyntaxError, get_tokens, "0o1_")
2501 self.assertRaises(SyntaxError, get_tokens, "0o")
2502 self.assertRaises(SyntaxError, get_tokens, "0x1_")
2503 self.assertRaises(SyntaxError, get_tokens, "0x")
2504 self.assertRaises(SyntaxError, get_tokens, "1_")
2505 self.assertRaises(SyntaxError, get_tokens, "012")
2506 self.assertRaises(SyntaxError, get_tokens, "1.2_")
2507 self.assertRaises(SyntaxError, get_tokens, "1e2_")
2508 self.assertRaises(SyntaxError, get_tokens, "1e+")
2509
2510 self.assertRaises(SyntaxError, get_tokens, "'sdfsdf")
2511 self.assertRaises(SyntaxError, get_tokens, "'''sdfsdf''")
2512
2513 self.assertRaises(SyntaxError, get_tokens, "("*1000+"a"+")"*1000)
2514 self.assertRaises(SyntaxError, get_tokens, "]")
2515
2516 def test_max_indent(self):
2517 MAXINDENT = 100
2518
2519 def generate_source(indents):
2520 source = ''.join((' ' * x) + 'if True:\n' for x in range(indents))
2521 source += ' ' * indents + 'pass\n'
2522 return source
2523
2524 valid = generate_source(MAXINDENT - 1)
2525 tokens = list(_generate_tokens_from_c_tokenizer(valid))
2526 self.assertEqual(tokens[-1].type, DEDENT)
2527 compile(valid, "<string>", "exec")
2528
2529 invalid = generate_source(MAXINDENT)
2530 tokens = list(_generate_tokens_from_c_tokenizer(invalid))
2531 self.assertEqual(tokens[-1].type, NEWLINE)
2532 self.assertRaises(
2533 IndentationError, compile, invalid, "<string>", "exec"
2534 )
2535
2536 def test_continuation_lines_indentation(self):
2537 def get_tokens(string):
2538 return [(kind, string) for (kind, string, *_) in _generate_tokens_from_c_tokenizer(string)]
2539
2540 code = dedent("""
2541 def fib(n):
2542 \\
2543 '''Print a Fibonacci series up to n.'''
2544 \\
2545 a, b = 0, 1
2546 """)
2547
2548 self.check_tokenize(code, """\
2549 NAME 'def' (2, 0) (2, 3)
2550 NAME 'fib' (2, 4) (2, 7)
2551 LPAR '(' (2, 7) (2, 8)
2552 NAME 'n' (2, 8) (2, 9)
2553 RPAR ')' (2, 9) (2, 10)
2554 COLON ':' (2, 10) (2, 11)
2555 NEWLINE '' (2, 11) (2, 11)
2556 INDENT '' (4, -1) (4, -1)
2557 STRING "'''Print a Fibonacci series up to n.'''" (4, 0) (4, 39)
2558 NEWLINE '' (4, 39) (4, 39)
2559 NAME 'a' (6, 0) (6, 1)
2560 COMMA ',' (6, 1) (6, 2)
2561 NAME 'b' (6, 3) (6, 4)
2562 EQUAL '=' (6, 5) (6, 6)
2563 NUMBER '0' (6, 7) (6, 8)
2564 COMMA ',' (6, 8) (6, 9)
2565 NUMBER '1' (6, 10) (6, 11)
2566 NEWLINE '' (6, 11) (6, 11)
2567 DEDENT '' (6, -1) (6, -1)
2568 """)
2569
2570 code_no_cont = dedent("""
2571 def fib(n):
2572 '''Print a Fibonacci series up to n.'''
2573 a, b = 0, 1
2574 """)
2575
2576 self.assertEqual(get_tokens(code), get_tokens(code_no_cont))
2577
2578 code = dedent("""
2579 pass
2580 \\
2581
2582 pass
2583 """)
2584
2585 self.check_tokenize(code, """\
2586 NAME 'pass' (2, 0) (2, 4)
2587 NEWLINE '' (2, 4) (2, 4)
2588 NAME 'pass' (5, 0) (5, 4)
2589 NEWLINE '' (5, 4) (5, 4)
2590 """)
2591
2592 code_no_cont = dedent("""
2593 pass
2594 pass
2595 """)
2596
2597 self.assertEqual(get_tokens(code), get_tokens(code_no_cont))
2598
2599 code = dedent("""
2600 if x:
2601 y = 1
2602 \\
2603 \\
2604 \\
2605 \\
2606 foo = 1
2607 """)
2608
2609 self.check_tokenize(code, """\
2610 NAME 'if' (2, 0) (2, 2)
2611 NAME 'x' (2, 3) (2, 4)
2612 COLON ':' (2, 4) (2, 5)
2613 NEWLINE '' (2, 5) (2, 5)
2614 INDENT '' (3, -1) (3, -1)
2615 NAME 'y' (3, 4) (3, 5)
2616 EQUAL '=' (3, 6) (3, 7)
2617 NUMBER '1' (3, 8) (3, 9)
2618 NEWLINE '' (3, 9) (3, 9)
2619 NAME 'foo' (8, 4) (8, 7)
2620 EQUAL '=' (8, 8) (8, 9)
2621 NUMBER '1' (8, 10) (8, 11)
2622 NEWLINE '' (8, 11) (8, 11)
2623 DEDENT '' (8, -1) (8, -1)
2624 """)
2625
2626 code_no_cont = dedent("""
2627 if x:
2628 y = 1
2629 foo = 1
2630 """)
2631
2632 self.assertEqual(get_tokens(code), get_tokens(code_no_cont))
2633
2634
2635 class ESC[4;38;5;81mCTokenizerBufferTests(ESC[4;38;5;149munittestESC[4;38;5;149m.ESC[4;38;5;149mTestCase):
2636 def test_newline_at_the_end_of_buffer(self):
2637 # See issue 99581: Make sure that if we need to add a new line at the
2638 # end of the buffer, we have enough space in the buffer, specially when
2639 # the current line is as long as the buffer space available.
2640 test_script = f"""\
2641 #coding: latin-1
2642 #{"a"*10000}
2643 #{"a"*10002}"""
2644 with os_helper.temp_dir() as temp_dir:
2645 file_name = make_script(temp_dir, 'foo', test_script)
2646 run_test_script(file_name)
2647
2648
2649 if __name__ == "__main__":
2650 unittest.main()