1 """Tokenization help for Python programs.
2
3 tokenize(readline) is a generator that breaks a stream of bytes into
4 Python tokens. It decodes the bytes according to PEP-0263 for
5 determining source file encoding.
6
7 It accepts a readline-like method which is called repeatedly to get the
8 next line of input (or b"" for EOF). It generates 5-tuples with these
9 members:
10
11 the token type (see token.py)
12 the token (a string)
13 the starting (row, column) indices of the token (a 2-tuple of ints)
14 the ending (row, column) indices of the token (a 2-tuple of ints)
15 the original line (string)
16
17 It is designed to match the working of the Python tokenizer exactly, except
18 that it produces COMMENT tokens for comments and gives type OP for all
19 operators. Additionally, all token lists start with an ENCODING token
20 which tells you which encoding was used to decode the bytes stream.
21 """
22
23 __author__ = 'Ka-Ping Yee <ping@lfw.org>'
24 __credits__ = ('GvR, ESR, Tim Peters, Thomas Wouters, Fred Drake, '
25 'Skip Montanaro, Raymond Hettinger, Trent Nelson, '
26 'Michael Foord')
27 from builtins import open as _builtin_open
28 from codecs import lookup, BOM_UTF8
29 import collections
30 import functools
31 from io import TextIOWrapper
32 import itertools as _itertools
33 import re
34 import sys
35 from token import *
36 from token import EXACT_TOKEN_TYPES
37 import _tokenize
38
39 cookie_re = re.compile(r'^[ \t\f]*#.*?coding[:=][ \t]*([-\w.]+)', re.ASCII)
40 blank_re = re.compile(br'^[ \t\f]*(?:[#\r\n]|$)', re.ASCII)
41
42 import token
43 __all__ = token.__all__ + ["tokenize", "generate_tokens", "detect_encoding",
44 "untokenize", "TokenInfo"]
45 del token
46
47 class ESC[4;38;5;81mTokenInfo(ESC[4;38;5;149mcollectionsESC[4;38;5;149m.ESC[4;38;5;149mnamedtuple('TokenInfo', 'type string start end line')):
48 def __repr__(self):
49 annotated_type = '%d (%s)' % (self.type, tok_name[self.type])
50 return ('TokenInfo(type=%s, string=%r, start=%r, end=%r, line=%r)' %
51 self._replace(type=annotated_type))
52
53 @property
54 def exact_type(self):
55 if self.type == OP and self.string in EXACT_TOKEN_TYPES:
56 return EXACT_TOKEN_TYPES[self.string]
57 else:
58 return self.type
59
60 def group(*choices): return '(' + '|'.join(choices) + ')'
61 def any(*choices): return group(*choices) + '*'
62 def maybe(*choices): return group(*choices) + '?'
63
64 # Note: we use unicode matching for names ("\w") but ascii matching for
65 # number literals.
66 Whitespace = r'[ \f\t]*'
67 Comment = r'#[^\r\n]*'
68 Ignore = Whitespace + any(r'\\\r?\n' + Whitespace) + maybe(Comment)
69 Name = r'\w+'
70
71 Hexnumber = r'0[xX](?:_?[0-9a-fA-F])+'
72 Binnumber = r'0[bB](?:_?[01])+'
73 Octnumber = r'0[oO](?:_?[0-7])+'
74 Decnumber = r'(?:0(?:_?0)*|[1-9](?:_?[0-9])*)'
75 Intnumber = group(Hexnumber, Binnumber, Octnumber, Decnumber)
76 Exponent = r'[eE][-+]?[0-9](?:_?[0-9])*'
77 Pointfloat = group(r'[0-9](?:_?[0-9])*\.(?:[0-9](?:_?[0-9])*)?',
78 r'\.[0-9](?:_?[0-9])*') + maybe(Exponent)
79 Expfloat = r'[0-9](?:_?[0-9])*' + Exponent
80 Floatnumber = group(Pointfloat, Expfloat)
81 Imagnumber = group(r'[0-9](?:_?[0-9])*[jJ]', Floatnumber + r'[jJ]')
82 Number = group(Imagnumber, Floatnumber, Intnumber)
83
84 # Return the empty string, plus all of the valid string prefixes.
85 def _all_string_prefixes():
86 # The valid string prefixes. Only contain the lower case versions,
87 # and don't contain any permutations (include 'fr', but not
88 # 'rf'). The various permutations will be generated.
89 _valid_string_prefixes = ['b', 'r', 'u', 'f', 'br', 'fr']
90 # if we add binary f-strings, add: ['fb', 'fbr']
91 result = {''}
92 for prefix in _valid_string_prefixes:
93 for t in _itertools.permutations(prefix):
94 # create a list with upper and lower versions of each
95 # character
96 for u in _itertools.product(*[(c, c.upper()) for c in t]):
97 result.add(''.join(u))
98 return result
99
100 @functools.lru_cache
101 def _compile(expr):
102 return re.compile(expr, re.UNICODE)
103
104 # Note that since _all_string_prefixes includes the empty string,
105 # StringPrefix can be the empty string (making it optional).
106 StringPrefix = group(*_all_string_prefixes())
107
108 # Tail end of ' string.
109 Single = r"[^'\\]*(?:\\.[^'\\]*)*'"
110 # Tail end of " string.
111 Double = r'[^"\\]*(?:\\.[^"\\]*)*"'
112 # Tail end of ''' string.
113 Single3 = r"[^'\\]*(?:(?:\\.|'(?!''))[^'\\]*)*'''"
114 # Tail end of """ string.
115 Double3 = r'[^"\\]*(?:(?:\\.|"(?!""))[^"\\]*)*"""'
116 Triple = group(StringPrefix + "'''", StringPrefix + '"""')
117 # Single-line ' or " string.
118 String = group(StringPrefix + r"'[^\n'\\]*(?:\\.[^\n'\\]*)*'",
119 StringPrefix + r'"[^\n"\\]*(?:\\.[^\n"\\]*)*"')
120
121 # Sorting in reverse order puts the long operators before their prefixes.
122 # Otherwise if = came before ==, == would get recognized as two instances
123 # of =.
124 Special = group(*map(re.escape, sorted(EXACT_TOKEN_TYPES, reverse=True)))
125 Funny = group(r'\r?\n', Special)
126
127 PlainToken = group(Number, Funny, String, Name)
128 Token = Ignore + PlainToken
129
130 # First (or only) line of ' or " string.
131 ContStr = group(StringPrefix + r"'[^\n'\\]*(?:\\.[^\n'\\]*)*" +
132 group("'", r'\\\r?\n'),
133 StringPrefix + r'"[^\n"\\]*(?:\\.[^\n"\\]*)*' +
134 group('"', r'\\\r?\n'))
135 PseudoExtras = group(r'\\\r?\n|\Z', Comment, Triple)
136 PseudoToken = Whitespace + group(PseudoExtras, Number, Funny, ContStr, Name)
137
138 # For a given string prefix plus quotes, endpats maps it to a regex
139 # to match the remainder of that string. _prefix can be empty, for
140 # a normal single or triple quoted string (with no prefix).
141 endpats = {}
142 for _prefix in _all_string_prefixes():
143 endpats[_prefix + "'"] = Single
144 endpats[_prefix + '"'] = Double
145 endpats[_prefix + "'''"] = Single3
146 endpats[_prefix + '"""'] = Double3
147 del _prefix
148
149 # A set of all of the single and triple quoted string prefixes,
150 # including the opening quotes.
151 single_quoted = set()
152 triple_quoted = set()
153 for t in _all_string_prefixes():
154 for u in (t + '"', t + "'"):
155 single_quoted.add(u)
156 for u in (t + '"""', t + "'''"):
157 triple_quoted.add(u)
158 del t, u
159
160 tabsize = 8
161
162 class ESC[4;38;5;81mTokenError(ESC[4;38;5;149mException): pass
163
164
165 class ESC[4;38;5;81mStopTokenizing(ESC[4;38;5;149mException): pass
166
167 class ESC[4;38;5;81mUntokenizer:
168
169 def __init__(self):
170 self.tokens = []
171 self.prev_row = 1
172 self.prev_col = 0
173 self.encoding = None
174
175 def add_whitespace(self, start):
176 row, col = start
177 if row < self.prev_row or row == self.prev_row and col < self.prev_col:
178 raise ValueError("start ({},{}) precedes previous end ({},{})"
179 .format(row, col, self.prev_row, self.prev_col))
180 row_offset = row - self.prev_row
181 if row_offset:
182 self.tokens.append("\\\n" * row_offset)
183 self.prev_col = 0
184 col_offset = col - self.prev_col
185 if col_offset:
186 self.tokens.append(" " * col_offset)
187
188 def untokenize(self, iterable):
189 it = iter(iterable)
190 indents = []
191 startline = False
192 for t in it:
193 if len(t) == 2:
194 self.compat(t, it)
195 break
196 tok_type, token, start, end, line = t
197 if tok_type == ENCODING:
198 self.encoding = token
199 continue
200 if tok_type == ENDMARKER:
201 break
202 if tok_type == INDENT:
203 indents.append(token)
204 continue
205 elif tok_type == DEDENT:
206 indents.pop()
207 self.prev_row, self.prev_col = end
208 continue
209 elif tok_type in (NEWLINE, NL):
210 startline = True
211 elif startline and indents:
212 indent = indents[-1]
213 if start[1] >= len(indent):
214 self.tokens.append(indent)
215 self.prev_col = len(indent)
216 startline = False
217 elif tok_type == FSTRING_MIDDLE:
218 if '{' in token or '}' in token:
219 end_line, end_col = end
220 end = (end_line, end_col + token.count('{') + token.count('}'))
221 token = re.sub('{', '{{', token)
222 token = re.sub('}', '}}', token)
223
224
225 self.add_whitespace(start)
226 self.tokens.append(token)
227 self.prev_row, self.prev_col = end
228 if tok_type in (NEWLINE, NL):
229 self.prev_row += 1
230 self.prev_col = 0
231 return "".join(self.tokens)
232
233 def compat(self, token, iterable):
234 indents = []
235 toks_append = self.tokens.append
236 startline = token[0] in (NEWLINE, NL)
237 prevstring = False
238
239 for tok in _itertools.chain([token], iterable):
240 toknum, tokval = tok[:2]
241 if toknum == ENCODING:
242 self.encoding = tokval
243 continue
244
245 if toknum in (NAME, NUMBER):
246 tokval += ' '
247
248 # Insert a space between two consecutive strings
249 if toknum == STRING:
250 if prevstring:
251 tokval = ' ' + tokval
252 prevstring = True
253 else:
254 prevstring = False
255
256 if toknum == INDENT:
257 indents.append(tokval)
258 continue
259 elif toknum == DEDENT:
260 indents.pop()
261 continue
262 elif toknum in (NEWLINE, NL):
263 startline = True
264 elif startline and indents:
265 toks_append(indents[-1])
266 startline = False
267 elif toknum == FSTRING_MIDDLE:
268 if '{' in tokval or '}' in tokval:
269 tokval = re.sub('{', '{{', tokval)
270 tokval = re.sub('}', '}}', tokval)
271
272 toks_append(tokval)
273
274
275 def untokenize(iterable):
276 """Transform tokens back into Python source code.
277 It returns a bytes object, encoded using the ENCODING
278 token, which is the first token sequence output by tokenize.
279
280 Each element returned by the iterable must be a token sequence
281 with at least two elements, a token number and token value. If
282 only two tokens are passed, the resulting output is poor.
283
284 Round-trip invariant for full input:
285 Untokenized source will match input source exactly
286
287 Round-trip invariant for limited input:
288 # Output bytes will tokenize back to the input
289 t1 = [tok[:2] for tok in tokenize(f.readline)]
290 newcode = untokenize(t1)
291 readline = BytesIO(newcode).readline
292 t2 = [tok[:2] for tok in tokenize(readline)]
293 assert t1 == t2
294 """
295 ut = Untokenizer()
296 out = ut.untokenize(iterable)
297 if ut.encoding is not None:
298 out = out.encode(ut.encoding)
299 return out
300
301
302 def _get_normal_name(orig_enc):
303 """Imitates get_normal_name in tokenizer.c."""
304 # Only care about the first 12 characters.
305 enc = orig_enc[:12].lower().replace("_", "-")
306 if enc == "utf-8" or enc.startswith("utf-8-"):
307 return "utf-8"
308 if enc in ("latin-1", "iso-8859-1", "iso-latin-1") or \
309 enc.startswith(("latin-1-", "iso-8859-1-", "iso-latin-1-")):
310 return "iso-8859-1"
311 return orig_enc
312
313 def detect_encoding(readline):
314 """
315 The detect_encoding() function is used to detect the encoding that should
316 be used to decode a Python source file. It requires one argument, readline,
317 in the same way as the tokenize() generator.
318
319 It will call readline a maximum of twice, and return the encoding used
320 (as a string) and a list of any lines (left as bytes) it has read in.
321
322 It detects the encoding from the presence of a utf-8 bom or an encoding
323 cookie as specified in pep-0263. If both a bom and a cookie are present,
324 but disagree, a SyntaxError will be raised. If the encoding cookie is an
325 invalid charset, raise a SyntaxError. Note that if a utf-8 bom is found,
326 'utf-8-sig' is returned.
327
328 If no encoding is specified, then the default of 'utf-8' will be returned.
329 """
330 try:
331 filename = readline.__self__.name
332 except AttributeError:
333 filename = None
334 bom_found = False
335 encoding = None
336 default = 'utf-8'
337 def read_or_stop():
338 try:
339 return readline()
340 except StopIteration:
341 return b''
342
343 def find_cookie(line):
344 try:
345 # Decode as UTF-8. Either the line is an encoding declaration,
346 # in which case it should be pure ASCII, or it must be UTF-8
347 # per default encoding.
348 line_string = line.decode('utf-8')
349 except UnicodeDecodeError:
350 msg = "invalid or missing encoding declaration"
351 if filename is not None:
352 msg = '{} for {!r}'.format(msg, filename)
353 raise SyntaxError(msg)
354
355 match = cookie_re.match(line_string)
356 if not match:
357 return None
358 encoding = _get_normal_name(match.group(1))
359 try:
360 codec = lookup(encoding)
361 except LookupError:
362 # This behaviour mimics the Python interpreter
363 if filename is None:
364 msg = "unknown encoding: " + encoding
365 else:
366 msg = "unknown encoding for {!r}: {}".format(filename,
367 encoding)
368 raise SyntaxError(msg)
369
370 if bom_found:
371 if encoding != 'utf-8':
372 # This behaviour mimics the Python interpreter
373 if filename is None:
374 msg = 'encoding problem: utf-8'
375 else:
376 msg = 'encoding problem for {!r}: utf-8'.format(filename)
377 raise SyntaxError(msg)
378 encoding += '-sig'
379 return encoding
380
381 first = read_or_stop()
382 if first.startswith(BOM_UTF8):
383 bom_found = True
384 first = first[3:]
385 default = 'utf-8-sig'
386 if not first:
387 return default, []
388
389 encoding = find_cookie(first)
390 if encoding:
391 return encoding, [first]
392 if not blank_re.match(first):
393 return default, [first]
394
395 second = read_or_stop()
396 if not second:
397 return default, [first]
398
399 encoding = find_cookie(second)
400 if encoding:
401 return encoding, [first, second]
402
403 return default, [first, second]
404
405
406 def open(filename):
407 """Open a file in read only mode using the encoding detected by
408 detect_encoding().
409 """
410 buffer = _builtin_open(filename, 'rb')
411 try:
412 encoding, lines = detect_encoding(buffer.readline)
413 buffer.seek(0)
414 text = TextIOWrapper(buffer, encoding, line_buffering=True)
415 text.mode = 'r'
416 return text
417 except:
418 buffer.close()
419 raise
420
421 def tokenize(readline):
422 """
423 The tokenize() generator requires one argument, readline, which
424 must be a callable object which provides the same interface as the
425 readline() method of built-in file objects. Each call to the function
426 should return one line of input as bytes. Alternatively, readline
427 can be a callable function terminating with StopIteration:
428 readline = open(myfile, 'rb').__next__ # Example of alternate readline
429
430 The generator produces 5-tuples with these members: the token type; the
431 token string; a 2-tuple (srow, scol) of ints specifying the row and
432 column where the token begins in the source; a 2-tuple (erow, ecol) of
433 ints specifying the row and column where the token ends in the source;
434 and the line on which the token was found. The line passed is the
435 physical line.
436
437 The first token sequence will always be an ENCODING token
438 which tells you which encoding was used to decode the bytes stream.
439 """
440 encoding, consumed = detect_encoding(readline)
441 rl_gen = _itertools.chain(consumed, iter(readline, b""))
442 if encoding is not None:
443 if encoding == "utf-8-sig":
444 # BOM will already have been stripped.
445 encoding = "utf-8"
446 yield TokenInfo(ENCODING, encoding, (0, 0), (0, 0), '')
447 yield from _generate_tokens_from_c_tokenizer(rl_gen.__next__, encoding, extra_tokens=True)
448
449 def generate_tokens(readline):
450 """Tokenize a source reading Python code as unicode strings.
451
452 This has the same API as tokenize(), except that it expects the *readline*
453 callable to return str objects instead of bytes.
454 """
455 return _generate_tokens_from_c_tokenizer(readline, extra_tokens=True)
456
457 def main():
458 import argparse
459
460 # Helper error handling routines
461 def perror(message):
462 sys.stderr.write(message)
463 sys.stderr.write('\n')
464
465 def error(message, filename=None, location=None):
466 if location:
467 args = (filename,) + location + (message,)
468 perror("%s:%d:%d: error: %s" % args)
469 elif filename:
470 perror("%s: error: %s" % (filename, message))
471 else:
472 perror("error: %s" % message)
473 sys.exit(1)
474
475 # Parse the arguments and options
476 parser = argparse.ArgumentParser(prog='python -m tokenize')
477 parser.add_argument(dest='filename', nargs='?',
478 metavar='filename.py',
479 help='the file to tokenize; defaults to stdin')
480 parser.add_argument('-e', '--exact', dest='exact', action='store_true',
481 help='display token names using the exact type')
482 args = parser.parse_args()
483
484 try:
485 # Tokenize the input
486 if args.filename:
487 filename = args.filename
488 with _builtin_open(filename, 'rb') as f:
489 tokens = list(tokenize(f.readline))
490 else:
491 filename = "<stdin>"
492 tokens = _generate_tokens_from_c_tokenizer(
493 sys.stdin.readline, extra_tokens=True)
494
495
496 # Output the tokenization
497 for token in tokens:
498 token_type = token.type
499 if args.exact:
500 token_type = token.exact_type
501 token_range = "%d,%d-%d,%d:" % (token.start + token.end)
502 print("%-20s%-15s%-15r" %
503 (token_range, tok_name[token_type], token.string))
504 except IndentationError as err:
505 line, column = err.args[1][1:3]
506 error(err.args[0], filename, (line, column))
507 except TokenError as err:
508 line, column = err.args[1]
509 error(err.args[0], filename, (line, column))
510 except SyntaxError as err:
511 error(err, filename)
512 except OSError as err:
513 error(err)
514 except KeyboardInterrupt:
515 print("interrupted\n")
516 except Exception as err:
517 perror("unexpected error: %s" % err)
518 raise
519
520 def _transform_msg(msg):
521 """Transform error messages from the C tokenizer into the Python tokenize
522
523 The C tokenizer is more picky than the Python one, so we need to massage
524 the error messages a bit for backwards compatibility.
525 """
526 if "unterminated triple-quoted string literal" in msg:
527 return "EOF in multi-line string"
528 return msg
529
530 def _generate_tokens_from_c_tokenizer(source, encoding=None, extra_tokens=False):
531 """Tokenize a source reading Python code as unicode strings using the internal C tokenizer"""
532 if encoding is None:
533 it = _tokenize.TokenizerIter(source, extra_tokens=extra_tokens)
534 else:
535 it = _tokenize.TokenizerIter(source, encoding=encoding, extra_tokens=extra_tokens)
536 try:
537 for info in it:
538 yield TokenInfo._make(info)
539 except SyntaxError as e:
540 if type(e) != SyntaxError:
541 raise e from None
542 msg = _transform_msg(e.msg)
543 raise TokenError(msg, (e.lineno, e.offset)) from None
544
545
546 if __name__ == "__main__":
547 main()