1  """Tokenization help for Python programs.
       2  
       3  tokenize(readline) is a generator that breaks a stream of bytes into
       4  Python tokens.  It decodes the bytes according to PEP-0263 for
       5  determining source file encoding.
       6  
       7  It accepts a readline-like method which is called repeatedly to get the
       8  next line of input (or b"" for EOF).  It generates 5-tuples with these
       9  members:
      10  
      11      the token type (see token.py)
      12      the token (a string)
      13      the starting (row, column) indices of the token (a 2-tuple of ints)
      14      the ending (row, column) indices of the token (a 2-tuple of ints)
      15      the original line (string)
      16  
      17  It is designed to match the working of the Python tokenizer exactly, except
      18  that it produces COMMENT tokens for comments and gives type OP for all
      19  operators.  Additionally, all token lists start with an ENCODING token
      20  which tells you which encoding was used to decode the bytes stream.
      21  """
      22  
      23  __author__ = 'Ka-Ping Yee <ping@lfw.org>'
      24  __credits__ = ('GvR, ESR, Tim Peters, Thomas Wouters, Fred Drake, '
      25                 'Skip Montanaro, Raymond Hettinger, Trent Nelson, '
      26                 'Michael Foord')
      27  from builtins import open as _builtin_open
      28  from codecs import lookup, BOM_UTF8
      29  import collections
      30  import functools
      31  from io import TextIOWrapper
      32  import itertools as _itertools
      33  import re
      34  import sys
      35  from token import *
      36  from token import EXACT_TOKEN_TYPES
      37  import _tokenize
      38  
      39  cookie_re = re.compile(r'^[ \t\f]*#.*?coding[:=][ \t]*([-\w.]+)', re.ASCII)
      40  blank_re = re.compile(br'^[ \t\f]*(?:[#\r\n]|$)', re.ASCII)
      41  
      42  import token
      43  __all__ = token.__all__ + ["tokenize", "generate_tokens", "detect_encoding",
      44                             "untokenize", "TokenInfo"]
      45  del token
      46  
      47  class ESC[4;38;5;81mTokenInfo(ESC[4;38;5;149mcollectionsESC[4;38;5;149m.ESC[4;38;5;149mnamedtuple('TokenInfo', 'type string start end line')):
      48      def __repr__(self):
      49          annotated_type = '%d (%s)' % (self.type, tok_name[self.type])
      50          return ('TokenInfo(type=%s, string=%r, start=%r, end=%r, line=%r)' %
      51                  self._replace(type=annotated_type))
      52  
      53      @property
      54      def exact_type(self):
      55          if self.type == OP and self.string in EXACT_TOKEN_TYPES:
      56              return EXACT_TOKEN_TYPES[self.string]
      57          else:
      58              return self.type
      59  
      60  def group(*choices): return '(' + '|'.join(choices) + ')'
      61  def any(*choices): return group(*choices) + '*'
      62  def maybe(*choices): return group(*choices) + '?'
      63  
      64  # Note: we use unicode matching for names ("\w") but ascii matching for
      65  # number literals.
      66  Whitespace = r'[ \f\t]*'
      67  Comment = r'#[^\r\n]*'
      68  Ignore = Whitespace + any(r'\\\r?\n' + Whitespace) + maybe(Comment)
      69  Name = r'\w+'
      70  
      71  Hexnumber = r'0[xX](?:_?[0-9a-fA-F])+'
      72  Binnumber = r'0[bB](?:_?[01])+'
      73  Octnumber = r'0[oO](?:_?[0-7])+'
      74  Decnumber = r'(?:0(?:_?0)*|[1-9](?:_?[0-9])*)'
      75  Intnumber = group(Hexnumber, Binnumber, Octnumber, Decnumber)
      76  Exponent = r'[eE][-+]?[0-9](?:_?[0-9])*'
      77  Pointfloat = group(r'[0-9](?:_?[0-9])*\.(?:[0-9](?:_?[0-9])*)?',
      78                     r'\.[0-9](?:_?[0-9])*') + maybe(Exponent)
      79  Expfloat = r'[0-9](?:_?[0-9])*' + Exponent
      80  Floatnumber = group(Pointfloat, Expfloat)
      81  Imagnumber = group(r'[0-9](?:_?[0-9])*[jJ]', Floatnumber + r'[jJ]')
      82  Number = group(Imagnumber, Floatnumber, Intnumber)
      83  
      84  # Return the empty string, plus all of the valid string prefixes.
      85  def _all_string_prefixes():
      86      # The valid string prefixes. Only contain the lower case versions,
      87      #  and don't contain any permutations (include 'fr', but not
      88      #  'rf'). The various permutations will be generated.
      89      _valid_string_prefixes = ['b', 'r', 'u', 'f', 'br', 'fr']
      90      # if we add binary f-strings, add: ['fb', 'fbr']
      91      result = {''}
      92      for prefix in _valid_string_prefixes:
      93          for t in _itertools.permutations(prefix):
      94              # create a list with upper and lower versions of each
      95              #  character
      96              for u in _itertools.product(*[(c, c.upper()) for c in t]):
      97                  result.add(''.join(u))
      98      return result
      99  
     100  @functools.lru_cache
     101  def _compile(expr):
     102      return re.compile(expr, re.UNICODE)
     103  
     104  # Note that since _all_string_prefixes includes the empty string,
     105  #  StringPrefix can be the empty string (making it optional).
     106  StringPrefix = group(*_all_string_prefixes())
     107  
     108  # Tail end of ' string.
     109  Single = r"[^'\\]*(?:\\.[^'\\]*)*'"
     110  # Tail end of " string.
     111  Double = r'[^"\\]*(?:\\.[^"\\]*)*"'
     112  # Tail end of ''' string.
     113  Single3 = r"[^'\\]*(?:(?:\\.|'(?!''))[^'\\]*)*'''"
     114  # Tail end of """ string.
     115  Double3 = r'[^"\\]*(?:(?:\\.|"(?!""))[^"\\]*)*"""'
     116  Triple = group(StringPrefix + "'''", StringPrefix + '"""')
     117  # Single-line ' or " string.
     118  String = group(StringPrefix + r"'[^\n'\\]*(?:\\.[^\n'\\]*)*'",
     119                 StringPrefix + r'"[^\n"\\]*(?:\\.[^\n"\\]*)*"')
     120  
     121  # Sorting in reverse order puts the long operators before their prefixes.
     122  # Otherwise if = came before ==, == would get recognized as two instances
     123  # of =.
     124  Special = group(*map(re.escape, sorted(EXACT_TOKEN_TYPES, reverse=True)))
     125  Funny = group(r'\r?\n', Special)
     126  
     127  PlainToken = group(Number, Funny, String, Name)
     128  Token = Ignore + PlainToken
     129  
     130  # First (or only) line of ' or " string.
     131  ContStr = group(StringPrefix + r"'[^\n'\\]*(?:\\.[^\n'\\]*)*" +
     132                  group("'", r'\\\r?\n'),
     133                  StringPrefix + r'"[^\n"\\]*(?:\\.[^\n"\\]*)*' +
     134                  group('"', r'\\\r?\n'))
     135  PseudoExtras = group(r'\\\r?\n|\Z', Comment, Triple)
     136  PseudoToken = Whitespace + group(PseudoExtras, Number, Funny, ContStr, Name)
     137  
     138  # For a given string prefix plus quotes, endpats maps it to a regex
     139  #  to match the remainder of that string. _prefix can be empty, for
     140  #  a normal single or triple quoted string (with no prefix).
     141  endpats = {}
     142  for _prefix in _all_string_prefixes():
     143      endpats[_prefix + "'"] = Single
     144      endpats[_prefix + '"'] = Double
     145      endpats[_prefix + "'''"] = Single3
     146      endpats[_prefix + '"""'] = Double3
     147  del _prefix
     148  
     149  # A set of all of the single and triple quoted string prefixes,
     150  #  including the opening quotes.
     151  single_quoted = set()
     152  triple_quoted = set()
     153  for t in _all_string_prefixes():
     154      for u in (t + '"', t + "'"):
     155          single_quoted.add(u)
     156      for u in (t + '"""', t + "'''"):
     157          triple_quoted.add(u)
     158  del t, u
     159  
     160  tabsize = 8
     161  
     162  class ESC[4;38;5;81mTokenError(ESC[4;38;5;149mException): pass
     163  
     164  
     165  class ESC[4;38;5;81mStopTokenizing(ESC[4;38;5;149mException): pass
     166  
     167  class ESC[4;38;5;81mUntokenizer:
     168  
     169      def __init__(self):
     170          self.tokens = []
     171          self.prev_row = 1
     172          self.prev_col = 0
     173          self.encoding = None
     174  
     175      def add_whitespace(self, start):
     176          row, col = start
     177          if row < self.prev_row or row == self.prev_row and col < self.prev_col:
     178              raise ValueError("start ({},{}) precedes previous end ({},{})"
     179                               .format(row, col, self.prev_row, self.prev_col))
     180          row_offset = row - self.prev_row
     181          if row_offset:
     182              self.tokens.append("\\\n" * row_offset)
     183              self.prev_col = 0
     184          col_offset = col - self.prev_col
     185          if col_offset:
     186              self.tokens.append(" " * col_offset)
     187  
     188      def untokenize(self, iterable):
     189          it = iter(iterable)
     190          indents = []
     191          startline = False
     192          for t in it:
     193              if len(t) == 2:
     194                  self.compat(t, it)
     195                  break
     196              tok_type, token, start, end, line = t
     197              if tok_type == ENCODING:
     198                  self.encoding = token
     199                  continue
     200              if tok_type == ENDMARKER:
     201                  break
     202              if tok_type == INDENT:
     203                  indents.append(token)
     204                  continue
     205              elif tok_type == DEDENT:
     206                  indents.pop()
     207                  self.prev_row, self.prev_col = end
     208                  continue
     209              elif tok_type in (NEWLINE, NL):
     210                  startline = True
     211              elif startline and indents:
     212                  indent = indents[-1]
     213                  if start[1] >= len(indent):
     214                      self.tokens.append(indent)
     215                      self.prev_col = len(indent)
     216                  startline = False
     217              elif tok_type == FSTRING_MIDDLE:
     218                  if '{' in token or '}' in token:
     219                      end_line, end_col = end
     220                      end = (end_line, end_col + token.count('{') + token.count('}'))
     221                      token = re.sub('{', '{{', token)
     222                      token = re.sub('}', '}}', token)
     223  
     224  
     225              self.add_whitespace(start)
     226              self.tokens.append(token)
     227              self.prev_row, self.prev_col = end
     228              if tok_type in (NEWLINE, NL):
     229                  self.prev_row += 1
     230                  self.prev_col = 0
     231          return "".join(self.tokens)
     232  
     233      def compat(self, token, iterable):
     234          indents = []
     235          toks_append = self.tokens.append
     236          startline = token[0] in (NEWLINE, NL)
     237          prevstring = False
     238  
     239          for tok in _itertools.chain([token], iterable):
     240              toknum, tokval = tok[:2]
     241              if toknum == ENCODING:
     242                  self.encoding = tokval
     243                  continue
     244  
     245              if toknum in (NAME, NUMBER):
     246                  tokval += ' '
     247  
     248              # Insert a space between two consecutive strings
     249              if toknum == STRING:
     250                  if prevstring:
     251                      tokval = ' ' + tokval
     252                  prevstring = True
     253              else:
     254                  prevstring = False
     255  
     256              if toknum == INDENT:
     257                  indents.append(tokval)
     258                  continue
     259              elif toknum == DEDENT:
     260                  indents.pop()
     261                  continue
     262              elif toknum in (NEWLINE, NL):
     263                  startline = True
     264              elif startline and indents:
     265                  toks_append(indents[-1])
     266                  startline = False
     267              elif toknum == FSTRING_MIDDLE:
     268                  if '{' in tokval or '}' in tokval:
     269                      tokval = re.sub('{', '{{', tokval)
     270                      tokval = re.sub('}', '}}', tokval)
     271  
     272              toks_append(tokval)
     273  
     274  
     275  def untokenize(iterable):
     276      """Transform tokens back into Python source code.
     277      It returns a bytes object, encoded using the ENCODING
     278      token, which is the first token sequence output by tokenize.
     279  
     280      Each element returned by the iterable must be a token sequence
     281      with at least two elements, a token number and token value.  If
     282      only two tokens are passed, the resulting output is poor.
     283  
     284      Round-trip invariant for full input:
     285          Untokenized source will match input source exactly
     286  
     287      Round-trip invariant for limited input:
     288          # Output bytes will tokenize back to the input
     289          t1 = [tok[:2] for tok in tokenize(f.readline)]
     290          newcode = untokenize(t1)
     291          readline = BytesIO(newcode).readline
     292          t2 = [tok[:2] for tok in tokenize(readline)]
     293          assert t1 == t2
     294      """
     295      ut = Untokenizer()
     296      out = ut.untokenize(iterable)
     297      if ut.encoding is not None:
     298          out = out.encode(ut.encoding)
     299      return out
     300  
     301  
     302  def _get_normal_name(orig_enc):
     303      """Imitates get_normal_name in tokenizer.c."""
     304      # Only care about the first 12 characters.
     305      enc = orig_enc[:12].lower().replace("_", "-")
     306      if enc == "utf-8" or enc.startswith("utf-8-"):
     307          return "utf-8"
     308      if enc in ("latin-1", "iso-8859-1", "iso-latin-1") or \
     309         enc.startswith(("latin-1-", "iso-8859-1-", "iso-latin-1-")):
     310          return "iso-8859-1"
     311      return orig_enc
     312  
     313  def detect_encoding(readline):
     314      """
     315      The detect_encoding() function is used to detect the encoding that should
     316      be used to decode a Python source file.  It requires one argument, readline,
     317      in the same way as the tokenize() generator.
     318  
     319      It will call readline a maximum of twice, and return the encoding used
     320      (as a string) and a list of any lines (left as bytes) it has read in.
     321  
     322      It detects the encoding from the presence of a utf-8 bom or an encoding
     323      cookie as specified in pep-0263.  If both a bom and a cookie are present,
     324      but disagree, a SyntaxError will be raised.  If the encoding cookie is an
     325      invalid charset, raise a SyntaxError.  Note that if a utf-8 bom is found,
     326      'utf-8-sig' is returned.
     327  
     328      If no encoding is specified, then the default of 'utf-8' will be returned.
     329      """
     330      try:
     331          filename = readline.__self__.name
     332      except AttributeError:
     333          filename = None
     334      bom_found = False
     335      encoding = None
     336      default = 'utf-8'
     337      def read_or_stop():
     338          try:
     339              return readline()
     340          except StopIteration:
     341              return b''
     342  
     343      def find_cookie(line):
     344          try:
     345              # Decode as UTF-8. Either the line is an encoding declaration,
     346              # in which case it should be pure ASCII, or it must be UTF-8
     347              # per default encoding.
     348              line_string = line.decode('utf-8')
     349          except UnicodeDecodeError:
     350              msg = "invalid or missing encoding declaration"
     351              if filename is not None:
     352                  msg = '{} for {!r}'.format(msg, filename)
     353              raise SyntaxError(msg)
     354  
     355          match = cookie_re.match(line_string)
     356          if not match:
     357              return None
     358          encoding = _get_normal_name(match.group(1))
     359          try:
     360              codec = lookup(encoding)
     361          except LookupError:
     362              # This behaviour mimics the Python interpreter
     363              if filename is None:
     364                  msg = "unknown encoding: " + encoding
     365              else:
     366                  msg = "unknown encoding for {!r}: {}".format(filename,
     367                          encoding)
     368              raise SyntaxError(msg)
     369  
     370          if bom_found:
     371              if encoding != 'utf-8':
     372                  # This behaviour mimics the Python interpreter
     373                  if filename is None:
     374                      msg = 'encoding problem: utf-8'
     375                  else:
     376                      msg = 'encoding problem for {!r}: utf-8'.format(filename)
     377                  raise SyntaxError(msg)
     378              encoding += '-sig'
     379          return encoding
     380  
     381      first = read_or_stop()
     382      if first.startswith(BOM_UTF8):
     383          bom_found = True
     384          first = first[3:]
     385          default = 'utf-8-sig'
     386      if not first:
     387          return default, []
     388  
     389      encoding = find_cookie(first)
     390      if encoding:
     391          return encoding, [first]
     392      if not blank_re.match(first):
     393          return default, [first]
     394  
     395      second = read_or_stop()
     396      if not second:
     397          return default, [first]
     398  
     399      encoding = find_cookie(second)
     400      if encoding:
     401          return encoding, [first, second]
     402  
     403      return default, [first, second]
     404  
     405  
     406  def open(filename):
     407      """Open a file in read only mode using the encoding detected by
     408      detect_encoding().
     409      """
     410      buffer = _builtin_open(filename, 'rb')
     411      try:
     412          encoding, lines = detect_encoding(buffer.readline)
     413          buffer.seek(0)
     414          text = TextIOWrapper(buffer, encoding, line_buffering=True)
     415          text.mode = 'r'
     416          return text
     417      except:
     418          buffer.close()
     419          raise
     420  
     421  def tokenize(readline):
     422      """
     423      The tokenize() generator requires one argument, readline, which
     424      must be a callable object which provides the same interface as the
     425      readline() method of built-in file objects.  Each call to the function
     426      should return one line of input as bytes.  Alternatively, readline
     427      can be a callable function terminating with StopIteration:
     428          readline = open(myfile, 'rb').__next__  # Example of alternate readline
     429  
     430      The generator produces 5-tuples with these members: the token type; the
     431      token string; a 2-tuple (srow, scol) of ints specifying the row and
     432      column where the token begins in the source; a 2-tuple (erow, ecol) of
     433      ints specifying the row and column where the token ends in the source;
     434      and the line on which the token was found.  The line passed is the
     435      physical line.
     436  
     437      The first token sequence will always be an ENCODING token
     438      which tells you which encoding was used to decode the bytes stream.
     439      """
     440      encoding, consumed = detect_encoding(readline)
     441      rl_gen = _itertools.chain(consumed, iter(readline, b""))
     442      if encoding is not None:
     443          if encoding == "utf-8-sig":
     444              # BOM will already have been stripped.
     445              encoding = "utf-8"
     446          yield TokenInfo(ENCODING, encoding, (0, 0), (0, 0), '')
     447      yield from _generate_tokens_from_c_tokenizer(rl_gen.__next__, encoding, extra_tokens=True)
     448  
     449  def generate_tokens(readline):
     450      """Tokenize a source reading Python code as unicode strings.
     451  
     452      This has the same API as tokenize(), except that it expects the *readline*
     453      callable to return str objects instead of bytes.
     454      """
     455      return _generate_tokens_from_c_tokenizer(readline, extra_tokens=True)
     456  
     457  def main():
     458      import argparse
     459  
     460      # Helper error handling routines
     461      def perror(message):
     462          sys.stderr.write(message)
     463          sys.stderr.write('\n')
     464  
     465      def error(message, filename=None, location=None):
     466          if location:
     467              args = (filename,) + location + (message,)
     468              perror("%s:%d:%d: error: %s" % args)
     469          elif filename:
     470              perror("%s: error: %s" % (filename, message))
     471          else:
     472              perror("error: %s" % message)
     473          sys.exit(1)
     474  
     475      # Parse the arguments and options
     476      parser = argparse.ArgumentParser(prog='python -m tokenize')
     477      parser.add_argument(dest='filename', nargs='?',
     478                          metavar='filename.py',
     479                          help='the file to tokenize; defaults to stdin')
     480      parser.add_argument('-e', '--exact', dest='exact', action='store_true',
     481                          help='display token names using the exact type')
     482      args = parser.parse_args()
     483  
     484      try:
     485          # Tokenize the input
     486          if args.filename:
     487              filename = args.filename
     488              with _builtin_open(filename, 'rb') as f:
     489                  tokens = list(tokenize(f.readline))
     490          else:
     491              filename = "<stdin>"
     492              tokens = _generate_tokens_from_c_tokenizer(
     493                  sys.stdin.readline, extra_tokens=True)
     494  
     495  
     496          # Output the tokenization
     497          for token in tokens:
     498              token_type = token.type
     499              if args.exact:
     500                  token_type = token.exact_type
     501              token_range = "%d,%d-%d,%d:" % (token.start + token.end)
     502              print("%-20s%-15s%-15r" %
     503                    (token_range, tok_name[token_type], token.string))
     504      except IndentationError as err:
     505          line, column = err.args[1][1:3]
     506          error(err.args[0], filename, (line, column))
     507      except TokenError as err:
     508          line, column = err.args[1]
     509          error(err.args[0], filename, (line, column))
     510      except SyntaxError as err:
     511          error(err, filename)
     512      except OSError as err:
     513          error(err)
     514      except KeyboardInterrupt:
     515          print("interrupted\n")
     516      except Exception as err:
     517          perror("unexpected error: %s" % err)
     518          raise
     519  
     520  def _transform_msg(msg):
     521      """Transform error messages from the C tokenizer into the Python tokenize
     522  
     523      The C tokenizer is more picky than the Python one, so we need to massage
     524      the error messages a bit for backwards compatibility.
     525      """
     526      if "unterminated triple-quoted string literal" in msg:
     527          return "EOF in multi-line string"
     528      return msg
     529  
     530  def _generate_tokens_from_c_tokenizer(source, encoding=None, extra_tokens=False):
     531      """Tokenize a source reading Python code as unicode strings using the internal C tokenizer"""
     532      if encoding is None:
     533          it = _tokenize.TokenizerIter(source, extra_tokens=extra_tokens)
     534      else:
     535          it = _tokenize.TokenizerIter(source, encoding=encoding, extra_tokens=extra_tokens)
     536      try:
     537          for info in it:
     538              yield TokenInfo._make(info)
     539      except SyntaxError as e:
     540          if type(e) != SyntaxError:
     541              raise e from None
     542          msg = _transform_msg(e.msg)
     543          raise TokenError(msg, (e.lineno, e.offset)) from None
     544  
     545  
     546  if __name__ == "__main__":
     547      main()