1  # Copyright (c) 2001, 2002, 2003, 2004, 2005, 2006 Python Software Foundation.
       2  # All rights reserved.
       3  
       4  """Tokenization help for Python programs.
       5  
       6  generate_tokens(readline) is a generator that breaks a stream of
       7  text into Python tokens.  It accepts a readline-like method which is called
       8  repeatedly to get the next line of input (or "" for EOF).  It generates
       9  5-tuples with these members:
      10  
      11      the token type (see token.py)
      12      the token (a string)
      13      the starting (row, column) indices of the token (a 2-tuple of ints)
      14      the ending (row, column) indices of the token (a 2-tuple of ints)
      15      the original line (string)
      16  
      17  It is designed to match the working of the Python tokenizer exactly, except
      18  that it produces COMMENT tokens for comments and gives type OP for all
      19  operators
      20  
      21  Older entry points
      22      tokenize_loop(readline, tokeneater)
      23      tokenize(readline, tokeneater=printtoken)
      24  are the same, except instead of generating tokens, tokeneater is a callback
      25  function to which the 5 fields described above are passed as 5 arguments,
      26  each time a new token is found."""
      27  
      28  __author__ = 'Ka-Ping Yee <ping@lfw.org>'
      29  __credits__ = \
      30      'GvR, ESR, Tim Peters, Thomas Wouters, Fred Drake, Skip Montanaro'
      31  
      32  import string, re
      33  from codecs import BOM_UTF8, lookup
      34  from lib2to3.pgen2.token import *
      35  
      36  from . import token
      37  __all__ = [x for x in dir(token) if x[0] != '_'] + ["tokenize",
      38             "generate_tokens", "untokenize"]
      39  del token
      40  
      41  try:
      42      bytes
      43  except NameError:
      44      # Support bytes type in Python <= 2.5, so 2to3 turns itself into
      45      # valid Python 3 code.
      46      bytes = str
      47  
      48  def group(*choices): return '(' + '|'.join(choices) + ')'
      49  def any(*choices): return group(*choices) + '*'
      50  def maybe(*choices): return group(*choices) + '?'
      51  def _combinations(*l):
      52      return set(
      53          x + y for x in l for y in l + ("",) if x.casefold() != y.casefold()
      54      )
      55  
      56  Whitespace = r'[ \f\t]*'
      57  Comment = r'#[^\r\n]*'
      58  Ignore = Whitespace + any(r'\\\r?\n' + Whitespace) + maybe(Comment)
      59  Name = r'\w+'
      60  
      61  Binnumber = r'0[bB]_?[01]+(?:_[01]+)*'
      62  Hexnumber = r'0[xX]_?[\da-fA-F]+(?:_[\da-fA-F]+)*[lL]?'
      63  Octnumber = r'0[oO]?_?[0-7]+(?:_[0-7]+)*[lL]?'
      64  Decnumber = group(r'[1-9]\d*(?:_\d+)*[lL]?', '0[lL]?')
      65  Intnumber = group(Binnumber, Hexnumber, Octnumber, Decnumber)
      66  Exponent = r'[eE][-+]?\d+(?:_\d+)*'
      67  Pointfloat = group(r'\d+(?:_\d+)*\.(?:\d+(?:_\d+)*)?', r'\.\d+(?:_\d+)*') + maybe(Exponent)
      68  Expfloat = r'\d+(?:_\d+)*' + Exponent
      69  Floatnumber = group(Pointfloat, Expfloat)
      70  Imagnumber = group(r'\d+(?:_\d+)*[jJ]', Floatnumber + r'[jJ]')
      71  Number = group(Imagnumber, Floatnumber, Intnumber)
      72  
      73  # Tail end of ' string.
      74  Single = r"[^'\\]*(?:\\.[^'\\]*)*'"
      75  # Tail end of " string.
      76  Double = r'[^"\\]*(?:\\.[^"\\]*)*"'
      77  # Tail end of ''' string.
      78  Single3 = r"[^'\\]*(?:(?:\\.|'(?!''))[^'\\]*)*'''"
      79  # Tail end of """ string.
      80  Double3 = r'[^"\\]*(?:(?:\\.|"(?!""))[^"\\]*)*"""'
      81  _litprefix = r"(?:[uUrRbBfF]|[rR][fFbB]|[fFbBuU][rR])?"
      82  Triple = group(_litprefix + "'''", _litprefix + '"""')
      83  # Single-line ' or " string.
      84  String = group(_litprefix + r"'[^\n'\\]*(?:\\.[^\n'\\]*)*'",
      85                 _litprefix + r'"[^\n"\\]*(?:\\.[^\n"\\]*)*"')
      86  
      87  # Because of leftmost-then-longest match semantics, be sure to put the
      88  # longest operators first (e.g., if = came before ==, == would get
      89  # recognized as two instances of =).
      90  Operator = group(r"\*\*=?", r">>=?", r"<<=?", r"<>", r"!=",
      91                   r"//=?", r"->",
      92                   r"[+\-*/%&@|^=<>]=?",
      93                   r"~")
      94  
      95  Bracket = '[][(){}]'
      96  Special = group(r'\r?\n', r':=', r'[:;.,`@]')
      97  Funny = group(Operator, Bracket, Special)
      98  
      99  PlainToken = group(Number, Funny, String, Name)
     100  Token = Ignore + PlainToken
     101  
     102  # First (or only) line of ' or " string.
     103  ContStr = group(_litprefix + r"'[^\n'\\]*(?:\\.[^\n'\\]*)*" +
     104                  group("'", r'\\\r?\n'),
     105                  _litprefix + r'"[^\n"\\]*(?:\\.[^\n"\\]*)*' +
     106                  group('"', r'\\\r?\n'))
     107  PseudoExtras = group(r'\\\r?\n', Comment, Triple)
     108  PseudoToken = Whitespace + group(PseudoExtras, Number, Funny, ContStr, Name)
     109  
     110  tokenprog, pseudoprog, single3prog, double3prog = map(
     111      re.compile, (Token, PseudoToken, Single3, Double3))
     112  
     113  _strprefixes = (
     114      _combinations('r', 'R', 'f', 'F') |
     115      _combinations('r', 'R', 'b', 'B') |
     116      {'u', 'U', 'ur', 'uR', 'Ur', 'UR'}
     117  )
     118  
     119  endprogs = {"'": re.compile(Single), '"': re.compile(Double),
     120              "'''": single3prog, '"""': double3prog,
     121              **{f"{prefix}'''": single3prog for prefix in _strprefixes},
     122              **{f'{prefix}"""': double3prog for prefix in _strprefixes},
     123              **{prefix: None for prefix in _strprefixes}}
     124  
     125  triple_quoted = (
     126      {"'''", '"""'} |
     127      {f"{prefix}'''" for prefix in _strprefixes} |
     128      {f'{prefix}"""' for prefix in _strprefixes}
     129  )
     130  single_quoted = (
     131      {"'", '"'} |
     132      {f"{prefix}'" for prefix in _strprefixes} |
     133      {f'{prefix}"' for prefix in _strprefixes}
     134  )
     135  
     136  tabsize = 8
     137  
     138  class ESC[4;38;5;81mTokenError(ESC[4;38;5;149mException): pass
     139  
     140  class ESC[4;38;5;81mStopTokenizing(ESC[4;38;5;149mException): pass
     141  
     142  def printtoken(type, token, xxx_todo_changeme, xxx_todo_changeme1, line): # for testing
     143      (srow, scol) = xxx_todo_changeme
     144      (erow, ecol) = xxx_todo_changeme1
     145      print("%d,%d-%d,%d:\t%s\t%s" % \
     146          (srow, scol, erow, ecol, tok_name[type], repr(token)))
     147  
     148  def tokenize(readline, tokeneater=printtoken):
     149      """
     150      The tokenize() function accepts two parameters: one representing the
     151      input stream, and one providing an output mechanism for tokenize().
     152  
     153      The first parameter, readline, must be a callable object which provides
     154      the same interface as the readline() method of built-in file objects.
     155      Each call to the function should return one line of input as a string.
     156  
     157      The second parameter, tokeneater, must also be a callable object. It is
     158      called once for each token, with five arguments, corresponding to the
     159      tuples generated by generate_tokens().
     160      """
     161      try:
     162          tokenize_loop(readline, tokeneater)
     163      except StopTokenizing:
     164          pass
     165  
     166  # backwards compatible interface
     167  def tokenize_loop(readline, tokeneater):
     168      for token_info in generate_tokens(readline):
     169          tokeneater(*token_info)
     170  
     171  class ESC[4;38;5;81mUntokenizer:
     172  
     173      def __init__(self):
     174          self.tokens = []
     175          self.prev_row = 1
     176          self.prev_col = 0
     177  
     178      def add_whitespace(self, start):
     179          row, col = start
     180          assert row <= self.prev_row
     181          col_offset = col - self.prev_col
     182          if col_offset:
     183              self.tokens.append(" " * col_offset)
     184  
     185      def untokenize(self, iterable):
     186          for t in iterable:
     187              if len(t) == 2:
     188                  self.compat(t, iterable)
     189                  break
     190              tok_type, token, start, end, line = t
     191              self.add_whitespace(start)
     192              self.tokens.append(token)
     193              self.prev_row, self.prev_col = end
     194              if tok_type in (NEWLINE, NL):
     195                  self.prev_row += 1
     196                  self.prev_col = 0
     197          return "".join(self.tokens)
     198  
     199      def compat(self, token, iterable):
     200          startline = False
     201          indents = []
     202          toks_append = self.tokens.append
     203          toknum, tokval = token
     204          if toknum in (NAME, NUMBER):
     205              tokval += ' '
     206          if toknum in (NEWLINE, NL):
     207              startline = True
     208          for tok in iterable:
     209              toknum, tokval = tok[:2]
     210  
     211              if toknum in (NAME, NUMBER, ASYNC, AWAIT):
     212                  tokval += ' '
     213  
     214              if toknum == INDENT:
     215                  indents.append(tokval)
     216                  continue
     217              elif toknum == DEDENT:
     218                  indents.pop()
     219                  continue
     220              elif toknum in (NEWLINE, NL):
     221                  startline = True
     222              elif startline and indents:
     223                  toks_append(indents[-1])
     224                  startline = False
     225              toks_append(tokval)
     226  
     227  cookie_re = re.compile(r'^[ \t\f]*#.*?coding[:=][ \t]*([-\w.]+)', re.ASCII)
     228  blank_re = re.compile(br'^[ \t\f]*(?:[#\r\n]|$)', re.ASCII)
     229  
     230  def _get_normal_name(orig_enc):
     231      """Imitates get_normal_name in tokenizer.c."""
     232      # Only care about the first 12 characters.
     233      enc = orig_enc[:12].lower().replace("_", "-")
     234      if enc == "utf-8" or enc.startswith("utf-8-"):
     235          return "utf-8"
     236      if enc in ("latin-1", "iso-8859-1", "iso-latin-1") or \
     237         enc.startswith(("latin-1-", "iso-8859-1-", "iso-latin-1-")):
     238          return "iso-8859-1"
     239      return orig_enc
     240  
     241  def detect_encoding(readline):
     242      """
     243      The detect_encoding() function is used to detect the encoding that should
     244      be used to decode a Python source file. It requires one argument, readline,
     245      in the same way as the tokenize() generator.
     246  
     247      It will call readline a maximum of twice, and return the encoding used
     248      (as a string) and a list of any lines (left as bytes) it has read
     249      in.
     250  
     251      It detects the encoding from the presence of a utf-8 bom or an encoding
     252      cookie as specified in pep-0263. If both a bom and a cookie are present, but
     253      disagree, a SyntaxError will be raised. If the encoding cookie is an invalid
     254      charset, raise a SyntaxError.  Note that if a utf-8 bom is found,
     255      'utf-8-sig' is returned.
     256  
     257      If no encoding is specified, then the default of 'utf-8' will be returned.
     258      """
     259      bom_found = False
     260      encoding = None
     261      default = 'utf-8'
     262      def read_or_stop():
     263          try:
     264              return readline()
     265          except StopIteration:
     266              return bytes()
     267  
     268      def find_cookie(line):
     269          try:
     270              line_string = line.decode('ascii')
     271          except UnicodeDecodeError:
     272              return None
     273          match = cookie_re.match(line_string)
     274          if not match:
     275              return None
     276          encoding = _get_normal_name(match.group(1))
     277          try:
     278              codec = lookup(encoding)
     279          except LookupError:
     280              # This behaviour mimics the Python interpreter
     281              raise SyntaxError("unknown encoding: " + encoding)
     282  
     283          if bom_found:
     284              if codec.name != 'utf-8':
     285                  # This behaviour mimics the Python interpreter
     286                  raise SyntaxError('encoding problem: utf-8')
     287              encoding += '-sig'
     288          return encoding
     289  
     290      first = read_or_stop()
     291      if first.startswith(BOM_UTF8):
     292          bom_found = True
     293          first = first[3:]
     294          default = 'utf-8-sig'
     295      if not first:
     296          return default, []
     297  
     298      encoding = find_cookie(first)
     299      if encoding:
     300          return encoding, [first]
     301      if not blank_re.match(first):
     302          return default, [first]
     303  
     304      second = read_or_stop()
     305      if not second:
     306          return default, [first]
     307  
     308      encoding = find_cookie(second)
     309      if encoding:
     310          return encoding, [first, second]
     311  
     312      return default, [first, second]
     313  
     314  def untokenize(iterable):
     315      """Transform tokens back into Python source code.
     316  
     317      Each element returned by the iterable must be a token sequence
     318      with at least two elements, a token number and token value.  If
     319      only two tokens are passed, the resulting output is poor.
     320  
     321      Round-trip invariant for full input:
     322          Untokenized source will match input source exactly
     323  
     324      Round-trip invariant for limited input:
     325          # Output text will tokenize the back to the input
     326          t1 = [tok[:2] for tok in generate_tokens(f.readline)]
     327          newcode = untokenize(t1)
     328          readline = iter(newcode.splitlines(1)).next
     329          t2 = [tok[:2] for tokin generate_tokens(readline)]
     330          assert t1 == t2
     331      """
     332      ut = Untokenizer()
     333      return ut.untokenize(iterable)
     334  
     335  def generate_tokens(readline):
     336      """
     337      The generate_tokens() generator requires one argument, readline, which
     338      must be a callable object which provides the same interface as the
     339      readline() method of built-in file objects. Each call to the function
     340      should return one line of input as a string.  Alternately, readline
     341      can be a callable function terminating with StopIteration:
     342          readline = open(myfile).next    # Example of alternate readline
     343  
     344      The generator produces 5-tuples with these members: the token type; the
     345      token string; a 2-tuple (srow, scol) of ints specifying the row and
     346      column where the token begins in the source; a 2-tuple (erow, ecol) of
     347      ints specifying the row and column where the token ends in the source;
     348      and the line on which the token was found. The line passed is the
     349      physical line.
     350      """
     351      lnum = parenlev = continued = 0
     352      contstr, needcont = '', 0
     353      contline = None
     354      indents = [0]
     355  
     356      # 'stashed' and 'async_*' are used for async/await parsing
     357      stashed = None
     358      async_def = False
     359      async_def_indent = 0
     360      async_def_nl = False
     361  
     362      while 1:                                   # loop over lines in stream
     363          try:
     364              line = readline()
     365          except StopIteration:
     366              line = ''
     367          lnum = lnum + 1
     368          pos, max = 0, len(line)
     369  
     370          if contstr:                            # continued string
     371              if not line:
     372                  raise TokenError("EOF in multi-line string", strstart)
     373              endmatch = endprog.match(line)
     374              if endmatch:
     375                  pos = end = endmatch.end(0)
     376                  yield (STRING, contstr + line[:end],
     377                         strstart, (lnum, end), contline + line)
     378                  contstr, needcont = '', 0
     379                  contline = None
     380              elif needcont and line[-2:] != '\\\n' and line[-3:] != '\\\r\n':
     381                  yield (ERRORTOKEN, contstr + line,
     382                             strstart, (lnum, len(line)), contline)
     383                  contstr = ''
     384                  contline = None
     385                  continue
     386              else:
     387                  contstr = contstr + line
     388                  contline = contline + line
     389                  continue
     390  
     391          elif parenlev == 0 and not continued:  # new statement
     392              if not line: break
     393              column = 0
     394              while pos < max:                   # measure leading whitespace
     395                  if line[pos] == ' ': column = column + 1
     396                  elif line[pos] == '\t': column = (column//tabsize + 1)*tabsize
     397                  elif line[pos] == '\f': column = 0
     398                  else: break
     399                  pos = pos + 1
     400              if pos == max: break
     401  
     402              if stashed:
     403                  yield stashed
     404                  stashed = None
     405  
     406              if line[pos] in '#\r\n':           # skip comments or blank lines
     407                  if line[pos] == '#':
     408                      comment_token = line[pos:].rstrip('\r\n')
     409                      nl_pos = pos + len(comment_token)
     410                      yield (COMMENT, comment_token,
     411                             (lnum, pos), (lnum, pos + len(comment_token)), line)
     412                      yield (NL, line[nl_pos:],
     413                             (lnum, nl_pos), (lnum, len(line)), line)
     414                  else:
     415                      yield ((NL, COMMENT)[line[pos] == '#'], line[pos:],
     416                             (lnum, pos), (lnum, len(line)), line)
     417                  continue
     418  
     419              if column > indents[-1]:           # count indents or dedents
     420                  indents.append(column)
     421                  yield (INDENT, line[:pos], (lnum, 0), (lnum, pos), line)
     422              while column < indents[-1]:
     423                  if column not in indents:
     424                      raise IndentationError(
     425                          "unindent does not match any outer indentation level",
     426                          ("<tokenize>", lnum, pos, line))
     427                  indents = indents[:-1]
     428  
     429                  if async_def and async_def_indent >= indents[-1]:
     430                      async_def = False
     431                      async_def_nl = False
     432                      async_def_indent = 0
     433  
     434                  yield (DEDENT, '', (lnum, pos), (lnum, pos), line)
     435  
     436              if async_def and async_def_nl and async_def_indent >= indents[-1]:
     437                  async_def = False
     438                  async_def_nl = False
     439                  async_def_indent = 0
     440  
     441          else:                                  # continued statement
     442              if not line:
     443                  raise TokenError("EOF in multi-line statement", (lnum, 0))
     444              continued = 0
     445  
     446          while pos < max:
     447              pseudomatch = pseudoprog.match(line, pos)
     448              if pseudomatch:                                # scan for tokens
     449                  start, end = pseudomatch.span(1)
     450                  spos, epos, pos = (lnum, start), (lnum, end), end
     451                  token, initial = line[start:end], line[start]
     452  
     453                  if initial in string.digits or \
     454                     (initial == '.' and token != '.'):      # ordinary number
     455                      yield (NUMBER, token, spos, epos, line)
     456                  elif initial in '\r\n':
     457                      newline = NEWLINE
     458                      if parenlev > 0:
     459                          newline = NL
     460                      elif async_def:
     461                          async_def_nl = True
     462                      if stashed:
     463                          yield stashed
     464                          stashed = None
     465                      yield (newline, token, spos, epos, line)
     466  
     467                  elif initial == '#':
     468                      assert not token.endswith("\n")
     469                      if stashed:
     470                          yield stashed
     471                          stashed = None
     472                      yield (COMMENT, token, spos, epos, line)
     473                  elif token in triple_quoted:
     474                      endprog = endprogs[token]
     475                      endmatch = endprog.match(line, pos)
     476                      if endmatch:                           # all on one line
     477                          pos = endmatch.end(0)
     478                          token = line[start:pos]
     479                          if stashed:
     480                              yield stashed
     481                              stashed = None
     482                          yield (STRING, token, spos, (lnum, pos), line)
     483                      else:
     484                          strstart = (lnum, start)           # multiple lines
     485                          contstr = line[start:]
     486                          contline = line
     487                          break
     488                  elif initial in single_quoted or \
     489                      token[:2] in single_quoted or \
     490                      token[:3] in single_quoted:
     491                      if token[-1] == '\n':                  # continued string
     492                          strstart = (lnum, start)
     493                          endprog = (endprogs[initial] or endprogs[token[1]] or
     494                                     endprogs[token[2]])
     495                          contstr, needcont = line[start:], 1
     496                          contline = line
     497                          break
     498                      else:                                  # ordinary string
     499                          if stashed:
     500                              yield stashed
     501                              stashed = None
     502                          yield (STRING, token, spos, epos, line)
     503                  elif initial.isidentifier():               # ordinary name
     504                      if token in ('async', 'await'):
     505                          if async_def:
     506                              yield (ASYNC if token == 'async' else AWAIT,
     507                                     token, spos, epos, line)
     508                              continue
     509  
     510                      tok = (NAME, token, spos, epos, line)
     511                      if token == 'async' and not stashed:
     512                          stashed = tok
     513                          continue
     514  
     515                      if token in ('def', 'for'):
     516                          if (stashed
     517                                  and stashed[0] == NAME
     518                                  and stashed[1] == 'async'):
     519  
     520                              if token == 'def':
     521                                  async_def = True
     522                                  async_def_indent = indents[-1]
     523  
     524                              yield (ASYNC, stashed[1],
     525                                     stashed[2], stashed[3],
     526                                     stashed[4])
     527                              stashed = None
     528  
     529                      if stashed:
     530                          yield stashed
     531                          stashed = None
     532  
     533                      yield tok
     534                  elif initial == '\\':                      # continued stmt
     535                      # This yield is new; needed for better idempotency:
     536                      if stashed:
     537                          yield stashed
     538                          stashed = None
     539                      yield (NL, token, spos, (lnum, pos), line)
     540                      continued = 1
     541                  else:
     542                      if initial in '([{': parenlev = parenlev + 1
     543                      elif initial in ')]}': parenlev = parenlev - 1
     544                      if stashed:
     545                          yield stashed
     546                          stashed = None
     547                      yield (OP, token, spos, epos, line)
     548              else:
     549                  yield (ERRORTOKEN, line[pos],
     550                             (lnum, pos), (lnum, pos+1), line)
     551                  pos = pos + 1
     552  
     553      if stashed:
     554          yield stashed
     555          stashed = None
     556  
     557      for indent in indents[1:]:                 # pop remaining indent levels
     558          yield (DEDENT, '', (lnum, 0), (lnum, 0), '')
     559      yield (ENDMARKER, '', (lnum, 0), (lnum, 0), '')
     560  
     561  if __name__ == '__main__':                     # testing
     562      import sys
     563      if len(sys.argv) > 1: tokenize(open(sys.argv[1]).readline)
     564      else: tokenize(sys.stdin.readline)