1  """Tokenization help for Python programs.
       2  
       3  tokenize(readline) is a generator that breaks a stream of bytes into
       4  Python tokens.  It decodes the bytes according to PEP-0263 for
       5  determining source file encoding.
       6  
       7  It accepts a readline-like method which is called repeatedly to get the
       8  next line of input (or b"" for EOF).  It generates 5-tuples with these
       9  members:
      10  
      11      the token type (see token.py)
      12      the token (a string)
      13      the starting (row, column) indices of the token (a 2-tuple of ints)
      14      the ending (row, column) indices of the token (a 2-tuple of ints)
      15      the original line (string)
      16  
      17  It is designed to match the working of the Python tokenizer exactly, except
      18  that it produces COMMENT tokens for comments and gives type OP for all
      19  operators.  Additionally, all token lists start with an ENCODING token
      20  which tells you which encoding was used to decode the bytes stream.
      21  """
      22  
      23  __author__ = 'Ka-Ping Yee <ping@lfw.org>'
      24  __credits__ = ('GvR, ESR, Tim Peters, Thomas Wouters, Fred Drake, '
      25                 'Skip Montanaro, Raymond Hettinger, Trent Nelson, '
      26                 'Michael Foord')
      27  from builtins import open as _builtin_open
      28  from codecs import lookup, BOM_UTF8
      29  import collections
      30  import functools
      31  from io import TextIOWrapper
      32  import itertools as _itertools
      33  import re
      34  import sys
      35  from token import *
      36  from token import EXACT_TOKEN_TYPES
      37  
      38  cookie_re = re.compile(r'^[ \t\f]*#.*?coding[:=][ \t]*([-\w.]+)', re.ASCII)
      39  blank_re = re.compile(br'^[ \t\f]*(?:[#\r\n]|$)', re.ASCII)
      40  
      41  import token
      42  __all__ = token.__all__ + ["tokenize", "generate_tokens", "detect_encoding",
      43                             "untokenize", "TokenInfo"]
      44  del token
      45  
      46  class ESC[4;38;5;81mTokenInfo(ESC[4;38;5;149mcollectionsESC[4;38;5;149m.ESC[4;38;5;149mnamedtuple('TokenInfo', 'type string start end line')):
      47      def __repr__(self):
      48          annotated_type = '%d (%s)' % (self.type, tok_name[self.type])
      49          return ('TokenInfo(type=%s, string=%r, start=%r, end=%r, line=%r)' %
      50                  self._replace(type=annotated_type))
      51  
      52      @property
      53      def exact_type(self):
      54          if self.type == OP and self.string in EXACT_TOKEN_TYPES:
      55              return EXACT_TOKEN_TYPES[self.string]
      56          else:
      57              return self.type
      58  
      59  def group(*choices): return '(' + '|'.join(choices) + ')'
      60  def any(*choices): return group(*choices) + '*'
      61  def maybe(*choices): return group(*choices) + '?'
      62  
      63  # Note: we use unicode matching for names ("\w") but ascii matching for
      64  # number literals.
      65  Whitespace = r'[ \f\t]*'
      66  Comment = r'#[^\r\n]*'
      67  Ignore = Whitespace + any(r'\\\r?\n' + Whitespace) + maybe(Comment)
      68  Name = r'\w+'
      69  
      70  Hexnumber = r'0[xX](?:_?[0-9a-fA-F])+'
      71  Binnumber = r'0[bB](?:_?[01])+'
      72  Octnumber = r'0[oO](?:_?[0-7])+'
      73  Decnumber = r'(?:0(?:_?0)*|[1-9](?:_?[0-9])*)'
      74  Intnumber = group(Hexnumber, Binnumber, Octnumber, Decnumber)
      75  Exponent = r'[eE][-+]?[0-9](?:_?[0-9])*'
      76  Pointfloat = group(r'[0-9](?:_?[0-9])*\.(?:[0-9](?:_?[0-9])*)?',
      77                     r'\.[0-9](?:_?[0-9])*') + maybe(Exponent)
      78  Expfloat = r'[0-9](?:_?[0-9])*' + Exponent
      79  Floatnumber = group(Pointfloat, Expfloat)
      80  Imagnumber = group(r'[0-9](?:_?[0-9])*[jJ]', Floatnumber + r'[jJ]')
      81  Number = group(Imagnumber, Floatnumber, Intnumber)
      82  
      83  # Return the empty string, plus all of the valid string prefixes.
      84  def _all_string_prefixes():
      85      # The valid string prefixes. Only contain the lower case versions,
      86      #  and don't contain any permutations (include 'fr', but not
      87      #  'rf'). The various permutations will be generated.
      88      _valid_string_prefixes = ['b', 'r', 'u', 'f', 'br', 'fr']
      89      # if we add binary f-strings, add: ['fb', 'fbr']
      90      result = {''}
      91      for prefix in _valid_string_prefixes:
      92          for t in _itertools.permutations(prefix):
      93              # create a list with upper and lower versions of each
      94              #  character
      95              for u in _itertools.product(*[(c, c.upper()) for c in t]):
      96                  result.add(''.join(u))
      97      return result
      98  
      99  @functools.lru_cache
     100  def _compile(expr):
     101      return re.compile(expr, re.UNICODE)
     102  
     103  # Note that since _all_string_prefixes includes the empty string,
     104  #  StringPrefix can be the empty string (making it optional).
     105  StringPrefix = group(*_all_string_prefixes())
     106  
     107  # Tail end of ' string.
     108  Single = r"[^'\\]*(?:\\.[^'\\]*)*'"
     109  # Tail end of " string.
     110  Double = r'[^"\\]*(?:\\.[^"\\]*)*"'
     111  # Tail end of ''' string.
     112  Single3 = r"[^'\\]*(?:(?:\\.|'(?!''))[^'\\]*)*'''"
     113  # Tail end of """ string.
     114  Double3 = r'[^"\\]*(?:(?:\\.|"(?!""))[^"\\]*)*"""'
     115  Triple = group(StringPrefix + "'''", StringPrefix + '"""')
     116  # Single-line ' or " string.
     117  String = group(StringPrefix + r"'[^\n'\\]*(?:\\.[^\n'\\]*)*'",
     118                 StringPrefix + r'"[^\n"\\]*(?:\\.[^\n"\\]*)*"')
     119  
     120  # Sorting in reverse order puts the long operators before their prefixes.
     121  # Otherwise if = came before ==, == would get recognized as two instances
     122  # of =.
     123  Special = group(*map(re.escape, sorted(EXACT_TOKEN_TYPES, reverse=True)))
     124  Funny = group(r'\r?\n', Special)
     125  
     126  PlainToken = group(Number, Funny, String, Name)
     127  Token = Ignore + PlainToken
     128  
     129  # First (or only) line of ' or " string.
     130  ContStr = group(StringPrefix + r"'[^\n'\\]*(?:\\.[^\n'\\]*)*" +
     131                  group("'", r'\\\r?\n'),
     132                  StringPrefix + r'"[^\n"\\]*(?:\\.[^\n"\\]*)*' +
     133                  group('"', r'\\\r?\n'))
     134  PseudoExtras = group(r'\\\r?\n|\Z', Comment, Triple)
     135  PseudoToken = Whitespace + group(PseudoExtras, Number, Funny, ContStr, Name)
     136  
     137  # For a given string prefix plus quotes, endpats maps it to a regex
     138  #  to match the remainder of that string. _prefix can be empty, for
     139  #  a normal single or triple quoted string (with no prefix).
     140  endpats = {}
     141  for _prefix in _all_string_prefixes():
     142      endpats[_prefix + "'"] = Single
     143      endpats[_prefix + '"'] = Double
     144      endpats[_prefix + "'''"] = Single3
     145      endpats[_prefix + '"""'] = Double3
     146  del _prefix
     147  
     148  # A set of all of the single and triple quoted string prefixes,
     149  #  including the opening quotes.
     150  single_quoted = set()
     151  triple_quoted = set()
     152  for t in _all_string_prefixes():
     153      for u in (t + '"', t + "'"):
     154          single_quoted.add(u)
     155      for u in (t + '"""', t + "'''"):
     156          triple_quoted.add(u)
     157  del t, u
     158  
     159  tabsize = 8
     160  
     161  class ESC[4;38;5;81mTokenError(ESC[4;38;5;149mException): pass
     162  
     163  class ESC[4;38;5;81mStopTokenizing(ESC[4;38;5;149mException): pass
     164  
     165  
     166  class ESC[4;38;5;81mUntokenizer:
     167  
     168      def __init__(self):
     169          self.tokens = []
     170          self.prev_row = 1
     171          self.prev_col = 0
     172          self.encoding = None
     173  
     174      def add_whitespace(self, start):
     175          row, col = start
     176          if row < self.prev_row or row == self.prev_row and col < self.prev_col:
     177              raise ValueError("start ({},{}) precedes previous end ({},{})"
     178                               .format(row, col, self.prev_row, self.prev_col))
     179          row_offset = row - self.prev_row
     180          if row_offset:
     181              self.tokens.append("\\\n" * row_offset)
     182              self.prev_col = 0
     183          col_offset = col - self.prev_col
     184          if col_offset:
     185              self.tokens.append(" " * col_offset)
     186  
     187      def untokenize(self, iterable):
     188          it = iter(iterable)
     189          indents = []
     190          startline = False
     191          for t in it:
     192              if len(t) == 2:
     193                  self.compat(t, it)
     194                  break
     195              tok_type, token, start, end, line = t
     196              if tok_type == ENCODING:
     197                  self.encoding = token
     198                  continue
     199              if tok_type == ENDMARKER:
     200                  break
     201              if tok_type == INDENT:
     202                  indents.append(token)
     203                  continue
     204              elif tok_type == DEDENT:
     205                  indents.pop()
     206                  self.prev_row, self.prev_col = end
     207                  continue
     208              elif tok_type in (NEWLINE, NL):
     209                  startline = True
     210              elif startline and indents:
     211                  indent = indents[-1]
     212                  if start[1] >= len(indent):
     213                      self.tokens.append(indent)
     214                      self.prev_col = len(indent)
     215                  startline = False
     216              self.add_whitespace(start)
     217              self.tokens.append(token)
     218              self.prev_row, self.prev_col = end
     219              if tok_type in (NEWLINE, NL):
     220                  self.prev_row += 1
     221                  self.prev_col = 0
     222          return "".join(self.tokens)
     223  
     224      def compat(self, token, iterable):
     225          indents = []
     226          toks_append = self.tokens.append
     227          startline = token[0] in (NEWLINE, NL)
     228          prevstring = False
     229  
     230          for tok in _itertools.chain([token], iterable):
     231              toknum, tokval = tok[:2]
     232              if toknum == ENCODING:
     233                  self.encoding = tokval
     234                  continue
     235  
     236              if toknum in (NAME, NUMBER):
     237                  tokval += ' '
     238  
     239              # Insert a space between two consecutive strings
     240              if toknum == STRING:
     241                  if prevstring:
     242                      tokval = ' ' + tokval
     243                  prevstring = True
     244              else:
     245                  prevstring = False
     246  
     247              if toknum == INDENT:
     248                  indents.append(tokval)
     249                  continue
     250              elif toknum == DEDENT:
     251                  indents.pop()
     252                  continue
     253              elif toknum in (NEWLINE, NL):
     254                  startline = True
     255              elif startline and indents:
     256                  toks_append(indents[-1])
     257                  startline = False
     258              toks_append(tokval)
     259  
     260  
     261  def untokenize(iterable):
     262      """Transform tokens back into Python source code.
     263      It returns a bytes object, encoded using the ENCODING
     264      token, which is the first token sequence output by tokenize.
     265  
     266      Each element returned by the iterable must be a token sequence
     267      with at least two elements, a token number and token value.  If
     268      only two tokens are passed, the resulting output is poor.
     269  
     270      Round-trip invariant for full input:
     271          Untokenized source will match input source exactly
     272  
     273      Round-trip invariant for limited input:
     274          # Output bytes will tokenize back to the input
     275          t1 = [tok[:2] for tok in tokenize(f.readline)]
     276          newcode = untokenize(t1)
     277          readline = BytesIO(newcode).readline
     278          t2 = [tok[:2] for tok in tokenize(readline)]
     279          assert t1 == t2
     280      """
     281      ut = Untokenizer()
     282      out = ut.untokenize(iterable)
     283      if ut.encoding is not None:
     284          out = out.encode(ut.encoding)
     285      return out
     286  
     287  
     288  def _get_normal_name(orig_enc):
     289      """Imitates get_normal_name in tokenizer.c."""
     290      # Only care about the first 12 characters.
     291      enc = orig_enc[:12].lower().replace("_", "-")
     292      if enc == "utf-8" or enc.startswith("utf-8-"):
     293          return "utf-8"
     294      if enc in ("latin-1", "iso-8859-1", "iso-latin-1") or \
     295         enc.startswith(("latin-1-", "iso-8859-1-", "iso-latin-1-")):
     296          return "iso-8859-1"
     297      return orig_enc
     298  
     299  def detect_encoding(readline):
     300      """
     301      The detect_encoding() function is used to detect the encoding that should
     302      be used to decode a Python source file.  It requires one argument, readline,
     303      in the same way as the tokenize() generator.
     304  
     305      It will call readline a maximum of twice, and return the encoding used
     306      (as a string) and a list of any lines (left as bytes) it has read in.
     307  
     308      It detects the encoding from the presence of a utf-8 bom or an encoding
     309      cookie as specified in pep-0263.  If both a bom and a cookie are present,
     310      but disagree, a SyntaxError will be raised.  If the encoding cookie is an
     311      invalid charset, raise a SyntaxError.  Note that if a utf-8 bom is found,
     312      'utf-8-sig' is returned.
     313  
     314      If no encoding is specified, then the default of 'utf-8' will be returned.
     315      """
     316      try:
     317          filename = readline.__self__.name
     318      except AttributeError:
     319          filename = None
     320      bom_found = False
     321      encoding = None
     322      default = 'utf-8'
     323      def read_or_stop():
     324          try:
     325              return readline()
     326          except StopIteration:
     327              return b''
     328  
     329      def find_cookie(line):
     330          try:
     331              # Decode as UTF-8. Either the line is an encoding declaration,
     332              # in which case it should be pure ASCII, or it must be UTF-8
     333              # per default encoding.
     334              line_string = line.decode('utf-8')
     335          except UnicodeDecodeError:
     336              msg = "invalid or missing encoding declaration"
     337              if filename is not None:
     338                  msg = '{} for {!r}'.format(msg, filename)
     339              raise SyntaxError(msg)
     340  
     341          match = cookie_re.match(line_string)
     342          if not match:
     343              return None
     344          encoding = _get_normal_name(match.group(1))
     345          try:
     346              codec = lookup(encoding)
     347          except LookupError:
     348              # This behaviour mimics the Python interpreter
     349              if filename is None:
     350                  msg = "unknown encoding: " + encoding
     351              else:
     352                  msg = "unknown encoding for {!r}: {}".format(filename,
     353                          encoding)
     354              raise SyntaxError(msg)
     355  
     356          if bom_found:
     357              if encoding != 'utf-8':
     358                  # This behaviour mimics the Python interpreter
     359                  if filename is None:
     360                      msg = 'encoding problem: utf-8'
     361                  else:
     362                      msg = 'encoding problem for {!r}: utf-8'.format(filename)
     363                  raise SyntaxError(msg)
     364              encoding += '-sig'
     365          return encoding
     366  
     367      first = read_or_stop()
     368      if first.startswith(BOM_UTF8):
     369          bom_found = True
     370          first = first[3:]
     371          default = 'utf-8-sig'
     372      if not first:
     373          return default, []
     374  
     375      encoding = find_cookie(first)
     376      if encoding:
     377          return encoding, [first]
     378      if not blank_re.match(first):
     379          return default, [first]
     380  
     381      second = read_or_stop()
     382      if not second:
     383          return default, [first]
     384  
     385      encoding = find_cookie(second)
     386      if encoding:
     387          return encoding, [first, second]
     388  
     389      return default, [first, second]
     390  
     391  
     392  def open(filename):
     393      """Open a file in read only mode using the encoding detected by
     394      detect_encoding().
     395      """
     396      buffer = _builtin_open(filename, 'rb')
     397      try:
     398          encoding, lines = detect_encoding(buffer.readline)
     399          buffer.seek(0)
     400          text = TextIOWrapper(buffer, encoding, line_buffering=True)
     401          text.mode = 'r'
     402          return text
     403      except:
     404          buffer.close()
     405          raise
     406  
     407  
     408  def tokenize(readline):
     409      """
     410      The tokenize() generator requires one argument, readline, which
     411      must be a callable object which provides the same interface as the
     412      readline() method of built-in file objects.  Each call to the function
     413      should return one line of input as bytes.  Alternatively, readline
     414      can be a callable function terminating with StopIteration:
     415          readline = open(myfile, 'rb').__next__  # Example of alternate readline
     416  
     417      The generator produces 5-tuples with these members: the token type; the
     418      token string; a 2-tuple (srow, scol) of ints specifying the row and
     419      column where the token begins in the source; a 2-tuple (erow, ecol) of
     420      ints specifying the row and column where the token ends in the source;
     421      and the line on which the token was found.  The line passed is the
     422      physical line.
     423  
     424      The first token sequence will always be an ENCODING token
     425      which tells you which encoding was used to decode the bytes stream.
     426      """
     427      encoding, consumed = detect_encoding(readline)
     428      empty = _itertools.repeat(b"")
     429      rl_gen = _itertools.chain(consumed, iter(readline, b""), empty)
     430      return _tokenize(rl_gen.__next__, encoding)
     431  
     432  
     433  def _tokenize(readline, encoding):
     434      lnum = parenlev = continued = 0
     435      numchars = '0123456789'
     436      contstr, needcont = '', 0
     437      contline = None
     438      indents = [0]
     439  
     440      if encoding is not None:
     441          if encoding == "utf-8-sig":
     442              # BOM will already have been stripped.
     443              encoding = "utf-8"
     444          yield TokenInfo(ENCODING, encoding, (0, 0), (0, 0), '')
     445      last_line = b''
     446      line = b''
     447      while True:                                # loop over lines in stream
     448          try:
     449              # We capture the value of the line variable here because
     450              # readline uses the empty string '' to signal end of input,
     451              # hence `line` itself will always be overwritten at the end
     452              # of this loop.
     453              last_line = line
     454              line = readline()
     455          except StopIteration:
     456              line = b''
     457  
     458          if encoding is not None:
     459              line = line.decode(encoding)
     460          lnum += 1
     461          pos, max = 0, len(line)
     462  
     463          if contstr:                            # continued string
     464              if not line:
     465                  raise TokenError("EOF in multi-line string", strstart)
     466              endmatch = endprog.match(line)
     467              if endmatch:
     468                  pos = end = endmatch.end(0)
     469                  yield TokenInfo(STRING, contstr + line[:end],
     470                         strstart, (lnum, end), contline + line)
     471                  contstr, needcont = '', 0
     472                  contline = None
     473              elif needcont and line[-2:] != '\\\n' and line[-3:] != '\\\r\n':
     474                  yield TokenInfo(ERRORTOKEN, contstr + line,
     475                             strstart, (lnum, len(line)), contline)
     476                  contstr = ''
     477                  contline = None
     478                  continue
     479              else:
     480                  contstr = contstr + line
     481                  contline = contline + line
     482                  continue
     483  
     484          elif parenlev == 0 and not continued:  # new statement
     485              if not line: break
     486              column = 0
     487              while pos < max:                   # measure leading whitespace
     488                  if line[pos] == ' ':
     489                      column += 1
     490                  elif line[pos] == '\t':
     491                      column = (column//tabsize + 1)*tabsize
     492                  elif line[pos] == '\f':
     493                      column = 0
     494                  else:
     495                      break
     496                  pos += 1
     497              if pos == max:
     498                  break
     499  
     500              if line[pos] in '#\r\n':           # skip comments or blank lines
     501                  if line[pos] == '#':
     502                      comment_token = line[pos:].rstrip('\r\n')
     503                      yield TokenInfo(COMMENT, comment_token,
     504                             (lnum, pos), (lnum, pos + len(comment_token)), line)
     505                      pos += len(comment_token)
     506  
     507                  yield TokenInfo(NL, line[pos:],
     508                             (lnum, pos), (lnum, len(line)), line)
     509                  continue
     510  
     511              if column > indents[-1]:           # count indents or dedents
     512                  indents.append(column)
     513                  yield TokenInfo(INDENT, line[:pos], (lnum, 0), (lnum, pos), line)
     514              while column < indents[-1]:
     515                  if column not in indents:
     516                      raise IndentationError(
     517                          "unindent does not match any outer indentation level",
     518                          ("<tokenize>", lnum, pos, line))
     519                  indents = indents[:-1]
     520  
     521                  yield TokenInfo(DEDENT, '', (lnum, pos), (lnum, pos), line)
     522  
     523          else:                                  # continued statement
     524              if not line:
     525                  raise TokenError("EOF in multi-line statement", (lnum, 0))
     526              continued = 0
     527  
     528          while pos < max:
     529              pseudomatch = _compile(PseudoToken).match(line, pos)
     530              if pseudomatch:                                # scan for tokens
     531                  start, end = pseudomatch.span(1)
     532                  spos, epos, pos = (lnum, start), (lnum, end), end
     533                  if start == end:
     534                      continue
     535                  token, initial = line[start:end], line[start]
     536  
     537                  if (initial in numchars or                 # ordinary number
     538                      (initial == '.' and token != '.' and token != '...')):
     539                      yield TokenInfo(NUMBER, token, spos, epos, line)
     540                  elif initial in '\r\n':
     541                      if parenlev > 0:
     542                          yield TokenInfo(NL, token, spos, epos, line)
     543                      else:
     544                          yield TokenInfo(NEWLINE, token, spos, epos, line)
     545  
     546                  elif initial == '#':
     547                      assert not token.endswith("\n")
     548                      yield TokenInfo(COMMENT, token, spos, epos, line)
     549  
     550                  elif token in triple_quoted:
     551                      endprog = _compile(endpats[token])
     552                      endmatch = endprog.match(line, pos)
     553                      if endmatch:                           # all on one line
     554                          pos = endmatch.end(0)
     555                          token = line[start:pos]
     556                          yield TokenInfo(STRING, token, spos, (lnum, pos), line)
     557                      else:
     558                          strstart = (lnum, start)           # multiple lines
     559                          contstr = line[start:]
     560                          contline = line
     561                          break
     562  
     563                  # Check up to the first 3 chars of the token to see if
     564                  #  they're in the single_quoted set. If so, they start
     565                  #  a string.
     566                  # We're using the first 3, because we're looking for
     567                  #  "rb'" (for example) at the start of the token. If
     568                  #  we switch to longer prefixes, this needs to be
     569                  #  adjusted.
     570                  # Note that initial == token[:1].
     571                  # Also note that single quote checking must come after
     572                  #  triple quote checking (above).
     573                  elif (initial in single_quoted or
     574                        token[:2] in single_quoted or
     575                        token[:3] in single_quoted):
     576                      if token[-1] == '\n':                  # continued string
     577                          strstart = (lnum, start)
     578                          # Again, using the first 3 chars of the
     579                          #  token. This is looking for the matching end
     580                          #  regex for the correct type of quote
     581                          #  character. So it's really looking for
     582                          #  endpats["'"] or endpats['"'], by trying to
     583                          #  skip string prefix characters, if any.
     584                          endprog = _compile(endpats.get(initial) or
     585                                             endpats.get(token[1]) or
     586                                             endpats.get(token[2]))
     587                          contstr, needcont = line[start:], 1
     588                          contline = line
     589                          break
     590                      else:                                  # ordinary string
     591                          yield TokenInfo(STRING, token, spos, epos, line)
     592  
     593                  elif initial.isidentifier():               # ordinary name
     594                      yield TokenInfo(NAME, token, spos, epos, line)
     595                  elif initial == '\\':                      # continued stmt
     596                      continued = 1
     597                  else:
     598                      if initial in '([{':
     599                          parenlev += 1
     600                      elif initial in ')]}':
     601                          parenlev -= 1
     602                      yield TokenInfo(OP, token, spos, epos, line)
     603              else:
     604                  yield TokenInfo(ERRORTOKEN, line[pos],
     605                             (lnum, pos), (lnum, pos+1), line)
     606                  pos += 1
     607  
     608      # Add an implicit NEWLINE if the input doesn't end in one
     609      if last_line and last_line[-1] not in '\r\n' and not last_line.strip().startswith("#"):
     610          yield TokenInfo(NEWLINE, '', (lnum - 1, len(last_line)), (lnum - 1, len(last_line) + 1), '')
     611      for indent in indents[1:]:                 # pop remaining indent levels
     612          yield TokenInfo(DEDENT, '', (lnum, 0), (lnum, 0), '')
     613      yield TokenInfo(ENDMARKER, '', (lnum, 0), (lnum, 0), '')
     614  
     615  
     616  def generate_tokens(readline):
     617      """Tokenize a source reading Python code as unicode strings.
     618  
     619      This has the same API as tokenize(), except that it expects the *readline*
     620      callable to return str objects instead of bytes.
     621      """
     622      return _tokenize(readline, None)
     623  
     624  def main():
     625      import argparse
     626  
     627      # Helper error handling routines
     628      def perror(message):
     629          sys.stderr.write(message)
     630          sys.stderr.write('\n')
     631  
     632      def error(message, filename=None, location=None):
     633          if location:
     634              args = (filename,) + location + (message,)
     635              perror("%s:%d:%d: error: %s" % args)
     636          elif filename:
     637              perror("%s: error: %s" % (filename, message))
     638          else:
     639              perror("error: %s" % message)
     640          sys.exit(1)
     641  
     642      # Parse the arguments and options
     643      parser = argparse.ArgumentParser(prog='python -m tokenize')
     644      parser.add_argument(dest='filename', nargs='?',
     645                          metavar='filename.py',
     646                          help='the file to tokenize; defaults to stdin')
     647      parser.add_argument('-e', '--exact', dest='exact', action='store_true',
     648                          help='display token names using the exact type')
     649      args = parser.parse_args()
     650  
     651      try:
     652          # Tokenize the input
     653          if args.filename:
     654              filename = args.filename
     655              with _builtin_open(filename, 'rb') as f:
     656                  tokens = list(tokenize(f.readline))
     657          else:
     658              filename = "<stdin>"
     659              tokens = _tokenize(sys.stdin.readline, None)
     660  
     661          # Output the tokenization
     662          for token in tokens:
     663              token_type = token.type
     664              if args.exact:
     665                  token_type = token.exact_type
     666              token_range = "%d,%d-%d,%d:" % (token.start + token.end)
     667              print("%-20s%-15s%-15r" %
     668                    (token_range, tok_name[token_type], token.string))
     669      except IndentationError as err:
     670          line, column = err.args[1][1:3]
     671          error(err.args[0], filename, (line, column))
     672      except TokenError as err:
     673          line, column = err.args[1]
     674          error(err.args[0], filename, (line, column))
     675      except SyntaxError as err:
     676          error(err, filename)
     677      except OSError as err:
     678          error(err)
     679      except KeyboardInterrupt:
     680          print("interrupted\n")
     681      except Exception as err:
     682          perror("unexpected error: %s" % err)
     683          raise
     684  
     685  def _generate_tokens_from_c_tokenizer(source):
     686      """Tokenize a source reading Python code as unicode strings using the internal C tokenizer"""
     687      import _tokenize as c_tokenizer
     688      for info in c_tokenizer.TokenizerIter(source):
     689          tok, type, lineno, end_lineno, col_off, end_col_off, line = info
     690          yield TokenInfo(type, tok, (lineno, col_off), (end_lineno, end_col_off), line)
     691  
     692  
     693  if __name__ == "__main__":
     694      main()