1  """Define partial Python code Parser used by editor and hyperparser.
       2  
       3  Instances of ParseMap are used with str.translate.
       4  
       5  The following bound search and match functions are defined:
       6  _synchre - start of popular statement;
       7  _junkre - whitespace or comment line;
       8  _match_stringre: string, possibly without closer;
       9  _itemre - line that may have bracket structure start;
      10  _closere - line that must be followed by dedent.
      11  _chew_ordinaryre - non-special characters.
      12  """
      13  import re
      14  
      15  # Reason last statement is continued (or C_NONE if it's not).
      16  (C_NONE, C_BACKSLASH, C_STRING_FIRST_LINE,
      17   C_STRING_NEXT_LINES, C_BRACKET) = range(5)
      18  
      19  # Find what looks like the start of a popular statement.
      20  
      21  _synchre = re.compile(r"""
      22      ^
      23      [ \t]*
      24      (?: while
      25      |   else
      26      |   def
      27      |   return
      28      |   assert
      29      |   break
      30      |   class
      31      |   continue
      32      |   elif
      33      |   try
      34      |   except
      35      |   raise
      36      |   import
      37      |   yield
      38      )
      39      \b
      40  """, re.VERBOSE | re.MULTILINE).search
      41  
      42  # Match blank line or non-indenting comment line.
      43  
      44  _junkre = re.compile(r"""
      45      [ \t]*
      46      (?: \# \S .* )?
      47      \n
      48  """, re.VERBOSE).match
      49  
      50  # Match any flavor of string; the terminating quote is optional
      51  # so that we're robust in the face of incomplete program text.
      52  
      53  _match_stringre = re.compile(r"""
      54      \""" [^"\\]* (?:
      55                       (?: \\. | "(?!"") )
      56                       [^"\\]*
      57                   )*
      58      (?: \""" )?
      59  
      60  |   " [^"\\\n]* (?: \\. [^"\\\n]* )* "?
      61  
      62  |   ''' [^'\\]* (?:
      63                     (?: \\. | '(?!'') )
      64                     [^'\\]*
      65                  )*
      66      (?: ''' )?
      67  
      68  |   ' [^'\\\n]* (?: \\. [^'\\\n]* )* '?
      69  """, re.VERBOSE | re.DOTALL).match
      70  
      71  # Match a line that starts with something interesting;
      72  # used to find the first item of a bracket structure.
      73  
      74  _itemre = re.compile(r"""
      75      [ \t]*
      76      [^\s#\\]    # if we match, m.end()-1 is the interesting char
      77  """, re.VERBOSE).match
      78  
      79  # Match start of statements that should be followed by a dedent.
      80  
      81  _closere = re.compile(r"""
      82      \s*
      83      (?: return
      84      |   break
      85      |   continue
      86      |   raise
      87      |   pass
      88      )
      89      \b
      90  """, re.VERBOSE).match
      91  
      92  # Chew up non-special chars as quickly as possible.  If match is
      93  # successful, m.end() less 1 is the index of the last boring char
      94  # matched.  If match is unsuccessful, the string starts with an
      95  # interesting char.
      96  
      97  _chew_ordinaryre = re.compile(r"""
      98      [^[\](){}#'"\\]+
      99  """, re.VERBOSE).match
     100  
     101  
     102  class ESC[4;38;5;81mParseMap(ESC[4;38;5;149mdict):
     103      r"""Dict subclass that maps anything not in dict to 'x'.
     104  
     105      This is designed to be used with str.translate in study1.
     106      Anything not specifically mapped otherwise becomes 'x'.
     107      Example: replace everything except whitespace with 'x'.
     108  
     109      >>> keepwhite = ParseMap((ord(c), ord(c)) for c in ' \t\n\r')
     110      >>> "a + b\tc\nd".translate(keepwhite)
     111      'x x x\tx\nx'
     112      """
     113      # Calling this triples access time; see bpo-32940
     114      def __missing__(self, key):
     115          return 120  # ord('x')
     116  
     117  
     118  # Map all ascii to 120 to avoid __missing__ call, then replace some.
     119  trans = ParseMap.fromkeys(range(128), 120)
     120  trans.update((ord(c), ord('(')) for c in "({[")  # open brackets => '(';
     121  trans.update((ord(c), ord(')')) for c in ")}]")  # close brackets => ')'.
     122  trans.update((ord(c), ord(c)) for c in "\"'\\\n#")  # Keep these.
     123  
     124  
     125  class ESC[4;38;5;81mParser:
     126  
     127      def __init__(self, indentwidth, tabwidth):
     128          self.indentwidth = indentwidth
     129          self.tabwidth = tabwidth
     130  
     131      def set_code(self, s):
     132          assert len(s) == 0 or s[-1] == '\n'
     133          self.code = s
     134          self.study_level = 0
     135  
     136      def find_good_parse_start(self, is_char_in_string):
     137          """
     138          Return index of a good place to begin parsing, as close to the
     139          end of the string as possible.  This will be the start of some
     140          popular stmt like "if" or "def".  Return None if none found:
     141          the caller should pass more prior context then, if possible, or
     142          if not (the entire program text up until the point of interest
     143          has already been tried) pass 0 to set_lo().
     144  
     145          This will be reliable iff given a reliable is_char_in_string()
     146          function, meaning that when it says "no", it's absolutely
     147          guaranteed that the char is not in a string.
     148          """
     149          code, pos = self.code, None
     150  
     151          # Peek back from the end for a good place to start,
     152          # but don't try too often; pos will be left None, or
     153          # bumped to a legitimate synch point.
     154          limit = len(code)
     155          for tries in range(5):
     156              i = code.rfind(":\n", 0, limit)
     157              if i < 0:
     158                  break
     159              i = code.rfind('\n', 0, i) + 1  # start of colon line (-1+1=0)
     160              m = _synchre(code, i, limit)
     161              if m and not is_char_in_string(m.start()):
     162                  pos = m.start()
     163                  break
     164              limit = i
     165          if pos is None:
     166              # Nothing looks like a block-opener, or stuff does
     167              # but is_char_in_string keeps returning true; most likely
     168              # we're in or near a giant string, the colorizer hasn't
     169              # caught up enough to be helpful, or there simply *aren't*
     170              # any interesting stmts.  In any of these cases we're
     171              # going to have to parse the whole thing to be sure, so
     172              # give it one last try from the start, but stop wasting
     173              # time here regardless of the outcome.
     174              m = _synchre(code)
     175              if m and not is_char_in_string(m.start()):
     176                  pos = m.start()
     177              return pos
     178  
     179          # Peeking back worked; look forward until _synchre no longer
     180          # matches.
     181          i = pos + 1
     182          while m := _synchre(code, i):
     183              s, i = m.span()
     184              if not is_char_in_string(s):
     185                  pos = s
     186          return pos
     187  
     188      def set_lo(self, lo):
     189          """ Throw away the start of the string.
     190  
     191          Intended to be called with the result of find_good_parse_start().
     192          """
     193          assert lo == 0 or self.code[lo-1] == '\n'
     194          if lo > 0:
     195              self.code = self.code[lo:]
     196  
     197      def _study1(self):
     198          """Find the line numbers of non-continuation lines.
     199  
     200          As quickly as humanly possible <wink>, find the line numbers (0-
     201          based) of the non-continuation lines.
     202          Creates self.{goodlines, continuation}.
     203          """
     204          if self.study_level >= 1:
     205              return
     206          self.study_level = 1
     207  
     208          # Map all uninteresting characters to "x", all open brackets
     209          # to "(", all close brackets to ")", then collapse runs of
     210          # uninteresting characters.  This can cut the number of chars
     211          # by a factor of 10-40, and so greatly speed the following loop.
     212          code = self.code
     213          code = code.translate(trans)
     214          code = code.replace('xxxxxxxx', 'x')
     215          code = code.replace('xxxx', 'x')
     216          code = code.replace('xx', 'x')
     217          code = code.replace('xx', 'x')
     218          code = code.replace('\nx', '\n')
     219          # Replacing x\n with \n would be incorrect because
     220          # x may be preceded by a backslash.
     221  
     222          # March over the squashed version of the program, accumulating
     223          # the line numbers of non-continued stmts, and determining
     224          # whether & why the last stmt is a continuation.
     225          continuation = C_NONE
     226          level = lno = 0     # level is nesting level; lno is line number
     227          self.goodlines = goodlines = [0]
     228          push_good = goodlines.append
     229          i, n = 0, len(code)
     230          while i < n:
     231              ch = code[i]
     232              i = i+1
     233  
     234              # cases are checked in decreasing order of frequency
     235              if ch == 'x':
     236                  continue
     237  
     238              if ch == '\n':
     239                  lno = lno + 1
     240                  if level == 0:
     241                      push_good(lno)
     242                      # else we're in an unclosed bracket structure
     243                  continue
     244  
     245              if ch == '(':
     246                  level = level + 1
     247                  continue
     248  
     249              if ch == ')':
     250                  if level:
     251                      level = level - 1
     252                      # else the program is invalid, but we can't complain
     253                  continue
     254  
     255              if ch == '"' or ch == "'":
     256                  # consume the string
     257                  quote = ch
     258                  if code[i-1:i+2] == quote * 3:
     259                      quote = quote * 3
     260                  firstlno = lno
     261                  w = len(quote) - 1
     262                  i = i+w
     263                  while i < n:
     264                      ch = code[i]
     265                      i = i+1
     266  
     267                      if ch == 'x':
     268                          continue
     269  
     270                      if code[i-1:i+w] == quote:
     271                          i = i+w
     272                          break
     273  
     274                      if ch == '\n':
     275                          lno = lno + 1
     276                          if w == 0:
     277                              # unterminated single-quoted string
     278                              if level == 0:
     279                                  push_good(lno)
     280                              break
     281                          continue
     282  
     283                      if ch == '\\':
     284                          assert i < n
     285                          if code[i] == '\n':
     286                              lno = lno + 1
     287                          i = i+1
     288                          continue
     289  
     290                      # else comment char or paren inside string
     291  
     292                  else:
     293                      # didn't break out of the loop, so we're still
     294                      # inside a string
     295                      if (lno - 1) == firstlno:
     296                          # before the previous \n in code, we were in the first
     297                          # line of the string
     298                          continuation = C_STRING_FIRST_LINE
     299                      else:
     300                          continuation = C_STRING_NEXT_LINES
     301                  continue    # with outer loop
     302  
     303              if ch == '#':
     304                  # consume the comment
     305                  i = code.find('\n', i)
     306                  assert i >= 0
     307                  continue
     308  
     309              assert ch == '\\'
     310              assert i < n
     311              if code[i] == '\n':
     312                  lno = lno + 1
     313                  if i+1 == n:
     314                      continuation = C_BACKSLASH
     315              i = i+1
     316  
     317          # The last stmt may be continued for all 3 reasons.
     318          # String continuation takes precedence over bracket
     319          # continuation, which beats backslash continuation.
     320          if (continuation != C_STRING_FIRST_LINE
     321              and continuation != C_STRING_NEXT_LINES and level > 0):
     322              continuation = C_BRACKET
     323          self.continuation = continuation
     324  
     325          # Push the final line number as a sentinel value, regardless of
     326          # whether it's continued.
     327          assert (continuation == C_NONE) == (goodlines[-1] == lno)
     328          if goodlines[-1] != lno:
     329              push_good(lno)
     330  
     331      def get_continuation_type(self):
     332          self._study1()
     333          return self.continuation
     334  
     335      def _study2(self):
     336          """
     337          study1 was sufficient to determine the continuation status,
     338          but doing more requires looking at every character.  study2
     339          does this for the last interesting statement in the block.
     340          Creates:
     341              self.stmt_start, stmt_end
     342                  slice indices of last interesting stmt
     343              self.stmt_bracketing
     344                  the bracketing structure of the last interesting stmt; for
     345                  example, for the statement "say(boo) or die",
     346                  stmt_bracketing will be ((0, 0), (0, 1), (2, 0), (2, 1),
     347                  (4, 0)). Strings and comments are treated as brackets, for
     348                  the matter.
     349              self.lastch
     350                  last interesting character before optional trailing comment
     351              self.lastopenbracketpos
     352                  if continuation is C_BRACKET, index of last open bracket
     353          """
     354          if self.study_level >= 2:
     355              return
     356          self._study1()
     357          self.study_level = 2
     358  
     359          # Set p and q to slice indices of last interesting stmt.
     360          code, goodlines = self.code, self.goodlines
     361          i = len(goodlines) - 1  # Index of newest line.
     362          p = len(code)  # End of goodlines[i]
     363          while i:
     364              assert p
     365              # Make p be the index of the stmt at line number goodlines[i].
     366              # Move p back to the stmt at line number goodlines[i-1].
     367              q = p
     368              for nothing in range(goodlines[i-1], goodlines[i]):
     369                  # tricky: sets p to 0 if no preceding newline
     370                  p = code.rfind('\n', 0, p-1) + 1
     371              # The stmt code[p:q] isn't a continuation, but may be blank
     372              # or a non-indenting comment line.
     373              if  _junkre(code, p):
     374                  i = i-1
     375              else:
     376                  break
     377          if i == 0:
     378              # nothing but junk!
     379              assert p == 0
     380              q = p
     381          self.stmt_start, self.stmt_end = p, q
     382  
     383          # Analyze this stmt, to find the last open bracket (if any)
     384          # and last interesting character (if any).
     385          lastch = ""
     386          stack = []  # stack of open bracket indices
     387          push_stack = stack.append
     388          bracketing = [(p, 0)]
     389          while p < q:
     390              # suck up all except ()[]{}'"#\\
     391              m = _chew_ordinaryre(code, p, q)
     392              if m:
     393                  # we skipped at least one boring char
     394                  newp = m.end()
     395                  # back up over totally boring whitespace
     396                  i = newp - 1    # index of last boring char
     397                  while i >= p and code[i] in " \t\n":
     398                      i = i-1
     399                  if i >= p:
     400                      lastch = code[i]
     401                  p = newp
     402                  if p >= q:
     403                      break
     404  
     405              ch = code[p]
     406  
     407              if ch in "([{":
     408                  push_stack(p)
     409                  bracketing.append((p, len(stack)))
     410                  lastch = ch
     411                  p = p+1
     412                  continue
     413  
     414              if ch in ")]}":
     415                  if stack:
     416                      del stack[-1]
     417                  lastch = ch
     418                  p = p+1
     419                  bracketing.append((p, len(stack)))
     420                  continue
     421  
     422              if ch == '"' or ch == "'":
     423                  # consume string
     424                  # Note that study1 did this with a Python loop, but
     425                  # we use a regexp here; the reason is speed in both
     426                  # cases; the string may be huge, but study1 pre-squashed
     427                  # strings to a couple of characters per line.  study1
     428                  # also needed to keep track of newlines, and we don't
     429                  # have to.
     430                  bracketing.append((p, len(stack)+1))
     431                  lastch = ch
     432                  p = _match_stringre(code, p, q).end()
     433                  bracketing.append((p, len(stack)))
     434                  continue
     435  
     436              if ch == '#':
     437                  # consume comment and trailing newline
     438                  bracketing.append((p, len(stack)+1))
     439                  p = code.find('\n', p, q) + 1
     440                  assert p > 0
     441                  bracketing.append((p, len(stack)))
     442                  continue
     443  
     444              assert ch == '\\'
     445              p = p+1     # beyond backslash
     446              assert p < q
     447              if code[p] != '\n':
     448                  # the program is invalid, but can't complain
     449                  lastch = ch + code[p]
     450              p = p+1     # beyond escaped char
     451  
     452          # end while p < q:
     453  
     454          self.lastch = lastch
     455          self.lastopenbracketpos = stack[-1] if stack else None
     456          self.stmt_bracketing = tuple(bracketing)
     457  
     458      def compute_bracket_indent(self):
     459          """Return number of spaces the next line should be indented.
     460  
     461          Line continuation must be C_BRACKET.
     462          """
     463          self._study2()
     464          assert self.continuation == C_BRACKET
     465          j = self.lastopenbracketpos
     466          code = self.code
     467          n = len(code)
     468          origi = i = code.rfind('\n', 0, j) + 1
     469          j = j+1     # one beyond open bracket
     470          # find first list item; set i to start of its line
     471          while j < n:
     472              m = _itemre(code, j)
     473              if m:
     474                  j = m.end() - 1     # index of first interesting char
     475                  extra = 0
     476                  break
     477              else:
     478                  # this line is junk; advance to next line
     479                  i = j = code.find('\n', j) + 1
     480          else:
     481              # nothing interesting follows the bracket;
     482              # reproduce the bracket line's indentation + a level
     483              j = i = origi
     484              while code[j] in " \t":
     485                  j = j+1
     486              extra = self.indentwidth
     487          return len(code[i:j].expandtabs(self.tabwidth)) + extra
     488  
     489      def get_num_lines_in_stmt(self):
     490          """Return number of physical lines in last stmt.
     491  
     492          The statement doesn't have to be an interesting statement.  This is
     493          intended to be called when continuation is C_BACKSLASH.
     494          """
     495          self._study1()
     496          goodlines = self.goodlines
     497          return goodlines[-1] - goodlines[-2]
     498  
     499      def compute_backslash_indent(self):
     500          """Return number of spaces the next line should be indented.
     501  
     502          Line continuation must be C_BACKSLASH.  Also assume that the new
     503          line is the first one following the initial line of the stmt.
     504          """
     505          self._study2()
     506          assert self.continuation == C_BACKSLASH
     507          code = self.code
     508          i = self.stmt_start
     509          while code[i] in " \t":
     510              i = i+1
     511          startpos = i
     512  
     513          # See whether the initial line starts an assignment stmt; i.e.,
     514          # look for an = operator
     515          endpos = code.find('\n', startpos) + 1
     516          found = level = 0
     517          while i < endpos:
     518              ch = code[i]
     519              if ch in "([{":
     520                  level = level + 1
     521                  i = i+1
     522              elif ch in ")]}":
     523                  if level:
     524                      level = level - 1
     525                  i = i+1
     526              elif ch == '"' or ch == "'":
     527                  i = _match_stringre(code, i, endpos).end()
     528              elif ch == '#':
     529                  # This line is unreachable because the # makes a comment of
     530                  # everything after it.
     531                  break
     532              elif level == 0 and ch == '=' and \
     533                     (i == 0 or code[i-1] not in "=<>!") and \
     534                     code[i+1] != '=':
     535                  found = 1
     536                  break
     537              else:
     538                  i = i+1
     539  
     540          if found:
     541              # found a legit =, but it may be the last interesting
     542              # thing on the line
     543              i = i+1     # move beyond the =
     544              found = re.match(r"\s*\\", code[i:endpos]) is None
     545  
     546          if not found:
     547              # oh well ... settle for moving beyond the first chunk
     548              # of non-whitespace chars
     549              i = startpos
     550              while code[i] not in " \t\n":
     551                  i = i+1
     552  
     553          return len(code[self.stmt_start:i].expandtabs(\
     554                                       self.tabwidth)) + 1
     555  
     556      def get_base_indent_string(self):
     557          """Return the leading whitespace on the initial line of the last
     558          interesting stmt.
     559          """
     560          self._study2()
     561          i, n = self.stmt_start, self.stmt_end
     562          j = i
     563          code = self.code
     564          while j < n and code[j] in " \t":
     565              j = j + 1
     566          return code[i:j]
     567  
     568      def is_block_opener(self):
     569          "Return True if the last interesting statement opens a block."
     570          self._study2()
     571          return self.lastch == ':'
     572  
     573      def is_block_closer(self):
     574          "Return True if the last interesting statement closes a block."
     575          self._study2()
     576          return _closere(self.code, self.stmt_start) is not None
     577  
     578      def get_last_stmt_bracketing(self):
     579          """Return bracketing structure of the last interesting statement.
     580  
     581          The returned tuple is in the format defined in _study2().
     582          """
     583          self._study2()
     584          return self.stmt_bracketing
     585  
     586  
     587  if __name__ == '__main__':
     588      from unittest import main
     589      main('idlelib.idle_test.test_pyparse', verbosity=2)