1  """Provide advanced parsing abilities for ParenMatch and other extensions.
       2  
       3  HyperParser uses PyParser.  PyParser mostly gives information on the
       4  proper indentation of code.  HyperParser gives additional information on
       5  the structure of code.
       6  """
       7  from keyword import iskeyword
       8  import string
       9  
      10  from idlelib import pyparse
      11  
      12  # all ASCII chars that may be in an identifier
      13  _ASCII_ID_CHARS = frozenset(string.ascii_letters + string.digits + "_")
      14  # all ASCII chars that may be the first char of an identifier
      15  _ASCII_ID_FIRST_CHARS = frozenset(string.ascii_letters + "_")
      16  
      17  # lookup table for whether 7-bit ASCII chars are valid in a Python identifier
      18  _IS_ASCII_ID_CHAR = [(chr(x) in _ASCII_ID_CHARS) for x in range(128)]
      19  # lookup table for whether 7-bit ASCII chars are valid as the first
      20  # char in a Python identifier
      21  _IS_ASCII_ID_FIRST_CHAR = \
      22      [(chr(x) in _ASCII_ID_FIRST_CHARS) for x in range(128)]
      23  
      24  
      25  class ESC[4;38;5;81mHyperParser:
      26      def __init__(self, editwin, index):
      27          "To initialize, analyze the surroundings of the given index."
      28  
      29          self.editwin = editwin
      30          self.text = text = editwin.text
      31  
      32          parser = pyparse.Parser(editwin.indentwidth, editwin.tabwidth)
      33  
      34          def index2line(index):
      35              return int(float(index))
      36          lno = index2line(text.index(index))
      37  
      38          if not editwin.prompt_last_line:
      39              for context in editwin.num_context_lines:
      40                  startat = max(lno - context, 1)
      41                  startatindex = repr(startat) + ".0"
      42                  stopatindex = "%d.end" % lno
      43                  # We add the newline because PyParse requires a newline
      44                  # at end. We add a space so that index won't be at end
      45                  # of line, so that its status will be the same as the
      46                  # char before it, if should.
      47                  parser.set_code(text.get(startatindex, stopatindex)+' \n')
      48                  bod = parser.find_good_parse_start(
      49                            editwin._build_char_in_string_func(startatindex))
      50                  if bod is not None or startat == 1:
      51                      break
      52              parser.set_lo(bod or 0)
      53          else:
      54              r = text.tag_prevrange("console", index)
      55              if r:
      56                  startatindex = r[1]
      57              else:
      58                  startatindex = "1.0"
      59              stopatindex = "%d.end" % lno
      60              # We add the newline because PyParse requires it. We add a
      61              # space so that index won't be at end of line, so that its
      62              # status will be the same as the char before it, if should.
      63              parser.set_code(text.get(startatindex, stopatindex)+' \n')
      64              parser.set_lo(0)
      65  
      66          # We want what the parser has, minus the last newline and space.
      67          self.rawtext = parser.code[:-2]
      68          # Parser.code apparently preserves the statement we are in, so
      69          # that stopatindex can be used to synchronize the string with
      70          # the text box indices.
      71          self.stopatindex = stopatindex
      72          self.bracketing = parser.get_last_stmt_bracketing()
      73          # find which pairs of bracketing are openers. These always
      74          # correspond to a character of rawtext.
      75          self.isopener = [i>0 and self.bracketing[i][1] >
      76                           self.bracketing[i-1][1]
      77                           for i in range(len(self.bracketing))]
      78  
      79          self.set_index(index)
      80  
      81      def set_index(self, index):
      82          """Set the index to which the functions relate.
      83  
      84          The index must be in the same statement.
      85          """
      86          indexinrawtext = (len(self.rawtext) -
      87                            len(self.text.get(index, self.stopatindex)))
      88          if indexinrawtext < 0:
      89              raise ValueError("Index %s precedes the analyzed statement"
      90                               % index)
      91          self.indexinrawtext = indexinrawtext
      92          # find the rightmost bracket to which index belongs
      93          self.indexbracket = 0
      94          while (self.indexbracket < len(self.bracketing)-1 and
      95                 self.bracketing[self.indexbracket+1][0] < self.indexinrawtext):
      96              self.indexbracket += 1
      97          if (self.indexbracket < len(self.bracketing)-1 and
      98              self.bracketing[self.indexbracket+1][0] == self.indexinrawtext and
      99             not self.isopener[self.indexbracket+1]):
     100              self.indexbracket += 1
     101  
     102      def is_in_string(self):
     103          """Is the index given to the HyperParser in a string?"""
     104          # The bracket to which we belong should be an opener.
     105          # If it's an opener, it has to have a character.
     106          return (self.isopener[self.indexbracket] and
     107                  self.rawtext[self.bracketing[self.indexbracket][0]]
     108                  in ('"', "'"))
     109  
     110      def is_in_code(self):
     111          """Is the index given to the HyperParser in normal code?"""
     112          return (not self.isopener[self.indexbracket] or
     113                  self.rawtext[self.bracketing[self.indexbracket][0]]
     114                  not in ('#', '"', "'"))
     115  
     116      def get_surrounding_brackets(self, openers='([{', mustclose=False):
     117          """Return bracket indexes or None.
     118  
     119          If the index given to the HyperParser is surrounded by a
     120          bracket defined in openers (or at least has one before it),
     121          return the indices of the opening bracket and the closing
     122          bracket (or the end of line, whichever comes first).
     123  
     124          If it is not surrounded by brackets, or the end of line comes
     125          before the closing bracket and mustclose is True, returns None.
     126          """
     127  
     128          bracketinglevel = self.bracketing[self.indexbracket][1]
     129          before = self.indexbracket
     130          while (not self.isopener[before] or
     131                self.rawtext[self.bracketing[before][0]] not in openers or
     132                self.bracketing[before][1] > bracketinglevel):
     133              before -= 1
     134              if before < 0:
     135                  return None
     136              bracketinglevel = min(bracketinglevel, self.bracketing[before][1])
     137          after = self.indexbracket + 1
     138          while (after < len(self.bracketing) and
     139                self.bracketing[after][1] >= bracketinglevel):
     140              after += 1
     141  
     142          beforeindex = self.text.index("%s-%dc" %
     143              (self.stopatindex, len(self.rawtext)-self.bracketing[before][0]))
     144          if (after >= len(self.bracketing) or
     145             self.bracketing[after][0] > len(self.rawtext)):
     146              if mustclose:
     147                  return None
     148              afterindex = self.stopatindex
     149          else:
     150              # We are after a real char, so it is a ')' and we give the
     151              # index before it.
     152              afterindex = self.text.index(
     153                  "%s-%dc" % (self.stopatindex,
     154                   len(self.rawtext)-(self.bracketing[after][0]-1)))
     155  
     156          return beforeindex, afterindex
     157  
     158      # the set of built-in identifiers which are also keywords,
     159      # i.e. keyword.iskeyword() returns True for them
     160      _ID_KEYWORDS = frozenset({"True", "False", "None"})
     161  
     162      @classmethod
     163      def _eat_identifier(cls, str, limit, pos):
     164          """Given a string and pos, return the number of chars in the
     165          identifier which ends at pos, or 0 if there is no such one.
     166  
     167          This ignores non-identifier eywords are not identifiers.
     168          """
     169          is_ascii_id_char = _IS_ASCII_ID_CHAR
     170  
     171          # Start at the end (pos) and work backwards.
     172          i = pos
     173  
     174          # Go backwards as long as the characters are valid ASCII
     175          # identifier characters. This is an optimization, since it
     176          # is faster in the common case where most of the characters
     177          # are ASCII.
     178          while i > limit and (
     179                  ord(str[i - 1]) < 128 and
     180                  is_ascii_id_char[ord(str[i - 1])]
     181          ):
     182              i -= 1
     183  
     184          # If the above loop ended due to reaching a non-ASCII
     185          # character, continue going backwards using the most generic
     186          # test for whether a string contains only valid identifier
     187          # characters.
     188          if i > limit and ord(str[i - 1]) >= 128:
     189              while i - 4 >= limit and ('a' + str[i - 4:pos]).isidentifier():
     190                  i -= 4
     191              if i - 2 >= limit and ('a' + str[i - 2:pos]).isidentifier():
     192                  i -= 2
     193              if i - 1 >= limit and ('a' + str[i - 1:pos]).isidentifier():
     194                  i -= 1
     195  
     196              # The identifier candidate starts here. If it isn't a valid
     197              # identifier, don't eat anything. At this point that is only
     198              # possible if the first character isn't a valid first
     199              # character for an identifier.
     200              if not str[i:pos].isidentifier():
     201                  return 0
     202          elif i < pos:
     203              # All characters in str[i:pos] are valid ASCII identifier
     204              # characters, so it is enough to check that the first is
     205              # valid as the first character of an identifier.
     206              if not _IS_ASCII_ID_FIRST_CHAR[ord(str[i])]:
     207                  return 0
     208  
     209          # All keywords are valid identifiers, but should not be
     210          # considered identifiers here, except for True, False and None.
     211          if i < pos and (
     212                  iskeyword(str[i:pos]) and
     213                  str[i:pos] not in cls._ID_KEYWORDS
     214          ):
     215              return 0
     216  
     217          return pos - i
     218  
     219      # This string includes all chars that may be in a white space
     220      _whitespace_chars = " \t\n\\"
     221  
     222      def get_expression(self):
     223          """Return a string with the Python expression which ends at the
     224          given index, which is empty if there is no real one.
     225          """
     226          if not self.is_in_code():
     227              raise ValueError("get_expression should only be called "
     228                               "if index is inside a code.")
     229  
     230          rawtext = self.rawtext
     231          bracketing = self.bracketing
     232  
     233          brck_index = self.indexbracket
     234          brck_limit = bracketing[brck_index][0]
     235          pos = self.indexinrawtext
     236  
     237          last_identifier_pos = pos
     238          postdot_phase = True
     239  
     240          while True:
     241              # Eat whitespaces, comments, and if postdot_phase is False - a dot
     242              while True:
     243                  if pos>brck_limit and rawtext[pos-1] in self._whitespace_chars:
     244                      # Eat a whitespace
     245                      pos -= 1
     246                  elif (not postdot_phase and
     247                        pos > brck_limit and rawtext[pos-1] == '.'):
     248                      # Eat a dot
     249                      pos -= 1
     250                      postdot_phase = True
     251                  # The next line will fail if we are *inside* a comment,
     252                  # but we shouldn't be.
     253                  elif (pos == brck_limit and brck_index > 0 and
     254                        rawtext[bracketing[brck_index-1][0]] == '#'):
     255                      # Eat a comment
     256                      brck_index -= 2
     257                      brck_limit = bracketing[brck_index][0]
     258                      pos = bracketing[brck_index+1][0]
     259                  else:
     260                      # If we didn't eat anything, quit.
     261                      break
     262  
     263              if not postdot_phase:
     264                  # We didn't find a dot, so the expression end at the
     265                  # last identifier pos.
     266                  break
     267  
     268              ret = self._eat_identifier(rawtext, brck_limit, pos)
     269              if ret:
     270                  # There is an identifier to eat
     271                  pos = pos - ret
     272                  last_identifier_pos = pos
     273                  # Now, to continue the search, we must find a dot.
     274                  postdot_phase = False
     275                  # (the loop continues now)
     276  
     277              elif pos == brck_limit:
     278                  # We are at a bracketing limit. If it is a closing
     279                  # bracket, eat the bracket, otherwise, stop the search.
     280                  level = bracketing[brck_index][1]
     281                  while brck_index > 0 and bracketing[brck_index-1][1] > level:
     282                      brck_index -= 1
     283                  if bracketing[brck_index][0] == brck_limit:
     284                      # We were not at the end of a closing bracket
     285                      break
     286                  pos = bracketing[brck_index][0]
     287                  brck_index -= 1
     288                  brck_limit = bracketing[brck_index][0]
     289                  last_identifier_pos = pos
     290                  if rawtext[pos] in "([":
     291                      # [] and () may be used after an identifier, so we
     292                      # continue. postdot_phase is True, so we don't allow a dot.
     293                      pass
     294                  else:
     295                      # We can't continue after other types of brackets
     296                      if rawtext[pos] in "'\"":
     297                          # Scan a string prefix
     298                          while pos > 0 and rawtext[pos - 1] in "rRbBuU":
     299                              pos -= 1
     300                          last_identifier_pos = pos
     301                      break
     302  
     303              else:
     304                  # We've found an operator or something.
     305                  break
     306  
     307          return rawtext[last_identifier_pos:self.indexinrawtext]
     308  
     309  
     310  if __name__ == '__main__':
     311      from unittest import main
     312      main('idlelib.idle_test.test_hyperparser', verbosity=2)