1  """A parser for HTML and XHTML."""
       2  
       3  # This file is based on sgmllib.py, but the API is slightly different.
       4  
       5  # XXX There should be a way to distinguish between PCDATA (parsed
       6  # character data -- the normal case), RCDATA (replaceable character
       7  # data -- only char and entity references and end tags are special)
       8  # and CDATA (character data -- only end tags are special).
       9  
      10  
      11  import re
      12  import _markupbase
      13  
      14  from html import unescape
      15  
      16  
      17  __all__ = ['HTMLParser']
      18  
      19  # Regular expressions used for parsing
      20  
      21  interesting_normal = re.compile('[&<]')
      22  incomplete = re.compile('&[a-zA-Z#]')
      23  
      24  entityref = re.compile('&([a-zA-Z][-.a-zA-Z0-9]*)[^a-zA-Z0-9]')
      25  charref = re.compile('&#(?:[0-9]+|[xX][0-9a-fA-F]+)[^0-9a-fA-F]')
      26  
      27  starttagopen = re.compile('<[a-zA-Z]')
      28  piclose = re.compile('>')
      29  commentclose = re.compile(r'--\s*>')
      30  # Note:
      31  #  1) if you change tagfind/attrfind remember to update locatestarttagend too;
      32  #  2) if you change tagfind/attrfind and/or locatestarttagend the parser will
      33  #     explode, so don't do it.
      34  # see http://www.w3.org/TR/html5/tokenization.html#tag-open-state
      35  # and http://www.w3.org/TR/html5/tokenization.html#tag-name-state
      36  tagfind_tolerant = re.compile(r'([a-zA-Z][^\t\n\r\f />\x00]*)(?:\s|/(?!>))*')
      37  attrfind_tolerant = re.compile(
      38      r'((?<=[\'"\s/])[^\s/>][^\s/=>]*)(\s*=+\s*'
      39      r'(\'[^\']*\'|"[^"]*"|(?![\'"])[^>\s]*))?(?:\s|/(?!>))*')
      40  locatestarttagend_tolerant = re.compile(r"""
      41    <[a-zA-Z][^\t\n\r\f />\x00]*       # tag name
      42    (?:[\s/]*                          # optional whitespace before attribute name
      43      (?:(?<=['"\s/])[^\s/>][^\s/=>]*  # attribute name
      44        (?:\s*=+\s*                    # value indicator
      45          (?:'[^']*'                   # LITA-enclosed value
      46            |"[^"]*"                   # LIT-enclosed value
      47            |(?!['"])[^>\s]*           # bare value
      48           )
      49          \s*                          # possibly followed by a space
      50         )?(?:\s|/(?!>))*
      51       )*
      52     )?
      53    \s*                                # trailing whitespace
      54  """, re.VERBOSE)
      55  endendtag = re.compile('>')
      56  # the HTML 5 spec, section 8.1.2.2, doesn't allow spaces between
      57  # </ and the tag name, so maybe this should be fixed
      58  endtagfind = re.compile(r'</\s*([a-zA-Z][-.a-zA-Z0-9:_]*)\s*>')
      59  
      60  
      61  
      62  class ESC[4;38;5;81mHTMLParser(ESC[4;38;5;149m_markupbaseESC[4;38;5;149m.ESC[4;38;5;149mParserBase):
      63      """Find tags and other markup and call handler functions.
      64  
      65      Usage:
      66          p = HTMLParser()
      67          p.feed(data)
      68          ...
      69          p.close()
      70  
      71      Start tags are handled by calling self.handle_starttag() or
      72      self.handle_startendtag(); end tags by self.handle_endtag().  The
      73      data between tags is passed from the parser to the derived class
      74      by calling self.handle_data() with the data as argument (the data
      75      may be split up in arbitrary chunks).  If convert_charrefs is
      76      True the character references are converted automatically to the
      77      corresponding Unicode character (and self.handle_data() is no
      78      longer split in chunks), otherwise they are passed by calling
      79      self.handle_entityref() or self.handle_charref() with the string
      80      containing respectively the named or numeric reference as the
      81      argument.
      82      """
      83  
      84      CDATA_CONTENT_ELEMENTS = ("script", "style")
      85  
      86      def __init__(self, *, convert_charrefs=True):
      87          """Initialize and reset this instance.
      88  
      89          If convert_charrefs is True (the default), all character references
      90          are automatically converted to the corresponding Unicode characters.
      91          """
      92          super().__init__()
      93          self.convert_charrefs = convert_charrefs
      94          self.reset()
      95  
      96      def reset(self):
      97          """Reset this instance.  Loses all unprocessed data."""
      98          self.rawdata = ''
      99          self.lasttag = '???'
     100          self.interesting = interesting_normal
     101          self.cdata_elem = None
     102          super().reset()
     103  
     104      def feed(self, data):
     105          r"""Feed data to the parser.
     106  
     107          Call this as often as you want, with as little or as much text
     108          as you want (may include '\n').
     109          """
     110          self.rawdata = self.rawdata + data
     111          self.goahead(0)
     112  
     113      def close(self):
     114          """Handle any buffered data."""
     115          self.goahead(1)
     116  
     117      __starttag_text = None
     118  
     119      def get_starttag_text(self):
     120          """Return full source of start tag: '<...>'."""
     121          return self.__starttag_text
     122  
     123      def set_cdata_mode(self, elem):
     124          self.cdata_elem = elem.lower()
     125          self.interesting = re.compile(r'</\s*%s\s*>' % self.cdata_elem, re.I)
     126  
     127      def clear_cdata_mode(self):
     128          self.interesting = interesting_normal
     129          self.cdata_elem = None
     130  
     131      # Internal -- handle data as far as reasonable.  May leave state
     132      # and data to be processed by a subsequent call.  If 'end' is
     133      # true, force handling all data as if followed by EOF marker.
     134      def goahead(self, end):
     135          rawdata = self.rawdata
     136          i = 0
     137          n = len(rawdata)
     138          while i < n:
     139              if self.convert_charrefs and not self.cdata_elem:
     140                  j = rawdata.find('<', i)
     141                  if j < 0:
     142                      # if we can't find the next <, either we are at the end
     143                      # or there's more text incoming.  If the latter is True,
     144                      # we can't pass the text to handle_data in case we have
     145                      # a charref cut in half at end.  Try to determine if
     146                      # this is the case before proceeding by looking for an
     147                      # & near the end and see if it's followed by a space or ;.
     148                      amppos = rawdata.rfind('&', max(i, n-34))
     149                      if (amppos >= 0 and
     150                          not re.compile(r'[\s;]').search(rawdata, amppos)):
     151                          break  # wait till we get all the text
     152                      j = n
     153              else:
     154                  match = self.interesting.search(rawdata, i)  # < or &
     155                  if match:
     156                      j = match.start()
     157                  else:
     158                      if self.cdata_elem:
     159                          break
     160                      j = n
     161              if i < j:
     162                  if self.convert_charrefs and not self.cdata_elem:
     163                      self.handle_data(unescape(rawdata[i:j]))
     164                  else:
     165                      self.handle_data(rawdata[i:j])
     166              i = self.updatepos(i, j)
     167              if i == n: break
     168              startswith = rawdata.startswith
     169              if startswith('<', i):
     170                  if starttagopen.match(rawdata, i): # < + letter
     171                      k = self.parse_starttag(i)
     172                  elif startswith("</", i):
     173                      k = self.parse_endtag(i)
     174                  elif startswith("<!--", i):
     175                      k = self.parse_comment(i)
     176                  elif startswith("<?", i):
     177                      k = self.parse_pi(i)
     178                  elif startswith("<!", i):
     179                      k = self.parse_html_declaration(i)
     180                  elif (i + 1) < n:
     181                      self.handle_data("<")
     182                      k = i + 1
     183                  else:
     184                      break
     185                  if k < 0:
     186                      if not end:
     187                          break
     188                      k = rawdata.find('>', i + 1)
     189                      if k < 0:
     190                          k = rawdata.find('<', i + 1)
     191                          if k < 0:
     192                              k = i + 1
     193                      else:
     194                          k += 1
     195                      if self.convert_charrefs and not self.cdata_elem:
     196                          self.handle_data(unescape(rawdata[i:k]))
     197                      else:
     198                          self.handle_data(rawdata[i:k])
     199                  i = self.updatepos(i, k)
     200              elif startswith("&#", i):
     201                  match = charref.match(rawdata, i)
     202                  if match:
     203                      name = match.group()[2:-1]
     204                      self.handle_charref(name)
     205                      k = match.end()
     206                      if not startswith(';', k-1):
     207                          k = k - 1
     208                      i = self.updatepos(i, k)
     209                      continue
     210                  else:
     211                      if ";" in rawdata[i:]:  # bail by consuming &#
     212                          self.handle_data(rawdata[i:i+2])
     213                          i = self.updatepos(i, i+2)
     214                      break
     215              elif startswith('&', i):
     216                  match = entityref.match(rawdata, i)
     217                  if match:
     218                      name = match.group(1)
     219                      self.handle_entityref(name)
     220                      k = match.end()
     221                      if not startswith(';', k-1):
     222                          k = k - 1
     223                      i = self.updatepos(i, k)
     224                      continue
     225                  match = incomplete.match(rawdata, i)
     226                  if match:
     227                      # match.group() will contain at least 2 chars
     228                      if end and match.group() == rawdata[i:]:
     229                          k = match.end()
     230                          if k <= i:
     231                              k = n
     232                          i = self.updatepos(i, i + 1)
     233                      # incomplete
     234                      break
     235                  elif (i + 1) < n:
     236                      # not the end of the buffer, and can't be confused
     237                      # with some other construct
     238                      self.handle_data("&")
     239                      i = self.updatepos(i, i + 1)
     240                  else:
     241                      break
     242              else:
     243                  assert 0, "interesting.search() lied"
     244          # end while
     245          if end and i < n and not self.cdata_elem:
     246              if self.convert_charrefs and not self.cdata_elem:
     247                  self.handle_data(unescape(rawdata[i:n]))
     248              else:
     249                  self.handle_data(rawdata[i:n])
     250              i = self.updatepos(i, n)
     251          self.rawdata = rawdata[i:]
     252  
     253      # Internal -- parse html declarations, return length or -1 if not terminated
     254      # See w3.org/TR/html5/tokenization.html#markup-declaration-open-state
     255      # See also parse_declaration in _markupbase
     256      def parse_html_declaration(self, i):
     257          rawdata = self.rawdata
     258          assert rawdata[i:i+2] == '<!', ('unexpected call to '
     259                                          'parse_html_declaration()')
     260          if rawdata[i:i+4] == '<!--':
     261              # this case is actually already handled in goahead()
     262              return self.parse_comment(i)
     263          elif rawdata[i:i+3] == '<![':
     264              return self.parse_marked_section(i)
     265          elif rawdata[i:i+9].lower() == '<!doctype':
     266              # find the closing >
     267              gtpos = rawdata.find('>', i+9)
     268              if gtpos == -1:
     269                  return -1
     270              self.handle_decl(rawdata[i+2:gtpos])
     271              return gtpos+1
     272          else:
     273              return self.parse_bogus_comment(i)
     274  
     275      # Internal -- parse bogus comment, return length or -1 if not terminated
     276      # see http://www.w3.org/TR/html5/tokenization.html#bogus-comment-state
     277      def parse_bogus_comment(self, i, report=1):
     278          rawdata = self.rawdata
     279          assert rawdata[i:i+2] in ('<!', '</'), ('unexpected call to '
     280                                                  'parse_comment()')
     281          pos = rawdata.find('>', i+2)
     282          if pos == -1:
     283              return -1
     284          if report:
     285              self.handle_comment(rawdata[i+2:pos])
     286          return pos + 1
     287  
     288      # Internal -- parse processing instr, return end or -1 if not terminated
     289      def parse_pi(self, i):
     290          rawdata = self.rawdata
     291          assert rawdata[i:i+2] == '<?', 'unexpected call to parse_pi()'
     292          match = piclose.search(rawdata, i+2) # >
     293          if not match:
     294              return -1
     295          j = match.start()
     296          self.handle_pi(rawdata[i+2: j])
     297          j = match.end()
     298          return j
     299  
     300      # Internal -- handle starttag, return end or -1 if not terminated
     301      def parse_starttag(self, i):
     302          self.__starttag_text = None
     303          endpos = self.check_for_whole_start_tag(i)
     304          if endpos < 0:
     305              return endpos
     306          rawdata = self.rawdata
     307          self.__starttag_text = rawdata[i:endpos]
     308  
     309          # Now parse the data between i+1 and j into a tag and attrs
     310          attrs = []
     311          match = tagfind_tolerant.match(rawdata, i+1)
     312          assert match, 'unexpected call to parse_starttag()'
     313          k = match.end()
     314          self.lasttag = tag = match.group(1).lower()
     315          while k < endpos:
     316              m = attrfind_tolerant.match(rawdata, k)
     317              if not m:
     318                  break
     319              attrname, rest, attrvalue = m.group(1, 2, 3)
     320              if not rest:
     321                  attrvalue = None
     322              elif attrvalue[:1] == '\'' == attrvalue[-1:] or \
     323                   attrvalue[:1] == '"' == attrvalue[-1:]:
     324                  attrvalue = attrvalue[1:-1]
     325              if attrvalue:
     326                  attrvalue = unescape(attrvalue)
     327              attrs.append((attrname.lower(), attrvalue))
     328              k = m.end()
     329  
     330          end = rawdata[k:endpos].strip()
     331          if end not in (">", "/>"):
     332              self.handle_data(rawdata[i:endpos])
     333              return endpos
     334          if end.endswith('/>'):
     335              # XHTML-style empty tag: <span attr="value" />
     336              self.handle_startendtag(tag, attrs)
     337          else:
     338              self.handle_starttag(tag, attrs)
     339              if tag in self.CDATA_CONTENT_ELEMENTS:
     340                  self.set_cdata_mode(tag)
     341          return endpos
     342  
     343      # Internal -- check to see if we have a complete starttag; return end
     344      # or -1 if incomplete.
     345      def check_for_whole_start_tag(self, i):
     346          rawdata = self.rawdata
     347          m = locatestarttagend_tolerant.match(rawdata, i)
     348          if m:
     349              j = m.end()
     350              next = rawdata[j:j+1]
     351              if next == ">":
     352                  return j + 1
     353              if next == "/":
     354                  if rawdata.startswith("/>", j):
     355                      return j + 2
     356                  if rawdata.startswith("/", j):
     357                      # buffer boundary
     358                      return -1
     359                  # else bogus input
     360                  if j > i:
     361                      return j
     362                  else:
     363                      return i + 1
     364              if next == "":
     365                  # end of input
     366                  return -1
     367              if next in ("abcdefghijklmnopqrstuvwxyz=/"
     368                          "ABCDEFGHIJKLMNOPQRSTUVWXYZ"):
     369                  # end of input in or before attribute value, or we have the
     370                  # '/' from a '/>' ending
     371                  return -1
     372              if j > i:
     373                  return j
     374              else:
     375                  return i + 1
     376          raise AssertionError("we should not get here!")
     377  
     378      # Internal -- parse endtag, return end or -1 if incomplete
     379      def parse_endtag(self, i):
     380          rawdata = self.rawdata
     381          assert rawdata[i:i+2] == "</", "unexpected call to parse_endtag"
     382          match = endendtag.search(rawdata, i+1) # >
     383          if not match:
     384              return -1
     385          gtpos = match.end()
     386          match = endtagfind.match(rawdata, i) # </ + tag + >
     387          if not match:
     388              if self.cdata_elem is not None:
     389                  self.handle_data(rawdata[i:gtpos])
     390                  return gtpos
     391              # find the name: w3.org/TR/html5/tokenization.html#tag-name-state
     392              namematch = tagfind_tolerant.match(rawdata, i+2)
     393              if not namematch:
     394                  # w3.org/TR/html5/tokenization.html#end-tag-open-state
     395                  if rawdata[i:i+3] == '</>':
     396                      return i+3
     397                  else:
     398                      return self.parse_bogus_comment(i)
     399              tagname = namematch.group(1).lower()
     400              # consume and ignore other stuff between the name and the >
     401              # Note: this is not 100% correct, since we might have things like
     402              # </tag attr=">">, but looking for > after the name should cover
     403              # most of the cases and is much simpler
     404              gtpos = rawdata.find('>', namematch.end())
     405              self.handle_endtag(tagname)
     406              return gtpos+1
     407  
     408          elem = match.group(1).lower() # script or style
     409          if self.cdata_elem is not None:
     410              if elem != self.cdata_elem:
     411                  self.handle_data(rawdata[i:gtpos])
     412                  return gtpos
     413  
     414          self.handle_endtag(elem)
     415          self.clear_cdata_mode()
     416          return gtpos
     417  
     418      # Overridable -- finish processing of start+end tag: <tag.../>
     419      def handle_startendtag(self, tag, attrs):
     420          self.handle_starttag(tag, attrs)
     421          self.handle_endtag(tag)
     422  
     423      # Overridable -- handle start tag
     424      def handle_starttag(self, tag, attrs):
     425          pass
     426  
     427      # Overridable -- handle end tag
     428      def handle_endtag(self, tag):
     429          pass
     430  
     431      # Overridable -- handle character reference
     432      def handle_charref(self, name):
     433          pass
     434  
     435      # Overridable -- handle entity reference
     436      def handle_entityref(self, name):
     437          pass
     438  
     439      # Overridable -- handle data
     440      def handle_data(self, data):
     441          pass
     442  
     443      # Overridable -- handle comment
     444      def handle_comment(self, data):
     445          pass
     446  
     447      # Overridable -- handle declaration
     448      def handle_decl(self, decl):
     449          pass
     450  
     451      # Overridable -- handle processing instruction
     452      def handle_pi(self, data):
     453          pass
     454  
     455      def unknown_decl(self, data):
     456          pass