1  """A parser for HTML and XHTML."""
       2  
       3  # This file is based on sgmllib.py, but the API is slightly different.
       4  
       5  # XXX There should be a way to distinguish between PCDATA (parsed
       6  # character data -- the normal case), RCDATA (replaceable character
       7  # data -- only char and entity references and end tags are special)
       8  # and CDATA (character data -- only end tags are special).
       9  
      10  
      11  import re
      12  import _markupbase
      13  
      14  from html import unescape
      15  
      16  
      17  __all__ = ['HTMLParser']
      18  
      19  # Regular expressions used for parsing
      20  
      21  interesting_normal = re.compile('[&<]')
      22  incomplete = re.compile('&[a-zA-Z#]')
      23  
      24  entityref = re.compile('&([a-zA-Z][-.a-zA-Z0-9]*)[^a-zA-Z0-9]')
      25  charref = re.compile('&#(?:[0-9]+|[xX][0-9a-fA-F]+)[^0-9a-fA-F]')
      26  
      27  starttagopen = re.compile('<[a-zA-Z]')
      28  piclose = re.compile('>')
      29  commentclose = re.compile(r'--\s*>')
      30  # Note:
      31  #  1) if you change tagfind/attrfind remember to update locatestarttagend too;
      32  #  2) if you change tagfind/attrfind and/or locatestarttagend the parser will
      33  #     explode, so don't do it.
      34  # see http://www.w3.org/TR/html5/tokenization.html#tag-open-state
      35  # and http://www.w3.org/TR/html5/tokenization.html#tag-name-state
      36  tagfind_tolerant = re.compile(r'([a-zA-Z][^\t\n\r\f />\x00]*)(?:\s|/(?!>))*')
      37  attrfind_tolerant = re.compile(
      38      r'((?<=[\'"\s/])[^\s/>][^\s/=>]*)(\s*=+\s*'
      39      r'(\'[^\']*\'|"[^"]*"|(?![\'"])[^>\s]*))?(?:\s|/(?!>))*')
      40  locatestarttagend_tolerant = re.compile(r"""
      41    <[a-zA-Z][^\t\n\r\f />\x00]*       # tag name
      42    (?:[\s/]*                          # optional whitespace before attribute name
      43      (?:(?<=['"\s/])[^\s/>][^\s/=>]*  # attribute name
      44        (?:\s*=+\s*                    # value indicator
      45          (?:'[^']*'                   # LITA-enclosed value
      46            |"[^"]*"                   # LIT-enclosed value
      47            |(?!['"])[^>\s]*           # bare value
      48           )
      49          \s*                          # possibly followed by a space
      50         )?(?:\s|/(?!>))*
      51       )*
      52     )?
      53    \s*                                # trailing whitespace
      54  """, re.VERBOSE)
      55  endendtag = re.compile('>')
      56  # the HTML 5 spec, section 8.1.2.2, doesn't allow spaces between
      57  # </ and the tag name, so maybe this should be fixed
      58  endtagfind = re.compile(r'</\s*([a-zA-Z][-.a-zA-Z0-9:_]*)\s*>')
      59  
      60  
      61  
      62  class ESC[4;38;5;81mHTMLParser(ESC[4;38;5;149m_markupbaseESC[4;38;5;149m.ESC[4;38;5;149mParserBase):
      63      """Find tags and other markup and call handler functions.
      64  
      65      Usage:
      66          p = HTMLParser()
      67          p.feed(data)
      68          ...
      69          p.close()
      70  
      71      Start tags are handled by calling self.handle_starttag() or
      72      self.handle_startendtag(); end tags by self.handle_endtag().  The
      73      data between tags is passed from the parser to the derived class
      74      by calling self.handle_data() with the data as argument (the data
      75      may be split up in arbitrary chunks).  If convert_charrefs is
      76      True the character references are converted automatically to the
      77      corresponding Unicode character (and self.handle_data() is no
      78      longer split in chunks), otherwise they are passed by calling
      79      self.handle_entityref() or self.handle_charref() with the string
      80      containing respectively the named or numeric reference as the
      81      argument.
      82      """
      83  
      84      CDATA_CONTENT_ELEMENTS = ("script", "style")
      85  
      86      def __init__(self, *, convert_charrefs=True):
      87          """Initialize and reset this instance.
      88  
      89          If convert_charrefs is True (the default), all character references
      90          are automatically converted to the corresponding Unicode characters.
      91          """
      92          self.convert_charrefs = convert_charrefs
      93          self.reset()
      94  
      95      def reset(self):
      96          """Reset this instance.  Loses all unprocessed data."""
      97          self.rawdata = ''
      98          self.lasttag = '???'
      99          self.interesting = interesting_normal
     100          self.cdata_elem = None
     101          _markupbase.ParserBase.reset(self)
     102  
     103      def feed(self, data):
     104          r"""Feed data to the parser.
     105  
     106          Call this as often as you want, with as little or as much text
     107          as you want (may include '\n').
     108          """
     109          self.rawdata = self.rawdata + data
     110          self.goahead(0)
     111  
     112      def close(self):
     113          """Handle any buffered data."""
     114          self.goahead(1)
     115  
     116      __starttag_text = None
     117  
     118      def get_starttag_text(self):
     119          """Return full source of start tag: '<...>'."""
     120          return self.__starttag_text
     121  
     122      def set_cdata_mode(self, elem):
     123          self.cdata_elem = elem.lower()
     124          self.interesting = re.compile(r'</\s*%s\s*>' % self.cdata_elem, re.I)
     125  
     126      def clear_cdata_mode(self):
     127          self.interesting = interesting_normal
     128          self.cdata_elem = None
     129  
     130      # Internal -- handle data as far as reasonable.  May leave state
     131      # and data to be processed by a subsequent call.  If 'end' is
     132      # true, force handling all data as if followed by EOF marker.
     133      def goahead(self, end):
     134          rawdata = self.rawdata
     135          i = 0
     136          n = len(rawdata)
     137          while i < n:
     138              if self.convert_charrefs and not self.cdata_elem:
     139                  j = rawdata.find('<', i)
     140                  if j < 0:
     141                      # if we can't find the next <, either we are at the end
     142                      # or there's more text incoming.  If the latter is True,
     143                      # we can't pass the text to handle_data in case we have
     144                      # a charref cut in half at end.  Try to determine if
     145                      # this is the case before proceeding by looking for an
     146                      # & near the end and see if it's followed by a space or ;.
     147                      amppos = rawdata.rfind('&', max(i, n-34))
     148                      if (amppos >= 0 and
     149                          not re.compile(r'[\s;]').search(rawdata, amppos)):
     150                          break  # wait till we get all the text
     151                      j = n
     152              else:
     153                  match = self.interesting.search(rawdata, i)  # < or &
     154                  if match:
     155                      j = match.start()
     156                  else:
     157                      if self.cdata_elem:
     158                          break
     159                      j = n
     160              if i < j:
     161                  if self.convert_charrefs and not self.cdata_elem:
     162                      self.handle_data(unescape(rawdata[i:j]))
     163                  else:
     164                      self.handle_data(rawdata[i:j])
     165              i = self.updatepos(i, j)
     166              if i == n: break
     167              startswith = rawdata.startswith
     168              if startswith('<', i):
     169                  if starttagopen.match(rawdata, i): # < + letter
     170                      k = self.parse_starttag(i)
     171                  elif startswith("</", i):
     172                      k = self.parse_endtag(i)
     173                  elif startswith("<!--", i):
     174                      k = self.parse_comment(i)
     175                  elif startswith("<?", i):
     176                      k = self.parse_pi(i)
     177                  elif startswith("<!", i):
     178                      k = self.parse_html_declaration(i)
     179                  elif (i + 1) < n:
     180                      self.handle_data("<")
     181                      k = i + 1
     182                  else:
     183                      break
     184                  if k < 0:
     185                      if not end:
     186                          break
     187                      k = rawdata.find('>', i + 1)
     188                      if k < 0:
     189                          k = rawdata.find('<', i + 1)
     190                          if k < 0:
     191                              k = i + 1
     192                      else:
     193                          k += 1
     194                      if self.convert_charrefs and not self.cdata_elem:
     195                          self.handle_data(unescape(rawdata[i:k]))
     196                      else:
     197                          self.handle_data(rawdata[i:k])
     198                  i = self.updatepos(i, k)
     199              elif startswith("&#", i):
     200                  match = charref.match(rawdata, i)
     201                  if match:
     202                      name = match.group()[2:-1]
     203                      self.handle_charref(name)
     204                      k = match.end()
     205                      if not startswith(';', k-1):
     206                          k = k - 1
     207                      i = self.updatepos(i, k)
     208                      continue
     209                  else:
     210                      if ";" in rawdata[i:]:  # bail by consuming &#
     211                          self.handle_data(rawdata[i:i+2])
     212                          i = self.updatepos(i, i+2)
     213                      break
     214              elif startswith('&', i):
     215                  match = entityref.match(rawdata, i)
     216                  if match:
     217                      name = match.group(1)
     218                      self.handle_entityref(name)
     219                      k = match.end()
     220                      if not startswith(';', k-1):
     221                          k = k - 1
     222                      i = self.updatepos(i, k)
     223                      continue
     224                  match = incomplete.match(rawdata, i)
     225                  if match:
     226                      # match.group() will contain at least 2 chars
     227                      if end and match.group() == rawdata[i:]:
     228                          k = match.end()
     229                          if k <= i:
     230                              k = n
     231                          i = self.updatepos(i, i + 1)
     232                      # incomplete
     233                      break
     234                  elif (i + 1) < n:
     235                      # not the end of the buffer, and can't be confused
     236                      # with some other construct
     237                      self.handle_data("&")
     238                      i = self.updatepos(i, i + 1)
     239                  else:
     240                      break
     241              else:
     242                  assert 0, "interesting.search() lied"
     243          # end while
     244          if end and i < n and not self.cdata_elem:
     245              if self.convert_charrefs and not self.cdata_elem:
     246                  self.handle_data(unescape(rawdata[i:n]))
     247              else:
     248                  self.handle_data(rawdata[i:n])
     249              i = self.updatepos(i, n)
     250          self.rawdata = rawdata[i:]
     251  
     252      # Internal -- parse html declarations, return length or -1 if not terminated
     253      # See w3.org/TR/html5/tokenization.html#markup-declaration-open-state
     254      # See also parse_declaration in _markupbase
     255      def parse_html_declaration(self, i):
     256          rawdata = self.rawdata
     257          assert rawdata[i:i+2] == '<!', ('unexpected call to '
     258                                          'parse_html_declaration()')
     259          if rawdata[i:i+4] == '<!--':
     260              # this case is actually already handled in goahead()
     261              return self.parse_comment(i)
     262          elif rawdata[i:i+3] == '<![':
     263              return self.parse_marked_section(i)
     264          elif rawdata[i:i+9].lower() == '<!doctype':
     265              # find the closing >
     266              gtpos = rawdata.find('>', i+9)
     267              if gtpos == -1:
     268                  return -1
     269              self.handle_decl(rawdata[i+2:gtpos])
     270              return gtpos+1
     271          else:
     272              return self.parse_bogus_comment(i)
     273  
     274      # Internal -- parse bogus comment, return length or -1 if not terminated
     275      # see http://www.w3.org/TR/html5/tokenization.html#bogus-comment-state
     276      def parse_bogus_comment(self, i, report=1):
     277          rawdata = self.rawdata
     278          assert rawdata[i:i+2] in ('<!', '</'), ('unexpected call to '
     279                                                  'parse_comment()')
     280          pos = rawdata.find('>', i+2)
     281          if pos == -1:
     282              return -1
     283          if report:
     284              self.handle_comment(rawdata[i+2:pos])
     285          return pos + 1
     286  
     287      # Internal -- parse processing instr, return end or -1 if not terminated
     288      def parse_pi(self, i):
     289          rawdata = self.rawdata
     290          assert rawdata[i:i+2] == '<?', 'unexpected call to parse_pi()'
     291          match = piclose.search(rawdata, i+2) # >
     292          if not match:
     293              return -1
     294          j = match.start()
     295          self.handle_pi(rawdata[i+2: j])
     296          j = match.end()
     297          return j
     298  
     299      # Internal -- handle starttag, return end or -1 if not terminated
     300      def parse_starttag(self, i):
     301          self.__starttag_text = None
     302          endpos = self.check_for_whole_start_tag(i)
     303          if endpos < 0:
     304              return endpos
     305          rawdata = self.rawdata
     306          self.__starttag_text = rawdata[i:endpos]
     307  
     308          # Now parse the data between i+1 and j into a tag and attrs
     309          attrs = []
     310          match = tagfind_tolerant.match(rawdata, i+1)
     311          assert match, 'unexpected call to parse_starttag()'
     312          k = match.end()
     313          self.lasttag = tag = match.group(1).lower()
     314          while k < endpos:
     315              m = attrfind_tolerant.match(rawdata, k)
     316              if not m:
     317                  break
     318              attrname, rest, attrvalue = m.group(1, 2, 3)
     319              if not rest:
     320                  attrvalue = None
     321              elif attrvalue[:1] == '\'' == attrvalue[-1:] or \
     322                   attrvalue[:1] == '"' == attrvalue[-1:]:
     323                  attrvalue = attrvalue[1:-1]
     324              if attrvalue:
     325                  attrvalue = unescape(attrvalue)
     326              attrs.append((attrname.lower(), attrvalue))
     327              k = m.end()
     328  
     329          end = rawdata[k:endpos].strip()
     330          if end not in (">", "/>"):
     331              self.handle_data(rawdata[i:endpos])
     332              return endpos
     333          if end.endswith('/>'):
     334              # XHTML-style empty tag: <span attr="value" />
     335              self.handle_startendtag(tag, attrs)
     336          else:
     337              self.handle_starttag(tag, attrs)
     338              if tag in self.CDATA_CONTENT_ELEMENTS:
     339                  self.set_cdata_mode(tag)
     340          return endpos
     341  
     342      # Internal -- check to see if we have a complete starttag; return end
     343      # or -1 if incomplete.
     344      def check_for_whole_start_tag(self, i):
     345          rawdata = self.rawdata
     346          m = locatestarttagend_tolerant.match(rawdata, i)
     347          if m:
     348              j = m.end()
     349              next = rawdata[j:j+1]
     350              if next == ">":
     351                  return j + 1
     352              if next == "/":
     353                  if rawdata.startswith("/>", j):
     354                      return j + 2
     355                  if rawdata.startswith("/", j):
     356                      # buffer boundary
     357                      return -1
     358                  # else bogus input
     359                  if j > i:
     360                      return j
     361                  else:
     362                      return i + 1
     363              if next == "":
     364                  # end of input
     365                  return -1
     366              if next in ("abcdefghijklmnopqrstuvwxyz=/"
     367                          "ABCDEFGHIJKLMNOPQRSTUVWXYZ"):
     368                  # end of input in or before attribute value, or we have the
     369                  # '/' from a '/>' ending
     370                  return -1
     371              if j > i:
     372                  return j
     373              else:
     374                  return i + 1
     375          raise AssertionError("we should not get here!")
     376  
     377      # Internal -- parse endtag, return end or -1 if incomplete
     378      def parse_endtag(self, i):
     379          rawdata = self.rawdata
     380          assert rawdata[i:i+2] == "</", "unexpected call to parse_endtag"
     381          match = endendtag.search(rawdata, i+1) # >
     382          if not match:
     383              return -1
     384          gtpos = match.end()
     385          match = endtagfind.match(rawdata, i) # </ + tag + >
     386          if not match:
     387              if self.cdata_elem is not None:
     388                  self.handle_data(rawdata[i:gtpos])
     389                  return gtpos
     390              # find the name: w3.org/TR/html5/tokenization.html#tag-name-state
     391              namematch = tagfind_tolerant.match(rawdata, i+2)
     392              if not namematch:
     393                  # w3.org/TR/html5/tokenization.html#end-tag-open-state
     394                  if rawdata[i:i+3] == '</>':
     395                      return i+3
     396                  else:
     397                      return self.parse_bogus_comment(i)
     398              tagname = namematch.group(1).lower()
     399              # consume and ignore other stuff between the name and the >
     400              # Note: this is not 100% correct, since we might have things like
     401              # </tag attr=">">, but looking for > after the name should cover
     402              # most of the cases and is much simpler
     403              gtpos = rawdata.find('>', namematch.end())
     404              self.handle_endtag(tagname)
     405              return gtpos+1
     406  
     407          elem = match.group(1).lower() # script or style
     408          if self.cdata_elem is not None:
     409              if elem != self.cdata_elem:
     410                  self.handle_data(rawdata[i:gtpos])
     411                  return gtpos
     412  
     413          self.handle_endtag(elem)
     414          self.clear_cdata_mode()
     415          return gtpos
     416  
     417      # Overridable -- finish processing of start+end tag: <tag.../>
     418      def handle_startendtag(self, tag, attrs):
     419          self.handle_starttag(tag, attrs)
     420          self.handle_endtag(tag)
     421  
     422      # Overridable -- handle start tag
     423      def handle_starttag(self, tag, attrs):
     424          pass
     425  
     426      # Overridable -- handle end tag
     427      def handle_endtag(self, tag):
     428          pass
     429  
     430      # Overridable -- handle character reference
     431      def handle_charref(self, name):
     432          pass
     433  
     434      # Overridable -- handle entity reference
     435      def handle_entityref(self, name):
     436          pass
     437  
     438      # Overridable -- handle data
     439      def handle_data(self, data):
     440          pass
     441  
     442      # Overridable -- handle comment
     443      def handle_comment(self, data):
     444          pass
     445  
     446      # Overridable -- handle declaration
     447      def handle_decl(self, decl):
     448          pass
     449  
     450      # Overridable -- handle processing instruction
     451      def handle_pi(self, data):
     452          pass
     453  
     454      def unknown_decl(self, data):
     455          pass