1  """Shared support for scanning document type declarations in HTML and XHTML.
       2  
       3  This module is used as a foundation for the html.parser module.  It has no
       4  documented public API and should not be used directly.
       5  
       6  """
       7  
       8  import re
       9  
      10  _declname_match = re.compile(r'[a-zA-Z][-_.a-zA-Z0-9]*\s*').match
      11  _declstringlit_match = re.compile(r'(\'[^\']*\'|"[^"]*")\s*').match
      12  _commentclose = re.compile(r'--\s*>')
      13  _markedsectionclose = re.compile(r']\s*]\s*>')
      14  
      15  # An analysis of the MS-Word extensions is available at
      16  # http://www.planetpublish.com/xmlarena/xap/Thursday/WordtoXML.pdf
      17  
      18  _msmarkedsectionclose = re.compile(r']\s*>')
      19  
      20  del re
      21  
      22  
      23  class ESC[4;38;5;81mParserBase:
      24      """Parser base class which provides some common support methods used
      25      by the SGML/HTML and XHTML parsers."""
      26  
      27      def __init__(self):
      28          if self.__class__ is ParserBase:
      29              raise RuntimeError(
      30                  "_markupbase.ParserBase must be subclassed")
      31  
      32      def reset(self):
      33          self.lineno = 1
      34          self.offset = 0
      35  
      36      def getpos(self):
      37          """Return current line number and offset."""
      38          return self.lineno, self.offset
      39  
      40      # Internal -- update line number and offset.  This should be
      41      # called for each piece of data exactly once, in order -- in other
      42      # words the concatenation of all the input strings to this
      43      # function should be exactly the entire input.
      44      def updatepos(self, i, j):
      45          if i >= j:
      46              return j
      47          rawdata = self.rawdata
      48          nlines = rawdata.count("\n", i, j)
      49          if nlines:
      50              self.lineno = self.lineno + nlines
      51              pos = rawdata.rindex("\n", i, j) # Should not fail
      52              self.offset = j-(pos+1)
      53          else:
      54              self.offset = self.offset + j-i
      55          return j
      56  
      57      _decl_otherchars = ''
      58  
      59      # Internal -- parse declaration (for use by subclasses).
      60      def parse_declaration(self, i):
      61          # This is some sort of declaration; in "HTML as
      62          # deployed," this should only be the document type
      63          # declaration ("<!DOCTYPE html...>").
      64          # ISO 8879:1986, however, has more complex
      65          # declaration syntax for elements in <!...>, including:
      66          # --comment--
      67          # [marked section]
      68          # name in the following list: ENTITY, DOCTYPE, ELEMENT,
      69          # ATTLIST, NOTATION, SHORTREF, USEMAP,
      70          # LINKTYPE, LINK, IDLINK, USELINK, SYSTEM
      71          rawdata = self.rawdata
      72          j = i + 2
      73          assert rawdata[i:j] == "<!", "unexpected call to parse_declaration"
      74          if rawdata[j:j+1] == ">":
      75              # the empty comment <!>
      76              return j + 1
      77          if rawdata[j:j+1] in ("-", ""):
      78              # Start of comment followed by buffer boundary,
      79              # or just a buffer boundary.
      80              return -1
      81          # A simple, practical version could look like: ((name|stringlit) S*) + '>'
      82          n = len(rawdata)
      83          if rawdata[j:j+2] == '--': #comment
      84              # Locate --.*-- as the body of the comment
      85              return self.parse_comment(i)
      86          elif rawdata[j] == '[': #marked section
      87              # Locate [statusWord [...arbitrary SGML...]] as the body of the marked section
      88              # Where statusWord is one of TEMP, CDATA, IGNORE, INCLUDE, RCDATA
      89              # Note that this is extended by Microsoft Office "Save as Web" function
      90              # to include [if...] and [endif].
      91              return self.parse_marked_section(i)
      92          else: #all other declaration elements
      93              decltype, j = self._scan_name(j, i)
      94          if j < 0:
      95              return j
      96          if decltype == "doctype":
      97              self._decl_otherchars = ''
      98          while j < n:
      99              c = rawdata[j]
     100              if c == ">":
     101                  # end of declaration syntax
     102                  data = rawdata[i+2:j]
     103                  if decltype == "doctype":
     104                      self.handle_decl(data)
     105                  else:
     106                      # According to the HTML5 specs sections "8.2.4.44 Bogus
     107                      # comment state" and "8.2.4.45 Markup declaration open
     108                      # state", a comment token should be emitted.
     109                      # Calling unknown_decl provides more flexibility though.
     110                      self.unknown_decl(data)
     111                  return j + 1
     112              if c in "\"'":
     113                  m = _declstringlit_match(rawdata, j)
     114                  if not m:
     115                      return -1 # incomplete
     116                  j = m.end()
     117              elif c in "abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ":
     118                  name, j = self._scan_name(j, i)
     119              elif c in self._decl_otherchars:
     120                  j = j + 1
     121              elif c == "[":
     122                  # this could be handled in a separate doctype parser
     123                  if decltype == "doctype":
     124                      j = self._parse_doctype_subset(j + 1, i)
     125                  elif decltype in {"attlist", "linktype", "link", "element"}:
     126                      # must tolerate []'d groups in a content model in an element declaration
     127                      # also in data attribute specifications of attlist declaration
     128                      # also link type declaration subsets in linktype declarations
     129                      # also link attribute specification lists in link declarations
     130                      raise AssertionError("unsupported '[' char in %s declaration" % decltype)
     131                  else:
     132                      raise AssertionError("unexpected '[' char in declaration")
     133              else:
     134                  raise AssertionError("unexpected %r char in declaration" % rawdata[j])
     135              if j < 0:
     136                  return j
     137          return -1 # incomplete
     138  
     139      # Internal -- parse a marked section
     140      # Override this to handle MS-word extension syntax <![if word]>content<![endif]>
     141      def parse_marked_section(self, i, report=1):
     142          rawdata= self.rawdata
     143          assert rawdata[i:i+3] == '<![', "unexpected call to parse_marked_section()"
     144          sectName, j = self._scan_name( i+3, i )
     145          if j < 0:
     146              return j
     147          if sectName in {"temp", "cdata", "ignore", "include", "rcdata"}:
     148              # look for standard ]]> ending
     149              match= _markedsectionclose.search(rawdata, i+3)
     150          elif sectName in {"if", "else", "endif"}:
     151              # look for MS Office ]> ending
     152              match= _msmarkedsectionclose.search(rawdata, i+3)
     153          else:
     154              raise AssertionError(
     155                  'unknown status keyword %r in marked section' % rawdata[i+3:j]
     156              )
     157          if not match:
     158              return -1
     159          if report:
     160              j = match.start(0)
     161              self.unknown_decl(rawdata[i+3: j])
     162          return match.end(0)
     163  
     164      # Internal -- parse comment, return length or -1 if not terminated
     165      def parse_comment(self, i, report=1):
     166          rawdata = self.rawdata
     167          if rawdata[i:i+4] != '<!--':
     168              raise AssertionError('unexpected call to parse_comment()')
     169          match = _commentclose.search(rawdata, i+4)
     170          if not match:
     171              return -1
     172          if report:
     173              j = match.start(0)
     174              self.handle_comment(rawdata[i+4: j])
     175          return match.end(0)
     176  
     177      # Internal -- scan past the internal subset in a <!DOCTYPE declaration,
     178      # returning the index just past any whitespace following the trailing ']'.
     179      def _parse_doctype_subset(self, i, declstartpos):
     180          rawdata = self.rawdata
     181          n = len(rawdata)
     182          j = i
     183          while j < n:
     184              c = rawdata[j]
     185              if c == "<":
     186                  s = rawdata[j:j+2]
     187                  if s == "<":
     188                      # end of buffer; incomplete
     189                      return -1
     190                  if s != "<!":
     191                      self.updatepos(declstartpos, j + 1)
     192                      raise AssertionError(
     193                          "unexpected char in internal subset (in %r)" % s
     194                      )
     195                  if (j + 2) == n:
     196                      # end of buffer; incomplete
     197                      return -1
     198                  if (j + 4) > n:
     199                      # end of buffer; incomplete
     200                      return -1
     201                  if rawdata[j:j+4] == "<!--":
     202                      j = self.parse_comment(j, report=0)
     203                      if j < 0:
     204                          return j
     205                      continue
     206                  name, j = self._scan_name(j + 2, declstartpos)
     207                  if j == -1:
     208                      return -1
     209                  if name not in {"attlist", "element", "entity", "notation"}:
     210                      self.updatepos(declstartpos, j + 2)
     211                      raise AssertionError(
     212                          "unknown declaration %r in internal subset" % name
     213                      )
     214                  # handle the individual names
     215                  meth = getattr(self, "_parse_doctype_" + name)
     216                  j = meth(j, declstartpos)
     217                  if j < 0:
     218                      return j
     219              elif c == "%":
     220                  # parameter entity reference
     221                  if (j + 1) == n:
     222                      # end of buffer; incomplete
     223                      return -1
     224                  s, j = self._scan_name(j + 1, declstartpos)
     225                  if j < 0:
     226                      return j
     227                  if rawdata[j] == ";":
     228                      j = j + 1
     229              elif c == "]":
     230                  j = j + 1
     231                  while j < n and rawdata[j].isspace():
     232                      j = j + 1
     233                  if j < n:
     234                      if rawdata[j] == ">":
     235                          return j
     236                      self.updatepos(declstartpos, j)
     237                      raise AssertionError("unexpected char after internal subset")
     238                  else:
     239                      return -1
     240              elif c.isspace():
     241                  j = j + 1
     242              else:
     243                  self.updatepos(declstartpos, j)
     244                  raise AssertionError("unexpected char %r in internal subset" % c)
     245          # end of buffer reached
     246          return -1
     247  
     248      # Internal -- scan past <!ELEMENT declarations
     249      def _parse_doctype_element(self, i, declstartpos):
     250          name, j = self._scan_name(i, declstartpos)
     251          if j == -1:
     252              return -1
     253          # style content model; just skip until '>'
     254          rawdata = self.rawdata
     255          if '>' in rawdata[j:]:
     256              return rawdata.find(">", j) + 1
     257          return -1
     258  
     259      # Internal -- scan past <!ATTLIST declarations
     260      def _parse_doctype_attlist(self, i, declstartpos):
     261          rawdata = self.rawdata
     262          name, j = self._scan_name(i, declstartpos)
     263          c = rawdata[j:j+1]
     264          if c == "":
     265              return -1
     266          if c == ">":
     267              return j + 1
     268          while 1:
     269              # scan a series of attribute descriptions; simplified:
     270              #   name type [value] [#constraint]
     271              name, j = self._scan_name(j, declstartpos)
     272              if j < 0:
     273                  return j
     274              c = rawdata[j:j+1]
     275              if c == "":
     276                  return -1
     277              if c == "(":
     278                  # an enumerated type; look for ')'
     279                  if ")" in rawdata[j:]:
     280                      j = rawdata.find(")", j) + 1
     281                  else:
     282                      return -1
     283                  while rawdata[j:j+1].isspace():
     284                      j = j + 1
     285                  if not rawdata[j:]:
     286                      # end of buffer, incomplete
     287                      return -1
     288              else:
     289                  name, j = self._scan_name(j, declstartpos)
     290              c = rawdata[j:j+1]
     291              if not c:
     292                  return -1
     293              if c in "'\"":
     294                  m = _declstringlit_match(rawdata, j)
     295                  if m:
     296                      j = m.end()
     297                  else:
     298                      return -1
     299                  c = rawdata[j:j+1]
     300                  if not c:
     301                      return -1
     302              if c == "#":
     303                  if rawdata[j:] == "#":
     304                      # end of buffer
     305                      return -1
     306                  name, j = self._scan_name(j + 1, declstartpos)
     307                  if j < 0:
     308                      return j
     309                  c = rawdata[j:j+1]
     310                  if not c:
     311                      return -1
     312              if c == '>':
     313                  # all done
     314                  return j + 1
     315  
     316      # Internal -- scan past <!NOTATION declarations
     317      def _parse_doctype_notation(self, i, declstartpos):
     318          name, j = self._scan_name(i, declstartpos)
     319          if j < 0:
     320              return j
     321          rawdata = self.rawdata
     322          while 1:
     323              c = rawdata[j:j+1]
     324              if not c:
     325                  # end of buffer; incomplete
     326                  return -1
     327              if c == '>':
     328                  return j + 1
     329              if c in "'\"":
     330                  m = _declstringlit_match(rawdata, j)
     331                  if not m:
     332                      return -1
     333                  j = m.end()
     334              else:
     335                  name, j = self._scan_name(j, declstartpos)
     336                  if j < 0:
     337                      return j
     338  
     339      # Internal -- scan past <!ENTITY declarations
     340      def _parse_doctype_entity(self, i, declstartpos):
     341          rawdata = self.rawdata
     342          if rawdata[i:i+1] == "%":
     343              j = i + 1
     344              while 1:
     345                  c = rawdata[j:j+1]
     346                  if not c:
     347                      return -1
     348                  if c.isspace():
     349                      j = j + 1
     350                  else:
     351                      break
     352          else:
     353              j = i
     354          name, j = self._scan_name(j, declstartpos)
     355          if j < 0:
     356              return j
     357          while 1:
     358              c = self.rawdata[j:j+1]
     359              if not c:
     360                  return -1
     361              if c in "'\"":
     362                  m = _declstringlit_match(rawdata, j)
     363                  if m:
     364                      j = m.end()
     365                  else:
     366                      return -1    # incomplete
     367              elif c == ">":
     368                  return j + 1
     369              else:
     370                  name, j = self._scan_name(j, declstartpos)
     371                  if j < 0:
     372                      return j
     373  
     374      # Internal -- scan a name token and the new position and the token, or
     375      # return -1 if we've reached the end of the buffer.
     376      def _scan_name(self, i, declstartpos):
     377          rawdata = self.rawdata
     378          n = len(rawdata)
     379          if i == n:
     380              return None, -1
     381          m = _declname_match(rawdata, i)
     382          if m:
     383              s = m.group()
     384              name = s.strip()
     385              if (i + len(s)) == n:
     386                  return None, -1  # end of buffer
     387              return name.lower(), m.end()
     388          else:
     389              self.updatepos(declstartpos, i)
     390              raise AssertionError(
     391                  "expected name token at %r" % rawdata[declstartpos:declstartpos+20]
     392              )
     393  
     394      # To be overridden -- handlers for unknown objects
     395      def unknown_decl(self, data):
     396          pass