1  """Tests for HTMLParser.py."""
       2  
       3  import html.parser
       4  import pprint
       5  import unittest
       6  
       7  from unittest.mock import patch
       8  
       9  
      10  class ESC[4;38;5;81mEventCollector(ESC[4;38;5;149mhtmlESC[4;38;5;149m.ESC[4;38;5;149mparserESC[4;38;5;149m.ESC[4;38;5;149mHTMLParser):
      11  
      12      def __init__(self, *args, **kw):
      13          self.events = []
      14          self.append = self.events.append
      15          html.parser.HTMLParser.__init__(self, *args, **kw)
      16  
      17      def get_events(self):
      18          # Normalize the list of events so that buffer artefacts don't
      19          # separate runs of contiguous characters.
      20          L = []
      21          prevtype = None
      22          for event in self.events:
      23              type = event[0]
      24              if type == prevtype == "data":
      25                  L[-1] = ("data", L[-1][1] + event[1])
      26              else:
      27                  L.append(event)
      28              prevtype = type
      29          self.events = L
      30          return L
      31  
      32      # structure markup
      33  
      34      def handle_starttag(self, tag, attrs):
      35          self.append(("starttag", tag, attrs))
      36  
      37      def handle_startendtag(self, tag, attrs):
      38          self.append(("startendtag", tag, attrs))
      39  
      40      def handle_endtag(self, tag):
      41          self.append(("endtag", tag))
      42  
      43      # all other markup
      44  
      45      def handle_comment(self, data):
      46          self.append(("comment", data))
      47  
      48      def handle_charref(self, data):
      49          self.append(("charref", data))
      50  
      51      def handle_data(self, data):
      52          self.append(("data", data))
      53  
      54      def handle_decl(self, data):
      55          self.append(("decl", data))
      56  
      57      def handle_entityref(self, data):
      58          self.append(("entityref", data))
      59  
      60      def handle_pi(self, data):
      61          self.append(("pi", data))
      62  
      63      def unknown_decl(self, decl):
      64          self.append(("unknown decl", decl))
      65  
      66  
      67  class ESC[4;38;5;81mEventCollectorExtra(ESC[4;38;5;149mEventCollector):
      68  
      69      def handle_starttag(self, tag, attrs):
      70          EventCollector.handle_starttag(self, tag, attrs)
      71          self.append(("starttag_text", self.get_starttag_text()))
      72  
      73  
      74  class ESC[4;38;5;81mEventCollectorCharrefs(ESC[4;38;5;149mEventCollector):
      75  
      76      def handle_charref(self, data):
      77          self.fail('This should never be called with convert_charrefs=True')
      78  
      79      def handle_entityref(self, data):
      80          self.fail('This should never be called with convert_charrefs=True')
      81  
      82  
      83  class ESC[4;38;5;81mTestCaseBase(ESC[4;38;5;149munittestESC[4;38;5;149m.ESC[4;38;5;149mTestCase):
      84  
      85      def get_collector(self):
      86          return EventCollector(convert_charrefs=False)
      87  
      88      def _run_check(self, source, expected_events, collector=None):
      89          if collector is None:
      90              collector = self.get_collector()
      91          parser = collector
      92          for s in source:
      93              parser.feed(s)
      94          parser.close()
      95          events = parser.get_events()
      96          if events != expected_events:
      97              self.fail("received events did not match expected events" +
      98                        "\nSource:\n" + repr(source) +
      99                        "\nExpected:\n" + pprint.pformat(expected_events) +
     100                        "\nReceived:\n" + pprint.pformat(events))
     101  
     102      def _run_check_extra(self, source, events):
     103          self._run_check(source, events,
     104                          EventCollectorExtra(convert_charrefs=False))
     105  
     106  
     107  class ESC[4;38;5;81mHTMLParserTestCase(ESC[4;38;5;149mTestCaseBase):
     108  
     109      def test_processing_instruction_only(self):
     110          self._run_check("<?processing instruction>", [
     111              ("pi", "processing instruction"),
     112              ])
     113          self._run_check("<?processing instruction ?>", [
     114              ("pi", "processing instruction ?"),
     115              ])
     116  
     117      def test_simple_html(self):
     118          self._run_check("""
     119  <!DOCTYPE html PUBLIC 'foo'>
     120  <HTML>&entity;&#32;
     121  <!--comment1a
     122  -></foo><bar>&lt;<?pi?></foo<bar
     123  comment1b-->
     124  <Img sRc='Bar' isMAP>sample
     125  text
     126  &#x201C;
     127  <!--comment2a-- --comment2b-->
     128  </Html>
     129  """, [
     130      ("data", "\n"),
     131      ("decl", "DOCTYPE html PUBLIC 'foo'"),
     132      ("data", "\n"),
     133      ("starttag", "html", []),
     134      ("entityref", "entity"),
     135      ("charref", "32"),
     136      ("data", "\n"),
     137      ("comment", "comment1a\n-></foo><bar>&lt;<?pi?></foo<bar\ncomment1b"),
     138      ("data", "\n"),
     139      ("starttag", "img", [("src", "Bar"), ("ismap", None)]),
     140      ("data", "sample\ntext\n"),
     141      ("charref", "x201C"),
     142      ("data", "\n"),
     143      ("comment", "comment2a-- --comment2b"),
     144      ("data", "\n"),
     145      ("endtag", "html"),
     146      ("data", "\n"),
     147      ])
     148  
     149      def test_malformatted_charref(self):
     150          self._run_check("<p>&#bad;</p>", [
     151              ("starttag", "p", []),
     152              ("data", "&#bad;"),
     153              ("endtag", "p"),
     154          ])
     155          # add the [] as a workaround to avoid buffering (see #20288)
     156          self._run_check(["<div>&#bad;</div>"], [
     157              ("starttag", "div", []),
     158              ("data", "&#bad;"),
     159              ("endtag", "div"),
     160          ])
     161  
     162      def test_unclosed_entityref(self):
     163          self._run_check("&entityref foo", [
     164              ("entityref", "entityref"),
     165              ("data", " foo"),
     166              ])
     167  
     168      def test_bad_nesting(self):
     169          # Strangely, this *is* supposed to test that overlapping
     170          # elements are allowed.  HTMLParser is more geared toward
     171          # lexing the input that parsing the structure.
     172          self._run_check("<a><b></a></b>", [
     173              ("starttag", "a", []),
     174              ("starttag", "b", []),
     175              ("endtag", "a"),
     176              ("endtag", "b"),
     177              ])
     178  
     179      def test_bare_ampersands(self):
     180          self._run_check("this text & contains & ampersands &", [
     181              ("data", "this text & contains & ampersands &"),
     182              ])
     183  
     184      def test_bare_pointy_brackets(self):
     185          self._run_check("this < text > contains < bare>pointy< brackets", [
     186              ("data", "this < text > contains < bare>pointy< brackets"),
     187              ])
     188  
     189      def test_starttag_end_boundary(self):
     190          self._run_check("""<a b='<'>""", [("starttag", "a", [("b", "<")])])
     191          self._run_check("""<a b='>'>""", [("starttag", "a", [("b", ">")])])
     192  
     193      def test_buffer_artefacts(self):
     194          output = [("starttag", "a", [("b", "<")])]
     195          self._run_check(["<a b='<'>"], output)
     196          self._run_check(["<a ", "b='<'>"], output)
     197          self._run_check(["<a b", "='<'>"], output)
     198          self._run_check(["<a b=", "'<'>"], output)
     199          self._run_check(["<a b='<", "'>"], output)
     200          self._run_check(["<a b='<'", ">"], output)
     201  
     202          output = [("starttag", "a", [("b", ">")])]
     203          self._run_check(["<a b='>'>"], output)
     204          self._run_check(["<a ", "b='>'>"], output)
     205          self._run_check(["<a b", "='>'>"], output)
     206          self._run_check(["<a b=", "'>'>"], output)
     207          self._run_check(["<a b='>", "'>"], output)
     208          self._run_check(["<a b='>'", ">"], output)
     209  
     210          output = [("comment", "abc")]
     211          self._run_check(["", "<!--abc-->"], output)
     212          self._run_check(["<", "!--abc-->"], output)
     213          self._run_check(["<!", "--abc-->"], output)
     214          self._run_check(["<!-", "-abc-->"], output)
     215          self._run_check(["<!--", "abc-->"], output)
     216          self._run_check(["<!--a", "bc-->"], output)
     217          self._run_check(["<!--ab", "c-->"], output)
     218          self._run_check(["<!--abc", "-->"], output)
     219          self._run_check(["<!--abc-", "->"], output)
     220          self._run_check(["<!--abc--", ">"], output)
     221          self._run_check(["<!--abc-->", ""], output)
     222  
     223      def test_valid_doctypes(self):
     224          # from http://www.w3.org/QA/2002/04/valid-dtd-list.html
     225          dtds = ['HTML',  # HTML5 doctype
     226                  ('HTML PUBLIC "-//W3C//DTD HTML 4.01//EN" '
     227                   '"http://www.w3.org/TR/html4/strict.dtd"'),
     228                  ('HTML PUBLIC "-//W3C//DTD HTML 4.01 Transitional//EN" '
     229                   '"http://www.w3.org/TR/html4/loose.dtd"'),
     230                  ('html PUBLIC "-//W3C//DTD XHTML 1.0 Strict//EN" '
     231                   '"http://www.w3.org/TR/xhtml1/DTD/xhtml1-strict.dtd"'),
     232                  ('html PUBLIC "-//W3C//DTD XHTML 1.0 Frameset//EN" '
     233                   '"http://www.w3.org/TR/xhtml1/DTD/xhtml1-frameset.dtd"'),
     234                  ('math PUBLIC "-//W3C//DTD MathML 2.0//EN" '
     235                   '"http://www.w3.org/Math/DTD/mathml2/mathml2.dtd"'),
     236                  ('html PUBLIC "-//W3C//DTD '
     237                   'XHTML 1.1 plus MathML 2.0 plus SVG 1.1//EN" '
     238                   '"http://www.w3.org/2002/04/xhtml-math-svg/xhtml-math-svg.dtd"'),
     239                  ('svg PUBLIC "-//W3C//DTD SVG 1.1//EN" '
     240                   '"http://www.w3.org/Graphics/SVG/1.1/DTD/svg11.dtd"'),
     241                  'html PUBLIC "-//IETF//DTD HTML 2.0//EN"',
     242                  'html PUBLIC "-//W3C//DTD HTML 3.2 Final//EN"']
     243          for dtd in dtds:
     244              self._run_check("<!DOCTYPE %s>" % dtd,
     245                              [('decl', 'DOCTYPE ' + dtd)])
     246  
     247      def test_startendtag(self):
     248          self._run_check("<p/>", [
     249              ("startendtag", "p", []),
     250              ])
     251          self._run_check("<p></p>", [
     252              ("starttag", "p", []),
     253              ("endtag", "p"),
     254              ])
     255          self._run_check("<p><img src='foo' /></p>", [
     256              ("starttag", "p", []),
     257              ("startendtag", "img", [("src", "foo")]),
     258              ("endtag", "p"),
     259              ])
     260  
     261      def test_get_starttag_text(self):
     262          s = """<foo:bar   \n   one="1"\ttwo=2   >"""
     263          self._run_check_extra(s, [
     264              ("starttag", "foo:bar", [("one", "1"), ("two", "2")]),
     265              ("starttag_text", s)])
     266  
     267      def test_cdata_content(self):
     268          contents = [
     269              '<!-- not a comment --> &not-an-entity-ref;',
     270              "<not a='start tag'>",
     271              '<a href="" /> <p> <span></span>',
     272              'foo = "</scr" + "ipt>";',
     273              'foo = "</SCRIPT" + ">";',
     274              'foo = <\n/script> ',
     275              '<!-- document.write("</scr" + "ipt>"); -->',
     276              ('\n//<![CDATA[\n'
     277               'document.write(\'<s\'+\'cript type="text/javascript" '
     278               'src="http://www.example.org/r=\'+new '
     279               'Date().getTime()+\'"><\\/s\'+\'cript>\');\n//]]>'),
     280              '\n<!-- //\nvar foo = 3.14;\n// -->\n',
     281              'foo = "</sty" + "le>";',
     282              '<!-- \u2603 -->',
     283              # these two should be invalid according to the HTML 5 spec,
     284              # section 8.1.2.2
     285              #'foo = </\nscript>',
     286              #'foo = </ script>',
     287          ]
     288          elements = ['script', 'style', 'SCRIPT', 'STYLE', 'Script', 'Style']
     289          for content in contents:
     290              for element in elements:
     291                  element_lower = element.lower()
     292                  s = '<{element}>{content}</{element}>'.format(element=element,
     293                                                                 content=content)
     294                  self._run_check(s, [("starttag", element_lower, []),
     295                                      ("data", content),
     296                                      ("endtag", element_lower)])
     297  
     298      def test_cdata_with_closing_tags(self):
     299          # see issue #13358
     300          # make sure that HTMLParser calls handle_data only once for each CDATA.
     301          # The normal event collector normalizes  the events in get_events,
     302          # so we override it to return the original list of events.
     303          class ESC[4;38;5;81mCollector(ESC[4;38;5;149mEventCollector):
     304              def get_events(self):
     305                  return self.events
     306  
     307          content = """<!-- not a comment --> &not-an-entity-ref;
     308                    <a href="" /> </p><p> <span></span></style>
     309                    '</script' + '>'"""
     310          for element in [' script', 'script ', ' script ',
     311                          '\nscript', 'script\n', '\nscript\n']:
     312              element_lower = element.lower().strip()
     313              s = '<script>{content}</{element}>'.format(element=element,
     314                                                         content=content)
     315              self._run_check(s, [("starttag", element_lower, []),
     316                                  ("data", content),
     317                                  ("endtag", element_lower)],
     318                              collector=Collector(convert_charrefs=False))
     319  
     320      def test_comments(self):
     321          html = ("<!-- I'm a valid comment -->"
     322                  '<!--me too!-->'
     323                  '<!------>'
     324                  '<!---->'
     325                  '<!----I have many hyphens---->'
     326                  '<!-- I have a > in the middle -->'
     327                  '<!-- and I have -- in the middle! -->')
     328          expected = [('comment', " I'm a valid comment "),
     329                      ('comment', 'me too!'),
     330                      ('comment', '--'),
     331                      ('comment', ''),
     332                      ('comment', '--I have many hyphens--'),
     333                      ('comment', ' I have a > in the middle '),
     334                      ('comment', ' and I have -- in the middle! ')]
     335          self._run_check(html, expected)
     336  
     337      def test_condcoms(self):
     338          html = ('<!--[if IE & !(lte IE 8)]>aren\'t<![endif]-->'
     339                  '<!--[if IE 8]>condcoms<![endif]-->'
     340                  '<!--[if lte IE 7]>pretty?<![endif]-->')
     341          expected = [('comment', "[if IE & !(lte IE 8)]>aren't<![endif]"),
     342                      ('comment', '[if IE 8]>condcoms<![endif]'),
     343                      ('comment', '[if lte IE 7]>pretty?<![endif]')]
     344          self._run_check(html, expected)
     345  
     346      def test_convert_charrefs(self):
     347          # default value for convert_charrefs is now True
     348          collector = lambda: EventCollectorCharrefs()
     349          self.assertTrue(collector().convert_charrefs)
     350          charrefs = ['&quot;', '&#34;', '&#x22;', '&quot', '&#34', '&#x22']
     351          # check charrefs in the middle of the text/attributes
     352          expected = [('starttag', 'a', [('href', 'foo"zar')]),
     353                      ('data', 'a"z'), ('endtag', 'a')]
     354          for charref in charrefs:
     355              self._run_check('<a href="foo{0}zar">a{0}z</a>'.format(charref),
     356                              expected, collector=collector())
     357          # check charrefs at the beginning/end of the text/attributes
     358          expected = [('data', '"'),
     359                      ('starttag', 'a', [('x', '"'), ('y', '"X'), ('z', 'X"')]),
     360                      ('data', '"'), ('endtag', 'a'), ('data', '"')]
     361          for charref in charrefs:
     362              self._run_check('{0}<a x="{0}" y="{0}X" z="X{0}">'
     363                              '{0}</a>{0}'.format(charref),
     364                              expected, collector=collector())
     365          # check charrefs in <script>/<style> elements
     366          for charref in charrefs:
     367              text = 'X'.join([charref]*3)
     368              expected = [('data', '"'),
     369                          ('starttag', 'script', []), ('data', text),
     370                          ('endtag', 'script'), ('data', '"'),
     371                          ('starttag', 'style', []), ('data', text),
     372                          ('endtag', 'style'), ('data', '"')]
     373              self._run_check('{1}<script>{0}</script>{1}'
     374                              '<style>{0}</style>{1}'.format(text, charref),
     375                              expected, collector=collector())
     376          # check truncated charrefs at the end of the file
     377          html = '&quo &# &#x'
     378          for x in range(1, len(html)):
     379              self._run_check(html[:x], [('data', html[:x])],
     380                              collector=collector())
     381          # check a string with no charrefs
     382          self._run_check('no charrefs here', [('data', 'no charrefs here')],
     383                          collector=collector())
     384  
     385      # the remaining tests were for the "tolerant" parser (which is now
     386      # the default), and check various kind of broken markup
     387      def test_tolerant_parsing(self):
     388          self._run_check('<html <html>te>>xt&a<<bc</a></html>\n'
     389                          '<img src="URL><//img></html</html>', [
     390                              ('starttag', 'html', [('<html', None)]),
     391                              ('data', 'te>>xt'),
     392                              ('entityref', 'a'),
     393                              ('data', '<'),
     394                              ('starttag', 'bc<', [('a', None)]),
     395                              ('endtag', 'html'),
     396                              ('data', '\n<img src="URL>'),
     397                              ('comment', '/img'),
     398                              ('endtag', 'html<')])
     399  
     400      def test_starttag_junk_chars(self):
     401          self._run_check("</>", [])
     402          self._run_check("</$>", [('comment', '$')])
     403          self._run_check("</", [('data', '</')])
     404          self._run_check("</a", [('data', '</a')])
     405          self._run_check("<a<a>", [('starttag', 'a<a', [])])
     406          self._run_check("</a<a>", [('endtag', 'a<a')])
     407          self._run_check("<!", [('data', '<!')])
     408          self._run_check("<a", [('data', '<a')])
     409          self._run_check("<a foo='bar'", [('data', "<a foo='bar'")])
     410          self._run_check("<a foo='bar", [('data', "<a foo='bar")])
     411          self._run_check("<a foo='>'", [('data', "<a foo='>'")])
     412          self._run_check("<a foo='>", [('data', "<a foo='>")])
     413          self._run_check("<a$>", [('starttag', 'a$', [])])
     414          self._run_check("<a$b>", [('starttag', 'a$b', [])])
     415          self._run_check("<a$b/>", [('startendtag', 'a$b', [])])
     416          self._run_check("<a$b  >", [('starttag', 'a$b', [])])
     417          self._run_check("<a$b  />", [('startendtag', 'a$b', [])])
     418  
     419      def test_slashes_in_starttag(self):
     420          self._run_check('<a foo="var"/>', [('startendtag', 'a', [('foo', 'var')])])
     421          html = ('<img width=902 height=250px '
     422                  'src="/sites/default/files/images/homepage/foo.jpg" '
     423                  '/*what am I doing here*/ />')
     424          expected = [(
     425              'startendtag', 'img',
     426              [('width', '902'), ('height', '250px'),
     427               ('src', '/sites/default/files/images/homepage/foo.jpg'),
     428               ('*what', None), ('am', None), ('i', None),
     429               ('doing', None), ('here*', None)]
     430          )]
     431          self._run_check(html, expected)
     432          html = ('<a / /foo/ / /=/ / /bar/ / />'
     433                  '<a / /foo/ / /=/ / /bar/ / >')
     434          expected = [
     435              ('startendtag', 'a', [('foo', None), ('=', None), ('bar', None)]),
     436              ('starttag', 'a', [('foo', None), ('=', None), ('bar', None)])
     437          ]
     438          self._run_check(html, expected)
     439          #see issue #14538
     440          html = ('<meta><meta / ><meta // ><meta / / >'
     441                  '<meta/><meta /><meta //><meta//>')
     442          expected = [
     443              ('starttag', 'meta', []), ('starttag', 'meta', []),
     444              ('starttag', 'meta', []), ('starttag', 'meta', []),
     445              ('startendtag', 'meta', []), ('startendtag', 'meta', []),
     446              ('startendtag', 'meta', []), ('startendtag', 'meta', []),
     447          ]
     448          self._run_check(html, expected)
     449  
     450      def test_declaration_junk_chars(self):
     451          self._run_check("<!DOCTYPE foo $ >", [('decl', 'DOCTYPE foo $ ')])
     452  
     453      def test_illegal_declarations(self):
     454          self._run_check('<!spacer type="block" height="25">',
     455                          [('comment', 'spacer type="block" height="25"')])
     456  
     457      def test_invalid_end_tags(self):
     458          # A collection of broken end tags. <br> is used as separator.
     459          # see http://www.w3.org/TR/html5/tokenization.html#end-tag-open-state
     460          # and #13993
     461          html = ('<br></label</p><br></div end tmAd-leaderBoard><br></<h4><br>'
     462                  '</li class="unit"><br></li\r\n\t\t\t\t\t\t</ul><br></><br>')
     463          expected = [('starttag', 'br', []),
     464                      # < is part of the name, / is discarded, p is an attribute
     465                      ('endtag', 'label<'),
     466                      ('starttag', 'br', []),
     467                      # text and attributes are discarded
     468                      ('endtag', 'div'),
     469                      ('starttag', 'br', []),
     470                      # comment because the first char after </ is not a-zA-Z
     471                      ('comment', '<h4'),
     472                      ('starttag', 'br', []),
     473                      # attributes are discarded
     474                      ('endtag', 'li'),
     475                      ('starttag', 'br', []),
     476                      # everything till ul (included) is discarded
     477                      ('endtag', 'li'),
     478                      ('starttag', 'br', []),
     479                      # </> is ignored
     480                      ('starttag', 'br', [])]
     481          self._run_check(html, expected)
     482  
     483      def test_broken_invalid_end_tag(self):
     484          # This is technically wrong (the "> shouldn't be included in the 'data')
     485          # but is probably not worth fixing it (in addition to all the cases of
     486          # the previous test, it would require a full attribute parsing).
     487          # see #13993
     488          html = '<b>This</b attr=">"> confuses the parser'
     489          expected = [('starttag', 'b', []),
     490                      ('data', 'This'),
     491                      ('endtag', 'b'),
     492                      ('data', '"> confuses the parser')]
     493          self._run_check(html, expected)
     494  
     495      def test_correct_detection_of_start_tags(self):
     496          # see #13273
     497          html = ('<div style=""    ><b>The <a href="some_url">rain</a> '
     498                  '<br /> in <span>Spain</span></b></div>')
     499          expected = [
     500              ('starttag', 'div', [('style', '')]),
     501              ('starttag', 'b', []),
     502              ('data', 'The '),
     503              ('starttag', 'a', [('href', 'some_url')]),
     504              ('data', 'rain'),
     505              ('endtag', 'a'),
     506              ('data', ' '),
     507              ('startendtag', 'br', []),
     508              ('data', ' in '),
     509              ('starttag', 'span', []),
     510              ('data', 'Spain'),
     511              ('endtag', 'span'),
     512              ('endtag', 'b'),
     513              ('endtag', 'div')
     514          ]
     515          self._run_check(html, expected)
     516  
     517          html = '<div style="", foo = "bar" ><b>The <a href="some_url">rain</a>'
     518          expected = [
     519              ('starttag', 'div', [('style', ''), (',', None), ('foo', 'bar')]),
     520              ('starttag', 'b', []),
     521              ('data', 'The '),
     522              ('starttag', 'a', [('href', 'some_url')]),
     523              ('data', 'rain'),
     524              ('endtag', 'a'),
     525          ]
     526          self._run_check(html, expected)
     527  
     528      def test_EOF_in_charref(self):
     529          # see #17802
     530          # This test checks that the UnboundLocalError reported in the issue
     531          # is not raised, however I'm not sure the returned values are correct.
     532          # Maybe HTMLParser should use self.unescape for these
     533          data = [
     534              ('a&', [('data', 'a&')]),
     535              ('a&b', [('data', 'ab')]),
     536              ('a&b ', [('data', 'a'), ('entityref', 'b'), ('data', ' ')]),
     537              ('a&b;', [('data', 'a'), ('entityref', 'b')]),
     538          ]
     539          for html, expected in data:
     540              self._run_check(html, expected)
     541  
     542      def test_broken_comments(self):
     543          html = ('<! not really a comment >'
     544                  '<! not a comment either -->'
     545                  '<! -- close enough -->'
     546                  '<!><!<-- this was an empty comment>'
     547                  '<!!! another bogus comment !!!>')
     548          expected = [
     549              ('comment', ' not really a comment '),
     550              ('comment', ' not a comment either --'),
     551              ('comment', ' -- close enough --'),
     552              ('comment', ''),
     553              ('comment', '<-- this was an empty comment'),
     554              ('comment', '!! another bogus comment !!!'),
     555          ]
     556          self._run_check(html, expected)
     557  
     558      def test_broken_condcoms(self):
     559          # these condcoms are missing the '--' after '<!' and before the '>'
     560          html = ('<![if !(IE)]>broken condcom<![endif]>'
     561                  '<![if ! IE]><link href="favicon.tiff"/><![endif]>'
     562                  '<![if !IE 6]><img src="firefox.png" /><![endif]>'
     563                  '<![if !ie 6]><b>foo</b><![endif]>'
     564                  '<![if (!IE)|(lt IE 9)]><img src="mammoth.bmp" /><![endif]>')
     565          # According to the HTML5 specs sections "8.2.4.44 Bogus comment state"
     566          # and "8.2.4.45 Markup declaration open state", comment tokens should
     567          # be emitted instead of 'unknown decl', but calling unknown_decl
     568          # provides more flexibility.
     569          # See also Lib/_markupbase.py:parse_declaration
     570          expected = [
     571              ('unknown decl', 'if !(IE)'),
     572              ('data', 'broken condcom'),
     573              ('unknown decl', 'endif'),
     574              ('unknown decl', 'if ! IE'),
     575              ('startendtag', 'link', [('href', 'favicon.tiff')]),
     576              ('unknown decl', 'endif'),
     577              ('unknown decl', 'if !IE 6'),
     578              ('startendtag', 'img', [('src', 'firefox.png')]),
     579              ('unknown decl', 'endif'),
     580              ('unknown decl', 'if !ie 6'),
     581              ('starttag', 'b', []),
     582              ('data', 'foo'),
     583              ('endtag', 'b'),
     584              ('unknown decl', 'endif'),
     585              ('unknown decl', 'if (!IE)|(lt IE 9)'),
     586              ('startendtag', 'img', [('src', 'mammoth.bmp')]),
     587              ('unknown decl', 'endif')
     588          ]
     589          self._run_check(html, expected)
     590  
     591      def test_convert_charrefs_dropped_text(self):
     592          # #23144: make sure that all the events are triggered when
     593          # convert_charrefs is True, even if we don't call .close()
     594          parser = EventCollector(convert_charrefs=True)
     595          # before the fix, bar & baz was missing
     596          parser.feed("foo <a>link</a> bar &amp; baz")
     597          self.assertEqual(
     598              parser.get_events(),
     599              [('data', 'foo '), ('starttag', 'a', []), ('data', 'link'),
     600               ('endtag', 'a'), ('data', ' bar & baz')]
     601          )
     602  
     603  
     604  class ESC[4;38;5;81mAttributesTestCase(ESC[4;38;5;149mTestCaseBase):
     605  
     606      def test_attr_syntax(self):
     607          output = [
     608            ("starttag", "a", [("b", "v"), ("c", "v"), ("d", "v"), ("e", None)])
     609          ]
     610          self._run_check("""<a b='v' c="v" d=v e>""", output)
     611          self._run_check("""<a  b = 'v' c = "v" d = v e>""", output)
     612          self._run_check("""<a\nb\n=\n'v'\nc\n=\n"v"\nd\n=\nv\ne>""", output)
     613          self._run_check("""<a\tb\t=\t'v'\tc\t=\t"v"\td\t=\tv\te>""", output)
     614  
     615      def test_attr_values(self):
     616          self._run_check("""<a b='xxx\n\txxx' c="yyy\t\nyyy" d='\txyz\n'>""",
     617                          [("starttag", "a", [("b", "xxx\n\txxx"),
     618                                              ("c", "yyy\t\nyyy"),
     619                                              ("d", "\txyz\n")])])
     620          self._run_check("""<a b='' c="">""",
     621                          [("starttag", "a", [("b", ""), ("c", "")])])
     622          # Regression test for SF patch #669683.
     623          self._run_check("<e a=rgb(1,2,3)>",
     624                          [("starttag", "e", [("a", "rgb(1,2,3)")])])
     625          # Regression test for SF bug #921657.
     626          self._run_check(
     627              "<a href=mailto:xyz@example.com>",
     628              [("starttag", "a", [("href", "mailto:xyz@example.com")])])
     629  
     630      def test_attr_nonascii(self):
     631          # see issue 7311
     632          self._run_check(
     633              "<img src=/foo/bar.png alt=\u4e2d\u6587>",
     634              [("starttag", "img", [("src", "/foo/bar.png"),
     635                                    ("alt", "\u4e2d\u6587")])])
     636          self._run_check(
     637              "<a title='\u30c6\u30b9\u30c8' href='\u30c6\u30b9\u30c8.html'>",
     638              [("starttag", "a", [("title", "\u30c6\u30b9\u30c8"),
     639                                  ("href", "\u30c6\u30b9\u30c8.html")])])
     640          self._run_check(
     641              '<a title="\u30c6\u30b9\u30c8" href="\u30c6\u30b9\u30c8.html">',
     642              [("starttag", "a", [("title", "\u30c6\u30b9\u30c8"),
     643                                  ("href", "\u30c6\u30b9\u30c8.html")])])
     644  
     645      def test_attr_entity_replacement(self):
     646          self._run_check(
     647              "<a b='&amp;&gt;&lt;&quot;&apos;'>",
     648              [("starttag", "a", [("b", "&><\"'")])])
     649  
     650      def test_attr_funky_names(self):
     651          self._run_check(
     652              "<a a.b='v' c:d=v e-f=v>",
     653              [("starttag", "a", [("a.b", "v"), ("c:d", "v"), ("e-f", "v")])])
     654  
     655      def test_entityrefs_in_attributes(self):
     656          self._run_check(
     657              "<html foo='&euro;&amp;&#97;&#x61;&unsupported;'>",
     658              [("starttag", "html", [("foo", "\u20AC&aa&unsupported;")])])
     659  
     660  
     661      def test_attr_funky_names2(self):
     662          self._run_check(
     663              r"<a $><b $=%><c \=/>",
     664              [("starttag", "a", [("$", None)]),
     665               ("starttag", "b", [("$", "%")]),
     666               ("starttag", "c", [("\\", "/")])])
     667  
     668      def test_entities_in_attribute_value(self):
     669          # see #1200313
     670          for entity in ['&', '&amp;', '&#38;', '&#x26;']:
     671              self._run_check('<a href="%s">' % entity,
     672                              [("starttag", "a", [("href", "&")])])
     673              self._run_check("<a href='%s'>" % entity,
     674                              [("starttag", "a", [("href", "&")])])
     675              self._run_check("<a href=%s>" % entity,
     676                              [("starttag", "a", [("href", "&")])])
     677  
     678      def test_malformed_attributes(self):
     679          # see #13357
     680          html = (
     681              "<a href=test'style='color:red;bad1'>test - bad1</a>"
     682              "<a href=test'+style='color:red;ba2'>test - bad2</a>"
     683              "<a href=test'&nbsp;style='color:red;bad3'>test - bad3</a>"
     684              "<a href = test'&nbsp;style='color:red;bad4'  >test - bad4</a>"
     685          )
     686          expected = [
     687              ('starttag', 'a', [('href', "test'style='color:red;bad1'")]),
     688              ('data', 'test - bad1'), ('endtag', 'a'),
     689              ('starttag', 'a', [('href', "test'+style='color:red;ba2'")]),
     690              ('data', 'test - bad2'), ('endtag', 'a'),
     691              ('starttag', 'a', [('href', "test'\xa0style='color:red;bad3'")]),
     692              ('data', 'test - bad3'), ('endtag', 'a'),
     693              ('starttag', 'a', [('href', "test'\xa0style='color:red;bad4'")]),
     694              ('data', 'test - bad4'), ('endtag', 'a')
     695          ]
     696          self._run_check(html, expected)
     697  
     698      def test_malformed_adjacent_attributes(self):
     699          # see #12629
     700          self._run_check('<x><y z=""o"" /></x>',
     701                          [('starttag', 'x', []),
     702                              ('startendtag', 'y', [('z', ''), ('o""', None)]),
     703                              ('endtag', 'x')])
     704          self._run_check('<x><y z="""" /></x>',
     705                          [('starttag', 'x', []),
     706                              ('startendtag', 'y', [('z', ''), ('""', None)]),
     707                              ('endtag', 'x')])
     708  
     709      # see #755670 for the following 3 tests
     710      def test_adjacent_attributes(self):
     711          self._run_check('<a width="100%"cellspacing=0>',
     712                          [("starttag", "a",
     713                            [("width", "100%"), ("cellspacing","0")])])
     714  
     715          self._run_check('<a id="foo"class="bar">',
     716                          [("starttag", "a",
     717                            [("id", "foo"), ("class","bar")])])
     718  
     719      def test_missing_attribute_value(self):
     720          self._run_check('<a v=>',
     721                          [("starttag", "a", [("v", "")])])
     722  
     723      def test_javascript_attribute_value(self):
     724          self._run_check("<a href=javascript:popup('/popup/help.html')>",
     725                          [("starttag", "a",
     726                            [("href", "javascript:popup('/popup/help.html')")])])
     727  
     728      def test_end_tag_in_attribute_value(self):
     729          # see #1745761
     730          self._run_check("<a href='http://www.example.org/\">;'>spam</a>",
     731                          [("starttag", "a",
     732                            [("href", "http://www.example.org/\">;")]),
     733                           ("data", "spam"), ("endtag", "a")])
     734  
     735      def test_with_unquoted_attributes(self):
     736          # see #12008
     737          html = ("<html><body bgcolor=d0ca90 text='181008'>"
     738                  "<table cellspacing=0 cellpadding=1 width=100% ><tr>"
     739                  "<td align=left><font size=-1>"
     740                  "- <a href=/rabota/><span class=en> software-and-i</span></a>"
     741                  "- <a href='/1/'><span class=en> library</span></a></table>")
     742          expected = [
     743              ('starttag', 'html', []),
     744              ('starttag', 'body', [('bgcolor', 'd0ca90'), ('text', '181008')]),
     745              ('starttag', 'table',
     746                  [('cellspacing', '0'), ('cellpadding', '1'), ('width', '100%')]),
     747              ('starttag', 'tr', []),
     748              ('starttag', 'td', [('align', 'left')]),
     749              ('starttag', 'font', [('size', '-1')]),
     750              ('data', '- '), ('starttag', 'a', [('href', '/rabota/')]),
     751              ('starttag', 'span', [('class', 'en')]), ('data', ' software-and-i'),
     752              ('endtag', 'span'), ('endtag', 'a'),
     753              ('data', '- '), ('starttag', 'a', [('href', '/1/')]),
     754              ('starttag', 'span', [('class', 'en')]), ('data', ' library'),
     755              ('endtag', 'span'), ('endtag', 'a'), ('endtag', 'table')
     756          ]
     757          self._run_check(html, expected)
     758  
     759      def test_comma_between_attributes(self):
     760          # see bpo 41478
     761          # HTMLParser preserves duplicate attributes, leaving the task of
     762          # removing duplicate attributes to a conformant html tree builder
     763          html = ('<div class=bar,baz=asd>'        # between attrs (unquoted)
     764                  '<div class="bar",baz="asd">'    # between attrs (quoted)
     765                  '<div class=bar, baz=asd,>'      # after values (unquoted)
     766                  '<div class="bar", baz="asd",>'  # after values (quoted)
     767                  '<div class="bar",>'             # one comma values (quoted)
     768                  '<div class=,bar baz=,asd>'      # before values (unquoted)
     769                  '<div class=,"bar" baz=,"asd">'  # before values (quoted)
     770                  '<div ,class=bar ,baz=asd>'      # before names
     771                  '<div class,="bar" baz,="asd">'  # after names
     772          )
     773          expected = [
     774              ('starttag', 'div', [('class', 'bar,baz=asd'),]),
     775              ('starttag', 'div', [('class', 'bar'), (',baz', 'asd')]),
     776              ('starttag', 'div', [('class', 'bar,'), ('baz', 'asd,')]),
     777              ('starttag', 'div', [('class', 'bar'), (',', None),
     778                                   ('baz', 'asd'), (',', None)]),
     779              ('starttag', 'div', [('class', 'bar'), (',', None)]),
     780              ('starttag', 'div', [('class', ',bar'), ('baz', ',asd')]),
     781              ('starttag', 'div', [('class', ',"bar"'), ('baz', ',"asd"')]),
     782              ('starttag', 'div', [(',class', 'bar'), (',baz', 'asd')]),
     783              ('starttag', 'div', [('class,', 'bar'), ('baz,', 'asd')]),
     784          ]
     785          self._run_check(html, expected)
     786  
     787      def test_weird_chars_in_unquoted_attribute_values(self):
     788          self._run_check('<form action=bogus|&#()value>', [
     789                              ('starttag', 'form',
     790                                  [('action', 'bogus|&#()value')])])
     791  
     792  
     793  class ESC[4;38;5;81mTestInheritance(ESC[4;38;5;149munittestESC[4;38;5;149m.ESC[4;38;5;149mTestCase):
     794  
     795      @patch("_markupbase.ParserBase.__init__")
     796      @patch("_markupbase.ParserBase.reset")
     797      def test_base_class_methods_called(self, super_reset_method, super_init_method):
     798          with patch('_markupbase.ParserBase') as parser_base:
     799              EventCollector()
     800              super_init_method.assert_called_once()
     801              super_reset_method.assert_called_once()
     802  
     803  
     804  if __name__ == "__main__":
     805      unittest.main()