1  """Tests for HTMLParser.py."""
       2  
       3  import html.parser
       4  import pprint
       5  import unittest
       6  
       7  
       8  class ESC[4;38;5;81mEventCollector(ESC[4;38;5;149mhtmlESC[4;38;5;149m.ESC[4;38;5;149mparserESC[4;38;5;149m.ESC[4;38;5;149mHTMLParser):
       9  
      10      def __init__(self, *args, **kw):
      11          self.events = []
      12          self.append = self.events.append
      13          html.parser.HTMLParser.__init__(self, *args, **kw)
      14  
      15      def get_events(self):
      16          # Normalize the list of events so that buffer artefacts don't
      17          # separate runs of contiguous characters.
      18          L = []
      19          prevtype = None
      20          for event in self.events:
      21              type = event[0]
      22              if type == prevtype == "data":
      23                  L[-1] = ("data", L[-1][1] + event[1])
      24              else:
      25                  L.append(event)
      26              prevtype = type
      27          self.events = L
      28          return L
      29  
      30      # structure markup
      31  
      32      def handle_starttag(self, tag, attrs):
      33          self.append(("starttag", tag, attrs))
      34  
      35      def handle_startendtag(self, tag, attrs):
      36          self.append(("startendtag", tag, attrs))
      37  
      38      def handle_endtag(self, tag):
      39          self.append(("endtag", tag))
      40  
      41      # all other markup
      42  
      43      def handle_comment(self, data):
      44          self.append(("comment", data))
      45  
      46      def handle_charref(self, data):
      47          self.append(("charref", data))
      48  
      49      def handle_data(self, data):
      50          self.append(("data", data))
      51  
      52      def handle_decl(self, data):
      53          self.append(("decl", data))
      54  
      55      def handle_entityref(self, data):
      56          self.append(("entityref", data))
      57  
      58      def handle_pi(self, data):
      59          self.append(("pi", data))
      60  
      61      def unknown_decl(self, decl):
      62          self.append(("unknown decl", decl))
      63  
      64  
      65  class ESC[4;38;5;81mEventCollectorExtra(ESC[4;38;5;149mEventCollector):
      66  
      67      def handle_starttag(self, tag, attrs):
      68          EventCollector.handle_starttag(self, tag, attrs)
      69          self.append(("starttag_text", self.get_starttag_text()))
      70  
      71  
      72  class ESC[4;38;5;81mEventCollectorCharrefs(ESC[4;38;5;149mEventCollector):
      73  
      74      def handle_charref(self, data):
      75          self.fail('This should never be called with convert_charrefs=True')
      76  
      77      def handle_entityref(self, data):
      78          self.fail('This should never be called with convert_charrefs=True')
      79  
      80  
      81  class ESC[4;38;5;81mTestCaseBase(ESC[4;38;5;149munittestESC[4;38;5;149m.ESC[4;38;5;149mTestCase):
      82  
      83      def get_collector(self):
      84          return EventCollector(convert_charrefs=False)
      85  
      86      def _run_check(self, source, expected_events, collector=None):
      87          if collector is None:
      88              collector = self.get_collector()
      89          parser = collector
      90          for s in source:
      91              parser.feed(s)
      92          parser.close()
      93          events = parser.get_events()
      94          if events != expected_events:
      95              self.fail("received events did not match expected events" +
      96                        "\nSource:\n" + repr(source) +
      97                        "\nExpected:\n" + pprint.pformat(expected_events) +
      98                        "\nReceived:\n" + pprint.pformat(events))
      99  
     100      def _run_check_extra(self, source, events):
     101          self._run_check(source, events,
     102                          EventCollectorExtra(convert_charrefs=False))
     103  
     104  
     105  class ESC[4;38;5;81mHTMLParserTestCase(ESC[4;38;5;149mTestCaseBase):
     106  
     107      def test_processing_instruction_only(self):
     108          self._run_check("<?processing instruction>", [
     109              ("pi", "processing instruction"),
     110              ])
     111          self._run_check("<?processing instruction ?>", [
     112              ("pi", "processing instruction ?"),
     113              ])
     114  
     115      def test_simple_html(self):
     116          self._run_check("""
     117  <!DOCTYPE html PUBLIC 'foo'>
     118  <HTML>&entity;&#32;
     119  <!--comment1a
     120  -></foo><bar>&lt;<?pi?></foo<bar
     121  comment1b-->
     122  <Img sRc='Bar' isMAP>sample
     123  text
     124  &#x201C;
     125  <!--comment2a-- --comment2b-->
     126  </Html>
     127  """, [
     128      ("data", "\n"),
     129      ("decl", "DOCTYPE html PUBLIC 'foo'"),
     130      ("data", "\n"),
     131      ("starttag", "html", []),
     132      ("entityref", "entity"),
     133      ("charref", "32"),
     134      ("data", "\n"),
     135      ("comment", "comment1a\n-></foo><bar>&lt;<?pi?></foo<bar\ncomment1b"),
     136      ("data", "\n"),
     137      ("starttag", "img", [("src", "Bar"), ("ismap", None)]),
     138      ("data", "sample\ntext\n"),
     139      ("charref", "x201C"),
     140      ("data", "\n"),
     141      ("comment", "comment2a-- --comment2b"),
     142      ("data", "\n"),
     143      ("endtag", "html"),
     144      ("data", "\n"),
     145      ])
     146  
     147      def test_malformatted_charref(self):
     148          self._run_check("<p>&#bad;</p>", [
     149              ("starttag", "p", []),
     150              ("data", "&#bad;"),
     151              ("endtag", "p"),
     152          ])
     153          # add the [] as a workaround to avoid buffering (see #20288)
     154          self._run_check(["<div>&#bad;</div>"], [
     155              ("starttag", "div", []),
     156              ("data", "&#bad;"),
     157              ("endtag", "div"),
     158          ])
     159  
     160      def test_unclosed_entityref(self):
     161          self._run_check("&entityref foo", [
     162              ("entityref", "entityref"),
     163              ("data", " foo"),
     164              ])
     165  
     166      def test_bad_nesting(self):
     167          # Strangely, this *is* supposed to test that overlapping
     168          # elements are allowed.  HTMLParser is more geared toward
     169          # lexing the input that parsing the structure.
     170          self._run_check("<a><b></a></b>", [
     171              ("starttag", "a", []),
     172              ("starttag", "b", []),
     173              ("endtag", "a"),
     174              ("endtag", "b"),
     175              ])
     176  
     177      def test_bare_ampersands(self):
     178          self._run_check("this text & contains & ampersands &", [
     179              ("data", "this text & contains & ampersands &"),
     180              ])
     181  
     182      def test_bare_pointy_brackets(self):
     183          self._run_check("this < text > contains < bare>pointy< brackets", [
     184              ("data", "this < text > contains < bare>pointy< brackets"),
     185              ])
     186  
     187      def test_starttag_end_boundary(self):
     188          self._run_check("""<a b='<'>""", [("starttag", "a", [("b", "<")])])
     189          self._run_check("""<a b='>'>""", [("starttag", "a", [("b", ">")])])
     190  
     191      def test_buffer_artefacts(self):
     192          output = [("starttag", "a", [("b", "<")])]
     193          self._run_check(["<a b='<'>"], output)
     194          self._run_check(["<a ", "b='<'>"], output)
     195          self._run_check(["<a b", "='<'>"], output)
     196          self._run_check(["<a b=", "'<'>"], output)
     197          self._run_check(["<a b='<", "'>"], output)
     198          self._run_check(["<a b='<'", ">"], output)
     199  
     200          output = [("starttag", "a", [("b", ">")])]
     201          self._run_check(["<a b='>'>"], output)
     202          self._run_check(["<a ", "b='>'>"], output)
     203          self._run_check(["<a b", "='>'>"], output)
     204          self._run_check(["<a b=", "'>'>"], output)
     205          self._run_check(["<a b='>", "'>"], output)
     206          self._run_check(["<a b='>'", ">"], output)
     207  
     208          output = [("comment", "abc")]
     209          self._run_check(["", "<!--abc-->"], output)
     210          self._run_check(["<", "!--abc-->"], output)
     211          self._run_check(["<!", "--abc-->"], output)
     212          self._run_check(["<!-", "-abc-->"], output)
     213          self._run_check(["<!--", "abc-->"], output)
     214          self._run_check(["<!--a", "bc-->"], output)
     215          self._run_check(["<!--ab", "c-->"], output)
     216          self._run_check(["<!--abc", "-->"], output)
     217          self._run_check(["<!--abc-", "->"], output)
     218          self._run_check(["<!--abc--", ">"], output)
     219          self._run_check(["<!--abc-->", ""], output)
     220  
     221      def test_valid_doctypes(self):
     222          # from http://www.w3.org/QA/2002/04/valid-dtd-list.html
     223          dtds = ['HTML',  # HTML5 doctype
     224                  ('HTML PUBLIC "-//W3C//DTD HTML 4.01//EN" '
     225                   '"http://www.w3.org/TR/html4/strict.dtd"'),
     226                  ('HTML PUBLIC "-//W3C//DTD HTML 4.01 Transitional//EN" '
     227                   '"http://www.w3.org/TR/html4/loose.dtd"'),
     228                  ('html PUBLIC "-//W3C//DTD XHTML 1.0 Strict//EN" '
     229                   '"http://www.w3.org/TR/xhtml1/DTD/xhtml1-strict.dtd"'),
     230                  ('html PUBLIC "-//W3C//DTD XHTML 1.0 Frameset//EN" '
     231                   '"http://www.w3.org/TR/xhtml1/DTD/xhtml1-frameset.dtd"'),
     232                  ('math PUBLIC "-//W3C//DTD MathML 2.0//EN" '
     233                   '"http://www.w3.org/Math/DTD/mathml2/mathml2.dtd"'),
     234                  ('html PUBLIC "-//W3C//DTD '
     235                   'XHTML 1.1 plus MathML 2.0 plus SVG 1.1//EN" '
     236                   '"http://www.w3.org/2002/04/xhtml-math-svg/xhtml-math-svg.dtd"'),
     237                  ('svg PUBLIC "-//W3C//DTD SVG 1.1//EN" '
     238                   '"http://www.w3.org/Graphics/SVG/1.1/DTD/svg11.dtd"'),
     239                  'html PUBLIC "-//IETF//DTD HTML 2.0//EN"',
     240                  'html PUBLIC "-//W3C//DTD HTML 3.2 Final//EN"']
     241          for dtd in dtds:
     242              self._run_check("<!DOCTYPE %s>" % dtd,
     243                              [('decl', 'DOCTYPE ' + dtd)])
     244  
     245      def test_startendtag(self):
     246          self._run_check("<p/>", [
     247              ("startendtag", "p", []),
     248              ])
     249          self._run_check("<p></p>", [
     250              ("starttag", "p", []),
     251              ("endtag", "p"),
     252              ])
     253          self._run_check("<p><img src='foo' /></p>", [
     254              ("starttag", "p", []),
     255              ("startendtag", "img", [("src", "foo")]),
     256              ("endtag", "p"),
     257              ])
     258  
     259      def test_get_starttag_text(self):
     260          s = """<foo:bar   \n   one="1"\ttwo=2   >"""
     261          self._run_check_extra(s, [
     262              ("starttag", "foo:bar", [("one", "1"), ("two", "2")]),
     263              ("starttag_text", s)])
     264  
     265      def test_cdata_content(self):
     266          contents = [
     267              '<!-- not a comment --> &not-an-entity-ref;',
     268              "<not a='start tag'>",
     269              '<a href="" /> <p> <span></span>',
     270              'foo = "</scr" + "ipt>";',
     271              'foo = "</SCRIPT" + ">";',
     272              'foo = <\n/script> ',
     273              '<!-- document.write("</scr" + "ipt>"); -->',
     274              ('\n//<![CDATA[\n'
     275               'document.write(\'<s\'+\'cript type="text/javascript" '
     276               'src="http://www.example.org/r=\'+new '
     277               'Date().getTime()+\'"><\\/s\'+\'cript>\');\n//]]>'),
     278              '\n<!-- //\nvar foo = 3.14;\n// -->\n',
     279              'foo = "</sty" + "le>";',
     280              '<!-- \u2603 -->',
     281              # these two should be invalid according to the HTML 5 spec,
     282              # section 8.1.2.2
     283              #'foo = </\nscript>',
     284              #'foo = </ script>',
     285          ]
     286          elements = ['script', 'style', 'SCRIPT', 'STYLE', 'Script', 'Style']
     287          for content in contents:
     288              for element in elements:
     289                  element_lower = element.lower()
     290                  s = '<{element}>{content}</{element}>'.format(element=element,
     291                                                                 content=content)
     292                  self._run_check(s, [("starttag", element_lower, []),
     293                                      ("data", content),
     294                                      ("endtag", element_lower)])
     295  
     296      def test_cdata_with_closing_tags(self):
     297          # see issue #13358
     298          # make sure that HTMLParser calls handle_data only once for each CDATA.
     299          # The normal event collector normalizes  the events in get_events,
     300          # so we override it to return the original list of events.
     301          class ESC[4;38;5;81mCollector(ESC[4;38;5;149mEventCollector):
     302              def get_events(self):
     303                  return self.events
     304  
     305          content = """<!-- not a comment --> &not-an-entity-ref;
     306                    <a href="" /> </p><p> <span></span></style>
     307                    '</script' + '>'"""
     308          for element in [' script', 'script ', ' script ',
     309                          '\nscript', 'script\n', '\nscript\n']:
     310              element_lower = element.lower().strip()
     311              s = '<script>{content}</{element}>'.format(element=element,
     312                                                         content=content)
     313              self._run_check(s, [("starttag", element_lower, []),
     314                                  ("data", content),
     315                                  ("endtag", element_lower)],
     316                              collector=Collector(convert_charrefs=False))
     317  
     318      def test_comments(self):
     319          html = ("<!-- I'm a valid comment -->"
     320                  '<!--me too!-->'
     321                  '<!------>'
     322                  '<!---->'
     323                  '<!----I have many hyphens---->'
     324                  '<!-- I have a > in the middle -->'
     325                  '<!-- and I have -- in the middle! -->')
     326          expected = [('comment', " I'm a valid comment "),
     327                      ('comment', 'me too!'),
     328                      ('comment', '--'),
     329                      ('comment', ''),
     330                      ('comment', '--I have many hyphens--'),
     331                      ('comment', ' I have a > in the middle '),
     332                      ('comment', ' and I have -- in the middle! ')]
     333          self._run_check(html, expected)
     334  
     335      def test_condcoms(self):
     336          html = ('<!--[if IE & !(lte IE 8)]>aren\'t<![endif]-->'
     337                  '<!--[if IE 8]>condcoms<![endif]-->'
     338                  '<!--[if lte IE 7]>pretty?<![endif]-->')
     339          expected = [('comment', "[if IE & !(lte IE 8)]>aren't<![endif]"),
     340                      ('comment', '[if IE 8]>condcoms<![endif]'),
     341                      ('comment', '[if lte IE 7]>pretty?<![endif]')]
     342          self._run_check(html, expected)
     343  
     344      def test_convert_charrefs(self):
     345          # default value for convert_charrefs is now True
     346          collector = lambda: EventCollectorCharrefs()
     347          self.assertTrue(collector().convert_charrefs)
     348          charrefs = ['&quot;', '&#34;', '&#x22;', '&quot', '&#34', '&#x22']
     349          # check charrefs in the middle of the text/attributes
     350          expected = [('starttag', 'a', [('href', 'foo"zar')]),
     351                      ('data', 'a"z'), ('endtag', 'a')]
     352          for charref in charrefs:
     353              self._run_check('<a href="foo{0}zar">a{0}z</a>'.format(charref),
     354                              expected, collector=collector())
     355          # check charrefs at the beginning/end of the text/attributes
     356          expected = [('data', '"'),
     357                      ('starttag', 'a', [('x', '"'), ('y', '"X'), ('z', 'X"')]),
     358                      ('data', '"'), ('endtag', 'a'), ('data', '"')]
     359          for charref in charrefs:
     360              self._run_check('{0}<a x="{0}" y="{0}X" z="X{0}">'
     361                              '{0}</a>{0}'.format(charref),
     362                              expected, collector=collector())
     363          # check charrefs in <script>/<style> elements
     364          for charref in charrefs:
     365              text = 'X'.join([charref]*3)
     366              expected = [('data', '"'),
     367                          ('starttag', 'script', []), ('data', text),
     368                          ('endtag', 'script'), ('data', '"'),
     369                          ('starttag', 'style', []), ('data', text),
     370                          ('endtag', 'style'), ('data', '"')]
     371              self._run_check('{1}<script>{0}</script>{1}'
     372                              '<style>{0}</style>{1}'.format(text, charref),
     373                              expected, collector=collector())
     374          # check truncated charrefs at the end of the file
     375          html = '&quo &# &#x'
     376          for x in range(1, len(html)):
     377              self._run_check(html[:x], [('data', html[:x])],
     378                              collector=collector())
     379          # check a string with no charrefs
     380          self._run_check('no charrefs here', [('data', 'no charrefs here')],
     381                          collector=collector())
     382  
     383      # the remaining tests were for the "tolerant" parser (which is now
     384      # the default), and check various kind of broken markup
     385      def test_tolerant_parsing(self):
     386          self._run_check('<html <html>te>>xt&a<<bc</a></html>\n'
     387                          '<img src="URL><//img></html</html>', [
     388                              ('starttag', 'html', [('<html', None)]),
     389                              ('data', 'te>>xt'),
     390                              ('entityref', 'a'),
     391                              ('data', '<'),
     392                              ('starttag', 'bc<', [('a', None)]),
     393                              ('endtag', 'html'),
     394                              ('data', '\n<img src="URL>'),
     395                              ('comment', '/img'),
     396                              ('endtag', 'html<')])
     397  
     398      def test_starttag_junk_chars(self):
     399          self._run_check("</>", [])
     400          self._run_check("</$>", [('comment', '$')])
     401          self._run_check("</", [('data', '</')])
     402          self._run_check("</a", [('data', '</a')])
     403          self._run_check("<a<a>", [('starttag', 'a<a', [])])
     404          self._run_check("</a<a>", [('endtag', 'a<a')])
     405          self._run_check("<!", [('data', '<!')])
     406          self._run_check("<a", [('data', '<a')])
     407          self._run_check("<a foo='bar'", [('data', "<a foo='bar'")])
     408          self._run_check("<a foo='bar", [('data', "<a foo='bar")])
     409          self._run_check("<a foo='>'", [('data', "<a foo='>'")])
     410          self._run_check("<a foo='>", [('data', "<a foo='>")])
     411          self._run_check("<a$>", [('starttag', 'a$', [])])
     412          self._run_check("<a$b>", [('starttag', 'a$b', [])])
     413          self._run_check("<a$b/>", [('startendtag', 'a$b', [])])
     414          self._run_check("<a$b  >", [('starttag', 'a$b', [])])
     415          self._run_check("<a$b  />", [('startendtag', 'a$b', [])])
     416  
     417      def test_slashes_in_starttag(self):
     418          self._run_check('<a foo="var"/>', [('startendtag', 'a', [('foo', 'var')])])
     419          html = ('<img width=902 height=250px '
     420                  'src="/sites/default/files/images/homepage/foo.jpg" '
     421                  '/*what am I doing here*/ />')
     422          expected = [(
     423              'startendtag', 'img',
     424              [('width', '902'), ('height', '250px'),
     425               ('src', '/sites/default/files/images/homepage/foo.jpg'),
     426               ('*what', None), ('am', None), ('i', None),
     427               ('doing', None), ('here*', None)]
     428          )]
     429          self._run_check(html, expected)
     430          html = ('<a / /foo/ / /=/ / /bar/ / />'
     431                  '<a / /foo/ / /=/ / /bar/ / >')
     432          expected = [
     433              ('startendtag', 'a', [('foo', None), ('=', None), ('bar', None)]),
     434              ('starttag', 'a', [('foo', None), ('=', None), ('bar', None)])
     435          ]
     436          self._run_check(html, expected)
     437          #see issue #14538
     438          html = ('<meta><meta / ><meta // ><meta / / >'
     439                  '<meta/><meta /><meta //><meta//>')
     440          expected = [
     441              ('starttag', 'meta', []), ('starttag', 'meta', []),
     442              ('starttag', 'meta', []), ('starttag', 'meta', []),
     443              ('startendtag', 'meta', []), ('startendtag', 'meta', []),
     444              ('startendtag', 'meta', []), ('startendtag', 'meta', []),
     445          ]
     446          self._run_check(html, expected)
     447  
     448      def test_declaration_junk_chars(self):
     449          self._run_check("<!DOCTYPE foo $ >", [('decl', 'DOCTYPE foo $ ')])
     450  
     451      def test_illegal_declarations(self):
     452          self._run_check('<!spacer type="block" height="25">',
     453                          [('comment', 'spacer type="block" height="25"')])
     454  
     455      def test_invalid_end_tags(self):
     456          # A collection of broken end tags. <br> is used as separator.
     457          # see http://www.w3.org/TR/html5/tokenization.html#end-tag-open-state
     458          # and #13993
     459          html = ('<br></label</p><br></div end tmAd-leaderBoard><br></<h4><br>'
     460                  '</li class="unit"><br></li\r\n\t\t\t\t\t\t</ul><br></><br>')
     461          expected = [('starttag', 'br', []),
     462                      # < is part of the name, / is discarded, p is an attribute
     463                      ('endtag', 'label<'),
     464                      ('starttag', 'br', []),
     465                      # text and attributes are discarded
     466                      ('endtag', 'div'),
     467                      ('starttag', 'br', []),
     468                      # comment because the first char after </ is not a-zA-Z
     469                      ('comment', '<h4'),
     470                      ('starttag', 'br', []),
     471                      # attributes are discarded
     472                      ('endtag', 'li'),
     473                      ('starttag', 'br', []),
     474                      # everything till ul (included) is discarded
     475                      ('endtag', 'li'),
     476                      ('starttag', 'br', []),
     477                      # </> is ignored
     478                      ('starttag', 'br', [])]
     479          self._run_check(html, expected)
     480  
     481      def test_broken_invalid_end_tag(self):
     482          # This is technically wrong (the "> shouldn't be included in the 'data')
     483          # but is probably not worth fixing it (in addition to all the cases of
     484          # the previous test, it would require a full attribute parsing).
     485          # see #13993
     486          html = '<b>This</b attr=">"> confuses the parser'
     487          expected = [('starttag', 'b', []),
     488                      ('data', 'This'),
     489                      ('endtag', 'b'),
     490                      ('data', '"> confuses the parser')]
     491          self._run_check(html, expected)
     492  
     493      def test_correct_detection_of_start_tags(self):
     494          # see #13273
     495          html = ('<div style=""    ><b>The <a href="some_url">rain</a> '
     496                  '<br /> in <span>Spain</span></b></div>')
     497          expected = [
     498              ('starttag', 'div', [('style', '')]),
     499              ('starttag', 'b', []),
     500              ('data', 'The '),
     501              ('starttag', 'a', [('href', 'some_url')]),
     502              ('data', 'rain'),
     503              ('endtag', 'a'),
     504              ('data', ' '),
     505              ('startendtag', 'br', []),
     506              ('data', ' in '),
     507              ('starttag', 'span', []),
     508              ('data', 'Spain'),
     509              ('endtag', 'span'),
     510              ('endtag', 'b'),
     511              ('endtag', 'div')
     512          ]
     513          self._run_check(html, expected)
     514  
     515          html = '<div style="", foo = "bar" ><b>The <a href="some_url">rain</a>'
     516          expected = [
     517              ('starttag', 'div', [('style', ''), (',', None), ('foo', 'bar')]),
     518              ('starttag', 'b', []),
     519              ('data', 'The '),
     520              ('starttag', 'a', [('href', 'some_url')]),
     521              ('data', 'rain'),
     522              ('endtag', 'a'),
     523          ]
     524          self._run_check(html, expected)
     525  
     526      def test_EOF_in_charref(self):
     527          # see #17802
     528          # This test checks that the UnboundLocalError reported in the issue
     529          # is not raised, however I'm not sure the returned values are correct.
     530          # Maybe HTMLParser should use self.unescape for these
     531          data = [
     532              ('a&', [('data', 'a&')]),
     533              ('a&b', [('data', 'ab')]),
     534              ('a&b ', [('data', 'a'), ('entityref', 'b'), ('data', ' ')]),
     535              ('a&b;', [('data', 'a'), ('entityref', 'b')]),
     536          ]
     537          for html, expected in data:
     538              self._run_check(html, expected)
     539  
     540      def test_broken_comments(self):
     541          html = ('<! not really a comment >'
     542                  '<! not a comment either -->'
     543                  '<! -- close enough -->'
     544                  '<!><!<-- this was an empty comment>'
     545                  '<!!! another bogus comment !!!>')
     546          expected = [
     547              ('comment', ' not really a comment '),
     548              ('comment', ' not a comment either --'),
     549              ('comment', ' -- close enough --'),
     550              ('comment', ''),
     551              ('comment', '<-- this was an empty comment'),
     552              ('comment', '!! another bogus comment !!!'),
     553          ]
     554          self._run_check(html, expected)
     555  
     556      def test_broken_condcoms(self):
     557          # these condcoms are missing the '--' after '<!' and before the '>'
     558          html = ('<![if !(IE)]>broken condcom<![endif]>'
     559                  '<![if ! IE]><link href="favicon.tiff"/><![endif]>'
     560                  '<![if !IE 6]><img src="firefox.png" /><![endif]>'
     561                  '<![if !ie 6]><b>foo</b><![endif]>'
     562                  '<![if (!IE)|(lt IE 9)]><img src="mammoth.bmp" /><![endif]>')
     563          # According to the HTML5 specs sections "8.2.4.44 Bogus comment state"
     564          # and "8.2.4.45 Markup declaration open state", comment tokens should
     565          # be emitted instead of 'unknown decl', but calling unknown_decl
     566          # provides more flexibility.
     567          # See also Lib/_markupbase.py:parse_declaration
     568          expected = [
     569              ('unknown decl', 'if !(IE)'),
     570              ('data', 'broken condcom'),
     571              ('unknown decl', 'endif'),
     572              ('unknown decl', 'if ! IE'),
     573              ('startendtag', 'link', [('href', 'favicon.tiff')]),
     574              ('unknown decl', 'endif'),
     575              ('unknown decl', 'if !IE 6'),
     576              ('startendtag', 'img', [('src', 'firefox.png')]),
     577              ('unknown decl', 'endif'),
     578              ('unknown decl', 'if !ie 6'),
     579              ('starttag', 'b', []),
     580              ('data', 'foo'),
     581              ('endtag', 'b'),
     582              ('unknown decl', 'endif'),
     583              ('unknown decl', 'if (!IE)|(lt IE 9)'),
     584              ('startendtag', 'img', [('src', 'mammoth.bmp')]),
     585              ('unknown decl', 'endif')
     586          ]
     587          self._run_check(html, expected)
     588  
     589      def test_convert_charrefs_dropped_text(self):
     590          # #23144: make sure that all the events are triggered when
     591          # convert_charrefs is True, even if we don't call .close()
     592          parser = EventCollector(convert_charrefs=True)
     593          # before the fix, bar & baz was missing
     594          parser.feed("foo <a>link</a> bar &amp; baz")
     595          self.assertEqual(
     596              parser.get_events(),
     597              [('data', 'foo '), ('starttag', 'a', []), ('data', 'link'),
     598               ('endtag', 'a'), ('data', ' bar & baz')]
     599          )
     600  
     601  
     602  class ESC[4;38;5;81mAttributesTestCase(ESC[4;38;5;149mTestCaseBase):
     603  
     604      def test_attr_syntax(self):
     605          output = [
     606            ("starttag", "a", [("b", "v"), ("c", "v"), ("d", "v"), ("e", None)])
     607          ]
     608          self._run_check("""<a b='v' c="v" d=v e>""", output)
     609          self._run_check("""<a  b = 'v' c = "v" d = v e>""", output)
     610          self._run_check("""<a\nb\n=\n'v'\nc\n=\n"v"\nd\n=\nv\ne>""", output)
     611          self._run_check("""<a\tb\t=\t'v'\tc\t=\t"v"\td\t=\tv\te>""", output)
     612  
     613      def test_attr_values(self):
     614          self._run_check("""<a b='xxx\n\txxx' c="yyy\t\nyyy" d='\txyz\n'>""",
     615                          [("starttag", "a", [("b", "xxx\n\txxx"),
     616                                              ("c", "yyy\t\nyyy"),
     617                                              ("d", "\txyz\n")])])
     618          self._run_check("""<a b='' c="">""",
     619                          [("starttag", "a", [("b", ""), ("c", "")])])
     620          # Regression test for SF patch #669683.
     621          self._run_check("<e a=rgb(1,2,3)>",
     622                          [("starttag", "e", [("a", "rgb(1,2,3)")])])
     623          # Regression test for SF bug #921657.
     624          self._run_check(
     625              "<a href=mailto:xyz@example.com>",
     626              [("starttag", "a", [("href", "mailto:xyz@example.com")])])
     627  
     628      def test_attr_nonascii(self):
     629          # see issue 7311
     630          self._run_check(
     631              "<img src=/foo/bar.png alt=\u4e2d\u6587>",
     632              [("starttag", "img", [("src", "/foo/bar.png"),
     633                                    ("alt", "\u4e2d\u6587")])])
     634          self._run_check(
     635              "<a title='\u30c6\u30b9\u30c8' href='\u30c6\u30b9\u30c8.html'>",
     636              [("starttag", "a", [("title", "\u30c6\u30b9\u30c8"),
     637                                  ("href", "\u30c6\u30b9\u30c8.html")])])
     638          self._run_check(
     639              '<a title="\u30c6\u30b9\u30c8" href="\u30c6\u30b9\u30c8.html">',
     640              [("starttag", "a", [("title", "\u30c6\u30b9\u30c8"),
     641                                  ("href", "\u30c6\u30b9\u30c8.html")])])
     642  
     643      def test_attr_entity_replacement(self):
     644          self._run_check(
     645              "<a b='&amp;&gt;&lt;&quot;&apos;'>",
     646              [("starttag", "a", [("b", "&><\"'")])])
     647  
     648      def test_attr_funky_names(self):
     649          self._run_check(
     650              "<a a.b='v' c:d=v e-f=v>",
     651              [("starttag", "a", [("a.b", "v"), ("c:d", "v"), ("e-f", "v")])])
     652  
     653      def test_entityrefs_in_attributes(self):
     654          self._run_check(
     655              "<html foo='&euro;&amp;&#97;&#x61;&unsupported;'>",
     656              [("starttag", "html", [("foo", "\u20AC&aa&unsupported;")])])
     657  
     658  
     659      def test_attr_funky_names2(self):
     660          self._run_check(
     661              r"<a $><b $=%><c \=/>",
     662              [("starttag", "a", [("$", None)]),
     663               ("starttag", "b", [("$", "%")]),
     664               ("starttag", "c", [("\\", "/")])])
     665  
     666      def test_entities_in_attribute_value(self):
     667          # see #1200313
     668          for entity in ['&', '&amp;', '&#38;', '&#x26;']:
     669              self._run_check('<a href="%s">' % entity,
     670                              [("starttag", "a", [("href", "&")])])
     671              self._run_check("<a href='%s'>" % entity,
     672                              [("starttag", "a", [("href", "&")])])
     673              self._run_check("<a href=%s>" % entity,
     674                              [("starttag", "a", [("href", "&")])])
     675  
     676      def test_malformed_attributes(self):
     677          # see #13357
     678          html = (
     679              "<a href=test'style='color:red;bad1'>test - bad1</a>"
     680              "<a href=test'+style='color:red;ba2'>test - bad2</a>"
     681              "<a href=test'&nbsp;style='color:red;bad3'>test - bad3</a>"
     682              "<a href = test'&nbsp;style='color:red;bad4'  >test - bad4</a>"
     683          )
     684          expected = [
     685              ('starttag', 'a', [('href', "test'style='color:red;bad1'")]),
     686              ('data', 'test - bad1'), ('endtag', 'a'),
     687              ('starttag', 'a', [('href', "test'+style='color:red;ba2'")]),
     688              ('data', 'test - bad2'), ('endtag', 'a'),
     689              ('starttag', 'a', [('href', "test'\xa0style='color:red;bad3'")]),
     690              ('data', 'test - bad3'), ('endtag', 'a'),
     691              ('starttag', 'a', [('href', "test'\xa0style='color:red;bad4'")]),
     692              ('data', 'test - bad4'), ('endtag', 'a')
     693          ]
     694          self._run_check(html, expected)
     695  
     696      def test_malformed_adjacent_attributes(self):
     697          # see #12629
     698          self._run_check('<x><y z=""o"" /></x>',
     699                          [('starttag', 'x', []),
     700                              ('startendtag', 'y', [('z', ''), ('o""', None)]),
     701                              ('endtag', 'x')])
     702          self._run_check('<x><y z="""" /></x>',
     703                          [('starttag', 'x', []),
     704                              ('startendtag', 'y', [('z', ''), ('""', None)]),
     705                              ('endtag', 'x')])
     706  
     707      # see #755670 for the following 3 tests
     708      def test_adjacent_attributes(self):
     709          self._run_check('<a width="100%"cellspacing=0>',
     710                          [("starttag", "a",
     711                            [("width", "100%"), ("cellspacing","0")])])
     712  
     713          self._run_check('<a id="foo"class="bar">',
     714                          [("starttag", "a",
     715                            [("id", "foo"), ("class","bar")])])
     716  
     717      def test_missing_attribute_value(self):
     718          self._run_check('<a v=>',
     719                          [("starttag", "a", [("v", "")])])
     720  
     721      def test_javascript_attribute_value(self):
     722          self._run_check("<a href=javascript:popup('/popup/help.html')>",
     723                          [("starttag", "a",
     724                            [("href", "javascript:popup('/popup/help.html')")])])
     725  
     726      def test_end_tag_in_attribute_value(self):
     727          # see #1745761
     728          self._run_check("<a href='http://www.example.org/\">;'>spam</a>",
     729                          [("starttag", "a",
     730                            [("href", "http://www.example.org/\">;")]),
     731                           ("data", "spam"), ("endtag", "a")])
     732  
     733      def test_with_unquoted_attributes(self):
     734          # see #12008
     735          html = ("<html><body bgcolor=d0ca90 text='181008'>"
     736                  "<table cellspacing=0 cellpadding=1 width=100% ><tr>"
     737                  "<td align=left><font size=-1>"
     738                  "- <a href=/rabota/><span class=en> software-and-i</span></a>"
     739                  "- <a href='/1/'><span class=en> library</span></a></table>")
     740          expected = [
     741              ('starttag', 'html', []),
     742              ('starttag', 'body', [('bgcolor', 'd0ca90'), ('text', '181008')]),
     743              ('starttag', 'table',
     744                  [('cellspacing', '0'), ('cellpadding', '1'), ('width', '100%')]),
     745              ('starttag', 'tr', []),
     746              ('starttag', 'td', [('align', 'left')]),
     747              ('starttag', 'font', [('size', '-1')]),
     748              ('data', '- '), ('starttag', 'a', [('href', '/rabota/')]),
     749              ('starttag', 'span', [('class', 'en')]), ('data', ' software-and-i'),
     750              ('endtag', 'span'), ('endtag', 'a'),
     751              ('data', '- '), ('starttag', 'a', [('href', '/1/')]),
     752              ('starttag', 'span', [('class', 'en')]), ('data', ' library'),
     753              ('endtag', 'span'), ('endtag', 'a'), ('endtag', 'table')
     754          ]
     755          self._run_check(html, expected)
     756  
     757      def test_comma_between_attributes(self):
     758          # see bpo 41478
     759          # HTMLParser preserves duplicate attributes, leaving the task of
     760          # removing duplicate attributes to a conformant html tree builder
     761          html = ('<div class=bar,baz=asd>'        # between attrs (unquoted)
     762                  '<div class="bar",baz="asd">'    # between attrs (quoted)
     763                  '<div class=bar, baz=asd,>'      # after values (unquoted)
     764                  '<div class="bar", baz="asd",>'  # after values (quoted)
     765                  '<div class="bar",>'             # one comma values (quoted)
     766                  '<div class=,bar baz=,asd>'      # before values (unquoted)
     767                  '<div class=,"bar" baz=,"asd">'  # before values (quoted)
     768                  '<div ,class=bar ,baz=asd>'      # before names
     769                  '<div class,="bar" baz,="asd">'  # after names
     770          )
     771          expected = [
     772              ('starttag', 'div', [('class', 'bar,baz=asd'),]),
     773              ('starttag', 'div', [('class', 'bar'), (',baz', 'asd')]),
     774              ('starttag', 'div', [('class', 'bar,'), ('baz', 'asd,')]),
     775              ('starttag', 'div', [('class', 'bar'), (',', None),
     776                                   ('baz', 'asd'), (',', None)]),
     777              ('starttag', 'div', [('class', 'bar'), (',', None)]),
     778              ('starttag', 'div', [('class', ',bar'), ('baz', ',asd')]),
     779              ('starttag', 'div', [('class', ',"bar"'), ('baz', ',"asd"')]),
     780              ('starttag', 'div', [(',class', 'bar'), (',baz', 'asd')]),
     781              ('starttag', 'div', [('class,', 'bar'), ('baz,', 'asd')]),
     782          ]
     783          self._run_check(html, expected)
     784  
     785      def test_weird_chars_in_unquoted_attribute_values(self):
     786          self._run_check('<form action=bogus|&#()value>', [
     787                              ('starttag', 'form',
     788                                  [('action', 'bogus|&#()value')])])
     789  
     790  if __name__ == "__main__":
     791      unittest.main()