1 """A parser for HTML and XHTML."""
2
3 # This file is based on sgmllib.py, but the API is slightly different.
4
5 # XXX There should be a way to distinguish between PCDATA (parsed
6 # character data -- the normal case), RCDATA (replaceable character
7 # data -- only char and entity references and end tags are special)
8 # and CDATA (character data -- only end tags are special).
9
10
11 import re
12 import _markupbase
13
14 from html import unescape
15
16
17 __all__ = ['HTMLParser']
18
19 # Regular expressions used for parsing
20
21 interesting_normal = re.compile('[&<]')
22 incomplete = re.compile('&[a-zA-Z#]')
23
24 entityref = re.compile('&([a-zA-Z][-.a-zA-Z0-9]*)[^a-zA-Z0-9]')
25 charref = re.compile('&#(?:[0-9]+|[xX][0-9a-fA-F]+)[^0-9a-fA-F]')
26
27 starttagopen = re.compile('<[a-zA-Z]')
28 piclose = re.compile('>')
29 commentclose = re.compile(r'--\s*>')
30 # Note:
31 # 1) if you change tagfind/attrfind remember to update locatestarttagend too;
32 # 2) if you change tagfind/attrfind and/or locatestarttagend the parser will
33 # explode, so don't do it.
34 # see http://www.w3.org/TR/html5/tokenization.html#tag-open-state
35 # and http://www.w3.org/TR/html5/tokenization.html#tag-name-state
36 tagfind_tolerant = re.compile(r'([a-zA-Z][^\t\n\r\f />\x00]*)(?:\s|/(?!>))*')
37 attrfind_tolerant = re.compile(
38 r'((?<=[\'"\s/])[^\s/>][^\s/=>]*)(\s*=+\s*'
39 r'(\'[^\']*\'|"[^"]*"|(?![\'"])[^>\s]*))?(?:\s|/(?!>))*')
40 locatestarttagend_tolerant = re.compile(r"""
41 <[a-zA-Z][^\t\n\r\f />\x00]* # tag name
42 (?:[\s/]* # optional whitespace before attribute name
43 (?:(?<=['"\s/])[^\s/>][^\s/=>]* # attribute name
44 (?:\s*=+\s* # value indicator
45 (?:'[^']*' # LITA-enclosed value
46 |"[^"]*" # LIT-enclosed value
47 |(?!['"])[^>\s]* # bare value
48 )
49 \s* # possibly followed by a space
50 )?(?:\s|/(?!>))*
51 )*
52 )?
53 \s* # trailing whitespace
54 """, re.VERBOSE)
55 endendtag = re.compile('>')
56 # the HTML 5 spec, section 8.1.2.2, doesn't allow spaces between
57 # </ and the tag name, so maybe this should be fixed
58 endtagfind = re.compile(r'</\s*([a-zA-Z][-.a-zA-Z0-9:_]*)\s*>')
59
60
61
62 class ESC[4;38;5;81mHTMLParser(ESC[4;38;5;149m_markupbaseESC[4;38;5;149m.ESC[4;38;5;149mParserBase):
63 """Find tags and other markup and call handler functions.
64
65 Usage:
66 p = HTMLParser()
67 p.feed(data)
68 ...
69 p.close()
70
71 Start tags are handled by calling self.handle_starttag() or
72 self.handle_startendtag(); end tags by self.handle_endtag(). The
73 data between tags is passed from the parser to the derived class
74 by calling self.handle_data() with the data as argument (the data
75 may be split up in arbitrary chunks). If convert_charrefs is
76 True the character references are converted automatically to the
77 corresponding Unicode character (and self.handle_data() is no
78 longer split in chunks), otherwise they are passed by calling
79 self.handle_entityref() or self.handle_charref() with the string
80 containing respectively the named or numeric reference as the
81 argument.
82 """
83
84 CDATA_CONTENT_ELEMENTS = ("script", "style")
85
86 def __init__(self, *, convert_charrefs=True):
87 """Initialize and reset this instance.
88
89 If convert_charrefs is True (the default), all character references
90 are automatically converted to the corresponding Unicode characters.
91 """
92 self.convert_charrefs = convert_charrefs
93 self.reset()
94
95 def reset(self):
96 """Reset this instance. Loses all unprocessed data."""
97 self.rawdata = ''
98 self.lasttag = '???'
99 self.interesting = interesting_normal
100 self.cdata_elem = None
101 _markupbase.ParserBase.reset(self)
102
103 def feed(self, data):
104 r"""Feed data to the parser.
105
106 Call this as often as you want, with as little or as much text
107 as you want (may include '\n').
108 """
109 self.rawdata = self.rawdata + data
110 self.goahead(0)
111
112 def close(self):
113 """Handle any buffered data."""
114 self.goahead(1)
115
116 __starttag_text = None
117
118 def get_starttag_text(self):
119 """Return full source of start tag: '<...>'."""
120 return self.__starttag_text
121
122 def set_cdata_mode(self, elem):
123 self.cdata_elem = elem.lower()
124 self.interesting = re.compile(r'</\s*%s\s*>' % self.cdata_elem, re.I)
125
126 def clear_cdata_mode(self):
127 self.interesting = interesting_normal
128 self.cdata_elem = None
129
130 # Internal -- handle data as far as reasonable. May leave state
131 # and data to be processed by a subsequent call. If 'end' is
132 # true, force handling all data as if followed by EOF marker.
133 def goahead(self, end):
134 rawdata = self.rawdata
135 i = 0
136 n = len(rawdata)
137 while i < n:
138 if self.convert_charrefs and not self.cdata_elem:
139 j = rawdata.find('<', i)
140 if j < 0:
141 # if we can't find the next <, either we are at the end
142 # or there's more text incoming. If the latter is True,
143 # we can't pass the text to handle_data in case we have
144 # a charref cut in half at end. Try to determine if
145 # this is the case before proceeding by looking for an
146 # & near the end and see if it's followed by a space or ;.
147 amppos = rawdata.rfind('&', max(i, n-34))
148 if (amppos >= 0 and
149 not re.compile(r'[\s;]').search(rawdata, amppos)):
150 break # wait till we get all the text
151 j = n
152 else:
153 match = self.interesting.search(rawdata, i) # < or &
154 if match:
155 j = match.start()
156 else:
157 if self.cdata_elem:
158 break
159 j = n
160 if i < j:
161 if self.convert_charrefs and not self.cdata_elem:
162 self.handle_data(unescape(rawdata[i:j]))
163 else:
164 self.handle_data(rawdata[i:j])
165 i = self.updatepos(i, j)
166 if i == n: break
167 startswith = rawdata.startswith
168 if startswith('<', i):
169 if starttagopen.match(rawdata, i): # < + letter
170 k = self.parse_starttag(i)
171 elif startswith("</", i):
172 k = self.parse_endtag(i)
173 elif startswith("<!--", i):
174 k = self.parse_comment(i)
175 elif startswith("<?", i):
176 k = self.parse_pi(i)
177 elif startswith("<!", i):
178 k = self.parse_html_declaration(i)
179 elif (i + 1) < n:
180 self.handle_data("<")
181 k = i + 1
182 else:
183 break
184 if k < 0:
185 if not end:
186 break
187 k = rawdata.find('>', i + 1)
188 if k < 0:
189 k = rawdata.find('<', i + 1)
190 if k < 0:
191 k = i + 1
192 else:
193 k += 1
194 if self.convert_charrefs and not self.cdata_elem:
195 self.handle_data(unescape(rawdata[i:k]))
196 else:
197 self.handle_data(rawdata[i:k])
198 i = self.updatepos(i, k)
199 elif startswith("&#", i):
200 match = charref.match(rawdata, i)
201 if match:
202 name = match.group()[2:-1]
203 self.handle_charref(name)
204 k = match.end()
205 if not startswith(';', k-1):
206 k = k - 1
207 i = self.updatepos(i, k)
208 continue
209 else:
210 if ";" in rawdata[i:]: # bail by consuming &#
211 self.handle_data(rawdata[i:i+2])
212 i = self.updatepos(i, i+2)
213 break
214 elif startswith('&', i):
215 match = entityref.match(rawdata, i)
216 if match:
217 name = match.group(1)
218 self.handle_entityref(name)
219 k = match.end()
220 if not startswith(';', k-1):
221 k = k - 1
222 i = self.updatepos(i, k)
223 continue
224 match = incomplete.match(rawdata, i)
225 if match:
226 # match.group() will contain at least 2 chars
227 if end and match.group() == rawdata[i:]:
228 k = match.end()
229 if k <= i:
230 k = n
231 i = self.updatepos(i, i + 1)
232 # incomplete
233 break
234 elif (i + 1) < n:
235 # not the end of the buffer, and can't be confused
236 # with some other construct
237 self.handle_data("&")
238 i = self.updatepos(i, i + 1)
239 else:
240 break
241 else:
242 assert 0, "interesting.search() lied"
243 # end while
244 if end and i < n and not self.cdata_elem:
245 if self.convert_charrefs and not self.cdata_elem:
246 self.handle_data(unescape(rawdata[i:n]))
247 else:
248 self.handle_data(rawdata[i:n])
249 i = self.updatepos(i, n)
250 self.rawdata = rawdata[i:]
251
252 # Internal -- parse html declarations, return length or -1 if not terminated
253 # See w3.org/TR/html5/tokenization.html#markup-declaration-open-state
254 # See also parse_declaration in _markupbase
255 def parse_html_declaration(self, i):
256 rawdata = self.rawdata
257 assert rawdata[i:i+2] == '<!', ('unexpected call to '
258 'parse_html_declaration()')
259 if rawdata[i:i+4] == '<!--':
260 # this case is actually already handled in goahead()
261 return self.parse_comment(i)
262 elif rawdata[i:i+3] == '<![':
263 return self.parse_marked_section(i)
264 elif rawdata[i:i+9].lower() == '<!doctype':
265 # find the closing >
266 gtpos = rawdata.find('>', i+9)
267 if gtpos == -1:
268 return -1
269 self.handle_decl(rawdata[i+2:gtpos])
270 return gtpos+1
271 else:
272 return self.parse_bogus_comment(i)
273
274 # Internal -- parse bogus comment, return length or -1 if not terminated
275 # see http://www.w3.org/TR/html5/tokenization.html#bogus-comment-state
276 def parse_bogus_comment(self, i, report=1):
277 rawdata = self.rawdata
278 assert rawdata[i:i+2] in ('<!', '</'), ('unexpected call to '
279 'parse_comment()')
280 pos = rawdata.find('>', i+2)
281 if pos == -1:
282 return -1
283 if report:
284 self.handle_comment(rawdata[i+2:pos])
285 return pos + 1
286
287 # Internal -- parse processing instr, return end or -1 if not terminated
288 def parse_pi(self, i):
289 rawdata = self.rawdata
290 assert rawdata[i:i+2] == '<?', 'unexpected call to parse_pi()'
291 match = piclose.search(rawdata, i+2) # >
292 if not match:
293 return -1
294 j = match.start()
295 self.handle_pi(rawdata[i+2: j])
296 j = match.end()
297 return j
298
299 # Internal -- handle starttag, return end or -1 if not terminated
300 def parse_starttag(self, i):
301 self.__starttag_text = None
302 endpos = self.check_for_whole_start_tag(i)
303 if endpos < 0:
304 return endpos
305 rawdata = self.rawdata
306 self.__starttag_text = rawdata[i:endpos]
307
308 # Now parse the data between i+1 and j into a tag and attrs
309 attrs = []
310 match = tagfind_tolerant.match(rawdata, i+1)
311 assert match, 'unexpected call to parse_starttag()'
312 k = match.end()
313 self.lasttag = tag = match.group(1).lower()
314 while k < endpos:
315 m = attrfind_tolerant.match(rawdata, k)
316 if not m:
317 break
318 attrname, rest, attrvalue = m.group(1, 2, 3)
319 if not rest:
320 attrvalue = None
321 elif attrvalue[:1] == '\'' == attrvalue[-1:] or \
322 attrvalue[:1] == '"' == attrvalue[-1:]:
323 attrvalue = attrvalue[1:-1]
324 if attrvalue:
325 attrvalue = unescape(attrvalue)
326 attrs.append((attrname.lower(), attrvalue))
327 k = m.end()
328
329 end = rawdata[k:endpos].strip()
330 if end not in (">", "/>"):
331 self.handle_data(rawdata[i:endpos])
332 return endpos
333 if end.endswith('/>'):
334 # XHTML-style empty tag: <span attr="value" />
335 self.handle_startendtag(tag, attrs)
336 else:
337 self.handle_starttag(tag, attrs)
338 if tag in self.CDATA_CONTENT_ELEMENTS:
339 self.set_cdata_mode(tag)
340 return endpos
341
342 # Internal -- check to see if we have a complete starttag; return end
343 # or -1 if incomplete.
344 def check_for_whole_start_tag(self, i):
345 rawdata = self.rawdata
346 m = locatestarttagend_tolerant.match(rawdata, i)
347 if m:
348 j = m.end()
349 next = rawdata[j:j+1]
350 if next == ">":
351 return j + 1
352 if next == "/":
353 if rawdata.startswith("/>", j):
354 return j + 2
355 if rawdata.startswith("/", j):
356 # buffer boundary
357 return -1
358 # else bogus input
359 if j > i:
360 return j
361 else:
362 return i + 1
363 if next == "":
364 # end of input
365 return -1
366 if next in ("abcdefghijklmnopqrstuvwxyz=/"
367 "ABCDEFGHIJKLMNOPQRSTUVWXYZ"):
368 # end of input in or before attribute value, or we have the
369 # '/' from a '/>' ending
370 return -1
371 if j > i:
372 return j
373 else:
374 return i + 1
375 raise AssertionError("we should not get here!")
376
377 # Internal -- parse endtag, return end or -1 if incomplete
378 def parse_endtag(self, i):
379 rawdata = self.rawdata
380 assert rawdata[i:i+2] == "</", "unexpected call to parse_endtag"
381 match = endendtag.search(rawdata, i+1) # >
382 if not match:
383 return -1
384 gtpos = match.end()
385 match = endtagfind.match(rawdata, i) # </ + tag + >
386 if not match:
387 if self.cdata_elem is not None:
388 self.handle_data(rawdata[i:gtpos])
389 return gtpos
390 # find the name: w3.org/TR/html5/tokenization.html#tag-name-state
391 namematch = tagfind_tolerant.match(rawdata, i+2)
392 if not namematch:
393 # w3.org/TR/html5/tokenization.html#end-tag-open-state
394 if rawdata[i:i+3] == '</>':
395 return i+3
396 else:
397 return self.parse_bogus_comment(i)
398 tagname = namematch.group(1).lower()
399 # consume and ignore other stuff between the name and the >
400 # Note: this is not 100% correct, since we might have things like
401 # </tag attr=">">, but looking for > after the name should cover
402 # most of the cases and is much simpler
403 gtpos = rawdata.find('>', namematch.end())
404 self.handle_endtag(tagname)
405 return gtpos+1
406
407 elem = match.group(1).lower() # script or style
408 if self.cdata_elem is not None:
409 if elem != self.cdata_elem:
410 self.handle_data(rawdata[i:gtpos])
411 return gtpos
412
413 self.handle_endtag(elem)
414 self.clear_cdata_mode()
415 return gtpos
416
417 # Overridable -- finish processing of start+end tag: <tag.../>
418 def handle_startendtag(self, tag, attrs):
419 self.handle_starttag(tag, attrs)
420 self.handle_endtag(tag)
421
422 # Overridable -- handle start tag
423 def handle_starttag(self, tag, attrs):
424 pass
425
426 # Overridable -- handle end tag
427 def handle_endtag(self, tag):
428 pass
429
430 # Overridable -- handle character reference
431 def handle_charref(self, name):
432 pass
433
434 # Overridable -- handle entity reference
435 def handle_entityref(self, name):
436 pass
437
438 # Overridable -- handle data
439 def handle_data(self, data):
440 pass
441
442 # Overridable -- handle comment
443 def handle_comment(self, data):
444 pass
445
446 # Overridable -- handle declaration
447 def handle_decl(self, decl):
448 pass
449
450 # Overridable -- handle processing instruction
451 def handle_pi(self, data):
452 pass
453
454 def unknown_decl(self, data):
455 pass