python (3.12.0)
1 """A parser for HTML and XHTML."""
2
3 # This file is based on sgmllib.py, but the API is slightly different.
4
5 # XXX There should be a way to distinguish between PCDATA (parsed
6 # character data -- the normal case), RCDATA (replaceable character
7 # data -- only char and entity references and end tags are special)
8 # and CDATA (character data -- only end tags are special).
9
10
11 import re
12 import _markupbase
13
14 from html import unescape
15
16
17 __all__ = ['HTMLParser']
18
19 # Regular expressions used for parsing
20
21 interesting_normal = re.compile('[&<]')
22 incomplete = re.compile('&[a-zA-Z#]')
23
24 entityref = re.compile('&([a-zA-Z][-.a-zA-Z0-9]*)[^a-zA-Z0-9]')
25 charref = re.compile('&#(?:[0-9]+|[xX][0-9a-fA-F]+)[^0-9a-fA-F]')
26
27 starttagopen = re.compile('<[a-zA-Z]')
28 piclose = re.compile('>')
29 commentclose = re.compile(r'--\s*>')
30 # Note:
31 # 1) if you change tagfind/attrfind remember to update locatestarttagend too;
32 # 2) if you change tagfind/attrfind and/or locatestarttagend the parser will
33 # explode, so don't do it.
34 # see http://www.w3.org/TR/html5/tokenization.html#tag-open-state
35 # and http://www.w3.org/TR/html5/tokenization.html#tag-name-state
36 tagfind_tolerant = re.compile(r'([a-zA-Z][^\t\n\r\f />\x00]*)(?:\s|/(?!>))*')
37 attrfind_tolerant = re.compile(
38 r'((?<=[\'"\s/])[^\s/>][^\s/=>]*)(\s*=+\s*'
39 r'(\'[^\']*\'|"[^"]*"|(?![\'"])[^>\s]*))?(?:\s|/(?!>))*')
40 locatestarttagend_tolerant = re.compile(r"""
41 <[a-zA-Z][^\t\n\r\f />\x00]* # tag name
42 (?:[\s/]* # optional whitespace before attribute name
43 (?:(?<=['"\s/])[^\s/>][^\s/=>]* # attribute name
44 (?:\s*=+\s* # value indicator
45 (?:'[^']*' # LITA-enclosed value
46 |"[^"]*" # LIT-enclosed value
47 |(?!['"])[^>\s]* # bare value
48 )
49 \s* # possibly followed by a space
50 )?(?:\s|/(?!>))*
51 )*
52 )?
53 \s* # trailing whitespace
54 """, re.VERBOSE)
55 endendtag = re.compile('>')
56 # the HTML 5 spec, section 8.1.2.2, doesn't allow spaces between
57 # </ and the tag name, so maybe this should be fixed
58 endtagfind = re.compile(r'</\s*([a-zA-Z][-.a-zA-Z0-9:_]*)\s*>')
59
60
61
62 class ESC[4;38;5;81mHTMLParser(ESC[4;38;5;149m_markupbaseESC[4;38;5;149m.ESC[4;38;5;149mParserBase):
63 """Find tags and other markup and call handler functions.
64
65 Usage:
66 p = HTMLParser()
67 p.feed(data)
68 ...
69 p.close()
70
71 Start tags are handled by calling self.handle_starttag() or
72 self.handle_startendtag(); end tags by self.handle_endtag(). The
73 data between tags is passed from the parser to the derived class
74 by calling self.handle_data() with the data as argument (the data
75 may be split up in arbitrary chunks). If convert_charrefs is
76 True the character references are converted automatically to the
77 corresponding Unicode character (and self.handle_data() is no
78 longer split in chunks), otherwise they are passed by calling
79 self.handle_entityref() or self.handle_charref() with the string
80 containing respectively the named or numeric reference as the
81 argument.
82 """
83
84 CDATA_CONTENT_ELEMENTS = ("script", "style")
85
86 def __init__(self, *, convert_charrefs=True):
87 """Initialize and reset this instance.
88
89 If convert_charrefs is True (the default), all character references
90 are automatically converted to the corresponding Unicode characters.
91 """
92 super().__init__()
93 self.convert_charrefs = convert_charrefs
94 self.reset()
95
96 def reset(self):
97 """Reset this instance. Loses all unprocessed data."""
98 self.rawdata = ''
99 self.lasttag = '???'
100 self.interesting = interesting_normal
101 self.cdata_elem = None
102 super().reset()
103
104 def feed(self, data):
105 r"""Feed data to the parser.
106
107 Call this as often as you want, with as little or as much text
108 as you want (may include '\n').
109 """
110 self.rawdata = self.rawdata + data
111 self.goahead(0)
112
113 def close(self):
114 """Handle any buffered data."""
115 self.goahead(1)
116
117 __starttag_text = None
118
119 def get_starttag_text(self):
120 """Return full source of start tag: '<...>'."""
121 return self.__starttag_text
122
123 def set_cdata_mode(self, elem):
124 self.cdata_elem = elem.lower()
125 self.interesting = re.compile(r'</\s*%s\s*>' % self.cdata_elem, re.I)
126
127 def clear_cdata_mode(self):
128 self.interesting = interesting_normal
129 self.cdata_elem = None
130
131 # Internal -- handle data as far as reasonable. May leave state
132 # and data to be processed by a subsequent call. If 'end' is
133 # true, force handling all data as if followed by EOF marker.
134 def goahead(self, end):
135 rawdata = self.rawdata
136 i = 0
137 n = len(rawdata)
138 while i < n:
139 if self.convert_charrefs and not self.cdata_elem:
140 j = rawdata.find('<', i)
141 if j < 0:
142 # if we can't find the next <, either we are at the end
143 # or there's more text incoming. If the latter is True,
144 # we can't pass the text to handle_data in case we have
145 # a charref cut in half at end. Try to determine if
146 # this is the case before proceeding by looking for an
147 # & near the end and see if it's followed by a space or ;.
148 amppos = rawdata.rfind('&', max(i, n-34))
149 if (amppos >= 0 and
150 not re.compile(r'[\s;]').search(rawdata, amppos)):
151 break # wait till we get all the text
152 j = n
153 else:
154 match = self.interesting.search(rawdata, i) # < or &
155 if match:
156 j = match.start()
157 else:
158 if self.cdata_elem:
159 break
160 j = n
161 if i < j:
162 if self.convert_charrefs and not self.cdata_elem:
163 self.handle_data(unescape(rawdata[i:j]))
164 else:
165 self.handle_data(rawdata[i:j])
166 i = self.updatepos(i, j)
167 if i == n: break
168 startswith = rawdata.startswith
169 if startswith('<', i):
170 if starttagopen.match(rawdata, i): # < + letter
171 k = self.parse_starttag(i)
172 elif startswith("</", i):
173 k = self.parse_endtag(i)
174 elif startswith("<!--", i):
175 k = self.parse_comment(i)
176 elif startswith("<?", i):
177 k = self.parse_pi(i)
178 elif startswith("<!", i):
179 k = self.parse_html_declaration(i)
180 elif (i + 1) < n:
181 self.handle_data("<")
182 k = i + 1
183 else:
184 break
185 if k < 0:
186 if not end:
187 break
188 k = rawdata.find('>', i + 1)
189 if k < 0:
190 k = rawdata.find('<', i + 1)
191 if k < 0:
192 k = i + 1
193 else:
194 k += 1
195 if self.convert_charrefs and not self.cdata_elem:
196 self.handle_data(unescape(rawdata[i:k]))
197 else:
198 self.handle_data(rawdata[i:k])
199 i = self.updatepos(i, k)
200 elif startswith("&#", i):
201 match = charref.match(rawdata, i)
202 if match:
203 name = match.group()[2:-1]
204 self.handle_charref(name)
205 k = match.end()
206 if not startswith(';', k-1):
207 k = k - 1
208 i = self.updatepos(i, k)
209 continue
210 else:
211 if ";" in rawdata[i:]: # bail by consuming &#
212 self.handle_data(rawdata[i:i+2])
213 i = self.updatepos(i, i+2)
214 break
215 elif startswith('&', i):
216 match = entityref.match(rawdata, i)
217 if match:
218 name = match.group(1)
219 self.handle_entityref(name)
220 k = match.end()
221 if not startswith(';', k-1):
222 k = k - 1
223 i = self.updatepos(i, k)
224 continue
225 match = incomplete.match(rawdata, i)
226 if match:
227 # match.group() will contain at least 2 chars
228 if end and match.group() == rawdata[i:]:
229 k = match.end()
230 if k <= i:
231 k = n
232 i = self.updatepos(i, i + 1)
233 # incomplete
234 break
235 elif (i + 1) < n:
236 # not the end of the buffer, and can't be confused
237 # with some other construct
238 self.handle_data("&")
239 i = self.updatepos(i, i + 1)
240 else:
241 break
242 else:
243 assert 0, "interesting.search() lied"
244 # end while
245 if end and i < n and not self.cdata_elem:
246 if self.convert_charrefs and not self.cdata_elem:
247 self.handle_data(unescape(rawdata[i:n]))
248 else:
249 self.handle_data(rawdata[i:n])
250 i = self.updatepos(i, n)
251 self.rawdata = rawdata[i:]
252
253 # Internal -- parse html declarations, return length or -1 if not terminated
254 # See w3.org/TR/html5/tokenization.html#markup-declaration-open-state
255 # See also parse_declaration in _markupbase
256 def parse_html_declaration(self, i):
257 rawdata = self.rawdata
258 assert rawdata[i:i+2] == '<!', ('unexpected call to '
259 'parse_html_declaration()')
260 if rawdata[i:i+4] == '<!--':
261 # this case is actually already handled in goahead()
262 return self.parse_comment(i)
263 elif rawdata[i:i+3] == '<![':
264 return self.parse_marked_section(i)
265 elif rawdata[i:i+9].lower() == '<!doctype':
266 # find the closing >
267 gtpos = rawdata.find('>', i+9)
268 if gtpos == -1:
269 return -1
270 self.handle_decl(rawdata[i+2:gtpos])
271 return gtpos+1
272 else:
273 return self.parse_bogus_comment(i)
274
275 # Internal -- parse bogus comment, return length or -1 if not terminated
276 # see http://www.w3.org/TR/html5/tokenization.html#bogus-comment-state
277 def parse_bogus_comment(self, i, report=1):
278 rawdata = self.rawdata
279 assert rawdata[i:i+2] in ('<!', '</'), ('unexpected call to '
280 'parse_comment()')
281 pos = rawdata.find('>', i+2)
282 if pos == -1:
283 return -1
284 if report:
285 self.handle_comment(rawdata[i+2:pos])
286 return pos + 1
287
288 # Internal -- parse processing instr, return end or -1 if not terminated
289 def parse_pi(self, i):
290 rawdata = self.rawdata
291 assert rawdata[i:i+2] == '<?', 'unexpected call to parse_pi()'
292 match = piclose.search(rawdata, i+2) # >
293 if not match:
294 return -1
295 j = match.start()
296 self.handle_pi(rawdata[i+2: j])
297 j = match.end()
298 return j
299
300 # Internal -- handle starttag, return end or -1 if not terminated
301 def parse_starttag(self, i):
302 self.__starttag_text = None
303 endpos = self.check_for_whole_start_tag(i)
304 if endpos < 0:
305 return endpos
306 rawdata = self.rawdata
307 self.__starttag_text = rawdata[i:endpos]
308
309 # Now parse the data between i+1 and j into a tag and attrs
310 attrs = []
311 match = tagfind_tolerant.match(rawdata, i+1)
312 assert match, 'unexpected call to parse_starttag()'
313 k = match.end()
314 self.lasttag = tag = match.group(1).lower()
315 while k < endpos:
316 m = attrfind_tolerant.match(rawdata, k)
317 if not m:
318 break
319 attrname, rest, attrvalue = m.group(1, 2, 3)
320 if not rest:
321 attrvalue = None
322 elif attrvalue[:1] == '\'' == attrvalue[-1:] or \
323 attrvalue[:1] == '"' == attrvalue[-1:]:
324 attrvalue = attrvalue[1:-1]
325 if attrvalue:
326 attrvalue = unescape(attrvalue)
327 attrs.append((attrname.lower(), attrvalue))
328 k = m.end()
329
330 end = rawdata[k:endpos].strip()
331 if end not in (">", "/>"):
332 self.handle_data(rawdata[i:endpos])
333 return endpos
334 if end.endswith('/>'):
335 # XHTML-style empty tag: <span attr="value" />
336 self.handle_startendtag(tag, attrs)
337 else:
338 self.handle_starttag(tag, attrs)
339 if tag in self.CDATA_CONTENT_ELEMENTS:
340 self.set_cdata_mode(tag)
341 return endpos
342
343 # Internal -- check to see if we have a complete starttag; return end
344 # or -1 if incomplete.
345 def check_for_whole_start_tag(self, i):
346 rawdata = self.rawdata
347 m = locatestarttagend_tolerant.match(rawdata, i)
348 if m:
349 j = m.end()
350 next = rawdata[j:j+1]
351 if next == ">":
352 return j + 1
353 if next == "/":
354 if rawdata.startswith("/>", j):
355 return j + 2
356 if rawdata.startswith("/", j):
357 # buffer boundary
358 return -1
359 # else bogus input
360 if j > i:
361 return j
362 else:
363 return i + 1
364 if next == "":
365 # end of input
366 return -1
367 if next in ("abcdefghijklmnopqrstuvwxyz=/"
368 "ABCDEFGHIJKLMNOPQRSTUVWXYZ"):
369 # end of input in or before attribute value, or we have the
370 # '/' from a '/>' ending
371 return -1
372 if j > i:
373 return j
374 else:
375 return i + 1
376 raise AssertionError("we should not get here!")
377
378 # Internal -- parse endtag, return end or -1 if incomplete
379 def parse_endtag(self, i):
380 rawdata = self.rawdata
381 assert rawdata[i:i+2] == "</", "unexpected call to parse_endtag"
382 match = endendtag.search(rawdata, i+1) # >
383 if not match:
384 return -1
385 gtpos = match.end()
386 match = endtagfind.match(rawdata, i) # </ + tag + >
387 if not match:
388 if self.cdata_elem is not None:
389 self.handle_data(rawdata[i:gtpos])
390 return gtpos
391 # find the name: w3.org/TR/html5/tokenization.html#tag-name-state
392 namematch = tagfind_tolerant.match(rawdata, i+2)
393 if not namematch:
394 # w3.org/TR/html5/tokenization.html#end-tag-open-state
395 if rawdata[i:i+3] == '</>':
396 return i+3
397 else:
398 return self.parse_bogus_comment(i)
399 tagname = namematch.group(1).lower()
400 # consume and ignore other stuff between the name and the >
401 # Note: this is not 100% correct, since we might have things like
402 # </tag attr=">">, but looking for > after the name should cover
403 # most of the cases and is much simpler
404 gtpos = rawdata.find('>', namematch.end())
405 self.handle_endtag(tagname)
406 return gtpos+1
407
408 elem = match.group(1).lower() # script or style
409 if self.cdata_elem is not None:
410 if elem != self.cdata_elem:
411 self.handle_data(rawdata[i:gtpos])
412 return gtpos
413
414 self.handle_endtag(elem)
415 self.clear_cdata_mode()
416 return gtpos
417
418 # Overridable -- finish processing of start+end tag: <tag.../>
419 def handle_startendtag(self, tag, attrs):
420 self.handle_starttag(tag, attrs)
421 self.handle_endtag(tag)
422
423 # Overridable -- handle start tag
424 def handle_starttag(self, tag, attrs):
425 pass
426
427 # Overridable -- handle end tag
428 def handle_endtag(self, tag):
429 pass
430
431 # Overridable -- handle character reference
432 def handle_charref(self, name):
433 pass
434
435 # Overridable -- handle entity reference
436 def handle_entityref(self, name):
437 pass
438
439 # Overridable -- handle data
440 def handle_data(self, data):
441 pass
442
443 # Overridable -- handle comment
444 def handle_comment(self, data):
445 pass
446
447 # Overridable -- handle declaration
448 def handle_decl(self, decl):
449 pass
450
451 # Overridable -- handle processing instruction
452 def handle_pi(self, data):
453 pass
454
455 def unknown_decl(self, data):
456 pass