python (3.12.0)
1 """Lightweight XML support for Python.
2
3 XML is an inherently hierarchical data format, and the most natural way to
4 represent it is with a tree. This module has two classes for this purpose:
5
6 1. ElementTree represents the whole XML document as a tree and
7
8 2. Element represents a single node in this tree.
9
10 Interactions with the whole document (reading and writing to/from files) are
11 usually done on the ElementTree level. Interactions with a single XML element
12 and its sub-elements are done on the Element level.
13
14 Element is a flexible container object designed to store hierarchical data
15 structures in memory. It can be described as a cross between a list and a
16 dictionary. Each Element has a number of properties associated with it:
17
18 'tag' - a string containing the element's name.
19
20 'attributes' - a Python dictionary storing the element's attributes.
21
22 'text' - a string containing the element's text content.
23
24 'tail' - an optional string containing text after the element's end tag.
25
26 And a number of child elements stored in a Python sequence.
27
28 To create an element instance, use the Element constructor,
29 or the SubElement factory function.
30
31 You can also use the ElementTree class to wrap an element structure
32 and convert it to and from XML.
33
34 """
35
36 #---------------------------------------------------------------------
37 # Licensed to PSF under a Contributor Agreement.
38 # See https://www.python.org/psf/license for licensing details.
39 #
40 # ElementTree
41 # Copyright (c) 1999-2008 by Fredrik Lundh. All rights reserved.
42 #
43 # fredrik@pythonware.com
44 # http://www.pythonware.com
45 # --------------------------------------------------------------------
46 # The ElementTree toolkit is
47 #
48 # Copyright (c) 1999-2008 by Fredrik Lundh
49 #
50 # By obtaining, using, and/or copying this software and/or its
51 # associated documentation, you agree that you have read, understood,
52 # and will comply with the following terms and conditions:
53 #
54 # Permission to use, copy, modify, and distribute this software and
55 # its associated documentation for any purpose and without fee is
56 # hereby granted, provided that the above copyright notice appears in
57 # all copies, and that both that copyright notice and this permission
58 # notice appear in supporting documentation, and that the name of
59 # Secret Labs AB or the author not be used in advertising or publicity
60 # pertaining to distribution of the software without specific, written
61 # prior permission.
62 #
63 # SECRET LABS AB AND THE AUTHOR DISCLAIMS ALL WARRANTIES WITH REGARD
64 # TO THIS SOFTWARE, INCLUDING ALL IMPLIED WARRANTIES OF MERCHANT-
65 # ABILITY AND FITNESS. IN NO EVENT SHALL SECRET LABS AB OR THE AUTHOR
66 # BE LIABLE FOR ANY SPECIAL, INDIRECT OR CONSEQUENTIAL DAMAGES OR ANY
67 # DAMAGES WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS,
68 # WHETHER IN AN ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS
69 # ACTION, ARISING OUT OF OR IN CONNECTION WITH THE USE OR PERFORMANCE
70 # OF THIS SOFTWARE.
71 # --------------------------------------------------------------------
72
73 __all__ = [
74 # public symbols
75 "Comment",
76 "dump",
77 "Element", "ElementTree",
78 "fromstring", "fromstringlist",
79 "indent", "iselement", "iterparse",
80 "parse", "ParseError",
81 "PI", "ProcessingInstruction",
82 "QName",
83 "SubElement",
84 "tostring", "tostringlist",
85 "TreeBuilder",
86 "VERSION",
87 "XML", "XMLID",
88 "XMLParser", "XMLPullParser",
89 "register_namespace",
90 "canonicalize", "C14NWriterTarget",
91 ]
92
93 VERSION = "1.3.0"
94
95 import sys
96 import re
97 import warnings
98 import io
99 import collections
100 import collections.abc
101 import contextlib
102
103 from . import ElementPath
104
105
106 class ESC[4;38;5;81mParseError(ESC[4;38;5;149mSyntaxError):
107 """An error when parsing an XML document.
108
109 In addition to its exception value, a ParseError contains
110 two extra attributes:
111 'code' - the specific exception code
112 'position' - the line and column of the error
113
114 """
115 pass
116
117 # --------------------------------------------------------------------
118
119
120 def iselement(element):
121 """Return True if *element* appears to be an Element."""
122 return hasattr(element, 'tag')
123
124
125 class ESC[4;38;5;81mElement:
126 """An XML element.
127
128 This class is the reference implementation of the Element interface.
129
130 An element's length is its number of subelements. That means if you
131 want to check if an element is truly empty, you should check BOTH
132 its length AND its text attribute.
133
134 The element tag, attribute names, and attribute values can be either
135 bytes or strings.
136
137 *tag* is the element name. *attrib* is an optional dictionary containing
138 element attributes. *extra* are additional element attributes given as
139 keyword arguments.
140
141 Example form:
142 <tag attrib>text<child/>...</tag>tail
143
144 """
145
146 tag = None
147 """The element's name."""
148
149 attrib = None
150 """Dictionary of the element's attributes."""
151
152 text = None
153 """
154 Text before first subelement. This is either a string or the value None.
155 Note that if there is no text, this attribute may be either
156 None or the empty string, depending on the parser.
157
158 """
159
160 tail = None
161 """
162 Text after this element's end tag, but before the next sibling element's
163 start tag. This is either a string or the value None. Note that if there
164 was no text, this attribute may be either None or an empty string,
165 depending on the parser.
166
167 """
168
169 def __init__(self, tag, attrib={}, **extra):
170 if not isinstance(attrib, dict):
171 raise TypeError("attrib must be dict, not %s" % (
172 attrib.__class__.__name__,))
173 self.tag = tag
174 self.attrib = {**attrib, **extra}
175 self._children = []
176
177 def __repr__(self):
178 return "<%s %r at %#x>" % (self.__class__.__name__, self.tag, id(self))
179
180 def makeelement(self, tag, attrib):
181 """Create a new element with the same type.
182
183 *tag* is a string containing the element name.
184 *attrib* is a dictionary containing the element attributes.
185
186 Do not call this method, use the SubElement factory function instead.
187
188 """
189 return self.__class__(tag, attrib)
190
191 def __copy__(self):
192 elem = self.makeelement(self.tag, self.attrib)
193 elem.text = self.text
194 elem.tail = self.tail
195 elem[:] = self
196 return elem
197
198 def __len__(self):
199 return len(self._children)
200
201 def __bool__(self):
202 warnings.warn(
203 "Testing an element's truth value will raise an exception in "
204 "future versions. "
205 "Use specific 'len(elem)' or 'elem is not None' test instead.",
206 DeprecationWarning, stacklevel=2
207 )
208 return len(self._children) != 0 # emulate old behaviour, for now
209
210 def __getitem__(self, index):
211 return self._children[index]
212
213 def __setitem__(self, index, element):
214 if isinstance(index, slice):
215 for elt in element:
216 self._assert_is_element(elt)
217 else:
218 self._assert_is_element(element)
219 self._children[index] = element
220
221 def __delitem__(self, index):
222 del self._children[index]
223
224 def append(self, subelement):
225 """Add *subelement* to the end of this element.
226
227 The new element will appear in document order after the last existing
228 subelement (or directly after the text, if it's the first subelement),
229 but before the end tag for this element.
230
231 """
232 self._assert_is_element(subelement)
233 self._children.append(subelement)
234
235 def extend(self, elements):
236 """Append subelements from a sequence.
237
238 *elements* is a sequence with zero or more elements.
239
240 """
241 for element in elements:
242 self._assert_is_element(element)
243 self._children.append(element)
244
245 def insert(self, index, subelement):
246 """Insert *subelement* at position *index*."""
247 self._assert_is_element(subelement)
248 self._children.insert(index, subelement)
249
250 def _assert_is_element(self, e):
251 # Need to refer to the actual Python implementation, not the
252 # shadowing C implementation.
253 if not isinstance(e, _Element_Py):
254 raise TypeError('expected an Element, not %s' % type(e).__name__)
255
256 def remove(self, subelement):
257 """Remove matching subelement.
258
259 Unlike the find methods, this method compares elements based on
260 identity, NOT ON tag value or contents. To remove subelements by
261 other means, the easiest way is to use a list comprehension to
262 select what elements to keep, and then use slice assignment to update
263 the parent element.
264
265 ValueError is raised if a matching element could not be found.
266
267 """
268 # assert iselement(element)
269 self._children.remove(subelement)
270
271 def find(self, path, namespaces=None):
272 """Find first matching element by tag name or path.
273
274 *path* is a string having either an element tag or an XPath,
275 *namespaces* is an optional mapping from namespace prefix to full name.
276
277 Return the first matching element, or None if no element was found.
278
279 """
280 return ElementPath.find(self, path, namespaces)
281
282 def findtext(self, path, default=None, namespaces=None):
283 """Find text for first matching element by tag name or path.
284
285 *path* is a string having either an element tag or an XPath,
286 *default* is the value to return if the element was not found,
287 *namespaces* is an optional mapping from namespace prefix to full name.
288
289 Return text content of first matching element, or default value if
290 none was found. Note that if an element is found having no text
291 content, the empty string is returned.
292
293 """
294 return ElementPath.findtext(self, path, default, namespaces)
295
296 def findall(self, path, namespaces=None):
297 """Find all matching subelements by tag name or path.
298
299 *path* is a string having either an element tag or an XPath,
300 *namespaces* is an optional mapping from namespace prefix to full name.
301
302 Returns list containing all matching elements in document order.
303
304 """
305 return ElementPath.findall(self, path, namespaces)
306
307 def iterfind(self, path, namespaces=None):
308 """Find all matching subelements by tag name or path.
309
310 *path* is a string having either an element tag or an XPath,
311 *namespaces* is an optional mapping from namespace prefix to full name.
312
313 Return an iterable yielding all matching elements in document order.
314
315 """
316 return ElementPath.iterfind(self, path, namespaces)
317
318 def clear(self):
319 """Reset element.
320
321 This function removes all subelements, clears all attributes, and sets
322 the text and tail attributes to None.
323
324 """
325 self.attrib.clear()
326 self._children = []
327 self.text = self.tail = None
328
329 def get(self, key, default=None):
330 """Get element attribute.
331
332 Equivalent to attrib.get, but some implementations may handle this a
333 bit more efficiently. *key* is what attribute to look for, and
334 *default* is what to return if the attribute was not found.
335
336 Returns a string containing the attribute value, or the default if
337 attribute was not found.
338
339 """
340 return self.attrib.get(key, default)
341
342 def set(self, key, value):
343 """Set element attribute.
344
345 Equivalent to attrib[key] = value, but some implementations may handle
346 this a bit more efficiently. *key* is what attribute to set, and
347 *value* is the attribute value to set it to.
348
349 """
350 self.attrib[key] = value
351
352 def keys(self):
353 """Get list of attribute names.
354
355 Names are returned in an arbitrary order, just like an ordinary
356 Python dict. Equivalent to attrib.keys()
357
358 """
359 return self.attrib.keys()
360
361 def items(self):
362 """Get element attributes as a sequence.
363
364 The attributes are returned in arbitrary order. Equivalent to
365 attrib.items().
366
367 Return a list of (name, value) tuples.
368
369 """
370 return self.attrib.items()
371
372 def iter(self, tag=None):
373 """Create tree iterator.
374
375 The iterator loops over the element and all subelements in document
376 order, returning all elements with a matching tag.
377
378 If the tree structure is modified during iteration, new or removed
379 elements may or may not be included. To get a stable set, use the
380 list() function on the iterator, and loop over the resulting list.
381
382 *tag* is what tags to look for (default is to return all elements)
383
384 Return an iterator containing all the matching elements.
385
386 """
387 if tag == "*":
388 tag = None
389 if tag is None or self.tag == tag:
390 yield self
391 for e in self._children:
392 yield from e.iter(tag)
393
394 def itertext(self):
395 """Create text iterator.
396
397 The iterator loops over the element and all subelements in document
398 order, returning all inner text.
399
400 """
401 tag = self.tag
402 if not isinstance(tag, str) and tag is not None:
403 return
404 t = self.text
405 if t:
406 yield t
407 for e in self:
408 yield from e.itertext()
409 t = e.tail
410 if t:
411 yield t
412
413
414 def SubElement(parent, tag, attrib={}, **extra):
415 """Subelement factory which creates an element instance, and appends it
416 to an existing parent.
417
418 The element tag, attribute names, and attribute values can be either
419 bytes or Unicode strings.
420
421 *parent* is the parent element, *tag* is the subelements name, *attrib* is
422 an optional directory containing element attributes, *extra* are
423 additional attributes given as keyword arguments.
424
425 """
426 attrib = {**attrib, **extra}
427 element = parent.makeelement(tag, attrib)
428 parent.append(element)
429 return element
430
431
432 def Comment(text=None):
433 """Comment element factory.
434
435 This function creates a special element which the standard serializer
436 serializes as an XML comment.
437
438 *text* is a string containing the comment string.
439
440 """
441 element = Element(Comment)
442 element.text = text
443 return element
444
445
446 def ProcessingInstruction(target, text=None):
447 """Processing Instruction element factory.
448
449 This function creates a special element which the standard serializer
450 serializes as an XML comment.
451
452 *target* is a string containing the processing instruction, *text* is a
453 string containing the processing instruction contents, if any.
454
455 """
456 element = Element(ProcessingInstruction)
457 element.text = target
458 if text:
459 element.text = element.text + " " + text
460 return element
461
462 PI = ProcessingInstruction
463
464
465 class ESC[4;38;5;81mQName:
466 """Qualified name wrapper.
467
468 This class can be used to wrap a QName attribute value in order to get
469 proper namespace handing on output.
470
471 *text_or_uri* is a string containing the QName value either in the form
472 {uri}local, or if the tag argument is given, the URI part of a QName.
473
474 *tag* is an optional argument which if given, will make the first
475 argument (text_or_uri) be interpreted as a URI, and this argument (tag)
476 be interpreted as a local name.
477
478 """
479 def __init__(self, text_or_uri, tag=None):
480 if tag:
481 text_or_uri = "{%s}%s" % (text_or_uri, tag)
482 self.text = text_or_uri
483 def __str__(self):
484 return self.text
485 def __repr__(self):
486 return '<%s %r>' % (self.__class__.__name__, self.text)
487 def __hash__(self):
488 return hash(self.text)
489 def __le__(self, other):
490 if isinstance(other, QName):
491 return self.text <= other.text
492 return self.text <= other
493 def __lt__(self, other):
494 if isinstance(other, QName):
495 return self.text < other.text
496 return self.text < other
497 def __ge__(self, other):
498 if isinstance(other, QName):
499 return self.text >= other.text
500 return self.text >= other
501 def __gt__(self, other):
502 if isinstance(other, QName):
503 return self.text > other.text
504 return self.text > other
505 def __eq__(self, other):
506 if isinstance(other, QName):
507 return self.text == other.text
508 return self.text == other
509
510 # --------------------------------------------------------------------
511
512
513 class ESC[4;38;5;81mElementTree:
514 """An XML element hierarchy.
515
516 This class also provides support for serialization to and from
517 standard XML.
518
519 *element* is an optional root element node,
520 *file* is an optional file handle or file name of an XML file whose
521 contents will be used to initialize the tree with.
522
523 """
524 def __init__(self, element=None, file=None):
525 # assert element is None or iselement(element)
526 self._root = element # first node
527 if file:
528 self.parse(file)
529
530 def getroot(self):
531 """Return root element of this tree."""
532 return self._root
533
534 def _setroot(self, element):
535 """Replace root element of this tree.
536
537 This will discard the current contents of the tree and replace it
538 with the given element. Use with care!
539
540 """
541 # assert iselement(element)
542 self._root = element
543
544 def parse(self, source, parser=None):
545 """Load external XML document into element tree.
546
547 *source* is a file name or file object, *parser* is an optional parser
548 instance that defaults to XMLParser.
549
550 ParseError is raised if the parser fails to parse the document.
551
552 Returns the root element of the given source document.
553
554 """
555 close_source = False
556 if not hasattr(source, "read"):
557 source = open(source, "rb")
558 close_source = True
559 try:
560 if parser is None:
561 # If no parser was specified, create a default XMLParser
562 parser = XMLParser()
563 if hasattr(parser, '_parse_whole'):
564 # The default XMLParser, when it comes from an accelerator,
565 # can define an internal _parse_whole API for efficiency.
566 # It can be used to parse the whole source without feeding
567 # it with chunks.
568 self._root = parser._parse_whole(source)
569 return self._root
570 while data := source.read(65536):
571 parser.feed(data)
572 self._root = parser.close()
573 return self._root
574 finally:
575 if close_source:
576 source.close()
577
578 def iter(self, tag=None):
579 """Create and return tree iterator for the root element.
580
581 The iterator loops over all elements in this tree, in document order.
582
583 *tag* is a string with the tag name to iterate over
584 (default is to return all elements).
585
586 """
587 # assert self._root is not None
588 return self._root.iter(tag)
589
590 def find(self, path, namespaces=None):
591 """Find first matching element by tag name or path.
592
593 Same as getroot().find(path), which is Element.find()
594
595 *path* is a string having either an element tag or an XPath,
596 *namespaces* is an optional mapping from namespace prefix to full name.
597
598 Return the first matching element, or None if no element was found.
599
600 """
601 # assert self._root is not None
602 if path[:1] == "/":
603 path = "." + path
604 warnings.warn(
605 "This search is broken in 1.3 and earlier, and will be "
606 "fixed in a future version. If you rely on the current "
607 "behaviour, change it to %r" % path,
608 FutureWarning, stacklevel=2
609 )
610 return self._root.find(path, namespaces)
611
612 def findtext(self, path, default=None, namespaces=None):
613 """Find first matching element by tag name or path.
614
615 Same as getroot().findtext(path), which is Element.findtext()
616
617 *path* is a string having either an element tag or an XPath,
618 *namespaces* is an optional mapping from namespace prefix to full name.
619
620 Return the first matching element, or None if no element was found.
621
622 """
623 # assert self._root is not None
624 if path[:1] == "/":
625 path = "." + path
626 warnings.warn(
627 "This search is broken in 1.3 and earlier, and will be "
628 "fixed in a future version. If you rely on the current "
629 "behaviour, change it to %r" % path,
630 FutureWarning, stacklevel=2
631 )
632 return self._root.findtext(path, default, namespaces)
633
634 def findall(self, path, namespaces=None):
635 """Find all matching subelements by tag name or path.
636
637 Same as getroot().findall(path), which is Element.findall().
638
639 *path* is a string having either an element tag or an XPath,
640 *namespaces* is an optional mapping from namespace prefix to full name.
641
642 Return list containing all matching elements in document order.
643
644 """
645 # assert self._root is not None
646 if path[:1] == "/":
647 path = "." + path
648 warnings.warn(
649 "This search is broken in 1.3 and earlier, and will be "
650 "fixed in a future version. If you rely on the current "
651 "behaviour, change it to %r" % path,
652 FutureWarning, stacklevel=2
653 )
654 return self._root.findall(path, namespaces)
655
656 def iterfind(self, path, namespaces=None):
657 """Find all matching subelements by tag name or path.
658
659 Same as getroot().iterfind(path), which is element.iterfind()
660
661 *path* is a string having either an element tag or an XPath,
662 *namespaces* is an optional mapping from namespace prefix to full name.
663
664 Return an iterable yielding all matching elements in document order.
665
666 """
667 # assert self._root is not None
668 if path[:1] == "/":
669 path = "." + path
670 warnings.warn(
671 "This search is broken in 1.3 and earlier, and will be "
672 "fixed in a future version. If you rely on the current "
673 "behaviour, change it to %r" % path,
674 FutureWarning, stacklevel=2
675 )
676 return self._root.iterfind(path, namespaces)
677
678 def write(self, file_or_filename,
679 encoding=None,
680 xml_declaration=None,
681 default_namespace=None,
682 method=None, *,
683 short_empty_elements=True):
684 """Write element tree to a file as XML.
685
686 Arguments:
687 *file_or_filename* -- file name or a file object opened for writing
688
689 *encoding* -- the output encoding (default: US-ASCII)
690
691 *xml_declaration* -- bool indicating if an XML declaration should be
692 added to the output. If None, an XML declaration
693 is added if encoding IS NOT either of:
694 US-ASCII, UTF-8, or Unicode
695
696 *default_namespace* -- sets the default XML namespace (for "xmlns")
697
698 *method* -- either "xml" (default), "html, "text", or "c14n"
699
700 *short_empty_elements* -- controls the formatting of elements
701 that contain no content. If True (default)
702 they are emitted as a single self-closed
703 tag, otherwise they are emitted as a pair
704 of start/end tags
705
706 """
707 if not method:
708 method = "xml"
709 elif method not in _serialize:
710 raise ValueError("unknown method %r" % method)
711 if not encoding:
712 if method == "c14n":
713 encoding = "utf-8"
714 else:
715 encoding = "us-ascii"
716 with _get_writer(file_or_filename, encoding) as (write, declared_encoding):
717 if method == "xml" and (xml_declaration or
718 (xml_declaration is None and
719 encoding.lower() != "unicode" and
720 declared_encoding.lower() not in ("utf-8", "us-ascii"))):
721 write("<?xml version='1.0' encoding='%s'?>\n" % (
722 declared_encoding,))
723 if method == "text":
724 _serialize_text(write, self._root)
725 else:
726 qnames, namespaces = _namespaces(self._root, default_namespace)
727 serialize = _serialize[method]
728 serialize(write, self._root, qnames, namespaces,
729 short_empty_elements=short_empty_elements)
730
731 def write_c14n(self, file):
732 # lxml.etree compatibility. use output method instead
733 return self.write(file, method="c14n")
734
735 # --------------------------------------------------------------------
736 # serialization support
737
738 @contextlib.contextmanager
739 def _get_writer(file_or_filename, encoding):
740 # returns text write method and release all resources after using
741 try:
742 write = file_or_filename.write
743 except AttributeError:
744 # file_or_filename is a file name
745 if encoding.lower() == "unicode":
746 encoding="utf-8"
747 with open(file_or_filename, "w", encoding=encoding,
748 errors="xmlcharrefreplace") as file:
749 yield file.write, encoding
750 else:
751 # file_or_filename is a file-like object
752 # encoding determines if it is a text or binary writer
753 if encoding.lower() == "unicode":
754 # use a text writer as is
755 yield write, getattr(file_or_filename, "encoding", None) or "utf-8"
756 else:
757 # wrap a binary writer with TextIOWrapper
758 with contextlib.ExitStack() as stack:
759 if isinstance(file_or_filename, io.BufferedIOBase):
760 file = file_or_filename
761 elif isinstance(file_or_filename, io.RawIOBase):
762 file = io.BufferedWriter(file_or_filename)
763 # Keep the original file open when the BufferedWriter is
764 # destroyed
765 stack.callback(file.detach)
766 else:
767 # This is to handle passed objects that aren't in the
768 # IOBase hierarchy, but just have a write method
769 file = io.BufferedIOBase()
770 file.writable = lambda: True
771 file.write = write
772 try:
773 # TextIOWrapper uses this methods to determine
774 # if BOM (for UTF-16, etc) should be added
775 file.seekable = file_or_filename.seekable
776 file.tell = file_or_filename.tell
777 except AttributeError:
778 pass
779 file = io.TextIOWrapper(file,
780 encoding=encoding,
781 errors="xmlcharrefreplace",
782 newline="\n")
783 # Keep the original file open when the TextIOWrapper is
784 # destroyed
785 stack.callback(file.detach)
786 yield file.write, encoding
787
788 def _namespaces(elem, default_namespace=None):
789 # identify namespaces used in this tree
790
791 # maps qnames to *encoded* prefix:local names
792 qnames = {None: None}
793
794 # maps uri:s to prefixes
795 namespaces = {}
796 if default_namespace:
797 namespaces[default_namespace] = ""
798
799 def add_qname(qname):
800 # calculate serialized qname representation
801 try:
802 if qname[:1] == "{":
803 uri, tag = qname[1:].rsplit("}", 1)
804 prefix = namespaces.get(uri)
805 if prefix is None:
806 prefix = _namespace_map.get(uri)
807 if prefix is None:
808 prefix = "ns%d" % len(namespaces)
809 if prefix != "xml":
810 namespaces[uri] = prefix
811 if prefix:
812 qnames[qname] = "%s:%s" % (prefix, tag)
813 else:
814 qnames[qname] = tag # default element
815 else:
816 if default_namespace:
817 # FIXME: can this be handled in XML 1.0?
818 raise ValueError(
819 "cannot use non-qualified names with "
820 "default_namespace option"
821 )
822 qnames[qname] = qname
823 except TypeError:
824 _raise_serialization_error(qname)
825
826 # populate qname and namespaces table
827 for elem in elem.iter():
828 tag = elem.tag
829 if isinstance(tag, QName):
830 if tag.text not in qnames:
831 add_qname(tag.text)
832 elif isinstance(tag, str):
833 if tag not in qnames:
834 add_qname(tag)
835 elif tag is not None and tag is not Comment and tag is not PI:
836 _raise_serialization_error(tag)
837 for key, value in elem.items():
838 if isinstance(key, QName):
839 key = key.text
840 if key not in qnames:
841 add_qname(key)
842 if isinstance(value, QName) and value.text not in qnames:
843 add_qname(value.text)
844 text = elem.text
845 if isinstance(text, QName) and text.text not in qnames:
846 add_qname(text.text)
847 return qnames, namespaces
848
849 def _serialize_xml(write, elem, qnames, namespaces,
850 short_empty_elements, **kwargs):
851 tag = elem.tag
852 text = elem.text
853 if tag is Comment:
854 write("<!--%s-->" % text)
855 elif tag is ProcessingInstruction:
856 write("<?%s?>" % text)
857 else:
858 tag = qnames[tag]
859 if tag is None:
860 if text:
861 write(_escape_cdata(text))
862 for e in elem:
863 _serialize_xml(write, e, qnames, None,
864 short_empty_elements=short_empty_elements)
865 else:
866 write("<" + tag)
867 items = list(elem.items())
868 if items or namespaces:
869 if namespaces:
870 for v, k in sorted(namespaces.items(),
871 key=lambda x: x[1]): # sort on prefix
872 if k:
873 k = ":" + k
874 write(" xmlns%s=\"%s\"" % (
875 k,
876 _escape_attrib(v)
877 ))
878 for k, v in items:
879 if isinstance(k, QName):
880 k = k.text
881 if isinstance(v, QName):
882 v = qnames[v.text]
883 else:
884 v = _escape_attrib(v)
885 write(" %s=\"%s\"" % (qnames[k], v))
886 if text or len(elem) or not short_empty_elements:
887 write(">")
888 if text:
889 write(_escape_cdata(text))
890 for e in elem:
891 _serialize_xml(write, e, qnames, None,
892 short_empty_elements=short_empty_elements)
893 write("</" + tag + ">")
894 else:
895 write(" />")
896 if elem.tail:
897 write(_escape_cdata(elem.tail))
898
899 HTML_EMPTY = {"area", "base", "basefont", "br", "col", "embed", "frame", "hr",
900 "img", "input", "isindex", "link", "meta", "param", "source",
901 "track", "wbr"}
902
903 def _serialize_html(write, elem, qnames, namespaces, **kwargs):
904 tag = elem.tag
905 text = elem.text
906 if tag is Comment:
907 write("<!--%s-->" % _escape_cdata(text))
908 elif tag is ProcessingInstruction:
909 write("<?%s?>" % _escape_cdata(text))
910 else:
911 tag = qnames[tag]
912 if tag is None:
913 if text:
914 write(_escape_cdata(text))
915 for e in elem:
916 _serialize_html(write, e, qnames, None)
917 else:
918 write("<" + tag)
919 items = list(elem.items())
920 if items or namespaces:
921 if namespaces:
922 for v, k in sorted(namespaces.items(),
923 key=lambda x: x[1]): # sort on prefix
924 if k:
925 k = ":" + k
926 write(" xmlns%s=\"%s\"" % (
927 k,
928 _escape_attrib(v)
929 ))
930 for k, v in items:
931 if isinstance(k, QName):
932 k = k.text
933 if isinstance(v, QName):
934 v = qnames[v.text]
935 else:
936 v = _escape_attrib_html(v)
937 # FIXME: handle boolean attributes
938 write(" %s=\"%s\"" % (qnames[k], v))
939 write(">")
940 ltag = tag.lower()
941 if text:
942 if ltag == "script" or ltag == "style":
943 write(text)
944 else:
945 write(_escape_cdata(text))
946 for e in elem:
947 _serialize_html(write, e, qnames, None)
948 if ltag not in HTML_EMPTY:
949 write("</" + tag + ">")
950 if elem.tail:
951 write(_escape_cdata(elem.tail))
952
953 def _serialize_text(write, elem):
954 for part in elem.itertext():
955 write(part)
956 if elem.tail:
957 write(elem.tail)
958
959 _serialize = {
960 "xml": _serialize_xml,
961 "html": _serialize_html,
962 "text": _serialize_text,
963 # this optional method is imported at the end of the module
964 # "c14n": _serialize_c14n,
965 }
966
967
968 def register_namespace(prefix, uri):
969 """Register a namespace prefix.
970
971 The registry is global, and any existing mapping for either the
972 given prefix or the namespace URI will be removed.
973
974 *prefix* is the namespace prefix, *uri* is a namespace uri. Tags and
975 attributes in this namespace will be serialized with prefix if possible.
976
977 ValueError is raised if prefix is reserved or is invalid.
978
979 """
980 if re.match(r"ns\d+$", prefix):
981 raise ValueError("Prefix format reserved for internal use")
982 for k, v in list(_namespace_map.items()):
983 if k == uri or v == prefix:
984 del _namespace_map[k]
985 _namespace_map[uri] = prefix
986
987 _namespace_map = {
988 # "well-known" namespace prefixes
989 "http://www.w3.org/XML/1998/namespace": "xml",
990 "http://www.w3.org/1999/xhtml": "html",
991 "http://www.w3.org/1999/02/22-rdf-syntax-ns#": "rdf",
992 "http://schemas.xmlsoap.org/wsdl/": "wsdl",
993 # xml schema
994 "http://www.w3.org/2001/XMLSchema": "xs",
995 "http://www.w3.org/2001/XMLSchema-instance": "xsi",
996 # dublin core
997 "http://purl.org/dc/elements/1.1/": "dc",
998 }
999 # For tests and troubleshooting
1000 register_namespace._namespace_map = _namespace_map
1001
1002 def _raise_serialization_error(text):
1003 raise TypeError(
1004 "cannot serialize %r (type %s)" % (text, type(text).__name__)
1005 )
1006
1007 def _escape_cdata(text):
1008 # escape character data
1009 try:
1010 # it's worth avoiding do-nothing calls for strings that are
1011 # shorter than 500 characters, or so. assume that's, by far,
1012 # the most common case in most applications.
1013 if "&" in text:
1014 text = text.replace("&", "&")
1015 if "<" in text:
1016 text = text.replace("<", "<")
1017 if ">" in text:
1018 text = text.replace(">", ">")
1019 return text
1020 except (TypeError, AttributeError):
1021 _raise_serialization_error(text)
1022
1023 def _escape_attrib(text):
1024 # escape attribute value
1025 try:
1026 if "&" in text:
1027 text = text.replace("&", "&")
1028 if "<" in text:
1029 text = text.replace("<", "<")
1030 if ">" in text:
1031 text = text.replace(">", ">")
1032 if "\"" in text:
1033 text = text.replace("\"", """)
1034 # Although section 2.11 of the XML specification states that CR or
1035 # CR LN should be replaced with just LN, it applies only to EOLNs
1036 # which take part of organizing file into lines. Within attributes,
1037 # we are replacing these with entity numbers, so they do not count.
1038 # http://www.w3.org/TR/REC-xml/#sec-line-ends
1039 # The current solution, contained in following six lines, was
1040 # discussed in issue 17582 and 39011.
1041 if "\r" in text:
1042 text = text.replace("\r", " ")
1043 if "\n" in text:
1044 text = text.replace("\n", " ")
1045 if "\t" in text:
1046 text = text.replace("\t", "	")
1047 return text
1048 except (TypeError, AttributeError):
1049 _raise_serialization_error(text)
1050
1051 def _escape_attrib_html(text):
1052 # escape attribute value
1053 try:
1054 if "&" in text:
1055 text = text.replace("&", "&")
1056 if ">" in text:
1057 text = text.replace(">", ">")
1058 if "\"" in text:
1059 text = text.replace("\"", """)
1060 return text
1061 except (TypeError, AttributeError):
1062 _raise_serialization_error(text)
1063
1064 # --------------------------------------------------------------------
1065
1066 def tostring(element, encoding=None, method=None, *,
1067 xml_declaration=None, default_namespace=None,
1068 short_empty_elements=True):
1069 """Generate string representation of XML element.
1070
1071 All subelements are included. If encoding is "unicode", a string
1072 is returned. Otherwise a bytestring is returned.
1073
1074 *element* is an Element instance, *encoding* is an optional output
1075 encoding defaulting to US-ASCII, *method* is an optional output which can
1076 be one of "xml" (default), "html", "text" or "c14n", *default_namespace*
1077 sets the default XML namespace (for "xmlns").
1078
1079 Returns an (optionally) encoded string containing the XML data.
1080
1081 """
1082 stream = io.StringIO() if encoding == 'unicode' else io.BytesIO()
1083 ElementTree(element).write(stream, encoding,
1084 xml_declaration=xml_declaration,
1085 default_namespace=default_namespace,
1086 method=method,
1087 short_empty_elements=short_empty_elements)
1088 return stream.getvalue()
1089
1090 class ESC[4;38;5;81m_ListDataStream(ESC[4;38;5;149mioESC[4;38;5;149m.ESC[4;38;5;149mBufferedIOBase):
1091 """An auxiliary stream accumulating into a list reference."""
1092 def __init__(self, lst):
1093 self.lst = lst
1094
1095 def writable(self):
1096 return True
1097
1098 def seekable(self):
1099 return True
1100
1101 def write(self, b):
1102 self.lst.append(b)
1103
1104 def tell(self):
1105 return len(self.lst)
1106
1107 def tostringlist(element, encoding=None, method=None, *,
1108 xml_declaration=None, default_namespace=None,
1109 short_empty_elements=True):
1110 lst = []
1111 stream = _ListDataStream(lst)
1112 ElementTree(element).write(stream, encoding,
1113 xml_declaration=xml_declaration,
1114 default_namespace=default_namespace,
1115 method=method,
1116 short_empty_elements=short_empty_elements)
1117 return lst
1118
1119
1120 def dump(elem):
1121 """Write element tree or element structure to sys.stdout.
1122
1123 This function should be used for debugging only.
1124
1125 *elem* is either an ElementTree, or a single Element. The exact output
1126 format is implementation dependent. In this version, it's written as an
1127 ordinary XML file.
1128
1129 """
1130 # debugging
1131 if not isinstance(elem, ElementTree):
1132 elem = ElementTree(elem)
1133 elem.write(sys.stdout, encoding="unicode")
1134 tail = elem.getroot().tail
1135 if not tail or tail[-1] != "\n":
1136 sys.stdout.write("\n")
1137
1138
1139 def indent(tree, space=" ", level=0):
1140 """Indent an XML document by inserting newlines and indentation space
1141 after elements.
1142
1143 *tree* is the ElementTree or Element to modify. The (root) element
1144 itself will not be changed, but the tail text of all elements in its
1145 subtree will be adapted.
1146
1147 *space* is the whitespace to insert for each indentation level, two
1148 space characters by default.
1149
1150 *level* is the initial indentation level. Setting this to a higher
1151 value than 0 can be used for indenting subtrees that are more deeply
1152 nested inside of a document.
1153 """
1154 if isinstance(tree, ElementTree):
1155 tree = tree.getroot()
1156 if level < 0:
1157 raise ValueError(f"Initial indentation level must be >= 0, got {level}")
1158 if not len(tree):
1159 return
1160
1161 # Reduce the memory consumption by reusing indentation strings.
1162 indentations = ["\n" + level * space]
1163
1164 def _indent_children(elem, level):
1165 # Start a new indentation level for the first child.
1166 child_level = level + 1
1167 try:
1168 child_indentation = indentations[child_level]
1169 except IndexError:
1170 child_indentation = indentations[level] + space
1171 indentations.append(child_indentation)
1172
1173 if not elem.text or not elem.text.strip():
1174 elem.text = child_indentation
1175
1176 for child in elem:
1177 if len(child):
1178 _indent_children(child, child_level)
1179 if not child.tail or not child.tail.strip():
1180 child.tail = child_indentation
1181
1182 # Dedent after the last child by overwriting the previous indentation.
1183 if not child.tail.strip():
1184 child.tail = indentations[level]
1185
1186 _indent_children(tree, 0)
1187
1188
1189 # --------------------------------------------------------------------
1190 # parsing
1191
1192
1193 def parse(source, parser=None):
1194 """Parse XML document into element tree.
1195
1196 *source* is a filename or file object containing XML data,
1197 *parser* is an optional parser instance defaulting to XMLParser.
1198
1199 Return an ElementTree instance.
1200
1201 """
1202 tree = ElementTree()
1203 tree.parse(source, parser)
1204 return tree
1205
1206
1207 def iterparse(source, events=None, parser=None):
1208 """Incrementally parse XML document into ElementTree.
1209
1210 This class also reports what's going on to the user based on the
1211 *events* it is initialized with. The supported events are the strings
1212 "start", "end", "start-ns" and "end-ns" (the "ns" events are used to get
1213 detailed namespace information). If *events* is omitted, only
1214 "end" events are reported.
1215
1216 *source* is a filename or file object containing XML data, *events* is
1217 a list of events to report back, *parser* is an optional parser instance.
1218
1219 Returns an iterator providing (event, elem) pairs.
1220
1221 """
1222 # Use the internal, undocumented _parser argument for now; When the
1223 # parser argument of iterparse is removed, this can be killed.
1224 pullparser = XMLPullParser(events=events, _parser=parser)
1225
1226 def iterator(source):
1227 close_source = False
1228 try:
1229 if not hasattr(source, "read"):
1230 source = open(source, "rb")
1231 close_source = True
1232 yield None
1233 while True:
1234 yield from pullparser.read_events()
1235 # load event buffer
1236 data = source.read(16 * 1024)
1237 if not data:
1238 break
1239 pullparser.feed(data)
1240 root = pullparser._close_and_return_root()
1241 yield from pullparser.read_events()
1242 it.root = root
1243 finally:
1244 if close_source:
1245 source.close()
1246
1247 class ESC[4;38;5;81mIterParseIterator(ESC[4;38;5;149mcollectionsESC[4;38;5;149m.ESC[4;38;5;149mabcESC[4;38;5;149m.ESC[4;38;5;149mIterator):
1248 __next__ = iterator(source).__next__
1249 it = IterParseIterator()
1250 it.root = None
1251 del iterator, IterParseIterator
1252
1253 next(it)
1254 return it
1255
1256
1257 class ESC[4;38;5;81mXMLPullParser:
1258
1259 def __init__(self, events=None, *, _parser=None):
1260 # The _parser argument is for internal use only and must not be relied
1261 # upon in user code. It will be removed in a future release.
1262 # See https://bugs.python.org/issue17741 for more details.
1263
1264 self._events_queue = collections.deque()
1265 self._parser = _parser or XMLParser(target=TreeBuilder())
1266 # wire up the parser for event reporting
1267 if events is None:
1268 events = ("end",)
1269 self._parser._setevents(self._events_queue, events)
1270
1271 def feed(self, data):
1272 """Feed encoded data to parser."""
1273 if self._parser is None:
1274 raise ValueError("feed() called after end of stream")
1275 if data:
1276 try:
1277 self._parser.feed(data)
1278 except SyntaxError as exc:
1279 self._events_queue.append(exc)
1280
1281 def _close_and_return_root(self):
1282 # iterparse needs this to set its root attribute properly :(
1283 root = self._parser.close()
1284 self._parser = None
1285 return root
1286
1287 def close(self):
1288 """Finish feeding data to parser.
1289
1290 Unlike XMLParser, does not return the root element. Use
1291 read_events() to consume elements from XMLPullParser.
1292 """
1293 self._close_and_return_root()
1294
1295 def read_events(self):
1296 """Return an iterator over currently available (event, elem) pairs.
1297
1298 Events are consumed from the internal event queue as they are
1299 retrieved from the iterator.
1300 """
1301 events = self._events_queue
1302 while events:
1303 event = events.popleft()
1304 if isinstance(event, Exception):
1305 raise event
1306 else:
1307 yield event
1308
1309
1310 def XML(text, parser=None):
1311 """Parse XML document from string constant.
1312
1313 This function can be used to embed "XML Literals" in Python code.
1314
1315 *text* is a string containing XML data, *parser* is an
1316 optional parser instance, defaulting to the standard XMLParser.
1317
1318 Returns an Element instance.
1319
1320 """
1321 if not parser:
1322 parser = XMLParser(target=TreeBuilder())
1323 parser.feed(text)
1324 return parser.close()
1325
1326
1327 def XMLID(text, parser=None):
1328 """Parse XML document from string constant for its IDs.
1329
1330 *text* is a string containing XML data, *parser* is an
1331 optional parser instance, defaulting to the standard XMLParser.
1332
1333 Returns an (Element, dict) tuple, in which the
1334 dict maps element id:s to elements.
1335
1336 """
1337 if not parser:
1338 parser = XMLParser(target=TreeBuilder())
1339 parser.feed(text)
1340 tree = parser.close()
1341 ids = {}
1342 for elem in tree.iter():
1343 id = elem.get("id")
1344 if id:
1345 ids[id] = elem
1346 return tree, ids
1347
1348 # Parse XML document from string constant. Alias for XML().
1349 fromstring = XML
1350
1351 def fromstringlist(sequence, parser=None):
1352 """Parse XML document from sequence of string fragments.
1353
1354 *sequence* is a list of other sequence, *parser* is an optional parser
1355 instance, defaulting to the standard XMLParser.
1356
1357 Returns an Element instance.
1358
1359 """
1360 if not parser:
1361 parser = XMLParser(target=TreeBuilder())
1362 for text in sequence:
1363 parser.feed(text)
1364 return parser.close()
1365
1366 # --------------------------------------------------------------------
1367
1368
1369 class ESC[4;38;5;81mTreeBuilder:
1370 """Generic element structure builder.
1371
1372 This builder converts a sequence of start, data, and end method
1373 calls to a well-formed element structure.
1374
1375 You can use this class to build an element structure using a custom XML
1376 parser, or a parser for some other XML-like format.
1377
1378 *element_factory* is an optional element factory which is called
1379 to create new Element instances, as necessary.
1380
1381 *comment_factory* is a factory to create comments to be used instead of
1382 the standard factory. If *insert_comments* is false (the default),
1383 comments will not be inserted into the tree.
1384
1385 *pi_factory* is a factory to create processing instructions to be used
1386 instead of the standard factory. If *insert_pis* is false (the default),
1387 processing instructions will not be inserted into the tree.
1388 """
1389 def __init__(self, element_factory=None, *,
1390 comment_factory=None, pi_factory=None,
1391 insert_comments=False, insert_pis=False):
1392 self._data = [] # data collector
1393 self._elem = [] # element stack
1394 self._last = None # last element
1395 self._root = None # root element
1396 self._tail = None # true if we're after an end tag
1397 if comment_factory is None:
1398 comment_factory = Comment
1399 self._comment_factory = comment_factory
1400 self.insert_comments = insert_comments
1401 if pi_factory is None:
1402 pi_factory = ProcessingInstruction
1403 self._pi_factory = pi_factory
1404 self.insert_pis = insert_pis
1405 if element_factory is None:
1406 element_factory = Element
1407 self._factory = element_factory
1408
1409 def close(self):
1410 """Flush builder buffers and return toplevel document Element."""
1411 assert len(self._elem) == 0, "missing end tags"
1412 assert self._root is not None, "missing toplevel element"
1413 return self._root
1414
1415 def _flush(self):
1416 if self._data:
1417 if self._last is not None:
1418 text = "".join(self._data)
1419 if self._tail:
1420 assert self._last.tail is None, "internal error (tail)"
1421 self._last.tail = text
1422 else:
1423 assert self._last.text is None, "internal error (text)"
1424 self._last.text = text
1425 self._data = []
1426
1427 def data(self, data):
1428 """Add text to current element."""
1429 self._data.append(data)
1430
1431 def start(self, tag, attrs):
1432 """Open new element and return it.
1433
1434 *tag* is the element name, *attrs* is a dict containing element
1435 attributes.
1436
1437 """
1438 self._flush()
1439 self._last = elem = self._factory(tag, attrs)
1440 if self._elem:
1441 self._elem[-1].append(elem)
1442 elif self._root is None:
1443 self._root = elem
1444 self._elem.append(elem)
1445 self._tail = 0
1446 return elem
1447
1448 def end(self, tag):
1449 """Close and return current Element.
1450
1451 *tag* is the element name.
1452
1453 """
1454 self._flush()
1455 self._last = self._elem.pop()
1456 assert self._last.tag == tag,\
1457 "end tag mismatch (expected %s, got %s)" % (
1458 self._last.tag, tag)
1459 self._tail = 1
1460 return self._last
1461
1462 def comment(self, text):
1463 """Create a comment using the comment_factory.
1464
1465 *text* is the text of the comment.
1466 """
1467 return self._handle_single(
1468 self._comment_factory, self.insert_comments, text)
1469
1470 def pi(self, target, text=None):
1471 """Create a processing instruction using the pi_factory.
1472
1473 *target* is the target name of the processing instruction.
1474 *text* is the data of the processing instruction, or ''.
1475 """
1476 return self._handle_single(
1477 self._pi_factory, self.insert_pis, target, text)
1478
1479 def _handle_single(self, factory, insert, *args):
1480 elem = factory(*args)
1481 if insert:
1482 self._flush()
1483 self._last = elem
1484 if self._elem:
1485 self._elem[-1].append(elem)
1486 self._tail = 1
1487 return elem
1488
1489
1490 # also see ElementTree and TreeBuilder
1491 class ESC[4;38;5;81mXMLParser:
1492 """Element structure builder for XML source data based on the expat parser.
1493
1494 *target* is an optional target object which defaults to an instance of the
1495 standard TreeBuilder class, *encoding* is an optional encoding string
1496 which if given, overrides the encoding specified in the XML file:
1497 http://www.iana.org/assignments/character-sets
1498
1499 """
1500
1501 def __init__(self, *, target=None, encoding=None):
1502 try:
1503 from xml.parsers import expat
1504 except ImportError:
1505 try:
1506 import pyexpat as expat
1507 except ImportError:
1508 raise ImportError(
1509 "No module named expat; use SimpleXMLTreeBuilder instead"
1510 )
1511 parser = expat.ParserCreate(encoding, "}")
1512 if target is None:
1513 target = TreeBuilder()
1514 # underscored names are provided for compatibility only
1515 self.parser = self._parser = parser
1516 self.target = self._target = target
1517 self._error = expat.error
1518 self._names = {} # name memo cache
1519 # main callbacks
1520 parser.DefaultHandlerExpand = self._default
1521 if hasattr(target, 'start'):
1522 parser.StartElementHandler = self._start
1523 if hasattr(target, 'end'):
1524 parser.EndElementHandler = self._end
1525 if hasattr(target, 'start_ns'):
1526 parser.StartNamespaceDeclHandler = self._start_ns
1527 if hasattr(target, 'end_ns'):
1528 parser.EndNamespaceDeclHandler = self._end_ns
1529 if hasattr(target, 'data'):
1530 parser.CharacterDataHandler = target.data
1531 # miscellaneous callbacks
1532 if hasattr(target, 'comment'):
1533 parser.CommentHandler = target.comment
1534 if hasattr(target, 'pi'):
1535 parser.ProcessingInstructionHandler = target.pi
1536 # Configure pyexpat: buffering, new-style attribute handling.
1537 parser.buffer_text = 1
1538 parser.ordered_attributes = 1
1539 self._doctype = None
1540 self.entity = {}
1541 try:
1542 self.version = "Expat %d.%d.%d" % expat.version_info
1543 except AttributeError:
1544 pass # unknown
1545
1546 def _setevents(self, events_queue, events_to_report):
1547 # Internal API for XMLPullParser
1548 # events_to_report: a list of events to report during parsing (same as
1549 # the *events* of XMLPullParser's constructor.
1550 # events_queue: a list of actual parsing events that will be populated
1551 # by the underlying parser.
1552 #
1553 parser = self._parser
1554 append = events_queue.append
1555 for event_name in events_to_report:
1556 if event_name == "start":
1557 parser.ordered_attributes = 1
1558 def handler(tag, attrib_in, event=event_name, append=append,
1559 start=self._start):
1560 append((event, start(tag, attrib_in)))
1561 parser.StartElementHandler = handler
1562 elif event_name == "end":
1563 def handler(tag, event=event_name, append=append,
1564 end=self._end):
1565 append((event, end(tag)))
1566 parser.EndElementHandler = handler
1567 elif event_name == "start-ns":
1568 # TreeBuilder does not implement .start_ns()
1569 if hasattr(self.target, "start_ns"):
1570 def handler(prefix, uri, event=event_name, append=append,
1571 start_ns=self._start_ns):
1572 append((event, start_ns(prefix, uri)))
1573 else:
1574 def handler(prefix, uri, event=event_name, append=append):
1575 append((event, (prefix or '', uri or '')))
1576 parser.StartNamespaceDeclHandler = handler
1577 elif event_name == "end-ns":
1578 # TreeBuilder does not implement .end_ns()
1579 if hasattr(self.target, "end_ns"):
1580 def handler(prefix, event=event_name, append=append,
1581 end_ns=self._end_ns):
1582 append((event, end_ns(prefix)))
1583 else:
1584 def handler(prefix, event=event_name, append=append):
1585 append((event, None))
1586 parser.EndNamespaceDeclHandler = handler
1587 elif event_name == 'comment':
1588 def handler(text, event=event_name, append=append, self=self):
1589 append((event, self.target.comment(text)))
1590 parser.CommentHandler = handler
1591 elif event_name == 'pi':
1592 def handler(pi_target, data, event=event_name, append=append,
1593 self=self):
1594 append((event, self.target.pi(pi_target, data)))
1595 parser.ProcessingInstructionHandler = handler
1596 else:
1597 raise ValueError("unknown event %r" % event_name)
1598
1599 def _raiseerror(self, value):
1600 err = ParseError(value)
1601 err.code = value.code
1602 err.position = value.lineno, value.offset
1603 raise err
1604
1605 def _fixname(self, key):
1606 # expand qname, and convert name string to ascii, if possible
1607 try:
1608 name = self._names[key]
1609 except KeyError:
1610 name = key
1611 if "}" in name:
1612 name = "{" + name
1613 self._names[key] = name
1614 return name
1615
1616 def _start_ns(self, prefix, uri):
1617 return self.target.start_ns(prefix or '', uri or '')
1618
1619 def _end_ns(self, prefix):
1620 return self.target.end_ns(prefix or '')
1621
1622 def _start(self, tag, attr_list):
1623 # Handler for expat's StartElementHandler. Since ordered_attributes
1624 # is set, the attributes are reported as a list of alternating
1625 # attribute name,value.
1626 fixname = self._fixname
1627 tag = fixname(tag)
1628 attrib = {}
1629 if attr_list:
1630 for i in range(0, len(attr_list), 2):
1631 attrib[fixname(attr_list[i])] = attr_list[i+1]
1632 return self.target.start(tag, attrib)
1633
1634 def _end(self, tag):
1635 return self.target.end(self._fixname(tag))
1636
1637 def _default(self, text):
1638 prefix = text[:1]
1639 if prefix == "&":
1640 # deal with undefined entities
1641 try:
1642 data_handler = self.target.data
1643 except AttributeError:
1644 return
1645 try:
1646 data_handler(self.entity[text[1:-1]])
1647 except KeyError:
1648 from xml.parsers import expat
1649 err = expat.error(
1650 "undefined entity %s: line %d, column %d" %
1651 (text, self.parser.ErrorLineNumber,
1652 self.parser.ErrorColumnNumber)
1653 )
1654 err.code = 11 # XML_ERROR_UNDEFINED_ENTITY
1655 err.lineno = self.parser.ErrorLineNumber
1656 err.offset = self.parser.ErrorColumnNumber
1657 raise err
1658 elif prefix == "<" and text[:9] == "<!DOCTYPE":
1659 self._doctype = [] # inside a doctype declaration
1660 elif self._doctype is not None:
1661 # parse doctype contents
1662 if prefix == ">":
1663 self._doctype = None
1664 return
1665 text = text.strip()
1666 if not text:
1667 return
1668 self._doctype.append(text)
1669 n = len(self._doctype)
1670 if n > 2:
1671 type = self._doctype[1]
1672 if type == "PUBLIC" and n == 4:
1673 name, type, pubid, system = self._doctype
1674 if pubid:
1675 pubid = pubid[1:-1]
1676 elif type == "SYSTEM" and n == 3:
1677 name, type, system = self._doctype
1678 pubid = None
1679 else:
1680 return
1681 if hasattr(self.target, "doctype"):
1682 self.target.doctype(name, pubid, system[1:-1])
1683 elif hasattr(self, "doctype"):
1684 warnings.warn(
1685 "The doctype() method of XMLParser is ignored. "
1686 "Define doctype() method on the TreeBuilder target.",
1687 RuntimeWarning)
1688
1689 self._doctype = None
1690
1691 def feed(self, data):
1692 """Feed encoded data to parser."""
1693 try:
1694 self.parser.Parse(data, False)
1695 except self._error as v:
1696 self._raiseerror(v)
1697
1698 def close(self):
1699 """Finish feeding data to parser and return element structure."""
1700 try:
1701 self.parser.Parse(b"", True) # end of data
1702 except self._error as v:
1703 self._raiseerror(v)
1704 try:
1705 close_handler = self.target.close
1706 except AttributeError:
1707 pass
1708 else:
1709 return close_handler()
1710 finally:
1711 # get rid of circular references
1712 del self.parser, self._parser
1713 del self.target, self._target
1714
1715
1716 # --------------------------------------------------------------------
1717 # C14N 2.0
1718
1719 def canonicalize(xml_data=None, *, out=None, from_file=None, **options):
1720 """Convert XML to its C14N 2.0 serialised form.
1721
1722 If *out* is provided, it must be a file or file-like object that receives
1723 the serialised canonical XML output (text, not bytes) through its ``.write()``
1724 method. To write to a file, open it in text mode with encoding "utf-8".
1725 If *out* is not provided, this function returns the output as text string.
1726
1727 Either *xml_data* (an XML string) or *from_file* (a file path or
1728 file-like object) must be provided as input.
1729
1730 The configuration options are the same as for the ``C14NWriterTarget``.
1731 """
1732 if xml_data is None and from_file is None:
1733 raise ValueError("Either 'xml_data' or 'from_file' must be provided as input")
1734 sio = None
1735 if out is None:
1736 sio = out = io.StringIO()
1737
1738 parser = XMLParser(target=C14NWriterTarget(out.write, **options))
1739
1740 if xml_data is not None:
1741 parser.feed(xml_data)
1742 parser.close()
1743 elif from_file is not None:
1744 parse(from_file, parser=parser)
1745
1746 return sio.getvalue() if sio is not None else None
1747
1748
1749 _looks_like_prefix_name = re.compile(r'^\w+:\w+$', re.UNICODE).match
1750
1751
1752 class ESC[4;38;5;81mC14NWriterTarget:
1753 """
1754 Canonicalization writer target for the XMLParser.
1755
1756 Serialises parse events to XML C14N 2.0.
1757
1758 The *write* function is used for writing out the resulting data stream
1759 as text (not bytes). To write to a file, open it in text mode with encoding
1760 "utf-8" and pass its ``.write`` method.
1761
1762 Configuration options:
1763
1764 - *with_comments*: set to true to include comments
1765 - *strip_text*: set to true to strip whitespace before and after text content
1766 - *rewrite_prefixes*: set to true to replace namespace prefixes by "n{number}"
1767 - *qname_aware_tags*: a set of qname aware tag names in which prefixes
1768 should be replaced in text content
1769 - *qname_aware_attrs*: a set of qname aware attribute names in which prefixes
1770 should be replaced in text content
1771 - *exclude_attrs*: a set of attribute names that should not be serialised
1772 - *exclude_tags*: a set of tag names that should not be serialised
1773 """
1774 def __init__(self, write, *,
1775 with_comments=False, strip_text=False, rewrite_prefixes=False,
1776 qname_aware_tags=None, qname_aware_attrs=None,
1777 exclude_attrs=None, exclude_tags=None):
1778 self._write = write
1779 self._data = []
1780 self._with_comments = with_comments
1781 self._strip_text = strip_text
1782 self._exclude_attrs = set(exclude_attrs) if exclude_attrs else None
1783 self._exclude_tags = set(exclude_tags) if exclude_tags else None
1784
1785 self._rewrite_prefixes = rewrite_prefixes
1786 if qname_aware_tags:
1787 self._qname_aware_tags = set(qname_aware_tags)
1788 else:
1789 self._qname_aware_tags = None
1790 if qname_aware_attrs:
1791 self._find_qname_aware_attrs = set(qname_aware_attrs).intersection
1792 else:
1793 self._find_qname_aware_attrs = None
1794
1795 # Stack with globally and newly declared namespaces as (uri, prefix) pairs.
1796 self._declared_ns_stack = [[
1797 ("http://www.w3.org/XML/1998/namespace", "xml"),
1798 ]]
1799 # Stack with user declared namespace prefixes as (uri, prefix) pairs.
1800 self._ns_stack = []
1801 if not rewrite_prefixes:
1802 self._ns_stack.append(list(_namespace_map.items()))
1803 self._ns_stack.append([])
1804 self._prefix_map = {}
1805 self._preserve_space = [False]
1806 self._pending_start = None
1807 self._root_seen = False
1808 self._root_done = False
1809 self._ignored_depth = 0
1810
1811 def _iter_namespaces(self, ns_stack, _reversed=reversed):
1812 for namespaces in _reversed(ns_stack):
1813 if namespaces: # almost no element declares new namespaces
1814 yield from namespaces
1815
1816 def _resolve_prefix_name(self, prefixed_name):
1817 prefix, name = prefixed_name.split(':', 1)
1818 for uri, p in self._iter_namespaces(self._ns_stack):
1819 if p == prefix:
1820 return f'{{{uri}}}{name}'
1821 raise ValueError(f'Prefix {prefix} of QName "{prefixed_name}" is not declared in scope')
1822
1823 def _qname(self, qname, uri=None):
1824 if uri is None:
1825 uri, tag = qname[1:].rsplit('}', 1) if qname[:1] == '{' else ('', qname)
1826 else:
1827 tag = qname
1828
1829 prefixes_seen = set()
1830 for u, prefix in self._iter_namespaces(self._declared_ns_stack):
1831 if u == uri and prefix not in prefixes_seen:
1832 return f'{prefix}:{tag}' if prefix else tag, tag, uri
1833 prefixes_seen.add(prefix)
1834
1835 # Not declared yet => add new declaration.
1836 if self._rewrite_prefixes:
1837 if uri in self._prefix_map:
1838 prefix = self._prefix_map[uri]
1839 else:
1840 prefix = self._prefix_map[uri] = f'n{len(self._prefix_map)}'
1841 self._declared_ns_stack[-1].append((uri, prefix))
1842 return f'{prefix}:{tag}', tag, uri
1843
1844 if not uri and '' not in prefixes_seen:
1845 # No default namespace declared => no prefix needed.
1846 return tag, tag, uri
1847
1848 for u, prefix in self._iter_namespaces(self._ns_stack):
1849 if u == uri:
1850 self._declared_ns_stack[-1].append((uri, prefix))
1851 return f'{prefix}:{tag}' if prefix else tag, tag, uri
1852
1853 if not uri:
1854 # As soon as a default namespace is defined,
1855 # anything that has no namespace (and thus, no prefix) goes there.
1856 return tag, tag, uri
1857
1858 raise ValueError(f'Namespace "{uri}" is not declared in scope')
1859
1860 def data(self, data):
1861 if not self._ignored_depth:
1862 self._data.append(data)
1863
1864 def _flush(self, _join_text=''.join):
1865 data = _join_text(self._data)
1866 del self._data[:]
1867 if self._strip_text and not self._preserve_space[-1]:
1868 data = data.strip()
1869 if self._pending_start is not None:
1870 args, self._pending_start = self._pending_start, None
1871 qname_text = data if data and _looks_like_prefix_name(data) else None
1872 self._start(*args, qname_text)
1873 if qname_text is not None:
1874 return
1875 if data and self._root_seen:
1876 self._write(_escape_cdata_c14n(data))
1877
1878 def start_ns(self, prefix, uri):
1879 if self._ignored_depth:
1880 return
1881 # we may have to resolve qnames in text content
1882 if self._data:
1883 self._flush()
1884 self._ns_stack[-1].append((uri, prefix))
1885
1886 def start(self, tag, attrs):
1887 if self._exclude_tags is not None and (
1888 self._ignored_depth or tag in self._exclude_tags):
1889 self._ignored_depth += 1
1890 return
1891 if self._data:
1892 self._flush()
1893
1894 new_namespaces = []
1895 self._declared_ns_stack.append(new_namespaces)
1896
1897 if self._qname_aware_tags is not None and tag in self._qname_aware_tags:
1898 # Need to parse text first to see if it requires a prefix declaration.
1899 self._pending_start = (tag, attrs, new_namespaces)
1900 return
1901 self._start(tag, attrs, new_namespaces)
1902
1903 def _start(self, tag, attrs, new_namespaces, qname_text=None):
1904 if self._exclude_attrs is not None and attrs:
1905 attrs = {k: v for k, v in attrs.items() if k not in self._exclude_attrs}
1906
1907 qnames = {tag, *attrs}
1908 resolved_names = {}
1909
1910 # Resolve prefixes in attribute and tag text.
1911 if qname_text is not None:
1912 qname = resolved_names[qname_text] = self._resolve_prefix_name(qname_text)
1913 qnames.add(qname)
1914 if self._find_qname_aware_attrs is not None and attrs:
1915 qattrs = self._find_qname_aware_attrs(attrs)
1916 if qattrs:
1917 for attr_name in qattrs:
1918 value = attrs[attr_name]
1919 if _looks_like_prefix_name(value):
1920 qname = resolved_names[value] = self._resolve_prefix_name(value)
1921 qnames.add(qname)
1922 else:
1923 qattrs = None
1924 else:
1925 qattrs = None
1926
1927 # Assign prefixes in lexicographical order of used URIs.
1928 parse_qname = self._qname
1929 parsed_qnames = {n: parse_qname(n) for n in sorted(
1930 qnames, key=lambda n: n.split('}', 1))}
1931
1932 # Write namespace declarations in prefix order ...
1933 if new_namespaces:
1934 attr_list = [
1935 ('xmlns:' + prefix if prefix else 'xmlns', uri)
1936 for uri, prefix in new_namespaces
1937 ]
1938 attr_list.sort()
1939 else:
1940 # almost always empty
1941 attr_list = []
1942
1943 # ... followed by attributes in URI+name order
1944 if attrs:
1945 for k, v in sorted(attrs.items()):
1946 if qattrs is not None and k in qattrs and v in resolved_names:
1947 v = parsed_qnames[resolved_names[v]][0]
1948 attr_qname, attr_name, uri = parsed_qnames[k]
1949 # No prefix for attributes in default ('') namespace.
1950 attr_list.append((attr_qname if uri else attr_name, v))
1951
1952 # Honour xml:space attributes.
1953 space_behaviour = attrs.get('{http://www.w3.org/XML/1998/namespace}space')
1954 self._preserve_space.append(
1955 space_behaviour == 'preserve' if space_behaviour
1956 else self._preserve_space[-1])
1957
1958 # Write the tag.
1959 write = self._write
1960 write('<' + parsed_qnames[tag][0])
1961 if attr_list:
1962 write(''.join([f' {k}="{_escape_attrib_c14n(v)}"' for k, v in attr_list]))
1963 write('>')
1964
1965 # Write the resolved qname text content.
1966 if qname_text is not None:
1967 write(_escape_cdata_c14n(parsed_qnames[resolved_names[qname_text]][0]))
1968
1969 self._root_seen = True
1970 self._ns_stack.append([])
1971
1972 def end(self, tag):
1973 if self._ignored_depth:
1974 self._ignored_depth -= 1
1975 return
1976 if self._data:
1977 self._flush()
1978 self._write(f'</{self._qname(tag)[0]}>')
1979 self._preserve_space.pop()
1980 self._root_done = len(self._preserve_space) == 1
1981 self._declared_ns_stack.pop()
1982 self._ns_stack.pop()
1983
1984 def comment(self, text):
1985 if not self._with_comments:
1986 return
1987 if self._ignored_depth:
1988 return
1989 if self._root_done:
1990 self._write('\n')
1991 elif self._root_seen and self._data:
1992 self._flush()
1993 self._write(f'<!--{_escape_cdata_c14n(text)}-->')
1994 if not self._root_seen:
1995 self._write('\n')
1996
1997 def pi(self, target, data):
1998 if self._ignored_depth:
1999 return
2000 if self._root_done:
2001 self._write('\n')
2002 elif self._root_seen and self._data:
2003 self._flush()
2004 self._write(
2005 f'<?{target} {_escape_cdata_c14n(data)}?>' if data else f'<?{target}?>')
2006 if not self._root_seen:
2007 self._write('\n')
2008
2009
2010 def _escape_cdata_c14n(text):
2011 # escape character data
2012 try:
2013 # it's worth avoiding do-nothing calls for strings that are
2014 # shorter than 500 character, or so. assume that's, by far,
2015 # the most common case in most applications.
2016 if '&' in text:
2017 text = text.replace('&', '&')
2018 if '<' in text:
2019 text = text.replace('<', '<')
2020 if '>' in text:
2021 text = text.replace('>', '>')
2022 if '\r' in text:
2023 text = text.replace('\r', '
')
2024 return text
2025 except (TypeError, AttributeError):
2026 _raise_serialization_error(text)
2027
2028
2029 def _escape_attrib_c14n(text):
2030 # escape attribute value
2031 try:
2032 if '&' in text:
2033 text = text.replace('&', '&')
2034 if '<' in text:
2035 text = text.replace('<', '<')
2036 if '"' in text:
2037 text = text.replace('"', '"')
2038 if '\t' in text:
2039 text = text.replace('\t', '	')
2040 if '\n' in text:
2041 text = text.replace('\n', '
')
2042 if '\r' in text:
2043 text = text.replace('\r', '
')
2044 return text
2045 except (TypeError, AttributeError):
2046 _raise_serialization_error(text)
2047
2048
2049 # --------------------------------------------------------------------
2050
2051 # Import the C accelerators
2052 try:
2053 # Element is going to be shadowed by the C implementation. We need to keep
2054 # the Python version of it accessible for some "creative" by external code
2055 # (see tests)
2056 _Element_Py = Element
2057
2058 # Element, SubElement, ParseError, TreeBuilder, XMLParser, _set_factories
2059 from _elementtree import *
2060 from _elementtree import _set_factories
2061 except ImportError:
2062 pass
2063 else:
2064 _set_factories(Comment, ProcessingInstruction)