1 """Lightweight XML support for Python.
2
3 XML is an inherently hierarchical data format, and the most natural way to
4 represent it is with a tree. This module has two classes for this purpose:
5
6 1. ElementTree represents the whole XML document as a tree and
7
8 2. Element represents a single node in this tree.
9
10 Interactions with the whole document (reading and writing to/from files) are
11 usually done on the ElementTree level. Interactions with a single XML element
12 and its sub-elements are done on the Element level.
13
14 Element is a flexible container object designed to store hierarchical data
15 structures in memory. It can be described as a cross between a list and a
16 dictionary. Each Element has a number of properties associated with it:
17
18 'tag' - a string containing the element's name.
19
20 'attributes' - a Python dictionary storing the element's attributes.
21
22 'text' - a string containing the element's text content.
23
24 'tail' - an optional string containing text after the element's end tag.
25
26 And a number of child elements stored in a Python sequence.
27
28 To create an element instance, use the Element constructor,
29 or the SubElement factory function.
30
31 You can also use the ElementTree class to wrap an element structure
32 and convert it to and from XML.
33
34 """
35
36 #---------------------------------------------------------------------
37 # Licensed to PSF under a Contributor Agreement.
38 # See https://www.python.org/psf/license for licensing details.
39 #
40 # ElementTree
41 # Copyright (c) 1999-2008 by Fredrik Lundh. All rights reserved.
42 #
43 # fredrik@pythonware.com
44 # http://www.pythonware.com
45 # --------------------------------------------------------------------
46 # The ElementTree toolkit is
47 #
48 # Copyright (c) 1999-2008 by Fredrik Lundh
49 #
50 # By obtaining, using, and/or copying this software and/or its
51 # associated documentation, you agree that you have read, understood,
52 # and will comply with the following terms and conditions:
53 #
54 # Permission to use, copy, modify, and distribute this software and
55 # its associated documentation for any purpose and without fee is
56 # hereby granted, provided that the above copyright notice appears in
57 # all copies, and that both that copyright notice and this permission
58 # notice appear in supporting documentation, and that the name of
59 # Secret Labs AB or the author not be used in advertising or publicity
60 # pertaining to distribution of the software without specific, written
61 # prior permission.
62 #
63 # SECRET LABS AB AND THE AUTHOR DISCLAIMS ALL WARRANTIES WITH REGARD
64 # TO THIS SOFTWARE, INCLUDING ALL IMPLIED WARRANTIES OF MERCHANT-
65 # ABILITY AND FITNESS. IN NO EVENT SHALL SECRET LABS AB OR THE AUTHOR
66 # BE LIABLE FOR ANY SPECIAL, INDIRECT OR CONSEQUENTIAL DAMAGES OR ANY
67 # DAMAGES WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS,
68 # WHETHER IN AN ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS
69 # ACTION, ARISING OUT OF OR IN CONNECTION WITH THE USE OR PERFORMANCE
70 # OF THIS SOFTWARE.
71 # --------------------------------------------------------------------
72
73 __all__ = [
74 # public symbols
75 "Comment",
76 "dump",
77 "Element", "ElementTree",
78 "fromstring", "fromstringlist",
79 "indent", "iselement", "iterparse",
80 "parse", "ParseError",
81 "PI", "ProcessingInstruction",
82 "QName",
83 "SubElement",
84 "tostring", "tostringlist",
85 "TreeBuilder",
86 "VERSION",
87 "XML", "XMLID",
88 "XMLParser", "XMLPullParser",
89 "register_namespace",
90 "canonicalize", "C14NWriterTarget",
91 ]
92
93 VERSION = "1.3.0"
94
95 import sys
96 import re
97 import warnings
98 import io
99 import collections
100 import collections.abc
101 import contextlib
102
103 from . import ElementPath
104
105
106 class ESC[4;38;5;81mParseError(ESC[4;38;5;149mSyntaxError):
107 """An error when parsing an XML document.
108
109 In addition to its exception value, a ParseError contains
110 two extra attributes:
111 'code' - the specific exception code
112 'position' - the line and column of the error
113
114 """
115 pass
116
117 # --------------------------------------------------------------------
118
119
120 def iselement(element):
121 """Return True if *element* appears to be an Element."""
122 return hasattr(element, 'tag')
123
124
125 class ESC[4;38;5;81mElement:
126 """An XML element.
127
128 This class is the reference implementation of the Element interface.
129
130 An element's length is its number of subelements. That means if you
131 want to check if an element is truly empty, you should check BOTH
132 its length AND its text attribute.
133
134 The element tag, attribute names, and attribute values can be either
135 bytes or strings.
136
137 *tag* is the element name. *attrib* is an optional dictionary containing
138 element attributes. *extra* are additional element attributes given as
139 keyword arguments.
140
141 Example form:
142 <tag attrib>text<child/>...</tag>tail
143
144 """
145
146 tag = None
147 """The element's name."""
148
149 attrib = None
150 """Dictionary of the element's attributes."""
151
152 text = None
153 """
154 Text before first subelement. This is either a string or the value None.
155 Note that if there is no text, this attribute may be either
156 None or the empty string, depending on the parser.
157
158 """
159
160 tail = None
161 """
162 Text after this element's end tag, but before the next sibling element's
163 start tag. This is either a string or the value None. Note that if there
164 was no text, this attribute may be either None or an empty string,
165 depending on the parser.
166
167 """
168
169 def __init__(self, tag, attrib={}, **extra):
170 if not isinstance(attrib, dict):
171 raise TypeError("attrib must be dict, not %s" % (
172 attrib.__class__.__name__,))
173 self.tag = tag
174 self.attrib = {**attrib, **extra}
175 self._children = []
176
177 def __repr__(self):
178 return "<%s %r at %#x>" % (self.__class__.__name__, self.tag, id(self))
179
180 def makeelement(self, tag, attrib):
181 """Create a new element with the same type.
182
183 *tag* is a string containing the element name.
184 *attrib* is a dictionary containing the element attributes.
185
186 Do not call this method, use the SubElement factory function instead.
187
188 """
189 return self.__class__(tag, attrib)
190
191 def copy(self):
192 """Return copy of current element.
193
194 This creates a shallow copy. Subelements will be shared with the
195 original tree.
196
197 """
198 warnings.warn(
199 "elem.copy() is deprecated. Use copy.copy(elem) instead.",
200 DeprecationWarning
201 )
202 return self.__copy__()
203
204 def __copy__(self):
205 elem = self.makeelement(self.tag, self.attrib)
206 elem.text = self.text
207 elem.tail = self.tail
208 elem[:] = self
209 return elem
210
211 def __len__(self):
212 return len(self._children)
213
214 def __bool__(self):
215 warnings.warn(
216 "The behavior of this method will change in future versions. "
217 "Use specific 'len(elem)' or 'elem is not None' test instead.",
218 FutureWarning, stacklevel=2
219 )
220 return len(self._children) != 0 # emulate old behaviour, for now
221
222 def __getitem__(self, index):
223 return self._children[index]
224
225 def __setitem__(self, index, element):
226 if isinstance(index, slice):
227 for elt in element:
228 self._assert_is_element(elt)
229 else:
230 self._assert_is_element(element)
231 self._children[index] = element
232
233 def __delitem__(self, index):
234 del self._children[index]
235
236 def append(self, subelement):
237 """Add *subelement* to the end of this element.
238
239 The new element will appear in document order after the last existing
240 subelement (or directly after the text, if it's the first subelement),
241 but before the end tag for this element.
242
243 """
244 self._assert_is_element(subelement)
245 self._children.append(subelement)
246
247 def extend(self, elements):
248 """Append subelements from a sequence.
249
250 *elements* is a sequence with zero or more elements.
251
252 """
253 for element in elements:
254 self._assert_is_element(element)
255 self._children.append(element)
256
257 def insert(self, index, subelement):
258 """Insert *subelement* at position *index*."""
259 self._assert_is_element(subelement)
260 self._children.insert(index, subelement)
261
262 def _assert_is_element(self, e):
263 # Need to refer to the actual Python implementation, not the
264 # shadowing C implementation.
265 if not isinstance(e, _Element_Py):
266 raise TypeError('expected an Element, not %s' % type(e).__name__)
267
268 def remove(self, subelement):
269 """Remove matching subelement.
270
271 Unlike the find methods, this method compares elements based on
272 identity, NOT ON tag value or contents. To remove subelements by
273 other means, the easiest way is to use a list comprehension to
274 select what elements to keep, and then use slice assignment to update
275 the parent element.
276
277 ValueError is raised if a matching element could not be found.
278
279 """
280 # assert iselement(element)
281 self._children.remove(subelement)
282
283 def find(self, path, namespaces=None):
284 """Find first matching element by tag name or path.
285
286 *path* is a string having either an element tag or an XPath,
287 *namespaces* is an optional mapping from namespace prefix to full name.
288
289 Return the first matching element, or None if no element was found.
290
291 """
292 return ElementPath.find(self, path, namespaces)
293
294 def findtext(self, path, default=None, namespaces=None):
295 """Find text for first matching element by tag name or path.
296
297 *path* is a string having either an element tag or an XPath,
298 *default* is the value to return if the element was not found,
299 *namespaces* is an optional mapping from namespace prefix to full name.
300
301 Return text content of first matching element, or default value if
302 none was found. Note that if an element is found having no text
303 content, the empty string is returned.
304
305 """
306 return ElementPath.findtext(self, path, default, namespaces)
307
308 def findall(self, path, namespaces=None):
309 """Find all matching subelements by tag name or path.
310
311 *path* is a string having either an element tag or an XPath,
312 *namespaces* is an optional mapping from namespace prefix to full name.
313
314 Returns list containing all matching elements in document order.
315
316 """
317 return ElementPath.findall(self, path, namespaces)
318
319 def iterfind(self, path, namespaces=None):
320 """Find all matching subelements by tag name or path.
321
322 *path* is a string having either an element tag or an XPath,
323 *namespaces* is an optional mapping from namespace prefix to full name.
324
325 Return an iterable yielding all matching elements in document order.
326
327 """
328 return ElementPath.iterfind(self, path, namespaces)
329
330 def clear(self):
331 """Reset element.
332
333 This function removes all subelements, clears all attributes, and sets
334 the text and tail attributes to None.
335
336 """
337 self.attrib.clear()
338 self._children = []
339 self.text = self.tail = None
340
341 def get(self, key, default=None):
342 """Get element attribute.
343
344 Equivalent to attrib.get, but some implementations may handle this a
345 bit more efficiently. *key* is what attribute to look for, and
346 *default* is what to return if the attribute was not found.
347
348 Returns a string containing the attribute value, or the default if
349 attribute was not found.
350
351 """
352 return self.attrib.get(key, default)
353
354 def set(self, key, value):
355 """Set element attribute.
356
357 Equivalent to attrib[key] = value, but some implementations may handle
358 this a bit more efficiently. *key* is what attribute to set, and
359 *value* is the attribute value to set it to.
360
361 """
362 self.attrib[key] = value
363
364 def keys(self):
365 """Get list of attribute names.
366
367 Names are returned in an arbitrary order, just like an ordinary
368 Python dict. Equivalent to attrib.keys()
369
370 """
371 return self.attrib.keys()
372
373 def items(self):
374 """Get element attributes as a sequence.
375
376 The attributes are returned in arbitrary order. Equivalent to
377 attrib.items().
378
379 Return a list of (name, value) tuples.
380
381 """
382 return self.attrib.items()
383
384 def iter(self, tag=None):
385 """Create tree iterator.
386
387 The iterator loops over the element and all subelements in document
388 order, returning all elements with a matching tag.
389
390 If the tree structure is modified during iteration, new or removed
391 elements may or may not be included. To get a stable set, use the
392 list() function on the iterator, and loop over the resulting list.
393
394 *tag* is what tags to look for (default is to return all elements)
395
396 Return an iterator containing all the matching elements.
397
398 """
399 if tag == "*":
400 tag = None
401 if tag is None or self.tag == tag:
402 yield self
403 for e in self._children:
404 yield from e.iter(tag)
405
406 def itertext(self):
407 """Create text iterator.
408
409 The iterator loops over the element and all subelements in document
410 order, returning all inner text.
411
412 """
413 tag = self.tag
414 if not isinstance(tag, str) and tag is not None:
415 return
416 t = self.text
417 if t:
418 yield t
419 for e in self:
420 yield from e.itertext()
421 t = e.tail
422 if t:
423 yield t
424
425
426 def SubElement(parent, tag, attrib={}, **extra):
427 """Subelement factory which creates an element instance, and appends it
428 to an existing parent.
429
430 The element tag, attribute names, and attribute values can be either
431 bytes or Unicode strings.
432
433 *parent* is the parent element, *tag* is the subelements name, *attrib* is
434 an optional directory containing element attributes, *extra* are
435 additional attributes given as keyword arguments.
436
437 """
438 attrib = {**attrib, **extra}
439 element = parent.makeelement(tag, attrib)
440 parent.append(element)
441 return element
442
443
444 def Comment(text=None):
445 """Comment element factory.
446
447 This function creates a special element which the standard serializer
448 serializes as an XML comment.
449
450 *text* is a string containing the comment string.
451
452 """
453 element = Element(Comment)
454 element.text = text
455 return element
456
457
458 def ProcessingInstruction(target, text=None):
459 """Processing Instruction element factory.
460
461 This function creates a special element which the standard serializer
462 serializes as an XML comment.
463
464 *target* is a string containing the processing instruction, *text* is a
465 string containing the processing instruction contents, if any.
466
467 """
468 element = Element(ProcessingInstruction)
469 element.text = target
470 if text:
471 element.text = element.text + " " + text
472 return element
473
474 PI = ProcessingInstruction
475
476
477 class ESC[4;38;5;81mQName:
478 """Qualified name wrapper.
479
480 This class can be used to wrap a QName attribute value in order to get
481 proper namespace handing on output.
482
483 *text_or_uri* is a string containing the QName value either in the form
484 {uri}local, or if the tag argument is given, the URI part of a QName.
485
486 *tag* is an optional argument which if given, will make the first
487 argument (text_or_uri) be interpreted as a URI, and this argument (tag)
488 be interpreted as a local name.
489
490 """
491 def __init__(self, text_or_uri, tag=None):
492 if tag:
493 text_or_uri = "{%s}%s" % (text_or_uri, tag)
494 self.text = text_or_uri
495 def __str__(self):
496 return self.text
497 def __repr__(self):
498 return '<%s %r>' % (self.__class__.__name__, self.text)
499 def __hash__(self):
500 return hash(self.text)
501 def __le__(self, other):
502 if isinstance(other, QName):
503 return self.text <= other.text
504 return self.text <= other
505 def __lt__(self, other):
506 if isinstance(other, QName):
507 return self.text < other.text
508 return self.text < other
509 def __ge__(self, other):
510 if isinstance(other, QName):
511 return self.text >= other.text
512 return self.text >= other
513 def __gt__(self, other):
514 if isinstance(other, QName):
515 return self.text > other.text
516 return self.text > other
517 def __eq__(self, other):
518 if isinstance(other, QName):
519 return self.text == other.text
520 return self.text == other
521
522 # --------------------------------------------------------------------
523
524
525 class ESC[4;38;5;81mElementTree:
526 """An XML element hierarchy.
527
528 This class also provides support for serialization to and from
529 standard XML.
530
531 *element* is an optional root element node,
532 *file* is an optional file handle or file name of an XML file whose
533 contents will be used to initialize the tree with.
534
535 """
536 def __init__(self, element=None, file=None):
537 # assert element is None or iselement(element)
538 self._root = element # first node
539 if file:
540 self.parse(file)
541
542 def getroot(self):
543 """Return root element of this tree."""
544 return self._root
545
546 def _setroot(self, element):
547 """Replace root element of this tree.
548
549 This will discard the current contents of the tree and replace it
550 with the given element. Use with care!
551
552 """
553 # assert iselement(element)
554 self._root = element
555
556 def parse(self, source, parser=None):
557 """Load external XML document into element tree.
558
559 *source* is a file name or file object, *parser* is an optional parser
560 instance that defaults to XMLParser.
561
562 ParseError is raised if the parser fails to parse the document.
563
564 Returns the root element of the given source document.
565
566 """
567 close_source = False
568 if not hasattr(source, "read"):
569 source = open(source, "rb")
570 close_source = True
571 try:
572 if parser is None:
573 # If no parser was specified, create a default XMLParser
574 parser = XMLParser()
575 if hasattr(parser, '_parse_whole'):
576 # The default XMLParser, when it comes from an accelerator,
577 # can define an internal _parse_whole API for efficiency.
578 # It can be used to parse the whole source without feeding
579 # it with chunks.
580 self._root = parser._parse_whole(source)
581 return self._root
582 while True:
583 data = source.read(65536)
584 if not data:
585 break
586 parser.feed(data)
587 self._root = parser.close()
588 return self._root
589 finally:
590 if close_source:
591 source.close()
592
593 def iter(self, tag=None):
594 """Create and return tree iterator for the root element.
595
596 The iterator loops over all elements in this tree, in document order.
597
598 *tag* is a string with the tag name to iterate over
599 (default is to return all elements).
600
601 """
602 # assert self._root is not None
603 return self._root.iter(tag)
604
605 def find(self, path, namespaces=None):
606 """Find first matching element by tag name or path.
607
608 Same as getroot().find(path), which is Element.find()
609
610 *path* is a string having either an element tag or an XPath,
611 *namespaces* is an optional mapping from namespace prefix to full name.
612
613 Return the first matching element, or None if no element was found.
614
615 """
616 # assert self._root is not None
617 if path[:1] == "/":
618 path = "." + path
619 warnings.warn(
620 "This search is broken in 1.3 and earlier, and will be "
621 "fixed in a future version. If you rely on the current "
622 "behaviour, change it to %r" % path,
623 FutureWarning, stacklevel=2
624 )
625 return self._root.find(path, namespaces)
626
627 def findtext(self, path, default=None, namespaces=None):
628 """Find first matching element by tag name or path.
629
630 Same as getroot().findtext(path), which is Element.findtext()
631
632 *path* is a string having either an element tag or an XPath,
633 *namespaces* is an optional mapping from namespace prefix to full name.
634
635 Return the first matching element, or None if no element was found.
636
637 """
638 # assert self._root is not None
639 if path[:1] == "/":
640 path = "." + path
641 warnings.warn(
642 "This search is broken in 1.3 and earlier, and will be "
643 "fixed in a future version. If you rely on the current "
644 "behaviour, change it to %r" % path,
645 FutureWarning, stacklevel=2
646 )
647 return self._root.findtext(path, default, namespaces)
648
649 def findall(self, path, namespaces=None):
650 """Find all matching subelements by tag name or path.
651
652 Same as getroot().findall(path), which is Element.findall().
653
654 *path* is a string having either an element tag or an XPath,
655 *namespaces* is an optional mapping from namespace prefix to full name.
656
657 Return list containing all matching elements in document order.
658
659 """
660 # assert self._root is not None
661 if path[:1] == "/":
662 path = "." + path
663 warnings.warn(
664 "This search is broken in 1.3 and earlier, and will be "
665 "fixed in a future version. If you rely on the current "
666 "behaviour, change it to %r" % path,
667 FutureWarning, stacklevel=2
668 )
669 return self._root.findall(path, namespaces)
670
671 def iterfind(self, path, namespaces=None):
672 """Find all matching subelements by tag name or path.
673
674 Same as getroot().iterfind(path), which is element.iterfind()
675
676 *path* is a string having either an element tag or an XPath,
677 *namespaces* is an optional mapping from namespace prefix to full name.
678
679 Return an iterable yielding all matching elements in document order.
680
681 """
682 # assert self._root is not None
683 if path[:1] == "/":
684 path = "." + path
685 warnings.warn(
686 "This search is broken in 1.3 and earlier, and will be "
687 "fixed in a future version. If you rely on the current "
688 "behaviour, change it to %r" % path,
689 FutureWarning, stacklevel=2
690 )
691 return self._root.iterfind(path, namespaces)
692
693 def write(self, file_or_filename,
694 encoding=None,
695 xml_declaration=None,
696 default_namespace=None,
697 method=None, *,
698 short_empty_elements=True):
699 """Write element tree to a file as XML.
700
701 Arguments:
702 *file_or_filename* -- file name or a file object opened for writing
703
704 *encoding* -- the output encoding (default: US-ASCII)
705
706 *xml_declaration* -- bool indicating if an XML declaration should be
707 added to the output. If None, an XML declaration
708 is added if encoding IS NOT either of:
709 US-ASCII, UTF-8, or Unicode
710
711 *default_namespace* -- sets the default XML namespace (for "xmlns")
712
713 *method* -- either "xml" (default), "html, "text", or "c14n"
714
715 *short_empty_elements* -- controls the formatting of elements
716 that contain no content. If True (default)
717 they are emitted as a single self-closed
718 tag, otherwise they are emitted as a pair
719 of start/end tags
720
721 """
722 if not method:
723 method = "xml"
724 elif method not in _serialize:
725 raise ValueError("unknown method %r" % method)
726 if not encoding:
727 if method == "c14n":
728 encoding = "utf-8"
729 else:
730 encoding = "us-ascii"
731 with _get_writer(file_or_filename, encoding) as (write, declared_encoding):
732 if method == "xml" and (xml_declaration or
733 (xml_declaration is None and
734 encoding.lower() != "unicode" and
735 declared_encoding.lower() not in ("utf-8", "us-ascii"))):
736 write("<?xml version='1.0' encoding='%s'?>\n" % (
737 declared_encoding,))
738 if method == "text":
739 _serialize_text(write, self._root)
740 else:
741 qnames, namespaces = _namespaces(self._root, default_namespace)
742 serialize = _serialize[method]
743 serialize(write, self._root, qnames, namespaces,
744 short_empty_elements=short_empty_elements)
745
746 def write_c14n(self, file):
747 # lxml.etree compatibility. use output method instead
748 return self.write(file, method="c14n")
749
750 # --------------------------------------------------------------------
751 # serialization support
752
753 @contextlib.contextmanager
754 def _get_writer(file_or_filename, encoding):
755 # returns text write method and release all resources after using
756 try:
757 write = file_or_filename.write
758 except AttributeError:
759 # file_or_filename is a file name
760 if encoding.lower() == "unicode":
761 encoding="utf-8"
762 with open(file_or_filename, "w", encoding=encoding,
763 errors="xmlcharrefreplace") as file:
764 yield file.write, encoding
765 else:
766 # file_or_filename is a file-like object
767 # encoding determines if it is a text or binary writer
768 if encoding.lower() == "unicode":
769 # use a text writer as is
770 yield write, getattr(file_or_filename, "encoding", None) or "utf-8"
771 else:
772 # wrap a binary writer with TextIOWrapper
773 with contextlib.ExitStack() as stack:
774 if isinstance(file_or_filename, io.BufferedIOBase):
775 file = file_or_filename
776 elif isinstance(file_or_filename, io.RawIOBase):
777 file = io.BufferedWriter(file_or_filename)
778 # Keep the original file open when the BufferedWriter is
779 # destroyed
780 stack.callback(file.detach)
781 else:
782 # This is to handle passed objects that aren't in the
783 # IOBase hierarchy, but just have a write method
784 file = io.BufferedIOBase()
785 file.writable = lambda: True
786 file.write = write
787 try:
788 # TextIOWrapper uses this methods to determine
789 # if BOM (for UTF-16, etc) should be added
790 file.seekable = file_or_filename.seekable
791 file.tell = file_or_filename.tell
792 except AttributeError:
793 pass
794 file = io.TextIOWrapper(file,
795 encoding=encoding,
796 errors="xmlcharrefreplace",
797 newline="\n")
798 # Keep the original file open when the TextIOWrapper is
799 # destroyed
800 stack.callback(file.detach)
801 yield file.write, encoding
802
803 def _namespaces(elem, default_namespace=None):
804 # identify namespaces used in this tree
805
806 # maps qnames to *encoded* prefix:local names
807 qnames = {None: None}
808
809 # maps uri:s to prefixes
810 namespaces = {}
811 if default_namespace:
812 namespaces[default_namespace] = ""
813
814 def add_qname(qname):
815 # calculate serialized qname representation
816 try:
817 if qname[:1] == "{":
818 uri, tag = qname[1:].rsplit("}", 1)
819 prefix = namespaces.get(uri)
820 if prefix is None:
821 prefix = _namespace_map.get(uri)
822 if prefix is None:
823 prefix = "ns%d" % len(namespaces)
824 if prefix != "xml":
825 namespaces[uri] = prefix
826 if prefix:
827 qnames[qname] = "%s:%s" % (prefix, tag)
828 else:
829 qnames[qname] = tag # default element
830 else:
831 if default_namespace:
832 # FIXME: can this be handled in XML 1.0?
833 raise ValueError(
834 "cannot use non-qualified names with "
835 "default_namespace option"
836 )
837 qnames[qname] = qname
838 except TypeError:
839 _raise_serialization_error(qname)
840
841 # populate qname and namespaces table
842 for elem in elem.iter():
843 tag = elem.tag
844 if isinstance(tag, QName):
845 if tag.text not in qnames:
846 add_qname(tag.text)
847 elif isinstance(tag, str):
848 if tag not in qnames:
849 add_qname(tag)
850 elif tag is not None and tag is not Comment and tag is not PI:
851 _raise_serialization_error(tag)
852 for key, value in elem.items():
853 if isinstance(key, QName):
854 key = key.text
855 if key not in qnames:
856 add_qname(key)
857 if isinstance(value, QName) and value.text not in qnames:
858 add_qname(value.text)
859 text = elem.text
860 if isinstance(text, QName) and text.text not in qnames:
861 add_qname(text.text)
862 return qnames, namespaces
863
864 def _serialize_xml(write, elem, qnames, namespaces,
865 short_empty_elements, **kwargs):
866 tag = elem.tag
867 text = elem.text
868 if tag is Comment:
869 write("<!--%s-->" % text)
870 elif tag is ProcessingInstruction:
871 write("<?%s?>" % text)
872 else:
873 tag = qnames[tag]
874 if tag is None:
875 if text:
876 write(_escape_cdata(text))
877 for e in elem:
878 _serialize_xml(write, e, qnames, None,
879 short_empty_elements=short_empty_elements)
880 else:
881 write("<" + tag)
882 items = list(elem.items())
883 if items or namespaces:
884 if namespaces:
885 for v, k in sorted(namespaces.items(),
886 key=lambda x: x[1]): # sort on prefix
887 if k:
888 k = ":" + k
889 write(" xmlns%s=\"%s\"" % (
890 k,
891 _escape_attrib(v)
892 ))
893 for k, v in items:
894 if isinstance(k, QName):
895 k = k.text
896 if isinstance(v, QName):
897 v = qnames[v.text]
898 else:
899 v = _escape_attrib(v)
900 write(" %s=\"%s\"" % (qnames[k], v))
901 if text or len(elem) or not short_empty_elements:
902 write(">")
903 if text:
904 write(_escape_cdata(text))
905 for e in elem:
906 _serialize_xml(write, e, qnames, None,
907 short_empty_elements=short_empty_elements)
908 write("</" + tag + ">")
909 else:
910 write(" />")
911 if elem.tail:
912 write(_escape_cdata(elem.tail))
913
914 HTML_EMPTY = {"area", "base", "basefont", "br", "col", "embed", "frame", "hr",
915 "img", "input", "isindex", "link", "meta", "param", "source",
916 "track", "wbr"}
917
918 def _serialize_html(write, elem, qnames, namespaces, **kwargs):
919 tag = elem.tag
920 text = elem.text
921 if tag is Comment:
922 write("<!--%s-->" % _escape_cdata(text))
923 elif tag is ProcessingInstruction:
924 write("<?%s?>" % _escape_cdata(text))
925 else:
926 tag = qnames[tag]
927 if tag is None:
928 if text:
929 write(_escape_cdata(text))
930 for e in elem:
931 _serialize_html(write, e, qnames, None)
932 else:
933 write("<" + tag)
934 items = list(elem.items())
935 if items or namespaces:
936 if namespaces:
937 for v, k in sorted(namespaces.items(),
938 key=lambda x: x[1]): # sort on prefix
939 if k:
940 k = ":" + k
941 write(" xmlns%s=\"%s\"" % (
942 k,
943 _escape_attrib(v)
944 ))
945 for k, v in items:
946 if isinstance(k, QName):
947 k = k.text
948 if isinstance(v, QName):
949 v = qnames[v.text]
950 else:
951 v = _escape_attrib_html(v)
952 # FIXME: handle boolean attributes
953 write(" %s=\"%s\"" % (qnames[k], v))
954 write(">")
955 ltag = tag.lower()
956 if text:
957 if ltag == "script" or ltag == "style":
958 write(text)
959 else:
960 write(_escape_cdata(text))
961 for e in elem:
962 _serialize_html(write, e, qnames, None)
963 if ltag not in HTML_EMPTY:
964 write("</" + tag + ">")
965 if elem.tail:
966 write(_escape_cdata(elem.tail))
967
968 def _serialize_text(write, elem):
969 for part in elem.itertext():
970 write(part)
971 if elem.tail:
972 write(elem.tail)
973
974 _serialize = {
975 "xml": _serialize_xml,
976 "html": _serialize_html,
977 "text": _serialize_text,
978 # this optional method is imported at the end of the module
979 # "c14n": _serialize_c14n,
980 }
981
982
983 def register_namespace(prefix, uri):
984 """Register a namespace prefix.
985
986 The registry is global, and any existing mapping for either the
987 given prefix or the namespace URI will be removed.
988
989 *prefix* is the namespace prefix, *uri* is a namespace uri. Tags and
990 attributes in this namespace will be serialized with prefix if possible.
991
992 ValueError is raised if prefix is reserved or is invalid.
993
994 """
995 if re.match(r"ns\d+$", prefix):
996 raise ValueError("Prefix format reserved for internal use")
997 for k, v in list(_namespace_map.items()):
998 if k == uri or v == prefix:
999 del _namespace_map[k]
1000 _namespace_map[uri] = prefix
1001
1002 _namespace_map = {
1003 # "well-known" namespace prefixes
1004 "http://www.w3.org/XML/1998/namespace": "xml",
1005 "http://www.w3.org/1999/xhtml": "html",
1006 "http://www.w3.org/1999/02/22-rdf-syntax-ns#": "rdf",
1007 "http://schemas.xmlsoap.org/wsdl/": "wsdl",
1008 # xml schema
1009 "http://www.w3.org/2001/XMLSchema": "xs",
1010 "http://www.w3.org/2001/XMLSchema-instance": "xsi",
1011 # dublin core
1012 "http://purl.org/dc/elements/1.1/": "dc",
1013 }
1014 # For tests and troubleshooting
1015 register_namespace._namespace_map = _namespace_map
1016
1017 def _raise_serialization_error(text):
1018 raise TypeError(
1019 "cannot serialize %r (type %s)" % (text, type(text).__name__)
1020 )
1021
1022 def _escape_cdata(text):
1023 # escape character data
1024 try:
1025 # it's worth avoiding do-nothing calls for strings that are
1026 # shorter than 500 characters, or so. assume that's, by far,
1027 # the most common case in most applications.
1028 if "&" in text:
1029 text = text.replace("&", "&")
1030 if "<" in text:
1031 text = text.replace("<", "<")
1032 if ">" in text:
1033 text = text.replace(">", ">")
1034 return text
1035 except (TypeError, AttributeError):
1036 _raise_serialization_error(text)
1037
1038 def _escape_attrib(text):
1039 # escape attribute value
1040 try:
1041 if "&" in text:
1042 text = text.replace("&", "&")
1043 if "<" in text:
1044 text = text.replace("<", "<")
1045 if ">" in text:
1046 text = text.replace(">", ">")
1047 if "\"" in text:
1048 text = text.replace("\"", """)
1049 # Although section 2.11 of the XML specification states that CR or
1050 # CR LN should be replaced with just LN, it applies only to EOLNs
1051 # which take part of organizing file into lines. Within attributes,
1052 # we are replacing these with entity numbers, so they do not count.
1053 # http://www.w3.org/TR/REC-xml/#sec-line-ends
1054 # The current solution, contained in following six lines, was
1055 # discussed in issue 17582 and 39011.
1056 if "\r" in text:
1057 text = text.replace("\r", " ")
1058 if "\n" in text:
1059 text = text.replace("\n", " ")
1060 if "\t" in text:
1061 text = text.replace("\t", "	")
1062 return text
1063 except (TypeError, AttributeError):
1064 _raise_serialization_error(text)
1065
1066 def _escape_attrib_html(text):
1067 # escape attribute value
1068 try:
1069 if "&" in text:
1070 text = text.replace("&", "&")
1071 if ">" in text:
1072 text = text.replace(">", ">")
1073 if "\"" in text:
1074 text = text.replace("\"", """)
1075 return text
1076 except (TypeError, AttributeError):
1077 _raise_serialization_error(text)
1078
1079 # --------------------------------------------------------------------
1080
1081 def tostring(element, encoding=None, method=None, *,
1082 xml_declaration=None, default_namespace=None,
1083 short_empty_elements=True):
1084 """Generate string representation of XML element.
1085
1086 All subelements are included. If encoding is "unicode", a string
1087 is returned. Otherwise a bytestring is returned.
1088
1089 *element* is an Element instance, *encoding* is an optional output
1090 encoding defaulting to US-ASCII, *method* is an optional output which can
1091 be one of "xml" (default), "html", "text" or "c14n", *default_namespace*
1092 sets the default XML namespace (for "xmlns").
1093
1094 Returns an (optionally) encoded string containing the XML data.
1095
1096 """
1097 stream = io.StringIO() if encoding == 'unicode' else io.BytesIO()
1098 ElementTree(element).write(stream, encoding,
1099 xml_declaration=xml_declaration,
1100 default_namespace=default_namespace,
1101 method=method,
1102 short_empty_elements=short_empty_elements)
1103 return stream.getvalue()
1104
1105 class ESC[4;38;5;81m_ListDataStream(ESC[4;38;5;149mioESC[4;38;5;149m.ESC[4;38;5;149mBufferedIOBase):
1106 """An auxiliary stream accumulating into a list reference."""
1107 def __init__(self, lst):
1108 self.lst = lst
1109
1110 def writable(self):
1111 return True
1112
1113 def seekable(self):
1114 return True
1115
1116 def write(self, b):
1117 self.lst.append(b)
1118
1119 def tell(self):
1120 return len(self.lst)
1121
1122 def tostringlist(element, encoding=None, method=None, *,
1123 xml_declaration=None, default_namespace=None,
1124 short_empty_elements=True):
1125 lst = []
1126 stream = _ListDataStream(lst)
1127 ElementTree(element).write(stream, encoding,
1128 xml_declaration=xml_declaration,
1129 default_namespace=default_namespace,
1130 method=method,
1131 short_empty_elements=short_empty_elements)
1132 return lst
1133
1134
1135 def dump(elem):
1136 """Write element tree or element structure to sys.stdout.
1137
1138 This function should be used for debugging only.
1139
1140 *elem* is either an ElementTree, or a single Element. The exact output
1141 format is implementation dependent. In this version, it's written as an
1142 ordinary XML file.
1143
1144 """
1145 # debugging
1146 if not isinstance(elem, ElementTree):
1147 elem = ElementTree(elem)
1148 elem.write(sys.stdout, encoding="unicode")
1149 tail = elem.getroot().tail
1150 if not tail or tail[-1] != "\n":
1151 sys.stdout.write("\n")
1152
1153
1154 def indent(tree, space=" ", level=0):
1155 """Indent an XML document by inserting newlines and indentation space
1156 after elements.
1157
1158 *tree* is the ElementTree or Element to modify. The (root) element
1159 itself will not be changed, but the tail text of all elements in its
1160 subtree will be adapted.
1161
1162 *space* is the whitespace to insert for each indentation level, two
1163 space characters by default.
1164
1165 *level* is the initial indentation level. Setting this to a higher
1166 value than 0 can be used for indenting subtrees that are more deeply
1167 nested inside of a document.
1168 """
1169 if isinstance(tree, ElementTree):
1170 tree = tree.getroot()
1171 if level < 0:
1172 raise ValueError(f"Initial indentation level must be >= 0, got {level}")
1173 if not len(tree):
1174 return
1175
1176 # Reduce the memory consumption by reusing indentation strings.
1177 indentations = ["\n" + level * space]
1178
1179 def _indent_children(elem, level):
1180 # Start a new indentation level for the first child.
1181 child_level = level + 1
1182 try:
1183 child_indentation = indentations[child_level]
1184 except IndexError:
1185 child_indentation = indentations[level] + space
1186 indentations.append(child_indentation)
1187
1188 if not elem.text or not elem.text.strip():
1189 elem.text = child_indentation
1190
1191 for child in elem:
1192 if len(child):
1193 _indent_children(child, child_level)
1194 if not child.tail or not child.tail.strip():
1195 child.tail = child_indentation
1196
1197 # Dedent after the last child by overwriting the previous indentation.
1198 if not child.tail.strip():
1199 child.tail = indentations[level]
1200
1201 _indent_children(tree, 0)
1202
1203
1204 # --------------------------------------------------------------------
1205 # parsing
1206
1207
1208 def parse(source, parser=None):
1209 """Parse XML document into element tree.
1210
1211 *source* is a filename or file object containing XML data,
1212 *parser* is an optional parser instance defaulting to XMLParser.
1213
1214 Return an ElementTree instance.
1215
1216 """
1217 tree = ElementTree()
1218 tree.parse(source, parser)
1219 return tree
1220
1221
1222 def iterparse(source, events=None, parser=None):
1223 """Incrementally parse XML document into ElementTree.
1224
1225 This class also reports what's going on to the user based on the
1226 *events* it is initialized with. The supported events are the strings
1227 "start", "end", "start-ns" and "end-ns" (the "ns" events are used to get
1228 detailed namespace information). If *events* is omitted, only
1229 "end" events are reported.
1230
1231 *source* is a filename or file object containing XML data, *events* is
1232 a list of events to report back, *parser* is an optional parser instance.
1233
1234 Returns an iterator providing (event, elem) pairs.
1235
1236 """
1237 # Use the internal, undocumented _parser argument for now; When the
1238 # parser argument of iterparse is removed, this can be killed.
1239 pullparser = XMLPullParser(events=events, _parser=parser)
1240
1241 def iterator(source):
1242 close_source = False
1243 try:
1244 if not hasattr(source, "read"):
1245 source = open(source, "rb")
1246 close_source = True
1247 yield None
1248 while True:
1249 yield from pullparser.read_events()
1250 # load event buffer
1251 data = source.read(16 * 1024)
1252 if not data:
1253 break
1254 pullparser.feed(data)
1255 root = pullparser._close_and_return_root()
1256 yield from pullparser.read_events()
1257 it.root = root
1258 finally:
1259 if close_source:
1260 source.close()
1261
1262 class ESC[4;38;5;81mIterParseIterator(ESC[4;38;5;149mcollectionsESC[4;38;5;149m.ESC[4;38;5;149mabcESC[4;38;5;149m.ESC[4;38;5;149mIterator):
1263 __next__ = iterator(source).__next__
1264 it = IterParseIterator()
1265 it.root = None
1266 del iterator, IterParseIterator
1267
1268 next(it)
1269 return it
1270
1271
1272 class ESC[4;38;5;81mXMLPullParser:
1273
1274 def __init__(self, events=None, *, _parser=None):
1275 # The _parser argument is for internal use only and must not be relied
1276 # upon in user code. It will be removed in a future release.
1277 # See https://bugs.python.org/issue17741 for more details.
1278
1279 self._events_queue = collections.deque()
1280 self._parser = _parser or XMLParser(target=TreeBuilder())
1281 # wire up the parser for event reporting
1282 if events is None:
1283 events = ("end",)
1284 self._parser._setevents(self._events_queue, events)
1285
1286 def feed(self, data):
1287 """Feed encoded data to parser."""
1288 if self._parser is None:
1289 raise ValueError("feed() called after end of stream")
1290 if data:
1291 try:
1292 self._parser.feed(data)
1293 except SyntaxError as exc:
1294 self._events_queue.append(exc)
1295
1296 def _close_and_return_root(self):
1297 # iterparse needs this to set its root attribute properly :(
1298 root = self._parser.close()
1299 self._parser = None
1300 return root
1301
1302 def close(self):
1303 """Finish feeding data to parser.
1304
1305 Unlike XMLParser, does not return the root element. Use
1306 read_events() to consume elements from XMLPullParser.
1307 """
1308 self._close_and_return_root()
1309
1310 def read_events(self):
1311 """Return an iterator over currently available (event, elem) pairs.
1312
1313 Events are consumed from the internal event queue as they are
1314 retrieved from the iterator.
1315 """
1316 events = self._events_queue
1317 while events:
1318 event = events.popleft()
1319 if isinstance(event, Exception):
1320 raise event
1321 else:
1322 yield event
1323
1324
1325 def XML(text, parser=None):
1326 """Parse XML document from string constant.
1327
1328 This function can be used to embed "XML Literals" in Python code.
1329
1330 *text* is a string containing XML data, *parser* is an
1331 optional parser instance, defaulting to the standard XMLParser.
1332
1333 Returns an Element instance.
1334
1335 """
1336 if not parser:
1337 parser = XMLParser(target=TreeBuilder())
1338 parser.feed(text)
1339 return parser.close()
1340
1341
1342 def XMLID(text, parser=None):
1343 """Parse XML document from string constant for its IDs.
1344
1345 *text* is a string containing XML data, *parser* is an
1346 optional parser instance, defaulting to the standard XMLParser.
1347
1348 Returns an (Element, dict) tuple, in which the
1349 dict maps element id:s to elements.
1350
1351 """
1352 if not parser:
1353 parser = XMLParser(target=TreeBuilder())
1354 parser.feed(text)
1355 tree = parser.close()
1356 ids = {}
1357 for elem in tree.iter():
1358 id = elem.get("id")
1359 if id:
1360 ids[id] = elem
1361 return tree, ids
1362
1363 # Parse XML document from string constant. Alias for XML().
1364 fromstring = XML
1365
1366 def fromstringlist(sequence, parser=None):
1367 """Parse XML document from sequence of string fragments.
1368
1369 *sequence* is a list of other sequence, *parser* is an optional parser
1370 instance, defaulting to the standard XMLParser.
1371
1372 Returns an Element instance.
1373
1374 """
1375 if not parser:
1376 parser = XMLParser(target=TreeBuilder())
1377 for text in sequence:
1378 parser.feed(text)
1379 return parser.close()
1380
1381 # --------------------------------------------------------------------
1382
1383
1384 class ESC[4;38;5;81mTreeBuilder:
1385 """Generic element structure builder.
1386
1387 This builder converts a sequence of start, data, and end method
1388 calls to a well-formed element structure.
1389
1390 You can use this class to build an element structure using a custom XML
1391 parser, or a parser for some other XML-like format.
1392
1393 *element_factory* is an optional element factory which is called
1394 to create new Element instances, as necessary.
1395
1396 *comment_factory* is a factory to create comments to be used instead of
1397 the standard factory. If *insert_comments* is false (the default),
1398 comments will not be inserted into the tree.
1399
1400 *pi_factory* is a factory to create processing instructions to be used
1401 instead of the standard factory. If *insert_pis* is false (the default),
1402 processing instructions will not be inserted into the tree.
1403 """
1404 def __init__(self, element_factory=None, *,
1405 comment_factory=None, pi_factory=None,
1406 insert_comments=False, insert_pis=False):
1407 self._data = [] # data collector
1408 self._elem = [] # element stack
1409 self._last = None # last element
1410 self._root = None # root element
1411 self._tail = None # true if we're after an end tag
1412 if comment_factory is None:
1413 comment_factory = Comment
1414 self._comment_factory = comment_factory
1415 self.insert_comments = insert_comments
1416 if pi_factory is None:
1417 pi_factory = ProcessingInstruction
1418 self._pi_factory = pi_factory
1419 self.insert_pis = insert_pis
1420 if element_factory is None:
1421 element_factory = Element
1422 self._factory = element_factory
1423
1424 def close(self):
1425 """Flush builder buffers and return toplevel document Element."""
1426 assert len(self._elem) == 0, "missing end tags"
1427 assert self._root is not None, "missing toplevel element"
1428 return self._root
1429
1430 def _flush(self):
1431 if self._data:
1432 if self._last is not None:
1433 text = "".join(self._data)
1434 if self._tail:
1435 assert self._last.tail is None, "internal error (tail)"
1436 self._last.tail = text
1437 else:
1438 assert self._last.text is None, "internal error (text)"
1439 self._last.text = text
1440 self._data = []
1441
1442 def data(self, data):
1443 """Add text to current element."""
1444 self._data.append(data)
1445
1446 def start(self, tag, attrs):
1447 """Open new element and return it.
1448
1449 *tag* is the element name, *attrs* is a dict containing element
1450 attributes.
1451
1452 """
1453 self._flush()
1454 self._last = elem = self._factory(tag, attrs)
1455 if self._elem:
1456 self._elem[-1].append(elem)
1457 elif self._root is None:
1458 self._root = elem
1459 self._elem.append(elem)
1460 self._tail = 0
1461 return elem
1462
1463 def end(self, tag):
1464 """Close and return current Element.
1465
1466 *tag* is the element name.
1467
1468 """
1469 self._flush()
1470 self._last = self._elem.pop()
1471 assert self._last.tag == tag,\
1472 "end tag mismatch (expected %s, got %s)" % (
1473 self._last.tag, tag)
1474 self._tail = 1
1475 return self._last
1476
1477 def comment(self, text):
1478 """Create a comment using the comment_factory.
1479
1480 *text* is the text of the comment.
1481 """
1482 return self._handle_single(
1483 self._comment_factory, self.insert_comments, text)
1484
1485 def pi(self, target, text=None):
1486 """Create a processing instruction using the pi_factory.
1487
1488 *target* is the target name of the processing instruction.
1489 *text* is the data of the processing instruction, or ''.
1490 """
1491 return self._handle_single(
1492 self._pi_factory, self.insert_pis, target, text)
1493
1494 def _handle_single(self, factory, insert, *args):
1495 elem = factory(*args)
1496 if insert:
1497 self._flush()
1498 self._last = elem
1499 if self._elem:
1500 self._elem[-1].append(elem)
1501 self._tail = 1
1502 return elem
1503
1504
1505 # also see ElementTree and TreeBuilder
1506 class ESC[4;38;5;81mXMLParser:
1507 """Element structure builder for XML source data based on the expat parser.
1508
1509 *target* is an optional target object which defaults to an instance of the
1510 standard TreeBuilder class, *encoding* is an optional encoding string
1511 which if given, overrides the encoding specified in the XML file:
1512 http://www.iana.org/assignments/character-sets
1513
1514 """
1515
1516 def __init__(self, *, target=None, encoding=None):
1517 try:
1518 from xml.parsers import expat
1519 except ImportError:
1520 try:
1521 import pyexpat as expat
1522 except ImportError:
1523 raise ImportError(
1524 "No module named expat; use SimpleXMLTreeBuilder instead"
1525 )
1526 parser = expat.ParserCreate(encoding, "}")
1527 if target is None:
1528 target = TreeBuilder()
1529 # underscored names are provided for compatibility only
1530 self.parser = self._parser = parser
1531 self.target = self._target = target
1532 self._error = expat.error
1533 self._names = {} # name memo cache
1534 # main callbacks
1535 parser.DefaultHandlerExpand = self._default
1536 if hasattr(target, 'start'):
1537 parser.StartElementHandler = self._start
1538 if hasattr(target, 'end'):
1539 parser.EndElementHandler = self._end
1540 if hasattr(target, 'start_ns'):
1541 parser.StartNamespaceDeclHandler = self._start_ns
1542 if hasattr(target, 'end_ns'):
1543 parser.EndNamespaceDeclHandler = self._end_ns
1544 if hasattr(target, 'data'):
1545 parser.CharacterDataHandler = target.data
1546 # miscellaneous callbacks
1547 if hasattr(target, 'comment'):
1548 parser.CommentHandler = target.comment
1549 if hasattr(target, 'pi'):
1550 parser.ProcessingInstructionHandler = target.pi
1551 # Configure pyexpat: buffering, new-style attribute handling.
1552 parser.buffer_text = 1
1553 parser.ordered_attributes = 1
1554 self._doctype = None
1555 self.entity = {}
1556 try:
1557 self.version = "Expat %d.%d.%d" % expat.version_info
1558 except AttributeError:
1559 pass # unknown
1560
1561 def _setevents(self, events_queue, events_to_report):
1562 # Internal API for XMLPullParser
1563 # events_to_report: a list of events to report during parsing (same as
1564 # the *events* of XMLPullParser's constructor.
1565 # events_queue: a list of actual parsing events that will be populated
1566 # by the underlying parser.
1567 #
1568 parser = self._parser
1569 append = events_queue.append
1570 for event_name in events_to_report:
1571 if event_name == "start":
1572 parser.ordered_attributes = 1
1573 def handler(tag, attrib_in, event=event_name, append=append,
1574 start=self._start):
1575 append((event, start(tag, attrib_in)))
1576 parser.StartElementHandler = handler
1577 elif event_name == "end":
1578 def handler(tag, event=event_name, append=append,
1579 end=self._end):
1580 append((event, end(tag)))
1581 parser.EndElementHandler = handler
1582 elif event_name == "start-ns":
1583 # TreeBuilder does not implement .start_ns()
1584 if hasattr(self.target, "start_ns"):
1585 def handler(prefix, uri, event=event_name, append=append,
1586 start_ns=self._start_ns):
1587 append((event, start_ns(prefix, uri)))
1588 else:
1589 def handler(prefix, uri, event=event_name, append=append):
1590 append((event, (prefix or '', uri or '')))
1591 parser.StartNamespaceDeclHandler = handler
1592 elif event_name == "end-ns":
1593 # TreeBuilder does not implement .end_ns()
1594 if hasattr(self.target, "end_ns"):
1595 def handler(prefix, event=event_name, append=append,
1596 end_ns=self._end_ns):
1597 append((event, end_ns(prefix)))
1598 else:
1599 def handler(prefix, event=event_name, append=append):
1600 append((event, None))
1601 parser.EndNamespaceDeclHandler = handler
1602 elif event_name == 'comment':
1603 def handler(text, event=event_name, append=append, self=self):
1604 append((event, self.target.comment(text)))
1605 parser.CommentHandler = handler
1606 elif event_name == 'pi':
1607 def handler(pi_target, data, event=event_name, append=append,
1608 self=self):
1609 append((event, self.target.pi(pi_target, data)))
1610 parser.ProcessingInstructionHandler = handler
1611 else:
1612 raise ValueError("unknown event %r" % event_name)
1613
1614 def _raiseerror(self, value):
1615 err = ParseError(value)
1616 err.code = value.code
1617 err.position = value.lineno, value.offset
1618 raise err
1619
1620 def _fixname(self, key):
1621 # expand qname, and convert name string to ascii, if possible
1622 try:
1623 name = self._names[key]
1624 except KeyError:
1625 name = key
1626 if "}" in name:
1627 name = "{" + name
1628 self._names[key] = name
1629 return name
1630
1631 def _start_ns(self, prefix, uri):
1632 return self.target.start_ns(prefix or '', uri or '')
1633
1634 def _end_ns(self, prefix):
1635 return self.target.end_ns(prefix or '')
1636
1637 def _start(self, tag, attr_list):
1638 # Handler for expat's StartElementHandler. Since ordered_attributes
1639 # is set, the attributes are reported as a list of alternating
1640 # attribute name,value.
1641 fixname = self._fixname
1642 tag = fixname(tag)
1643 attrib = {}
1644 if attr_list:
1645 for i in range(0, len(attr_list), 2):
1646 attrib[fixname(attr_list[i])] = attr_list[i+1]
1647 return self.target.start(tag, attrib)
1648
1649 def _end(self, tag):
1650 return self.target.end(self._fixname(tag))
1651
1652 def _default(self, text):
1653 prefix = text[:1]
1654 if prefix == "&":
1655 # deal with undefined entities
1656 try:
1657 data_handler = self.target.data
1658 except AttributeError:
1659 return
1660 try:
1661 data_handler(self.entity[text[1:-1]])
1662 except KeyError:
1663 from xml.parsers import expat
1664 err = expat.error(
1665 "undefined entity %s: line %d, column %d" %
1666 (text, self.parser.ErrorLineNumber,
1667 self.parser.ErrorColumnNumber)
1668 )
1669 err.code = 11 # XML_ERROR_UNDEFINED_ENTITY
1670 err.lineno = self.parser.ErrorLineNumber
1671 err.offset = self.parser.ErrorColumnNumber
1672 raise err
1673 elif prefix == "<" and text[:9] == "<!DOCTYPE":
1674 self._doctype = [] # inside a doctype declaration
1675 elif self._doctype is not None:
1676 # parse doctype contents
1677 if prefix == ">":
1678 self._doctype = None
1679 return
1680 text = text.strip()
1681 if not text:
1682 return
1683 self._doctype.append(text)
1684 n = len(self._doctype)
1685 if n > 2:
1686 type = self._doctype[1]
1687 if type == "PUBLIC" and n == 4:
1688 name, type, pubid, system = self._doctype
1689 if pubid:
1690 pubid = pubid[1:-1]
1691 elif type == "SYSTEM" and n == 3:
1692 name, type, system = self._doctype
1693 pubid = None
1694 else:
1695 return
1696 if hasattr(self.target, "doctype"):
1697 self.target.doctype(name, pubid, system[1:-1])
1698 elif hasattr(self, "doctype"):
1699 warnings.warn(
1700 "The doctype() method of XMLParser is ignored. "
1701 "Define doctype() method on the TreeBuilder target.",
1702 RuntimeWarning)
1703
1704 self._doctype = None
1705
1706 def feed(self, data):
1707 """Feed encoded data to parser."""
1708 try:
1709 self.parser.Parse(data, False)
1710 except self._error as v:
1711 self._raiseerror(v)
1712
1713 def close(self):
1714 """Finish feeding data to parser and return element structure."""
1715 try:
1716 self.parser.Parse(b"", True) # end of data
1717 except self._error as v:
1718 self._raiseerror(v)
1719 try:
1720 close_handler = self.target.close
1721 except AttributeError:
1722 pass
1723 else:
1724 return close_handler()
1725 finally:
1726 # get rid of circular references
1727 del self.parser, self._parser
1728 del self.target, self._target
1729
1730
1731 # --------------------------------------------------------------------
1732 # C14N 2.0
1733
1734 def canonicalize(xml_data=None, *, out=None, from_file=None, **options):
1735 """Convert XML to its C14N 2.0 serialised form.
1736
1737 If *out* is provided, it must be a file or file-like object that receives
1738 the serialised canonical XML output (text, not bytes) through its ``.write()``
1739 method. To write to a file, open it in text mode with encoding "utf-8".
1740 If *out* is not provided, this function returns the output as text string.
1741
1742 Either *xml_data* (an XML string) or *from_file* (a file path or
1743 file-like object) must be provided as input.
1744
1745 The configuration options are the same as for the ``C14NWriterTarget``.
1746 """
1747 if xml_data is None and from_file is None:
1748 raise ValueError("Either 'xml_data' or 'from_file' must be provided as input")
1749 sio = None
1750 if out is None:
1751 sio = out = io.StringIO()
1752
1753 parser = XMLParser(target=C14NWriterTarget(out.write, **options))
1754
1755 if xml_data is not None:
1756 parser.feed(xml_data)
1757 parser.close()
1758 elif from_file is not None:
1759 parse(from_file, parser=parser)
1760
1761 return sio.getvalue() if sio is not None else None
1762
1763
1764 _looks_like_prefix_name = re.compile(r'^\w+:\w+$', re.UNICODE).match
1765
1766
1767 class ESC[4;38;5;81mC14NWriterTarget:
1768 """
1769 Canonicalization writer target for the XMLParser.
1770
1771 Serialises parse events to XML C14N 2.0.
1772
1773 The *write* function is used for writing out the resulting data stream
1774 as text (not bytes). To write to a file, open it in text mode with encoding
1775 "utf-8" and pass its ``.write`` method.
1776
1777 Configuration options:
1778
1779 - *with_comments*: set to true to include comments
1780 - *strip_text*: set to true to strip whitespace before and after text content
1781 - *rewrite_prefixes*: set to true to replace namespace prefixes by "n{number}"
1782 - *qname_aware_tags*: a set of qname aware tag names in which prefixes
1783 should be replaced in text content
1784 - *qname_aware_attrs*: a set of qname aware attribute names in which prefixes
1785 should be replaced in text content
1786 - *exclude_attrs*: a set of attribute names that should not be serialised
1787 - *exclude_tags*: a set of tag names that should not be serialised
1788 """
1789 def __init__(self, write, *,
1790 with_comments=False, strip_text=False, rewrite_prefixes=False,
1791 qname_aware_tags=None, qname_aware_attrs=None,
1792 exclude_attrs=None, exclude_tags=None):
1793 self._write = write
1794 self._data = []
1795 self._with_comments = with_comments
1796 self._strip_text = strip_text
1797 self._exclude_attrs = set(exclude_attrs) if exclude_attrs else None
1798 self._exclude_tags = set(exclude_tags) if exclude_tags else None
1799
1800 self._rewrite_prefixes = rewrite_prefixes
1801 if qname_aware_tags:
1802 self._qname_aware_tags = set(qname_aware_tags)
1803 else:
1804 self._qname_aware_tags = None
1805 if qname_aware_attrs:
1806 self._find_qname_aware_attrs = set(qname_aware_attrs).intersection
1807 else:
1808 self._find_qname_aware_attrs = None
1809
1810 # Stack with globally and newly declared namespaces as (uri, prefix) pairs.
1811 self._declared_ns_stack = [[
1812 ("http://www.w3.org/XML/1998/namespace", "xml"),
1813 ]]
1814 # Stack with user declared namespace prefixes as (uri, prefix) pairs.
1815 self._ns_stack = []
1816 if not rewrite_prefixes:
1817 self._ns_stack.append(list(_namespace_map.items()))
1818 self._ns_stack.append([])
1819 self._prefix_map = {}
1820 self._preserve_space = [False]
1821 self._pending_start = None
1822 self._root_seen = False
1823 self._root_done = False
1824 self._ignored_depth = 0
1825
1826 def _iter_namespaces(self, ns_stack, _reversed=reversed):
1827 for namespaces in _reversed(ns_stack):
1828 if namespaces: # almost no element declares new namespaces
1829 yield from namespaces
1830
1831 def _resolve_prefix_name(self, prefixed_name):
1832 prefix, name = prefixed_name.split(':', 1)
1833 for uri, p in self._iter_namespaces(self._ns_stack):
1834 if p == prefix:
1835 return f'{{{uri}}}{name}'
1836 raise ValueError(f'Prefix {prefix} of QName "{prefixed_name}" is not declared in scope')
1837
1838 def _qname(self, qname, uri=None):
1839 if uri is None:
1840 uri, tag = qname[1:].rsplit('}', 1) if qname[:1] == '{' else ('', qname)
1841 else:
1842 tag = qname
1843
1844 prefixes_seen = set()
1845 for u, prefix in self._iter_namespaces(self._declared_ns_stack):
1846 if u == uri and prefix not in prefixes_seen:
1847 return f'{prefix}:{tag}' if prefix else tag, tag, uri
1848 prefixes_seen.add(prefix)
1849
1850 # Not declared yet => add new declaration.
1851 if self._rewrite_prefixes:
1852 if uri in self._prefix_map:
1853 prefix = self._prefix_map[uri]
1854 else:
1855 prefix = self._prefix_map[uri] = f'n{len(self._prefix_map)}'
1856 self._declared_ns_stack[-1].append((uri, prefix))
1857 return f'{prefix}:{tag}', tag, uri
1858
1859 if not uri and '' not in prefixes_seen:
1860 # No default namespace declared => no prefix needed.
1861 return tag, tag, uri
1862
1863 for u, prefix in self._iter_namespaces(self._ns_stack):
1864 if u == uri:
1865 self._declared_ns_stack[-1].append((uri, prefix))
1866 return f'{prefix}:{tag}' if prefix else tag, tag, uri
1867
1868 if not uri:
1869 # As soon as a default namespace is defined,
1870 # anything that has no namespace (and thus, no prefix) goes there.
1871 return tag, tag, uri
1872
1873 raise ValueError(f'Namespace "{uri}" is not declared in scope')
1874
1875 def data(self, data):
1876 if not self._ignored_depth:
1877 self._data.append(data)
1878
1879 def _flush(self, _join_text=''.join):
1880 data = _join_text(self._data)
1881 del self._data[:]
1882 if self._strip_text and not self._preserve_space[-1]:
1883 data = data.strip()
1884 if self._pending_start is not None:
1885 args, self._pending_start = self._pending_start, None
1886 qname_text = data if data and _looks_like_prefix_name(data) else None
1887 self._start(*args, qname_text)
1888 if qname_text is not None:
1889 return
1890 if data and self._root_seen:
1891 self._write(_escape_cdata_c14n(data))
1892
1893 def start_ns(self, prefix, uri):
1894 if self._ignored_depth:
1895 return
1896 # we may have to resolve qnames in text content
1897 if self._data:
1898 self._flush()
1899 self._ns_stack[-1].append((uri, prefix))
1900
1901 def start(self, tag, attrs):
1902 if self._exclude_tags is not None and (
1903 self._ignored_depth or tag in self._exclude_tags):
1904 self._ignored_depth += 1
1905 return
1906 if self._data:
1907 self._flush()
1908
1909 new_namespaces = []
1910 self._declared_ns_stack.append(new_namespaces)
1911
1912 if self._qname_aware_tags is not None and tag in self._qname_aware_tags:
1913 # Need to parse text first to see if it requires a prefix declaration.
1914 self._pending_start = (tag, attrs, new_namespaces)
1915 return
1916 self._start(tag, attrs, new_namespaces)
1917
1918 def _start(self, tag, attrs, new_namespaces, qname_text=None):
1919 if self._exclude_attrs is not None and attrs:
1920 attrs = {k: v for k, v in attrs.items() if k not in self._exclude_attrs}
1921
1922 qnames = {tag, *attrs}
1923 resolved_names = {}
1924
1925 # Resolve prefixes in attribute and tag text.
1926 if qname_text is not None:
1927 qname = resolved_names[qname_text] = self._resolve_prefix_name(qname_text)
1928 qnames.add(qname)
1929 if self._find_qname_aware_attrs is not None and attrs:
1930 qattrs = self._find_qname_aware_attrs(attrs)
1931 if qattrs:
1932 for attr_name in qattrs:
1933 value = attrs[attr_name]
1934 if _looks_like_prefix_name(value):
1935 qname = resolved_names[value] = self._resolve_prefix_name(value)
1936 qnames.add(qname)
1937 else:
1938 qattrs = None
1939 else:
1940 qattrs = None
1941
1942 # Assign prefixes in lexicographical order of used URIs.
1943 parse_qname = self._qname
1944 parsed_qnames = {n: parse_qname(n) for n in sorted(
1945 qnames, key=lambda n: n.split('}', 1))}
1946
1947 # Write namespace declarations in prefix order ...
1948 if new_namespaces:
1949 attr_list = [
1950 ('xmlns:' + prefix if prefix else 'xmlns', uri)
1951 for uri, prefix in new_namespaces
1952 ]
1953 attr_list.sort()
1954 else:
1955 # almost always empty
1956 attr_list = []
1957
1958 # ... followed by attributes in URI+name order
1959 if attrs:
1960 for k, v in sorted(attrs.items()):
1961 if qattrs is not None and k in qattrs and v in resolved_names:
1962 v = parsed_qnames[resolved_names[v]][0]
1963 attr_qname, attr_name, uri = parsed_qnames[k]
1964 # No prefix for attributes in default ('') namespace.
1965 attr_list.append((attr_qname if uri else attr_name, v))
1966
1967 # Honour xml:space attributes.
1968 space_behaviour = attrs.get('{http://www.w3.org/XML/1998/namespace}space')
1969 self._preserve_space.append(
1970 space_behaviour == 'preserve' if space_behaviour
1971 else self._preserve_space[-1])
1972
1973 # Write the tag.
1974 write = self._write
1975 write('<' + parsed_qnames[tag][0])
1976 if attr_list:
1977 write(''.join([f' {k}="{_escape_attrib_c14n(v)}"' for k, v in attr_list]))
1978 write('>')
1979
1980 # Write the resolved qname text content.
1981 if qname_text is not None:
1982 write(_escape_cdata_c14n(parsed_qnames[resolved_names[qname_text]][0]))
1983
1984 self._root_seen = True
1985 self._ns_stack.append([])
1986
1987 def end(self, tag):
1988 if self._ignored_depth:
1989 self._ignored_depth -= 1
1990 return
1991 if self._data:
1992 self._flush()
1993 self._write(f'</{self._qname(tag)[0]}>')
1994 self._preserve_space.pop()
1995 self._root_done = len(self._preserve_space) == 1
1996 self._declared_ns_stack.pop()
1997 self._ns_stack.pop()
1998
1999 def comment(self, text):
2000 if not self._with_comments:
2001 return
2002 if self._ignored_depth:
2003 return
2004 if self._root_done:
2005 self._write('\n')
2006 elif self._root_seen and self._data:
2007 self._flush()
2008 self._write(f'<!--{_escape_cdata_c14n(text)}-->')
2009 if not self._root_seen:
2010 self._write('\n')
2011
2012 def pi(self, target, data):
2013 if self._ignored_depth:
2014 return
2015 if self._root_done:
2016 self._write('\n')
2017 elif self._root_seen and self._data:
2018 self._flush()
2019 self._write(
2020 f'<?{target} {_escape_cdata_c14n(data)}?>' if data else f'<?{target}?>')
2021 if not self._root_seen:
2022 self._write('\n')
2023
2024
2025 def _escape_cdata_c14n(text):
2026 # escape character data
2027 try:
2028 # it's worth avoiding do-nothing calls for strings that are
2029 # shorter than 500 character, or so. assume that's, by far,
2030 # the most common case in most applications.
2031 if '&' in text:
2032 text = text.replace('&', '&')
2033 if '<' in text:
2034 text = text.replace('<', '<')
2035 if '>' in text:
2036 text = text.replace('>', '>')
2037 if '\r' in text:
2038 text = text.replace('\r', '
')
2039 return text
2040 except (TypeError, AttributeError):
2041 _raise_serialization_error(text)
2042
2043
2044 def _escape_attrib_c14n(text):
2045 # escape attribute value
2046 try:
2047 if '&' in text:
2048 text = text.replace('&', '&')
2049 if '<' in text:
2050 text = text.replace('<', '<')
2051 if '"' in text:
2052 text = text.replace('"', '"')
2053 if '\t' in text:
2054 text = text.replace('\t', '	')
2055 if '\n' in text:
2056 text = text.replace('\n', '
')
2057 if '\r' in text:
2058 text = text.replace('\r', '
')
2059 return text
2060 except (TypeError, AttributeError):
2061 _raise_serialization_error(text)
2062
2063
2064 # --------------------------------------------------------------------
2065
2066 # Import the C accelerators
2067 try:
2068 # Element is going to be shadowed by the C implementation. We need to keep
2069 # the Python version of it accessible for some "creative" by external code
2070 # (see tests)
2071 _Element_Py = Element
2072
2073 # Element, SubElement, ParseError, TreeBuilder, XMLParser, _set_factories
2074 from _elementtree import *
2075 from _elementtree import _set_factories
2076 except ImportError:
2077 pass
2078 else:
2079 _set_factories(Comment, ProcessingInstruction)