1  #
       2  # ElementTree
       3  # $Id: ElementPath.py 3375 2008-02-13 08:05:08Z fredrik $
       4  #
       5  # limited xpath support for element trees
       6  #
       7  # history:
       8  # 2003-05-23 fl   created
       9  # 2003-05-28 fl   added support for // etc
      10  # 2003-08-27 fl   fixed parsing of periods in element names
      11  # 2007-09-10 fl   new selection engine
      12  # 2007-09-12 fl   fixed parent selector
      13  # 2007-09-13 fl   added iterfind; changed findall to return a list
      14  # 2007-11-30 fl   added namespaces support
      15  # 2009-10-30 fl   added child element value filter
      16  #
      17  # Copyright (c) 2003-2009 by Fredrik Lundh.  All rights reserved.
      18  #
      19  # fredrik@pythonware.com
      20  # http://www.pythonware.com
      21  #
      22  # --------------------------------------------------------------------
      23  # The ElementTree toolkit is
      24  #
      25  # Copyright (c) 1999-2009 by Fredrik Lundh
      26  #
      27  # By obtaining, using, and/or copying this software and/or its
      28  # associated documentation, you agree that you have read, understood,
      29  # and will comply with the following terms and conditions:
      30  #
      31  # Permission to use, copy, modify, and distribute this software and
      32  # its associated documentation for any purpose and without fee is
      33  # hereby granted, provided that the above copyright notice appears in
      34  # all copies, and that both that copyright notice and this permission
      35  # notice appear in supporting documentation, and that the name of
      36  # Secret Labs AB or the author not be used in advertising or publicity
      37  # pertaining to distribution of the software without specific, written
      38  # prior permission.
      39  #
      40  # SECRET LABS AB AND THE AUTHOR DISCLAIMS ALL WARRANTIES WITH REGARD
      41  # TO THIS SOFTWARE, INCLUDING ALL IMPLIED WARRANTIES OF MERCHANT-
      42  # ABILITY AND FITNESS.  IN NO EVENT SHALL SECRET LABS AB OR THE AUTHOR
      43  # BE LIABLE FOR ANY SPECIAL, INDIRECT OR CONSEQUENTIAL DAMAGES OR ANY
      44  # DAMAGES WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS,
      45  # WHETHER IN AN ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS
      46  # ACTION, ARISING OUT OF OR IN CONNECTION WITH THE USE OR PERFORMANCE
      47  # OF THIS SOFTWARE.
      48  # --------------------------------------------------------------------
      49  
      50  # Licensed to PSF under a Contributor Agreement.
      51  # See https://www.python.org/psf/license for licensing details.
      52  
      53  ##
      54  # Implementation module for XPath support.  There's usually no reason
      55  # to import this module directly; the <b>ElementTree</b> does this for
      56  # you, if needed.
      57  ##
      58  
      59  import re
      60  
      61  xpath_tokenizer_re = re.compile(
      62      r"("
      63      r"'[^']*'|\"[^\"]*\"|"
      64      r"::|"
      65      r"//?|"
      66      r"\.\.|"
      67      r"\(\)|"
      68      r"!=|"
      69      r"[/.*:\[\]\(\)@=])|"
      70      r"((?:\{[^}]+\})?[^/\[\]\(\)@!=\s]+)|"
      71      r"\s+"
      72      )
      73  
      74  def xpath_tokenizer(pattern, namespaces=None):
      75      default_namespace = namespaces.get('') if namespaces else None
      76      parsing_attribute = False
      77      for token in xpath_tokenizer_re.findall(pattern):
      78          ttype, tag = token
      79          if tag and tag[0] != "{":
      80              if ":" in tag:
      81                  prefix, uri = tag.split(":", 1)
      82                  try:
      83                      if not namespaces:
      84                          raise KeyError
      85                      yield ttype, "{%s}%s" % (namespaces[prefix], uri)
      86                  except KeyError:
      87                      raise SyntaxError("prefix %r not found in prefix map" % prefix) from None
      88              elif default_namespace and not parsing_attribute:
      89                  yield ttype, "{%s}%s" % (default_namespace, tag)
      90              else:
      91                  yield token
      92              parsing_attribute = False
      93          else:
      94              yield token
      95              parsing_attribute = ttype == '@'
      96  
      97  
      98  def get_parent_map(context):
      99      parent_map = context.parent_map
     100      if parent_map is None:
     101          context.parent_map = parent_map = {}
     102          for p in context.root.iter():
     103              for e in p:
     104                  parent_map[e] = p
     105      return parent_map
     106  
     107  
     108  def _is_wildcard_tag(tag):
     109      return tag[:3] == '{*}' or tag[-2:] == '}*'
     110  
     111  
     112  def _prepare_tag(tag):
     113      _isinstance, _str = isinstance, str
     114      if tag == '{*}*':
     115          # Same as '*', but no comments or processing instructions.
     116          # It can be a surprise that '*' includes those, but there is no
     117          # justification for '{*}*' doing the same.
     118          def select(context, result):
     119              for elem in result:
     120                  if _isinstance(elem.tag, _str):
     121                      yield elem
     122      elif tag == '{}*':
     123          # Any tag that is not in a namespace.
     124          def select(context, result):
     125              for elem in result:
     126                  el_tag = elem.tag
     127                  if _isinstance(el_tag, _str) and el_tag[0] != '{':
     128                      yield elem
     129      elif tag[:3] == '{*}':
     130          # The tag in any (or no) namespace.
     131          suffix = tag[2:]  # '}name'
     132          no_ns = slice(-len(suffix), None)
     133          tag = tag[3:]
     134          def select(context, result):
     135              for elem in result:
     136                  el_tag = elem.tag
     137                  if el_tag == tag or _isinstance(el_tag, _str) and el_tag[no_ns] == suffix:
     138                      yield elem
     139      elif tag[-2:] == '}*':
     140          # Any tag in the given namespace.
     141          ns = tag[:-1]
     142          ns_only = slice(None, len(ns))
     143          def select(context, result):
     144              for elem in result:
     145                  el_tag = elem.tag
     146                  if _isinstance(el_tag, _str) and el_tag[ns_only] == ns:
     147                      yield elem
     148      else:
     149          raise RuntimeError(f"internal parser error, got {tag}")
     150      return select
     151  
     152  
     153  def prepare_child(next, token):
     154      tag = token[1]
     155      if _is_wildcard_tag(tag):
     156          select_tag = _prepare_tag(tag)
     157          def select(context, result):
     158              def select_child(result):
     159                  for elem in result:
     160                      yield from elem
     161              return select_tag(context, select_child(result))
     162      else:
     163          if tag[:2] == '{}':
     164              tag = tag[2:]  # '{}tag' == 'tag'
     165          def select(context, result):
     166              for elem in result:
     167                  for e in elem:
     168                      if e.tag == tag:
     169                          yield e
     170      return select
     171  
     172  def prepare_star(next, token):
     173      def select(context, result):
     174          for elem in result:
     175              yield from elem
     176      return select
     177  
     178  def prepare_self(next, token):
     179      def select(context, result):
     180          yield from result
     181      return select
     182  
     183  def prepare_descendant(next, token):
     184      try:
     185          token = next()
     186      except StopIteration:
     187          return
     188      if token[0] == "*":
     189          tag = "*"
     190      elif not token[0]:
     191          tag = token[1]
     192      else:
     193          raise SyntaxError("invalid descendant")
     194  
     195      if _is_wildcard_tag(tag):
     196          select_tag = _prepare_tag(tag)
     197          def select(context, result):
     198              def select_child(result):
     199                  for elem in result:
     200                      for e in elem.iter():
     201                          if e is not elem:
     202                              yield e
     203              return select_tag(context, select_child(result))
     204      else:
     205          if tag[:2] == '{}':
     206              tag = tag[2:]  # '{}tag' == 'tag'
     207          def select(context, result):
     208              for elem in result:
     209                  for e in elem.iter(tag):
     210                      if e is not elem:
     211                          yield e
     212      return select
     213  
     214  def prepare_parent(next, token):
     215      def select(context, result):
     216          # FIXME: raise error if .. is applied at toplevel?
     217          parent_map = get_parent_map(context)
     218          result_map = {}
     219          for elem in result:
     220              if elem in parent_map:
     221                  parent = parent_map[elem]
     222                  if parent not in result_map:
     223                      result_map[parent] = None
     224                      yield parent
     225      return select
     226  
     227  def prepare_predicate(next, token):
     228      # FIXME: replace with real parser!!! refs:
     229      # http://javascript.crockford.com/tdop/tdop.html
     230      signature = []
     231      predicate = []
     232      while 1:
     233          try:
     234              token = next()
     235          except StopIteration:
     236              return
     237          if token[0] == "]":
     238              break
     239          if token == ('', ''):
     240              # ignore whitespace
     241              continue
     242          if token[0] and token[0][:1] in "'\"":
     243              token = "'", token[0][1:-1]
     244          signature.append(token[0] or "-")
     245          predicate.append(token[1])
     246      signature = "".join(signature)
     247      # use signature to determine predicate type
     248      if signature == "@-":
     249          # [@attribute] predicate
     250          key = predicate[1]
     251          def select(context, result):
     252              for elem in result:
     253                  if elem.get(key) is not None:
     254                      yield elem
     255          return select
     256      if signature == "@-='" or signature == "@-!='":
     257          # [@attribute='value'] or [@attribute!='value']
     258          key = predicate[1]
     259          value = predicate[-1]
     260          def select(context, result):
     261              for elem in result:
     262                  if elem.get(key) == value:
     263                      yield elem
     264          def select_negated(context, result):
     265              for elem in result:
     266                  if (attr_value := elem.get(key)) is not None and attr_value != value:
     267                      yield elem
     268          return select_negated if '!=' in signature else select
     269      if signature == "-" and not re.match(r"\-?\d+$", predicate[0]):
     270          # [tag]
     271          tag = predicate[0]
     272          def select(context, result):
     273              for elem in result:
     274                  if elem.find(tag) is not None:
     275                      yield elem
     276          return select
     277      if signature == ".='" or signature == ".!='" or (
     278              (signature == "-='" or signature == "-!='")
     279              and not re.match(r"\-?\d+$", predicate[0])):
     280          # [.='value'] or [tag='value'] or [.!='value'] or [tag!='value']
     281          tag = predicate[0]
     282          value = predicate[-1]
     283          if tag:
     284              def select(context, result):
     285                  for elem in result:
     286                      for e in elem.findall(tag):
     287                          if "".join(e.itertext()) == value:
     288                              yield elem
     289                              break
     290              def select_negated(context, result):
     291                  for elem in result:
     292                      for e in elem.iterfind(tag):
     293                          if "".join(e.itertext()) != value:
     294                              yield elem
     295                              break
     296          else:
     297              def select(context, result):
     298                  for elem in result:
     299                      if "".join(elem.itertext()) == value:
     300                          yield elem
     301              def select_negated(context, result):
     302                  for elem in result:
     303                      if "".join(elem.itertext()) != value:
     304                          yield elem
     305          return select_negated if '!=' in signature else select
     306      if signature == "-" or signature == "-()" or signature == "-()-":
     307          # [index] or [last()] or [last()-index]
     308          if signature == "-":
     309              # [index]
     310              index = int(predicate[0]) - 1
     311              if index < 0:
     312                  raise SyntaxError("XPath position >= 1 expected")
     313          else:
     314              if predicate[0] != "last":
     315                  raise SyntaxError("unsupported function")
     316              if signature == "-()-":
     317                  try:
     318                      index = int(predicate[2]) - 1
     319                  except ValueError:
     320                      raise SyntaxError("unsupported expression")
     321                  if index > -2:
     322                      raise SyntaxError("XPath offset from last() must be negative")
     323              else:
     324                  index = -1
     325          def select(context, result):
     326              parent_map = get_parent_map(context)
     327              for elem in result:
     328                  try:
     329                      parent = parent_map[elem]
     330                      # FIXME: what if the selector is "*" ?
     331                      elems = list(parent.findall(elem.tag))
     332                      if elems[index] is elem:
     333                          yield elem
     334                  except (IndexError, KeyError):
     335                      pass
     336          return select
     337      raise SyntaxError("invalid predicate")
     338  
     339  ops = {
     340      "": prepare_child,
     341      "*": prepare_star,
     342      ".": prepare_self,
     343      "..": prepare_parent,
     344      "//": prepare_descendant,
     345      "[": prepare_predicate,
     346      }
     347  
     348  _cache = {}
     349  
     350  class ESC[4;38;5;81m_SelectorContext:
     351      parent_map = None
     352      def __init__(self, root):
     353          self.root = root
     354  
     355  # --------------------------------------------------------------------
     356  
     357  ##
     358  # Generate all matching objects.
     359  
     360  def iterfind(elem, path, namespaces=None):
     361      # compile selector pattern
     362      if path[-1:] == "/":
     363          path = path + "*" # implicit all (FIXME: keep this?)
     364  
     365      cache_key = (path,)
     366      if namespaces:
     367          cache_key += tuple(sorted(namespaces.items()))
     368  
     369      try:
     370          selector = _cache[cache_key]
     371      except KeyError:
     372          if len(_cache) > 100:
     373              _cache.clear()
     374          if path[:1] == "/":
     375              raise SyntaxError("cannot use absolute path on element")
     376          next = iter(xpath_tokenizer(path, namespaces)).__next__
     377          try:
     378              token = next()
     379          except StopIteration:
     380              return
     381          selector = []
     382          while 1:
     383              try:
     384                  selector.append(ops[token[0]](next, token))
     385              except StopIteration:
     386                  raise SyntaxError("invalid path") from None
     387              try:
     388                  token = next()
     389                  if token[0] == "/":
     390                      token = next()
     391              except StopIteration:
     392                  break
     393          _cache[cache_key] = selector
     394      # execute selector pattern
     395      result = [elem]
     396      context = _SelectorContext(elem)
     397      for select in selector:
     398          result = select(context, result)
     399      return result
     400  
     401  ##
     402  # Find first matching object.
     403  
     404  def find(elem, path, namespaces=None):
     405      return next(iterfind(elem, path, namespaces), None)
     406  
     407  ##
     408  # Find all matching objects.
     409  
     410  def findall(elem, path, namespaces=None):
     411      return list(iterfind(elem, path, namespaces))
     412  
     413  ##
     414  # Find text for first matching object.
     415  
     416  def findtext(elem, path, default=None, namespaces=None):
     417      try:
     418          elem = next(iterfind(elem, path, namespaces))
     419          if elem.text is None:
     420              return ""
     421          return elem.text
     422      except StopIteration:
     423          return default