(root)/
libxml2-2.12.3/
python/
drv_libxml2.py
       1  # -*- coding: iso-8859-1 -*-
       2  """ A SAX2 driver for libxml2, on top of it's XmlReader API
       3  
       4  USAGE
       5      # put this file (drv_libxml2.py) in PYTHONPATH
       6      import xml.sax
       7      reader = xml.sax.make_parser(["drv_libxml2"])
       8      # ...and the rest is standard python sax.
       9  
      10  CAVEATS
      11      - Lexical handlers are supported, except for start/endEntity
      12        (waiting for XmlReader.ResolveEntity) and start/endDTD
      13      - Error callbacks are not exactly synchronous, they tend
      14        to be invoked before the corresponding content callback,
      15        because the underlying reader interface parses
      16        data by chunks of 512 bytes
      17      
      18  TODO
      19      - search for TODO
      20      - some ErrorHandler events (warning)
      21      - some ContentHandler events (setDocumentLocator, skippedEntity)
      22      - EntityResolver (using libxml2.?)
      23      - DTDHandler (if/when libxml2 exposes such node types)
      24      - DeclHandler (if/when libxml2 exposes such node types)
      25      - property_xml_string?
      26      - feature_string_interning?
      27      - Incremental parser
      28      - additional performance tuning:
      29        - one might cache callbacks to avoid some name lookups
      30        - one might implement a smarter way to pass attributes to startElement
      31          (some kind of lazy evaluation?)
      32        - there might be room for improvement in start/endPrefixMapping
      33        - other?
      34  
      35  """
      36  
      37  __author__  = "St�phane Bidoul <sbi@skynet.be>"
      38  __version__ = "0.3"
      39  
      40  import sys
      41  import codecs
      42  
      43  if sys.version_info[0] < 3:
      44      __author__  = codecs.unicode_escape_decode(__author__)[0]
      45  
      46      StringTypes = (str, unicode)
      47      # libxml2 returns strings as UTF8
      48      _decoder = codecs.lookup("utf8")[1]
      49      def _d(s):
      50          if s is None:
      51              return s
      52          else:
      53              return _decoder(s)[0]
      54  else:
      55      StringTypes = str
      56      # s is Unicode `str` already
      57      def _d(s):
      58          return s
      59  
      60  from xml.sax._exceptions import *
      61  from xml.sax import xmlreader, saxutils
      62  from xml.sax.handler import \
      63       feature_namespaces, \
      64       feature_namespace_prefixes, \
      65       feature_string_interning, \
      66       feature_validation, \
      67       feature_external_ges, \
      68       feature_external_pes, \
      69       property_lexical_handler, \
      70       property_declaration_handler, \
      71       property_dom_node, \
      72       property_xml_string
      73  
      74  try:
      75      import libxml2
      76  except ImportError:
      77      raise SAXReaderNotAvailable("libxml2 not available: " \
      78                                  "import error was: %s" % sys.exc_info()[1])
      79  
      80  class ESC[4;38;5;81mLocator(ESC[4;38;5;149mxmlreaderESC[4;38;5;149m.ESC[4;38;5;149mLocator):
      81      """SAX Locator adapter for libxml2.xmlTextReaderLocator"""
      82  
      83      def __init__(self,locator):
      84          self.__locator = locator
      85  
      86      def getColumnNumber(self):
      87          "Return the column number where the current event ends."
      88          return -1
      89  
      90      def getLineNumber(self):
      91          "Return the line number where the current event ends."
      92          return self.__locator.LineNumber()
      93  
      94      def getPublicId(self):
      95          "Return the public identifier for the current event."
      96          return None
      97  
      98      def getSystemId(self):
      99          "Return the system identifier for the current event."
     100          return self.__locator.BaseURI()
     101  
     102  class ESC[4;38;5;81mLibXml2Reader(ESC[4;38;5;149mxmlreaderESC[4;38;5;149m.ESC[4;38;5;149mXMLReader):
     103  
     104      def __init__(self):
     105          xmlreader.XMLReader.__init__(self)
     106          # features
     107          self.__ns = 0
     108          self.__nspfx = 0
     109          self.__validate = 0
     110          self.__extparams = 1
     111          # parsing flag
     112          self.__parsing = 0
     113          # additional handlers
     114          self.__lex_handler = None
     115          self.__decl_handler = None
     116          # error messages accumulator
     117          self.__errors = None
     118  
     119      def _errorHandler(self,arg,msg,severity,locator):
     120          if self.__errors is None:
     121              self.__errors = []
     122          self.__errors.append((severity,
     123                                SAXParseException(msg,None,
     124                                                  Locator(locator))))
     125  
     126      def _reportErrors(self,fatal):
     127          for severity,exception in self.__errors:
     128              if severity in (libxml2.PARSER_SEVERITY_VALIDITY_WARNING,
     129                              libxml2.PARSER_SEVERITY_WARNING):
     130                  self._err_handler.warning(exception)
     131              else:
     132                  # when fatal is set, the parse will stop;
     133                  # we consider that the last error reported
     134                  # is the fatal one.
     135                  if fatal and exception is self.__errors[-1][1]:
     136                      self._err_handler.fatalError(exception)
     137                  else:
     138                      self._err_handler.error(exception)
     139          self.__errors = None
     140  
     141      def parse(self, source):
     142          self.__parsing = 1
     143          try:
     144              # prepare source and create reader
     145              if isinstance(source, StringTypes):
     146                  reader = libxml2.newTextReaderFilename(source)
     147              else:
     148                  source = saxutils.prepare_input_source(source)
     149                  input = libxml2.inputBuffer(source.getByteStream())
     150                  reader = input.newTextReader(source.getSystemId())
     151              reader.SetErrorHandler(self._errorHandler,None)
     152              # configure reader
     153              if self.__extparams:
     154                  reader.SetParserProp(libxml2.PARSER_LOADDTD,1)
     155                  reader.SetParserProp(libxml2.PARSER_DEFAULTATTRS,1)
     156                  reader.SetParserProp(libxml2.PARSER_SUBST_ENTITIES,1)
     157                  reader.SetParserProp(libxml2.PARSER_VALIDATE,self.__validate)
     158              else:
     159                  reader.SetParserProp(libxml2.PARSER_LOADDTD, 0)
     160              # we reuse attribute maps (for a slight performance gain)
     161              if self.__ns:
     162                  attributesNSImpl = xmlreader.AttributesNSImpl({},{})
     163              else:
     164                  attributesImpl = xmlreader.AttributesImpl({})
     165              # prefixes to pop (for endPrefixMapping)
     166              prefixes = []
     167              # start loop
     168              self._cont_handler.startDocument()
     169              while 1:
     170                  r = reader.Read()
     171                  # check for errors
     172                  if r == 1:
     173                      if not self.__errors is None:
     174                          self._reportErrors(0)
     175                  elif r == 0:
     176                      if not self.__errors is None:
     177                          self._reportErrors(0)
     178                      break # end of parse
     179                  else:
     180                      if not self.__errors is None:
     181                          self._reportErrors(1)
     182                      else:
     183                          self._err_handler.fatalError(\
     184                              SAXException("Read failed (no details available)"))
     185                      break # fatal parse error
     186                  # get node type
     187                  nodeType = reader.NodeType()
     188                  # Element
     189                  if nodeType == 1: 
     190                      if self.__ns:
     191                          eltName = (_d(reader.NamespaceUri()),\
     192                                     _d(reader.LocalName()))
     193                          eltQName = _d(reader.Name())
     194                          attributesNSImpl._attrs = attrs = {}
     195                          attributesNSImpl._qnames = qnames = {}
     196                          newPrefixes = []
     197                          while reader.MoveToNextAttribute():
     198                              qname = _d(reader.Name())
     199                              value = _d(reader.Value())
     200                              if qname.startswith("xmlns"):
     201                                  if len(qname) > 5:
     202                                      newPrefix = qname[6:]
     203                                  else:
     204                                      newPrefix = None
     205                                  newPrefixes.append(newPrefix)
     206                                  self._cont_handler.startPrefixMapping(\
     207                                      newPrefix,value)
     208                                  if not self.__nspfx:
     209                                      continue # don't report xmlns attribute
     210                              attName = (_d(reader.NamespaceUri()),
     211                                         _d(reader.LocalName()))
     212                              qnames[attName] = qname
     213                              attrs[attName] = value
     214                          reader.MoveToElement()
     215                          self._cont_handler.startElementNS( \
     216                              eltName,eltQName,attributesNSImpl) 
     217                          if reader.IsEmptyElement():
     218                              self._cont_handler.endElementNS(eltName,eltQName)
     219                              for newPrefix in newPrefixes:
     220                                  self._cont_handler.endPrefixMapping(newPrefix)
     221                          else:
     222                              prefixes.append(newPrefixes)
     223                      else:
     224                          eltName = _d(reader.Name())
     225                          attributesImpl._attrs = attrs = {}
     226                          while reader.MoveToNextAttribute():
     227                              attName = _d(reader.Name())
     228                              attrs[attName] = _d(reader.Value())
     229                          reader.MoveToElement()
     230                          self._cont_handler.startElement( \
     231                              eltName,attributesImpl)
     232                          if reader.IsEmptyElement():
     233                              self._cont_handler.endElement(eltName)
     234                  # EndElement
     235                  elif nodeType == 15: 
     236                      if self.__ns:
     237                          self._cont_handler.endElementNS( \
     238                               (_d(reader.NamespaceUri()),_d(reader.LocalName())),
     239                               _d(reader.Name()))
     240                          for prefix in prefixes.pop():
     241                              self._cont_handler.endPrefixMapping(prefix)
     242                      else:
     243                          self._cont_handler.endElement(_d(reader.Name()))
     244                  # Text
     245                  elif nodeType == 3: 
     246                      self._cont_handler.characters(_d(reader.Value()))
     247                  # Whitespace
     248                  elif nodeType == 13: 
     249                      self._cont_handler.ignorableWhitespace(_d(reader.Value()))
     250                  # SignificantWhitespace
     251                  elif nodeType == 14:
     252                      self._cont_handler.characters(_d(reader.Value()))
     253                  # CDATA
     254                  elif nodeType == 4:
     255                      if not self.__lex_handler is None:
     256                          self.__lex_handler.startCDATA()
     257                      self._cont_handler.characters(_d(reader.Value()))
     258                      if not self.__lex_handler is None:
     259                          self.__lex_handler.endCDATA()
     260                  # EntityReference
     261                  elif nodeType == 5:
     262                      if not self.__lex_handler is None:
     263                          self.startEntity(_d(reader.Name()))
     264                      reader.ResolveEntity()
     265                  # EndEntity
     266                  elif nodeType == 16:
     267                      if not self.__lex_handler is None:
     268                          self.endEntity(_d(reader.Name()))
     269                  # ProcessingInstruction
     270                  elif nodeType == 7: 
     271                      self._cont_handler.processingInstruction( \
     272                          _d(reader.Name()),_d(reader.Value()))
     273                  # Comment
     274                  elif nodeType == 8:
     275                      if not self.__lex_handler is None:
     276                          self.__lex_handler.comment(_d(reader.Value()))
     277                  # DocumentType
     278                  elif nodeType == 10:
     279                      #if not self.__lex_handler is None:
     280                      #    self.__lex_handler.startDTD()
     281                      pass # TODO (how to detect endDTD? on first non-dtd event?)
     282                  # XmlDeclaration
     283                  elif nodeType == 17:
     284                      pass # TODO
     285                  # Entity
     286                  elif nodeType == 6:
     287                      pass # TODO (entity decl)
     288                  # Notation (decl)
     289                  elif nodeType == 12:
     290                      pass # TODO
     291                  # Attribute (never in this loop)
     292                  #elif nodeType == 2: 
     293                  #    pass
     294                  # Document (not exposed)
     295                  #elif nodeType == 9: 
     296                  #    pass
     297                  # DocumentFragment (never returned by XmlReader)
     298                  #elif nodeType == 11:
     299                  #    pass
     300                  # None
     301                  #elif nodeType == 0:
     302                  #    pass
     303                  # -
     304                  else:
     305                      raise SAXException("Unexpected node type %d" % nodeType)
     306              if r == 0:
     307                  self._cont_handler.endDocument()
     308              reader.Close()
     309          finally:
     310              self.__parsing = 0
     311  
     312      def setDTDHandler(self, handler):
     313          # TODO (when supported, the inherited method works just fine)
     314          raise SAXNotSupportedException("DTDHandler not supported")
     315  
     316      def setEntityResolver(self, resolver):
     317          # TODO (when supported, the inherited method works just fine)
     318          raise SAXNotSupportedException("EntityResolver not supported")
     319  
     320      def getFeature(self, name):
     321          if name == feature_namespaces:
     322              return self.__ns
     323          elif name == feature_namespace_prefixes:
     324              return self.__nspfx
     325          elif name == feature_validation:
     326              return self.__validate
     327          elif name == feature_external_ges:
     328              return 1 # TODO (does that relate to PARSER_LOADDTD)?
     329          elif name == feature_external_pes:
     330              return self.__extparams
     331          else:
     332              raise SAXNotRecognizedException("Feature '%s' not recognized" % \
     333                                              name)
     334  
     335      def setFeature(self, name, state):
     336          if self.__parsing:
     337              raise SAXNotSupportedException("Cannot set feature %s " \
     338                                             "while parsing" % name)
     339          if name == feature_namespaces:
     340              self.__ns = state
     341          elif name == feature_namespace_prefixes:
     342              self.__nspfx = state
     343          elif name == feature_validation:
     344              self.__validate = state
     345          elif name == feature_external_ges:
     346              if state == 0:
     347                  # TODO (does that relate to PARSER_LOADDTD)?
     348                  raise SAXNotSupportedException("Feature '%s' not supported" % \
     349                                                 name)
     350          elif name == feature_external_pes:
     351              self.__extparams = state
     352          else:
     353              raise SAXNotRecognizedException("Feature '%s' not recognized" % \
     354                                              name)
     355  
     356      def getProperty(self, name):
     357          if name == property_lexical_handler:
     358              return self.__lex_handler
     359          elif name == property_declaration_handler:
     360              return self.__decl_handler
     361          else:
     362              raise SAXNotRecognizedException("Property '%s' not recognized" % \
     363                                              name)
     364  
     365      def setProperty(self, name, value):     
     366          if name == property_lexical_handler:
     367              self.__lex_handler = value
     368          elif name == property_declaration_handler:
     369              # TODO: remove if/when libxml2 supports dtd events
     370              raise SAXNotSupportedException("Property '%s' not supported" % \
     371                                             name)
     372              self.__decl_handler = value
     373          else:
     374              raise SAXNotRecognizedException("Property '%s' not recognized" % \
     375                                              name)
     376  
     377  def create_parser():
     378      return LibXml2Reader()
     379