1  /*
       2   * HTMLtree.c : implementation of access function for an HTML tree.
       3   *
       4   * See Copyright for the status of this software.
       5   *
       6   * daniel@veillard.com
       7   */
       8  
       9  
      10  #define IN_LIBXML
      11  #include "libxml.h"
      12  #ifdef LIBXML_HTML_ENABLED
      13  
      14  #include <string.h> /* for memset() only ! */
      15  #include <ctype.h>
      16  #include <stdlib.h>
      17  
      18  #include <libxml/xmlmemory.h>
      19  #include <libxml/HTMLparser.h>
      20  #include <libxml/HTMLtree.h>
      21  #include <libxml/entities.h>
      22  #include <libxml/xmlerror.h>
      23  #include <libxml/parserInternals.h>
      24  #include <libxml/uri.h>
      25  
      26  #include "private/buf.h"
      27  #include "private/error.h"
      28  #include "private/io.h"
      29  #include "private/save.h"
      30  
      31  /************************************************************************
      32   *									*
      33   *		Getting/Setting encoding meta tags			*
      34   *									*
      35   ************************************************************************/
      36  
      37  /**
      38   * htmlGetMetaEncoding:
      39   * @doc:  the document
      40   *
      41   * Encoding definition lookup in the Meta tags
      42   *
      43   * Returns the current encoding as flagged in the HTML source
      44   */
      45  const xmlChar *
      46  htmlGetMetaEncoding(htmlDocPtr doc) {
      47      htmlNodePtr cur;
      48      const xmlChar *content;
      49      const xmlChar *encoding;
      50  
      51      if (doc == NULL)
      52  	return(NULL);
      53      cur = doc->children;
      54  
      55      /*
      56       * Search the html
      57       */
      58      while (cur != NULL) {
      59  	if ((cur->type == XML_ELEMENT_NODE) && (cur->name != NULL)) {
      60  	    if (xmlStrEqual(cur->name, BAD_CAST"html"))
      61  		break;
      62  	    if (xmlStrEqual(cur->name, BAD_CAST"head"))
      63  		goto found_head;
      64  	    if (xmlStrEqual(cur->name, BAD_CAST"meta"))
      65  		goto found_meta;
      66  	}
      67  	cur = cur->next;
      68      }
      69      if (cur == NULL)
      70  	return(NULL);
      71      cur = cur->children;
      72  
      73      /*
      74       * Search the head
      75       */
      76      while (cur != NULL) {
      77  	if ((cur->type == XML_ELEMENT_NODE) && (cur->name != NULL)) {
      78  	    if (xmlStrEqual(cur->name, BAD_CAST"head"))
      79  		break;
      80  	    if (xmlStrEqual(cur->name, BAD_CAST"meta"))
      81  		goto found_meta;
      82  	}
      83  	cur = cur->next;
      84      }
      85      if (cur == NULL)
      86  	return(NULL);
      87  found_head:
      88      cur = cur->children;
      89  
      90      /*
      91       * Search the meta elements
      92       */
      93  found_meta:
      94      while (cur != NULL) {
      95  	if ((cur->type == XML_ELEMENT_NODE) && (cur->name != NULL)) {
      96  	    if (xmlStrEqual(cur->name, BAD_CAST"meta")) {
      97  		xmlAttrPtr attr = cur->properties;
      98  		int http;
      99  		const xmlChar *value;
     100  
     101  		content = NULL;
     102  		http = 0;
     103  		while (attr != NULL) {
     104  		    if ((attr->children != NULL) &&
     105  		        (attr->children->type == XML_TEXT_NODE) &&
     106  		        (attr->children->next == NULL)) {
     107  			value = attr->children->content;
     108  			if ((!xmlStrcasecmp(attr->name, BAD_CAST"http-equiv"))
     109  			 && (!xmlStrcasecmp(value, BAD_CAST"Content-Type")))
     110  			    http = 1;
     111  			else if ((value != NULL)
     112  			 && (!xmlStrcasecmp(attr->name, BAD_CAST"content")))
     113  			    content = value;
     114  			if ((http != 0) && (content != NULL))
     115  			    goto found_content;
     116  		    }
     117  		    attr = attr->next;
     118  		}
     119  	    }
     120  	}
     121  	cur = cur->next;
     122      }
     123      return(NULL);
     124  
     125  found_content:
     126      encoding = xmlStrstr(content, BAD_CAST"charset=");
     127      if (encoding == NULL)
     128  	encoding = xmlStrstr(content, BAD_CAST"Charset=");
     129      if (encoding == NULL)
     130  	encoding = xmlStrstr(content, BAD_CAST"CHARSET=");
     131      if (encoding != NULL) {
     132  	encoding += 8;
     133      } else {
     134  	encoding = xmlStrstr(content, BAD_CAST"charset =");
     135  	if (encoding == NULL)
     136  	    encoding = xmlStrstr(content, BAD_CAST"Charset =");
     137  	if (encoding == NULL)
     138  	    encoding = xmlStrstr(content, BAD_CAST"CHARSET =");
     139  	if (encoding != NULL)
     140  	    encoding += 9;
     141      }
     142      if (encoding != NULL) {
     143  	while ((*encoding == ' ') || (*encoding == '\t')) encoding++;
     144      }
     145      return(encoding);
     146  }
     147  
     148  /**
     149   * htmlSetMetaEncoding:
     150   * @doc:  the document
     151   * @encoding:  the encoding string
     152   *
     153   * Sets the current encoding in the Meta tags
     154   * NOTE: this will not change the document content encoding, just
     155   * the META flag associated.
     156   *
     157   * Returns 0 in case of success and -1 in case of error
     158   */
     159  int
     160  htmlSetMetaEncoding(htmlDocPtr doc, const xmlChar *encoding) {
     161      htmlNodePtr cur, meta = NULL, head = NULL;
     162      const xmlChar *content = NULL;
     163      char newcontent[100];
     164  
     165      newcontent[0] = 0;
     166  
     167      if (doc == NULL)
     168  	return(-1);
     169  
     170      /* html isn't a real encoding it's just libxml2 way to get entities */
     171      if (!xmlStrcasecmp(encoding, BAD_CAST "html"))
     172          return(-1);
     173  
     174      if (encoding != NULL) {
     175  	snprintf(newcontent, sizeof(newcontent), "text/html; charset=%s",
     176                  (char *)encoding);
     177  	newcontent[sizeof(newcontent) - 1] = 0;
     178      }
     179  
     180      cur = doc->children;
     181  
     182      /*
     183       * Search the html
     184       */
     185      while (cur != NULL) {
     186  	if ((cur->type == XML_ELEMENT_NODE) && (cur->name != NULL)) {
     187  	    if (xmlStrcasecmp(cur->name, BAD_CAST"html") == 0)
     188  		break;
     189  	    if (xmlStrcasecmp(cur->name, BAD_CAST"head") == 0)
     190  		goto found_head;
     191  	    if (xmlStrcasecmp(cur->name, BAD_CAST"meta") == 0)
     192  		goto found_meta;
     193  	}
     194  	cur = cur->next;
     195      }
     196      if (cur == NULL)
     197  	return(-1);
     198      cur = cur->children;
     199  
     200      /*
     201       * Search the head
     202       */
     203      while (cur != NULL) {
     204  	if ((cur->type == XML_ELEMENT_NODE) && (cur->name != NULL)) {
     205  	    if (xmlStrcasecmp(cur->name, BAD_CAST"head") == 0)
     206  		break;
     207  	    if (xmlStrcasecmp(cur->name, BAD_CAST"meta") == 0) {
     208                  head = cur->parent;
     209  		goto found_meta;
     210              }
     211  	}
     212  	cur = cur->next;
     213      }
     214      if (cur == NULL)
     215  	return(-1);
     216  found_head:
     217      head = cur;
     218      if (cur->children == NULL)
     219          goto create;
     220      cur = cur->children;
     221  
     222  found_meta:
     223      /*
     224       * Search and update all the remaining the meta elements carrying
     225       * encoding information
     226       */
     227      while (cur != NULL) {
     228  	if ((cur->type == XML_ELEMENT_NODE) && (cur->name != NULL)) {
     229  	    if (xmlStrcasecmp(cur->name, BAD_CAST"meta") == 0) {
     230  		xmlAttrPtr attr = cur->properties;
     231  		int http;
     232  		const xmlChar *value;
     233  
     234  		content = NULL;
     235  		http = 0;
     236  		while (attr != NULL) {
     237  		    if ((attr->children != NULL) &&
     238  		        (attr->children->type == XML_TEXT_NODE) &&
     239  		        (attr->children->next == NULL)) {
     240  			value = attr->children->content;
     241  			if ((!xmlStrcasecmp(attr->name, BAD_CAST"http-equiv"))
     242  			 && (!xmlStrcasecmp(value, BAD_CAST"Content-Type")))
     243  			    http = 1;
     244  			else
     245                          {
     246                             if ((value != NULL) &&
     247                                 (!xmlStrcasecmp(attr->name, BAD_CAST"content")))
     248  			       content = value;
     249                          }
     250  		        if ((http != 0) && (content != NULL))
     251  			    break;
     252  		    }
     253  		    attr = attr->next;
     254  		}
     255  		if ((http != 0) && (content != NULL)) {
     256  		    meta = cur;
     257  		    break;
     258  		}
     259  
     260  	    }
     261  	}
     262  	cur = cur->next;
     263      }
     264  create:
     265      if (meta == NULL) {
     266          if ((encoding != NULL) && (head != NULL)) {
     267              /*
     268               * Create a new Meta element with the right attributes
     269               */
     270  
     271              meta = xmlNewDocNode(doc, NULL, BAD_CAST"meta", NULL);
     272              if (head->children == NULL)
     273                  xmlAddChild(head, meta);
     274              else
     275                  xmlAddPrevSibling(head->children, meta);
     276              xmlNewProp(meta, BAD_CAST"http-equiv", BAD_CAST"Content-Type");
     277              xmlNewProp(meta, BAD_CAST"content", BAD_CAST newcontent);
     278          }
     279      } else {
     280          /* remove the meta tag if NULL is passed */
     281          if (encoding == NULL) {
     282              xmlUnlinkNode(meta);
     283              xmlFreeNode(meta);
     284          }
     285          /* change the document only if there is a real encoding change */
     286          else if (xmlStrcasestr(content, encoding) == NULL) {
     287              xmlSetProp(meta, BAD_CAST"content", BAD_CAST newcontent);
     288          }
     289      }
     290  
     291  
     292      return(0);
     293  }
     294  
     295  /**
     296   * booleanHTMLAttrs:
     297   *
     298   * These are the HTML attributes which will be output
     299   * in minimized form, i.e. <option selected="selected"> will be
     300   * output as <option selected>, as per XSLT 1.0 16.2 "HTML Output Method"
     301   *
     302   */
     303  static const char* const htmlBooleanAttrs[] = {
     304    "checked", "compact", "declare", "defer", "disabled", "ismap",
     305    "multiple", "nohref", "noresize", "noshade", "nowrap", "readonly",
     306    "selected", NULL
     307  };
     308  
     309  
     310  /**
     311   * htmlIsBooleanAttr:
     312   * @name:  the name of the attribute to check
     313   *
     314   * Determine if a given attribute is a boolean attribute.
     315   *
     316   * returns: false if the attribute is not boolean, true otherwise.
     317   */
     318  int
     319  htmlIsBooleanAttr(const xmlChar *name)
     320  {
     321      int i = 0;
     322  
     323      while (htmlBooleanAttrs[i] != NULL) {
     324          if (xmlStrcasecmp((const xmlChar *)htmlBooleanAttrs[i], name) == 0)
     325              return 1;
     326          i++;
     327      }
     328      return 0;
     329  }
     330  
     331  #ifdef LIBXML_OUTPUT_ENABLED
     332  /************************************************************************
     333   *									*
     334   *			Output error handlers				*
     335   *									*
     336   ************************************************************************/
     337  /**
     338   * htmlSaveErrMemory:
     339   * @extra:  extra information
     340   *
     341   * Handle an out of memory condition
     342   */
     343  static void
     344  htmlSaveErrMemory(const char *extra)
     345  {
     346      __xmlSimpleError(XML_FROM_OUTPUT, XML_ERR_NO_MEMORY, NULL, NULL, extra);
     347  }
     348  
     349  /**
     350   * htmlSaveErr:
     351   * @code:  the error number
     352   * @node:  the location of the error.
     353   * @extra:  extra information
     354   *
     355   * Handle an out of memory condition
     356   */
     357  static void
     358  htmlSaveErr(int code, xmlNodePtr node, const char *extra)
     359  {
     360      const char *msg = NULL;
     361  
     362      switch(code) {
     363          case XML_SAVE_NOT_UTF8:
     364  	    msg = "string is not in UTF-8\n";
     365  	    break;
     366  	case XML_SAVE_CHAR_INVALID:
     367  	    msg = "invalid character value\n";
     368  	    break;
     369  	case XML_SAVE_UNKNOWN_ENCODING:
     370  	    msg = "unknown encoding %s\n";
     371  	    break;
     372  	case XML_SAVE_NO_DOCTYPE:
     373  	    msg = "HTML has no DOCTYPE\n";
     374  	    break;
     375  	default:
     376  	    msg = "unexpected error number\n";
     377      }
     378      __xmlSimpleError(XML_FROM_OUTPUT, code, node, msg, extra);
     379  }
     380  
     381  /************************************************************************
     382   *									*
     383   *		Dumping HTML tree content to a simple buffer		*
     384   *									*
     385   ************************************************************************/
     386  
     387  /**
     388   * htmlBufNodeDumpFormat:
     389   * @buf:  the xmlBufPtr output
     390   * @doc:  the document
     391   * @cur:  the current node
     392   * @format:  should formatting spaces been added
     393   *
     394   * Dump an HTML node, recursive behaviour,children are printed too.
     395   *
     396   * Returns the number of byte written or -1 in case of error
     397   */
     398  static size_t
     399  htmlBufNodeDumpFormat(xmlBufPtr buf, xmlDocPtr doc, xmlNodePtr cur,
     400  	           int format) {
     401      size_t use;
     402      int ret;
     403      xmlOutputBufferPtr outbuf;
     404  
     405      if (cur == NULL) {
     406  	return (-1);
     407      }
     408      if (buf == NULL) {
     409  	return (-1);
     410      }
     411      outbuf = (xmlOutputBufferPtr) xmlMalloc(sizeof(xmlOutputBuffer));
     412      if (outbuf == NULL) {
     413          htmlSaveErrMemory("allocating HTML output buffer");
     414  	return (-1);
     415      }
     416      memset(outbuf, 0, sizeof(xmlOutputBuffer));
     417      outbuf->buffer = buf;
     418      outbuf->encoder = NULL;
     419      outbuf->writecallback = NULL;
     420      outbuf->closecallback = NULL;
     421      outbuf->context = NULL;
     422      outbuf->written = 0;
     423  
     424      use = xmlBufUse(buf);
     425      htmlNodeDumpFormatOutput(outbuf, doc, cur, NULL, format);
     426      xmlFree(outbuf);
     427      ret = xmlBufUse(buf) - use;
     428      return (ret);
     429  }
     430  
     431  /**
     432   * htmlNodeDump:
     433   * @buf:  the HTML buffer output
     434   * @doc:  the document
     435   * @cur:  the current node
     436   *
     437   * Dump an HTML node, recursive behaviour,children are printed too,
     438   * and formatting returns are added.
     439   *
     440   * Returns the number of byte written or -1 in case of error
     441   */
     442  int
     443  htmlNodeDump(xmlBufferPtr buf, xmlDocPtr doc, xmlNodePtr cur) {
     444      xmlBufPtr buffer;
     445      size_t ret;
     446  
     447      if ((buf == NULL) || (cur == NULL))
     448          return(-1);
     449  
     450      xmlInitParser();
     451      buffer = xmlBufFromBuffer(buf);
     452      if (buffer == NULL)
     453          return(-1);
     454  
     455      ret = htmlBufNodeDumpFormat(buffer, doc, cur, 1);
     456  
     457      xmlBufBackToBuffer(buffer);
     458  
     459      if (ret > INT_MAX)
     460          return(-1);
     461      return((int) ret);
     462  }
     463  
     464  /**
     465   * htmlNodeDumpFileFormat:
     466   * @out:  the FILE pointer
     467   * @doc:  the document
     468   * @cur:  the current node
     469   * @encoding: the document encoding
     470   * @format:  should formatting spaces been added
     471   *
     472   * Dump an HTML node, recursive behaviour,children are printed too.
     473   *
     474   * TODO: if encoding == NULL try to save in the doc encoding
     475   *
     476   * returns: the number of byte written or -1 in case of failure.
     477   */
     478  int
     479  htmlNodeDumpFileFormat(FILE *out, xmlDocPtr doc,
     480  	               xmlNodePtr cur, const char *encoding, int format) {
     481      xmlOutputBufferPtr buf;
     482      xmlCharEncodingHandlerPtr handler = NULL;
     483      int ret;
     484  
     485      xmlInitParser();
     486  
     487      if (encoding != NULL) {
     488  	xmlCharEncoding enc;
     489  
     490  	enc = xmlParseCharEncoding(encoding);
     491  	if (enc != XML_CHAR_ENCODING_UTF8) {
     492  	    handler = xmlFindCharEncodingHandler(encoding);
     493  	    if (handler == NULL)
     494  		htmlSaveErr(XML_SAVE_UNKNOWN_ENCODING, NULL, encoding);
     495  	}
     496      } else {
     497          /*
     498           * Fallback to HTML or ASCII when the encoding is unspecified
     499           */
     500          if (handler == NULL)
     501              handler = xmlFindCharEncodingHandler("HTML");
     502          if (handler == NULL)
     503              handler = xmlFindCharEncodingHandler("ascii");
     504      }
     505  
     506      /*
     507       * save the content to a temp buffer.
     508       */
     509      buf = xmlOutputBufferCreateFile(out, handler);
     510      if (buf == NULL) return(0);
     511  
     512      htmlNodeDumpFormatOutput(buf, doc, cur, NULL, format);
     513  
     514      ret = xmlOutputBufferClose(buf);
     515      return(ret);
     516  }
     517  
     518  /**
     519   * htmlNodeDumpFile:
     520   * @out:  the FILE pointer
     521   * @doc:  the document
     522   * @cur:  the current node
     523   *
     524   * Dump an HTML node, recursive behaviour,children are printed too,
     525   * and formatting returns are added.
     526   */
     527  void
     528  htmlNodeDumpFile(FILE *out, xmlDocPtr doc, xmlNodePtr cur) {
     529      htmlNodeDumpFileFormat(out, doc, cur, NULL, 1);
     530  }
     531  
     532  /**
     533   * htmlDocDumpMemoryFormat:
     534   * @cur:  the document
     535   * @mem:  OUT: the memory pointer
     536   * @size:  OUT: the memory length
     537   * @format:  should formatting spaces been added
     538   *
     539   * Dump an HTML document in memory and return the xmlChar * and it's size.
     540   * It's up to the caller to free the memory.
     541   */
     542  void
     543  htmlDocDumpMemoryFormat(xmlDocPtr cur, xmlChar**mem, int *size, int format) {
     544      xmlOutputBufferPtr buf;
     545      xmlCharEncodingHandlerPtr handler = NULL;
     546      const char *encoding;
     547  
     548      xmlInitParser();
     549  
     550      if ((mem == NULL) || (size == NULL))
     551          return;
     552      if (cur == NULL) {
     553  	*mem = NULL;
     554  	*size = 0;
     555  	return;
     556      }
     557  
     558      encoding = (const char *) htmlGetMetaEncoding(cur);
     559  
     560      if (encoding != NULL) {
     561  	xmlCharEncoding enc;
     562  
     563  	enc = xmlParseCharEncoding(encoding);
     564  	if (enc != XML_CHAR_ENCODING_UTF8) {
     565  	    handler = xmlFindCharEncodingHandler(encoding);
     566  	    if (handler == NULL)
     567                  htmlSaveErr(XML_SAVE_UNKNOWN_ENCODING, NULL, encoding);
     568  
     569  	}
     570      } else {
     571          /*
     572           * Fallback to HTML or ASCII when the encoding is unspecified
     573           */
     574          if (handler == NULL)
     575              handler = xmlFindCharEncodingHandler("HTML");
     576          if (handler == NULL)
     577              handler = xmlFindCharEncodingHandler("ascii");
     578      }
     579  
     580      buf = xmlAllocOutputBufferInternal(handler);
     581      if (buf == NULL) {
     582  	*mem = NULL;
     583  	*size = 0;
     584  	return;
     585      }
     586  
     587      htmlDocContentDumpFormatOutput(buf, cur, NULL, format);
     588  
     589      xmlOutputBufferFlush(buf);
     590      if (buf->conv != NULL) {
     591  	*size = xmlBufUse(buf->conv);
     592  	*mem = xmlStrndup(xmlBufContent(buf->conv), *size);
     593      } else {
     594  	*size = xmlBufUse(buf->buffer);
     595  	*mem = xmlStrndup(xmlBufContent(buf->buffer), *size);
     596      }
     597      (void)xmlOutputBufferClose(buf);
     598  }
     599  
     600  /**
     601   * htmlDocDumpMemory:
     602   * @cur:  the document
     603   * @mem:  OUT: the memory pointer
     604   * @size:  OUT: the memory length
     605   *
     606   * Dump an HTML document in memory and return the xmlChar * and it's size.
     607   * It's up to the caller to free the memory.
     608   */
     609  void
     610  htmlDocDumpMemory(xmlDocPtr cur, xmlChar**mem, int *size) {
     611  	htmlDocDumpMemoryFormat(cur, mem, size, 1);
     612  }
     613  
     614  
     615  /************************************************************************
     616   *									*
     617   *		Dumping HTML tree content to an I/O output buffer	*
     618   *									*
     619   ************************************************************************/
     620  
     621  /**
     622   * htmlDtdDumpOutput:
     623   * @buf:  the HTML buffer output
     624   * @doc:  the document
     625   * @encoding:  the encoding string
     626   *
     627   * TODO: check whether encoding is needed
     628   *
     629   * Dump the HTML document DTD, if any.
     630   */
     631  static void
     632  htmlDtdDumpOutput(xmlOutputBufferPtr buf, xmlDocPtr doc,
     633  	          const char *encoding ATTRIBUTE_UNUSED) {
     634      xmlDtdPtr cur = doc->intSubset;
     635  
     636      if (cur == NULL) {
     637  	htmlSaveErr(XML_SAVE_NO_DOCTYPE, (xmlNodePtr) doc, NULL);
     638  	return;
     639      }
     640      xmlOutputBufferWriteString(buf, "<!DOCTYPE ");
     641      xmlOutputBufferWriteString(buf, (const char *)cur->name);
     642      if (cur->ExternalID != NULL) {
     643  	xmlOutputBufferWriteString(buf, " PUBLIC ");
     644  	xmlBufWriteQuotedString(buf->buffer, cur->ExternalID);
     645  	if (cur->SystemID != NULL) {
     646  	    xmlOutputBufferWriteString(buf, " ");
     647  	    xmlBufWriteQuotedString(buf->buffer, cur->SystemID);
     648  	}
     649      } else if (cur->SystemID != NULL &&
     650  	       xmlStrcmp(cur->SystemID, BAD_CAST "about:legacy-compat")) {
     651  	xmlOutputBufferWriteString(buf, " SYSTEM ");
     652  	xmlBufWriteQuotedString(buf->buffer, cur->SystemID);
     653      }
     654      xmlOutputBufferWriteString(buf, ">\n");
     655  }
     656  
     657  /**
     658   * htmlAttrDumpOutput:
     659   * @buf:  the HTML buffer output
     660   * @doc:  the document
     661   * @cur:  the attribute pointer
     662   *
     663   * Dump an HTML attribute
     664   */
     665  static void
     666  htmlAttrDumpOutput(xmlOutputBufferPtr buf, xmlDocPtr doc, xmlAttrPtr cur) {
     667      xmlChar *value;
     668  
     669      /*
     670       * The html output method should not escape a & character
     671       * occurring in an attribute value immediately followed by
     672       * a { character (see Section B.7.1 of the HTML 4.0 Recommendation).
     673       * This is implemented in xmlEncodeEntitiesReentrant
     674       */
     675  
     676      if (cur == NULL) {
     677  	return;
     678      }
     679      xmlOutputBufferWriteString(buf, " ");
     680      if ((cur->ns != NULL) && (cur->ns->prefix != NULL)) {
     681          xmlOutputBufferWriteString(buf, (const char *)cur->ns->prefix);
     682  	xmlOutputBufferWriteString(buf, ":");
     683      }
     684      xmlOutputBufferWriteString(buf, (const char *)cur->name);
     685      if ((cur->children != NULL) && (!htmlIsBooleanAttr(cur->name))) {
     686  	value = xmlNodeListGetString(doc, cur->children, 0);
     687  	if (value) {
     688  	    xmlOutputBufferWriteString(buf, "=");
     689  	    if ((cur->ns == NULL) && (cur->parent != NULL) &&
     690  		(cur->parent->ns == NULL) &&
     691  		((!xmlStrcasecmp(cur->name, BAD_CAST "href")) ||
     692  	         (!xmlStrcasecmp(cur->name, BAD_CAST "action")) ||
     693  		 (!xmlStrcasecmp(cur->name, BAD_CAST "src")) ||
     694  		 ((!xmlStrcasecmp(cur->name, BAD_CAST "name")) &&
     695  		  (!xmlStrcasecmp(cur->parent->name, BAD_CAST "a"))))) {
     696  		xmlChar *escaped;
     697  		xmlChar *tmp = value;
     698  
     699  		while (IS_BLANK_CH(*tmp)) tmp++;
     700  
     701  		/*
     702                   * Angle brackets are technically illegal in URIs, but they're
     703                   * used in server side includes, for example. Curly brackets
     704                   * are illegal as well and often used in templates.
     705                   * Don't escape non-whitespace, printable ASCII chars for
     706                   * improved interoperability. Only escape space, control
     707                   * and non-ASCII chars.
     708  		 */
     709  		escaped = xmlURIEscapeStr(tmp,
     710                          BAD_CAST "\"#$%&+,/:;<=>?@[\\]^`{|}");
     711  		if (escaped != NULL) {
     712  		    xmlBufWriteQuotedString(buf->buffer, escaped);
     713  		    xmlFree(escaped);
     714  		} else {
     715  		    xmlBufWriteQuotedString(buf->buffer, value);
     716  		}
     717  	    } else {
     718  		xmlBufWriteQuotedString(buf->buffer, value);
     719  	    }
     720  	    xmlFree(value);
     721  	} else  {
     722  	    xmlOutputBufferWriteString(buf, "=\"\"");
     723  	}
     724      }
     725  }
     726  
     727  /**
     728   * htmlNodeDumpFormatOutput:
     729   * @buf:  the HTML buffer output
     730   * @doc:  the document
     731   * @cur:  the current node
     732   * @encoding:  the encoding string (unused)
     733   * @format:  should formatting spaces been added
     734   *
     735   * Dump an HTML node, recursive behaviour,children are printed too.
     736   */
     737  void
     738  htmlNodeDumpFormatOutput(xmlOutputBufferPtr buf, xmlDocPtr doc,
     739  	                 xmlNodePtr cur, const char *encoding ATTRIBUTE_UNUSED,
     740                           int format) {
     741      xmlNodePtr root, parent;
     742      xmlAttrPtr attr;
     743      const htmlElemDesc * info;
     744  
     745      xmlInitParser();
     746  
     747      if ((cur == NULL) || (buf == NULL)) {
     748  	return;
     749      }
     750  
     751      root = cur;
     752      parent = cur->parent;
     753      while (1) {
     754          switch (cur->type) {
     755          case XML_HTML_DOCUMENT_NODE:
     756          case XML_DOCUMENT_NODE:
     757              if (((xmlDocPtr) cur)->intSubset != NULL) {
     758                  htmlDtdDumpOutput(buf, (xmlDocPtr) cur, NULL);
     759              }
     760              if (cur->children != NULL) {
     761                  /* Always validate cur->parent when descending. */
     762                  if (cur->parent == parent) {
     763                      parent = cur;
     764                      cur = cur->children;
     765                      continue;
     766                  }
     767              } else {
     768                  xmlOutputBufferWriteString(buf, "\n");
     769              }
     770              break;
     771  
     772          case XML_ELEMENT_NODE:
     773              /*
     774               * Some users like lxml are known to pass nodes with a corrupted
     775               * tree structure. Fall back to a recursive call to handle this
     776               * case.
     777               */
     778              if ((cur->parent != parent) && (cur->children != NULL)) {
     779                  htmlNodeDumpFormatOutput(buf, doc, cur, encoding, format);
     780                  break;
     781              }
     782  
     783              /*
     784               * Get specific HTML info for that node.
     785               */
     786              if (cur->ns == NULL)
     787                  info = htmlTagLookup(cur->name);
     788              else
     789                  info = NULL;
     790  
     791              xmlOutputBufferWriteString(buf, "<");
     792              if ((cur->ns != NULL) && (cur->ns->prefix != NULL)) {
     793                  xmlOutputBufferWriteString(buf, (const char *)cur->ns->prefix);
     794                  xmlOutputBufferWriteString(buf, ":");
     795              }
     796              xmlOutputBufferWriteString(buf, (const char *)cur->name);
     797              if (cur->nsDef)
     798                  xmlNsListDumpOutput(buf, cur->nsDef);
     799              attr = cur->properties;
     800              while (attr != NULL) {
     801                  htmlAttrDumpOutput(buf, doc, attr);
     802                  attr = attr->next;
     803              }
     804  
     805              if ((info != NULL) && (info->empty)) {
     806                  xmlOutputBufferWriteString(buf, ">");
     807              } else if (cur->children == NULL) {
     808                  if ((info != NULL) && (info->saveEndTag != 0) &&
     809                      (xmlStrcmp(BAD_CAST info->name, BAD_CAST "html")) &&
     810                      (xmlStrcmp(BAD_CAST info->name, BAD_CAST "body"))) {
     811                      xmlOutputBufferWriteString(buf, ">");
     812                  } else {
     813                      xmlOutputBufferWriteString(buf, "></");
     814                      if ((cur->ns != NULL) && (cur->ns->prefix != NULL)) {
     815                          xmlOutputBufferWriteString(buf,
     816                                  (const char *)cur->ns->prefix);
     817                          xmlOutputBufferWriteString(buf, ":");
     818                      }
     819                      xmlOutputBufferWriteString(buf, (const char *)cur->name);
     820                      xmlOutputBufferWriteString(buf, ">");
     821                  }
     822              } else {
     823                  xmlOutputBufferWriteString(buf, ">");
     824                  if ((format) && (info != NULL) && (!info->isinline) &&
     825                      (cur->children->type != HTML_TEXT_NODE) &&
     826                      (cur->children->type != HTML_ENTITY_REF_NODE) &&
     827                      (cur->children != cur->last) &&
     828                      (cur->name != NULL) &&
     829                      (cur->name[0] != 'p')) /* p, pre, param */
     830                      xmlOutputBufferWriteString(buf, "\n");
     831                  parent = cur;
     832                  cur = cur->children;
     833                  continue;
     834              }
     835  
     836              if ((format) && (cur->next != NULL) &&
     837                  (info != NULL) && (!info->isinline)) {
     838                  if ((cur->next->type != HTML_TEXT_NODE) &&
     839                      (cur->next->type != HTML_ENTITY_REF_NODE) &&
     840                      (parent != NULL) &&
     841                      (parent->name != NULL) &&
     842                      (parent->name[0] != 'p')) /* p, pre, param */
     843                      xmlOutputBufferWriteString(buf, "\n");
     844              }
     845  
     846              break;
     847  
     848          case XML_ATTRIBUTE_NODE:
     849              htmlAttrDumpOutput(buf, doc, (xmlAttrPtr) cur);
     850              break;
     851  
     852          case HTML_TEXT_NODE:
     853              if (cur->content == NULL)
     854                  break;
     855              if (((cur->name == (const xmlChar *)xmlStringText) ||
     856                   (cur->name != (const xmlChar *)xmlStringTextNoenc)) &&
     857                  ((parent == NULL) ||
     858                   ((xmlStrcasecmp(parent->name, BAD_CAST "script")) &&
     859                    (xmlStrcasecmp(parent->name, BAD_CAST "style"))))) {
     860                  xmlChar *buffer;
     861  
     862                  buffer = xmlEncodeEntitiesReentrant(doc, cur->content);
     863                  if (buffer != NULL) {
     864                      xmlOutputBufferWriteString(buf, (const char *)buffer);
     865                      xmlFree(buffer);
     866                  }
     867              } else {
     868                  xmlOutputBufferWriteString(buf, (const char *)cur->content);
     869              }
     870              break;
     871  
     872          case HTML_COMMENT_NODE:
     873              if (cur->content != NULL) {
     874                  xmlOutputBufferWriteString(buf, "<!--");
     875                  xmlOutputBufferWriteString(buf, (const char *)cur->content);
     876                  xmlOutputBufferWriteString(buf, "-->");
     877              }
     878              break;
     879  
     880          case HTML_PI_NODE:
     881              if (cur->name != NULL) {
     882                  xmlOutputBufferWriteString(buf, "<?");
     883                  xmlOutputBufferWriteString(buf, (const char *)cur->name);
     884                  if (cur->content != NULL) {
     885                      xmlOutputBufferWriteString(buf, " ");
     886                      xmlOutputBufferWriteString(buf,
     887                              (const char *)cur->content);
     888                  }
     889                  xmlOutputBufferWriteString(buf, ">");
     890              }
     891              break;
     892  
     893          case HTML_ENTITY_REF_NODE:
     894              xmlOutputBufferWriteString(buf, "&");
     895              xmlOutputBufferWriteString(buf, (const char *)cur->name);
     896              xmlOutputBufferWriteString(buf, ";");
     897              break;
     898  
     899          case HTML_PRESERVE_NODE:
     900              if (cur->content != NULL) {
     901                  xmlOutputBufferWriteString(buf, (const char *)cur->content);
     902              }
     903              break;
     904  
     905          default:
     906              break;
     907          }
     908  
     909          while (1) {
     910              if (cur == root)
     911                  return;
     912              if (cur->next != NULL) {
     913                  cur = cur->next;
     914                  break;
     915              }
     916  
     917              cur = parent;
     918              /* cur->parent was validated when descending. */
     919              parent = cur->parent;
     920  
     921              if ((cur->type == XML_HTML_DOCUMENT_NODE) ||
     922                  (cur->type == XML_DOCUMENT_NODE)) {
     923                  xmlOutputBufferWriteString(buf, "\n");
     924              } else {
     925                  if ((format) && (cur->ns == NULL))
     926                      info = htmlTagLookup(cur->name);
     927                  else
     928                      info = NULL;
     929  
     930                  if ((format) && (info != NULL) && (!info->isinline) &&
     931                      (cur->last->type != HTML_TEXT_NODE) &&
     932                      (cur->last->type != HTML_ENTITY_REF_NODE) &&
     933                      (cur->children != cur->last) &&
     934                      (cur->name != NULL) &&
     935                      (cur->name[0] != 'p')) /* p, pre, param */
     936                      xmlOutputBufferWriteString(buf, "\n");
     937  
     938                  xmlOutputBufferWriteString(buf, "</");
     939                  if ((cur->ns != NULL) && (cur->ns->prefix != NULL)) {
     940                      xmlOutputBufferWriteString(buf, (const char *)cur->ns->prefix);
     941                      xmlOutputBufferWriteString(buf, ":");
     942                  }
     943                  xmlOutputBufferWriteString(buf, (const char *)cur->name);
     944                  xmlOutputBufferWriteString(buf, ">");
     945  
     946                  if ((format) && (info != NULL) && (!info->isinline) &&
     947                      (cur->next != NULL)) {
     948                      if ((cur->next->type != HTML_TEXT_NODE) &&
     949                          (cur->next->type != HTML_ENTITY_REF_NODE) &&
     950                          (parent != NULL) &&
     951                          (parent->name != NULL) &&
     952                          (parent->name[0] != 'p')) /* p, pre, param */
     953                          xmlOutputBufferWriteString(buf, "\n");
     954                  }
     955              }
     956          }
     957      }
     958  }
     959  
     960  /**
     961   * htmlNodeDumpOutput:
     962   * @buf:  the HTML buffer output
     963   * @doc:  the document
     964   * @cur:  the current node
     965   * @encoding:  the encoding string (unused)
     966   *
     967   * Dump an HTML node, recursive behaviour,children are printed too,
     968   * and formatting returns/spaces are added.
     969   */
     970  void
     971  htmlNodeDumpOutput(xmlOutputBufferPtr buf, xmlDocPtr doc,
     972  	           xmlNodePtr cur, const char *encoding ATTRIBUTE_UNUSED) {
     973      htmlNodeDumpFormatOutput(buf, doc, cur, NULL, 1);
     974  }
     975  
     976  /**
     977   * htmlDocContentDumpFormatOutput:
     978   * @buf:  the HTML buffer output
     979   * @cur:  the document
     980   * @encoding:  the encoding string (unused)
     981   * @format:  should formatting spaces been added
     982   *
     983   * Dump an HTML document.
     984   */
     985  void
     986  htmlDocContentDumpFormatOutput(xmlOutputBufferPtr buf, xmlDocPtr cur,
     987  	                       const char *encoding ATTRIBUTE_UNUSED,
     988                                 int format) {
     989      int type = 0;
     990      if (cur) {
     991          type = cur->type;
     992          cur->type = XML_HTML_DOCUMENT_NODE;
     993      }
     994      htmlNodeDumpFormatOutput(buf, cur, (xmlNodePtr) cur, NULL, format);
     995      if (cur)
     996          cur->type = (xmlElementType) type;
     997  }
     998  
     999  /**
    1000   * htmlDocContentDumpOutput:
    1001   * @buf:  the HTML buffer output
    1002   * @cur:  the document
    1003   * @encoding:  the encoding string (unused)
    1004   *
    1005   * Dump an HTML document. Formatting return/spaces are added.
    1006   */
    1007  void
    1008  htmlDocContentDumpOutput(xmlOutputBufferPtr buf, xmlDocPtr cur,
    1009  	                 const char *encoding ATTRIBUTE_UNUSED) {
    1010      htmlNodeDumpFormatOutput(buf, cur, (xmlNodePtr) cur, NULL, 1);
    1011  }
    1012  
    1013  /************************************************************************
    1014   *									*
    1015   *		Saving functions front-ends				*
    1016   *									*
    1017   ************************************************************************/
    1018  
    1019  /**
    1020   * htmlDocDump:
    1021   * @f:  the FILE*
    1022   * @cur:  the document
    1023   *
    1024   * Dump an HTML document to an open FILE.
    1025   *
    1026   * returns: the number of byte written or -1 in case of failure.
    1027   */
    1028  int
    1029  htmlDocDump(FILE *f, xmlDocPtr cur) {
    1030      xmlOutputBufferPtr buf;
    1031      xmlCharEncodingHandlerPtr handler = NULL;
    1032      const char *encoding;
    1033      int ret;
    1034  
    1035      xmlInitParser();
    1036  
    1037      if ((cur == NULL) || (f == NULL)) {
    1038  	return(-1);
    1039      }
    1040  
    1041      encoding = (const char *) htmlGetMetaEncoding(cur);
    1042  
    1043      if (encoding != NULL) {
    1044  	xmlCharEncoding enc;
    1045  
    1046  	enc = xmlParseCharEncoding(encoding);
    1047  	if (enc != XML_CHAR_ENCODING_UTF8) {
    1048  	    handler = xmlFindCharEncodingHandler(encoding);
    1049  	    if (handler == NULL)
    1050  		htmlSaveErr(XML_SAVE_UNKNOWN_ENCODING, NULL, encoding);
    1051  	}
    1052      } else {
    1053          /*
    1054           * Fallback to HTML or ASCII when the encoding is unspecified
    1055           */
    1056          if (handler == NULL)
    1057              handler = xmlFindCharEncodingHandler("HTML");
    1058          if (handler == NULL)
    1059              handler = xmlFindCharEncodingHandler("ascii");
    1060      }
    1061  
    1062      buf = xmlOutputBufferCreateFile(f, handler);
    1063      if (buf == NULL) return(-1);
    1064      htmlDocContentDumpOutput(buf, cur, NULL);
    1065  
    1066      ret = xmlOutputBufferClose(buf);
    1067      return(ret);
    1068  }
    1069  
    1070  /**
    1071   * htmlSaveFile:
    1072   * @filename:  the filename (or URL)
    1073   * @cur:  the document
    1074   *
    1075   * Dump an HTML document to a file. If @filename is "-" the stdout file is
    1076   * used.
    1077   * returns: the number of byte written or -1 in case of failure.
    1078   */
    1079  int
    1080  htmlSaveFile(const char *filename, xmlDocPtr cur) {
    1081      xmlOutputBufferPtr buf;
    1082      xmlCharEncodingHandlerPtr handler = NULL;
    1083      const char *encoding;
    1084      int ret;
    1085  
    1086      if ((cur == NULL) || (filename == NULL))
    1087          return(-1);
    1088  
    1089      xmlInitParser();
    1090  
    1091      encoding = (const char *) htmlGetMetaEncoding(cur);
    1092  
    1093      if (encoding != NULL) {
    1094  	xmlCharEncoding enc;
    1095  
    1096  	enc = xmlParseCharEncoding(encoding);
    1097  	if (enc != XML_CHAR_ENCODING_UTF8) {
    1098  	    handler = xmlFindCharEncodingHandler(encoding);
    1099  	    if (handler == NULL)
    1100  		htmlSaveErr(XML_SAVE_UNKNOWN_ENCODING, NULL, encoding);
    1101  	}
    1102      } else {
    1103          /*
    1104           * Fallback to HTML or ASCII when the encoding is unspecified
    1105           */
    1106          if (handler == NULL)
    1107              handler = xmlFindCharEncodingHandler("HTML");
    1108          if (handler == NULL)
    1109              handler = xmlFindCharEncodingHandler("ascii");
    1110      }
    1111  
    1112      /*
    1113       * save the content to a temp buffer.
    1114       */
    1115      buf = xmlOutputBufferCreateFilename(filename, handler, cur->compression);
    1116      if (buf == NULL) return(0);
    1117  
    1118      htmlDocContentDumpOutput(buf, cur, NULL);
    1119  
    1120      ret = xmlOutputBufferClose(buf);
    1121      return(ret);
    1122  }
    1123  
    1124  /**
    1125   * htmlSaveFileFormat:
    1126   * @filename:  the filename
    1127   * @cur:  the document
    1128   * @format:  should formatting spaces been added
    1129   * @encoding: the document encoding
    1130   *
    1131   * Dump an HTML document to a file using a given encoding.
    1132   *
    1133   * returns: the number of byte written or -1 in case of failure.
    1134   */
    1135  int
    1136  htmlSaveFileFormat(const char *filename, xmlDocPtr cur,
    1137  	           const char *encoding, int format) {
    1138      xmlOutputBufferPtr buf;
    1139      xmlCharEncodingHandlerPtr handler = NULL;
    1140      int ret;
    1141  
    1142      if ((cur == NULL) || (filename == NULL))
    1143          return(-1);
    1144  
    1145      xmlInitParser();
    1146  
    1147      if (encoding != NULL) {
    1148  	xmlCharEncoding enc;
    1149  
    1150  	enc = xmlParseCharEncoding(encoding);
    1151  	if (enc != XML_CHAR_ENCODING_UTF8) {
    1152  	    handler = xmlFindCharEncodingHandler(encoding);
    1153  	    if (handler == NULL)
    1154  		htmlSaveErr(XML_SAVE_UNKNOWN_ENCODING, NULL, encoding);
    1155  	}
    1156          htmlSetMetaEncoding(cur, (const xmlChar *) encoding);
    1157      } else {
    1158  	htmlSetMetaEncoding(cur, (const xmlChar *) "UTF-8");
    1159  
    1160          /*
    1161           * Fallback to HTML or ASCII when the encoding is unspecified
    1162           */
    1163          if (handler == NULL)
    1164              handler = xmlFindCharEncodingHandler("HTML");
    1165          if (handler == NULL)
    1166              handler = xmlFindCharEncodingHandler("ascii");
    1167      }
    1168  
    1169      /*
    1170       * save the content to a temp buffer.
    1171       */
    1172      buf = xmlOutputBufferCreateFilename(filename, handler, 0);
    1173      if (buf == NULL) return(0);
    1174  
    1175      htmlDocContentDumpFormatOutput(buf, cur, encoding, format);
    1176  
    1177      ret = xmlOutputBufferClose(buf);
    1178      return(ret);
    1179  }
    1180  
    1181  /**
    1182   * htmlSaveFileEnc:
    1183   * @filename:  the filename
    1184   * @cur:  the document
    1185   * @encoding: the document encoding
    1186   *
    1187   * Dump an HTML document to a file using a given encoding
    1188   * and formatting returns/spaces are added.
    1189   *
    1190   * returns: the number of byte written or -1 in case of failure.
    1191   */
    1192  int
    1193  htmlSaveFileEnc(const char *filename, xmlDocPtr cur, const char *encoding) {
    1194      return(htmlSaveFileFormat(filename, cur, encoding, 1));
    1195  }
    1196  
    1197  #endif /* LIBXML_OUTPUT_ENABLED */
    1198  
    1199  #endif /* LIBXML_HTML_ENABLED */