(root)/
libxml2-2.12.3/
include/
libxml/
HTMLparser.h
       1  /*
       2   * Summary: interface for an HTML 4.0 non-verifying parser
       3   * Description: this module implements an HTML 4.0 non-verifying parser
       4   *              with API compatible with the XML parser ones. It should
       5   *              be able to parse "real world" HTML, even if severely
       6   *              broken from a specification point of view.
       7   *
       8   * Copy: See Copyright for the status of this software.
       9   *
      10   * Author: Daniel Veillard
      11   */
      12  
      13  #ifndef __HTML_PARSER_H__
      14  #define __HTML_PARSER_H__
      15  #include <libxml/xmlversion.h>
      16  #include <libxml/parser.h>
      17  
      18  #ifdef LIBXML_HTML_ENABLED
      19  
      20  #ifdef __cplusplus
      21  extern "C" {
      22  #endif
      23  
      24  /*
      25   * Most of the back-end structures from XML and HTML are shared.
      26   */
      27  typedef xmlParserCtxt htmlParserCtxt;
      28  typedef xmlParserCtxtPtr htmlParserCtxtPtr;
      29  typedef xmlParserNodeInfo htmlParserNodeInfo;
      30  typedef xmlSAXHandler htmlSAXHandler;
      31  typedef xmlSAXHandlerPtr htmlSAXHandlerPtr;
      32  typedef xmlParserInput htmlParserInput;
      33  typedef xmlParserInputPtr htmlParserInputPtr;
      34  typedef xmlDocPtr htmlDocPtr;
      35  typedef xmlNodePtr htmlNodePtr;
      36  
      37  /*
      38   * Internal description of an HTML element, representing HTML 4.01
      39   * and XHTML 1.0 (which share the same structure).
      40   */
      41  typedef struct _htmlElemDesc htmlElemDesc;
      42  typedef htmlElemDesc *htmlElemDescPtr;
      43  struct _htmlElemDesc {
      44      const char *name;	/* The tag name */
      45      char startTag;      /* Whether the start tag can be implied */
      46      char endTag;        /* Whether the end tag can be implied */
      47      char saveEndTag;    /* Whether the end tag should be saved */
      48      char empty;         /* Is this an empty element ? */
      49      char depr;          /* Is this a deprecated element ? */
      50      char dtd;           /* 1: only in Loose DTD, 2: only Frameset one */
      51      char isinline;      /* is this a block 0 or inline 1 element */
      52      const char *desc;   /* the description */
      53  
      54  /* NRK Jan.2003
      55   * New fields encapsulating HTML structure
      56   *
      57   * Bugs:
      58   *	This is a very limited representation.  It fails to tell us when
      59   *	an element *requires* subelements (we only have whether they're
      60   *	allowed or not), and it doesn't tell us where CDATA and PCDATA
      61   *	are allowed.  Some element relationships are not fully represented:
      62   *	these are flagged with the word MODIFIER
      63   */
      64      const char** subelts;		/* allowed sub-elements of this element */
      65      const char* defaultsubelt;	/* subelement for suggested auto-repair
      66  					   if necessary or NULL */
      67      const char** attrs_opt;		/* Optional Attributes */
      68      const char** attrs_depr;		/* Additional deprecated attributes */
      69      const char** attrs_req;		/* Required attributes */
      70  };
      71  
      72  /*
      73   * Internal description of an HTML entity.
      74   */
      75  typedef struct _htmlEntityDesc htmlEntityDesc;
      76  typedef htmlEntityDesc *htmlEntityDescPtr;
      77  struct _htmlEntityDesc {
      78      unsigned int value;	/* the UNICODE value for the character */
      79      const char *name;	/* The entity name */
      80      const char *desc;   /* the description */
      81  };
      82  
      83  /** DOC_DISABLE */
      84  #ifdef LIBXML_SAX1_ENABLED
      85    #define XML_GLOBALS_HTML \
      86      XML_OP(htmlDefaultSAXHandler, xmlSAXHandlerV1, XML_DEPRECATED)
      87  #else
      88    #define XML_GLOBALS_HTML
      89  #endif
      90  
      91  #define XML_OP XML_DECLARE_GLOBAL
      92  XML_GLOBALS_HTML
      93  #undef XML_OP
      94  
      95  #if defined(LIBXML_THREAD_ENABLED) && !defined(XML_GLOBALS_NO_REDEFINITION)
      96    #define htmlDefaultSAXHandler XML_GLOBAL_MACRO(htmlDefaultSAXHandler)
      97  #endif
      98  /** DOC_ENABLE */
      99  
     100  /*
     101   * There is only few public functions.
     102   */
     103  XML_DEPRECATED
     104  XMLPUBFUN void
     105  			htmlInitAutoClose	(void);
     106  XMLPUBFUN const htmlElemDesc *
     107  			htmlTagLookup	(const xmlChar *tag);
     108  XMLPUBFUN const htmlEntityDesc *
     109  			htmlEntityLookup(const xmlChar *name);
     110  XMLPUBFUN const htmlEntityDesc *
     111  			htmlEntityValueLookup(unsigned int value);
     112  
     113  XMLPUBFUN int
     114  			htmlIsAutoClosed(htmlDocPtr doc,
     115  					 htmlNodePtr elem);
     116  XMLPUBFUN int
     117  			htmlAutoCloseTag(htmlDocPtr doc,
     118  					 const xmlChar *name,
     119  					 htmlNodePtr elem);
     120  XML_DEPRECATED
     121  XMLPUBFUN const htmlEntityDesc *
     122  			htmlParseEntityRef(htmlParserCtxtPtr ctxt,
     123  					 const xmlChar **str);
     124  XML_DEPRECATED
     125  XMLPUBFUN int
     126  			htmlParseCharRef(htmlParserCtxtPtr ctxt);
     127  XML_DEPRECATED
     128  XMLPUBFUN void
     129  			htmlParseElement(htmlParserCtxtPtr ctxt);
     130  
     131  XMLPUBFUN htmlParserCtxtPtr
     132  			htmlNewParserCtxt(void);
     133  XMLPUBFUN htmlParserCtxtPtr
     134  			htmlNewSAXParserCtxt(const htmlSAXHandler *sax,
     135  					     void *userData);
     136  
     137  XMLPUBFUN htmlParserCtxtPtr
     138  			htmlCreateMemoryParserCtxt(const char *buffer,
     139  						   int size);
     140  
     141  XMLPUBFUN int
     142  			htmlParseDocument(htmlParserCtxtPtr ctxt);
     143  XML_DEPRECATED
     144  XMLPUBFUN htmlDocPtr
     145  			htmlSAXParseDoc	(const xmlChar *cur,
     146  					 const char *encoding,
     147  					 htmlSAXHandlerPtr sax,
     148  					 void *userData);
     149  XMLPUBFUN htmlDocPtr
     150  			htmlParseDoc	(const xmlChar *cur,
     151  					 const char *encoding);
     152  XMLPUBFUN htmlParserCtxtPtr
     153  			htmlCreateFileParserCtxt(const char *filename,
     154  	                                         const char *encoding);
     155  XML_DEPRECATED
     156  XMLPUBFUN htmlDocPtr
     157  			htmlSAXParseFile(const char *filename,
     158  					 const char *encoding,
     159  					 htmlSAXHandlerPtr sax,
     160  					 void *userData);
     161  XMLPUBFUN htmlDocPtr
     162  			htmlParseFile	(const char *filename,
     163  					 const char *encoding);
     164  XMLPUBFUN int
     165  			UTF8ToHtml	(unsigned char *out,
     166  					 int *outlen,
     167  					 const unsigned char *in,
     168  					 int *inlen);
     169  XMLPUBFUN int
     170  			htmlEncodeEntities(unsigned char *out,
     171  					 int *outlen,
     172  					 const unsigned char *in,
     173  					 int *inlen, int quoteChar);
     174  XMLPUBFUN int
     175  			htmlIsScriptAttribute(const xmlChar *name);
     176  XMLPUBFUN int
     177  			htmlHandleOmittedElem(int val);
     178  
     179  #ifdef LIBXML_PUSH_ENABLED
     180  /**
     181   * Interfaces for the Push mode.
     182   */
     183  XMLPUBFUN htmlParserCtxtPtr
     184  			htmlCreatePushParserCtxt(htmlSAXHandlerPtr sax,
     185  						 void *user_data,
     186  						 const char *chunk,
     187  						 int size,
     188  						 const char *filename,
     189  						 xmlCharEncoding enc);
     190  XMLPUBFUN int
     191  			htmlParseChunk		(htmlParserCtxtPtr ctxt,
     192  						 const char *chunk,
     193  						 int size,
     194  						 int terminate);
     195  #endif /* LIBXML_PUSH_ENABLED */
     196  
     197  XMLPUBFUN void
     198  			htmlFreeParserCtxt	(htmlParserCtxtPtr ctxt);
     199  
     200  /*
     201   * New set of simpler/more flexible APIs
     202   */
     203  /**
     204   * xmlParserOption:
     205   *
     206   * This is the set of XML parser options that can be passed down
     207   * to the xmlReadDoc() and similar calls.
     208   */
     209  typedef enum {
     210      HTML_PARSE_RECOVER  = 1<<0, /* Relaxed parsing */
     211      HTML_PARSE_NODEFDTD = 1<<2, /* do not default a doctype if not found */
     212      HTML_PARSE_NOERROR	= 1<<5,	/* suppress error reports */
     213      HTML_PARSE_NOWARNING= 1<<6,	/* suppress warning reports */
     214      HTML_PARSE_PEDANTIC	= 1<<7,	/* pedantic error reporting */
     215      HTML_PARSE_NOBLANKS	= 1<<8,	/* remove blank nodes */
     216      HTML_PARSE_NONET	= 1<<11,/* Forbid network access */
     217      HTML_PARSE_NOIMPLIED= 1<<13,/* Do not add implied html/body... elements */
     218      HTML_PARSE_COMPACT  = 1<<16,/* compact small text nodes */
     219      HTML_PARSE_IGNORE_ENC=1<<21 /* ignore internal document encoding hint */
     220  } htmlParserOption;
     221  
     222  XMLPUBFUN void
     223  		htmlCtxtReset		(htmlParserCtxtPtr ctxt);
     224  XMLPUBFUN int
     225  		htmlCtxtUseOptions	(htmlParserCtxtPtr ctxt,
     226  					 int options);
     227  XMLPUBFUN htmlDocPtr
     228  		htmlReadDoc		(const xmlChar *cur,
     229  					 const char *URL,
     230  					 const char *encoding,
     231  					 int options);
     232  XMLPUBFUN htmlDocPtr
     233  		htmlReadFile		(const char *URL,
     234  					 const char *encoding,
     235  					 int options);
     236  XMLPUBFUN htmlDocPtr
     237  		htmlReadMemory		(const char *buffer,
     238  					 int size,
     239  					 const char *URL,
     240  					 const char *encoding,
     241  					 int options);
     242  XMLPUBFUN htmlDocPtr
     243  		htmlReadFd		(int fd,
     244  					 const char *URL,
     245  					 const char *encoding,
     246  					 int options);
     247  XMLPUBFUN htmlDocPtr
     248  		htmlReadIO		(xmlInputReadCallback ioread,
     249  					 xmlInputCloseCallback ioclose,
     250  					 void *ioctx,
     251  					 const char *URL,
     252  					 const char *encoding,
     253  					 int options);
     254  XMLPUBFUN htmlDocPtr
     255  		htmlCtxtReadDoc		(xmlParserCtxtPtr ctxt,
     256  					 const xmlChar *cur,
     257  					 const char *URL,
     258  					 const char *encoding,
     259  					 int options);
     260  XMLPUBFUN htmlDocPtr
     261  		htmlCtxtReadFile		(xmlParserCtxtPtr ctxt,
     262  					 const char *filename,
     263  					 const char *encoding,
     264  					 int options);
     265  XMLPUBFUN htmlDocPtr
     266  		htmlCtxtReadMemory		(xmlParserCtxtPtr ctxt,
     267  					 const char *buffer,
     268  					 int size,
     269  					 const char *URL,
     270  					 const char *encoding,
     271  					 int options);
     272  XMLPUBFUN htmlDocPtr
     273  		htmlCtxtReadFd		(xmlParserCtxtPtr ctxt,
     274  					 int fd,
     275  					 const char *URL,
     276  					 const char *encoding,
     277  					 int options);
     278  XMLPUBFUN htmlDocPtr
     279  		htmlCtxtReadIO		(xmlParserCtxtPtr ctxt,
     280  					 xmlInputReadCallback ioread,
     281  					 xmlInputCloseCallback ioclose,
     282  					 void *ioctx,
     283  					 const char *URL,
     284  					 const char *encoding,
     285  					 int options);
     286  
     287  /* NRK/Jan2003: further knowledge of HTML structure
     288   */
     289  typedef enum {
     290    HTML_NA = 0 ,		/* something we don't check at all */
     291    HTML_INVALID = 0x1 ,
     292    HTML_DEPRECATED = 0x2 ,
     293    HTML_VALID = 0x4 ,
     294    HTML_REQUIRED = 0xc /* VALID bit set so ( & HTML_VALID ) is TRUE */
     295  } htmlStatus ;
     296  
     297  /* Using htmlElemDesc rather than name here, to emphasise the fact
     298     that otherwise there's a lookup overhead
     299  */
     300  XMLPUBFUN htmlStatus htmlAttrAllowed(const htmlElemDesc*, const xmlChar*, int) ;
     301  XMLPUBFUN int htmlElementAllowedHere(const htmlElemDesc*, const xmlChar*) ;
     302  XMLPUBFUN htmlStatus htmlElementStatusHere(const htmlElemDesc*, const htmlElemDesc*) ;
     303  XMLPUBFUN htmlStatus htmlNodeStatus(const htmlNodePtr, int) ;
     304  /**
     305   * htmlDefaultSubelement:
     306   * @elt: HTML element
     307   *
     308   * Returns the default subelement for this element
     309   */
     310  #define htmlDefaultSubelement(elt) elt->defaultsubelt
     311  /**
     312   * htmlElementAllowedHereDesc:
     313   * @parent: HTML parent element
     314   * @elt: HTML element
     315   *
     316   * Checks whether an HTML element description may be a
     317   * direct child of the specified element.
     318   *
     319   * Returns 1 if allowed; 0 otherwise.
     320   */
     321  #define htmlElementAllowedHereDesc(parent,elt) \
     322  	htmlElementAllowedHere((parent), (elt)->name)
     323  /**
     324   * htmlRequiredAttrs:
     325   * @elt: HTML element
     326   *
     327   * Returns the attributes required for the specified element.
     328   */
     329  #define htmlRequiredAttrs(elt) (elt)->attrs_req
     330  
     331  
     332  #ifdef __cplusplus
     333  }
     334  #endif
     335  
     336  #else /* LIBXML_HTML_ENABLED */
     337  
     338  /** DOC_DISABLE */
     339  #define XML_GLOBALS_HTML
     340  /** DOC_ENABLE */
     341  
     342  #endif /* LIBXML_HTML_ENABLED */
     343  #endif /* __HTML_PARSER_H__ */