1 /*
2 * Summary: interface for an HTML 4.0 non-verifying parser
3 * Description: this module implements an HTML 4.0 non-verifying parser
4 * with API compatible with the XML parser ones. It should
5 * be able to parse "real world" HTML, even if severely
6 * broken from a specification point of view.
7 *
8 * Copy: See Copyright for the status of this software.
9 *
10 * Author: Daniel Veillard
11 */
12
13 #ifndef __HTML_PARSER_H__
14 #define __HTML_PARSER_H__
15 #include <libxml/xmlversion.h>
16 #include <libxml/parser.h>
17
18 #ifdef LIBXML_HTML_ENABLED
19
20 #ifdef __cplusplus
21 extern "C" {
22 #endif
23
24 /*
25 * Most of the back-end structures from XML and HTML are shared.
26 */
27 typedef xmlParserCtxt htmlParserCtxt;
28 typedef xmlParserCtxtPtr htmlParserCtxtPtr;
29 typedef xmlParserNodeInfo htmlParserNodeInfo;
30 typedef xmlSAXHandler htmlSAXHandler;
31 typedef xmlSAXHandlerPtr htmlSAXHandlerPtr;
32 typedef xmlParserInput htmlParserInput;
33 typedef xmlParserInputPtr htmlParserInputPtr;
34 typedef xmlDocPtr htmlDocPtr;
35 typedef xmlNodePtr htmlNodePtr;
36
37 /*
38 * Internal description of an HTML element, representing HTML 4.01
39 * and XHTML 1.0 (which share the same structure).
40 */
41 typedef struct _htmlElemDesc htmlElemDesc;
42 typedef htmlElemDesc *htmlElemDescPtr;
43 struct _htmlElemDesc {
44 const char *name; /* The tag name */
45 char startTag; /* Whether the start tag can be implied */
46 char endTag; /* Whether the end tag can be implied */
47 char saveEndTag; /* Whether the end tag should be saved */
48 char empty; /* Is this an empty element ? */
49 char depr; /* Is this a deprecated element ? */
50 char dtd; /* 1: only in Loose DTD, 2: only Frameset one */
51 char isinline; /* is this a block 0 or inline 1 element */
52 const char *desc; /* the description */
53
54 /* NRK Jan.2003
55 * New fields encapsulating HTML structure
56 *
57 * Bugs:
58 * This is a very limited representation. It fails to tell us when
59 * an element *requires* subelements (we only have whether they're
60 * allowed or not), and it doesn't tell us where CDATA and PCDATA
61 * are allowed. Some element relationships are not fully represented:
62 * these are flagged with the word MODIFIER
63 */
64 const char** subelts; /* allowed sub-elements of this element */
65 const char* defaultsubelt; /* subelement for suggested auto-repair
66 if necessary or NULL */
67 const char** attrs_opt; /* Optional Attributes */
68 const char** attrs_depr; /* Additional deprecated attributes */
69 const char** attrs_req; /* Required attributes */
70 };
71
72 /*
73 * Internal description of an HTML entity.
74 */
75 typedef struct _htmlEntityDesc htmlEntityDesc;
76 typedef htmlEntityDesc *htmlEntityDescPtr;
77 struct _htmlEntityDesc {
78 unsigned int value; /* the UNICODE value for the character */
79 const char *name; /* The entity name */
80 const char *desc; /* the description */
81 };
82
83 /** DOC_DISABLE */
84 #ifdef LIBXML_SAX1_ENABLED
85 #define XML_GLOBALS_HTML \
86 XML_OP(htmlDefaultSAXHandler, xmlSAXHandlerV1, XML_DEPRECATED)
87 #else
88 #define XML_GLOBALS_HTML
89 #endif
90
91 #define XML_OP XML_DECLARE_GLOBAL
92 XML_GLOBALS_HTML
93 #undef XML_OP
94
95 #if defined(LIBXML_THREAD_ENABLED) && !defined(XML_GLOBALS_NO_REDEFINITION)
96 #define htmlDefaultSAXHandler XML_GLOBAL_MACRO(htmlDefaultSAXHandler)
97 #endif
98 /** DOC_ENABLE */
99
100 /*
101 * There is only few public functions.
102 */
103 XML_DEPRECATED
104 XMLPUBFUN void
105 htmlInitAutoClose (void);
106 XMLPUBFUN const htmlElemDesc *
107 htmlTagLookup (const xmlChar *tag);
108 XMLPUBFUN const htmlEntityDesc *
109 htmlEntityLookup(const xmlChar *name);
110 XMLPUBFUN const htmlEntityDesc *
111 htmlEntityValueLookup(unsigned int value);
112
113 XMLPUBFUN int
114 htmlIsAutoClosed(htmlDocPtr doc,
115 htmlNodePtr elem);
116 XMLPUBFUN int
117 htmlAutoCloseTag(htmlDocPtr doc,
118 const xmlChar *name,
119 htmlNodePtr elem);
120 XML_DEPRECATED
121 XMLPUBFUN const htmlEntityDesc *
122 htmlParseEntityRef(htmlParserCtxtPtr ctxt,
123 const xmlChar **str);
124 XML_DEPRECATED
125 XMLPUBFUN int
126 htmlParseCharRef(htmlParserCtxtPtr ctxt);
127 XML_DEPRECATED
128 XMLPUBFUN void
129 htmlParseElement(htmlParserCtxtPtr ctxt);
130
131 XMLPUBFUN htmlParserCtxtPtr
132 htmlNewParserCtxt(void);
133 XMLPUBFUN htmlParserCtxtPtr
134 htmlNewSAXParserCtxt(const htmlSAXHandler *sax,
135 void *userData);
136
137 XMLPUBFUN htmlParserCtxtPtr
138 htmlCreateMemoryParserCtxt(const char *buffer,
139 int size);
140
141 XMLPUBFUN int
142 htmlParseDocument(htmlParserCtxtPtr ctxt);
143 XML_DEPRECATED
144 XMLPUBFUN htmlDocPtr
145 htmlSAXParseDoc (const xmlChar *cur,
146 const char *encoding,
147 htmlSAXHandlerPtr sax,
148 void *userData);
149 XMLPUBFUN htmlDocPtr
150 htmlParseDoc (const xmlChar *cur,
151 const char *encoding);
152 XMLPUBFUN htmlParserCtxtPtr
153 htmlCreateFileParserCtxt(const char *filename,
154 const char *encoding);
155 XML_DEPRECATED
156 XMLPUBFUN htmlDocPtr
157 htmlSAXParseFile(const char *filename,
158 const char *encoding,
159 htmlSAXHandlerPtr sax,
160 void *userData);
161 XMLPUBFUN htmlDocPtr
162 htmlParseFile (const char *filename,
163 const char *encoding);
164 XMLPUBFUN int
165 UTF8ToHtml (unsigned char *out,
166 int *outlen,
167 const unsigned char *in,
168 int *inlen);
169 XMLPUBFUN int
170 htmlEncodeEntities(unsigned char *out,
171 int *outlen,
172 const unsigned char *in,
173 int *inlen, int quoteChar);
174 XMLPUBFUN int
175 htmlIsScriptAttribute(const xmlChar *name);
176 XMLPUBFUN int
177 htmlHandleOmittedElem(int val);
178
179 #ifdef LIBXML_PUSH_ENABLED
180 /**
181 * Interfaces for the Push mode.
182 */
183 XMLPUBFUN htmlParserCtxtPtr
184 htmlCreatePushParserCtxt(htmlSAXHandlerPtr sax,
185 void *user_data,
186 const char *chunk,
187 int size,
188 const char *filename,
189 xmlCharEncoding enc);
190 XMLPUBFUN int
191 htmlParseChunk (htmlParserCtxtPtr ctxt,
192 const char *chunk,
193 int size,
194 int terminate);
195 #endif /* LIBXML_PUSH_ENABLED */
196
197 XMLPUBFUN void
198 htmlFreeParserCtxt (htmlParserCtxtPtr ctxt);
199
200 /*
201 * New set of simpler/more flexible APIs
202 */
203 /**
204 * xmlParserOption:
205 *
206 * This is the set of XML parser options that can be passed down
207 * to the xmlReadDoc() and similar calls.
208 */
209 typedef enum {
210 HTML_PARSE_RECOVER = 1<<0, /* Relaxed parsing */
211 HTML_PARSE_NODEFDTD = 1<<2, /* do not default a doctype if not found */
212 HTML_PARSE_NOERROR = 1<<5, /* suppress error reports */
213 HTML_PARSE_NOWARNING= 1<<6, /* suppress warning reports */
214 HTML_PARSE_PEDANTIC = 1<<7, /* pedantic error reporting */
215 HTML_PARSE_NOBLANKS = 1<<8, /* remove blank nodes */
216 HTML_PARSE_NONET = 1<<11,/* Forbid network access */
217 HTML_PARSE_NOIMPLIED= 1<<13,/* Do not add implied html/body... elements */
218 HTML_PARSE_COMPACT = 1<<16,/* compact small text nodes */
219 HTML_PARSE_IGNORE_ENC=1<<21 /* ignore internal document encoding hint */
220 } htmlParserOption;
221
222 XMLPUBFUN void
223 htmlCtxtReset (htmlParserCtxtPtr ctxt);
224 XMLPUBFUN int
225 htmlCtxtUseOptions (htmlParserCtxtPtr ctxt,
226 int options);
227 XMLPUBFUN htmlDocPtr
228 htmlReadDoc (const xmlChar *cur,
229 const char *URL,
230 const char *encoding,
231 int options);
232 XMLPUBFUN htmlDocPtr
233 htmlReadFile (const char *URL,
234 const char *encoding,
235 int options);
236 XMLPUBFUN htmlDocPtr
237 htmlReadMemory (const char *buffer,
238 int size,
239 const char *URL,
240 const char *encoding,
241 int options);
242 XMLPUBFUN htmlDocPtr
243 htmlReadFd (int fd,
244 const char *URL,
245 const char *encoding,
246 int options);
247 XMLPUBFUN htmlDocPtr
248 htmlReadIO (xmlInputReadCallback ioread,
249 xmlInputCloseCallback ioclose,
250 void *ioctx,
251 const char *URL,
252 const char *encoding,
253 int options);
254 XMLPUBFUN htmlDocPtr
255 htmlCtxtReadDoc (xmlParserCtxtPtr ctxt,
256 const xmlChar *cur,
257 const char *URL,
258 const char *encoding,
259 int options);
260 XMLPUBFUN htmlDocPtr
261 htmlCtxtReadFile (xmlParserCtxtPtr ctxt,
262 const char *filename,
263 const char *encoding,
264 int options);
265 XMLPUBFUN htmlDocPtr
266 htmlCtxtReadMemory (xmlParserCtxtPtr ctxt,
267 const char *buffer,
268 int size,
269 const char *URL,
270 const char *encoding,
271 int options);
272 XMLPUBFUN htmlDocPtr
273 htmlCtxtReadFd (xmlParserCtxtPtr ctxt,
274 int fd,
275 const char *URL,
276 const char *encoding,
277 int options);
278 XMLPUBFUN htmlDocPtr
279 htmlCtxtReadIO (xmlParserCtxtPtr ctxt,
280 xmlInputReadCallback ioread,
281 xmlInputCloseCallback ioclose,
282 void *ioctx,
283 const char *URL,
284 const char *encoding,
285 int options);
286
287 /* NRK/Jan2003: further knowledge of HTML structure
288 */
289 typedef enum {
290 HTML_NA = 0 , /* something we don't check at all */
291 HTML_INVALID = 0x1 ,
292 HTML_DEPRECATED = 0x2 ,
293 HTML_VALID = 0x4 ,
294 HTML_REQUIRED = 0xc /* VALID bit set so ( & HTML_VALID ) is TRUE */
295 } htmlStatus ;
296
297 /* Using htmlElemDesc rather than name here, to emphasise the fact
298 that otherwise there's a lookup overhead
299 */
300 XMLPUBFUN htmlStatus htmlAttrAllowed(const htmlElemDesc*, const xmlChar*, int) ;
301 XMLPUBFUN int htmlElementAllowedHere(const htmlElemDesc*, const xmlChar*) ;
302 XMLPUBFUN htmlStatus htmlElementStatusHere(const htmlElemDesc*, const htmlElemDesc*) ;
303 XMLPUBFUN htmlStatus htmlNodeStatus(const htmlNodePtr, int) ;
304 /**
305 * htmlDefaultSubelement:
306 * @elt: HTML element
307 *
308 * Returns the default subelement for this element
309 */
310 #define htmlDefaultSubelement(elt) elt->defaultsubelt
311 /**
312 * htmlElementAllowedHereDesc:
313 * @parent: HTML parent element
314 * @elt: HTML element
315 *
316 * Checks whether an HTML element description may be a
317 * direct child of the specified element.
318 *
319 * Returns 1 if allowed; 0 otherwise.
320 */
321 #define htmlElementAllowedHereDesc(parent,elt) \
322 htmlElementAllowedHere((parent), (elt)->name)
323 /**
324 * htmlRequiredAttrs:
325 * @elt: HTML element
326 *
327 * Returns the attributes required for the specified element.
328 */
329 #define htmlRequiredAttrs(elt) (elt)->attrs_req
330
331
332 #ifdef __cplusplus
333 }
334 #endif
335
336 #else /* LIBXML_HTML_ENABLED */
337
338 /** DOC_DISABLE */
339 #define XML_GLOBALS_HTML
340 /** DOC_ENABLE */
341
342 #endif /* LIBXML_HTML_ENABLED */
343 #endif /* __HTML_PARSER_H__ */