/ lib / lxml / includes / libxml / HTMLparser.h
HTMLparser.h
  1  /*
  2   * Summary: interface for an HTML 4.0 non-verifying parser
  3   * Description: this module implements an HTML 4.0 non-verifying parser
  4   *              with API compatible with the XML parser ones. It should
  5   *              be able to parse "real world" HTML, even if severely
  6   *              broken from a specification point of view.
  7   *
  8   * Copy: See Copyright for the status of this software.
  9   *
 10   * Author: Daniel Veillard
 11   */
 12  
 13  #ifndef __HTML_PARSER_H__
 14  #define __HTML_PARSER_H__
 15  #include <libxml/xmlversion.h>
 16  #include <libxml/parser.h>
 17  
 18  #ifdef LIBXML_HTML_ENABLED
 19  
 20  #ifdef __cplusplus
 21  extern "C" {
 22  #endif
 23  
 24  /*
 25   * Most of the back-end structures from XML and HTML are shared.
 26   */
 27  typedef xmlParserCtxt htmlParserCtxt;
 28  typedef xmlParserCtxtPtr htmlParserCtxtPtr;
 29  typedef xmlParserNodeInfo htmlParserNodeInfo;
 30  typedef xmlSAXHandler htmlSAXHandler;
 31  typedef xmlSAXHandlerPtr htmlSAXHandlerPtr;
 32  typedef xmlParserInput htmlParserInput;
 33  typedef xmlParserInputPtr htmlParserInputPtr;
 34  typedef xmlDocPtr htmlDocPtr;
 35  typedef xmlNodePtr htmlNodePtr;
 36  
 37  /*
 38   * Internal description of an HTML element, representing HTML 4.01
 39   * and XHTML 1.0 (which share the same structure).
 40   */
 41  typedef struct _htmlElemDesc htmlElemDesc;
 42  typedef htmlElemDesc *htmlElemDescPtr;
 43  struct _htmlElemDesc {
 44      const char *name;	/* The tag name */
 45      char startTag;      /* Whether the start tag can be implied */
 46      char endTag;        /* Whether the end tag can be implied */
 47      char saveEndTag;    /* Whether the end tag should be saved */
 48      char empty;         /* Is this an empty element ? */
 49      char depr;          /* Is this a deprecated element ? */
 50      char dtd;           /* 1: only in Loose DTD, 2: only Frameset one */
 51      char isinline;      /* is this a block 0 or inline 1 element */
 52      const char *desc;   /* the description */
 53  
 54  /* NRK Jan.2003
 55   * New fields encapsulating HTML structure
 56   *
 57   * Bugs:
 58   *	This is a very limited representation.  It fails to tell us when
 59   *	an element *requires* subelements (we only have whether they're
 60   *	allowed or not), and it doesn't tell us where CDATA and PCDATA
 61   *	are allowed.  Some element relationships are not fully represented:
 62   *	these are flagged with the word MODIFIER
 63   */
 64      const char** subelts;		/* allowed sub-elements of this element */
 65      const char* defaultsubelt;	/* subelement for suggested auto-repair
 66  					   if necessary or NULL */
 67      const char** attrs_opt;		/* Optional Attributes */
 68      const char** attrs_depr;		/* Additional deprecated attributes */
 69      const char** attrs_req;		/* Required attributes */
 70  };
 71  
 72  /*
 73   * Internal description of an HTML entity.
 74   */
 75  typedef struct _htmlEntityDesc htmlEntityDesc;
 76  typedef htmlEntityDesc *htmlEntityDescPtr;
 77  struct _htmlEntityDesc {
 78      unsigned int value;	/* the UNICODE value for the character */
 79      const char *name;	/* The entity name */
 80      const char *desc;   /* the description */
 81  };
 82  
 83  /*
 84   * There is only few public functions.
 85   */
 86  XMLPUBFUN const htmlElemDesc * XMLCALL
 87  			htmlTagLookup	(const xmlChar *tag);
 88  XMLPUBFUN const htmlEntityDesc * XMLCALL
 89  			htmlEntityLookup(const xmlChar *name);
 90  XMLPUBFUN const htmlEntityDesc * XMLCALL
 91  			htmlEntityValueLookup(unsigned int value);
 92  
 93  XMLPUBFUN int XMLCALL
 94  			htmlIsAutoClosed(htmlDocPtr doc,
 95  					 htmlNodePtr elem);
 96  XMLPUBFUN int XMLCALL
 97  			htmlAutoCloseTag(htmlDocPtr doc,
 98  					 const xmlChar *name,
 99  					 htmlNodePtr elem);
100  XMLPUBFUN const htmlEntityDesc * XMLCALL
101  			htmlParseEntityRef(htmlParserCtxtPtr ctxt,
102  					 const xmlChar **str);
103  XMLPUBFUN int XMLCALL
104  			htmlParseCharRef(htmlParserCtxtPtr ctxt);
105  XMLPUBFUN void XMLCALL
106  			htmlParseElement(htmlParserCtxtPtr ctxt);
107  
108  XMLPUBFUN htmlParserCtxtPtr XMLCALL
109  			htmlNewParserCtxt(void);
110  
111  XMLPUBFUN htmlParserCtxtPtr XMLCALL
112  			htmlCreateMemoryParserCtxt(const char *buffer,
113  						   int size);
114  
115  XMLPUBFUN int XMLCALL
116  			htmlParseDocument(htmlParserCtxtPtr ctxt);
117  XMLPUBFUN htmlDocPtr XMLCALL
118  			htmlSAXParseDoc	(const xmlChar *cur,
119  					 const char *encoding,
120  					 htmlSAXHandlerPtr sax,
121  					 void *userData);
122  XMLPUBFUN htmlDocPtr XMLCALL
123  			htmlParseDoc	(const xmlChar *cur,
124  					 const char *encoding);
125  XMLPUBFUN htmlDocPtr XMLCALL
126  			htmlSAXParseFile(const char *filename,
127  					 const char *encoding,
128  					 htmlSAXHandlerPtr sax,
129  					 void *userData);
130  XMLPUBFUN htmlDocPtr XMLCALL
131  			htmlParseFile	(const char *filename,
132  					 const char *encoding);
133  XMLPUBFUN int XMLCALL
134  			UTF8ToHtml	(unsigned char *out,
135  					 int *outlen,
136  					 const unsigned char *in,
137  					 int *inlen);
138  XMLPUBFUN int XMLCALL
139  			htmlEncodeEntities(unsigned char *out,
140  					 int *outlen,
141  					 const unsigned char *in,
142  					 int *inlen, int quoteChar);
143  XMLPUBFUN int XMLCALL
144  			htmlIsScriptAttribute(const xmlChar *name);
145  XMLPUBFUN int XMLCALL
146  			htmlHandleOmittedElem(int val);
147  
148  #ifdef LIBXML_PUSH_ENABLED
149  /**
150   * Interfaces for the Push mode.
151   */
152  XMLPUBFUN htmlParserCtxtPtr XMLCALL
153  			htmlCreatePushParserCtxt(htmlSAXHandlerPtr sax,
154  						 void *user_data,
155  						 const char *chunk,
156  						 int size,
157  						 const char *filename,
158  						 xmlCharEncoding enc);
159  XMLPUBFUN int XMLCALL
160  			htmlParseChunk		(htmlParserCtxtPtr ctxt,
161  						 const char *chunk,
162  						 int size,
163  						 int terminate);
164  #endif /* LIBXML_PUSH_ENABLED */
165  
166  XMLPUBFUN void XMLCALL
167  			htmlFreeParserCtxt	(htmlParserCtxtPtr ctxt);
168  
169  /*
170   * New set of simpler/more flexible APIs
171   */
172  /**
173   * xmlParserOption:
174   *
175   * This is the set of XML parser options that can be passed down
176   * to the xmlReadDoc() and similar calls.
177   */
178  typedef enum {
179      HTML_PARSE_RECOVER  = 1<<0, /* Relaxed parsing */
180      HTML_PARSE_NODEFDTD = 1<<2, /* do not default a doctype if not found */
181      HTML_PARSE_NOERROR	= 1<<5,	/* suppress error reports */
182      HTML_PARSE_NOWARNING= 1<<6,	/* suppress warning reports */
183      HTML_PARSE_PEDANTIC	= 1<<7,	/* pedantic error reporting */
184      HTML_PARSE_NOBLANKS	= 1<<8,	/* remove blank nodes */
185      HTML_PARSE_NONET	= 1<<11,/* Forbid network access */
186      HTML_PARSE_NOIMPLIED= 1<<13,/* Do not add implied html/body... elements */
187      HTML_PARSE_COMPACT  = 1<<16,/* compact small text nodes */
188      HTML_PARSE_IGNORE_ENC=1<<21 /* ignore internal document encoding hint */
189  } htmlParserOption;
190  
191  XMLPUBFUN void XMLCALL
192  		htmlCtxtReset		(htmlParserCtxtPtr ctxt);
193  XMLPUBFUN int XMLCALL
194  		htmlCtxtUseOptions	(htmlParserCtxtPtr ctxt,
195  					 int options);
196  XMLPUBFUN htmlDocPtr XMLCALL
197  		htmlReadDoc		(const xmlChar *cur,
198  					 const char *URL,
199  					 const char *encoding,
200  					 int options);
201  XMLPUBFUN htmlDocPtr XMLCALL
202  		htmlReadFile		(const char *URL,
203  					 const char *encoding,
204  					 int options);
205  XMLPUBFUN htmlDocPtr XMLCALL
206  		htmlReadMemory		(const char *buffer,
207  					 int size,
208  					 const char *URL,
209  					 const char *encoding,
210  					 int options);
211  XMLPUBFUN htmlDocPtr XMLCALL
212  		htmlReadFd		(int fd,
213  					 const char *URL,
214  					 const char *encoding,
215  					 int options);
216  XMLPUBFUN htmlDocPtr XMLCALL
217  		htmlReadIO		(xmlInputReadCallback ioread,
218  					 xmlInputCloseCallback ioclose,
219  					 void *ioctx,
220  					 const char *URL,
221  					 const char *encoding,
222  					 int options);
223  XMLPUBFUN htmlDocPtr XMLCALL
224  		htmlCtxtReadDoc		(xmlParserCtxtPtr ctxt,
225  					 const xmlChar *cur,
226  					 const char *URL,
227  					 const char *encoding,
228  					 int options);
229  XMLPUBFUN htmlDocPtr XMLCALL
230  		htmlCtxtReadFile		(xmlParserCtxtPtr ctxt,
231  					 const char *filename,
232  					 const char *encoding,
233  					 int options);
234  XMLPUBFUN htmlDocPtr XMLCALL
235  		htmlCtxtReadMemory		(xmlParserCtxtPtr ctxt,
236  					 const char *buffer,
237  					 int size,
238  					 const char *URL,
239  					 const char *encoding,
240  					 int options);
241  XMLPUBFUN htmlDocPtr XMLCALL
242  		htmlCtxtReadFd		(xmlParserCtxtPtr ctxt,
243  					 int fd,
244  					 const char *URL,
245  					 const char *encoding,
246  					 int options);
247  XMLPUBFUN htmlDocPtr XMLCALL
248  		htmlCtxtReadIO		(xmlParserCtxtPtr ctxt,
249  					 xmlInputReadCallback ioread,
250  					 xmlInputCloseCallback ioclose,
251  					 void *ioctx,
252  					 const char *URL,
253  					 const char *encoding,
254  					 int options);
255  
256  /* NRK/Jan2003: further knowledge of HTML structure
257   */
258  typedef enum {
259    HTML_NA = 0 ,		/* something we don't check at all */
260    HTML_INVALID = 0x1 ,
261    HTML_DEPRECATED = 0x2 ,
262    HTML_VALID = 0x4 ,
263    HTML_REQUIRED = 0xc /* VALID bit set so ( & HTML_VALID ) is TRUE */
264  } htmlStatus ;
265  
266  /* Using htmlElemDesc rather than name here, to emphasise the fact
267     that otherwise there's a lookup overhead
268  */
269  XMLPUBFUN htmlStatus XMLCALL htmlAttrAllowed(const htmlElemDesc*, const xmlChar*, int) ;
270  XMLPUBFUN int XMLCALL htmlElementAllowedHere(const htmlElemDesc*, const xmlChar*) ;
271  XMLPUBFUN htmlStatus XMLCALL htmlElementStatusHere(const htmlElemDesc*, const htmlElemDesc*) ;
272  XMLPUBFUN htmlStatus XMLCALL htmlNodeStatus(const htmlNodePtr, int) ;
273  /**
274   * htmlDefaultSubelement:
275   * @elt: HTML element
276   *
277   * Returns the default subelement for this element
278   */
279  #define htmlDefaultSubelement(elt) elt->defaultsubelt
280  /**
281   * htmlElementAllowedHereDesc:
282   * @parent: HTML parent element
283   * @elt: HTML element
284   *
285   * Checks whether an HTML element description may be a
286   * direct child of the specified element.
287   *
288   * Returns 1 if allowed; 0 otherwise.
289   */
290  #define htmlElementAllowedHereDesc(parent,elt) \
291  	htmlElementAllowedHere((parent), (elt)->name)
292  /**
293   * htmlRequiredAttrs:
294   * @elt: HTML element
295   *
296   * Returns the attributes required for the specified element.
297   */
298  #define htmlRequiredAttrs(elt) (elt)->attrs_req
299  
300  
301  #ifdef __cplusplus
302  }
303  #endif
304  
305  #endif /* LIBXML_HTML_ENABLED */
306  #endif /* __HTML_PARSER_H__ */