HTMLparser.h
1 /* 2 * Summary: interface for an HTML 4.0 non-verifying parser 3 * Description: this module implements an HTML 4.0 non-verifying parser 4 * with API compatible with the XML parser ones. It should 5 * be able to parse "real world" HTML, even if severely 6 * broken from a specification point of view. 7 * 8 * Copy: See Copyright for the status of this software. 9 * 10 * Author: Daniel Veillard 11 */ 12 13 #ifndef __HTML_PARSER_H__ 14 #define __HTML_PARSER_H__ 15 #include <libxml/xmlversion.h> 16 #include <libxml/encoding.h> 17 #include <libxml/parser.h> 18 #include <libxml/xmlIO.h> 19 #include <libxml/xmlstring.h> 20 21 #ifdef LIBXML_HTML_ENABLED 22 23 #ifdef __cplusplus 24 extern "C" { 25 #endif 26 27 /* 28 * Most of the back-end structures from XML and HTML are shared. 29 */ 30 typedef xmlParserCtxt htmlParserCtxt; 31 typedef xmlParserCtxtPtr htmlParserCtxtPtr; 32 typedef xmlParserNodeInfo htmlParserNodeInfo; 33 typedef xmlSAXHandler htmlSAXHandler; 34 typedef xmlSAXHandlerPtr htmlSAXHandlerPtr; 35 typedef xmlParserInput htmlParserInput; 36 typedef xmlParserInputPtr htmlParserInputPtr; 37 typedef xmlDocPtr htmlDocPtr; 38 typedef xmlNodePtr htmlNodePtr; 39 40 /* 41 * Internal description of an HTML element, representing HTML 4.01 42 * and XHTML 1.0 (which share the same structure). 43 */ 44 typedef struct _htmlElemDesc htmlElemDesc; 45 typedef htmlElemDesc *htmlElemDescPtr; 46 struct _htmlElemDesc { 47 const char *name; /* The tag name */ 48 char startTag; /* Whether the start tag can be implied */ 49 char endTag; /* Whether the end tag can be implied */ 50 char saveEndTag; /* Whether the end tag should be saved */ 51 char empty; /* Is this an empty element ? */ 52 char depr; /* Is this a deprecated element ? */ 53 char dtd; /* 1: only in Loose DTD, 2: only Frameset one */ 54 char isinline; /* is this a block 0 or inline 1 element */ 55 const char *desc; /* the description */ 56 57 /* NRK Jan.2003 58 * New fields encapsulating HTML structure 59 * 60 * Bugs: 61 * This is a very limited representation. It fails to tell us when 62 * an element *requires* subelements (we only have whether they're 63 * allowed or not), and it doesn't tell us where CDATA and PCDATA 64 * are allowed. Some element relationships are not fully represented: 65 * these are flagged with the word MODIFIER 66 */ 67 const char** subelts; /* allowed sub-elements of this element */ 68 const char* defaultsubelt; /* subelement for suggested auto-repair 69 if necessary or NULL */ 70 const char** attrs_opt; /* Optional Attributes */ 71 const char** attrs_depr; /* Additional deprecated attributes */ 72 const char** attrs_req; /* Required attributes */ 73 }; 74 75 /* 76 * Internal description of an HTML entity. 77 */ 78 typedef struct _htmlEntityDesc htmlEntityDesc; 79 typedef htmlEntityDesc *htmlEntityDescPtr; 80 struct _htmlEntityDesc { 81 unsigned int value; /* the UNICODE value for the character */ 82 const char *name; /* The entity name */ 83 const char *desc; /* the description */ 84 }; 85 86 /* 87 * There is only few public functions. 88 */ 89 XMLPUBFUN const htmlElemDesc * XMLCALL 90 htmlTagLookup (const xmlChar *tag); 91 XMLPUBFUN const htmlEntityDesc * XMLCALL 92 htmlEntityLookup(const xmlChar *name); 93 XMLPUBFUN const htmlEntityDesc * XMLCALL 94 htmlEntityValueLookup(unsigned int value); 95 96 XMLPUBFUN int XMLCALL 97 htmlIsAutoClosed(htmlDocPtr doc, 98 htmlNodePtr elem); 99 XMLPUBFUN int XMLCALL 100 htmlAutoCloseTag(htmlDocPtr doc, 101 const xmlChar *name, 102 htmlNodePtr elem); 103 XMLPUBFUN const htmlEntityDesc * XMLCALL 104 htmlParseEntityRef(htmlParserCtxtPtr ctxt, 105 const xmlChar **str); 106 XMLPUBFUN int XMLCALL 107 htmlParseCharRef(htmlParserCtxtPtr ctxt); 108 XMLPUBFUN void XMLCALL 109 htmlParseElement(htmlParserCtxtPtr ctxt); 110 111 XMLPUBFUN htmlParserCtxtPtr XMLCALL 112 htmlNewParserCtxt(void); 113 114 XMLPUBFUN htmlParserCtxtPtr XMLCALL 115 htmlCreateMemoryParserCtxt(const char *buffer, 116 int size); 117 118 XMLPUBFUN int XMLCALL 119 htmlParseDocument(htmlParserCtxtPtr ctxt); 120 XMLPUBFUN htmlDocPtr XMLCALL 121 htmlSAXParseDoc (const xmlChar *cur, 122 const char *encoding, 123 htmlSAXHandlerPtr sax, 124 void *userData); 125 XMLPUBFUN htmlDocPtr XMLCALL 126 htmlParseDoc (const xmlChar *cur, 127 const char *encoding); 128 XMLPUBFUN htmlDocPtr XMLCALL 129 htmlSAXParseFile(const char *filename, 130 const char *encoding, 131 htmlSAXHandlerPtr sax, 132 void *userData); 133 XMLPUBFUN htmlDocPtr XMLCALL 134 htmlParseFile (const char *filename, 135 const char *encoding); 136 XMLPUBFUN int XMLCALL 137 UTF8ToHtml (unsigned char *out, 138 int *outlen, 139 const unsigned char *in, 140 int *inlen); 141 XMLPUBFUN int XMLCALL 142 htmlEncodeEntities(unsigned char *out, 143 int *outlen, 144 const unsigned char *in, 145 int *inlen, int quoteChar); 146 XMLPUBFUN int XMLCALL 147 htmlIsScriptAttribute(const xmlChar *name); 148 XMLPUBFUN int XMLCALL 149 htmlHandleOmittedElem(int val); 150 151 #ifdef LIBXML_PUSH_ENABLED 152 /** 153 * Interfaces for the Push mode. 154 */ 155 XMLPUBFUN htmlParserCtxtPtr XMLCALL 156 htmlCreatePushParserCtxt(htmlSAXHandlerPtr sax, 157 void *user_data, 158 const char *chunk, 159 int size, 160 const char *filename, 161 xmlCharEncoding enc); 162 XMLPUBFUN int XMLCALL 163 htmlParseChunk (htmlParserCtxtPtr ctxt, 164 const char *chunk, 165 int size, 166 int terminate); 167 #endif /* LIBXML_PUSH_ENABLED */ 168 169 XMLPUBFUN void XMLCALL 170 htmlFreeParserCtxt (htmlParserCtxtPtr ctxt); 171 172 /* 173 * New set of simpler/more flexible APIs 174 */ 175 /** 176 * xmlParserOption: 177 * 178 * This is the set of XML parser options that can be passed down 179 * to the xmlReadDoc() and similar calls. 180 */ 181 typedef enum { 182 HTML_PARSE_RECOVER = 1<<0, /* Relaxed parsing */ 183 HTML_PARSE_NODEFDTD = 1<<2, /* do not default a doctype if not found */ 184 HTML_PARSE_NOERROR = 1<<5, /* suppress error reports */ 185 HTML_PARSE_NOWARNING= 1<<6, /* suppress warning reports */ 186 HTML_PARSE_PEDANTIC = 1<<7, /* pedantic error reporting */ 187 HTML_PARSE_NOBLANKS = 1<<8, /* remove blank nodes */ 188 HTML_PARSE_NONET = 1<<11,/* Forbid network access */ 189 HTML_PARSE_NOIMPLIED= 1<<13,/* Do not add implied html/body... elements */ 190 HTML_PARSE_COMPACT = 1<<16,/* compact small text nodes */ 191 HTML_PARSE_IGNORE_ENC=1<<21 /* ignore internal document encoding hint */ 192 } htmlParserOption; 193 194 XMLPUBFUN void XMLCALL 195 htmlCtxtReset (htmlParserCtxtPtr ctxt); 196 XMLPUBFUN int XMLCALL 197 htmlCtxtUseOptions (htmlParserCtxtPtr ctxt, 198 int options); 199 XMLPUBFUN htmlDocPtr XMLCALL 200 htmlReadDoc (const xmlChar *cur, 201 const char *URL, 202 const char *encoding, 203 int options); 204 XMLPUBFUN htmlDocPtr XMLCALL 205 htmlReadFile (const char *URL, 206 const char *encoding, 207 int options); 208 XMLPUBFUN htmlDocPtr XMLCALL 209 htmlReadMemory (const char *buffer, 210 int size, 211 const char *URL, 212 const char *encoding, 213 int options); 214 XMLPUBFUN htmlDocPtr XMLCALL 215 htmlReadFd (int fd, 216 const char *URL, 217 const char *encoding, 218 int options); 219 XMLPUBFUN htmlDocPtr XMLCALL 220 htmlReadIO (xmlInputReadCallback ioread, 221 xmlInputCloseCallback ioclose, 222 void *ioctx, 223 const char *URL, 224 const char *encoding, 225 int options); 226 XMLPUBFUN htmlDocPtr XMLCALL 227 htmlCtxtReadDoc (xmlParserCtxtPtr ctxt, 228 const xmlChar *cur, 229 const char *URL, 230 const char *encoding, 231 int options); 232 XMLPUBFUN htmlDocPtr XMLCALL 233 htmlCtxtReadFile (xmlParserCtxtPtr ctxt, 234 const char *filename, 235 const char *encoding, 236 int options); 237 XMLPUBFUN htmlDocPtr XMLCALL 238 htmlCtxtReadMemory (xmlParserCtxtPtr ctxt, 239 const char *buffer, 240 int size, 241 const char *URL, 242 const char *encoding, 243 int options); 244 XMLPUBFUN htmlDocPtr XMLCALL 245 htmlCtxtReadFd (xmlParserCtxtPtr ctxt, 246 int fd, 247 const char *URL, 248 const char *encoding, 249 int options); 250 XMLPUBFUN htmlDocPtr XMLCALL 251 htmlCtxtReadIO (xmlParserCtxtPtr ctxt, 252 xmlInputReadCallback ioread, 253 xmlInputCloseCallback ioclose, 254 void *ioctx, 255 const char *URL, 256 const char *encoding, 257 int options); 258 259 /* NRK/Jan2003: further knowledge of HTML structure 260 */ 261 typedef enum { 262 HTML_NA = 0 , /* something we don't check at all */ 263 HTML_INVALID = 0x1 , 264 HTML_DEPRECATED = 0x2 , 265 HTML_VALID = 0x4 , 266 HTML_REQUIRED = 0xc /* VALID bit set so ( & HTML_VALID ) is TRUE */ 267 } htmlStatus ; 268 269 /* Using htmlElemDesc rather than name here, to emphasise the fact 270 that otherwise there's a lookup overhead 271 */ 272 XMLPUBFUN htmlStatus XMLCALL htmlAttrAllowed(const htmlElemDesc*, const xmlChar*, int) ; 273 XMLPUBFUN int XMLCALL htmlElementAllowedHere(const htmlElemDesc*, const xmlChar*) ; 274 XMLPUBFUN htmlStatus XMLCALL htmlElementStatusHere(const htmlElemDesc*, const htmlElemDesc*) ; 275 XMLPUBFUN htmlStatus XMLCALL htmlNodeStatus(const htmlNodePtr, int) ; 276 /** 277 * htmlDefaultSubelement: 278 * @elt: HTML element 279 * 280 * Returns the default subelement for this element 281 */ 282 #define htmlDefaultSubelement(elt) elt->defaultsubelt 283 /** 284 * htmlElementAllowedHereDesc: 285 * @parent: HTML parent element 286 * @elt: HTML element 287 * 288 * Checks whether an HTML element description may be a 289 * direct child of the specified element. 290 * 291 * Returns 1 if allowed; 0 otherwise. 292 */ 293 #define htmlElementAllowedHereDesc(parent,elt) \ 294 htmlElementAllowedHere((parent), (elt)->name) 295 /** 296 * htmlRequiredAttrs: 297 * @elt: HTML element 298 * 299 * Returns the attributes required for the specified element. 300 */ 301 #define htmlRequiredAttrs(elt) (elt)->attrs_req 302 303 304 #ifdef __cplusplus 305 } 306 #endif 307 308 #endif /* LIBXML_HTML_ENABLED */ 309 #endif /* __HTML_PARSER_H__ */