HTMLtree.c
1 /* 2 * HTMLtree.c : implementation of access function for an HTML tree. 3 * 4 * See Copyright for the status of this software. 5 * 6 * daniel@veillard.com 7 */ 8 9 10 #define IN_LIBXML 11 #include "libxml.h" 12 #ifdef LIBXML_HTML_ENABLED 13 14 #include <string.h> /* for memset() only ! */ 15 16 #ifdef HAVE_CTYPE_H 17 #include <ctype.h> 18 #endif 19 #ifdef HAVE_STDLIB_H 20 #include <stdlib.h> 21 #endif 22 23 #include <libxml/xmlmemory.h> 24 #include <libxml/HTMLparser.h> 25 #include <libxml/HTMLtree.h> 26 #include <libxml/entities.h> 27 #include <libxml/valid.h> 28 #include <libxml/xmlerror.h> 29 #include <libxml/parserInternals.h> 30 #include <libxml/globals.h> 31 #include <libxml/uri.h> 32 33 #include "buf.h" 34 35 /************************************************************************ 36 * * 37 * Getting/Setting encoding meta tags * 38 * * 39 ************************************************************************/ 40 41 /** 42 * htmlGetMetaEncoding: 43 * @doc: the document 44 * 45 * Encoding definition lookup in the Meta tags 46 * 47 * Returns the current encoding as flagged in the HTML source 48 */ 49 const xmlChar * 50 htmlGetMetaEncoding(htmlDocPtr doc) { 51 htmlNodePtr cur; 52 const xmlChar *content; 53 const xmlChar *encoding; 54 55 if (doc == NULL) 56 return(NULL); 57 cur = doc->children; 58 59 /* 60 * Search the html 61 */ 62 while (cur != NULL) { 63 if ((cur->type == XML_ELEMENT_NODE) && (cur->name != NULL)) { 64 if (xmlStrEqual(cur->name, BAD_CAST"html")) 65 break; 66 if (xmlStrEqual(cur->name, BAD_CAST"head")) 67 goto found_head; 68 if (xmlStrEqual(cur->name, BAD_CAST"meta")) 69 goto found_meta; 70 } 71 cur = cur->next; 72 } 73 if (cur == NULL) 74 return(NULL); 75 cur = cur->children; 76 77 /* 78 * Search the head 79 */ 80 while (cur != NULL) { 81 if ((cur->type == XML_ELEMENT_NODE) && (cur->name != NULL)) { 82 if (xmlStrEqual(cur->name, BAD_CAST"head")) 83 break; 84 if (xmlStrEqual(cur->name, BAD_CAST"meta")) 85 goto found_meta; 86 } 87 cur = cur->next; 88 } 89 if (cur == NULL) 90 return(NULL); 91 found_head: 92 cur = cur->children; 93 94 /* 95 * Search the meta elements 96 */ 97 found_meta: 98 while (cur != NULL) { 99 if ((cur->type == XML_ELEMENT_NODE) && (cur->name != NULL)) { 100 if (xmlStrEqual(cur->name, BAD_CAST"meta")) { 101 xmlAttrPtr attr = cur->properties; 102 int http; 103 const xmlChar *value; 104 105 content = NULL; 106 http = 0; 107 while (attr != NULL) { 108 if ((attr->children != NULL) && 109 (attr->children->type == XML_TEXT_NODE) && 110 (attr->children->next == NULL)) { 111 value = attr->children->content; 112 if ((!xmlStrcasecmp(attr->name, BAD_CAST"http-equiv")) 113 && (!xmlStrcasecmp(value, BAD_CAST"Content-Type"))) 114 http = 1; 115 else if ((value != NULL) 116 && (!xmlStrcasecmp(attr->name, BAD_CAST"content"))) 117 content = value; 118 if ((http != 0) && (content != NULL)) 119 goto found_content; 120 } 121 attr = attr->next; 122 } 123 } 124 } 125 cur = cur->next; 126 } 127 return(NULL); 128 129 found_content: 130 encoding = xmlStrstr(content, BAD_CAST"charset="); 131 if (encoding == NULL) 132 encoding = xmlStrstr(content, BAD_CAST"Charset="); 133 if (encoding == NULL) 134 encoding = xmlStrstr(content, BAD_CAST"CHARSET="); 135 if (encoding != NULL) { 136 encoding += 8; 137 } else { 138 encoding = xmlStrstr(content, BAD_CAST"charset ="); 139 if (encoding == NULL) 140 encoding = xmlStrstr(content, BAD_CAST"Charset ="); 141 if (encoding == NULL) 142 encoding = xmlStrstr(content, BAD_CAST"CHARSET ="); 143 if (encoding != NULL) 144 encoding += 9; 145 } 146 if (encoding != NULL) { 147 while ((*encoding == ' ') || (*encoding == '\t')) encoding++; 148 } 149 return(encoding); 150 } 151 152 /** 153 * htmlSetMetaEncoding: 154 * @doc: the document 155 * @encoding: the encoding string 156 * 157 * Sets the current encoding in the Meta tags 158 * NOTE: this will not change the document content encoding, just 159 * the META flag associated. 160 * 161 * Returns 0 in case of success and -1 in case of error 162 */ 163 int 164 htmlSetMetaEncoding(htmlDocPtr doc, const xmlChar *encoding) { 165 htmlNodePtr cur, meta = NULL, head = NULL; 166 const xmlChar *content = NULL; 167 char newcontent[100]; 168 169 newcontent[0] = 0; 170 171 if (doc == NULL) 172 return(-1); 173 174 /* html isn't a real encoding it's just libxml2 way to get entities */ 175 if (!xmlStrcasecmp(encoding, BAD_CAST "html")) 176 return(-1); 177 178 if (encoding != NULL) { 179 snprintf(newcontent, sizeof(newcontent), "text/html; charset=%s", 180 (char *)encoding); 181 newcontent[sizeof(newcontent) - 1] = 0; 182 } 183 184 cur = doc->children; 185 186 /* 187 * Search the html 188 */ 189 while (cur != NULL) { 190 if ((cur->type == XML_ELEMENT_NODE) && (cur->name != NULL)) { 191 if (xmlStrcasecmp(cur->name, BAD_CAST"html") == 0) 192 break; 193 if (xmlStrcasecmp(cur->name, BAD_CAST"head") == 0) 194 goto found_head; 195 if (xmlStrcasecmp(cur->name, BAD_CAST"meta") == 0) 196 goto found_meta; 197 } 198 cur = cur->next; 199 } 200 if (cur == NULL) 201 return(-1); 202 cur = cur->children; 203 204 /* 205 * Search the head 206 */ 207 while (cur != NULL) { 208 if ((cur->type == XML_ELEMENT_NODE) && (cur->name != NULL)) { 209 if (xmlStrcasecmp(cur->name, BAD_CAST"head") == 0) 210 break; 211 if (xmlStrcasecmp(cur->name, BAD_CAST"meta") == 0) { 212 head = cur->parent; 213 goto found_meta; 214 } 215 } 216 cur = cur->next; 217 } 218 if (cur == NULL) 219 return(-1); 220 found_head: 221 head = cur; 222 if (cur->children == NULL) 223 goto create; 224 cur = cur->children; 225 226 found_meta: 227 /* 228 * Search and update all the remaining the meta elements carrying 229 * encoding informations 230 */ 231 while (cur != NULL) { 232 if ((cur->type == XML_ELEMENT_NODE) && (cur->name != NULL)) { 233 if (xmlStrcasecmp(cur->name, BAD_CAST"meta") == 0) { 234 xmlAttrPtr attr = cur->properties; 235 int http; 236 const xmlChar *value; 237 238 content = NULL; 239 http = 0; 240 while (attr != NULL) { 241 if ((attr->children != NULL) && 242 (attr->children->type == XML_TEXT_NODE) && 243 (attr->children->next == NULL)) { 244 value = attr->children->content; 245 if ((!xmlStrcasecmp(attr->name, BAD_CAST"http-equiv")) 246 && (!xmlStrcasecmp(value, BAD_CAST"Content-Type"))) 247 http = 1; 248 else 249 { 250 if ((value != NULL) && 251 (!xmlStrcasecmp(attr->name, BAD_CAST"content"))) 252 content = value; 253 } 254 if ((http != 0) && (content != NULL)) 255 break; 256 } 257 attr = attr->next; 258 } 259 if ((http != 0) && (content != NULL)) { 260 meta = cur; 261 break; 262 } 263 264 } 265 } 266 cur = cur->next; 267 } 268 create: 269 if (meta == NULL) { 270 if ((encoding != NULL) && (head != NULL)) { 271 /* 272 * Create a new Meta element with the right attributes 273 */ 274 275 meta = xmlNewDocNode(doc, NULL, BAD_CAST"meta", NULL); 276 if (head->children == NULL) 277 xmlAddChild(head, meta); 278 else 279 xmlAddPrevSibling(head->children, meta); 280 xmlNewProp(meta, BAD_CAST"http-equiv", BAD_CAST"Content-Type"); 281 xmlNewProp(meta, BAD_CAST"content", BAD_CAST newcontent); 282 } 283 } else { 284 /* remove the meta tag if NULL is passed */ 285 if (encoding == NULL) { 286 xmlUnlinkNode(meta); 287 xmlFreeNode(meta); 288 } 289 /* change the document only if there is a real encoding change */ 290 else if (xmlStrcasestr(content, encoding) == NULL) { 291 xmlSetProp(meta, BAD_CAST"content", BAD_CAST newcontent); 292 } 293 } 294 295 296 return(0); 297 } 298 299 /** 300 * booleanHTMLAttrs: 301 * 302 * These are the HTML attributes which will be output 303 * in minimized form, i.e. <option selected="selected"> will be 304 * output as <option selected>, as per XSLT 1.0 16.2 "HTML Output Method" 305 * 306 */ 307 static const char* const htmlBooleanAttrs[] = { 308 "checked", "compact", "declare", "defer", "disabled", "ismap", 309 "multiple", "nohref", "noresize", "noshade", "nowrap", "readonly", 310 "selected", NULL 311 }; 312 313 314 /** 315 * htmlIsBooleanAttr: 316 * @name: the name of the attribute to check 317 * 318 * Determine if a given attribute is a boolean attribute. 319 * 320 * returns: false if the attribute is not boolean, true otherwise. 321 */ 322 int 323 htmlIsBooleanAttr(const xmlChar *name) 324 { 325 int i = 0; 326 327 while (htmlBooleanAttrs[i] != NULL) { 328 if (xmlStrcasecmp((const xmlChar *)htmlBooleanAttrs[i], name) == 0) 329 return 1; 330 i++; 331 } 332 return 0; 333 } 334 335 #ifdef LIBXML_OUTPUT_ENABLED 336 /* 337 * private routine exported from xmlIO.c 338 */ 339 xmlOutputBufferPtr 340 xmlAllocOutputBufferInternal(xmlCharEncodingHandlerPtr encoder); 341 /************************************************************************ 342 * * 343 * Output error handlers * 344 * * 345 ************************************************************************/ 346 /** 347 * htmlSaveErrMemory: 348 * @extra: extra informations 349 * 350 * Handle an out of memory condition 351 */ 352 static void 353 htmlSaveErrMemory(const char *extra) 354 { 355 __xmlSimpleError(XML_FROM_OUTPUT, XML_ERR_NO_MEMORY, NULL, NULL, extra); 356 } 357 358 /** 359 * htmlSaveErr: 360 * @code: the error number 361 * @node: the location of the error. 362 * @extra: extra informations 363 * 364 * Handle an out of memory condition 365 */ 366 static void 367 htmlSaveErr(int code, xmlNodePtr node, const char *extra) 368 { 369 const char *msg = NULL; 370 371 switch(code) { 372 case XML_SAVE_NOT_UTF8: 373 msg = "string is not in UTF-8\n"; 374 break; 375 case XML_SAVE_CHAR_INVALID: 376 msg = "invalid character value\n"; 377 break; 378 case XML_SAVE_UNKNOWN_ENCODING: 379 msg = "unknown encoding %s\n"; 380 break; 381 case XML_SAVE_NO_DOCTYPE: 382 msg = "HTML has no DOCTYPE\n"; 383 break; 384 default: 385 msg = "unexpected error number\n"; 386 } 387 #pragma clang diagnostic push 388 #pragma clang diagnostic ignored "-Wformat-nonliteral" 389 __xmlSimpleError(XML_FROM_OUTPUT, code, node, msg, extra); 390 #pragma clang diagnostic pop 391 } 392 393 /************************************************************************ 394 * * 395 * Dumping HTML tree content to a simple buffer * 396 * * 397 ************************************************************************/ 398 399 /** 400 * htmlBufNodeDumpFormat: 401 * @buf: the xmlBufPtr output 402 * @doc: the document 403 * @cur: the current node 404 * @format: should formatting spaces been added 405 * 406 * Dump an HTML node, recursive behaviour,children are printed too. 407 * 408 * Returns the number of byte written or -1 in case of error 409 */ 410 static size_t 411 htmlBufNodeDumpFormat(xmlBufPtr buf, xmlDocPtr doc, xmlNodePtr cur, 412 int format) { 413 size_t use; 414 int ret; 415 xmlOutputBufferPtr outbuf; 416 417 if (cur == NULL) { 418 return (-1); 419 } 420 if (buf == NULL) { 421 return (-1); 422 } 423 outbuf = (xmlOutputBufferPtr) xmlMalloc(sizeof(xmlOutputBuffer)); 424 if (outbuf == NULL) { 425 htmlSaveErrMemory("allocating HTML output buffer"); 426 return (-1); 427 } 428 memset(outbuf, 0, (size_t) sizeof(xmlOutputBuffer)); 429 outbuf->buffer = buf; 430 outbuf->encoder = NULL; 431 outbuf->writecallback = NULL; 432 outbuf->closecallback = NULL; 433 outbuf->context = NULL; 434 outbuf->written = 0; 435 436 use = xmlBufUse(buf); 437 htmlNodeDumpFormatOutput(outbuf, doc, cur, NULL, format); 438 xmlFree(outbuf); 439 ret = xmlBufUse(buf) - use; 440 return (ret); 441 } 442 443 /** 444 * htmlNodeDump: 445 * @buf: the HTML buffer output 446 * @doc: the document 447 * @cur: the current node 448 * 449 * Dump an HTML node, recursive behaviour,children are printed too, 450 * and formatting returns are added. 451 * 452 * Returns the number of byte written or -1 in case of error 453 */ 454 int 455 htmlNodeDump(xmlBufferPtr buf, xmlDocPtr doc, xmlNodePtr cur) { 456 xmlBufPtr buffer; 457 size_t ret; 458 459 if ((buf == NULL) || (cur == NULL)) 460 return(-1); 461 462 xmlInitParser(); 463 buffer = xmlBufFromBuffer(buf); 464 if (buffer == NULL) 465 return(-1); 466 467 ret = htmlBufNodeDumpFormat(buffer, doc, cur, 1); 468 469 xmlBufBackToBuffer(buffer); 470 471 if (ret > INT_MAX) 472 return(-1); 473 return((int) ret); 474 } 475 476 /** 477 * htmlNodeDumpFileFormat: 478 * @out: the FILE pointer 479 * @doc: the document 480 * @cur: the current node 481 * @encoding: the document encoding 482 * @format: should formatting spaces been added 483 * 484 * Dump an HTML node, recursive behaviour,children are printed too. 485 * 486 * TODO: if encoding == NULL try to save in the doc encoding 487 * 488 * returns: the number of byte written or -1 in case of failure. 489 */ 490 int 491 htmlNodeDumpFileFormat(FILE *out, xmlDocPtr doc, 492 xmlNodePtr cur, const char *encoding, int format) { 493 xmlOutputBufferPtr buf; 494 xmlCharEncodingHandlerPtr handler = NULL; 495 int ret; 496 497 xmlInitParser(); 498 499 if (encoding != NULL) { 500 xmlCharEncoding enc; 501 502 enc = xmlParseCharEncoding(encoding); 503 if (enc != XML_CHAR_ENCODING_UTF8) { 504 handler = xmlFindCharEncodingHandler(encoding); 505 if (handler == NULL) 506 htmlSaveErr(XML_SAVE_UNKNOWN_ENCODING, NULL, encoding); 507 } 508 } 509 510 /* 511 * Fallback to HTML or ASCII when the encoding is unspecified 512 */ 513 if (handler == NULL) 514 handler = xmlFindCharEncodingHandler("HTML"); 515 if (handler == NULL) 516 handler = xmlFindCharEncodingHandler("ascii"); 517 518 /* 519 * save the content to a temp buffer. 520 */ 521 buf = xmlOutputBufferCreateFile(out, handler); 522 if (buf == NULL) return(0); 523 524 htmlNodeDumpFormatOutput(buf, doc, cur, encoding, format); 525 526 ret = xmlOutputBufferClose(buf); 527 return(ret); 528 } 529 530 /** 531 * htmlNodeDumpFile: 532 * @out: the FILE pointer 533 * @doc: the document 534 * @cur: the current node 535 * 536 * Dump an HTML node, recursive behaviour,children are printed too, 537 * and formatting returns are added. 538 */ 539 void 540 htmlNodeDumpFile(FILE *out, xmlDocPtr doc, xmlNodePtr cur) { 541 htmlNodeDumpFileFormat(out, doc, cur, NULL, 1); 542 } 543 544 /** 545 * htmlDocDumpMemoryFormat: 546 * @cur: the document 547 * @mem: OUT: the memory pointer 548 * @size: OUT: the memory length 549 * @format: should formatting spaces been added 550 * 551 * Dump an HTML document in memory and return the xmlChar * and it's size. 552 * It's up to the caller to free the memory. 553 */ 554 void 555 htmlDocDumpMemoryFormat(xmlDocPtr cur, xmlChar**mem, int *size, int format) { 556 xmlOutputBufferPtr buf; 557 xmlCharEncodingHandlerPtr handler = NULL; 558 const char *encoding; 559 560 xmlInitParser(); 561 562 if ((mem == NULL) || (size == NULL)) 563 return; 564 if (cur == NULL) { 565 *mem = NULL; 566 *size = 0; 567 return; 568 } 569 570 encoding = (const char *) htmlGetMetaEncoding(cur); 571 572 if (encoding != NULL) { 573 xmlCharEncoding enc; 574 575 enc = xmlParseCharEncoding(encoding); 576 if (enc != cur->charset) { 577 if (cur->charset != XML_CHAR_ENCODING_UTF8) { 578 /* 579 * Not supported yet 580 */ 581 *mem = NULL; 582 *size = 0; 583 return; 584 } 585 586 handler = xmlFindCharEncodingHandler(encoding); 587 if (handler == NULL) 588 htmlSaveErr(XML_SAVE_UNKNOWN_ENCODING, NULL, encoding); 589 590 } else { 591 handler = xmlFindCharEncodingHandler(encoding); 592 } 593 } 594 595 /* 596 * Fallback to HTML or ASCII when the encoding is unspecified 597 */ 598 if (handler == NULL) 599 handler = xmlFindCharEncodingHandler("HTML"); 600 if (handler == NULL) 601 handler = xmlFindCharEncodingHandler("ascii"); 602 603 buf = xmlAllocOutputBufferInternal(handler); 604 if (buf == NULL) { 605 *mem = NULL; 606 *size = 0; 607 return; 608 } 609 610 htmlDocContentDumpFormatOutput(buf, cur, NULL, format); 611 612 xmlOutputBufferFlush(buf); 613 if (buf->conv != NULL) { 614 *size = xmlBufUse(buf->conv); 615 *mem = xmlStrndup(xmlBufContent(buf->conv), *size); 616 } else { 617 *size = xmlBufUse(buf->buffer); 618 *mem = xmlStrndup(xmlBufContent(buf->buffer), *size); 619 } 620 (void)xmlOutputBufferClose(buf); 621 } 622 623 /** 624 * htmlDocDumpMemory: 625 * @cur: the document 626 * @mem: OUT: the memory pointer 627 * @size: OUT: the memory length 628 * 629 * Dump an HTML document in memory and return the xmlChar * and it's size. 630 * It's up to the caller to free the memory. 631 */ 632 void 633 htmlDocDumpMemory(xmlDocPtr cur, xmlChar**mem, int *size) { 634 htmlDocDumpMemoryFormat(cur, mem, size, 1); 635 } 636 637 638 /************************************************************************ 639 * * 640 * Dumping HTML tree content to an I/O output buffer * 641 * * 642 ************************************************************************/ 643 644 void xmlNsListDumpOutput(xmlOutputBufferPtr buf, xmlNsPtr cur); 645 646 /** 647 * htmlDtdDumpOutput: 648 * @buf: the HTML buffer output 649 * @doc: the document 650 * @encoding: the encoding string 651 * 652 * TODO: check whether encoding is needed 653 * 654 * Dump the HTML document DTD, if any. 655 */ 656 static void 657 htmlDtdDumpOutput(xmlOutputBufferPtr buf, xmlDocPtr doc, 658 const char *encoding ATTRIBUTE_UNUSED) { 659 xmlDtdPtr cur = doc->intSubset; 660 661 if (cur == NULL) { 662 htmlSaveErr(XML_SAVE_NO_DOCTYPE, (xmlNodePtr) doc, NULL); 663 return; 664 } 665 xmlOutputBufferWriteString(buf, "<!DOCTYPE "); 666 xmlOutputBufferWriteString(buf, (const char *)cur->name); 667 if (cur->ExternalID != NULL) { 668 xmlOutputBufferWriteString(buf, " PUBLIC "); 669 xmlBufWriteQuotedString(buf->buffer, cur->ExternalID); 670 if (cur->SystemID != NULL) { 671 xmlOutputBufferWriteString(buf, " "); 672 xmlBufWriteQuotedString(buf->buffer, cur->SystemID); 673 } 674 } else if (cur->SystemID != NULL && 675 xmlStrcmp(cur->SystemID, BAD_CAST "about:legacy-compat")) { 676 xmlOutputBufferWriteString(buf, " SYSTEM "); 677 xmlBufWriteQuotedString(buf->buffer, cur->SystemID); 678 } 679 xmlOutputBufferWriteString(buf, ">\n"); 680 } 681 682 /** 683 * htmlAttrDumpOutput: 684 * @buf: the HTML buffer output 685 * @doc: the document 686 * @cur: the attribute pointer 687 * @encoding: the encoding string 688 * 689 * Dump an HTML attribute 690 */ 691 static void 692 htmlAttrDumpOutput(xmlOutputBufferPtr buf, xmlDocPtr doc, xmlAttrPtr cur, 693 const char *encoding ATTRIBUTE_UNUSED) { 694 xmlChar *value; 695 696 /* 697 * The html output method should not escape a & character 698 * occurring in an attribute value immediately followed by 699 * a { character (see Section B.7.1 of the HTML 4.0 Recommendation). 700 * This is implemented in xmlEncodeEntitiesReentrant 701 */ 702 703 if (cur == NULL) { 704 return; 705 } 706 xmlOutputBufferWriteString(buf, " "); 707 if ((cur->ns != NULL) && (cur->ns->prefix != NULL)) { 708 xmlOutputBufferWriteString(buf, (const char *)cur->ns->prefix); 709 xmlOutputBufferWriteString(buf, ":"); 710 } 711 xmlOutputBufferWriteString(buf, (const char *)cur->name); 712 if ((cur->children != NULL) && (!htmlIsBooleanAttr(cur->name))) { 713 value = xmlNodeListGetString(doc, cur->children, 0); 714 if (value) { 715 xmlOutputBufferWriteString(buf, "="); 716 if ((cur->ns == NULL) && (cur->parent != NULL) && 717 (cur->parent->ns == NULL) && 718 ((!xmlStrcasecmp(cur->name, BAD_CAST "href")) || 719 (!xmlStrcasecmp(cur->name, BAD_CAST "action")) || 720 (!xmlStrcasecmp(cur->name, BAD_CAST "src")) || 721 ((!xmlStrcasecmp(cur->name, BAD_CAST "name")) && 722 (!xmlStrcasecmp(cur->parent->name, BAD_CAST "a"))))) { 723 xmlChar *tmp = value; 724 /* xmlURIEscapeStr() escapes '"' so it can be safely used. */ 725 xmlBufCCat(buf->buffer, "\""); 726 727 while (IS_BLANK_CH(*tmp)) tmp++; 728 729 /* URI Escape everything, except server side includes. */ 730 for ( ; ; ) { 731 xmlChar *escaped; 732 xmlChar endChar; 733 xmlChar *end = NULL; 734 xmlChar *start = (xmlChar *)xmlStrstr(tmp, BAD_CAST "<!--"); 735 if (start != NULL) { 736 end = (xmlChar *)xmlStrstr(tmp, BAD_CAST "-->"); 737 if (end != NULL) { 738 *start = '\0'; 739 } 740 } 741 742 /* Escape the whole string, or until start (set to '\0'). */ 743 escaped = xmlURIEscapeStr(tmp, BAD_CAST"@/:=?;#%&,+"); 744 if (escaped != NULL) { 745 xmlBufCat(buf->buffer, escaped); 746 xmlFree(escaped); 747 } else { 748 xmlBufCat(buf->buffer, tmp); 749 } 750 751 if (end == NULL) { /* Everything has been written. */ 752 break; 753 } 754 755 /* Do not escape anything within server side includes. */ 756 *start = '<'; /* Restore the first character of "<!--". */ 757 end += 3; /* strlen("-->") */ 758 endChar = *end; 759 *end = '\0'; 760 xmlBufCat(buf->buffer, start); 761 *end = endChar; 762 tmp = end; 763 } 764 765 xmlBufCCat(buf->buffer, "\""); 766 } else { 767 xmlBufWriteQuotedString(buf->buffer, value); 768 } 769 xmlFree(value); 770 } else { 771 xmlOutputBufferWriteString(buf, "=\"\""); 772 } 773 } 774 } 775 776 /** 777 * htmlAttrListDumpOutput: 778 * @buf: the HTML buffer output 779 * @doc: the document 780 * @cur: the first attribute pointer 781 * @encoding: the encoding string 782 * 783 * Dump a list of HTML attributes 784 */ 785 static void 786 htmlAttrListDumpOutput(xmlOutputBufferPtr buf, xmlDocPtr doc, xmlAttrPtr cur, const char *encoding) { 787 if (cur == NULL) { 788 return; 789 } 790 while (cur != NULL) { 791 htmlAttrDumpOutput(buf, doc, cur, encoding); 792 cur = cur->next; 793 } 794 } 795 796 797 798 /** 799 * htmlNodeListDumpOutput: 800 * @buf: the HTML buffer output 801 * @doc: the document 802 * @cur: the first node 803 * @encoding: the encoding string 804 * @format: should formatting spaces been added 805 * 806 * Dump an HTML node list, recursive behaviour,children are printed too. 807 */ 808 static void 809 htmlNodeListDumpOutput(xmlOutputBufferPtr buf, xmlDocPtr doc, 810 xmlNodePtr cur, const char *encoding, int format) { 811 if (cur == NULL) { 812 return; 813 } 814 while (cur != NULL) { 815 htmlNodeDumpFormatOutput(buf, doc, cur, encoding, format); 816 cur = cur->next; 817 } 818 } 819 820 /** 821 * htmlNodeDumpFormatOutput: 822 * @buf: the HTML buffer output 823 * @doc: the document 824 * @cur: the current node 825 * @encoding: the encoding string 826 * @format: should formatting spaces been added 827 * 828 * Dump an HTML node, recursive behaviour,children are printed too. 829 */ 830 void 831 htmlNodeDumpFormatOutput(xmlOutputBufferPtr buf, xmlDocPtr doc, 832 xmlNodePtr cur, const char *encoding, int format) { 833 const htmlElemDesc * info; 834 835 xmlInitParser(); 836 837 if ((cur == NULL) || (buf == NULL)) { 838 return; 839 } 840 /* 841 * Special cases. 842 */ 843 if (cur->type == XML_DTD_NODE) 844 return; 845 if ((cur->type == XML_HTML_DOCUMENT_NODE) || 846 (cur->type == XML_DOCUMENT_NODE)){ 847 htmlDocContentDumpOutput(buf, (xmlDocPtr) cur, encoding); 848 return; 849 } 850 if (cur->type == XML_ATTRIBUTE_NODE) { 851 htmlAttrDumpOutput(buf, doc, (xmlAttrPtr) cur, encoding); 852 return; 853 } 854 if (cur->type == HTML_TEXT_NODE) { 855 if (cur->content != NULL) { 856 if (((cur->name == (const xmlChar *)xmlStringText) || 857 (cur->name != (const xmlChar *)xmlStringTextNoenc)) && 858 ((cur->parent == NULL) || 859 ((xmlStrcasecmp(cur->parent->name, BAD_CAST "script")) && 860 (xmlStrcasecmp(cur->parent->name, BAD_CAST "style"))))) { 861 xmlChar *buffer; 862 863 buffer = xmlEncodeEntitiesReentrant(doc, cur->content); 864 if (buffer != NULL) { 865 xmlOutputBufferWriteString(buf, (const char *)buffer); 866 xmlFree(buffer); 867 } 868 } else { 869 xmlOutputBufferWriteString(buf, (const char *)cur->content); 870 } 871 } 872 return; 873 } 874 if (cur->type == HTML_COMMENT_NODE) { 875 if (cur->content != NULL) { 876 xmlOutputBufferWriteString(buf, "<!--"); 877 xmlOutputBufferWriteString(buf, (const char *)cur->content); 878 xmlOutputBufferWriteString(buf, "-->"); 879 } 880 return; 881 } 882 if (cur->type == HTML_PI_NODE) { 883 if (cur->name == NULL) 884 return; 885 xmlOutputBufferWriteString(buf, "<?"); 886 xmlOutputBufferWriteString(buf, (const char *)cur->name); 887 if (cur->content != NULL) { 888 xmlOutputBufferWriteString(buf, " "); 889 xmlOutputBufferWriteString(buf, (const char *)cur->content); 890 } 891 xmlOutputBufferWriteString(buf, ">"); 892 return; 893 } 894 if (cur->type == HTML_ENTITY_REF_NODE) { 895 xmlOutputBufferWriteString(buf, "&"); 896 xmlOutputBufferWriteString(buf, (const char *)cur->name); 897 xmlOutputBufferWriteString(buf, ";"); 898 return; 899 } 900 if (cur->type == HTML_PRESERVE_NODE) { 901 if (cur->content != NULL) { 902 xmlOutputBufferWriteString(buf, (const char *)cur->content); 903 } 904 return; 905 } 906 907 /* 908 * Get specific HTML info for that node. 909 */ 910 if (cur->ns == NULL) 911 info = htmlTagLookup(cur->name); 912 else 913 info = NULL; 914 915 xmlOutputBufferWriteString(buf, "<"); 916 if ((cur->ns != NULL) && (cur->ns->prefix != NULL)) { 917 xmlOutputBufferWriteString(buf, (const char *)cur->ns->prefix); 918 xmlOutputBufferWriteString(buf, ":"); 919 } 920 xmlOutputBufferWriteString(buf, (const char *)cur->name); 921 if (cur->nsDef) 922 xmlNsListDumpOutput(buf, cur->nsDef); 923 if (cur->properties != NULL) 924 htmlAttrListDumpOutput(buf, doc, cur->properties, encoding); 925 926 if ((info != NULL) && (info->empty)) { 927 xmlOutputBufferWriteString(buf, ">"); 928 if ((format) && (!info->isinline) && (cur->next != NULL)) { 929 if ((cur->next->type != HTML_TEXT_NODE) && 930 (cur->next->type != HTML_ENTITY_REF_NODE) && 931 (cur->parent != NULL) && 932 (cur->parent->name != NULL) && 933 (cur->parent->name[0] != 'p')) /* p, pre, param */ 934 xmlOutputBufferWriteString(buf, "\n"); 935 } 936 return; 937 } 938 if (((cur->type == XML_ELEMENT_NODE) || (cur->content == NULL)) && 939 (cur->children == NULL)) { 940 if ((info != NULL) && (info->saveEndTag != 0) && 941 (xmlStrcmp(BAD_CAST info->name, BAD_CAST "html")) && 942 (xmlStrcmp(BAD_CAST info->name, BAD_CAST "body"))) { 943 xmlOutputBufferWriteString(buf, ">"); 944 } else { 945 xmlOutputBufferWriteString(buf, "></"); 946 if ((cur->ns != NULL) && (cur->ns->prefix != NULL)) { 947 xmlOutputBufferWriteString(buf, (const char *)cur->ns->prefix); 948 xmlOutputBufferWriteString(buf, ":"); 949 } 950 xmlOutputBufferWriteString(buf, (const char *)cur->name); 951 xmlOutputBufferWriteString(buf, ">"); 952 } 953 if ((format) && (cur->next != NULL) && 954 (info != NULL) && (!info->isinline)) { 955 if ((cur->next->type != HTML_TEXT_NODE) && 956 (cur->next->type != HTML_ENTITY_REF_NODE) && 957 (cur->parent != NULL) && 958 (cur->parent->name != NULL) && 959 (cur->parent->name[0] != 'p')) /* p, pre, param */ 960 xmlOutputBufferWriteString(buf, "\n"); 961 } 962 return; 963 } 964 xmlOutputBufferWriteString(buf, ">"); 965 if ((cur->type != XML_ELEMENT_NODE) && 966 (cur->content != NULL)) { 967 /* 968 * Uses the OutputBuffer property to automatically convert 969 * invalids to charrefs 970 */ 971 972 xmlOutputBufferWriteString(buf, (const char *) cur->content); 973 } 974 if (cur->children != NULL) { 975 if ((format) && (info != NULL) && (!info->isinline) && 976 (cur->children->type != HTML_TEXT_NODE) && 977 (cur->children->type != HTML_ENTITY_REF_NODE) && 978 (cur->children != cur->last) && 979 (cur->name != NULL) && 980 (cur->name[0] != 'p')) /* p, pre, param */ 981 xmlOutputBufferWriteString(buf, "\n"); 982 htmlNodeListDumpOutput(buf, doc, cur->children, encoding, format); 983 if ((format) && (info != NULL) && (!info->isinline) && 984 (cur->last->type != HTML_TEXT_NODE) && 985 (cur->last->type != HTML_ENTITY_REF_NODE) && 986 (cur->children != cur->last) && 987 (cur->name != NULL) && 988 (cur->name[0] != 'p')) /* p, pre, param */ 989 xmlOutputBufferWriteString(buf, "\n"); 990 } 991 xmlOutputBufferWriteString(buf, "</"); 992 if ((cur->ns != NULL) && (cur->ns->prefix != NULL)) { 993 xmlOutputBufferWriteString(buf, (const char *)cur->ns->prefix); 994 xmlOutputBufferWriteString(buf, ":"); 995 } 996 xmlOutputBufferWriteString(buf, (const char *)cur->name); 997 xmlOutputBufferWriteString(buf, ">"); 998 if ((format) && (info != NULL) && (!info->isinline) && 999 (cur->next != NULL)) { 1000 if ((cur->next->type != HTML_TEXT_NODE) && 1001 (cur->next->type != HTML_ENTITY_REF_NODE) && 1002 (cur->parent != NULL) && 1003 (cur->parent->name != NULL) && 1004 (cur->parent->name[0] != 'p')) /* p, pre, param */ 1005 xmlOutputBufferWriteString(buf, "\n"); 1006 } 1007 } 1008 1009 /** 1010 * htmlNodeDumpOutput: 1011 * @buf: the HTML buffer output 1012 * @doc: the document 1013 * @cur: the current node 1014 * @encoding: the encoding string 1015 * 1016 * Dump an HTML node, recursive behaviour,children are printed too, 1017 * and formatting returns/spaces are added. 1018 */ 1019 void 1020 htmlNodeDumpOutput(xmlOutputBufferPtr buf, xmlDocPtr doc, 1021 xmlNodePtr cur, const char *encoding) { 1022 htmlNodeDumpFormatOutput(buf, doc, cur, encoding, 1); 1023 } 1024 1025 /** 1026 * htmlDocContentDumpFormatOutput: 1027 * @buf: the HTML buffer output 1028 * @cur: the document 1029 * @encoding: the encoding string 1030 * @format: should formatting spaces been added 1031 * 1032 * Dump an HTML document. 1033 */ 1034 void 1035 htmlDocContentDumpFormatOutput(xmlOutputBufferPtr buf, xmlDocPtr cur, 1036 const char *encoding, int format) { 1037 int type; 1038 1039 xmlInitParser(); 1040 1041 if ((buf == NULL) || (cur == NULL)) 1042 return; 1043 1044 /* 1045 * force to output the stuff as HTML, especially for entities 1046 */ 1047 type = cur->type; 1048 cur->type = XML_HTML_DOCUMENT_NODE; 1049 if (cur->intSubset != NULL) { 1050 htmlDtdDumpOutput(buf, cur, NULL); 1051 } 1052 if (cur->children != NULL) { 1053 htmlNodeListDumpOutput(buf, cur, cur->children, encoding, format); 1054 } 1055 xmlOutputBufferWriteString(buf, "\n"); 1056 cur->type = (xmlElementType) type; 1057 } 1058 1059 /** 1060 * htmlDocContentDumpOutput: 1061 * @buf: the HTML buffer output 1062 * @cur: the document 1063 * @encoding: the encoding string 1064 * 1065 * Dump an HTML document. Formating return/spaces are added. 1066 */ 1067 void 1068 htmlDocContentDumpOutput(xmlOutputBufferPtr buf, xmlDocPtr cur, 1069 const char *encoding) { 1070 htmlDocContentDumpFormatOutput(buf, cur, encoding, 1); 1071 } 1072 1073 /************************************************************************ 1074 * * 1075 * Saving functions front-ends * 1076 * * 1077 ************************************************************************/ 1078 1079 /** 1080 * htmlDocDump: 1081 * @f: the FILE* 1082 * @cur: the document 1083 * 1084 * Dump an HTML document to an open FILE. 1085 * 1086 * returns: the number of byte written or -1 in case of failure. 1087 */ 1088 int 1089 htmlDocDump(FILE *f, xmlDocPtr cur) { 1090 xmlOutputBufferPtr buf; 1091 xmlCharEncodingHandlerPtr handler = NULL; 1092 const char *encoding; 1093 int ret; 1094 1095 xmlInitParser(); 1096 1097 if ((cur == NULL) || (f == NULL)) { 1098 return(-1); 1099 } 1100 1101 encoding = (const char *) htmlGetMetaEncoding(cur); 1102 1103 if (encoding != NULL) { 1104 xmlCharEncoding enc; 1105 1106 enc = xmlParseCharEncoding(encoding); 1107 if (enc != cur->charset) { 1108 if (cur->charset != XML_CHAR_ENCODING_UTF8) { 1109 /* 1110 * Not supported yet 1111 */ 1112 return(-1); 1113 } 1114 1115 handler = xmlFindCharEncodingHandler(encoding); 1116 if (handler == NULL) 1117 htmlSaveErr(XML_SAVE_UNKNOWN_ENCODING, NULL, encoding); 1118 } else { 1119 handler = xmlFindCharEncodingHandler(encoding); 1120 } 1121 } 1122 1123 /* 1124 * Fallback to HTML or ASCII when the encoding is unspecified 1125 */ 1126 if (handler == NULL) 1127 handler = xmlFindCharEncodingHandler("HTML"); 1128 if (handler == NULL) 1129 handler = xmlFindCharEncodingHandler("ascii"); 1130 1131 buf = xmlOutputBufferCreateFile(f, handler); 1132 if (buf == NULL) return(-1); 1133 htmlDocContentDumpOutput(buf, cur, NULL); 1134 1135 ret = xmlOutputBufferClose(buf); 1136 return(ret); 1137 } 1138 1139 /** 1140 * htmlSaveFile: 1141 * @filename: the filename (or URL) 1142 * @cur: the document 1143 * 1144 * Dump an HTML document to a file. If @filename is "-" the stdout file is 1145 * used. 1146 * returns: the number of byte written or -1 in case of failure. 1147 */ 1148 int 1149 htmlSaveFile(const char *filename, xmlDocPtr cur) { 1150 xmlOutputBufferPtr buf; 1151 xmlCharEncodingHandlerPtr handler = NULL; 1152 const char *encoding; 1153 int ret; 1154 1155 if ((cur == NULL) || (filename == NULL)) 1156 return(-1); 1157 1158 xmlInitParser(); 1159 1160 encoding = (const char *) htmlGetMetaEncoding(cur); 1161 1162 if (encoding != NULL) { 1163 xmlCharEncoding enc; 1164 1165 enc = xmlParseCharEncoding(encoding); 1166 if (enc != cur->charset) { 1167 if (cur->charset != XML_CHAR_ENCODING_UTF8) { 1168 /* 1169 * Not supported yet 1170 */ 1171 return(-1); 1172 } 1173 1174 handler = xmlFindCharEncodingHandler(encoding); 1175 if (handler == NULL) 1176 htmlSaveErr(XML_SAVE_UNKNOWN_ENCODING, NULL, encoding); 1177 } 1178 } 1179 1180 /* 1181 * Fallback to HTML or ASCII when the encoding is unspecified 1182 */ 1183 if (handler == NULL) 1184 handler = xmlFindCharEncodingHandler("HTML"); 1185 if (handler == NULL) 1186 handler = xmlFindCharEncodingHandler("ascii"); 1187 1188 /* 1189 * save the content to a temp buffer. 1190 */ 1191 buf = xmlOutputBufferCreateFilename(filename, handler, cur->compression); 1192 if (buf == NULL) return(0); 1193 1194 htmlDocContentDumpOutput(buf, cur, NULL); 1195 1196 ret = xmlOutputBufferClose(buf); 1197 return(ret); 1198 } 1199 1200 /** 1201 * htmlSaveFileFormat: 1202 * @filename: the filename 1203 * @cur: the document 1204 * @format: should formatting spaces been added 1205 * @encoding: the document encoding 1206 * 1207 * Dump an HTML document to a file using a given encoding. 1208 * 1209 * returns: the number of byte written or -1 in case of failure. 1210 */ 1211 int 1212 htmlSaveFileFormat(const char *filename, xmlDocPtr cur, 1213 const char *encoding, int format) { 1214 xmlOutputBufferPtr buf; 1215 xmlCharEncodingHandlerPtr handler = NULL; 1216 int ret; 1217 1218 if ((cur == NULL) || (filename == NULL)) 1219 return(-1); 1220 1221 xmlInitParser(); 1222 1223 if (encoding != NULL) { 1224 xmlCharEncoding enc; 1225 1226 enc = xmlParseCharEncoding(encoding); 1227 if (enc != cur->charset) { 1228 if (cur->charset != XML_CHAR_ENCODING_UTF8) { 1229 /* 1230 * Not supported yet 1231 */ 1232 return(-1); 1233 } 1234 1235 handler = xmlFindCharEncodingHandler(encoding); 1236 if (handler == NULL) 1237 htmlSaveErr(XML_SAVE_UNKNOWN_ENCODING, NULL, encoding); 1238 } 1239 htmlSetMetaEncoding(cur, (const xmlChar *) encoding); 1240 } else { 1241 htmlSetMetaEncoding(cur, (const xmlChar *) "UTF-8"); 1242 } 1243 1244 /* 1245 * Fallback to HTML or ASCII when the encoding is unspecified 1246 */ 1247 if (handler == NULL) 1248 handler = xmlFindCharEncodingHandler("HTML"); 1249 if (handler == NULL) 1250 handler = xmlFindCharEncodingHandler("ascii"); 1251 1252 /* 1253 * save the content to a temp buffer. 1254 */ 1255 buf = xmlOutputBufferCreateFilename(filename, handler, 0); 1256 if (buf == NULL) return(0); 1257 1258 htmlDocContentDumpFormatOutput(buf, cur, encoding, format); 1259 1260 ret = xmlOutputBufferClose(buf); 1261 return(ret); 1262 } 1263 1264 /** 1265 * htmlSaveFileEnc: 1266 * @filename: the filename 1267 * @cur: the document 1268 * @encoding: the document encoding 1269 * 1270 * Dump an HTML document to a file using a given encoding 1271 * and formatting returns/spaces are added. 1272 * 1273 * returns: the number of byte written or -1 in case of failure. 1274 */ 1275 int 1276 htmlSaveFileEnc(const char *filename, xmlDocPtr cur, const char *encoding) { 1277 return(htmlSaveFileFormat(filename, cur, encoding, 1)); 1278 } 1279 1280 #endif /* LIBXML_OUTPUT_ENABLED */ 1281 1282 #define bottom_HTMLtree 1283 #include "elfgcchack.h" 1284 #endif /* LIBXML_HTML_ENABLED */