/ libxml2 / HTMLtree.c
HTMLtree.c
   1  /*
   2   * HTMLtree.c : implementation of access function for an HTML tree.
   3   *
   4   * See Copyright for the status of this software.
   5   *
   6   * daniel@veillard.com
   7   */
   8  
   9  
  10  #define IN_LIBXML
  11  #include "libxml.h"
  12  #ifdef LIBXML_HTML_ENABLED
  13  
  14  #include <string.h> /* for memset() only ! */
  15  
  16  #ifdef HAVE_CTYPE_H
  17  #include <ctype.h>
  18  #endif
  19  #ifdef HAVE_STDLIB_H
  20  #include <stdlib.h>
  21  #endif
  22  
  23  #include <libxml/xmlmemory.h>
  24  #include <libxml/HTMLparser.h>
  25  #include <libxml/HTMLtree.h>
  26  #include <libxml/entities.h>
  27  #include <libxml/valid.h>
  28  #include <libxml/xmlerror.h>
  29  #include <libxml/parserInternals.h>
  30  #include <libxml/globals.h>
  31  #include <libxml/uri.h>
  32  
  33  #include "buf.h"
  34  
  35  /************************************************************************
  36   *									*
  37   *		Getting/Setting encoding meta tags			*
  38   *									*
  39   ************************************************************************/
  40  
  41  /**
  42   * htmlGetMetaEncoding:
  43   * @doc:  the document
  44   *
  45   * Encoding definition lookup in the Meta tags
  46   *
  47   * Returns the current encoding as flagged in the HTML source
  48   */
  49  const xmlChar *
  50  htmlGetMetaEncoding(htmlDocPtr doc) {
  51      htmlNodePtr cur;
  52      const xmlChar *content;
  53      const xmlChar *encoding;
  54  
  55      if (doc == NULL)
  56  	return(NULL);
  57      cur = doc->children;
  58  
  59      /*
  60       * Search the html
  61       */
  62      while (cur != NULL) {
  63  	if ((cur->type == XML_ELEMENT_NODE) && (cur->name != NULL)) {
  64  	    if (xmlStrEqual(cur->name, BAD_CAST"html"))
  65  		break;
  66  	    if (xmlStrEqual(cur->name, BAD_CAST"head"))
  67  		goto found_head;
  68  	    if (xmlStrEqual(cur->name, BAD_CAST"meta"))
  69  		goto found_meta;
  70  	}
  71  	cur = cur->next;
  72      }
  73      if (cur == NULL)
  74  	return(NULL);
  75      cur = cur->children;
  76  
  77      /*
  78       * Search the head
  79       */
  80      while (cur != NULL) {
  81  	if ((cur->type == XML_ELEMENT_NODE) && (cur->name != NULL)) {
  82  	    if (xmlStrEqual(cur->name, BAD_CAST"head"))
  83  		break;
  84  	    if (xmlStrEqual(cur->name, BAD_CAST"meta"))
  85  		goto found_meta;
  86  	}
  87  	cur = cur->next;
  88      }
  89      if (cur == NULL)
  90  	return(NULL);
  91  found_head:
  92      cur = cur->children;
  93  
  94      /*
  95       * Search the meta elements
  96       */
  97  found_meta:
  98      while (cur != NULL) {
  99  	if ((cur->type == XML_ELEMENT_NODE) && (cur->name != NULL)) {
 100  	    if (xmlStrEqual(cur->name, BAD_CAST"meta")) {
 101  		xmlAttrPtr attr = cur->properties;
 102  		int http;
 103  		const xmlChar *value;
 104  
 105  		content = NULL;
 106  		http = 0;
 107  		while (attr != NULL) {
 108  		    if ((attr->children != NULL) &&
 109  		        (attr->children->type == XML_TEXT_NODE) &&
 110  		        (attr->children->next == NULL)) {
 111  			value = attr->children->content;
 112  			if ((!xmlStrcasecmp(attr->name, BAD_CAST"http-equiv"))
 113  			 && (!xmlStrcasecmp(value, BAD_CAST"Content-Type")))
 114  			    http = 1;
 115  			else if ((value != NULL)
 116  			 && (!xmlStrcasecmp(attr->name, BAD_CAST"content")))
 117  			    content = value;
 118  			if ((http != 0) && (content != NULL))
 119  			    goto found_content;
 120  		    }
 121  		    attr = attr->next;
 122  		}
 123  	    }
 124  	}
 125  	cur = cur->next;
 126      }
 127      return(NULL);
 128  
 129  found_content:
 130      encoding = xmlStrstr(content, BAD_CAST"charset=");
 131      if (encoding == NULL)
 132  	encoding = xmlStrstr(content, BAD_CAST"Charset=");
 133      if (encoding == NULL)
 134  	encoding = xmlStrstr(content, BAD_CAST"CHARSET=");
 135      if (encoding != NULL) {
 136  	encoding += 8;
 137      } else {
 138  	encoding = xmlStrstr(content, BAD_CAST"charset =");
 139  	if (encoding == NULL)
 140  	    encoding = xmlStrstr(content, BAD_CAST"Charset =");
 141  	if (encoding == NULL)
 142  	    encoding = xmlStrstr(content, BAD_CAST"CHARSET =");
 143  	if (encoding != NULL)
 144  	    encoding += 9;
 145      }
 146      if (encoding != NULL) {
 147  	while ((*encoding == ' ') || (*encoding == '\t')) encoding++;
 148      }
 149      return(encoding);
 150  }
 151  
 152  /**
 153   * htmlSetMetaEncoding:
 154   * @doc:  the document
 155   * @encoding:  the encoding string
 156   *
 157   * Sets the current encoding in the Meta tags
 158   * NOTE: this will not change the document content encoding, just
 159   * the META flag associated.
 160   *
 161   * Returns 0 in case of success and -1 in case of error
 162   */
 163  int
 164  htmlSetMetaEncoding(htmlDocPtr doc, const xmlChar *encoding) {
 165      htmlNodePtr cur, meta = NULL, head = NULL;
 166      const xmlChar *content = NULL;
 167      char newcontent[100];
 168  
 169      newcontent[0] = 0;
 170  
 171      if (doc == NULL)
 172  	return(-1);
 173  
 174      /* html isn't a real encoding it's just libxml2 way to get entities */
 175      if (!xmlStrcasecmp(encoding, BAD_CAST "html"))
 176          return(-1);
 177  
 178      if (encoding != NULL) {
 179  	snprintf(newcontent, sizeof(newcontent), "text/html; charset=%s",
 180                  (char *)encoding);
 181  	newcontent[sizeof(newcontent) - 1] = 0;
 182      }
 183  
 184      cur = doc->children;
 185  
 186      /*
 187       * Search the html
 188       */
 189      while (cur != NULL) {
 190  	if ((cur->type == XML_ELEMENT_NODE) && (cur->name != NULL)) {
 191  	    if (xmlStrcasecmp(cur->name, BAD_CAST"html") == 0)
 192  		break;
 193  	    if (xmlStrcasecmp(cur->name, BAD_CAST"head") == 0)
 194  		goto found_head;
 195  	    if (xmlStrcasecmp(cur->name, BAD_CAST"meta") == 0)
 196  		goto found_meta;
 197  	}
 198  	cur = cur->next;
 199      }
 200      if (cur == NULL)
 201  	return(-1);
 202      cur = cur->children;
 203  
 204      /*
 205       * Search the head
 206       */
 207      while (cur != NULL) {
 208  	if ((cur->type == XML_ELEMENT_NODE) && (cur->name != NULL)) {
 209  	    if (xmlStrcasecmp(cur->name, BAD_CAST"head") == 0)
 210  		break;
 211  	    if (xmlStrcasecmp(cur->name, BAD_CAST"meta") == 0) {
 212                  head = cur->parent;
 213  		goto found_meta;
 214              }
 215  	}
 216  	cur = cur->next;
 217      }
 218      if (cur == NULL)
 219  	return(-1);
 220  found_head:
 221      head = cur;
 222      if (cur->children == NULL)
 223          goto create;
 224      cur = cur->children;
 225  
 226  found_meta:
 227      /*
 228       * Search and update all the remaining the meta elements carrying
 229       * encoding informations
 230       */
 231      while (cur != NULL) {
 232  	if ((cur->type == XML_ELEMENT_NODE) && (cur->name != NULL)) {
 233  	    if (xmlStrcasecmp(cur->name, BAD_CAST"meta") == 0) {
 234  		xmlAttrPtr attr = cur->properties;
 235  		int http;
 236  		const xmlChar *value;
 237  
 238  		content = NULL;
 239  		http = 0;
 240  		while (attr != NULL) {
 241  		    if ((attr->children != NULL) &&
 242  		        (attr->children->type == XML_TEXT_NODE) &&
 243  		        (attr->children->next == NULL)) {
 244  			value = attr->children->content;
 245  			if ((!xmlStrcasecmp(attr->name, BAD_CAST"http-equiv"))
 246  			 && (!xmlStrcasecmp(value, BAD_CAST"Content-Type")))
 247  			    http = 1;
 248  			else
 249                          {
 250                             if ((value != NULL) &&
 251                                 (!xmlStrcasecmp(attr->name, BAD_CAST"content")))
 252  			       content = value;
 253                          }
 254  		        if ((http != 0) && (content != NULL))
 255  			    break;
 256  		    }
 257  		    attr = attr->next;
 258  		}
 259  		if ((http != 0) && (content != NULL)) {
 260  		    meta = cur;
 261  		    break;
 262  		}
 263  
 264  	    }
 265  	}
 266  	cur = cur->next;
 267      }
 268  create:
 269      if (meta == NULL) {
 270          if ((encoding != NULL) && (head != NULL)) {
 271              /*
 272               * Create a new Meta element with the right attributes
 273               */
 274  
 275              meta = xmlNewDocNode(doc, NULL, BAD_CAST"meta", NULL);
 276              if (head->children == NULL)
 277                  xmlAddChild(head, meta);
 278              else
 279                  xmlAddPrevSibling(head->children, meta);
 280              xmlNewProp(meta, BAD_CAST"http-equiv", BAD_CAST"Content-Type");
 281              xmlNewProp(meta, BAD_CAST"content", BAD_CAST newcontent);
 282          }
 283      } else {
 284          /* remove the meta tag if NULL is passed */
 285          if (encoding == NULL) {
 286              xmlUnlinkNode(meta);
 287              xmlFreeNode(meta);
 288          }
 289          /* change the document only if there is a real encoding change */
 290          else if (xmlStrcasestr(content, encoding) == NULL) {
 291              xmlSetProp(meta, BAD_CAST"content", BAD_CAST newcontent);
 292          }
 293      }
 294  
 295  
 296      return(0);
 297  }
 298  
 299  /**
 300   * booleanHTMLAttrs:
 301   *
 302   * These are the HTML attributes which will be output
 303   * in minimized form, i.e. <option selected="selected"> will be
 304   * output as <option selected>, as per XSLT 1.0 16.2 "HTML Output Method"
 305   *
 306   */
 307  static const char* const htmlBooleanAttrs[] = {
 308    "checked", "compact", "declare", "defer", "disabled", "ismap",
 309    "multiple", "nohref", "noresize", "noshade", "nowrap", "readonly",
 310    "selected", NULL
 311  };
 312  
 313  
 314  /**
 315   * htmlIsBooleanAttr:
 316   * @name:  the name of the attribute to check
 317   *
 318   * Determine if a given attribute is a boolean attribute.
 319   *
 320   * returns: false if the attribute is not boolean, true otherwise.
 321   */
 322  int
 323  htmlIsBooleanAttr(const xmlChar *name)
 324  {
 325      int i = 0;
 326  
 327      while (htmlBooleanAttrs[i] != NULL) {
 328          if (xmlStrcasecmp((const xmlChar *)htmlBooleanAttrs[i], name) == 0)
 329              return 1;
 330          i++;
 331      }
 332      return 0;
 333  }
 334  
 335  #ifdef LIBXML_OUTPUT_ENABLED
 336  /*
 337   * private routine exported from xmlIO.c
 338   */
 339  xmlOutputBufferPtr
 340  xmlAllocOutputBufferInternal(xmlCharEncodingHandlerPtr encoder);
 341  /************************************************************************
 342   *									*
 343   *			Output error handlers				*
 344   *									*
 345   ************************************************************************/
 346  /**
 347   * htmlSaveErrMemory:
 348   * @extra:  extra informations
 349   *
 350   * Handle an out of memory condition
 351   */
 352  static void
 353  htmlSaveErrMemory(const char *extra)
 354  {
 355      __xmlSimpleError(XML_FROM_OUTPUT, XML_ERR_NO_MEMORY, NULL, NULL, extra);
 356  }
 357  
 358  /**
 359   * htmlSaveErr:
 360   * @code:  the error number
 361   * @node:  the location of the error.
 362   * @extra:  extra informations
 363   *
 364   * Handle an out of memory condition
 365   */
 366  static void
 367  htmlSaveErr(int code, xmlNodePtr node, const char *extra)
 368  {
 369      const char *msg = NULL;
 370  
 371      switch(code) {
 372          case XML_SAVE_NOT_UTF8:
 373  	    msg = "string is not in UTF-8\n";
 374  	    break;
 375  	case XML_SAVE_CHAR_INVALID:
 376  	    msg = "invalid character value\n";
 377  	    break;
 378  	case XML_SAVE_UNKNOWN_ENCODING:
 379  	    msg = "unknown encoding %s\n";
 380  	    break;
 381  	case XML_SAVE_NO_DOCTYPE:
 382  	    msg = "HTML has no DOCTYPE\n";
 383  	    break;
 384  	default:
 385  	    msg = "unexpected error number\n";
 386      }
 387  #pragma clang diagnostic push
 388  #pragma clang diagnostic ignored "-Wformat-nonliteral"
 389      __xmlSimpleError(XML_FROM_OUTPUT, code, node, msg, extra);
 390  #pragma clang diagnostic pop
 391  }
 392  
 393  /************************************************************************
 394   *									*
 395   *		Dumping HTML tree content to a simple buffer		*
 396   *									*
 397   ************************************************************************/
 398  
 399  /**
 400   * htmlBufNodeDumpFormat:
 401   * @buf:  the xmlBufPtr output
 402   * @doc:  the document
 403   * @cur:  the current node
 404   * @format:  should formatting spaces been added
 405   *
 406   * Dump an HTML node, recursive behaviour,children are printed too.
 407   *
 408   * Returns the number of byte written or -1 in case of error
 409   */
 410  static size_t
 411  htmlBufNodeDumpFormat(xmlBufPtr buf, xmlDocPtr doc, xmlNodePtr cur,
 412  	           int format) {
 413      size_t use;
 414      int ret;
 415      xmlOutputBufferPtr outbuf;
 416  
 417      if (cur == NULL) {
 418  	return (-1);
 419      }
 420      if (buf == NULL) {
 421  	return (-1);
 422      }
 423      outbuf = (xmlOutputBufferPtr) xmlMalloc(sizeof(xmlOutputBuffer));
 424      if (outbuf == NULL) {
 425          htmlSaveErrMemory("allocating HTML output buffer");
 426  	return (-1);
 427      }
 428      memset(outbuf, 0, (size_t) sizeof(xmlOutputBuffer));
 429      outbuf->buffer = buf;
 430      outbuf->encoder = NULL;
 431      outbuf->writecallback = NULL;
 432      outbuf->closecallback = NULL;
 433      outbuf->context = NULL;
 434      outbuf->written = 0;
 435  
 436      use = xmlBufUse(buf);
 437      htmlNodeDumpFormatOutput(outbuf, doc, cur, NULL, format);
 438      xmlFree(outbuf);
 439      ret = xmlBufUse(buf) - use;
 440      return (ret);
 441  }
 442  
 443  /**
 444   * htmlNodeDump:
 445   * @buf:  the HTML buffer output
 446   * @doc:  the document
 447   * @cur:  the current node
 448   *
 449   * Dump an HTML node, recursive behaviour,children are printed too,
 450   * and formatting returns are added.
 451   *
 452   * Returns the number of byte written or -1 in case of error
 453   */
 454  int
 455  htmlNodeDump(xmlBufferPtr buf, xmlDocPtr doc, xmlNodePtr cur) {
 456      xmlBufPtr buffer;
 457      size_t ret;
 458  
 459      if ((buf == NULL) || (cur == NULL))
 460          return(-1);
 461  
 462      xmlInitParser();
 463      buffer = xmlBufFromBuffer(buf);
 464      if (buffer == NULL)
 465          return(-1);
 466  
 467      ret = htmlBufNodeDumpFormat(buffer, doc, cur, 1);
 468  
 469      xmlBufBackToBuffer(buffer);
 470  
 471      if (ret > INT_MAX)
 472          return(-1);
 473      return((int) ret);
 474  }
 475  
 476  /**
 477   * htmlNodeDumpFileFormat:
 478   * @out:  the FILE pointer
 479   * @doc:  the document
 480   * @cur:  the current node
 481   * @encoding: the document encoding
 482   * @format:  should formatting spaces been added
 483   *
 484   * Dump an HTML node, recursive behaviour,children are printed too.
 485   *
 486   * TODO: if encoding == NULL try to save in the doc encoding
 487   *
 488   * returns: the number of byte written or -1 in case of failure.
 489   */
 490  int
 491  htmlNodeDumpFileFormat(FILE *out, xmlDocPtr doc,
 492  	               xmlNodePtr cur, const char *encoding, int format) {
 493      xmlOutputBufferPtr buf;
 494      xmlCharEncodingHandlerPtr handler = NULL;
 495      int ret;
 496  
 497      xmlInitParser();
 498  
 499      if (encoding != NULL) {
 500  	xmlCharEncoding enc;
 501  
 502  	enc = xmlParseCharEncoding(encoding);
 503  	if (enc != XML_CHAR_ENCODING_UTF8) {
 504  	    handler = xmlFindCharEncodingHandler(encoding);
 505  	    if (handler == NULL)
 506  		htmlSaveErr(XML_SAVE_UNKNOWN_ENCODING, NULL, encoding);
 507  	}
 508      }
 509  
 510      /*
 511       * Fallback to HTML or ASCII when the encoding is unspecified
 512       */
 513      if (handler == NULL)
 514  	handler = xmlFindCharEncodingHandler("HTML");
 515      if (handler == NULL)
 516  	handler = xmlFindCharEncodingHandler("ascii");
 517  
 518      /*
 519       * save the content to a temp buffer.
 520       */
 521      buf = xmlOutputBufferCreateFile(out, handler);
 522      if (buf == NULL) return(0);
 523  
 524      htmlNodeDumpFormatOutput(buf, doc, cur, encoding, format);
 525  
 526      ret = xmlOutputBufferClose(buf);
 527      return(ret);
 528  }
 529  
 530  /**
 531   * htmlNodeDumpFile:
 532   * @out:  the FILE pointer
 533   * @doc:  the document
 534   * @cur:  the current node
 535   *
 536   * Dump an HTML node, recursive behaviour,children are printed too,
 537   * and formatting returns are added.
 538   */
 539  void
 540  htmlNodeDumpFile(FILE *out, xmlDocPtr doc, xmlNodePtr cur) {
 541      htmlNodeDumpFileFormat(out, doc, cur, NULL, 1);
 542  }
 543  
 544  /**
 545   * htmlDocDumpMemoryFormat:
 546   * @cur:  the document
 547   * @mem:  OUT: the memory pointer
 548   * @size:  OUT: the memory length
 549   * @format:  should formatting spaces been added
 550   *
 551   * Dump an HTML document in memory and return the xmlChar * and it's size.
 552   * It's up to the caller to free the memory.
 553   */
 554  void
 555  htmlDocDumpMemoryFormat(xmlDocPtr cur, xmlChar**mem, int *size, int format) {
 556      xmlOutputBufferPtr buf;
 557      xmlCharEncodingHandlerPtr handler = NULL;
 558      const char *encoding;
 559  
 560      xmlInitParser();
 561  
 562      if ((mem == NULL) || (size == NULL))
 563          return;
 564      if (cur == NULL) {
 565  	*mem = NULL;
 566  	*size = 0;
 567  	return;
 568      }
 569  
 570      encoding = (const char *) htmlGetMetaEncoding(cur);
 571  
 572      if (encoding != NULL) {
 573  	xmlCharEncoding enc;
 574  
 575  	enc = xmlParseCharEncoding(encoding);
 576  	if (enc != cur->charset) {
 577  	    if (cur->charset != XML_CHAR_ENCODING_UTF8) {
 578  		/*
 579  		 * Not supported yet
 580  		 */
 581  		*mem = NULL;
 582  		*size = 0;
 583  		return;
 584  	    }
 585  
 586  	    handler = xmlFindCharEncodingHandler(encoding);
 587  	    if (handler == NULL)
 588                  htmlSaveErr(XML_SAVE_UNKNOWN_ENCODING, NULL, encoding);
 589  
 590  	} else {
 591  	    handler = xmlFindCharEncodingHandler(encoding);
 592  	}
 593      }
 594  
 595      /*
 596       * Fallback to HTML or ASCII when the encoding is unspecified
 597       */
 598      if (handler == NULL)
 599  	handler = xmlFindCharEncodingHandler("HTML");
 600      if (handler == NULL)
 601  	handler = xmlFindCharEncodingHandler("ascii");
 602  
 603      buf = xmlAllocOutputBufferInternal(handler);
 604      if (buf == NULL) {
 605  	*mem = NULL;
 606  	*size = 0;
 607  	return;
 608      }
 609  
 610      htmlDocContentDumpFormatOutput(buf, cur, NULL, format);
 611  
 612      xmlOutputBufferFlush(buf);
 613      if (buf->conv != NULL) {
 614  	*size = xmlBufUse(buf->conv);
 615  	*mem = xmlStrndup(xmlBufContent(buf->conv), *size);
 616      } else {
 617  	*size = xmlBufUse(buf->buffer);
 618  	*mem = xmlStrndup(xmlBufContent(buf->buffer), *size);
 619      }
 620      (void)xmlOutputBufferClose(buf);
 621  }
 622  
 623  /**
 624   * htmlDocDumpMemory:
 625   * @cur:  the document
 626   * @mem:  OUT: the memory pointer
 627   * @size:  OUT: the memory length
 628   *
 629   * Dump an HTML document in memory and return the xmlChar * and it's size.
 630   * It's up to the caller to free the memory.
 631   */
 632  void
 633  htmlDocDumpMemory(xmlDocPtr cur, xmlChar**mem, int *size) {
 634  	htmlDocDumpMemoryFormat(cur, mem, size, 1);
 635  }
 636  
 637  
 638  /************************************************************************
 639   *									*
 640   *		Dumping HTML tree content to an I/O output buffer	*
 641   *									*
 642   ************************************************************************/
 643  
 644  void xmlNsListDumpOutput(xmlOutputBufferPtr buf, xmlNsPtr cur);
 645  
 646  /**
 647   * htmlDtdDumpOutput:
 648   * @buf:  the HTML buffer output
 649   * @doc:  the document
 650   * @encoding:  the encoding string
 651   *
 652   * TODO: check whether encoding is needed
 653   *
 654   * Dump the HTML document DTD, if any.
 655   */
 656  static void
 657  htmlDtdDumpOutput(xmlOutputBufferPtr buf, xmlDocPtr doc,
 658  	          const char *encoding ATTRIBUTE_UNUSED) {
 659      xmlDtdPtr cur = doc->intSubset;
 660  
 661      if (cur == NULL) {
 662  	htmlSaveErr(XML_SAVE_NO_DOCTYPE, (xmlNodePtr) doc, NULL);
 663  	return;
 664      }
 665      xmlOutputBufferWriteString(buf, "<!DOCTYPE ");
 666      xmlOutputBufferWriteString(buf, (const char *)cur->name);
 667      if (cur->ExternalID != NULL) {
 668  	xmlOutputBufferWriteString(buf, " PUBLIC ");
 669  	xmlBufWriteQuotedString(buf->buffer, cur->ExternalID);
 670  	if (cur->SystemID != NULL) {
 671  	    xmlOutputBufferWriteString(buf, " ");
 672  	    xmlBufWriteQuotedString(buf->buffer, cur->SystemID);
 673  	}
 674      } else if (cur->SystemID != NULL &&
 675  	       xmlStrcmp(cur->SystemID, BAD_CAST "about:legacy-compat")) {
 676  	xmlOutputBufferWriteString(buf, " SYSTEM ");
 677  	xmlBufWriteQuotedString(buf->buffer, cur->SystemID);
 678      }
 679      xmlOutputBufferWriteString(buf, ">\n");
 680  }
 681  
 682  /**
 683   * htmlAttrDumpOutput:
 684   * @buf:  the HTML buffer output
 685   * @doc:  the document
 686   * @cur:  the attribute pointer
 687   * @encoding:  the encoding string
 688   *
 689   * Dump an HTML attribute
 690   */
 691  static void
 692  htmlAttrDumpOutput(xmlOutputBufferPtr buf, xmlDocPtr doc, xmlAttrPtr cur,
 693  	           const char *encoding ATTRIBUTE_UNUSED) {
 694      xmlChar *value;
 695  
 696      /*
 697       * The html output method should not escape a & character
 698       * occurring in an attribute value immediately followed by
 699       * a { character (see Section B.7.1 of the HTML 4.0 Recommendation).
 700       * This is implemented in xmlEncodeEntitiesReentrant
 701       */
 702  
 703      if (cur == NULL) {
 704  	return;
 705      }
 706      xmlOutputBufferWriteString(buf, " ");
 707      if ((cur->ns != NULL) && (cur->ns->prefix != NULL)) {
 708          xmlOutputBufferWriteString(buf, (const char *)cur->ns->prefix);
 709  	xmlOutputBufferWriteString(buf, ":");
 710      }
 711      xmlOutputBufferWriteString(buf, (const char *)cur->name);
 712      if ((cur->children != NULL) && (!htmlIsBooleanAttr(cur->name))) {
 713  	value = xmlNodeListGetString(doc, cur->children, 0);
 714  	if (value) {
 715  	    xmlOutputBufferWriteString(buf, "=");
 716  	    if ((cur->ns == NULL) && (cur->parent != NULL) &&
 717  		(cur->parent->ns == NULL) &&
 718  		((!xmlStrcasecmp(cur->name, BAD_CAST "href")) ||
 719  	         (!xmlStrcasecmp(cur->name, BAD_CAST "action")) ||
 720  		 (!xmlStrcasecmp(cur->name, BAD_CAST "src")) ||
 721  		 ((!xmlStrcasecmp(cur->name, BAD_CAST "name")) &&
 722  		  (!xmlStrcasecmp(cur->parent->name, BAD_CAST "a"))))) {
 723  		xmlChar *tmp = value;
 724  		/* xmlURIEscapeStr() escapes '"' so it can be safely used. */
 725  		xmlBufCCat(buf->buffer, "\"");
 726  
 727  		while (IS_BLANK_CH(*tmp)) tmp++;
 728  
 729  		/* URI Escape everything, except server side includes. */
 730  		for ( ; ; ) {
 731  		    xmlChar *escaped;
 732  		    xmlChar endChar;
 733  		    xmlChar *end = NULL;
 734  		    xmlChar *start = (xmlChar *)xmlStrstr(tmp, BAD_CAST "<!--");
 735  		    if (start != NULL) {
 736  			end = (xmlChar *)xmlStrstr(tmp, BAD_CAST "-->");
 737  			if (end != NULL) {
 738  			    *start = '\0';
 739  			}
 740  		    }
 741  
 742  		    /* Escape the whole string, or until start (set to '\0'). */
 743  		    escaped = xmlURIEscapeStr(tmp, BAD_CAST"@/:=?;#%&,+");
 744  		    if (escaped != NULL) {
 745  		        xmlBufCat(buf->buffer, escaped);
 746  		        xmlFree(escaped);
 747  		    } else {
 748  		        xmlBufCat(buf->buffer, tmp);
 749  		    }
 750  
 751  		    if (end == NULL) { /* Everything has been written. */
 752  			break;
 753  		    }
 754  
 755  		    /* Do not escape anything within server side includes. */
 756  		    *start = '<'; /* Restore the first character of "<!--". */
 757  		    end += 3; /* strlen("-->") */
 758  		    endChar = *end;
 759  		    *end = '\0';
 760  		    xmlBufCat(buf->buffer, start);
 761  		    *end = endChar;
 762  		    tmp = end;
 763  		}
 764  
 765  		xmlBufCCat(buf->buffer, "\"");
 766  	    } else {
 767  		xmlBufWriteQuotedString(buf->buffer, value);
 768  	    }
 769  	    xmlFree(value);
 770  	} else  {
 771  	    xmlOutputBufferWriteString(buf, "=\"\"");
 772  	}
 773      }
 774  }
 775  
 776  /**
 777   * htmlAttrListDumpOutput:
 778   * @buf:  the HTML buffer output
 779   * @doc:  the document
 780   * @cur:  the first attribute pointer
 781   * @encoding:  the encoding string
 782   *
 783   * Dump a list of HTML attributes
 784   */
 785  static void
 786  htmlAttrListDumpOutput(xmlOutputBufferPtr buf, xmlDocPtr doc, xmlAttrPtr cur, const char *encoding) {
 787      if (cur == NULL) {
 788  	return;
 789      }
 790      while (cur != NULL) {
 791          htmlAttrDumpOutput(buf, doc, cur, encoding);
 792  	cur = cur->next;
 793      }
 794  }
 795  
 796  
 797  
 798  /**
 799   * htmlNodeListDumpOutput:
 800   * @buf:  the HTML buffer output
 801   * @doc:  the document
 802   * @cur:  the first node
 803   * @encoding:  the encoding string
 804   * @format:  should formatting spaces been added
 805   *
 806   * Dump an HTML node list, recursive behaviour,children are printed too.
 807   */
 808  static void
 809  htmlNodeListDumpOutput(xmlOutputBufferPtr buf, xmlDocPtr doc,
 810  	               xmlNodePtr cur, const char *encoding, int format) {
 811      if (cur == NULL) {
 812  	return;
 813      }
 814      while (cur != NULL) {
 815          htmlNodeDumpFormatOutput(buf, doc, cur, encoding, format);
 816  	cur = cur->next;
 817      }
 818  }
 819  
 820  /**
 821   * htmlNodeDumpFormatOutput:
 822   * @buf:  the HTML buffer output
 823   * @doc:  the document
 824   * @cur:  the current node
 825   * @encoding:  the encoding string
 826   * @format:  should formatting spaces been added
 827   *
 828   * Dump an HTML node, recursive behaviour,children are printed too.
 829   */
 830  void
 831  htmlNodeDumpFormatOutput(xmlOutputBufferPtr buf, xmlDocPtr doc,
 832  	                 xmlNodePtr cur, const char *encoding, int format) {
 833      const htmlElemDesc * info;
 834  
 835      xmlInitParser();
 836  
 837      if ((cur == NULL) || (buf == NULL)) {
 838  	return;
 839      }
 840      /*
 841       * Special cases.
 842       */
 843      if (cur->type == XML_DTD_NODE)
 844  	return;
 845      if ((cur->type == XML_HTML_DOCUMENT_NODE) ||
 846          (cur->type == XML_DOCUMENT_NODE)){
 847  	htmlDocContentDumpOutput(buf, (xmlDocPtr) cur, encoding);
 848  	return;
 849      }
 850      if (cur->type == XML_ATTRIBUTE_NODE) {
 851          htmlAttrDumpOutput(buf, doc, (xmlAttrPtr) cur, encoding);
 852  	return;
 853      }
 854      if (cur->type == HTML_TEXT_NODE) {
 855  	if (cur->content != NULL) {
 856  	    if (((cur->name == (const xmlChar *)xmlStringText) ||
 857  		 (cur->name != (const xmlChar *)xmlStringTextNoenc)) &&
 858  		((cur->parent == NULL) ||
 859  		 ((xmlStrcasecmp(cur->parent->name, BAD_CAST "script")) &&
 860  		  (xmlStrcasecmp(cur->parent->name, BAD_CAST "style"))))) {
 861  		xmlChar *buffer;
 862  
 863  		buffer = xmlEncodeEntitiesReentrant(doc, cur->content);
 864  		if (buffer != NULL) {
 865  		    xmlOutputBufferWriteString(buf, (const char *)buffer);
 866  		    xmlFree(buffer);
 867  		}
 868  	    } else {
 869  		xmlOutputBufferWriteString(buf, (const char *)cur->content);
 870  	    }
 871  	}
 872  	return;
 873      }
 874      if (cur->type == HTML_COMMENT_NODE) {
 875  	if (cur->content != NULL) {
 876  	    xmlOutputBufferWriteString(buf, "<!--");
 877  	    xmlOutputBufferWriteString(buf, (const char *)cur->content);
 878  	    xmlOutputBufferWriteString(buf, "-->");
 879  	}
 880  	return;
 881      }
 882      if (cur->type == HTML_PI_NODE) {
 883  	if (cur->name == NULL)
 884  	    return;
 885  	xmlOutputBufferWriteString(buf, "<?");
 886  	xmlOutputBufferWriteString(buf, (const char *)cur->name);
 887  	if (cur->content != NULL) {
 888  	    xmlOutputBufferWriteString(buf, " ");
 889  	    xmlOutputBufferWriteString(buf, (const char *)cur->content);
 890  	}
 891  	xmlOutputBufferWriteString(buf, ">");
 892  	return;
 893      }
 894      if (cur->type == HTML_ENTITY_REF_NODE) {
 895          xmlOutputBufferWriteString(buf, "&");
 896  	xmlOutputBufferWriteString(buf, (const char *)cur->name);
 897          xmlOutputBufferWriteString(buf, ";");
 898  	return;
 899      }
 900      if (cur->type == HTML_PRESERVE_NODE) {
 901  	if (cur->content != NULL) {
 902  	    xmlOutputBufferWriteString(buf, (const char *)cur->content);
 903  	}
 904  	return;
 905      }
 906  
 907      /*
 908       * Get specific HTML info for that node.
 909       */
 910      if (cur->ns == NULL)
 911  	info = htmlTagLookup(cur->name);
 912      else
 913  	info = NULL;
 914  
 915      xmlOutputBufferWriteString(buf, "<");
 916      if ((cur->ns != NULL) && (cur->ns->prefix != NULL)) {
 917          xmlOutputBufferWriteString(buf, (const char *)cur->ns->prefix);
 918  	xmlOutputBufferWriteString(buf, ":");
 919      }
 920      xmlOutputBufferWriteString(buf, (const char *)cur->name);
 921      if (cur->nsDef)
 922  	xmlNsListDumpOutput(buf, cur->nsDef);
 923      if (cur->properties != NULL)
 924          htmlAttrListDumpOutput(buf, doc, cur->properties, encoding);
 925  
 926      if ((info != NULL) && (info->empty)) {
 927          xmlOutputBufferWriteString(buf, ">");
 928  	if ((format) && (!info->isinline) && (cur->next != NULL)) {
 929  	    if ((cur->next->type != HTML_TEXT_NODE) &&
 930  		(cur->next->type != HTML_ENTITY_REF_NODE) &&
 931  		(cur->parent != NULL) &&
 932  		(cur->parent->name != NULL) &&
 933  		(cur->parent->name[0] != 'p')) /* p, pre, param */
 934  		xmlOutputBufferWriteString(buf, "\n");
 935  	}
 936  	return;
 937      }
 938      if (((cur->type == XML_ELEMENT_NODE) || (cur->content == NULL)) &&
 939  	(cur->children == NULL)) {
 940          if ((info != NULL) && (info->saveEndTag != 0) &&
 941  	    (xmlStrcmp(BAD_CAST info->name, BAD_CAST "html")) &&
 942  	    (xmlStrcmp(BAD_CAST info->name, BAD_CAST "body"))) {
 943  	    xmlOutputBufferWriteString(buf, ">");
 944  	} else {
 945  	    xmlOutputBufferWriteString(buf, "></");
 946              if ((cur->ns != NULL) && (cur->ns->prefix != NULL)) {
 947                  xmlOutputBufferWriteString(buf, (const char *)cur->ns->prefix);
 948                  xmlOutputBufferWriteString(buf, ":");
 949              }
 950  	    xmlOutputBufferWriteString(buf, (const char *)cur->name);
 951  	    xmlOutputBufferWriteString(buf, ">");
 952  	}
 953  	if ((format) && (cur->next != NULL) &&
 954              (info != NULL) && (!info->isinline)) {
 955  	    if ((cur->next->type != HTML_TEXT_NODE) &&
 956  		(cur->next->type != HTML_ENTITY_REF_NODE) &&
 957  		(cur->parent != NULL) &&
 958  		(cur->parent->name != NULL) &&
 959  		(cur->parent->name[0] != 'p')) /* p, pre, param */
 960  		xmlOutputBufferWriteString(buf, "\n");
 961  	}
 962  	return;
 963      }
 964      xmlOutputBufferWriteString(buf, ">");
 965      if ((cur->type != XML_ELEMENT_NODE) &&
 966  	(cur->content != NULL)) {
 967  	    /*
 968  	     * Uses the OutputBuffer property to automatically convert
 969  	     * invalids to charrefs
 970  	     */
 971  
 972              xmlOutputBufferWriteString(buf, (const char *) cur->content);
 973      }
 974      if (cur->children != NULL) {
 975          if ((format) && (info != NULL) && (!info->isinline) &&
 976  	    (cur->children->type != HTML_TEXT_NODE) &&
 977  	    (cur->children->type != HTML_ENTITY_REF_NODE) &&
 978  	    (cur->children != cur->last) &&
 979  	    (cur->name != NULL) &&
 980  	    (cur->name[0] != 'p')) /* p, pre, param */
 981  	    xmlOutputBufferWriteString(buf, "\n");
 982  	htmlNodeListDumpOutput(buf, doc, cur->children, encoding, format);
 983          if ((format) && (info != NULL) && (!info->isinline) &&
 984  	    (cur->last->type != HTML_TEXT_NODE) &&
 985  	    (cur->last->type != HTML_ENTITY_REF_NODE) &&
 986  	    (cur->children != cur->last) &&
 987  	    (cur->name != NULL) &&
 988  	    (cur->name[0] != 'p')) /* p, pre, param */
 989  	    xmlOutputBufferWriteString(buf, "\n");
 990      }
 991      xmlOutputBufferWriteString(buf, "</");
 992      if ((cur->ns != NULL) && (cur->ns->prefix != NULL)) {
 993          xmlOutputBufferWriteString(buf, (const char *)cur->ns->prefix);
 994  	xmlOutputBufferWriteString(buf, ":");
 995      }
 996      xmlOutputBufferWriteString(buf, (const char *)cur->name);
 997      xmlOutputBufferWriteString(buf, ">");
 998      if ((format) && (info != NULL) && (!info->isinline) &&
 999  	(cur->next != NULL)) {
1000          if ((cur->next->type != HTML_TEXT_NODE) &&
1001  	    (cur->next->type != HTML_ENTITY_REF_NODE) &&
1002  	    (cur->parent != NULL) &&
1003  	    (cur->parent->name != NULL) &&
1004  	    (cur->parent->name[0] != 'p')) /* p, pre, param */
1005  	    xmlOutputBufferWriteString(buf, "\n");
1006      }
1007  }
1008  
1009  /**
1010   * htmlNodeDumpOutput:
1011   * @buf:  the HTML buffer output
1012   * @doc:  the document
1013   * @cur:  the current node
1014   * @encoding:  the encoding string
1015   *
1016   * Dump an HTML node, recursive behaviour,children are printed too,
1017   * and formatting returns/spaces are added.
1018   */
1019  void
1020  htmlNodeDumpOutput(xmlOutputBufferPtr buf, xmlDocPtr doc,
1021  	           xmlNodePtr cur, const char *encoding) {
1022      htmlNodeDumpFormatOutput(buf, doc, cur, encoding, 1);
1023  }
1024  
1025  /**
1026   * htmlDocContentDumpFormatOutput:
1027   * @buf:  the HTML buffer output
1028   * @cur:  the document
1029   * @encoding:  the encoding string
1030   * @format:  should formatting spaces been added
1031   *
1032   * Dump an HTML document.
1033   */
1034  void
1035  htmlDocContentDumpFormatOutput(xmlOutputBufferPtr buf, xmlDocPtr cur,
1036  	                       const char *encoding, int format) {
1037      int type;
1038  
1039      xmlInitParser();
1040  
1041      if ((buf == NULL) || (cur == NULL))
1042          return;
1043  
1044      /*
1045       * force to output the stuff as HTML, especially for entities
1046       */
1047      type = cur->type;
1048      cur->type = XML_HTML_DOCUMENT_NODE;
1049      if (cur->intSubset != NULL) {
1050          htmlDtdDumpOutput(buf, cur, NULL);
1051      }
1052      if (cur->children != NULL) {
1053          htmlNodeListDumpOutput(buf, cur, cur->children, encoding, format);
1054      }
1055      xmlOutputBufferWriteString(buf, "\n");
1056      cur->type = (xmlElementType) type;
1057  }
1058  
1059  /**
1060   * htmlDocContentDumpOutput:
1061   * @buf:  the HTML buffer output
1062   * @cur:  the document
1063   * @encoding:  the encoding string
1064   *
1065   * Dump an HTML document. Formating return/spaces are added.
1066   */
1067  void
1068  htmlDocContentDumpOutput(xmlOutputBufferPtr buf, xmlDocPtr cur,
1069  	                 const char *encoding) {
1070      htmlDocContentDumpFormatOutput(buf, cur, encoding, 1);
1071  }
1072  
1073  /************************************************************************
1074   *									*
1075   *		Saving functions front-ends				*
1076   *									*
1077   ************************************************************************/
1078  
1079  /**
1080   * htmlDocDump:
1081   * @f:  the FILE*
1082   * @cur:  the document
1083   *
1084   * Dump an HTML document to an open FILE.
1085   *
1086   * returns: the number of byte written or -1 in case of failure.
1087   */
1088  int
1089  htmlDocDump(FILE *f, xmlDocPtr cur) {
1090      xmlOutputBufferPtr buf;
1091      xmlCharEncodingHandlerPtr handler = NULL;
1092      const char *encoding;
1093      int ret;
1094  
1095      xmlInitParser();
1096  
1097      if ((cur == NULL) || (f == NULL)) {
1098  	return(-1);
1099      }
1100  
1101      encoding = (const char *) htmlGetMetaEncoding(cur);
1102  
1103      if (encoding != NULL) {
1104  	xmlCharEncoding enc;
1105  
1106  	enc = xmlParseCharEncoding(encoding);
1107  	if (enc != cur->charset) {
1108  	    if (cur->charset != XML_CHAR_ENCODING_UTF8) {
1109  		/*
1110  		 * Not supported yet
1111  		 */
1112  		return(-1);
1113  	    }
1114  
1115  	    handler = xmlFindCharEncodingHandler(encoding);
1116  	    if (handler == NULL)
1117  		htmlSaveErr(XML_SAVE_UNKNOWN_ENCODING, NULL, encoding);
1118  	} else {
1119  	    handler = xmlFindCharEncodingHandler(encoding);
1120  	}
1121      }
1122  
1123      /*
1124       * Fallback to HTML or ASCII when the encoding is unspecified
1125       */
1126      if (handler == NULL)
1127  	handler = xmlFindCharEncodingHandler("HTML");
1128      if (handler == NULL)
1129  	handler = xmlFindCharEncodingHandler("ascii");
1130  
1131      buf = xmlOutputBufferCreateFile(f, handler);
1132      if (buf == NULL) return(-1);
1133      htmlDocContentDumpOutput(buf, cur, NULL);
1134  
1135      ret = xmlOutputBufferClose(buf);
1136      return(ret);
1137  }
1138  
1139  /**
1140   * htmlSaveFile:
1141   * @filename:  the filename (or URL)
1142   * @cur:  the document
1143   *
1144   * Dump an HTML document to a file. If @filename is "-" the stdout file is
1145   * used.
1146   * returns: the number of byte written or -1 in case of failure.
1147   */
1148  int
1149  htmlSaveFile(const char *filename, xmlDocPtr cur) {
1150      xmlOutputBufferPtr buf;
1151      xmlCharEncodingHandlerPtr handler = NULL;
1152      const char *encoding;
1153      int ret;
1154  
1155      if ((cur == NULL) || (filename == NULL))
1156          return(-1);
1157  
1158      xmlInitParser();
1159  
1160      encoding = (const char *) htmlGetMetaEncoding(cur);
1161  
1162      if (encoding != NULL) {
1163  	xmlCharEncoding enc;
1164  
1165  	enc = xmlParseCharEncoding(encoding);
1166  	if (enc != cur->charset) {
1167  	    if (cur->charset != XML_CHAR_ENCODING_UTF8) {
1168  		/*
1169  		 * Not supported yet
1170  		 */
1171  		return(-1);
1172  	    }
1173  
1174  	    handler = xmlFindCharEncodingHandler(encoding);
1175  	    if (handler == NULL)
1176  		htmlSaveErr(XML_SAVE_UNKNOWN_ENCODING, NULL, encoding);
1177  	}
1178      }
1179  
1180      /*
1181       * Fallback to HTML or ASCII when the encoding is unspecified
1182       */
1183      if (handler == NULL)
1184  	handler = xmlFindCharEncodingHandler("HTML");
1185      if (handler == NULL)
1186  	handler = xmlFindCharEncodingHandler("ascii");
1187  
1188      /*
1189       * save the content to a temp buffer.
1190       */
1191      buf = xmlOutputBufferCreateFilename(filename, handler, cur->compression);
1192      if (buf == NULL) return(0);
1193  
1194      htmlDocContentDumpOutput(buf, cur, NULL);
1195  
1196      ret = xmlOutputBufferClose(buf);
1197      return(ret);
1198  }
1199  
1200  /**
1201   * htmlSaveFileFormat:
1202   * @filename:  the filename
1203   * @cur:  the document
1204   * @format:  should formatting spaces been added
1205   * @encoding: the document encoding
1206   *
1207   * Dump an HTML document to a file using a given encoding.
1208   *
1209   * returns: the number of byte written or -1 in case of failure.
1210   */
1211  int
1212  htmlSaveFileFormat(const char *filename, xmlDocPtr cur,
1213  	           const char *encoding, int format) {
1214      xmlOutputBufferPtr buf;
1215      xmlCharEncodingHandlerPtr handler = NULL;
1216      int ret;
1217  
1218      if ((cur == NULL) || (filename == NULL))
1219          return(-1);
1220  
1221      xmlInitParser();
1222  
1223      if (encoding != NULL) {
1224  	xmlCharEncoding enc;
1225  
1226  	enc = xmlParseCharEncoding(encoding);
1227  	if (enc != cur->charset) {
1228  	    if (cur->charset != XML_CHAR_ENCODING_UTF8) {
1229  		/*
1230  		 * Not supported yet
1231  		 */
1232  		return(-1);
1233  	    }
1234  
1235  	    handler = xmlFindCharEncodingHandler(encoding);
1236  	    if (handler == NULL)
1237  		htmlSaveErr(XML_SAVE_UNKNOWN_ENCODING, NULL, encoding);
1238  	}
1239          htmlSetMetaEncoding(cur, (const xmlChar *) encoding);
1240      } else {
1241  	htmlSetMetaEncoding(cur, (const xmlChar *) "UTF-8");
1242      }
1243  
1244      /*
1245       * Fallback to HTML or ASCII when the encoding is unspecified
1246       */
1247      if (handler == NULL)
1248  	handler = xmlFindCharEncodingHandler("HTML");
1249      if (handler == NULL)
1250  	handler = xmlFindCharEncodingHandler("ascii");
1251  
1252      /*
1253       * save the content to a temp buffer.
1254       */
1255      buf = xmlOutputBufferCreateFilename(filename, handler, 0);
1256      if (buf == NULL) return(0);
1257  
1258      htmlDocContentDumpFormatOutput(buf, cur, encoding, format);
1259  
1260      ret = xmlOutputBufferClose(buf);
1261      return(ret);
1262  }
1263  
1264  /**
1265   * htmlSaveFileEnc:
1266   * @filename:  the filename
1267   * @cur:  the document
1268   * @encoding: the document encoding
1269   *
1270   * Dump an HTML document to a file using a given encoding
1271   * and formatting returns/spaces are added.
1272   *
1273   * returns: the number of byte written or -1 in case of failure.
1274   */
1275  int
1276  htmlSaveFileEnc(const char *filename, xmlDocPtr cur, const char *encoding) {
1277      return(htmlSaveFileFormat(filename, cur, encoding, 1));
1278  }
1279  
1280  #endif /* LIBXML_OUTPUT_ENABLED */
1281  
1282  #define bottom_HTMLtree
1283  #include "elfgcchack.h"
1284  #endif /* LIBXML_HTML_ENABLED */