/ libxml2 / xmlstring.c
xmlstring.c
   1  /*
   2   * string.c : an XML string utilities module
   3   *
   4   * This module provides various utility functions for manipulating
   5   * the xmlChar* type. All functions named xmlStr* have been moved here
   6   * from the parser.c file (their original home).
   7   *
   8   * See Copyright for the status of this software.
   9   *
  10   * UTF8 string routines from:
  11   * William Brack <wbrack@mmm.com.hk>
  12   *
  13   * daniel@veillard.com
  14   */
  15  
  16  #define IN_LIBXML
  17  #include "libxml.h"
  18  
  19  #include <stdlib.h>
  20  #include <string.h>
  21  #include <libxml/xmlmemory.h>
  22  #include <libxml/parserInternals.h>
  23  #include <libxml/xmlstring.h>
  24  
  25  /************************************************************************
  26   *                                                                      *
  27   *                Commodity functions to handle xmlChars                *
  28   *                                                                      *
  29   ************************************************************************/
  30  
  31  /**
  32   * xmlStrndup:
  33   * @cur:  the input xmlChar *
  34   * @len:  the len of @cur
  35   *
  36   * a strndup for array of xmlChar's
  37   *
  38   * Returns a new xmlChar * or NULL
  39   */
  40  xmlChar *
  41  xmlStrndup(const xmlChar *cur, int len) {
  42      xmlChar *ret;
  43  
  44      if ((cur == NULL) || (len < 0)) return(NULL);
  45      ret = (xmlChar *) xmlMallocAtomic((len + 1) * sizeof(xmlChar));
  46      if (ret == NULL) {
  47          xmlErrMemory(NULL, NULL);
  48          return(NULL);
  49      }
  50      memcpy(ret, cur, len * sizeof(xmlChar));
  51      ret[len] = 0;
  52      return(ret);
  53  }
  54  
  55  /**
  56   * xmlStrdup:
  57   * @cur:  the input xmlChar *
  58   *
  59   * a strdup for array of xmlChar's. Since they are supposed to be
  60   * encoded in UTF-8 or an encoding with 8bit based chars, we assume
  61   * a termination mark of '0'.
  62   *
  63   * Returns a new xmlChar * or NULL
  64   */
  65  xmlChar *
  66  xmlStrdup(const xmlChar *cur) {
  67      const xmlChar *p = cur;
  68  
  69      if (cur == NULL) return(NULL);
  70      while (*p != 0) p++; /* non input consuming */
  71      return(xmlStrndup(cur, p - cur));
  72  }
  73  
  74  /**
  75   * xmlCharStrndup:
  76   * @cur:  the input char *
  77   * @len:  the len of @cur
  78   *
  79   * a strndup for char's to xmlChar's
  80   *
  81   * Returns a new xmlChar * or NULL
  82   */
  83  
  84  xmlChar *
  85  xmlCharStrndup(const char *cur, int len) {
  86      int i;
  87      xmlChar *ret;
  88  
  89      if ((cur == NULL) || (len < 0)) return(NULL);
  90      ret = (xmlChar *) xmlMallocAtomic((len + 1) * sizeof(xmlChar));
  91      if (ret == NULL) {
  92          xmlErrMemory(NULL, NULL);
  93          return(NULL);
  94      }
  95      for (i = 0;i < len;i++) {
  96          ret[i] = (xmlChar) cur[i];
  97          if (ret[i] == 0) return(ret);
  98      }
  99      ret[len] = 0;
 100      return(ret);
 101  }
 102  
 103  /**
 104   * xmlCharStrdup:
 105   * @cur:  the input char *
 106   *
 107   * a strdup for char's to xmlChar's
 108   *
 109   * Returns a new xmlChar * or NULL
 110   */
 111  
 112  xmlChar *
 113  xmlCharStrdup(const char *cur) {
 114      const char *p = cur;
 115  
 116      if (cur == NULL) return(NULL);
 117      while (*p != '\0') p++; /* non input consuming */
 118      return(xmlCharStrndup(cur, p - cur));
 119  }
 120  
 121  /**
 122   * xmlStrcmp:
 123   * @str1:  the first xmlChar *
 124   * @str2:  the second xmlChar *
 125   *
 126   * a strcmp for xmlChar's
 127   *
 128   * Returns the integer result of the comparison
 129   */
 130  
 131  int
 132  xmlStrcmp(const xmlChar *str1, const xmlChar *str2) {
 133      register int tmp;
 134  
 135      if (str1 == str2) return(0);
 136      if (str1 == NULL) return(-1);
 137      if (str2 == NULL) return(1);
 138      do {
 139          tmp = *str1++ - *str2;
 140          if (tmp != 0) return(tmp);
 141      } while (*str2++ != 0);
 142      return 0;
 143  }
 144  
 145  /**
 146   * xmlStrEqual:
 147   * @str1:  the first xmlChar *
 148   * @str2:  the second xmlChar *
 149   *
 150   * Check if both strings are equal of have same content.
 151   * Should be a bit more readable and faster than xmlStrcmp()
 152   *
 153   * Returns 1 if they are equal, 0 if they are different
 154   */
 155  
 156  int
 157  xmlStrEqual(const xmlChar *str1, const xmlChar *str2) {
 158      if (str1 == str2) return(1);
 159      if (str1 == NULL) return(0);
 160      if (str2 == NULL) return(0);
 161      do {
 162          if (*str1++ != *str2) return(0);
 163      } while (*str2++);
 164      return(1);
 165  }
 166  
 167  /**
 168   * xmlStrQEqual:
 169   * @pref:  the prefix of the QName
 170   * @name:  the localname of the QName
 171   * @str:  the second xmlChar *
 172   *
 173   * Check if a QName is Equal to a given string
 174   *
 175   * Returns 1 if they are equal, 0 if they are different
 176   */
 177  
 178  int
 179  xmlStrQEqual(const xmlChar *pref, const xmlChar *name, const xmlChar *str) {
 180      if (pref == NULL) return(xmlStrEqual(name, str));
 181      if (name == NULL) return(0);
 182      if (str == NULL) return(0);
 183  
 184      do {
 185          if (*pref++ != *str) return(0);
 186      } while ((*str++) && (*pref));
 187      if (*str++ != ':') return(0);
 188      do {
 189          if (*name++ != *str) return(0);
 190      } while (*str++);
 191      return(1);
 192  }
 193  
 194  /**
 195   * xmlStrncmp:
 196   * @str1:  the first xmlChar *
 197   * @str2:  the second xmlChar *
 198   * @len:  the max comparison length
 199   *
 200   * a strncmp for xmlChar's
 201   *
 202   * Returns the integer result of the comparison
 203   */
 204  
 205  int
 206  xmlStrncmp(const xmlChar *str1, const xmlChar *str2, int len) {
 207      register int tmp;
 208  
 209      if (len <= 0) return(0);
 210      if (str1 == str2) return(0);
 211      if (str1 == NULL) return(-1);
 212      if (str2 == NULL) return(1);
 213  #ifdef __GNUC__
 214      tmp = strncmp((const char *)str1, (const char *)str2, len);
 215      return tmp;
 216  #else
 217      do {
 218          tmp = *str1++ - *str2;
 219          if (tmp != 0 || --len == 0) return(tmp);
 220      } while (*str2++ != 0);
 221      return 0;
 222  #endif
 223  }
 224  
 225  static const xmlChar casemap[256] = {
 226      0x00,0x01,0x02,0x03,0x04,0x05,0x06,0x07,
 227      0x08,0x09,0x0A,0x0B,0x0C,0x0D,0x0E,0x0F,
 228      0x10,0x11,0x12,0x13,0x14,0x15,0x16,0x17,
 229      0x18,0x19,0x1A,0x1B,0x1C,0x1D,0x1E,0x1F,
 230      0x20,0x21,0x22,0x23,0x24,0x25,0x26,0x27,
 231      0x28,0x29,0x2A,0x2B,0x2C,0x2D,0x2E,0x2F,
 232      0x30,0x31,0x32,0x33,0x34,0x35,0x36,0x37,
 233      0x38,0x39,0x3A,0x3B,0x3C,0x3D,0x3E,0x3F,
 234      0x40,0x61,0x62,0x63,0x64,0x65,0x66,0x67,
 235      0x68,0x69,0x6A,0x6B,0x6C,0x6D,0x6E,0x6F,
 236      0x70,0x71,0x72,0x73,0x74,0x75,0x76,0x77,
 237      0x78,0x79,0x7A,0x7B,0x5C,0x5D,0x5E,0x5F,
 238      0x60,0x61,0x62,0x63,0x64,0x65,0x66,0x67,
 239      0x68,0x69,0x6A,0x6B,0x6C,0x6D,0x6E,0x6F,
 240      0x70,0x71,0x72,0x73,0x74,0x75,0x76,0x77,
 241      0x78,0x79,0x7A,0x7B,0x7C,0x7D,0x7E,0x7F,
 242      0x80,0x81,0x82,0x83,0x84,0x85,0x86,0x87,
 243      0x88,0x89,0x8A,0x8B,0x8C,0x8D,0x8E,0x8F,
 244      0x90,0x91,0x92,0x93,0x94,0x95,0x96,0x97,
 245      0x98,0x99,0x9A,0x9B,0x9C,0x9D,0x9E,0x9F,
 246      0xA0,0xA1,0xA2,0xA3,0xA4,0xA5,0xA6,0xA7,
 247      0xA8,0xA9,0xAA,0xAB,0xAC,0xAD,0xAE,0xAF,
 248      0xB0,0xB1,0xB2,0xB3,0xB4,0xB5,0xB6,0xB7,
 249      0xB8,0xB9,0xBA,0xBB,0xBC,0xBD,0xBE,0xBF,
 250      0xC0,0xC1,0xC2,0xC3,0xC4,0xC5,0xC6,0xC7,
 251      0xC8,0xC9,0xCA,0xCB,0xCC,0xCD,0xCE,0xCF,
 252      0xD0,0xD1,0xD2,0xD3,0xD4,0xD5,0xD6,0xD7,
 253      0xD8,0xD9,0xDA,0xDB,0xDC,0xDD,0xDE,0xDF,
 254      0xE0,0xE1,0xE2,0xE3,0xE4,0xE5,0xE6,0xE7,
 255      0xE8,0xE9,0xEA,0xEB,0xEC,0xED,0xEE,0xEF,
 256      0xF0,0xF1,0xF2,0xF3,0xF4,0xF5,0xF6,0xF7,
 257      0xF8,0xF9,0xFA,0xFB,0xFC,0xFD,0xFE,0xFF
 258  };
 259  
 260  /**
 261   * xmlStrcasecmp:
 262   * @str1:  the first xmlChar *
 263   * @str2:  the second xmlChar *
 264   *
 265   * a strcasecmp for xmlChar's
 266   *
 267   * Returns the integer result of the comparison
 268   */
 269  
 270  int
 271  xmlStrcasecmp(const xmlChar *str1, const xmlChar *str2) {
 272      register int tmp;
 273  
 274      if (str1 == str2) return(0);
 275      if (str1 == NULL) return(-1);
 276      if (str2 == NULL) return(1);
 277      do {
 278          tmp = casemap[*str1++] - casemap[*str2];
 279          if (tmp != 0) return(tmp);
 280      } while (*str2++ != 0);
 281      return 0;
 282  }
 283  
 284  /**
 285   * xmlStrncasecmp:
 286   * @str1:  the first xmlChar *
 287   * @str2:  the second xmlChar *
 288   * @len:  the max comparison length
 289   *
 290   * a strncasecmp for xmlChar's
 291   *
 292   * Returns the integer result of the comparison
 293   */
 294  
 295  int
 296  xmlStrncasecmp(const xmlChar *str1, const xmlChar *str2, int len) {
 297      register int tmp;
 298  
 299      if (len <= 0) return(0);
 300      if (str1 == str2) return(0);
 301      if (str1 == NULL) return(-1);
 302      if (str2 == NULL) return(1);
 303      do {
 304          tmp = casemap[*str1++] - casemap[*str2];
 305          if (tmp != 0 || --len == 0) return(tmp);
 306      } while (*str2++ != 0);
 307      return 0;
 308  }
 309  
 310  /**
 311   * xmlStrchr:
 312   * @str:  the xmlChar * array
 313   * @val:  the xmlChar to search
 314   *
 315   * a strchr for xmlChar's
 316   *
 317   * Returns the xmlChar * for the first occurrence or NULL.
 318   */
 319  
 320  const xmlChar *
 321  xmlStrchr(const xmlChar *str, xmlChar val) {
 322      if (str == NULL) return(NULL);
 323      while (*str != 0) { /* non input consuming */
 324          if (*str == val) return((xmlChar *) str);
 325          str++;
 326      }
 327      return(NULL);
 328  }
 329  
 330  /**
 331   * xmlStrstr:
 332   * @str:  the xmlChar * array (haystack)
 333   * @val:  the xmlChar to search (needle)
 334   *
 335   * a strstr for xmlChar's
 336   *
 337   * Returns the xmlChar * for the first occurrence or NULL.
 338   */
 339  
 340  const xmlChar *
 341  xmlStrstr(const xmlChar *str, const xmlChar *val) {
 342      int n;
 343  
 344      if (str == NULL) return(NULL);
 345      if (val == NULL) return(NULL);
 346      n = xmlStrlen(val);
 347  
 348      if (n == 0) return(str);
 349      while (*str != 0) { /* non input consuming */
 350          if (*str == *val) {
 351              if (!xmlStrncmp(str, val, n)) return((const xmlChar *) str);
 352          }
 353          str++;
 354      }
 355      return(NULL);
 356  }
 357  
 358  /**
 359   * xmlStrcasestr:
 360   * @str:  the xmlChar * array (haystack)
 361   * @val:  the xmlChar to search (needle)
 362   *
 363   * a case-ignoring strstr for xmlChar's
 364   *
 365   * Returns the xmlChar * for the first occurrence or NULL.
 366   */
 367  
 368  const xmlChar *
 369  xmlStrcasestr(const xmlChar *str, const xmlChar *val) {
 370      int n;
 371  
 372      if (str == NULL) return(NULL);
 373      if (val == NULL) return(NULL);
 374      n = xmlStrlen(val);
 375  
 376      if (n == 0) return(str);
 377      while (*str != 0) { /* non input consuming */
 378          if (casemap[*str] == casemap[*val])
 379              if (!xmlStrncasecmp(str, val, n)) return(str);
 380          str++;
 381      }
 382      return(NULL);
 383  }
 384  
 385  /**
 386   * xmlStrsub:
 387   * @str:  the xmlChar * array (haystack)
 388   * @start:  the index of the first char (zero based)
 389   * @len:  the length of the substring
 390   *
 391   * Extract a substring of a given string
 392   *
 393   * Returns the xmlChar * for the first occurrence or NULL.
 394   */
 395  
 396  xmlChar *
 397  xmlStrsub(const xmlChar *str, int start, int len) {
 398      int i;
 399  
 400      if (str == NULL) return(NULL);
 401      if (start < 0) return(NULL);
 402      if (len < 0) return(NULL);
 403  
 404      for (i = 0;i < start;i++) {
 405          if (*str == 0) return(NULL);
 406          str++;
 407      }
 408      if (*str == 0) return(NULL);
 409      return(xmlStrndup(str, len));
 410  }
 411  
 412  /**
 413   * xmlStrlen:
 414   * @str:  the xmlChar * array
 415   *
 416   * length of a xmlChar's string
 417   *
 418   * Returns the number of xmlChar contained in the ARRAY.
 419   */
 420  
 421  int
 422  xmlStrlen(const xmlChar *str) {
 423      int len = 0;
 424  
 425      if (str == NULL) return(0);
 426      while (*str != 0) { /* non input consuming */
 427          str++;
 428          len++;
 429      }
 430      return(len);
 431  }
 432  
 433  /**
 434   * xmlStrncat:
 435   * @cur:  the original xmlChar * array
 436   * @add:  the xmlChar * array added
 437   * @len:  the length of @add
 438   *
 439   * a strncat for array of xmlChar's, it will extend @cur with the len
 440   * first bytes of @add. Note that if @len < 0 then this is an API error
 441   * and NULL will be returned.
 442   *
 443   * Returns a new xmlChar *, the original @cur is reallocated if needed
 444   * and should not be freed
 445   */
 446  
 447  xmlChar *
 448  xmlStrncat(xmlChar *cur, const xmlChar *add, int len) {
 449      int size;
 450      xmlChar *ret;
 451  
 452      if ((add == NULL) || (len == 0))
 453          return(cur);
 454      if (len < 0)
 455  	return(NULL);
 456      if (cur == NULL)
 457          return(xmlStrndup(add, len));
 458  
 459      size = xmlStrlen(cur);
 460      if (size < 0)
 461          return(NULL);
 462      ret = (xmlChar *) xmlRealloc(cur, (size + len + 1) * sizeof(xmlChar));
 463      if (ret == NULL) {
 464          xmlErrMemory(NULL, NULL);
 465          return(cur);
 466      }
 467      memcpy(&ret[size], add, len * sizeof(xmlChar));
 468      ret[size + len] = 0;
 469      return(ret);
 470  }
 471  
 472  /**
 473   * xmlStrncatNew:
 474   * @str1:  first xmlChar string
 475   * @str2:  second xmlChar string
 476   * @len:  the len of @str2 or < 0
 477   *
 478   * same as xmlStrncat, but creates a new string.  The original
 479   * two strings are not freed. If @len is < 0 then the length
 480   * will be calculated automatically.
 481   *
 482   * Returns a new xmlChar * or NULL
 483   */
 484  xmlChar *
 485  xmlStrncatNew(const xmlChar *str1, const xmlChar *str2, int len) {
 486      int size;
 487      xmlChar *ret;
 488  
 489      if (len < 0) {
 490          len = xmlStrlen(str2);
 491          if (len < 0)
 492              return(NULL);
 493      }
 494      if ((str2 == NULL) || (len == 0))
 495          return(xmlStrdup(str1));
 496      if (str1 == NULL)
 497          return(xmlStrndup(str2, len));
 498  
 499      size = xmlStrlen(str1);
 500      if (size < 0)
 501          return(NULL);
 502      ret = (xmlChar *) xmlMalloc((size + len + 1) * sizeof(xmlChar));
 503      if (ret == NULL) {
 504          xmlErrMemory(NULL, NULL);
 505          return(xmlStrndup(str1, size));
 506      }
 507      memcpy(ret, str1, size * sizeof(xmlChar));
 508      memcpy(&ret[size], str2, len * sizeof(xmlChar));
 509      ret[size + len] = 0;
 510      return(ret);
 511  }
 512  
 513  /**
 514   * xmlStrcat:
 515   * @cur:  the original xmlChar * array
 516   * @add:  the xmlChar * array added
 517   *
 518   * a strcat for array of xmlChar's. Since they are supposed to be
 519   * encoded in UTF-8 or an encoding with 8bit based chars, we assume
 520   * a termination mark of '0'.
 521   *
 522   * Returns a new xmlChar * containing the concatenated string.
 523   */
 524  xmlChar *
 525  xmlStrcat(xmlChar *cur, const xmlChar *add) {
 526      const xmlChar *p = add;
 527  
 528      if (add == NULL) return(cur);
 529      if (cur == NULL)
 530          return(xmlStrdup(add));
 531  
 532      while (*p != 0) p++; /* non input consuming */
 533      return(xmlStrncat(cur, add, p - add));
 534  }
 535  
 536  /**
 537   * xmlStrPrintf:
 538   * @buf:   the result buffer.
 539   * @len:   the result buffer length.
 540   * @msg:   the message with printf formatting.
 541   * @...:   extra parameters for the message.
 542   *
 543   * Formats @msg and places result into @buf.
 544   *
 545   * Returns the number of characters written to @buf or -1 if an error occurs.
 546   */
 547  int XMLCDECL
 548  xmlStrPrintf(xmlChar *buf, int len, const char *msg, ...) {
 549      va_list args;
 550      int ret;
 551  
 552      if((buf == NULL) || (msg == NULL)) {
 553          return(-1);
 554      }
 555  
 556      va_start(args, msg);
 557      ret = vsnprintf((char *) buf, len, (const char *) msg, args);
 558      va_end(args);
 559      buf[len - 1] = 0; /* be safe ! */
 560  
 561      return(ret);
 562  }
 563  
 564  /**
 565   * xmlStrVPrintf:
 566   * @buf:   the result buffer.
 567   * @len:   the result buffer length.
 568   * @msg:   the message with printf formatting.
 569   * @ap:    extra parameters for the message.
 570   *
 571   * Formats @msg and places result into @buf.
 572   *
 573   * Returns the number of characters written to @buf or -1 if an error occurs.
 574   */
 575  int
 576  xmlStrVPrintf(xmlChar *buf, int len, const char *msg, va_list ap) {
 577      int ret;
 578  
 579      if((buf == NULL) || (msg == NULL)) {
 580          return(-1);
 581      }
 582  
 583      ret = vsnprintf((char *) buf, len, (const char *) msg, ap);
 584      buf[len - 1] = 0; /* be safe ! */
 585  
 586      return(ret);
 587  }
 588  
 589  /************************************************************************
 590   *                                                                      *
 591   *              Generic UTF8 handling routines                          *
 592   *                                                                      *
 593   * From rfc2044: encoding of the Unicode values on UTF-8:               *
 594   *                                                                      *
 595   * UCS-4 range (hex.)           UTF-8 octet sequence (binary)           *
 596   * 0000 0000-0000 007F   0xxxxxxx                                       *
 597   * 0000 0080-0000 07FF   110xxxxx 10xxxxxx                              *
 598   * 0000 0800-0000 FFFF   1110xxxx 10xxxxxx 10xxxxxx                     *
 599   *                                                                      *
 600   * I hope we won't use values > 0xFFFF anytime soon !                   *
 601   *                                                                      *
 602   ************************************************************************/
 603  
 604  
 605  /**
 606   * xmlUTF8Size:
 607   * @utf: pointer to the UTF8 character
 608   *
 609   * calculates the internal size of a UTF8 character
 610   *
 611   * returns the numbers of bytes in the character, -1 on format error
 612   */
 613  int
 614  xmlUTF8Size(const xmlChar *utf) {
 615      xmlChar mask;
 616      int len;
 617  
 618      if (utf == NULL)
 619          return -1;
 620      if (*utf < 0x80)
 621          return 1;
 622      /* check valid UTF8 character */
 623      if (!(*utf & 0x40))
 624          return -1;
 625      /* determine number of bytes in char */
 626      len = 2;
 627      for (mask=0x20; mask != 0; mask>>=1) {
 628          if (!(*utf & mask))
 629              return len;
 630          len++;
 631      }
 632      return -1;
 633  }
 634  
 635  /**
 636   * xmlUTF8Charcmp:
 637   * @utf1: pointer to first UTF8 char
 638   * @utf2: pointer to second UTF8 char
 639   *
 640   * compares the two UCS4 values
 641   *
 642   * returns result of the compare as with xmlStrncmp
 643   */
 644  int
 645  xmlUTF8Charcmp(const xmlChar *utf1, const xmlChar *utf2) {
 646  
 647      if (utf1 == NULL ) {
 648          if (utf2 == NULL)
 649              return 0;
 650          return -1;
 651      }
 652      return xmlStrncmp(utf1, utf2, xmlUTF8Size(utf1));
 653  }
 654  
 655  /**
 656   * xmlUTF8Strlen:
 657   * @utf:  a sequence of UTF-8 encoded bytes
 658   *
 659   * compute the length of an UTF8 string, it doesn't do a full UTF8
 660   * checking of the content of the string.
 661   *
 662   * Returns the number of characters in the string or -1 in case of error
 663   */
 664  int
 665  xmlUTF8Strlen(const xmlChar *utf) {
 666      int ret = 0;
 667  
 668      if (utf == NULL)
 669          return(-1);
 670  
 671      while (*utf != 0) {
 672          if (utf[0] & 0x80) {
 673              if ((utf[1] & 0xc0) != 0x80)
 674                  return(-1);
 675              if ((utf[0] & 0xe0) == 0xe0) {
 676                  if ((utf[2] & 0xc0) != 0x80)
 677                      return(-1);
 678                  if ((utf[0] & 0xf0) == 0xf0) {
 679                      if ((utf[0] & 0xf8) != 0xf0 || (utf[3] & 0xc0) != 0x80)
 680                          return(-1);
 681                      utf += 4;
 682                  } else {
 683                      utf += 3;
 684                  }
 685              } else {
 686                  utf += 2;
 687              }
 688          } else {
 689              utf++;
 690          }
 691          ret++;
 692      }
 693      return(ret);
 694  }
 695  
 696  /**
 697   * xmlGetUTF8Char:
 698   * @utf:  a sequence of UTF-8 encoded bytes
 699   * @len:  a pointer to the minimum number of bytes present in
 700   *        the sequence.  This is used to assure the next character
 701   *        is completely contained within the sequence.
 702   *
 703   * Read the first UTF8 character from @utf
 704   *
 705   * Returns the char value or -1 in case of error, and sets *len to
 706   *        the actual number of bytes consumed (0 in case of error)
 707   */
 708  int
 709  xmlGetUTF8Char(const unsigned char *utf, int *len) {
 710      unsigned int c;
 711  
 712      if (utf == NULL)
 713          goto error;
 714      if (len == NULL)
 715          goto error;
 716      if (*len < 1)
 717          goto error;
 718  
 719      c = utf[0];
 720      if (c & 0x80) {
 721          if (*len < 2)
 722              goto error;
 723          if ((utf[1] & 0xc0) != 0x80)
 724              goto error;
 725          if ((c & 0xe0) == 0xe0) {
 726              if (*len < 3)
 727                  goto error;
 728              if ((utf[2] & 0xc0) != 0x80)
 729                  goto error;
 730              if ((c & 0xf0) == 0xf0) {
 731                  if (*len < 4)
 732                      goto error;
 733                  if ((c & 0xf8) != 0xf0 || (utf[3] & 0xc0) != 0x80)
 734                      goto error;
 735                  *len = 4;
 736                  /* 4-byte code */
 737                  c = (utf[0] & 0x7) << 18;
 738                  c |= (utf[1] & 0x3f) << 12;
 739                  c |= (utf[2] & 0x3f) << 6;
 740                  c |= utf[3] & 0x3f;
 741              } else {
 742                /* 3-byte code */
 743                  *len = 3;
 744                  c = (utf[0] & 0xf) << 12;
 745                  c |= (utf[1] & 0x3f) << 6;
 746                  c |= utf[2] & 0x3f;
 747              }
 748          } else {
 749            /* 2-byte code */
 750              *len = 2;
 751              c = (utf[0] & 0x1f) << 6;
 752              c |= utf[1] & 0x3f;
 753          }
 754      } else {
 755          /* 1-byte code */
 756          *len = 1;
 757      }
 758      return(c);
 759  
 760  error:
 761      if (len != NULL)
 762  	*len = 0;
 763      return(-1);
 764  }
 765  
 766  /**
 767   * xmlCheckUTF8:
 768   * @utf: Pointer to putative UTF-8 encoded string.
 769   *
 770   * Checks @utf for being valid UTF-8. @utf is assumed to be
 771   * null-terminated. This function is not super-strict, as it will
 772   * allow longer UTF-8 sequences than necessary. Note that Java is
 773   * capable of producing these sequences if provoked. Also note, this
 774   * routine checks for the 4-byte maximum size, but does not check for
 775   * 0x10ffff maximum value.
 776   *
 777   * Return value: true if @utf is valid.
 778   **/
 779  int
 780  xmlCheckUTF8(const unsigned char *utf)
 781  {
 782      int ix;
 783      unsigned char c;
 784  
 785      if (utf == NULL)
 786          return(0);
 787      /*
 788       * utf is a string of 1, 2, 3 or 4 bytes.  The valid strings
 789       * are as follows (in "bit format"):
 790       *    0xxxxxxx                                      valid 1-byte
 791       *    110xxxxx 10xxxxxx                             valid 2-byte
 792       *    1110xxxx 10xxxxxx 10xxxxxx                    valid 3-byte
 793       *    11110xxx 10xxxxxx 10xxxxxx 10xxxxxx           valid 4-byte
 794       */
 795      for (ix = 0; (c = utf[ix]);) {      /* string is 0-terminated */
 796          if ((c & 0x80) == 0x00) {	/* 1-byte code, starts with 10 */
 797              ix++;
 798  	} else if ((c & 0xe0) == 0xc0) {/* 2-byte code, starts with 110 */
 799  	    if ((utf[ix+1] & 0xc0 ) != 0x80)
 800  	        return 0;
 801  	    ix += 2;
 802  	} else if ((c & 0xf0) == 0xe0) {/* 3-byte code, starts with 1110 */
 803  	    if (((utf[ix+1] & 0xc0) != 0x80) ||
 804  	        ((utf[ix+2] & 0xc0) != 0x80))
 805  		    return 0;
 806  	    ix += 3;
 807  	} else if ((c & 0xf8) == 0xf0) {/* 4-byte code, starts with 11110 */
 808  	    if (((utf[ix+1] & 0xc0) != 0x80) ||
 809  	        ((utf[ix+2] & 0xc0) != 0x80) ||
 810  		((utf[ix+3] & 0xc0) != 0x80))
 811  		    return 0;
 812  	    ix += 4;
 813  	} else				/* unknown encoding */
 814  	    return 0;
 815        }
 816        return(1);
 817  }
 818  
 819  /**
 820   * xmlUTF8Strsize:
 821   * @utf:  a sequence of UTF-8 encoded bytes
 822   * @len:  the number of characters in the array
 823   *
 824   * storage size of an UTF8 string
 825   * the behaviour is not guaranteed if the input string is not UTF-8
 826   *
 827   * Returns the storage size of
 828   * the first 'len' characters of ARRAY
 829   */
 830  
 831  int
 832  xmlUTF8Strsize(const xmlChar *utf, int len) {
 833      const xmlChar   *ptr=utf;
 834      xmlChar         ch;
 835  
 836      if (utf == NULL)
 837          return(0);
 838  
 839      if (len <= 0)
 840          return(0);
 841  
 842      while ( len-- > 0) {
 843          if ( !*ptr )
 844              break;
 845          if ( (ch = *ptr++) & 0x80)
 846              while ((ch<<=1) & 0x80 ) {
 847  		if (*ptr == 0) break;
 848                  ptr++;
 849  	    }
 850      }
 851      return (ptr - utf);
 852  }
 853  
 854  
 855  /**
 856   * xmlUTF8Strndup:
 857   * @utf:  the input UTF8 *
 858   * @len:  the len of @utf (in chars)
 859   *
 860   * a strndup for array of UTF8's
 861   *
 862   * Returns a new UTF8 * or NULL
 863   */
 864  xmlChar *
 865  xmlUTF8Strndup(const xmlChar *utf, int len) {
 866      xmlChar *ret;
 867      int i;
 868  
 869      if ((utf == NULL) || (len < 0)) return(NULL);
 870      i = xmlUTF8Strsize(utf, len);
 871      ret = (xmlChar *) xmlMallocAtomic((i + 1) * sizeof(xmlChar));
 872      if (ret == NULL) {
 873          xmlGenericError(xmlGenericErrorContext,
 874                  "malloc of %ld byte failed\n",
 875                  (len + 1) * (long)sizeof(xmlChar));
 876          return(NULL);
 877      }
 878      memcpy(ret, utf, i * sizeof(xmlChar));
 879      ret[i] = 0;
 880      return(ret);
 881  }
 882  
 883  /**
 884   * xmlUTF8Strpos:
 885   * @utf:  the input UTF8 *
 886   * @pos:  the position of the desired UTF8 char (in chars)
 887   *
 888   * a function to provide the equivalent of fetching a
 889   * character from a string array
 890   *
 891   * Returns a pointer to the UTF8 character or NULL
 892   */
 893  const xmlChar *
 894  xmlUTF8Strpos(const xmlChar *utf, int pos) {
 895      xmlChar ch;
 896  
 897      if (utf == NULL) return(NULL);
 898      if (pos < 0)
 899          return(NULL);
 900      while (pos--) {
 901          if ((ch=*utf++) == 0) return(NULL);
 902          if ( ch & 0x80 ) {
 903              /* if not simple ascii, verify proper format */
 904              if ( (ch & 0xc0) != 0xc0 )
 905                  return(NULL);
 906              /* then skip over remaining bytes for this char */
 907              while ( (ch <<= 1) & 0x80 )
 908                  if ( (*utf++ & 0xc0) != 0x80 )
 909                      return(NULL);
 910          }
 911      }
 912      return((xmlChar *)utf);
 913  }
 914  
 915  /**
 916   * xmlUTF8Strloc:
 917   * @utf:  the input UTF8 *
 918   * @utfchar:  the UTF8 character to be found
 919   *
 920   * a function to provide the relative location of a UTF8 char
 921   *
 922   * Returns the relative character position of the desired char
 923   * or -1 if not found
 924   */
 925  int
 926  xmlUTF8Strloc(const xmlChar *utf, const xmlChar *utfchar) {
 927      int i, size;
 928      xmlChar ch;
 929  
 930      if (utf==NULL || utfchar==NULL) return -1;
 931      size = xmlUTF8Strsize(utfchar, 1);
 932          for(i=0; (ch=*utf) != 0; i++) {
 933              if (xmlStrncmp(utf, utfchar, size)==0)
 934                  return(i);
 935              utf++;
 936              if ( ch & 0x80 ) {
 937                  /* if not simple ascii, verify proper format */
 938                  if ( (ch & 0xc0) != 0xc0 )
 939                      return(-1);
 940                  /* then skip over remaining bytes for this char */
 941                  while ( (ch <<= 1) & 0x80 )
 942                      if ( (*utf++ & 0xc0) != 0x80 )
 943                          return(-1);
 944              }
 945          }
 946  
 947      return(-1);
 948  }
 949  /**
 950   * xmlUTF8Strsub:
 951   * @utf:  a sequence of UTF-8 encoded bytes
 952   * @start: relative pos of first char
 953   * @len:   total number to copy
 954   *
 955   * Create a substring from a given UTF-8 string
 956   * Note:  positions are given in units of UTF-8 chars
 957   *
 958   * Returns a pointer to a newly created string
 959   * or NULL if any problem
 960   */
 961  
 962  xmlChar *
 963  xmlUTF8Strsub(const xmlChar *utf, int start, int len) {
 964      int            i;
 965      xmlChar ch;
 966  
 967      if (utf == NULL) return(NULL);
 968      if (start < 0) return(NULL);
 969      if (len < 0) return(NULL);
 970  
 971      /*
 972       * Skip over any leading chars
 973       */
 974      for (i = 0;i < start;i++) {
 975          if ((ch=*utf++) == 0) return(NULL);
 976          if ( ch & 0x80 ) {
 977              /* if not simple ascii, verify proper format */
 978              if ( (ch & 0xc0) != 0xc0 )
 979                  return(NULL);
 980              /* then skip over remaining bytes for this char */
 981              while ( (ch <<= 1) & 0x80 )
 982                  if ( (*utf++ & 0xc0) != 0x80 )
 983                      return(NULL);
 984          }
 985      }
 986  
 987      return(xmlUTF8Strndup(utf, len));
 988  }
 989  
 990  /**
 991   * xmlEscapeFormatString:
 992   * @msg:  a pointer to the string in which to escape '%' characters.
 993   * Must be a heap-allocated buffer created by libxml2 that may be
 994   * returned, or that may be freed and replaced.
 995   *
 996   * Replaces the string pointed to by 'msg' with an escaped string.
 997   * Returns the same string with all '%' characters escaped.
 998   */
 999  xmlChar *
1000  xmlEscapeFormatString(xmlChar **msg)
1001  {
1002      xmlChar *msgPtr = NULL;
1003      xmlChar *result = NULL;
1004      xmlChar *resultPtr = NULL;
1005      size_t count = 0;
1006      size_t msgLen = 0;
1007      size_t resultLen = 0;
1008  
1009      if (!msg || !*msg)
1010          return(NULL);
1011  
1012      for (msgPtr = *msg; *msgPtr != '\0'; ++msgPtr) {
1013          ++msgLen;
1014          if (*msgPtr == '%')
1015              ++count;
1016      }
1017  
1018      if (count == 0)
1019          return(*msg);
1020  
1021      resultLen = msgLen + count + 1;
1022      result = (xmlChar *) xmlMallocAtomic(resultLen * sizeof(xmlChar));
1023      if (result == NULL) {
1024          /* Clear *msg to prevent format string vulnerabilities in
1025             out-of-memory situations. */
1026          xmlFree(*msg);
1027          *msg = NULL;
1028          xmlErrMemory(NULL, NULL);
1029          return(NULL);
1030      }
1031  
1032      for (msgPtr = *msg, resultPtr = result; *msgPtr != '\0'; ++msgPtr, ++resultPtr) {
1033          *resultPtr = *msgPtr;
1034          if (*msgPtr == '%')
1035              *(++resultPtr) = '%';
1036      }
1037      result[resultLen - 1] = '\0';
1038  
1039      xmlFree(*msg);
1040      *msg = result;
1041  
1042      return *msg;
1043  }
1044  
1045  #define bottom_xmlstring
1046  #include "elfgcchack.h"