/ CFXMLInputStream.c
CFXMLInputStream.c
  1  /*
  2   * Copyright (c) 2015 Apple Inc. All rights reserved.
  3   *
  4   * @APPLE_LICENSE_HEADER_START@
  5   *
  6   * This file contains Original Code and/or Modifications of Original Code
  7   * as defined in and that are subject to the Apple Public Source License
  8   * Version 2.0 (the 'License'). You may not use this file except in
  9   * compliance with the License. Please obtain a copy of the License at
 10   * http://www.opensource.apple.com/apsl/ and read it before using this
 11   * file.
 12   *
 13   * The Original Code and all software distributed under the License are
 14   * distributed on an 'AS IS' basis, WITHOUT WARRANTY OF ANY KIND, EITHER
 15   * EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES,
 16   * INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY,
 17   * FITNESS FOR A PARTICULAR PURPOSE, QUIET ENJOYMENT OR NON-INFRINGEMENT.
 18   * Please see the License for the specific language governing rights and
 19   * limitations under the License.
 20   *
 21   * @APPLE_LICENSE_HEADER_END@
 22   */
 23  
 24  /*	CFXMLInputStream.c
 25  	Copyright (c) 1999-2014, Apple Inc. All rights reserved.
 26  	Responsibility: David Smith
 27  */
 28  
 29  #include "CFXMLInputStream.h"
 30  #include <CoreFoundation/CFCharacterSet.h>
 31  #include <string.h>
 32  #include "CFStringEncodingConverter.h"
 33  #include "CFUniChar.h"
 34  
 35  /* Utility functions used in parsing */
 36  static Boolean determineEncoding(_CFXMLInputStream *stream) {
 37      const uint8_t *bytes = (uint8_t *)CFDataGetBytePtr(stream->data);
 38      UInt32 length = CFDataGetLength(stream->data);
 39      const uint8_t *idx = 0L, *end = 0L;
 40      const uint8_t *base = 0L;
 41      char quote = ' ';
 42      Boolean useUTF8 = false;
 43      
 44      // Check for the byte order mark first
 45      if (length > 2) {
 46          // This clause checks for the unicode byte order mark, or a Unicode sequence lacking the BOM; technically an error, but this check is recommended by the XML spec
 47          if ((*bytes == 0xFF && *(bytes+1) == 0xFE) ||*(bytes+1) == 0x00) {
 48  #if __BIG_ENDIAN__
 49              stream->flags |= ENCODING_IS_UNICODE_SWAPPED;
 50  #else
 51              stream->flags |= ENCODING_IS_UNICODE_NATURAL;
 52  #endif
 53              if (*bytes == 0xFF) {
 54                  stream->currentByte = bytes + 2;
 55              }
 56              stream->encoding = kCFStringEncodingUnicode;
 57              return true;
 58          } else if ((*bytes == 0xFE && *(bytes+1) == 0xFF) || *bytes == 0x00) {
 59  #if __BIG_ENDIAN__
 60              stream->flags |= ENCODING_IS_UNICODE_NATURAL;
 61  #else
 62              stream->flags |= ENCODING_IS_UNICODE_SWAPPED;
 63  #endif
 64              if (*bytes == 0xFE) {
 65                  stream->currentByte = bytes + 2;
 66              }
 67              stream->encoding = kCFStringEncodingUnicode;
 68              return true;
 69          } else if(*bytes == 0xEF && *(bytes+1) == 0xBB && *(bytes+2) == 0xBF) {
 70              if(*bytes == 0xEF) {
 71                  stream->currentByte = bytes + 3;
 72              }
 73              stream->encoding = kCFStringEncodingUTF8;
 74              stream->flags |= ENCODING_MATCHES_ASCII;
 75              return true;
 76          }
 77      }
 78      // Scan for the <?xml.... ?> opening
 79      if (length < 5 || strncmp((char const *) bytes, "<?xml", 5) != 0) {
 80          useUTF8 = true;
 81      }
 82      if (!useUTF8) {
 83          idx = bytes + 5;
 84          end = bytes + length;
 85          // Found "<?xml"; now we scan for "encoding"
 86          while (idx < end) {
 87              uint8_t ch = *idx;
 88              const uint8_t *scan;
 89              if ( ch == '?' || ch == '>') {
 90                  useUTF8 = true;
 91                  break;
 92              }
 93              idx ++;
 94              scan = idx;
 95              if (ch == 'e' && *scan++ == 'n' && *scan++ == 'c' && *scan++ == 'o' && *scan++ == 'd' && *scan++ == 'i' && *scan++ == 'n' && *scan++ == 'g' && *scan++ == '=') {
 96                  idx = scan;
 97                  break;
 98              }
 99          }
100          if (!useUTF8 && idx >= end) {
101              useUTF8 = true;
102          }
103      }
104      if (!useUTF8) {
105          // Found "encoding="; see if we've got an honest-to-goodness encoding name
106          quote = *idx;
107          if (quote != '\'' && quote != '\"') {
108              useUTF8 = true;
109          }
110      }
111      if (!useUTF8) {
112          base = idx + 1; // Move past the quote character
113          idx ++;
114          while (idx < end && *idx != quote) idx ++;
115          if (idx >= end) {
116              useUTF8 = true;
117          }
118      }
119      if (!useUTF8) {
120          UInt32 len = idx - base;
121          if (len == 5 && (*base == 'u' || *base == 'U') && (base[1] == 't' || base[1] == 'T') && (base[2] == 'f' || base[2] == 'F') && (base[3] == '-') && (base[4] == '8')) {
122              useUTF8 = true;
123          } else {
124              CFStringRef encodingName = CFStringCreateWithBytes(stream->allocator, base, len, kCFStringEncodingISOLatin1, false);
125              stream->encoding = CFStringConvertIANACharSetNameToEncoding(encodingName);
126              CFRelease(encodingName);
127          }
128      }
129      if (useUTF8) {
130          stream->encoding = kCFStringEncodingUTF8;
131          stream->flags |= ENCODING_MATCHES_ASCII;
132          return true;
133      } else if (stream->encoding == kCFStringEncodingInvalidId) {
134          return false;
135      } else if (__CFStringEncodingIsSupersetOfASCII(stream->encoding)) {
136          stream->flags |= ENCODING_MATCHES_ASCII;
137      }
138      return true;
139  }
140  
141  CF_INLINE void _fillStringWithCharacters(CFMutableStringRef string, UniChar *characters, CFIndex numChars) {
142      CFStringDelete(string, CFRangeMake(0, CFStringGetLength(string)));
143      if (numChars) {
144          CFStringAppendCharacters(string, characters, numChars);
145      }
146  }
147  
148  CF_PRIVATE Boolean _openInputStream(_CFXMLInputStream *stream) {
149      if (NULL == stream->data) {
150          return false;
151      } else {
152          stream->currentByte = CFDataGetBytePtr(stream->data);
153          if (determineEncoding(stream)) {
154              stream->flags |= STREAM_OPEN;
155              return true;
156          } else {
157              return false;
158          }
159      }
160  }
161  
162  CF_PRIVATE void _initializeInputStream(_CFXMLInputStream *stream, CFAllocatorRef alloc, CFURLRef dataSource, CFDataRef xmlData) {
163      stream->data = xmlData ? (CFDataRef)CFRetain(xmlData) : NULL;
164      stream->url = dataSource ? (CFURLRef)CFRetain(dataSource) : NULL;
165      stream->encoding = kCFStringEncodingInvalidId;
166      stream->currentByte = NULL;
167      
168      stream->allocator = (CFAllocatorRef)CFRetain(alloc);
169      stream->charBuffer = NULL;
170      stream->currentChar = NULL;
171      stream->mark = NULL;
172      stream->parserMark = NULL;
173      stream->bufferLength = 0;
174      stream->bufferCapacity = 0;
175      
176      stream->charIndex = 1;
177      stream->lineNum = 1;
178  
179      stream->flags = 0;
180      stream->nameSet = NULL;
181      stream->tempString = NULL;
182  }
183  
184  
185  CF_PRIVATE void _freeInputStream(_CFXMLInputStream *stream) {
186      if (stream->data) CFRelease(stream->data);
187      if (stream->url) CFRelease(stream->url);
188      if (stream->charBuffer) CFAllocatorDeallocate(stream->allocator, stream->charBuffer);
189      if (stream->nameSet) CFRelease(stream->nameSet);
190      if (stream->tempString) CFRelease(stream->tempString);
191      CFRelease(stream->allocator);
192  }
193  
194  CF_PRIVATE CFStringEncoding _inputStreamGetEncoding(_CFXMLInputStream *stream) {
195      return stream->encoding;
196  }
197  
198  CF_PRIVATE CFIndex _inputStreamCurrentLocation(_CFXMLInputStream *stream) {
199      return stream->charIndex;
200  }
201  
202  CF_PRIVATE CFIndex _inputStreamCurrentLine(_CFXMLInputStream *stream) {
203      return stream->lineNum;
204  }
205  
206  CF_PRIVATE Boolean _inputStreamAtEOF(_CFXMLInputStream *stream) {
207      if (!(stream->flags & STREAM_OPEN)) return false;
208      if (stream->currentChar) return false;
209      if (stream->currentByte - CFDataGetBytePtr(stream->data) < CFDataGetLength(stream->data)) return false;
210      return true;
211  }
212  
213  CF_PRIVATE Boolean _inputStreamComposingErrorOccurred(_CFXMLInputStream *stream) {
214      return stream->flags & ENCODING_COMPOSITION_ERROR;
215  }
216  
217  #define INITIAL_BUFFER_SIZE 64
218  static void growCharacterBuffer(_CFXMLInputStream *stream) {
219      if (!stream->charBuffer) {
220          stream->charBuffer = (UniChar *)CFAllocatorAllocate(stream->allocator, INITIAL_BUFFER_SIZE*sizeof(UniChar), 0);
221          stream->bufferCapacity = INITIAL_BUFFER_SIZE;
222      } else {
223          CFIndex currCharDelta = stream->currentChar ? stream->currentChar - stream->charBuffer : -1;
224          CFIndex markDelta = stream->mark ? stream->mark - stream->charBuffer: -1;
225          CFIndex parserMarkDelta = stream->parserMark ? stream->parserMark - stream->charBuffer: -1;
226          UniChar *newBuffer = (UniChar *)CFAllocatorReallocate(stream->allocator, stream->charBuffer, stream->bufferCapacity * 2 * sizeof(UniChar), 0);
227          stream->bufferCapacity *= 2;
228          if (newBuffer != stream->charBuffer) {
229              stream->charBuffer = newBuffer;
230              if (currCharDelta != -1) {
231                  stream->currentChar = newBuffer + currCharDelta;
232              }
233              if (markDelta != -1) {
234                  stream->mark = newBuffer + markDelta;
235              }
236              if (parserMarkDelta != -1) {
237                  stream->parserMark = newBuffer + parserMarkDelta;
238              }
239          }
240      }
241  }
242  
243  static CFIndex loadCharacters(UniChar *base, CFIndex maxLength, _CFXMLInputStream *stream) {
244      const uint8_t *dataEnd = CFDataGetBytePtr(stream->data) + CFDataGetLength(stream->data);
245      if (stream->flags & (ENCODING_IS_UNICODE_NATURAL|ENCODING_IS_UNICODE_SWAPPED) ) {
246          CFIndex charsToTranslate = (dataEnd - stream->currentByte) / sizeof(UniChar);
247          if (charsToTranslate > maxLength) {
248              charsToTranslate = maxLength;
249          }
250          if (stream->flags & ENCODING_IS_UNICODE_NATURAL) {
251              memmove(base, stream->currentByte, charsToTranslate * sizeof(UniChar));
252              stream->currentByte += (charsToTranslate * sizeof(UniChar));
253          } else {
254              CFIndex i;
255              uint8_t *baseBytePtr = (uint8_t *)base;
256              for (i = 0; i < charsToTranslate; i ++) {
257                  *(baseBytePtr + 1) = *stream->currentByte;
258                  *baseBytePtr = *(stream->currentByte + 1);
259                  baseBytePtr += 2;
260                  stream->currentByte += 2;
261              }
262          }
263          return charsToTranslate;
264      } else {
265          CFIndex lengthConsumed = 0;
266          CFIndex usedByteLength, usedCharLength;
267          UInt32 conversionResult;
268          if (stream->flags & ENCODING_MATCHES_ASCII) {
269              while (stream->currentByte < dataEnd && lengthConsumed < maxLength) {
270                  if (*stream->currentByte > 0x7f) break;
271                  *base = *stream->currentByte;
272                  base ++;
273                  stream->currentByte ++;
274                  lengthConsumed ++;
275              }
276              if (stream->currentByte == dataEnd || lengthConsumed == maxLength) {
277                  return lengthConsumed;
278              }
279          }
280          conversionResult = CFStringEncodingBytesToUnicode(stream->encoding, 0, stream->currentByte, dataEnd - stream->currentByte, &usedByteLength, base, maxLength-lengthConsumed, &usedCharLength);
281          if(kCFStringEncodingConversionSuccess != conversionResult) {
282              switch(conversionResult) {
283                  case kCFStringEncodingConverterUnavailable:
284                  case kCFStringEncodingInvalidInputStream:
285                      stream->flags |= ENCODING_COMPOSITION_ERROR;
286                      break;
287                  case kCFStringEncodingInsufficientOutputBufferLength:
288                  default:
289                      break;
290              }
291          }
292          if (usedByteLength > 0) {
293              stream->currentByte += usedByteLength;
294              lengthConsumed += usedCharLength;
295          }
296          return lengthConsumed;
297      }
298  }
299  
300  // returns number of characters filled
301  CF_INLINE CFIndex fillToCapacity(_CFXMLInputStream *stream) {
302      CFIndex numFilled;
303      if (stream->bufferLength >= stream->bufferCapacity) return 0;
304      // Try and fill in the remaining characters
305      numFilled = loadCharacters(stream->charBuffer+stream->bufferLength, stream->bufferCapacity - stream->bufferLength, stream);
306      if (numFilled != 0) {
307          stream->currentChar = stream->charBuffer + stream->bufferLength;
308          stream->bufferLength += numFilled;
309      }
310      return numFilled;
311  }
312  
313  // we are expected to move mark & parserMark relative to any moved characters, set currentChar to the first new character fetched, update bufferLength, and advance currentByte as appropriate.  Does not check for EOF; it is the caller's responsibility to verify this.
314  static void fillCharacterBuffer(_CFXMLInputStream *stream) {
315      if (!stream->charBuffer) {
316          growCharacterBuffer(stream);
317      }
318      if (!stream->mark && !stream->parserMark) {
319          // This is the easy case; we can freely overwrite the buffer; if either mark or parserMark is set, we must not remove any characters from those marks and the end of the buffer
320          CFIndex fillLength = stream->bufferCapacity-5; // We leave a few characters at the end, b/c we don't want to reallocate (doubling the amount of memory used) just b/c we're matching a small string near the end of the filled buffer
321          stream->bufferLength = loadCharacters(stream->charBuffer, fillLength, stream);
322          CFAssert(stream->bufferLength != 0, __kCFLogAssertion, "CF internal error: XML parser input stream corruption");
323          stream->currentChar = stream->charBuffer;
324      } else {
325          // We do everything we can not to allocate; first we fill any remaining characters.  If that doesn't work, we try shifting the characters starting at the earlier of mark or parserMark to the beginning of buffer, then filling the newly-freed characters.
326          Boolean done;
327  
328          // First try just filling the remaining capacity
329          done = (fillToCapacity(stream) != 0);
330          if (!done) {
331              const UniChar *leftMostMark;
332              if (stream->mark && !stream->parserMark) {
333                  leftMostMark = stream->mark;
334              } else if (stream->parserMark && !stream->mark) {
335                  leftMostMark = stream->parserMark;
336              } else if (stream->parserMark < stream->mark) {
337                  leftMostMark = stream->parserMark;
338              } else {
339                  leftMostMark = stream->mark;
340              }
341              if (leftMostMark > stream->charBuffer) {
342                  CFIndex delta = leftMostMark - stream->charBuffer;
343                  memmove(stream->charBuffer, leftMostMark, (stream->bufferLength - delta) * sizeof(UniChar));
344                  stream->bufferLength -= delta;
345                  if (stream->mark) {
346                      stream->mark -= delta;
347                  }
348                  if (stream->parserMark) {
349                      stream->parserMark -= delta;
350                  }
351                  // Now try to fill the newly-opened space
352                  done = (fillToCapacity(stream) != 0);
353                  delta = loadCharacters(stream->charBuffer + stream->bufferLength, stream->bufferCapacity - stream->bufferLength, stream);
354              }
355          }
356          if (!done) {
357              // No help for it; now we must allocate
358              growCharacterBuffer(stream);
359              fillToCapacity(stream); // If this doesn't work, we give up.
360          }
361      }
362  }
363  
364  /* The guts of getCharacter() have been separated in order to allow getCharacter() to be small and more easily inline-able. Note that a check late in the 10.3 development cycle indicates that getCharacter() should call getCharacterGuts() less than 2% of the time.  (In 29000 calls, less than 400 called this. Note that a majority of calls have advanceStream set, so that was left in the inline version.  Also note that some calls to getCharacter() were changed to go through the functions _inputStreamGetCharacter() or _inputStreamPeekCharacter(), as the expansion in using the inline version didn't seem worthwhile. See 3275503 for some data supporting this.
365  */
366  static Boolean getCharacterGuts(_CFXMLInputStream *stream, UniChar *ch, Boolean advanceStream) {
367      if (stream->currentByte - CFDataGetBytePtr(stream->data) >= CFDataGetLength(stream->data)) {
368          return false; // EOF
369      } else if (!((stream->mark || stream->parserMark) && advanceStream) &&
370                 (((stream->flags & ENCODING_MATCHES_ASCII) && *(stream->currentByte) < 0x7F) ||
371                  (stream->flags & (ENCODING_IS_UNICODE_NATURAL | ENCODING_IS_UNICODE_SWAPPED)))) {
372          // We can only perform optimizations if neither mark is set (if the mark is set, we must fill the character buffer so we can retrieve the characters later), and the encoding is Unicode, or the encoding matches ASCII and we're looking at a low-byte character.
373          if (stream->flags & ENCODING_MATCHES_ASCII) {
374              *ch = (UniChar)*(stream->currentByte);
375              if (advanceStream) {
376                  stream->currentByte ++;
377              }
378          } else if (stream->flags & ENCODING_IS_UNICODE_NATURAL) {
379              *ch = *(UniChar *)(stream->currentByte);
380              if (advanceStream) {
381                  stream->currentByte += 2;
382              }
383          } else {
384              // Unicode with swapped bytes
385              *ch = CFSwapInt16(*(UniChar *)(stream->currentByte));
386              if (advanceStream) {
387                  stream->currentByte += 2;
388              }
389          }
390      } else {
391          fillCharacterBuffer(stream); // this takes into account markIsSet to make sure and do the right thing
392          if (!stream->charBuffer || !stream->currentChar) {
393              return false;
394          } else {
395              *ch = *(stream->currentChar);
396              if (advanceStream) {
397                  stream->currentChar ++;
398                  if (stream->currentChar == stream->charBuffer + stream->bufferLength) {
399                      stream->currentChar = NULL;
400                  }
401              }
402          }
403      }
404      return true;
405  }
406  
407  /* See comments above getCharacterGuts()
408  */
409  CF_INLINE Boolean getCharacter(_CFXMLInputStream *stream, UniChar *ch, Boolean advanceStream) {
410      if (!(stream->flags & STREAM_OPEN)) {
411          return false;
412      } else if (stream->currentChar) {
413          *ch = *stream->currentChar;
414          if (advanceStream) {
415              stream->currentChar ++;
416              if (stream->currentChar == stream->charBuffer + stream->bufferLength) {
417                  stream->currentChar = NULL;
418              }
419          }
420      } else {
421          if (!getCharacterGuts(stream, ch, advanceStream)) return false;
422      }
423      if (advanceStream) {
424          UniChar nextChar;
425          stream->charIndex ++;
426          if ((*ch == '\n') || ((*ch == '\r') && (!_inputStreamPeekCharacter(stream, &nextChar) || nextChar != '\n'))) stream->lineNum ++;
427      }
428      return true;
429  }
430  
431  CF_PRIVATE Boolean _inputStreamPeekCharacter(_CFXMLInputStream *stream, UniChar *ch) {
432      return getCharacter(stream, ch, false);
433  }
434  
435  CF_PRIVATE Boolean _inputStreamGetCharacter(_CFXMLInputStream *stream, UniChar *ch) {
436      return getCharacter(stream, ch, true);
437  }
438  
439  CF_PRIVATE Boolean _inputStreamReturnCharacter(_CFXMLInputStream *stream, UniChar ch) {
440      Boolean decrementLineNum = false;
441      if (ch == '\n') {
442          decrementLineNum = true;
443      } else if (ch == '\r') {
444          UniChar nextChar;
445          if (!_inputStreamPeekCharacter(stream, &nextChar) || nextChar != '\n') {
446              decrementLineNum = true;
447          }
448      }
449  
450      if (!(stream->flags & STREAM_OPEN)) {
451          return false;
452      } else if (stream->currentChar) {
453          if (stream->currentChar != stream->charBuffer) {
454              stream->currentChar --;
455          } else {
456              // Yuck; we're unlucky and are returning a character _before_ the first character in charBuffer
457              if (stream->bufferLength >= stream->bufferCapacity) {
458                  growCharacterBuffer(stream);
459              }
460              memmove(stream->charBuffer + 1, stream->charBuffer, stream->bufferLength * sizeof(UniChar));
461              *stream->charBuffer = ch;
462              stream->bufferLength ++;
463              if (stream->mark) {
464                  stream->mark ++;
465              }
466              if (stream->parserMark) {
467                  stream->parserMark ++;
468              }
469          }
470      } else if ((stream->mark || stream->parserMark) && stream->bufferLength) {
471          // We've been collecting characters in charBuffer; the only reason stream->currentChar is NULL is that we've processed the last character thusfar translated from data.  That last character is the one being returned.
472          stream->currentChar = stream->charBuffer + stream->bufferLength - 1;
473      } else if (stream->charBuffer) {
474          // We have processed all the meaningful characters from charBuffer and have no reason to preserve them.  We use charBuffer to hold this one character that has been returned to us.
475          *stream->charBuffer = ch;
476          stream->currentChar = stream->charBuffer;
477          stream->bufferLength = 1;
478          if (stream->mark) {
479              stream->mark ++;
480          }
481          if (stream->parserMark) {
482              stream->parserMark ++;
483          } 
484      } else if (stream->currentByte > CFDataGetBytePtr(stream->data)) {
485          // We have no character buffer available, so that means one of two things - either we've never needed a character buffer because all the characters could come directly out of the byte stream, or we've not yet processed the first character.  The former means we can just back up the byte pointer; the latter means Bad Things have happened.
486          if (stream->flags & ENCODING_MATCHES_ASCII) {
487              stream->currentByte --;
488          } else {  // Must be Unicode
489              stream->currentByte -= 2;
490          }
491      } else {
492          return false;
493      }
494      stream->charIndex --;
495      if (decrementLineNum) {
496          stream->lineNum --;
497      }
498      return true;
499  }
500  
501  // Returns the pointer to hold as the mark
502  static UniChar *dropMark(_CFXMLInputStream *stream) {
503      if (stream->currentChar) {
504          return stream->currentChar;
505      } else if (stream->mark || stream->parserMark) {
506          return stream->charBuffer + stream->bufferLength;
507      } else {
508          if (!stream->charBuffer) {
509              growCharacterBuffer(stream);
510          }
511          stream->bufferLength = 0; // This will be sufficient to force a fetch into the buffer when the next character is requested
512          return stream->charBuffer;
513      }
514  
515  }
516  
517  CF_PRIVATE void _inputStreamSetMark(_CFXMLInputStream *stream) {
518      CFAssert(stream->mark == NULL, __kCFLogAssertion, "CF internal error: parser input stream malformed");
519      stream->mark = dropMark(stream);
520  }
521  
522  CF_PRIVATE void _inputStreamClearMark(_CFXMLInputStream *stream) {
523      CFAssert(stream->mark != NULL, __kCFLogAssertion, "CF internal error: parser input stream malformed");
524      stream->mark = NULL;
525  }
526  
527  CF_PRIVATE void _inputStreamGetCharactersFromMark(_CFXMLInputStream *stream, CFMutableStringRef string) {
528      UniChar *end = stream->currentChar ? stream->currentChar : stream->charBuffer + stream->bufferLength;
529      CFIndex numChars = end - stream->mark;
530      CFAssert(stream->mark, __kCFLogAssertion, "CF internal error: malformed XML input stream");
531      _fillStringWithCharacters(string, stream->mark, numChars);
532  }
533  
534  static void restoreToMark(_CFXMLInputStream *stream, UniChar *mark) {
535      UniChar *end = stream->currentChar ? stream->currentChar : stream->charBuffer + stream->bufferLength;
536      if (end > mark) {
537          CFIndex numChars = end - mark;
538          stream->charIndex -= numChars;
539          stream->currentChar = mark;
540  
541          // This is annoying; to keep the line count accurate, if the first character we are returning is a CR, we must decrement the line count iff the next character is NOT a LF
542          if (*(end - 1) == '\r') {
543              UniChar nextChar;
544              if (_inputStreamPeekCharacter(stream, &nextChar) && nextChar == '\n') {
545                  end --;
546              }
547          }
548          while (end != mark) {
549              end --;
550              if (*end == '\r') {
551                  stream->lineNum --;
552              } else if (*end == '\n') {
553                  stream->lineNum --;
554                  if (end != mark && *(end - 1) == '\r') {
555                      end --;
556                  }
557              }
558          }
559      }
560  }
561  
562  CF_PRIVATE void _inputStreamBackUpToMark(_CFXMLInputStream *stream) {
563      CFAssert(stream->mark != NULL || stream->charBuffer == NULL, __kCFLogAssertion, "CF internal error: malformed XML input stream");
564      restoreToMark(stream, stream->mark);
565  }
566  
567  CF_INLINE Boolean isWhitespaceChar(UniChar ch) {
568      return (ch == '\n' || ch == '\r' || ch == ' ' || ch == '\t');
569  }
570  
571  CF_PRIVATE CFIndex _inputStreamSkipWhitespace(_CFXMLInputStream *stream, CFMutableStringRef str) {
572      UniChar ch;
573      CFIndex len = 0;
574      if (str) {
575          stream->parserMark = dropMark(stream);
576      }
577      while (getCharacter(stream, &ch, true) && isWhitespaceChar(ch)) {
578          len ++;
579      }
580      if (!isWhitespaceChar(ch)) {
581          _inputStreamReturnCharacter(stream, ch);
582      }
583      if (str) {
584          _fillStringWithCharacters(str, stream->parserMark, len);
585          stream->parserMark = NULL;
586      }
587      return len;
588  }
589  
590  // false return means EOF was encountered without finding scanChars
591  CF_PRIVATE Boolean _inputStreamScanToCharacters(_CFXMLInputStream *stream, const UniChar *scanChars, CFIndex numChars, CFMutableStringRef str) {
592      Boolean done = false;
593      CFIndex firstRepeatIndex = -1;
594      CFIndex len = 0;
595      stream->parserMark = dropMark(stream);
596      do {
597          UniChar ch;
598          while (_inputStreamGetCharacter(stream, &ch) && ch != scanChars[0]) {
599              len ++;
600          }
601          if (ch != scanChars[0]) {
602              restoreToMark(stream, stream->parserMark);
603              stream->parserMark = NULL;
604              return false;
605          } else {
606              CFIndex i;
607              for (i = 1; i < numChars; i ++) {
608                  if (!_inputStreamGetCharacter(stream, &ch)) break;
609                  if (ch != scanChars[i]) break;
610              }
611              if (i == numChars) {
612                  done = true;
613              } else {
614                  if (firstRepeatIndex == -1) {
615                      CFIndex j;
616                      for (j = 1; j < numChars; j ++) {
617                          if (scanChars[0] == scanChars[j]) {
618                              break;
619                          }
620                      }
621                      firstRepeatIndex = j;
622                  }
623                  _inputStreamReturnCharacter(stream, ch);
624                  while (i > firstRepeatIndex) {
625                      i --;
626                      _inputStreamReturnCharacter(stream, scanChars[i]);
627                  }
628                  len += i;
629              }
630          }
631      } while (!done);
632      if (str) {
633          _fillStringWithCharacters(str, stream->parserMark, len);
634      }
635      stream->parserMark = NULL;
636      return true;
637  }
638  
639  CF_PRIVATE Boolean _inputStreamMatchString(_CFXMLInputStream *stream, const UniChar *stringToMatch, CFIndex length) {
640      const UniChar *end = stringToMatch+length;
641      const UniChar *sPtr=stringToMatch;
642      stream->parserMark = dropMark(stream);
643      while (sPtr < end) {
644          UniChar ch;
645          if (!_inputStreamGetCharacter(stream, &ch)) break;
646          if (ch != *sPtr) break;
647          sPtr ++;
648      }
649      if (sPtr != end) {
650          restoreToMark(stream, stream->parserMark);
651          stream->parserMark = NULL;
652          return false;
653      } else {
654          stream->parserMark = NULL;
655          return true;
656      }
657  }
658  
659  CF_PRIVATE Boolean _inputStreamScanQuotedString(_CFXMLInputStream *stream, CFMutableStringRef str) {
660      UniChar ch;
661      if (!_inputStreamPeekCharacter(stream, &ch)) return false;
662      if (ch != '\'' && ch != '\"')  return false;
663  
664      _inputStreamGetCharacter(stream, &ch);
665      if (!_inputStreamScanToCharacters(stream, &ch, 1, str)) {
666          return false;
667      }
668      return true;
669  }
670  
671  /*
672   [4]  NameChar ::= Letter | Digit | '.' | '-' | '_' | ':' | CombiningChar | Extender
673   [5]  Name ::= (Letter | '_' | ':') (NameChar)*
674   [7]  Nmtoken ::= (NameChar)+
675   [84] Letter ::= BaseChar | Ideographic
676  
677   We don't do this quite right; we rely on the Unicode charsets to do this analysis.  While
678   the productions in the XML spec are based on the Unicode character sets, the definitions
679   differ slightly to avoid those areas where the Unicode standard is still being resolved.
680   At any rate, I'd lay money that using the Unicode charsets, we will be more correct than
681   the vast majority of parsers out there.
682  
683   Letter == kCFUniCharLetterCharacterSet
684   Digit == kCFUniCharDecimalDigitCharacterSet
685   CombiningChar == kCFUniCharNonBaseCharacterSet
686   Extender - complex, and not represented by a uniform character set.
687   */
688  CF_PRIVATE Boolean _inputStreamScanXMLName(_CFXMLInputStream *stream, Boolean isNMToken, CFStringRef *str) {
689      UniChar ch;
690      Boolean success = true;
691      stream->parserMark = dropMark(stream);
692      if (!isNMToken) {
693          // Only difference between an NMToken and a Name is Names have a stricter condition on the first character
694          if (!getCharacter(stream, &ch, false)) {
695              success = false;
696          } else if (!CFUniCharIsMemberOf(ch, kCFUniCharLetterCharacterSet) && ch != '_' && ch != ':') {
697              success = false;
698          } else {
699              getCharacter(stream, &ch, true);
700          }
701      }
702      if (success) {
703          while (getCharacter(stream, &ch, true)) {
704              if (!CFUniCharIsMemberOf(ch, kCFUniCharLetterCharacterSet) && !CFUniCharIsMemberOf(ch, kCFUniCharDecimalDigitCharacterSet)  && ch != '.' && ch != '-' && ch != '_' && ch != ':' && !CFUniCharIsMemberOf(ch, kCFUniCharNonBaseCharacterSet)) {
705                  _inputStreamReturnCharacter(stream, ch);
706                  break;
707              }
708          }
709          if (NULL == stream->currentChar || stream->currentChar == stream->parserMark) {
710              success = false; // Must have processed at least one character
711          }
712      }
713      if (success) {
714          if (str) {
715              if (!stream->nameSet) {
716                  stream->nameSet = CFSetCreateMutable(stream->allocator, 0, &kCFTypeSetCallBacks);
717                  stream->tempString = CFStringCreateMutableWithExternalCharactersNoCopy(stream->allocator, NULL, 0, 0, kCFAllocatorNull);
718              }
719              CFStringSetExternalCharactersNoCopy(stream->tempString, stream->parserMark, stream->currentChar-stream->parserMark, stream->currentChar-stream->parserMark);
720              if (!CFSetGetValueIfPresent(stream->nameSet, stream->tempString, (const void **)str)) {
721                  *str = (CFStringRef)CFStringCreateCopy(stream->allocator, stream->tempString);
722                  CFSetAddValue(stream->nameSet, *str);
723                  CFRelease(*str);
724              }
725          }
726      } else {
727          restoreToMark(stream, stream->parserMark);
728      }
729      stream->parserMark = NULL;
730      return success;
731  }
732  
733