/ src / common / convert_UTF.cc
convert_UTF.cc
  1  /*
  2   * Copyright © 1991-2015 Unicode, Inc. All rights reserved.
  3   * Distributed under the Terms of Use in 
  4   * http://www.unicode.org/copyright.html.
  5   *
  6   * Permission is hereby granted, free of charge, to any person obtaining
  7   * a copy of the Unicode data files and any associated documentation
  8   * (the "Data Files") or Unicode software and any associated documentation
  9   * (the "Software") to deal in the Data Files or Software
 10   * without restriction, including without limitation the rights to use,
 11   * copy, modify, merge, publish, distribute, and/or sell copies of
 12   * the Data Files or Software, and to permit persons to whom the Data Files
 13   * or Software are furnished to do so, provided that
 14   * (a) this copyright and permission notice appear with all copies 
 15   * of the Data Files or Software,
 16   * (b) this copyright and permission notice appear in associated 
 17   * documentation, and
 18   * (c) there is clear notice in each modified Data File or in the Software
 19   * as well as in the documentation associated with the Data File(s) or
 20   * Software that the data or software has been modified.
 21   *
 22   * THE DATA FILES AND SOFTWARE ARE PROVIDED "AS IS", WITHOUT WARRANTY OF
 23   * ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE
 24   * WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
 25   * NONINFRINGEMENT OF THIRD PARTY RIGHTS.
 26   * IN NO EVENT SHALL THE COPYRIGHT HOLDER OR HOLDERS INCLUDED IN THIS
 27   * NOTICE BE LIABLE FOR ANY CLAIM, OR ANY SPECIAL INDIRECT OR CONSEQUENTIAL
 28   * DAMAGES, OR ANY DAMAGES WHATSOEVER RESULTING FROM LOSS OF USE,
 29   * DATA OR PROFITS, WHETHER IN AN ACTION OF CONTRACT, NEGLIGENCE OR OTHER
 30   * TORTIOUS ACTION, ARISING OUT OF OR IN CONNECTION WITH THE USE OR
 31   * PERFORMANCE OF THE DATA FILES OR SOFTWARE.
 32   *
 33   * Except as contained in this notice, the name of a copyright holder
 34   * shall not be used in advertising or otherwise to promote the sale,
 35   * use or other dealings in these Data Files or Software without prior
 36   * written authorization of the copyright holder.
 37   */
 38  
 39  /* ---------------------------------------------------------------------
 40  
 41  Conversions between UTF32, UTF-16, and UTF-8. Source code file.
 42  Author: Mark E. Davis, 1994.
 43  Rev History: Rick McGowan, fixes & updates May 2001.
 44  Sept 2001: fixed const & error conditions per
 45  mods suggested by S. Parent & A. Lillich.
 46  June 2002: Tim Dodd added detection and handling of incomplete
 47  source sequences, enhanced error detection, added casts
 48  to eliminate compiler warnings.
 49  July 2003: slight mods to back out aggressive FFFE detection.
 50  Jan 2004: updated switches in from-UTF8 conversions.
 51  Oct 2004: updated to use UNI_MAX_LEGAL_UTF32 in UTF-32 conversions.
 52  
 53  See the header file "ConvertUTF.h" for complete documentation.
 54  
 55  ------------------------------------------------------------------------ */
 56  
 57  
 58  #ifdef HAVE_CONFIG_H
 59  #include <config.h>  // Must come first
 60  #endif
 61  
 62  #include "convert_UTF.h"
 63  #ifdef CVTUTF_DEBUG
 64  #include <stdio.h>
 65  #endif
 66  
 67  #include "common/macros.h"
 68  
 69  namespace google_breakpad {
 70  
 71  namespace {
 72  
 73  const int halfShift  = 10; /* used for shifting by 10 bits */
 74  
 75  const UTF32 halfBase = 0x0010000UL;
 76  const UTF32 halfMask = 0x3FFUL;
 77  
 78  }  // namespace
 79  
 80  #define UNI_SUR_HIGH_START  (UTF32)0xD800
 81  #define UNI_SUR_HIGH_END    (UTF32)0xDBFF
 82  #define UNI_SUR_LOW_START   (UTF32)0xDC00
 83  #define UNI_SUR_LOW_END     (UTF32)0xDFFF
 84  
 85  /* --------------------------------------------------------------------- */
 86  
 87  ConversionResult ConvertUTF32toUTF16 (const UTF32** sourceStart, const UTF32* sourceEnd,
 88                                        UTF16** targetStart, UTF16* targetEnd, ConversionFlags flags) {
 89    ConversionResult result = conversionOK;
 90    const UTF32* source = *sourceStart;
 91    UTF16* target = *targetStart;
 92    while (source < sourceEnd) {
 93      UTF32 ch;
 94      if (target >= targetEnd) {
 95  	    result = targetExhausted; break;
 96      }
 97      ch = *source++;
 98      if (ch <= UNI_MAX_BMP) { /* Target is a character <= 0xFFFF */
 99  	    /* UTF-16 surrogate values are illegal in UTF-32; 0xffff or 0xfffe are both reserved values */
100  	    if (ch >= UNI_SUR_HIGH_START && ch <= UNI_SUR_LOW_END) {
101          if (flags == strictConversion) {
102            --source; /* return to the illegal value itself */
103            result = sourceIllegal;
104            break;
105          } else {
106            *target++ = UNI_REPLACEMENT_CHAR;
107          }
108  	    } else {
109          *target++ = (UTF16)ch; /* normal case */
110  	    }
111      } else if (ch > UNI_MAX_LEGAL_UTF32) {
112  	    if (flags == strictConversion) {
113          result = sourceIllegal;
114  	    } else {
115          *target++ = UNI_REPLACEMENT_CHAR;
116  	    }
117      } else {
118  	    /* target is a character in range 0xFFFF - 0x10FFFF. */
119  	    if (target + 1 >= targetEnd) {
120          --source; /* Back up source pointer! */
121          result = targetExhausted; break;
122  	    }
123  	    ch -= halfBase;
124  	    *target++ = (UTF16)((ch >> halfShift) + UNI_SUR_HIGH_START);
125  	    *target++ = (UTF16)((ch & halfMask) + UNI_SUR_LOW_START);
126      }
127    }
128  *sourceStart = source;
129  *targetStart = target;
130  return result;
131  }
132  
133  /* --------------------------------------------------------------------- */
134  
135  ConversionResult ConvertUTF16toUTF32 (const UTF16** sourceStart, const UTF16* sourceEnd,
136                                        UTF32** targetStart, UTF32* targetEnd, ConversionFlags flags) {
137    ConversionResult result = conversionOK;
138    const UTF16* source = *sourceStart;
139    UTF32* target = *targetStart;
140    UTF32 ch, ch2;
141    while (source < sourceEnd) {
142      const UTF16* oldSource = source; /*  In case we have to back up because of target overflow. */
143      ch = *source++;
144      /* If we have a surrogate pair, convert to UTF32 first. */
145      if (ch >= UNI_SUR_HIGH_START && ch <= UNI_SUR_HIGH_END) {
146  	    /* If the 16 bits following the high surrogate are in the source buffer... */
147  	    if (source < sourceEnd) {
148          ch2 = *source;
149          /* If it's a low surrogate, convert to UTF32. */
150          if (ch2 >= UNI_SUR_LOW_START && ch2 <= UNI_SUR_LOW_END) {
151            ch = ((ch - UNI_SUR_HIGH_START) << halfShift)
152            + (ch2 - UNI_SUR_LOW_START) + halfBase;
153            ++source;
154          } else if (flags == strictConversion) { /* it's an unpaired high surrogate */
155            --source; /* return to the illegal value itself */
156            result = sourceIllegal;
157            break;
158          }
159  	    } else { /* We don't have the 16 bits following the high surrogate. */
160          --source; /* return to the high surrogate */
161          result = sourceExhausted;
162          break;
163  	    }
164      } else if (flags == strictConversion) {
165  	    /* UTF-16 surrogate values are illegal in UTF-32 */
166  	    if (ch >= UNI_SUR_LOW_START && ch <= UNI_SUR_LOW_END) {
167          --source; /* return to the illegal value itself */
168          result = sourceIllegal;
169          break;
170  	    }
171      }
172      if (target >= targetEnd) {
173  	    source = oldSource; /* Back up source pointer! */
174  	    result = targetExhausted; break;
175      }
176      *target++ = ch;
177    }
178    *sourceStart = source;
179    *targetStart = target;
180  #ifdef CVTUTF_DEBUG
181    if (result == sourceIllegal) {
182      fprintf(stderr, "ConvertUTF16toUTF32 illegal seq 0x%04x,%04x\n", ch, ch2);
183      fflush(stderr);
184    }
185  #endif
186    return result;
187  }
188  
189  /* --------------------------------------------------------------------- */
190  
191  namespace {
192  
193  /*
194   * Index into the table below with the first byte of a UTF-8 sequence to
195   * get the number of trailing bytes that are supposed to follow it.
196   * Note that *legal* UTF-8 values can't have 4 or 5-bytes. The table is
197   * left as-is for anyone who may want to do such conversion, which was
198   * allowed in earlier algorithms.
199   */
200  const char trailingBytesForUTF8[256] = {
201    0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
202    0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
203    0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
204    0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
205    0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
206    0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
207    1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1, 1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,
208    2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2, 3,3,3,3,3,3,3,3,4,4,4,4,5,5,5,5
209  };
210  
211  /*
212   * Magic values subtracted from a buffer value during UTF8 conversion.
213   * This table contains as many values as there might be trailing bytes
214   * in a UTF-8 sequence.
215   */
216  const UTF32 offsetsFromUTF8[6] = { 0x00000000UL, 0x00003080UL, 0x000E2080UL,
217    0x03C82080UL, 0xFA082080UL, 0x82082080UL };
218  
219  /*
220   * Once the bits are split out into bytes of UTF-8, this is a mask OR-ed
221   * into the first byte, depending on how many bytes follow.  There are
222   * as many entries in this table as there are UTF-8 sequence types.
223   * (I.e., one byte sequence, two byte... etc.). Remember that sequencs
224   * for *legal* UTF-8 will be 4 or fewer bytes total.
225   */
226  const UTF8 firstByteMark[7] = { 0x00, 0x00, 0xC0, 0xE0, 0xF0, 0xF8, 0xFC };
227  
228  /* --------------------------------------------------------------------- */
229  
230  /* The interface converts a whole buffer to avoid function-call overhead.
231  * Constants have been gathered. Loops & conditionals have been removed as
232  * much as possible for efficiency, in favor of drop-through switches.
233  * (See "Note A" at the bottom of the file for equivalent code.)
234  * If your compiler supports it, the "isLegalUTF8" call can be turned
235  * into an inline function.
236  */
237  
238  }  // namespace
239  
240  /* --------------------------------------------------------------------- */
241  
242  ConversionResult ConvertUTF16toUTF8 (const UTF16** sourceStart, const UTF16* sourceEnd,
243                                       UTF8** targetStart, UTF8* targetEnd, ConversionFlags flags) {
244    ConversionResult result = conversionOK;
245    const UTF16* source = *sourceStart;
246    UTF8* target = *targetStart;
247    while (source < sourceEnd) {
248      UTF32 ch;
249      unsigned short bytesToWrite = 0;
250      const UTF32 byteMask = 0xBF;
251      const UTF32 byteMark = 0x80;
252      const UTF16* oldSource = source; /* In case we have to back up because of target overflow. */
253      ch = *source++;
254      /* If we have a surrogate pair, convert to UTF32 first. */
255      if (ch >= UNI_SUR_HIGH_START && ch <= UNI_SUR_HIGH_END) {
256  	    /* If the 16 bits following the high surrogate are in the source buffer... */
257  	    if (source < sourceEnd) {
258          UTF32 ch2 = *source;
259          /* If it's a low surrogate, convert to UTF32. */
260          if (ch2 >= UNI_SUR_LOW_START && ch2 <= UNI_SUR_LOW_END) {
261            ch = ((ch - UNI_SUR_HIGH_START) << halfShift)
262            + (ch2 - UNI_SUR_LOW_START) + halfBase;
263            ++source;
264          } else if (flags == strictConversion) { /* it's an unpaired high surrogate */
265            --source; /* return to the illegal value itself */
266            result = sourceIllegal;
267            break;
268          }
269  	    } else { /* We don't have the 16 bits following the high surrogate. */
270          --source; /* return to the high surrogate */
271          result = sourceExhausted;
272          break;
273  	    }
274      } else if (flags == strictConversion) {
275  	    /* UTF-16 surrogate values are illegal in UTF-32 */
276  	    if (ch >= UNI_SUR_LOW_START && ch <= UNI_SUR_LOW_END) {
277          --source; /* return to the illegal value itself */
278          result = sourceIllegal;
279          break;
280  	    }
281      }
282      /* Figure out how many bytes the result will require */
283      if (ch < (UTF32)0x80) {	     bytesToWrite = 1;
284      } else if (ch < (UTF32)0x800) {     bytesToWrite = 2;
285      } else if (ch < (UTF32)0x10000) {   bytesToWrite = 3;
286      } else if (ch < (UTF32)0x110000) {  bytesToWrite = 4;
287      } else {			    bytesToWrite = 3;
288        ch = UNI_REPLACEMENT_CHAR;
289      }
290  
291      target += bytesToWrite;
292      if (target > targetEnd) {
293  	    source = oldSource; /* Back up source pointer! */
294  	    target -= bytesToWrite; result = targetExhausted; break;
295      }
296      switch (bytesToWrite) { /* note: everything falls through. */
297        case 4:
298          *--target = (UTF8)((ch | byteMark) & byteMask);
299          ch >>= 6;
300          BP_FALLTHROUGH;
301        case 3:
302          *--target = (UTF8)((ch | byteMark) & byteMask);
303          ch >>= 6;
304          BP_FALLTHROUGH;
305        case 2:
306          *--target = (UTF8)((ch | byteMark) & byteMask);
307          ch >>= 6;
308          BP_FALLTHROUGH;
309        case 1:
310          *--target =  (UTF8)(ch | firstByteMark[bytesToWrite]);
311      }
312      target += bytesToWrite;
313    }
314  *sourceStart = source;
315  *targetStart = target;
316  return result;
317  }
318  
319  /* --------------------------------------------------------------------- */
320  
321  namespace {
322  
323  /*
324   * Utility routine to tell whether a sequence of bytes is legal UTF-8.
325   * This must be called with the length pre-determined by the first byte.
326   * If not calling this from ConvertUTF8to*, then the length can be set by:
327   *  length = trailingBytesForUTF8[*source]+1;
328   * and the sequence is illegal right away if there aren't that many bytes
329   * available.
330   * If presented with a length > 4, this returns false.  The Unicode
331   * definition of UTF-8 goes up to 4-byte sequences.
332   */
333  Boolean isLegalUTF8(const UTF8 *source, int length) {
334    UTF8 a;
335    const UTF8 *srcptr = source+length;
336    switch (length) {
337      default: return false;
338        /* Everything else falls through when "true"... */
339      case 4:
340        if ((a = (*--srcptr)) < 0x80 || a > 0xBF) return false;
341        BP_FALLTHROUGH;
342      case 3:
343        if ((a = (*--srcptr)) < 0x80 || a > 0xBF) return false;
344        BP_FALLTHROUGH;
345      case 2:
346        if ((a = (*--srcptr)) > 0xBF) return false;
347  
348        switch (*source) {
349          /* no fall-through in this inner switch */
350          case 0xE0: if (a < 0xA0) return false; break;
351          case 0xED: if (a > 0x9F) return false; break;
352          case 0xF0: if (a < 0x90) return false; break;
353          case 0xF4: if (a > 0x8F) return false; break;
354          default:   if (a < 0x80) return false;
355        }
356        BP_FALLTHROUGH;
357      case 1: if (*source >= 0x80 && *source < 0xC2) return false;
358    }
359    if (*source > 0xF4) return false;
360    return true;
361  }
362  
363  }  // namespace
364  
365  /* --------------------------------------------------------------------- */
366  
367  /*
368   * Exported function to return whether a UTF-8 sequence is legal or not.
369   * This is not used here; it's just exported.
370   */
371  Boolean isLegalUTF8Sequence(const UTF8 *source, const UTF8 *sourceEnd) {
372    int length = trailingBytesForUTF8[*source]+1;
373    if (source+length > sourceEnd) {
374      return false;
375    }
376    return isLegalUTF8(source, length);
377  }
378  
379  /* --------------------------------------------------------------------- */
380  
381  ConversionResult ConvertUTF8toUTF16 (const UTF8** sourceStart, const UTF8* sourceEnd,
382                                       UTF16** targetStart, UTF16* targetEnd, ConversionFlags flags) {
383    ConversionResult result = conversionOK;
384    const UTF8* source = *sourceStart;
385    UTF16* target = *targetStart;
386    while (source < sourceEnd) {
387      UTF32 ch = 0;
388      unsigned short extraBytesToRead = trailingBytesForUTF8[*source];
389      if (source + extraBytesToRead >= sourceEnd) {
390  	    result = sourceExhausted; break;
391      }
392      /* Do this check whether lenient or strict */
393      if (! isLegalUTF8(source, extraBytesToRead+1)) {
394  	    result = sourceIllegal;
395  	    break;
396      }
397      /*
398       * The cases all fall through. See "Note A" below.
399       */
400      switch (extraBytesToRead) {
401        /* remember, illegal UTF-8 */
402        case 5: ch += *source++; ch <<= 6; BP_FALLTHROUGH;
403        /* remember, illegal UTF-8 */
404        case 4: ch += *source++; ch <<= 6; BP_FALLTHROUGH;
405        case 3: ch += *source++; ch <<= 6; BP_FALLTHROUGH;
406        case 2: ch += *source++; ch <<= 6; BP_FALLTHROUGH;
407        case 1: ch += *source++; ch <<= 6; BP_FALLTHROUGH;
408        case 0: ch += *source++;
409      }
410      ch -= offsetsFromUTF8[extraBytesToRead];
411  
412      if (target >= targetEnd) {
413  	    source -= (extraBytesToRead+1); /* Back up source pointer! */
414  	    result = targetExhausted; break;
415      }
416      if (ch <= UNI_MAX_BMP) { /* Target is a character <= 0xFFFF */
417  	    /* UTF-16 surrogate values are illegal in UTF-32 */
418  	    if (ch >= UNI_SUR_HIGH_START && ch <= UNI_SUR_LOW_END) {
419          if (flags == strictConversion) {
420            source -= (extraBytesToRead+1); /* return to the illegal value itself */
421            result = sourceIllegal;
422            break;
423          } else {
424            *target++ = UNI_REPLACEMENT_CHAR;
425          }
426  	    } else {
427          *target++ = (UTF16)ch; /* normal case */
428  	    }
429      } else if (ch > UNI_MAX_UTF16) {
430  	    if (flags == strictConversion) {
431          result = sourceIllegal;
432          source -= (extraBytesToRead+1); /* return to the start */
433          break; /* Bail out; shouldn't continue */
434  	    } else {
435          *target++ = UNI_REPLACEMENT_CHAR;
436  	    }
437      } else {
438  	    /* target is a character in range 0xFFFF - 0x10FFFF. */
439  	    if (target + 1 >= targetEnd) {
440          source -= (extraBytesToRead+1); /* Back up source pointer! */
441          result = targetExhausted; break;
442  	    }
443  	    ch -= halfBase;
444  	    *target++ = (UTF16)((ch >> halfShift) + UNI_SUR_HIGH_START);
445  	    *target++ = (UTF16)((ch & halfMask) + UNI_SUR_LOW_START);
446      }
447    }
448  *sourceStart = source;
449  *targetStart = target;
450  return result;
451  }
452  
453  /* --------------------------------------------------------------------- */
454  
455  ConversionResult ConvertUTF32toUTF8 (const UTF32** sourceStart, const UTF32* sourceEnd,
456                                       UTF8** targetStart, UTF8* targetEnd, ConversionFlags flags) {
457    ConversionResult result = conversionOK;
458    const UTF32* source = *sourceStart;
459    UTF8* target = *targetStart;
460    while (source < sourceEnd) {
461      UTF32 ch;
462      unsigned short bytesToWrite = 0;
463      const UTF32 byteMask = 0xBF;
464      const UTF32 byteMark = 0x80;
465      ch = *source++;
466      if (flags == strictConversion ) {
467  	    /* UTF-16 surrogate values are illegal in UTF-32 */
468  	    if (ch >= UNI_SUR_HIGH_START && ch <= UNI_SUR_LOW_END) {
469          --source; /* return to the illegal value itself */
470          result = sourceIllegal;
471          break;
472  	    }
473      }
474      /*
475       * Figure out how many bytes the result will require. Turn any
476       * illegally large UTF32 things (> Plane 17) into replacement chars.
477       */
478      if (ch < (UTF32)0x80) {	     bytesToWrite = 1;
479      } else if (ch < (UTF32)0x800) {     bytesToWrite = 2;
480      } else if (ch < (UTF32)0x10000) {   bytesToWrite = 3;
481      } else if (ch <= UNI_MAX_LEGAL_UTF32) {  bytesToWrite = 4;
482      } else {			    bytesToWrite = 3;
483        ch = UNI_REPLACEMENT_CHAR;
484        result = sourceIllegal;
485      }
486  
487      target += bytesToWrite;
488      if (target > targetEnd) {
489  	    --source; /* Back up source pointer! */
490  	    target -= bytesToWrite; result = targetExhausted; break;
491      }
492      switch (bytesToWrite) { /* note: everything falls through. */
493        case 4:
494          *--target = (UTF8)((ch | byteMark) & byteMask);
495          ch >>= 6;
496          BP_FALLTHROUGH;
497        case 3:
498          *--target = (UTF8)((ch | byteMark) & byteMask);
499          ch >>= 6;
500          BP_FALLTHROUGH;
501        case 2:
502          *--target = (UTF8)((ch | byteMark) & byteMask);
503          ch >>= 6;
504          BP_FALLTHROUGH;
505        case 1:
506          *--target = (UTF8) (ch | firstByteMark[bytesToWrite]);
507      }
508      target += bytesToWrite;
509    }
510  *sourceStart = source;
511  *targetStart = target;
512  return result;
513  }
514  
515  /* --------------------------------------------------------------------- */
516  
517  ConversionResult ConvertUTF8toUTF32 (const UTF8** sourceStart, const UTF8* sourceEnd,
518                                       UTF32** targetStart, UTF32* targetEnd, ConversionFlags flags) {
519    ConversionResult result = conversionOK;
520    const UTF8* source = *sourceStart;
521    UTF32* target = *targetStart;
522    while (source < sourceEnd) {
523      UTF32 ch = 0;
524      unsigned short extraBytesToRead = trailingBytesForUTF8[*source];
525      if (source + extraBytesToRead >= sourceEnd) {
526  	    result = sourceExhausted; break;
527      }
528      /* Do this check whether lenient or strict */
529      if (! isLegalUTF8(source, extraBytesToRead+1)) {
530  	    result = sourceIllegal;
531  	    break;
532      }
533      /*
534       * The cases all fall through. See "Note A" below.
535       */
536      switch (extraBytesToRead) {
537        case 5: ch += *source++; ch <<= 6; BP_FALLTHROUGH;
538        case 4: ch += *source++; ch <<= 6; BP_FALLTHROUGH;
539        case 3: ch += *source++; ch <<= 6; BP_FALLTHROUGH;
540        case 2: ch += *source++; ch <<= 6; BP_FALLTHROUGH;
541        case 1: ch += *source++; ch <<= 6; BP_FALLTHROUGH;
542        case 0: ch += *source++;
543      }
544      ch -= offsetsFromUTF8[extraBytesToRead];
545  
546      if (target >= targetEnd) {
547  	    source -= (extraBytesToRead+1); /* Back up the source pointer! */
548  	    result = targetExhausted; break;
549      }
550      if (ch <= UNI_MAX_LEGAL_UTF32) {
551  	    /*
552  	     * UTF-16 surrogate values are illegal in UTF-32, and anything
553  	     * over Plane 17 (> 0x10FFFF) is illegal.
554  	     */
555  	    if (ch >= UNI_SUR_HIGH_START && ch <= UNI_SUR_LOW_END) {
556          if (flags == strictConversion) {
557            source -= (extraBytesToRead+1); /* return to the illegal value itself */
558            result = sourceIllegal;
559            break;
560          } else {
561            *target++ = UNI_REPLACEMENT_CHAR;
562          }
563  	    } else {
564          *target++ = ch;
565  	    }
566      } else { /* i.e., ch > UNI_MAX_LEGAL_UTF32 */
567  	    result = sourceIllegal;
568  	    *target++ = UNI_REPLACEMENT_CHAR;
569      }
570    }
571    *sourceStart = source;
572    *targetStart = target;
573    return result;
574  }
575  
576  /* ---------------------------------------------------------------------
577  
578  Note A.
579  The fall-through switches in UTF-8 reading code save a
580  temp variable, some decrements & conditionals.  The switches
581  are equivalent to the following loop:
582  {
583    int tmpBytesToRead = extraBytesToRead+1;
584    do {
585  		ch += *source++;
586  		--tmpBytesToRead;
587  		if (tmpBytesToRead) ch <<= 6;
588    } while (tmpBytesToRead > 0);
589  }
590  In UTF-8 writing code, the switches on "bytesToWrite" are
591  similarly unrolled loops.
592  
593  --------------------------------------------------------------------- */
594  
595  }  // namespace google_breakpad