/ CFBuiltinConverters.c
CFBuiltinConverters.c
   1  /*
   2   * Copyright (c) 2015 Apple Inc. All rights reserved.
   3   *
   4   * @APPLE_LICENSE_HEADER_START@
   5   *
   6   * This file contains Original Code and/or Modifications of Original Code
   7   * as defined in and that are subject to the Apple Public Source License
   8   * Version 2.0 (the 'License'). You may not use this file except in
   9   * compliance with the License. Please obtain a copy of the License at
  10   * http://www.opensource.apple.com/apsl/ and read it before using this
  11   * file.
  12   *
  13   * The Original Code and all software distributed under the License are
  14   * distributed on an 'AS IS' basis, WITHOUT WARRANTY OF ANY KIND, EITHER
  15   * EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES,
  16   * INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY,
  17   * FITNESS FOR A PARTICULAR PURPOSE, QUIET ENJOYMENT OR NON-INFRINGEMENT.
  18   * Please see the License for the specific language governing rights and
  19   * limitations under the License.
  20   *
  21   * @APPLE_LICENSE_HEADER_END@
  22   */
  23  
  24  /*	CFBuiltinConverters.c
  25  	Copyright (c) 1999-2014, Apple Inc. All rights reserved.
  26  	Responsibility: Aki Inoue
  27  */
  28  
  29  #include "CFStringEncodingConverterExt.h"
  30  #include "CFUniChar.h"
  31  #include "CFUnicodeDecomposition.h"
  32  #include "CFUnicodePrecomposition.h"
  33  #include "CFStringEncodingConverterPriv.h"
  34  #include "CFInternal.h"
  35  
  36  #define ParagraphSeparator 0x2029
  37  #define ASCIINewLine 0x0a
  38  static int8_t __CFMapsParagraphSeparator = -1;
  39  
  40  CF_INLINE bool __CFIsParagraphSeparator(UTF16Char character) {
  41      if (-1 == __CFMapsParagraphSeparator) __CFMapsParagraphSeparator = (1 ? false : true);
  42  
  43      return ((__CFMapsParagraphSeparator && (ParagraphSeparator == character)) ? true : false);
  44  }
  45  
  46  /* Precomposition */
  47  static const uint32_t __CFLatin1CombiningCharBitmap[] = { // 0x300 ~ 0x35FF
  48      0xFBB94010, 0x01800000, 0x0000000,
  49  };
  50  
  51  bool CFStringEncodingIsValidCombiningCharacterForLatin1(UniChar character) {
  52      return ((character >= 0x300) && (character < 0x360) && (__CFLatin1CombiningCharBitmap[(character - 0x300) / 32] & (1 << (31 - ((character - 0x300) % 32)))) ? true : false);
  53  }
  54  
  55  UniChar CFStringEncodingPrecomposeLatinCharacter(const UniChar *character, CFIndex numChars, CFIndex *usedChars) {
  56      if (numChars > 0) {
  57          UTF32Char ch = *(character++), nextCh, composedChar;
  58          CFIndex usedCharLen = 1;
  59  
  60          if (CFUniCharIsSurrogateHighCharacter(ch) || CFUniCharIsSurrogateLowCharacter(ch)) {
  61              if (usedChars) (*usedChars) = usedCharLen;
  62              return ch;
  63          }
  64  
  65          while (usedCharLen < numChars) {
  66              nextCh = *(character++);
  67  
  68              if (CFUniCharIsSurrogateHighCharacter(nextCh) || CFUniCharIsSurrogateLowCharacter(nextCh)) break;
  69  
  70              if (CFUniCharIsMemberOf(nextCh, kCFUniCharNonBaseCharacterSet) && ((composedChar = CFUniCharPrecomposeCharacter(ch, nextCh)) != 0xFFFD)) {
  71                  if (composedChar > 0xFFFF) { // Non-base
  72                      break;
  73                  } else {
  74                      ch = composedChar;
  75                  }
  76              } else {
  77                  break;
  78              }
  79              ++usedCharLen;
  80          }
  81          if (usedChars) (*usedChars) = usedCharLen;
  82          if (usedCharLen > 1) return ch;
  83      }
  84      return 0xFFFD;
  85  }
  86  
  87  /* ASCII */
  88  static bool __CFToASCII(uint32_t flags, UniChar character, uint8_t *byte) {
  89      if (character < 0x80) {
  90          *byte = (uint8_t)character;
  91      } else if (__CFIsParagraphSeparator(character)) {
  92          *byte = ASCIINewLine;
  93      } else {
  94          return false;
  95      }
  96      return true;
  97  }
  98  
  99  static bool __CFFromASCII(uint32_t flags, uint8_t byte, UniChar *character) {
 100      if (byte < 0x80) {
 101          *character = (UniChar)byte;
 102          return true;
 103      } else {
 104          return false;
 105      }
 106  }
 107  
 108  
 109  CF_PRIVATE const CFStringEncodingConverter __CFConverterASCII = {
 110      __CFToASCII, __CFFromASCII, 1, 1, kCFStringEncodingConverterCheapEightBit,
 111      NULL, NULL, NULL, NULL, NULL, NULL,
 112  };
 113  
 114  /* ISO Latin 1 (8859-1) */
 115  static bool __CFToISOLatin1(uint32_t flags, UniChar character, uint8_t *byte) {
 116      if (character <= 0xFF) {
 117          *byte = (uint8_t)character;
 118      } else if (__CFIsParagraphSeparator(character)) {
 119          *byte = ASCIINewLine;
 120      } else {
 121          return false;
 122      }
 123  
 124      return true;
 125  }
 126  
 127  static bool __CFFromISOLatin1(uint32_t flags, uint8_t byte, UniChar *character) {
 128      *character = (UniChar)byte;
 129      return true;
 130  }
 131  
 132  static CFIndex __CFToISOLatin1Precompose(uint32_t flags, const UniChar *character, CFIndex numChars, uint8_t *bytes, CFIndex maxByteLen, CFIndex *usedByteLen) {
 133      uint8_t byte;
 134      CFIndex usedCharLen;
 135  
 136      if (__CFToISOLatin1(flags, CFStringEncodingPrecomposeLatinCharacter(character, numChars, &usedCharLen), &byte) && byte && (usedCharLen > 1)) {
 137          if (maxByteLen) *bytes = byte;
 138          *usedByteLen = 1;
 139          return usedCharLen;
 140      } else {
 141          return 0;
 142      }
 143  }
 144  
 145  CF_PRIVATE const CFStringEncodingConverter __CFConverterISOLatin1 = {
 146      __CFToISOLatin1, __CFFromISOLatin1, 1, 1, kCFStringEncodingConverterCheapEightBit,
 147      NULL, NULL, NULL, NULL, __CFToISOLatin1Precompose, CFStringEncodingIsValidCombiningCharacterForLatin1,
 148  };
 149  
 150  /* Mac Roman */
 151  #define NUM_MACROMAN_FROM_UNI 129
 152  static const CFStringEncodingUnicodeTo8BitCharMap macRoman_from_uni[NUM_MACROMAN_FROM_UNI] = {
 153      { 0x00A0, 0xCA }, /* NO-BREAK SPACE */
 154      { 0x00A1, 0xC1 }, /* INVERTED EXCLAMATION MARK */
 155      { 0x00A2, 0xA2 }, /* CENT SIGN */
 156      { 0x00A3, 0xA3 }, /* POUND SIGN */
 157      { 0x00A5, 0xB4 }, /* YEN SIGN */
 158      { 0x00A7, 0xA4 }, /* SECTION SIGN */
 159      { 0x00A8, 0xAC }, /* DIAERESIS */
 160      { 0x00A9, 0xA9 }, /* COPYRIGHT SIGN */
 161      { 0x00AA, 0xBB }, /* FEMININE ORDINAL INDICATOR */
 162      { 0x00AB, 0xC7 }, /* LEFT-POINTING DOUBLE ANGLE QUOTATION MARK */
 163      { 0x00AC, 0xC2 }, /* NOT SIGN */
 164      { 0x00AE, 0xA8 }, /* REGISTERED SIGN */
 165      { 0x00AF, 0xF8 }, /* MACRON */
 166      { 0x00B0, 0xA1 }, /* DEGREE SIGN */
 167      { 0x00B1, 0xB1 }, /* PLUS-MINUS SIGN */
 168      { 0x00B4, 0xAB }, /* ACUTE ACCENT */
 169      { 0x00B5, 0xB5 }, /* MICRO SIGN */
 170      { 0x00B6, 0xA6 }, /* PILCROW SIGN */
 171      { 0x00B7, 0xE1 }, /* MIDDLE DOT */
 172      { 0x00B8, 0xFC }, /* CEDILLA */
 173      { 0x00BA, 0xBC }, /* MASCULINE ORDINAL INDICATOR */
 174      { 0x00BB, 0xC8 }, /* RIGHT-POINTING DOUBLE ANGLE QUOTATION MARK */
 175      { 0x00BF, 0xC0 }, /* INVERTED QUESTION MARK */
 176      { 0x00C0, 0xCB }, /* LATIN CAPITAL LETTER A WITH GRAVE */
 177      { 0x00C1, 0xE7 }, /* LATIN CAPITAL LETTER A WITH ACUTE */
 178      { 0x00C2, 0xE5 }, /* LATIN CAPITAL LETTER A WITH CIRCUMFLEX */
 179      { 0x00C3, 0xCC }, /* LATIN CAPITAL LETTER A WITH TILDE */
 180      { 0x00C4, 0x80 }, /* LATIN CAPITAL LETTER A WITH DIAERESIS */
 181      { 0x00C5, 0x81 }, /* LATIN CAPITAL LETTER A WITH RING ABOVE */
 182      { 0x00C6, 0xAE }, /* LATIN CAPITAL LIGATURE AE */
 183      { 0x00C7, 0x82 }, /* LATIN CAPITAL LETTER C WITH CEDILLA */
 184      { 0x00C8, 0xE9 }, /* LATIN CAPITAL LETTER E WITH GRAVE */
 185      { 0x00C9, 0x83 }, /* LATIN CAPITAL LETTER E WITH ACUTE */
 186      { 0x00CA, 0xE6 }, /* LATIN CAPITAL LETTER E WITH CIRCUMFLEX */
 187      { 0x00CB, 0xE8 }, /* LATIN CAPITAL LETTER E WITH DIAERESIS */
 188      { 0x00CC, 0xED }, /* LATIN CAPITAL LETTER I WITH GRAVE */
 189      { 0x00CD, 0xEA }, /* LATIN CAPITAL LETTER I WITH ACUTE */
 190      { 0x00CE, 0xEB }, /* LATIN CAPITAL LETTER I WITH CIRCUMFLEX */
 191      { 0x00CF, 0xEC }, /* LATIN CAPITAL LETTER I WITH DIAERESIS */
 192      { 0x00D1, 0x84 }, /* LATIN CAPITAL LETTER N WITH TILDE */
 193      { 0x00D2, 0xF1 }, /* LATIN CAPITAL LETTER O WITH GRAVE */
 194      { 0x00D3, 0xEE }, /* LATIN CAPITAL LETTER O WITH ACUTE */
 195      { 0x00D4, 0xEF }, /* LATIN CAPITAL LETTER O WITH CIRCUMFLEX */
 196      { 0x00D5, 0xCD }, /* LATIN CAPITAL LETTER O WITH TILDE */
 197      { 0x00D6, 0x85 }, /* LATIN CAPITAL LETTER O WITH DIAERESIS */
 198      { 0x00D8, 0xAF }, /* LATIN CAPITAL LETTER O WITH STROKE */
 199      { 0x00D9, 0xF4 }, /* LATIN CAPITAL LETTER U WITH GRAVE */
 200      { 0x00DA, 0xF2 }, /* LATIN CAPITAL LETTER U WITH ACUTE */
 201      { 0x00DB, 0xF3 }, /* LATIN CAPITAL LETTER U WITH CIRCUMFLEX */
 202      { 0x00DC, 0x86 }, /* LATIN CAPITAL LETTER U WITH DIAERESIS */
 203      { 0x00DF, 0xA7 }, /* LATIN SMALL LETTER SHARP S */
 204      { 0x00E0, 0x88 }, /* LATIN SMALL LETTER A WITH GRAVE */
 205      { 0x00E1, 0x87 }, /* LATIN SMALL LETTER A WITH ACUTE */
 206      { 0x00E2, 0x89 }, /* LATIN SMALL LETTER A WITH CIRCUMFLEX */
 207      { 0x00E3, 0x8B }, /* LATIN SMALL LETTER A WITH TILDE */
 208      { 0x00E4, 0x8A }, /* LATIN SMALL LETTER A WITH DIAERESIS */
 209      { 0x00E5, 0x8C }, /* LATIN SMALL LETTER A WITH RING ABOVE */
 210      { 0x00E6, 0xBE }, /* LATIN SMALL LIGATURE AE */
 211      { 0x00E7, 0x8D }, /* LATIN SMALL LETTER C WITH CEDILLA */
 212      { 0x00E8, 0x8F }, /* LATIN SMALL LETTER E WITH GRAVE */
 213      { 0x00E9, 0x8E }, /* LATIN SMALL LETTER E WITH ACUTE */
 214      { 0x00EA, 0x90 }, /* LATIN SMALL LETTER E WITH CIRCUMFLEX */
 215      { 0x00EB, 0x91 }, /* LATIN SMALL LETTER E WITH DIAERESIS */
 216      { 0x00EC, 0x93 }, /* LATIN SMALL LETTER I WITH GRAVE */
 217      { 0x00ED, 0x92 }, /* LATIN SMALL LETTER I WITH ACUTE */
 218      { 0x00EE, 0x94 }, /* LATIN SMALL LETTER I WITH CIRCUMFLEX */
 219      { 0x00EF, 0x95 }, /* LATIN SMALL LETTER I WITH DIAERESIS */
 220      { 0x00F1, 0x96 }, /* LATIN SMALL LETTER N WITH TILDE */
 221      { 0x00F2, 0x98 }, /* LATIN SMALL LETTER O WITH GRAVE */
 222      { 0x00F3, 0x97 }, /* LATIN SMALL LETTER O WITH ACUTE */
 223      { 0x00F4, 0x99 }, /* LATIN SMALL LETTER O WITH CIRCUMFLEX */
 224      { 0x00F5, 0x9B }, /* LATIN SMALL LETTER O WITH TILDE */
 225      { 0x00F6, 0x9A }, /* LATIN SMALL LETTER O WITH DIAERESIS */
 226      { 0x00F7, 0xD6 }, /* DIVISION SIGN */
 227      { 0x00F8, 0xBF }, /* LATIN SMALL LETTER O WITH STROKE */
 228      { 0x00F9, 0x9D }, /* LATIN SMALL LETTER U WITH GRAVE */
 229      { 0x00FA, 0x9C }, /* LATIN SMALL LETTER U WITH ACUTE */
 230      { 0x00FB, 0x9E }, /* LATIN SMALL LETTER U WITH CIRCUMFLEX */
 231      { 0x00FC, 0x9F }, /* LATIN SMALL LETTER U WITH DIAERESIS */
 232      { 0x00FF, 0xD8 }, /* LATIN SMALL LETTER Y WITH DIAERESIS */
 233      { 0x0131, 0xF5 }, /* LATIN SMALL LETTER DOTLESS I */
 234      { 0x0152, 0xCE }, /* LATIN CAPITAL LIGATURE OE */
 235      { 0x0153, 0xCF }, /* LATIN SMALL LIGATURE OE */
 236      { 0x0178, 0xD9 }, /* LATIN CAPITAL LETTER Y WITH DIAERESIS */
 237      { 0x0192, 0xC4 }, /* LATIN SMALL LETTER F WITH HOOK */
 238      { 0x02C6, 0xF6 }, /* MODIFIER LETTER CIRCUMFLEX ACCENT */
 239      { 0x02C7, 0xFF }, /* CARON */
 240      { 0x02D8, 0xF9 }, /* BREVE */
 241      { 0x02D9, 0xFA }, /* DOT ABOVE */
 242      { 0x02DA, 0xFB }, /* RING ABOVE */
 243      { 0x02DB, 0xFE }, /* OGONEK */
 244      { 0x02DC, 0xF7 }, /* SMALL TILDE */
 245      { 0x02DD, 0xFD }, /* DOUBLE ACUTE ACCENT */
 246      { 0x03A9, 0xBD }, /* OHM SIGN (Canonical ?) */
 247      { 0x03C0, 0xB9 }, /* GREEK SMALL LETTER PI */
 248      { 0x2013, 0xD0 }, /* EN DASH */
 249      { 0x2014, 0xD1 }, /* EM DASH */
 250      { 0x2018, 0xD4 }, /* LEFT SINGLE QUOTATION MARK */
 251      { 0x2019, 0xD5 }, /* RIGHT SINGLE QUOTATION MARK */
 252      { 0x201A, 0xE2 }, /* SINGLE LOW-9 QUOTATION MARK */
 253      { 0x201C, 0xD2 }, /* LEFT DOUBLE QUOTATION MARK */
 254      { 0x201D, 0xD3 }, /* RIGHT DOUBLE QUOTATION MARK */
 255      { 0x201E, 0xE3 }, /* DOUBLE LOW-9 QUOTATION MARK */
 256      { 0x2020, 0xA0 }, /* DAGGER */
 257      { 0x2021, 0xE0 }, /* DOUBLE DAGGER */
 258      { 0x2022, 0xA5 }, /* BULLET */
 259      { 0x2026, 0xC9 }, /* HORIZONTAL ELLIPSIS */
 260      { 0x2030, 0xE4 }, /* PER MILLE SIGN */
 261      { 0x2039, 0xDC }, /* SINGLE LEFT-POINTING ANGLE QUOTATION MARK */
 262      { 0x203A, 0xDD }, /* SINGLE RIGHT-POINTING ANGLE QUOTATION MARK */
 263      { 0x2044, 0xDA }, /* FRACTION SLASH */
 264      { 0x20AC, 0xDB }, /* EURO SIGN */
 265      { 0x2122, 0xAA }, /* TRADE MARK SIGN */
 266      { 0x2126, 0xBD }, /* OHM SIGN */
 267      { 0x2202, 0xB6 }, /* PARTIAL DIFFERENTIAL */
 268      { 0x2206, 0xC6 }, /* INCREMENT */
 269      { 0x220F, 0xB8 }, /* N-ARY PRODUCT */
 270      { 0x2211, 0xB7 }, /* N-ARY SUMMATION */
 271      { 0x221A, 0xC3 }, /* SQUARE ROOT */
 272      { 0x221E, 0xB0 }, /* INFINITY */
 273      { 0x222B, 0xBA }, /* INTEGRAL */
 274      { 0x2248, 0xC5 }, /* ALMOST EQUAL TO */
 275      { 0x2260, 0xAD }, /* NOT EQUAL TO */
 276      { 0x2264, 0xB2 }, /* LESS-THAN OR EQUAL TO */
 277      { 0x2265, 0xB3 }, /* GREATER-THAN OR EQUAL TO */
 278      { 0x25CA, 0xD7 }, /* LOZENGE */
 279      { 0xF8FF, 0xF0 }, /* Apple logo */
 280      { 0xFB01, 0xDE }, /* LATIN SMALL LIGATURE FI */
 281      { 0xFB02, 0xDF }, /* LATIN SMALL LIGATURE FL */
 282  };
 283  
 284  static bool __CFToMacRoman(uint32_t flags, UniChar character, uint8_t *byte) {
 285      if (character < 0x80) {
 286          *byte = (uint8_t)character;
 287          return true;
 288      } else {
 289          return CFStringEncodingUnicodeTo8BitEncoding(macRoman_from_uni, NUM_MACROMAN_FROM_UNI, character, byte);
 290      }
 291  }
 292  
 293  static const UniChar macRoman_to_uni[128] = {
 294      0x00C4, /* LATIN CAPITAL LETTER A WITH DIAERESIS */
 295      0x00C5, /* LATIN CAPITAL LETTER A WITH RING ABOVE */
 296      0x00C7, /* LATIN CAPITAL LETTER C WITH CEDILLA */
 297      0x00C9, /* LATIN CAPITAL LETTER E WITH ACUTE */
 298      0x00D1, /* LATIN CAPITAL LETTER N WITH TILDE */
 299      0x00D6, /* LATIN CAPITAL LETTER O WITH DIAERESIS */
 300      0x00DC, /* LATIN CAPITAL LETTER U WITH DIAERESIS */
 301      0x00E1, /* LATIN SMALL LETTER A WITH ACUTE */
 302      0x00E0, /* LATIN SMALL LETTER A WITH GRAVE */
 303      0x00E2, /* LATIN SMALL LETTER A WITH CIRCUMFLEX */
 304      0x00E4, /* LATIN SMALL LETTER A WITH DIAERESIS */
 305      0x00E3, /* LATIN SMALL LETTER A WITH TILDE */
 306      0x00E5, /* LATIN SMALL LETTER A WITH RING ABOVE */
 307      0x00E7, /* LATIN SMALL LETTER C WITH CEDILLA */
 308      0x00E9, /* LATIN SMALL LETTER E WITH ACUTE */
 309      0x00E8, /* LATIN SMALL LETTER E WITH GRAVE */
 310      0x00EA, /* LATIN SMALL LETTER E WITH CIRCUMFLEX */
 311      0x00EB, /* LATIN SMALL LETTER E WITH DIAERESIS */
 312      0x00ED, /* LATIN SMALL LETTER I WITH ACUTE */
 313      0x00EC, /* LATIN SMALL LETTER I WITH GRAVE */
 314      0x00EE, /* LATIN SMALL LETTER I WITH CIRCUMFLEX */
 315      0x00EF, /* LATIN SMALL LETTER I WITH DIAERESIS */
 316      0x00F1, /* LATIN SMALL LETTER N WITH TILDE */
 317      0x00F3, /* LATIN SMALL LETTER O WITH ACUTE */
 318      0x00F2, /* LATIN SMALL LETTER O WITH GRAVE */
 319      0x00F4, /* LATIN SMALL LETTER O WITH CIRCUMFLEX */
 320      0x00F6, /* LATIN SMALL LETTER O WITH DIAERESIS */
 321      0x00F5, /* LATIN SMALL LETTER O WITH TILDE */
 322      0x00FA, /* LATIN SMALL LETTER U WITH ACUTE */
 323      0x00F9, /* LATIN SMALL LETTER U WITH GRAVE */
 324      0x00FB, /* LATIN SMALL LETTER U WITH CIRCUMFLEX */
 325      0x00FC, /* LATIN SMALL LETTER U WITH DIAERESIS */
 326      0x2020, /* DAGGER */
 327      0x00B0, /* DEGREE SIGN */
 328      0x00A2, /* CENT SIGN */
 329      0x00A3, /* POUND SIGN */
 330      0x00A7, /* SECTION SIGN */
 331      0x2022, /* BULLET */
 332      0x00B6, /* PILCROW SIGN */
 333      0x00DF, /* LATIN SMALL LETTER SHARP S */
 334      0x00AE, /* REGISTERED SIGN */
 335      0x00A9, /* COPYRIGHT SIGN */
 336      0x2122, /* TRADE MARK SIGN */
 337      0x00B4, /* ACUTE ACCENT */
 338      0x00A8, /* DIAERESIS */
 339      0x2260, /* NOT EQUAL TO */
 340      0x00C6, /* LATIN CAPITAL LIGATURE AE */
 341      0x00D8, /* LATIN CAPITAL LETTER O WITH STROKE */
 342      0x221E, /* INFINITY */
 343      0x00B1, /* PLUS-MINUS SIGN */
 344      0x2264, /* LESS-THAN OR EQUAL TO */
 345      0x2265, /* GREATER-THAN OR EQUAL TO */
 346      0x00A5, /* YEN SIGN */
 347      0x00B5, /* MICRO SIGN */
 348      0x2202, /* PARTIAL DIFFERENTIAL */
 349      0x2211, /* N-ARY SUMMATION */
 350      0x220F, /* N-ARY PRODUCT */
 351      0x03C0, /* GREEK SMALL LETTER PI */
 352      0x222B, /* INTEGRAL */
 353      0x00AA, /* FEMININE ORDINAL INDICATOR */
 354      0x00BA, /* MASCULINE ORDINAL INDICATOR */
 355      0x03A9, /* OHM SIGN (Canonical mapping) */
 356      0x00E6, /* LATIN SMALL LIGATURE AE */
 357      0x00F8, /* LATIN SMALL LETTER O WITH STROKE */
 358      0x00BF, /* INVERTED QUESTION MARK */
 359      0x00A1, /* INVERTED EXCLAMATION MARK */
 360      0x00AC, /* NOT SIGN */
 361      0x221A, /* SQUARE ROOT */
 362      0x0192, /* LATIN SMALL LETTER F WITH HOOK */
 363      0x2248, /* ALMOST EQUAL TO */
 364      0x2206, /* INCREMENT */
 365      0x00AB, /* LEFT-POINTING DOUBLE ANGLE QUOTATION MARK */
 366      0x00BB, /* RIGHT-POINTING DOUBLE ANGLE QUOTATION MARK */
 367      0x2026, /* HORIZONTAL ELLIPSIS */
 368      0x00A0, /* NO-BREAK SPACE */
 369      0x00C0, /* LATIN CAPITAL LETTER A WITH GRAVE */
 370      0x00C3, /* LATIN CAPITAL LETTER A WITH TILDE */
 371      0x00D5, /* LATIN CAPITAL LETTER O WITH TILDE */
 372      0x0152, /* LATIN CAPITAL LIGATURE OE */
 373      0x0153, /* LATIN SMALL LIGATURE OE */
 374      0x2013, /* EN DASH */
 375      0x2014, /* EM DASH */
 376      0x201C, /* LEFT DOUBLE QUOTATION MARK */
 377      0x201D, /* RIGHT DOUBLE QUOTATION MARK */
 378      0x2018, /* LEFT SINGLE QUOTATION MARK */
 379      0x2019, /* RIGHT SINGLE QUOTATION MARK */
 380      0x00F7, /* DIVISION SIGN */
 381      0x25CA, /* LOZENGE */
 382      0x00FF, /* LATIN SMALL LETTER Y WITH DIAERESIS */
 383      0x0178, /* LATIN CAPITAL LETTER Y WITH DIAERESIS */
 384      0x2044, /* FRACTION SLASH */
 385      0x20AC, /* EURO SIGN */
 386      0x2039, /* SINGLE LEFT-POINTING ANGLE QUOTATION MARK */
 387      0x203A, /* SINGLE RIGHT-POINTING ANGLE QUOTATION MARK */
 388      0xFB01, /* LATIN SMALL LIGATURE FI */
 389      0xFB02, /* LATIN SMALL LIGATURE FL */
 390      0x2021, /* DOUBLE DAGGER */
 391      0x00B7, /* MIDDLE DOT */
 392      0x201A, /* SINGLE LOW-9 QUOTATION MARK */
 393      0x201E, /* DOUBLE LOW-9 QUOTATION MARK */
 394      0x2030, /* PER MILLE SIGN */
 395      0x00C2, /* LATIN CAPITAL LETTER A WITH CIRCUMFLEX */
 396      0x00CA, /* LATIN CAPITAL LETTER E WITH CIRCUMFLEX */
 397      0x00C1, /* LATIN CAPITAL LETTER A WITH ACUTE */
 398      0x00CB, /* LATIN CAPITAL LETTER E WITH DIAERESIS */
 399      0x00C8, /* LATIN CAPITAL LETTER E WITH GRAVE */
 400      0x00CD, /* LATIN CAPITAL LETTER I WITH ACUTE */
 401      0x00CE, /* LATIN CAPITAL LETTER I WITH CIRCUMFLEX */
 402      0x00CF, /* LATIN CAPITAL LETTER I WITH DIAERESIS */
 403      0x00CC, /* LATIN CAPITAL LETTER I WITH GRAVE */
 404      0x00D3, /* LATIN CAPITAL LETTER O WITH ACUTE */
 405      0x00D4, /* LATIN CAPITAL LETTER O WITH CIRCUMFLEX */
 406      0xF8FF, /* Apple logo */
 407      0x00D2, /* LATIN CAPITAL LETTER O WITH GRAVE */
 408      0x00DA, /* LATIN CAPITAL LETTER U WITH ACUTE */
 409      0x00DB, /* LATIN CAPITAL LETTER U WITH CIRCUMFLEX */
 410      0x00D9, /* LATIN CAPITAL LETTER U WITH GRAVE */
 411      0x0131, /* LATIN SMALL LETTER DOTLESS I */
 412      0x02C6, /* MODIFIER LETTER CIRCUMFLEX ACCENT */
 413      0x02DC, /* SMALL TILDE */
 414      0x00AF, /* MACRON */
 415      0x02D8, /* BREVE */
 416      0x02D9, /* DOT ABOVE */
 417      0x02DA, /* RING ABOVE */
 418      0x00B8, /* CEDILLA */
 419      0x02DD, /* DOUBLE ACUTE ACCENT */
 420      0x02DB, /* OGONEK */
 421      0x02C7, /* CARON */
 422  };
 423  
 424  static bool __CFFromMacRoman(uint32_t flags, uint8_t byte, UniChar *character) {
 425      *character = (byte < 0x80 ? (UniChar)byte : macRoman_to_uni[byte - 0x80]);
 426      return true;
 427  }
 428  
 429  static CFIndex __CFToMacRomanPrecompose(uint32_t flags, const UniChar *character, CFIndex numChars, uint8_t *bytes, CFIndex maxByteLen, CFIndex *usedByteLen) {
 430      uint8_t byte;
 431      CFIndex usedCharLen;
 432  
 433      if (__CFToMacRoman(flags, CFStringEncodingPrecomposeLatinCharacter(character, numChars, &usedCharLen), &byte) && byte && (usedCharLen > 1)) {
 434          if (maxByteLen) *bytes = byte;
 435          *usedByteLen = 1;
 436          return usedCharLen;
 437      } else {
 438          return 0;
 439      }
 440  }
 441  
 442  CF_PRIVATE const CFStringEncodingConverter __CFConverterMacRoman = {
 443      __CFToMacRoman, __CFFromMacRoman, 1, 1, kCFStringEncodingConverterCheapEightBit,
 444      NULL, NULL, NULL, NULL, __CFToMacRomanPrecompose, CFStringEncodingIsValidCombiningCharacterForLatin1,
 445  };
 446  
 447  /* Win Latin1 (ANSI CodePage 1252) */
 448  #define NUM_1252_FROM_UNI 27
 449  static const CFStringEncodingUnicodeTo8BitCharMap cp1252_from_uni[NUM_1252_FROM_UNI] = {
 450      {0x0152, 0x8C}, // LATIN CAPITAL LIGATURE OE
 451      {0x0153, 0x9C}, // LATIN SMALL LIGATURE OE
 452      {0x0160, 0x8A}, // LATIN CAPITAL LETTER S WITH CARON
 453      {0x0161, 0x9A}, // LATIN SMALL LETTER S WITH CARON
 454      {0x0178, 0x9F}, // LATIN CAPITAL LETTER Y WITH DIAERESIS
 455      {0x017D, 0x8E}, // LATIN CAPITAL LETTER Z WITH CARON
 456      {0x017E, 0x9E}, // LATIN SMALL LETTER Z WITH CARON
 457      {0x0192, 0x83}, // LATIN SMALL LETTER F WITH HOOK
 458      {0x02C6, 0x88}, // MODIFIER LETTER CIRCUMFLEX ACCENT
 459      {0x02DC, 0x98}, // SMALL TILDE
 460      {0x2013, 0x96}, // EN DASH
 461      {0x2014, 0x97}, // EM DASH
 462      {0x2018, 0x91}, // LEFT SINGLE QUOTATION MARK
 463      {0x2019, 0x92}, // RIGHT SINGLE QUOTATION MARK
 464      {0x201A, 0x82}, // SINGLE LOW-9 QUOTATION MARK
 465      {0x201C, 0x93}, // LEFT DOUBLE QUOTATION MARK
 466      {0x201D, 0x94}, // RIGHT DOUBLE QUOTATION MARK
 467      {0x201E, 0x84}, // DOUBLE LOW-9 QUOTATION MARK
 468      {0x2020, 0x86}, // DAGGER
 469      {0x2021, 0x87}, // DOUBLE DAGGER
 470      {0x2022, 0x95}, // BULLET
 471      {0x2026, 0x85}, // HORIZONTAL ELLIPSIS
 472      {0x2030, 0x89}, // PER MILLE SIGN
 473      {0x2039, 0x8B}, // SINGLE LEFT-POINTING ANGLE QUOTATION MARK
 474      {0x203A, 0x9B}, // SINGLE RIGHT-POINTING ANGLE QUOTATION MARK
 475      {0x20AC, 0x80}, // EURO SIGN
 476      {0x2122, 0x99}, // TRADE MARK SIGN
 477  };
 478  
 479  static bool __CFToWinLatin1(uint32_t flags, UniChar character, uint8_t *byte) {
 480      if ((character < 0x80) || ((character > 0x9F) && (character <= 0x00FF))) {
 481          *byte = (uint8_t)character;
 482          return true;
 483      }
 484      return CFStringEncodingUnicodeTo8BitEncoding(cp1252_from_uni, NUM_1252_FROM_UNI, character, byte);
 485  }
 486  
 487  static const uint16_t cp1252_to_uni[32] = {
 488      0x20AC, //  EURO SIGN
 489      0xFFFD, //  NOT USED
 490      0x201A, //  SINGLE LOW-9 QUOTATION MARK
 491      0x0192, //  LATIN SMALL LETTER F WITH HOOK
 492      0x201E, //  DOUBLE LOW-9 QUOTATION MARK
 493      0x2026, //  HORIZONTAL ELLIPSIS
 494      0x2020, //  DAGGER
 495      0x2021, //  DOUBLE DAGGER
 496      0x02C6, //  MODIFIER LETTER CIRCUMFLEX ACCENT
 497      0x2030, //  PER MILLE SIGN
 498      0x0160, //  LATIN CAPITAL LETTER S WITH CARON
 499      0x2039, //  SINGLE LEFT-POINTING ANGLE QUOTATION MARK
 500      0x0152, //  LATIN CAPITAL LIGATURE OE
 501      0xFFFD, //  NOT USED
 502      0x017D, //  LATIN CAPITAL LETTER Z WITH CARON
 503      0xFFFD, //  NOT USED
 504      0xFFFD, //  NOT USED
 505      0x2018, //  LEFT SINGLE QUOTATION MARK
 506      0x2019, //  RIGHT SINGLE QUOTATION MARK
 507      0x201C, //  LEFT DOUBLE QUOTATION MARK
 508      0x201D, //  RIGHT DOUBLE QUOTATION MARK
 509      0x2022, //  BULLET
 510      0x2013, //  EN DASH
 511      0x2014, //  EM DASH
 512      0x02DC, //  SMALL TILDE
 513      0x2122, //  TRADE MARK SIGN
 514      0x0161, //  LATIN SMALL LETTER S WITH CARON
 515      0x203A, //  SINGLE RIGHT-POINTING ANGLE QUOTATION MARK
 516      0x0153, //  LATIN SMALL LIGATURE OE
 517      0xFFFD, //  NOT USED
 518      0x017E, //  LATIN SMALL LETTER Z WITH CARON
 519      0x0178, //  LATIN CAPITAL LETTER Y WITH DIAERESIS
 520  };
 521  
 522  static bool __CFFromWinLatin1(uint32_t flags, uint8_t byte, UniChar *character) {
 523      *character = (byte < 0x80 || byte > 0x9F ? (UniChar)byte : cp1252_to_uni[byte - 0x80]);
 524      return (*character != 0xFFFD);
 525  }
 526  
 527  static CFIndex __CFToWinLatin1Precompose(uint32_t flags, const UniChar *character, CFIndex numChars, uint8_t *bytes, CFIndex maxByteLen, CFIndex *usedByteLen) {
 528      uint8_t byte;
 529      CFIndex usedCharLen;
 530  
 531      if (__CFToWinLatin1(flags, CFStringEncodingPrecomposeLatinCharacter(character, numChars, &usedCharLen), &byte) && byte && (usedCharLen > 1)) {
 532          if (maxByteLen) *bytes = byte;
 533          *usedByteLen = 1;
 534          return usedCharLen;
 535      } else {
 536          return 0;
 537      }
 538  }
 539  
 540  CF_PRIVATE const CFStringEncodingConverter __CFConverterWinLatin1 = {
 541      __CFToWinLatin1, __CFFromWinLatin1, 1, 1, kCFStringEncodingConverterCheapEightBit,
 542      NULL, NULL, NULL, NULL, __CFToWinLatin1Precompose, CFStringEncodingIsValidCombiningCharacterForLatin1,
 543  };
 544  
 545  /* NEXTSTEP Encoding */
 546  #define NUM_NEXTSTEP_FROM_UNI	127
 547  
 548  static const CFStringEncodingUnicodeTo8BitCharMap nextstep_from_tab[NUM_NEXTSTEP_FROM_UNI] = {
 549          { 0x00a0, 0x80 },
 550          { 0x00a1, 0xa1 },
 551          { 0x00a2, 0xa2 },
 552          { 0x00a3, 0xa3 },
 553          { 0x00a4, 0xa8 },
 554          { 0x00a5, 0xa5 },
 555          { 0x00a6, 0xb5 },
 556          { 0x00a7, 0xa7 },
 557          { 0x00a8, 0xc8 },
 558          { 0x00a9, 0xa0 },
 559          { 0x00aa, 0xe3 },
 560          { 0x00ab, 0xab },
 561          { 0x00ac, 0xbe },
 562  /*	{ 0x00ad, 0x2d }, <= 96/10/25 rick removed; converts soft-hyphen to hyphen! */
 563          { 0x00ae, 0xb0 },
 564          { 0x00af, 0xc5 },
 565          { 0x00b1, 0xd1 },
 566          { 0x00b2, 0xc9 },
 567          { 0x00b3, 0xcc },
 568          { 0x00b4, 0xc2 },
 569          { 0x00b5, 0x9d },
 570          { 0x00b6, 0xb6 },
 571          { 0x00b7, 0xb4 },
 572          { 0x00b8, 0xcb },
 573          { 0x00b9, 0xc0 },
 574          { 0x00ba, 0xeb },
 575          { 0x00bb, 0xbb },
 576          { 0x00bc, 0xd2 },
 577          { 0x00bd, 0xd3 },
 578          { 0x00be, 0xd4 },
 579          { 0x00bf, 0xbf },
 580          { 0x00c0, 0x81 },
 581          { 0x00c1, 0x82 },
 582          { 0x00c2, 0x83 },
 583          { 0x00c3, 0x84 },
 584          { 0x00c4, 0x85 },
 585          { 0x00c5, 0x86 },
 586          { 0x00c6, 0xe1 },
 587          { 0x00c7, 0x87 },
 588          { 0x00c8, 0x88 },
 589          { 0x00c9, 0x89 },
 590          { 0x00ca, 0x8a },
 591          { 0x00cb, 0x8b },
 592          { 0x00cc, 0x8c },
 593          { 0x00cd, 0x8d },
 594          { 0x00ce, 0x8e },
 595          { 0x00cf, 0x8f },
 596          { 0x00d0, 0x90 },
 597          { 0x00d1, 0x91 },
 598          { 0x00d2, 0x92 },
 599          { 0x00d3, 0x93 },
 600          { 0x00d4, 0x94 },
 601          { 0x00d5, 0x95 },
 602          { 0x00d6, 0x96 },
 603          { 0x00d7, 0x9e },
 604          { 0x00d8, 0xe9 },
 605          { 0x00d9, 0x97 },
 606          { 0x00da, 0x98 },
 607          { 0x00db, 0x99 },
 608          { 0x00dc, 0x9a },
 609          { 0x00dd, 0x9b },
 610          { 0x00de, 0x9c },
 611          { 0x00df, 0xfb },
 612          { 0x00e0, 0xd5 },
 613          { 0x00e1, 0xd6 },
 614          { 0x00e2, 0xd7 },
 615          { 0x00e3, 0xd8 },
 616          { 0x00e4, 0xd9 },
 617          { 0x00e5, 0xda },
 618          { 0x00e6, 0xf1 },
 619          { 0x00e7, 0xdb },
 620          { 0x00e8, 0xdc },
 621          { 0x00e9, 0xdd },
 622          { 0x00ea, 0xde },
 623          { 0x00eb, 0xdf },
 624          { 0x00ec, 0xe0 },
 625          { 0x00ed, 0xe2 },
 626          { 0x00ee, 0xe4 },
 627          { 0x00ef, 0xe5 },
 628          { 0x00f0, 0xe6 },
 629          { 0x00f1, 0xe7 },
 630          { 0x00f2, 0xec },
 631          { 0x00f3, 0xed },
 632          { 0x00f4, 0xee },
 633          { 0x00f5, 0xef },
 634          { 0x00f6, 0xf0 },
 635          { 0x00f7, 0x9f },
 636          { 0x00f8, 0xf9 },
 637          { 0x00f9, 0xf2 },
 638          { 0x00fa, 0xf3 },
 639          { 0x00fb, 0xf4 },
 640          { 0x00fc, 0xf6 },
 641          { 0x00fd, 0xf7 },
 642          { 0x00fe, 0xfc },
 643          { 0x00ff, 0xfd },
 644          { 0x0131, 0xf5 },
 645          { 0x0141, 0xe8 },
 646          { 0x0142, 0xf8 },
 647          { 0x0152, 0xea },
 648          { 0x0153, 0xfa },
 649          { 0x0192, 0xa6 },
 650          { 0x02c6, 0xc3 },
 651          { 0x02c7, 0xcf },
 652          { 0x02cb, 0xc1 },
 653          { 0x02d8, 0xc6 },
 654          { 0x02d9, 0xc7 },
 655          { 0x02da, 0xca },
 656          { 0x02db, 0xce },
 657          { 0x02dc, 0xc4 },
 658          { 0x02dd, 0xcd },
 659          { 0x2013, 0xb1 },
 660          { 0x2014, 0xd0 },
 661          { 0x2019, 0xa9 },
 662          { 0x201a, 0xb8 },
 663          { 0x201c, 0xaa },
 664          { 0x201d, 0xba },
 665          { 0x201e, 0xb9 },
 666          { 0x2020, 0xb2 },
 667          { 0x2021, 0xb3 },
 668          { 0x2022, 0xb7 },
 669          { 0x2026, 0xbc },
 670          { 0x2030, 0xbd },
 671          { 0x2039, 0xac },
 672          { 0x203a, 0xad },
 673          { 0x2044, 0xa4 },
 674          { 0xfb01, 0xae },
 675          { 0xfb02, 0xaf },
 676          { 0xfffd, 0xff },
 677  };
 678  
 679  static bool __CFToNextStepLatin(uint32_t flags, UniChar character, uint8_t *byte) {
 680      if (character < 0x80) {
 681          *byte = (uint8_t)character;
 682          return true;
 683      } else if (__CFIsParagraphSeparator(character)) {
 684          *byte = ASCIINewLine;
 685          return true;
 686      } else {
 687          return CFStringEncodingUnicodeTo8BitEncoding(nextstep_from_tab, NUM_NEXTSTEP_FROM_UNI, character, byte);
 688      }
 689  };
 690  
 691  static const UniChar NSToPrecompUnicodeTable[128] = {
 692          /* NextStep Encoding	Unicode */
 693          /*  128	figspace */	0x00a0,		/* 0x2007 is fig space */
 694          /*  129	Agrave */	0x00c0,
 695          /*  130	Aacute */	0x00c1,
 696          /*  131	Acircumflex */	0x00c2,
 697          /*  132	Atilde */	0x00c3,
 698          /*  133	Adieresis */	0x00c4,
 699          /*  134	Aring */	0x00c5,
 700          /*  135	Ccedilla */	0x00c7,
 701          /*  136	Egrave */	0x00c8,
 702          /*  137	Eacute */	0x00c9,
 703          /*  138	Ecircumflex */	0x00ca,
 704          /*  139	Edieresis */	0x00cb,
 705          /*  140	Igrave */	0x00cc,
 706          /*  141	Iacute */	0x00cd,
 707          /*  142	Icircumflex */	0x00ce,
 708          /*  143	Idieresis */	0x00cf,
 709          /*  144	Eth */		0x00d0,
 710          /*  145	Ntilde */	0x00d1,
 711          /*  146	Ograve */	0x00d2,
 712          /*  147	Oacute */	0x00d3,
 713          /*  148	Ocircumflex */	0x00d4,
 714          /*  149	Otilde */	0x00d5,
 715          /*  150	Odieresis */	0x00d6,
 716          /*  151	Ugrave */	0x00d9,
 717          /*  152	Uacute */	0x00da,
 718          /*  153	Ucircumflex */	0x00db,
 719          /*  154	Udieresis */	0x00dc,
 720          /*  155	Yacute */	0x00dd,
 721          /*  156	Thorn */	0x00de,
 722          /*  157	mu */		0x00b5,
 723          /*  158	multiply */	0x00d7,
 724          /*  159	divide */	0x00f7,
 725          /*  160	copyright */	0x00a9,
 726          /*  161	exclamdown */	0x00a1,
 727          /*  162	cent */		0x00a2,
 728          /*  163	sterling */	0x00a3,
 729          /*  164	fraction */	0x2044,
 730          /*  165	yen */		0x00a5,
 731          /*  166	florin */	0x0192,
 732          /*  167	section */	0x00a7,
 733          /*  168	currency */	0x00a4,
 734          /*  169	quotesingle */	0x2019,
 735          /*  170	quotedblleft */	0x201c,
 736          /*  171	guillemotleft */ 0x00ab,
 737          /*  172	guilsinglleft */ 0x2039,
 738          /*  173	guilsinglright */ 0x203a,
 739          /*  174	fi */		0xFB01,
 740          /*  175	fl */		0xFB02,
 741          /*  176	registered */	0x00ae,
 742          /*  177	endash */	0x2013,
 743          /*  178	dagger */	0x2020,
 744          /*  179	daggerdbl */	0x2021,
 745          /*  180	periodcentered */ 0x00b7,
 746          /*  181	brokenbar */	0x00a6,
 747          /*  182	paragraph */	0x00b6,
 748          /*  183	bullet */	0x2022,
 749          /*  184	quotesinglbase */ 0x201a,
 750          /*  185	quotedblbase */	0x201e,
 751          /*  186	quotedblright */ 0x201d,
 752          /*  187	guillemotright */ 0x00bb,
 753          /*  188	ellipsis */	0x2026,
 754          /*  189	perthousand */	0x2030,
 755          /*  190	logicalnot */	0x00ac,
 756          /*  191	questiondown */	0x00bf,
 757          /*  192	onesuperior */	0x00b9,
 758          /*  193	grave */	0x02cb,
 759          /*  194	acute */	0x00b4,
 760          /*  195	circumflex */	0x02c6,
 761          /*  196	tilde */	0x02dc,
 762          /*  197	macron */	0x00af,
 763          /*  198	breve */	0x02d8,
 764          /*  199	dotaccent */	0x02d9,
 765          /*  200	dieresis */	0x00a8,
 766          /*  201	twosuperior */	0x00b2,
 767          /*  202	ring */		0x02da,
 768          /*  203	cedilla */	0x00b8,
 769          /*  204	threesuperior */ 0x00b3,
 770          /*  205	hungarumlaut */	0x02dd,
 771          /*  206	ogonek */	0x02db,
 772          /*  207	caron */	0x02c7,
 773          /*  208	emdash */	0x2014,
 774          /*  209	plusminus */	0x00b1,
 775          /*  210	onequarter */	0x00bc,
 776          /*  211	onehalf */	0x00bd,
 777          /*  212	threequarters */ 0x00be,
 778          /*  213	agrave */	0x00e0,
 779          /*  214	aacute */	0x00e1,
 780          /*  215	acircumflex */	0x00e2,
 781          /*  216	atilde */	0x00e3,
 782          /*  217	adieresis */	0x00e4,
 783          /*  218	aring */	0x00e5,
 784          /*  219	ccedilla */	0x00e7,
 785          /*  220	egrave */	0x00e8,
 786          /*  221	eacute */	0x00e9,
 787          /*  222	ecircumflex */	0x00ea,
 788          /*  223	edieresis */	0x00eb,
 789          /*  224	igrave */	0x00ec,
 790          /*  225	AE */		0x00c6,
 791          /*  226	iacute */	0x00ed,
 792          /*  227	ordfeminine */	0x00aa,
 793          /*  228	icircumflex */	0x00ee,
 794          /*  229	idieresis */	0x00ef,
 795          /*  230	eth */		0x00f0,
 796          /*  231	ntilde */	0x00f1,
 797          /*  232	Lslash */	0x0141,
 798          /*  233	Oslash */	0x00d8,
 799          /*  234	OE */		0x0152,
 800          /*  235	ordmasculine */	0x00ba,
 801          /*  236	ograve */	0x00f2,
 802          /*  237	oacute */	0x00f3,
 803          /*  238	ocircumflex */	0x00f4,
 804          /*  239	otilde */	0x00f5,
 805          /*  240	odieresis */	0x00f6,
 806          /*  241	ae */		0x00e6,
 807          /*  242	ugrave */	0x00f9,
 808          /*  243	uacute */	0x00fa,
 809          /*  244	ucircumflex */	0x00fb,
 810          /*  245	dotlessi */	0x0131,
 811          /*  246	udieresis */	0x00fc,
 812          /*  247	yacute */	0x00fd,
 813          /*  248	lslash */	0x0142,
 814          /*  249	oslash */	0x00f8,
 815          /*  250	oe */		0x0153,
 816          /*  251	germandbls */	0x00df,
 817          /*  252	thorn */	0x00fe,
 818          /*  253	ydieresis */	0x00ff,
 819          /*  254	.notdef */	0xFFFD,
 820          /*  255	.notdef */	0xFFFD
 821  };
 822  
 823  static bool __CFFromNextStepLatin(uint32_t flags, uint8_t byte, UniChar *character) {
 824      return ((*character = (byte < 0x80 ? (UniChar)byte : NSToPrecompUnicodeTable[byte - 0x80])) != 0xFFFD);
 825  }
 826  
 827  static CFIndex __CFToNextStepLatinPrecompose(uint32_t flags, const UniChar *character, CFIndex numChars, uint8_t *bytes, CFIndex maxByteLen, CFIndex *usedByteLen) {
 828      uint8_t byte;
 829      CFIndex usedCharLen;
 830  
 831      if (__CFToNextStepLatin(flags, CFStringEncodingPrecomposeLatinCharacter(character, numChars, &usedCharLen), &byte) && byte && (usedCharLen > 1)) {
 832          if (maxByteLen) *bytes = byte;
 833          *usedByteLen = 1;
 834          return usedCharLen;
 835      } else {
 836          return 0;
 837      }
 838  }
 839  
 840  CF_PRIVATE const CFStringEncodingConverter __CFConverterNextStepLatin = {
 841      __CFToNextStepLatin, __CFFromNextStepLatin, 1, 1, kCFStringEncodingConverterCheapEightBit,
 842      NULL, NULL, NULL, NULL, __CFToNextStepLatinPrecompose, CFStringEncodingIsValidCombiningCharacterForLatin1,
 843  };
 844  
 845  /* UTF8 */
 846  /*
 847   * Copyright 2001 Unicode, Inc.
 848   * 
 849   * Disclaimer
 850   * 
 851   * This source code is provided as is by Unicode, Inc. No claims are
 852   * made as to fitness for any particular purpose. No warranties of any
 853   * kind are expressed or implied. The recipient agrees to determine
 854   * applicability of information provided. If this file has been
 855   * purchased on magnetic or optical media from Unicode, Inc., the
 856   * sole remedy for any claim will be exchange of defective media
 857   * within 90 days of receipt.
 858   * 
 859   * Limitations on Rights to Redistribute This Code
 860   * 
 861   * Unicode, Inc. hereby grants the right to freely use the information
 862   * supplied in this file in the creation of products supporting the
 863   * Unicode Standard, and to make copies of this file in any form
 864   * for internal or external distribution as long as this notice
 865   * remains attached.
 866   */
 867  
 868  static const uint32_t kReplacementCharacter =   0x0000FFFDUL;
 869  static const uint32_t kMaximumUCS2 =		0x0000FFFFUL;
 870  static const uint32_t kMaximumUTF16 =		0x0010FFFFUL;
 871  static const uint32_t kMaximumUCS4 =		0x7FFFFFFFUL;
 872  
 873  static const int halfShift			= 10;
 874  static const uint32_t halfBase		= 0x0010000UL;
 875  static const uint32_t halfMask		= 0x3FFUL;
 876  static const uint32_t kSurrogateHighStart	= 0xD800UL;
 877  static const uint32_t kSurrogateHighEnd	= 0xDBFFUL;
 878  static const uint32_t kSurrogateLowStart	= 0xDC00UL;
 879  static const uint32_t kSurrogateLowEnd	= 0xDFFFUL;
 880  
 881  /*
 882   * Index into the table below with the first byte of a UTF-8 sequence to
 883   * get the number of trailing bytes that are supposed to follow it.
 884   */
 885  static const char trailingBytesForUTF8[256] = {
 886  	0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
 887  	0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
 888  	0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
 889  	0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
 890  	0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
 891  	0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
 892  	1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1, 1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,
 893  	2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2, 3,3,3,3,3,3,3,3,4,4,4,4,5,5,5,5
 894  };
 895  
 896  /*
 897   * Magic values subtracted from a buffer value during UTF8 conversion.
 898   * This table contains as many values as there might be trailing bytes
 899   * in a UTF-8 sequence.
 900   */
 901  static const UTF32Char offsetsFromUTF8[6] = { 0x00000000UL, 0x00003080UL, 0x000E2080UL, 
 902  					 0x03C82080UL, 0xFA082080UL, 0x82082080UL };
 903  
 904  static const uint8_t firstByteMark[7] = { 0x00, 0x00, 0xC0, 0xE0, 0xF0, 0xF8, 0xFC };
 905  
 906  /* This code is similar in effect to making successive calls on the mbtowc and wctomb routines in FSS-UTF. However, it is considerably different in code:
 907          * it is adapted to be consistent with UTF16,
 908          * constants have been gathered.
 909          * loops & conditionals have been removed as much as possible for
 910          * efficiency, in favor of drop-through switch statements.
 911  */
 912  
 913  CF_INLINE uint16_t __CFUTF8BytesToWriteForCharacter(uint32_t ch) {
 914      if (ch < 0x80) return  1;
 915      else if (ch < 0x800) return 2;
 916      else if (ch < 0x10000) return 3;
 917      else if (ch < 0x200000) return 4;
 918      else if (ch < 0x4000000) return 5;
 919      else if (ch <= kMaximumUCS4) return 6;
 920      else return 0;
 921  }
 922  
 923  CF_INLINE uint16_t __CFToUTF8Core(uint32_t ch, uint8_t *bytes, uint32_t maxByteLen) {
 924      uint16_t bytesToWrite = __CFUTF8BytesToWriteForCharacter(ch);
 925      const uint32_t byteMask = 0xBF;
 926      const uint32_t byteMark = 0x80;
 927  
 928      if (!bytesToWrite) {
 929          bytesToWrite = 2;
 930          ch = kReplacementCharacter;
 931      }
 932  
 933      if (maxByteLen < bytesToWrite) return 0;
 934  
 935      switch (bytesToWrite) {	/* note: code falls through cases! */
 936          case 6: bytes[5] = (ch | byteMark) & byteMask; ch >>= 6;
 937          case 5: bytes[4] = (ch | byteMark) & byteMask; ch >>= 6;
 938          case 4: bytes[3] = (ch | byteMark) & byteMask; ch >>= 6;
 939          case 3: bytes[2] = (ch | byteMark) & byteMask; ch >>= 6;
 940          case 2: bytes[1] = (ch | byteMark) & byteMask; ch >>= 6;
 941          case 1: bytes[0] =  ch | firstByteMark[bytesToWrite];
 942      }
 943      return bytesToWrite;
 944  }
 945  
 946  static CFIndex __CFToUTF8(uint32_t flags, const UniChar *characters, CFIndex numChars, uint8_t *bytes, CFIndex maxByteLen, CFIndex *usedByteLen) {
 947      uint16_t bytesWritten;
 948      uint32_t ch;
 949      const UniChar *beginCharacter = characters;
 950      const UniChar *endCharacter = characters + numChars;
 951      const uint8_t *beginBytes = bytes;
 952      const uint8_t *endBytes = bytes + maxByteLen;
 953      bool isStrict = (flags & kCFStringEncodingUseHFSPlusCanonical ? false : true);
 954  
 955      while ((characters < endCharacter) && (!maxByteLen || (bytes < endBytes))) {
 956          ch = *(characters++);
 957  
 958          if (ch < 0x80) { // ASCII
 959              if (maxByteLen) *bytes = ch;
 960              ++bytes;
 961          } else {
 962              if (ch >= kSurrogateHighStart) {
 963                  if (ch <= kSurrogateHighEnd) {
 964                      if ((characters < endCharacter) && ((*characters >= kSurrogateLowStart) && (*characters <= kSurrogateLowEnd))) {
 965                          ch = ((ch - kSurrogateHighStart) << halfShift) + (*(characters++) - kSurrogateLowStart) + halfBase;
 966                      } else if (isStrict) {
 967                          --characters;
 968                          break;
 969                      }
 970                  } else if (isStrict && (ch <= kSurrogateLowEnd)) {
 971                      --characters;
 972                      break;
 973                  }
 974              }
 975      
 976              if (!(bytesWritten = (maxByteLen ? __CFToUTF8Core(ch, bytes, endBytes - bytes) : __CFUTF8BytesToWriteForCharacter(ch)))) {
 977                  characters -= (ch < 0x10000 ? 1 : 2);
 978                  break;
 979              }
 980              bytes += bytesWritten;
 981          }
 982      }
 983  
 984      if (usedByteLen) *usedByteLen = bytes - beginBytes;
 985      return characters - beginCharacter;
 986  }
 987  
 988  /*
 989   * Utility routine to tell whether a sequence of bytes is legal UTF-8.
 990   * This must be called with the length pre-determined by the first byte.
 991   * If not calling this from ConvertUTF8to*, then the length can be set by:
 992   *	length = trailingBytesForUTF8[*source]+1;
 993   * and the sequence is illegal right away if there aren't that many bytes
 994   * available.
 995   * If presented with a length > 4, this returns false.  The Unicode
 996   * definition of UTF-8 goes up to 4-byte sequences.
 997   */
 998  
 999  CF_INLINE bool __CFIsLegalUTF8(const uint8_t *source, CFIndex length) {
1000      if (length > 4) return false;
1001  
1002      const uint8_t *srcptr = source+length;
1003      uint8_t head = *source;
1004  
1005      while (--srcptr > source) if ((*srcptr & 0xC0) != 0x80) return false;
1006  
1007      if (((head >= 0x80) && (head < 0xC2)) || (head > 0xF4)) return false;
1008  
1009      if (((head == 0xE0) && (*(source + 1) < 0xA0)) || ((head == 0xED) && (*(source + 1) > 0x9F)) || ((head == 0xF0) && (*(source + 1) < 0x90)) || ((head == 0xF4) && (*(source + 1) > 0x8F))) return false;
1010      return true;
1011  }
1012  
1013  static CFIndex __CFFromUTF8(uint32_t flags, const uint8_t *bytes, CFIndex numBytes, UniChar *characters, CFIndex maxCharLen, CFIndex *usedCharLen) {
1014      const uint8_t *source = bytes;
1015      uint16_t extraBytesToRead;
1016      CFIndex theUsedCharLen = 0;
1017      uint32_t ch;
1018      bool isHFSPlus = (flags & kCFStringEncodingUseHFSPlusCanonical ? true : false);
1019      bool needsToDecompose = (flags & kCFStringEncodingUseCanonical || isHFSPlus ? true : false);
1020      bool strictUTF8 = (flags & kCFStringEncodingLenientUTF8Conversion ? false : true);
1021      UTF32Char decomposed[MAX_DECOMPOSED_LENGTH];
1022      CFIndex decompLength;
1023      bool isStrict = !isHFSPlus;
1024  
1025      while (numBytes && (!maxCharLen || (theUsedCharLen < maxCharLen))) {
1026          extraBytesToRead = trailingBytesForUTF8[*source];
1027  
1028          if (extraBytesToRead > --numBytes) break;
1029          numBytes -= extraBytesToRead;
1030  
1031          /* Do this check whether lenient or strict */
1032          // We need to allow 0xA9 (copyright in MacRoman and Unicode) not to break existing apps
1033          // Will use a flag passed in from upper layers to switch restriction mode for this case in the next release
1034          if ((extraBytesToRead > 3) || (strictUTF8 && !__CFIsLegalUTF8(source, extraBytesToRead + 1))) {
1035              if ((*source == 0xA9) || (flags & kCFStringEncodingAllowLossyConversion)) {
1036                  numBytes += extraBytesToRead;
1037                  ++source;
1038                  if (maxCharLen) *(characters++) = (UTF16Char)kReplacementCharacter;
1039                  ++theUsedCharLen;
1040                  continue;
1041              } else {
1042                  break;
1043              }
1044          }
1045  
1046          ch = 0;
1047          /*
1048           * The cases all fall through. See "Note A" below.
1049           */
1050          switch (extraBytesToRead) {
1051              case 3:	ch += *source++; ch <<= 6;
1052              case 2:	ch += *source++; ch <<= 6;
1053              case 1:	ch += *source++; ch <<= 6;
1054              case 0:	ch += *source++;
1055          }
1056          ch -= offsetsFromUTF8[extraBytesToRead];
1057  
1058          if (ch <= kMaximumUCS2) {
1059              if (isStrict && (ch >= kSurrogateHighStart && ch <= kSurrogateLowEnd)) {
1060                  source -= (extraBytesToRead + 1);
1061                  break;
1062              }
1063              if (needsToDecompose && CFUniCharIsDecomposableCharacter(ch, isHFSPlus)) {
1064                  decompLength = CFUniCharDecomposeCharacter(ch, decomposed, MAX_DECOMPOSED_LENGTH);
1065  
1066                  if (maxCharLen) {
1067                      if (!CFUniCharFillDestinationBuffer(decomposed, decompLength, (void **)&characters, maxCharLen, &theUsedCharLen, kCFUniCharUTF16Format)) break;
1068                  } else {
1069                      theUsedCharLen += decompLength;
1070                  }
1071              } else {
1072                  if (maxCharLen) *(characters++) = (UTF16Char)ch;
1073                  ++theUsedCharLen;
1074              }
1075          } else if (ch > kMaximumUTF16) {
1076              if (isStrict) {
1077                  source -= (extraBytesToRead + 1);
1078                  break;
1079              }
1080              if (maxCharLen) *(characters++) = (UTF16Char)kReplacementCharacter;
1081              ++theUsedCharLen;
1082          } else {
1083              if (needsToDecompose && CFUniCharIsDecomposableCharacter(ch, isHFSPlus)) {
1084                  decompLength = CFUniCharDecomposeCharacter(ch, decomposed, MAX_DECOMPOSED_LENGTH);
1085  
1086                  if (maxCharLen) {
1087                      if (!CFUniCharFillDestinationBuffer(decomposed, decompLength, (void **)&characters, maxCharLen, &theUsedCharLen, kCFUniCharUTF16Format)) break;
1088                  } else {
1089                      while (--decompLength >= 0) theUsedCharLen += (decomposed[decompLength] < 0x10000 ? 1 : 2);
1090                  }
1091              } else {
1092                  if (maxCharLen) {
1093                      if ((theUsedCharLen + 2) > maxCharLen) break;
1094                      ch -= halfBase;
1095                      *(characters++) = (ch >> halfShift) + kSurrogateHighStart;
1096                      *(characters++) = (ch & halfMask) + kSurrogateLowStart;
1097                  }
1098                  theUsedCharLen += 2;
1099              }
1100          }
1101      }
1102  
1103      if (usedCharLen) *usedCharLen = theUsedCharLen;
1104  
1105      return source - bytes;
1106  }
1107  
1108  static CFIndex __CFToUTF8Len(uint32_t flags, const UniChar *characters, CFIndex numChars) {
1109      uint32_t bytesToWrite = 0;
1110      uint32_t ch;
1111  
1112      while (numChars) {
1113          ch = *characters++;
1114          numChars--;
1115          if ((ch >= kSurrogateHighStart && ch <= kSurrogateHighEnd) && numChars && (*characters >= kSurrogateLowStart && *characters <= kSurrogateLowEnd)) {
1116              ch = ((ch - kSurrogateHighStart) << halfShift) + (*characters++ - kSurrogateLowStart) + halfBase;
1117              numChars--;
1118          }
1119          bytesToWrite += __CFUTF8BytesToWriteForCharacter(ch);
1120      }
1121  
1122      return bytesToWrite;
1123  }
1124  
1125  static CFIndex __CFFromUTF8Len(uint32_t flags, const uint8_t *source, CFIndex numBytes) {
1126      uint16_t extraBytesToRead;
1127      CFIndex theUsedCharLen = 0;
1128      uint32_t ch;
1129      bool isHFSPlus = (flags & kCFStringEncodingUseHFSPlusCanonical ? true : false);
1130      bool needsToDecompose = (flags & kCFStringEncodingUseCanonical || isHFSPlus ? true : false);
1131      bool strictUTF8 = (flags & kCFStringEncodingLenientUTF8Conversion ? false : true);
1132      UTF32Char decomposed[MAX_DECOMPOSED_LENGTH];
1133      CFIndex decompLength;
1134      bool isStrict = !isHFSPlus;
1135  
1136      while (numBytes) {
1137          extraBytesToRead = trailingBytesForUTF8[*source];
1138  
1139          if (extraBytesToRead > --numBytes) break;
1140          numBytes -= extraBytesToRead;
1141  
1142          /* Do this check whether lenient or strict */
1143          // We need to allow 0xA9 (copyright in MacRoman and Unicode) not to break existing apps
1144          // Will use a flag passed in from upper layers to switch restriction mode for this case in the next release
1145          if ((extraBytesToRead > 3) || (strictUTF8 && !__CFIsLegalUTF8(source, extraBytesToRead + 1))) {
1146              if ((*source == 0xA9) || (flags & kCFStringEncodingAllowLossyConversion)) {
1147                  numBytes += extraBytesToRead;
1148                  ++source;
1149                  ++theUsedCharLen;
1150                  continue;
1151              } else {
1152                  break;
1153              }
1154          }
1155  
1156  
1157          ch = 0;
1158          /*
1159           * The cases all fall through. See "Note A" below.
1160           */
1161          switch (extraBytesToRead) {
1162              case 3:	ch += *source++; ch <<= 6;
1163              case 2:	ch += *source++; ch <<= 6;
1164              case 1:	ch += *source++; ch <<= 6;
1165              case 0:	ch += *source++;
1166          }
1167          ch -= offsetsFromUTF8[extraBytesToRead];
1168  
1169          if (ch <= kMaximumUCS2) {
1170              if (isStrict && (ch >= kSurrogateHighStart && ch <= kSurrogateLowEnd)) {
1171                  break;
1172              }
1173              if (needsToDecompose && CFUniCharIsDecomposableCharacter(ch, isHFSPlus)) {
1174                  decompLength = CFUniCharDecomposeCharacter(ch, decomposed, MAX_DECOMPOSED_LENGTH);
1175                  theUsedCharLen += decompLength;
1176              } else {
1177                  ++theUsedCharLen;
1178              }
1179          } else if (ch > kMaximumUTF16) {
1180              ++theUsedCharLen;
1181          } else {
1182              if (needsToDecompose && CFUniCharIsDecomposableCharacter(ch, isHFSPlus)) {
1183                  decompLength = CFUniCharDecomposeCharacter(ch, decomposed, MAX_DECOMPOSED_LENGTH);
1184                  while (--decompLength >= 0) theUsedCharLen += (decomposed[decompLength] < 0x10000 ? 1 : 2);
1185              } else {
1186                  theUsedCharLen += 2;
1187              }
1188          }
1189      }
1190  
1191      return theUsedCharLen;
1192  }
1193  
1194  CF_PRIVATE const CFStringEncodingConverter __CFConverterUTF8 = {
1195      __CFToUTF8, __CFFromUTF8, 3, 2, kCFStringEncodingConverterStandard,
1196      __CFToUTF8Len, __CFFromUTF8Len, NULL, NULL, NULL, NULL,
1197  };