/ CFStringUtilities.c
CFStringUtilities.c
1 /* 2 * Copyright (c) 2015 Apple Inc. All rights reserved. 3 * 4 * @APPLE_LICENSE_HEADER_START@ 5 * 6 * This file contains Original Code and/or Modifications of Original Code 7 * as defined in and that are subject to the Apple Public Source License 8 * Version 2.0 (the 'License'). You may not use this file except in 9 * compliance with the License. Please obtain a copy of the License at 10 * http://www.opensource.apple.com/apsl/ and read it before using this 11 * file. 12 * 13 * The Original Code and all software distributed under the License are 14 * distributed on an 'AS IS' basis, WITHOUT WARRANTY OF ANY KIND, EITHER 15 * EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES, 16 * INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY, 17 * FITNESS FOR A PARTICULAR PURPOSE, QUIET ENJOYMENT OR NON-INFRINGEMENT. 18 * Please see the License for the specific language governing rights and 19 * limitations under the License. 20 * 21 * @APPLE_LICENSE_HEADER_END@ 22 */ 23 24 /* CFStringUtilities.c 25 Copyright (c) 1999-2014, Apple Inc. All rights reserved. 26 Responsibility: Aki Inoue 27 */ 28 29 #include "CFInternal.h" 30 #include <CoreFoundation/CFStringEncodingConverterExt.h> 31 #include <CoreFoundation/CFUniChar.h> 32 #include <CoreFoundation/CFStringEncodingExt.h> 33 #include "CFStringEncodingDatabase.h" 34 #include "CFICUConverters.h" 35 #include <limits.h> 36 #include <stdlib.h> 37 #if DEPLOYMENT_TARGET_MACOSX || DEPLOYMENT_TARGET_EMBEDDED || DEPLOYMENT_TARGET_WINDOWS || DEPLOYMENT_TARGET_LINUX 38 #include <unicode/ucol.h> 39 #include <unicode/ucoleitr.h> 40 #endif 41 #include <string.h> 42 43 #if DEPLOYMENT_TARGET_WINDOWS 44 #include <tchar.h> 45 #endif 46 47 48 Boolean CFStringIsEncodingAvailable(CFStringEncoding theEncoding) { 49 switch (theEncoding) { 50 case kCFStringEncodingASCII: // Built-in encodings 51 case kCFStringEncodingMacRoman: 52 case kCFStringEncodingUTF8: 53 case kCFStringEncodingNonLossyASCII: 54 case kCFStringEncodingWindowsLatin1: 55 case kCFStringEncodingNextStepLatin: 56 case kCFStringEncodingUTF16: 57 case kCFStringEncodingUTF16BE: 58 case kCFStringEncodingUTF16LE: 59 case kCFStringEncodingUTF32: 60 case kCFStringEncodingUTF32BE: 61 case kCFStringEncodingUTF32LE: 62 return true; 63 64 default: 65 return CFStringEncodingIsValidEncoding(theEncoding); 66 } 67 } 68 69 const CFStringEncoding* CFStringGetListOfAvailableEncodings() { 70 return (const CFStringEncoding *)CFStringEncodingListOfAvailableEncodings(); 71 } 72 73 CFStringRef CFStringGetNameOfEncoding(CFStringEncoding theEncoding) { 74 static CFMutableDictionaryRef mappingTable = NULL; 75 static OSSpinLock mappingTableLock = OS_SPINLOCK_INIT; 76 77 CFStringRef theName = NULL; 78 79 if (mappingTable) { 80 OSSpinLockLock(&mappingTableLock); 81 theName = (CFStringRef)CFDictionaryGetValue(mappingTable, (const void*)(uintptr_t)theEncoding); 82 OSSpinLockUnlock(&mappingTableLock); 83 } 84 85 if (!theName) { 86 const char *encodingName = __CFStringEncodingGetName(theEncoding); 87 88 if (encodingName) { 89 theName = CFStringCreateWithCString(kCFAllocatorSystemDefault, encodingName, kCFStringEncodingASCII); 90 } 91 92 if (theName) { 93 OSSpinLockLock(&mappingTableLock); 94 95 CFStringRef result = NULL; 96 if (!mappingTable) { 97 mappingTable = CFDictionaryCreateMutable(kCFAllocatorSystemDefault, 0, (const CFDictionaryKeyCallBacks *)NULL, &kCFTypeDictionaryValueCallBacks); 98 } else { // Check to see if this got in the dictionary in the meantime 99 result = (CFStringRef)CFDictionaryGetValue(mappingTable, (const void*)(uintptr_t)theEncoding); 100 } 101 if (!result) { // If not, add it in 102 CFDictionaryAddValue(mappingTable, (const void*)(uintptr_t)theEncoding, (const void*)theName); 103 OSSpinLockUnlock(&mappingTableLock); 104 CFRelease(theName); 105 } else { // Otherwise use the one already in there 106 OSSpinLockUnlock(&mappingTableLock); 107 CFRelease(theName); 108 theName = result; 109 } 110 } 111 } 112 113 return theName; 114 } 115 116 CFStringEncoding CFStringConvertIANACharSetNameToEncoding(CFStringRef charsetName) { 117 CFStringEncoding encoding = kCFStringEncodingInvalidId; 118 #define BUFFER_SIZE (100) 119 char buffer[BUFFER_SIZE]; 120 const char *name = CFStringGetCStringPtr(charsetName, __CFStringGetEightBitStringEncoding()); 121 122 if (NULL == name) { 123 if (false == CFStringGetCString(charsetName, buffer, BUFFER_SIZE, __CFStringGetEightBitStringEncoding())) return kCFStringEncodingInvalidId; 124 125 name = buffer; 126 } 127 128 encoding = __CFStringEncodingGetFromCanonicalName(name); 129 130 #if DEPLOYMENT_TARGET_MACOSX || DEPLOYMENT_TARGET_EMBEDDED || DEPLOYMENT_TARGET_WINDOWS || DEPLOYMENT_TARGET_LINUX 131 if (kCFStringEncodingInvalidId == encoding) encoding = __CFStringEncodingGetFromICUName(name); 132 #endif 133 134 135 // handling Java name variant for MS codepages 136 if ((kCFStringEncodingInvalidId == encoding) && !strncasecmp(name, "ms950", strlen("ms950"))) { // <rdar://problem/12903398> “MS950” is not recognized 137 encoding = __CFStringEncodingGetFromCanonicalName("cp950"); 138 } 139 140 return encoding; 141 } 142 143 CFStringRef CFStringConvertEncodingToIANACharSetName(CFStringEncoding encoding) { 144 CFStringRef name = NULL; 145 CFIndex value = encoding; 146 static CFMutableDictionaryRef mappingTable = NULL; 147 static CFLock_t lock = CFLockInit; 148 149 __CFLock(&lock); 150 name = ((NULL == mappingTable) ? NULL : (CFStringRef)CFDictionaryGetValue(mappingTable, (const void*)value)); 151 152 if (NULL == name) { 153 #define STACK_BUFFER_SIZE (100) 154 char buffer[STACK_BUFFER_SIZE]; 155 156 if (__CFStringEncodingGetCanonicalName(encoding, buffer, STACK_BUFFER_SIZE)) name = CFStringCreateWithCString(NULL, buffer, kCFStringEncodingASCII); 157 158 159 if (NULL != name) { 160 CFIndex value = encoding; 161 162 if (NULL == mappingTable) mappingTable = CFDictionaryCreateMutable(NULL, 0, NULL, &kCFTypeDictionaryValueCallBacks); 163 164 CFDictionaryAddValue(mappingTable, (const void*)value, (const void*)name); 165 CFRelease(name); 166 } 167 } 168 __CFUnlock(&lock); 169 170 return name; 171 } 172 173 #ifndef __OBJC__ 174 enum { 175 NSASCIIStringEncoding = 1, /* 0..127 only */ 176 NSNEXTSTEPStringEncoding = 2, 177 NSJapaneseEUCStringEncoding = 3, 178 NSUTF8StringEncoding = 4, 179 NSISOLatin1StringEncoding = 5, 180 NSSymbolStringEncoding = 6, 181 NSNonLossyASCIIStringEncoding = 7, 182 NSShiftJISStringEncoding = 8, 183 NSISOLatin2StringEncoding = 9, 184 NSUnicodeStringEncoding = 10, 185 NSWindowsCP1251StringEncoding = 11, /* Cyrillic; same as AdobeStandardCyrillic */ 186 NSWindowsCP1252StringEncoding = 12, /* WinLatin1 */ 187 NSWindowsCP1253StringEncoding = 13, /* Greek */ 188 NSWindowsCP1254StringEncoding = 14, /* Turkish */ 189 NSWindowsCP1250StringEncoding = 15, /* WinLatin2 */ 190 NSISO2022JPStringEncoding = 21, /* ISO 2022 Japanese encoding for e-mail */ 191 NSMacOSRomanStringEncoding = 30, 192 193 NSProprietaryStringEncoding = 65536 /* Installation-specific encoding */ 194 }; 195 #endif 196 197 #define NSENCODING_MASK (1 << 31) 198 199 unsigned long CFStringConvertEncodingToNSStringEncoding(CFStringEncoding theEncoding) { 200 switch (theEncoding & 0xFFF) { 201 case kCFStringEncodingUnicode: 202 if (theEncoding == kCFStringEncodingUTF16) return NSUnicodeStringEncoding; 203 else if (theEncoding == kCFStringEncodingUTF8) return NSUTF8StringEncoding; 204 break; 205 206 case kCFStringEncodingWindowsLatin1: return NSWindowsCP1252StringEncoding; 207 case kCFStringEncodingMacRoman: return NSMacOSRomanStringEncoding; 208 209 case kCFStringEncodingASCII: return NSASCIIStringEncoding; 210 211 case kCFStringEncodingDOSJapanese: return NSShiftJISStringEncoding; 212 case kCFStringEncodingWindowsCyrillic: return NSWindowsCP1251StringEncoding; 213 case kCFStringEncodingWindowsGreek: return NSWindowsCP1253StringEncoding; 214 case kCFStringEncodingWindowsLatin5: return NSWindowsCP1254StringEncoding; 215 case kCFStringEncodingWindowsLatin2: return NSWindowsCP1250StringEncoding; 216 case kCFStringEncodingISOLatin1: return NSISOLatin1StringEncoding; 217 218 case kCFStringEncodingNonLossyASCII: return NSNonLossyASCIIStringEncoding; 219 case kCFStringEncodingEUC_JP: return NSJapaneseEUCStringEncoding; 220 case kCFStringEncodingMacSymbol: return NSSymbolStringEncoding; 221 case kCFStringEncodingISOLatin2: return NSISOLatin2StringEncoding; 222 case kCFStringEncodingISO_2022_JP: return NSISO2022JPStringEncoding; 223 case kCFStringEncodingNextStepLatin: return NSNEXTSTEPStringEncoding; 224 } 225 226 return NSENCODING_MASK | theEncoding; 227 } 228 229 CFStringEncoding CFStringConvertNSStringEncodingToEncoding(unsigned long theEncoding) { 230 const uint16_t encodings[] = { 231 kCFStringEncodingASCII, 232 kCFStringEncodingNextStepLatin, 233 kCFStringEncodingEUC_JP, 234 0, 235 kCFStringEncodingISOLatin1, 236 kCFStringEncodingMacSymbol, 237 kCFStringEncodingNonLossyASCII, 238 kCFStringEncodingDOSJapanese, 239 kCFStringEncodingISOLatin2, 240 kCFStringEncodingUTF16, 241 kCFStringEncodingWindowsCyrillic, 242 kCFStringEncodingWindowsLatin1, 243 kCFStringEncodingWindowsGreek, 244 kCFStringEncodingWindowsLatin5, 245 kCFStringEncodingWindowsLatin2 246 }; 247 248 if (NSUTF8StringEncoding == theEncoding) return kCFStringEncodingUTF8; 249 250 if ((theEncoding > 0) && (theEncoding <= NSWindowsCP1250StringEncoding)) return encodings[theEncoding - 1]; 251 252 switch (theEncoding) { 253 case NSMacOSRomanStringEncoding: return kCFStringEncodingMacRoman; 254 case NSISO2022JPStringEncoding: return kCFStringEncodingISO_2022_JP; 255 256 default: 257 return ((theEncoding & NSENCODING_MASK) ? theEncoding & ~NSENCODING_MASK : kCFStringEncodingInvalidId); 258 } 259 } 260 261 UInt32 CFStringConvertEncodingToWindowsCodepage(CFStringEncoding theEncoding) { 262 uint16_t codepage = __CFStringEncodingGetWindowsCodePage(theEncoding); 263 264 return ((0 == codepage) ? kCFStringEncodingInvalidId : codepage); 265 } 266 267 CFStringEncoding CFStringConvertWindowsCodepageToEncoding(UInt32 theEncoding) { 268 return __CFStringEncodingGetFromWindowsCodePage(theEncoding); 269 } 270 271 CFStringEncoding CFStringGetMostCompatibleMacStringEncoding(CFStringEncoding encoding) { 272 CFStringEncoding macEncoding = __CFStringEncodingGetMostCompatibleMacScript(encoding); 273 274 275 return macEncoding; 276 } 277 278 #define kCFStringCompareAllocationIncrement (128) 279 280 #if DEPLOYMENT_TARGET_MACOSX || DEPLOYMENT_TARGET_EMBEDDED || DEPLOYMENT_TARGET_WINDOWS || DEPLOYMENT_TARGET_LINUX 281 282 // ------------------------------------------------------------------------------------------------- 283 // CompareSpecials - ignore case & diacritic differences 284 // 285 // Decomposed have 2nd-4th chars of type Mn or Mc, or in range 1160-11FF (jamo) 286 // Fullwidth & halfwidth are in range FF00-FFEF 287 // Parenthesized & circled are in range 3200-32FF 288 // ------------------------------------------------------------------------------------------------- 289 290 enum { 291 kUpperCaseWeightMin = 0x80 | 0x0F, 292 kUpperCaseWeightMax = 0x80 | 0x17, 293 kUpperToLowerDelta = 0x80 | 0x0A, // 0x0A = 0x0F - 0x05 294 kMaskPrimarySecondary = 0xFFFFFF00, 295 kMaskPrimaryOnly = 0xFFFF0000, 296 kMaskSecondaryOnly = 0x0000FF00, 297 kMaskCaseTertiary = 0x000000FF // 2 hi bits case, 6 lo bits tertiary 298 }; 299 300 static SInt32 __CompareSpecials(const UCollator *collator, CFOptionFlags options, const UniChar *text1Ptr, UniCharCount text1Length, const UniChar *text2Ptr, UniCharCount text2Length) { 301 UErrorCode icuStatus = U_ZERO_ERROR; 302 SInt32 orderWidth = 0; 303 SInt32 orderCompos = 0; 304 305 UCollationElements * collElems1 = ucol_openElements(collator, (const UChar *)text1Ptr, text1Length, &icuStatus); 306 UCollationElements * collElems2 = ucol_openElements(collator, (const UChar *)text2Ptr, text2Length, &icuStatus); 307 if (U_SUCCESS(icuStatus)) { 308 int32_t startOffset1 = 0; 309 int32_t startOffset2 = 0; 310 311 while (true) { 312 int32_t elemOrder1, elemOrder2; 313 int32_t offset1, offset2; 314 315 elemOrder1 = ucol_next(collElems1, &icuStatus); 316 elemOrder2 = ucol_next(collElems2, &icuStatus); 317 if ( U_FAILURE(icuStatus) || elemOrder1 == (int32_t)UCOL_NULLORDER || elemOrder2 == (int32_t)UCOL_NULLORDER ) { 318 break; 319 } 320 321 offset1 = ucol_getOffset(collElems1); 322 offset2 = ucol_getOffset(collElems2); 323 if ( (elemOrder1 & kMaskPrimarySecondary) == (elemOrder2 & kMaskPrimarySecondary) ) { 324 if ( (elemOrder1 & kMaskPrimaryOnly) != 0 ) { 325 // keys may differ in case, width, circling, etc. 326 327 int32_t tertiary1 = (elemOrder1 & kMaskCaseTertiary); 328 int32_t tertiary2 = (elemOrder2 & kMaskCaseTertiary); 329 // fold upper to lower case 330 if (tertiary1 >= kUpperCaseWeightMin && tertiary1 <= kUpperCaseWeightMax) { 331 tertiary1 -= kUpperToLowerDelta; 332 } 333 if (tertiary2 >= kUpperCaseWeightMin && tertiary2 <= kUpperCaseWeightMax) { 334 tertiary2 -= kUpperToLowerDelta; 335 } 336 // now compare 337 if (tertiary1 != tertiary2) { 338 orderWidth = (tertiary1 < tertiary2)? -1: 1; 339 break; 340 } 341 342 } else if ( (elemOrder1 & kMaskSecondaryOnly) != 0 ) { 343 // primary weights are both zero, but secondaries are not. 344 if ( orderCompos == 0 && (options & kCFCompareNonliteral) == 0 ) { 345 // We have a code element which is a diacritic. 346 // It may have come from a composed char or a combining char. 347 // If it came from a combining char (longer element length) it sorts first. 348 // This is only an approximation to what the Mac OS 9 code did, but this is an 349 // unusual case anyway. 350 int32_t elem1Length = offset1 - startOffset1; 351 int32_t elem2Length = offset2 - startOffset2; 352 if (elem1Length != elem2Length) { 353 orderCompos = (elem1Length > elem2Length)? -1: 1; 354 } 355 } 356 } 357 } 358 359 startOffset1 = offset1; 360 startOffset2 = offset2; 361 } 362 ucol_closeElements(collElems1); 363 ucol_closeElements(collElems2); 364 } 365 366 return (orderWidth != 0)? orderWidth: orderCompos; 367 } 368 369 static SInt32 __CompareCodePoints(const UniChar *text1Ptr, UniCharCount text1Length, const UniChar *text2Ptr, UniCharCount text2Length ) { 370 const UniChar * text1P = text1Ptr; 371 const UniChar * text2P = text2Ptr; 372 UInt32 textLimit = (text1Length <= text2Length)? text1Length: text2Length; 373 UInt32 textCounter; 374 SInt32 orderResult = 0; 375 376 // Loop through either string...the first difference differentiates this. 377 for (textCounter = 0; textCounter < textLimit && *text1P == *text2P; textCounter++) { 378 text1P++; 379 text2P++; 380 } 381 if (textCounter < textLimit) { 382 // code point difference 383 orderResult = (*text1P < *text2P) ? -1 : 1; 384 } else if (text1Length != text2Length) { 385 // one string has extra stuff at end 386 orderResult = (text1Length < text2Length) ? -1 : 1; 387 } 388 return orderResult; 389 } 390 391 392 extern const CFStringRef __kCFLocaleCollatorID; 393 394 static UCollator *__CFStringCreateCollator(CFLocaleRef compareLocale) { 395 CFStringRef canonLocaleCFStr = (CFStringRef)CFLocaleGetValue(compareLocale, __kCFLocaleCollatorID); 396 char icuLocaleStr[128] = {0}; 397 CFStringGetCString(canonLocaleCFStr, icuLocaleStr, sizeof(icuLocaleStr), kCFStringEncodingASCII); 398 UErrorCode icuStatus = U_ZERO_ERROR; 399 UCollator * collator = ucol_open(icuLocaleStr, &icuStatus); 400 ucol_setAttribute(collator, UCOL_NORMALIZATION_MODE, UCOL_OFF, &icuStatus); 401 ucol_setAttribute(collator, UCOL_ALTERNATE_HANDLING, UCOL_NON_IGNORABLE, &icuStatus); 402 ucol_setAttribute(collator, UCOL_STRENGTH, UCOL_PRIMARY, &icuStatus); 403 ucol_setAttribute(collator, UCOL_CASE_LEVEL, UCOL_OFF, &icuStatus); 404 ucol_setAttribute(collator, UCOL_NUMERIC_COLLATION, UCOL_OFF, &icuStatus); 405 return collator; 406 } 407 408 #define kCFMaxCachedDefaultCollators (8) 409 static UCollator *__CFDefaultCollators[kCFMaxCachedDefaultCollators]; 410 static CFIndex __CFDefaultCollatorsCount = 0; 411 static const void *__CFDefaultCollatorLocale = NULL; 412 static CFLock_t __CFDefaultCollatorLock = CFLockInit; 413 414 static UCollator *__CFStringCopyDefaultCollator(CFLocaleRef compareLocale) { 415 CFLocaleRef currentLocale = NULL; 416 UCollator * collator = NULL; 417 418 if (compareLocale != __CFDefaultCollatorLocale) { 419 currentLocale = CFLocaleCopyCurrent(); 420 if (compareLocale != currentLocale) { 421 CFRelease(currentLocale); 422 return NULL; 423 } 424 } 425 426 __CFLock(&__CFDefaultCollatorLock); 427 if ((NULL != currentLocale) && (__CFDefaultCollatorLocale != currentLocale)) { 428 while (__CFDefaultCollatorsCount > 0) ucol_close(__CFDefaultCollators[--__CFDefaultCollatorsCount]); 429 __CFDefaultCollatorLocale = CFRetain(currentLocale); 430 } 431 432 if (__CFDefaultCollatorsCount > 0) collator = __CFDefaultCollators[--__CFDefaultCollatorsCount]; 433 __CFUnlock(&__CFDefaultCollatorLock); 434 435 if (NULL == collator) { 436 collator = __CFStringCreateCollator(compareLocale); 437 } 438 439 if (NULL != currentLocale) CFRelease(currentLocale); 440 441 return collator; 442 } 443 444 #if DEPLOYMENT_TARGET_MACOSX || DEPLOYMENT_TARGET_EMBEDDED 445 static void __collatorFinalize(UCollator *collator) { 446 CFLocaleRef locale = _CFGetTSD(__CFTSDKeyCollatorLocale); 447 _CFSetTSD(__CFTSDKeyCollatorUCollator, NULL, NULL); 448 _CFSetTSD(__CFTSDKeyCollatorLocale, NULL, NULL); 449 __CFLock(&__CFDefaultCollatorLock); 450 if ((__CFDefaultCollatorLocale == locale) && (__CFDefaultCollatorsCount < kCFMaxCachedDefaultCollators)) { 451 __CFDefaultCollators[__CFDefaultCollatorsCount++] = collator; 452 collator = NULL; 453 } 454 __CFUnlock(&__CFDefaultCollatorLock); 455 if (NULL != collator) ucol_close(collator); 456 if (locale) CFRelease(locale); 457 } 458 #endif 459 460 // ------------------------------------------------------------------------------------------------- 461 // __CompareTextDefault 462 // 463 // A primary difference is denoted by values 2/-2 in orderP. Other differences are indicated with a -1/1. 464 // A negative value indicates that text1 sorts before text2. 465 // ------------------------------------------------------------------------------------------------- 466 static OSStatus __CompareTextDefault(UCollator *collator, CFOptionFlags options, const UniChar *text1Ptr, UniCharCount text1Length, const UniChar *text2Ptr, UniCharCount text2Length, Boolean *equivalentP, SInt32 *orderP) { 467 468 // collator must have default settings restored on exit from this function 469 470 *equivalentP = true; 471 *orderP = 0; 472 473 if (options & kCFCompareNumerically) { 474 UErrorCode icuStatus = U_ZERO_ERROR; 475 ucol_setAttribute(collator, UCOL_NUMERIC_COLLATION, UCOL_ON, &icuStatus); 476 } 477 478 // Most string differences are Primary. Do a primary check first, then if there 479 // are no differences do a comparison with the options in the collator. 480 UCollationResult icuResult = ucol_strcoll(collator, (const UChar *)text1Ptr, text1Length, (const UChar *)text2Ptr, text2Length); 481 if (icuResult != UCOL_EQUAL) { 482 *orderP = (icuResult == UCOL_LESS) ? -2 : 2; 483 } 484 if (*orderP == 0) { 485 UErrorCode icuStatus = U_ZERO_ERROR; 486 ucol_setAttribute(collator, UCOL_NORMALIZATION_MODE, UCOL_ON, &icuStatus); 487 ucol_setAttribute(collator, UCOL_STRENGTH, (options & kCFCompareDiacriticInsensitive) ? UCOL_PRIMARY : UCOL_SECONDARY, &icuStatus); 488 ucol_setAttribute(collator, UCOL_CASE_LEVEL, (options & kCFCompareCaseInsensitive) ? UCOL_OFF : UCOL_ON, &icuStatus); 489 if (!U_SUCCESS(icuStatus)) { 490 icuStatus = U_ZERO_ERROR; 491 ucol_setAttribute(collator, UCOL_NORMALIZATION_MODE, UCOL_OFF, &icuStatus); 492 ucol_setAttribute(collator, UCOL_STRENGTH, UCOL_PRIMARY, &icuStatus); 493 ucol_setAttribute(collator, UCOL_CASE_LEVEL, UCOL_OFF, &icuStatus); 494 ucol_setAttribute(collator, UCOL_NUMERIC_COLLATION, UCOL_OFF, &icuStatus); 495 return 666; 496 } 497 498 // We don't have a primary difference. Recompare with standard collator. 499 icuResult = ucol_strcoll(collator, (const UChar *)text1Ptr, text1Length, (const UChar *)text2Ptr, text2Length); 500 if (icuResult != UCOL_EQUAL) { 501 *orderP = (icuResult == UCOL_LESS) ? -1 : 1; 502 } 503 icuStatus = U_ZERO_ERROR; 504 ucol_setAttribute(collator, UCOL_NORMALIZATION_MODE, UCOL_OFF, &icuStatus); 505 ucol_setAttribute(collator, UCOL_STRENGTH, UCOL_PRIMARY, &icuStatus); 506 ucol_setAttribute(collator, UCOL_CASE_LEVEL, UCOL_OFF, &icuStatus); 507 } 508 if (*orderP == 0 && (options & kCFCompareNonliteral) == 0) { 509 *orderP = __CompareSpecials(collator, options, text1Ptr, text1Length, text2Ptr, text2Length); 510 } 511 512 *equivalentP = (*orderP == 0); 513 514 // If strings are equivalent but we care about order and have not yet checked 515 // to the level of code point order, then do some more checks for order 516 if (*orderP == 0) { 517 UErrorCode icuStatus = U_ZERO_ERROR; 518 // First try to see if ICU can find any differences above code point level 519 ucol_setAttribute(collator, UCOL_NORMALIZATION_MODE, UCOL_ON, &icuStatus); 520 ucol_setAttribute(collator, UCOL_STRENGTH, UCOL_TERTIARY, &icuStatus); 521 ucol_setAttribute(collator, UCOL_CASE_LEVEL, UCOL_ON, &icuStatus); 522 if (!U_SUCCESS(icuStatus)) { 523 icuStatus = U_ZERO_ERROR; 524 ucol_setAttribute(collator, UCOL_NORMALIZATION_MODE, UCOL_OFF, &icuStatus); 525 ucol_setAttribute(collator, UCOL_STRENGTH, UCOL_PRIMARY, &icuStatus); 526 ucol_setAttribute(collator, UCOL_CASE_LEVEL, UCOL_OFF, &icuStatus); 527 ucol_setAttribute(collator, UCOL_NUMERIC_COLLATION, UCOL_OFF, &icuStatus); 528 return 666; 529 } 530 icuResult = ucol_strcoll(collator, (const UChar *)text1Ptr, text1Length, (const UChar *)text2Ptr, text2Length); 531 if (icuResult != UCOL_EQUAL) { 532 *orderP = (icuResult == UCOL_LESS) ? -1 : 1; 533 } else { 534 // no ICU differences above code point level, compare code points 535 *orderP = __CompareCodePoints( text1Ptr, text1Length, text2Ptr, text2Length ); 536 } 537 icuStatus = U_ZERO_ERROR; 538 ucol_setAttribute(collator, UCOL_NORMALIZATION_MODE, UCOL_OFF, &icuStatus); 539 ucol_setAttribute(collator, UCOL_STRENGTH, UCOL_PRIMARY, &icuStatus); 540 ucol_setAttribute(collator, UCOL_CASE_LEVEL, UCOL_OFF, &icuStatus); 541 } 542 543 if (options & kCFCompareNumerically) { 544 UErrorCode icuStatus = U_ZERO_ERROR; 545 ucol_setAttribute(collator, UCOL_NUMERIC_COLLATION, UCOL_OFF, &icuStatus); 546 } 547 return 0; // noErr 548 } 549 550 #endif // DEPLOYMENT_TARGET_MACOSX || DEPLOYMENT_TARGET_EMBEDDED || DEPLOYMENT_TARGET_WINDOWS || DEPLOYMENT_TARGET_LINUX 551 552 static inline CFIndex __extendLocationBackward(CFIndex location, CFStringInlineBuffer *str, const uint8_t *nonBaseBMP, const uint8_t *punctBMP) { 553 while (location > 0) { 554 UTF32Char ch = CFStringGetCharacterFromInlineBuffer(str, location); 555 UTF32Char otherChar; 556 if (CFUniCharIsSurrogateLowCharacter(ch) && CFUniCharIsSurrogateHighCharacter((otherChar = CFStringGetCharacterFromInlineBuffer(str, location - 1)))) { 557 ch = CFUniCharGetLongCharacterForSurrogatePair(ch, otherChar); 558 uint8_t planeNo = (ch >> 16); 559 if ((planeNo > 1) || (!CFUniCharIsMemberOfBitmap(ch, CFUniCharGetBitmapPtrForPlane(kCFUniCharNonBaseCharacterSet, planeNo)) && !CFUniCharIsMemberOfBitmap(ch, CFUniCharGetBitmapPtrForPlane(kCFUniCharPunctuationCharacterSet, planeNo)))) break; 560 location -= 2; 561 } else { 562 if ((!CFUniCharIsMemberOfBitmap(ch, nonBaseBMP) && !CFUniCharIsMemberOfBitmap(ch, punctBMP)) || ((ch >= 0x2E80) && (ch < 0xAC00))) break; 563 --location; 564 } 565 } 566 567 return location; 568 } 569 570 static inline CFIndex __extendLocationForward(CFIndex location, CFStringInlineBuffer *str, const uint8_t *alnumBMP, const uint8_t *punctBMP, const uint8_t *controlBMP, CFIndex strMax) { 571 do { 572 UTF32Char ch = CFStringGetCharacterFromInlineBuffer(str, location); 573 UTF32Char otherChar; 574 if (CFUniCharIsSurrogateHighCharacter(ch) && CFUniCharIsSurrogateLowCharacter((otherChar = CFStringGetCharacterFromInlineBuffer(str, location + 1)))) { 575 ch = CFUniCharGetLongCharacterForSurrogatePair(ch, otherChar); 576 location += 2; 577 uint8_t planeNo = (ch >> 16); 578 if (!CFUniCharIsMemberOfBitmap(ch, CFUniCharGetBitmapPtrForPlane(kCFUniCharAlphaNumericCharacterSet, planeNo)) && !CFUniCharIsMemberOfBitmap(ch, CFUniCharGetBitmapPtrForPlane(kCFUniCharPunctuationCharacterSet, planeNo)) && !CFUniCharIsMemberOfBitmap(ch, CFUniCharGetBitmapPtrForPlane(kCFUniCharControlAndFormatterCharacterSet, planeNo))) break; 579 } else { 580 ++location; 581 if ((!CFUniCharIsMemberOfBitmap(ch, alnumBMP) && !CFUniCharIsMemberOfBitmap(ch, punctBMP) && !CFUniCharIsMemberOfBitmap(ch, controlBMP)) || ((ch >= 0x2E80) && (ch < 0xAC00))) break; 582 } 583 } while (location < strMax); 584 return location; 585 } 586 587 CF_PRIVATE CFComparisonResult _CFCompareStringsWithLocale(CFStringInlineBuffer *str1, CFRange str1Range, CFStringInlineBuffer *str2, CFRange str2Range, CFOptionFlags options, const void *compareLocale) { 588 const UniChar *characters1; 589 const UniChar *characters2; 590 CFComparisonResult compResult = kCFCompareEqualTo; 591 CFRange range1 = str1Range; 592 CFRange range2 = str2Range; 593 SInt32 order; 594 #if DEPLOYMENT_TARGET_MACOSX || DEPLOYMENT_TARGET_EMBEDDED || DEPLOYMENT_TARGET_WINDOWS || DEPLOYMENT_TARGET_LINUX 595 Boolean isEqual; 596 bool forcedOrdering = ((options & kCFCompareForcedOrdering) ? true : false); 597 598 UCollator *collator = NULL; 599 bool defaultCollator = true; 600 #endif 601 static const uint8_t *alnumBMP = NULL; 602 static const uint8_t *nonBaseBMP = NULL; 603 static const uint8_t *punctBMP = NULL; 604 static const uint8_t *controlBMP = NULL; 605 606 if (NULL == alnumBMP) { 607 alnumBMP = CFUniCharGetBitmapPtrForPlane(kCFUniCharAlphaNumericCharacterSet, 0); 608 nonBaseBMP = CFUniCharGetBitmapPtrForPlane(kCFUniCharNonBaseCharacterSet, 0); 609 punctBMP = CFUniCharGetBitmapPtrForPlane(kCFUniCharPunctuationCharacterSet, 0); 610 controlBMP = CFUniCharGetBitmapPtrForPlane(kCFUniCharControlAndFormatterCharacterSet, 0); 611 } 612 613 // Determine the range of characters surrodiing the current index significant for localized comparison. The range is extended backward and forward as long as they are contextual. Contextual characters include all letters and punctuations. Since most control/format characters are ignorable in localized comparison, we also include them extending forward. 614 615 range1.location = str1Range.location; 616 range2.location = str2Range.location; 617 618 // go backward 619 // The characters upto the current index are already determined to be equal by the CFString's standard character folding algorithm. Extend as long as truly contextual (all letters and punctuations). 620 if (range1.location > 0) { 621 range1.location = __extendLocationBackward(range1.location - 1, str1, nonBaseBMP, punctBMP); 622 } 623 624 if (range2.location > 0) { 625 range2.location = __extendLocationBackward(range2.location - 1, str2, nonBaseBMP, punctBMP); 626 } 627 628 #if DEPLOYMENT_TARGET_MACOSX || DEPLOYMENT_TARGET_EMBEDDED || DEPLOYMENT_TARGET_WINDOWS || DEPLOYMENT_TARGET_LINUX 629 #if DEPLOYMENT_TARGET_MACOSX || DEPLOYMENT_TARGET_EMBEDDED 630 // First we try to use the last one used on this thread, if the locale is the same, 631 // otherwise we try to check out a default one, or then we create one. 632 UCollator *threadCollator = _CFGetTSD(__CFTSDKeyCollatorUCollator); 633 CFLocaleRef threadLocale = _CFGetTSD(__CFTSDKeyCollatorLocale); 634 if (compareLocale == threadLocale) { 635 collator = threadCollator; 636 } else { 637 #endif 638 collator = __CFStringCopyDefaultCollator((CFLocaleRef)compareLocale); 639 defaultCollator = true; 640 if (NULL == collator) { 641 collator = __CFStringCreateCollator((CFLocaleRef)compareLocale); 642 defaultCollator = false; 643 } 644 #if DEPLOYMENT_TARGET_MACOSX || DEPLOYMENT_TARGET_EMBEDDED 645 } 646 #endif 647 #endif 648 649 characters1 = CFStringGetCharactersPtrFromInlineBuffer(str1, range1); 650 characters2 = CFStringGetCharactersPtrFromInlineBuffer(str2, range2); 651 652 if ((NULL != characters1) && (NULL != characters2)) { // do fast 653 range1.length = (str1Range.location + str1Range.length) - range1.location; 654 range2.length = (str2Range.location + str2Range.length) - range2.location; 655 656 #if DEPLOYMENT_TARGET_MACOSX || DEPLOYMENT_TARGET_EMBEDDED || DEPLOYMENT_TARGET_WINDOWS || DEPLOYMENT_TARGET_LINUX 657 if ((NULL != collator) && (__CompareTextDefault(collator, options, characters1, range1.length, characters2, range2.length, &isEqual, &order) == 0 /* noErr */)) { 658 compResult = ((isEqual && !forcedOrdering) ? kCFCompareEqualTo : ((order < 0) ? kCFCompareLessThan : kCFCompareGreaterThan)); 659 } else 660 #endif 661 { 662 compResult = ((memcmp(characters1, characters2, sizeof(UniChar) * range1.length) < 0) ? kCFCompareLessThan : kCFCompareGreaterThan); 663 } 664 } else { 665 UniChar *buffer1 = NULL; 666 UniChar *buffer2 = NULL; 667 UTF16Char sBuffer1[kCFStringCompareAllocationIncrement]; 668 UTF16Char sBuffer2[kCFStringCompareAllocationIncrement]; 669 CFIndex buffer1Len = 0, buffer2Len = 0; 670 CFIndex str1Max = str1Range.location + str1Range.length; 671 CFIndex str2Max = str2Range.location + str2Range.length; 672 CFIndex bufferSize; 673 674 // Extend forward and compare until the result is deterministic. The result is indeterministic if the differences are weak and can be resolved by character folding. For example, comparision between "abc" and "ABC" is considered to be indeterministic. 675 do { 676 if (str1Range.location < str1Max) { 677 str1Range.location = __extendLocationForward(str1Range.location, str1, alnumBMP, punctBMP, controlBMP, str1Max); 678 range1.length = (str1Range.location - range1.location); 679 characters1 = CFStringGetCharactersPtrFromInlineBuffer(str1, range1); 680 681 if (NULL == characters1) { 682 if ((0 > buffer1Len) || (range1.length > kCFStringCompareAllocationIncrement)) { 683 if (buffer1Len < range1.length) { 684 bufferSize = range1.length + (kCFStringCompareAllocationIncrement - (range1.length % kCFStringCompareAllocationIncrement)); 685 if (0 == buffer1Len) { 686 buffer1 = (UniChar *)CFAllocatorAllocate(kCFAllocatorSystemDefault, sizeof(UTF16Char) * bufferSize, 0); 687 } else if (buffer1Len < range1.length) { 688 buffer1 = (UniChar *)CFAllocatorReallocate(kCFAllocatorSystemDefault, buffer1, sizeof(UTF16Char) * bufferSize, 0); 689 } 690 buffer1Len = bufferSize; 691 } 692 } else { 693 buffer1 = sBuffer1; 694 } 695 696 CFStringGetCharactersFromInlineBuffer(str1, range1, buffer1); 697 characters1 = buffer1; 698 } 699 } 700 701 if (str2Range.location < str2Max) { 702 str2Range.location = __extendLocationForward(str2Range.location, str2, alnumBMP, punctBMP, controlBMP, str2Max); 703 range2.length = (str2Range.location - range2.location); 704 characters2 = CFStringGetCharactersPtrFromInlineBuffer(str2, range2); 705 706 if (NULL == characters2) { 707 if ((0 > buffer2Len) || (range2.length > kCFStringCompareAllocationIncrement)) { 708 if (buffer2Len < range2.length) { 709 bufferSize = range2.length + (kCFStringCompareAllocationIncrement - (range2.length % kCFStringCompareAllocationIncrement)); 710 if (0 == buffer2Len) { 711 buffer2 = (UniChar *)CFAllocatorAllocate(kCFAllocatorSystemDefault, sizeof(UTF16Char) * bufferSize, 0); 712 } else if (buffer2Len < range2.length) { 713 buffer2 = (UniChar *)CFAllocatorReallocate(kCFAllocatorSystemDefault, buffer2, sizeof(UTF16Char) * bufferSize, 0); 714 } 715 buffer2Len = bufferSize; 716 } 717 } else { 718 buffer2 = sBuffer2; 719 } 720 721 CFStringGetCharactersFromInlineBuffer(str2, range2, buffer2); 722 characters2 = buffer2; 723 } 724 } 725 726 #if DEPLOYMENT_TARGET_MACOSX || DEPLOYMENT_TARGET_EMBEDDED || DEPLOYMENT_TARGET_WINDOWS || DEPLOYMENT_TARGET_LINUX 727 if ((NULL != collator) && (__CompareTextDefault(collator, options, characters1, range1.length, characters2, range2.length, &isEqual, &order) == 0 /* noErr */)) { 728 if (isEqual) { 729 if (forcedOrdering && (kCFCompareEqualTo == compResult) && (0 != order)) compResult = ((order < 0) ? kCFCompareLessThan : kCFCompareGreaterThan); 730 order = 0; 731 } 732 } else 733 #endif 734 { 735 order = memcmp(characters1, characters2, sizeof(UTF16Char) * ((range1.length < range2.length) ? range1.length : range2.length)); 736 if (0 == order) { 737 if (range1.length < range2.length) { 738 order = -2; 739 } else if (range2.length < range1.length) { 740 order = 2; 741 } 742 } else if (order < 0) { 743 --order; 744 } else if (order > 0) { 745 ++order; 746 } 747 } 748 749 if ((order < -1) || (order > 1)) break; // the result is deterministic 750 751 if (0 == order) { 752 range1.location = str1Range.location; 753 range2.location = str2Range.location; 754 } 755 } while ((str1Range.location < str1Max) || (str2Range.location < str2Max)); 756 757 if (0 != order) compResult = ((order < 0) ? kCFCompareLessThan : kCFCompareGreaterThan); 758 759 if (buffer1Len > 0) CFAllocatorDeallocate(kCFAllocatorSystemDefault, buffer1); 760 if (buffer2Len > 0) CFAllocatorDeallocate(kCFAllocatorSystemDefault, buffer2); 761 } 762 763 #if DEPLOYMENT_TARGET_MACOSX || DEPLOYMENT_TARGET_EMBEDDED 764 if (collator == threadCollator) { 765 // do nothing, already cached 766 } else { 767 if (threadLocale) __collatorFinalize((UCollator *)_CFGetTSD(__CFTSDKeyCollatorUCollator)); // need to dealloc collators 768 769 _CFSetTSD(__CFTSDKeyCollatorUCollator, collator, (void *)__collatorFinalize); 770 _CFSetTSD(__CFTSDKeyCollatorLocale, (void *)CFRetain(compareLocale), NULL); 771 } 772 #endif 773 774 return compResult; 775 } 776