/ CFICUConverters.c
CFICUConverters.c
  1  /*
  2   * Copyright (c) 2015 Apple Inc. All rights reserved.
  3   *
  4   * @APPLE_LICENSE_HEADER_START@
  5   *
  6   * This file contains Original Code and/or Modifications of Original Code
  7   * as defined in and that are subject to the Apple Public Source License
  8   * Version 2.0 (the 'License'). You may not use this file except in
  9   * compliance with the License. Please obtain a copy of the License at
 10   * http://www.opensource.apple.com/apsl/ and read it before using this
 11   * file.
 12   *
 13   * The Original Code and all software distributed under the License are
 14   * distributed on an 'AS IS' basis, WITHOUT WARRANTY OF ANY KIND, EITHER
 15   * EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES,
 16   * INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY,
 17   * FITNESS FOR A PARTICULAR PURPOSE, QUIET ENJOYMENT OR NON-INFRINGEMENT.
 18   * Please see the License for the specific language governing rights and
 19   * limitations under the License.
 20   *
 21   * @APPLE_LICENSE_HEADER_END@
 22   */
 23  
 24  /*	CFICUConverters.c
 25  	Copyright (c) 2004-2014, Apple Inc. All rights reserved.
 26  	Responsibility: Aki Inoue
 27  */
 28  
 29  #include "CFStringEncodingDatabase.h"
 30  #include "CFStringEncodingConverterPriv.h"
 31  #include "CFICUConverters.h"
 32  #include <CoreFoundation/CFStringEncodingExt.h>
 33  #include <CoreFoundation/CFUniChar.h>
 34  #include <unicode/ucnv.h>
 35  #include <unicode/uversion.h>
 36  #include "CFInternal.h"
 37  #include <stdio.h>
 38  
 39  // Thread data support
 40  typedef struct {
 41      uint8_t _numSlots;
 42      uint8_t _nextSlot;
 43      UConverter **_converters;
 44  } __CFICUThreadData;
 45  
 46  static void __CFICUThreadDataDestructor(void *context) {
 47      __CFICUThreadData * data = (__CFICUThreadData *)context;
 48      
 49      if (NULL != data->_converters) { // scan to make sure deallocation
 50          UConverter **converter = data->_converters;
 51          UConverter **limit = converter + data->_numSlots;
 52          
 53          while (converter < limit) {
 54              if (NULL != converter) ucnv_close(*converter);
 55              ++converter;
 56          }
 57          CFAllocatorDeallocate(NULL, data->_converters);
 58      }
 59      
 60      CFAllocatorDeallocate(NULL, data);
 61  }
 62  
 63  CF_INLINE __CFICUThreadData *__CFStringEncodingICUGetThreadData() {
 64      __CFICUThreadData * data;
 65  
 66      data = (__CFICUThreadData *)_CFGetTSD(__CFTSDKeyICUConverter);
 67  
 68      if (NULL == data) {
 69          data = (__CFICUThreadData *)CFAllocatorAllocate(NULL, sizeof(__CFICUThreadData), 0);
 70          memset(data, 0, sizeof(__CFICUThreadData));
 71          _CFSetTSD(__CFTSDKeyICUConverter, (void *)data, __CFICUThreadDataDestructor);
 72      }
 73  
 74      return data;
 75  }
 76  
 77  CF_PRIVATE const char *__CFStringEncodingGetICUName(CFStringEncoding encoding) {
 78  #define STACK_BUFFER_SIZE (60)
 79      char buffer[STACK_BUFFER_SIZE];
 80      const char *result = NULL;
 81      UErrorCode errorCode = U_ZERO_ERROR;
 82      uint32_t codepage = 0;
 83  
 84      if (kCFStringEncodingUTF7_IMAP == encoding) return "IMAP-mailbox-name";
 85  
 86      if (kCFStringEncodingUnicode != (encoding & 0x0F00)) codepage = __CFStringEncodingGetWindowsCodePage(encoding); // we don't use codepage for UTF to avoid little endian weirdness of Windows
 87  
 88      if ((0 != codepage) && (snprintf(buffer, STACK_BUFFER_SIZE, "windows-%d", codepage) < STACK_BUFFER_SIZE) && (NULL != (result = ucnv_getAlias(buffer, 0, &errorCode)))) return result;
 89  
 90      if (__CFStringEncodingGetCanonicalName(encoding, buffer, STACK_BUFFER_SIZE)) result = ucnv_getAlias(buffer, 0, &errorCode);
 91  
 92      return result;
 93  #undef STACK_BUFFER_SIZE
 94  }
 95  
 96  CF_PRIVATE CFStringEncoding __CFStringEncodingGetFromICUName(const char *icuName) {
 97      uint32_t codepage;
 98      char *endPtr;
 99      UErrorCode errorCode = U_ZERO_ERROR;
100  
101      if ((0 == strncasecmp_l(icuName, "windows-", strlen("windows-"), NULL)) && (0 != (codepage = strtol(icuName + strlen("windows-"), &endPtr, 10))) && (*endPtr == '\0')) return __CFStringEncodingGetFromWindowsCodePage(codepage);
102  
103      if (0 != ucnv_countAliases(icuName, &errorCode)) {
104          CFStringEncoding encoding;
105          const char *name;
106  
107          // Try WINDOWS platform
108          name = ucnv_getStandardName(icuName, "WINDOWS", &errorCode);
109          
110          if (NULL != name) {
111              if ((0 == strncasecmp_l(name, "windows-", strlen("windows-"), NULL)) && (0 != (codepage = strtol(name + strlen("windows-"), &endPtr, 10))) && (*endPtr == '\0')) return __CFStringEncodingGetFromWindowsCodePage(codepage);
112              
113              if (strncasecmp_l(icuName, name, strlen(name), NULL) && (kCFStringEncodingInvalidId != (encoding = __CFStringEncodingGetFromCanonicalName(name)))) return encoding;
114          }
115  
116          // Try JAVA platform
117          name = ucnv_getStandardName(icuName, "JAVA", &errorCode);
118          if ((NULL != name) && strncasecmp_l(icuName, name, strlen(name), NULL) && (kCFStringEncodingInvalidId != (encoding = __CFStringEncodingGetFromCanonicalName(name)))) return encoding;
119  
120          // Try MIME platform
121          name = ucnv_getStandardName(icuName, "MIME", &errorCode);
122          if ((NULL != name) && strncasecmp_l(icuName, name, strlen(name), NULL) && (kCFStringEncodingInvalidId != (encoding = __CFStringEncodingGetFromCanonicalName(name)))) return encoding;
123      }
124  
125      return kCFStringEncodingInvalidId;
126  }
127  
128  CF_INLINE UConverter *__CFStringEncodingConverterCreateICUConverter(const char *icuName, uint32_t flags, bool toUnicode) {
129      UConverter *converter;
130      UErrorCode errorCode = U_ZERO_ERROR;
131      uint8_t streamID = CFStringEncodingStreamIDFromMask(flags);
132  
133      if (0 != streamID) { // this is a part of streaming previously created
134          __CFICUThreadData *data = __CFStringEncodingICUGetThreadData();
135  
136          --streamID; // map to array index
137  
138          if ((streamID < data->_numSlots) && (NULL != data->_converters[streamID])) return data->_converters[streamID];
139      }
140  
141      converter = ucnv_open(icuName, &errorCode);
142  
143      if (NULL != converter) {
144          char lossyByte = CFStringEncodingMaskToLossyByte(flags);
145  
146          if ((0 == lossyByte) && (0 != (flags & kCFStringEncodingAllowLossyConversion))) lossyByte = '?';
147  
148          if (0 ==lossyByte) {
149              if (toUnicode) {
150                  ucnv_setToUCallBack(converter, &UCNV_TO_U_CALLBACK_STOP, NULL, NULL, NULL, &errorCode);
151              } else {
152                  ucnv_setFromUCallBack(converter, &UCNV_FROM_U_CALLBACK_STOP, NULL, NULL, NULL, &errorCode);
153              }
154          } else {
155              ucnv_setSubstChars(converter, &lossyByte, 1, &errorCode);
156          }
157      }
158  
159      return converter;
160  }
161  
162  #define ICU_CONVERTER_SLOT_INCREMENT (10)
163  #define ICU_CONVERTER_MAX_SLOT (255)
164  
165  static CFIndex __CFStringEncodingConverterReleaseICUConverter(UConverter *converter, uint32_t flags, CFIndex status) {
166      uint8_t streamID = CFStringEncodingStreamIDFromMask(flags);
167  
168      if ((kCFStringEncodingInvalidInputStream != status) && ((0 != (flags & kCFStringEncodingPartialInput)) || ((kCFStringEncodingInsufficientOutputBufferLength == status) && (0 != (flags & kCFStringEncodingPartialOutput))))) {
169          if (0 == streamID) {
170              __CFICUThreadData *data = __CFStringEncodingICUGetThreadData();
171  
172              if (NULL == data->_converters) {
173                  data->_converters = (UConverter **)CFAllocatorAllocate(NULL, sizeof(UConverter *) * ICU_CONVERTER_SLOT_INCREMENT, 0);
174                  memset(data->_converters, 0, sizeof(UConverter *) * ICU_CONVERTER_SLOT_INCREMENT);
175                  data->_numSlots = ICU_CONVERTER_SLOT_INCREMENT;
176                  data->_nextSlot = 0;
177              } else if ((data->_nextSlot >= data->_numSlots) || (NULL != data->_converters[data->_nextSlot])) { // Need to find one
178                  CFIndex index;
179  
180                  for (index = 0;index < data->_numSlots;index++) {
181                      if (NULL == data->_converters[index]) {
182                          data->_nextSlot = index;
183                          break;
184                      }
185                  }
186  
187                  if (index >= data->_numSlots) { // we're full
188                      UConverter **newConverters;
189                      CFIndex newSize = data->_numSlots + ICU_CONVERTER_SLOT_INCREMENT;
190  
191                      if (newSize > ICU_CONVERTER_MAX_SLOT) { // something is terribly wrong
192                          CFLog(kCFLogLevelError, CFSTR("Per-thread streaming ID for ICU converters exhausted. Ignoring..."));
193                          ucnv_close(converter);
194                          return 0;
195                      }
196  
197                      newConverters = (UConverter **)CFAllocatorAllocate(NULL, sizeof(UConverter *) * newSize, 0);
198                      memset(newConverters, 0, sizeof(UConverter *) * newSize);
199                      memcpy(newConverters, data->_converters, sizeof(UConverter *) * data->_numSlots);
200                      CFAllocatorDeallocate(NULL, data->_converters);
201                      data->_converters = newConverters;
202                      data->_nextSlot = data->_numSlots;
203                      data->_numSlots = newSize;
204                  }
205              }
206  
207              data->_converters[data->_nextSlot] = converter;
208              streamID = data->_nextSlot + 1;
209  
210              // now find next slot
211              ++data->_nextSlot;
212  
213              if ((data->_nextSlot >= data->_numSlots) || (NULL != data->_converters[data->_nextSlot])) {
214                  data->_nextSlot = 0;
215  
216                  while ((data->_nextSlot < data->_numSlots) && (NULL != data->_converters[data->_nextSlot])) ++data->_nextSlot;
217              }
218          }
219  
220          return CFStringEncodingStreamIDToMask(streamID);
221      }
222  
223      if (0 != streamID) {
224          __CFICUThreadData *data = __CFStringEncodingICUGetThreadData();
225  
226          --streamID; // map to array index
227  
228          if ((streamID < data->_numSlots) && (converter == data->_converters[streamID])) {
229              data->_converters[streamID] = NULL;
230              if (data->_nextSlot > streamID) data->_nextSlot = streamID;
231          }
232      }
233  
234      ucnv_close(converter);
235  
236      return 0;
237  }
238  
239  #define MAX_BUFFER_SIZE (1000)
240  
241  #if DEPLOYMENT_TARGET_MACOSX || DEPLOYMENT_TARGET_EMBEDDED
242  #if 0
243  // we're no longer doing this check. Revive when the status in the bug changed.
244  #if (U_ICU_VERSION_MAJOR_NUM > 49)
245  #warning Unknown ICU version. Check binary compatibility issues for rdar://problem/6024743
246  #endif
247  #endif
248  #endif
249  #define HAS_ICU_BUG_6024743 (1)
250  #define HAS_ICU_BUG_6025527 (1)
251  
252  CF_PRIVATE CFIndex __CFStringEncodingICUToBytes(const char *icuName, uint32_t flags, const UniChar *characters, CFIndex numChars, CFIndex *usedCharLen, uint8_t *bytes, CFIndex maxByteLen, CFIndex *usedByteLen) {
253      UConverter *converter;
254      UErrorCode errorCode = U_ZERO_ERROR;
255      const UTF16Char *source = characters;
256      const UTF16Char *sourceLimit = source + numChars;
257      char *destination = (char *)bytes;
258      const char *destinationLimit = destination + maxByteLen;
259      bool flush = ((0 == (flags & kCFStringEncodingPartialInput)) ? true : false);
260      CFIndex status;
261  
262      if (NULL == (converter = __CFStringEncodingConverterCreateICUConverter(icuName, flags, false))) return kCFStringEncodingConverterUnavailable;
263  
264      if (0 == maxByteLen) {
265          char buffer[MAX_BUFFER_SIZE];
266          CFIndex totalLength = 0;
267  
268          while ((source < sourceLimit) && (U_ZERO_ERROR == errorCode)) {
269              destination = buffer;
270              destinationLimit = destination + MAX_BUFFER_SIZE;
271  
272              ucnv_fromUnicode(converter, &destination, destinationLimit, (const UChar **)&source, (const UChar *)sourceLimit, NULL, flush, &errorCode);
273  
274              totalLength += (destination - buffer);
275  
276              if (U_BUFFER_OVERFLOW_ERROR == errorCode) errorCode = U_ZERO_ERROR;
277          }
278  
279          if (NULL != usedByteLen) *usedByteLen = totalLength;
280      } else {
281          ucnv_fromUnicode(converter, &destination, destinationLimit, (const UChar **)&source, (const UChar *)sourceLimit, NULL, flush, &errorCode);
282  
283  #if HAS_ICU_BUG_6024743
284  /* Another critical ICU design issue. Similar to conversion error, source pointer returned from U_BUFFER_OVERFLOW_ERROR is already beyond the last valid character position. It renders the returned value from source entirely unusable. We have to manually back up until succeeding <rdar://problem/7183045> Intrestingly, this issue doesn't apply to ucnv_toUnicode. The asynmmetric nature makes this more dangerous */
285          if (U_BUFFER_OVERFLOW_ERROR == errorCode) {
286              const uint8_t *bitmap = CFUniCharGetBitmapPtrForPlane(kCFUniCharNonBaseCharacterSet, 0);
287              const uint8_t *nonBase;
288              UTF32Char character;
289  
290              do {
291                  // Since the output buffer is filled, we can assume no invalid chars (including stray surrogates)
292                  do {
293                      sourceLimit = (source - 1);
294                      character = *sourceLimit;
295                      nonBase = bitmap;
296  
297                      if (CFUniCharIsSurrogateLowCharacter(character)) {
298                          --sourceLimit;
299                          character = CFUniCharGetLongCharacterForSurrogatePair(*sourceLimit, character);
300                          nonBase = CFUniCharGetBitmapPtrForPlane(kCFUniCharNonBaseCharacterSet, (character >> 16) & 0x000F);
301                          character &= 0xFFFF;
302                      }
303                  } while ((sourceLimit > characters) && CFUniCharIsMemberOfBitmap(character, nonBase));
304  
305                  if (sourceLimit > characters) {
306                      source = characters;
307                      destination = (char *)bytes;
308                      errorCode = U_ZERO_ERROR;
309  
310                      ucnv_resetFromUnicode(converter);
311  
312                      ucnv_fromUnicode(converter, &destination, destinationLimit, (const UChar **)&source, (const UChar *)sourceLimit, NULL, flush, &errorCode);
313                  }
314              } while (U_BUFFER_OVERFLOW_ERROR == errorCode);
315  
316              errorCode = U_BUFFER_OVERFLOW_ERROR;
317          }
318  #endif
319          if (NULL != usedByteLen) *usedByteLen = destination - (const char *)bytes;
320      }
321  
322      status = ((U_ZERO_ERROR == errorCode) ? kCFStringEncodingConversionSuccess : ((U_BUFFER_OVERFLOW_ERROR == errorCode) ? kCFStringEncodingInsufficientOutputBufferLength : kCFStringEncodingInvalidInputStream));
323  
324      if (NULL != usedCharLen) {
325  #if HAS_ICU_BUG_6024743
326  /* ICU has a serious behavioral inconsistency issue that the source pointer returned from ucnv_fromUnicode() is after illegal input. We have to keep track of any changes in this area in order to prevent future binary compatiibility issues */
327  	if (kCFStringEncodingInvalidInputStream == status) {
328  #define MAX_ERROR_BUFFER_LEN (32)
329  	    UTF16Char errorBuffer[MAX_ERROR_BUFFER_LEN];
330  	    int8_t errorLength = MAX_ERROR_BUFFER_LEN;
331  #undef MAX_ERROR_BUFFER_LEN
332  
333  	    errorCode = U_ZERO_ERROR;
334  
335  	    ucnv_getInvalidUChars(converter, (UChar *)errorBuffer, &errorLength, &errorCode);
336  
337  	    if (U_ZERO_ERROR == errorCode) {
338  		source -= errorLength;
339  	    } else {
340  		// Gah, something is terribly wrong. Reset everything
341  		source = characters; // 0 length
342  		if (NULL != usedByteLen) *usedByteLen = 0;
343  	    }
344  	}
345  #endif
346  	*usedCharLen = source - characters;
347      }
348  
349      status |= __CFStringEncodingConverterReleaseICUConverter(converter, flags, status);
350  
351      return status;
352  }
353  
354  CF_PRIVATE CFIndex __CFStringEncodingICUToUnicode(const char *icuName, uint32_t flags, const uint8_t *bytes, CFIndex numBytes, CFIndex *usedByteLen, UniChar *characters, CFIndex maxCharLen, CFIndex *usedCharLen) {
355      UConverter *converter;
356      UErrorCode errorCode = U_ZERO_ERROR;
357      const char *source = (const char *)bytes;
358      const char *sourceLimit = source + numBytes;
359      UTF16Char *destination = characters;
360      const UTF16Char *destinationLimit = destination + maxCharLen;
361      bool flush = ((0 == (flags & kCFStringEncodingPartialInput)) ? true : false);
362      CFIndex status;
363  
364      if (NULL == (converter = __CFStringEncodingConverterCreateICUConverter(icuName, flags, true))) return kCFStringEncodingConverterUnavailable;
365  
366      if (0 == maxCharLen) {
367          UTF16Char buffer[MAX_BUFFER_SIZE];
368          CFIndex totalLength = 0;
369          
370          while ((source < sourceLimit) && (U_ZERO_ERROR == errorCode)) {
371              destination = buffer;
372              destinationLimit = destination + MAX_BUFFER_SIZE;
373              
374              ucnv_toUnicode(converter, (UChar **)&destination, (const UChar *)destinationLimit, &source, sourceLimit, NULL, flush, &errorCode);
375              
376              totalLength += (destination - buffer);
377              
378              if (U_BUFFER_OVERFLOW_ERROR == errorCode) errorCode = U_ZERO_ERROR;
379          }
380          
381          if (NULL != usedCharLen) *usedCharLen = totalLength;
382      } else {
383          ucnv_toUnicode(converter, (UChar **)&destination, (const UChar *)destinationLimit, &source, sourceLimit, NULL, flush, &errorCode);
384  
385          if (NULL != usedCharLen) *usedCharLen = destination - characters;
386      }
387  
388      status = ((U_ZERO_ERROR == errorCode) ? kCFStringEncodingConversionSuccess : ((U_BUFFER_OVERFLOW_ERROR == errorCode) ? kCFStringEncodingInsufficientOutputBufferLength : kCFStringEncodingInvalidInputStream));
389  
390      if (NULL != usedByteLen) {
391  #if HAS_ICU_BUG_6024743
392  	/* ICU has a serious behavioral inconsistency issue that the source pointer returned from ucnv_toUnicode() is after illegal input. We have to keep track of any changes in this area in order to prevent future binary compatiibility issues */
393  	if (kCFStringEncodingInvalidInputStream == status) {
394  #define MAX_ERROR_BUFFER_LEN (32)
395  	    char errorBuffer[MAX_ERROR_BUFFER_LEN];
396  	    int8_t errorLength = MAX_ERROR_BUFFER_LEN;
397  #undef MAX_ERROR_BUFFER_LEN
398  
399  	    errorCode = U_ZERO_ERROR;
400  	    
401  	    ucnv_getInvalidChars(converter, errorBuffer, &errorLength, &errorCode);
402  	    
403  	    if (U_ZERO_ERROR == errorCode) {
404  #if HAS_ICU_BUG_6025527
405                  // Another ICU oddness here. ucnv_getInvalidUChars() writes the '\0' terminator, and errorLength includes the extra byte.
406                  if ((errorLength > 0) && ('\0' == errorBuffer[errorLength - 1])) --errorLength;
407  #endif
408  		source -= errorLength;
409  	    } else {
410  		// Gah, something is terribly wrong. Reset everything
411  		source = (const char *)bytes; // 0 length
412  		if (NULL != usedCharLen) *usedCharLen = 0;
413  	    }
414  	}
415  #endif
416  
417  	*usedByteLen = source - (const char *)bytes;
418      }
419      
420      status |= __CFStringEncodingConverterReleaseICUConverter(converter, flags, status);
421  
422      return status;
423  }
424  
425  CF_PRIVATE CFIndex __CFStringEncodingICUCharLength(const char *icuName, uint32_t flags, const uint8_t *bytes, CFIndex numBytes) {
426      CFIndex usedCharLen;
427      return (__CFStringEncodingICUToUnicode(icuName, flags, bytes, numBytes, NULL, NULL, 0, &usedCharLen) == kCFStringEncodingConversionSuccess ? usedCharLen : 0);
428  }
429  
430  CF_PRIVATE CFIndex __CFStringEncodingICUByteLength(const char *icuName, uint32_t flags, const UniChar *characters, CFIndex numChars) {
431      CFIndex usedByteLen;
432      return (__CFStringEncodingICUToBytes(icuName, flags, characters, numChars, NULL, NULL, 0, &usedByteLen) == kCFStringEncodingConversionSuccess ? usedByteLen : 0);
433  }
434  
435  CF_PRIVATE CFStringEncoding *__CFStringEncodingCreateICUEncodings(CFAllocatorRef allocator, CFIndex *numberOfIndex) {
436      CFIndex count = ucnv_countAvailable();
437      CFIndex numEncodings = 0;
438      CFStringEncoding *encodings;
439      CFStringEncoding encoding;
440      CFIndex index;
441  
442      if (0 == count) return NULL;
443  
444      encodings = (CFStringEncoding *)CFAllocatorAllocate(NULL, sizeof(CFStringEncoding) * count, 0);
445  
446      for (index = 0;index < count;index++) {
447          encoding = __CFStringEncodingGetFromICUName(ucnv_getAvailableName(index));
448  
449          if (kCFStringEncodingInvalidId != encoding) encodings[numEncodings++] = encoding;
450      }
451  
452      if (0 == numEncodings) {
453          CFAllocatorDeallocate(allocator, encodings);
454          encodings = NULL;
455      }
456  
457      *numberOfIndex = numEncodings;
458  
459      return encodings;
460  }