/ CFUnicodeDecomposition.c
CFUnicodeDecomposition.c
  1  /*
  2   * Copyright (c) 2015 Apple Inc. All rights reserved.
  3   *
  4   * @APPLE_LICENSE_HEADER_START@
  5   *
  6   * This file contains Original Code and/or Modifications of Original Code
  7   * as defined in and that are subject to the Apple Public Source License
  8   * Version 2.0 (the 'License'). You may not use this file except in
  9   * compliance with the License. Please obtain a copy of the License at
 10   * http://www.opensource.apple.com/apsl/ and read it before using this
 11   * file.
 12   *
 13   * The Original Code and all software distributed under the License are
 14   * distributed on an 'AS IS' basis, WITHOUT WARRANTY OF ANY KIND, EITHER
 15   * EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES,
 16   * INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY,
 17   * FITNESS FOR A PARTICULAR PURPOSE, QUIET ENJOYMENT OR NON-INFRINGEMENT.
 18   * Please see the License for the specific language governing rights and
 19   * limitations under the License.
 20   *
 21   * @APPLE_LICENSE_HEADER_END@
 22   */
 23  
 24  /*    CFUnicodeDecomposition.c
 25      Copyright (c) 1999-2014, Apple Inc. All rights reserved.
 26      Responsibility: Aki Inoue
 27  */
 28  
 29  #include <string.h>
 30  #include <CoreFoundation/CFBase.h>
 31  #include <CoreFoundation/CFCharacterSet.h>
 32  #include <CoreFoundation/CFUniChar.h>
 33  #include <CoreFoundation/CFUnicodeDecomposition.h>
 34  #include "CFInternal.h"
 35  #include "CFUniCharPriv.h"
 36  
 37  // Canonical Decomposition
 38  static UTF32Char *__CFUniCharDecompositionTable = NULL;
 39  static uint32_t __CFUniCharDecompositionTableLength = 0;
 40  static UTF32Char *__CFUniCharMultipleDecompositionTable = NULL;
 41  
 42  static const uint8_t *__CFUniCharDecomposableBitmapForBMP = NULL;
 43  static const uint8_t *__CFUniCharHFSPlusDecomposableBitmapForBMP = NULL;
 44  
 45  static CFLock_t __CFUniCharDecompositionTableLock = CFLockInit;
 46  
 47  static const uint8_t **__CFUniCharCombiningPriorityTable = NULL;
 48  static uint8_t __CFUniCharCombiningPriorityTableNumPlane = 0;
 49  
 50  static void __CFUniCharLoadDecompositionTable(void) {
 51  
 52      __CFLock(&__CFUniCharDecompositionTableLock);
 53  
 54      if (NULL == __CFUniCharDecompositionTable) {
 55          const uint32_t *bytes = (uint32_t *)CFUniCharGetMappingData(kCFUniCharCanonicalDecompMapping);
 56  
 57          if (NULL == bytes) {
 58              __CFUnlock(&__CFUniCharDecompositionTableLock);
 59              return;
 60          }
 61  
 62          __CFUniCharDecompositionTableLength = *(bytes++);
 63          __CFUniCharDecompositionTable = (UTF32Char *)bytes;
 64          __CFUniCharMultipleDecompositionTable = (UTF32Char *)((intptr_t)bytes + __CFUniCharDecompositionTableLength);
 65  
 66          __CFUniCharDecompositionTableLength /= (sizeof(uint32_t) * 2);
 67          __CFUniCharDecomposableBitmapForBMP = CFUniCharGetBitmapPtrForPlane(kCFUniCharCanonicalDecomposableCharacterSet, 0);
 68          __CFUniCharHFSPlusDecomposableBitmapForBMP = CFUniCharGetBitmapPtrForPlane(kCFUniCharHFSPlusDecomposableCharacterSet, 0);
 69  
 70          CFIndex idx;
 71  
 72          __CFUniCharCombiningPriorityTableNumPlane = CFUniCharGetNumberOfPlanesForUnicodePropertyData(kCFUniCharCombiningProperty);
 73          __CFUniCharCombiningPriorityTable = (const uint8_t **)CFAllocatorAllocate(kCFAllocatorSystemDefault, sizeof(uint8_t *) * __CFUniCharCombiningPriorityTableNumPlane, 0);
 74          for (idx = 0;idx < __CFUniCharCombiningPriorityTableNumPlane;idx++) __CFUniCharCombiningPriorityTable[idx] = (const uint8_t *)CFUniCharGetUnicodePropertyDataForPlane(kCFUniCharCombiningProperty, idx);
 75      }
 76  
 77      __CFUnlock(&__CFUniCharDecompositionTableLock);
 78  }
 79  
 80  static CFLock_t __CFUniCharCompatibilityDecompositionTableLock = CFLockInit;
 81  static UTF32Char *__CFUniCharCompatibilityDecompositionTable = NULL;
 82  static uint32_t __CFUniCharCompatibilityDecompositionTableLength = 0;
 83  static UTF32Char *__CFUniCharCompatibilityMultipleDecompositionTable = NULL;
 84  
 85  static void __CFUniCharLoadCompatibilityDecompositionTable(void) {
 86  
 87      __CFLock(&__CFUniCharCompatibilityDecompositionTableLock);
 88  
 89      if (NULL == __CFUniCharCompatibilityDecompositionTable) {
 90          const uint32_t *bytes = (uint32_t *)CFUniCharGetMappingData(kCFUniCharCompatibilityDecompMapping);
 91  
 92          if (NULL == bytes) {
 93              __CFUnlock(&__CFUniCharCompatibilityDecompositionTableLock);
 94              return;
 95          }
 96  
 97          __CFUniCharCompatibilityDecompositionTableLength = *(bytes++);
 98          __CFUniCharCompatibilityDecompositionTable = (UTF32Char *)bytes;
 99          __CFUniCharCompatibilityMultipleDecompositionTable = (UTF32Char *)((intptr_t)bytes + __CFUniCharCompatibilityDecompositionTableLength);
100  
101          __CFUniCharCompatibilityDecompositionTableLength /= (sizeof(uint32_t) * 2);
102      }
103  
104      __CFUnlock(&__CFUniCharCompatibilityDecompositionTableLock);
105  }
106  
107  CF_INLINE bool __CFUniCharIsDecomposableCharacterWithFlag(UTF32Char character, bool isHFSPlus) {
108      return CFUniCharIsMemberOfBitmap(character, (character < 0x10000 ? (isHFSPlus ? __CFUniCharHFSPlusDecomposableBitmapForBMP : __CFUniCharDecomposableBitmapForBMP) : CFUniCharGetBitmapPtrForPlane(kCFUniCharCanonicalDecomposableCharacterSet, ((character >> 16) & 0xFF))));
109  }
110  
111  CF_INLINE uint8_t __CFUniCharGetCombiningPropertyForCharacter(UTF32Char character) { return CFUniCharGetCombiningPropertyForCharacter(character, (((character) >> 16) < __CFUniCharCombiningPriorityTableNumPlane ? __CFUniCharCombiningPriorityTable[(character) >> 16] : NULL)); }
112  
113  CF_INLINE bool __CFUniCharIsNonBaseCharacter(UTF32Char character) { return ((0 == __CFUniCharGetCombiningPropertyForCharacter(character)) ? false : true); } // the notion of non-base in normalization is characters with non-0 combining class
114  
115  typedef struct {
116      uint32_t _key;
117      uint32_t _value;
118  } __CFUniCharDecomposeMappings;
119  
120  static uint32_t __CFUniCharGetMappedValue(const __CFUniCharDecomposeMappings *theTable, uint32_t numElem, UTF32Char character) {
121      const __CFUniCharDecomposeMappings *p, *q, *divider;
122  
123      if ((character < theTable[0]._key) || (character > theTable[numElem-1]._key)) {
124          return 0;
125      }
126      p = theTable;
127      q = p + (numElem-1);
128      while (p <= q) {
129          divider = p + ((q - p) >> 1);    /* divide by 2 */
130          if (character < divider->_key) { q = divider - 1; }
131          else if (character > divider->_key) { p = divider + 1; }
132          else { return divider->_value; }
133      }
134      return 0;
135  }
136  
137  static void __CFUniCharPrioritySort(UTF32Char *characters, CFIndex length) {
138      UTF32Char *end = characters + length;
139  
140      while ((characters < end) && (0 == __CFUniCharGetCombiningPropertyForCharacter(*characters))) ++characters;
141  
142      if ((end - characters) > 1) {
143          uint32_t p1, p2;
144          UTF32Char *ch1, *ch2;
145          bool changes = true;
146  
147          do {
148              changes = false;
149              ch1 = characters; ch2 = characters + 1;
150              p2 = __CFUniCharGetCombiningPropertyForCharacter(*ch1);
151              while (ch2 < end) {
152                  p1 = p2; p2 = __CFUniCharGetCombiningPropertyForCharacter(*ch2);
153                  if (p1 > p2) {
154                      UTF32Char tmp = *ch1; *ch1 = *ch2; *ch2 = tmp;
155                      changes = true;
156                  }
157                  ++ch1; ++ch2;
158              }
159          } while (changes);
160      }
161  }
162  
163  static CFIndex __CFUniCharRecursivelyDecomposeCharacter(UTF32Char character, UTF32Char *convertedChars, CFIndex maxBufferLength) {
164      uint32_t value = __CFUniCharGetMappedValue((const __CFUniCharDecomposeMappings *)__CFUniCharDecompositionTable, __CFUniCharDecompositionTableLength, character);
165      CFIndex length = CFUniCharConvertFlagToCount(value);
166      UTF32Char firstChar = value & 0xFFFFFF;
167      UTF32Char *mappings = (length > 1 ? __CFUniCharMultipleDecompositionTable + firstChar : &firstChar);
168      CFIndex usedLength = 0;
169  
170      if (maxBufferLength < length) return 0;
171  
172      if (value & kCFUniCharRecursiveDecompositionFlag) {
173          usedLength = __CFUniCharRecursivelyDecomposeCharacter(*mappings, convertedChars, maxBufferLength - length);
174  
175          --length; // Decrement for the first char
176          if (!usedLength || usedLength + length > maxBufferLength) return 0;
177          ++mappings;
178          convertedChars += usedLength;
179      }
180  
181      usedLength += length;
182  
183      while (length--) *(convertedChars++) = *(mappings++);
184  
185      return usedLength;
186  }
187      
188  #define HANGUL_SBASE 0xAC00
189  #define HANGUL_LBASE 0x1100
190  #define HANGUL_VBASE 0x1161
191  #define HANGUL_TBASE 0x11A7
192  #define HANGUL_SCOUNT 11172
193  #define HANGUL_LCOUNT 19
194  #define HANGUL_VCOUNT 21
195  #define HANGUL_TCOUNT 28
196  #define HANGUL_NCOUNT (HANGUL_VCOUNT * HANGUL_TCOUNT)
197  
198  CFIndex CFUniCharDecomposeCharacter(UTF32Char character, UTF32Char *convertedChars, CFIndex maxBufferLength) {
199      if (NULL == __CFUniCharDecompositionTable) __CFUniCharLoadDecompositionTable();
200      if (character >= HANGUL_SBASE && character <= (HANGUL_SBASE + HANGUL_SCOUNT)) {
201          CFIndex length;
202  
203          character -= HANGUL_SBASE;
204  
205          length = (character % HANGUL_TCOUNT ? 3 : 2);
206  
207          if (maxBufferLength < length) return 0;
208  
209          *(convertedChars++) = character / HANGUL_NCOUNT + HANGUL_LBASE;
210          *(convertedChars++) = (character % HANGUL_NCOUNT) / HANGUL_TCOUNT + HANGUL_VBASE;
211          if (length > 2) *convertedChars = (character % HANGUL_TCOUNT) + HANGUL_TBASE;
212          return length;
213      } else {
214          return __CFUniCharRecursivelyDecomposeCharacter(character, convertedChars, maxBufferLength);
215      }
216  }
217  
218  CF_INLINE bool __CFProcessReorderBuffer(UTF32Char *buffer, CFIndex length, void **dst, CFIndex dstLength, CFIndex *filledLength, uint32_t dstFormat) {
219      if (length > 1) __CFUniCharPrioritySort(buffer, length);
220      return CFUniCharFillDestinationBuffer(buffer, length, dst, dstLength, filledLength, dstFormat);
221  }
222  
223  #define MAX_BUFFER_LENGTH (32)
224  bool CFUniCharDecompose(const UTF16Char *src, CFIndex length, CFIndex *consumedLength, void *dst, CFIndex maxLength, CFIndex *filledLength, bool needToReorder, uint32_t dstFormat, bool isHFSPlus) {
225      CFIndex usedLength = 0;
226      CFIndex originalLength = length;
227      UTF32Char buffer[MAX_BUFFER_LENGTH];
228      UTF32Char *decompBuffer = buffer;
229      CFIndex decompBufferSize = MAX_BUFFER_LENGTH;
230      CFIndex decompBufferLen = 0;
231      CFIndex segmentLength = 0;
232      UTF32Char currentChar;
233  
234      if (NULL == __CFUniCharDecompositionTable) __CFUniCharLoadDecompositionTable();
235  
236      while ((length - segmentLength) > 0) {
237          currentChar = *(src++);
238  
239          if (currentChar < 0x80) {
240              if (decompBufferLen > 0) {
241                  if (!__CFProcessReorderBuffer(decompBuffer, decompBufferLen, &dst, maxLength, &usedLength, dstFormat)) break;
242                  length -= segmentLength;
243                  segmentLength = 0;
244                  decompBufferLen = 0;
245              }
246  
247              if (maxLength > 0) {
248                  if (usedLength >= maxLength) break;
249                  switch (dstFormat) {
250                  case kCFUniCharUTF8Format: *(uint8_t *)dst = currentChar; dst = (uint8_t *)dst + sizeof(uint8_t); break;
251                  case kCFUniCharUTF16Format: *(UTF16Char *)dst = currentChar; dst = (uint8_t *)dst + sizeof(UTF16Char); break;
252                  case kCFUniCharUTF32Format: *(UTF32Char *)dst = currentChar; dst = (uint8_t *)dst + sizeof(UTF32Char); break;
253                  }
254              }
255  
256              --length;
257              ++usedLength;
258          } else {
259              if (CFUniCharIsSurrogateLowCharacter(currentChar)) { // Stray surrogagte
260                  if (dstFormat != kCFUniCharUTF16Format) break;
261              } else if (CFUniCharIsSurrogateHighCharacter(currentChar)) {
262                  if (((length - segmentLength) > 1) && CFUniCharIsSurrogateLowCharacter(*src)) {
263                      currentChar = CFUniCharGetLongCharacterForSurrogatePair(currentChar, *(src++));
264                  } else {
265                      if (dstFormat != kCFUniCharUTF16Format) break;
266                  }
267              }
268  
269              if (needToReorder && __CFUniCharIsNonBaseCharacter(currentChar)) {
270                  if ((decompBufferLen + 1) >= decompBufferSize) {
271                      UTF32Char *newBuffer;
272                      
273                      decompBufferSize += MAX_BUFFER_LENGTH;
274                      newBuffer = (UTF32Char *)CFAllocatorAllocate(kCFAllocatorSystemDefault, sizeof(UTF32Char) * decompBufferSize, 0);
275                      memmove(newBuffer, decompBuffer, (decompBufferSize - MAX_BUFFER_LENGTH) * sizeof(UTF32Char));
276                      if (decompBuffer != buffer) CFAllocatorDeallocate(kCFAllocatorSystemDefault, decompBuffer);
277                      decompBuffer = newBuffer;
278                  }
279                  
280                  if (__CFUniCharIsDecomposableCharacterWithFlag(currentChar, isHFSPlus)) { // Vietnamese accent, etc.
281                      decompBufferLen += CFUniCharDecomposeCharacter(currentChar, decompBuffer + decompBufferLen, decompBufferSize - decompBufferLen);
282                  } else {
283                      decompBuffer[decompBufferLen++] = currentChar;
284                  }
285              } else {
286                  if (decompBufferLen > 0) {
287                      if (!__CFProcessReorderBuffer(decompBuffer, decompBufferLen, &dst, maxLength, &usedLength, dstFormat)) break;
288                      length -= segmentLength;
289                      segmentLength = 0;
290                  }
291  
292                  if (__CFUniCharIsDecomposableCharacterWithFlag(currentChar, isHFSPlus)) {
293                      decompBufferLen = CFUniCharDecomposeCharacter(currentChar, decompBuffer, MAX_BUFFER_LENGTH);
294                  } else {
295                      decompBufferLen = 1;
296                      *decompBuffer = currentChar;
297                  }
298  
299                  if (!needToReorder || (decompBufferLen == 1)) {
300                      if (!CFUniCharFillDestinationBuffer(decompBuffer, decompBufferLen, &dst, maxLength, &usedLength, dstFormat)) break;
301                      length -= ((currentChar > 0xFFFF) ? 2 : 1);
302                      decompBufferLen = 0;
303                      continue;
304                  }
305              }
306  
307              segmentLength += ((currentChar > 0xFFFF) ? 2 : 1);
308          }
309      }
310  
311      if ((decompBufferLen > 0) && __CFProcessReorderBuffer(decompBuffer, decompBufferLen, &dst, maxLength, &usedLength, dstFormat)) length -= segmentLength;
312  
313      if (decompBuffer != buffer) CFAllocatorDeallocate(kCFAllocatorSystemDefault, decompBuffer);
314  
315      if (consumedLength) *consumedLength = originalLength - length;
316      if (filledLength) *filledLength = usedLength;
317  
318      return ((length > 0) ? false : true);
319  }
320  
321  #define MAX_COMP_DECOMP_LEN (32)
322  
323  static CFIndex __CFUniCharRecursivelyCompatibilityDecomposeCharacter(UTF32Char character, UTF32Char *convertedChars) {
324      uint32_t value = __CFUniCharGetMappedValue((const __CFUniCharDecomposeMappings *)__CFUniCharCompatibilityDecompositionTable, __CFUniCharCompatibilityDecompositionTableLength, character);
325      CFIndex length = CFUniCharConvertFlagToCount(value);
326      UTF32Char firstChar = value & 0xFFFFFF;
327      const UTF32Char *mappings = (length > 1 ? __CFUniCharCompatibilityMultipleDecompositionTable + firstChar : &firstChar);
328      CFIndex usedLength = length;
329      UTF32Char currentChar;
330      CFIndex currentLength;
331  
332      while (length-- > 0) {
333          currentChar = *(mappings++);
334          if (__CFUniCharIsDecomposableCharacterWithFlag(currentChar, false)) {
335              currentLength = __CFUniCharRecursivelyDecomposeCharacter(currentChar, convertedChars, MAX_COMP_DECOMP_LEN - length);
336              convertedChars += currentLength;
337              usedLength += (currentLength - 1);
338          } else if (CFUniCharIsMemberOf(currentChar, kCFUniCharCompatibilityDecomposableCharacterSet)) {
339              currentLength = __CFUniCharRecursivelyCompatibilityDecomposeCharacter(currentChar, convertedChars);
340              convertedChars += currentLength;
341              usedLength += (currentLength - 1);
342          } else {
343              *(convertedChars++) = currentChar;
344          }
345      }
346  
347      return usedLength;
348  }
349  
350  CF_INLINE void __CFUniCharMoveBufferFromEnd1(UTF32Char *convertedChars, CFIndex length, CFIndex delta) {
351      const UTF32Char *limit = convertedChars;
352      UTF32Char *dstP;
353  
354      convertedChars += length;
355      dstP = convertedChars + delta;
356  
357      while (convertedChars > limit) *(--dstP) = *(--convertedChars);
358  }
359  
360  CF_PRIVATE CFIndex CFUniCharCompatibilityDecompose(UTF32Char *convertedChars, CFIndex length, CFIndex maxBufferLength) {
361      UTF32Char currentChar;
362      UTF32Char buffer[MAX_COMP_DECOMP_LEN];
363      const UTF32Char *bufferP;
364      const UTF32Char *limit = convertedChars + length;
365      CFIndex filledLength;
366  
367      if (NULL == __CFUniCharCompatibilityDecompositionTable) __CFUniCharLoadCompatibilityDecompositionTable();
368  
369      while (convertedChars < limit) {
370          currentChar = *convertedChars;
371  
372          if (CFUniCharIsMemberOf(currentChar, kCFUniCharCompatibilityDecomposableCharacterSet)) {
373              filledLength = __CFUniCharRecursivelyCompatibilityDecomposeCharacter(currentChar, buffer);
374  
375              if (filledLength + length - 1 > maxBufferLength) return 0;
376  
377              if (filledLength > 1) __CFUniCharMoveBufferFromEnd1(convertedChars + 1, limit - convertedChars - 1, filledLength - 1);
378  
379              bufferP = buffer;
380              length += (filledLength - 1);
381              while (filledLength-- > 0) *(convertedChars++) = *(bufferP++);
382          } else {
383              ++convertedChars;
384          }
385      }
386      
387      return length;
388  }
389  
390  CF_EXPORT void CFUniCharPrioritySort(UTF32Char *characters, CFIndex length) {
391      __CFUniCharPrioritySort(characters, length);
392  }
393  
394  #undef MAX_BUFFER_LENGTH
395  #undef MAX_COMP_DECOMP_LEN
396  #undef HANGUL_SBASE
397  #undef HANGUL_LBASE
398  #undef HANGUL_VBASE
399  #undef HANGUL_TBASE
400  #undef HANGUL_SCOUNT
401  #undef HANGUL_LCOUNT
402  #undef HANGUL_VCOUNT
403  #undef HANGUL_TCOUNT
404  #undef HANGUL_NCOUNT
405