/ CFStringTokenizer.c
CFStringTokenizer.c
  1  //
  2  //  CFStringTokenizer.c
  3  //  CoreFoundation
  4  //
  5  //  Copyright (c) 2014 Apportable. All rights reserved.
  6  //
  7  
  8  #include "CFBase.h"
  9  #include "CFRuntime.h"
 10  #include "CFStringTokenizer.h"
 11  #include <unicode/ubrk.h>
 12  
 13  #define TYPE_MASK 0x000000FF
 14  
 15  struct __CFStringTokenizer {
 16      CFRuntimeBase _base;
 17      CFStringRef _string;
 18      CFRange _range;
 19      CFOptionFlags _options;
 20      CFLocaleRef _locale;
 21      UBreakIterator *_break_itr;
 22  };
 23  
 24  static void __CFStringTokenizerDeallocate(CFTypeRef cf) {
 25      struct __CFStringTokenizer *tokenizer = (struct __CFStringTokenizer *)cf;
 26      if (tokenizer->_string) {
 27          CFRelease(tokenizer->_string);
 28      }
 29  
 30      if (tokenizer->_locale) {
 31          CFRelease(tokenizer->_locale);
 32      }
 33  
 34      if (tokenizer->_break_itr) {
 35          ubrk_close(tokenizer->_break_itr);
 36      }
 37  }
 38  
 39  
 40  static CFTypeID __kCFStringTokenizerTypeID = _kCFRuntimeNotATypeID;
 41  
 42  static const CFRuntimeClass __CFStringTokenizerClass = {
 43      _kCFRuntimeScannedObject,
 44      "CFStringTokenizer",
 45      NULL,   // init
 46      NULL,   // copy
 47      __CFStringTokenizerDeallocate,
 48      NULL,   // __CFStringTokenizerEqual,
 49      NULL,   // __CFStringTokenizerHash,
 50      NULL,   // 
 51      NULL
 52  };
 53  
 54  void __CFStringTokenizerInitialize(void) {
 55      __kCFStringTokenizerTypeID = _CFRuntimeRegisterClass(&__CFStringTokenizerClass);
 56  }
 57  
 58  
 59  CFTypeID CFStringTokenizerGetTypeID(void) {
 60      if (__kCFStringTokenizerTypeID == _kCFRuntimeNotATypeID) {
 61          __CFStringTokenizerInitialize();
 62      }
 63      return __kCFStringTokenizerTypeID;
 64  }
 65  
 66  #define BUFFER_SIZE 768
 67  
 68  CFStringTokenizerRef CFStringTokenizerCreate(CFAllocatorRef allocator, CFStringRef string, CFRange range, CFOptionFlags options, CFLocaleRef locale) {
 69      CFIndex size = sizeof(struct __CFStringTokenizer) - sizeof(CFRuntimeBase);
 70      struct __CFStringTokenizer *tokenizer = (struct __CFStringTokenizer *)_CFRuntimeCreateInstance(allocator, CFStringTokenizerGetTypeID(), size, NULL);
 71      tokenizer->_string = CFStringCreateCopy(allocator, string);
 72      tokenizer->_range = range;
 73      tokenizer->_options = options;
 74      if (locale == NULL) {
 75          tokenizer->_locale = CFLocaleCopyCurrent();
 76      } else {
 77          tokenizer->_locale = CFRetain(locale);
 78      }
 79  
 80  
 81      CFStringRef localeName = locale ? CFLocaleGetIdentifier(locale) : CFSTR("");
 82      char buffer[BUFFER_SIZE];
 83      const char *cstr = CFStringGetCStringPtr(localeName, kCFStringEncodingASCII);
 84      if (NULL == cstr) {
 85          if (CFStringGetCString(localeName, buffer, BUFFER_SIZE, kCFStringEncodingASCII)) {
 86              cstr = buffer;
 87          }
 88      }
 89  
 90      if (NULL == cstr) {
 91          CFRelease((CFTypeRef)tokenizer);
 92          return NULL;
 93      }
 94  
 95      UBreakIteratorType type;
 96      // UBRK_CHARACTER, UBRK_WORD, UBRK_LINE, UBRK_SENTENCE
 97      switch (options & TYPE_MASK) { // mask off the high bits since they can be options
 98          case kCFStringTokenizerUnitWord:
 99          case kCFStringTokenizerUnitWordBoundary:
100              type = UBRK_WORD;
101              break;
102          case kCFStringTokenizerUnitSentence:
103          case kCFStringTokenizerUnitParagraph: // this seems incorrect.
104              type = UBRK_SENTENCE;
105              break;
106          case kCFStringTokenizerUnitLineBreak:
107              type = UBRK_LINE;
108              break;
109      }
110  
111      UChar stack_text[BUFFER_SIZE] = {0};
112      UChar *text = &stack_text[0];
113      CFIndex len = CFStringGetLength(string);
114      if (len > BUFFER_SIZE) {
115          text = malloc(len * sizeof(UChar));
116          if (text == NULL) {
117              CFRelease(tokenizer);
118              return NULL;
119          }
120      }
121      CFStringGetCharacters(string, CFRangeMake(0, len), (UniChar *)text);
122      UErrorCode err = 0;
123      tokenizer->_break_itr = ubrk_open(type, cstr, text, len, &err);
124      if (text != &stack_text[0]) {
125          free(text);
126      }
127  
128      if (tokenizer->_break_itr == NULL) {
129          CFRelease(tokenizer);
130          return NULL;
131      }
132  
133      return tokenizer;
134  }
135  
136  /*
137  This requires a fairly massive database and heuristic modeling of langauge.
138  CFStringRef CFStringTokenizerCopyBestStringLanguage(CFStringRef string, CFRange range) {
139  
140  }
141  */
142  
143  void CFStringTokenizerSetString(CFStringTokenizerRef tokenizer, CFStringRef string, CFRange range) {
144  
145  #warning TODO: range is not handled currently
146  
147      UChar stack_text[BUFFER_SIZE] = { 0 };
148      UChar *text = &stack_text[0];
149      CFIndex len = CFStringGetLength(string);
150      if (len > BUFFER_SIZE) {
151          text = malloc(len * sizeof(UChar));
152          if (text == NULL) {
153              return;
154          }
155      }
156      CFStringGetCharacters(string, CFRangeMake(0, len), (UniChar *)text);
157      UErrorCode err = 0;
158      ubrk_setText(tokenizer->_break_itr, text, len, &err);
159      if (text != &stack_text[0]) {
160          free(text);
161      }  
162  }
163  
164  CFStringTokenizerTokenType CFStringTokenizerGoToTokenAtIndex(CFStringTokenizerRef tokenizer, CFIndex index) {
165      int32_t res = ubrk_following(tokenizer->_break_itr, index);
166      if (res == UBRK_DONE) {
167          return kCFStringTokenizerTokenNone;
168      } else {
169          return kCFStringTokenizerTokenNormal;
170      }
171  }
172  
173  CFStringTokenizerTokenType CFStringTokenizerAdvanceToNextToken(CFStringTokenizerRef tokenizer) {
174      int32_t type = ubrk_next(tokenizer->_break_itr);
175      return type == UBRK_DONE ? kCFStringTokenizerTokenNone : kCFStringTokenizerTokenNormal;
176  }
177  
178  CFRange CFStringTokenizerGetCurrentTokenRange(CFStringTokenizerRef tokenizer) {
179      int32_t prev = ubrk_previous(tokenizer->_break_itr);
180      int32_t curr = ubrk_next(tokenizer->_break_itr);
181      if (curr == UBRK_DONE) {
182          return CFRangeMake(0, -1);
183      } else {
184          return CFRangeMake(prev, curr - prev);
185      }
186  }
187  
188  CFTypeRef CFStringTokenizerCopyCurrentTokenAttribute(CFStringTokenizerRef tokenizer, CFOptionFlags attribute) {
189      // docs says this can validly return null, seems reasonable...
190      return NULL;
191  }
192  
193  /*
194  This requires linguistic databases for compound words.
195  CFIndex CFStringTokenizerGetCurrentSubTokens(CFStringTokenizerRef tokenizer, CFRange *ranges, CFIndex maxRangeLength, CFMutableArrayRef derivedSubTokens) {
196  
197  }
198  */
199