/ CFStringTokenizer.c
CFStringTokenizer.c
1 // 2 // CFStringTokenizer.c 3 // CoreFoundation 4 // 5 // Copyright (c) 2014 Apportable. All rights reserved. 6 // 7 8 #include "CFBase.h" 9 #include "CFRuntime.h" 10 #include "CFStringTokenizer.h" 11 #include <unicode/ubrk.h> 12 13 #define TYPE_MASK 0x000000FF 14 15 struct __CFStringTokenizer { 16 CFRuntimeBase _base; 17 CFStringRef _string; 18 CFRange _range; 19 CFOptionFlags _options; 20 CFLocaleRef _locale; 21 UBreakIterator *_break_itr; 22 }; 23 24 static void __CFStringTokenizerDeallocate(CFTypeRef cf) { 25 struct __CFStringTokenizer *tokenizer = (struct __CFStringTokenizer *)cf; 26 if (tokenizer->_string) { 27 CFRelease(tokenizer->_string); 28 } 29 30 if (tokenizer->_locale) { 31 CFRelease(tokenizer->_locale); 32 } 33 34 if (tokenizer->_break_itr) { 35 ubrk_close(tokenizer->_break_itr); 36 } 37 } 38 39 40 static CFTypeID __kCFStringTokenizerTypeID = _kCFRuntimeNotATypeID; 41 42 static const CFRuntimeClass __CFStringTokenizerClass = { 43 _kCFRuntimeScannedObject, 44 "CFStringTokenizer", 45 NULL, // init 46 NULL, // copy 47 __CFStringTokenizerDeallocate, 48 NULL, // __CFStringTokenizerEqual, 49 NULL, // __CFStringTokenizerHash, 50 NULL, // 51 NULL 52 }; 53 54 void __CFStringTokenizerInitialize(void) { 55 __kCFStringTokenizerTypeID = _CFRuntimeRegisterClass(&__CFStringTokenizerClass); 56 } 57 58 59 CFTypeID CFStringTokenizerGetTypeID(void) { 60 if (__kCFStringTokenizerTypeID == _kCFRuntimeNotATypeID) { 61 __CFStringTokenizerInitialize(); 62 } 63 return __kCFStringTokenizerTypeID; 64 } 65 66 #define BUFFER_SIZE 768 67 68 CFStringTokenizerRef CFStringTokenizerCreate(CFAllocatorRef allocator, CFStringRef string, CFRange range, CFOptionFlags options, CFLocaleRef locale) { 69 CFIndex size = sizeof(struct __CFStringTokenizer) - sizeof(CFRuntimeBase); 70 struct __CFStringTokenizer *tokenizer = (struct __CFStringTokenizer *)_CFRuntimeCreateInstance(allocator, CFStringTokenizerGetTypeID(), size, NULL); 71 tokenizer->_string = CFStringCreateCopy(allocator, string); 72 tokenizer->_range = range; 73 tokenizer->_options = options; 74 if (locale == NULL) { 75 tokenizer->_locale = CFLocaleCopyCurrent(); 76 } else { 77 tokenizer->_locale = CFRetain(locale); 78 } 79 80 81 CFStringRef localeName = locale ? CFLocaleGetIdentifier(locale) : CFSTR(""); 82 char buffer[BUFFER_SIZE]; 83 const char *cstr = CFStringGetCStringPtr(localeName, kCFStringEncodingASCII); 84 if (NULL == cstr) { 85 if (CFStringGetCString(localeName, buffer, BUFFER_SIZE, kCFStringEncodingASCII)) { 86 cstr = buffer; 87 } 88 } 89 90 if (NULL == cstr) { 91 CFRelease((CFTypeRef)tokenizer); 92 return NULL; 93 } 94 95 UBreakIteratorType type; 96 // UBRK_CHARACTER, UBRK_WORD, UBRK_LINE, UBRK_SENTENCE 97 switch (options & TYPE_MASK) { // mask off the high bits since they can be options 98 case kCFStringTokenizerUnitWord: 99 case kCFStringTokenizerUnitWordBoundary: 100 type = UBRK_WORD; 101 break; 102 case kCFStringTokenizerUnitSentence: 103 case kCFStringTokenizerUnitParagraph: // this seems incorrect. 104 type = UBRK_SENTENCE; 105 break; 106 case kCFStringTokenizerUnitLineBreak: 107 type = UBRK_LINE; 108 break; 109 } 110 111 UChar stack_text[BUFFER_SIZE] = {0}; 112 UChar *text = &stack_text[0]; 113 CFIndex len = CFStringGetLength(string); 114 if (len > BUFFER_SIZE) { 115 text = malloc(len * sizeof(UChar)); 116 if (text == NULL) { 117 CFRelease(tokenizer); 118 return NULL; 119 } 120 } 121 CFStringGetCharacters(string, CFRangeMake(0, len), (UniChar *)text); 122 UErrorCode err = 0; 123 tokenizer->_break_itr = ubrk_open(type, cstr, text, len, &err); 124 if (text != &stack_text[0]) { 125 free(text); 126 } 127 128 if (tokenizer->_break_itr == NULL) { 129 CFRelease(tokenizer); 130 return NULL; 131 } 132 133 return tokenizer; 134 } 135 136 /* 137 This requires a fairly massive database and heuristic modeling of langauge. 138 CFStringRef CFStringTokenizerCopyBestStringLanguage(CFStringRef string, CFRange range) { 139 140 } 141 */ 142 143 void CFStringTokenizerSetString(CFStringTokenizerRef tokenizer, CFStringRef string, CFRange range) { 144 145 #warning TODO: range is not handled currently 146 147 UChar stack_text[BUFFER_SIZE] = { 0 }; 148 UChar *text = &stack_text[0]; 149 CFIndex len = CFStringGetLength(string); 150 if (len > BUFFER_SIZE) { 151 text = malloc(len * sizeof(UChar)); 152 if (text == NULL) { 153 return; 154 } 155 } 156 CFStringGetCharacters(string, CFRangeMake(0, len), (UniChar *)text); 157 UErrorCode err = 0; 158 ubrk_setText(tokenizer->_break_itr, text, len, &err); 159 if (text != &stack_text[0]) { 160 free(text); 161 } 162 } 163 164 CFStringTokenizerTokenType CFStringTokenizerGoToTokenAtIndex(CFStringTokenizerRef tokenizer, CFIndex index) { 165 int32_t res = ubrk_following(tokenizer->_break_itr, index); 166 if (res == UBRK_DONE) { 167 return kCFStringTokenizerTokenNone; 168 } else { 169 return kCFStringTokenizerTokenNormal; 170 } 171 } 172 173 CFStringTokenizerTokenType CFStringTokenizerAdvanceToNextToken(CFStringTokenizerRef tokenizer) { 174 int32_t type = ubrk_next(tokenizer->_break_itr); 175 return type == UBRK_DONE ? kCFStringTokenizerTokenNone : kCFStringTokenizerTokenNormal; 176 } 177 178 CFRange CFStringTokenizerGetCurrentTokenRange(CFStringTokenizerRef tokenizer) { 179 int32_t prev = ubrk_previous(tokenizer->_break_itr); 180 int32_t curr = ubrk_next(tokenizer->_break_itr); 181 if (curr == UBRK_DONE) { 182 return CFRangeMake(0, -1); 183 } else { 184 return CFRangeMake(prev, curr - prev); 185 } 186 } 187 188 CFTypeRef CFStringTokenizerCopyCurrentTokenAttribute(CFStringTokenizerRef tokenizer, CFOptionFlags attribute) { 189 // docs says this can validly return null, seems reasonable... 190 return NULL; 191 } 192 193 /* 194 This requires linguistic databases for compound words. 195 CFIndex CFStringTokenizerGetCurrentSubTokens(CFStringTokenizerRef tokenizer, CFRange *ranges, CFIndex maxRangeLength, CFMutableArrayRef derivedSubTokens) { 196 197 } 198 */ 199