ScriptDetector.cpp
1 #include "ScriptDetector.h" 2 3 #include <Utf8.h> 4 5 namespace ScriptDetector { 6 7 bool isCjkCodepoint(uint32_t cp) { 8 // CJK Unified Ideographs 9 if (cp >= 0x4E00 && cp <= 0x9FFF) return true; 10 // CJK Extension A 11 if (cp >= 0x3400 && cp <= 0x4DBF) return true; 12 // CJK Compatibility Ideographs 13 if (cp >= 0xF900 && cp <= 0xFAFF) return true; 14 // Hiragana 15 if (cp >= 0x3040 && cp <= 0x309F) return true; 16 // Katakana 17 if (cp >= 0x30A0 && cp <= 0x30FF) return true; 18 // Hangul Syllables 19 if (cp >= 0xAC00 && cp <= 0xD7AF) return true; 20 // CJK Extension B and beyond (Plane 2) 21 if (cp >= 0x20000 && cp <= 0x2A6DF) return true; 22 // Fullwidth ASCII variants (often used in CJK context) 23 if (cp >= 0xFF00 && cp <= 0xFFEF) return true; 24 return false; 25 } 26 27 bool containsThai(const char* text) { 28 if (text == nullptr) return false; 29 30 const unsigned char* ptr = reinterpret_cast<const unsigned char*>(text); 31 uint32_t cp; 32 33 while ((cp = utf8NextCodepoint(&ptr))) { 34 if (isThaiCodepoint(cp)) { 35 return true; 36 } 37 } 38 return false; 39 } 40 41 bool containsCjk(const char* text) { 42 if (text == nullptr) return false; 43 44 const unsigned char* ptr = reinterpret_cast<const unsigned char*>(text); 45 uint32_t cp; 46 47 while ((cp = utf8NextCodepoint(&ptr))) { 48 if (isCjkCodepoint(cp)) { 49 return true; 50 } 51 } 52 return false; 53 } 54 55 Script classify(const char* word) { 56 if (word == nullptr || *word == '\0') { 57 return Script::OTHER; 58 } 59 60 const unsigned char* ptr = reinterpret_cast<const unsigned char*>(word); 61 uint32_t cp; 62 63 while ((cp = utf8NextCodepoint(&ptr))) { 64 // Skip ASCII - continue to find first non-ASCII character 65 if (cp < 0x80) { 66 continue; 67 } 68 69 // Check Thai first (smaller range, fast check) 70 if (isThaiCodepoint(cp)) { 71 return Script::THAI; 72 } 73 74 // Check CJK ranges 75 if (isCjkCodepoint(cp)) { 76 return Script::CJK; 77 } 78 79 // Extended Latin, Cyrillic, Greek, etc. - treat as Latin 80 // Latin Extended: U+0080-U+024F 81 // Latin Extended Additional: U+1E00-U+1EFF 82 // Cyrillic: U+0400-U+04FF 83 // Greek: U+0370-U+03FF 84 if ((cp >= 0x0080 && cp <= 0x024F) || // Latin Extended 85 (cp >= 0x1E00 && cp <= 0x1EFF) || // Latin Extended Additional 86 (cp >= 0x0400 && cp <= 0x04FF) || // Cyrillic 87 (cp >= 0x0370 && cp <= 0x03FF)) { // Greek 88 return Script::LATIN; 89 } 90 91 // Unknown non-ASCII - classify as OTHER 92 return Script::OTHER; 93 } 94 95 // All ASCII - classify as LATIN 96 return Script::LATIN; 97 } 98 99 } // namespace ScriptDetector