/ lib / ScriptDetector / ScriptDetector.cpp
ScriptDetector.cpp
 1  #include "ScriptDetector.h"
 2  
 3  #include <Utf8.h>
 4  
 5  namespace ScriptDetector {
 6  
 7  bool isCjkCodepoint(uint32_t cp) {
 8    // CJK Unified Ideographs
 9    if (cp >= 0x4E00 && cp <= 0x9FFF) return true;
10    // CJK Extension A
11    if (cp >= 0x3400 && cp <= 0x4DBF) return true;
12    // CJK Compatibility Ideographs
13    if (cp >= 0xF900 && cp <= 0xFAFF) return true;
14    // Hiragana
15    if (cp >= 0x3040 && cp <= 0x309F) return true;
16    // Katakana
17    if (cp >= 0x30A0 && cp <= 0x30FF) return true;
18    // Hangul Syllables
19    if (cp >= 0xAC00 && cp <= 0xD7AF) return true;
20    // CJK Extension B and beyond (Plane 2)
21    if (cp >= 0x20000 && cp <= 0x2A6DF) return true;
22    // Fullwidth ASCII variants (often used in CJK context)
23    if (cp >= 0xFF00 && cp <= 0xFFEF) return true;
24    return false;
25  }
26  
27  bool containsThai(const char* text) {
28    if (text == nullptr) return false;
29  
30    const unsigned char* ptr = reinterpret_cast<const unsigned char*>(text);
31    uint32_t cp;
32  
33    while ((cp = utf8NextCodepoint(&ptr))) {
34      if (isThaiCodepoint(cp)) {
35        return true;
36      }
37    }
38    return false;
39  }
40  
41  bool containsCjk(const char* text) {
42    if (text == nullptr) return false;
43  
44    const unsigned char* ptr = reinterpret_cast<const unsigned char*>(text);
45    uint32_t cp;
46  
47    while ((cp = utf8NextCodepoint(&ptr))) {
48      if (isCjkCodepoint(cp)) {
49        return true;
50      }
51    }
52    return false;
53  }
54  
55  Script classify(const char* word) {
56    if (word == nullptr || *word == '\0') {
57      return Script::OTHER;
58    }
59  
60    const unsigned char* ptr = reinterpret_cast<const unsigned char*>(word);
61    uint32_t cp;
62  
63    while ((cp = utf8NextCodepoint(&ptr))) {
64      // Skip ASCII - continue to find first non-ASCII character
65      if (cp < 0x80) {
66        continue;
67      }
68  
69      // Check Thai first (smaller range, fast check)
70      if (isThaiCodepoint(cp)) {
71        return Script::THAI;
72      }
73  
74      // Check CJK ranges
75      if (isCjkCodepoint(cp)) {
76        return Script::CJK;
77      }
78  
79      // Extended Latin, Cyrillic, Greek, etc. - treat as Latin
80      // Latin Extended: U+0080-U+024F
81      // Latin Extended Additional: U+1E00-U+1EFF
82      // Cyrillic: U+0400-U+04FF
83      // Greek: U+0370-U+03FF
84      if ((cp >= 0x0080 && cp <= 0x024F) ||  // Latin Extended
85          (cp >= 0x1E00 && cp <= 0x1EFF) ||  // Latin Extended Additional
86          (cp >= 0x0400 && cp <= 0x04FF) ||  // Cyrillic
87          (cp >= 0x0370 && cp <= 0x03FF)) {  // Greek
88        return Script::LATIN;
89      }
90  
91      // Unknown non-ASCII - classify as OTHER
92      return Script::OTHER;
93    }
94  
95    // All ASCII - classify as LATIN
96    return Script::LATIN;
97  }
98  
99  }  // namespace ScriptDetector