ScriptDetector.h
1 #pragma once 2 3 #include <cstdint> 4 5 /** 6 * Script Detection Utility 7 * 8 * Provides fast detection of script types for text rendering decisions. 9 * Used to determine spacing rules and rendering paths for multi-script text. 10 */ 11 12 namespace ScriptDetector { 13 14 // Script classification for rendering decisions 15 enum class Script : uint8_t { 16 LATIN, // Latin, Cyrillic, Greek, and other space-separated scripts 17 CJK, // Chinese, Japanese, Korean (no spaces between characters) 18 THAI, // Thai script (requires shaping, no word spaces) 19 OTHER // Symbols, digits, punctuation, unknown 20 }; 21 22 /** 23 * Classify a word's primary script based on first significant codepoint. 24 * For mixed content, returns the script of the first non-ASCII character. 25 * 26 * @param word UTF-8 encoded word 27 * @return Detected script type 28 */ 29 Script classify(const char* word); 30 31 /** 32 * Check if a codepoint is a CJK ideograph (allows line break before/after). 33 * Based on UAX #14 Line Break Class ID. 34 * 35 * Ranges covered: 36 * - CJK Unified Ideographs: U+4E00-U+9FFF 37 * - CJK Extension A: U+3400-U+4DBF 38 * - CJK Compatibility Ideographs: U+F900-U+FAFF 39 * - Hiragana: U+3040-U+309F 40 * - Katakana: U+30A0-U+30FF 41 * - Hangul Syllables: U+AC00-U+D7AF 42 * - CJK Extension B+: U+20000-U+2A6DF 43 * - Fullwidth forms: U+FF00-U+FFEF 44 */ 45 bool isCjkCodepoint(uint32_t cp); 46 47 /** 48 * Check if a codepoint is in the Thai Unicode block (U+0E00-U+0E7F). 49 */ 50 inline bool isThaiCodepoint(uint32_t cp) { return cp >= 0x0E00 && cp <= 0x0E7F; } 51 52 /** 53 * Check if text contains any Thai codepoints (for fast-path detection). 54 * 55 * @param text UTF-8 encoded text 56 * @return true if any Thai character found 57 */ 58 bool containsThai(const char* text); 59 60 /** 61 * Check if text contains any CJK codepoints (for fast-path detection). 62 * 63 * @param text UTF-8 encoded text 64 * @return true if any CJK character found 65 */ 66 bool containsCjk(const char* text); 67 68 } // namespace ScriptDetector