/ lib / ScriptDetector / ScriptDetector.h
ScriptDetector.h
 1  #pragma once
 2  
 3  #include <cstdint>
 4  
 5  /**
 6   * Script Detection Utility
 7   *
 8   * Provides fast detection of script types for text rendering decisions.
 9   * Used to determine spacing rules and rendering paths for multi-script text.
10   */
11  
12  namespace ScriptDetector {
13  
14  // Script classification for rendering decisions
15  enum class Script : uint8_t {
16    LATIN,  // Latin, Cyrillic, Greek, and other space-separated scripts
17    CJK,    // Chinese, Japanese, Korean (no spaces between characters)
18    THAI,   // Thai script (requires shaping, no word spaces)
19    OTHER   // Symbols, digits, punctuation, unknown
20  };
21  
22  /**
23   * Classify a word's primary script based on first significant codepoint.
24   * For mixed content, returns the script of the first non-ASCII character.
25   *
26   * @param word UTF-8 encoded word
27   * @return Detected script type
28   */
29  Script classify(const char* word);
30  
31  /**
32   * Check if a codepoint is a CJK ideograph (allows line break before/after).
33   * Based on UAX #14 Line Break Class ID.
34   *
35   * Ranges covered:
36   * - CJK Unified Ideographs: U+4E00-U+9FFF
37   * - CJK Extension A: U+3400-U+4DBF
38   * - CJK Compatibility Ideographs: U+F900-U+FAFF
39   * - Hiragana: U+3040-U+309F
40   * - Katakana: U+30A0-U+30FF
41   * - Hangul Syllables: U+AC00-U+D7AF
42   * - CJK Extension B+: U+20000-U+2A6DF
43   * - Fullwidth forms: U+FF00-U+FFEF
44   */
45  bool isCjkCodepoint(uint32_t cp);
46  
47  /**
48   * Check if a codepoint is in the Thai Unicode block (U+0E00-U+0E7F).
49   */
50  inline bool isThaiCodepoint(uint32_t cp) { return cp >= 0x0E00 && cp <= 0x0E7F; }
51  
52  /**
53   * Check if text contains any Thai codepoints (for fast-path detection).
54   *
55   * @param text UTF-8 encoded text
56   * @return true if any Thai character found
57   */
58  bool containsThai(const char* text);
59  
60  /**
61   * Check if text contains any CJK codepoints (for fast-path detection).
62   *
63   * @param text UTF-8 encoded text
64   * @return true if any CJK character found
65   */
66  bool containsCjk(const char* text);
67  
68  }  // namespace ScriptDetector