/ lib / Utf8 / Utf8.cpp
Utf8.cpp
 1  #include "Utf8.h"
 2  
 3  int utf8CodepointLen(const unsigned char c) {
 4    if (c < 0x80) return 1;          // 0xxxxxxx
 5    if ((c >> 5) == 0x6) return 2;   // 110xxxxx
 6    if ((c >> 4) == 0xE) return 3;   // 1110xxxx
 7    if ((c >> 3) == 0x1E) return 4;  // 11110xxx
 8    return 1;                        // fallback for invalid
 9  }
10  
11  uint32_t utf8NextCodepoint(const unsigned char** string) {
12    if (**string == 0) {
13      return 0;
14    }
15  
16    const int bytes = utf8CodepointLen(**string);
17    const uint8_t* chr = *string;
18    *string += bytes;
19  
20    if (bytes == 1) {
21      return chr[0];
22    }
23  
24    uint32_t cp = chr[0] & ((1 << (7 - bytes)) - 1);  // mask header bits
25  
26    for (int i = 1; i < bytes; i++) {
27      cp = (cp << 6) | (chr[i] & 0x3F);
28    }
29  
30    return cp;
31  }
32  
33  size_t utf8RemoveLastChar(std::string& str) {
34    if (str.empty()) return 0;
35    size_t pos = str.size() - 1;
36    // Walk back to find the start of the last UTF-8 character
37    // UTF-8 continuation bytes start with 10xxxxxx (0x80-0xBF)
38    while (pos > 0 && (static_cast<unsigned char>(str[pos]) & 0xC0) == 0x80) {
39      --pos;
40    }
41    str.resize(pos);
42    return pos;
43  }
44  
45  void utf8TruncateChars(std::string& str, size_t numChars) {
46    for (size_t i = 0; i < numChars && !str.empty(); ++i) {
47      utf8RemoveLastChar(str);
48    }
49  }