Utf8.cpp
1 #include "Utf8.h" 2 3 int utf8CodepointLen(const unsigned char c) { 4 if (c < 0x80) return 1; // 0xxxxxxx 5 if ((c >> 5) == 0x6) return 2; // 110xxxxx 6 if ((c >> 4) == 0xE) return 3; // 1110xxxx 7 if ((c >> 3) == 0x1E) return 4; // 11110xxx 8 return 1; // fallback for invalid 9 } 10 11 uint32_t utf8NextCodepoint(const unsigned char** string) { 12 if (**string == 0) { 13 return 0; 14 } 15 16 const int bytes = utf8CodepointLen(**string); 17 const uint8_t* chr = *string; 18 *string += bytes; 19 20 if (bytes == 1) { 21 return chr[0]; 22 } 23 24 uint32_t cp = chr[0] & ((1 << (7 - bytes)) - 1); // mask header bits 25 26 for (int i = 1; i < bytes; i++) { 27 cp = (cp << 6) | (chr[i] & 0x3F); 28 } 29 30 return cp; 31 } 32 33 size_t utf8RemoveLastChar(std::string& str) { 34 if (str.empty()) return 0; 35 size_t pos = str.size() - 1; 36 // Walk back to find the start of the last UTF-8 character 37 // UTF-8 continuation bytes start with 10xxxxxx (0x80-0xBF) 38 while (pos > 0 && (static_cast<unsigned char>(str[pos]) & 0xC0) == 0x80) { 39 --pos; 40 } 41 str.resize(pos); 42 return pos; 43 } 44 45 void utf8TruncateChars(std::string& str, size_t numChars) { 46 for (size_t i = 0; i < numChars && !str.empty(); ++i) { 47 utf8RemoveLastChar(str); 48 } 49 }