FontIconGlyphClassifier.cpp
1 #include "pch.h" 2 #include "FontIconGlyphClassifier.h" 3 #include "FontIconGlyphClassifier.g.cpp" 4 5 #include <icu.h> 6 #include <utility> 7 8 namespace winrt::Microsoft::Terminal::UI::implementation 9 { 10 namespace 11 { 12 // Check if the code point is in the Private Use Area range used by Fluent UI icons. 13 [[nodiscard]] constexpr bool _isFluentIconPua(const UChar32 cp) noexcept 14 { 15 constexpr UChar32 fluentIconsPrivateUseAreaStart = 0xE700; 16 constexpr UChar32 fluentIconsPrivateUseAreaEnd = 0xF8FF; 17 return cp >= fluentIconsPrivateUseAreaStart && cp <= fluentIconsPrivateUseAreaEnd; 18 } 19 20 // Determine if the given text (as a sequence of UChar code units) is emoji 21 [[nodiscard]] bool _isEmoji(const UChar* p, const int32_t length) noexcept 22 { 23 if (!p || length < 1) 24 { 25 return false; 26 } 27 28 // https://www.unicode.org/reports/tr51/#Emoji_Variation_Selector_Notes 29 constexpr UChar32 vs15CodePoint = 0xFE0E; // Variation Selectors 15: text variation selector 30 constexpr UChar32 vs16CodePoint = 0xFE0F; // Variation Selectors: 16 emoji variation selector 31 32 // Decode the first code point correctly (surrogate-safe) 33 int32_t i0{ 0 }; 34 UChar32 first{ 0 }; 35 U16_NEXT(p, i0, length, first); 36 37 for (int32_t i = 0; i < length;) 38 { 39 UChar32 cp{ 0 }; 40 U16_NEXT(p, i, length, cp); 41 42 if (cp == vs16CodePoint) { return true; } 43 if (cp == vs15CodePoint) { return false; } 44 } 45 46 return !U_IS_SURROGATE(first) && u_hasBinaryProperty(first, UCHAR_EMOJI_PRESENTATION); 47 } 48 } 49 50 bool FontIconGlyphClassifier::IsLikelyToBeEmojiOrSymbolIcon(const hstring& text) 51 { 52 if (text.empty()) 53 { 54 return false; 55 } 56 57 if (text.size() == 1 && !IS_HIGH_SURROGATE(text[0])) 58 { 59 // If it's a single code unit, it's definitely either zero or one grapheme clusters. 60 // If it turns out to be illegal Unicode, we don't really care. 61 return true; 62 } 63 64 if (text.size() >= 2 && text[0] <= 0x7F && text[1] <= 0x7F) 65 { 66 // Two adjacent ASCII characters (as seen in most file paths) aren't a single 67 // grapheme cluster. 68 return false; 69 } 70 71 // Use ICU to determine whether text is composed of a single grapheme cluster. 72 int32_t off{ 0 }; 73 UErrorCode status{ U_ZERO_ERROR }; 74 75 UBreakIterator* const bi{ ubrk_open(UBRK_CHARACTER, 76 nullptr, 77 reinterpret_cast<const UChar*>(text.data()), 78 static_cast<int>(text.size()), 79 &status) }; 80 if (bi) 81 { 82 if (U_SUCCESS(status)) 83 { 84 off = ubrk_next(bi); 85 } 86 ubrk_close(bi); 87 } 88 return std::cmp_equal(off, text.size()); 89 } 90 91 FontIconGlyphKind FontIconGlyphClassifier::Classify(hstring const& text) noexcept 92 { 93 if (text.empty()) 94 { 95 return FontIconGlyphKind::None; 96 } 97 98 const size_t textSize{ text.size() }; 99 const auto* buffer{ reinterpret_cast<const UChar*>(text.c_str()) }; 100 101 // Fast path 1: Single UTF-16 code unit (most common case) 102 if (textSize == 1) 103 { 104 const UChar ch{ buffer[0] }; 105 106 if (IS_HIGH_SURROGATE(ch)) 107 { 108 return FontIconGlyphKind::Invalid; 109 } 110 111 if (_isFluentIconPua(ch)) 112 { 113 return FontIconGlyphKind::FluentSymbol; 114 } 115 116 if (_isEmoji(&ch, 1)) 117 { 118 return FontIconGlyphKind::Emoji; 119 } 120 121 return FontIconGlyphKind::Other; 122 } 123 124 // Fast path 2: Common file path pattern - two ASCII printable characters 125 if (textSize >= 2 && buffer[0] <= 0x7F && buffer[1] <= 0x7F) 126 { 127 // Definitely multiple graphemes 128 return FontIconGlyphKind::Invalid; 129 } 130 131 // Expensive path: Use ICU to determine grapheme boundaries 132 UErrorCode status{ U_ZERO_ERROR }; 133 134 UBreakIterator* bi{ ubrk_open(UBRK_CHARACTER, 135 nullptr, 136 buffer, 137 static_cast<int32_t>(textSize), 138 &status) }; 139 140 if (U_FAILURE(status) || !bi) 141 { 142 return FontIconGlyphKind::Invalid; 143 } 144 145 const int32_t start{ ubrk_first(bi) }; 146 const int32_t end{ ubrk_next(bi) }; // end of first grapheme 147 ubrk_close(bi); 148 149 // No graphemes found 150 if (end == UBRK_DONE || end <= start) 151 { 152 return FontIconGlyphKind::None; 153 } 154 155 // If there's more than one grapheme, it's not a valid icon glyph 156 if (std::cmp_not_equal(end, textSize)) 157 { 158 return FontIconGlyphKind::Invalid; 159 } 160 161 // Exactly one grapheme: classify 162 const UChar* grapheme = buffer + start; 163 const int32_t graphemeLength = end - start; 164 165 if (graphemeLength == 1 && _isFluentIconPua(grapheme[0])) 166 { 167 return FontIconGlyphKind::FluentSymbol; 168 } 169 170 if (_isEmoji(grapheme, graphemeLength)) 171 { 172 return FontIconGlyphKind::Emoji; 173 } 174 175 return FontIconGlyphKind::Other; 176 } 177 }