/ src / modules / cmdpal / Microsoft.Terminal.UI / FontIconGlyphClassifier.cpp
FontIconGlyphClassifier.cpp
  1  #include "pch.h"
  2  #include "FontIconGlyphClassifier.h"
  3  #include "FontIconGlyphClassifier.g.cpp"
  4  
  5  #include <icu.h>
  6  #include <utility>
  7  
  8  namespace winrt::Microsoft::Terminal::UI::implementation
  9  {
 10      namespace
 11      {
 12          // Check if the code point is in the Private Use Area range used by Fluent UI icons.
 13          [[nodiscard]] constexpr bool _isFluentIconPua(const UChar32 cp) noexcept
 14          {
 15              constexpr UChar32 fluentIconsPrivateUseAreaStart = 0xE700;
 16              constexpr UChar32 fluentIconsPrivateUseAreaEnd = 0xF8FF;
 17              return cp >= fluentIconsPrivateUseAreaStart && cp <= fluentIconsPrivateUseAreaEnd;
 18          }
 19  
 20          // Determine if the given text (as a sequence of UChar code units) is emoji
 21          [[nodiscard]] bool _isEmoji(const UChar* p, const int32_t length) noexcept
 22          {
 23              if (!p || length < 1)
 24              {
 25                  return false;
 26              }
 27  
 28              // https://www.unicode.org/reports/tr51/#Emoji_Variation_Selector_Notes
 29              constexpr UChar32 vs15CodePoint = 0xFE0E; // Variation Selectors 15: text variation selector
 30              constexpr UChar32 vs16CodePoint = 0xFE0F; // Variation Selectors: 16 emoji variation selector
 31  
 32              // Decode the first code point correctly (surrogate-safe)
 33              int32_t i0{ 0 };
 34              UChar32 first{ 0 };
 35              U16_NEXT(p, i0, length, first);
 36  
 37              for (int32_t i = 0; i < length;)
 38              {
 39                  UChar32 cp{ 0 };
 40                  U16_NEXT(p, i, length, cp);
 41  
 42                  if (cp == vs16CodePoint) { return true; }
 43                  if (cp == vs15CodePoint) { return false; }
 44              }
 45  
 46              return !U_IS_SURROGATE(first) && u_hasBinaryProperty(first, UCHAR_EMOJI_PRESENTATION);
 47          }
 48      }
 49  
 50      bool FontIconGlyphClassifier::IsLikelyToBeEmojiOrSymbolIcon(const hstring& text)
 51      {
 52          if (text.empty())
 53          {
 54              return false;
 55          }
 56  
 57          if (text.size() == 1 && !IS_HIGH_SURROGATE(text[0]))
 58          {
 59              // If it's a single code unit, it's definitely either zero or one grapheme clusters.
 60              // If it turns out to be illegal Unicode, we don't really care.
 61              return true;
 62          }
 63  
 64          if (text.size() >= 2 && text[0] <= 0x7F && text[1] <= 0x7F)
 65          {
 66              // Two adjacent ASCII characters (as seen in most file paths) aren't a single
 67              // grapheme cluster.
 68              return false;
 69          }
 70  
 71          // Use ICU to determine whether text is composed of a single grapheme cluster.
 72          int32_t off{ 0 };
 73          UErrorCode status{ U_ZERO_ERROR };
 74  
 75          UBreakIterator* const bi{ ubrk_open(UBRK_CHARACTER,
 76                                              nullptr,
 77                                              reinterpret_cast<const UChar*>(text.data()),
 78                                              static_cast<int>(text.size()),
 79                                              &status) };
 80          if (bi)
 81          {
 82              if (U_SUCCESS(status))
 83              {
 84                  off = ubrk_next(bi);
 85              }
 86              ubrk_close(bi);
 87          }
 88          return std::cmp_equal(off, text.size());
 89      }
 90  
 91      FontIconGlyphKind FontIconGlyphClassifier::Classify(hstring const& text) noexcept
 92      {
 93          if (text.empty())
 94          {
 95              return FontIconGlyphKind::None;
 96          }
 97  
 98          const size_t textSize{ text.size() };
 99          const auto* buffer{ reinterpret_cast<const UChar*>(text.c_str()) };
100  
101          // Fast path 1: Single UTF-16 code unit (most common case)
102          if (textSize == 1)
103          {
104              const UChar ch{ buffer[0] };
105  
106              if (IS_HIGH_SURROGATE(ch))
107              {
108                  return FontIconGlyphKind::Invalid;
109              }
110  
111              if (_isFluentIconPua(ch))
112              {
113                  return FontIconGlyphKind::FluentSymbol;
114              }
115  
116              if (_isEmoji(&ch, 1))
117              {
118                  return FontIconGlyphKind::Emoji;
119              }
120  
121              return FontIconGlyphKind::Other;
122          }
123  
124          // Fast path 2: Common file path pattern - two ASCII printable characters
125          if (textSize >= 2 && buffer[0] <= 0x7F && buffer[1] <= 0x7F)
126          {
127              // Definitely multiple graphemes
128              return FontIconGlyphKind::Invalid;
129          }
130  
131          // Expensive path: Use ICU to determine grapheme boundaries
132          UErrorCode status{ U_ZERO_ERROR };
133  
134          UBreakIterator* bi{ ubrk_open(UBRK_CHARACTER,
135                                        nullptr,
136                                        buffer,
137                                        static_cast<int32_t>(textSize),
138                                        &status) };
139  
140          if (U_FAILURE(status) || !bi)
141          {
142              return FontIconGlyphKind::Invalid;
143          }
144  
145          const int32_t start{ ubrk_first(bi) };
146          const int32_t end{ ubrk_next(bi) }; // end of first grapheme
147          ubrk_close(bi);
148  
149          // No graphemes found
150          if (end == UBRK_DONE || end <= start)
151          {
152              return FontIconGlyphKind::None;
153          }
154  
155          // If there's more than one grapheme, it's not a valid icon glyph
156          if (std::cmp_not_equal(end, textSize))
157          {
158              return FontIconGlyphKind::Invalid;
159          }
160  
161          // Exactly one grapheme: classify
162          const UChar* grapheme = buffer + start;
163          const int32_t graphemeLength = end - start;
164  
165          if (graphemeLength == 1 && _isFluentIconPua(grapheme[0]))
166          {
167              return FontIconGlyphKind::FluentSymbol;
168          }
169  
170          if (_isEmoji(grapheme, graphemeLength))
171          {
172              return FontIconGlyphKind::Emoji;
173          }
174  
175          return FontIconGlyphKind::Other;
176      }
177  }