til_string.h
  1  // Copyright (c) Microsoft Corporation.
  2  // Licensed under the MIT license.
  3  
  4  #pragma once
  5  
  6  namespace til // Terminal Implementation Library. Also: "Today I Learned"
  7  {
  8      // The at function declares that you've already sufficiently checked that your array access
  9      // is in range before retrieving an item inside it at an offset.
 10      // This is to save double/triple/quadruple testing in circumstances where you are already
 11      // pivoting on the length of a set and now want to pull elements out of it by offset
 12      // without checking again.
 13      // gsl::at will do the check again. As will .at(). And using [] will have a warning in audit.
 14      // This template is explicitly disabled if T is of type std::span, as it would interfere with
 15      // the overload below.
 16      template<typename T, typename I>
 17      constexpr auto at(T&& cont, const I i) noexcept -> decltype(auto)
 18      {
 19  #pragma warning(suppress : 26481) // Don't use pointer arithmetic. Use span instead (bounds.1).
 20  #pragma warning(suppress : 26482) // Suppress bounds.2 check for indexing with constant expressions
 21  #pragma warning(suppress : 26446) // Suppress bounds.4 check for subscript operator.
 22  #pragma warning(suppress : 26445) // Suppress lifetime check for a reference to std::span or std::string_view
 23          return cont[i];
 24      }
 25  
 26      _TIL_INLINEPREFIX std::wstring visualize_control_codes(std::wstring str) noexcept
 27      {
 28          for (auto& ch : str)
 29          {
 30              if (ch < 0x20)
 31              {
 32                  ch += 0x2400;
 33              }
 34              else if (ch == 0x20)
 35              {
 36                  ch = 0x2423; // replace space with ␣
 37              }
 38              else if (ch == 0x7f)
 39              {
 40                  ch = 0x2421; // replace del with ␡
 41              }
 42          }
 43          return str;
 44      }
 45      // The same as the above, but it doesn't visualize BS nor SPC.
 46      _TIL_INLINEPREFIX std::wstring visualize_nonspace_control_codes(std::wstring str) noexcept
 47      {
 48          for (auto& ch : str)
 49          {
 50              // NOT backspace!
 51              if (ch < 0x20 && ch != 0x08)
 52              {
 53                  ch += 0x2400;
 54              }
 55              // NOT space
 56              else if (ch == 0x7f)
 57              {
 58                  ch = 0x2421; // replace del with ␡
 59              }
 60          }
 61          return str;
 62      }
 63  
 64      _TIL_INLINEPREFIX std::wstring visualize_control_codes(std::wstring_view str)
 65      {
 66          return visualize_control_codes(std::wstring{ str });
 67      }
 68  
 69      namespace details
 70      {
 71          inline constexpr uint8_t __ = 0b00;
 72          inline constexpr uint8_t F_ = 0b10; // stripped in clean_filename
 73          inline constexpr uint8_t _P = 0b01; // stripped in clean_path
 74          inline constexpr uint8_t FP = 0b11; // stripped in clean_filename and clean_path
 75          inline constexpr std::array<uint8_t, 128> pathFilter{ {
 76              // clang-format off
 77              __ /* NUL */, __ /* SOH */, __ /* STX */, __ /* ETX */, __ /* EOT */, __ /* ENQ */, __ /* ACK */, __ /* BEL */, __ /* BS  */, __ /* HT  */, __ /* LF  */, __ /* VT  */, __ /* FF  */, __ /* CR  */, __ /* SO  */, __ /* SI  */,
 78              __ /* DLE */, __ /* DC1 */, __ /* DC2 */, __ /* DC3 */, __ /* DC4 */, __ /* NAK */, __ /* SYN */, __ /* ETB */, __ /* CAN */, __ /* EM  */, __ /* SUB */, __ /* ESC */, __ /* FS  */, __ /* GS  */, __ /* RS  */, __ /* US  */,
 79              __ /* SP  */, __ /* !   */, FP /* "   */, __ /* #   */, __ /* $   */, __ /* %   */, __ /* &   */, __ /* '   */, __ /* (   */, __ /* )   */, FP /* *   */, __ /* +   */, __ /* ,   */, __ /* -   */, __ /* .   */, F_ /* /   */,
 80              __ /* 0   */, __ /* 1   */, __ /* 2   */, __ /* 3   */, __ /* 4   */, __ /* 5   */, __ /* 6   */, __ /* 7   */, __ /* 8   */, __ /* 9   */, F_ /* :   */, __ /* ;   */, FP /* <   */, __ /* =   */, FP /* >   */, FP /* ?   */,
 81              __ /* @   */, __ /* A   */, __ /* B   */, __ /* C   */, __ /* D   */, __ /* E   */, __ /* F   */, __ /* G   */, __ /* H   */, __ /* I   */, __ /* J   */, __ /* K   */, __ /* L   */, __ /* M   */, __ /* N   */, __ /* O   */,
 82              __ /* P   */, __ /* Q   */, __ /* R   */, __ /* S   */, __ /* T   */, __ /* U   */, __ /* V   */, __ /* W   */, __ /* X   */, __ /* Y   */, __ /* Z   */, __ /* [   */, F_ /* \   */, __ /* ]   */, __ /* ^   */, __ /* _   */,
 83              __ /* `   */, __ /* a   */, __ /* b   */, __ /* c   */, __ /* d   */, __ /* e   */, __ /* f   */, __ /* g   */, __ /* h   */, __ /* i   */, __ /* j   */, __ /* k   */, __ /* l   */, __ /* m   */, __ /* n   */, __ /* o   */,
 84              __ /* p   */, __ /* q   */, __ /* r   */, __ /* s   */, __ /* t   */, __ /* u   */, __ /* v   */, __ /* w   */, __ /* x   */, __ /* y   */, __ /* z   */, __ /* {   */, FP /* |   */, __ /* }   */, __ /* ~   */, __ /* DEL */,
 85              // clang-format on
 86          } };
 87      }
 88  
 89      _TIL_INLINEPREFIX std::wstring clean_filename(std::wstring str) noexcept
 90      {
 91          using namespace til::details;
 92          std::erase_if(str, [](auto ch) {
 93              // This lookup is branchless: It always checks the filter, but throws
 94              // away the result if ch >= 128. This is faster than using `&&` (branchy).
 95              return ((til::at(details::pathFilter, ch & 127) & F_) != 0) & (ch < 128);
 96          });
 97          return str;
 98      }
 99  
100      _TIL_INLINEPREFIX std::wstring clean_path(std::wstring str) noexcept
101      {
102          using namespace til::details;
103          std::erase_if(str, [](auto ch) {
104              return ((til::at(details::pathFilter, ch & 127) & _P) != 0) & (ch < 128);
105          });
106          return str;
107      }
108  
109      // is_legal_path rules on whether a path contains any non-path characters.
110      // it **DOES NOT** rule on whether a path exists.
111      _TIL_INLINEPREFIX constexpr bool is_legal_path(const std::wstring_view str) noexcept
112      {
113          using namespace til::details;
114          return !std::any_of(std::begin(str), std::end(str), [](auto&& ch) {
115              return ((til::at(details::pathFilter, ch & 127) & _P) != 0) & (ch < 128);
116          });
117      }
118  
119      // std::string_view::starts_with support for C++17.
120      template<typename T, typename Traits>
121      constexpr bool starts_with(const std::basic_string_view<T, Traits>& str, const std::basic_string_view<T, Traits>& prefix) noexcept
122      {
123          return str.size() >= prefix.size() && __builtin_memcmp(str.data(), prefix.data(), prefix.size() * sizeof(T)) == 0;
124      }
125  
126      constexpr bool starts_with(const std::string_view& str, const std::string_view& prefix) noexcept
127      {
128          return starts_with<>(str, prefix);
129      }
130  
131      constexpr bool starts_with(const std::wstring_view& str, const std::wstring_view& prefix) noexcept
132      {
133          return starts_with<>(str, prefix);
134      }
135  
136      // std::string_view::ends_with support for C++17.
137      template<typename T, typename Traits>
138      constexpr bool ends_with(const std::basic_string_view<T, Traits>& str, const std::basic_string_view<T, Traits>& suffix) noexcept
139      {
140  #pragma warning(suppress : 26481) // Don't use pointer arithmetic. Use span instead (bounds.1).
141          return str.size() >= suffix.size() && __builtin_memcmp(str.data() + (str.size() - suffix.size()), suffix.data(), suffix.size() * sizeof(T)) == 0;
142      }
143  
144      constexpr bool ends_with(const std::string_view& str, const std::string_view& prefix) noexcept
145      {
146          return ends_with<>(str, prefix);
147      }
148  
149      constexpr bool ends_with(const std::wstring_view& str, const std::wstring_view& prefix) noexcept
150      {
151          return ends_with<>(str, prefix);
152      }
153  
154      inline constexpr unsigned long to_ulong_error = ULONG_MAX;
155      inline constexpr int to_int_error = INT_MAX;
156  
157      // Just like std::wcstoul, but without annoying locales and null-terminating strings.
158      // It has been fuzz-tested against clang's strtoul implementation.
159      template<typename T, typename Traits>
160      _TIL_INLINEPREFIX constexpr unsigned long to_ulong(const std::basic_string_view<T, Traits>& str, unsigned long base = 0) noexcept
161      {
162          static constexpr unsigned long maximumValue = ULONG_MAX / 16;
163  
164          // We don't have to test ptr for null value, as we only access it under either condition:
165          // * str.length() > 0, for determining the base
166          // * ptr != end, when parsing the characters; if ptr is null, length will be 0 and thus end == ptr
167  #pragma warning(push)
168  #pragma warning(disable : 26429) // Symbol 'ptr' is never tested for null value, it can be marked as not_null
169  #pragma warning(disable : 26481) // Don't use pointer arithmetic. Use span instead
170          auto ptr = str.data();
171          const auto end = ptr + str.length();
172          unsigned long accumulator = 0;
173          unsigned long value = ULONG_MAX;
174  
175          if (!base)
176          {
177              base = 10;
178  
179              if (str.length() > 1 && *ptr == '0')
180              {
181                  base = 8;
182                  ++ptr;
183  
184                  if (str.length() > 2 && (*ptr == 'x' || *ptr == 'X'))
185                  {
186                      base = 16;
187                      ++ptr;
188                  }
189              }
190          }
191  
192          if (ptr == end)
193          {
194              return to_ulong_error;
195          }
196  
197          for (;; accumulator *= base)
198          {
199              value = ULONG_MAX;
200              if (*ptr >= '0' && *ptr <= '9')
201              {
202                  value = *ptr - '0';
203              }
204              else if (*ptr >= 'A' && *ptr <= 'F')
205              {
206                  value = *ptr - 'A' + 10;
207              }
208              else if (*ptr >= 'a' && *ptr <= 'f')
209              {
210                  value = *ptr - 'a' + 10;
211              }
212              else
213              {
214                  return to_ulong_error;
215              }
216  
217              accumulator += value;
218              if (accumulator >= maximumValue)
219              {
220                  return to_ulong_error;
221              }
222  
223              if (++ptr == end)
224              {
225                  return accumulator;
226              }
227          }
228  #pragma warning(pop)
229      }
230  
231      constexpr unsigned long to_ulong(const std::string_view& str, unsigned long base = 0) noexcept
232      {
233          return to_ulong<>(str, base);
234      }
235  
236      constexpr unsigned long to_ulong(const std::wstring_view& str, unsigned long base = 0) noexcept
237      {
238          return to_ulong<>(str, base);
239      }
240  
241      // Implement to_int in terms of to_ulong by negating its result. to_ulong does not expect
242      // to be passed signed numbers and will return an error accordingly. That error when
243      // compared against -1 evaluates to true. We account for that by returning to_int_error if to_ulong
244      // returns an error.
245      constexpr int to_int(const std::wstring_view& str, unsigned long base = 0) noexcept
246      {
247          auto result = to_ulong_error;
248          const auto signPosition = str.find(L"-");
249          const bool hasSign = signPosition != std::wstring_view::npos;
250          result = hasSign ? to_ulong(str.substr(signPosition + 1), base) : to_ulong(str, base);
251  
252          // Check that result is valid and will fit in an int.
253          if (result == to_ulong_error || (result > INT_MAX))
254          {
255              return to_int_error;
256          }
257  
258          return hasSign ? result * -1 : result;
259      }
260  
261      // Just like std::tolower, but without annoying locales.
262      template<typename T>
263      constexpr T tolower_ascii(T c)
264      {
265          if ((c >= 'A') && (c <= 'Z'))
266          {
267              c |= 0x20;
268          }
269  
270          return c;
271      }
272  
273      // Just like std::toupper, but without annoying locales.
274      template<typename T>
275      constexpr T toupper_ascii(T c)
276      {
277          if ((c >= 'a') && (c <= 'z'))
278          {
279              c &= ~0x20;
280          }
281  
282          return c;
283      }
284  
285      // Just like std::wstring_view::operator==().
286      //
287      // At the time of writing wmemcmp() is not an intrinsic for MSVC,
288      // but the STL uses it to implement wide string comparisons.
289      // This produces 3x the assembly _per_ comparison and increases
290      // runtime by 2-3x for strings of medium length (16 characters)
291      // and 5x or more for long strings (128 characters or more).
292      // See: https://github.com/microsoft/STL/issues/2289
293      template<typename T, typename Traits>
294      bool equals(const std::basic_string_view<T, Traits>& lhs, const std::basic_string_view<T, Traits>& rhs) noexcept
295      {
296          return lhs.size() == rhs.size() && __builtin_memcmp(lhs.data(), rhs.data(), lhs.size() * sizeof(T)) == 0;
297      }
298  
299      // Just like _memicmp, but without annoying locales.
300      template<typename T, typename Traits>
301      bool equals_insensitive_ascii(const std::basic_string_view<T, Traits>& str1, const std::basic_string_view<T, Traits>& str2) noexcept
302      {
303          if (str1.size() != str2.size())
304          {
305              return false;
306          }
307  
308  #pragma warning(push)
309  #pragma warning(disable : 26429) // Symbol 'data1' is never tested for null, it can be marked as not_null
310  #pragma warning(disable : 26481) // Don't use pointer arithmetic. Use span instead
311          auto remaining = str1.size();
312          auto data1 = str1.data();
313          auto data2 = str2.data();
314          for (; remaining; --remaining, ++data1, ++data2)
315          {
316              if (*data1 != *data2 && tolower_ascii(*data1) != tolower_ascii(*data2))
317              {
318                  return false;
319              }
320          }
321  #pragma warning(pop)
322  
323          return true;
324      }
325  
326      inline bool equals_insensitive_ascii(const std::string_view& str1, const std::string_view& str2) noexcept
327      {
328          return equals_insensitive_ascii<>(str1, str2);
329      }
330  
331      inline bool equals_insensitive_ascii(const std::wstring_view& str1, const std::wstring_view& str2) noexcept
332      {
333          return equals_insensitive_ascii<>(str1, str2);
334      }
335  
336      template<typename T, typename Traits>
337      constexpr bool starts_with_insensitive_ascii(const std::basic_string_view<T, Traits>& str, const std::basic_string_view<T, Traits>& prefix) noexcept
338      {
339          return str.size() >= prefix.size() && equals_insensitive_ascii<>({ str.data(), prefix.size() }, prefix);
340      }
341  
342      constexpr bool starts_with_insensitive_ascii(const std::string_view& str, const std::string_view& prefix) noexcept
343      {
344          return starts_with_insensitive_ascii<>(str, prefix);
345      }
346  
347      constexpr bool starts_with_insensitive_ascii(const std::wstring_view& str, const std::wstring_view& prefix) noexcept
348      {
349          return starts_with_insensitive_ascii<>(str, prefix);
350      }
351  
352      template<typename T, typename Traits>
353      constexpr bool ends_with_insensitive_ascii(const std::basic_string_view<T, Traits>& str, const std::basic_string_view<T, Traits>& suffix) noexcept
354      {
355  #pragma warning(suppress : 26481) // Don't use pointer arithmetic. Use span instead (bounds.1).
356          return str.size() >= suffix.size() && equals_insensitive_ascii<>({ str.data() - suffix.size(), suffix.size() }, suffix);
357      }
358  
359      constexpr bool ends_with_insensitive_ascii(const std::string_view& str, const std::string_view& prefix) noexcept
360      {
361          return ends_with_insensitive_ascii<>(str, prefix);
362      }
363  
364      constexpr bool ends_with_insensitive_ascii(const std::wstring_view& str, const std::wstring_view& prefix) noexcept
365      {
366          return ends_with<>(str, prefix);
367      }
368  
369      // Give the arguments ("foo bar baz", " "), this method will
370      // * modify the first argument to "bar baz"
371      // * return "foo"
372      // If the needle cannot be found the "str" argument is returned as is.
373      template<typename T, typename Traits>
374      constexpr std::basic_string_view<T, Traits> prefix_split(std::basic_string_view<T, Traits>& str, const std::basic_string_view<T, Traits>& needle) noexcept
375      {
376          using view_type = std::basic_string_view<T, Traits>;
377  
378          const auto needleLen = needle.size();
379          const auto idx = needleLen == 0 ? str.size() : str.find(needle);
380          const auto prefixIdx = std::min(str.size(), idx);
381          const auto suffixIdx = std::min(str.size(), prefixIdx + needle.size());
382  
383          const view_type result{ str.data(), prefixIdx };
384  #pragma warning(suppress : 26481) // Don't use pointer arithmetic. Use span instead
385          str = { str.data() + suffixIdx, str.size() - suffixIdx };
386          return result;
387      }
388  
389      constexpr std::string_view prefix_split(std::string_view& str, const std::string_view& needle) noexcept
390      {
391          return prefix_split<>(str, needle);
392      }
393  
394      constexpr std::wstring_view prefix_split(std::wstring_view& str, const std::wstring_view& needle) noexcept
395      {
396          return prefix_split<>(str, needle);
397      }
398  
399      // Give the arguments ("foo bar baz", " "), this method will
400      // * modify the first argument to "bar baz"
401      // * return "foo"
402      // If the needle cannot be found the "str" argument is returned as is.
403      template<typename T, typename Traits>
404      constexpr std::basic_string_view<T, Traits> prefix_split(std::basic_string_view<T, Traits>& str, T ch) noexcept
405      {
406          using view_type = std::basic_string_view<T, Traits>;
407  
408          const auto idx = str.find(ch);
409          const auto prefixIdx = std::min(str.size(), idx);
410          const auto suffixIdx = std::min(str.size(), prefixIdx + 1);
411  
412          const view_type result{ str.data(), prefixIdx };
413  #pragma warning(suppress : 26481) // Don't use pointer arithmetic. Use span instead
414          str = { str.data() + suffixIdx, str.size() - suffixIdx };
415          return result;
416      }
417  
418      template<typename T, typename Traits>
419      constexpr std::basic_string_view<T, Traits> trim(const std::basic_string_view<T, Traits>& str, const T ch) noexcept
420      {
421          auto beg = str.data();
422          auto end = beg + str.size();
423  
424          for (; beg != end && *beg == ch; ++beg)
425          {
426          }
427  
428          for (; beg != end && end[-1] == ch; --end)
429          {
430          }
431  
432          return { beg, end };
433      }
434  
435      // Splits a font-family list into individual font-families. It loosely follows the CSS spec for font-family.
436      // It splits by comma, handles quotes and simple escape characters, and it cleans whitespace.
437      //
438      // This is not the right place to put this, because it's highly specialized towards font-family names.
439      // But this code is needed both, in our renderer and in our settings UI. At the time I couldn't find a better place for it.
440      void iterate_font_families(const std::wstring_view& families, auto&& callback)
441      {
442          std::wstring family;
443          bool escape = false;
444          bool delayedSpace = false;
445          wchar_t stringType = 0;
446  
447          for (const auto ch : families)
448          {
449              if (!escape)
450              {
451                  switch (ch)
452                  {
453                  case ' ':
454                      if (stringType)
455                      {
456                          // Spaces are treated literally inside strings.
457                          break;
458                      }
459                      delayedSpace = !family.empty();
460                      continue;
461                  case '"':
462                  case '\'':
463                      if (stringType && stringType != ch)
464                      {
465                          // Single quotes inside double quotes are treated literally and vice versa.
466                          break;
467                      }
468                      stringType = stringType == ch ? 0 : ch;
469                      continue;
470                  case ',':
471                      if (stringType)
472                      {
473                          // Commas are treated literally inside strings.
474                          break;
475                      }
476                      if (!family.empty())
477                      {
478                          callback(std::move(family));
479                          family.clear();
480                          delayedSpace = false;
481                      }
482                      continue;
483                  case '\\':
484                      escape = true;
485                      continue;
486                  default:
487                      break;
488                  }
489              }
490  
491              // The `delayedSpace` logic automatically takes care for us to
492              // strip leading and trailing spaces and deduplicate them too.
493              if (delayedSpace)
494              {
495                  delayedSpace = false;
496                  family.push_back(L' ');
497              }
498  
499              family.push_back(ch);
500              escape = false;
501          }
502  
503          // Just like the comma handler above.
504          if (!stringType && !family.empty())
505          {
506              callback(std::move(family));
507          }
508      }
509  
510      //// This function is appropriate for case-insensitive equivalence testing of file paths and other "system" strings.
511      //// Similar to memcmp, this returns <0, 0 or >0.
512      //inline int compare_ordinal_insensitive(const std::wstring_view& lhs, const std::wstring_view& rhs) noexcept
513      //{
514      //    const auto lhsLen = ::base::saturated_cast<int>(lhs.size());
515      //    const auto rhsLen = ::base::saturated_cast<int>(rhs.size());
516      //    // MSDN:
517      //    // > To maintain the C runtime convention of comparing strings,
518      //    // > the value 2 can be subtracted from a nonzero return value.
519      //    // > [...]
520      //    // > The function returns 0 if it does not succeed. [...] following error codes:
521      //    // > * ERROR_INVALID_PARAMETER. Any of the parameter values was invalid.
522      //    // -> We can just subtract 2.
523      //    return CompareStringOrdinal(lhs.data(), lhsLen, rhs.data(), rhsLen, TRUE) - 2;
524      //}
525  
526  //    // This function is appropriate for sorting strings primarily used for human consumption, like a list of file names.
527  //    // Similar to memcmp, this returns <0, 0 or >0.
528  //    inline int compare_linguistic_insensitive(const std::wstring_view& lhs, const std::wstring_view& rhs) noexcept
529  //    {
530  //        const auto lhsLen = ::base::saturated_cast<int>(lhs.size());
531  //        const auto rhsLen = ::base::saturated_cast<int>(rhs.size());
532  //        // MSDN:
533  //        // > To maintain the C runtime convention of comparing strings,
534  //        // > the value 2 can be subtracted from a nonzero return value.
535  //        // > [...]
536  //        // > The function returns 0 if it does not succeed. [...] following error codes:
537  //        // > * ERROR_INVALID_FLAGS. The values supplied for flags were invalid.
538  //        // > * ERROR_INVALID_PARAMETER. Any of the parameter values was invalid.
539  //        // -> We can just subtract 2.
540  //#pragma warning(suppress : 26477) // Use 'nullptr' rather than 0 or NULL (es.47).
541  //        return CompareStringEx(LOCALE_NAME_USER_DEFAULT, LINGUISTIC_IGNORECASE, lhs.data(), lhsLen, rhs.data(), rhsLen, nullptr, nullptr, 0) - 2;
542  //    }
543  //
544  //    // This function is appropriate for strings primarily used for human consumption, like a list of file names.
545  //    inline bool contains_linguistic_insensitive(const std::wstring_view& str, const std::wstring_view& needle) noexcept
546  //    {
547  //        const auto strLen = ::base::saturated_cast<int>(str.size());
548  //        const auto needleLen = ::base::saturated_cast<int>(needle.size());
549  //        // MSDN:
550  //        // > Returns a 0-based index into the source string indicated by lpStringSource if successful.
551  //        // > [...]
552  //        // > The function returns -1 if it does not succeed.
553  //        // > * ERROR_INVALID_FLAGS. The values supplied for flags were not valid.
554  //        // > * ERROR_INVALID_PARAMETER. Any of the parameter values was invalid.
555  //        // > * ERROR_SUCCESS. The action completed successfully but yielded no results.
556  //        // -> We can just check for -1.
557  //#pragma warning(suppress : 26477) // Use 'nullptr' rather than 0 or NULL (es.47).
558  //        return FindNLSStringEx(LOCALE_NAME_USER_DEFAULT, LINGUISTIC_IGNORECASE, str.data(), strLen, needle.data(), needleLen, nullptr, nullptr, nullptr, 0) != -1;
559  //    }
560  }