til_string.h
1 // Copyright (c) Microsoft Corporation. 2 // Licensed under the MIT license. 3 4 #pragma once 5 6 namespace til // Terminal Implementation Library. Also: "Today I Learned" 7 { 8 // The at function declares that you've already sufficiently checked that your array access 9 // is in range before retrieving an item inside it at an offset. 10 // This is to save double/triple/quadruple testing in circumstances where you are already 11 // pivoting on the length of a set and now want to pull elements out of it by offset 12 // without checking again. 13 // gsl::at will do the check again. As will .at(). And using [] will have a warning in audit. 14 // This template is explicitly disabled if T is of type std::span, as it would interfere with 15 // the overload below. 16 template<typename T, typename I> 17 constexpr auto at(T&& cont, const I i) noexcept -> decltype(auto) 18 { 19 #pragma warning(suppress : 26481) // Don't use pointer arithmetic. Use span instead (bounds.1). 20 #pragma warning(suppress : 26482) // Suppress bounds.2 check for indexing with constant expressions 21 #pragma warning(suppress : 26446) // Suppress bounds.4 check for subscript operator. 22 #pragma warning(suppress : 26445) // Suppress lifetime check for a reference to std::span or std::string_view 23 return cont[i]; 24 } 25 26 _TIL_INLINEPREFIX std::wstring visualize_control_codes(std::wstring str) noexcept 27 { 28 for (auto& ch : str) 29 { 30 if (ch < 0x20) 31 { 32 ch += 0x2400; 33 } 34 else if (ch == 0x20) 35 { 36 ch = 0x2423; // replace space with ␣ 37 } 38 else if (ch == 0x7f) 39 { 40 ch = 0x2421; // replace del with ␡ 41 } 42 } 43 return str; 44 } 45 // The same as the above, but it doesn't visualize BS nor SPC. 46 _TIL_INLINEPREFIX std::wstring visualize_nonspace_control_codes(std::wstring str) noexcept 47 { 48 for (auto& ch : str) 49 { 50 // NOT backspace! 51 if (ch < 0x20 && ch != 0x08) 52 { 53 ch += 0x2400; 54 } 55 // NOT space 56 else if (ch == 0x7f) 57 { 58 ch = 0x2421; // replace del with ␡ 59 } 60 } 61 return str; 62 } 63 64 _TIL_INLINEPREFIX std::wstring visualize_control_codes(std::wstring_view str) 65 { 66 return visualize_control_codes(std::wstring{ str }); 67 } 68 69 namespace details 70 { 71 inline constexpr uint8_t __ = 0b00; 72 inline constexpr uint8_t F_ = 0b10; // stripped in clean_filename 73 inline constexpr uint8_t _P = 0b01; // stripped in clean_path 74 inline constexpr uint8_t FP = 0b11; // stripped in clean_filename and clean_path 75 inline constexpr std::array<uint8_t, 128> pathFilter{ { 76 // clang-format off 77 __ /* NUL */, __ /* SOH */, __ /* STX */, __ /* ETX */, __ /* EOT */, __ /* ENQ */, __ /* ACK */, __ /* BEL */, __ /* BS */, __ /* HT */, __ /* LF */, __ /* VT */, __ /* FF */, __ /* CR */, __ /* SO */, __ /* SI */, 78 __ /* DLE */, __ /* DC1 */, __ /* DC2 */, __ /* DC3 */, __ /* DC4 */, __ /* NAK */, __ /* SYN */, __ /* ETB */, __ /* CAN */, __ /* EM */, __ /* SUB */, __ /* ESC */, __ /* FS */, __ /* GS */, __ /* RS */, __ /* US */, 79 __ /* SP */, __ /* ! */, FP /* " */, __ /* # */, __ /* $ */, __ /* % */, __ /* & */, __ /* ' */, __ /* ( */, __ /* ) */, FP /* * */, __ /* + */, __ /* , */, __ /* - */, __ /* . */, F_ /* / */, 80 __ /* 0 */, __ /* 1 */, __ /* 2 */, __ /* 3 */, __ /* 4 */, __ /* 5 */, __ /* 6 */, __ /* 7 */, __ /* 8 */, __ /* 9 */, F_ /* : */, __ /* ; */, FP /* < */, __ /* = */, FP /* > */, FP /* ? */, 81 __ /* @ */, __ /* A */, __ /* B */, __ /* C */, __ /* D */, __ /* E */, __ /* F */, __ /* G */, __ /* H */, __ /* I */, __ /* J */, __ /* K */, __ /* L */, __ /* M */, __ /* N */, __ /* O */, 82 __ /* P */, __ /* Q */, __ /* R */, __ /* S */, __ /* T */, __ /* U */, __ /* V */, __ /* W */, __ /* X */, __ /* Y */, __ /* Z */, __ /* [ */, F_ /* \ */, __ /* ] */, __ /* ^ */, __ /* _ */, 83 __ /* ` */, __ /* a */, __ /* b */, __ /* c */, __ /* d */, __ /* e */, __ /* f */, __ /* g */, __ /* h */, __ /* i */, __ /* j */, __ /* k */, __ /* l */, __ /* m */, __ /* n */, __ /* o */, 84 __ /* p */, __ /* q */, __ /* r */, __ /* s */, __ /* t */, __ /* u */, __ /* v */, __ /* w */, __ /* x */, __ /* y */, __ /* z */, __ /* { */, FP /* | */, __ /* } */, __ /* ~ */, __ /* DEL */, 85 // clang-format on 86 } }; 87 } 88 89 _TIL_INLINEPREFIX std::wstring clean_filename(std::wstring str) noexcept 90 { 91 using namespace til::details; 92 std::erase_if(str, [](auto ch) { 93 // This lookup is branchless: It always checks the filter, but throws 94 // away the result if ch >= 128. This is faster than using `&&` (branchy). 95 return ((til::at(details::pathFilter, ch & 127) & F_) != 0) & (ch < 128); 96 }); 97 return str; 98 } 99 100 _TIL_INLINEPREFIX std::wstring clean_path(std::wstring str) noexcept 101 { 102 using namespace til::details; 103 std::erase_if(str, [](auto ch) { 104 return ((til::at(details::pathFilter, ch & 127) & _P) != 0) & (ch < 128); 105 }); 106 return str; 107 } 108 109 // is_legal_path rules on whether a path contains any non-path characters. 110 // it **DOES NOT** rule on whether a path exists. 111 _TIL_INLINEPREFIX constexpr bool is_legal_path(const std::wstring_view str) noexcept 112 { 113 using namespace til::details; 114 return !std::any_of(std::begin(str), std::end(str), [](auto&& ch) { 115 return ((til::at(details::pathFilter, ch & 127) & _P) != 0) & (ch < 128); 116 }); 117 } 118 119 // std::string_view::starts_with support for C++17. 120 template<typename T, typename Traits> 121 constexpr bool starts_with(const std::basic_string_view<T, Traits>& str, const std::basic_string_view<T, Traits>& prefix) noexcept 122 { 123 return str.size() >= prefix.size() && __builtin_memcmp(str.data(), prefix.data(), prefix.size() * sizeof(T)) == 0; 124 } 125 126 constexpr bool starts_with(const std::string_view& str, const std::string_view& prefix) noexcept 127 { 128 return starts_with<>(str, prefix); 129 } 130 131 constexpr bool starts_with(const std::wstring_view& str, const std::wstring_view& prefix) noexcept 132 { 133 return starts_with<>(str, prefix); 134 } 135 136 // std::string_view::ends_with support for C++17. 137 template<typename T, typename Traits> 138 constexpr bool ends_with(const std::basic_string_view<T, Traits>& str, const std::basic_string_view<T, Traits>& suffix) noexcept 139 { 140 #pragma warning(suppress : 26481) // Don't use pointer arithmetic. Use span instead (bounds.1). 141 return str.size() >= suffix.size() && __builtin_memcmp(str.data() + (str.size() - suffix.size()), suffix.data(), suffix.size() * sizeof(T)) == 0; 142 } 143 144 constexpr bool ends_with(const std::string_view& str, const std::string_view& prefix) noexcept 145 { 146 return ends_with<>(str, prefix); 147 } 148 149 constexpr bool ends_with(const std::wstring_view& str, const std::wstring_view& prefix) noexcept 150 { 151 return ends_with<>(str, prefix); 152 } 153 154 inline constexpr unsigned long to_ulong_error = ULONG_MAX; 155 inline constexpr int to_int_error = INT_MAX; 156 157 // Just like std::wcstoul, but without annoying locales and null-terminating strings. 158 // It has been fuzz-tested against clang's strtoul implementation. 159 template<typename T, typename Traits> 160 _TIL_INLINEPREFIX constexpr unsigned long to_ulong(const std::basic_string_view<T, Traits>& str, unsigned long base = 0) noexcept 161 { 162 static constexpr unsigned long maximumValue = ULONG_MAX / 16; 163 164 // We don't have to test ptr for null value, as we only access it under either condition: 165 // * str.length() > 0, for determining the base 166 // * ptr != end, when parsing the characters; if ptr is null, length will be 0 and thus end == ptr 167 #pragma warning(push) 168 #pragma warning(disable : 26429) // Symbol 'ptr' is never tested for null value, it can be marked as not_null 169 #pragma warning(disable : 26481) // Don't use pointer arithmetic. Use span instead 170 auto ptr = str.data(); 171 const auto end = ptr + str.length(); 172 unsigned long accumulator = 0; 173 unsigned long value = ULONG_MAX; 174 175 if (!base) 176 { 177 base = 10; 178 179 if (str.length() > 1 && *ptr == '0') 180 { 181 base = 8; 182 ++ptr; 183 184 if (str.length() > 2 && (*ptr == 'x' || *ptr == 'X')) 185 { 186 base = 16; 187 ++ptr; 188 } 189 } 190 } 191 192 if (ptr == end) 193 { 194 return to_ulong_error; 195 } 196 197 for (;; accumulator *= base) 198 { 199 value = ULONG_MAX; 200 if (*ptr >= '0' && *ptr <= '9') 201 { 202 value = *ptr - '0'; 203 } 204 else if (*ptr >= 'A' && *ptr <= 'F') 205 { 206 value = *ptr - 'A' + 10; 207 } 208 else if (*ptr >= 'a' && *ptr <= 'f') 209 { 210 value = *ptr - 'a' + 10; 211 } 212 else 213 { 214 return to_ulong_error; 215 } 216 217 accumulator += value; 218 if (accumulator >= maximumValue) 219 { 220 return to_ulong_error; 221 } 222 223 if (++ptr == end) 224 { 225 return accumulator; 226 } 227 } 228 #pragma warning(pop) 229 } 230 231 constexpr unsigned long to_ulong(const std::string_view& str, unsigned long base = 0) noexcept 232 { 233 return to_ulong<>(str, base); 234 } 235 236 constexpr unsigned long to_ulong(const std::wstring_view& str, unsigned long base = 0) noexcept 237 { 238 return to_ulong<>(str, base); 239 } 240 241 // Implement to_int in terms of to_ulong by negating its result. to_ulong does not expect 242 // to be passed signed numbers and will return an error accordingly. That error when 243 // compared against -1 evaluates to true. We account for that by returning to_int_error if to_ulong 244 // returns an error. 245 constexpr int to_int(const std::wstring_view& str, unsigned long base = 0) noexcept 246 { 247 auto result = to_ulong_error; 248 const auto signPosition = str.find(L"-"); 249 const bool hasSign = signPosition != std::wstring_view::npos; 250 result = hasSign ? to_ulong(str.substr(signPosition + 1), base) : to_ulong(str, base); 251 252 // Check that result is valid and will fit in an int. 253 if (result == to_ulong_error || (result > INT_MAX)) 254 { 255 return to_int_error; 256 } 257 258 return hasSign ? result * -1 : result; 259 } 260 261 // Just like std::tolower, but without annoying locales. 262 template<typename T> 263 constexpr T tolower_ascii(T c) 264 { 265 if ((c >= 'A') && (c <= 'Z')) 266 { 267 c |= 0x20; 268 } 269 270 return c; 271 } 272 273 // Just like std::toupper, but without annoying locales. 274 template<typename T> 275 constexpr T toupper_ascii(T c) 276 { 277 if ((c >= 'a') && (c <= 'z')) 278 { 279 c &= ~0x20; 280 } 281 282 return c; 283 } 284 285 // Just like std::wstring_view::operator==(). 286 // 287 // At the time of writing wmemcmp() is not an intrinsic for MSVC, 288 // but the STL uses it to implement wide string comparisons. 289 // This produces 3x the assembly _per_ comparison and increases 290 // runtime by 2-3x for strings of medium length (16 characters) 291 // and 5x or more for long strings (128 characters or more). 292 // See: https://github.com/microsoft/STL/issues/2289 293 template<typename T, typename Traits> 294 bool equals(const std::basic_string_view<T, Traits>& lhs, const std::basic_string_view<T, Traits>& rhs) noexcept 295 { 296 return lhs.size() == rhs.size() && __builtin_memcmp(lhs.data(), rhs.data(), lhs.size() * sizeof(T)) == 0; 297 } 298 299 // Just like _memicmp, but without annoying locales. 300 template<typename T, typename Traits> 301 bool equals_insensitive_ascii(const std::basic_string_view<T, Traits>& str1, const std::basic_string_view<T, Traits>& str2) noexcept 302 { 303 if (str1.size() != str2.size()) 304 { 305 return false; 306 } 307 308 #pragma warning(push) 309 #pragma warning(disable : 26429) // Symbol 'data1' is never tested for null, it can be marked as not_null 310 #pragma warning(disable : 26481) // Don't use pointer arithmetic. Use span instead 311 auto remaining = str1.size(); 312 auto data1 = str1.data(); 313 auto data2 = str2.data(); 314 for (; remaining; --remaining, ++data1, ++data2) 315 { 316 if (*data1 != *data2 && tolower_ascii(*data1) != tolower_ascii(*data2)) 317 { 318 return false; 319 } 320 } 321 #pragma warning(pop) 322 323 return true; 324 } 325 326 inline bool equals_insensitive_ascii(const std::string_view& str1, const std::string_view& str2) noexcept 327 { 328 return equals_insensitive_ascii<>(str1, str2); 329 } 330 331 inline bool equals_insensitive_ascii(const std::wstring_view& str1, const std::wstring_view& str2) noexcept 332 { 333 return equals_insensitive_ascii<>(str1, str2); 334 } 335 336 template<typename T, typename Traits> 337 constexpr bool starts_with_insensitive_ascii(const std::basic_string_view<T, Traits>& str, const std::basic_string_view<T, Traits>& prefix) noexcept 338 { 339 return str.size() >= prefix.size() && equals_insensitive_ascii<>({ str.data(), prefix.size() }, prefix); 340 } 341 342 constexpr bool starts_with_insensitive_ascii(const std::string_view& str, const std::string_view& prefix) noexcept 343 { 344 return starts_with_insensitive_ascii<>(str, prefix); 345 } 346 347 constexpr bool starts_with_insensitive_ascii(const std::wstring_view& str, const std::wstring_view& prefix) noexcept 348 { 349 return starts_with_insensitive_ascii<>(str, prefix); 350 } 351 352 template<typename T, typename Traits> 353 constexpr bool ends_with_insensitive_ascii(const std::basic_string_view<T, Traits>& str, const std::basic_string_view<T, Traits>& suffix) noexcept 354 { 355 #pragma warning(suppress : 26481) // Don't use pointer arithmetic. Use span instead (bounds.1). 356 return str.size() >= suffix.size() && equals_insensitive_ascii<>({ str.data() - suffix.size(), suffix.size() }, suffix); 357 } 358 359 constexpr bool ends_with_insensitive_ascii(const std::string_view& str, const std::string_view& prefix) noexcept 360 { 361 return ends_with_insensitive_ascii<>(str, prefix); 362 } 363 364 constexpr bool ends_with_insensitive_ascii(const std::wstring_view& str, const std::wstring_view& prefix) noexcept 365 { 366 return ends_with<>(str, prefix); 367 } 368 369 // Give the arguments ("foo bar baz", " "), this method will 370 // * modify the first argument to "bar baz" 371 // * return "foo" 372 // If the needle cannot be found the "str" argument is returned as is. 373 template<typename T, typename Traits> 374 constexpr std::basic_string_view<T, Traits> prefix_split(std::basic_string_view<T, Traits>& str, const std::basic_string_view<T, Traits>& needle) noexcept 375 { 376 using view_type = std::basic_string_view<T, Traits>; 377 378 const auto needleLen = needle.size(); 379 const auto idx = needleLen == 0 ? str.size() : str.find(needle); 380 const auto prefixIdx = std::min(str.size(), idx); 381 const auto suffixIdx = std::min(str.size(), prefixIdx + needle.size()); 382 383 const view_type result{ str.data(), prefixIdx }; 384 #pragma warning(suppress : 26481) // Don't use pointer arithmetic. Use span instead 385 str = { str.data() + suffixIdx, str.size() - suffixIdx }; 386 return result; 387 } 388 389 constexpr std::string_view prefix_split(std::string_view& str, const std::string_view& needle) noexcept 390 { 391 return prefix_split<>(str, needle); 392 } 393 394 constexpr std::wstring_view prefix_split(std::wstring_view& str, const std::wstring_view& needle) noexcept 395 { 396 return prefix_split<>(str, needle); 397 } 398 399 // Give the arguments ("foo bar baz", " "), this method will 400 // * modify the first argument to "bar baz" 401 // * return "foo" 402 // If the needle cannot be found the "str" argument is returned as is. 403 template<typename T, typename Traits> 404 constexpr std::basic_string_view<T, Traits> prefix_split(std::basic_string_view<T, Traits>& str, T ch) noexcept 405 { 406 using view_type = std::basic_string_view<T, Traits>; 407 408 const auto idx = str.find(ch); 409 const auto prefixIdx = std::min(str.size(), idx); 410 const auto suffixIdx = std::min(str.size(), prefixIdx + 1); 411 412 const view_type result{ str.data(), prefixIdx }; 413 #pragma warning(suppress : 26481) // Don't use pointer arithmetic. Use span instead 414 str = { str.data() + suffixIdx, str.size() - suffixIdx }; 415 return result; 416 } 417 418 template<typename T, typename Traits> 419 constexpr std::basic_string_view<T, Traits> trim(const std::basic_string_view<T, Traits>& str, const T ch) noexcept 420 { 421 auto beg = str.data(); 422 auto end = beg + str.size(); 423 424 for (; beg != end && *beg == ch; ++beg) 425 { 426 } 427 428 for (; beg != end && end[-1] == ch; --end) 429 { 430 } 431 432 return { beg, end }; 433 } 434 435 // Splits a font-family list into individual font-families. It loosely follows the CSS spec for font-family. 436 // It splits by comma, handles quotes and simple escape characters, and it cleans whitespace. 437 // 438 // This is not the right place to put this, because it's highly specialized towards font-family names. 439 // But this code is needed both, in our renderer and in our settings UI. At the time I couldn't find a better place for it. 440 void iterate_font_families(const std::wstring_view& families, auto&& callback) 441 { 442 std::wstring family; 443 bool escape = false; 444 bool delayedSpace = false; 445 wchar_t stringType = 0; 446 447 for (const auto ch : families) 448 { 449 if (!escape) 450 { 451 switch (ch) 452 { 453 case ' ': 454 if (stringType) 455 { 456 // Spaces are treated literally inside strings. 457 break; 458 } 459 delayedSpace = !family.empty(); 460 continue; 461 case '"': 462 case '\'': 463 if (stringType && stringType != ch) 464 { 465 // Single quotes inside double quotes are treated literally and vice versa. 466 break; 467 } 468 stringType = stringType == ch ? 0 : ch; 469 continue; 470 case ',': 471 if (stringType) 472 { 473 // Commas are treated literally inside strings. 474 break; 475 } 476 if (!family.empty()) 477 { 478 callback(std::move(family)); 479 family.clear(); 480 delayedSpace = false; 481 } 482 continue; 483 case '\\': 484 escape = true; 485 continue; 486 default: 487 break; 488 } 489 } 490 491 // The `delayedSpace` logic automatically takes care for us to 492 // strip leading and trailing spaces and deduplicate them too. 493 if (delayedSpace) 494 { 495 delayedSpace = false; 496 family.push_back(L' '); 497 } 498 499 family.push_back(ch); 500 escape = false; 501 } 502 503 // Just like the comma handler above. 504 if (!stringType && !family.empty()) 505 { 506 callback(std::move(family)); 507 } 508 } 509 510 //// This function is appropriate for case-insensitive equivalence testing of file paths and other "system" strings. 511 //// Similar to memcmp, this returns <0, 0 or >0. 512 //inline int compare_ordinal_insensitive(const std::wstring_view& lhs, const std::wstring_view& rhs) noexcept 513 //{ 514 // const auto lhsLen = ::base::saturated_cast<int>(lhs.size()); 515 // const auto rhsLen = ::base::saturated_cast<int>(rhs.size()); 516 // // MSDN: 517 // // > To maintain the C runtime convention of comparing strings, 518 // // > the value 2 can be subtracted from a nonzero return value. 519 // // > [...] 520 // // > The function returns 0 if it does not succeed. [...] following error codes: 521 // // > * ERROR_INVALID_PARAMETER. Any of the parameter values was invalid. 522 // // -> We can just subtract 2. 523 // return CompareStringOrdinal(lhs.data(), lhsLen, rhs.data(), rhsLen, TRUE) - 2; 524 //} 525 526 // // This function is appropriate for sorting strings primarily used for human consumption, like a list of file names. 527 // // Similar to memcmp, this returns <0, 0 or >0. 528 // inline int compare_linguistic_insensitive(const std::wstring_view& lhs, const std::wstring_view& rhs) noexcept 529 // { 530 // const auto lhsLen = ::base::saturated_cast<int>(lhs.size()); 531 // const auto rhsLen = ::base::saturated_cast<int>(rhs.size()); 532 // // MSDN: 533 // // > To maintain the C runtime convention of comparing strings, 534 // // > the value 2 can be subtracted from a nonzero return value. 535 // // > [...] 536 // // > The function returns 0 if it does not succeed. [...] following error codes: 537 // // > * ERROR_INVALID_FLAGS. The values supplied for flags were invalid. 538 // // > * ERROR_INVALID_PARAMETER. Any of the parameter values was invalid. 539 // // -> We can just subtract 2. 540 //#pragma warning(suppress : 26477) // Use 'nullptr' rather than 0 or NULL (es.47). 541 // return CompareStringEx(LOCALE_NAME_USER_DEFAULT, LINGUISTIC_IGNORECASE, lhs.data(), lhsLen, rhs.data(), rhsLen, nullptr, nullptr, 0) - 2; 542 // } 543 // 544 // // This function is appropriate for strings primarily used for human consumption, like a list of file names. 545 // inline bool contains_linguistic_insensitive(const std::wstring_view& str, const std::wstring_view& needle) noexcept 546 // { 547 // const auto strLen = ::base::saturated_cast<int>(str.size()); 548 // const auto needleLen = ::base::saturated_cast<int>(needle.size()); 549 // // MSDN: 550 // // > Returns a 0-based index into the source string indicated by lpStringSource if successful. 551 // // > [...] 552 // // > The function returns -1 if it does not succeed. 553 // // > * ERROR_INVALID_FLAGS. The values supplied for flags were not valid. 554 // // > * ERROR_INVALID_PARAMETER. Any of the parameter values was invalid. 555 // // > * ERROR_SUCCESS. The action completed successfully but yielded no results. 556 // // -> We can just check for -1. 557 //#pragma warning(suppress : 26477) // Use 'nullptr' rather than 0 or NULL (es.47). 558 // return FindNLSStringEx(LOCALE_NAME_USER_DEFAULT, LINGUISTIC_IGNORECASE, str.data(), strLen, needle.data(), needleLen, nullptr, nullptr, nullptr, 0) != -1; 559 // } 560 }