strencodings.h
1 // Copyright (c) 2009-2010 Satoshi Nakamoto 2 // Copyright (c) 2009-present The Bitcoin Core developers 3 // Distributed under the MIT software license, see the accompanying 4 // file COPYING or http://www.opensource.org/licenses/mit-license.php. 5 6 /** 7 * Utilities for converting data from/to strings. 8 */ 9 #ifndef BITCOIN_UTIL_STRENCODINGS_H 10 #define BITCOIN_UTIL_STRENCODINGS_H 11 12 #include <crypto/hex_base.h> // IWYU pragma: export 13 #include <span.h> 14 #include <util/string.h> 15 16 #include <array> 17 #include <bit> 18 #include <charconv> 19 #include <cstddef> 20 #include <cstdint> 21 #include <limits> 22 #include <optional> 23 #include <string> // IWYU pragma: export 24 #include <string_view> // IWYU pragma: export 25 #include <system_error> 26 #include <type_traits> 27 #include <vector> 28 29 /** Used by SanitizeString() */ 30 enum SafeChars 31 { 32 SAFE_CHARS_DEFAULT, //!< The full set of allowed chars 33 SAFE_CHARS_UA_COMMENT, //!< BIP-0014 subset 34 SAFE_CHARS_FILENAME, //!< Chars allowed in filenames 35 SAFE_CHARS_URI, //!< Chars allowed in URIs (RFC 3986) 36 }; 37 38 /** 39 * Used by ParseByteUnits() 40 * Lowercase base 1000 41 * Uppercase base 1024 42 */ 43 enum class ByteUnit : uint64_t { 44 NOOP = 1ULL, 45 k = 1000ULL, 46 K = 1024ULL, 47 m = 1'000'000ULL, 48 M = 1ULL << 20, 49 g = 1'000'000'000ULL, 50 G = 1ULL << 30, 51 t = 1'000'000'000'000ULL, 52 T = 1ULL << 40, 53 }; 54 55 /** 56 * Remove unsafe chars. Safe chars chosen to allow simple messages/URLs/email 57 * addresses, but avoid anything even possibly remotely dangerous like & or > 58 * @param[in] str The string to sanitize 59 * @param[in] rule The set of safe chars to choose (default: least restrictive) 60 * @return A new string without unsafe chars 61 */ 62 std::string SanitizeString(std::string_view str, int rule = SAFE_CHARS_DEFAULT); 63 /** Parse the hex string into bytes (uint8_t or std::byte). Ignores whitespace. Returns nullopt on invalid input. */ 64 template <typename Byte = std::byte> 65 std::optional<std::vector<Byte>> TryParseHex(std::string_view str); 66 /** Like TryParseHex, but returns an empty vector on invalid input. */ 67 template <typename Byte = uint8_t> 68 std::vector<Byte> ParseHex(std::string_view hex_str) 69 { 70 return TryParseHex<Byte>(hex_str).value_or(std::vector<Byte>{}); 71 } 72 /* Returns true if each character in str is a hex character, and has an even 73 * number of hex digits.*/ 74 bool IsHex(std::string_view str); 75 std::optional<std::vector<unsigned char>> DecodeBase64(std::string_view str); 76 std::string EncodeBase64(std::span<const unsigned char> input); 77 inline std::string EncodeBase64(std::span<const std::byte> input) { return EncodeBase64(MakeUCharSpan(input)); } 78 inline std::string EncodeBase64(std::string_view str) { return EncodeBase64(MakeUCharSpan(str)); } 79 std::optional<std::vector<unsigned char>> DecodeBase32(std::string_view str); 80 81 /** 82 * Base32 encode. 83 * If `pad` is true, then the output will be padded with '=' so that its length 84 * is a multiple of 8. 85 */ 86 std::string EncodeBase32(std::span<const unsigned char> input, bool pad = true); 87 88 /** 89 * Base32 encode. 90 * If `pad` is true, then the output will be padded with '=' so that its length 91 * is a multiple of 8. 92 */ 93 std::string EncodeBase32(std::string_view str, bool pad = true); 94 95 /** 96 * Splits socket address string into host string and port value. 97 * Validates port value. 98 * 99 * @param[in] in The socket address string to split. 100 * @param[out] portOut Port-portion of the input, if found and parsable. 101 * @param[out] hostOut Host-portion of the input, if found. 102 * @return true if port-portion is absent or within its allowed range, otherwise false 103 */ 104 bool SplitHostPort(std::string_view in, uint16_t& portOut, std::string& hostOut); 105 106 // LocaleIndependentAtoi is provided for backwards compatibility reasons. 107 // 108 // New code should use ToIntegral or the ParseInt* functions 109 // which provide parse error feedback. 110 // 111 // The goal of LocaleIndependentAtoi is to replicate the defined behaviour of 112 // std::atoi as it behaves under the "C" locale, and remove some undefined 113 // behavior. If the parsed value is bigger than the integer type's maximum 114 // value, or smaller than the integer type's minimum value, std::atoi has 115 // undefined behavior, while this function returns the maximum or minimum 116 // values, respectively. 117 template <typename T> 118 T LocaleIndependentAtoi(std::string_view str) 119 { 120 static_assert(std::is_integral_v<T>); 121 T result; 122 // Emulate atoi(...) handling of white space and leading +/-. 123 std::string_view s = util::TrimStringView(str); 124 if (!s.empty() && s[0] == '+') { 125 if (s.length() >= 2 && s[1] == '-') { 126 return 0; 127 } 128 s = s.substr(1); 129 } 130 auto [_, error_condition] = std::from_chars(s.data(), s.data() + s.size(), result); 131 if (error_condition == std::errc::result_out_of_range) { 132 if (s.length() >= 1 && s[0] == '-') { 133 // Saturate underflow, per strtoll's behavior. 134 return std::numeric_limits<T>::min(); 135 } else { 136 // Saturate overflow, per strtoll's behavior. 137 return std::numeric_limits<T>::max(); 138 } 139 } else if (error_condition != std::errc{}) { 140 return 0; 141 } 142 return result; 143 } 144 145 /** 146 * Tests if the given character is a decimal digit. 147 * @param[in] c character to test 148 * @return true if the argument is a decimal digit; otherwise false. 149 */ 150 constexpr bool IsDigit(char c) 151 { 152 return c >= '0' && c <= '9'; 153 } 154 155 /** 156 * Tests if the given character is a whitespace character. The whitespace characters 157 * are: space, form-feed ('\f'), newline ('\n'), carriage return ('\r'), horizontal 158 * tab ('\t'), and vertical tab ('\v'). 159 * 160 * This function is locale independent. Under the C locale this function gives the 161 * same result as std::isspace. 162 * 163 * @param[in] c character to test 164 * @return true if the argument is a whitespace character; otherwise false 165 */ 166 constexpr inline bool IsSpace(char c) noexcept { 167 return c == ' ' || c == '\f' || c == '\n' || c == '\r' || c == '\t' || c == '\v'; 168 } 169 170 /** 171 * Convert string to integral type T. Leading whitespace, a leading +, or any 172 * trailing character fail the parsing. The required format expressed as regex 173 * is `-?[0-9]+`. The minus sign is only permitted for signed integer types. 174 * 175 * @returns std::nullopt if the entire string could not be parsed, or if the 176 * parsed value is not in the range representable by the type T. 177 */ 178 template <typename T> 179 std::optional<T> ToIntegral(std::string_view str) 180 { 181 static_assert(std::is_integral_v<T>); 182 T result; 183 const auto [first_nonmatching, error_condition] = std::from_chars(str.data(), str.data() + str.size(), result); 184 if (first_nonmatching != str.data() + str.size() || error_condition != std::errc{}) { 185 return std::nullopt; 186 } 187 return result; 188 } 189 190 /** 191 * Convert string to signed 32-bit integer with strict parse error feedback. 192 * @returns true if the entire string could be parsed as valid integer, 193 * false if not the entire string could be parsed or when overflow or underflow occurred. 194 */ 195 [[nodiscard]] bool ParseInt32(std::string_view str, int32_t *out); 196 197 /** 198 * Convert string to signed 64-bit integer with strict parse error feedback. 199 * @returns true if the entire string could be parsed as valid integer, 200 * false if not the entire string could be parsed or when overflow or underflow occurred. 201 */ 202 [[nodiscard]] bool ParseInt64(std::string_view str, int64_t *out); 203 204 /** 205 * Convert decimal string to unsigned 8-bit integer with strict parse error feedback. 206 * @returns true if the entire string could be parsed as valid integer, 207 * false if not the entire string could be parsed or when overflow or underflow occurred. 208 */ 209 [[nodiscard]] bool ParseUInt8(std::string_view str, uint8_t *out); 210 211 /** 212 * Convert decimal string to unsigned 16-bit integer with strict parse error feedback. 213 * @returns true if the entire string could be parsed as valid integer, 214 * false if the entire string could not be parsed or if overflow or underflow occurred. 215 */ 216 [[nodiscard]] bool ParseUInt16(std::string_view str, uint16_t* out); 217 218 /** 219 * Convert decimal string to unsigned 32-bit integer with strict parse error feedback. 220 * @returns true if the entire string could be parsed as valid integer, 221 * false if not the entire string could be parsed or when overflow or underflow occurred. 222 */ 223 [[nodiscard]] bool ParseUInt32(std::string_view str, uint32_t *out); 224 225 /** 226 * Convert decimal string to unsigned 64-bit integer with strict parse error feedback. 227 * @returns true if the entire string could be parsed as valid integer, 228 * false if not the entire string could be parsed or when overflow or underflow occurred. 229 */ 230 [[nodiscard]] bool ParseUInt64(std::string_view str, uint64_t *out); 231 232 /** 233 * Format a paragraph of text to a fixed width, adding spaces for 234 * indentation to any added line. 235 */ 236 std::string FormatParagraph(std::string_view in, size_t width = 79, size_t indent = 0); 237 238 /** 239 * Timing-attack-resistant comparison. 240 * Takes time proportional to length 241 * of first argument. 242 */ 243 template <typename T> 244 bool TimingResistantEqual(const T& a, const T& b) 245 { 246 if (b.size() == 0) return a.size() == 0; 247 size_t accumulator = a.size() ^ b.size(); 248 for (size_t i = 0; i < a.size(); i++) 249 accumulator |= size_t(a[i] ^ b[i%b.size()]); 250 return accumulator == 0; 251 } 252 253 /** Parse number as fixed point according to JSON number syntax. 254 * @returns true on success, false on error. 255 * @note The result must be in the range (-10^18,10^18), otherwise an overflow error will trigger. 256 */ 257 [[nodiscard]] bool ParseFixedPoint(std::string_view, int decimals, int64_t *amount_out); 258 259 namespace { 260 /** Helper class for the default infn argument to ConvertBits (just returns the input). */ 261 struct IntIdentity 262 { 263 [[maybe_unused]] int operator()(int x) const { return x; } 264 }; 265 266 } // namespace 267 268 /** Convert from one power-of-2 number base to another. */ 269 template<int frombits, int tobits, bool pad, typename O, typename It, typename I = IntIdentity> 270 bool ConvertBits(O outfn, It it, It end, I infn = {}) { 271 size_t acc = 0; 272 size_t bits = 0; 273 constexpr size_t maxv = (1 << tobits) - 1; 274 constexpr size_t max_acc = (1 << (frombits + tobits - 1)) - 1; 275 while (it != end) { 276 int v = infn(*it); 277 if (v < 0) return false; 278 acc = ((acc << frombits) | v) & max_acc; 279 bits += frombits; 280 while (bits >= tobits) { 281 bits -= tobits; 282 outfn((acc >> bits) & maxv); 283 } 284 ++it; 285 } 286 if (pad) { 287 if (bits) outfn((acc << (tobits - bits)) & maxv); 288 } else if (bits >= frombits || ((acc << (tobits - bits)) & maxv)) { 289 return false; 290 } 291 return true; 292 } 293 294 /** 295 * Converts the given character to its lowercase equivalent. 296 * This function is locale independent. It only converts uppercase 297 * characters in the standard 7-bit ASCII range. 298 * This is a feature, not a limitation. 299 * 300 * @param[in] c the character to convert to lowercase. 301 * @return the lowercase equivalent of c; or the argument 302 * if no conversion is possible. 303 */ 304 constexpr char ToLower(char c) 305 { 306 return (c >= 'A' && c <= 'Z' ? (c - 'A') + 'a' : c); 307 } 308 309 /** 310 * Returns the lowercase equivalent of the given string. 311 * This function is locale independent. It only converts uppercase 312 * characters in the standard 7-bit ASCII range. 313 * This is a feature, not a limitation. 314 * 315 * @param[in] str the string to convert to lowercase. 316 * @returns lowercased equivalent of str 317 */ 318 std::string ToLower(std::string_view str); 319 320 /** 321 * Converts the given character to its uppercase equivalent. 322 * This function is locale independent. It only converts lowercase 323 * characters in the standard 7-bit ASCII range. 324 * This is a feature, not a limitation. 325 * 326 * @param[in] c the character to convert to uppercase. 327 * @return the uppercase equivalent of c; or the argument 328 * if no conversion is possible. 329 */ 330 constexpr char ToUpper(char c) 331 { 332 return (c >= 'a' && c <= 'z' ? (c - 'a') + 'A' : c); 333 } 334 335 /** 336 * Returns the uppercase equivalent of the given string. 337 * This function is locale independent. It only converts lowercase 338 * characters in the standard 7-bit ASCII range. 339 * This is a feature, not a limitation. 340 * 341 * @param[in] str the string to convert to uppercase. 342 * @returns UPPERCASED EQUIVALENT OF str 343 */ 344 std::string ToUpper(std::string_view str); 345 346 /** 347 * Capitalizes the first character of the given string. 348 * This function is locale independent. It only converts lowercase 349 * characters in the standard 7-bit ASCII range. 350 * This is a feature, not a limitation. 351 * 352 * @param[in] str the string to capitalize. 353 * @returns string with the first letter capitalized. 354 */ 355 std::string Capitalize(std::string str); 356 357 /** 358 * Parse a string with suffix unit [k|K|m|M|g|G|t|T]. 359 * Must be a whole integer, fractions not allowed (0.5t), no whitespace or +- 360 * Lowercase units are 1000 base. Uppercase units are 1024 base. 361 * Examples: 2m,27M,19g,41T 362 * 363 * @param[in] str the string to convert into bytes 364 * @param[in] default_multiplier if no unit is found in str use this unit 365 * @returns optional uint64_t bytes from str or nullopt 366 * if ToIntegral is false, str is empty, trailing whitespace or overflow 367 */ 368 std::optional<uint64_t> ParseByteUnits(std::string_view str, ByteUnit default_multiplier); 369 370 namespace util { 371 /** consteval version of HexDigit() without the lookup table. */ 372 consteval uint8_t ConstevalHexDigit(const char c) 373 { 374 if (c >= '0' && c <= '9') return c - '0'; 375 if (c >= 'a' && c <= 'f') return c - 'a' + 0xa; 376 377 throw "Only lowercase hex digits are allowed, for consistency"; 378 } 379 380 namespace detail { 381 template <size_t N> 382 struct Hex { 383 std::array<std::byte, N / 2> bytes{}; 384 consteval Hex(const char (&hex_str)[N]) 385 // 2 hex digits required per byte + implicit null terminator 386 requires(N % 2 == 1) 387 { 388 if (hex_str[N - 1]) throw "null terminator required"; 389 for (std::size_t i = 0; i < bytes.size(); ++i) { 390 bytes[i] = static_cast<std::byte>( 391 (ConstevalHexDigit(hex_str[2 * i]) << 4) | 392 ConstevalHexDigit(hex_str[2 * i + 1])); 393 } 394 } 395 }; 396 } // namespace detail 397 398 /** 399 * ""_hex is a compile-time user-defined literal returning a 400 * `std::array<std::byte>`, equivalent to ParseHex(). Variants provided: 401 * 402 * - ""_hex_v: Returns `std::vector<std::byte>`, useful for heap allocation or 403 * variable-length serialization. 404 * 405 * - ""_hex_u8: Returns `std::array<uint8_t>`, for cases where `std::byte` is 406 * incompatible. 407 * 408 * - ""_hex_v_u8: Returns `std::vector<uint8_t>`, combining heap allocation with 409 * `uint8_t`. 410 * 411 * @warning It could be necessary to use vector instead of array variants when 412 * serializing, or vice versa, because vectors are assumed to be variable- 413 * length and serialized with a size prefix, while arrays are considered fixed 414 * length and serialized with no prefix. 415 * 416 * @warning It may be preferable to use vector variants to save stack space when 417 * declaring local variables if hex strings are large. Alternatively variables 418 * could be declared constexpr to avoid using stack space. 419 * 420 * @warning Avoid `uint8_t` variants when not necessary, as the codebase 421 * migrates to use `std::byte` instead of `unsigned char` and `uint8_t`. 422 * 423 * @note One reason ""_hex uses `std::array` instead of `std::vector` like 424 * ParseHex() does is because heap-based containers cannot cross the compile- 425 * time/runtime barrier. 426 */ 427 inline namespace hex_literals { 428 429 template <util::detail::Hex str> 430 constexpr auto operator""_hex() { return str.bytes; } 431 432 template <util::detail::Hex str> 433 constexpr auto operator""_hex_u8() { return std::bit_cast<std::array<uint8_t, str.bytes.size()>>(str.bytes); } 434 435 template <util::detail::Hex str> 436 constexpr auto operator""_hex_v() { return std::vector<std::byte>{str.bytes.begin(), str.bytes.end()}; } 437 438 template <util::detail::Hex str> 439 inline auto operator""_hex_v_u8() { return std::vector<uint8_t>{UCharCast(str.bytes.data()), UCharCast(str.bytes.data() + str.bytes.size())}; } 440 441 } // inline namespace hex_literals 442 } // namespace util 443 444 #endif // BITCOIN_UTIL_STRENCODINGS_H