/ src / util / strencodings.h
strencodings.h
  1  // Copyright (c) 2009-2010 Satoshi Nakamoto
  2  // Copyright (c) 2009-present The Bitcoin Core developers
  3  // Distributed under the MIT software license, see the accompanying
  4  // file COPYING or http://www.opensource.org/licenses/mit-license.php.
  5  
  6  /**
  7   * Utilities for converting data from/to strings.
  8   */
  9  #ifndef BITCOIN_UTIL_STRENCODINGS_H
 10  #define BITCOIN_UTIL_STRENCODINGS_H
 11  
 12  #include <crypto/hex_base.h> // IWYU pragma: export
 13  #include <span.h>
 14  #include <util/string.h>
 15  
 16  #include <array>
 17  #include <bit>
 18  #include <charconv>
 19  #include <cstddef>
 20  #include <cstdint>
 21  #include <limits>
 22  #include <optional>
 23  #include <string>      // IWYU pragma: export
 24  #include <string_view> // IWYU pragma: export
 25  #include <system_error>
 26  #include <type_traits>
 27  #include <vector>
 28  
 29  /** Used by SanitizeString() */
 30  enum SafeChars
 31  {
 32      SAFE_CHARS_DEFAULT, //!< The full set of allowed chars
 33      SAFE_CHARS_UA_COMMENT, //!< BIP-0014 subset
 34      SAFE_CHARS_FILENAME, //!< Chars allowed in filenames
 35      SAFE_CHARS_URI, //!< Chars allowed in URIs (RFC 3986)
 36  };
 37  
 38  /**
 39   * Used by ParseByteUnits()
 40   * Lowercase base 1000
 41   * Uppercase base 1024
 42  */
 43  enum class ByteUnit : uint64_t {
 44      NOOP = 1ULL,
 45      k = 1000ULL,
 46      K = 1024ULL,
 47      m = 1'000'000ULL,
 48      M = 1ULL << 20,
 49      g = 1'000'000'000ULL,
 50      G = 1ULL << 30,
 51      t = 1'000'000'000'000ULL,
 52      T = 1ULL << 40,
 53  };
 54  
 55  /**
 56  * Remove unsafe chars. Safe chars chosen to allow simple messages/URLs/email
 57  * addresses, but avoid anything even possibly remotely dangerous like & or >
 58  * @param[in] str    The string to sanitize
 59  * @param[in] rule   The set of safe chars to choose (default: least restrictive)
 60  * @return           A new string without unsafe chars
 61  */
 62  std::string SanitizeString(std::string_view str, int rule = SAFE_CHARS_DEFAULT);
 63  /** Parse the hex string into bytes (uint8_t or std::byte). Ignores whitespace. Returns nullopt on invalid input. */
 64  template <typename Byte = std::byte>
 65  std::optional<std::vector<Byte>> TryParseHex(std::string_view str);
 66  /** Like TryParseHex, but returns an empty vector on invalid input. */
 67  template <typename Byte = uint8_t>
 68  std::vector<Byte> ParseHex(std::string_view hex_str)
 69  {
 70      return TryParseHex<Byte>(hex_str).value_or(std::vector<Byte>{});
 71  }
 72  /* Returns true if each character in str is a hex character, and has an even
 73   * number of hex digits.*/
 74  bool IsHex(std::string_view str);
 75  std::optional<std::vector<unsigned char>> DecodeBase64(std::string_view str);
 76  std::string EncodeBase64(std::span<const unsigned char> input);
 77  inline std::string EncodeBase64(std::span<const std::byte> input) { return EncodeBase64(MakeUCharSpan(input)); }
 78  inline std::string EncodeBase64(std::string_view str) { return EncodeBase64(MakeUCharSpan(str)); }
 79  std::optional<std::vector<unsigned char>> DecodeBase32(std::string_view str);
 80  
 81  /**
 82   * Base32 encode.
 83   * If `pad` is true, then the output will be padded with '=' so that its length
 84   * is a multiple of 8.
 85   */
 86  std::string EncodeBase32(std::span<const unsigned char> input, bool pad = true);
 87  
 88  /**
 89   * Base32 encode.
 90   * If `pad` is true, then the output will be padded with '=' so that its length
 91   * is a multiple of 8.
 92   */
 93  std::string EncodeBase32(std::string_view str, bool pad = true);
 94  
 95  /**
 96   * Splits socket address string into host string and port value.
 97   * Validates port value.
 98   *
 99   * @param[in] in        The socket address string to split.
100   * @param[out] portOut  Port-portion of the input, if found and parsable.
101   * @param[out] hostOut  Host-portion of the input, if found.
102   * @return              true if port-portion is absent or within its allowed range, otherwise false
103   */
104  bool SplitHostPort(std::string_view in, uint16_t& portOut, std::string& hostOut);
105  
106  // LocaleIndependentAtoi is provided for backwards compatibility reasons.
107  //
108  // New code should use ToIntegral or the ParseInt* functions
109  // which provide parse error feedback.
110  //
111  // The goal of LocaleIndependentAtoi is to replicate the defined behaviour of
112  // std::atoi as it behaves under the "C" locale, and remove some undefined
113  // behavior. If the parsed value is bigger than the integer type's maximum
114  // value, or smaller than the integer type's minimum value, std::atoi has
115  // undefined behavior, while this function returns the maximum or minimum
116  // values, respectively.
117  template <typename T>
118  T LocaleIndependentAtoi(std::string_view str)
119  {
120      static_assert(std::is_integral_v<T>);
121      T result;
122      // Emulate atoi(...) handling of white space and leading +/-.
123      std::string_view s = util::TrimStringView(str);
124      if (!s.empty() && s[0] == '+') {
125          if (s.length() >= 2 && s[1] == '-') {
126              return 0;
127          }
128          s = s.substr(1);
129      }
130      auto [_, error_condition] = std::from_chars(s.data(), s.data() + s.size(), result);
131      if (error_condition == std::errc::result_out_of_range) {
132          if (s.length() >= 1 && s[0] == '-') {
133              // Saturate underflow, per strtoll's behavior.
134              return std::numeric_limits<T>::min();
135          } else {
136              // Saturate overflow, per strtoll's behavior.
137              return std::numeric_limits<T>::max();
138          }
139      } else if (error_condition != std::errc{}) {
140          return 0;
141      }
142      return result;
143  }
144  
145  /**
146   * Tests if the given character is a decimal digit.
147   * @param[in] c     character to test
148   * @return          true if the argument is a decimal digit; otherwise false.
149   */
150  constexpr bool IsDigit(char c)
151  {
152      return c >= '0' && c <= '9';
153  }
154  
155  /**
156   * Tests if the given character is a whitespace character. The whitespace characters
157   * are: space, form-feed ('\f'), newline ('\n'), carriage return ('\r'), horizontal
158   * tab ('\t'), and vertical tab ('\v').
159   *
160   * This function is locale independent. Under the C locale this function gives the
161   * same result as std::isspace.
162   *
163   * @param[in] c     character to test
164   * @return          true if the argument is a whitespace character; otherwise false
165   */
166  constexpr inline bool IsSpace(char c) noexcept {
167      return c == ' ' || c == '\f' || c == '\n' || c == '\r' || c == '\t' || c == '\v';
168  }
169  
170  /**
171   * Convert string to integral type T. Leading whitespace, a leading +, or any
172   * trailing character fail the parsing. The required format expressed as regex
173   * is `-?[0-9]+`. The minus sign is only permitted for signed integer types.
174   *
175   * @returns std::nullopt if the entire string could not be parsed, or if the
176   *   parsed value is not in the range representable by the type T.
177   */
178  template <typename T>
179  std::optional<T> ToIntegral(std::string_view str)
180  {
181      static_assert(std::is_integral_v<T>);
182      T result;
183      const auto [first_nonmatching, error_condition] = std::from_chars(str.data(), str.data() + str.size(), result);
184      if (first_nonmatching != str.data() + str.size() || error_condition != std::errc{}) {
185          return std::nullopt;
186      }
187      return result;
188  }
189  
190  /**
191   * Convert string to signed 32-bit integer with strict parse error feedback.
192   * @returns true if the entire string could be parsed as valid integer,
193   *   false if not the entire string could be parsed or when overflow or underflow occurred.
194   */
195  [[nodiscard]] bool ParseInt32(std::string_view str, int32_t *out);
196  
197  /**
198   * Convert string to signed 64-bit integer with strict parse error feedback.
199   * @returns true if the entire string could be parsed as valid integer,
200   *   false if not the entire string could be parsed or when overflow or underflow occurred.
201   */
202  [[nodiscard]] bool ParseInt64(std::string_view str, int64_t *out);
203  
204  /**
205   * Convert decimal string to unsigned 8-bit integer with strict parse error feedback.
206   * @returns true if the entire string could be parsed as valid integer,
207   *   false if not the entire string could be parsed or when overflow or underflow occurred.
208   */
209  [[nodiscard]] bool ParseUInt8(std::string_view str, uint8_t *out);
210  
211  /**
212   * Convert decimal string to unsigned 16-bit integer with strict parse error feedback.
213   * @returns true if the entire string could be parsed as valid integer,
214   *   false if the entire string could not be parsed or if overflow or underflow occurred.
215   */
216  [[nodiscard]] bool ParseUInt16(std::string_view str, uint16_t* out);
217  
218  /**
219   * Convert decimal string to unsigned 32-bit integer with strict parse error feedback.
220   * @returns true if the entire string could be parsed as valid integer,
221   *   false if not the entire string could be parsed or when overflow or underflow occurred.
222   */
223  [[nodiscard]] bool ParseUInt32(std::string_view str, uint32_t *out);
224  
225  /**
226   * Convert decimal string to unsigned 64-bit integer with strict parse error feedback.
227   * @returns true if the entire string could be parsed as valid integer,
228   *   false if not the entire string could be parsed or when overflow or underflow occurred.
229   */
230  [[nodiscard]] bool ParseUInt64(std::string_view str, uint64_t *out);
231  
232  /**
233   * Format a paragraph of text to a fixed width, adding spaces for
234   * indentation to any added line.
235   */
236  std::string FormatParagraph(std::string_view in, size_t width = 79, size_t indent = 0);
237  
238  /**
239   * Timing-attack-resistant comparison.
240   * Takes time proportional to length
241   * of first argument.
242   */
243  template <typename T>
244  bool TimingResistantEqual(const T& a, const T& b)
245  {
246      if (b.size() == 0) return a.size() == 0;
247      size_t accumulator = a.size() ^ b.size();
248      for (size_t i = 0; i < a.size(); i++)
249          accumulator |= size_t(a[i] ^ b[i%b.size()]);
250      return accumulator == 0;
251  }
252  
253  /** Parse number as fixed point according to JSON number syntax.
254   * @returns true on success, false on error.
255   * @note The result must be in the range (-10^18,10^18), otherwise an overflow error will trigger.
256   */
257  [[nodiscard]] bool ParseFixedPoint(std::string_view, int decimals, int64_t *amount_out);
258  
259  namespace {
260  /** Helper class for the default infn argument to ConvertBits (just returns the input). */
261  struct IntIdentity
262  {
263      [[maybe_unused]] int operator()(int x) const { return x; }
264  };
265  
266  } // namespace
267  
268  /** Convert from one power-of-2 number base to another. */
269  template<int frombits, int tobits, bool pad, typename O, typename It, typename I = IntIdentity>
270  bool ConvertBits(O outfn, It it, It end, I infn = {}) {
271      size_t acc = 0;
272      size_t bits = 0;
273      constexpr size_t maxv = (1 << tobits) - 1;
274      constexpr size_t max_acc = (1 << (frombits + tobits - 1)) - 1;
275      while (it != end) {
276          int v = infn(*it);
277          if (v < 0) return false;
278          acc = ((acc << frombits) | v) & max_acc;
279          bits += frombits;
280          while (bits >= tobits) {
281              bits -= tobits;
282              outfn((acc >> bits) & maxv);
283          }
284          ++it;
285      }
286      if (pad) {
287          if (bits) outfn((acc << (tobits - bits)) & maxv);
288      } else if (bits >= frombits || ((acc << (tobits - bits)) & maxv)) {
289          return false;
290      }
291      return true;
292  }
293  
294  /**
295   * Converts the given character to its lowercase equivalent.
296   * This function is locale independent. It only converts uppercase
297   * characters in the standard 7-bit ASCII range.
298   * This is a feature, not a limitation.
299   *
300   * @param[in] c     the character to convert to lowercase.
301   * @return          the lowercase equivalent of c; or the argument
302   *                  if no conversion is possible.
303   */
304  constexpr char ToLower(char c)
305  {
306      return (c >= 'A' && c <= 'Z' ? (c - 'A') + 'a' : c);
307  }
308  
309  /**
310   * Returns the lowercase equivalent of the given string.
311   * This function is locale independent. It only converts uppercase
312   * characters in the standard 7-bit ASCII range.
313   * This is a feature, not a limitation.
314   *
315   * @param[in] str   the string to convert to lowercase.
316   * @returns         lowercased equivalent of str
317   */
318  std::string ToLower(std::string_view str);
319  
320  /**
321   * Converts the given character to its uppercase equivalent.
322   * This function is locale independent. It only converts lowercase
323   * characters in the standard 7-bit ASCII range.
324   * This is a feature, not a limitation.
325   *
326   * @param[in] c     the character to convert to uppercase.
327   * @return          the uppercase equivalent of c; or the argument
328   *                  if no conversion is possible.
329   */
330  constexpr char ToUpper(char c)
331  {
332      return (c >= 'a' && c <= 'z' ? (c - 'a') + 'A' : c);
333  }
334  
335  /**
336   * Returns the uppercase equivalent of the given string.
337   * This function is locale independent. It only converts lowercase
338   * characters in the standard 7-bit ASCII range.
339   * This is a feature, not a limitation.
340   *
341   * @param[in] str   the string to convert to uppercase.
342   * @returns         UPPERCASED EQUIVALENT OF str
343   */
344  std::string ToUpper(std::string_view str);
345  
346  /**
347   * Capitalizes the first character of the given string.
348   * This function is locale independent. It only converts lowercase
349   * characters in the standard 7-bit ASCII range.
350   * This is a feature, not a limitation.
351   *
352   * @param[in] str   the string to capitalize.
353   * @returns         string with the first letter capitalized.
354   */
355  std::string Capitalize(std::string str);
356  
357  /**
358   * Parse a string with suffix unit [k|K|m|M|g|G|t|T].
359   * Must be a whole integer, fractions not allowed (0.5t), no whitespace or +-
360   * Lowercase units are 1000 base. Uppercase units are 1024 base.
361   * Examples: 2m,27M,19g,41T
362   *
363   * @param[in] str                  the string to convert into bytes
364   * @param[in] default_multiplier   if no unit is found in str use this unit
365   * @returns                        optional uint64_t bytes from str or nullopt
366   *                                 if ToIntegral is false, str is empty, trailing whitespace or overflow
367   */
368  std::optional<uint64_t> ParseByteUnits(std::string_view str, ByteUnit default_multiplier);
369  
370  namespace util {
371  /** consteval version of HexDigit() without the lookup table. */
372  consteval uint8_t ConstevalHexDigit(const char c)
373  {
374      if (c >= '0' && c <= '9') return c - '0';
375      if (c >= 'a' && c <= 'f') return c - 'a' + 0xa;
376  
377      throw "Only lowercase hex digits are allowed, for consistency";
378  }
379  
380  namespace detail {
381  template <size_t N>
382  struct Hex {
383      std::array<std::byte, N / 2> bytes{};
384      consteval Hex(const char (&hex_str)[N])
385          // 2 hex digits required per byte + implicit null terminator
386          requires(N % 2 == 1)
387      {
388          if (hex_str[N - 1]) throw "null terminator required";
389          for (std::size_t i = 0; i < bytes.size(); ++i) {
390              bytes[i] = static_cast<std::byte>(
391                  (ConstevalHexDigit(hex_str[2 * i]) << 4) |
392                   ConstevalHexDigit(hex_str[2 * i + 1]));
393          }
394      }
395  };
396  } // namespace detail
397  
398  /**
399   * ""_hex is a compile-time user-defined literal returning a
400   * `std::array<std::byte>`, equivalent to ParseHex(). Variants provided:
401   *
402   * - ""_hex_v: Returns `std::vector<std::byte>`, useful for heap allocation or
403   *   variable-length serialization.
404   *
405   * - ""_hex_u8: Returns `std::array<uint8_t>`, for cases where `std::byte` is
406   *   incompatible.
407   *
408   * - ""_hex_v_u8: Returns `std::vector<uint8_t>`, combining heap allocation with
409   *   `uint8_t`.
410   *
411   * @warning It could be necessary to use vector instead of array variants when
412   *   serializing, or vice versa, because vectors are assumed to be variable-
413   *   length and serialized with a size prefix, while arrays are considered fixed
414   *   length and serialized with no prefix.
415   *
416   * @warning It may be preferable to use vector variants to save stack space when
417   *   declaring local variables if hex strings are large. Alternatively variables
418   *   could be declared constexpr to avoid using stack space.
419   *
420   * @warning Avoid `uint8_t` variants when not necessary, as the codebase
421   *   migrates to use `std::byte` instead of `unsigned char` and `uint8_t`.
422   *
423   * @note One reason ""_hex uses `std::array` instead of `std::vector` like
424   *   ParseHex() does is because heap-based containers cannot cross the compile-
425   *   time/runtime barrier.
426   */
427  inline namespace hex_literals {
428  
429  template <util::detail::Hex str>
430  constexpr auto operator""_hex() { return str.bytes; }
431  
432  template <util::detail::Hex str>
433  constexpr auto operator""_hex_u8() { return std::bit_cast<std::array<uint8_t, str.bytes.size()>>(str.bytes); }
434  
435  template <util::detail::Hex str>
436  constexpr auto operator""_hex_v() { return std::vector<std::byte>{str.bytes.begin(), str.bytes.end()}; }
437  
438  template <util::detail::Hex str>
439  inline auto operator""_hex_v_u8() { return std::vector<uint8_t>{UCharCast(str.bytes.data()), UCharCast(str.bytes.data() + str.bytes.size())}; }
440  
441  } // inline namespace hex_literals
442  } // namespace util
443  
444  #endif // BITCOIN_UTIL_STRENCODINGS_H