/ src / util / strencodings.h
strencodings.h
  1  // Copyright (c) 2009-2010 Satoshi Nakamoto
  2  // Copyright (c) 2009-present The Bitcoin Core developers
  3  // Distributed under the MIT software license, see the accompanying
  4  // file COPYING or http://www.opensource.org/licenses/mit-license.php.
  5  
  6  /**
  7   * Utilities for converting data from/to strings.
  8   */
  9  #ifndef BITCOIN_UTIL_STRENCODINGS_H
 10  #define BITCOIN_UTIL_STRENCODINGS_H
 11  
 12  #include <span.h>
 13  #include <util/string.h>
 14  
 15  #include <array>
 16  #include <bit>
 17  #include <charconv>
 18  #include <cstddef>
 19  #include <cstdint>
 20  #include <limits>
 21  #include <optional>
 22  #include <span>
 23  #include <string>
 24  #include <string_view>
 25  #include <system_error>
 26  #include <type_traits>
 27  #include <vector>
 28  
 29  /** Used by SanitizeString() */
 30  enum SafeChars
 31  {
 32      SAFE_CHARS_DEFAULT, //!< The full set of allowed chars
 33      SAFE_CHARS_UA_COMMENT, //!< BIP-0014 subset
 34      SAFE_CHARS_FILENAME, //!< Chars allowed in filenames
 35      SAFE_CHARS_URI, //!< Chars allowed in URIs (RFC 3986)
 36  };
 37  
 38  /**
 39   * Used by ParseByteUnits()
 40   * Lowercase base 1000
 41   * Uppercase base 1024
 42  */
 43  enum class ByteUnit : uint64_t {
 44      NOOP = 1ULL,
 45      k = 1000ULL,
 46      K = 1024ULL,
 47      m = 1'000'000ULL,
 48      M = 1ULL << 20,
 49      g = 1'000'000'000ULL,
 50      G = 1ULL << 30,
 51      t = 1'000'000'000'000ULL,
 52      T = 1ULL << 40,
 53  };
 54  
 55  /**
 56  * Remove unsafe chars. Safe chars chosen to allow simple messages/URLs/email
 57  * addresses, but avoid anything even possibly remotely dangerous like & or >
 58  * @param[in] str    The string to sanitize
 59  * @param[in] rule   The set of safe chars to choose (default: least restrictive)
 60  * @return           A new string without unsafe chars
 61  */
 62  std::string SanitizeString(std::string_view str, int rule = SAFE_CHARS_DEFAULT);
 63  /** Parse the hex string into bytes (uint8_t or std::byte). Ignores whitespace. Returns nullopt on invalid input. */
 64  template <typename Byte = std::byte>
 65  std::optional<std::vector<Byte>> TryParseHex(std::string_view str);
 66  /** Like TryParseHex, but returns an empty vector on invalid input. */
 67  template <typename Byte = uint8_t>
 68  std::vector<Byte> ParseHex(std::string_view hex_str)
 69  {
 70      return TryParseHex<Byte>(hex_str).value_or(std::vector<Byte>{});
 71  }
 72  /* Returns true if each character in str is a hex character, and has an even
 73   * number of hex digits.*/
 74  bool IsHex(std::string_view str);
 75  std::optional<std::vector<unsigned char>> DecodeBase64(std::string_view str);
 76  std::string EncodeBase64(std::span<const unsigned char> input);
 77  inline std::string EncodeBase64(std::span<const std::byte> input) { return EncodeBase64(MakeUCharSpan(input)); }
 78  inline std::string EncodeBase64(std::string_view str) { return EncodeBase64(MakeUCharSpan(str)); }
 79  std::optional<std::vector<unsigned char>> DecodeBase32(std::string_view str);
 80  
 81  /**
 82   * Base32 encode.
 83   * If `pad` is true, then the output will be padded with '=' so that its length
 84   * is a multiple of 8.
 85   */
 86  std::string EncodeBase32(std::span<const unsigned char> input, bool pad = true);
 87  
 88  /**
 89   * Base32 encode.
 90   * If `pad` is true, then the output will be padded with '=' so that its length
 91   * is a multiple of 8.
 92   */
 93  std::string EncodeBase32(std::string_view str, bool pad = true);
 94  
 95  /**
 96   * Splits socket address string into host string and port value.
 97   * Validates port value.
 98   *
 99   * @param[in] in        The socket address string to split.
100   * @param[out] portOut  Port-portion of the input, if found and parsable.
101   * @param[out] hostOut  Host-portion of the input, if found.
102   * @return              true if port-portion is absent or within its allowed range, otherwise false
103   */
104  bool SplitHostPort(std::string_view in, uint16_t& portOut, std::string& hostOut);
105  
106  // LocaleIndependentAtoi is provided for backwards compatibility reasons.
107  //
108  // New code should use ToIntegral.
109  //
110  // The goal of LocaleIndependentAtoi is to replicate the defined behaviour of
111  // std::atoi as it behaves under the "C" locale, and remove some undefined
112  // behavior. If the parsed value is bigger than the integer type's maximum
113  // value, or smaller than the integer type's minimum value, std::atoi has
114  // undefined behavior, while this function returns the maximum or minimum
115  // values, respectively.
116  template <typename T>
117  T LocaleIndependentAtoi(std::string_view str)
118  {
119      static_assert(std::is_integral_v<T>);
120      T result;
121      // Emulate atoi(...) handling of white space and leading +/-.
122      std::string_view s = util::TrimStringView(str);
123      if (!s.empty() && s[0] == '+') {
124          if (s.length() >= 2 && s[1] == '-') {
125              return 0;
126          }
127          s = s.substr(1);
128      }
129      auto [_, error_condition] = std::from_chars(s.data(), s.data() + s.size(), result);
130      if (error_condition == std::errc::result_out_of_range) {
131          if (s.length() >= 1 && s[0] == '-') {
132              // Saturate underflow, per strtoll's behavior.
133              return std::numeric_limits<T>::min();
134          } else {
135              // Saturate overflow, per strtoll's behavior.
136              return std::numeric_limits<T>::max();
137          }
138      } else if (error_condition != std::errc{}) {
139          return 0;
140      }
141      return result;
142  }
143  
144  /**
145   * Tests if the given character is a decimal digit.
146   * @param[in] c     character to test
147   * @return          true if the argument is a decimal digit; otherwise false.
148   */
149  constexpr bool IsDigit(char c)
150  {
151      return c >= '0' && c <= '9';
152  }
153  
154  /**
155   * Tests if the given character is a whitespace character. The whitespace characters
156   * are: space, form-feed ('\f'), newline ('\n'), carriage return ('\r'), horizontal
157   * tab ('\t'), and vertical tab ('\v').
158   *
159   * This function is locale independent. Under the C locale this function gives the
160   * same result as std::isspace.
161   *
162   * @param[in] c     character to test
163   * @return          true if the argument is a whitespace character; otherwise false
164   */
165  constexpr inline bool IsSpace(char c) noexcept {
166      return c == ' ' || c == '\f' || c == '\n' || c == '\r' || c == '\t' || c == '\v';
167  }
168  
169  /**
170   * Convert string to integral type T. Leading whitespace, a leading +, or any
171   * trailing character fail the parsing. The required format expressed as regex
172   * is `-?[0-9]+` by default (or `-?[0-9a-fA-F]+` if base = 16).
173   * The minus sign is only permitted for signed integer types.
174   *
175   * @returns std::nullopt if the entire string could not be parsed, or if the
176   *   parsed value is not in the range representable by the type T.
177   */
178  template <typename T>
179  std::optional<T> ToIntegral(std::string_view str, size_t base = 10)
180  {
181      static_assert(std::is_integral_v<T>);
182      T result;
183      const auto [first_nonmatching, error_condition] = std::from_chars(str.data(), str.data() + str.size(), result, base);
184      if (first_nonmatching != str.data() + str.size() || error_condition != std::errc{}) {
185          return std::nullopt;
186      }
187      return result;
188  }
189  
190  /**
191   * Format a paragraph of text to a fixed width, adding spaces for
192   * indentation to any added line.
193   */
194  std::string FormatParagraph(std::string_view in, size_t width = 79, size_t indent = 0);
195  
196  /**
197   * Timing-attack-resistant comparison.
198   * Takes time proportional to length
199   * of first argument.
200   */
201  template <typename T>
202  bool TimingResistantEqual(const T& a, const T& b)
203  {
204      if (b.size() == 0) return a.size() == 0;
205      size_t accumulator = a.size() ^ b.size();
206      for (size_t i = 0; i < a.size(); i++)
207          accumulator |= size_t(a[i] ^ b[i%b.size()]);
208      return accumulator == 0;
209  }
210  
211  /** Parse number as fixed point according to JSON number syntax.
212   * @returns true on success, false on error.
213   * @note The result must be in the range (-10^18,10^18), otherwise an overflow error will trigger.
214   */
215  [[nodiscard]] bool ParseFixedPoint(std::string_view, int decimals, int64_t *amount_out);
216  
217  namespace {
218  /** Helper class for the default infn argument to ConvertBits (just returns the input). */
219  struct IntIdentity
220  {
221      [[maybe_unused]] int operator()(int x) const { return x; }
222  };
223  
224  } // namespace
225  
226  /** Convert from one power-of-2 number base to another. */
227  template<int frombits, int tobits, bool pad, typename O, typename It, typename I = IntIdentity>
228  bool ConvertBits(O outfn, It it, It end, I infn = {}) {
229      size_t acc = 0;
230      size_t bits = 0;
231      constexpr size_t maxv = (1 << tobits) - 1;
232      constexpr size_t max_acc = (1 << (frombits + tobits - 1)) - 1;
233      while (it != end) {
234          int v = infn(*it);
235          if (v < 0) return false;
236          acc = ((acc << frombits) | v) & max_acc;
237          bits += frombits;
238          while (bits >= tobits) {
239              bits -= tobits;
240              outfn((acc >> bits) & maxv);
241          }
242          ++it;
243      }
244      if (pad) {
245          if (bits) outfn((acc << (tobits - bits)) & maxv);
246      } else if (bits >= frombits || ((acc << (tobits - bits)) & maxv)) {
247          return false;
248      }
249      return true;
250  }
251  
252  /**
253   * Converts the given character to its lowercase equivalent.
254   * This function is locale independent. It only converts uppercase
255   * characters in the standard 7-bit ASCII range.
256   * This is a feature, not a limitation.
257   *
258   * @param[in] c     the character to convert to lowercase.
259   * @return          the lowercase equivalent of c; or the argument
260   *                  if no conversion is possible.
261   */
262  constexpr char ToLower(char c)
263  {
264      return (c >= 'A' && c <= 'Z' ? (c - 'A') + 'a' : c);
265  }
266  
267  /**
268   * Returns the lowercase equivalent of the given string.
269   * This function is locale independent. It only converts uppercase
270   * characters in the standard 7-bit ASCII range.
271   * This is a feature, not a limitation.
272   *
273   * @param[in] str   the string to convert to lowercase.
274   * @returns         lowercased equivalent of str
275   */
276  std::string ToLower(std::string_view str);
277  
278  /**
279   * Converts the given character to its uppercase equivalent.
280   * This function is locale independent. It only converts lowercase
281   * characters in the standard 7-bit ASCII range.
282   * This is a feature, not a limitation.
283   *
284   * @param[in] c     the character to convert to uppercase.
285   * @return          the uppercase equivalent of c; or the argument
286   *                  if no conversion is possible.
287   */
288  constexpr char ToUpper(char c)
289  {
290      return (c >= 'a' && c <= 'z' ? (c - 'a') + 'A' : c);
291  }
292  
293  /**
294   * Returns the uppercase equivalent of the given string.
295   * This function is locale independent. It only converts lowercase
296   * characters in the standard 7-bit ASCII range.
297   * This is a feature, not a limitation.
298   *
299   * @param[in] str   the string to convert to uppercase.
300   * @returns         UPPERCASED EQUIVALENT OF str
301   */
302  std::string ToUpper(std::string_view str);
303  
304  /**
305   * Capitalizes the first character of the given string.
306   * This function is locale independent. It only converts lowercase
307   * characters in the standard 7-bit ASCII range.
308   * This is a feature, not a limitation.
309   *
310   * @param[in] str   the string to capitalize.
311   * @returns         string with the first letter capitalized.
312   */
313  std::string Capitalize(std::string str);
314  
315  /**
316   * Parse a string with suffix unit [k|K|m|M|g|G|t|T].
317   * Must be a whole integer, fractions not allowed (0.5t), no whitespace or +-
318   * Lowercase units are 1000 base. Uppercase units are 1024 base.
319   * Examples: 2m,27M,19g,41T
320   *
321   * @param[in] str                  the string to convert into bytes
322   * @param[in] default_multiplier   if no unit is found in str use this unit
323   * @returns                        optional uint64_t bytes from str or nullopt
324   *                                 if ToIntegral is false, str is empty, trailing whitespace or overflow
325   */
326  std::optional<uint64_t> ParseByteUnits(std::string_view str, ByteUnit default_multiplier);
327  
328  /**
329   *  Locale-independent, ASCII-only comparator
330   *  @param[in] s1 a string to compare
331   *  @param[in] s2 another string to compare
332   *  @returns true if s1 == s2 when both strings are converted to lowercase
333   */
334  bool CaseInsensitiveEqual(std::string_view s1, std::string_view s2);
335  
336  namespace util {
337  /** consteval version of HexDigit() without the lookup table. */
338  consteval uint8_t ConstevalHexDigit(const char c)
339  {
340      if (c >= '0' && c <= '9') return c - '0';
341      if (c >= 'a' && c <= 'f') return c - 'a' + 0xa;
342  
343      throw "Only lowercase hex digits are allowed, for consistency";
344  }
345  
346  namespace detail {
347  template <size_t N>
348  struct Hex {
349      std::array<std::byte, N / 2> bytes{};
350      consteval Hex(const char (&hex_str)[N])
351          // 2 hex digits required per byte + implicit null terminator
352          requires(N % 2 == 1)
353      {
354          if (hex_str[N - 1]) throw "null terminator required";
355          for (std::size_t i = 0; i < bytes.size(); ++i) {
356              bytes[i] = static_cast<std::byte>(
357                  (ConstevalHexDigit(hex_str[2 * i]) << 4) |
358                   ConstevalHexDigit(hex_str[2 * i + 1]));
359          }
360      }
361  };
362  } // namespace detail
363  
364  /**
365   * ""_hex is a compile-time user-defined literal returning a
366   * `std::array<std::byte>`, equivalent to ParseHex(). Variants provided:
367   *
368   * - ""_hex_v: Returns `std::vector<std::byte>`, useful for heap allocation or
369   *   variable-length serialization.
370   *
371   * - ""_hex_u8: Returns `std::array<uint8_t>`, for cases where `std::byte` is
372   *   incompatible.
373   *
374   * - ""_hex_v_u8: Returns `std::vector<uint8_t>`, combining heap allocation with
375   *   `uint8_t`.
376   *
377   * @warning It could be necessary to use vector instead of array variants when
378   *   serializing, or vice versa, because vectors are assumed to be variable-
379   *   length and serialized with a size prefix, while arrays are considered fixed
380   *   length and serialized with no prefix.
381   *
382   * @warning It may be preferable to use vector variants to save stack space when
383   *   declaring local variables if hex strings are large. Alternatively variables
384   *   could be declared constexpr to avoid using stack space.
385   *
386   * @warning Avoid `uint8_t` variants when not necessary, as the codebase
387   *   migrates to use `std::byte` instead of `unsigned char` and `uint8_t`.
388   *
389   * @note One reason ""_hex uses `std::array` instead of `std::vector` like
390   *   ParseHex() does is because heap-based containers cannot cross the compile-
391   *   time/runtime barrier.
392   */
393  inline namespace hex_literals {
394  
395  template <util::detail::Hex str>
396  constexpr auto operator""_hex() { return str.bytes; }
397  
398  template <util::detail::Hex str>
399  constexpr auto operator""_hex_u8() { return std::bit_cast<std::array<uint8_t, str.bytes.size()>>(str.bytes); }
400  
401  template <util::detail::Hex str>
402  constexpr auto operator""_hex_v() { return std::vector<std::byte>{str.bytes.begin(), str.bytes.end()}; }
403  
404  template <util::detail::Hex str>
405  inline auto operator""_hex_v_u8() { return std::vector<uint8_t>{UCharCast(str.bytes.data()), UCharCast(str.bytes.data() + str.bytes.size())}; }
406  
407  } // inline namespace hex_literals
408  } // namespace util
409  
410  #endif // BITCOIN_UTIL_STRENCODINGS_H