/ src / util / strencodings.cpp
strencodings.cpp
  1  // Copyright (c) 2009-2010 Satoshi Nakamoto
  2  // Copyright (c) 2009-2022 The Bitcoin Core developers
  3  // Distributed under the MIT software license, see the accompanying
  4  // file COPYING or http://www.opensource.org/licenses/mit-license.php.
  5  
  6  #include <span.h>
  7  #include <util/strencodings.h>
  8  
  9  #include <array>
 10  #include <cassert>
 11  #include <cstring>
 12  #include <limits>
 13  #include <optional>
 14  #include <ostream>
 15  #include <string>
 16  #include <vector>
 17  
 18  static const std::string CHARS_ALPHA_NUM = "abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ0123456789";
 19  
 20  static const std::string SAFE_CHARS[] =
 21  {
 22      CHARS_ALPHA_NUM + " .,;-_/:?@()", // SAFE_CHARS_DEFAULT
 23      CHARS_ALPHA_NUM + " .,;-_?@", // SAFE_CHARS_UA_COMMENT
 24      CHARS_ALPHA_NUM + ".-_", // SAFE_CHARS_FILENAME
 25      CHARS_ALPHA_NUM + "!*'();:@&=+$,/?#[]-_.~%", // SAFE_CHARS_URI
 26  };
 27  
 28  std::string SanitizeString(std::string_view str, int rule)
 29  {
 30      std::string result;
 31      for (char c : str) {
 32          if (SAFE_CHARS[rule].find(c) != std::string::npos) {
 33              result.push_back(c);
 34          }
 35      }
 36      return result;
 37  }
 38  
 39  const signed char p_util_hexdigit[256] =
 40  { -1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,
 41    -1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,
 42    -1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,
 43    0,1,2,3,4,5,6,7,8,9,-1,-1,-1,-1,-1,-1,
 44    -1,0xa,0xb,0xc,0xd,0xe,0xf,-1,-1,-1,-1,-1,-1,-1,-1,-1,
 45    -1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,
 46    -1,0xa,0xb,0xc,0xd,0xe,0xf,-1,-1,-1,-1,-1,-1,-1,-1,-1,
 47    -1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,
 48    -1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,
 49    -1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,
 50    -1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,
 51    -1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,
 52    -1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,
 53    -1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,
 54    -1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,
 55    -1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1, };
 56  
 57  signed char HexDigit(char c)
 58  {
 59      return p_util_hexdigit[(unsigned char)c];
 60  }
 61  
 62  bool IsHex(std::string_view str)
 63  {
 64      for (char c : str) {
 65          if (HexDigit(c) < 0) return false;
 66      }
 67      return (str.size() > 0) && (str.size()%2 == 0);
 68  }
 69  
 70  bool IsHexNumber(std::string_view str)
 71  {
 72      if (str.substr(0, 2) == "0x") str.remove_prefix(2);
 73      for (char c : str) {
 74          if (HexDigit(c) < 0) return false;
 75      }
 76      // Return false for empty string or "0x".
 77      return str.size() > 0;
 78  }
 79  
 80  template <typename Byte>
 81  std::optional<std::vector<Byte>> TryParseHex(std::string_view str)
 82  {
 83      std::vector<Byte> vch;
 84      vch.reserve(str.size() / 2); // two hex characters form a single byte
 85  
 86      auto it = str.begin();
 87      while (it != str.end()) {
 88          if (IsSpace(*it)) {
 89              ++it;
 90              continue;
 91          }
 92          auto c1 = HexDigit(*(it++));
 93          if (it == str.end()) return std::nullopt;
 94          auto c2 = HexDigit(*(it++));
 95          if (c1 < 0 || c2 < 0) return std::nullopt;
 96          vch.push_back(Byte(c1 << 4) | Byte(c2));
 97      }
 98      return vch;
 99  }
100  template std::optional<std::vector<std::byte>> TryParseHex(std::string_view);
101  template std::optional<std::vector<uint8_t>> TryParseHex(std::string_view);
102  
103  bool SplitHostPort(std::string_view in, uint16_t& portOut, std::string& hostOut)
104  {
105      bool valid = false;
106      size_t colon = in.find_last_of(':');
107      // if a : is found, and it either follows a [...], or no other : is in the string, treat it as port separator
108      bool fHaveColon = colon != in.npos;
109      bool fBracketed = fHaveColon && (in[0] == '[' && in[colon - 1] == ']'); // if there is a colon, and in[0]=='[', colon is not 0, so in[colon-1] is safe
110      bool fMultiColon{fHaveColon && colon != 0 && (in.find_last_of(':', colon - 1) != in.npos)};
111      if (fHaveColon && (colon == 0 || fBracketed || !fMultiColon)) {
112          uint16_t n;
113          if (ParseUInt16(in.substr(colon + 1), &n)) {
114              in = in.substr(0, colon);
115              portOut = n;
116              valid = (portOut != 0);
117          }
118      } else {
119          valid = true;
120      }
121      if (in.size() > 0 && in[0] == '[' && in[in.size() - 1] == ']') {
122          hostOut = in.substr(1, in.size() - 2);
123      } else {
124          hostOut = in;
125      }
126  
127      return valid;
128  }
129  
130  std::string EncodeBase64(Span<const unsigned char> input)
131  {
132      static const char *pbase64 = "ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz0123456789+/";
133  
134      std::string str;
135      str.reserve(((input.size() + 2) / 3) * 4);
136      ConvertBits<8, 6, true>([&](int v) { str += pbase64[v]; }, input.begin(), input.end());
137      while (str.size() % 4) str += '=';
138      return str;
139  }
140  
141  std::optional<std::vector<unsigned char>> DecodeBase64(std::string_view str)
142  {
143      static const int8_t decode64_table[256]{
144          -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1,
145          -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1,
146          -1, -1, -1, 62, -1, -1, -1, 63, 52, 53, 54, 55, 56, 57, 58, 59, 60, 61, -1, -1,
147          -1, -1, -1, -1, -1,  0,  1,  2,  3,  4,  5,  6,  7,  8,  9, 10, 11, 12, 13, 14,
148          15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, -1, -1, -1, -1, -1, -1, 26, 27, 28,
149          29, 30, 31, 32, 33, 34, 35, 36, 37, 38, 39, 40, 41, 42, 43, 44, 45, 46, 47, 48,
150          49, 50, 51, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1,
151          -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1,
152          -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1,
153          -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1,
154          -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1,
155          -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1,
156          -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1
157      };
158  
159      if (str.size() % 4 != 0) return {};
160      /* One or two = characters at the end are permitted. */
161      if (str.size() >= 1 && str.back() == '=') str.remove_suffix(1);
162      if (str.size() >= 1 && str.back() == '=') str.remove_suffix(1);
163  
164      std::vector<unsigned char> ret;
165      ret.reserve((str.size() * 3) / 4);
166      bool valid = ConvertBits<6, 8, false>(
167          [&](unsigned char c) { ret.push_back(c); },
168          str.begin(), str.end(),
169          [](char c) { return decode64_table[uint8_t(c)]; }
170      );
171      if (!valid) return {};
172  
173      return ret;
174  }
175  
176  std::string EncodeBase32(Span<const unsigned char> input, bool pad)
177  {
178      static const char *pbase32 = "abcdefghijklmnopqrstuvwxyz234567";
179  
180      std::string str;
181      str.reserve(((input.size() + 4) / 5) * 8);
182      ConvertBits<8, 5, true>([&](int v) { str += pbase32[v]; }, input.begin(), input.end());
183      if (pad) {
184          while (str.size() % 8) {
185              str += '=';
186          }
187      }
188      return str;
189  }
190  
191  std::string EncodeBase32(std::string_view str, bool pad)
192  {
193      return EncodeBase32(MakeUCharSpan(str), pad);
194  }
195  
196  std::optional<std::vector<unsigned char>> DecodeBase32(std::string_view str)
197  {
198      static const int8_t decode32_table[256]{
199          -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1,
200          -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1,
201          -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, 26, 27, 28, 29, 30, 31, -1, -1, -1, -1,
202          -1, -1, -1, -1, -1,  0,  1,  2,  3,  4,  5,  6,  7,  8,  9, 10, 11, 12, 13, 14,
203          15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, -1, -1, -1, -1, -1, -1,  0,  1,  2,
204           3,  4,  5,  6,  7,  8,  9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22,
205          23, 24, 25, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1,
206          -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1,
207          -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1,
208          -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1,
209          -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1,
210          -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1,
211          -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1
212      };
213  
214      if (str.size() % 8 != 0) return {};
215      /* 1, 3, 4, or 6 padding '=' suffix characters are permitted. */
216      if (str.size() >= 1 && str.back() == '=') str.remove_suffix(1);
217      if (str.size() >= 2 && str.substr(str.size() - 2) == "==") str.remove_suffix(2);
218      if (str.size() >= 1 && str.back() == '=') str.remove_suffix(1);
219      if (str.size() >= 2 && str.substr(str.size() - 2) == "==") str.remove_suffix(2);
220  
221      std::vector<unsigned char> ret;
222      ret.reserve((str.size() * 5) / 8);
223      bool valid = ConvertBits<5, 8, false>(
224          [&](unsigned char c) { ret.push_back(c); },
225          str.begin(), str.end(),
226          [](char c) { return decode32_table[uint8_t(c)]; }
227      );
228  
229      if (!valid) return {};
230  
231      return ret;
232  }
233  
234  namespace {
235  template <typename T>
236  bool ParseIntegral(std::string_view str, T* out)
237  {
238      static_assert(std::is_integral<T>::value);
239      // Replicate the exact behavior of strtol/strtoll/strtoul/strtoull when
240      // handling leading +/- for backwards compatibility.
241      if (str.length() >= 2 && str[0] == '+' && str[1] == '-') {
242          return false;
243      }
244      const std::optional<T> opt_int = ToIntegral<T>((!str.empty() && str[0] == '+') ? str.substr(1) : str);
245      if (!opt_int) {
246          return false;
247      }
248      if (out != nullptr) {
249          *out = *opt_int;
250      }
251      return true;
252  }
253  }; // namespace
254  
255  bool ParseInt32(std::string_view str, int32_t* out)
256  {
257      return ParseIntegral<int32_t>(str, out);
258  }
259  
260  bool ParseInt64(std::string_view str, int64_t* out)
261  {
262      return ParseIntegral<int64_t>(str, out);
263  }
264  
265  bool ParseUInt8(std::string_view str, uint8_t* out)
266  {
267      return ParseIntegral<uint8_t>(str, out);
268  }
269  
270  bool ParseUInt16(std::string_view str, uint16_t* out)
271  {
272      return ParseIntegral<uint16_t>(str, out);
273  }
274  
275  bool ParseUInt32(std::string_view str, uint32_t* out)
276  {
277      return ParseIntegral<uint32_t>(str, out);
278  }
279  
280  bool ParseUInt64(std::string_view str, uint64_t* out)
281  {
282      return ParseIntegral<uint64_t>(str, out);
283  }
284  
285  std::string FormatParagraph(std::string_view in, size_t width, size_t indent)
286  {
287      assert(width >= indent);
288      std::stringstream out;
289      size_t ptr = 0;
290      size_t indented = 0;
291      while (ptr < in.size())
292      {
293          size_t lineend = in.find_first_of('\n', ptr);
294          if (lineend == std::string::npos) {
295              lineend = in.size();
296          }
297          const size_t linelen = lineend - ptr;
298          const size_t rem_width = width - indented;
299          if (linelen <= rem_width) {
300              out << in.substr(ptr, linelen + 1);
301              ptr = lineend + 1;
302              indented = 0;
303          } else {
304              size_t finalspace = in.find_last_of(" \n", ptr + rem_width);
305              if (finalspace == std::string::npos || finalspace < ptr) {
306                  // No place to break; just include the entire word and move on
307                  finalspace = in.find_first_of("\n ", ptr);
308                  if (finalspace == std::string::npos) {
309                      // End of the string, just add it and break
310                      out << in.substr(ptr);
311                      break;
312                  }
313              }
314              out << in.substr(ptr, finalspace - ptr) << "\n";
315              if (in[finalspace] == '\n') {
316                  indented = 0;
317              } else if (indent) {
318                  out << std::string(indent, ' ');
319                  indented = indent;
320              }
321              ptr = finalspace + 1;
322          }
323      }
324      return out.str();
325  }
326  
327  /** Upper bound for mantissa.
328   * 10^18-1 is the largest arbitrary decimal that will fit in a signed 64-bit integer.
329   * Larger integers cannot consist of arbitrary combinations of 0-9:
330   *
331   *   999999999999999999  1^18-1
332   *  9223372036854775807  (1<<63)-1  (max int64_t)
333   *  9999999999999999999  1^19-1     (would overflow)
334   */
335  static const int64_t UPPER_BOUND = 1000000000000000000LL - 1LL;
336  
337  /** Helper function for ParseFixedPoint */
338  static inline bool ProcessMantissaDigit(char ch, int64_t &mantissa, int &mantissa_tzeros)
339  {
340      if(ch == '0')
341          ++mantissa_tzeros;
342      else {
343          for (int i=0; i<=mantissa_tzeros; ++i) {
344              if (mantissa > (UPPER_BOUND / 10LL))
345                  return false; /* overflow */
346              mantissa *= 10;
347          }
348          mantissa += ch - '0';
349          mantissa_tzeros = 0;
350      }
351      return true;
352  }
353  
354  bool ParseFixedPoint(std::string_view val, int decimals, int64_t *amount_out)
355  {
356      int64_t mantissa = 0;
357      int64_t exponent = 0;
358      int mantissa_tzeros = 0;
359      bool mantissa_sign = false;
360      bool exponent_sign = false;
361      int ptr = 0;
362      int end = val.size();
363      int point_ofs = 0;
364  
365      if (ptr < end && val[ptr] == '-') {
366          mantissa_sign = true;
367          ++ptr;
368      }
369      if (ptr < end)
370      {
371          if (val[ptr] == '0') {
372              /* pass single 0 */
373              ++ptr;
374          } else if (val[ptr] >= '1' && val[ptr] <= '9') {
375              while (ptr < end && IsDigit(val[ptr])) {
376                  if (!ProcessMantissaDigit(val[ptr], mantissa, mantissa_tzeros))
377                      return false; /* overflow */
378                  ++ptr;
379              }
380          } else return false; /* missing expected digit */
381      } else return false; /* empty string or loose '-' */
382      if (ptr < end && val[ptr] == '.')
383      {
384          ++ptr;
385          if (ptr < end && IsDigit(val[ptr]))
386          {
387              while (ptr < end && IsDigit(val[ptr])) {
388                  if (!ProcessMantissaDigit(val[ptr], mantissa, mantissa_tzeros))
389                      return false; /* overflow */
390                  ++ptr;
391                  ++point_ofs;
392              }
393          } else return false; /* missing expected digit */
394      }
395      if (ptr < end && (val[ptr] == 'e' || val[ptr] == 'E'))
396      {
397          ++ptr;
398          if (ptr < end && val[ptr] == '+')
399              ++ptr;
400          else if (ptr < end && val[ptr] == '-') {
401              exponent_sign = true;
402              ++ptr;
403          }
404          if (ptr < end && IsDigit(val[ptr])) {
405              while (ptr < end && IsDigit(val[ptr])) {
406                  if (exponent > (UPPER_BOUND / 10LL))
407                      return false; /* overflow */
408                  exponent = exponent * 10 + val[ptr] - '0';
409                  ++ptr;
410              }
411          } else return false; /* missing expected digit */
412      }
413      if (ptr != end)
414          return false; /* trailing garbage */
415  
416      /* finalize exponent */
417      if (exponent_sign)
418          exponent = -exponent;
419      exponent = exponent - point_ofs + mantissa_tzeros;
420  
421      /* finalize mantissa */
422      if (mantissa_sign)
423          mantissa = -mantissa;
424  
425      /* convert to one 64-bit fixed-point value */
426      exponent += decimals;
427      if (exponent < 0)
428          return false; /* cannot represent values smaller than 10^-decimals */
429      if (exponent >= 18)
430          return false; /* cannot represent values larger than or equal to 10^(18-decimals) */
431  
432      for (int i=0; i < exponent; ++i) {
433          if (mantissa > (UPPER_BOUND / 10LL) || mantissa < -(UPPER_BOUND / 10LL))
434              return false; /* overflow */
435          mantissa *= 10;
436      }
437      if (mantissa > UPPER_BOUND || mantissa < -UPPER_BOUND)
438          return false; /* overflow */
439  
440      if (amount_out)
441          *amount_out = mantissa;
442  
443      return true;
444  }
445  
446  std::string ToLower(std::string_view str)
447  {
448      std::string r;
449      r.reserve(str.size());
450      for (auto ch : str) r += ToLower(ch);
451      return r;
452  }
453  
454  std::string ToUpper(std::string_view str)
455  {
456      std::string r;
457      r.reserve(str.size());
458      for (auto ch : str) r += ToUpper(ch);
459      return r;
460  }
461  
462  std::string Capitalize(std::string str)
463  {
464      if (str.empty()) return str;
465      str[0] = ToUpper(str.front());
466      return str;
467  }
468  
469  namespace {
470  
471  using ByteAsHex = std::array<char, 2>;
472  
473  constexpr std::array<ByteAsHex, 256> CreateByteToHexMap()
474  {
475      constexpr char hexmap[16] = {'0', '1', '2', '3', '4', '5', '6', '7', '8', '9', 'a', 'b', 'c', 'd', 'e', 'f'};
476  
477      std::array<ByteAsHex, 256> byte_to_hex{};
478      for (size_t i = 0; i < byte_to_hex.size(); ++i) {
479          byte_to_hex[i][0] = hexmap[i >> 4];
480          byte_to_hex[i][1] = hexmap[i & 15];
481      }
482      return byte_to_hex;
483  }
484  
485  } // namespace
486  
487  std::string HexStr(const Span<const uint8_t> s)
488  {
489      std::string rv(s.size() * 2, '\0');
490      static constexpr auto byte_to_hex = CreateByteToHexMap();
491      static_assert(sizeof(byte_to_hex) == 512);
492  
493      char* it = rv.data();
494      for (uint8_t v : s) {
495          std::memcpy(it, byte_to_hex[v].data(), 2);
496          it += 2;
497      }
498  
499      assert(it == rv.data() + rv.size());
500      return rv;
501  }
502  
503  std::optional<uint64_t> ParseByteUnits(std::string_view str, ByteUnit default_multiplier)
504  {
505      if (str.empty()) {
506          return std::nullopt;
507      }
508      auto multiplier = default_multiplier;
509      char unit = str.back();
510      switch (unit) {
511      case 'k':
512          multiplier = ByteUnit::k;
513          break;
514      case 'K':
515          multiplier = ByteUnit::K;
516          break;
517      case 'm':
518          multiplier = ByteUnit::m;
519          break;
520      case 'M':
521          multiplier = ByteUnit::M;
522          break;
523      case 'g':
524          multiplier = ByteUnit::g;
525          break;
526      case 'G':
527          multiplier = ByteUnit::G;
528          break;
529      case 't':
530          multiplier = ByteUnit::t;
531          break;
532      case 'T':
533          multiplier = ByteUnit::T;
534          break;
535      default:
536          unit = 0;
537          break;
538      }
539  
540      uint64_t unit_amount = static_cast<uint64_t>(multiplier);
541      auto parsed_num = ToIntegral<uint64_t>(unit ? str.substr(0, str.size() - 1) : str);
542      if (!parsed_num || parsed_num > std::numeric_limits<uint64_t>::max() / unit_amount) { // check overflow
543          return std::nullopt;
544      }
545      return *parsed_num * unit_amount;
546  }