/ src / univalue / include / univalue_utffilter.h
univalue_utffilter.h
  1  // Copyright 2016 Wladimir J. van der Laan
  2  // Distributed under the MIT software license, see the accompanying
  3  // file COPYING or https://opensource.org/licenses/mit-license.php.
  4  #ifndef BITCOIN_UNIVALUE_INCLUDE_UNIVALUE_UTFFILTER_H
  5  #define BITCOIN_UNIVALUE_INCLUDE_UNIVALUE_UTFFILTER_H
  6  
  7  #include <string>
  8  
  9  /**
 10   * Filter that generates and validates UTF-8, as well as collates UTF-16
 11   * surrogate pairs as specified in RFC4627.
 12   */
 13  class JSONUTF8StringFilter
 14  {
 15  public:
 16      explicit JSONUTF8StringFilter(std::string& s)
 17          : str(s)
 18      {
 19      }
 20      // Write single 8-bit char (may be part of UTF-8 sequence)
 21      void push_back(unsigned char ch)
 22      {
 23          if (state == 0) {
 24              if (ch < 0x80) // 7-bit ASCII, fast direct pass-through
 25                  str.push_back(ch);
 26              else if (ch < 0xc0) // Mid-sequence character, invalid in this state
 27                  is_valid = false;
 28              else if (ch < 0xe0) { // Start of 2-byte sequence
 29                  codepoint = (ch & 0x1f) << 6;
 30                  state = 6;
 31              } else if (ch < 0xf0) { // Start of 3-byte sequence
 32                  codepoint = (ch & 0x0f) << 12;
 33                  state = 12;
 34              } else if (ch < 0xf8) { // Start of 4-byte sequence
 35                  codepoint = (ch & 0x07) << 18;
 36                  state = 18;
 37              } else // Reserved, invalid
 38                  is_valid = false;
 39          } else {
 40              if ((ch & 0xc0) != 0x80) // Not a continuation, invalid
 41                  is_valid = false;
 42              state -= 6;
 43              codepoint |= (ch & 0x3f) << state;
 44              if (state == 0)
 45                  push_back_u(codepoint);
 46          }
 47      }
 48      // Write codepoint directly, possibly collating surrogate pairs
 49      void push_back_u(unsigned int codepoint_)
 50      {
 51          if (state) // Only accept full codepoints in open state
 52              is_valid = false;
 53          if (codepoint_ >= 0xD800 && codepoint_ < 0xDC00) { // First half of surrogate pair
 54              if (surpair) // Two subsequent surrogate pair openers - fail
 55                  is_valid = false;
 56              else
 57                  surpair = codepoint_;
 58          } else if (codepoint_ >= 0xDC00 && codepoint_ < 0xE000) { // Second half of surrogate pair
 59              if (surpair) { // Open surrogate pair, expect second half
 60                  // Compute code point from UTF-16 surrogate pair
 61                  append_codepoint(0x10000 | ((surpair - 0xD800)<<10) | (codepoint_ - 0xDC00));
 62                  surpair = 0;
 63              } else // Second half doesn't follow a first half - fail
 64                  is_valid = false;
 65          } else {
 66              if (surpair) // First half of surrogate pair not followed by second - fail
 67                  is_valid = false;
 68              else
 69                  append_codepoint(codepoint_);
 70          }
 71      }
 72      // Check that we're in a state where the string can be ended
 73      // No open sequences, no open surrogate pairs, etc
 74      bool finalize()
 75      {
 76          if (state || surpair)
 77              is_valid = false;
 78          return is_valid;
 79      }
 80  private:
 81      std::string &str;
 82      bool is_valid{true};
 83      // Current UTF-8 decoding state
 84      unsigned int codepoint{0};
 85      int state{0}; // Top bit to be filled in for next UTF-8 byte, or 0
 86  
 87      // Keep track of the following state to handle the following section of
 88      // RFC4627:
 89      //
 90      //    To escape an extended character that is not in the Basic Multilingual
 91      //    Plane, the character is represented as a twelve-character sequence,
 92      //    encoding the UTF-16 surrogate pair.  So, for example, a string
 93      //    containing only the G clef character (U+1D11E) may be represented as
 94      //    "\uD834\uDD1E".
 95      //
 96      //  Two subsequent \u.... may have to be replaced with one actual codepoint.
 97      unsigned int surpair{0}; // First half of open UTF-16 surrogate pair, or 0
 98  
 99      void append_codepoint(unsigned int codepoint_)
100      {
101          if (codepoint_ <= 0x7f)
102              str.push_back((char)codepoint_);
103          else if (codepoint_ <= 0x7FF) {
104              str.push_back((char)(0xC0 | (codepoint_ >> 6)));
105              str.push_back((char)(0x80 | (codepoint_ & 0x3F)));
106          } else if (codepoint_ <= 0xFFFF) {
107              str.push_back((char)(0xE0 | (codepoint_ >> 12)));
108              str.push_back((char)(0x80 | ((codepoint_ >> 6) & 0x3F)));
109              str.push_back((char)(0x80 | (codepoint_ & 0x3F)));
110          } else if (codepoint_ <= 0x1FFFFF) {
111              str.push_back((char)(0xF0 | (codepoint_ >> 18)));
112              str.push_back((char)(0x80 | ((codepoint_ >> 12) & 0x3F)));
113              str.push_back((char)(0x80 | ((codepoint_ >> 6) & 0x3F)));
114              str.push_back((char)(0x80 | (codepoint_ & 0x3F)));
115          }
116      }
117  };
118  
119  #endif // BITCOIN_UNIVALUE_INCLUDE_UNIVALUE_UTFFILTER_H