|              Branch data     Line data    Source code 
       1                 :             : // Copyright 2016 Wladimir J. van der Laan
       2                 :             : // Distributed under the MIT software license, see the accompanying
       3                 :             : // file COPYING or https://opensource.org/licenses/mit-license.php.
       4                 :             : #ifndef BITCOIN_UNIVALUE_INCLUDE_UNIVALUE_UTFFILTER_H
       5                 :             : #define BITCOIN_UNIVALUE_INCLUDE_UNIVALUE_UTFFILTER_H
       6                 :             : 
       7                 :             : #include <string>
       8                 :             : 
       9                 :             : /**
      10                 :             :  * Filter that generates and validates UTF-8, as well as collates UTF-16
      11                 :             :  * surrogate pairs as specified in RFC4627.
      12                 :             :  */
      13                 :             : class JSONUTF8StringFilter
      14                 :             : {
      15                 :             : public:
      16                 :     1561793 :     explicit JSONUTF8StringFilter(std::string& s)
      17                 :     1561793 :         : str(s)
      18                 :             :     {
      19                 :     1561793 :     }
      20                 :             :     // Write single 8-bit char (may be part of UTF-8 sequence)
      21                 :  3451324883 :     void push_back(unsigned char ch)
      22                 :             :     {
      23         [ +  + ]:  3451324883 :         if (state == 0) {
      24         [ +  + ]:  3451324552 :             if (ch < 0x80) // 7-bit ASCII, fast direct pass-through
      25                 :  3451324416 :                 str.push_back(ch);
      26         [ +  + ]:         136 :             else if (ch < 0xc0) // Mid-sequence character, invalid in this state
      27                 :           3 :                 is_valid = false;
      28         [ +  + ]:         133 :             else if (ch < 0xe0) { // Start of 2-byte sequence
      29                 :           1 :                 codepoint = (ch & 0x1f) << 6;
      30                 :           1 :                 state = 6;
      31         [ +  + ]:         132 :             } else if (ch < 0xf0) { // Start of 3-byte sequence
      32                 :          65 :                 codepoint = (ch & 0x0f) << 12;
      33                 :          65 :                 state = 12;
      34         [ +  - ]:          67 :             } else if (ch < 0xf8) { // Start of 4-byte sequence
      35                 :          67 :                 codepoint = (ch & 0x07) << 18;
      36                 :          67 :                 state = 18;
      37                 :             :             } else // Reserved, invalid
      38                 :           0 :                 is_valid = false;
      39                 :             :         } else {
      40         [ -  + ]:         331 :             if ((ch & 0xc0) != 0x80) // Not a continuation, invalid
      41                 :           0 :                 is_valid = false;
      42                 :         331 :             state -= 6;
      43                 :         331 :             codepoint |= (ch & 0x3f) << state;
      44         [ +  + ]:         331 :             if (state == 0)
      45                 :         132 :                 push_back_u(codepoint);
      46                 :             :         }
      47                 :  3451324883 :     }
      48                 :             :     // Write codepoint directly, possibly collating surrogate pairs
      49                 :         697 :     void push_back_u(unsigned int codepoint_)
      50                 :             :     {
      51         [ -  + ]:         697 :         if (state) // Only accept full codepoints in open state
      52                 :           0 :             is_valid = false;
      53         [ +  + ]:         697 :         if (codepoint_ >= 0xD800 && codepoint_ < 0xDC00) { // First half of surrogate pair
      54         [ -  + ]:         152 :             if (surpair) // Two subsequent surrogate pair openers - fail
      55                 :           0 :                 is_valid = false;
      56                 :             :             else
      57                 :         152 :                 surpair = codepoint_;
      58         [ +  + ]:         545 :         } else if (codepoint_ >= 0xDC00 && codepoint_ < 0xE000) { // Second half of surrogate pair
      59         [ +  + ]:         152 :             if (surpair) { // Open surrogate pair, expect second half
      60                 :             :                 // Compute code point from UTF-16 surrogate pair
      61                 :         151 :                 append_codepoint(0x10000 | ((surpair - 0xD800)<<10) | (codepoint_ - 0xDC00));
      62                 :         151 :                 surpair = 0;
      63                 :             :             } else // Second half doesn't follow a first half - fail
      64                 :           1 :                 is_valid = false;
      65                 :             :         } else {
      66         [ -  + ]:         393 :             if (surpair) // First half of surrogate pair not followed by second - fail
      67                 :           0 :                 is_valid = false;
      68                 :             :             else
      69                 :         393 :                 append_codepoint(codepoint_);
      70                 :             :         }
      71                 :         697 :     }
      72                 :             :     // Check that we're in a state where the string can be ended
      73                 :             :     // No open sequences, no open surrogate pairs, etc
      74                 :     1561785 :     bool finalize()
      75                 :             :     {
      76   [ +  +  +  + ]:     1561785 :         if (state || surpair)
      77                 :           2 :             is_valid = false;
      78         [ +  + ]:     1561785 :         return is_valid;
      79                 :             :     }
      80                 :             : private:
      81                 :             :     std::string &str;
      82                 :             :     bool is_valid{true};
      83                 :             :     // Current UTF-8 decoding state
      84                 :             :     unsigned int codepoint{0};
      85                 :             :     int state{0}; // Top bit to be filled in for next UTF-8 byte, or 0
      86                 :             : 
      87                 :             :     // Keep track of the following state to handle the following section of
      88                 :             :     // RFC4627:
      89                 :             :     //
      90                 :             :     //    To escape an extended character that is not in the Basic Multilingual
      91                 :             :     //    Plane, the character is represented as a twelve-character sequence,
      92                 :             :     //    encoding the UTF-16 surrogate pair.  So, for example, a string
      93                 :             :     //    containing only the G clef character (U+1D11E) may be represented as
      94                 :             :     //    "\uD834\uDD1E".
      95                 :             :     //
      96                 :             :     //  Two subsequent \u.... may have to be replaced with one actual codepoint.
      97                 :             :     unsigned int surpair{0}; // First half of open UTF-16 surrogate pair, or 0
      98                 :             : 
      99                 :         544 :     void append_codepoint(unsigned int codepoint_)
     100                 :             :     {
     101         [ +  + ]:         544 :         if (codepoint_ <= 0x7f)
     102                 :          71 :             str.push_back((char)codepoint_);
     103         [ +  + ]:         473 :         else if (codepoint_ <= 0x7FF) {
     104                 :          11 :             str.push_back((char)(0xC0 | (codepoint_ >> 6)));
     105                 :          11 :             str.push_back((char)(0x80 | (codepoint_ & 0x3F)));
     106         [ +  + ]:         462 :         } else if (codepoint_ <= 0xFFFF) {
     107                 :         245 :             str.push_back((char)(0xE0 | (codepoint_ >> 12)));
     108                 :         245 :             str.push_back((char)(0x80 | ((codepoint_ >> 6) & 0x3F)));
     109                 :         245 :             str.push_back((char)(0x80 | (codepoint_ & 0x3F)));
     110         [ +  - ]:         217 :         } else if (codepoint_ <= 0x1FFFFF) {
     111                 :         217 :             str.push_back((char)(0xF0 | (codepoint_ >> 18)));
     112                 :         217 :             str.push_back((char)(0x80 | ((codepoint_ >> 12) & 0x3F)));
     113                 :         217 :             str.push_back((char)(0x80 | ((codepoint_ >> 6) & 0x3F)));
     114                 :         217 :             str.push_back((char)(0x80 | (codepoint_ & 0x3F)));
     115                 :             :         }
     116                 :         544 :     }
     117                 :             : };
     118                 :             : 
     119                 :             : #endif // BITCOIN_UNIVALUE_INCLUDE_UNIVALUE_UTFFILTER_H
         |