LCOV - code coverage report
Current view: top level - src/univalue/include - univalue_utffilter.h (source / functions) Coverage Total Hit
Test: fuzz_coverage.info Lines: 100.0 % 62 62
Test Date: 2025-01-22 04:09:46 Functions: 100.0 % 3 3
Branches: 97.5 % 40 39

             Branch data     Line data    Source code
       1                 :             : // Copyright 2016 Wladimir J. van der Laan
       2                 :             : // Distributed under the MIT software license, see the accompanying
       3                 :             : // file COPYING or https://opensource.org/licenses/mit-license.php.
       4                 :             : #ifndef BITCOIN_UNIVALUE_INCLUDE_UNIVALUE_UTFFILTER_H
       5                 :             : #define BITCOIN_UNIVALUE_INCLUDE_UNIVALUE_UTFFILTER_H
       6                 :             : 
       7                 :             : #include <string>
       8                 :             : 
       9                 :             : /**
      10                 :             :  * Filter that generates and validates UTF-8, as well as collates UTF-16
      11                 :             :  * surrogate pairs as specified in RFC4627.
      12                 :             :  */
      13                 :             : class JSONUTF8StringFilter
      14                 :             : {
      15                 :             : public:
      16                 :      609378 :     explicit JSONUTF8StringFilter(std::string& s)
      17                 :      609378 :         : str(s)
      18                 :             :     {
      19                 :      609378 :     }
      20                 :             :     // Write single 8-bit char (may be part of UTF-8 sequence)
      21                 :    63229733 :     void push_back(unsigned char ch)
      22                 :             :     {
      23         [ +  + ]:    63229733 :         if (state == 0) {
      24         [ +  + ]:    62498202 :             if (ch < 0x80) // 7-bit ASCII, fast direct pass-through
      25                 :    62108602 :                 str.push_back(ch);
      26         [ +  + ]:      389600 :             else if (ch < 0xc0) // Mid-sequence character, invalid in this state
      27                 :       11130 :                 is_valid = false;
      28         [ +  + ]:      378470 :             else if (ch < 0xe0) { // Start of 2-byte sequence
      29                 :      150002 :                 codepoint = (ch & 0x1f) << 6;
      30                 :      150002 :                 state = 6;
      31         [ +  + ]:      228468 :             } else if (ch < 0xf0) { // Start of 3-byte sequence
      32                 :       21042 :                 codepoint = (ch & 0x0f) << 12;
      33                 :       21042 :                 state = 12;
      34         [ +  + ]:      207426 :             } else if (ch < 0xf8) { // Start of 4-byte sequence
      35                 :      179871 :                 codepoint = (ch & 0x07) << 18;
      36                 :      179871 :                 state = 18;
      37                 :             :             } else // Reserved, invalid
      38                 :       27555 :                 is_valid = false;
      39                 :             :         } else {
      40         [ +  + ]:      731531 :             if ((ch & 0xc0) != 0x80) // Not a continuation, invalid
      41                 :      318305 :                 is_valid = false;
      42                 :      731531 :             state -= 6;
      43                 :      731531 :             codepoint |= (ch & 0x3f) << state;
      44         [ +  + ]:      731531 :             if (state == 0)
      45                 :      350794 :                 push_back_u(codepoint);
      46                 :             :         }
      47                 :    63229733 :     }
      48                 :             :     // Write codepoint directly, possibly collating surrogate pairs
      49                 :      373121 :     void push_back_u(unsigned int codepoint_)
      50                 :             :     {
      51         [ +  + ]:      373121 :         if (state) // Only accept full codepoints in open state
      52                 :        2610 :             is_valid = false;
      53         [ +  + ]:      373121 :         if (codepoint_ >= 0xD800 && codepoint_ < 0xDC00) { // First half of surrogate pair
      54         [ +  + ]:        8155 :             if (surpair) // Two subsequent surrogate pair openers - fail
      55                 :        4051 :                 is_valid = false;
      56                 :             :             else
      57                 :        4104 :                 surpair = codepoint_;
      58         [ +  + ]:      364966 :         } else if (codepoint_ >= 0xDC00 && codepoint_ < 0xE000) { // Second half of surrogate pair
      59         [ +  + ]:        7694 :             if (surpair) { // Open surrogate pair, expect second half
      60                 :             :                 // Compute code point from UTF-16 surrogate pair
      61                 :        3951 :                 append_codepoint(0x10000 | ((surpair - 0xD800)<<10) | (codepoint_ - 0xDC00));
      62                 :        3951 :                 surpair = 0;
      63                 :             :             } else // Second half doesn't follow a first half - fail
      64                 :        3743 :                 is_valid = false;
      65                 :             :         } else {
      66         [ +  + ]:      357272 :             if (surpair) // First half of surrogate pair not followed by second - fail
      67                 :        4679 :                 is_valid = false;
      68                 :             :             else
      69                 :      352593 :                 append_codepoint(codepoint_);
      70                 :             :         }
      71                 :      373121 :     }
      72                 :             :     // Check that we're in a state where the string can be ended
      73                 :             :     // No open sequences, no open surrogate pairs, etc
      74                 :      608609 :     bool finalize()
      75                 :             :     {
      76         [ +  + ]:      608609 :         if (state || surpair)
      77                 :          57 :             is_valid = false;
      78         [ +  + ]:      608609 :         return is_valid;
      79                 :             :     }
      80                 :             : private:
      81                 :             :     std::string &str;
      82                 :             :     bool is_valid{true};
      83                 :             :     // Current UTF-8 decoding state
      84                 :             :     unsigned int codepoint{0};
      85                 :             :     int state{0}; // Top bit to be filled in for next UTF-8 byte, or 0
      86                 :             : 
      87                 :             :     // Keep track of the following state to handle the following section of
      88                 :             :     // RFC4627:
      89                 :             :     //
      90                 :             :     //    To escape an extended character that is not in the Basic Multilingual
      91                 :             :     //    Plane, the character is represented as a twelve-character sequence,
      92                 :             :     //    encoding the UTF-16 surrogate pair.  So, for example, a string
      93                 :             :     //    containing only the G clef character (U+1D11E) may be represented as
      94                 :             :     //    "\uD834\uDD1E".
      95                 :             :     //
      96                 :             :     //  Two subsequent \u.... may have to be replaced with one actual codepoint.
      97                 :             :     unsigned int surpair{0}; // First half of open UTF-16 surrogate pair, or 0
      98                 :             : 
      99                 :      356544 :     void append_codepoint(unsigned int codepoint_)
     100                 :             :     {
     101         [ +  + ]:      356544 :         if (codepoint_ <= 0x7f)
     102                 :       15756 :             str.push_back((char)codepoint_);
     103         [ +  + ]:      340788 :         else if (codepoint_ <= 0x7FF) {
     104                 :      132779 :             str.push_back((char)(0xC0 | (codepoint_ >> 6)));
     105                 :      132779 :             str.push_back((char)(0x80 | (codepoint_ & 0x3F)));
     106         [ +  + ]:      208009 :         } else if (codepoint_ <= 0xFFFF) {
     107                 :       24359 :             str.push_back((char)(0xE0 | (codepoint_ >> 12)));
     108                 :       24359 :             str.push_back((char)(0x80 | ((codepoint_ >> 6) & 0x3F)));
     109                 :       24359 :             str.push_back((char)(0x80 | (codepoint_ & 0x3F)));
     110         [ +  - ]:      183650 :         } else if (codepoint_ <= 0x1FFFFF) {
     111                 :      183650 :             str.push_back((char)(0xF0 | (codepoint_ >> 18)));
     112                 :      183650 :             str.push_back((char)(0x80 | ((codepoint_ >> 12) & 0x3F)));
     113                 :      183650 :             str.push_back((char)(0x80 | ((codepoint_ >> 6) & 0x3F)));
     114                 :      183650 :             str.push_back((char)(0x80 | (codepoint_ & 0x3F)));
     115                 :             :         }
     116                 :      356544 :     }
     117                 :             : };
     118                 :             : 
     119                 :             : #endif // BITCOIN_UNIVALUE_INCLUDE_UNIVALUE_UTFFILTER_H
        

Generated by: LCOV version 2.0-1