1 #include "gmock/gmock.h" 2 3 #include "config.h" 4 5 #include "log.h" 6 7 #include "encodingspeculator.h" 8 9 using namespace std; 10 using namespace testing; 11 12 class EncodingSpeculatorBehaviour: public testing::Test { 13 public: 14 EncodingSpeculator speculator; 15 16 EncodingSpeculatorBehaviour() { 17 } 18 }; 19 20 TEST_F( EncodingSpeculatorBehaviour, DefaultAsPureAscii ) { 21 ASSERT_THAT( speculator.guess(), Eq( EncodingSpeculator::Encoding::ASCII7 ) ); 22 } 23 24 TEST_F( EncodingSpeculatorBehaviour, RecognisePureAscii ) { 25 for ( uint8_t i = 0; i < 127; ++i ) 26 speculator.inject_byte( i ); 27 28 ASSERT_THAT( speculator.guess(), Eq( EncodingSpeculator::Encoding::ASCII7 ) ); 29 } 30 31 TEST_F( EncodingSpeculatorBehaviour, RecogniseRandom8bitEncoding ) { 32 for ( uint8_t i = 0; i < 127; ++i ) 33 speculator.inject_byte( i ); 34 speculator.inject_byte( 0xFF ); 35 36 ASSERT_THAT( speculator.guess(), Eq( EncodingSpeculator::Encoding::ASCII8 ) ); 37 } 38 39 pair<uint8_t,uint8_t> utf8encode2bytes( uint16_t code_point ) 40 { 41 uint8_t cp_low = static_cast<uint8_t>(code_point & 0xFF); 42 uint8_t cp_hi = static_cast<uint8_t>((code_point & 0xFF00) >> 8); 43 uint8_t first_byte = 0xC0 | ( ( cp_hi & 0x7F ) << 2 ) | ( ( cp_low & 0xC0 ) >> 6 ); 44 uint8_t second_byte = 0x80 | ( cp_low & 0x3F ); 45 46 return { first_byte, second_byte }; 47 } 48 49 vector<uint8_t> utf8encodeMultiBytes( uint32_t code_point ) 50 { 51 vector<uint8_t> bytes = {}; 52 53 if ( code_point <= 0xFFFF ) { 54 uint8_t lead = static_cast<uint8_t>( 0xE0 | ( ( code_point & 0xF000 ) >> 12 ) ); 55 bytes.push_back( lead ); 56 bytes.push_back( 0x80 | ( code_point & 0x0FC0 ) >> 6 ); 57 bytes.push_back( 0x80 | ( code_point & 0x3F ) ); 58 } 59 else if ( code_point <= 0x1FFFFF ) { 60 uint8_t lead = static_cast<uint8_t>( 0xF0 | ( ( code_point & 0x1C0000 ) >> 18 ) ); 61 bytes.push_back( lead ); 62 bytes.push_back( 0x80 | ( code_point & 0x3F000 ) >> 12 ); 63 bytes.push_back( 0x80 | ( code_point & 0x00FC0 ) >> 6 ); 64 bytes.push_back( 0x80 | ( code_point & 0x0003F ) ); 65 } 66 67 return bytes; 68 } 69 70 71 TEST_F( EncodingSpeculatorBehaviour, RecogniseTwoBytesUTF8 ) { 72 // All the code points encodable as 2 bytes. 73 for ( uint16_t i = 0x80; i < ( 1 << 11 ); ++i ) { 74 auto utf8_bytes = utf8encode2bytes( i ); 75 76 // cout << bitset<8>(first_byte) << " " << bitset<8>(second_byte) << endl; 77 78 speculator.inject_byte( utf8_bytes.first ); 79 speculator.inject_byte( utf8_bytes.second ); 80 } 81 82 ASSERT_THAT( speculator.guess(), Eq( EncodingSpeculator::Encoding::UTF8 ) ); 83 } 84 85 TEST_F( EncodingSpeculatorBehaviour, RecogniseTwoBytesUTF8With7bitsInterleaved ) { 86 // All the code points encodable as 2 bytes. 87 for ( uint16_t i = 0x80; i < ( 1 << 11 ); ++i ) { 88 auto utf8_bytes = utf8encode2bytes( i ); 89 90 speculator.inject_byte( ' ' ); 91 speculator.inject_byte( utf8_bytes.first ); 92 speculator.inject_byte( utf8_bytes.second ); 93 } 94 95 ASSERT_THAT( speculator.guess(), Eq( EncodingSpeculator::Encoding::UTF8 ) ); 96 } 97 98 TEST_F( EncodingSpeculatorBehaviour, RecogniseIncompleteTwoBytesUTF8 ) { 99 // All the code points encodable as 2 bytes. 100 for ( uint16_t i = 0x80; i < ( 1 << 11 ); ++i ) { 101 auto utf8_bytes = utf8encode2bytes( i ); 102 103 speculator.inject_byte( ' ' ); 104 speculator.inject_byte( utf8_bytes.first ); 105 speculator.inject_byte( utf8_bytes.second ); 106 } 107 108 // Lead byte only 109 speculator.inject_byte( 0xCF ); 110 111 ASSERT_THAT( speculator.guess(), Eq( EncodingSpeculator::Encoding::ASCII8 ) ); 112 } 113 114 TEST_F( EncodingSpeculatorBehaviour, RecogniseIncorrectTwoBytesUTF8 ) { 115 // All the code points encodable as 2 bytes. 116 for ( uint16_t i = 0x80; i < ( 1 << 11 ); ++i ) { 117 auto utf8_bytes = utf8encode2bytes( i ); 118 119 speculator.inject_byte( ' ' ); 120 speculator.inject_byte( utf8_bytes.first ); 121 speculator.inject_byte( utf8_bytes.second ); 122 } 123 124 // Lead byte 125 speculator.inject_byte( 0xCF ); 126 // Incorrect continuation byte (should start with 1) 127 speculator.inject_byte( 0x00 ); 128 129 ASSERT_THAT( speculator.guess(), Eq( EncodingSpeculator::Encoding::ASCII8 ) ); 130 } 131 132 TEST_F( EncodingSpeculatorBehaviour, RecogniseOverlong2BytesUTF8 ) { 133 speculator.inject_byte( ' ' ); 134 speculator.inject_byte( 0xC1 ); 135 speculator.inject_byte( 0xBF ); 136 137 ASSERT_THAT( speculator.guess(), Eq( EncodingSpeculator::Encoding::ASCII8 ) ); 138 } 139 140 TEST_F( EncodingSpeculatorBehaviour, RecogniseThreeBytesUTF8 ) { 141 for ( uint32_t i = 0x800; i <= 0xFFFF; ++i ) { 142 auto utf8_bytes = utf8encodeMultiBytes( i ); 143 144 speculator.inject_byte( ' ' ); 145 for ( uint8_t byte: utf8_bytes ) { 146 // cout << hex << i << " " << static_cast<uint32_t>( byte ) << endl; 147 speculator.inject_byte( byte ); 148 } 149 } 150 151 ASSERT_THAT( speculator.guess(), Eq( EncodingSpeculator::Encoding::UTF8 ) ); 152 } 153 154 TEST_F( EncodingSpeculatorBehaviour, RecogniseOverlong3BytesUTF8 ) { 155 speculator.inject_byte( ' ' ); 156 speculator.inject_byte( 0xA0 ); 157 speculator.inject_byte( 0x80 ); 158 speculator.inject_byte( 0x80 ); 159 160 ASSERT_THAT( speculator.guess(), Eq( EncodingSpeculator::Encoding::ASCII8 ) ); 161 } 162 163 TEST_F( EncodingSpeculatorBehaviour, RecogniseFourBytesUTF8 ) { 164 for ( uint32_t i = 0x10000; i <= 0x1FFFFF; ++i ) { 165 auto utf8_bytes = utf8encodeMultiBytes( i ); 166 167 speculator.inject_byte( ' ' ); 168 for ( uint8_t byte: utf8_bytes ) { 169 // cout << hex << i << " " << static_cast<uint32_t>( byte ) << endl; 170 speculator.inject_byte( byte ); 171 } 172 } 173 174 ASSERT_THAT( speculator.guess(), Eq( EncodingSpeculator::Encoding::UTF8 ) ); 175 } 176 177 TEST_F( EncodingSpeculatorBehaviour, RecogniseOverlong4BytesUTF8 ) { 178 speculator.inject_byte( ' ' ); 179 speculator.inject_byte( 0xF0 ); 180 speculator.inject_byte( 0x80 ); 181 speculator.inject_byte( 0x80 ); 182 speculator.inject_byte( 0x80 ); 183 184 ASSERT_THAT( speculator.guess(), Eq( EncodingSpeculator::Encoding::ASCII8 ) ); 185 } 186 187