1 #include "gmock/gmock.h"
2
3 #include "config.h"
4
5 #include "log.h"
6
7 #include "encodingspeculator.h"
8
9 using namespace std;
10 using namespace testing;
11
12 class EncodingSpeculatorBehaviour: public testing::Test {
13 public:
14 EncodingSpeculator speculator;
15
EncodingSpeculatorBehaviour()16 EncodingSpeculatorBehaviour() {
17 }
18 };
19
TEST_F(EncodingSpeculatorBehaviour,DefaultAsPureAscii)20 TEST_F( EncodingSpeculatorBehaviour, DefaultAsPureAscii ) {
21 ASSERT_THAT( speculator.guess(), Eq( EncodingSpeculator::Encoding::ASCII7 ) );
22 }
23
TEST_F(EncodingSpeculatorBehaviour,RecognisePureAscii)24 TEST_F( EncodingSpeculatorBehaviour, RecognisePureAscii ) {
25 for ( uint8_t i = 0; i < 127; ++i )
26 speculator.inject_byte( i );
27
28 ASSERT_THAT( speculator.guess(), Eq( EncodingSpeculator::Encoding::ASCII7 ) );
29 }
30
TEST_F(EncodingSpeculatorBehaviour,RecogniseRandom8bitEncoding)31 TEST_F( EncodingSpeculatorBehaviour, RecogniseRandom8bitEncoding ) {
32 for ( uint8_t i = 0; i < 127; ++i )
33 speculator.inject_byte( i );
34 speculator.inject_byte( 0xFF );
35
36 ASSERT_THAT( speculator.guess(), Eq( EncodingSpeculator::Encoding::ASCII8 ) );
37 }
38
utf8encode2bytes(uint16_t code_point)39 pair<uint8_t,uint8_t> utf8encode2bytes( uint16_t code_point )
40 {
41 uint8_t cp_low = static_cast<uint8_t>(code_point & 0xFF);
42 uint8_t cp_hi = static_cast<uint8_t>((code_point & 0xFF00) >> 8);
43 uint8_t first_byte = 0xC0 | ( ( cp_hi & 0x7F ) << 2 ) | ( ( cp_low & 0xC0 ) >> 6 );
44 uint8_t second_byte = 0x80 | ( cp_low & 0x3F );
45
46 return { first_byte, second_byte };
47 }
48
utf8encodeMultiBytes(uint32_t code_point)49 vector<uint8_t> utf8encodeMultiBytes( uint32_t code_point )
50 {
51 vector<uint8_t> bytes = {};
52
53 if ( code_point <= 0xFFFF ) {
54 uint8_t lead = static_cast<uint8_t>( 0xE0 | ( ( code_point & 0xF000 ) >> 12 ) );
55 bytes.push_back( lead );
56 bytes.push_back( 0x80 | ( code_point & 0x0FC0 ) >> 6 );
57 bytes.push_back( 0x80 | ( code_point & 0x3F ) );
58 }
59 else if ( code_point <= 0x1FFFFF ) {
60 uint8_t lead = static_cast<uint8_t>( 0xF0 | ( ( code_point & 0x1C0000 ) >> 18 ) );
61 bytes.push_back( lead );
62 bytes.push_back( 0x80 | ( code_point & 0x3F000 ) >> 12 );
63 bytes.push_back( 0x80 | ( code_point & 0x00FC0 ) >> 6 );
64 bytes.push_back( 0x80 | ( code_point & 0x0003F ) );
65 }
66
67 return bytes;
68 }
69
70
TEST_F(EncodingSpeculatorBehaviour,RecogniseTwoBytesUTF8)71 TEST_F( EncodingSpeculatorBehaviour, RecogniseTwoBytesUTF8 ) {
72 // All the code points encodable as 2 bytes.
73 for ( uint16_t i = 0x80; i < ( 1 << 11 ); ++i ) {
74 auto utf8_bytes = utf8encode2bytes( i );
75
76 // cout << bitset<8>(first_byte) << " " << bitset<8>(second_byte) << endl;
77
78 speculator.inject_byte( utf8_bytes.first );
79 speculator.inject_byte( utf8_bytes.second );
80 }
81
82 ASSERT_THAT( speculator.guess(), Eq( EncodingSpeculator::Encoding::UTF8 ) );
83 }
84
TEST_F(EncodingSpeculatorBehaviour,RecogniseTwoBytesUTF8With7bitsInterleaved)85 TEST_F( EncodingSpeculatorBehaviour, RecogniseTwoBytesUTF8With7bitsInterleaved ) {
86 // All the code points encodable as 2 bytes.
87 for ( uint16_t i = 0x80; i < ( 1 << 11 ); ++i ) {
88 auto utf8_bytes = utf8encode2bytes( i );
89
90 speculator.inject_byte( ' ' );
91 speculator.inject_byte( utf8_bytes.first );
92 speculator.inject_byte( utf8_bytes.second );
93 }
94
95 ASSERT_THAT( speculator.guess(), Eq( EncodingSpeculator::Encoding::UTF8 ) );
96 }
97
TEST_F(EncodingSpeculatorBehaviour,RecogniseIncompleteTwoBytesUTF8)98 TEST_F( EncodingSpeculatorBehaviour, RecogniseIncompleteTwoBytesUTF8 ) {
99 // All the code points encodable as 2 bytes.
100 for ( uint16_t i = 0x80; i < ( 1 << 11 ); ++i ) {
101 auto utf8_bytes = utf8encode2bytes( i );
102
103 speculator.inject_byte( ' ' );
104 speculator.inject_byte( utf8_bytes.first );
105 speculator.inject_byte( utf8_bytes.second );
106 }
107
108 // Lead byte only
109 speculator.inject_byte( 0xCF );
110
111 ASSERT_THAT( speculator.guess(), Eq( EncodingSpeculator::Encoding::ASCII8 ) );
112 }
113
TEST_F(EncodingSpeculatorBehaviour,RecogniseIncorrectTwoBytesUTF8)114 TEST_F( EncodingSpeculatorBehaviour, RecogniseIncorrectTwoBytesUTF8 ) {
115 // All the code points encodable as 2 bytes.
116 for ( uint16_t i = 0x80; i < ( 1 << 11 ); ++i ) {
117 auto utf8_bytes = utf8encode2bytes( i );
118
119 speculator.inject_byte( ' ' );
120 speculator.inject_byte( utf8_bytes.first );
121 speculator.inject_byte( utf8_bytes.second );
122 }
123
124 // Lead byte
125 speculator.inject_byte( 0xCF );
126 // Incorrect continuation byte (should start with 1)
127 speculator.inject_byte( 0x00 );
128
129 ASSERT_THAT( speculator.guess(), Eq( EncodingSpeculator::Encoding::ASCII8 ) );
130 }
131
TEST_F(EncodingSpeculatorBehaviour,RecogniseOverlong2BytesUTF8)132 TEST_F( EncodingSpeculatorBehaviour, RecogniseOverlong2BytesUTF8 ) {
133 speculator.inject_byte( ' ' );
134 speculator.inject_byte( 0xC1 );
135 speculator.inject_byte( 0xBF );
136
137 ASSERT_THAT( speculator.guess(), Eq( EncodingSpeculator::Encoding::ASCII8 ) );
138 }
139
TEST_F(EncodingSpeculatorBehaviour,RecogniseThreeBytesUTF8)140 TEST_F( EncodingSpeculatorBehaviour, RecogniseThreeBytesUTF8 ) {
141 for ( uint32_t i = 0x800; i <= 0xFFFF; ++i ) {
142 auto utf8_bytes = utf8encodeMultiBytes( i );
143
144 speculator.inject_byte( ' ' );
145 for ( uint8_t byte: utf8_bytes ) {
146 // cout << hex << i << " " << static_cast<uint32_t>( byte ) << endl;
147 speculator.inject_byte( byte );
148 }
149 }
150
151 ASSERT_THAT( speculator.guess(), Eq( EncodingSpeculator::Encoding::UTF8 ) );
152 }
153
TEST_F(EncodingSpeculatorBehaviour,RecogniseOverlong3BytesUTF8)154 TEST_F( EncodingSpeculatorBehaviour, RecogniseOverlong3BytesUTF8 ) {
155 speculator.inject_byte( ' ' );
156 speculator.inject_byte( 0xA0 );
157 speculator.inject_byte( 0x80 );
158 speculator.inject_byte( 0x80 );
159
160 ASSERT_THAT( speculator.guess(), Eq( EncodingSpeculator::Encoding::ASCII8 ) );
161 }
162
TEST_F(EncodingSpeculatorBehaviour,RecogniseFourBytesUTF8)163 TEST_F( EncodingSpeculatorBehaviour, RecogniseFourBytesUTF8 ) {
164 for ( uint32_t i = 0x10000; i <= 0x1FFFFF; ++i ) {
165 auto utf8_bytes = utf8encodeMultiBytes( i );
166
167 speculator.inject_byte( ' ' );
168 for ( uint8_t byte: utf8_bytes ) {
169 // cout << hex << i << " " << static_cast<uint32_t>( byte ) << endl;
170 speculator.inject_byte( byte );
171 }
172 }
173
174 ASSERT_THAT( speculator.guess(), Eq( EncodingSpeculator::Encoding::UTF8 ) );
175 }
176
TEST_F(EncodingSpeculatorBehaviour,RecogniseOverlong4BytesUTF8)177 TEST_F( EncodingSpeculatorBehaviour, RecogniseOverlong4BytesUTF8 ) {
178 speculator.inject_byte( ' ' );
179 speculator.inject_byte( 0xF0 );
180 speculator.inject_byte( 0x80 );
181 speculator.inject_byte( 0x80 );
182 speculator.inject_byte( 0x80 );
183
184 ASSERT_THAT( speculator.guess(), Eq( EncodingSpeculator::Encoding::ASCII8 ) );
185 }
186
TEST_F(EncodingSpeculatorBehaviour,RecogniseUTF16LEBOM)187 TEST_F( EncodingSpeculatorBehaviour, RecogniseUTF16LEBOM ) {
188 speculator.inject_byte( 0xFF );
189 speculator.inject_byte( 0xFE );
190 speculator.inject_byte( 0x10 );
191 speculator.inject_byte( 0x10 );
192
193 ASSERT_THAT( speculator.guess(), Eq( EncodingSpeculator::Encoding::UTF16LE ) );
194 }
195
TEST_F(EncodingSpeculatorBehaviour,RecogniseUTF16BEBOM)196 TEST_F( EncodingSpeculatorBehaviour, RecogniseUTF16BEBOM ) {
197 speculator.inject_byte( 0xFE );
198 speculator.inject_byte( 0xFF );
199 speculator.inject_byte( 0x10 );
200 speculator.inject_byte( 0x10 );
201
202 ASSERT_THAT( speculator.guess(), Eq( EncodingSpeculator::Encoding::UTF16BE ) );
203 }
204
TEST_F(EncodingSpeculatorBehaviour,IgnoreNonInitialBOM)205 TEST_F( EncodingSpeculatorBehaviour, IgnoreNonInitialBOM ) {
206 speculator.inject_byte( 0x10 );
207 speculator.inject_byte( 0x10 );
208 speculator.inject_byte( 0xFE );
209 speculator.inject_byte( 0xFF );
210
211 ASSERT_THAT( speculator.guess(), Eq( EncodingSpeculator::Encoding::ASCII8 ) );
212 }
213