xref: /glogg/tests/encodingspeculatorTest.cpp (revision 2f6fa4628fc30ec8c0de04db338125ee1e7f7ae8)
1 #include "gmock/gmock.h"
2 
3 #include "config.h"
4 
5 #include "log.h"
6 
7 #include "encodingspeculator.h"
8 
9 using namespace std;
10 using namespace testing;
11 
12 class EncodingSpeculatorBehaviour: public testing::Test {
13   public:
14     EncodingSpeculator speculator;
15 
16     EncodingSpeculatorBehaviour() {
17     }
18 };
19 
20 TEST_F( EncodingSpeculatorBehaviour, DefaultAsPureAscii ) {
21     ASSERT_THAT( speculator.guess(), Eq( EncodingSpeculator::Encoding::ASCII7 ) );
22 }
23 
24 TEST_F( EncodingSpeculatorBehaviour, RecognisePureAscii ) {
25     for ( uint8_t i = 0; i < 127; ++i )
26         speculator.inject_byte( i );
27 
28     ASSERT_THAT( speculator.guess(), Eq( EncodingSpeculator::Encoding::ASCII7 ) );
29 }
30 
31 TEST_F( EncodingSpeculatorBehaviour, RecogniseRandom8bitEncoding ) {
32     for ( uint8_t i = 0; i < 127; ++i )
33         speculator.inject_byte( i );
34     speculator.inject_byte( 0xFF );
35 
36     ASSERT_THAT( speculator.guess(), Eq( EncodingSpeculator::Encoding::ASCII8 ) );
37 }
38 
39 pair<uint8_t,uint8_t> utf8encode2bytes( uint16_t code_point )
40 {
41     uint8_t cp_low = static_cast<uint8_t>(code_point & 0xFF);
42     uint8_t cp_hi  = static_cast<uint8_t>((code_point & 0xFF00) >> 8);
43     uint8_t first_byte = 0xC0 | ( ( cp_hi & 0x7F ) << 2 ) | ( ( cp_low & 0xC0 ) >> 6 );
44     uint8_t second_byte = 0x80 | ( cp_low & 0x3F );
45 
46     return { first_byte, second_byte };
47 }
48 
49 vector<uint8_t> utf8encodeMultiBytes( uint32_t code_point )
50 {
51     vector<uint8_t> bytes = {};
52 
53     if ( code_point <= 0xFFFF ) {
54         uint8_t lead = static_cast<uint8_t>( 0xE0 | ( ( code_point & 0xF000 ) >> 12 ) );
55         bytes.push_back( lead );
56         bytes.push_back( 0x80 | ( code_point & 0x0FC0 ) >> 6 );
57         bytes.push_back( 0x80 | ( code_point & 0x3F ) );
58     }
59     else if ( code_point <= 0x1FFFFF ) {
60         uint8_t lead = static_cast<uint8_t>( 0xF0 | ( ( code_point & 0x1C0000 ) >> 18 ) );
61         bytes.push_back( lead );
62         bytes.push_back( 0x80 | ( code_point & 0x3F000 ) >> 12 );
63         bytes.push_back( 0x80 | ( code_point & 0x00FC0 ) >> 6 );
64         bytes.push_back( 0x80 | ( code_point & 0x0003F ) );
65     }
66 
67     return bytes;
68 }
69 
70 
71 TEST_F( EncodingSpeculatorBehaviour, RecogniseTwoBytesUTF8 ) {
72     // All the code points encodable as 2 bytes.
73     for ( uint16_t i = 0x80; i < ( 1 << 11 ); ++i ) {
74         auto utf8_bytes = utf8encode2bytes( i );
75 
76         // cout << bitset<8>(first_byte) << " " << bitset<8>(second_byte) << endl;
77 
78         speculator.inject_byte( utf8_bytes.first );
79         speculator.inject_byte( utf8_bytes.second );
80     }
81 
82     ASSERT_THAT( speculator.guess(), Eq( EncodingSpeculator::Encoding::UTF8 ) );
83 }
84 
85 TEST_F( EncodingSpeculatorBehaviour, RecogniseTwoBytesUTF8With7bitsInterleaved ) {
86     // All the code points encodable as 2 bytes.
87     for ( uint16_t i = 0x80; i < ( 1 << 11 ); ++i ) {
88         auto utf8_bytes = utf8encode2bytes( i );
89 
90         speculator.inject_byte( ' ' );
91         speculator.inject_byte( utf8_bytes.first );
92         speculator.inject_byte( utf8_bytes.second );
93     }
94 
95     ASSERT_THAT( speculator.guess(), Eq( EncodingSpeculator::Encoding::UTF8 ) );
96 }
97 
98 TEST_F( EncodingSpeculatorBehaviour, RecogniseIncompleteTwoBytesUTF8 ) {
99     // All the code points encodable as 2 bytes.
100     for ( uint16_t i = 0x80; i < ( 1 << 11 ); ++i ) {
101         auto utf8_bytes = utf8encode2bytes( i );
102 
103         speculator.inject_byte( ' ' );
104         speculator.inject_byte( utf8_bytes.first );
105         speculator.inject_byte( utf8_bytes.second );
106     }
107 
108     // Lead byte only
109     speculator.inject_byte( 0xCF );
110 
111     ASSERT_THAT( speculator.guess(), Eq( EncodingSpeculator::Encoding::ASCII8 ) );
112 }
113 
114 TEST_F( EncodingSpeculatorBehaviour, RecogniseIncorrectTwoBytesUTF8 ) {
115     // All the code points encodable as 2 bytes.
116     for ( uint16_t i = 0x80; i < ( 1 << 11 ); ++i ) {
117         auto utf8_bytes = utf8encode2bytes( i );
118 
119         speculator.inject_byte( ' ' );
120         speculator.inject_byte( utf8_bytes.first );
121         speculator.inject_byte( utf8_bytes.second );
122     }
123 
124     // Lead byte
125     speculator.inject_byte( 0xCF );
126     // Incorrect continuation byte (should start with 1)
127     speculator.inject_byte( 0x00 );
128 
129     ASSERT_THAT( speculator.guess(), Eq( EncodingSpeculator::Encoding::ASCII8 ) );
130 }
131 
132 TEST_F( EncodingSpeculatorBehaviour, RecogniseOverlong2BytesUTF8 ) {
133     speculator.inject_byte( ' ' );
134     speculator.inject_byte( 0xC1 );
135     speculator.inject_byte( 0xBF );
136 
137     ASSERT_THAT( speculator.guess(), Eq( EncodingSpeculator::Encoding::ASCII8 ) );
138 }
139 
140 TEST_F( EncodingSpeculatorBehaviour, RecogniseThreeBytesUTF8 ) {
141     for ( uint32_t i = 0x800; i <= 0xFFFF; ++i ) {
142         auto utf8_bytes = utf8encodeMultiBytes( i );
143 
144         speculator.inject_byte( ' ' );
145         for ( uint8_t byte: utf8_bytes ) {
146             // cout << hex << i << " " << static_cast<uint32_t>( byte ) << endl;
147             speculator.inject_byte( byte );
148         }
149     }
150 
151     ASSERT_THAT( speculator.guess(), Eq( EncodingSpeculator::Encoding::UTF8 ) );
152 }
153 
154 TEST_F( EncodingSpeculatorBehaviour, RecogniseOverlong3BytesUTF8 ) {
155     speculator.inject_byte( ' ' );
156     speculator.inject_byte( 0xA0 );
157     speculator.inject_byte( 0x80 );
158     speculator.inject_byte( 0x80 );
159 
160     ASSERT_THAT( speculator.guess(), Eq( EncodingSpeculator::Encoding::ASCII8 ) );
161 }
162 
163 TEST_F( EncodingSpeculatorBehaviour, RecogniseFourBytesUTF8 ) {
164     for ( uint32_t i = 0x10000; i <= 0x1FFFFF; ++i ) {
165         auto utf8_bytes = utf8encodeMultiBytes( i );
166 
167         speculator.inject_byte( ' ' );
168         for ( uint8_t byte: utf8_bytes ) {
169             // cout << hex << i << " " << static_cast<uint32_t>( byte ) << endl;
170             speculator.inject_byte( byte );
171         }
172     }
173 
174     ASSERT_THAT( speculator.guess(), Eq( EncodingSpeculator::Encoding::UTF8 ) );
175 }
176 
177 TEST_F( EncodingSpeculatorBehaviour, RecogniseOverlong4BytesUTF8 ) {
178     speculator.inject_byte( ' ' );
179     speculator.inject_byte( 0xF0 );
180     speculator.inject_byte( 0x80 );
181     speculator.inject_byte( 0x80 );
182     speculator.inject_byte( 0x80 );
183 
184     ASSERT_THAT( speculator.guess(), Eq( EncodingSpeculator::Encoding::ASCII8 ) );
185 }
186 
187 TEST_F( EncodingSpeculatorBehaviour, RecogniseUTF16LEBOM ) {
188     speculator.inject_byte( 0xFF );
189     speculator.inject_byte( 0xFE );
190     speculator.inject_byte( 0x10 );
191     speculator.inject_byte( 0x10 );
192 
193     ASSERT_THAT( speculator.guess(), Eq( EncodingSpeculator::Encoding::UTF16LE ) );
194 }
195 
196 TEST_F( EncodingSpeculatorBehaviour, RecogniseUTF16BEBOM ) {
197     speculator.inject_byte( 0xFE );
198     speculator.inject_byte( 0xFF );
199     speculator.inject_byte( 0x10 );
200     speculator.inject_byte( 0x10 );
201 
202     ASSERT_THAT( speculator.guess(), Eq( EncodingSpeculator::Encoding::UTF16BE ) );
203 }
204 
205 TEST_F( EncodingSpeculatorBehaviour, IgnoreNonInitialBOM ) {
206     speculator.inject_byte( 0x10 );
207     speculator.inject_byte( 0x10 );
208     speculator.inject_byte( 0xFE );
209     speculator.inject_byte( 0xFF );
210 
211     ASSERT_THAT( speculator.guess(), Eq( EncodingSpeculator::Encoding::ASCII8 ) );
212 }
213