xref: /glogg/tests/encodingspeculatorTest.cpp (revision 0faa4758001e16aefc549004c6565cc7bab02cb2)
1702af59eSNicolas Bonnefon #include "gmock/gmock.h"
2702af59eSNicolas Bonnefon 
3702af59eSNicolas Bonnefon #include "config.h"
4702af59eSNicolas Bonnefon 
5702af59eSNicolas Bonnefon #include "log.h"
6702af59eSNicolas Bonnefon 
7702af59eSNicolas Bonnefon #include "encodingspeculator.h"
8702af59eSNicolas Bonnefon 
9702af59eSNicolas Bonnefon using namespace std;
10702af59eSNicolas Bonnefon using namespace testing;
11702af59eSNicolas Bonnefon 
12702af59eSNicolas Bonnefon class EncodingSpeculatorBehaviour: public testing::Test {
13702af59eSNicolas Bonnefon   public:
14702af59eSNicolas Bonnefon     EncodingSpeculator speculator;
15702af59eSNicolas Bonnefon 
EncodingSpeculatorBehaviour()16702af59eSNicolas Bonnefon     EncodingSpeculatorBehaviour() {
17702af59eSNicolas Bonnefon     }
18702af59eSNicolas Bonnefon };
19702af59eSNicolas Bonnefon 
TEST_F(EncodingSpeculatorBehaviour,DefaultAsPureAscii)20702af59eSNicolas Bonnefon TEST_F( EncodingSpeculatorBehaviour, DefaultAsPureAscii ) {
21702af59eSNicolas Bonnefon     ASSERT_THAT( speculator.guess(), Eq( EncodingSpeculator::Encoding::ASCII7 ) );
22702af59eSNicolas Bonnefon }
23702af59eSNicolas Bonnefon 
TEST_F(EncodingSpeculatorBehaviour,RecognisePureAscii)24702af59eSNicolas Bonnefon TEST_F( EncodingSpeculatorBehaviour, RecognisePureAscii ) {
25702af59eSNicolas Bonnefon     for ( uint8_t i = 0; i < 127; ++i )
26702af59eSNicolas Bonnefon         speculator.inject_byte( i );
27702af59eSNicolas Bonnefon 
28702af59eSNicolas Bonnefon     ASSERT_THAT( speculator.guess(), Eq( EncodingSpeculator::Encoding::ASCII7 ) );
29702af59eSNicolas Bonnefon }
30702af59eSNicolas Bonnefon 
TEST_F(EncodingSpeculatorBehaviour,RecogniseRandom8bitEncoding)31702af59eSNicolas Bonnefon TEST_F( EncodingSpeculatorBehaviour, RecogniseRandom8bitEncoding ) {
32702af59eSNicolas Bonnefon     for ( uint8_t i = 0; i < 127; ++i )
33702af59eSNicolas Bonnefon         speculator.inject_byte( i );
34702af59eSNicolas Bonnefon     speculator.inject_byte( 0xFF );
35702af59eSNicolas Bonnefon 
36702af59eSNicolas Bonnefon     ASSERT_THAT( speculator.guess(), Eq( EncodingSpeculator::Encoding::ASCII8 ) );
37702af59eSNicolas Bonnefon }
38702af59eSNicolas Bonnefon 
utf8encode2bytes(uint16_t code_point)39702af59eSNicolas Bonnefon pair<uint8_t,uint8_t> utf8encode2bytes( uint16_t code_point )
40702af59eSNicolas Bonnefon {
41702af59eSNicolas Bonnefon     uint8_t cp_low = static_cast<uint8_t>(code_point & 0xFF);
42702af59eSNicolas Bonnefon     uint8_t cp_hi  = static_cast<uint8_t>((code_point & 0xFF00) >> 8);
43702af59eSNicolas Bonnefon     uint8_t first_byte = 0xC0 | ( ( cp_hi & 0x7F ) << 2 ) | ( ( cp_low & 0xC0 ) >> 6 );
44702af59eSNicolas Bonnefon     uint8_t second_byte = 0x80 | ( cp_low & 0x3F );
45702af59eSNicolas Bonnefon 
46702af59eSNicolas Bonnefon     return { first_byte, second_byte };
47702af59eSNicolas Bonnefon }
48702af59eSNicolas Bonnefon 
utf8encodeMultiBytes(uint32_t code_point)49702af59eSNicolas Bonnefon vector<uint8_t> utf8encodeMultiBytes( uint32_t code_point )
50702af59eSNicolas Bonnefon {
51702af59eSNicolas Bonnefon     vector<uint8_t> bytes = {};
52702af59eSNicolas Bonnefon 
53702af59eSNicolas Bonnefon     if ( code_point <= 0xFFFF ) {
54702af59eSNicolas Bonnefon         uint8_t lead = static_cast<uint8_t>( 0xE0 | ( ( code_point & 0xF000 ) >> 12 ) );
55702af59eSNicolas Bonnefon         bytes.push_back( lead );
56702af59eSNicolas Bonnefon         bytes.push_back( 0x80 | ( code_point & 0x0FC0 ) >> 6 );
57702af59eSNicolas Bonnefon         bytes.push_back( 0x80 | ( code_point & 0x3F ) );
58702af59eSNicolas Bonnefon     }
59702af59eSNicolas Bonnefon     else if ( code_point <= 0x1FFFFF ) {
60702af59eSNicolas Bonnefon         uint8_t lead = static_cast<uint8_t>( 0xF0 | ( ( code_point & 0x1C0000 ) >> 18 ) );
61702af59eSNicolas Bonnefon         bytes.push_back( lead );
62702af59eSNicolas Bonnefon         bytes.push_back( 0x80 | ( code_point & 0x3F000 ) >> 12 );
63702af59eSNicolas Bonnefon         bytes.push_back( 0x80 | ( code_point & 0x00FC0 ) >> 6 );
64702af59eSNicolas Bonnefon         bytes.push_back( 0x80 | ( code_point & 0x0003F ) );
65702af59eSNicolas Bonnefon     }
66702af59eSNicolas Bonnefon 
67702af59eSNicolas Bonnefon     return bytes;
68702af59eSNicolas Bonnefon }
69702af59eSNicolas Bonnefon 
70702af59eSNicolas Bonnefon 
TEST_F(EncodingSpeculatorBehaviour,RecogniseTwoBytesUTF8)71702af59eSNicolas Bonnefon TEST_F( EncodingSpeculatorBehaviour, RecogniseTwoBytesUTF8 ) {
72702af59eSNicolas Bonnefon     // All the code points encodable as 2 bytes.
73702af59eSNicolas Bonnefon     for ( uint16_t i = 0x80; i < ( 1 << 11 ); ++i ) {
74702af59eSNicolas Bonnefon         auto utf8_bytes = utf8encode2bytes( i );
75702af59eSNicolas Bonnefon 
76702af59eSNicolas Bonnefon         // cout << bitset<8>(first_byte) << " " << bitset<8>(second_byte) << endl;
77702af59eSNicolas Bonnefon 
78702af59eSNicolas Bonnefon         speculator.inject_byte( utf8_bytes.first );
79702af59eSNicolas Bonnefon         speculator.inject_byte( utf8_bytes.second );
80702af59eSNicolas Bonnefon     }
81702af59eSNicolas Bonnefon 
82702af59eSNicolas Bonnefon     ASSERT_THAT( speculator.guess(), Eq( EncodingSpeculator::Encoding::UTF8 ) );
83702af59eSNicolas Bonnefon }
84702af59eSNicolas Bonnefon 
TEST_F(EncodingSpeculatorBehaviour,RecogniseTwoBytesUTF8With7bitsInterleaved)85702af59eSNicolas Bonnefon TEST_F( EncodingSpeculatorBehaviour, RecogniseTwoBytesUTF8With7bitsInterleaved ) {
86702af59eSNicolas Bonnefon     // All the code points encodable as 2 bytes.
87702af59eSNicolas Bonnefon     for ( uint16_t i = 0x80; i < ( 1 << 11 ); ++i ) {
88702af59eSNicolas Bonnefon         auto utf8_bytes = utf8encode2bytes( i );
89702af59eSNicolas Bonnefon 
90702af59eSNicolas Bonnefon         speculator.inject_byte( ' ' );
91702af59eSNicolas Bonnefon         speculator.inject_byte( utf8_bytes.first );
92702af59eSNicolas Bonnefon         speculator.inject_byte( utf8_bytes.second );
93702af59eSNicolas Bonnefon     }
94702af59eSNicolas Bonnefon 
95702af59eSNicolas Bonnefon     ASSERT_THAT( speculator.guess(), Eq( EncodingSpeculator::Encoding::UTF8 ) );
96702af59eSNicolas Bonnefon }
97702af59eSNicolas Bonnefon 
TEST_F(EncodingSpeculatorBehaviour,RecogniseIncompleteTwoBytesUTF8)98702af59eSNicolas Bonnefon TEST_F( EncodingSpeculatorBehaviour, RecogniseIncompleteTwoBytesUTF8 ) {
99702af59eSNicolas Bonnefon     // All the code points encodable as 2 bytes.
100702af59eSNicolas Bonnefon     for ( uint16_t i = 0x80; i < ( 1 << 11 ); ++i ) {
101702af59eSNicolas Bonnefon         auto utf8_bytes = utf8encode2bytes( i );
102702af59eSNicolas Bonnefon 
103702af59eSNicolas Bonnefon         speculator.inject_byte( ' ' );
104702af59eSNicolas Bonnefon         speculator.inject_byte( utf8_bytes.first );
105702af59eSNicolas Bonnefon         speculator.inject_byte( utf8_bytes.second );
106702af59eSNicolas Bonnefon     }
107702af59eSNicolas Bonnefon 
108702af59eSNicolas Bonnefon     // Lead byte only
109702af59eSNicolas Bonnefon     speculator.inject_byte( 0xCF );
110702af59eSNicolas Bonnefon 
111702af59eSNicolas Bonnefon     ASSERT_THAT( speculator.guess(), Eq( EncodingSpeculator::Encoding::ASCII8 ) );
112702af59eSNicolas Bonnefon }
113702af59eSNicolas Bonnefon 
TEST_F(EncodingSpeculatorBehaviour,RecogniseIncorrectTwoBytesUTF8)114702af59eSNicolas Bonnefon TEST_F( EncodingSpeculatorBehaviour, RecogniseIncorrectTwoBytesUTF8 ) {
115702af59eSNicolas Bonnefon     // All the code points encodable as 2 bytes.
116702af59eSNicolas Bonnefon     for ( uint16_t i = 0x80; i < ( 1 << 11 ); ++i ) {
117702af59eSNicolas Bonnefon         auto utf8_bytes = utf8encode2bytes( i );
118702af59eSNicolas Bonnefon 
119702af59eSNicolas Bonnefon         speculator.inject_byte( ' ' );
120702af59eSNicolas Bonnefon         speculator.inject_byte( utf8_bytes.first );
121702af59eSNicolas Bonnefon         speculator.inject_byte( utf8_bytes.second );
122702af59eSNicolas Bonnefon     }
123702af59eSNicolas Bonnefon 
124702af59eSNicolas Bonnefon     // Lead byte
125702af59eSNicolas Bonnefon     speculator.inject_byte( 0xCF );
126702af59eSNicolas Bonnefon     // Incorrect continuation byte (should start with 1)
127702af59eSNicolas Bonnefon     speculator.inject_byte( 0x00 );
128702af59eSNicolas Bonnefon 
129702af59eSNicolas Bonnefon     ASSERT_THAT( speculator.guess(), Eq( EncodingSpeculator::Encoding::ASCII8 ) );
130702af59eSNicolas Bonnefon }
131702af59eSNicolas Bonnefon 
TEST_F(EncodingSpeculatorBehaviour,RecogniseOverlong2BytesUTF8)132702af59eSNicolas Bonnefon TEST_F( EncodingSpeculatorBehaviour, RecogniseOverlong2BytesUTF8 ) {
133702af59eSNicolas Bonnefon     speculator.inject_byte( ' ' );
134702af59eSNicolas Bonnefon     speculator.inject_byte( 0xC1 );
135702af59eSNicolas Bonnefon     speculator.inject_byte( 0xBF );
136702af59eSNicolas Bonnefon 
137702af59eSNicolas Bonnefon     ASSERT_THAT( speculator.guess(), Eq( EncodingSpeculator::Encoding::ASCII8 ) );
138702af59eSNicolas Bonnefon }
139702af59eSNicolas Bonnefon 
TEST_F(EncodingSpeculatorBehaviour,RecogniseThreeBytesUTF8)140702af59eSNicolas Bonnefon TEST_F( EncodingSpeculatorBehaviour, RecogniseThreeBytesUTF8 ) {
141702af59eSNicolas Bonnefon     for ( uint32_t i = 0x800; i <= 0xFFFF; ++i ) {
142702af59eSNicolas Bonnefon         auto utf8_bytes = utf8encodeMultiBytes( i );
143702af59eSNicolas Bonnefon 
144702af59eSNicolas Bonnefon         speculator.inject_byte( ' ' );
145702af59eSNicolas Bonnefon         for ( uint8_t byte: utf8_bytes ) {
146702af59eSNicolas Bonnefon             // cout << hex << i << " " << static_cast<uint32_t>( byte ) << endl;
147702af59eSNicolas Bonnefon             speculator.inject_byte( byte );
148702af59eSNicolas Bonnefon         }
149702af59eSNicolas Bonnefon     }
150702af59eSNicolas Bonnefon 
151702af59eSNicolas Bonnefon     ASSERT_THAT( speculator.guess(), Eq( EncodingSpeculator::Encoding::UTF8 ) );
152702af59eSNicolas Bonnefon }
153702af59eSNicolas Bonnefon 
TEST_F(EncodingSpeculatorBehaviour,RecogniseOverlong3BytesUTF8)154702af59eSNicolas Bonnefon TEST_F( EncodingSpeculatorBehaviour, RecogniseOverlong3BytesUTF8 ) {
155702af59eSNicolas Bonnefon     speculator.inject_byte( ' ' );
156702af59eSNicolas Bonnefon     speculator.inject_byte( 0xA0 );
157702af59eSNicolas Bonnefon     speculator.inject_byte( 0x80 );
158702af59eSNicolas Bonnefon     speculator.inject_byte( 0x80 );
159702af59eSNicolas Bonnefon 
160702af59eSNicolas Bonnefon     ASSERT_THAT( speculator.guess(), Eq( EncodingSpeculator::Encoding::ASCII8 ) );
161702af59eSNicolas Bonnefon }
162702af59eSNicolas Bonnefon 
TEST_F(EncodingSpeculatorBehaviour,RecogniseFourBytesUTF8)163702af59eSNicolas Bonnefon TEST_F( EncodingSpeculatorBehaviour, RecogniseFourBytesUTF8 ) {
164702af59eSNicolas Bonnefon     for ( uint32_t i = 0x10000; i <= 0x1FFFFF; ++i ) {
165702af59eSNicolas Bonnefon         auto utf8_bytes = utf8encodeMultiBytes( i );
166702af59eSNicolas Bonnefon 
167702af59eSNicolas Bonnefon         speculator.inject_byte( ' ' );
168702af59eSNicolas Bonnefon         for ( uint8_t byte: utf8_bytes ) {
169702af59eSNicolas Bonnefon             // cout << hex << i << " " << static_cast<uint32_t>( byte ) << endl;
170702af59eSNicolas Bonnefon             speculator.inject_byte( byte );
171702af59eSNicolas Bonnefon         }
172702af59eSNicolas Bonnefon     }
173702af59eSNicolas Bonnefon 
174702af59eSNicolas Bonnefon     ASSERT_THAT( speculator.guess(), Eq( EncodingSpeculator::Encoding::UTF8 ) );
175702af59eSNicolas Bonnefon }
176702af59eSNicolas Bonnefon 
TEST_F(EncodingSpeculatorBehaviour,RecogniseOverlong4BytesUTF8)177702af59eSNicolas Bonnefon TEST_F( EncodingSpeculatorBehaviour, RecogniseOverlong4BytesUTF8 ) {
178702af59eSNicolas Bonnefon     speculator.inject_byte( ' ' );
179702af59eSNicolas Bonnefon     speculator.inject_byte( 0xF0 );
180702af59eSNicolas Bonnefon     speculator.inject_byte( 0x80 );
181702af59eSNicolas Bonnefon     speculator.inject_byte( 0x80 );
182702af59eSNicolas Bonnefon     speculator.inject_byte( 0x80 );
183702af59eSNicolas Bonnefon 
184702af59eSNicolas Bonnefon     ASSERT_THAT( speculator.guess(), Eq( EncodingSpeculator::Encoding::ASCII8 ) );
185702af59eSNicolas Bonnefon }
186702af59eSNicolas Bonnefon 
TEST_F(EncodingSpeculatorBehaviour,RecogniseUTF16LEBOM)187*0faa4758SNicolas Bonnefon TEST_F( EncodingSpeculatorBehaviour, RecogniseUTF16LEBOM ) {
188*0faa4758SNicolas Bonnefon     speculator.inject_byte( 0xFF );
189*0faa4758SNicolas Bonnefon     speculator.inject_byte( 0xFE );
190*0faa4758SNicolas Bonnefon     speculator.inject_byte( 0x10 );
191*0faa4758SNicolas Bonnefon     speculator.inject_byte( 0x10 );
192*0faa4758SNicolas Bonnefon 
193*0faa4758SNicolas Bonnefon     ASSERT_THAT( speculator.guess(), Eq( EncodingSpeculator::Encoding::UTF16LE ) );
194*0faa4758SNicolas Bonnefon }
195*0faa4758SNicolas Bonnefon 
TEST_F(EncodingSpeculatorBehaviour,RecogniseUTF16BEBOM)196*0faa4758SNicolas Bonnefon TEST_F( EncodingSpeculatorBehaviour, RecogniseUTF16BEBOM ) {
197*0faa4758SNicolas Bonnefon     speculator.inject_byte( 0xFE );
198*0faa4758SNicolas Bonnefon     speculator.inject_byte( 0xFF );
199*0faa4758SNicolas Bonnefon     speculator.inject_byte( 0x10 );
200*0faa4758SNicolas Bonnefon     speculator.inject_byte( 0x10 );
201*0faa4758SNicolas Bonnefon 
202*0faa4758SNicolas Bonnefon     ASSERT_THAT( speculator.guess(), Eq( EncodingSpeculator::Encoding::UTF16BE ) );
203*0faa4758SNicolas Bonnefon }
204*0faa4758SNicolas Bonnefon 
TEST_F(EncodingSpeculatorBehaviour,IgnoreNonInitialBOM)205*0faa4758SNicolas Bonnefon TEST_F( EncodingSpeculatorBehaviour, IgnoreNonInitialBOM ) {
206*0faa4758SNicolas Bonnefon     speculator.inject_byte( 0x10 );
207*0faa4758SNicolas Bonnefon     speculator.inject_byte( 0x10 );
208*0faa4758SNicolas Bonnefon     speculator.inject_byte( 0xFE );
209*0faa4758SNicolas Bonnefon     speculator.inject_byte( 0xFF );
210*0faa4758SNicolas Bonnefon 
211*0faa4758SNicolas Bonnefon     ASSERT_THAT( speculator.guess(), Eq( EncodingSpeculator::Encoding::ASCII8 ) );
212*0faa4758SNicolas Bonnefon }
213