xref: /glogg/src/encodingspeculator.cpp (revision 048334c92fb4b86ebbabc7471f7313a1cc515c10)
1702af59eSNicolas Bonnefon /*
2702af59eSNicolas Bonnefon  * Copyright (C) 2016 Nicolas Bonnefon and other contributors
3702af59eSNicolas Bonnefon  *
4702af59eSNicolas Bonnefon  * This file is part of glogg.
5702af59eSNicolas Bonnefon  *
6702af59eSNicolas Bonnefon  * glogg is free software: you can redistribute it and/or modify
7702af59eSNicolas Bonnefon  * it under the terms of the GNU General Public License as published by
8702af59eSNicolas Bonnefon  * the Free Software Foundation, either version 3 of the License, or
9702af59eSNicolas Bonnefon  * (at your option) any later version.
10702af59eSNicolas Bonnefon  *
11702af59eSNicolas Bonnefon  * glogg is distributed in the hope that it will be useful,
12702af59eSNicolas Bonnefon  * but WITHOUT ANY WARRANTY; without even the implied warranty of
13702af59eSNicolas Bonnefon  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
14702af59eSNicolas Bonnefon  * GNU General Public License for more details.
15702af59eSNicolas Bonnefon  *
16702af59eSNicolas Bonnefon  * You should have received a copy of the GNU General Public License
17702af59eSNicolas Bonnefon  * along with glogg.  If not, see <http://www.gnu.org/licenses/>.
18702af59eSNicolas Bonnefon  */
19702af59eSNicolas Bonnefon 
20702af59eSNicolas Bonnefon #include "encodingspeculator.h"
21702af59eSNicolas Bonnefon 
22702af59eSNicolas Bonnefon #include <iostream>
23702af59eSNicolas Bonnefon 
24702af59eSNicolas Bonnefon void EncodingSpeculator::inject_byte( uint8_t byte )
25702af59eSNicolas Bonnefon {
26702af59eSNicolas Bonnefon     if ( ! ( byte & 0x80 ) ) {
27702af59eSNicolas Bonnefon         // 7-bit character, all fine
280faa4758SNicolas Bonnefon         if ( state_ == State::Start )
290faa4758SNicolas Bonnefon             state_ = State::ASCIIOnly;
30702af59eSNicolas Bonnefon     }
31702af59eSNicolas Bonnefon     else {
32702af59eSNicolas Bonnefon         switch ( state_ ) {
330faa4758SNicolas Bonnefon             case State::Start:
340faa4758SNicolas Bonnefon                 if ( byte == 0xFE ) {
350faa4758SNicolas Bonnefon                     state_ = State::UTF16BELeadingBOMByteSeen;
360faa4758SNicolas Bonnefon                     break;
370faa4758SNicolas Bonnefon                 }
380faa4758SNicolas Bonnefon                 else if ( byte == 0xFF ) {
390faa4758SNicolas Bonnefon                     state_ = State::UTF16LELeadingBOMByteSeen;
400faa4758SNicolas Bonnefon                     break;
410faa4758SNicolas Bonnefon                 }
420faa4758SNicolas Bonnefon                 else {
430faa4758SNicolas Bonnefon                     state_ = State::ASCIIOnly;
440faa4758SNicolas Bonnefon                     // And carry on...
450faa4758SNicolas Bonnefon                 }
46702af59eSNicolas Bonnefon             case State::ASCIIOnly:
47702af59eSNicolas Bonnefon             case State::ValidUTF8:
48702af59eSNicolas Bonnefon                 if ( ( byte & 0xE0 ) == 0xC0 ) {
49702af59eSNicolas Bonnefon                     state_ = State::UTF8LeadingByteSeen;
50702af59eSNicolas Bonnefon                     code_point_ = ( byte & 0x1F ) << 6;
51702af59eSNicolas Bonnefon                     continuation_left_ = 1;
52702af59eSNicolas Bonnefon                     min_value_ = 0x80;
53702af59eSNicolas Bonnefon                     // std::cout << "Lead: cp= " << std::hex << code_point_ << std::endl;
54702af59eSNicolas Bonnefon                 }
55702af59eSNicolas Bonnefon                 else if ( ( byte & 0xF0 ) == 0xE0 ) {
56702af59eSNicolas Bonnefon                     state_ = State::UTF8LeadingByteSeen;
57702af59eSNicolas Bonnefon                     code_point_ = ( byte & 0x0F ) << 12;
58702af59eSNicolas Bonnefon                     continuation_left_ = 2;
59702af59eSNicolas Bonnefon                     min_value_ = 0x800;
60702af59eSNicolas Bonnefon                     // std::cout << "Lead 3: cp= " << std::hex << code_point_ << std::endl;
61702af59eSNicolas Bonnefon                 }
62702af59eSNicolas Bonnefon                 else if ( ( byte & 0xF8 ) == 0xF0 ) {
63702af59eSNicolas Bonnefon                     state_ = State::UTF8LeadingByteSeen;
64702af59eSNicolas Bonnefon                     code_point_ = ( byte & 0x07 ) << 18;
65702af59eSNicolas Bonnefon                     continuation_left_ = 3;
66702af59eSNicolas Bonnefon                     min_value_ = 0x800;
67702af59eSNicolas Bonnefon                     // std::cout << "Lead 4: cp= " << std::hex << code_point_ << std::endl;
68702af59eSNicolas Bonnefon                 }
69702af59eSNicolas Bonnefon                 else {
70*048334c9SSeerauber                     state_ = State::OtherOrUnknown8Bit;
71702af59eSNicolas Bonnefon                 }
72702af59eSNicolas Bonnefon                 break;
73702af59eSNicolas Bonnefon             case State::UTF8LeadingByteSeen:
74702af59eSNicolas Bonnefon                 if ( ( byte & 0xC0 ) == 0x80 ) {
75702af59eSNicolas Bonnefon                     --continuation_left_;
76702af59eSNicolas Bonnefon                     code_point_ |= ( byte & 0x3F ) << (continuation_left_ * 6);
77702af59eSNicolas Bonnefon                     // std::cout << "Cont: cp= " << std::hex << code_point_ << std::endl;
78702af59eSNicolas Bonnefon                     if ( continuation_left_ == 0 ) {
79702af59eSNicolas Bonnefon                         if ( code_point_ >= min_value_ )
80702af59eSNicolas Bonnefon                             state_ = State::ValidUTF8;
81702af59eSNicolas Bonnefon                         else
82*048334c9SSeerauber                             state_ = State::OtherOrUnknown8Bit;
83702af59eSNicolas Bonnefon                     }
84702af59eSNicolas Bonnefon                 }
85702af59eSNicolas Bonnefon                 else {
86*048334c9SSeerauber                     state_ = State::OtherOrUnknown8Bit;
87702af59eSNicolas Bonnefon                 }
88702af59eSNicolas Bonnefon                 break;
890faa4758SNicolas Bonnefon             case State::UTF16BELeadingBOMByteSeen:
900faa4758SNicolas Bonnefon                 if ( byte == 0xFF ) {
910faa4758SNicolas Bonnefon                     state_ = State::ValidUTF16BE;
92702af59eSNicolas Bonnefon                 }
930faa4758SNicolas Bonnefon                 else {
94*048334c9SSeerauber                     state_ = State::OtherOrUnknown8Bit;
950faa4758SNicolas Bonnefon                 }
960faa4758SNicolas Bonnefon                 break;
970faa4758SNicolas Bonnefon             case State::UTF16LELeadingBOMByteSeen:
980faa4758SNicolas Bonnefon                 if ( byte == 0xFE ) {
990faa4758SNicolas Bonnefon                     state_ = State::ValidUTF16LE;
1000faa4758SNicolas Bonnefon                 }
1010faa4758SNicolas Bonnefon                 else {
102*048334c9SSeerauber                     state_ = State::OtherOrUnknown8Bit;
1030faa4758SNicolas Bonnefon                 }
1040faa4758SNicolas Bonnefon                 break;
1050faa4758SNicolas Bonnefon             case State::ValidUTF16LE:
1060faa4758SNicolas Bonnefon             case State::ValidUTF16BE:
1070faa4758SNicolas Bonnefon                 // We don't verify UTF16 and assume it's all fine for now.
1080faa4758SNicolas Bonnefon                 break;
109*048334c9SSeerauber             case State::OtherOrUnknown8Bit:
110*048334c9SSeerauber                 state_ = State::OtherOrUnknown8Bit;
1110faa4758SNicolas Bonnefon          }
112702af59eSNicolas Bonnefon     }
113702af59eSNicolas Bonnefon }
114702af59eSNicolas Bonnefon 
115702af59eSNicolas Bonnefon EncodingSpeculator::Encoding EncodingSpeculator::guess() const
116702af59eSNicolas Bonnefon {
117702af59eSNicolas Bonnefon     Encoding guess;
118702af59eSNicolas Bonnefon 
119702af59eSNicolas Bonnefon     switch ( state_ ) {
1200faa4758SNicolas Bonnefon         case State::Start:
121702af59eSNicolas Bonnefon         case State::ASCIIOnly:
122702af59eSNicolas Bonnefon             guess = Encoding::ASCII7;
123702af59eSNicolas Bonnefon             break;
124*048334c9SSeerauber         case State::OtherOrUnknown8Bit:
125702af59eSNicolas Bonnefon         case State::UTF8LeadingByteSeen:
126702af59eSNicolas Bonnefon             guess = Encoding::ASCII8;
127702af59eSNicolas Bonnefon             break;
128702af59eSNicolas Bonnefon         case State::ValidUTF8:
129702af59eSNicolas Bonnefon             guess = Encoding::UTF8;
130702af59eSNicolas Bonnefon             break;
1310faa4758SNicolas Bonnefon         case State::ValidUTF16LE:
1320faa4758SNicolas Bonnefon             guess = Encoding::UTF16LE;
1330faa4758SNicolas Bonnefon             break;
1340faa4758SNicolas Bonnefon         case State::ValidUTF16BE:
1350faa4758SNicolas Bonnefon             guess = Encoding::UTF16BE;
1360faa4758SNicolas Bonnefon             break;
137702af59eSNicolas Bonnefon         default:
138702af59eSNicolas Bonnefon             guess = Encoding::ASCII8;
139702af59eSNicolas Bonnefon     }
140702af59eSNicolas Bonnefon 
141702af59eSNicolas Bonnefon     return guess;
142702af59eSNicolas Bonnefon }
143