xref: /glogg/src/encodingspeculator.cpp (revision 702af59ea138e3124b906092de415e3601c74d3e)
1*702af59eSNicolas Bonnefon /*
2*702af59eSNicolas Bonnefon  * Copyright (C) 2016 Nicolas Bonnefon and other contributors
3*702af59eSNicolas Bonnefon  *
4*702af59eSNicolas Bonnefon  * This file is part of glogg.
5*702af59eSNicolas Bonnefon  *
6*702af59eSNicolas Bonnefon  * glogg is free software: you can redistribute it and/or modify
7*702af59eSNicolas Bonnefon  * it under the terms of the GNU General Public License as published by
8*702af59eSNicolas Bonnefon  * the Free Software Foundation, either version 3 of the License, or
9*702af59eSNicolas Bonnefon  * (at your option) any later version.
10*702af59eSNicolas Bonnefon  *
11*702af59eSNicolas Bonnefon  * glogg is distributed in the hope that it will be useful,
12*702af59eSNicolas Bonnefon  * but WITHOUT ANY WARRANTY; without even the implied warranty of
13*702af59eSNicolas Bonnefon  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
14*702af59eSNicolas Bonnefon  * GNU General Public License for more details.
15*702af59eSNicolas Bonnefon  *
16*702af59eSNicolas Bonnefon  * You should have received a copy of the GNU General Public License
17*702af59eSNicolas Bonnefon  * along with glogg.  If not, see <http://www.gnu.org/licenses/>.
18*702af59eSNicolas Bonnefon  */
19*702af59eSNicolas Bonnefon 
20*702af59eSNicolas Bonnefon #include "encodingspeculator.h"
21*702af59eSNicolas Bonnefon 
22*702af59eSNicolas Bonnefon #include <iostream>
23*702af59eSNicolas Bonnefon 
24*702af59eSNicolas Bonnefon void EncodingSpeculator::inject_byte( uint8_t byte )
25*702af59eSNicolas Bonnefon {
26*702af59eSNicolas Bonnefon     if ( ! ( byte & 0x80 ) ) {
27*702af59eSNicolas Bonnefon         // 7-bit character, all fine
28*702af59eSNicolas Bonnefon     }
29*702af59eSNicolas Bonnefon     else {
30*702af59eSNicolas Bonnefon         switch ( state_ ) {
31*702af59eSNicolas Bonnefon             case State::ASCIIOnly:
32*702af59eSNicolas Bonnefon             case State::ValidUTF8:
33*702af59eSNicolas Bonnefon                 if ( ( byte & 0xE0 ) == 0xC0 ) {
34*702af59eSNicolas Bonnefon                     state_ = State::UTF8LeadingByteSeen;
35*702af59eSNicolas Bonnefon                     code_point_ = ( byte & 0x1F ) << 6;
36*702af59eSNicolas Bonnefon                     continuation_left_ = 1;
37*702af59eSNicolas Bonnefon                     min_value_ = 0x80;
38*702af59eSNicolas Bonnefon                     // std::cout << "Lead: cp= " << std::hex << code_point_ << std::endl;
39*702af59eSNicolas Bonnefon                 }
40*702af59eSNicolas Bonnefon                 else if ( ( byte & 0xF0 ) == 0xE0 ) {
41*702af59eSNicolas Bonnefon                     state_ = State::UTF8LeadingByteSeen;
42*702af59eSNicolas Bonnefon                     code_point_ = ( byte & 0x0F ) << 12;
43*702af59eSNicolas Bonnefon                     continuation_left_ = 2;
44*702af59eSNicolas Bonnefon                     min_value_ = 0x800;
45*702af59eSNicolas Bonnefon                     // std::cout << "Lead 3: cp= " << std::hex << code_point_ << std::endl;
46*702af59eSNicolas Bonnefon                 }
47*702af59eSNicolas Bonnefon                 else if ( ( byte & 0xF8 ) == 0xF0 ) {
48*702af59eSNicolas Bonnefon                     state_ = State::UTF8LeadingByteSeen;
49*702af59eSNicolas Bonnefon                     code_point_ = ( byte & 0x07 ) << 18;
50*702af59eSNicolas Bonnefon                     continuation_left_ = 3;
51*702af59eSNicolas Bonnefon                     min_value_ = 0x800;
52*702af59eSNicolas Bonnefon                     // std::cout << "Lead 4: cp= " << std::hex << code_point_ << std::endl;
53*702af59eSNicolas Bonnefon                 }
54*702af59eSNicolas Bonnefon                 else {
55*702af59eSNicolas Bonnefon                     state_ = State::Unknown8Bit;
56*702af59eSNicolas Bonnefon                 }
57*702af59eSNicolas Bonnefon                 break;
58*702af59eSNicolas Bonnefon             case State::UTF8LeadingByteSeen:
59*702af59eSNicolas Bonnefon                 if ( ( byte & 0xC0 ) == 0x80 ) {
60*702af59eSNicolas Bonnefon                     --continuation_left_;
61*702af59eSNicolas Bonnefon                     code_point_ |= ( byte & 0x3F ) << (continuation_left_ * 6);
62*702af59eSNicolas Bonnefon                     // std::cout << "Cont: cp= " << std::hex << code_point_ << std::endl;
63*702af59eSNicolas Bonnefon                     if ( continuation_left_ == 0 ) {
64*702af59eSNicolas Bonnefon                         if ( code_point_ >= min_value_ )
65*702af59eSNicolas Bonnefon                             state_ = State::ValidUTF8;
66*702af59eSNicolas Bonnefon                         else
67*702af59eSNicolas Bonnefon                             state_ = State::Unknown8Bit;
68*702af59eSNicolas Bonnefon                     }
69*702af59eSNicolas Bonnefon                 }
70*702af59eSNicolas Bonnefon                 else {
71*702af59eSNicolas Bonnefon                     state_ = State::Unknown8Bit;
72*702af59eSNicolas Bonnefon                 }
73*702af59eSNicolas Bonnefon                 break;
74*702af59eSNicolas Bonnefon         }
75*702af59eSNicolas Bonnefon         // state_ = State::Unknown8Bit;
76*702af59eSNicolas Bonnefon     }
77*702af59eSNicolas Bonnefon }
78*702af59eSNicolas Bonnefon 
79*702af59eSNicolas Bonnefon EncodingSpeculator::Encoding EncodingSpeculator::guess() const
80*702af59eSNicolas Bonnefon {
81*702af59eSNicolas Bonnefon     Encoding guess;
82*702af59eSNicolas Bonnefon 
83*702af59eSNicolas Bonnefon     switch ( state_ ) {
84*702af59eSNicolas Bonnefon         case State::ASCIIOnly:
85*702af59eSNicolas Bonnefon             guess = Encoding::ASCII7;
86*702af59eSNicolas Bonnefon             break;
87*702af59eSNicolas Bonnefon         case State::Unknown8Bit:
88*702af59eSNicolas Bonnefon         case State::UTF8LeadingByteSeen:
89*702af59eSNicolas Bonnefon             guess = Encoding::ASCII8;
90*702af59eSNicolas Bonnefon             break;
91*702af59eSNicolas Bonnefon         case State::ValidUTF8:
92*702af59eSNicolas Bonnefon             guess = Encoding::UTF8;
93*702af59eSNicolas Bonnefon             break;
94*702af59eSNicolas Bonnefon         default:
95*702af59eSNicolas Bonnefon             guess = Encoding::ASCII8;
96*702af59eSNicolas Bonnefon     }
97*702af59eSNicolas Bonnefon 
98*702af59eSNicolas Bonnefon     return guess;
99*702af59eSNicolas Bonnefon }
100