xref: /glogg/src/encodingspeculator.cpp (revision c9a9366412fcda118aaadfe3742db8727a163a0f)
1 /*
2  * Copyright (C) 2016 Nicolas Bonnefon and other contributors
3  *
4  * This file is part of glogg.
5  *
6  * glogg is free software: you can redistribute it and/or modify
7  * it under the terms of the GNU General Public License as published by
8  * the Free Software Foundation, either version 3 of the License, or
9  * (at your option) any later version.
10  *
11  * glogg is distributed in the hope that it will be useful,
12  * but WITHOUT ANY WARRANTY; without even the implied warranty of
13  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
14  * GNU General Public License for more details.
15  *
16  * You should have received a copy of the GNU General Public License
17  * along with glogg.  If not, see <http://www.gnu.org/licenses/>.
18  */
19 
20 #include "encodingspeculator.h"
21 
22 #include <iostream>
23 
24 void EncodingSpeculator::inject_byte( uint8_t byte )
25 {
26     if ( ! ( byte & 0x80 ) ) {
27         // 7-bit character, all fine
28     }
29     else {
30         switch ( state_ ) {
31             case State::ASCIIOnly:
32             case State::ValidUTF8:
33                 if ( ( byte & 0xE0 ) == 0xC0 ) {
34                     state_ = State::UTF8LeadingByteSeen;
35                     code_point_ = ( byte & 0x1F ) << 6;
36                     continuation_left_ = 1;
37                     min_value_ = 0x80;
38                     // std::cout << "Lead: cp= " << std::hex << code_point_ << std::endl;
39                 }
40                 else if ( ( byte & 0xF0 ) == 0xE0 ) {
41                     state_ = State::UTF8LeadingByteSeen;
42                     code_point_ = ( byte & 0x0F ) << 12;
43                     continuation_left_ = 2;
44                     min_value_ = 0x800;
45                     // std::cout << "Lead 3: cp= " << std::hex << code_point_ << std::endl;
46                 }
47                 else if ( ( byte & 0xF8 ) == 0xF0 ) {
48                     state_ = State::UTF8LeadingByteSeen;
49                     code_point_ = ( byte & 0x07 ) << 18;
50                     continuation_left_ = 3;
51                     min_value_ = 0x800;
52                     // std::cout << "Lead 4: cp= " << std::hex << code_point_ << std::endl;
53                 }
54                 else {
55                     state_ = State::Unknown8Bit;
56                 }
57                 break;
58             case State::UTF8LeadingByteSeen:
59                 if ( ( byte & 0xC0 ) == 0x80 ) {
60                     --continuation_left_;
61                     code_point_ |= ( byte & 0x3F ) << (continuation_left_ * 6);
62                     // std::cout << "Cont: cp= " << std::hex << code_point_ << std::endl;
63                     if ( continuation_left_ == 0 ) {
64                         if ( code_point_ >= min_value_ )
65                             state_ = State::ValidUTF8;
66                         else
67                             state_ = State::Unknown8Bit;
68                     }
69                 }
70                 else {
71                     state_ = State::Unknown8Bit;
72                 }
73                 break;
74         }
75         // state_ = State::Unknown8Bit;
76     }
77 }
78 
79 EncodingSpeculator::Encoding EncodingSpeculator::guess() const
80 {
81     Encoding guess;
82 
83     switch ( state_ ) {
84         case State::ASCIIOnly:
85             guess = Encoding::ASCII7;
86             break;
87         case State::Unknown8Bit:
88         case State::UTF8LeadingByteSeen:
89             guess = Encoding::ASCII8;
90             break;
91         case State::ValidUTF8:
92             guess = Encoding::UTF8;
93             break;
94         default:
95             guess = Encoding::ASCII8;
96     }
97 
98     return guess;
99 }
100