xref: /glogg/src/encodingspeculator.cpp (revision 8b941e123ddc1953f679547d44fb51d36b42e416)
1 /*
2  * Copyright (C) 2016 Nicolas Bonnefon and other contributors
3  *
4  * This file is part of glogg.
5  *
6  * glogg is free software: you can redistribute it and/or modify
7  * it under the terms of the GNU General Public License as published by
8  * the Free Software Foundation, either version 3 of the License, or
9  * (at your option) any later version.
10  *
11  * glogg is distributed in the hope that it will be useful,
12  * but WITHOUT ANY WARRANTY; without even the implied warranty of
13  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
14  * GNU General Public License for more details.
15  *
16  * You should have received a copy of the GNU General Public License
17  * along with glogg.  If not, see <http://www.gnu.org/licenses/>.
18  */
19 
20 #include "encodingspeculator.h"
21 
22 #include <iostream>
23 
24 void EncodingSpeculator::inject_byte( uint8_t byte )
25 {
26     if ( ! ( byte & 0x80 ) ) {
27         // 7-bit character, all fine
28         if ( state_ == State::Start )
29             state_ = State::ASCIIOnly;
30     }
31     else {
32         switch ( state_ ) {
33             case State::Start:
34                 if ( byte == 0xFE ) {
35                     state_ = State::UTF16BELeadingBOMByteSeen;
36                     break;
37                 }
38                 else if ( byte == 0xFF ) {
39                     state_ = State::UTF16LELeadingBOMByteSeen;
40                     break;
41                 }
42                 else {
43                     state_ = State::ASCIIOnly;
44                     // And carry on...
45                 }
46             case State::ASCIIOnly:
47             case State::ValidUTF8:
48                 if ( ( byte & 0xE0 ) == 0xC0 ) {
49                     state_ = State::UTF8LeadingByteSeen;
50                     code_point_ = ( byte & 0x1F ) << 6;
51                     continuation_left_ = 1;
52                     min_value_ = 0x80;
53                     // std::cout << "Lead: cp= " << std::hex << code_point_ << std::endl;
54                 }
55                 else if ( ( byte & 0xF0 ) == 0xE0 ) {
56                     state_ = State::UTF8LeadingByteSeen;
57                     code_point_ = ( byte & 0x0F ) << 12;
58                     continuation_left_ = 2;
59                     min_value_ = 0x800;
60                     // std::cout << "Lead 3: cp= " << std::hex << code_point_ << std::endl;
61                 }
62                 else if ( ( byte & 0xF8 ) == 0xF0 ) {
63                     state_ = State::UTF8LeadingByteSeen;
64                     code_point_ = ( byte & 0x07 ) << 18;
65                     continuation_left_ = 3;
66                     min_value_ = 0x800;
67                     // std::cout << "Lead 4: cp= " << std::hex << code_point_ << std::endl;
68                 }
69                 else {
70                     state_ = State::Unknown8Bit;
71                 }
72                 break;
73             case State::UTF8LeadingByteSeen:
74                 if ( ( byte & 0xC0 ) == 0x80 ) {
75                     --continuation_left_;
76                     code_point_ |= ( byte & 0x3F ) << (continuation_left_ * 6);
77                     // std::cout << "Cont: cp= " << std::hex << code_point_ << std::endl;
78                     if ( continuation_left_ == 0 ) {
79                         if ( code_point_ >= min_value_ )
80                             state_ = State::ValidUTF8;
81                         else
82                             state_ = State::Unknown8Bit;
83                     }
84                 }
85                 else {
86                     state_ = State::Unknown8Bit;
87                 }
88                 break;
89             case State::UTF16BELeadingBOMByteSeen:
90                 if ( byte == 0xFF ) {
91                     state_ = State::ValidUTF16BE;
92                 }
93                 else {
94                     state_ = State::Unknown8Bit;
95                 }
96                 break;
97             case State::UTF16LELeadingBOMByteSeen:
98                 if ( byte == 0xFE ) {
99                     state_ = State::ValidUTF16LE;
100                 }
101                 else {
102                     state_ = State::Unknown8Bit;
103                 }
104                 break;
105             case State::ValidUTF16LE:
106             case State::ValidUTF16BE:
107                 // We don't verify UTF16 and assume it's all fine for now.
108                 break;
109             case State::Unknown8Bit:
110                 state_ = State::Unknown8Bit;
111          }
112     }
113 }
114 
115 EncodingSpeculator::Encoding EncodingSpeculator::guess() const
116 {
117     Encoding guess;
118 
119     switch ( state_ ) {
120         case State::Start:
121         case State::ASCIIOnly:
122             guess = Encoding::ASCII7;
123             break;
124         case State::Unknown8Bit:
125         case State::UTF8LeadingByteSeen:
126             guess = Encoding::ASCII8;
127             break;
128         case State::ValidUTF8:
129             guess = Encoding::UTF8;
130             break;
131         case State::ValidUTF16LE:
132             guess = Encoding::UTF16LE;
133             break;
134         case State::ValidUTF16BE:
135             guess = Encoding::UTF16BE;
136             break;
137         default:
138             guess = Encoding::ASCII8;
139     }
140 
141     return guess;
142 }
143