1 /* 2 * Copyright (C) 2016 Nicolas Bonnefon and other contributors 3 * 4 * This file is part of glogg. 5 * 6 * glogg is free software: you can redistribute it and/or modify 7 * it under the terms of the GNU General Public License as published by 8 * the Free Software Foundation, either version 3 of the License, or 9 * (at your option) any later version. 10 * 11 * glogg is distributed in the hope that it will be useful, 12 * but WITHOUT ANY WARRANTY; without even the implied warranty of 13 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 14 * GNU General Public License for more details. 15 * 16 * You should have received a copy of the GNU General Public License 17 * along with glogg. If not, see <http://www.gnu.org/licenses/>. 18 */ 19 20 #include "encodingspeculator.h" 21 22 #include <iostream> 23 24 void EncodingSpeculator::inject_byte( uint8_t byte ) 25 { 26 if ( ! ( byte & 0x80 ) ) { 27 // 7-bit character, all fine 28 } 29 else { 30 switch ( state_ ) { 31 case State::ASCIIOnly: 32 case State::ValidUTF8: 33 if ( ( byte & 0xE0 ) == 0xC0 ) { 34 state_ = State::UTF8LeadingByteSeen; 35 code_point_ = ( byte & 0x1F ) << 6; 36 continuation_left_ = 1; 37 min_value_ = 0x80; 38 // std::cout << "Lead: cp= " << std::hex << code_point_ << std::endl; 39 } 40 else if ( ( byte & 0xF0 ) == 0xE0 ) { 41 state_ = State::UTF8LeadingByteSeen; 42 code_point_ = ( byte & 0x0F ) << 12; 43 continuation_left_ = 2; 44 min_value_ = 0x800; 45 // std::cout << "Lead 3: cp= " << std::hex << code_point_ << std::endl; 46 } 47 else if ( ( byte & 0xF8 ) == 0xF0 ) { 48 state_ = State::UTF8LeadingByteSeen; 49 code_point_ = ( byte & 0x07 ) << 18; 50 continuation_left_ = 3; 51 min_value_ = 0x800; 52 // std::cout << "Lead 4: cp= " << std::hex << code_point_ << std::endl; 53 } 54 else { 55 state_ = State::Unknown8Bit; 56 } 57 break; 58 case State::UTF8LeadingByteSeen: 59 if ( ( byte & 0xC0 ) == 0x80 ) { 60 --continuation_left_; 61 code_point_ |= ( byte & 0x3F ) << (continuation_left_ * 6); 62 // std::cout << "Cont: cp= " << std::hex << code_point_ << std::endl; 63 if ( continuation_left_ == 0 ) { 64 if ( code_point_ >= min_value_ ) 65 state_ = State::ValidUTF8; 66 else 67 state_ = State::Unknown8Bit; 68 } 69 } 70 else { 71 state_ = State::Unknown8Bit; 72 } 73 break; 74 } 75 // state_ = State::Unknown8Bit; 76 } 77 } 78 79 EncodingSpeculator::Encoding EncodingSpeculator::guess() const 80 { 81 Encoding guess; 82 83 switch ( state_ ) { 84 case State::ASCIIOnly: 85 guess = Encoding::ASCII7; 86 break; 87 case State::Unknown8Bit: 88 case State::UTF8LeadingByteSeen: 89 guess = Encoding::ASCII8; 90 break; 91 case State::ValidUTF8: 92 guess = Encoding::UTF8; 93 break; 94 default: 95 guess = Encoding::ASCII8; 96 } 97 98 return guess; 99 } 100