1*702af59eSNicolas Bonnefon /* 2*702af59eSNicolas Bonnefon * Copyright (C) 2016 Nicolas Bonnefon and other contributors 3*702af59eSNicolas Bonnefon * 4*702af59eSNicolas Bonnefon * This file is part of glogg. 5*702af59eSNicolas Bonnefon * 6*702af59eSNicolas Bonnefon * glogg is free software: you can redistribute it and/or modify 7*702af59eSNicolas Bonnefon * it under the terms of the GNU General Public License as published by 8*702af59eSNicolas Bonnefon * the Free Software Foundation, either version 3 of the License, or 9*702af59eSNicolas Bonnefon * (at your option) any later version. 10*702af59eSNicolas Bonnefon * 11*702af59eSNicolas Bonnefon * glogg is distributed in the hope that it will be useful, 12*702af59eSNicolas Bonnefon * but WITHOUT ANY WARRANTY; without even the implied warranty of 13*702af59eSNicolas Bonnefon * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 14*702af59eSNicolas Bonnefon * GNU General Public License for more details. 15*702af59eSNicolas Bonnefon * 16*702af59eSNicolas Bonnefon * You should have received a copy of the GNU General Public License 17*702af59eSNicolas Bonnefon * along with glogg. If not, see <http://www.gnu.org/licenses/>. 18*702af59eSNicolas Bonnefon */ 19*702af59eSNicolas Bonnefon 20*702af59eSNicolas Bonnefon #include "encodingspeculator.h" 21*702af59eSNicolas Bonnefon 22*702af59eSNicolas Bonnefon #include <iostream> 23*702af59eSNicolas Bonnefon 24*702af59eSNicolas Bonnefon void EncodingSpeculator::inject_byte( uint8_t byte ) 25*702af59eSNicolas Bonnefon { 26*702af59eSNicolas Bonnefon if ( ! ( byte & 0x80 ) ) { 27*702af59eSNicolas Bonnefon // 7-bit character, all fine 28*702af59eSNicolas Bonnefon } 29*702af59eSNicolas Bonnefon else { 30*702af59eSNicolas Bonnefon switch ( state_ ) { 31*702af59eSNicolas Bonnefon case State::ASCIIOnly: 32*702af59eSNicolas Bonnefon case State::ValidUTF8: 33*702af59eSNicolas Bonnefon if ( ( byte & 0xE0 ) == 0xC0 ) { 34*702af59eSNicolas Bonnefon state_ = State::UTF8LeadingByteSeen; 35*702af59eSNicolas Bonnefon code_point_ = ( byte & 0x1F ) << 6; 36*702af59eSNicolas Bonnefon continuation_left_ = 1; 37*702af59eSNicolas Bonnefon min_value_ = 0x80; 38*702af59eSNicolas Bonnefon // std::cout << "Lead: cp= " << std::hex << code_point_ << std::endl; 39*702af59eSNicolas Bonnefon } 40*702af59eSNicolas Bonnefon else if ( ( byte & 0xF0 ) == 0xE0 ) { 41*702af59eSNicolas Bonnefon state_ = State::UTF8LeadingByteSeen; 42*702af59eSNicolas Bonnefon code_point_ = ( byte & 0x0F ) << 12; 43*702af59eSNicolas Bonnefon continuation_left_ = 2; 44*702af59eSNicolas Bonnefon min_value_ = 0x800; 45*702af59eSNicolas Bonnefon // std::cout << "Lead 3: cp= " << std::hex << code_point_ << std::endl; 46*702af59eSNicolas Bonnefon } 47*702af59eSNicolas Bonnefon else if ( ( byte & 0xF8 ) == 0xF0 ) { 48*702af59eSNicolas Bonnefon state_ = State::UTF8LeadingByteSeen; 49*702af59eSNicolas Bonnefon code_point_ = ( byte & 0x07 ) << 18; 50*702af59eSNicolas Bonnefon continuation_left_ = 3; 51*702af59eSNicolas Bonnefon min_value_ = 0x800; 52*702af59eSNicolas Bonnefon // std::cout << "Lead 4: cp= " << std::hex << code_point_ << std::endl; 53*702af59eSNicolas Bonnefon } 54*702af59eSNicolas Bonnefon else { 55*702af59eSNicolas Bonnefon state_ = State::Unknown8Bit; 56*702af59eSNicolas Bonnefon } 57*702af59eSNicolas Bonnefon break; 58*702af59eSNicolas Bonnefon case State::UTF8LeadingByteSeen: 59*702af59eSNicolas Bonnefon if ( ( byte & 0xC0 ) == 0x80 ) { 60*702af59eSNicolas Bonnefon --continuation_left_; 61*702af59eSNicolas Bonnefon code_point_ |= ( byte & 0x3F ) << (continuation_left_ * 6); 62*702af59eSNicolas Bonnefon // std::cout << "Cont: cp= " << std::hex << code_point_ << std::endl; 63*702af59eSNicolas Bonnefon if ( continuation_left_ == 0 ) { 64*702af59eSNicolas Bonnefon if ( code_point_ >= min_value_ ) 65*702af59eSNicolas Bonnefon state_ = State::ValidUTF8; 66*702af59eSNicolas Bonnefon else 67*702af59eSNicolas Bonnefon state_ = State::Unknown8Bit; 68*702af59eSNicolas Bonnefon } 69*702af59eSNicolas Bonnefon } 70*702af59eSNicolas Bonnefon else { 71*702af59eSNicolas Bonnefon state_ = State::Unknown8Bit; 72*702af59eSNicolas Bonnefon } 73*702af59eSNicolas Bonnefon break; 74*702af59eSNicolas Bonnefon } 75*702af59eSNicolas Bonnefon // state_ = State::Unknown8Bit; 76*702af59eSNicolas Bonnefon } 77*702af59eSNicolas Bonnefon } 78*702af59eSNicolas Bonnefon 79*702af59eSNicolas Bonnefon EncodingSpeculator::Encoding EncodingSpeculator::guess() const 80*702af59eSNicolas Bonnefon { 81*702af59eSNicolas Bonnefon Encoding guess; 82*702af59eSNicolas Bonnefon 83*702af59eSNicolas Bonnefon switch ( state_ ) { 84*702af59eSNicolas Bonnefon case State::ASCIIOnly: 85*702af59eSNicolas Bonnefon guess = Encoding::ASCII7; 86*702af59eSNicolas Bonnefon break; 87*702af59eSNicolas Bonnefon case State::Unknown8Bit: 88*702af59eSNicolas Bonnefon case State::UTF8LeadingByteSeen: 89*702af59eSNicolas Bonnefon guess = Encoding::ASCII8; 90*702af59eSNicolas Bonnefon break; 91*702af59eSNicolas Bonnefon case State::ValidUTF8: 92*702af59eSNicolas Bonnefon guess = Encoding::UTF8; 93*702af59eSNicolas Bonnefon break; 94*702af59eSNicolas Bonnefon default: 95*702af59eSNicolas Bonnefon guess = Encoding::ASCII8; 96*702af59eSNicolas Bonnefon } 97*702af59eSNicolas Bonnefon 98*702af59eSNicolas Bonnefon return guess; 99*702af59eSNicolas Bonnefon } 100