1702af59eSNicolas Bonnefon /* 2702af59eSNicolas Bonnefon * Copyright (C) 2016 Nicolas Bonnefon and other contributors 3702af59eSNicolas Bonnefon * 4702af59eSNicolas Bonnefon * This file is part of glogg. 5702af59eSNicolas Bonnefon * 6702af59eSNicolas Bonnefon * glogg is free software: you can redistribute it and/or modify 7702af59eSNicolas Bonnefon * it under the terms of the GNU General Public License as published by 8702af59eSNicolas Bonnefon * the Free Software Foundation, either version 3 of the License, or 9702af59eSNicolas Bonnefon * (at your option) any later version. 10702af59eSNicolas Bonnefon * 11702af59eSNicolas Bonnefon * glogg is distributed in the hope that it will be useful, 12702af59eSNicolas Bonnefon * but WITHOUT ANY WARRANTY; without even the implied warranty of 13702af59eSNicolas Bonnefon * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 14702af59eSNicolas Bonnefon * GNU General Public License for more details. 15702af59eSNicolas Bonnefon * 16702af59eSNicolas Bonnefon * You should have received a copy of the GNU General Public License 17702af59eSNicolas Bonnefon * along with glogg. If not, see <http://www.gnu.org/licenses/>. 18702af59eSNicolas Bonnefon */ 19702af59eSNicolas Bonnefon 20702af59eSNicolas Bonnefon #include "encodingspeculator.h" 21702af59eSNicolas Bonnefon 22702af59eSNicolas Bonnefon #include <iostream> 23702af59eSNicolas Bonnefon 24702af59eSNicolas Bonnefon void EncodingSpeculator::inject_byte( uint8_t byte ) 25702af59eSNicolas Bonnefon { 26702af59eSNicolas Bonnefon if ( ! ( byte & 0x80 ) ) { 27702af59eSNicolas Bonnefon // 7-bit character, all fine 280faa4758SNicolas Bonnefon if ( state_ == State::Start ) 290faa4758SNicolas Bonnefon state_ = State::ASCIIOnly; 30702af59eSNicolas Bonnefon } 31702af59eSNicolas Bonnefon else { 32702af59eSNicolas Bonnefon switch ( state_ ) { 330faa4758SNicolas Bonnefon case State::Start: 340faa4758SNicolas Bonnefon if ( byte == 0xFE ) { 350faa4758SNicolas Bonnefon state_ = State::UTF16BELeadingBOMByteSeen; 360faa4758SNicolas Bonnefon break; 370faa4758SNicolas Bonnefon } 380faa4758SNicolas Bonnefon else if ( byte == 0xFF ) { 390faa4758SNicolas Bonnefon state_ = State::UTF16LELeadingBOMByteSeen; 400faa4758SNicolas Bonnefon break; 410faa4758SNicolas Bonnefon } 420faa4758SNicolas Bonnefon else { 430faa4758SNicolas Bonnefon state_ = State::ASCIIOnly; 440faa4758SNicolas Bonnefon // And carry on... 450faa4758SNicolas Bonnefon } 46702af59eSNicolas Bonnefon case State::ASCIIOnly: 47702af59eSNicolas Bonnefon case State::ValidUTF8: 48702af59eSNicolas Bonnefon if ( ( byte & 0xE0 ) == 0xC0 ) { 49702af59eSNicolas Bonnefon state_ = State::UTF8LeadingByteSeen; 50702af59eSNicolas Bonnefon code_point_ = ( byte & 0x1F ) << 6; 51702af59eSNicolas Bonnefon continuation_left_ = 1; 52702af59eSNicolas Bonnefon min_value_ = 0x80; 53702af59eSNicolas Bonnefon // std::cout << "Lead: cp= " << std::hex << code_point_ << std::endl; 54702af59eSNicolas Bonnefon } 55702af59eSNicolas Bonnefon else if ( ( byte & 0xF0 ) == 0xE0 ) { 56702af59eSNicolas Bonnefon state_ = State::UTF8LeadingByteSeen; 57702af59eSNicolas Bonnefon code_point_ = ( byte & 0x0F ) << 12; 58702af59eSNicolas Bonnefon continuation_left_ = 2; 59702af59eSNicolas Bonnefon min_value_ = 0x800; 60702af59eSNicolas Bonnefon // std::cout << "Lead 3: cp= " << std::hex << code_point_ << std::endl; 61702af59eSNicolas Bonnefon } 62702af59eSNicolas Bonnefon else if ( ( byte & 0xF8 ) == 0xF0 ) { 63702af59eSNicolas Bonnefon state_ = State::UTF8LeadingByteSeen; 64702af59eSNicolas Bonnefon code_point_ = ( byte & 0x07 ) << 18; 65702af59eSNicolas Bonnefon continuation_left_ = 3; 66702af59eSNicolas Bonnefon min_value_ = 0x800; 67702af59eSNicolas Bonnefon // std::cout << "Lead 4: cp= " << std::hex << code_point_ << std::endl; 68702af59eSNicolas Bonnefon } 69702af59eSNicolas Bonnefon else { 70*048334c9SSeerauber state_ = State::OtherOrUnknown8Bit; 71702af59eSNicolas Bonnefon } 72702af59eSNicolas Bonnefon break; 73702af59eSNicolas Bonnefon case State::UTF8LeadingByteSeen: 74702af59eSNicolas Bonnefon if ( ( byte & 0xC0 ) == 0x80 ) { 75702af59eSNicolas Bonnefon --continuation_left_; 76702af59eSNicolas Bonnefon code_point_ |= ( byte & 0x3F ) << (continuation_left_ * 6); 77702af59eSNicolas Bonnefon // std::cout << "Cont: cp= " << std::hex << code_point_ << std::endl; 78702af59eSNicolas Bonnefon if ( continuation_left_ == 0 ) { 79702af59eSNicolas Bonnefon if ( code_point_ >= min_value_ ) 80702af59eSNicolas Bonnefon state_ = State::ValidUTF8; 81702af59eSNicolas Bonnefon else 82*048334c9SSeerauber state_ = State::OtherOrUnknown8Bit; 83702af59eSNicolas Bonnefon } 84702af59eSNicolas Bonnefon } 85702af59eSNicolas Bonnefon else { 86*048334c9SSeerauber state_ = State::OtherOrUnknown8Bit; 87702af59eSNicolas Bonnefon } 88702af59eSNicolas Bonnefon break; 890faa4758SNicolas Bonnefon case State::UTF16BELeadingBOMByteSeen: 900faa4758SNicolas Bonnefon if ( byte == 0xFF ) { 910faa4758SNicolas Bonnefon state_ = State::ValidUTF16BE; 92702af59eSNicolas Bonnefon } 930faa4758SNicolas Bonnefon else { 94*048334c9SSeerauber state_ = State::OtherOrUnknown8Bit; 950faa4758SNicolas Bonnefon } 960faa4758SNicolas Bonnefon break; 970faa4758SNicolas Bonnefon case State::UTF16LELeadingBOMByteSeen: 980faa4758SNicolas Bonnefon if ( byte == 0xFE ) { 990faa4758SNicolas Bonnefon state_ = State::ValidUTF16LE; 1000faa4758SNicolas Bonnefon } 1010faa4758SNicolas Bonnefon else { 102*048334c9SSeerauber state_ = State::OtherOrUnknown8Bit; 1030faa4758SNicolas Bonnefon } 1040faa4758SNicolas Bonnefon break; 1050faa4758SNicolas Bonnefon case State::ValidUTF16LE: 1060faa4758SNicolas Bonnefon case State::ValidUTF16BE: 1070faa4758SNicolas Bonnefon // We don't verify UTF16 and assume it's all fine for now. 1080faa4758SNicolas Bonnefon break; 109*048334c9SSeerauber case State::OtherOrUnknown8Bit: 110*048334c9SSeerauber state_ = State::OtherOrUnknown8Bit; 1110faa4758SNicolas Bonnefon } 112702af59eSNicolas Bonnefon } 113702af59eSNicolas Bonnefon } 114702af59eSNicolas Bonnefon 115702af59eSNicolas Bonnefon EncodingSpeculator::Encoding EncodingSpeculator::guess() const 116702af59eSNicolas Bonnefon { 117702af59eSNicolas Bonnefon Encoding guess; 118702af59eSNicolas Bonnefon 119702af59eSNicolas Bonnefon switch ( state_ ) { 1200faa4758SNicolas Bonnefon case State::Start: 121702af59eSNicolas Bonnefon case State::ASCIIOnly: 122702af59eSNicolas Bonnefon guess = Encoding::ASCII7; 123702af59eSNicolas Bonnefon break; 124*048334c9SSeerauber case State::OtherOrUnknown8Bit: 125702af59eSNicolas Bonnefon case State::UTF8LeadingByteSeen: 126702af59eSNicolas Bonnefon guess = Encoding::ASCII8; 127702af59eSNicolas Bonnefon break; 128702af59eSNicolas Bonnefon case State::ValidUTF8: 129702af59eSNicolas Bonnefon guess = Encoding::UTF8; 130702af59eSNicolas Bonnefon break; 1310faa4758SNicolas Bonnefon case State::ValidUTF16LE: 1320faa4758SNicolas Bonnefon guess = Encoding::UTF16LE; 1330faa4758SNicolas Bonnefon break; 1340faa4758SNicolas Bonnefon case State::ValidUTF16BE: 1350faa4758SNicolas Bonnefon guess = Encoding::UTF16BE; 1360faa4758SNicolas Bonnefon break; 137702af59eSNicolas Bonnefon default: 138702af59eSNicolas Bonnefon guess = Encoding::ASCII8; 139702af59eSNicolas Bonnefon } 140702af59eSNicolas Bonnefon 141702af59eSNicolas Bonnefon return guess; 142702af59eSNicolas Bonnefon } 143