1702af59eSNicolas Bonnefon /* 2702af59eSNicolas Bonnefon * Copyright (C) 2016 Nicolas Bonnefon and other contributors 3702af59eSNicolas Bonnefon * 4702af59eSNicolas Bonnefon * This file is part of glogg. 5702af59eSNicolas Bonnefon * 6702af59eSNicolas Bonnefon * glogg is free software: you can redistribute it and/or modify 7702af59eSNicolas Bonnefon * it under the terms of the GNU General Public License as published by 8702af59eSNicolas Bonnefon * the Free Software Foundation, either version 3 of the License, or 9702af59eSNicolas Bonnefon * (at your option) any later version. 10702af59eSNicolas Bonnefon * 11702af59eSNicolas Bonnefon * glogg is distributed in the hope that it will be useful, 12702af59eSNicolas Bonnefon * but WITHOUT ANY WARRANTY; without even the implied warranty of 13702af59eSNicolas Bonnefon * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 14702af59eSNicolas Bonnefon * GNU General Public License for more details. 15702af59eSNicolas Bonnefon * 16702af59eSNicolas Bonnefon * You should have received a copy of the GNU General Public License 17702af59eSNicolas Bonnefon * along with glogg. If not, see <http://www.gnu.org/licenses/>. 18702af59eSNicolas Bonnefon */ 19702af59eSNicolas Bonnefon 20702af59eSNicolas Bonnefon #ifndef ENCODINGSPECULATOR_H 21702af59eSNicolas Bonnefon #define ENCODINGSPECULATOR_H 22702af59eSNicolas Bonnefon 23702af59eSNicolas Bonnefon #include <cstdint> 24702af59eSNicolas Bonnefon 25702af59eSNicolas Bonnefon // The encoder speculator tries to determine the likely encoding 26702af59eSNicolas Bonnefon // of the stream of bytes which is passed to it. 27702af59eSNicolas Bonnefon 28702af59eSNicolas Bonnefon class EncodingSpeculator { 29702af59eSNicolas Bonnefon public: 30702af59eSNicolas Bonnefon enum class Encoding { 31702af59eSNicolas Bonnefon ASCII7, 32702af59eSNicolas Bonnefon ASCII8, 330faa4758SNicolas Bonnefon UTF8, 340faa4758SNicolas Bonnefon UTF16LE, 35*048334c9SSeerauber UTF16BE, 36*048334c9SSeerauber BIG5, 37*048334c9SSeerauber GB18030, 38*048334c9SSeerauber SHIFT_JIS, 39*048334c9SSeerauber KOI8R 40702af59eSNicolas Bonnefon }; 41702af59eSNicolas Bonnefon EncodingSpeculator()420faa4758SNicolas Bonnefon EncodingSpeculator() : state_( State::Start ) {} 43702af59eSNicolas Bonnefon 44702af59eSNicolas Bonnefon // Inject one byte into the speculator 45702af59eSNicolas Bonnefon void inject_byte( uint8_t byte ); 46702af59eSNicolas Bonnefon 47702af59eSNicolas Bonnefon // Returns the current guess based on the previously injected bytes 48702af59eSNicolas Bonnefon Encoding guess() const; 49702af59eSNicolas Bonnefon 50702af59eSNicolas Bonnefon private: 51702af59eSNicolas Bonnefon enum class State { 520faa4758SNicolas Bonnefon Start, 53702af59eSNicolas Bonnefon ASCIIOnly, 54*048334c9SSeerauber OtherOrUnknown8Bit, 55702af59eSNicolas Bonnefon UTF8LeadingByteSeen, 56702af59eSNicolas Bonnefon ValidUTF8, 570faa4758SNicolas Bonnefon UTF16BELeadingBOMByteSeen, 580faa4758SNicolas Bonnefon UTF16LELeadingBOMByteSeen, 590faa4758SNicolas Bonnefon ValidUTF16LE, 600faa4758SNicolas Bonnefon ValidUTF16BE, 61702af59eSNicolas Bonnefon }; 62702af59eSNicolas Bonnefon 63702af59eSNicolas Bonnefon State state_; 64702af59eSNicolas Bonnefon uint32_t code_point_; 65702af59eSNicolas Bonnefon int continuation_left_; 66702af59eSNicolas Bonnefon uint32_t min_value_; 67702af59eSNicolas Bonnefon }; 68702af59eSNicolas Bonnefon 69702af59eSNicolas Bonnefon #endif 70