1702af59eSNicolas Bonnefon /* 2702af59eSNicolas Bonnefon * Copyright (C) 2016 Nicolas Bonnefon and other contributors 3702af59eSNicolas Bonnefon * 4702af59eSNicolas Bonnefon * This file is part of glogg. 5702af59eSNicolas Bonnefon * 6702af59eSNicolas Bonnefon * glogg is free software: you can redistribute it and/or modify 7702af59eSNicolas Bonnefon * it under the terms of the GNU General Public License as published by 8702af59eSNicolas Bonnefon * the Free Software Foundation, either version 3 of the License, or 9702af59eSNicolas Bonnefon * (at your option) any later version. 10702af59eSNicolas Bonnefon * 11702af59eSNicolas Bonnefon * glogg is distributed in the hope that it will be useful, 12702af59eSNicolas Bonnefon * but WITHOUT ANY WARRANTY; without even the implied warranty of 13702af59eSNicolas Bonnefon * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 14702af59eSNicolas Bonnefon * GNU General Public License for more details. 15702af59eSNicolas Bonnefon * 16702af59eSNicolas Bonnefon * You should have received a copy of the GNU General Public License 17702af59eSNicolas Bonnefon * along with glogg. If not, see <http://www.gnu.org/licenses/>. 18702af59eSNicolas Bonnefon */ 19702af59eSNicolas Bonnefon 20702af59eSNicolas Bonnefon #ifndef ENCODINGSPECULATOR_H 21702af59eSNicolas Bonnefon #define ENCODINGSPECULATOR_H 22702af59eSNicolas Bonnefon 23702af59eSNicolas Bonnefon #include <cstdint> 24702af59eSNicolas Bonnefon 25702af59eSNicolas Bonnefon // The encoder speculator tries to determine the likely encoding 26702af59eSNicolas Bonnefon // of the stream of bytes which is passed to it. 27702af59eSNicolas Bonnefon 28702af59eSNicolas Bonnefon class EncodingSpeculator { 29702af59eSNicolas Bonnefon public: 30702af59eSNicolas Bonnefon enum class Encoding { 31702af59eSNicolas Bonnefon ASCII7, 32702af59eSNicolas Bonnefon ASCII8, 33*0faa4758SNicolas Bonnefon UTF8, 34*0faa4758SNicolas Bonnefon UTF16LE, 35*0faa4758SNicolas Bonnefon UTF16BE 36702af59eSNicolas Bonnefon }; 37702af59eSNicolas Bonnefon 38*0faa4758SNicolas Bonnefon EncodingSpeculator() : state_( State::Start ) {} 39702af59eSNicolas Bonnefon 40702af59eSNicolas Bonnefon // Inject one byte into the speculator 41702af59eSNicolas Bonnefon void inject_byte( uint8_t byte ); 42702af59eSNicolas Bonnefon 43702af59eSNicolas Bonnefon // Returns the current guess based on the previously injected bytes 44702af59eSNicolas Bonnefon Encoding guess() const; 45702af59eSNicolas Bonnefon 46702af59eSNicolas Bonnefon private: 47702af59eSNicolas Bonnefon enum class State { 48*0faa4758SNicolas Bonnefon Start, 49702af59eSNicolas Bonnefon ASCIIOnly, 50702af59eSNicolas Bonnefon Unknown8Bit, 51702af59eSNicolas Bonnefon UTF8LeadingByteSeen, 52702af59eSNicolas Bonnefon ValidUTF8, 53*0faa4758SNicolas Bonnefon UTF16BELeadingBOMByteSeen, 54*0faa4758SNicolas Bonnefon UTF16LELeadingBOMByteSeen, 55*0faa4758SNicolas Bonnefon ValidUTF16LE, 56*0faa4758SNicolas Bonnefon ValidUTF16BE, 57702af59eSNicolas Bonnefon }; 58702af59eSNicolas Bonnefon 59702af59eSNicolas Bonnefon State state_; 60702af59eSNicolas Bonnefon uint32_t code_point_; 61702af59eSNicolas Bonnefon int continuation_left_; 62702af59eSNicolas Bonnefon uint32_t min_value_; 63702af59eSNicolas Bonnefon }; 64702af59eSNicolas Bonnefon 65702af59eSNicolas Bonnefon #endif 66