xref: /glogg/src/encodingspeculator.h (revision 0faa4758001e16aefc549004c6565cc7bab02cb2)
1702af59eSNicolas Bonnefon /*
2702af59eSNicolas Bonnefon  * Copyright (C) 2016 Nicolas Bonnefon and other contributors
3702af59eSNicolas Bonnefon  *
4702af59eSNicolas Bonnefon  * This file is part of glogg.
5702af59eSNicolas Bonnefon  *
6702af59eSNicolas Bonnefon  * glogg is free software: you can redistribute it and/or modify
7702af59eSNicolas Bonnefon  * it under the terms of the GNU General Public License as published by
8702af59eSNicolas Bonnefon  * the Free Software Foundation, either version 3 of the License, or
9702af59eSNicolas Bonnefon  * (at your option) any later version.
10702af59eSNicolas Bonnefon  *
11702af59eSNicolas Bonnefon  * glogg is distributed in the hope that it will be useful,
12702af59eSNicolas Bonnefon  * but WITHOUT ANY WARRANTY; without even the implied warranty of
13702af59eSNicolas Bonnefon  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
14702af59eSNicolas Bonnefon  * GNU General Public License for more details.
15702af59eSNicolas Bonnefon  *
16702af59eSNicolas Bonnefon  * You should have received a copy of the GNU General Public License
17702af59eSNicolas Bonnefon  * along with glogg.  If not, see <http://www.gnu.org/licenses/>.
18702af59eSNicolas Bonnefon  */
19702af59eSNicolas Bonnefon 
20702af59eSNicolas Bonnefon #ifndef ENCODINGSPECULATOR_H
21702af59eSNicolas Bonnefon #define ENCODINGSPECULATOR_H
22702af59eSNicolas Bonnefon 
23702af59eSNicolas Bonnefon #include <cstdint>
24702af59eSNicolas Bonnefon 
25702af59eSNicolas Bonnefon // The encoder speculator tries to determine the likely encoding
26702af59eSNicolas Bonnefon // of the stream of bytes which is passed to it.
27702af59eSNicolas Bonnefon 
28702af59eSNicolas Bonnefon class EncodingSpeculator {
29702af59eSNicolas Bonnefon   public:
30702af59eSNicolas Bonnefon     enum class Encoding {
31702af59eSNicolas Bonnefon         ASCII7,
32702af59eSNicolas Bonnefon         ASCII8,
33*0faa4758SNicolas Bonnefon         UTF8,
34*0faa4758SNicolas Bonnefon         UTF16LE,
35*0faa4758SNicolas Bonnefon         UTF16BE
36702af59eSNicolas Bonnefon     };
37702af59eSNicolas Bonnefon 
38*0faa4758SNicolas Bonnefon     EncodingSpeculator() : state_( State::Start ) {}
39702af59eSNicolas Bonnefon 
40702af59eSNicolas Bonnefon     // Inject one byte into the speculator
41702af59eSNicolas Bonnefon     void inject_byte( uint8_t byte );
42702af59eSNicolas Bonnefon 
43702af59eSNicolas Bonnefon     // Returns the current guess based on the previously injected bytes
44702af59eSNicolas Bonnefon     Encoding guess() const;
45702af59eSNicolas Bonnefon 
46702af59eSNicolas Bonnefon   private:
47702af59eSNicolas Bonnefon     enum class State {
48*0faa4758SNicolas Bonnefon         Start,
49702af59eSNicolas Bonnefon         ASCIIOnly,
50702af59eSNicolas Bonnefon         Unknown8Bit,
51702af59eSNicolas Bonnefon         UTF8LeadingByteSeen,
52702af59eSNicolas Bonnefon         ValidUTF8,
53*0faa4758SNicolas Bonnefon         UTF16BELeadingBOMByteSeen,
54*0faa4758SNicolas Bonnefon         UTF16LELeadingBOMByteSeen,
55*0faa4758SNicolas Bonnefon         ValidUTF16LE,
56*0faa4758SNicolas Bonnefon         ValidUTF16BE,
57702af59eSNicolas Bonnefon     };
58702af59eSNicolas Bonnefon 
59702af59eSNicolas Bonnefon     State state_;
60702af59eSNicolas Bonnefon     uint32_t code_point_;
61702af59eSNicolas Bonnefon     int continuation_left_;
62702af59eSNicolas Bonnefon     uint32_t min_value_;
63702af59eSNicolas Bonnefon };
64702af59eSNicolas Bonnefon 
65702af59eSNicolas Bonnefon #endif
66