xref: /glogg/src/encodingspeculator.h (revision 048334c92fb4b86ebbabc7471f7313a1cc515c10)
1702af59eSNicolas Bonnefon /*
2702af59eSNicolas Bonnefon  * Copyright (C) 2016 Nicolas Bonnefon and other contributors
3702af59eSNicolas Bonnefon  *
4702af59eSNicolas Bonnefon  * This file is part of glogg.
5702af59eSNicolas Bonnefon  *
6702af59eSNicolas Bonnefon  * glogg is free software: you can redistribute it and/or modify
7702af59eSNicolas Bonnefon  * it under the terms of the GNU General Public License as published by
8702af59eSNicolas Bonnefon  * the Free Software Foundation, either version 3 of the License, or
9702af59eSNicolas Bonnefon  * (at your option) any later version.
10702af59eSNicolas Bonnefon  *
11702af59eSNicolas Bonnefon  * glogg is distributed in the hope that it will be useful,
12702af59eSNicolas Bonnefon  * but WITHOUT ANY WARRANTY; without even the implied warranty of
13702af59eSNicolas Bonnefon  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
14702af59eSNicolas Bonnefon  * GNU General Public License for more details.
15702af59eSNicolas Bonnefon  *
16702af59eSNicolas Bonnefon  * You should have received a copy of the GNU General Public License
17702af59eSNicolas Bonnefon  * along with glogg.  If not, see <http://www.gnu.org/licenses/>.
18702af59eSNicolas Bonnefon  */
19702af59eSNicolas Bonnefon 
20702af59eSNicolas Bonnefon #ifndef ENCODINGSPECULATOR_H
21702af59eSNicolas Bonnefon #define ENCODINGSPECULATOR_H
22702af59eSNicolas Bonnefon 
23702af59eSNicolas Bonnefon #include <cstdint>
24702af59eSNicolas Bonnefon 
25702af59eSNicolas Bonnefon // The encoder speculator tries to determine the likely encoding
26702af59eSNicolas Bonnefon // of the stream of bytes which is passed to it.
27702af59eSNicolas Bonnefon 
28702af59eSNicolas Bonnefon class EncodingSpeculator {
29702af59eSNicolas Bonnefon   public:
30702af59eSNicolas Bonnefon     enum class Encoding {
31702af59eSNicolas Bonnefon         ASCII7,
32702af59eSNicolas Bonnefon         ASCII8,
330faa4758SNicolas Bonnefon         UTF8,
340faa4758SNicolas Bonnefon         UTF16LE,
35*048334c9SSeerauber         UTF16BE,
36*048334c9SSeerauber         BIG5,
37*048334c9SSeerauber         GB18030,
38*048334c9SSeerauber         SHIFT_JIS,
39*048334c9SSeerauber         KOI8R
40702af59eSNicolas Bonnefon     };
41702af59eSNicolas Bonnefon 
420faa4758SNicolas Bonnefon     EncodingSpeculator() : state_( State::Start ) {}
43702af59eSNicolas Bonnefon 
44702af59eSNicolas Bonnefon     // Inject one byte into the speculator
45702af59eSNicolas Bonnefon     void inject_byte( uint8_t byte );
46702af59eSNicolas Bonnefon 
47702af59eSNicolas Bonnefon     // Returns the current guess based on the previously injected bytes
48702af59eSNicolas Bonnefon     Encoding guess() const;
49702af59eSNicolas Bonnefon 
50702af59eSNicolas Bonnefon   private:
51702af59eSNicolas Bonnefon     enum class State {
520faa4758SNicolas Bonnefon         Start,
53702af59eSNicolas Bonnefon         ASCIIOnly,
54*048334c9SSeerauber         OtherOrUnknown8Bit,
55702af59eSNicolas Bonnefon         UTF8LeadingByteSeen,
56702af59eSNicolas Bonnefon         ValidUTF8,
570faa4758SNicolas Bonnefon         UTF16BELeadingBOMByteSeen,
580faa4758SNicolas Bonnefon         UTF16LELeadingBOMByteSeen,
590faa4758SNicolas Bonnefon         ValidUTF16LE,
600faa4758SNicolas Bonnefon         ValidUTF16BE,
61702af59eSNicolas Bonnefon     };
62702af59eSNicolas Bonnefon 
63702af59eSNicolas Bonnefon     State state_;
64702af59eSNicolas Bonnefon     uint32_t code_point_;
65702af59eSNicolas Bonnefon     int continuation_left_;
66702af59eSNicolas Bonnefon     uint32_t min_value_;
67702af59eSNicolas Bonnefon };
68702af59eSNicolas Bonnefon 
69702af59eSNicolas Bonnefon #endif
70