xref: /glogg/src/encodingspeculator.h (revision 048334c92fb4b86ebbabc7471f7313a1cc515c10)
1 /*
2  * Copyright (C) 2016 Nicolas Bonnefon and other contributors
3  *
4  * This file is part of glogg.
5  *
6  * glogg is free software: you can redistribute it and/or modify
7  * it under the terms of the GNU General Public License as published by
8  * the Free Software Foundation, either version 3 of the License, or
9  * (at your option) any later version.
10  *
11  * glogg is distributed in the hope that it will be useful,
12  * but WITHOUT ANY WARRANTY; without even the implied warranty of
13  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
14  * GNU General Public License for more details.
15  *
16  * You should have received a copy of the GNU General Public License
17  * along with glogg.  If not, see <http://www.gnu.org/licenses/>.
18  */
19 
20 #ifndef ENCODINGSPECULATOR_H
21 #define ENCODINGSPECULATOR_H
22 
23 #include <cstdint>
24 
25 // The encoder speculator tries to determine the likely encoding
26 // of the stream of bytes which is passed to it.
27 
28 class EncodingSpeculator {
29   public:
30     enum class Encoding {
31         ASCII7,
32         ASCII8,
33         UTF8,
34         UTF16LE,
35         UTF16BE,
36         BIG5,
37         GB18030,
38         SHIFT_JIS,
39         KOI8R
40     };
41 
42     EncodingSpeculator() : state_( State::Start ) {}
43 
44     // Inject one byte into the speculator
45     void inject_byte( uint8_t byte );
46 
47     // Returns the current guess based on the previously injected bytes
48     Encoding guess() const;
49 
50   private:
51     enum class State {
52         Start,
53         ASCIIOnly,
54         OtherOrUnknown8Bit,
55         UTF8LeadingByteSeen,
56         ValidUTF8,
57         UTF16BELeadingBOMByteSeen,
58         UTF16LELeadingBOMByteSeen,
59         ValidUTF16LE,
60         ValidUTF16BE,
61     };
62 
63     State state_;
64     uint32_t code_point_;
65     int continuation_left_;
66     uint32_t min_value_;
67 };
68 
69 #endif
70