1 /* 2 * Copyright (C) 2016 Nicolas Bonnefon and other contributors 3 * 4 * This file is part of glogg. 5 * 6 * glogg is free software: you can redistribute it and/or modify 7 * it under the terms of the GNU General Public License as published by 8 * the Free Software Foundation, either version 3 of the License, or 9 * (at your option) any later version. 10 * 11 * glogg is distributed in the hope that it will be useful, 12 * but WITHOUT ANY WARRANTY; without even the implied warranty of 13 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 14 * GNU General Public License for more details. 15 * 16 * You should have received a copy of the GNU General Public License 17 * along with glogg. If not, see <http://www.gnu.org/licenses/>. 18 */ 19 20 #include "encodingspeculator.h" 21 22 #include <iostream> 23 24 void EncodingSpeculator::inject_byte( uint8_t byte ) 25 { 26 if ( ! ( byte & 0x80 ) ) { 27 // 7-bit character, all fine 28 if ( state_ == State::Start ) 29 state_ = State::ASCIIOnly; 30 } 31 else { 32 switch ( state_ ) { 33 case State::Start: 34 if ( byte == 0xFE ) { 35 state_ = State::UTF16BELeadingBOMByteSeen; 36 break; 37 } 38 else if ( byte == 0xFF ) { 39 state_ = State::UTF16LELeadingBOMByteSeen; 40 break; 41 } 42 else { 43 state_ = State::ASCIIOnly; 44 // And carry on... 45 } 46 case State::ASCIIOnly: 47 case State::ValidUTF8: 48 if ( ( byte & 0xE0 ) == 0xC0 ) { 49 state_ = State::UTF8LeadingByteSeen; 50 code_point_ = ( byte & 0x1F ) << 6; 51 continuation_left_ = 1; 52 min_value_ = 0x80; 53 // std::cout << "Lead: cp= " << std::hex << code_point_ << std::endl; 54 } 55 else if ( ( byte & 0xF0 ) == 0xE0 ) { 56 state_ = State::UTF8LeadingByteSeen; 57 code_point_ = ( byte & 0x0F ) << 12; 58 continuation_left_ = 2; 59 min_value_ = 0x800; 60 // std::cout << "Lead 3: cp= " << std::hex << code_point_ << std::endl; 61 } 62 else if ( ( byte & 0xF8 ) == 0xF0 ) { 63 state_ = State::UTF8LeadingByteSeen; 64 code_point_ = ( byte & 0x07 ) << 18; 65 continuation_left_ = 3; 66 min_value_ = 0x800; 67 // std::cout << "Lead 4: cp= " << std::hex << code_point_ << std::endl; 68 } 69 else { 70 state_ = State::OtherOrUnknown8Bit; 71 } 72 break; 73 case State::UTF8LeadingByteSeen: 74 if ( ( byte & 0xC0 ) == 0x80 ) { 75 --continuation_left_; 76 code_point_ |= ( byte & 0x3F ) << (continuation_left_ * 6); 77 // std::cout << "Cont: cp= " << std::hex << code_point_ << std::endl; 78 if ( continuation_left_ == 0 ) { 79 if ( code_point_ >= min_value_ ) 80 state_ = State::ValidUTF8; 81 else 82 state_ = State::OtherOrUnknown8Bit; 83 } 84 } 85 else { 86 state_ = State::OtherOrUnknown8Bit; 87 } 88 break; 89 case State::UTF16BELeadingBOMByteSeen: 90 if ( byte == 0xFF ) { 91 state_ = State::ValidUTF16BE; 92 } 93 else { 94 state_ = State::OtherOrUnknown8Bit; 95 } 96 break; 97 case State::UTF16LELeadingBOMByteSeen: 98 if ( byte == 0xFE ) { 99 state_ = State::ValidUTF16LE; 100 } 101 else { 102 state_ = State::OtherOrUnknown8Bit; 103 } 104 break; 105 case State::ValidUTF16LE: 106 case State::ValidUTF16BE: 107 // We don't verify UTF16 and assume it's all fine for now. 108 break; 109 case State::OtherOrUnknown8Bit: 110 state_ = State::OtherOrUnknown8Bit; 111 } 112 } 113 } 114 115 EncodingSpeculator::Encoding EncodingSpeculator::guess() const 116 { 117 Encoding guess; 118 119 switch ( state_ ) { 120 case State::Start: 121 case State::ASCIIOnly: 122 guess = Encoding::ASCII7; 123 break; 124 case State::OtherOrUnknown8Bit: 125 case State::UTF8LeadingByteSeen: 126 guess = Encoding::ASCII8; 127 break; 128 case State::ValidUTF8: 129 guess = Encoding::UTF8; 130 break; 131 case State::ValidUTF16LE: 132 guess = Encoding::UTF16LE; 133 break; 134 case State::ValidUTF16BE: 135 guess = Encoding::UTF16BE; 136 break; 137 default: 138 guess = Encoding::ASCII8; 139 } 140 141 return guess; 142 } 143