/*
* Copyright (C) 2016 Nicolas Bonnefon and other contributors
*
* This file is part of glogg.
*
* glogg is free software: you can redistribute it and/or modify
* it under the terms of the GNU General Public License as published by
* the Free Software Foundation, either version 3 of the License, or
* (at your option) any later version.
*
* glogg is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
* GNU General Public License for more details.
*
* You should have received a copy of the GNU General Public License
* along with glogg. If not, see .
*/
#include "encodingspeculator.h"
#include
void EncodingSpeculator::inject_byte( uint8_t byte )
{
if ( ! ( byte & 0x80 ) ) {
// 7-bit character, all fine
if ( state_ == State::Start )
state_ = State::ASCIIOnly;
}
else {
switch ( state_ ) {
case State::Start:
if ( byte == 0xFE ) {
state_ = State::UTF16BELeadingBOMByteSeen;
break;
}
else if ( byte == 0xFF ) {
state_ = State::UTF16LELeadingBOMByteSeen;
break;
}
else {
state_ = State::ASCIIOnly;
// And carry on...
}
case State::ASCIIOnly:
case State::ValidUTF8:
if ( ( byte & 0xE0 ) == 0xC0 ) {
state_ = State::UTF8LeadingByteSeen;
code_point_ = ( byte & 0x1F ) << 6;
continuation_left_ = 1;
min_value_ = 0x80;
// std::cout << "Lead: cp= " << std::hex << code_point_ << std::endl;
}
else if ( ( byte & 0xF0 ) == 0xE0 ) {
state_ = State::UTF8LeadingByteSeen;
code_point_ = ( byte & 0x0F ) << 12;
continuation_left_ = 2;
min_value_ = 0x800;
// std::cout << "Lead 3: cp= " << std::hex << code_point_ << std::endl;
}
else if ( ( byte & 0xF8 ) == 0xF0 ) {
state_ = State::UTF8LeadingByteSeen;
code_point_ = ( byte & 0x07 ) << 18;
continuation_left_ = 3;
min_value_ = 0x800;
// std::cout << "Lead 4: cp= " << std::hex << code_point_ << std::endl;
}
else {
state_ = State::Unknown8Bit;
}
break;
case State::UTF8LeadingByteSeen:
if ( ( byte & 0xC0 ) == 0x80 ) {
--continuation_left_;
code_point_ |= ( byte & 0x3F ) << (continuation_left_ * 6);
// std::cout << "Cont: cp= " << std::hex << code_point_ << std::endl;
if ( continuation_left_ == 0 ) {
if ( code_point_ >= min_value_ )
state_ = State::ValidUTF8;
else
state_ = State::Unknown8Bit;
}
}
else {
state_ = State::Unknown8Bit;
}
break;
case State::UTF16BELeadingBOMByteSeen:
if ( byte == 0xFF ) {
state_ = State::ValidUTF16BE;
}
else {
state_ = State::Unknown8Bit;
}
break;
case State::UTF16LELeadingBOMByteSeen:
if ( byte == 0xFE ) {
state_ = State::ValidUTF16LE;
}
else {
state_ = State::Unknown8Bit;
}
break;
case State::ValidUTF16LE:
case State::ValidUTF16BE:
// We don't verify UTF16 and assume it's all fine for now.
break;
case State::Unknown8Bit:
state_ = State::Unknown8Bit;
}
}
}
EncodingSpeculator::Encoding EncodingSpeculator::guess() const
{
Encoding guess;
switch ( state_ ) {
case State::Start:
case State::ASCIIOnly:
guess = Encoding::ASCII7;
break;
case State::Unknown8Bit:
case State::UTF8LeadingByteSeen:
guess = Encoding::ASCII8;
break;
case State::ValidUTF8:
guess = Encoding::UTF8;
break;
case State::ValidUTF16LE:
guess = Encoding::UTF16LE;
break;
case State::ValidUTF16BE:
guess = Encoding::UTF16BE;
break;
default:
guess = Encoding::ASCII8;
}
return guess;
}