1 /* 2 * Copyright (C) 2015 Nicolas Bonnefon and other contributors 3 * 4 * This file is part of glogg. 5 * 6 * glogg is free software: you can redistribute it and/or modify 7 * it under the terms of the GNU General Public License as published by 8 * the Free Software Foundation, either version 3 of the License, or 9 * (at your option) any later version. 10 * 11 * glogg is distributed in the hope that it will be useful, 12 * but WITHOUT ANY WARRANTY; without even the implied warranty of 13 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 14 * GNU General Public License for more details. 15 * 16 * You should have received a copy of the GNU General Public License 17 * along with glogg. If not, see <http://www.gnu.org/licenses/>. 18 */ 19 20 #include <cassert> 21 #include <cstdlib> 22 #include <arpa/inet.h> 23 24 #include <iostream> 25 26 #include "data/compressedlinestorage.h" 27 28 namespace { 29 // Functions to manipulate blocks 30 31 // Create a new 32 bits block of the passed size, 32 // initialised at the passed position 33 char* block32_new( int block_size, uint32_t initial_position, 34 char** block_ptr ) 35 { 36 // malloc a block of the maximum possible size 37 // (where every line is >16384) 38 char* ptr = static_cast<char*>( malloc( 4 + block_size * 6 ) ); 39 40 if ( ptr ) { 41 // Write the initial_position 42 *(reinterpret_cast<uint32_t*>(ptr)) = initial_position; 43 *block_ptr = ptr + 4; 44 } 45 46 return ptr; 47 } 48 49 // Add a one byte relative delta (0-127) and inc pointer 50 // First bit is always 0 51 void block_add_one_byte_relative( char** ptr, uint8_t value ) 52 { 53 **ptr = value; 54 *ptr += sizeof( value ); 55 } 56 57 // Add a two bytes relative delta (0-16383) and inc pointer 58 // First 2 bits are always 10 59 void block_add_two_bytes_relative( char** ptr, uint16_t value ) 60 { 61 // Stored in big endian format in order to recognise the initial pattern: 62 // 10xx xxxx xxxx xxxx 63 // HO byte | LO byte 64 *(reinterpret_cast<uint16_t*>(*ptr)) = htons( value | (1 << 15) ); 65 *ptr += sizeof( value ); 66 } 67 68 // Add a signal and a 4 bytes absolute position and inc pointer 69 void block32_add_absolute( char** ptr, uint32_t value ) 70 { 71 // 2 bytes marker (actually only the first two bits are tested) 72 *(reinterpret_cast<uint16_t*>(*ptr)) = 0xFF; 73 *ptr += sizeof( uint16_t ); 74 // Absolute value (machine endian) 75 *(reinterpret_cast<uint32_t*>(*ptr)) = value; 76 *ptr += sizeof( uint32_t ); 77 } 78 79 // Initialise the passed block for reading, returning 80 // the initial position and a pointer to the second entry. 81 uint64_t block32_initial_pos( char* block, char** ptr ) 82 { 83 *ptr = block + sizeof( uint32_t ); 84 return *(reinterpret_cast<uint32_t*>(block)); 85 } 86 87 // Give the next position in the block based on the previous 88 // position, then increase the pointer. 89 uint64_t block32_next_pos( char** ptr, uint64_t previous_pos ) 90 { 91 uint64_t pos = previous_pos; 92 93 uint8_t byte = **ptr; 94 ++(*ptr); 95 if ( ! ( byte & 0x80 ) ) { 96 // High order bit is 0 97 pos += byte; 98 } 99 else if ( ( byte & 0xC0 ) == 0x80 ) { 100 // We need to read the low order byte 101 uint8_t lo_byte = **ptr; 102 ++(*ptr); 103 // Remove the starting 10b 104 byte &= ~0xC0; 105 // And form the displacement (stored as big endian) 106 pos += ( (uint16_t) byte << 8 ) | (uint16_t) lo_byte; 107 } 108 else { 109 // Skip the next byte (not used) 110 ++(*ptr); 111 // And read the new absolute pos (machine endian) 112 pos = *(reinterpret_cast<uint32_t*>(*ptr)); 113 *ptr += sizeof( uint32_t ); 114 } 115 116 return pos; 117 } 118 } 119 120 void CompressedLinePositionStorage::move_from( 121 CompressedLinePositionStorage&& orig ) 122 { 123 nb_lines_ = orig.nb_lines_; 124 first_long_line_ = orig.first_long_line_; 125 current_pos_ = orig.current_pos_; 126 block_pointer_ = orig.block_pointer_; 127 previous_block_pointer_ = orig.previous_block_pointer_; 128 129 orig.nb_lines_ = 0; 130 } 131 132 // Move constructor 133 CompressedLinePositionStorage::CompressedLinePositionStorage( 134 CompressedLinePositionStorage&& orig ) 135 : block32_index_( std::move( orig.block32_index_ ) ) 136 { 137 move_from( std::move( orig ) ); 138 } 139 140 // Move assignement 141 CompressedLinePositionStorage& CompressedLinePositionStorage::operator=( 142 CompressedLinePositionStorage&& orig ) 143 { 144 block32_index_ = std::move( orig.block32_index_ ); 145 move_from( std::move( orig ) ); 146 147 return *this; 148 } 149 150 CompressedLinePositionStorage::~CompressedLinePositionStorage() 151 { 152 for ( char* block : block32_index_ ) { 153 void* p = static_cast<void*>( block ); 154 // std::cerr << "block = " << p << std::endl; 155 free( p ); 156 } 157 } 158 159 // template<int BLOCK_SIZE> 160 void CompressedLinePositionStorage::append( uint64_t pos ) 161 { 162 // Save the pointer in case we need to "pop_back" 163 previous_block_pointer_ = block_pointer_; 164 165 if ( ! block_pointer_ ) { 166 // We need to start a new block 167 block32_index_.push_back( 168 block32_new( BLOCK_SIZE, pos, &block_pointer_ ) ); 169 } 170 else { 171 uint64_t delta = pos - current_pos_; 172 if ( delta < 128 ) { 173 // Code relative on one byte 174 block_add_one_byte_relative( &block_pointer_, delta ); 175 } 176 else if ( delta < 16384 ) { 177 // Code relative on two bytes 178 block_add_two_bytes_relative( &block_pointer_, delta ); 179 } 180 else { 181 // Code absolute 182 block32_add_absolute( &block_pointer_, pos ); 183 } 184 } 185 186 current_pos_ = pos; 187 ++nb_lines_; 188 189 if ( nb_lines_ % BLOCK_SIZE == 0 ) { 190 // We have finished the block 191 192 // Let's reduce its size to what is actually used 193 int block_index = nb_lines_ / BLOCK_SIZE - 1; 194 char* block = block32_index_[block_index]; 195 // We allocate extra space for the last element in case it 196 // is replaced by an absolute value in the future (following a pop_back) 197 size_t new_size = ( previous_block_pointer_ 198 + sizeof( uint16_t ) + sizeof( uint32_t ) ) - block; 199 void* new_location = realloc( block, new_size ); 200 if ( new_location ) 201 block32_index_[block_index] = static_cast<char*>( new_location ); 202 203 block_pointer_ = nullptr; 204 } 205 } 206 207 // template<int BLOCK_SIZE> 208 uint64_t CompressedLinePositionStorage::at( int index ) const 209 { 210 char* block = block32_index_[ index / BLOCK_SIZE ]; 211 char* ptr; 212 uint64_t position = block32_initial_pos( block, &ptr ); 213 214 for ( int i = 0; i < index % BLOCK_SIZE; i++ ) { 215 // Go through all the lines in the block till the one we want 216 position = block32_next_pos( &ptr, position ); 217 } 218 219 return position; 220 } 221 222 void CompressedLinePositionStorage::append_list( 223 const std::vector<uint64_t>& positions ) 224 { 225 // This is not very clever, but caching should make it 226 // reasonably fast. 227 for ( uint32_t i = 0; i < positions.size(); i++ ) 228 append( positions.at( i ) ); 229 } 230 231 // template<int BLOCK_SIZE> 232 void CompressedLinePositionStorage::pop_back() 233 { 234 // Removing the last entered data, there are two cases 235 if ( previous_block_pointer_ ) { 236 // The last append was a normal entry in an existing block, 237 // so we can just revert the pointer 238 block_pointer_ = previous_block_pointer_; 239 previous_block_pointer_ = nullptr; 240 } 241 else { 242 // A new block has been created for the last entry, we need 243 // to de-alloc it. 244 245 // If we try to pop_back() twice, we're dead! 246 assert( ( nb_lines_ - 1 ) % BLOCK_SIZE == 0 ); 247 248 char* block = block32_index_.back(); 249 block32_index_.pop_back(); 250 free( block ); 251 252 253 block_pointer_ = nullptr; 254 } 255 256 --nb_lines_; 257 current_pos_ = at( nb_lines_ - 1 ); 258 } 259