1 /* 2 * Copyright (C) 2015 Nicolas Bonnefon and other contributors 3 * 4 * This file is part of glogg. 5 * 6 * glogg is free software: you can redistribute it and/or modify 7 * it under the terms of the GNU General Public License as published by 8 * the Free Software Foundation, either version 3 of the License, or 9 * (at your option) any later version. 10 * 11 * glogg is distributed in the hope that it will be useful, 12 * but WITHOUT ANY WARRANTY; without even the implied warranty of 13 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 14 * GNU General Public License for more details. 15 * 16 * You should have received a copy of the GNU General Public License 17 * along with glogg. If not, see <http://www.gnu.org/licenses/>. 18 */ 19 20 #include <cassert> 21 #include <cstdlib> 22 #include <QtEndian> 23 24 #include "utils.h" 25 26 #include "data/compressedlinestorage.h" 27 28 namespace { 29 // Functions to manipulate blocks 30 31 // Create a new 32 bits block of the passed size, 32 // initialised at the passed position 33 char* block32_new( int block_size, uint32_t initial_position, 34 char** block_ptr ) 35 { 36 // malloc a block of the maximum possible size 37 // (where every line is >16384) 38 char* ptr = static_cast<char*>( malloc( 4 + block_size * 6 ) ); 39 40 if ( ptr ) { 41 // Write the initial_position 42 *(reinterpret_cast<uint32_t*>(ptr)) = initial_position; 43 *block_ptr = ptr + 4; 44 } 45 46 return ptr; 47 } 48 49 // Create a new 64 bits block of the passed size, 50 // initialised at the passed position 51 char* block64_new( int block_size, uint64_t initial_position, 52 char** block_ptr ) 53 { 54 // malloc a block of the maximum possible size 55 // (where every line is >16384) 56 char* ptr = static_cast<char*>( malloc( 8 + block_size * 10 ) ); 57 58 if ( ptr ) { 59 // Write the initial_position 60 *(reinterpret_cast<uint64_t*>(ptr)) = initial_position; 61 *block_ptr = ptr + 8; 62 } 63 64 return ptr; 65 } 66 67 68 // Add a one byte relative delta (0-127) and inc pointer 69 // First bit is always 0 70 void block_add_one_byte_relative( char** ptr, uint8_t value ) 71 { 72 **ptr = value; 73 *ptr += sizeof( value ); 74 } 75 76 // Add a two bytes relative delta (0-16383) and inc pointer 77 // First 2 bits are always 10 78 void block_add_two_bytes_relative( char** ptr, uint16_t value ) 79 { 80 // Stored in big endian format in order to recognise the initial pattern: 81 // 10xx xxxx xxxx xxxx 82 // HO byte | LO byte 83 *(reinterpret_cast<uint16_t*>(*ptr)) = qToBigEndian( static_cast<uint16_t>( value | (1 << 15) ) ); 84 *ptr += sizeof( value ); 85 } 86 87 // Add a signal and a 4 bytes absolute position and inc pointer 88 void block32_add_absolute( char** ptr, uint32_t value ) 89 { 90 // 2 bytes marker (actually only the first two bits are tested) 91 *(reinterpret_cast<uint16_t*>(*ptr)) = 0xFF; 92 *ptr += sizeof( uint16_t ); 93 // Absolute value (machine endian) 94 *(reinterpret_cast<uint32_t*>(*ptr)) = value; 95 *ptr += sizeof( uint32_t ); 96 } 97 98 // Initialise the passed block for reading, returning 99 // the initial position and a pointer to the second entry. 100 uint64_t block32_initial_pos( char* block, char** ptr ) 101 { 102 *ptr = block + sizeof( uint32_t ); 103 return *(reinterpret_cast<uint32_t*>(block)); 104 } 105 106 // Give the next position in the block based on the previous 107 // position, then increase the pointer. 108 uint64_t block32_next_pos( char** ptr, uint64_t previous_pos ) 109 { 110 uint64_t pos = previous_pos; 111 112 uint8_t byte = **ptr; 113 ++(*ptr); 114 if ( ! ( byte & 0x80 ) ) { 115 // High order bit is 0 116 pos += byte; 117 } 118 else if ( ( byte & 0xC0 ) == 0x80 ) { 119 // We need to read the low order byte 120 uint8_t lo_byte = **ptr; 121 ++(*ptr); 122 // Remove the starting 10b 123 byte &= ~0xC0; 124 // And form the displacement (stored as big endian) 125 pos += ( (uint16_t) byte << 8 ) | (uint16_t) lo_byte; 126 } 127 else { 128 // Skip the next byte (not used) 129 ++(*ptr); 130 // And read the new absolute pos (machine endian) 131 pos = *(reinterpret_cast<uint32_t*>(*ptr)); 132 *ptr += sizeof( uint32_t ); 133 } 134 135 return pos; 136 } 137 138 // Add a signal and a 8 bytes absolute position and inc pointer 139 void block64_add_absolute( char** ptr, uint64_t value ) 140 { 141 // This is unaligned, can cause problem on some CPUs 142 143 // 2 bytes marker (actually only the first two bits are tested) 144 *(reinterpret_cast<uint16_t*>(*ptr)) = 0xFF; 145 *ptr += sizeof( uint16_t ); 146 // Absolute value (machine endian) 147 *(reinterpret_cast<uint64_t*>(*ptr)) = value; 148 *ptr += sizeof( uint64_t ); 149 } 150 151 // Initialise the passed block for reading, returning 152 // the initial position and a pointer to the second entry. 153 uint64_t block64_initial_pos( char* block, char** ptr ) 154 { 155 *ptr = block + sizeof( uint64_t ); 156 return *(reinterpret_cast<uint64_t*>(block)); 157 } 158 159 // Give the next position in the block based on the previous 160 // position, then increase the pointer. 161 uint64_t block64_next_pos( char** ptr, uint64_t previous_pos ) 162 { 163 uint64_t pos = previous_pos; 164 165 uint8_t byte = **ptr; 166 ++(*ptr); 167 if ( ! ( byte & 0x80 ) ) { 168 // High order bit is 0 169 pos += byte; 170 } 171 else if ( ( byte & 0xC0 ) == 0x80 ) { 172 // We need to read the low order byte 173 uint8_t lo_byte = **ptr; 174 ++(*ptr); 175 // Remove the starting 10b 176 byte &= ~0xC0; 177 // And form the displacement (stored as big endian) 178 pos += ( (uint16_t) byte << 8 ) | (uint16_t) lo_byte; 179 } 180 else { 181 // Skip the next byte (not used) 182 ++(*ptr); 183 // And read the new absolute pos (machine endian) 184 pos = *(reinterpret_cast<uint64_t*>(*ptr)); 185 *ptr += sizeof( uint64_t ); 186 } 187 188 return pos; 189 } 190 } 191 192 void CompressedLinePositionStorage::move_from( 193 CompressedLinePositionStorage&& orig ) 194 { 195 nb_lines_ = orig.nb_lines_; 196 first_long_line_ = orig.first_long_line_; 197 current_pos_ = orig.current_pos_; 198 block_pointer_ = orig.block_pointer_; 199 previous_block_pointer_ = orig.previous_block_pointer_; 200 201 orig.nb_lines_ = 0; 202 } 203 204 // Move constructor 205 CompressedLinePositionStorage::CompressedLinePositionStorage( 206 CompressedLinePositionStorage&& orig ) 207 : block32_index_( std::move( orig.block32_index_ ) ), 208 block64_index_( std::move( orig.block64_index_ ) ) 209 { 210 move_from( std::move( orig ) ); 211 } 212 213 void CompressedLinePositionStorage::free_blocks() 214 { 215 for ( char* block : block32_index_ ) { 216 void* p = static_cast<void*>( block ); 217 free( p ); 218 } 219 220 for ( char* block : block64_index_ ) { 221 void* p = static_cast<void*>( block ); 222 free( p ); 223 } 224 } 225 226 // Move assignement 227 CompressedLinePositionStorage& CompressedLinePositionStorage::operator=( 228 CompressedLinePositionStorage&& orig ) 229 { 230 free_blocks(); 231 232 block32_index_ = std::move( orig.block32_index_ ); 233 block64_index_ = std::move( orig.block64_index_ ); 234 move_from( std::move( orig ) ); 235 236 return *this; 237 } 238 239 CompressedLinePositionStorage::~CompressedLinePositionStorage() 240 { 241 free_blocks(); 242 } 243 244 // template<int BLOCK_SIZE> 245 void CompressedLinePositionStorage::append( uint64_t pos ) 246 { 247 // Lines must be stored in order 248 assert( ( pos > current_pos_ ) || ( pos == 0 ) ); 249 250 // Save the pointer in case we need to "pop_back" 251 previous_block_pointer_ = block_pointer_; 252 253 bool store_in_big = false; 254 if ( pos > UINT32_MAX ) { 255 store_in_big = true; 256 if ( first_long_line_ == UINT32_MAX ) { 257 // First "big" end of line, we will start a new (64) block 258 first_long_line_ = nb_lines_; 259 block_pointer_ = nullptr; 260 } 261 } 262 263 if ( ! block_pointer_ ) { 264 // We need to start a new block 265 if ( ! store_in_big ) 266 block32_index_.push_back( 267 block32_new( BLOCK_SIZE, pos, &block_pointer_ ) ); 268 else 269 block64_index_.push_back( 270 block64_new( BLOCK_SIZE, pos, &block_pointer_ ) ); 271 } 272 else { 273 uint64_t delta = pos - current_pos_; 274 if ( delta < 128 ) { 275 // Code relative on one byte 276 block_add_one_byte_relative( &block_pointer_, delta ); 277 } 278 else if ( delta < 16384 ) { 279 // Code relative on two bytes 280 block_add_two_bytes_relative( &block_pointer_, delta ); 281 } 282 else { 283 // Code absolute 284 if ( ! store_in_big ) 285 block32_add_absolute( &block_pointer_, pos ); 286 else 287 block64_add_absolute( &block_pointer_, pos ); 288 } 289 } 290 291 current_pos_ = pos; 292 ++nb_lines_; 293 294 if ( ! store_in_big ) { 295 if ( nb_lines_ % BLOCK_SIZE == 0 ) { 296 // We have finished the block 297 298 // Let's reduce its size to what is actually used 299 int block_index = nb_lines_ / BLOCK_SIZE - 1; 300 char* block = block32_index_[block_index]; 301 // We allocate extra space for the last element in case it 302 // is replaced by an absolute value in the future (following a pop_back) 303 size_t new_size = ( previous_block_pointer_ 304 + sizeof( uint16_t ) + sizeof( uint32_t ) ) - block; 305 void* new_location = realloc( block, new_size ); 306 if ( new_location ) 307 block32_index_[block_index] = static_cast<char*>( new_location ); 308 309 block_pointer_ = nullptr; 310 previous_block_pointer_ = static_cast<char*>( new_location ) + ( previous_block_pointer_ - block ); 311 } 312 } 313 else { 314 if ( ( nb_lines_ - first_long_line_ ) % BLOCK_SIZE == 0 ) { 315 // We have finished the block 316 317 // Let's reduce its size to what is actually used 318 int block_index = ( nb_lines_ - first_long_line_ ) / BLOCK_SIZE - 1; 319 char* block = block64_index_[block_index]; 320 // We allocate extra space for the last element in case it 321 // is replaced by an absolute value in the future (following a pop_back) 322 size_t new_size = ( previous_block_pointer_ 323 + sizeof( uint16_t ) + sizeof( uint64_t ) ) - block; 324 void* new_location = realloc( block, new_size ); 325 if ( new_location ) 326 block64_index_[block_index] = static_cast<char*>( new_location ); 327 328 block_pointer_ = nullptr; 329 previous_block_pointer_ = static_cast<char*>( new_location ) + ( previous_block_pointer_ - block ); 330 } 331 } 332 } 333 334 // template<int BLOCK_SIZE> 335 uint64_t CompressedLinePositionStorage::at( uint32_t index ) const 336 { 337 char* ptr; 338 uint64_t position; 339 340 Cache* last_read = last_read_.getPtr(); 341 342 if ( index < first_long_line_ ) { 343 if ( ( index == last_read->index + 1 ) && ( index % BLOCK_SIZE != 0 ) ) { 344 position = last_read->position; 345 ptr = last_read->ptr; 346 347 position = block32_next_pos( &ptr, position ); 348 } 349 else { 350 char* block = block32_index_[ index / BLOCK_SIZE ]; 351 position = block32_initial_pos( block, &ptr ); 352 353 for ( uint32_t i = 0; i < index % BLOCK_SIZE; i++ ) { 354 // Go through all the lines in the block till the one we want 355 position = block32_next_pos( &ptr, position ); 356 } 357 } 358 } 359 else { 360 const uint32_t index_in_64 = index - first_long_line_; 361 if ( ( index == last_read->index + 1 ) && ( index_in_64 % BLOCK_SIZE != 0 ) ) { 362 position = last_read->position; 363 ptr = last_read->ptr; 364 365 position = block64_next_pos( &ptr, position ); 366 } 367 else { 368 char* block = block64_index_[ index_in_64 / BLOCK_SIZE ]; 369 position = block64_initial_pos( block, &ptr ); 370 371 for ( uint32_t i = 0; i < index_in_64 % BLOCK_SIZE; i++ ) { 372 // Go through all the lines in the block till the one we want 373 position = block64_next_pos( &ptr, position ); 374 } 375 } 376 } 377 378 // Populate our cache ready for next consecutive read 379 last_read->index = index; 380 last_read->position = position; 381 last_read->ptr = ptr; 382 383 return position; 384 } 385 386 void CompressedLinePositionStorage::append_list( 387 const std::vector<uint64_t>& positions ) 388 { 389 // This is not very clever, but caching should make it 390 // reasonably fast. 391 for ( uint32_t i = 0; i < positions.size(); i++ ) 392 append( positions.at( i ) ); 393 } 394 395 // template<int BLOCK_SIZE> 396 void CompressedLinePositionStorage::pop_back() 397 { 398 // Removing the last entered data, there are two cases 399 if ( previous_block_pointer_ ) { 400 // The last append was a normal entry in an existing block, 401 // so we can just revert the pointer 402 block_pointer_ = previous_block_pointer_; 403 previous_block_pointer_ = nullptr; 404 } 405 else { 406 // A new block has been created for the last entry, we need 407 // to de-alloc it. 408 409 if ( first_long_line_ == UINT32_MAX ) { 410 // If we try to pop_back() twice, we're dead! 411 assert( ( nb_lines_ - 1 ) % BLOCK_SIZE == 0 ); 412 413 char* block = block32_index_.back(); 414 block32_index_.pop_back(); 415 free( block ); 416 } 417 else { 418 // If we try to pop_back() twice, we're dead! 419 assert( ( nb_lines_ - first_long_line_ - 1 ) % BLOCK_SIZE == 0 ); 420 421 char* block = block64_index_.back(); 422 block64_index_.pop_back(); 423 free( block ); 424 } 425 426 block_pointer_ = nullptr; 427 } 428 429 --nb_lines_; 430 current_pos_ = at( nb_lines_ - 1 ); 431 } 432