1 /* 2 * Copyright (C) 2015 Nicolas Bonnefon and other contributors 3 * 4 * This file is part of glogg. 5 * 6 * glogg is free software: you can redistribute it and/or modify 7 * it under the terms of the GNU General Public License as published by 8 * the Free Software Foundation, either version 3 of the License, or 9 * (at your option) any later version. 10 * 11 * glogg is distributed in the hope that it will be useful, 12 * but WITHOUT ANY WARRANTY; without even the implied warranty of 13 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 14 * GNU General Public License for more details. 15 * 16 * You should have received a copy of the GNU General Public License 17 * along with glogg. If not, see <http://www.gnu.org/licenses/>. 18 */ 19 20 #include <cassert> 21 #include <cstdlib> 22 23 #include "utils.h" 24 25 #include "data/compressedlinestorage.h" 26 27 #ifdef HAVE_HTONS 28 # include <arpa/inet.h> 29 #else 30 # define htons(a) glogg_htons(a) 31 #endif 32 33 namespace { 34 // Functions to manipulate blocks 35 36 // Create a new 32 bits block of the passed size, 37 // initialised at the passed position 38 char* block32_new( int block_size, uint32_t initial_position, 39 char** block_ptr ) 40 { 41 // malloc a block of the maximum possible size 42 // (where every line is >16384) 43 char* ptr = static_cast<char*>( malloc( 4 + block_size * 6 ) ); 44 45 if ( ptr ) { 46 // Write the initial_position 47 *(reinterpret_cast<uint32_t*>(ptr)) = initial_position; 48 *block_ptr = ptr + 4; 49 } 50 51 return ptr; 52 } 53 54 // Create a new 64 bits block of the passed size, 55 // initialised at the passed position 56 char* block64_new( int block_size, uint64_t initial_position, 57 char** block_ptr ) 58 { 59 // malloc a block of the maximum possible size 60 // (where every line is >16384) 61 char* ptr = static_cast<char*>( malloc( 8 + block_size * 10 ) ); 62 63 if ( ptr ) { 64 // Write the initial_position 65 *(reinterpret_cast<uint64_t*>(ptr)) = initial_position; 66 *block_ptr = ptr + 8; 67 } 68 69 return ptr; 70 } 71 72 73 // Add a one byte relative delta (0-127) and inc pointer 74 // First bit is always 0 75 void block_add_one_byte_relative( char** ptr, uint8_t value ) 76 { 77 **ptr = value; 78 *ptr += sizeof( value ); 79 } 80 81 // Add a two bytes relative delta (0-16383) and inc pointer 82 // First 2 bits are always 10 83 void block_add_two_bytes_relative( char** ptr, uint16_t value ) 84 { 85 // Stored in big endian format in order to recognise the initial pattern: 86 // 10xx xxxx xxxx xxxx 87 // HO byte | LO byte 88 *(reinterpret_cast<uint16_t*>(*ptr)) = htons( value | (1 << 15) ); 89 *ptr += sizeof( value ); 90 } 91 92 // Add a signal and a 4 bytes absolute position and inc pointer 93 void block32_add_absolute( char** ptr, uint32_t value ) 94 { 95 // 2 bytes marker (actually only the first two bits are tested) 96 *(reinterpret_cast<uint16_t*>(*ptr)) = 0xFF; 97 *ptr += sizeof( uint16_t ); 98 // Absolute value (machine endian) 99 *(reinterpret_cast<uint32_t*>(*ptr)) = value; 100 *ptr += sizeof( uint32_t ); 101 } 102 103 // Initialise the passed block for reading, returning 104 // the initial position and a pointer to the second entry. 105 uint64_t block32_initial_pos( char* block, char** ptr ) 106 { 107 *ptr = block + sizeof( uint32_t ); 108 return *(reinterpret_cast<uint32_t*>(block)); 109 } 110 111 // Give the next position in the block based on the previous 112 // position, then increase the pointer. 113 uint64_t block32_next_pos( char** ptr, uint64_t previous_pos ) 114 { 115 uint64_t pos = previous_pos; 116 117 uint8_t byte = **ptr; 118 ++(*ptr); 119 if ( ! ( byte & 0x80 ) ) { 120 // High order bit is 0 121 pos += byte; 122 } 123 else if ( ( byte & 0xC0 ) == 0x80 ) { 124 // We need to read the low order byte 125 uint8_t lo_byte = **ptr; 126 ++(*ptr); 127 // Remove the starting 10b 128 byte &= ~0xC0; 129 // And form the displacement (stored as big endian) 130 pos += ( (uint16_t) byte << 8 ) | (uint16_t) lo_byte; 131 } 132 else { 133 // Skip the next byte (not used) 134 ++(*ptr); 135 // And read the new absolute pos (machine endian) 136 pos = *(reinterpret_cast<uint32_t*>(*ptr)); 137 *ptr += sizeof( uint32_t ); 138 } 139 140 return pos; 141 } 142 143 // Add a signal and a 8 bytes absolute position and inc pointer 144 void block64_add_absolute( char** ptr, uint64_t value ) 145 { 146 // This is unaligned, can cause problem on some CPUs 147 148 // 2 bytes marker (actually only the first two bits are tested) 149 *(reinterpret_cast<uint16_t*>(*ptr)) = 0xFF; 150 *ptr += sizeof( uint16_t ); 151 // Absolute value (machine endian) 152 *(reinterpret_cast<uint64_t*>(*ptr)) = value; 153 *ptr += sizeof( uint64_t ); 154 } 155 156 // Initialise the passed block for reading, returning 157 // the initial position and a pointer to the second entry. 158 uint64_t block64_initial_pos( char* block, char** ptr ) 159 { 160 *ptr = block + sizeof( uint64_t ); 161 return *(reinterpret_cast<uint64_t*>(block)); 162 } 163 164 // Give the next position in the block based on the previous 165 // position, then increase the pointer. 166 uint64_t block64_next_pos( char** ptr, uint64_t previous_pos ) 167 { 168 uint64_t pos = previous_pos; 169 170 uint8_t byte = **ptr; 171 ++(*ptr); 172 if ( ! ( byte & 0x80 ) ) { 173 // High order bit is 0 174 pos += byte; 175 } 176 else if ( ( byte & 0xC0 ) == 0x80 ) { 177 // We need to read the low order byte 178 uint8_t lo_byte = **ptr; 179 ++(*ptr); 180 // Remove the starting 10b 181 byte &= ~0xC0; 182 // And form the displacement (stored as big endian) 183 pos += ( (uint16_t) byte << 8 ) | (uint16_t) lo_byte; 184 } 185 else { 186 // Skip the next byte (not used) 187 ++(*ptr); 188 // And read the new absolute pos (machine endian) 189 pos = *(reinterpret_cast<uint64_t*>(*ptr)); 190 *ptr += sizeof( uint64_t ); 191 } 192 193 return pos; 194 } 195 } 196 197 void CompressedLinePositionStorage::move_from( 198 CompressedLinePositionStorage&& orig ) 199 { 200 nb_lines_ = orig.nb_lines_; 201 first_long_line_ = orig.first_long_line_; 202 current_pos_ = orig.current_pos_; 203 block_pointer_ = orig.block_pointer_; 204 previous_block_pointer_ = orig.previous_block_pointer_; 205 206 orig.nb_lines_ = 0; 207 } 208 209 // Move constructor 210 CompressedLinePositionStorage::CompressedLinePositionStorage( 211 CompressedLinePositionStorage&& orig ) 212 : block32_index_( std::move( orig.block32_index_ ) ), 213 block64_index_( std::move( orig.block64_index_ ) ) 214 { 215 move_from( std::move( orig ) ); 216 } 217 218 void CompressedLinePositionStorage::free_blocks() 219 { 220 for ( char* block : block32_index_ ) { 221 void* p = static_cast<void*>( block ); 222 free( p ); 223 } 224 225 for ( char* block : block64_index_ ) { 226 void* p = static_cast<void*>( block ); 227 free( p ); 228 } 229 } 230 231 // Move assignement 232 CompressedLinePositionStorage& CompressedLinePositionStorage::operator=( 233 CompressedLinePositionStorage&& orig ) 234 { 235 free_blocks(); 236 237 block32_index_ = std::move( orig.block32_index_ ); 238 block64_index_ = std::move( orig.block64_index_ ); 239 move_from( std::move( orig ) ); 240 241 return *this; 242 } 243 244 CompressedLinePositionStorage::~CompressedLinePositionStorage() 245 { 246 free_blocks(); 247 } 248 249 // template<int BLOCK_SIZE> 250 void CompressedLinePositionStorage::append( uint64_t pos ) 251 { 252 // Lines must be stored in order 253 assert( ( pos > current_pos_ ) || ( pos == 0 ) ); 254 255 // Save the pointer in case we need to "pop_back" 256 previous_block_pointer_ = block_pointer_; 257 258 bool store_in_big = false; 259 if ( pos > UINT32_MAX ) { 260 store_in_big = true; 261 if ( first_long_line_ == UINT32_MAX ) { 262 // First "big" end of line, we will start a new (64) block 263 first_long_line_ = nb_lines_; 264 block_pointer_ = nullptr; 265 } 266 } 267 268 if ( ! block_pointer_ ) { 269 // We need to start a new block 270 if ( ! store_in_big ) 271 block32_index_.push_back( 272 block32_new( BLOCK_SIZE, pos, &block_pointer_ ) ); 273 else 274 block64_index_.push_back( 275 block64_new( BLOCK_SIZE, pos, &block_pointer_ ) ); 276 } 277 else { 278 uint64_t delta = pos - current_pos_; 279 if ( delta < 128 ) { 280 // Code relative on one byte 281 block_add_one_byte_relative( &block_pointer_, delta ); 282 } 283 else if ( delta < 16384 ) { 284 // Code relative on two bytes 285 block_add_two_bytes_relative( &block_pointer_, delta ); 286 } 287 else { 288 // Code absolute 289 if ( ! store_in_big ) 290 block32_add_absolute( &block_pointer_, pos ); 291 else 292 block64_add_absolute( &block_pointer_, pos ); 293 } 294 } 295 296 current_pos_ = pos; 297 ++nb_lines_; 298 299 if ( ! store_in_big ) { 300 if ( nb_lines_ % BLOCK_SIZE == 0 ) { 301 // We have finished the block 302 303 // Let's reduce its size to what is actually used 304 int block_index = nb_lines_ / BLOCK_SIZE - 1; 305 char* block = block32_index_[block_index]; 306 // We allocate extra space for the last element in case it 307 // is replaced by an absolute value in the future (following a pop_back) 308 size_t new_size = ( previous_block_pointer_ 309 + sizeof( uint16_t ) + sizeof( uint32_t ) ) - block; 310 void* new_location = realloc( block, new_size ); 311 if ( new_location ) 312 block32_index_[block_index] = static_cast<char*>( new_location ); 313 314 block_pointer_ = nullptr; 315 previous_block_pointer_ = static_cast<char*>( new_location ) + ( previous_block_pointer_ - block ); 316 } 317 } 318 else { 319 if ( ( nb_lines_ - first_long_line_ ) % BLOCK_SIZE == 0 ) { 320 // We have finished the block 321 322 // Let's reduce its size to what is actually used 323 int block_index = ( nb_lines_ - first_long_line_ ) / BLOCK_SIZE - 1; 324 char* block = block64_index_[block_index]; 325 // We allocate extra space for the last element in case it 326 // is replaced by an absolute value in the future (following a pop_back) 327 size_t new_size = ( previous_block_pointer_ 328 + sizeof( uint16_t ) + sizeof( uint64_t ) ) - block; 329 void* new_location = realloc( block, new_size ); 330 if ( new_location ) 331 block64_index_[block_index] = static_cast<char*>( new_location ); 332 333 block_pointer_ = nullptr; 334 previous_block_pointer_ = static_cast<char*>( new_location ) + ( previous_block_pointer_ - block ); 335 } 336 } 337 } 338 339 // template<int BLOCK_SIZE> 340 uint64_t CompressedLinePositionStorage::at( uint32_t index ) const 341 { 342 char* ptr; 343 uint64_t position; 344 345 Cache* last_read = last_read_.getPtr(); 346 347 if ( index < first_long_line_ ) { 348 if ( ( index == last_read->index + 1 ) && ( index % BLOCK_SIZE != 0 ) ) { 349 position = last_read->position; 350 ptr = last_read->ptr; 351 352 position = block32_next_pos( &ptr, position ); 353 } 354 else { 355 char* block = block32_index_[ index / BLOCK_SIZE ]; 356 position = block32_initial_pos( block, &ptr ); 357 358 for ( uint32_t i = 0; i < index % BLOCK_SIZE; i++ ) { 359 // Go through all the lines in the block till the one we want 360 position = block32_next_pos( &ptr, position ); 361 } 362 } 363 } 364 else { 365 const uint32_t index_in_64 = index - first_long_line_; 366 if ( ( index == last_read->index + 1 ) && ( index_in_64 % BLOCK_SIZE != 0 ) ) { 367 position = last_read->position; 368 ptr = last_read->ptr; 369 370 position = block64_next_pos( &ptr, position ); 371 } 372 else { 373 char* block = block64_index_[ index_in_64 / BLOCK_SIZE ]; 374 position = block64_initial_pos( block, &ptr ); 375 376 for ( uint32_t i = 0; i < index_in_64 % BLOCK_SIZE; i++ ) { 377 // Go through all the lines in the block till the one we want 378 position = block64_next_pos( &ptr, position ); 379 } 380 } 381 } 382 383 // Populate our cache ready for next consecutive read 384 last_read->index = index; 385 last_read->position = position; 386 last_read->ptr = ptr; 387 388 return position; 389 } 390 391 void CompressedLinePositionStorage::append_list( 392 const std::vector<uint64_t>& positions ) 393 { 394 // This is not very clever, but caching should make it 395 // reasonably fast. 396 for ( uint32_t i = 0; i < positions.size(); i++ ) 397 append( positions.at( i ) ); 398 } 399 400 // template<int BLOCK_SIZE> 401 void CompressedLinePositionStorage::pop_back() 402 { 403 // Removing the last entered data, there are two cases 404 if ( previous_block_pointer_ ) { 405 // The last append was a normal entry in an existing block, 406 // so we can just revert the pointer 407 block_pointer_ = previous_block_pointer_; 408 previous_block_pointer_ = nullptr; 409 } 410 else { 411 // A new block has been created for the last entry, we need 412 // to de-alloc it. 413 414 if ( first_long_line_ == UINT32_MAX ) { 415 // If we try to pop_back() twice, we're dead! 416 assert( ( nb_lines_ - 1 ) % BLOCK_SIZE == 0 ); 417 418 char* block = block32_index_.back(); 419 block32_index_.pop_back(); 420 free( block ); 421 } 422 else { 423 // If we try to pop_back() twice, we're dead! 424 assert( ( nb_lines_ - first_long_line_ - 1 ) % BLOCK_SIZE == 0 ); 425 426 char* block = block64_index_.back(); 427 block64_index_.pop_back(); 428 free( block ); 429 } 430 431 block_pointer_ = nullptr; 432 } 433 434 --nb_lines_; 435 current_pos_ = at( nb_lines_ - 1 ); 436 } 437