1 /* 2 * Copyright (C) 2015 Nicolas Bonnefon and other contributors 3 * 4 * This file is part of glogg. 5 * 6 * glogg is free software: you can redistribute it and/or modify 7 * it under the terms of the GNU General Public License as published by 8 * the Free Software Foundation, either version 3 of the License, or 9 * (at your option) any later version. 10 * 11 * glogg is distributed in the hope that it will be useful, 12 * but WITHOUT ANY WARRANTY; without even the implied warranty of 13 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 14 * GNU General Public License for more details. 15 * 16 * You should have received a copy of the GNU General Public License 17 * along with glogg. If not, see <http://www.gnu.org/licenses/>. 18 */ 19 20 #include <cassert> 21 #include <cstdlib> 22 #include <iostream> 23 24 #include "utils.h" 25 26 #include "data/compressedlinestorage.h" 27 28 #ifdef HAVE_HTONS 29 # include <arpa/inet.h> 30 #else 31 # define htons(a) glogg_htons(a) 32 #endif 33 34 namespace { 35 // Functions to manipulate blocks 36 37 // Create a new 32 bits block of the passed size, 38 // initialised at the passed position 39 char* block32_new( int block_size, uint32_t initial_position, 40 char** block_ptr ) 41 { 42 // malloc a block of the maximum possible size 43 // (where every line is >16384) 44 char* ptr = static_cast<char*>( malloc( 4 + block_size * 6 ) ); 45 46 if ( ptr ) { 47 // Write the initial_position 48 *(reinterpret_cast<uint32_t*>(ptr)) = initial_position; 49 *block_ptr = ptr + 4; 50 } 51 52 return ptr; 53 } 54 55 // Create a new 64 bits block of the passed size, 56 // initialised at the passed position 57 char* block64_new( int block_size, uint64_t initial_position, 58 char** block_ptr ) 59 { 60 // malloc a block of the maximum possible size 61 // (where every line is >16384) 62 char* ptr = static_cast<char*>( malloc( 8 + block_size * 10 ) ); 63 64 if ( ptr ) { 65 // Write the initial_position 66 *(reinterpret_cast<uint64_t*>(ptr)) = initial_position; 67 *block_ptr = ptr + 8; 68 } 69 70 return ptr; 71 } 72 73 74 // Add a one byte relative delta (0-127) and inc pointer 75 // First bit is always 0 76 void block_add_one_byte_relative( char** ptr, uint8_t value ) 77 { 78 **ptr = value; 79 *ptr += sizeof( value ); 80 } 81 82 // Add a two bytes relative delta (0-16383) and inc pointer 83 // First 2 bits are always 10 84 void block_add_two_bytes_relative( char** ptr, uint16_t value ) 85 { 86 // Stored in big endian format in order to recognise the initial pattern: 87 // 10xx xxxx xxxx xxxx 88 // HO byte | LO byte 89 *(reinterpret_cast<uint16_t*>(*ptr)) = htons( value | (1 << 15) ); 90 *ptr += sizeof( value ); 91 } 92 93 // Add a signal and a 4 bytes absolute position and inc pointer 94 void block32_add_absolute( char** ptr, uint32_t value ) 95 { 96 // 2 bytes marker (actually only the first two bits are tested) 97 *(reinterpret_cast<uint16_t*>(*ptr)) = 0xFF; 98 *ptr += sizeof( uint16_t ); 99 // Absolute value (machine endian) 100 *(reinterpret_cast<uint32_t*>(*ptr)) = value; 101 *ptr += sizeof( uint32_t ); 102 } 103 104 // Initialise the passed block for reading, returning 105 // the initial position and a pointer to the second entry. 106 uint64_t block32_initial_pos( char* block, char** ptr ) 107 { 108 *ptr = block + sizeof( uint32_t ); 109 return *(reinterpret_cast<uint32_t*>(block)); 110 } 111 112 // Give the next position in the block based on the previous 113 // position, then increase the pointer. 114 uint64_t block32_next_pos( char** ptr, uint64_t previous_pos ) 115 { 116 uint64_t pos = previous_pos; 117 118 uint8_t byte = **ptr; 119 ++(*ptr); 120 if ( ! ( byte & 0x80 ) ) { 121 // High order bit is 0 122 pos += byte; 123 } 124 else if ( ( byte & 0xC0 ) == 0x80 ) { 125 // We need to read the low order byte 126 uint8_t lo_byte = **ptr; 127 ++(*ptr); 128 // Remove the starting 10b 129 byte &= ~0xC0; 130 // And form the displacement (stored as big endian) 131 pos += ( (uint16_t) byte << 8 ) | (uint16_t) lo_byte; 132 } 133 else { 134 // Skip the next byte (not used) 135 ++(*ptr); 136 // And read the new absolute pos (machine endian) 137 pos = *(reinterpret_cast<uint32_t*>(*ptr)); 138 *ptr += sizeof( uint32_t ); 139 } 140 141 return pos; 142 } 143 144 // Add a signal and a 8 bytes absolute position and inc pointer 145 void block64_add_absolute( char** ptr, uint64_t value ) 146 { 147 // This is unaligned, can cause problem on some CPUs 148 149 // 2 bytes marker (actually only the first two bits are tested) 150 *(reinterpret_cast<uint16_t*>(*ptr)) = 0xFF; 151 *ptr += sizeof( uint16_t ); 152 // Absolute value (machine endian) 153 *(reinterpret_cast<uint64_t*>(*ptr)) = value; 154 *ptr += sizeof( uint64_t ); 155 } 156 157 // Initialise the passed block for reading, returning 158 // the initial position and a pointer to the second entry. 159 uint64_t block64_initial_pos( char* block, char** ptr ) 160 { 161 *ptr = block + sizeof( uint64_t ); 162 return *(reinterpret_cast<uint64_t*>(block)); 163 } 164 165 // Give the next position in the block based on the previous 166 // position, then increase the pointer. 167 uint64_t block64_next_pos( char** ptr, uint64_t previous_pos ) 168 { 169 uint64_t pos = previous_pos; 170 171 uint8_t byte = **ptr; 172 ++(*ptr); 173 if ( ! ( byte & 0x80 ) ) { 174 // High order bit is 0 175 pos += byte; 176 } 177 else if ( ( byte & 0xC0 ) == 0x80 ) { 178 // We need to read the low order byte 179 uint8_t lo_byte = **ptr; 180 ++(*ptr); 181 // Remove the starting 10b 182 byte &= ~0xC0; 183 // And form the displacement (stored as big endian) 184 pos += ( (uint16_t) byte << 8 ) | (uint16_t) lo_byte; 185 } 186 else { 187 // Skip the next byte (not used) 188 ++(*ptr); 189 // And read the new absolute pos (machine endian) 190 pos = *(reinterpret_cast<uint64_t*>(*ptr)); 191 *ptr += sizeof( uint64_t ); 192 } 193 194 return pos; 195 } 196 } 197 198 void CompressedLinePositionStorage::move_from( 199 CompressedLinePositionStorage&& orig ) 200 { 201 nb_lines_ = orig.nb_lines_; 202 first_long_line_ = orig.first_long_line_; 203 current_pos_ = orig.current_pos_; 204 block_pointer_ = orig.block_pointer_; 205 previous_block_pointer_ = orig.previous_block_pointer_; 206 207 orig.nb_lines_ = 0; 208 } 209 210 // Move constructor 211 CompressedLinePositionStorage::CompressedLinePositionStorage( 212 CompressedLinePositionStorage&& orig ) 213 : block32_index_( std::move( orig.block32_index_ ) ) 214 { 215 move_from( std::move( orig ) ); 216 } 217 218 void CompressedLinePositionStorage::free_blocks() 219 { 220 for ( char* block : block32_index_ ) { 221 void* p = static_cast<void*>( block ); 222 // std::cerr << "free block = " << p << std::endl; 223 free( p ); 224 } 225 226 for ( char* block : block64_index_ ) { 227 void* p = static_cast<void*>( block ); 228 // std::cerr << "block = " << p << std::endl; 229 free( p ); 230 } 231 } 232 233 // Move assignement 234 CompressedLinePositionStorage& CompressedLinePositionStorage::operator=( 235 CompressedLinePositionStorage&& orig ) 236 { 237 free_blocks(); 238 239 block32_index_ = std::move( orig.block32_index_ ); 240 move_from( std::move( orig ) ); 241 242 return *this; 243 } 244 245 CompressedLinePositionStorage::~CompressedLinePositionStorage() 246 { 247 free_blocks(); 248 } 249 250 // template<int BLOCK_SIZE> 251 void CompressedLinePositionStorage::append( uint64_t pos ) 252 { 253 // Save the pointer in case we need to "pop_back" 254 previous_block_pointer_ = block_pointer_; 255 256 bool store_in_big = false; 257 if ( pos > UINT32_MAX ) { 258 store_in_big = true; 259 if ( first_long_line_ == UINT32_MAX ) { 260 // First "big" end of line, we will start a new (64) block 261 first_long_line_ = nb_lines_; 262 block_pointer_ = nullptr; 263 } 264 } 265 266 if ( ! block_pointer_ ) { 267 // We need to start a new block 268 if ( ! store_in_big ) 269 block32_index_.push_back( 270 block32_new( BLOCK_SIZE, pos, &block_pointer_ ) ); 271 else 272 block64_index_.push_back( 273 block64_new( BLOCK_SIZE, pos, &block_pointer_ ) ); 274 } 275 else { 276 uint64_t delta = pos - current_pos_; 277 if ( delta < 128 ) { 278 // Code relative on one byte 279 block_add_one_byte_relative( &block_pointer_, delta ); 280 } 281 else if ( delta < 16384 ) { 282 // Code relative on two bytes 283 block_add_two_bytes_relative( &block_pointer_, delta ); 284 } 285 else { 286 // Code absolute 287 if ( ! store_in_big ) 288 block32_add_absolute( &block_pointer_, pos ); 289 else 290 block64_add_absolute( &block_pointer_, pos ); 291 } 292 } 293 294 current_pos_ = pos; 295 ++nb_lines_; 296 297 if ( ! store_in_big ) { 298 if ( nb_lines_ % BLOCK_SIZE == 0 ) { 299 // We have finished the block 300 301 // Let's reduce its size to what is actually used 302 int block_index = nb_lines_ / BLOCK_SIZE - 1; 303 char* block = block32_index_[block_index]; 304 // We allocate extra space for the last element in case it 305 // is replaced by an absolute value in the future (following a pop_back) 306 size_t new_size = ( previous_block_pointer_ 307 + sizeof( uint16_t ) + sizeof( uint32_t ) ) - block; 308 void* new_location = realloc( block, new_size ); 309 if ( new_location ) 310 block32_index_[block_index] = static_cast<char*>( new_location ); 311 312 block_pointer_ = nullptr; 313 } 314 } 315 else { 316 if ( ( nb_lines_ - first_long_line_ ) % BLOCK_SIZE == 0 ) { 317 // We have finished the block 318 319 // Let's reduce its size to what is actually used 320 int block_index = ( nb_lines_ - first_long_line_ ) / BLOCK_SIZE - 1; 321 char* block = block64_index_[block_index]; 322 // We allocate extra space for the last element in case it 323 // is replaced by an absolute value in the future (following a pop_back) 324 size_t new_size = ( previous_block_pointer_ 325 + sizeof( uint16_t ) + sizeof( uint64_t ) ) - block; 326 void* new_location = realloc( block, new_size ); 327 if ( new_location ) 328 block64_index_[block_index] = static_cast<char*>( new_location ); 329 330 block_pointer_ = nullptr; 331 } 332 } 333 } 334 335 // template<int BLOCK_SIZE> 336 uint64_t CompressedLinePositionStorage::at( uint32_t index ) const 337 { 338 char* ptr; 339 uint64_t position; 340 341 Cache* last_read = last_read_.getPtr(); 342 343 if ( index < first_long_line_ ) { 344 if ( ( index == last_read->index + 1 ) && ( index % BLOCK_SIZE != 0 ) ) { 345 position = last_read->position; 346 ptr = last_read->ptr; 347 348 position = block32_next_pos( &ptr, position ); 349 } 350 else { 351 char* block = block32_index_[ index / BLOCK_SIZE ]; 352 position = block32_initial_pos( block, &ptr ); 353 354 for ( uint32_t i = 0; i < index % BLOCK_SIZE; i++ ) { 355 // Go through all the lines in the block till the one we want 356 position = block32_next_pos( &ptr, position ); 357 } 358 } 359 } 360 else { 361 const uint32_t index_in_64 = index - first_long_line_; 362 if ( ( index == last_read->index + 1 ) && ( index_in_64 % BLOCK_SIZE != 0 ) ) { 363 position = last_read->position; 364 ptr = last_read->ptr; 365 366 position = block64_next_pos( &ptr, position ); 367 } 368 else { 369 char* block = block64_index_[ index_in_64 / BLOCK_SIZE ]; 370 position = block64_initial_pos( block, &ptr ); 371 372 for ( uint32_t i = 0; i < index_in_64 % BLOCK_SIZE; i++ ) { 373 // Go through all the lines in the block till the one we want 374 position = block64_next_pos( &ptr, position ); 375 } 376 } 377 } 378 379 // Populate our cache ready for next consecutive read 380 last_read->index = index; 381 last_read->position = position; 382 last_read->ptr = ptr; 383 384 return position; 385 } 386 387 void CompressedLinePositionStorage::append_list( 388 const std::vector<uint64_t>& positions ) 389 { 390 // This is not very clever, but caching should make it 391 // reasonably fast. 392 for ( uint32_t i = 0; i < positions.size(); i++ ) 393 append( positions.at( i ) ); 394 } 395 396 // template<int BLOCK_SIZE> 397 void CompressedLinePositionStorage::pop_back() 398 { 399 // Removing the last entered data, there are two cases 400 if ( previous_block_pointer_ ) { 401 // The last append was a normal entry in an existing block, 402 // so we can just revert the pointer 403 block_pointer_ = previous_block_pointer_; 404 previous_block_pointer_ = nullptr; 405 } 406 else { 407 // A new block has been created for the last entry, we need 408 // to de-alloc it. 409 410 // If we try to pop_back() twice, we're dead! 411 assert( ( nb_lines_ - 1 ) % BLOCK_SIZE == 0 ); 412 413 char* block = block32_index_.back(); 414 block32_index_.pop_back(); 415 free( block ); 416 417 block_pointer_ = nullptr; 418 } 419 420 --nb_lines_; 421 current_pos_ = at( nb_lines_ - 1 ); 422 } 423