1 /* 2 * Copyright (C) 2015 Nicolas Bonnefon and other contributors 3 * 4 * This file is part of glogg. 5 * 6 * glogg is free software: you can redistribute it and/or modify 7 * it under the terms of the GNU General Public License as published by 8 * the Free Software Foundation, either version 3 of the License, or 9 * (at your option) any later version. 10 * 11 * glogg is distributed in the hope that it will be useful, 12 * but WITHOUT ANY WARRANTY; without even the implied warranty of 13 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 14 * GNU General Public License for more details. 15 * 16 * You should have received a copy of the GNU General Public License 17 * along with glogg. If not, see <http://www.gnu.org/licenses/>. 18 */ 19 20 #include <cassert> 21 #include <cstdlib> 22 #include <iostream> 23 24 #include "utils.h" 25 26 #include "data/compressedlinestorage.h" 27 28 #ifdef HAVE_HTONS 29 # include <arpa/inet.h> 30 #else 31 # define htons(a) glogg_htons(a) 32 #endif 33 34 namespace { 35 // Functions to manipulate blocks 36 37 // Create a new 32 bits block of the passed size, 38 // initialised at the passed position 39 char* block32_new( int block_size, uint32_t initial_position, 40 char** block_ptr ) 41 { 42 // malloc a block of the maximum possible size 43 // (where every line is >16384) 44 char* ptr = static_cast<char*>( malloc( 4 + block_size * 6 ) ); 45 46 if ( ptr ) { 47 // Write the initial_position 48 *(reinterpret_cast<uint32_t*>(ptr)) = initial_position; 49 *block_ptr = ptr + 4; 50 } 51 52 return ptr; 53 } 54 55 // Create a new 64 bits block of the passed size, 56 // initialised at the passed position 57 char* block64_new( int block_size, uint64_t initial_position, 58 char** block_ptr ) 59 { 60 // malloc a block of the maximum possible size 61 // (where every line is >16384) 62 char* ptr = static_cast<char*>( malloc( 8 + block_size * 10 ) ); 63 64 if ( ptr ) { 65 // Write the initial_position 66 *(reinterpret_cast<uint64_t*>(ptr)) = initial_position; 67 *block_ptr = ptr + 8; 68 } 69 70 return ptr; 71 } 72 73 74 // Add a one byte relative delta (0-127) and inc pointer 75 // First bit is always 0 76 void block_add_one_byte_relative( char** ptr, uint8_t value ) 77 { 78 **ptr = value; 79 *ptr += sizeof( value ); 80 } 81 82 // Add a two bytes relative delta (0-16383) and inc pointer 83 // First 2 bits are always 10 84 void block_add_two_bytes_relative( char** ptr, uint16_t value ) 85 { 86 // Stored in big endian format in order to recognise the initial pattern: 87 // 10xx xxxx xxxx xxxx 88 // HO byte | LO byte 89 *(reinterpret_cast<uint16_t*>(*ptr)) = htons( value | (1 << 15) ); 90 *ptr += sizeof( value ); 91 } 92 93 // Add a signal and a 4 bytes absolute position and inc pointer 94 void block32_add_absolute( char** ptr, uint32_t value ) 95 { 96 // 2 bytes marker (actually only the first two bits are tested) 97 *(reinterpret_cast<uint16_t*>(*ptr)) = 0xFF; 98 *ptr += sizeof( uint16_t ); 99 // Absolute value (machine endian) 100 *(reinterpret_cast<uint32_t*>(*ptr)) = value; 101 *ptr += sizeof( uint32_t ); 102 } 103 104 // Initialise the passed block for reading, returning 105 // the initial position and a pointer to the second entry. 106 uint64_t block32_initial_pos( char* block, char** ptr ) 107 { 108 *ptr = block + sizeof( uint32_t ); 109 return *(reinterpret_cast<uint32_t*>(block)); 110 } 111 112 // Give the next position in the block based on the previous 113 // position, then increase the pointer. 114 uint64_t block32_next_pos( char** ptr, uint64_t previous_pos ) 115 { 116 uint64_t pos = previous_pos; 117 118 uint8_t byte = **ptr; 119 ++(*ptr); 120 if ( ! ( byte & 0x80 ) ) { 121 // High order bit is 0 122 pos += byte; 123 } 124 else if ( ( byte & 0xC0 ) == 0x80 ) { 125 // We need to read the low order byte 126 uint8_t lo_byte = **ptr; 127 ++(*ptr); 128 // Remove the starting 10b 129 byte &= ~0xC0; 130 // And form the displacement (stored as big endian) 131 pos += ( (uint16_t) byte << 8 ) | (uint16_t) lo_byte; 132 } 133 else { 134 // Skip the next byte (not used) 135 ++(*ptr); 136 // And read the new absolute pos (machine endian) 137 pos = *(reinterpret_cast<uint32_t*>(*ptr)); 138 *ptr += sizeof( uint32_t ); 139 } 140 141 return pos; 142 } 143 144 // Add a signal and a 8 bytes absolute position and inc pointer 145 void block64_add_absolute( char** ptr, uint64_t value ) 146 { 147 // This is unaligned, can cause problem on some CPUs 148 149 // 2 bytes marker (actually only the first two bits are tested) 150 *(reinterpret_cast<uint16_t*>(*ptr)) = 0xFF; 151 *ptr += sizeof( uint16_t ); 152 // Absolute value (machine endian) 153 *(reinterpret_cast<uint64_t*>(*ptr)) = value; 154 *ptr += sizeof( uint64_t ); 155 } 156 157 // Initialise the passed block for reading, returning 158 // the initial position and a pointer to the second entry. 159 uint64_t block64_initial_pos( char* block, char** ptr ) 160 { 161 *ptr = block + sizeof( uint64_t ); 162 return *(reinterpret_cast<uint64_t*>(block)); 163 } 164 165 // Give the next position in the block based on the previous 166 // position, then increase the pointer. 167 uint64_t block64_next_pos( char** ptr, uint64_t previous_pos ) 168 { 169 uint64_t pos = previous_pos; 170 171 uint8_t byte = **ptr; 172 ++(*ptr); 173 if ( ! ( byte & 0x80 ) ) { 174 // High order bit is 0 175 pos += byte; 176 } 177 else if ( ( byte & 0xC0 ) == 0x80 ) { 178 // We need to read the low order byte 179 uint8_t lo_byte = **ptr; 180 ++(*ptr); 181 // Remove the starting 10b 182 byte &= ~0xC0; 183 // And form the displacement (stored as big endian) 184 pos += ( (uint16_t) byte << 8 ) | (uint16_t) lo_byte; 185 } 186 else { 187 // Skip the next byte (not used) 188 ++(*ptr); 189 // And read the new absolute pos (machine endian) 190 pos = *(reinterpret_cast<uint64_t*>(*ptr)); 191 *ptr += sizeof( uint64_t ); 192 } 193 194 return pos; 195 } 196 } 197 198 void CompressedLinePositionStorage::move_from( 199 CompressedLinePositionStorage&& orig ) 200 { 201 nb_lines_ = orig.nb_lines_; 202 first_long_line_ = orig.first_long_line_; 203 current_pos_ = orig.current_pos_; 204 block_pointer_ = orig.block_pointer_; 205 previous_block_pointer_ = orig.previous_block_pointer_; 206 207 orig.nb_lines_ = 0; 208 } 209 210 // Move constructor 211 CompressedLinePositionStorage::CompressedLinePositionStorage( 212 CompressedLinePositionStorage&& orig ) 213 : block32_index_( std::move( orig.block32_index_ ) ) 214 { 215 move_from( std::move( orig ) ); 216 } 217 218 void CompressedLinePositionStorage::free_blocks() 219 { 220 for ( char* block : block32_index_ ) { 221 void* p = static_cast<void*>( block ); 222 std::cerr << "free block = " << p << std::endl; 223 free( p ); 224 } 225 226 for ( char* block : block64_index_ ) { 227 void* p = static_cast<void*>( block ); 228 std::cerr << "block = " << p << std::endl; 229 free( p ); 230 } 231 } 232 233 // Move assignement 234 CompressedLinePositionStorage& CompressedLinePositionStorage::operator=( 235 CompressedLinePositionStorage&& orig ) 236 { 237 free_blocks(); 238 239 block32_index_ = std::move( orig.block32_index_ ); 240 move_from( std::move( orig ) ); 241 242 return *this; 243 } 244 245 CompressedLinePositionStorage::~CompressedLinePositionStorage() 246 { 247 248 free_blocks(); 249 } 250 251 // template<int BLOCK_SIZE> 252 void CompressedLinePositionStorage::append( uint64_t pos ) 253 { 254 // Save the pointer in case we need to "pop_back" 255 previous_block_pointer_ = block_pointer_; 256 257 bool store_in_big = false; 258 if ( pos > UINT32_MAX ) { 259 store_in_big = true; 260 if ( first_long_line_ == UINT32_MAX ) { 261 // First "big" end of line, we will start a new (64) block 262 first_long_line_ = nb_lines_; 263 block_pointer_ = nullptr; 264 } 265 } 266 267 if ( ! block_pointer_ ) { 268 // We need to start a new block 269 if ( ! store_in_big ) 270 block32_index_.push_back( 271 block32_new( BLOCK_SIZE, pos, &block_pointer_ ) ); 272 else 273 block64_index_.push_back( 274 block64_new( BLOCK_SIZE, pos, &block_pointer_ ) ); 275 } 276 else { 277 uint64_t delta = pos - current_pos_; 278 if ( delta < 128 ) { 279 // Code relative on one byte 280 block_add_one_byte_relative( &block_pointer_, delta ); 281 } 282 else if ( delta < 16384 ) { 283 // Code relative on two bytes 284 block_add_two_bytes_relative( &block_pointer_, delta ); 285 } 286 else { 287 // Code absolute 288 if ( ! store_in_big ) 289 block32_add_absolute( &block_pointer_, pos ); 290 else 291 block64_add_absolute( &block_pointer_, pos ); 292 } 293 } 294 295 current_pos_ = pos; 296 ++nb_lines_; 297 298 if ( ! store_in_big ) { 299 if ( nb_lines_ % BLOCK_SIZE == 0 ) { 300 // We have finished the block 301 302 // Let's reduce its size to what is actually used 303 int block_index = nb_lines_ / BLOCK_SIZE - 1; 304 char* block = block32_index_[block_index]; 305 // We allocate extra space for the last element in case it 306 // is replaced by an absolute value in the future (following a pop_back) 307 size_t new_size = ( previous_block_pointer_ 308 + sizeof( uint16_t ) + sizeof( uint32_t ) ) - block; 309 void* new_location = realloc( block, new_size ); 310 if ( new_location ) 311 block32_index_[block_index] = static_cast<char*>( new_location ); 312 313 block_pointer_ = nullptr; 314 } 315 } 316 else { 317 if ( ( nb_lines_ - first_long_line_ ) % BLOCK_SIZE == 0 ) { 318 // We have finished the block 319 320 // Let's reduce its size to what is actually used 321 int block_index = ( nb_lines_ - first_long_line_ ) / BLOCK_SIZE - 1; 322 char* block = block64_index_[block_index]; 323 // We allocate extra space for the last element in case it 324 // is replaced by an absolute value in the future (following a pop_back) 325 size_t new_size = ( previous_block_pointer_ 326 + sizeof( uint16_t ) + sizeof( uint64_t ) ) - block; 327 void* new_location = realloc( block, new_size ); 328 if ( new_location ) 329 block64_index_[block_index] = static_cast<char*>( new_location ); 330 331 block_pointer_ = nullptr; 332 } 333 } 334 } 335 336 // template<int BLOCK_SIZE> 337 uint64_t CompressedLinePositionStorage::at( uint32_t index ) const 338 { 339 char* ptr; 340 uint64_t position; 341 342 Cache* last_read = last_read_.getPtr(); 343 344 if ( index < first_long_line_ ) { 345 if ( ( index == last_read->index + 1 ) && ( index % BLOCK_SIZE != 0 ) ) { 346 position = last_read->position; 347 ptr = last_read->ptr; 348 349 position = block32_next_pos( &ptr, position ); 350 } 351 else { 352 char* block = block32_index_[ index / BLOCK_SIZE ]; 353 position = block32_initial_pos( block, &ptr ); 354 355 for ( uint32_t i = 0; i < index % BLOCK_SIZE; i++ ) { 356 // Go through all the lines in the block till the one we want 357 position = block32_next_pos( &ptr, position ); 358 } 359 } 360 } 361 else { 362 const uint32_t index_in_64 = index - first_long_line_; 363 if ( ( index == last_read->index + 1 ) && ( index_in_64 % BLOCK_SIZE != 0 ) ) { 364 position = last_read->position; 365 ptr = last_read->ptr; 366 367 position = block64_next_pos( &ptr, position ); 368 } 369 else { 370 char* block = block64_index_[ index_in_64 / BLOCK_SIZE ]; 371 position = block64_initial_pos( block, &ptr ); 372 373 for ( uint32_t i = 0; i < index_in_64 % BLOCK_SIZE; i++ ) { 374 // Go through all the lines in the block till the one we want 375 position = block64_next_pos( &ptr, position ); 376 } 377 } 378 } 379 380 // Populate our cache ready for next consecutive read 381 last_read->index = index; 382 last_read->position = position; 383 last_read->ptr = ptr; 384 385 return position; 386 } 387 388 void CompressedLinePositionStorage::append_list( 389 const std::vector<uint64_t>& positions ) 390 { 391 // This is not very clever, but caching should make it 392 // reasonably fast. 393 for ( uint32_t i = 0; i < positions.size(); i++ ) 394 append( positions.at( i ) ); 395 } 396 397 // template<int BLOCK_SIZE> 398 void CompressedLinePositionStorage::pop_back() 399 { 400 // Removing the last entered data, there are two cases 401 if ( previous_block_pointer_ ) { 402 // The last append was a normal entry in an existing block, 403 // so we can just revert the pointer 404 block_pointer_ = previous_block_pointer_; 405 previous_block_pointer_ = nullptr; 406 } 407 else { 408 // A new block has been created for the last entry, we need 409 // to de-alloc it. 410 411 // If we try to pop_back() twice, we're dead! 412 assert( ( nb_lines_ - 1 ) % BLOCK_SIZE == 0 ); 413 414 char* block = block32_index_.back(); 415 block32_index_.pop_back(); 416 free( block ); 417 418 419 block_pointer_ = nullptr; 420 } 421 422 --nb_lines_; 423 current_pos_ = at( nb_lines_ - 1 ); 424 } 425