1 /* 2 * Copyright (C) 2015 Nicolas Bonnefon and other contributors 3 * 4 * This file is part of glogg. 5 * 6 * glogg is free software: you can redistribute it and/or modify 7 * it under the terms of the GNU General Public License as published by 8 * the Free Software Foundation, either version 3 of the License, or 9 * (at your option) any later version. 10 * 11 * glogg is distributed in the hope that it will be useful, 12 * but WITHOUT ANY WARRANTY; without even the implied warranty of 13 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 14 * GNU General Public License for more details. 15 * 16 * You should have received a copy of the GNU General Public License 17 * along with glogg. If not, see <http://www.gnu.org/licenses/>. 18 */ 19 20 #include <cassert> 21 #include <cstdlib> 22 #include <iostream> 23 24 #include "utils.h" 25 26 #include "data/compressedlinestorage.h" 27 28 #ifdef HAVE_HTONS 29 # include <arpa/inet.h> 30 #else 31 # define htons(a) glogg_htons(a) 32 #endif 33 34 namespace { 35 // Functions to manipulate blocks 36 37 // Create a new 32 bits block of the passed size, 38 // initialised at the passed position 39 char* block32_new( int block_size, uint32_t initial_position, 40 char** block_ptr ) 41 { 42 // malloc a block of the maximum possible size 43 // (where every line is >16384) 44 char* ptr = static_cast<char*>( malloc( 4 + block_size * 6 ) ); 45 46 if ( ptr ) { 47 // Write the initial_position 48 *(reinterpret_cast<uint32_t*>(ptr)) = initial_position; 49 *block_ptr = ptr + 4; 50 } 51 52 return ptr; 53 } 54 55 // Create a new 64 bits block of the passed size, 56 // initialised at the passed position 57 char* block64_new( int block_size, uint64_t initial_position, 58 char** block_ptr ) 59 { 60 // malloc a block of the maximum possible size 61 // (where every line is >16384) 62 char* ptr = static_cast<char*>( malloc( 8 + block_size * 10 ) ); 63 64 if ( ptr ) { 65 // Write the initial_position 66 *(reinterpret_cast<uint64_t*>(ptr)) = initial_position; 67 *block_ptr = ptr + 8; 68 } 69 70 return ptr; 71 } 72 73 74 // Add a one byte relative delta (0-127) and inc pointer 75 // First bit is always 0 76 void block_add_one_byte_relative( char** ptr, uint8_t value ) 77 { 78 **ptr = value; 79 *ptr += sizeof( value ); 80 } 81 82 // Add a two bytes relative delta (0-16383) and inc pointer 83 // First 2 bits are always 10 84 void block_add_two_bytes_relative( char** ptr, uint16_t value ) 85 { 86 // Stored in big endian format in order to recognise the initial pattern: 87 // 10xx xxxx xxxx xxxx 88 // HO byte | LO byte 89 *(reinterpret_cast<uint16_t*>(*ptr)) = htons( value | (1 << 15) ); 90 *ptr += sizeof( value ); 91 } 92 93 // Add a signal and a 4 bytes absolute position and inc pointer 94 void block32_add_absolute( char** ptr, uint32_t value ) 95 { 96 // 2 bytes marker (actually only the first two bits are tested) 97 *(reinterpret_cast<uint16_t*>(*ptr)) = 0xFF; 98 *ptr += sizeof( uint16_t ); 99 // Absolute value (machine endian) 100 *(reinterpret_cast<uint32_t*>(*ptr)) = value; 101 *ptr += sizeof( uint32_t ); 102 } 103 104 // Initialise the passed block for reading, returning 105 // the initial position and a pointer to the second entry. 106 uint64_t block32_initial_pos( char* block, char** ptr ) 107 { 108 *ptr = block + sizeof( uint32_t ); 109 return *(reinterpret_cast<uint32_t*>(block)); 110 } 111 112 // Give the next position in the block based on the previous 113 // position, then increase the pointer. 114 uint64_t block32_next_pos( char** ptr, uint64_t previous_pos ) 115 { 116 uint64_t pos = previous_pos; 117 118 uint8_t byte = **ptr; 119 ++(*ptr); 120 if ( ! ( byte & 0x80 ) ) { 121 // High order bit is 0 122 pos += byte; 123 } 124 else if ( ( byte & 0xC0 ) == 0x80 ) { 125 // We need to read the low order byte 126 uint8_t lo_byte = **ptr; 127 ++(*ptr); 128 // Remove the starting 10b 129 byte &= ~0xC0; 130 // And form the displacement (stored as big endian) 131 pos += ( (uint16_t) byte << 8 ) | (uint16_t) lo_byte; 132 } 133 else { 134 // Skip the next byte (not used) 135 ++(*ptr); 136 // And read the new absolute pos (machine endian) 137 pos = *(reinterpret_cast<uint32_t*>(*ptr)); 138 *ptr += sizeof( uint32_t ); 139 } 140 141 return pos; 142 } 143 144 // Add a signal and a 8 bytes absolute position and inc pointer 145 void block64_add_absolute( char** ptr, uint64_t value ) 146 { 147 // This is unaligned, can cause problem on some CPUs 148 149 // 2 bytes marker (actually only the first two bits are tested) 150 *(reinterpret_cast<uint16_t*>(*ptr)) = 0xFF; 151 *ptr += sizeof( uint16_t ); 152 // Absolute value (machine endian) 153 *(reinterpret_cast<uint64_t*>(*ptr)) = value; 154 *ptr += sizeof( uint64_t ); 155 } 156 157 // Initialise the passed block for reading, returning 158 // the initial position and a pointer to the second entry. 159 uint64_t block64_initial_pos( char* block, char** ptr ) 160 { 161 *ptr = block + sizeof( uint64_t ); 162 return *(reinterpret_cast<uint64_t*>(block)); 163 } 164 165 // Give the next position in the block based on the previous 166 // position, then increase the pointer. 167 uint64_t block64_next_pos( char** ptr, uint64_t previous_pos ) 168 { 169 uint64_t pos = previous_pos; 170 171 uint8_t byte = **ptr; 172 ++(*ptr); 173 if ( ! ( byte & 0x80 ) ) { 174 // High order bit is 0 175 pos += byte; 176 } 177 else if ( ( byte & 0xC0 ) == 0x80 ) { 178 // We need to read the low order byte 179 uint8_t lo_byte = **ptr; 180 ++(*ptr); 181 // Remove the starting 10b 182 byte &= ~0xC0; 183 // And form the displacement (stored as big endian) 184 pos += ( (uint16_t) byte << 8 ) | (uint16_t) lo_byte; 185 } 186 else { 187 // Skip the next byte (not used) 188 ++(*ptr); 189 // And read the new absolute pos (machine endian) 190 pos = *(reinterpret_cast<uint64_t*>(*ptr)); 191 *ptr += sizeof( uint64_t ); 192 } 193 194 return pos; 195 } 196 } 197 198 void CompressedLinePositionStorage::move_from( 199 CompressedLinePositionStorage&& orig ) 200 { 201 nb_lines_ = orig.nb_lines_; 202 first_long_line_ = orig.first_long_line_; 203 current_pos_ = orig.current_pos_; 204 block_pointer_ = orig.block_pointer_; 205 previous_block_pointer_ = orig.previous_block_pointer_; 206 207 orig.nb_lines_ = 0; 208 } 209 210 // Move constructor 211 CompressedLinePositionStorage::CompressedLinePositionStorage( 212 CompressedLinePositionStorage&& orig ) 213 : block32_index_( std::move( orig.block32_index_ ) ) 214 { 215 move_from( std::move( orig ) ); 216 } 217 218 // Move assignement 219 CompressedLinePositionStorage& CompressedLinePositionStorage::operator=( 220 CompressedLinePositionStorage&& orig ) 221 { 222 block32_index_ = std::move( orig.block32_index_ ); 223 move_from( std::move( orig ) ); 224 225 return *this; 226 } 227 228 CompressedLinePositionStorage::~CompressedLinePositionStorage() 229 { 230 for ( char* block : block32_index_ ) { 231 void* p = static_cast<void*>( block ); 232 // std::cerr << "block = " << p << std::endl; 233 free( p ); 234 } 235 236 for ( char* block : block64_index_ ) { 237 void* p = static_cast<void*>( block ); 238 // std::cerr << "block = " << p << std::endl; 239 free( p ); 240 } 241 } 242 243 // template<int BLOCK_SIZE> 244 void CompressedLinePositionStorage::append( uint64_t pos ) 245 { 246 // Save the pointer in case we need to "pop_back" 247 previous_block_pointer_ = block_pointer_; 248 249 bool store_in_big = false; 250 if ( pos > UINT32_MAX ) { 251 store_in_big = true; 252 if ( first_long_line_ == UINT32_MAX ) { 253 // First "big" end of line, we will start a new (64) block 254 first_long_line_ = nb_lines_; 255 block_pointer_ = nullptr; 256 } 257 } 258 259 if ( ! block_pointer_ ) { 260 // We need to start a new block 261 if ( ! store_in_big ) 262 block32_index_.push_back( 263 block32_new( BLOCK_SIZE, pos, &block_pointer_ ) ); 264 else 265 block64_index_.push_back( 266 block64_new( BLOCK_SIZE, pos, &block_pointer_ ) ); 267 } 268 else { 269 uint64_t delta = pos - current_pos_; 270 if ( delta < 128 ) { 271 // Code relative on one byte 272 block_add_one_byte_relative( &block_pointer_, delta ); 273 } 274 else if ( delta < 16384 ) { 275 // Code relative on two bytes 276 block_add_two_bytes_relative( &block_pointer_, delta ); 277 } 278 else { 279 // Code absolute 280 if ( ! store_in_big ) 281 block32_add_absolute( &block_pointer_, pos ); 282 else 283 block64_add_absolute( &block_pointer_, pos ); 284 } 285 } 286 287 current_pos_ = pos; 288 ++nb_lines_; 289 290 if ( ! store_in_big ) { 291 if ( nb_lines_ % BLOCK_SIZE == 0 ) { 292 // We have finished the block 293 294 // Let's reduce its size to what is actually used 295 int block_index = nb_lines_ / BLOCK_SIZE - 1; 296 char* block = block32_index_[block_index]; 297 // We allocate extra space for the last element in case it 298 // is replaced by an absolute value in the future (following a pop_back) 299 size_t new_size = ( previous_block_pointer_ 300 + sizeof( uint16_t ) + sizeof( uint32_t ) ) - block; 301 void* new_location = realloc( block, new_size ); 302 if ( new_location ) 303 block32_index_[block_index] = static_cast<char*>( new_location ); 304 305 block_pointer_ = nullptr; 306 } 307 } 308 else { 309 if ( ( nb_lines_ - first_long_line_ ) % BLOCK_SIZE == 0 ) { 310 // We have finished the block 311 312 // Let's reduce its size to what is actually used 313 int block_index = ( nb_lines_ - first_long_line_ ) / BLOCK_SIZE - 1; 314 char* block = block64_index_[block_index]; 315 // We allocate extra space for the last element in case it 316 // is replaced by an absolute value in the future (following a pop_back) 317 size_t new_size = ( previous_block_pointer_ 318 + sizeof( uint16_t ) + sizeof( uint64_t ) ) - block; 319 void* new_location = realloc( block, new_size ); 320 if ( new_location ) 321 block64_index_[block_index] = static_cast<char*>( new_location ); 322 323 block_pointer_ = nullptr; 324 } 325 } 326 } 327 328 // template<int BLOCK_SIZE> 329 uint64_t CompressedLinePositionStorage::at( uint32_t index ) const 330 { 331 char* ptr; 332 uint64_t position; 333 334 Cache* last_read = last_read_.getPtr(); 335 336 if ( index < first_long_line_ ) { 337 if ( ( index == last_read->index + 1 ) && ( index % BLOCK_SIZE != 0 ) ) { 338 position = last_read->position; 339 ptr = last_read->ptr; 340 341 position = block32_next_pos( &ptr, position ); 342 } 343 else { 344 char* block = block32_index_[ index / BLOCK_SIZE ]; 345 position = block32_initial_pos( block, &ptr ); 346 347 for ( uint32_t i = 0; i < index % BLOCK_SIZE; i++ ) { 348 // Go through all the lines in the block till the one we want 349 position = block32_next_pos( &ptr, position ); 350 } 351 } 352 } 353 else { 354 const uint32_t index_in_64 = index - first_long_line_; 355 if ( ( index == last_read->index + 1 ) && ( index_in_64 % BLOCK_SIZE != 0 ) ) { 356 position = last_read->position; 357 ptr = last_read->ptr; 358 359 position = block64_next_pos( &ptr, position ); 360 } 361 else { 362 char* block = block64_index_[ index_in_64 / BLOCK_SIZE ]; 363 position = block64_initial_pos( block, &ptr ); 364 365 for ( uint32_t i = 0; i < index_in_64 % BLOCK_SIZE; i++ ) { 366 // Go through all the lines in the block till the one we want 367 position = block64_next_pos( &ptr, position ); 368 } 369 } 370 } 371 372 // Populate our cache ready for next consecutive read 373 last_read->index = index; 374 last_read->position = position; 375 last_read->ptr = ptr; 376 377 return position; 378 } 379 380 void CompressedLinePositionStorage::append_list( 381 const std::vector<uint64_t>& positions ) 382 { 383 // This is not very clever, but caching should make it 384 // reasonably fast. 385 for ( uint32_t i = 0; i < positions.size(); i++ ) 386 append( positions.at( i ) ); 387 } 388 389 // template<int BLOCK_SIZE> 390 void CompressedLinePositionStorage::pop_back() 391 { 392 // Removing the last entered data, there are two cases 393 if ( previous_block_pointer_ ) { 394 // The last append was a normal entry in an existing block, 395 // so we can just revert the pointer 396 block_pointer_ = previous_block_pointer_; 397 previous_block_pointer_ = nullptr; 398 } 399 else { 400 // A new block has been created for the last entry, we need 401 // to de-alloc it. 402 403 // If we try to pop_back() twice, we're dead! 404 assert( ( nb_lines_ - 1 ) % BLOCK_SIZE == 0 ); 405 406 char* block = block32_index_.back(); 407 block32_index_.pop_back(); 408 free( block ); 409 410 411 block_pointer_ = nullptr; 412 } 413 414 --nb_lines_; 415 current_pos_ = at( nb_lines_ - 1 ); 416 } 417