1 /* 2 * Copyright (C) 2015 Nicolas Bonnefon and other contributors 3 * 4 * This file is part of glogg. 5 * 6 * glogg is free software: you can redistribute it and/or modify 7 * it under the terms of the GNU General Public License as published by 8 * the Free Software Foundation, either version 3 of the License, or 9 * (at your option) any later version. 10 * 11 * glogg is distributed in the hope that it will be useful, 12 * but WITHOUT ANY WARRANTY; without even the implied warranty of 13 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 14 * GNU General Public License for more details. 15 * 16 * You should have received a copy of the GNU General Public License 17 * along with glogg. If not, see <http://www.gnu.org/licenses/>. 18 */ 19 20 #include <cassert> 21 #include <cstdlib> 22 #include <arpa/inet.h> 23 24 #include <iostream> 25 26 #include "data/compressedlinestorage.h" 27 28 namespace { 29 // Functions to manipulate blocks 30 31 // Create a new 32 bits block of the passed size, 32 // initialised at the passed position 33 char* block32_new( int block_size, uint32_t initial_position, 34 char** block_ptr ) 35 { 36 // malloc a block of the maximum possible size 37 // (where every line is >16384) 38 char* ptr = static_cast<char*>( malloc( 4 + block_size * 6 ) ); 39 40 if ( ptr ) { 41 // Write the initial_position 42 *(reinterpret_cast<uint32_t*>(ptr)) = initial_position; 43 *block_ptr = ptr + 4; 44 } 45 46 return ptr; 47 } 48 49 // Create a new 64 bits block of the passed size, 50 // initialised at the passed position 51 char* block64_new( int block_size, uint64_t initial_position, 52 char** block_ptr ) 53 { 54 // malloc a block of the maximum possible size 55 // (where every line is >16384) 56 char* ptr = static_cast<char*>( malloc( 8 + block_size * 10 ) ); 57 58 if ( ptr ) { 59 // Write the initial_position 60 *(reinterpret_cast<uint64_t*>(ptr)) = initial_position; 61 *block_ptr = ptr + 8; 62 } 63 64 return ptr; 65 } 66 67 68 // Add a one byte relative delta (0-127) and inc pointer 69 // First bit is always 0 70 void block_add_one_byte_relative( char** ptr, uint8_t value ) 71 { 72 **ptr = value; 73 *ptr += sizeof( value ); 74 } 75 76 // Add a two bytes relative delta (0-16383) and inc pointer 77 // First 2 bits are always 10 78 void block_add_two_bytes_relative( char** ptr, uint16_t value ) 79 { 80 // Stored in big endian format in order to recognise the initial pattern: 81 // 10xx xxxx xxxx xxxx 82 // HO byte | LO byte 83 *(reinterpret_cast<uint16_t*>(*ptr)) = htons( value | (1 << 15) ); 84 *ptr += sizeof( value ); 85 } 86 87 // Add a signal and a 4 bytes absolute position and inc pointer 88 void block32_add_absolute( char** ptr, uint32_t value ) 89 { 90 // 2 bytes marker (actually only the first two bits are tested) 91 *(reinterpret_cast<uint16_t*>(*ptr)) = 0xFF; 92 *ptr += sizeof( uint16_t ); 93 // Absolute value (machine endian) 94 *(reinterpret_cast<uint32_t*>(*ptr)) = value; 95 *ptr += sizeof( uint32_t ); 96 } 97 98 // Initialise the passed block for reading, returning 99 // the initial position and a pointer to the second entry. 100 uint64_t block32_initial_pos( char* block, char** ptr ) 101 { 102 *ptr = block + sizeof( uint32_t ); 103 return *(reinterpret_cast<uint32_t*>(block)); 104 } 105 106 // Give the next position in the block based on the previous 107 // position, then increase the pointer. 108 uint64_t block32_next_pos( char** ptr, uint64_t previous_pos ) 109 { 110 uint64_t pos = previous_pos; 111 112 uint8_t byte = **ptr; 113 ++(*ptr); 114 if ( ! ( byte & 0x80 ) ) { 115 // High order bit is 0 116 pos += byte; 117 } 118 else if ( ( byte & 0xC0 ) == 0x80 ) { 119 // We need to read the low order byte 120 uint8_t lo_byte = **ptr; 121 ++(*ptr); 122 // Remove the starting 10b 123 byte &= ~0xC0; 124 // And form the displacement (stored as big endian) 125 pos += ( (uint16_t) byte << 8 ) | (uint16_t) lo_byte; 126 } 127 else { 128 // Skip the next byte (not used) 129 ++(*ptr); 130 // And read the new absolute pos (machine endian) 131 pos = *(reinterpret_cast<uint32_t*>(*ptr)); 132 *ptr += sizeof( uint32_t ); 133 } 134 135 return pos; 136 } 137 138 // Add a signal and a 8 bytes absolute position and inc pointer 139 void block64_add_absolute( char** ptr, uint64_t value ) 140 { 141 // This is unaligned, can cause problem on some CPUs 142 143 // 2 bytes marker (actually only the first two bits are tested) 144 *(reinterpret_cast<uint16_t*>(*ptr)) = 0xFF; 145 *ptr += sizeof( uint16_t ); 146 // Absolute value (machine endian) 147 *(reinterpret_cast<uint64_t*>(*ptr)) = value; 148 *ptr += sizeof( uint64_t ); 149 } 150 151 // Initialise the passed block for reading, returning 152 // the initial position and a pointer to the second entry. 153 uint64_t block64_initial_pos( char* block, char** ptr ) 154 { 155 *ptr = block + sizeof( uint64_t ); 156 return *(reinterpret_cast<uint64_t*>(block)); 157 } 158 159 // Give the next position in the block based on the previous 160 // position, then increase the pointer. 161 uint64_t block64_next_pos( char** ptr, uint64_t previous_pos ) 162 { 163 uint64_t pos = previous_pos; 164 165 uint8_t byte = **ptr; 166 ++(*ptr); 167 if ( ! ( byte & 0x80 ) ) { 168 // High order bit is 0 169 pos += byte; 170 } 171 else if ( ( byte & 0xC0 ) == 0x80 ) { 172 // We need to read the low order byte 173 uint8_t lo_byte = **ptr; 174 ++(*ptr); 175 // Remove the starting 10b 176 byte &= ~0xC0; 177 // And form the displacement (stored as big endian) 178 pos += ( (uint16_t) byte << 8 ) | (uint16_t) lo_byte; 179 } 180 else { 181 // Skip the next byte (not used) 182 ++(*ptr); 183 // And read the new absolute pos (machine endian) 184 pos = *(reinterpret_cast<uint64_t*>(*ptr)); 185 *ptr += sizeof( uint64_t ); 186 } 187 188 return pos; 189 } 190 } 191 192 void CompressedLinePositionStorage::move_from( 193 CompressedLinePositionStorage&& orig ) 194 { 195 nb_lines_ = orig.nb_lines_; 196 first_long_line_ = orig.first_long_line_; 197 current_pos_ = orig.current_pos_; 198 block_pointer_ = orig.block_pointer_; 199 previous_block_pointer_ = orig.previous_block_pointer_; 200 201 orig.nb_lines_ = 0; 202 } 203 204 // Move constructor 205 CompressedLinePositionStorage::CompressedLinePositionStorage( 206 CompressedLinePositionStorage&& orig ) 207 : block32_index_( std::move( orig.block32_index_ ) ) 208 { 209 move_from( std::move( orig ) ); 210 } 211 212 // Move assignement 213 CompressedLinePositionStorage& CompressedLinePositionStorage::operator=( 214 CompressedLinePositionStorage&& orig ) 215 { 216 block32_index_ = std::move( orig.block32_index_ ); 217 move_from( std::move( orig ) ); 218 219 return *this; 220 } 221 222 CompressedLinePositionStorage::~CompressedLinePositionStorage() 223 { 224 for ( char* block : block32_index_ ) { 225 void* p = static_cast<void*>( block ); 226 // std::cerr << "block = " << p << std::endl; 227 free( p ); 228 } 229 230 for ( char* block : block64_index_ ) { 231 void* p = static_cast<void*>( block ); 232 // std::cerr << "block = " << p << std::endl; 233 free( p ); 234 } 235 } 236 237 // template<int BLOCK_SIZE> 238 void CompressedLinePositionStorage::append( uint64_t pos ) 239 { 240 // Save the pointer in case we need to "pop_back" 241 previous_block_pointer_ = block_pointer_; 242 243 bool store_in_big = false; 244 if ( pos > UINT32_MAX ) { 245 store_in_big = true; 246 if ( first_long_line_ == UINT32_MAX ) { 247 // First "big" end of line, we will start a new (64) block 248 first_long_line_ = nb_lines_; 249 block_pointer_ = nullptr; 250 } 251 } 252 253 if ( ! block_pointer_ ) { 254 // We need to start a new block 255 if ( ! store_in_big ) 256 block32_index_.push_back( 257 block32_new( BLOCK_SIZE, pos, &block_pointer_ ) ); 258 else 259 block64_index_.push_back( 260 block64_new( BLOCK_SIZE, pos, &block_pointer_ ) ); 261 } 262 else { 263 uint64_t delta = pos - current_pos_; 264 if ( delta < 128 ) { 265 // Code relative on one byte 266 block_add_one_byte_relative( &block_pointer_, delta ); 267 } 268 else if ( delta < 16384 ) { 269 // Code relative on two bytes 270 block_add_two_bytes_relative( &block_pointer_, delta ); 271 } 272 else { 273 // Code absolute 274 if ( ! store_in_big ) 275 block32_add_absolute( &block_pointer_, pos ); 276 else 277 block64_add_absolute( &block_pointer_, pos ); 278 } 279 } 280 281 current_pos_ = pos; 282 ++nb_lines_; 283 284 if ( ! store_in_big ) { 285 if ( nb_lines_ % BLOCK_SIZE == 0 ) { 286 // We have finished the block 287 288 // Let's reduce its size to what is actually used 289 int block_index = nb_lines_ / BLOCK_SIZE - 1; 290 char* block = block32_index_[block_index]; 291 // We allocate extra space for the last element in case it 292 // is replaced by an absolute value in the future (following a pop_back) 293 size_t new_size = ( previous_block_pointer_ 294 + sizeof( uint16_t ) + sizeof( uint32_t ) ) - block; 295 void* new_location = realloc( block, new_size ); 296 if ( new_location ) 297 block32_index_[block_index] = static_cast<char*>( new_location ); 298 299 block_pointer_ = nullptr; 300 } 301 } 302 else { 303 if ( ( nb_lines_ - first_long_line_ ) % BLOCK_SIZE == 0 ) { 304 // We have finished the block 305 306 // Let's reduce its size to what is actually used 307 int block_index = ( nb_lines_ - first_long_line_ ) / BLOCK_SIZE - 1; 308 char* block = block64_index_[block_index]; 309 // We allocate extra space for the last element in case it 310 // is replaced by an absolute value in the future (following a pop_back) 311 size_t new_size = ( previous_block_pointer_ 312 + sizeof( uint16_t ) + sizeof( uint64_t ) ) - block; 313 void* new_location = realloc( block, new_size ); 314 if ( new_location ) 315 block64_index_[block_index] = static_cast<char*>( new_location ); 316 317 block_pointer_ = nullptr; 318 } 319 } 320 } 321 322 // template<int BLOCK_SIZE> 323 uint64_t CompressedLinePositionStorage::at( uint32_t index ) const 324 { 325 char* ptr; 326 uint64_t position; 327 328 Cache* last_read = last_read_.getPtr(); 329 330 if ( index < first_long_line_ ) { 331 if ( ( index == last_read->index + 1 ) && ( index % BLOCK_SIZE != 0 ) ) { 332 position = last_read->position; 333 ptr = last_read->ptr; 334 335 position = block32_next_pos( &ptr, position ); 336 } 337 else { 338 char* block = block32_index_[ index / BLOCK_SIZE ]; 339 position = block32_initial_pos( block, &ptr ); 340 341 for ( uint32_t i = 0; i < index % BLOCK_SIZE; i++ ) { 342 // Go through all the lines in the block till the one we want 343 position = block32_next_pos( &ptr, position ); 344 } 345 } 346 } 347 else { 348 const uint32_t index_in_64 = index - first_long_line_; 349 if ( ( index == last_read->index + 1 ) && ( index_in_64 % BLOCK_SIZE != 0 ) ) { 350 position = last_read->position; 351 ptr = last_read->ptr; 352 353 position = block64_next_pos( &ptr, position ); 354 } 355 else { 356 char* block = block64_index_[ index_in_64 / BLOCK_SIZE ]; 357 position = block64_initial_pos( block, &ptr ); 358 359 for ( uint32_t i = 0; i < index_in_64 % BLOCK_SIZE; i++ ) { 360 // Go through all the lines in the block till the one we want 361 position = block64_next_pos( &ptr, position ); 362 } 363 } 364 } 365 366 // Populate our cache ready for next consecutive read 367 last_read->index = index; 368 last_read->position = position; 369 last_read->ptr = ptr; 370 371 return position; 372 } 373 374 void CompressedLinePositionStorage::append_list( 375 const std::vector<uint64_t>& positions ) 376 { 377 // This is not very clever, but caching should make it 378 // reasonably fast. 379 for ( uint32_t i = 0; i < positions.size(); i++ ) 380 append( positions.at( i ) ); 381 } 382 383 // template<int BLOCK_SIZE> 384 void CompressedLinePositionStorage::pop_back() 385 { 386 // Removing the last entered data, there are two cases 387 if ( previous_block_pointer_ ) { 388 // The last append was a normal entry in an existing block, 389 // so we can just revert the pointer 390 block_pointer_ = previous_block_pointer_; 391 previous_block_pointer_ = nullptr; 392 } 393 else { 394 // A new block has been created for the last entry, we need 395 // to de-alloc it. 396 397 // If we try to pop_back() twice, we're dead! 398 assert( ( nb_lines_ - 1 ) % BLOCK_SIZE == 0 ); 399 400 char* block = block32_index_.back(); 401 block32_index_.pop_back(); 402 free( block ); 403 404 405 block_pointer_ = nullptr; 406 } 407 408 --nb_lines_; 409 current_pos_ = at( nb_lines_ - 1 ); 410 } 411