1 /*
2 * Copyright (C) 2015 Nicolas Bonnefon and other contributors
3 *
4 * This file is part of glogg.
5 *
6 * glogg is free software: you can redistribute it and/or modify
7 * it under the terms of the GNU General Public License as published by
8 * the Free Software Foundation, either version 3 of the License, or
9 * (at your option) any later version.
10 *
11 * glogg is distributed in the hope that it will be useful,
12 * but WITHOUT ANY WARRANTY; without even the implied warranty of
13 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
14 * GNU General Public License for more details.
15 *
16 * You should have received a copy of the GNU General Public License
17 * along with glogg. If not, see <http://www.gnu.org/licenses/>.
18 */
19
20 #include <cassert>
21 #include <cstdlib>
22 #include <QtEndian>
23
24 #include "utils.h"
25
26 #include "data/compressedlinestorage.h"
27
28 namespace {
29 // Functions to manipulate blocks
30
31 // Create a new 32 bits block of the passed size,
32 // initialised at the passed position
block32_new(int block_size,uint32_t initial_position,char ** block_ptr)33 char* block32_new( int block_size, uint32_t initial_position,
34 char** block_ptr )
35 {
36 // malloc a block of the maximum possible size
37 // (where every line is >16384)
38 char* ptr = static_cast<char*>( malloc( 4 + block_size * 6 ) );
39
40 if ( ptr ) {
41 // Write the initial_position
42 *(reinterpret_cast<uint32_t*>(ptr)) = initial_position;
43 *block_ptr = ptr + 4;
44 }
45
46 return ptr;
47 }
48
49 // Create a new 64 bits block of the passed size,
50 // initialised at the passed position
block64_new(int block_size,uint64_t initial_position,char ** block_ptr)51 char* block64_new( int block_size, uint64_t initial_position,
52 char** block_ptr )
53 {
54 // malloc a block of the maximum possible size
55 // (where every line is >16384)
56 char* ptr = static_cast<char*>( malloc( 8 + block_size * 10 ) );
57
58 if ( ptr ) {
59 // Write the initial_position
60 *(reinterpret_cast<uint64_t*>(ptr)) = initial_position;
61 *block_ptr = ptr + 8;
62 }
63
64 return ptr;
65 }
66
67
68 // Add a one byte relative delta (0-127) and inc pointer
69 // First bit is always 0
block_add_one_byte_relative(char ** ptr,uint8_t value)70 void block_add_one_byte_relative( char** ptr, uint8_t value )
71 {
72 **ptr = value;
73 *ptr += sizeof( value );
74 }
75
76 // Add a two bytes relative delta (0-16383) and inc pointer
77 // First 2 bits are always 10
block_add_two_bytes_relative(char ** ptr,uint16_t value)78 void block_add_two_bytes_relative( char** ptr, uint16_t value )
79 {
80 // Stored in big endian format in order to recognise the initial pattern:
81 // 10xx xxxx xxxx xxxx
82 // HO byte | LO byte
83 *(reinterpret_cast<uint16_t*>(*ptr)) = qToBigEndian( static_cast<uint16_t>( value | (1 << 15) ) );
84 *ptr += sizeof( value );
85 }
86
87 // Add a signal and a 4 bytes absolute position and inc pointer
block32_add_absolute(char ** ptr,uint32_t value)88 void block32_add_absolute( char** ptr, uint32_t value )
89 {
90 // 2 bytes marker (actually only the first two bits are tested)
91 *(reinterpret_cast<uint16_t*>(*ptr)) = 0xFF;
92 *ptr += sizeof( uint16_t );
93 // Absolute value (machine endian)
94 *(reinterpret_cast<uint32_t*>(*ptr)) = value;
95 *ptr += sizeof( uint32_t );
96 }
97
98 // Initialise the passed block for reading, returning
99 // the initial position and a pointer to the second entry.
block32_initial_pos(char * block,char ** ptr)100 uint64_t block32_initial_pos( char* block, char** ptr )
101 {
102 *ptr = block + sizeof( uint32_t );
103 return *(reinterpret_cast<uint32_t*>(block));
104 }
105
106 // Give the next position in the block based on the previous
107 // position, then increase the pointer.
block32_next_pos(char ** ptr,uint64_t previous_pos)108 uint64_t block32_next_pos( char** ptr, uint64_t previous_pos )
109 {
110 uint64_t pos = previous_pos;
111
112 uint8_t byte = **ptr;
113 ++(*ptr);
114 if ( ! ( byte & 0x80 ) ) {
115 // High order bit is 0
116 pos += byte;
117 }
118 else if ( ( byte & 0xC0 ) == 0x80 ) {
119 // We need to read the low order byte
120 uint8_t lo_byte = **ptr;
121 ++(*ptr);
122 // Remove the starting 10b
123 byte &= ~0xC0;
124 // And form the displacement (stored as big endian)
125 pos += ( (uint16_t) byte << 8 ) | (uint16_t) lo_byte;
126 }
127 else {
128 // Skip the next byte (not used)
129 ++(*ptr);
130 // And read the new absolute pos (machine endian)
131 pos = *(reinterpret_cast<uint32_t*>(*ptr));
132 *ptr += sizeof( uint32_t );
133 }
134
135 return pos;
136 }
137
138 // Add a signal and a 8 bytes absolute position and inc pointer
block64_add_absolute(char ** ptr,uint64_t value)139 void block64_add_absolute( char** ptr, uint64_t value )
140 {
141 // This is unaligned, can cause problem on some CPUs
142
143 // 2 bytes marker (actually only the first two bits are tested)
144 *(reinterpret_cast<uint16_t*>(*ptr)) = 0xFF;
145 *ptr += sizeof( uint16_t );
146 // Absolute value (machine endian)
147 *(reinterpret_cast<uint64_t*>(*ptr)) = value;
148 *ptr += sizeof( uint64_t );
149 }
150
151 // Initialise the passed block for reading, returning
152 // the initial position and a pointer to the second entry.
block64_initial_pos(char * block,char ** ptr)153 uint64_t block64_initial_pos( char* block, char** ptr )
154 {
155 *ptr = block + sizeof( uint64_t );
156 return *(reinterpret_cast<uint64_t*>(block));
157 }
158
159 // Give the next position in the block based on the previous
160 // position, then increase the pointer.
block64_next_pos(char ** ptr,uint64_t previous_pos)161 uint64_t block64_next_pos( char** ptr, uint64_t previous_pos )
162 {
163 uint64_t pos = previous_pos;
164
165 uint8_t byte = **ptr;
166 ++(*ptr);
167 if ( ! ( byte & 0x80 ) ) {
168 // High order bit is 0
169 pos += byte;
170 }
171 else if ( ( byte & 0xC0 ) == 0x80 ) {
172 // We need to read the low order byte
173 uint8_t lo_byte = **ptr;
174 ++(*ptr);
175 // Remove the starting 10b
176 byte &= ~0xC0;
177 // And form the displacement (stored as big endian)
178 pos += ( (uint16_t) byte << 8 ) | (uint16_t) lo_byte;
179 }
180 else {
181 // Skip the next byte (not used)
182 ++(*ptr);
183 // And read the new absolute pos (machine endian)
184 pos = *(reinterpret_cast<uint64_t*>(*ptr));
185 *ptr += sizeof( uint64_t );
186 }
187
188 return pos;
189 }
190 }
191
move_from(CompressedLinePositionStorage && orig)192 void CompressedLinePositionStorage::move_from(
193 CompressedLinePositionStorage&& orig )
194 {
195 nb_lines_ = orig.nb_lines_;
196 first_long_line_ = orig.first_long_line_;
197 current_pos_ = orig.current_pos_;
198 block_pointer_ = orig.block_pointer_;
199 previous_block_pointer_ = orig.previous_block_pointer_;
200
201 orig.nb_lines_ = 0;
202 }
203
204 // Move constructor
CompressedLinePositionStorage(CompressedLinePositionStorage && orig)205 CompressedLinePositionStorage::CompressedLinePositionStorage(
206 CompressedLinePositionStorage&& orig )
207 : block32_index_( std::move( orig.block32_index_ ) ),
208 block64_index_( std::move( orig.block64_index_ ) )
209 {
210 move_from( std::move( orig ) );
211 }
212
free_blocks()213 void CompressedLinePositionStorage::free_blocks()
214 {
215 for ( char* block : block32_index_ ) {
216 void* p = static_cast<void*>( block );
217 free( p );
218 }
219
220 for ( char* block : block64_index_ ) {
221 void* p = static_cast<void*>( block );
222 free( p );
223 }
224 }
225
226 // Move assignement
operator =(CompressedLinePositionStorage && orig)227 CompressedLinePositionStorage& CompressedLinePositionStorage::operator=(
228 CompressedLinePositionStorage&& orig )
229 {
230 free_blocks();
231
232 block32_index_ = std::move( orig.block32_index_ );
233 block64_index_ = std::move( orig.block64_index_ );
234 move_from( std::move( orig ) );
235
236 return *this;
237 }
238
~CompressedLinePositionStorage()239 CompressedLinePositionStorage::~CompressedLinePositionStorage()
240 {
241 free_blocks();
242 }
243
244 // template<int BLOCK_SIZE>
append(uint64_t pos)245 void CompressedLinePositionStorage::append( uint64_t pos )
246 {
247 // Lines must be stored in order
248 assert( ( pos > current_pos_ ) || ( pos == 0 ) );
249
250 // Save the pointer in case we need to "pop_back"
251 previous_block_pointer_ = block_pointer_;
252
253 bool store_in_big = false;
254 if ( pos > UINT32_MAX ) {
255 store_in_big = true;
256 if ( first_long_line_ == UINT32_MAX ) {
257 // First "big" end of line, we will start a new (64) block
258 first_long_line_ = nb_lines_;
259 block_pointer_ = nullptr;
260 }
261 }
262
263 if ( ! block_pointer_ ) {
264 // We need to start a new block
265 if ( ! store_in_big )
266 block32_index_.push_back(
267 block32_new( BLOCK_SIZE, pos, &block_pointer_ ) );
268 else
269 block64_index_.push_back(
270 block64_new( BLOCK_SIZE, pos, &block_pointer_ ) );
271 }
272 else {
273 uint64_t delta = pos - current_pos_;
274 if ( delta < 128 ) {
275 // Code relative on one byte
276 block_add_one_byte_relative( &block_pointer_, delta );
277 }
278 else if ( delta < 16384 ) {
279 // Code relative on two bytes
280 block_add_two_bytes_relative( &block_pointer_, delta );
281 }
282 else {
283 // Code absolute
284 if ( ! store_in_big )
285 block32_add_absolute( &block_pointer_, pos );
286 else
287 block64_add_absolute( &block_pointer_, pos );
288 }
289 }
290
291 current_pos_ = pos;
292 ++nb_lines_;
293
294 if ( ! store_in_big ) {
295 if ( nb_lines_ % BLOCK_SIZE == 0 ) {
296 // We have finished the block
297
298 // Let's reduce its size to what is actually used
299 int block_index = nb_lines_ / BLOCK_SIZE - 1;
300 char* block = block32_index_[block_index];
301 // We allocate extra space for the last element in case it
302 // is replaced by an absolute value in the future (following a pop_back)
303 size_t new_size = ( previous_block_pointer_
304 + sizeof( uint16_t ) + sizeof( uint32_t ) ) - block;
305 void* new_location = realloc( block, new_size );
306 if ( new_location )
307 block32_index_[block_index] = static_cast<char*>( new_location );
308
309 block_pointer_ = nullptr;
310 previous_block_pointer_ = static_cast<char*>( new_location ) + ( previous_block_pointer_ - block );
311 }
312 }
313 else {
314 if ( ( nb_lines_ - first_long_line_ ) % BLOCK_SIZE == 0 ) {
315 // We have finished the block
316
317 // Let's reduce its size to what is actually used
318 int block_index = ( nb_lines_ - first_long_line_ ) / BLOCK_SIZE - 1;
319 char* block = block64_index_[block_index];
320 // We allocate extra space for the last element in case it
321 // is replaced by an absolute value in the future (following a pop_back)
322 size_t new_size = ( previous_block_pointer_
323 + sizeof( uint16_t ) + sizeof( uint64_t ) ) - block;
324 void* new_location = realloc( block, new_size );
325 if ( new_location )
326 block64_index_[block_index] = static_cast<char*>( new_location );
327
328 block_pointer_ = nullptr;
329 previous_block_pointer_ = static_cast<char*>( new_location ) + ( previous_block_pointer_ - block );
330 }
331 }
332 }
333
334 // template<int BLOCK_SIZE>
at(uint32_t index) const335 uint64_t CompressedLinePositionStorage::at( uint32_t index ) const
336 {
337 char* ptr;
338 uint64_t position;
339
340 Cache* last_read = last_read_.getPtr();
341
342 if ( index < first_long_line_ ) {
343 if ( ( index == last_read->index + 1 ) && ( index % BLOCK_SIZE != 0 ) ) {
344 position = last_read->position;
345 ptr = last_read->ptr;
346
347 position = block32_next_pos( &ptr, position );
348 }
349 else {
350 char* block = block32_index_[ index / BLOCK_SIZE ];
351 position = block32_initial_pos( block, &ptr );
352
353 for ( uint32_t i = 0; i < index % BLOCK_SIZE; i++ ) {
354 // Go through all the lines in the block till the one we want
355 position = block32_next_pos( &ptr, position );
356 }
357 }
358 }
359 else {
360 const uint32_t index_in_64 = index - first_long_line_;
361 if ( ( index == last_read->index + 1 ) && ( index_in_64 % BLOCK_SIZE != 0 ) ) {
362 position = last_read->position;
363 ptr = last_read->ptr;
364
365 position = block64_next_pos( &ptr, position );
366 }
367 else {
368 char* block = block64_index_[ index_in_64 / BLOCK_SIZE ];
369 position = block64_initial_pos( block, &ptr );
370
371 for ( uint32_t i = 0; i < index_in_64 % BLOCK_SIZE; i++ ) {
372 // Go through all the lines in the block till the one we want
373 position = block64_next_pos( &ptr, position );
374 }
375 }
376 }
377
378 // Populate our cache ready for next consecutive read
379 last_read->index = index;
380 last_read->position = position;
381 last_read->ptr = ptr;
382
383 return position;
384 }
385
append_list(const std::vector<uint64_t> & positions)386 void CompressedLinePositionStorage::append_list(
387 const std::vector<uint64_t>& positions )
388 {
389 // This is not very clever, but caching should make it
390 // reasonably fast.
391 for ( uint32_t i = 0; i < positions.size(); i++ )
392 append( positions.at( i ) );
393 }
394
395 // template<int BLOCK_SIZE>
pop_back()396 void CompressedLinePositionStorage::pop_back()
397 {
398 // Removing the last entered data, there are two cases
399 if ( previous_block_pointer_ ) {
400 // The last append was a normal entry in an existing block,
401 // so we can just revert the pointer
402 block_pointer_ = previous_block_pointer_;
403 previous_block_pointer_ = nullptr;
404 }
405 else {
406 // A new block has been created for the last entry, we need
407 // to de-alloc it.
408
409 if ( first_long_line_ == UINT32_MAX ) {
410 // If we try to pop_back() twice, we're dead!
411 assert( ( nb_lines_ - 1 ) % BLOCK_SIZE == 0 );
412
413 char* block = block32_index_.back();
414 block32_index_.pop_back();
415 free( block );
416 }
417 else {
418 // If we try to pop_back() twice, we're dead!
419 assert( ( nb_lines_ - first_long_line_ - 1 ) % BLOCK_SIZE == 0 );
420
421 char* block = block64_index_.back();
422 block64_index_.pop_back();
423 free( block );
424 }
425
426 block_pointer_ = nullptr;
427 }
428
429 --nb_lines_;
430 current_pos_ = at( nb_lines_ - 1 );
431 }
432