xref: /glogg/src/data/compressedlinestorage.cpp (revision 732308b3a6635dc95cbd6b6f5e067b954ea0ffed)
1 /*
2  * Copyright (C) 2015 Nicolas Bonnefon and other contributors
3  *
4  * This file is part of glogg.
5  *
6  * glogg is free software: you can redistribute it and/or modify
7  * it under the terms of the GNU General Public License as published by
8  * the Free Software Foundation, either version 3 of the License, or
9  * (at your option) any later version.
10  *
11  * glogg is distributed in the hope that it will be useful,
12  * but WITHOUT ANY WARRANTY; without even the implied warranty of
13  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
14  * GNU General Public License for more details.
15  *
16  * You should have received a copy of the GNU General Public License
17  * along with glogg.  If not, see <http://www.gnu.org/licenses/>.
18  */
19 
20 #include <cassert>
21 #include <cstdlib>
22 #include <QtEndian>
23 
24 #include "utils.h"
25 
26 #include "data/compressedlinestorage.h"
27 
28 namespace {
29     // Functions to manipulate blocks
30 
31     // Create a new 32 bits block of the passed size,
32     // initialised at the passed position
block32_new(int block_size,uint32_t initial_position,char ** block_ptr)33     char* block32_new( int block_size, uint32_t initial_position,
34            char** block_ptr )
35     {
36         // malloc a block of the maximum possible size
37         // (where every line is >16384)
38         char* ptr = static_cast<char*>( malloc( 4 + block_size * 6 ) );
39 
40         if ( ptr ) {
41             // Write the initial_position
42             *(reinterpret_cast<uint32_t*>(ptr)) = initial_position;
43             *block_ptr = ptr + 4;
44         }
45 
46         return ptr;
47     }
48 
49     // Create a new 64 bits block of the passed size,
50     // initialised at the passed position
block64_new(int block_size,uint64_t initial_position,char ** block_ptr)51     char* block64_new( int block_size, uint64_t initial_position,
52            char** block_ptr )
53     {
54         // malloc a block of the maximum possible size
55         // (where every line is >16384)
56         char* ptr = static_cast<char*>( malloc( 8 + block_size * 10 ) );
57 
58         if ( ptr ) {
59             // Write the initial_position
60             *(reinterpret_cast<uint64_t*>(ptr)) = initial_position;
61             *block_ptr = ptr + 8;
62         }
63 
64         return ptr;
65     }
66 
67 
68     // Add a one byte relative delta (0-127) and inc pointer
69     // First bit is always 0
block_add_one_byte_relative(char ** ptr,uint8_t value)70     void block_add_one_byte_relative( char** ptr, uint8_t value )
71     {
72         **ptr = value;
73         *ptr += sizeof( value );
74     }
75 
76     // Add a two bytes relative delta (0-16383) and inc pointer
77     // First 2 bits are always 10
block_add_two_bytes_relative(char ** ptr,uint16_t value)78     void block_add_two_bytes_relative( char** ptr, uint16_t value )
79     {
80         // Stored in big endian format in order to recognise the initial pattern:
81         // 10xx xxxx xxxx xxxx
82         //  HO byte | LO byte
83         *(reinterpret_cast<uint16_t*>(*ptr)) = qToBigEndian( static_cast<uint16_t>( value | (1 << 15) ) );
84         *ptr += sizeof( value );
85     }
86 
87     // Add a signal and a 4 bytes absolute position and inc pointer
block32_add_absolute(char ** ptr,uint32_t value)88     void block32_add_absolute( char** ptr, uint32_t value )
89     {
90         // 2 bytes marker (actually only the first two bits are tested)
91         *(reinterpret_cast<uint16_t*>(*ptr)) = 0xFF;
92         *ptr += sizeof( uint16_t );
93         // Absolute value (machine endian)
94         *(reinterpret_cast<uint32_t*>(*ptr)) = value;
95         *ptr += sizeof( uint32_t );
96     }
97 
98     // Initialise the passed block for reading, returning
99     // the initial position and a pointer to the second entry.
block32_initial_pos(char * block,char ** ptr)100     uint64_t block32_initial_pos( char* block, char** ptr )
101     {
102         *ptr = block + sizeof( uint32_t );
103         return *(reinterpret_cast<uint32_t*>(block));
104     }
105 
106     // Give the next position in the block based on the previous
107     // position, then increase the pointer.
block32_next_pos(char ** ptr,uint64_t previous_pos)108     uint64_t block32_next_pos( char** ptr, uint64_t previous_pos )
109     {
110         uint64_t pos = previous_pos;
111 
112         uint8_t byte = **ptr;
113         ++(*ptr);
114         if ( ! ( byte & 0x80 ) ) {
115             // High order bit is 0
116             pos += byte;
117         }
118         else if ( ( byte & 0xC0 ) == 0x80 ) {
119             // We need to read the low order byte
120             uint8_t lo_byte = **ptr;
121             ++(*ptr);
122             // Remove the starting 10b
123             byte &= ~0xC0;
124             // And form the displacement (stored as big endian)
125             pos += ( (uint16_t) byte << 8 ) | (uint16_t) lo_byte;
126         }
127         else {
128             // Skip the next byte (not used)
129             ++(*ptr);
130             // And read the new absolute pos (machine endian)
131             pos = *(reinterpret_cast<uint32_t*>(*ptr));
132             *ptr += sizeof( uint32_t );
133         }
134 
135         return pos;
136     }
137 
138     // Add a signal and a 8 bytes absolute position and inc pointer
block64_add_absolute(char ** ptr,uint64_t value)139     void block64_add_absolute( char** ptr, uint64_t value )
140     {
141         // This is unaligned, can cause problem on some CPUs
142 
143         // 2 bytes marker (actually only the first two bits are tested)
144         *(reinterpret_cast<uint16_t*>(*ptr)) = 0xFF;
145         *ptr += sizeof( uint16_t );
146         // Absolute value (machine endian)
147         *(reinterpret_cast<uint64_t*>(*ptr)) = value;
148         *ptr += sizeof( uint64_t );
149     }
150 
151     // Initialise the passed block for reading, returning
152     // the initial position and a pointer to the second entry.
block64_initial_pos(char * block,char ** ptr)153     uint64_t block64_initial_pos( char* block, char** ptr )
154     {
155         *ptr = block + sizeof( uint64_t );
156         return *(reinterpret_cast<uint64_t*>(block));
157     }
158 
159     // Give the next position in the block based on the previous
160     // position, then increase the pointer.
block64_next_pos(char ** ptr,uint64_t previous_pos)161     uint64_t block64_next_pos( char** ptr, uint64_t previous_pos )
162     {
163         uint64_t pos = previous_pos;
164 
165         uint8_t byte = **ptr;
166         ++(*ptr);
167         if ( ! ( byte & 0x80 ) ) {
168             // High order bit is 0
169             pos += byte;
170         }
171         else if ( ( byte & 0xC0 ) == 0x80 ) {
172             // We need to read the low order byte
173             uint8_t lo_byte = **ptr;
174             ++(*ptr);
175             // Remove the starting 10b
176             byte &= ~0xC0;
177             // And form the displacement (stored as big endian)
178             pos += ( (uint16_t) byte << 8 ) | (uint16_t) lo_byte;
179         }
180         else {
181             // Skip the next byte (not used)
182             ++(*ptr);
183             // And read the new absolute pos (machine endian)
184             pos = *(reinterpret_cast<uint64_t*>(*ptr));
185             *ptr += sizeof( uint64_t );
186         }
187 
188         return pos;
189     }
190 }
191 
move_from(CompressedLinePositionStorage && orig)192 void CompressedLinePositionStorage::move_from(
193         CompressedLinePositionStorage&& orig )
194 {
195     nb_lines_        = orig.nb_lines_;
196     first_long_line_ = orig.first_long_line_;
197     current_pos_     = orig.current_pos_;
198     block_pointer_   = orig.block_pointer_;
199     previous_block_pointer_ = orig.previous_block_pointer_;
200 
201     orig.nb_lines_   = 0;
202 }
203 
204 // Move constructor
CompressedLinePositionStorage(CompressedLinePositionStorage && orig)205 CompressedLinePositionStorage::CompressedLinePositionStorage(
206         CompressedLinePositionStorage&& orig )
207     : block32_index_( std::move( orig.block32_index_ ) ),
208       block64_index_( std::move( orig.block64_index_ ) )
209 {
210     move_from( std::move( orig ) );
211 }
212 
free_blocks()213 void CompressedLinePositionStorage::free_blocks()
214 {
215     for ( char* block : block32_index_ ) {
216         void* p = static_cast<void*>( block );
217         free( p );
218     }
219 
220     for ( char* block : block64_index_ ) {
221         void* p = static_cast<void*>( block );
222         free( p );
223     }
224 }
225 
226 // Move assignement
operator =(CompressedLinePositionStorage && orig)227 CompressedLinePositionStorage& CompressedLinePositionStorage::operator=(
228         CompressedLinePositionStorage&& orig )
229 {
230     free_blocks();
231 
232     block32_index_ = std::move( orig.block32_index_ );
233     block64_index_ = std::move( orig.block64_index_ );
234     move_from( std::move( orig ) );
235 
236     return *this;
237 }
238 
~CompressedLinePositionStorage()239 CompressedLinePositionStorage::~CompressedLinePositionStorage()
240 {
241     free_blocks();
242 }
243 
244 // template<int BLOCK_SIZE>
append(uint64_t pos)245 void CompressedLinePositionStorage::append( uint64_t pos )
246 {
247     // Lines must be stored in order
248     assert( ( pos > current_pos_ ) || ( pos == 0 ) );
249 
250     // Save the pointer in case we need to "pop_back"
251     previous_block_pointer_ = block_pointer_;
252 
253     bool store_in_big = false;
254     if ( pos > UINT32_MAX ) {
255         store_in_big = true;
256         if ( first_long_line_ == UINT32_MAX ) {
257             // First "big" end of line, we will start a new (64) block
258             first_long_line_ = nb_lines_;
259             block_pointer_ = nullptr;
260         }
261     }
262 
263     if ( ! block_pointer_ ) {
264         // We need to start a new block
265         if ( ! store_in_big )
266             block32_index_.push_back(
267                 block32_new( BLOCK_SIZE, pos, &block_pointer_ ) );
268         else
269             block64_index_.push_back(
270                 block64_new( BLOCK_SIZE, pos, &block_pointer_ ) );
271     }
272     else {
273         uint64_t delta = pos - current_pos_;
274         if ( delta < 128 ) {
275             // Code relative on one byte
276             block_add_one_byte_relative( &block_pointer_, delta );
277         }
278         else if ( delta < 16384 ) {
279             // Code relative on two bytes
280             block_add_two_bytes_relative( &block_pointer_, delta );
281         }
282         else {
283             // Code absolute
284             if ( ! store_in_big )
285                 block32_add_absolute( &block_pointer_, pos );
286             else
287                 block64_add_absolute( &block_pointer_, pos );
288         }
289     }
290 
291     current_pos_ = pos;
292     ++nb_lines_;
293 
294     if ( ! store_in_big ) {
295         if ( nb_lines_ % BLOCK_SIZE == 0 ) {
296             // We have finished the block
297 
298             // Let's reduce its size to what is actually used
299             int block_index = nb_lines_ / BLOCK_SIZE - 1;
300             char* block = block32_index_[block_index];
301             // We allocate extra space for the last element in case it
302             // is replaced by an absolute value in the future (following a pop_back)
303             size_t new_size = ( previous_block_pointer_
304                     + sizeof( uint16_t ) + sizeof( uint32_t ) ) - block;
305             void* new_location = realloc( block, new_size );
306             if ( new_location )
307                 block32_index_[block_index] = static_cast<char*>( new_location );
308 
309             block_pointer_ = nullptr;
310             previous_block_pointer_ = static_cast<char*>( new_location ) + ( previous_block_pointer_ - block );
311         }
312     }
313     else {
314         if ( ( nb_lines_ - first_long_line_ ) % BLOCK_SIZE == 0 ) {
315             // We have finished the block
316 
317             // Let's reduce its size to what is actually used
318             int block_index = ( nb_lines_ - first_long_line_ ) / BLOCK_SIZE - 1;
319             char* block = block64_index_[block_index];
320             // We allocate extra space for the last element in case it
321             // is replaced by an absolute value in the future (following a pop_back)
322             size_t new_size = ( previous_block_pointer_
323                     + sizeof( uint16_t ) + sizeof( uint64_t ) ) - block;
324             void* new_location = realloc( block, new_size );
325             if ( new_location )
326                 block64_index_[block_index] = static_cast<char*>( new_location );
327 
328             block_pointer_ = nullptr;
329             previous_block_pointer_ = static_cast<char*>( new_location ) + ( previous_block_pointer_ - block );
330         }
331     }
332 }
333 
334 // template<int BLOCK_SIZE>
at(uint32_t index) const335 uint64_t CompressedLinePositionStorage::at( uint32_t index ) const
336 {
337     char* ptr;
338     uint64_t position;
339 
340     Cache* last_read = last_read_.getPtr();
341 
342     if ( index < first_long_line_ ) {
343         if ( ( index == last_read->index + 1 ) && ( index % BLOCK_SIZE != 0 ) ) {
344             position = last_read->position;
345             ptr      = last_read->ptr;
346 
347             position = block32_next_pos( &ptr, position );
348         }
349         else {
350             char* block = block32_index_[ index / BLOCK_SIZE ];
351             position = block32_initial_pos( block, &ptr );
352 
353             for ( uint32_t i = 0; i < index % BLOCK_SIZE; i++ ) {
354                 // Go through all the lines in the block till the one we want
355                 position = block32_next_pos( &ptr, position );
356             }
357         }
358     }
359     else {
360         const uint32_t index_in_64 = index - first_long_line_;
361         if ( ( index == last_read->index + 1 ) && ( index_in_64 % BLOCK_SIZE != 0 ) ) {
362             position = last_read->position;
363             ptr      = last_read->ptr;
364 
365             position = block64_next_pos( &ptr, position );
366         }
367         else {
368             char* block = block64_index_[ index_in_64 / BLOCK_SIZE ];
369             position = block64_initial_pos( block, &ptr );
370 
371             for ( uint32_t i = 0; i < index_in_64 % BLOCK_SIZE; i++ ) {
372                 // Go through all the lines in the block till the one we want
373                 position = block64_next_pos( &ptr, position );
374             }
375         }
376     }
377 
378     // Populate our cache ready for next consecutive read
379     last_read->index    = index;
380     last_read->position = position;
381     last_read->ptr      = ptr;
382 
383     return position;
384 }
385 
append_list(const std::vector<uint64_t> & positions)386 void CompressedLinePositionStorage::append_list(
387         const std::vector<uint64_t>& positions )
388 {
389     // This is not very clever, but caching should make it
390     // reasonably fast.
391     for ( uint32_t i = 0; i < positions.size(); i++ )
392         append( positions.at( i ) );
393 }
394 
395 // template<int BLOCK_SIZE>
pop_back()396 void CompressedLinePositionStorage::pop_back()
397 {
398     // Removing the last entered data, there are two cases
399     if ( previous_block_pointer_ ) {
400         // The last append was a normal entry in an existing block,
401         // so we can just revert the pointer
402         block_pointer_ = previous_block_pointer_;
403         previous_block_pointer_ = nullptr;
404     }
405     else {
406         // A new block has been created for the last entry, we need
407         // to de-alloc it.
408 
409         if ( first_long_line_ == UINT32_MAX ) {
410             // If we try to pop_back() twice, we're dead!
411             assert( ( nb_lines_ - 1 ) % BLOCK_SIZE == 0 );
412 
413             char* block = block32_index_.back();
414             block32_index_.pop_back();
415             free( block );
416         }
417         else {
418             // If we try to pop_back() twice, we're dead!
419             assert( ( nb_lines_ - first_long_line_ - 1 ) % BLOCK_SIZE == 0 );
420 
421             char* block = block64_index_.back();
422             block64_index_.pop_back();
423             free( block );
424         }
425 
426         block_pointer_ = nullptr;
427     }
428 
429     --nb_lines_;
430     current_pos_ = at( nb_lines_ - 1 );
431 }
432