xref: /glogg/src/data/compressedlinestorage.cpp (revision 05467f52d72e282658c219cc889ed6bc08e5dc01)
1 /*
2  * Copyright (C) 2015 Nicolas Bonnefon and other contributors
3  *
4  * This file is part of glogg.
5  *
6  * glogg is free software: you can redistribute it and/or modify
7  * it under the terms of the GNU General Public License as published by
8  * the Free Software Foundation, either version 3 of the License, or
9  * (at your option) any later version.
10  *
11  * glogg is distributed in the hope that it will be useful,
12  * but WITHOUT ANY WARRANTY; without even the implied warranty of
13  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
14  * GNU General Public License for more details.
15  *
16  * You should have received a copy of the GNU General Public License
17  * along with glogg.  If not, see <http://www.gnu.org/licenses/>.
18  */
19 
20 #include <cassert>
21 #include <cstdlib>
22 
23 #include "utils.h"
24 
25 #include "data/compressedlinestorage.h"
26 
27 #ifdef HAVE_HTONS
28 #  include <arpa/inet.h>
29 #else
30 #  define htons(a) glogg_htons(a)
31 #endif
32 
33 namespace {
34     // Functions to manipulate blocks
35 
36     // Create a new 32 bits block of the passed size,
37     // initialised at the passed position
38     char* block32_new( int block_size, uint32_t initial_position,
39            char** block_ptr )
40     {
41         // malloc a block of the maximum possible size
42         // (where every line is >16384)
43         char* ptr = static_cast<char*>( malloc( 4 + block_size * 6 ) );
44 
45         if ( ptr ) {
46             // Write the initial_position
47             *(reinterpret_cast<uint32_t*>(ptr)) = initial_position;
48             *block_ptr = ptr + 4;
49         }
50 
51         return ptr;
52     }
53 
54     // Create a new 64 bits block of the passed size,
55     // initialised at the passed position
56     char* block64_new( int block_size, uint64_t initial_position,
57            char** block_ptr )
58     {
59         // malloc a block of the maximum possible size
60         // (where every line is >16384)
61         char* ptr = static_cast<char*>( malloc( 8 + block_size * 10 ) );
62 
63         if ( ptr ) {
64             // Write the initial_position
65             *(reinterpret_cast<uint64_t*>(ptr)) = initial_position;
66             *block_ptr = ptr + 8;
67         }
68 
69         return ptr;
70     }
71 
72 
73     // Add a one byte relative delta (0-127) and inc pointer
74     // First bit is always 0
75     void block_add_one_byte_relative( char** ptr, uint8_t value )
76     {
77         **ptr = value;
78         *ptr += sizeof( value );
79     }
80 
81     // Add a two bytes relative delta (0-16383) and inc pointer
82     // First 2 bits are always 10
83     void block_add_two_bytes_relative( char** ptr, uint16_t value )
84     {
85         // Stored in big endian format in order to recognise the initial pattern:
86         // 10xx xxxx xxxx xxxx
87         //  HO byte | LO byte
88         *(reinterpret_cast<uint16_t*>(*ptr)) = htons( value | (1 << 15) );
89         *ptr += sizeof( value );
90     }
91 
92     // Add a signal and a 4 bytes absolute position and inc pointer
93     void block32_add_absolute( char** ptr, uint32_t value )
94     {
95         // 2 bytes marker (actually only the first two bits are tested)
96         *(reinterpret_cast<uint16_t*>(*ptr)) = 0xFF;
97         *ptr += sizeof( uint16_t );
98         // Absolute value (machine endian)
99         *(reinterpret_cast<uint32_t*>(*ptr)) = value;
100         *ptr += sizeof( uint32_t );
101     }
102 
103     // Initialise the passed block for reading, returning
104     // the initial position and a pointer to the second entry.
105     uint64_t block32_initial_pos( char* block, char** ptr )
106     {
107         *ptr = block + sizeof( uint32_t );
108         return *(reinterpret_cast<uint32_t*>(block));
109     }
110 
111     // Give the next position in the block based on the previous
112     // position, then increase the pointer.
113     uint64_t block32_next_pos( char** ptr, uint64_t previous_pos )
114     {
115         uint64_t pos = previous_pos;
116 
117         uint8_t byte = **ptr;
118         ++(*ptr);
119         if ( ! ( byte & 0x80 ) ) {
120             // High order bit is 0
121             pos += byte;
122         }
123         else if ( ( byte & 0xC0 ) == 0x80 ) {
124             // We need to read the low order byte
125             uint8_t lo_byte = **ptr;
126             ++(*ptr);
127             // Remove the starting 10b
128             byte &= ~0xC0;
129             // And form the displacement (stored as big endian)
130             pos += ( (uint16_t) byte << 8 ) | (uint16_t) lo_byte;
131         }
132         else {
133             // Skip the next byte (not used)
134             ++(*ptr);
135             // And read the new absolute pos (machine endian)
136             pos = *(reinterpret_cast<uint32_t*>(*ptr));
137             *ptr += sizeof( uint32_t );
138         }
139 
140         return pos;
141     }
142 
143     // Add a signal and a 8 bytes absolute position and inc pointer
144     void block64_add_absolute( char** ptr, uint64_t value )
145     {
146         // This is unaligned, can cause problem on some CPUs
147 
148         // 2 bytes marker (actually only the first two bits are tested)
149         *(reinterpret_cast<uint16_t*>(*ptr)) = 0xFF;
150         *ptr += sizeof( uint16_t );
151         // Absolute value (machine endian)
152         *(reinterpret_cast<uint64_t*>(*ptr)) = value;
153         *ptr += sizeof( uint64_t );
154     }
155 
156     // Initialise the passed block for reading, returning
157     // the initial position and a pointer to the second entry.
158     uint64_t block64_initial_pos( char* block, char** ptr )
159     {
160         *ptr = block + sizeof( uint64_t );
161         return *(reinterpret_cast<uint64_t*>(block));
162     }
163 
164     // Give the next position in the block based on the previous
165     // position, then increase the pointer.
166     uint64_t block64_next_pos( char** ptr, uint64_t previous_pos )
167     {
168         uint64_t pos = previous_pos;
169 
170         uint8_t byte = **ptr;
171         ++(*ptr);
172         if ( ! ( byte & 0x80 ) ) {
173             // High order bit is 0
174             pos += byte;
175         }
176         else if ( ( byte & 0xC0 ) == 0x80 ) {
177             // We need to read the low order byte
178             uint8_t lo_byte = **ptr;
179             ++(*ptr);
180             // Remove the starting 10b
181             byte &= ~0xC0;
182             // And form the displacement (stored as big endian)
183             pos += ( (uint16_t) byte << 8 ) | (uint16_t) lo_byte;
184         }
185         else {
186             // Skip the next byte (not used)
187             ++(*ptr);
188             // And read the new absolute pos (machine endian)
189             pos = *(reinterpret_cast<uint64_t*>(*ptr));
190             *ptr += sizeof( uint64_t );
191         }
192 
193         return pos;
194     }
195 }
196 
197 void CompressedLinePositionStorage::move_from(
198         CompressedLinePositionStorage&& orig )
199 {
200     nb_lines_        = orig.nb_lines_;
201     first_long_line_ = orig.first_long_line_;
202     current_pos_     = orig.current_pos_;
203     block_pointer_   = orig.block_pointer_;
204     previous_block_pointer_ = orig.previous_block_pointer_;
205 
206     orig.nb_lines_   = 0;
207 }
208 
209 // Move constructor
210 CompressedLinePositionStorage::CompressedLinePositionStorage(
211         CompressedLinePositionStorage&& orig )
212     : block32_index_( std::move( orig.block32_index_ ) ),
213       block64_index_( std::move( orig.block64_index_ ) )
214 {
215     move_from( std::move( orig ) );
216 }
217 
218 void CompressedLinePositionStorage::free_blocks()
219 {
220     for ( char* block : block32_index_ ) {
221         void* p = static_cast<void*>( block );
222         free( p );
223     }
224 
225     for ( char* block : block64_index_ ) {
226         void* p = static_cast<void*>( block );
227         free( p );
228     }
229 }
230 
231 // Move assignement
232 CompressedLinePositionStorage& CompressedLinePositionStorage::operator=(
233         CompressedLinePositionStorage&& orig )
234 {
235     free_blocks();
236 
237     block32_index_ = std::move( orig.block32_index_ );
238     block64_index_ = std::move( orig.block64_index_ );
239     move_from( std::move( orig ) );
240 
241     return *this;
242 }
243 
244 CompressedLinePositionStorage::~CompressedLinePositionStorage()
245 {
246     free_blocks();
247 }
248 
249 // template<int BLOCK_SIZE>
250 void CompressedLinePositionStorage::append( uint64_t pos )
251 {
252     // Lines must be stored in order
253     assert( ( pos > current_pos_ ) || ( pos == 0 ) );
254 
255     // Save the pointer in case we need to "pop_back"
256     previous_block_pointer_ = block_pointer_;
257 
258     bool store_in_big = false;
259     if ( pos > UINT32_MAX ) {
260         store_in_big = true;
261         if ( first_long_line_ == UINT32_MAX ) {
262             // First "big" end of line, we will start a new (64) block
263             first_long_line_ = nb_lines_;
264             block_pointer_ = nullptr;
265         }
266     }
267 
268     if ( ! block_pointer_ ) {
269         // We need to start a new block
270         if ( ! store_in_big )
271             block32_index_.push_back(
272                 block32_new( BLOCK_SIZE, pos, &block_pointer_ ) );
273         else
274             block64_index_.push_back(
275                 block64_new( BLOCK_SIZE, pos, &block_pointer_ ) );
276     }
277     else {
278         uint64_t delta = pos - current_pos_;
279         if ( delta < 128 ) {
280             // Code relative on one byte
281             block_add_one_byte_relative( &block_pointer_, delta );
282         }
283         else if ( delta < 16384 ) {
284             // Code relative on two bytes
285             block_add_two_bytes_relative( &block_pointer_, delta );
286         }
287         else {
288             // Code absolute
289             if ( ! store_in_big )
290                 block32_add_absolute( &block_pointer_, pos );
291             else
292                 block64_add_absolute( &block_pointer_, pos );
293         }
294     }
295 
296     current_pos_ = pos;
297     ++nb_lines_;
298 
299     if ( ! store_in_big ) {
300         if ( nb_lines_ % BLOCK_SIZE == 0 ) {
301             // We have finished the block
302 
303             // Let's reduce its size to what is actually used
304             int block_index = nb_lines_ / BLOCK_SIZE - 1;
305             char* block = block32_index_[block_index];
306             // We allocate extra space for the last element in case it
307             // is replaced by an absolute value in the future (following a pop_back)
308             size_t new_size = ( previous_block_pointer_
309                     + sizeof( uint16_t ) + sizeof( uint32_t ) ) - block;
310             void* new_location = realloc( block, new_size );
311             if ( new_location )
312                 block32_index_[block_index] = static_cast<char*>( new_location );
313 
314             block_pointer_ = nullptr;
315             previous_block_pointer_ = static_cast<char*>( new_location ) + ( previous_block_pointer_ - block );
316         }
317     }
318     else {
319         if ( ( nb_lines_ - first_long_line_ ) % BLOCK_SIZE == 0 ) {
320             // We have finished the block
321 
322             // Let's reduce its size to what is actually used
323             int block_index = ( nb_lines_ - first_long_line_ ) / BLOCK_SIZE - 1;
324             char* block = block64_index_[block_index];
325             // We allocate extra space for the last element in case it
326             // is replaced by an absolute value in the future (following a pop_back)
327             size_t new_size = ( previous_block_pointer_
328                     + sizeof( uint16_t ) + sizeof( uint64_t ) ) - block;
329             void* new_location = realloc( block, new_size );
330             if ( new_location )
331                 block64_index_[block_index] = static_cast<char*>( new_location );
332 
333             block_pointer_ = nullptr;
334             previous_block_pointer_ = static_cast<char*>( new_location ) + ( previous_block_pointer_ - block );
335         }
336     }
337 }
338 
339 // template<int BLOCK_SIZE>
340 uint64_t CompressedLinePositionStorage::at( uint32_t index ) const
341 {
342     char* ptr;
343     uint64_t position;
344 
345     Cache* last_read = last_read_.getPtr();
346 
347     if ( index < first_long_line_ ) {
348         if ( ( index == last_read->index + 1 ) && ( index % BLOCK_SIZE != 0 ) ) {
349             position = last_read->position;
350             ptr      = last_read->ptr;
351 
352             position = block32_next_pos( &ptr, position );
353         }
354         else {
355             char* block = block32_index_[ index / BLOCK_SIZE ];
356             position = block32_initial_pos( block, &ptr );
357 
358             for ( uint32_t i = 0; i < index % BLOCK_SIZE; i++ ) {
359                 // Go through all the lines in the block till the one we want
360                 position = block32_next_pos( &ptr, position );
361             }
362         }
363     }
364     else {
365         const uint32_t index_in_64 = index - first_long_line_;
366         if ( ( index == last_read->index + 1 ) && ( index_in_64 % BLOCK_SIZE != 0 ) ) {
367             position = last_read->position;
368             ptr      = last_read->ptr;
369 
370             position = block64_next_pos( &ptr, position );
371         }
372         else {
373             char* block = block64_index_[ index_in_64 / BLOCK_SIZE ];
374             position = block64_initial_pos( block, &ptr );
375 
376             for ( uint32_t i = 0; i < index_in_64 % BLOCK_SIZE; i++ ) {
377                 // Go through all the lines in the block till the one we want
378                 position = block64_next_pos( &ptr, position );
379             }
380         }
381     }
382 
383     // Populate our cache ready for next consecutive read
384     last_read->index    = index;
385     last_read->position = position;
386     last_read->ptr      = ptr;
387 
388     return position;
389 }
390 
391 void CompressedLinePositionStorage::append_list(
392         const std::vector<uint64_t>& positions )
393 {
394     // This is not very clever, but caching should make it
395     // reasonably fast.
396     for ( uint32_t i = 0; i < positions.size(); i++ )
397         append( positions.at( i ) );
398 }
399 
400 // template<int BLOCK_SIZE>
401 void CompressedLinePositionStorage::pop_back()
402 {
403     // Removing the last entered data, there are two cases
404     if ( previous_block_pointer_ ) {
405         // The last append was a normal entry in an existing block,
406         // so we can just revert the pointer
407         block_pointer_ = previous_block_pointer_;
408         previous_block_pointer_ = nullptr;
409     }
410     else {
411         // A new block has been created for the last entry, we need
412         // to de-alloc it.
413 
414         if ( first_long_line_ == UINT32_MAX ) {
415             // If we try to pop_back() twice, we're dead!
416             assert( ( nb_lines_ - 1 ) % BLOCK_SIZE == 0 );
417 
418             char* block = block32_index_.back();
419             block32_index_.pop_back();
420             free( block );
421         }
422         else {
423             // If we try to pop_back() twice, we're dead!
424             assert( ( nb_lines_ - first_long_line_ - 1 ) % BLOCK_SIZE == 0 );
425 
426             char* block = block64_index_.back();
427             block64_index_.pop_back();
428             free( block );
429         }
430 
431         block_pointer_ = nullptr;
432     }
433 
434     --nb_lines_;
435     current_pos_ = at( nb_lines_ - 1 );
436 }
437