xref: /glogg/src/data/compressedlinestorage.cpp (revision 25d0072c9ecfabfb2a7869f6ca1a146e708fc672)
1 /*
2  * Copyright (C) 2015 Nicolas Bonnefon and other contributors
3  *
4  * This file is part of glogg.
5  *
6  * glogg is free software: you can redistribute it and/or modify
7  * it under the terms of the GNU General Public License as published by
8  * the Free Software Foundation, either version 3 of the License, or
9  * (at your option) any later version.
10  *
11  * glogg is distributed in the hope that it will be useful,
12  * but WITHOUT ANY WARRANTY; without even the implied warranty of
13  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
14  * GNU General Public License for more details.
15  *
16  * You should have received a copy of the GNU General Public License
17  * along with glogg.  If not, see <http://www.gnu.org/licenses/>.
18  */
19 
20 #include <cassert>
21 #include <cstdlib>
22 #include <iostream>
23 
24 #include "utils.h"
25 
26 #include "data/compressedlinestorage.h"
27 
28 #ifdef HAVE_HTONS
29 #  include <arpa/inet.h>
30 #else
31 #  define htons(a) glogg_htons(a)
32 #endif
33 
34 namespace {
35     // Functions to manipulate blocks
36 
37     // Create a new 32 bits block of the passed size,
38     // initialised at the passed position
39     char* block32_new( int block_size, uint32_t initial_position,
40            char** block_ptr )
41     {
42         // malloc a block of the maximum possible size
43         // (where every line is >16384)
44         char* ptr = static_cast<char*>( malloc( 4 + block_size * 6 ) );
45 
46         if ( ptr ) {
47             // Write the initial_position
48             *(reinterpret_cast<uint32_t*>(ptr)) = initial_position;
49             *block_ptr = ptr + 4;
50         }
51 
52         return ptr;
53     }
54 
55     // Create a new 64 bits block of the passed size,
56     // initialised at the passed position
57     char* block64_new( int block_size, uint64_t initial_position,
58            char** block_ptr )
59     {
60         // malloc a block of the maximum possible size
61         // (where every line is >16384)
62         char* ptr = static_cast<char*>( malloc( 8 + block_size * 10 ) );
63 
64         if ( ptr ) {
65             // Write the initial_position
66             *(reinterpret_cast<uint64_t*>(ptr)) = initial_position;
67             *block_ptr = ptr + 8;
68         }
69 
70         return ptr;
71     }
72 
73 
74     // Add a one byte relative delta (0-127) and inc pointer
75     // First bit is always 0
76     void block_add_one_byte_relative( char** ptr, uint8_t value )
77     {
78         **ptr = value;
79         *ptr += sizeof( value );
80     }
81 
82     // Add a two bytes relative delta (0-16383) and inc pointer
83     // First 2 bits are always 10
84     void block_add_two_bytes_relative( char** ptr, uint16_t value )
85     {
86         // Stored in big endian format in order to recognise the initial pattern:
87         // 10xx xxxx xxxx xxxx
88         //  HO byte | LO byte
89         *(reinterpret_cast<uint16_t*>(*ptr)) = htons( value | (1 << 15) );
90         *ptr += sizeof( value );
91     }
92 
93     // Add a signal and a 4 bytes absolute position and inc pointer
94     void block32_add_absolute( char** ptr, uint32_t value )
95     {
96         // 2 bytes marker (actually only the first two bits are tested)
97         *(reinterpret_cast<uint16_t*>(*ptr)) = 0xFF;
98         *ptr += sizeof( uint16_t );
99         // Absolute value (machine endian)
100         *(reinterpret_cast<uint32_t*>(*ptr)) = value;
101         *ptr += sizeof( uint32_t );
102     }
103 
104     // Initialise the passed block for reading, returning
105     // the initial position and a pointer to the second entry.
106     uint64_t block32_initial_pos( char* block, char** ptr )
107     {
108         *ptr = block + sizeof( uint32_t );
109         return *(reinterpret_cast<uint32_t*>(block));
110     }
111 
112     // Give the next position in the block based on the previous
113     // position, then increase the pointer.
114     uint64_t block32_next_pos( char** ptr, uint64_t previous_pos )
115     {
116         uint64_t pos = previous_pos;
117 
118         uint8_t byte = **ptr;
119         ++(*ptr);
120         if ( ! ( byte & 0x80 ) ) {
121             // High order bit is 0
122             pos += byte;
123         }
124         else if ( ( byte & 0xC0 ) == 0x80 ) {
125             // We need to read the low order byte
126             uint8_t lo_byte = **ptr;
127             ++(*ptr);
128             // Remove the starting 10b
129             byte &= ~0xC0;
130             // And form the displacement (stored as big endian)
131             pos += ( (uint16_t) byte << 8 ) | (uint16_t) lo_byte;
132         }
133         else {
134             // Skip the next byte (not used)
135             ++(*ptr);
136             // And read the new absolute pos (machine endian)
137             pos = *(reinterpret_cast<uint32_t*>(*ptr));
138             *ptr += sizeof( uint32_t );
139         }
140 
141         return pos;
142     }
143 
144     // Add a signal and a 8 bytes absolute position and inc pointer
145     void block64_add_absolute( char** ptr, uint64_t value )
146     {
147         // This is unaligned, can cause problem on some CPUs
148 
149         // 2 bytes marker (actually only the first two bits are tested)
150         *(reinterpret_cast<uint16_t*>(*ptr)) = 0xFF;
151         *ptr += sizeof( uint16_t );
152         // Absolute value (machine endian)
153         *(reinterpret_cast<uint64_t*>(*ptr)) = value;
154         *ptr += sizeof( uint64_t );
155     }
156 
157     // Initialise the passed block for reading, returning
158     // the initial position and a pointer to the second entry.
159     uint64_t block64_initial_pos( char* block, char** ptr )
160     {
161         *ptr = block + sizeof( uint64_t );
162         return *(reinterpret_cast<uint64_t*>(block));
163     }
164 
165     // Give the next position in the block based on the previous
166     // position, then increase the pointer.
167     uint64_t block64_next_pos( char** ptr, uint64_t previous_pos )
168     {
169         uint64_t pos = previous_pos;
170 
171         uint8_t byte = **ptr;
172         ++(*ptr);
173         if ( ! ( byte & 0x80 ) ) {
174             // High order bit is 0
175             pos += byte;
176         }
177         else if ( ( byte & 0xC0 ) == 0x80 ) {
178             // We need to read the low order byte
179             uint8_t lo_byte = **ptr;
180             ++(*ptr);
181             // Remove the starting 10b
182             byte &= ~0xC0;
183             // And form the displacement (stored as big endian)
184             pos += ( (uint16_t) byte << 8 ) | (uint16_t) lo_byte;
185         }
186         else {
187             // Skip the next byte (not used)
188             ++(*ptr);
189             // And read the new absolute pos (machine endian)
190             pos = *(reinterpret_cast<uint64_t*>(*ptr));
191             *ptr += sizeof( uint64_t );
192         }
193 
194         return pos;
195     }
196 }
197 
198 void CompressedLinePositionStorage::move_from(
199         CompressedLinePositionStorage&& orig )
200 {
201     nb_lines_        = orig.nb_lines_;
202     first_long_line_ = orig.first_long_line_;
203     current_pos_     = orig.current_pos_;
204     block_pointer_   = orig.block_pointer_;
205     previous_block_pointer_ = orig.previous_block_pointer_;
206 
207     orig.nb_lines_   = 0;
208 }
209 
210 // Move constructor
211 CompressedLinePositionStorage::CompressedLinePositionStorage(
212         CompressedLinePositionStorage&& orig )
213     : block32_index_( std::move( orig.block32_index_ ) )
214 {
215     move_from( std::move( orig ) );
216 }
217 
218 void CompressedLinePositionStorage::free_blocks()
219 {
220     for ( char* block : block32_index_ ) {
221         void* p = static_cast<void*>( block );
222         std::cerr << "free block = " << p << std::endl;
223         free( p );
224     }
225 
226     for ( char* block : block64_index_ ) {
227         void* p = static_cast<void*>( block );
228         std::cerr << "block = " << p << std::endl;
229         free( p );
230     }
231 }
232 
233 // Move assignement
234 CompressedLinePositionStorage& CompressedLinePositionStorage::operator=(
235         CompressedLinePositionStorage&& orig )
236 {
237     free_blocks();
238 
239     block32_index_ = std::move( orig.block32_index_ );
240     move_from( std::move( orig ) );
241 
242     return *this;
243 }
244 
245 CompressedLinePositionStorage::~CompressedLinePositionStorage()
246 {
247 
248     free_blocks();
249 }
250 
251 // template<int BLOCK_SIZE>
252 void CompressedLinePositionStorage::append( uint64_t pos )
253 {
254     // Save the pointer in case we need to "pop_back"
255     previous_block_pointer_ = block_pointer_;
256 
257     bool store_in_big = false;
258     if ( pos > UINT32_MAX ) {
259         store_in_big = true;
260         if ( first_long_line_ == UINT32_MAX ) {
261             // First "big" end of line, we will start a new (64) block
262             first_long_line_ = nb_lines_;
263             block_pointer_ = nullptr;
264         }
265     }
266 
267     if ( ! block_pointer_ ) {
268         // We need to start a new block
269         if ( ! store_in_big )
270             block32_index_.push_back(
271                 block32_new( BLOCK_SIZE, pos, &block_pointer_ ) );
272         else
273             block64_index_.push_back(
274                 block64_new( BLOCK_SIZE, pos, &block_pointer_ ) );
275     }
276     else {
277         uint64_t delta = pos - current_pos_;
278         if ( delta < 128 ) {
279             // Code relative on one byte
280             block_add_one_byte_relative( &block_pointer_, delta );
281         }
282         else if ( delta < 16384 ) {
283             // Code relative on two bytes
284             block_add_two_bytes_relative( &block_pointer_, delta );
285         }
286         else {
287             // Code absolute
288             if ( ! store_in_big )
289                 block32_add_absolute( &block_pointer_, pos );
290             else
291                 block64_add_absolute( &block_pointer_, pos );
292         }
293     }
294 
295     current_pos_ = pos;
296     ++nb_lines_;
297 
298     if ( ! store_in_big ) {
299         if ( nb_lines_ % BLOCK_SIZE == 0 ) {
300             // We have finished the block
301 
302             // Let's reduce its size to what is actually used
303             int block_index = nb_lines_ / BLOCK_SIZE - 1;
304             char* block = block32_index_[block_index];
305             // We allocate extra space for the last element in case it
306             // is replaced by an absolute value in the future (following a pop_back)
307             size_t new_size = ( previous_block_pointer_
308                     + sizeof( uint16_t ) + sizeof( uint32_t ) ) - block;
309             void* new_location = realloc( block, new_size );
310             if ( new_location )
311                 block32_index_[block_index] = static_cast<char*>( new_location );
312 
313             block_pointer_ = nullptr;
314         }
315     }
316     else {
317         if ( ( nb_lines_ - first_long_line_ ) % BLOCK_SIZE == 0 ) {
318             // We have finished the block
319 
320             // Let's reduce its size to what is actually used
321             int block_index = ( nb_lines_ - first_long_line_ ) / BLOCK_SIZE - 1;
322             char* block = block64_index_[block_index];
323             // We allocate extra space for the last element in case it
324             // is replaced by an absolute value in the future (following a pop_back)
325             size_t new_size = ( previous_block_pointer_
326                     + sizeof( uint16_t ) + sizeof( uint64_t ) ) - block;
327             void* new_location = realloc( block, new_size );
328             if ( new_location )
329                 block64_index_[block_index] = static_cast<char*>( new_location );
330 
331             block_pointer_ = nullptr;
332         }
333     }
334 }
335 
336 // template<int BLOCK_SIZE>
337 uint64_t CompressedLinePositionStorage::at( uint32_t index ) const
338 {
339     char* ptr;
340     uint64_t position;
341 
342     Cache* last_read = last_read_.getPtr();
343 
344     if ( index < first_long_line_ ) {
345         if ( ( index == last_read->index + 1 ) && ( index % BLOCK_SIZE != 0 ) ) {
346             position = last_read->position;
347             ptr      = last_read->ptr;
348 
349             position = block32_next_pos( &ptr, position );
350         }
351         else {
352             char* block = block32_index_[ index / BLOCK_SIZE ];
353             position = block32_initial_pos( block, &ptr );
354 
355             for ( uint32_t i = 0; i < index % BLOCK_SIZE; i++ ) {
356                 // Go through all the lines in the block till the one we want
357                 position = block32_next_pos( &ptr, position );
358             }
359         }
360     }
361     else {
362         const uint32_t index_in_64 = index - first_long_line_;
363         if ( ( index == last_read->index + 1 ) && ( index_in_64 % BLOCK_SIZE != 0 ) ) {
364             position = last_read->position;
365             ptr      = last_read->ptr;
366 
367             position = block64_next_pos( &ptr, position );
368         }
369         else {
370             char* block = block64_index_[ index_in_64 / BLOCK_SIZE ];
371             position = block64_initial_pos( block, &ptr );
372 
373             for ( uint32_t i = 0; i < index_in_64 % BLOCK_SIZE; i++ ) {
374                 // Go through all the lines in the block till the one we want
375                 position = block64_next_pos( &ptr, position );
376             }
377         }
378     }
379 
380     // Populate our cache ready for next consecutive read
381     last_read->index    = index;
382     last_read->position = position;
383     last_read->ptr      = ptr;
384 
385     return position;
386 }
387 
388 void CompressedLinePositionStorage::append_list(
389         const std::vector<uint64_t>& positions )
390 {
391     // This is not very clever, but caching should make it
392     // reasonably fast.
393     for ( uint32_t i = 0; i < positions.size(); i++ )
394         append( positions.at( i ) );
395 }
396 
397 // template<int BLOCK_SIZE>
398 void CompressedLinePositionStorage::pop_back()
399 {
400     // Removing the last entered data, there are two cases
401     if ( previous_block_pointer_ ) {
402         // The last append was a normal entry in an existing block,
403         // so we can just revert the pointer
404         block_pointer_ = previous_block_pointer_;
405         previous_block_pointer_ = nullptr;
406     }
407     else {
408         // A new block has been created for the last entry, we need
409         // to de-alloc it.
410 
411         // If we try to pop_back() twice, we're dead!
412         assert( ( nb_lines_ - 1 ) % BLOCK_SIZE == 0 );
413 
414         char* block = block32_index_.back();
415         block32_index_.pop_back();
416         free( block );
417 
418 
419         block_pointer_ = nullptr;
420     }
421 
422     --nb_lines_;
423     current_pos_ = at( nb_lines_ - 1 );
424 }
425