xref: /glogg/src/data/compressedlinestorage.cpp (revision 6e2e573c451ec24d720c967fab68538eaa3f3ab5)
1 /*
2  * Copyright (C) 2015 Nicolas Bonnefon and other contributors
3  *
4  * This file is part of glogg.
5  *
6  * glogg is free software: you can redistribute it and/or modify
7  * it under the terms of the GNU General Public License as published by
8  * the Free Software Foundation, either version 3 of the License, or
9  * (at your option) any later version.
10  *
11  * glogg is distributed in the hope that it will be useful,
12  * but WITHOUT ANY WARRANTY; without even the implied warranty of
13  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
14  * GNU General Public License for more details.
15  *
16  * You should have received a copy of the GNU General Public License
17  * along with glogg.  If not, see <http://www.gnu.org/licenses/>.
18  */
19 
20 #include <cassert>
21 #include <cstdlib>
22 #include <arpa/inet.h>
23 
24 #include <iostream>
25 
26 #include "data/compressedlinestorage.h"
27 
28 namespace {
29     // Functions to manipulate blocks
30 
31     // Create a new 32 bits block of the passed size,
32     // initialised at the passed position
33     char* block32_new( int block_size, uint32_t initial_position,
34            char** block_ptr )
35     {
36         // malloc a block of the maximum possible size
37         // (where every line is >16384)
38         char* ptr = static_cast<char*>( malloc( 4 + block_size * 6 ) );
39 
40         if ( ptr ) {
41             // Write the initial_position
42             *(reinterpret_cast<uint32_t*>(ptr)) = initial_position;
43             *block_ptr = ptr + 4;
44         }
45 
46         return ptr;
47     }
48 
49     // Create a new 64 bits block of the passed size,
50     // initialised at the passed position
51     char* block64_new( int block_size, uint64_t initial_position,
52            char** block_ptr )
53     {
54         // malloc a block of the maximum possible size
55         // (where every line is >16384)
56         char* ptr = static_cast<char*>( malloc( 8 + block_size * 10 ) );
57 
58         if ( ptr ) {
59             // Write the initial_position
60             *(reinterpret_cast<uint64_t*>(ptr)) = initial_position;
61             *block_ptr = ptr + 8;
62         }
63 
64         return ptr;
65     }
66 
67 
68     // Add a one byte relative delta (0-127) and inc pointer
69     // First bit is always 0
70     void block_add_one_byte_relative( char** ptr, uint8_t value )
71     {
72         **ptr = value;
73         *ptr += sizeof( value );
74     }
75 
76     // Add a two bytes relative delta (0-16383) and inc pointer
77     // First 2 bits are always 10
78     void block_add_two_bytes_relative( char** ptr, uint16_t value )
79     {
80         // Stored in big endian format in order to recognise the initial pattern:
81         // 10xx xxxx xxxx xxxx
82         //  HO byte | LO byte
83         *(reinterpret_cast<uint16_t*>(*ptr)) = htons( value | (1 << 15) );
84         *ptr += sizeof( value );
85     }
86 
87     // Add a signal and a 4 bytes absolute position and inc pointer
88     void block32_add_absolute( char** ptr, uint32_t value )
89     {
90         // 2 bytes marker (actually only the first two bits are tested)
91         *(reinterpret_cast<uint16_t*>(*ptr)) = 0xFF;
92         *ptr += sizeof( uint16_t );
93         // Absolute value (machine endian)
94         *(reinterpret_cast<uint32_t*>(*ptr)) = value;
95         *ptr += sizeof( uint32_t );
96     }
97 
98     // Initialise the passed block for reading, returning
99     // the initial position and a pointer to the second entry.
100     uint64_t block32_initial_pos( char* block, char** ptr )
101     {
102         *ptr = block + sizeof( uint32_t );
103         return *(reinterpret_cast<uint32_t*>(block));
104     }
105 
106     // Give the next position in the block based on the previous
107     // position, then increase the pointer.
108     uint64_t block32_next_pos( char** ptr, uint64_t previous_pos )
109     {
110         uint64_t pos = previous_pos;
111 
112         uint8_t byte = **ptr;
113         ++(*ptr);
114         if ( ! ( byte & 0x80 ) ) {
115             // High order bit is 0
116             pos += byte;
117         }
118         else if ( ( byte & 0xC0 ) == 0x80 ) {
119             // We need to read the low order byte
120             uint8_t lo_byte = **ptr;
121             ++(*ptr);
122             // Remove the starting 10b
123             byte &= ~0xC0;
124             // And form the displacement (stored as big endian)
125             pos += ( (uint16_t) byte << 8 ) | (uint16_t) lo_byte;
126         }
127         else {
128             // Skip the next byte (not used)
129             ++(*ptr);
130             // And read the new absolute pos (machine endian)
131             pos = *(reinterpret_cast<uint32_t*>(*ptr));
132             *ptr += sizeof( uint32_t );
133         }
134 
135         return pos;
136     }
137 
138     // Add a signal and a 8 bytes absolute position and inc pointer
139     void block64_add_absolute( char** ptr, uint64_t value )
140     {
141         // This is unaligned, can cause problem on some CPUs
142 
143         // 2 bytes marker (actually only the first two bits are tested)
144         *(reinterpret_cast<uint16_t*>(*ptr)) = 0xFF;
145         *ptr += sizeof( uint16_t );
146         // Absolute value (machine endian)
147         *(reinterpret_cast<uint64_t*>(*ptr)) = value;
148         *ptr += sizeof( uint64_t );
149     }
150 
151     // Initialise the passed block for reading, returning
152     // the initial position and a pointer to the second entry.
153     uint64_t block64_initial_pos( char* block, char** ptr )
154     {
155         *ptr = block + sizeof( uint64_t );
156         return *(reinterpret_cast<uint64_t*>(block));
157     }
158 
159     // Give the next position in the block based on the previous
160     // position, then increase the pointer.
161     uint64_t block64_next_pos( char** ptr, uint64_t previous_pos )
162     {
163         uint64_t pos = previous_pos;
164 
165         uint8_t byte = **ptr;
166         ++(*ptr);
167         if ( ! ( byte & 0x80 ) ) {
168             // High order bit is 0
169             pos += byte;
170         }
171         else if ( ( byte & 0xC0 ) == 0x80 ) {
172             // We need to read the low order byte
173             uint8_t lo_byte = **ptr;
174             ++(*ptr);
175             // Remove the starting 10b
176             byte &= ~0xC0;
177             // And form the displacement (stored as big endian)
178             pos += ( (uint16_t) byte << 8 ) | (uint16_t) lo_byte;
179         }
180         else {
181             // Skip the next byte (not used)
182             ++(*ptr);
183             // And read the new absolute pos (machine endian)
184             pos = *(reinterpret_cast<uint64_t*>(*ptr));
185             *ptr += sizeof( uint64_t );
186         }
187 
188         return pos;
189     }
190 }
191 
192 void CompressedLinePositionStorage::move_from(
193         CompressedLinePositionStorage&& orig )
194 {
195     nb_lines_        = orig.nb_lines_;
196     first_long_line_ = orig.first_long_line_;
197     current_pos_     = orig.current_pos_;
198     block_pointer_   = orig.block_pointer_;
199     previous_block_pointer_ = orig.previous_block_pointer_;
200 
201     orig.nb_lines_   = 0;
202 }
203 
204 // Move constructor
205 CompressedLinePositionStorage::CompressedLinePositionStorage(
206         CompressedLinePositionStorage&& orig )
207     : block32_index_( std::move( orig.block32_index_ ) )
208 {
209     move_from( std::move( orig ) );
210 }
211 
212 // Move assignement
213 CompressedLinePositionStorage& CompressedLinePositionStorage::operator=(
214         CompressedLinePositionStorage&& orig )
215 {
216     block32_index_ = std::move( orig.block32_index_ );
217     move_from( std::move( orig ) );
218 
219     return *this;
220 }
221 
222 CompressedLinePositionStorage::~CompressedLinePositionStorage()
223 {
224     for ( char* block : block32_index_ ) {
225         void* p = static_cast<void*>( block );
226         // std::cerr << "block = " << p << std::endl;
227         free( p );
228     }
229 
230     for ( char* block : block64_index_ ) {
231         void* p = static_cast<void*>( block );
232         // std::cerr << "block = " << p << std::endl;
233         free( p );
234     }
235 }
236 
237 // template<int BLOCK_SIZE>
238 void CompressedLinePositionStorage::append( uint64_t pos )
239 {
240     // Save the pointer in case we need to "pop_back"
241     previous_block_pointer_ = block_pointer_;
242 
243     bool store_in_big = false;
244     if ( pos > UINT32_MAX ) {
245         store_in_big = true;
246         if ( first_long_line_ == UINT32_MAX ) {
247             // First "big" end of line, we will start a new (64) block
248             first_long_line_ = nb_lines_;
249             block_pointer_ = nullptr;
250         }
251     }
252 
253     if ( ! block_pointer_ ) {
254         // We need to start a new block
255         if ( ! store_in_big )
256             block32_index_.push_back(
257                 block32_new( BLOCK_SIZE, pos, &block_pointer_ ) );
258         else
259             block64_index_.push_back(
260                 block64_new( BLOCK_SIZE, pos, &block_pointer_ ) );
261     }
262     else {
263         uint64_t delta = pos - current_pos_;
264         if ( delta < 128 ) {
265             // Code relative on one byte
266             block_add_one_byte_relative( &block_pointer_, delta );
267         }
268         else if ( delta < 16384 ) {
269             // Code relative on two bytes
270             block_add_two_bytes_relative( &block_pointer_, delta );
271         }
272         else {
273             // Code absolute
274             if ( ! store_in_big )
275                 block32_add_absolute( &block_pointer_, pos );
276             else
277                 block64_add_absolute( &block_pointer_, pos );
278         }
279     }
280 
281     current_pos_ = pos;
282     ++nb_lines_;
283 
284     if ( ! store_in_big ) {
285         if ( nb_lines_ % BLOCK_SIZE == 0 ) {
286             // We have finished the block
287 
288             // Let's reduce its size to what is actually used
289             int block_index = nb_lines_ / BLOCK_SIZE - 1;
290             char* block = block32_index_[block_index];
291             // We allocate extra space for the last element in case it
292             // is replaced by an absolute value in the future (following a pop_back)
293             size_t new_size = ( previous_block_pointer_
294                     + sizeof( uint16_t ) + sizeof( uint32_t ) ) - block;
295             void* new_location = realloc( block, new_size );
296             if ( new_location )
297                 block32_index_[block_index] = static_cast<char*>( new_location );
298 
299             block_pointer_ = nullptr;
300         }
301     }
302     else {
303         if ( ( nb_lines_ - first_long_line_ ) % BLOCK_SIZE == 0 ) {
304             // We have finished the block
305 
306             // Let's reduce its size to what is actually used
307             int block_index = ( nb_lines_ - first_long_line_ ) / BLOCK_SIZE - 1;
308             char* block = block64_index_[block_index];
309             // We allocate extra space for the last element in case it
310             // is replaced by an absolute value in the future (following a pop_back)
311             size_t new_size = ( previous_block_pointer_
312                     + sizeof( uint16_t ) + sizeof( uint64_t ) ) - block;
313             void* new_location = realloc( block, new_size );
314             if ( new_location )
315                 block64_index_[block_index] = static_cast<char*>( new_location );
316 
317             block_pointer_ = nullptr;
318         }
319     }
320 }
321 
322 // template<int BLOCK_SIZE>
323 uint64_t CompressedLinePositionStorage::at( uint32_t index ) const
324 {
325     char* ptr;
326     uint64_t position;
327 
328     Cache* last_read = last_read_.getPtr();
329 
330     if ( index < first_long_line_ ) {
331         if ( ( index == last_read->index + 1 ) && ( index % BLOCK_SIZE != 0 ) ) {
332             position = last_read->position;
333             ptr      = last_read->ptr;
334 
335             position = block32_next_pos( &ptr, position );
336         }
337         else {
338             char* block = block32_index_[ index / BLOCK_SIZE ];
339             position = block32_initial_pos( block, &ptr );
340 
341             for ( uint32_t i = 0; i < index % BLOCK_SIZE; i++ ) {
342                 // Go through all the lines in the block till the one we want
343                 position = block32_next_pos( &ptr, position );
344             }
345         }
346     }
347     else {
348         const uint32_t index_in_64 = index - first_long_line_;
349         if ( ( index == last_read->index + 1 ) && ( index_in_64 % BLOCK_SIZE != 0 ) ) {
350             position = last_read->position;
351             ptr      = last_read->ptr;
352 
353             position = block64_next_pos( &ptr, position );
354         }
355         else {
356             char* block = block64_index_[ index_in_64 / BLOCK_SIZE ];
357             position = block64_initial_pos( block, &ptr );
358 
359             for ( uint32_t i = 0; i < index_in_64 % BLOCK_SIZE; i++ ) {
360                 // Go through all the lines in the block till the one we want
361                 position = block64_next_pos( &ptr, position );
362             }
363         }
364     }
365 
366     // Populate our cache ready for next consecutive read
367     last_read->index    = index;
368     last_read->position = position;
369     last_read->ptr      = ptr;
370 
371     return position;
372 }
373 
374 void CompressedLinePositionStorage::append_list(
375         const std::vector<uint64_t>& positions )
376 {
377     // This is not very clever, but caching should make it
378     // reasonably fast.
379     for ( uint32_t i = 0; i < positions.size(); i++ )
380         append( positions.at( i ) );
381 }
382 
383 // template<int BLOCK_SIZE>
384 void CompressedLinePositionStorage::pop_back()
385 {
386     // Removing the last entered data, there are two cases
387     if ( previous_block_pointer_ ) {
388         // The last append was a normal entry in an existing block,
389         // so we can just revert the pointer
390         block_pointer_ = previous_block_pointer_;
391         previous_block_pointer_ = nullptr;
392     }
393     else {
394         // A new block has been created for the last entry, we need
395         // to de-alloc it.
396 
397         // If we try to pop_back() twice, we're dead!
398         assert( ( nb_lines_ - 1 ) % BLOCK_SIZE == 0 );
399 
400         char* block = block32_index_.back();
401         block32_index_.pop_back();
402         free( block );
403 
404 
405         block_pointer_ = nullptr;
406     }
407 
408     --nb_lines_;
409     current_pos_ = at( nb_lines_ - 1 );
410 }
411