xref: /glogg/src/data/compressedlinestorage.cpp (revision af5438c7ece65e2f88fe509fd5c31d9c7260457e)
1 /*
2  * Copyright (C) 2015 Nicolas Bonnefon and other contributors
3  *
4  * This file is part of glogg.
5  *
6  * glogg is free software: you can redistribute it and/or modify
7  * it under the terms of the GNU General Public License as published by
8  * the Free Software Foundation, either version 3 of the License, or
9  * (at your option) any later version.
10  *
11  * glogg is distributed in the hope that it will be useful,
12  * but WITHOUT ANY WARRANTY; without even the implied warranty of
13  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
14  * GNU General Public License for more details.
15  *
16  * You should have received a copy of the GNU General Public License
17  * along with glogg.  If not, see <http://www.gnu.org/licenses/>.
18  */
19 
20 #include <cassert>
21 #include <cstdlib>
22 #include <arpa/inet.h>
23 
24 #include <iostream>
25 
26 #include "data/compressedlinestorage.h"
27 
28 namespace {
29     // Functions to manipulate blocks
30 
31     // Create a new 32 bits block of the passed size,
32     // initialised at the passed position
33     char* block32_new( int block_size, uint32_t initial_position,
34            char** block_ptr )
35     {
36         // malloc a block of the maximum possible size
37         // (where every line is >16384)
38         char* ptr = static_cast<char*>( malloc( 4 + block_size * 6 ) );
39 
40         if ( ptr ) {
41             // Write the initial_position
42             *(reinterpret_cast<uint32_t*>(ptr)) = initial_position;
43             *block_ptr = ptr + 4;
44         }
45 
46         return ptr;
47     }
48 
49     // Add a one byte relative delta (0-127) and inc pointer
50     // First bit is always 0
51     void block_add_one_byte_relative( char** ptr, uint8_t value )
52     {
53         **ptr = value;
54         *ptr += sizeof( value );
55     }
56 
57     // Add a two bytes relative delta (0-16383) and inc pointer
58     // First 2 bits are always 10
59     void block_add_two_bytes_relative( char** ptr, uint16_t value )
60     {
61         // Stored in big endian format in order to recognise the initial pattern:
62         // 10xx xxxx xxxx xxxx
63         //  HO byte | LO byte
64         *(reinterpret_cast<uint16_t*>(*ptr)) = htons( value | (1 << 15) );
65         *ptr += sizeof( value );
66     }
67 
68     // Add a signal and a 4 bytes absolute position and inc pointer
69     void block32_add_absolute( char** ptr, uint32_t value )
70     {
71         // 2 bytes marker (actually only the first two bits are tested)
72         *(reinterpret_cast<uint16_t*>(*ptr)) = 0xFF;
73         *ptr += sizeof( uint16_t );
74         // Absolute value (machine endian)
75         *(reinterpret_cast<uint32_t*>(*ptr)) = value;
76         *ptr += sizeof( uint32_t );
77     }
78 
79     // Initialise the passed block for reading, returning
80     // the initial position and a pointer to the second entry.
81     uint64_t block32_initial_pos( char* block, char** ptr )
82     {
83         *ptr = block + sizeof( uint32_t );
84         return *(reinterpret_cast<uint32_t*>(block));
85     }
86 
87     // Give the next position in the block based on the previous
88     // position, then increase the pointer.
89     uint64_t block32_next_pos( char** ptr, uint64_t previous_pos )
90     {
91         uint64_t pos = previous_pos;
92 
93         uint8_t byte = **ptr;
94         ++(*ptr);
95         if ( ! ( byte & 0x80 ) ) {
96             // High order bit is 0
97             pos += byte;
98         }
99         else if ( ( byte & 0xC0 ) == 0x80 ) {
100             // We need to read the low order byte
101             uint8_t lo_byte = **ptr;
102             ++(*ptr);
103             // Remove the starting 10b
104             byte &= ~0xC0;
105             // And form the displacement (stored as big endian)
106             pos += ( (uint16_t) byte << 8 ) | (uint16_t) lo_byte;
107         }
108         else {
109             // Skip the next byte (not used)
110             ++(*ptr);
111             // And read the new absolute pos (machine endian)
112             pos = *(reinterpret_cast<uint32_t*>(*ptr));
113             *ptr += sizeof( uint32_t );
114         }
115 
116         return pos;
117     }
118 }
119 
120 // FIXME: painfully slow temporary copy constructor
121 CompressedLinePositionStorage::CompressedLinePositionStorage(
122         const CompressedLinePositionStorage& orig )
123 {
124     this->append_list( orig );
125 }
126 
127 // FIXME too
128 CompressedLinePositionStorage& CompressedLinePositionStorage::operator=(
129         const CompressedLinePositionStorage& orig )
130 {
131     nb_lines_ = 0;
132     first_long_line_ = UINT32_MAX;
133     current_pos_ = 0;
134     block_pointer_ = nullptr;
135     previous_block_pointer_ = nullptr;
136 
137     for ( char* block : block32_index_ ) {
138         void* p = static_cast<void*>( block );
139         // std::cerr << "block = " << p << std::endl;
140         free( p );
141     }
142 
143     block32_index_.clear();
144 
145     this->append_list( orig );
146 
147     return *this;
148 }
149 
150 void CompressedLinePositionStorage::move_from(
151         CompressedLinePositionStorage&& orig )
152 {
153     nb_lines_        = orig.nb_lines_;
154     first_long_line_ = orig.first_long_line_;
155     current_pos_     = orig.current_pos_;
156     block_pointer_   = orig.block_pointer_;
157     previous_block_pointer_ = orig.previous_block_pointer_;
158 
159     orig.nb_lines_   = 0;
160 }
161 
162 // Move constructor
163 CompressedLinePositionStorage::CompressedLinePositionStorage(
164         CompressedLinePositionStorage&& orig )
165     : block32_index_( std::move( orig.block32_index_ ) )
166 {
167     move_from( std::move( orig ) );
168 }
169 
170 // Move assignement
171 CompressedLinePositionStorage& CompressedLinePositionStorage::operator=(
172         CompressedLinePositionStorage&& orig )
173 {
174     block32_index_ = std::move( orig.block32_index_ );
175     move_from( std::move( orig ) );
176 
177     return *this;
178 }
179 
180 CompressedLinePositionStorage::~CompressedLinePositionStorage()
181 {
182     for ( char* block : block32_index_ ) {
183         void* p = static_cast<void*>( block );
184         // std::cerr << "block = " << p << std::endl;
185         free( p );
186     }
187 }
188 
189 // template<int BLOCK_SIZE>
190 void CompressedLinePositionStorage::append( uint64_t pos )
191 {
192     // Save the pointer in case we need to "pop_back"
193     previous_block_pointer_ = block_pointer_;
194 
195     if ( ! block_pointer_ ) {
196         // We need to start a new block
197         block32_index_.push_back(
198             block32_new( BLOCK_SIZE, pos, &block_pointer_ ) );
199     }
200     else {
201         uint64_t delta = pos - current_pos_;
202         if ( delta < 128 ) {
203             // Code relative on one byte
204             block_add_one_byte_relative( &block_pointer_, delta );
205         }
206         else if ( delta < 16384 ) {
207             // Code relative on two bytes
208             block_add_two_bytes_relative( &block_pointer_, delta );
209         }
210         else {
211             // Code absolute
212             block32_add_absolute( &block_pointer_, pos );
213         }
214     }
215 
216     current_pos_ = pos;
217     ++nb_lines_;
218 
219     if ( nb_lines_ % BLOCK_SIZE == 0 ) {
220         // We have finished the block
221 
222         // Let's reduce its size to what is actually used
223         int block_index = nb_lines_ / BLOCK_SIZE - 1;
224         char* block = block32_index_[block_index];
225         // We allocate extra space for the last element in case it
226         // is replaced by an absolute value in the future (following a pop_back)
227         size_t new_size = ( previous_block_pointer_
228                 + sizeof( uint16_t ) + sizeof( uint32_t ) ) - block;
229         void* new_location = realloc( block, new_size );
230         if ( new_location )
231             block32_index_[block_index] = static_cast<char*>( new_location );
232 
233         block_pointer_ = nullptr;
234     }
235 }
236 
237 // template<int BLOCK_SIZE>
238 uint64_t CompressedLinePositionStorage::at( int index ) const
239 {
240     char* block = block32_index_[ index / BLOCK_SIZE ];
241     char* ptr;
242     uint64_t position = block32_initial_pos( block, &ptr );
243 
244     for ( int i = 0; i < index % BLOCK_SIZE; i++ ) {
245         // Go through all the lines in the block till the one we want
246         position = block32_next_pos( &ptr, position );
247     }
248 
249     return position;
250 }
251 
252 void CompressedLinePositionStorage::append_list(
253         const CompressedLinePositionStorage& other )
254 {
255     // This is not very clever, but caching should make it
256     // reasonably fast.
257     for ( uint32_t i = 0; i < other.size(); i++ )
258         append( other.at( i ) );
259 }
260 
261 // template<int BLOCK_SIZE>
262 void CompressedLinePositionStorage::pop_back()
263 {
264     // Removing the last entered data, there are two cases
265     if ( previous_block_pointer_ ) {
266         // The last append was a normal entry in an existing block,
267         // so we can just revert the pointer
268         block_pointer_ = previous_block_pointer_;
269         previous_block_pointer_ = nullptr;
270     }
271     else {
272         // A new block has been created for the last entry, we need
273         // to de-alloc it.
274 
275         // If we try to pop_back() twice, we're dead!
276         assert( ( nb_lines_ - 1 ) % BLOCK_SIZE == 0 );
277 
278         char* block = block32_index_.back();
279         block32_index_.pop_back();
280         free( block );
281 
282 
283         block_pointer_ = nullptr;
284     }
285 
286     --nb_lines_;
287     current_pos_ = at( nb_lines_ - 1 );
288 }
289