xref: /glogg/src/data/compressedlinestorage.cpp (revision 3c46a4695c48e83de2b8042a54c32333a43a44eb)
1 /*
2  * Copyright (C) 2015 Nicolas Bonnefon and other contributors
3  *
4  * This file is part of glogg.
5  *
6  * glogg is free software: you can redistribute it and/or modify
7  * it under the terms of the GNU General Public License as published by
8  * the Free Software Foundation, either version 3 of the License, or
9  * (at your option) any later version.
10  *
11  * glogg is distributed in the hope that it will be useful,
12  * but WITHOUT ANY WARRANTY; without even the implied warranty of
13  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
14  * GNU General Public License for more details.
15  *
16  * You should have received a copy of the GNU General Public License
17  * along with glogg.  If not, see <http://www.gnu.org/licenses/>.
18  */
19 
20 #include <cassert>
21 #include <cstdlib>
22 #include <arpa/inet.h>
23 
24 #include "data/compressedlinestorage.h"
25 
26 namespace {
27     // Functions to manipulate blocks
28 
29     // Create a new 32 bits block of the passed size,
30     // initialised at the passed position
31     char* block32_new( int block_size, uint32_t initial_position,
32            char** block_ptr )
33     {
34         // malloc a block of the maximum possible size
35         // (where every line is >16384)
36         char* ptr = static_cast<char*>( malloc( 4 + block_size * 6 ) );
37 
38         if ( ptr ) {
39             // Write the initial_position
40             *(reinterpret_cast<uint32_t*>(ptr)) = initial_position;
41             *block_ptr = ptr + 4;
42         }
43 
44         return ptr;
45     }
46 
47     // Add a one byte relative delta (0-127) and inc pointer
48     // First bit is always 0
49     void block_add_one_byte_relative( char** ptr, uint8_t value )
50     {
51         **ptr = value;
52         *ptr += sizeof( value );
53     }
54 
55     // Add a two bytes relative delta (0-16383) and inc pointer
56     // First 2 bits are always 10
57     void block_add_two_bytes_relative( char** ptr, uint16_t value )
58     {
59         // Stored in big endian format in order to recognise the initial pattern:
60         // 10xx xxxx xxxx xxxx
61         //  HO byte | LO byte
62         *(reinterpret_cast<uint16_t*>(*ptr)) = htons( value | (1 << 15) );
63         *ptr += sizeof( value );
64     }
65 
66     // Add a signal and a 4 bytes absolute position and inc pointer
67     void block32_add_absolute( char** ptr, uint32_t value )
68     {
69         // 2 bytes marker (actually only the first two bits are tested)
70         *(reinterpret_cast<uint16_t*>(*ptr)) = 0xFF;
71         *ptr += sizeof( uint16_t );
72         // Absolute value (machine endian)
73         *(reinterpret_cast<uint32_t*>(*ptr)) = value;
74         *ptr += sizeof( uint32_t );
75     }
76 
77     // Initialise the passed block for reading, returning
78     // the initial position and a pointer to the second entry.
79     uint64_t block32_initial_pos( char* block, char** ptr )
80     {
81         *ptr = block + sizeof( uint32_t );
82         return *(reinterpret_cast<uint32_t*>(block));
83     }
84 
85     // Give the next position in the block based on the previous
86     // position, then increase the pointer.
87     uint64_t block32_next_pos( char** ptr, uint64_t previous_pos )
88     {
89         uint64_t pos = previous_pos;
90 
91         uint8_t byte = **ptr;
92         ++(*ptr);
93         if ( ! ( byte & 0x80 ) ) {
94             // High order bit is 0
95             pos += byte;
96         }
97         else if ( ( byte & 0xC0 ) == 0x80 ) {
98             // We need to read the low order byte
99             uint8_t lo_byte = **ptr;
100             ++(*ptr);
101             // Remove the starting 10b
102             byte &= ~0xC0;
103             // And form the displacement (stored as big endian)
104             pos += ( (uint16_t) byte << 8 ) | (uint16_t) lo_byte;
105         }
106         else {
107             // Skip the next byte (not used)
108             ++(*ptr);
109             // And read the new absolute pos (machine endian)
110             pos = *(reinterpret_cast<uint32_t*>(*ptr));
111             *ptr += sizeof( uint32_t );
112         }
113 
114         return pos;
115     }
116 }
117 
118 void CompressedLinePositionStorage::move_from(
119         CompressedLinePositionStorage&& orig )
120 {
121     nb_lines_        = orig.nb_lines_;
122     first_long_line_ = orig.first_long_line_;
123     current_pos_     = orig.current_pos_;
124     block_pointer_   = orig.block_pointer_;
125     previous_block_pointer_ = orig.previous_block_pointer_;
126 
127     orig.nb_lines_   = 0;
128 }
129 
130 // Move constructor
131 CompressedLinePositionStorage::CompressedLinePositionStorage(
132         CompressedLinePositionStorage&& orig )
133     : block32_index_( std::move( orig.block32_index_ ) )
134 {
135     move_from( std::move( orig ) );
136 }
137 
138 // Move assignement
139 CompressedLinePositionStorage& CompressedLinePositionStorage::operator=(
140         CompressedLinePositionStorage&& orig )
141 {
142     block32_index_ = std::move( orig.block32_index_ );
143     move_from( std::move( orig ) );
144 
145     return *this;
146 }
147 
148 CompressedLinePositionStorage::~CompressedLinePositionStorage()
149 {
150     for ( char* block : block32_index_ ) {
151         void* p = static_cast<void*>( block );
152         free( p );
153     }
154 }
155 
156 // template<int BLOCK_SIZE>
157 void CompressedLinePositionStorage::append( uint64_t pos )
158 {
159     // Save the pointer in case we need to "pop_back"
160     previous_block_pointer_ = block_pointer_;
161 
162     if ( ! block_pointer_ ) {
163         // We need to start a new block
164         block32_index_.push_back(
165             block32_new( BLOCK_SIZE, pos, &block_pointer_ ) );
166     }
167     else {
168         uint64_t delta = pos - current_pos_;
169         if ( delta < 128 ) {
170             // Code relative on one byte
171             block_add_one_byte_relative( &block_pointer_, delta );
172         }
173         else if ( delta < 16384 ) {
174             // Code relative on two bytes
175             block_add_two_bytes_relative( &block_pointer_, delta );
176         }
177         else {
178             // Code absolute
179             block32_add_absolute( &block_pointer_, pos );
180         }
181     }
182 
183     current_pos_ = pos;
184     ++nb_lines_;
185 
186     if ( nb_lines_ % BLOCK_SIZE == 0 ) {
187         // We have finished the block
188         block_pointer_ = nullptr;
189     }
190 }
191 
192 // template<int BLOCK_SIZE>
193 uint64_t CompressedLinePositionStorage::at( int index ) const
194 {
195     char* block = block32_index_[ index / BLOCK_SIZE ];
196     char* ptr;
197     uint64_t position = block32_initial_pos( block, &ptr );
198 
199     for ( int i = 0; i < index % BLOCK_SIZE; i++ ) {
200         // Go through all the lines in the block till the one we want
201         position = block32_next_pos( &ptr, position );
202     }
203 
204     return position;
205 }
206 
207 void CompressedLinePositionStorage::append_list(
208         const CompressedLinePositionStorage& other )
209 {
210     // This is not very clever, but caching should make it
211     // reasonably fast.
212     for ( uint32_t i = 0; i < other.size(); i++ )
213         append( other.at( i ) );
214 }
215 
216 // template<int BLOCK_SIZE>
217 void CompressedLinePositionStorage::pop_back()
218 {
219     // Removing the last entered data, there are two cases
220     if ( previous_block_pointer_ ) {
221         // The last append was a normal entry in an existing block,
222         // so we can just revert the pointer
223         block_pointer_ = previous_block_pointer_;
224         previous_block_pointer_ = nullptr;
225     }
226     else {
227         // A new block has been created for the last entry, we need
228         // to de-alloc it.
229 
230         // If we try to pop_back() twice, we're dead!
231         assert( ( nb_lines_ - 1 ) % BLOCK_SIZE == 0 );
232 
233         char* block = block32_index_.back();
234         block32_index_.pop_back();
235         free( block );
236 
237 
238         block_pointer_ = nullptr;
239     }
240 
241     --nb_lines_;
242     current_pos_ = at( nb_lines_ - 1 );
243 }
244