xref: /glogg/src/data/compressedlinestorage.h (revision 7a2f3a57e884000357471fe3462e6bb0262e96af)
1 /*
2  * Copyright (C) 2015 Nicolas Bonnefon and other contributors
3  *
4  * This file is part of glogg.
5  *
6  * glogg is free software: you can redistribute it and/or modify
7  * it under the terms of the GNU General Public License as published by
8  * the Free Software Foundation, either version 3 of the License, or
9  * (at your option) any later version.
10  *
11  * glogg is distributed in the hope that it will be useful,
12  * but WITHOUT ANY WARRANTY; without even the implied warranty of
13  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
14  * GNU General Public License for more details.
15  *
16  * You should have received a copy of the GNU General Public License
17  * along with glogg.  If not, see <http://www.gnu.org/licenses/>.
18  */
19 
20 #include <vector>
21 #include <cstdint>
22 
23 // This class is a compressed storage backend for LinePositionArray
24 // It emulates the interface of a vector, but take advantage of the nature
25 // of the stored data (increasing end of line addresses) to apply some
26 // compression in memory, while still providing fast, constant-time look-up.
27 
28 /* The current algorithm takes advantage of the fact most lines are reasonably
29  * short, it codes each line on:
30  * - Line < 127 bytes    : 1 byte
31  * - 127 < line < 16383  : 2 bytes
32  * - line > 16383        : 6 bytes (or 10 bytes)
33  * Uncompressed backend stores line on 4 bytes or 8 bytes.
34  *
35  * The algorithm is quite simple, the file is first divided in two parts:
36  * - The lines whose end are located before UINT32_MAX
37  * - The lines whose end are located after UINT32_MAX
38  * Those end of lines are stored separately in the table32 and the table64
39  * respectively.
40  *
41  * The EOL list is then divided in blocks of BLOCK_SIZE (128) lines.
42  * A block index vector (per table) contains pointers to each block.
43  *
44  * Each block is then defined as such:
45  * Block32 (sizes in byte)
46  * 00 - Absolute EOF address (4 bytes)
47  * 04 - ( 0xxx xxxx  if second line is < 127 ) (1 byte, relative)
48  *    - ( 10xx xxxx
49  *        xxxx xxxx  if second line is < 16383 ) (2 bytes, relative)
50  *    - ( 1111 1111
51  *        xxxx xxxx
52  *        xxxx xxxx  if second line is > 16383 ) (6 bytes, absolute)
53  * ...
54  * (126 more lines)
55  *
56  * Block64 (sizes in byte)
57  * 00 - Absolute EOF address (8 bytes)
58  * 08 - ( 0xxx xxxx  if second line is < 127 ) (1 byte, relative)
59  *    - ( 10xx xxxx
60  *        xxxx xxxx  if second line is < 16383 ) (2 bytes, relative)
61  *    - ( 1111 1111
62  *        xxxx xxxx
63  *        xxxx xxxx
64  *        xxxx xxxx
65  *        xxxx xxxx  if second line is > 16383 ) (10 bytes, absolute)
66  * ...
67  * (126 more lines)
68  *
69  * Absolute addressing has been adopted for line > 16383 to bound memory usage in case
70  * of pathologically long (MBs or GBs) lines, even if it is a bit less efficient for
71  * long-ish (30 KB) lines.
72  */
73 
74 #ifndef COMPRESSEDLINESTORAGE_H
75 #define COMPRESSEDLINESTORAGE_H
76 
77 #define BLOCK_SIZE 128
78 
79 //template<int BLOCK_SIZE = 128>
80 class CompressedLinePositionStorage
81 {
82   public:
83     // Default constructor
84     CompressedLinePositionStorage()
85     { nb_lines_ = 0; first_long_line_ = UINT32_MAX;
86       current_pos_ = 0; block_pointer_ = nullptr; }
87     // Copy constructor
88     CompressedLinePositionStorage( const CompressedLinePositionStorage& orig );
89     // TODO: do we need move constructor?
90 
91     // Append the passed end-of-line to the storage
92     void append( uint64_t pos );
93     // Size of the array
94     uint32_t size() const
95     { return nb_lines_; }
96     // Element at index
97     uint64_t at( int i ) const;
98 
99     // Add one list to the other
100     // TODO: do we need a move version?
101     void append_list( const CompressedLinePositionStorage& other );
102 
103     // Pop the last element of the storage
104     void pop_back();
105 
106   private:
107     // The two indexes
108     std::vector<char*> block32_index_;
109     std::vector<char*> block64_index_;
110 
111     // Total number of lines in storage
112     uint32_t nb_lines_;
113 
114     // Current position (position of the end of the last line added)
115     uint64_t current_pos_;
116     // Address of the next position (not yet written) within the current
117     // block. nullptr means there is no current block (previous block
118     // finished or no data)
119     char* block_pointer_;
120 
121     // The index of the first line whose end is stored in a block64
122     // Initialised at UINT32_MAX, meaning "unset"
123     uint32_t first_long_line_;
124 };
125 
126 #endif
127