xref: /glogg/src/data/compressedlinestorage.h (revision 653377b63177ff63943e126bf71462b35ceb8b9d)
1 /*
2  * Copyright (C) 2015 Nicolas Bonnefon and other contributors
3  *
4  * This file is part of glogg.
5  *
6  * glogg is free software: you can redistribute it and/or modify
7  * it under the terms of the GNU General Public License as published by
8  * the Free Software Foundation, either version 3 of the License, or
9  * (at your option) any later version.
10  *
11  * glogg is distributed in the hope that it will be useful,
12  * but WITHOUT ANY WARRANTY; without even the implied warranty of
13  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
14  * GNU General Public License for more details.
15  *
16  * You should have received a copy of the GNU General Public License
17  * along with glogg.  If not, see <http://www.gnu.org/licenses/>.
18  */
19 
20 #include <vector>
21 #include <cstdint>
22 
23 // This class is a compressed storage backend for LinePositionArray
24 // It emulates the interface of a vector, but take advantage of the nature
25 // of the stored data (increasing end of line addresses) to apply some
26 // compression in memory, while still providing fast, constant-time look-up.
27 
28 /* The current algorithm takes advantage of the fact most lines are reasonably
29  * short, it codes each line on:
30  * - Line < 127 bytes    : 1 byte
31  * - 127 < line < 16383  : 2 bytes
32  * - line > 16383        : 6 bytes (or 10 bytes)
33  * Uncompressed backend stores line on 4 bytes or 8 bytes.
34  *
35  * The algorithm is quite simple, the file is first divided in two parts:
36  * - The lines whose end are located before UINT32_MAX
37  * - The lines whose end are located after UINT32_MAX
38  * Those end of lines are stored separately in the table32 and the table64
39  * respectively.
40  *
41  * The EOL list is then divided in blocks of BLOCK_SIZE (128) lines.
42  * A block index vector (per table) contains pointers to each block.
43  *
44  * Each block is then defined as such:
45  * Block32 (sizes in byte)
46  * 00 - Absolute EOF address (4 bytes)
47  * 04 - ( 0xxx xxxx  if second line is < 127 ) (1 byte, relative)
48  *    - ( 10xx xxxx
49  *        xxxx xxxx  if second line is < 16383 ) (2 bytes, relative)
50  *    - ( 1111 1111
51  *        xxxx xxxx
52  *        xxxx xxxx  if second line is > 16383 ) (6 bytes, absolute)
53  * ...
54  * (126 more lines)
55  *
56  * Block64 (sizes in byte)
57  * 00 - Absolute EOF address (8 bytes)
58  * 08 - ( 0xxx xxxx  if second line is < 127 ) (1 byte, relative)
59  *    - ( 10xx xxxx
60  *        xxxx xxxx  if second line is < 16383 ) (2 bytes, relative)
61  *    - ( 1111 1111
62  *        xxxx xxxx
63  *        xxxx xxxx
64  *        xxxx xxxx
65  *        xxxx xxxx  if second line is > 16383 ) (10 bytes, absolute)
66  * ...
67  * (126 more lines)
68  *
69  * Absolute addressing has been adopted for line > 16383 to bound memory usage in case
70  * of pathologically long (MBs or GBs) lines, even if it is a bit less efficient for
71  * long-ish (30 KB) lines.
72  */
73 
74 #ifndef COMPRESSEDLINESTORAGE_H
75 #define COMPRESSEDLINESTORAGE_H
76 
77 #define BLOCK_SIZE 128
78 
79 //template<int BLOCK_SIZE = 128>
80 class CompressedLinePositionStorage
81 {
82   public:
83     // Default constructor
84     CompressedLinePositionStorage()
85     { nb_lines_ = 0; first_long_line_ = UINT32_MAX;
86       current_pos_ = 0; block_pointer_ = nullptr;
87       previous_block_pointer_ = nullptr; }
88     // Copy constructor would be slow, delete!
89     //CompressedLinePositionStorage( const CompressedLinePositionStorage& orig ) = delete;
90     // FIXME, let's implement it for now
91     CompressedLinePositionStorage( const CompressedLinePositionStorage& orig );
92     CompressedLinePositionStorage& operator=(
93             const CompressedLinePositionStorage& orig );
94 
95     // Move constructor
96     CompressedLinePositionStorage( CompressedLinePositionStorage&& orig );
97     // Move assignement
98     CompressedLinePositionStorage& operator=(
99             CompressedLinePositionStorage&& orig );
100     // Destructor
101     ~CompressedLinePositionStorage();
102 
103     // Append the passed end-of-line to the storage
104     void append( uint64_t pos );
105     // Size of the array
106     uint32_t size() const
107     { return nb_lines_; }
108     // Element at index
109     uint64_t at( int i ) const;
110 
111     // Add one list to the other
112     // TODO: do we need a move version?
113     void append_list( const CompressedLinePositionStorage& other );
114 
115     // Pop the last element of the storage
116     void pop_back();
117 
118   private:
119     // Utility for move ctor/assign
120     void move_from( CompressedLinePositionStorage&& orig );
121 
122     // The two indexes
123     std::vector<char*> block32_index_;
124     std::vector<char*> block64_index_;
125 
126     // Total number of lines in storage
127     uint32_t nb_lines_;
128 
129     // Current position (position of the end of the last line added)
130     uint64_t current_pos_;
131     // Address of the next position (not yet written) within the current
132     // block. nullptr means there is no current block (previous block
133     // finished or no data)
134     char* block_pointer_;
135 
136     // The index of the first line whose end is stored in a block64
137     // Initialised at UINT32_MAX, meaning "unset"
138     uint32_t first_long_line_;
139 
140     // For pop_back:
141 
142     // Previous pointer to block element, it is restored when we
143     // "pop_back" the last element.
144     // A null pointer here means pop_back need to free the block
145     // that has just been created.
146     char* previous_block_pointer_;
147 };
148 
149 #endif
150