xref: /glogg/src/data/compressedlinestorage.h (revision 3c46a4695c48e83de2b8042a54c32333a43a44eb)
1 /*
2  * Copyright (C) 2015 Nicolas Bonnefon and other contributors
3  *
4  * This file is part of glogg.
5  *
6  * glogg is free software: you can redistribute it and/or modify
7  * it under the terms of the GNU General Public License as published by
8  * the Free Software Foundation, either version 3 of the License, or
9  * (at your option) any later version.
10  *
11  * glogg is distributed in the hope that it will be useful,
12  * but WITHOUT ANY WARRANTY; without even the implied warranty of
13  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
14  * GNU General Public License for more details.
15  *
16  * You should have received a copy of the GNU General Public License
17  * along with glogg.  If not, see <http://www.gnu.org/licenses/>.
18  */
19 
20 #include <vector>
21 #include <cstdint>
22 
23 // This class is a compressed storage backend for LinePositionArray
24 // It emulates the interface of a vector, but take advantage of the nature
25 // of the stored data (increasing end of line addresses) to apply some
26 // compression in memory, while still providing fast, constant-time look-up.
27 
28 /* The current algorithm takes advantage of the fact most lines are reasonably
29  * short, it codes each line on:
30  * - Line < 127 bytes    : 1 byte
31  * - 127 < line < 16383  : 2 bytes
32  * - line > 16383        : 6 bytes (or 10 bytes)
33  * Uncompressed backend stores line on 4 bytes or 8 bytes.
34  *
35  * The algorithm is quite simple, the file is first divided in two parts:
36  * - The lines whose end are located before UINT32_MAX
37  * - The lines whose end are located after UINT32_MAX
38  * Those end of lines are stored separately in the table32 and the table64
39  * respectively.
40  *
41  * The EOL list is then divided in blocks of BLOCK_SIZE (128) lines.
42  * A block index vector (per table) contains pointers to each block.
43  *
44  * Each block is then defined as such:
45  * Block32 (sizes in byte)
46  * 00 - Absolute EOF address (4 bytes)
47  * 04 - ( 0xxx xxxx  if second line is < 127 ) (1 byte, relative)
48  *    - ( 10xx xxxx
49  *        xxxx xxxx  if second line is < 16383 ) (2 bytes, relative)
50  *    - ( 1111 1111
51  *        xxxx xxxx
52  *        xxxx xxxx  if second line is > 16383 ) (6 bytes, absolute)
53  * ...
54  * (126 more lines)
55  *
56  * Block64 (sizes in byte)
57  * 00 - Absolute EOF address (8 bytes)
58  * 08 - ( 0xxx xxxx  if second line is < 127 ) (1 byte, relative)
59  *    - ( 10xx xxxx
60  *        xxxx xxxx  if second line is < 16383 ) (2 bytes, relative)
61  *    - ( 1111 1111
62  *        xxxx xxxx
63  *        xxxx xxxx
64  *        xxxx xxxx
65  *        xxxx xxxx  if second line is > 16383 ) (10 bytes, absolute)
66  * ...
67  * (126 more lines)
68  *
69  * Absolute addressing has been adopted for line > 16383 to bound memory usage in case
70  * of pathologically long (MBs or GBs) lines, even if it is a bit less efficient for
71  * long-ish (30 KB) lines.
72  */
73 
74 #ifndef COMPRESSEDLINESTORAGE_H
75 #define COMPRESSEDLINESTORAGE_H
76 
77 #define BLOCK_SIZE 128
78 
79 //template<int BLOCK_SIZE = 128>
80 class CompressedLinePositionStorage
81 {
82   public:
83     // Default constructor
84     CompressedLinePositionStorage()
85     { nb_lines_ = 0; first_long_line_ = UINT32_MAX;
86       current_pos_ = 0; block_pointer_ = nullptr;
87       previous_block_pointer_ = nullptr; }
88     // Copy constructor would be slow, delete!
89     CompressedLinePositionStorage( const CompressedLinePositionStorage& orig ) = delete;
90     // Move constructor
91     CompressedLinePositionStorage( CompressedLinePositionStorage&& orig );
92     // Move assignement
93     CompressedLinePositionStorage& operator=(
94             CompressedLinePositionStorage&& orig );
95     // Destructor
96     ~CompressedLinePositionStorage();
97 
98     // Append the passed end-of-line to the storage
99     void append( uint64_t pos );
100     // Size of the array
101     uint32_t size() const
102     { return nb_lines_; }
103     // Element at index
104     uint64_t at( int i ) const;
105 
106     // Add one list to the other
107     // TODO: do we need a move version?
108     void append_list( const CompressedLinePositionStorage& other );
109 
110     // Pop the last element of the storage
111     void pop_back();
112 
113   private:
114     // Utility for move ctor/assign
115     void move_from( CompressedLinePositionStorage&& orig );
116 
117     // The two indexes
118     std::vector<char*> block32_index_;
119     std::vector<char*> block64_index_;
120 
121     // Total number of lines in storage
122     uint32_t nb_lines_;
123 
124     // Current position (position of the end of the last line added)
125     uint64_t current_pos_;
126     // Address of the next position (not yet written) within the current
127     // block. nullptr means there is no current block (previous block
128     // finished or no data)
129     char* block_pointer_;
130 
131     // The index of the first line whose end is stored in a block64
132     // Initialised at UINT32_MAX, meaning "unset"
133     uint32_t first_long_line_;
134 
135     // For pop_back:
136 
137     // Previous pointer to block element, it is restored when we
138     // "pop_back" the last element.
139     // A null pointer here means pop_back need to free the block
140     // that has just been created.
141     char* previous_block_pointer_;
142 };
143 
144 #endif
145