xref: /glogg/src/data/compressedlinestorage.h (revision 6e2e573c451ec24d720c967fab68538eaa3f3ab5)
1 /*
2  * Copyright (C) 2015 Nicolas Bonnefon and other contributors
3  *
4  * This file is part of glogg.
5  *
6  * glogg is free software: you can redistribute it and/or modify
7  * it under the terms of the GNU General Public License as published by
8  * the Free Software Foundation, either version 3 of the License, or
9  * (at your option) any later version.
10  *
11  * glogg is distributed in the hope that it will be useful,
12  * but WITHOUT ANY WARRANTY; without even the implied warranty of
13  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
14  * GNU General Public License for more details.
15  *
16  * You should have received a copy of the GNU General Public License
17  * along with glogg.  If not, see <http://www.gnu.org/licenses/>.
18  */
19 
20 #include <vector>
21 #include <cstdint>
22 
23 #include "threadprivatestore.h"
24 
25 // This class is a compressed storage backend for LinePositionArray
26 // It emulates the interface of a vector, but take advantage of the nature
27 // of the stored data (increasing end of line addresses) to apply some
28 // compression in memory, while still providing fast, constant-time look-up.
29 
30 /* The current algorithm takes advantage of the fact most lines are reasonably
31  * short, it codes each line on:
32  * - Line < 127 bytes    : 1 byte
33  * - 127 < line < 16383  : 2 bytes
34  * - line > 16383        : 6 bytes (or 10 bytes)
35  * Uncompressed backend stores line on 4 bytes or 8 bytes.
36  *
37  * The algorithm is quite simple, the file is first divided in two parts:
38  * - The lines whose end are located before UINT32_MAX
39  * - The lines whose end are located after UINT32_MAX
40  * Those end of lines are stored separately in the table32 and the table64
41  * respectively.
42  *
43  * The EOL list is then divided in blocks of BLOCK_SIZE (128) lines.
44  * A block index vector (per table) contains pointers to each block.
45  *
46  * Each block is then defined as such:
47  * Block32 (sizes in byte)
48  * 00 - Absolute EOF address (4 bytes)
49  * 04 - ( 0xxx xxxx  if second line is < 127 ) (1 byte, relative)
50  *    - ( 10xx xxxx
51  *        xxxx xxxx  if second line is < 16383 ) (2 bytes, relative)
52  *    - ( 1111 1111
53  *        xxxx xxxx
54  *        xxxx xxxx  if second line is > 16383 ) (6 bytes, absolute)
55  * ...
56  * (126 more lines)
57  *
58  * Block64 (sizes in byte)
59  * 00 - Absolute EOF address (8 bytes)
60  * 08 - ( 0xxx xxxx  if second line is < 127 ) (1 byte, relative)
61  *    - ( 10xx xxxx
62  *        xxxx xxxx  if second line is < 16383 ) (2 bytes, relative)
63  *    - ( 1111 1111
64  *        xxxx xxxx
65  *        xxxx xxxx
66  *        xxxx xxxx
67  *        xxxx xxxx  if second line is > 16383 ) (10 bytes, absolute)
68  * ...
69  * (126 more lines)
70  *
71  * Absolute addressing has been adopted for line > 16383 to bound memory usage in case
72  * of pathologically long (MBs or GBs) lines, even if it is a bit less efficient for
73  * long-ish (30 KB) lines.
74  *
75  * The table32 always starts at 0, the table64 starts at first_long_line_
76  */
77 
78 #ifndef COMPRESSEDLINESTORAGE_H
79 #define COMPRESSEDLINESTORAGE_H
80 
81 #define BLOCK_SIZE 256
82 
83 //template<int BLOCK_SIZE = 128>
84 class CompressedLinePositionStorage
85 {
86   public:
87     // Default constructor
88     CompressedLinePositionStorage()
89     { nb_lines_ = 0; first_long_line_ = UINT32_MAX;
90       current_pos_ = 0; block_pointer_ = nullptr;
91       previous_block_pointer_ = nullptr; }
92     // Copy constructor would be slow, delete!
93     CompressedLinePositionStorage( const CompressedLinePositionStorage& orig ) = delete;
94 
95     // Move constructor
96     CompressedLinePositionStorage( CompressedLinePositionStorage&& orig );
97     // Move assignement
98     CompressedLinePositionStorage& operator=(
99             CompressedLinePositionStorage&& orig );
100     // Destructor
101     ~CompressedLinePositionStorage();
102 
103     // Append the passed end-of-line to the storage
104     void append( uint64_t pos );
105     void push_back( uint64_t pos )
106     { append( pos ); }
107     // Size of the array
108     uint32_t size() const
109     { return nb_lines_; }
110     // Element at index
111     uint64_t at( uint32_t i ) const;
112 
113     // Add one list to the other
114     void append_list( const std::vector<uint64_t>& positions );
115 
116     // Pop the last element of the storage
117     void pop_back();
118 
119   private:
120     // Utility for move ctor/assign
121     void move_from( CompressedLinePositionStorage&& orig );
122 
123     // The two indexes
124     std::vector<char*> block32_index_;
125     std::vector<char*> block64_index_;
126 
127     // Total number of lines in storage
128     uint32_t nb_lines_;
129 
130     // Current position (position of the end of the last line added)
131     uint64_t current_pos_;
132     // Address of the next position (not yet written) within the current
133     // block. nullptr means there is no current block (previous block
134     // finished or no data)
135     char* block_pointer_;
136 
137     // The index of the first line whose end is stored in a block64
138     // Initialised at UINT32_MAX, meaning "unset"
139     // this is the origin point for all calculations in block64
140     uint32_t first_long_line_;
141 
142     // For pop_back:
143 
144     // Previous pointer to block element, it is restored when we
145     // "pop_back" the last element.
146     // A null pointer here means pop_back need to free the block
147     // that has just been created.
148     char* previous_block_pointer_;
149 
150     // Cache the last position read
151     // This is to speed up consecutive reads (whole page)
152     struct Cache {
153         uint32_t index;
154         uint64_t position;
155         char* ptr;
156     };
157     mutable ThreadPrivateStore<Cache,2> last_read_; // = { UINT32_MAX - 1U, 0, nullptr };
158     // mutable Cache last_read;
159 };
160 
161 #endif
162