xref: /glogg/src/data/compressedlinestorage.h (revision 7151e3ac37b1aab5aeda99b64e862aecf2a6be89)
1 /*
2  * Copyright (C) 2015 Nicolas Bonnefon and other contributors
3  *
4  * This file is part of glogg.
5  *
6  * glogg is free software: you can redistribute it and/or modify
7  * it under the terms of the GNU General Public License as published by
8  * the Free Software Foundation, either version 3 of the License, or
9  * (at your option) any later version.
10  *
11  * glogg is distributed in the hope that it will be useful,
12  * but WITHOUT ANY WARRANTY; without even the implied warranty of
13  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
14  * GNU General Public License for more details.
15  *
16  * You should have received a copy of the GNU General Public License
17  * along with glogg.  If not, see <http://www.gnu.org/licenses/>.
18  */
19 
20 #include <vector>
21 #include <cstdint>
22 
23 #include "threadprivatestore.h"
24 
25 // This class is a compressed storage backend for LinePositionArray
26 // It emulates the interface of a vector, but take advantage of the nature
27 // of the stored data (increasing end of line addresses) to apply some
28 // compression in memory, while still providing fast, constant-time look-up.
29 
30 /* The current algorithm takes advantage of the fact most lines are reasonably
31  * short, it codes each line on:
32  * - Line < 127 bytes    : 1 byte
33  * - 127 < line < 16383  : 2 bytes
34  * - line > 16383        : 6 bytes (or 10 bytes)
35  * Uncompressed backend stores line on 4 bytes or 8 bytes.
36  *
37  * The algorithm is quite simple, the file is first divided in two parts:
38  * - The lines whose end are located before UINT32_MAX
39  * - The lines whose end are located after UINT32_MAX
40  * Those end of lines are stored separately in the table32 and the table64
41  * respectively.
42  *
43  * The EOL list is then divided in blocks of BLOCK_SIZE (128) lines.
44  * A block index vector (per table) contains pointers to each block.
45  *
46  * Each block is then defined as such:
47  * Block32 (sizes in byte)
48  * 00 - Absolute EOF address (4 bytes)
49  * 04 - ( 0xxx xxxx  if second line is < 127 ) (1 byte, relative)
50  *    - ( 10xx xxxx
51  *        xxxx xxxx  if second line is < 16383 ) (2 bytes, relative)
52  *    - ( 1111 1111
53  *        xxxx xxxx
54  *        xxxx xxxx  if second line is > 16383 ) (6 bytes, absolute)
55  * ...
56  * (126 more lines)
57  *
58  * Block64 (sizes in byte)
59  * 00 - Absolute EOF address (8 bytes)
60  * 08 - ( 0xxx xxxx  if second line is < 127 ) (1 byte, relative)
61  *    - ( 10xx xxxx
62  *        xxxx xxxx  if second line is < 16383 ) (2 bytes, relative)
63  *    - ( 1111 1111
64  *        xxxx xxxx
65  *        xxxx xxxx
66  *        xxxx xxxx
67  *        xxxx xxxx  if second line is > 16383 ) (10 bytes, absolute)
68  * ...
69  * (126 more lines)
70  *
71  * Absolute addressing has been adopted for line > 16383 to bound memory usage in case
72  * of pathologically long (MBs or GBs) lines, even if it is a bit less efficient for
73  * long-ish (30 KB) lines.
74  */
75 
76 #ifndef COMPRESSEDLINESTORAGE_H
77 #define COMPRESSEDLINESTORAGE_H
78 
79 #define BLOCK_SIZE 256
80 
81 //template<int BLOCK_SIZE = 128>
82 class CompressedLinePositionStorage
83 {
84   public:
85     // Default constructor
86     CompressedLinePositionStorage()
87     { nb_lines_ = 0; first_long_line_ = UINT32_MAX;
88       current_pos_ = 0; block_pointer_ = nullptr;
89       previous_block_pointer_ = nullptr; }
90     // Copy constructor would be slow, delete!
91     CompressedLinePositionStorage( const CompressedLinePositionStorage& orig ) = delete;
92 
93     // Move constructor
94     CompressedLinePositionStorage( CompressedLinePositionStorage&& orig );
95     // Move assignement
96     CompressedLinePositionStorage& operator=(
97             CompressedLinePositionStorage&& orig );
98     // Destructor
99     ~CompressedLinePositionStorage();
100 
101     // Append the passed end-of-line to the storage
102     void append( uint64_t pos );
103     void push_back( uint64_t pos )
104     { append( pos ); }
105     // Size of the array
106     uint32_t size() const
107     { return nb_lines_; }
108     // Element at index
109     uint64_t at( uint32_t i ) const;
110 
111     // Add one list to the other
112     void append_list( const std::vector<uint64_t>& positions );
113 
114     // Pop the last element of the storage
115     void pop_back();
116 
117   private:
118     // Utility for move ctor/assign
119     void move_from( CompressedLinePositionStorage&& orig );
120 
121     // The two indexes
122     std::vector<char*> block32_index_;
123     std::vector<char*> block64_index_;
124 
125     // Total number of lines in storage
126     uint32_t nb_lines_;
127 
128     // Current position (position of the end of the last line added)
129     uint64_t current_pos_;
130     // Address of the next position (not yet written) within the current
131     // block. nullptr means there is no current block (previous block
132     // finished or no data)
133     char* block_pointer_;
134 
135     // The index of the first line whose end is stored in a block64
136     // Initialised at UINT32_MAX, meaning "unset"
137     uint32_t first_long_line_;
138 
139     // For pop_back:
140 
141     // Previous pointer to block element, it is restored when we
142     // "pop_back" the last element.
143     // A null pointer here means pop_back need to free the block
144     // that has just been created.
145     char* previous_block_pointer_;
146 
147     // Cache the last position read
148     // This is to speed up consecutive reads (whole page)
149     struct Cache {
150         uint32_t index;
151         uint64_t position;
152         char* ptr;
153     };
154     mutable ThreadPrivateStore<Cache,2> last_read_; // = { UINT32_MAX - 1U, 0, nullptr };
155     // mutable Cache last_read;
156 };
157 
158 #endif
159