1 /* 2 * Copyright (C) 2015 Nicolas Bonnefon and other contributors 3 * 4 * This file is part of glogg. 5 * 6 * glogg is free software: you can redistribute it and/or modify 7 * it under the terms of the GNU General Public License as published by 8 * the Free Software Foundation, either version 3 of the License, or 9 * (at your option) any later version. 10 * 11 * glogg is distributed in the hope that it will be useful, 12 * but WITHOUT ANY WARRANTY; without even the implied warranty of 13 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 14 * GNU General Public License for more details. 15 * 16 * You should have received a copy of the GNU General Public License 17 * along with glogg. If not, see <http://www.gnu.org/licenses/>. 18 */ 19 20 #include <vector> 21 #include <cstdint> 22 23 #include "threadprivatestore.h" 24 25 // This class is a compressed storage backend for LinePositionArray 26 // It emulates the interface of a vector, but take advantage of the nature 27 // of the stored data (increasing end of line addresses) to apply some 28 // compression in memory, while still providing fast, constant-time look-up. 29 30 /* The current algorithm takes advantage of the fact most lines are reasonably 31 * short, it codes each line on: 32 * - Line < 127 bytes : 1 byte 33 * - 127 < line < 16383 : 2 bytes 34 * - line > 16383 : 6 bytes (or 10 bytes) 35 * Uncompressed backend stores line on 4 bytes or 8 bytes. 36 * 37 * The algorithm is quite simple, the file is first divided in two parts: 38 * - The lines whose end are located before UINT32_MAX 39 * - The lines whose end are located after UINT32_MAX 40 * Those end of lines are stored separately in the table32 and the table64 41 * respectively. 42 * 43 * The EOL list is then divided in blocks of BLOCK_SIZE (128) lines. 44 * A block index vector (per table) contains pointers to each block. 45 * 46 * Each block is then defined as such: 47 * Block32 (sizes in byte) 48 * 00 - Absolute EOF address (4 bytes) 49 * 04 - ( 0xxx xxxx if second line is < 127 ) (1 byte, relative) 50 * - ( 10xx xxxx 51 * xxxx xxxx if second line is < 16383 ) (2 bytes, relative) 52 * - ( 1111 1111 53 * xxxx xxxx 54 * xxxx xxxx if second line is > 16383 ) (6 bytes, absolute) 55 * ... 56 * (126 more lines) 57 * 58 * Block64 (sizes in byte) 59 * 00 - Absolute EOF address (8 bytes) 60 * 08 - ( 0xxx xxxx if second line is < 127 ) (1 byte, relative) 61 * - ( 10xx xxxx 62 * xxxx xxxx if second line is < 16383 ) (2 bytes, relative) 63 * - ( 1111 1111 64 * xxxx xxxx 65 * xxxx xxxx 66 * xxxx xxxx 67 * xxxx xxxx if second line is > 16383 ) (10 bytes, absolute) 68 * ... 69 * (126 more lines) 70 * 71 * Absolute addressing has been adopted for line > 16383 to bound memory usage in case 72 * of pathologically long (MBs or GBs) lines, even if it is a bit less efficient for 73 * long-ish (30 KB) lines. 74 * 75 * The table32 always starts at 0, the table64 starts at first_long_line_ 76 */ 77 78 #ifndef COMPRESSEDLINESTORAGE_H 79 #define COMPRESSEDLINESTORAGE_H 80 81 #define BLOCK_SIZE 256 82 83 //template<int BLOCK_SIZE = 128> 84 class CompressedLinePositionStorage 85 { 86 public: 87 // Default constructor 88 CompressedLinePositionStorage() 89 { nb_lines_ = 0; first_long_line_ = UINT32_MAX; 90 current_pos_ = 0; block_pointer_ = nullptr; 91 previous_block_pointer_ = nullptr; } 92 // Copy constructor would be slow, delete! 93 CompressedLinePositionStorage( const CompressedLinePositionStorage& orig ) = delete; 94 95 // Move constructor 96 CompressedLinePositionStorage( CompressedLinePositionStorage&& orig ); 97 // Move assignement 98 CompressedLinePositionStorage& operator=( 99 CompressedLinePositionStorage&& orig ); 100 // Destructor 101 ~CompressedLinePositionStorage(); 102 103 // Append the passed end-of-line to the storage 104 void append( uint64_t pos ); 105 void push_back( uint64_t pos ) 106 { append( pos ); } 107 // Size of the array 108 uint32_t size() const 109 { return nb_lines_; } 110 // Element at index 111 uint64_t at( uint32_t i ) const; 112 113 // Add one list to the other 114 void append_list( const std::vector<uint64_t>& positions ); 115 116 // Pop the last element of the storage 117 void pop_back(); 118 119 private: 120 // Utility for move ctor/assign 121 void move_from( CompressedLinePositionStorage&& orig ); 122 void free_blocks(); 123 124 // The two indexes 125 std::vector<char*> block32_index_; 126 std::vector<char*> block64_index_; 127 128 // Total number of lines in storage 129 uint32_t nb_lines_; 130 131 // Current position (position of the end of the last line added) 132 uint64_t current_pos_; 133 // Address of the next position (not yet written) within the current 134 // block. nullptr means there is no current block (previous block 135 // finished or no data) 136 char* block_pointer_; 137 138 // The index of the first line whose end is stored in a block64 139 // Initialised at UINT32_MAX, meaning "unset" 140 // this is the origin point for all calculations in block64 141 uint32_t first_long_line_; 142 143 // For pop_back: 144 145 // Previous pointer to block element, it is restored when we 146 // "pop_back" the last element. 147 // A null pointer here means pop_back need to free the block 148 // that has just been created. 149 char* previous_block_pointer_; 150 151 // Cache the last position read 152 // This is to speed up consecutive reads (whole page) 153 struct Cache { 154 Cache() { 155 index = UINT32_MAX - 1U; 156 position = 0; 157 ptr = nullptr; 158 } 159 160 uint32_t index; 161 uint64_t position; 162 char* ptr; 163 }; 164 mutable ThreadPrivateStore<Cache,2> last_read_; // = { UINT32_MAX - 1U, 0, nullptr }; 165 // mutable Cache last_read; 166 }; 167 168 #endif 169