1 /* 2 * Copyright (C) 2015 Nicolas Bonnefon and other contributors 3 * 4 * This file is part of glogg. 5 * 6 * glogg is free software: you can redistribute it and/or modify 7 * it under the terms of the GNU General Public License as published by 8 * the Free Software Foundation, either version 3 of the License, or 9 * (at your option) any later version. 10 * 11 * glogg is distributed in the hope that it will be useful, 12 * but WITHOUT ANY WARRANTY; without even the implied warranty of 13 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 14 * GNU General Public License for more details. 15 * 16 * You should have received a copy of the GNU General Public License 17 * along with glogg. If not, see <http://www.gnu.org/licenses/>. 18 */ 19 20 #include <vector> 21 #include <cstdint> 22 23 // This class is a compressed storage backend for LinePositionArray 24 // It emulates the interface of a vector, but take advantage of the nature 25 // of the stored data (increasing end of line addresses) to apply some 26 // compression in memory, while still providing fast, constant-time look-up. 27 28 /* The current algorithm takes advantage of the fact most lines are reasonably 29 * short, it codes each line on: 30 * - Line < 127 bytes : 1 byte 31 * - 127 < line < 16383 : 2 bytes 32 * - line > 16383 : 6 bytes (or 10 bytes) 33 * Uncompressed backend stores line on 4 bytes or 8 bytes. 34 * 35 * The algorithm is quite simple, the file is first divided in two parts: 36 * - The lines whose end are located before UINT32_MAX 37 * - The lines whose end are located after UINT32_MAX 38 * Those end of lines are stored separately in the table32 and the table64 39 * respectively. 40 * 41 * The EOL list is then divided in blocks of BLOCK_SIZE (128) lines. 42 * A block index vector (per table) contains pointers to each block. 43 * 44 * Each block is then defined as such: 45 * Block32 (sizes in byte) 46 * 00 - Absolute EOF address (4 bytes) 47 * 04 - ( 0xxx xxxx if second line is < 127 ) (1 byte, relative) 48 * - ( 10xx xxxx 49 * xxxx xxxx if second line is < 16383 ) (2 bytes, relative) 50 * - ( 1111 1111 51 * xxxx xxxx 52 * xxxx xxxx if second line is > 16383 ) (6 bytes, absolute) 53 * ... 54 * (126 more lines) 55 * 56 * Block64 (sizes in byte) 57 * 00 - Absolute EOF address (8 bytes) 58 * 08 - ( 0xxx xxxx if second line is < 127 ) (1 byte, relative) 59 * - ( 10xx xxxx 60 * xxxx xxxx if second line is < 16383 ) (2 bytes, relative) 61 * - ( 1111 1111 62 * xxxx xxxx 63 * xxxx xxxx 64 * xxxx xxxx 65 * xxxx xxxx if second line is > 16383 ) (10 bytes, absolute) 66 * ... 67 * (126 more lines) 68 * 69 * Absolute addressing has been adopted for line > 16383 to bound memory usage in case 70 * of pathologically long (MBs or GBs) lines, even if it is a bit less efficient for 71 * long-ish (30 KB) lines. 72 */ 73 74 #ifndef COMPRESSEDLINESTORAGE_H 75 #define COMPRESSEDLINESTORAGE_H 76 77 #define BLOCK_SIZE 128 78 79 //template<int BLOCK_SIZE = 128> 80 class CompressedLinePositionStorage 81 { 82 public: 83 // Default constructor 84 CompressedLinePositionStorage() 85 { nb_lines_ = 0; first_long_line_ = UINT32_MAX; 86 current_pos_ = 0; block_pointer_ = nullptr; } 87 // Copy constructor 88 CompressedLinePositionStorage( const CompressedLinePositionStorage& orig ); 89 // TODO: do we need move constructor? 90 91 // Append the passed end-of-line to the storage 92 void append( uint64_t pos ); 93 // Size of the array 94 uint32_t size() const 95 { return nb_lines_; } 96 // Element at index 97 uint64_t at( int i ) const; 98 99 // Add one list to the other 100 // TODO: do we need a move version? 101 void append_list( const CompressedLinePositionStorage& other ); 102 103 // Pop the last element of the storage 104 void pop_back(); 105 106 private: 107 // The two indexes 108 std::vector<char*> block32_index_; 109 std::vector<char*> block64_index_; 110 111 // Total number of lines in storage 112 uint32_t nb_lines_; 113 114 // Current position (position of the end of the last line added) 115 uint64_t current_pos_; 116 // Address of the next position (not yet written) within the current 117 // block. nullptr means there is no current block (previous block 118 // finished or no data) 119 char* block_pointer_; 120 121 // The index of the first line whose end is stored in a block64 122 // Initialised at UINT32_MAX, meaning "unset" 123 uint32_t first_long_line_; 124 }; 125 126 #endif 127