17a2f3a57SNicolas Bonnefon /* 27a2f3a57SNicolas Bonnefon * Copyright (C) 2015 Nicolas Bonnefon and other contributors 37a2f3a57SNicolas Bonnefon * 47a2f3a57SNicolas Bonnefon * This file is part of glogg. 57a2f3a57SNicolas Bonnefon * 67a2f3a57SNicolas Bonnefon * glogg is free software: you can redistribute it and/or modify 77a2f3a57SNicolas Bonnefon * it under the terms of the GNU General Public License as published by 87a2f3a57SNicolas Bonnefon * the Free Software Foundation, either version 3 of the License, or 97a2f3a57SNicolas Bonnefon * (at your option) any later version. 107a2f3a57SNicolas Bonnefon * 117a2f3a57SNicolas Bonnefon * glogg is distributed in the hope that it will be useful, 127a2f3a57SNicolas Bonnefon * but WITHOUT ANY WARRANTY; without even the implied warranty of 137a2f3a57SNicolas Bonnefon * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 147a2f3a57SNicolas Bonnefon * GNU General Public License for more details. 157a2f3a57SNicolas Bonnefon * 167a2f3a57SNicolas Bonnefon * You should have received a copy of the GNU General Public License 177a2f3a57SNicolas Bonnefon * along with glogg. If not, see <http://www.gnu.org/licenses/>. 187a2f3a57SNicolas Bonnefon */ 197a2f3a57SNicolas Bonnefon 207a2f3a57SNicolas Bonnefon #include <vector> 217a2f3a57SNicolas Bonnefon #include <cstdint> 227a2f3a57SNicolas Bonnefon 237151e3acSNicolas Bonnefon #include "threadprivatestore.h" 247151e3acSNicolas Bonnefon 257a2f3a57SNicolas Bonnefon // This class is a compressed storage backend for LinePositionArray 267a2f3a57SNicolas Bonnefon // It emulates the interface of a vector, but take advantage of the nature 277a2f3a57SNicolas Bonnefon // of the stored data (increasing end of line addresses) to apply some 287a2f3a57SNicolas Bonnefon // compression in memory, while still providing fast, constant-time look-up. 297a2f3a57SNicolas Bonnefon 307a2f3a57SNicolas Bonnefon /* The current algorithm takes advantage of the fact most lines are reasonably 317a2f3a57SNicolas Bonnefon * short, it codes each line on: 327a2f3a57SNicolas Bonnefon * - Line < 127 bytes : 1 byte 337a2f3a57SNicolas Bonnefon * - 127 < line < 16383 : 2 bytes 347a2f3a57SNicolas Bonnefon * - line > 16383 : 6 bytes (or 10 bytes) 357a2f3a57SNicolas Bonnefon * Uncompressed backend stores line on 4 bytes or 8 bytes. 367a2f3a57SNicolas Bonnefon * 377a2f3a57SNicolas Bonnefon * The algorithm is quite simple, the file is first divided in two parts: 387a2f3a57SNicolas Bonnefon * - The lines whose end are located before UINT32_MAX 397a2f3a57SNicolas Bonnefon * - The lines whose end are located after UINT32_MAX 407a2f3a57SNicolas Bonnefon * Those end of lines are stored separately in the table32 and the table64 417a2f3a57SNicolas Bonnefon * respectively. 427a2f3a57SNicolas Bonnefon * 437a2f3a57SNicolas Bonnefon * The EOL list is then divided in blocks of BLOCK_SIZE (128) lines. 447a2f3a57SNicolas Bonnefon * A block index vector (per table) contains pointers to each block. 457a2f3a57SNicolas Bonnefon * 467a2f3a57SNicolas Bonnefon * Each block is then defined as such: 477a2f3a57SNicolas Bonnefon * Block32 (sizes in byte) 487a2f3a57SNicolas Bonnefon * 00 - Absolute EOF address (4 bytes) 497a2f3a57SNicolas Bonnefon * 04 - ( 0xxx xxxx if second line is < 127 ) (1 byte, relative) 507a2f3a57SNicolas Bonnefon * - ( 10xx xxxx 517a2f3a57SNicolas Bonnefon * xxxx xxxx if second line is < 16383 ) (2 bytes, relative) 527a2f3a57SNicolas Bonnefon * - ( 1111 1111 537a2f3a57SNicolas Bonnefon * xxxx xxxx 547a2f3a57SNicolas Bonnefon * xxxx xxxx if second line is > 16383 ) (6 bytes, absolute) 557a2f3a57SNicolas Bonnefon * ... 567a2f3a57SNicolas Bonnefon * (126 more lines) 577a2f3a57SNicolas Bonnefon * 587a2f3a57SNicolas Bonnefon * Block64 (sizes in byte) 597a2f3a57SNicolas Bonnefon * 00 - Absolute EOF address (8 bytes) 607a2f3a57SNicolas Bonnefon * 08 - ( 0xxx xxxx if second line is < 127 ) (1 byte, relative) 617a2f3a57SNicolas Bonnefon * - ( 10xx xxxx 627a2f3a57SNicolas Bonnefon * xxxx xxxx if second line is < 16383 ) (2 bytes, relative) 637a2f3a57SNicolas Bonnefon * - ( 1111 1111 647a2f3a57SNicolas Bonnefon * xxxx xxxx 657a2f3a57SNicolas Bonnefon * xxxx xxxx 667a2f3a57SNicolas Bonnefon * xxxx xxxx 677a2f3a57SNicolas Bonnefon * xxxx xxxx if second line is > 16383 ) (10 bytes, absolute) 687a2f3a57SNicolas Bonnefon * ... 697a2f3a57SNicolas Bonnefon * (126 more lines) 707a2f3a57SNicolas Bonnefon * 717a2f3a57SNicolas Bonnefon * Absolute addressing has been adopted for line > 16383 to bound memory usage in case 727a2f3a57SNicolas Bonnefon * of pathologically long (MBs or GBs) lines, even if it is a bit less efficient for 737a2f3a57SNicolas Bonnefon * long-ish (30 KB) lines. 746e2e573cSNicolas Bonnefon * 756e2e573cSNicolas Bonnefon * The table32 always starts at 0, the table64 starts at first_long_line_ 767a2f3a57SNicolas Bonnefon */ 777a2f3a57SNicolas Bonnefon 787a2f3a57SNicolas Bonnefon #ifndef COMPRESSEDLINESTORAGE_H 797a2f3a57SNicolas Bonnefon #define COMPRESSEDLINESTORAGE_H 807a2f3a57SNicolas Bonnefon 817151e3acSNicolas Bonnefon #define BLOCK_SIZE 256 827a2f3a57SNicolas Bonnefon 837a2f3a57SNicolas Bonnefon //template<int BLOCK_SIZE = 128> 847a2f3a57SNicolas Bonnefon class CompressedLinePositionStorage 857a2f3a57SNicolas Bonnefon { 867a2f3a57SNicolas Bonnefon public: 877a2f3a57SNicolas Bonnefon // Default constructor CompressedLinePositionStorage()887a2f3a57SNicolas Bonnefon CompressedLinePositionStorage() 897a2f3a57SNicolas Bonnefon { nb_lines_ = 0; first_long_line_ = UINT32_MAX; 903c46a469SNicolas Bonnefon current_pos_ = 0; block_pointer_ = nullptr; 913c46a469SNicolas Bonnefon previous_block_pointer_ = nullptr; } 923c46a469SNicolas Bonnefon // Copy constructor would be slow, delete! 93e49f4922SNicolas Bonnefon CompressedLinePositionStorage( const CompressedLinePositionStorage& orig ) = delete; 94af5438c7SNicolas Bonnefon 953c46a469SNicolas Bonnefon // Move constructor 963c46a469SNicolas Bonnefon CompressedLinePositionStorage( CompressedLinePositionStorage&& orig ); 973c46a469SNicolas Bonnefon // Move assignement 983c46a469SNicolas Bonnefon CompressedLinePositionStorage& operator=( 993c46a469SNicolas Bonnefon CompressedLinePositionStorage&& orig ); 1003c46a469SNicolas Bonnefon // Destructor 1013c46a469SNicolas Bonnefon ~CompressedLinePositionStorage(); 1027a2f3a57SNicolas Bonnefon 1037a2f3a57SNicolas Bonnefon // Append the passed end-of-line to the storage 1047a2f3a57SNicolas Bonnefon void append( uint64_t pos ); push_back(uint64_t pos)1057151e3acSNicolas Bonnefon void push_back( uint64_t pos ) 1067151e3acSNicolas Bonnefon { append( pos ); } 1077a2f3a57SNicolas Bonnefon // Size of the array size()1087a2f3a57SNicolas Bonnefon uint32_t size() const 1097a2f3a57SNicolas Bonnefon { return nb_lines_; } 1107a2f3a57SNicolas Bonnefon // Element at index 1117151e3acSNicolas Bonnefon uint64_t at( uint32_t i ) const; 1127a2f3a57SNicolas Bonnefon 1137a2f3a57SNicolas Bonnefon // Add one list to the other 114e49f4922SNicolas Bonnefon void append_list( const std::vector<uint64_t>& positions ); 1157a2f3a57SNicolas Bonnefon 1167a2f3a57SNicolas Bonnefon // Pop the last element of the storage 1177a2f3a57SNicolas Bonnefon void pop_back(); 1187a2f3a57SNicolas Bonnefon 1197a2f3a57SNicolas Bonnefon private: 1203c46a469SNicolas Bonnefon // Utility for move ctor/assign 1213c46a469SNicolas Bonnefon void move_from( CompressedLinePositionStorage&& orig ); 12225d0072cSNicolas Bonnefon void free_blocks(); 1233c46a469SNicolas Bonnefon 1247a2f3a57SNicolas Bonnefon // The two indexes 1257a2f3a57SNicolas Bonnefon std::vector<char*> block32_index_; 1267a2f3a57SNicolas Bonnefon std::vector<char*> block64_index_; 1277a2f3a57SNicolas Bonnefon 1287a2f3a57SNicolas Bonnefon // Total number of lines in storage 1297a2f3a57SNicolas Bonnefon uint32_t nb_lines_; 1307a2f3a57SNicolas Bonnefon 1317a2f3a57SNicolas Bonnefon // Current position (position of the end of the last line added) 1327a2f3a57SNicolas Bonnefon uint64_t current_pos_; 1337a2f3a57SNicolas Bonnefon // Address of the next position (not yet written) within the current 1347a2f3a57SNicolas Bonnefon // block. nullptr means there is no current block (previous block 1357a2f3a57SNicolas Bonnefon // finished or no data) 1367a2f3a57SNicolas Bonnefon char* block_pointer_; 1377a2f3a57SNicolas Bonnefon 1387a2f3a57SNicolas Bonnefon // The index of the first line whose end is stored in a block64 1397a2f3a57SNicolas Bonnefon // Initialised at UINT32_MAX, meaning "unset" 1406e2e573cSNicolas Bonnefon // this is the origin point for all calculations in block64 1417a2f3a57SNicolas Bonnefon uint32_t first_long_line_; 1423c46a469SNicolas Bonnefon 1433c46a469SNicolas Bonnefon // For pop_back: 1443c46a469SNicolas Bonnefon 1453c46a469SNicolas Bonnefon // Previous pointer to block element, it is restored when we 1463c46a469SNicolas Bonnefon // "pop_back" the last element. 1473c46a469SNicolas Bonnefon // A null pointer here means pop_back need to free the block 1483c46a469SNicolas Bonnefon // that has just been created. 1493c46a469SNicolas Bonnefon char* previous_block_pointer_; 1507151e3acSNicolas Bonnefon 1517151e3acSNicolas Bonnefon // Cache the last position read 1527151e3acSNicolas Bonnefon // This is to speed up consecutive reads (whole page) 1537151e3acSNicolas Bonnefon struct Cache { CacheCache154*02531d6aSNicolas Bonnefon Cache() { 155*02531d6aSNicolas Bonnefon index = UINT32_MAX - 1U; 156*02531d6aSNicolas Bonnefon position = 0; 157*02531d6aSNicolas Bonnefon ptr = nullptr; 158*02531d6aSNicolas Bonnefon } 159*02531d6aSNicolas Bonnefon 1607151e3acSNicolas Bonnefon uint32_t index; 1617151e3acSNicolas Bonnefon uint64_t position; 1627151e3acSNicolas Bonnefon char* ptr; 1637151e3acSNicolas Bonnefon }; 1647151e3acSNicolas Bonnefon mutable ThreadPrivateStore<Cache,2> last_read_; // = { UINT32_MAX - 1U, 0, nullptr }; 1657151e3acSNicolas Bonnefon // mutable Cache last_read; 1667a2f3a57SNicolas Bonnefon }; 1677a2f3a57SNicolas Bonnefon 1687a2f3a57SNicolas Bonnefon #endif 169