/* * Copyright (C) 2015 Nicolas Bonnefon and other contributors * * This file is part of glogg. * * glogg is free software: you can redistribute it and/or modify * it under the terms of the GNU General Public License as published by * the Free Software Foundation, either version 3 of the License, or * (at your option) any later version. * * glogg is distributed in the hope that it will be useful, * but WITHOUT ANY WARRANTY; without even the implied warranty of * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the * GNU General Public License for more details. * * You should have received a copy of the GNU General Public License * along with glogg. If not, see . */ #include #include #include "threadprivatestore.h" // This class is a compressed storage backend for LinePositionArray // It emulates the interface of a vector, but take advantage of the nature // of the stored data (increasing end of line addresses) to apply some // compression in memory, while still providing fast, constant-time look-up. /* The current algorithm takes advantage of the fact most lines are reasonably * short, it codes each line on: * - Line < 127 bytes : 1 byte * - 127 < line < 16383 : 2 bytes * - line > 16383 : 6 bytes (or 10 bytes) * Uncompressed backend stores line on 4 bytes or 8 bytes. * * The algorithm is quite simple, the file is first divided in two parts: * - The lines whose end are located before UINT32_MAX * - The lines whose end are located after UINT32_MAX * Those end of lines are stored separately in the table32 and the table64 * respectively. * * The EOL list is then divided in blocks of BLOCK_SIZE (128) lines. * A block index vector (per table) contains pointers to each block. * * Each block is then defined as such: * Block32 (sizes in byte) * 00 - Absolute EOF address (4 bytes) * 04 - ( 0xxx xxxx if second line is < 127 ) (1 byte, relative) * - ( 10xx xxxx * xxxx xxxx if second line is < 16383 ) (2 bytes, relative) * - ( 1111 1111 * xxxx xxxx * xxxx xxxx if second line is > 16383 ) (6 bytes, absolute) * ... * (126 more lines) * * Block64 (sizes in byte) * 00 - Absolute EOF address (8 bytes) * 08 - ( 0xxx xxxx if second line is < 127 ) (1 byte, relative) * - ( 10xx xxxx * xxxx xxxx if second line is < 16383 ) (2 bytes, relative) * - ( 1111 1111 * xxxx xxxx * xxxx xxxx * xxxx xxxx * xxxx xxxx if second line is > 16383 ) (10 bytes, absolute) * ... * (126 more lines) * * Absolute addressing has been adopted for line > 16383 to bound memory usage in case * of pathologically long (MBs or GBs) lines, even if it is a bit less efficient for * long-ish (30 KB) lines. * * The table32 always starts at 0, the table64 starts at first_long_line_ */ #ifndef COMPRESSEDLINESTORAGE_H #define COMPRESSEDLINESTORAGE_H #define BLOCK_SIZE 256 //template class CompressedLinePositionStorage { public: // Default constructor CompressedLinePositionStorage() { nb_lines_ = 0; first_long_line_ = UINT32_MAX; current_pos_ = 0; block_pointer_ = nullptr; previous_block_pointer_ = nullptr; } // Copy constructor would be slow, delete! CompressedLinePositionStorage( const CompressedLinePositionStorage& orig ) = delete; // Move constructor CompressedLinePositionStorage( CompressedLinePositionStorage&& orig ); // Move assignement CompressedLinePositionStorage& operator=( CompressedLinePositionStorage&& orig ); // Destructor ~CompressedLinePositionStorage(); // Append the passed end-of-line to the storage void append( uint64_t pos ); void push_back( uint64_t pos ) { append( pos ); } // Size of the array uint32_t size() const { return nb_lines_; } // Element at index uint64_t at( uint32_t i ) const; // Add one list to the other void append_list( const std::vector& positions ); // Pop the last element of the storage void pop_back(); private: // Utility for move ctor/assign void move_from( CompressedLinePositionStorage&& orig ); void free_blocks(); // The two indexes std::vector block32_index_; std::vector block64_index_; // Total number of lines in storage uint32_t nb_lines_; // Current position (position of the end of the last line added) uint64_t current_pos_; // Address of the next position (not yet written) within the current // block. nullptr means there is no current block (previous block // finished or no data) char* block_pointer_; // The index of the first line whose end is stored in a block64 // Initialised at UINT32_MAX, meaning "unset" // this is the origin point for all calculations in block64 uint32_t first_long_line_; // For pop_back: // Previous pointer to block element, it is restored when we // "pop_back" the last element. // A null pointer here means pop_back need to free the block // that has just been created. char* previous_block_pointer_; // Cache the last position read // This is to speed up consecutive reads (whole page) struct Cache { uint32_t index; uint64_t position; char* ptr; }; mutable ThreadPrivateStore last_read_; // = { UINT32_MAX - 1U, 0, nullptr }; // mutable Cache last_read; }; #endif