xref: /glogg/src/data/compressedlinestorage.h (revision 02531d6a363303ac93258a8d3dc43388e1d08ba8)
17a2f3a57SNicolas Bonnefon /*
27a2f3a57SNicolas Bonnefon  * Copyright (C) 2015 Nicolas Bonnefon and other contributors
37a2f3a57SNicolas Bonnefon  *
47a2f3a57SNicolas Bonnefon  * This file is part of glogg.
57a2f3a57SNicolas Bonnefon  *
67a2f3a57SNicolas Bonnefon  * glogg is free software: you can redistribute it and/or modify
77a2f3a57SNicolas Bonnefon  * it under the terms of the GNU General Public License as published by
87a2f3a57SNicolas Bonnefon  * the Free Software Foundation, either version 3 of the License, or
97a2f3a57SNicolas Bonnefon  * (at your option) any later version.
107a2f3a57SNicolas Bonnefon  *
117a2f3a57SNicolas Bonnefon  * glogg is distributed in the hope that it will be useful,
127a2f3a57SNicolas Bonnefon  * but WITHOUT ANY WARRANTY; without even the implied warranty of
137a2f3a57SNicolas Bonnefon  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
147a2f3a57SNicolas Bonnefon  * GNU General Public License for more details.
157a2f3a57SNicolas Bonnefon  *
167a2f3a57SNicolas Bonnefon  * You should have received a copy of the GNU General Public License
177a2f3a57SNicolas Bonnefon  * along with glogg.  If not, see <http://www.gnu.org/licenses/>.
187a2f3a57SNicolas Bonnefon  */
197a2f3a57SNicolas Bonnefon 
207a2f3a57SNicolas Bonnefon #include <vector>
217a2f3a57SNicolas Bonnefon #include <cstdint>
227a2f3a57SNicolas Bonnefon 
237151e3acSNicolas Bonnefon #include "threadprivatestore.h"
247151e3acSNicolas Bonnefon 
257a2f3a57SNicolas Bonnefon // This class is a compressed storage backend for LinePositionArray
267a2f3a57SNicolas Bonnefon // It emulates the interface of a vector, but take advantage of the nature
277a2f3a57SNicolas Bonnefon // of the stored data (increasing end of line addresses) to apply some
287a2f3a57SNicolas Bonnefon // compression in memory, while still providing fast, constant-time look-up.
297a2f3a57SNicolas Bonnefon 
307a2f3a57SNicolas Bonnefon /* The current algorithm takes advantage of the fact most lines are reasonably
317a2f3a57SNicolas Bonnefon  * short, it codes each line on:
327a2f3a57SNicolas Bonnefon  * - Line < 127 bytes    : 1 byte
337a2f3a57SNicolas Bonnefon  * - 127 < line < 16383  : 2 bytes
347a2f3a57SNicolas Bonnefon  * - line > 16383        : 6 bytes (or 10 bytes)
357a2f3a57SNicolas Bonnefon  * Uncompressed backend stores line on 4 bytes or 8 bytes.
367a2f3a57SNicolas Bonnefon  *
377a2f3a57SNicolas Bonnefon  * The algorithm is quite simple, the file is first divided in two parts:
387a2f3a57SNicolas Bonnefon  * - The lines whose end are located before UINT32_MAX
397a2f3a57SNicolas Bonnefon  * - The lines whose end are located after UINT32_MAX
407a2f3a57SNicolas Bonnefon  * Those end of lines are stored separately in the table32 and the table64
417a2f3a57SNicolas Bonnefon  * respectively.
427a2f3a57SNicolas Bonnefon  *
437a2f3a57SNicolas Bonnefon  * The EOL list is then divided in blocks of BLOCK_SIZE (128) lines.
447a2f3a57SNicolas Bonnefon  * A block index vector (per table) contains pointers to each block.
457a2f3a57SNicolas Bonnefon  *
467a2f3a57SNicolas Bonnefon  * Each block is then defined as such:
477a2f3a57SNicolas Bonnefon  * Block32 (sizes in byte)
487a2f3a57SNicolas Bonnefon  * 00 - Absolute EOF address (4 bytes)
497a2f3a57SNicolas Bonnefon  * 04 - ( 0xxx xxxx  if second line is < 127 ) (1 byte, relative)
507a2f3a57SNicolas Bonnefon  *    - ( 10xx xxxx
517a2f3a57SNicolas Bonnefon  *        xxxx xxxx  if second line is < 16383 ) (2 bytes, relative)
527a2f3a57SNicolas Bonnefon  *    - ( 1111 1111
537a2f3a57SNicolas Bonnefon  *        xxxx xxxx
547a2f3a57SNicolas Bonnefon  *        xxxx xxxx  if second line is > 16383 ) (6 bytes, absolute)
557a2f3a57SNicolas Bonnefon  * ...
567a2f3a57SNicolas Bonnefon  * (126 more lines)
577a2f3a57SNicolas Bonnefon  *
587a2f3a57SNicolas Bonnefon  * Block64 (sizes in byte)
597a2f3a57SNicolas Bonnefon  * 00 - Absolute EOF address (8 bytes)
607a2f3a57SNicolas Bonnefon  * 08 - ( 0xxx xxxx  if second line is < 127 ) (1 byte, relative)
617a2f3a57SNicolas Bonnefon  *    - ( 10xx xxxx
627a2f3a57SNicolas Bonnefon  *        xxxx xxxx  if second line is < 16383 ) (2 bytes, relative)
637a2f3a57SNicolas Bonnefon  *    - ( 1111 1111
647a2f3a57SNicolas Bonnefon  *        xxxx xxxx
657a2f3a57SNicolas Bonnefon  *        xxxx xxxx
667a2f3a57SNicolas Bonnefon  *        xxxx xxxx
677a2f3a57SNicolas Bonnefon  *        xxxx xxxx  if second line is > 16383 ) (10 bytes, absolute)
687a2f3a57SNicolas Bonnefon  * ...
697a2f3a57SNicolas Bonnefon  * (126 more lines)
707a2f3a57SNicolas Bonnefon  *
717a2f3a57SNicolas Bonnefon  * Absolute addressing has been adopted for line > 16383 to bound memory usage in case
727a2f3a57SNicolas Bonnefon  * of pathologically long (MBs or GBs) lines, even if it is a bit less efficient for
737a2f3a57SNicolas Bonnefon  * long-ish (30 KB) lines.
746e2e573cSNicolas Bonnefon  *
756e2e573cSNicolas Bonnefon  * The table32 always starts at 0, the table64 starts at first_long_line_
767a2f3a57SNicolas Bonnefon  */
777a2f3a57SNicolas Bonnefon 
787a2f3a57SNicolas Bonnefon #ifndef COMPRESSEDLINESTORAGE_H
797a2f3a57SNicolas Bonnefon #define COMPRESSEDLINESTORAGE_H
807a2f3a57SNicolas Bonnefon 
817151e3acSNicolas Bonnefon #define BLOCK_SIZE 256
827a2f3a57SNicolas Bonnefon 
837a2f3a57SNicolas Bonnefon //template<int BLOCK_SIZE = 128>
847a2f3a57SNicolas Bonnefon class CompressedLinePositionStorage
857a2f3a57SNicolas Bonnefon {
867a2f3a57SNicolas Bonnefon   public:
877a2f3a57SNicolas Bonnefon     // Default constructor
CompressedLinePositionStorage()887a2f3a57SNicolas Bonnefon     CompressedLinePositionStorage()
897a2f3a57SNicolas Bonnefon     { nb_lines_ = 0; first_long_line_ = UINT32_MAX;
903c46a469SNicolas Bonnefon       current_pos_ = 0; block_pointer_ = nullptr;
913c46a469SNicolas Bonnefon       previous_block_pointer_ = nullptr; }
923c46a469SNicolas Bonnefon     // Copy constructor would be slow, delete!
93e49f4922SNicolas Bonnefon     CompressedLinePositionStorage( const CompressedLinePositionStorage& orig ) = delete;
94af5438c7SNicolas Bonnefon 
953c46a469SNicolas Bonnefon     // Move constructor
963c46a469SNicolas Bonnefon     CompressedLinePositionStorage( CompressedLinePositionStorage&& orig );
973c46a469SNicolas Bonnefon     // Move assignement
983c46a469SNicolas Bonnefon     CompressedLinePositionStorage& operator=(
993c46a469SNicolas Bonnefon             CompressedLinePositionStorage&& orig );
1003c46a469SNicolas Bonnefon     // Destructor
1013c46a469SNicolas Bonnefon     ~CompressedLinePositionStorage();
1027a2f3a57SNicolas Bonnefon 
1037a2f3a57SNicolas Bonnefon     // Append the passed end-of-line to the storage
1047a2f3a57SNicolas Bonnefon     void append( uint64_t pos );
push_back(uint64_t pos)1057151e3acSNicolas Bonnefon     void push_back( uint64_t pos )
1067151e3acSNicolas Bonnefon     { append( pos ); }
1077a2f3a57SNicolas Bonnefon     // Size of the array
size()1087a2f3a57SNicolas Bonnefon     uint32_t size() const
1097a2f3a57SNicolas Bonnefon     { return nb_lines_; }
1107a2f3a57SNicolas Bonnefon     // Element at index
1117151e3acSNicolas Bonnefon     uint64_t at( uint32_t i ) const;
1127a2f3a57SNicolas Bonnefon 
1137a2f3a57SNicolas Bonnefon     // Add one list to the other
114e49f4922SNicolas Bonnefon     void append_list( const std::vector<uint64_t>& positions );
1157a2f3a57SNicolas Bonnefon 
1167a2f3a57SNicolas Bonnefon     // Pop the last element of the storage
1177a2f3a57SNicolas Bonnefon     void pop_back();
1187a2f3a57SNicolas Bonnefon 
1197a2f3a57SNicolas Bonnefon   private:
1203c46a469SNicolas Bonnefon     // Utility for move ctor/assign
1213c46a469SNicolas Bonnefon     void move_from( CompressedLinePositionStorage&& orig );
12225d0072cSNicolas Bonnefon     void free_blocks();
1233c46a469SNicolas Bonnefon 
1247a2f3a57SNicolas Bonnefon     // The two indexes
1257a2f3a57SNicolas Bonnefon     std::vector<char*> block32_index_;
1267a2f3a57SNicolas Bonnefon     std::vector<char*> block64_index_;
1277a2f3a57SNicolas Bonnefon 
1287a2f3a57SNicolas Bonnefon     // Total number of lines in storage
1297a2f3a57SNicolas Bonnefon     uint32_t nb_lines_;
1307a2f3a57SNicolas Bonnefon 
1317a2f3a57SNicolas Bonnefon     // Current position (position of the end of the last line added)
1327a2f3a57SNicolas Bonnefon     uint64_t current_pos_;
1337a2f3a57SNicolas Bonnefon     // Address of the next position (not yet written) within the current
1347a2f3a57SNicolas Bonnefon     // block. nullptr means there is no current block (previous block
1357a2f3a57SNicolas Bonnefon     // finished or no data)
1367a2f3a57SNicolas Bonnefon     char* block_pointer_;
1377a2f3a57SNicolas Bonnefon 
1387a2f3a57SNicolas Bonnefon     // The index of the first line whose end is stored in a block64
1397a2f3a57SNicolas Bonnefon     // Initialised at UINT32_MAX, meaning "unset"
1406e2e573cSNicolas Bonnefon     // this is the origin point for all calculations in block64
1417a2f3a57SNicolas Bonnefon     uint32_t first_long_line_;
1423c46a469SNicolas Bonnefon 
1433c46a469SNicolas Bonnefon     // For pop_back:
1443c46a469SNicolas Bonnefon 
1453c46a469SNicolas Bonnefon     // Previous pointer to block element, it is restored when we
1463c46a469SNicolas Bonnefon     // "pop_back" the last element.
1473c46a469SNicolas Bonnefon     // A null pointer here means pop_back need to free the block
1483c46a469SNicolas Bonnefon     // that has just been created.
1493c46a469SNicolas Bonnefon     char* previous_block_pointer_;
1507151e3acSNicolas Bonnefon 
1517151e3acSNicolas Bonnefon     // Cache the last position read
1527151e3acSNicolas Bonnefon     // This is to speed up consecutive reads (whole page)
1537151e3acSNicolas Bonnefon     struct Cache {
CacheCache154*02531d6aSNicolas Bonnefon         Cache() {
155*02531d6aSNicolas Bonnefon             index = UINT32_MAX - 1U;
156*02531d6aSNicolas Bonnefon             position = 0;
157*02531d6aSNicolas Bonnefon             ptr = nullptr;
158*02531d6aSNicolas Bonnefon         }
159*02531d6aSNicolas Bonnefon 
1607151e3acSNicolas Bonnefon         uint32_t index;
1617151e3acSNicolas Bonnefon         uint64_t position;
1627151e3acSNicolas Bonnefon         char* ptr;
1637151e3acSNicolas Bonnefon     };
1647151e3acSNicolas Bonnefon     mutable ThreadPrivateStore<Cache,2> last_read_; // = { UINT32_MAX - 1U, 0, nullptr };
1657151e3acSNicolas Bonnefon     // mutable Cache last_read;
1667a2f3a57SNicolas Bonnefon };
1677a2f3a57SNicolas Bonnefon 
1687a2f3a57SNicolas Bonnefon #endif
169