// Copyright (c) 2011 The LevelDB Authors. All rights reserved. // Use of this source code is governed by a BSD-style license that can be // found in the LICENSE file. See the AUTHORS file for names of contributors. #pragma once #include #include #include #include "rocksdb/env.h" #include "rocksdb/iterator.h" #include "rocksdb/table.h" #include "rocksdb/plain_table_factory.h" namespace rocksdb { class Block; class BlockHandle; class Footer; struct Options; class RandomAccessFile; struct ReadOptions; class TableCache; class TableReader; class DynamicBloom; using std::unique_ptr; using std::unordered_map; // Based on following output file format shown in plain_table_factory.h // When opening the output file, IndexedTableReader creates a hash table // from key prefixes to offset of the output file. IndexedTable will decide // whether it points to the data offset of the first key with the key prefix // or the offset of it. If there are too many keys share this prefix, it will // create a binary search-able index from the suffix to offset on disk. // // The implementation of IndexedTableReader requires output file is mmaped class PlainTableReader: public TableReader { public: static Status Open(const Options& options, const EnvOptions& soptions, unique_ptr && file, uint64_t file_size, unique_ptr* table, const int bloom_num_bits, double hash_table_ratio); bool PrefixMayMatch(const Slice& internal_prefix); Iterator* NewIterator(const ReadOptions&); Status Get( const ReadOptions&, const Slice& key, void* arg, bool (*handle_result)(void* arg, const Slice& k, const Slice& v, bool), void (*mark_key_may_exist)(void*) = nullptr); uint64_t ApproximateOffsetOf(const Slice& key); bool TEST_KeyInCache(const ReadOptions& options, const Slice& key); void SetupForCompaction(); TableProperties& GetTableProperties() { return table_properties_; } PlainTableReader(const EnvOptions& storage_options, uint64_t file_size, int bloom_num_bits, double hash_table_ratio, const TableProperties& table_properties); ~PlainTableReader(); private: struct IndexRecord; class IndexRecordList; uint32_t* hash_table_ = nullptr; int hash_table_size_; char* sub_index_ = nullptr; Options options_; const EnvOptions& soptions_; Status status_; unique_ptr file_; Slice file_data_; uint32_t version_; uint32_t file_size_; const double hash_table_ratio_; const int bloom_bits_per_key_; DynamicBloom* bloom_; TableProperties table_properties_; const uint32_t data_start_offset_; const uint32_t data_end_offset_; const size_t user_key_len_; static const size_t kNumInternalBytes = 8; static const uint32_t kSubIndexMask = 0x80000000; static const size_t kOffsetLen = sizeof(uint32_t); bool IsFixedLength() { return user_key_len_ != PlainTableFactory::kVariableLength; } size_t GetFixedInternalKeyLength() { return user_key_len_ + kNumInternalBytes; } friend class TableCache; friend class PlainTableIterator; // Internal helper function to generate an IndexRecordList object from all // the rows, which contains index records as a list. int PopulateIndexRecordList(IndexRecordList& record_list); // Internal helper function to allocate memory for indexes and bloom filters void Allocate(int num_prefixes); // Internal helper function to bucket index record list to hash buckets. // hash2offsets is sized of of hash_table_size_, each contains a linked list // of offsets for the hash, in reversed order. // bucket_count is sized of hash_table_size_. The value is how many index // records are there in hash2offsets for the same bucket. size_t BucketizeIndexesAndFillBloom( IndexRecordList& record_list, int num_prefixes, std::vector& hash2offsets, std::vector& bucket_count); // Internal helper class to fill the indexes and bloom filters to internal // data structures. hash2offsets and bucket_count are bucketized indexes and // counts generated by BucketizeIndexesAndFillBloom(). void FillIndexes(size_t sub_index_size_needed, std::vector& hash2offsets, std::vector& bucket_count); // Populate the internal indexes. It must be called before // any query to the table. // This query will populate the hash table hash_table_, the second // level of indexes sub_index_ and bloom filter filter_slice_ if enabled. Status PopulateIndex(); // Check bloom filter to see whether it might contain this prefix. // The hash of the prefix is given, since it can be reused for index lookup // too. bool MayHavePrefix(uint32_t hash); Status ReadKey(const char* row_ptr, Slice* key, size_t& bytes_read); // Read the key and value at offset to key and value. // tmp_slice is a tmp slice. // return next_offset as the offset for the next key. Status Next(uint32_t offset, Slice* key, Slice* value, uint32_t& next_offset); // Get file offset for key target. // return value prefix_matched is set to true if the offset is confirmed // for a key with the same prefix as target. Status GetOffset(const Slice& target, const Slice& prefix, uint32_t prefix_hash, bool& prefix_matched, uint32_t& ret_offset); Slice GetPrefix(const Slice& target) { assert(target.size() >= 8); // target is internal key return options_.prefix_extractor->Transform( Slice(target.data(), target.size() - 8)); } // No copying allowed explicit PlainTableReader(const TableReader&) = delete; void operator=(const TableReader&) = delete; }; // Iterator to iterate IndexedTable class PlainTableIterator: public Iterator { public: explicit PlainTableIterator(PlainTableReader* table); ~PlainTableIterator(); bool Valid() const; void SeekToFirst(); void SeekToLast(); void Seek(const Slice& target); void Next(); void Prev(); Slice key() const; Slice value() const; Status status() const; private: PlainTableReader* table_; uint32_t offset_; uint32_t next_offset_; Slice key_; Slice value_; Status status_; // No copying allowed PlainTableIterator(const PlainTableIterator&) = delete; void operator=(const Iterator&) = delete; }; } // namespace rocksdb