// Copyright (c) 2011-present, Facebook, Inc. All rights reserved. // This source code is licensed under both the GPLv2 (found in the // COPYING file in the root directory) and Apache 2.0 License // (found in the LICENSE.Apache file in the root directory). #pragma once #ifndef ROCKSDB_LITE #include #include "rocksdb/slice.h" #include "db/dbformat.h" #include "table/plain/plain_table_reader.h" // The file contains three helper classes of PlainTable format, // PlainTableKeyEncoder, PlainTableKeyDecoder and PlainTableFileReader. // These classes issue the lowest level of operations of PlainTable. // Actual data format of the key is documented in comments of class // PlainTableFactory. namespace rocksdb { class WritableFile; struct ParsedInternalKey; struct PlainTableReaderFileInfo; enum PlainTableEntryType : unsigned char; // Helper class for PlainTable format to write out a key to an output file // The class is used in PlainTableBuilder. class PlainTableKeyEncoder { public: explicit PlainTableKeyEncoder(EncodingType encoding_type, uint32_t user_key_len, const SliceTransform* prefix_extractor, size_t index_sparseness) : encoding_type_((prefix_extractor != nullptr) ? encoding_type : kPlain), fixed_user_key_len_(user_key_len), prefix_extractor_(prefix_extractor), index_sparseness_((index_sparseness > 1) ? index_sparseness : 1), key_count_for_prefix_(0) {} // key: the key to write out, in the format of internal key. // file: the output file to write out // offset: offset in the file. Needs to be updated after appending bytes // for the key // meta_bytes_buf: buffer for extra meta bytes // meta_bytes_buf_size: offset to append extra meta bytes. Will be updated // if meta_bytes_buf is updated. Status AppendKey(const Slice& key, WritableFileWriter* file, uint64_t* offset, char* meta_bytes_buf, size_t* meta_bytes_buf_size); // Return actual encoding type to be picked EncodingType GetEncodingType() { return encoding_type_; } private: EncodingType encoding_type_; uint32_t fixed_user_key_len_; const SliceTransform* prefix_extractor_; const size_t index_sparseness_; size_t key_count_for_prefix_; IterKey pre_prefix_; }; // The class does raw file reads for PlainTableReader. // It hides whether it is a mmap-read, or a non-mmap read. // The class is implemented in a way to favor the performance of mmap case. // The class is used by PlainTableReader. class PlainTableFileReader { public: explicit PlainTableFileReader(const PlainTableReaderFileInfo* _file_info) : file_info_(_file_info), num_buf_(0) {} // In mmaped mode, the results point to mmaped area of the file, which // means it is always valid before closing the file. // In non-mmap mode, the results point to an internal buffer. If the caller // makes another read call, the results may not be valid. So callers should // make a copy when needed. // In order to save read calls to files, we keep two internal buffers: // the first read and the most recent read. This is efficient because it // columns these two common use cases: // (1) hash index only identify one location, we read the key to verify // the location, and read key and value if it is the right location. // (2) after hash index checking, we identify two locations (because of // hash bucket conflicts), we binary search the two location to see // which one is what we need and start to read from the location. // These two most common use cases will be covered by the two buffers // so that we don't need to re-read the same location. // Currently we keep a fixed size buffer. If a read doesn't exactly fit // the buffer, we replace the second buffer with the location user reads. // // If return false, status code is stored in status_. bool Read(uint32_t file_offset, uint32_t len, Slice* out) { if (file_info_->is_mmap_mode) { assert(file_offset + len <= file_info_->data_end_offset); *out = Slice(file_info_->file_data.data() + file_offset, len); return true; } else { return ReadNonMmap(file_offset, len, out); } } // If return false, status code is stored in status_. bool ReadNonMmap(uint32_t file_offset, uint32_t len, Slice* output); // *bytes_read = 0 means eof. false means failure and status is saved // in status_. Not directly returning Status to save copying status // object to map previous performance of mmap mode. inline bool ReadVarint32(uint32_t offset, uint32_t* output, uint32_t* bytes_read); bool ReadVarint32NonMmap(uint32_t offset, uint32_t* output, uint32_t* bytes_read); Status status() const { return status_; } const PlainTableReaderFileInfo* file_info() { return file_info_; } private: const PlainTableReaderFileInfo* file_info_; struct Buffer { Buffer() : buf_start_offset(0), buf_len(0), buf_capacity(0) {} std::unique_ptr buf; uint32_t buf_start_offset; uint32_t buf_len; uint32_t buf_capacity; }; // Keep buffers for two recent reads. std::array, 2> buffers_; uint32_t num_buf_; Status status_; Slice GetFromBuffer(Buffer* buf, uint32_t file_offset, uint32_t len); }; // A helper class to decode keys from input buffer // The class is used by PlainTableBuilder. class PlainTableKeyDecoder { public: explicit PlainTableKeyDecoder(const PlainTableReaderFileInfo* file_info, EncodingType encoding_type, uint32_t user_key_len, const SliceTransform* prefix_extractor) : file_reader_(file_info), encoding_type_(encoding_type), prefix_len_(0), fixed_user_key_len_(user_key_len), prefix_extractor_(prefix_extractor), in_prefix_(false) {} // Find the next key. // start: char array where the key starts. // limit: boundary of the char array // parsed_key: the output of the result key // internal_key: if not null, fill with the output of the result key in // un-parsed format // bytes_read: how many bytes read from start. Output // seekable: whether key can be read from this place. Used when building // indexes. Output. Status NextKey(uint32_t start_offset, ParsedInternalKey* parsed_key, Slice* internal_key, Slice* value, uint32_t* bytes_read, bool* seekable = nullptr); Status NextKeyNoValue(uint32_t start_offset, ParsedInternalKey* parsed_key, Slice* internal_key, uint32_t* bytes_read, bool* seekable = nullptr); PlainTableFileReader file_reader_; EncodingType encoding_type_; uint32_t prefix_len_; uint32_t fixed_user_key_len_; Slice saved_user_key_; IterKey cur_key_; const SliceTransform* prefix_extractor_; bool in_prefix_; private: Status NextPlainEncodingKey(uint32_t start_offset, ParsedInternalKey* parsed_key, Slice* internal_key, uint32_t* bytes_read, bool* seekable = nullptr); Status NextPrefixEncodingKey(uint32_t start_offset, ParsedInternalKey* parsed_key, Slice* internal_key, uint32_t* bytes_read, bool* seekable = nullptr); Status ReadInternalKey(uint32_t file_offset, uint32_t user_key_size, ParsedInternalKey* parsed_key, uint32_t* bytes_read, bool* internal_key_valid, Slice* internal_key); inline Status DecodeSize(uint32_t start_offset, PlainTableEntryType* entry_type, uint32_t* key_size, uint32_t* bytes_read); }; } // namespace rocksdb #endif // ROCKSDB_LITE