fork of https://github.com/oxigraph/rocksdb and https://github.com/facebook/rocksdb for nextgraph and oxigraph
You can not select more than 25 topics
Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.
201 lines
8.0 KiB
201 lines
8.0 KiB
// Copyright (c) 2011-present, Facebook, Inc. All rights reserved.
|
|
// This source code is licensed under both the GPLv2 (found in the
|
|
// COPYING file in the root directory) and Apache 2.0 License
|
|
// (found in the LICENSE.Apache file in the root directory).
|
|
|
|
#pragma once
|
|
|
|
#ifndef ROCKSDB_LITE
|
|
|
|
#include <array>
|
|
#include "db/dbformat.h"
|
|
#include "rocksdb/slice.h"
|
|
#include "table/plain/plain_table_reader.h"
|
|
|
|
// The file contains three helper classes of PlainTable format,
|
|
// PlainTableKeyEncoder, PlainTableKeyDecoder and PlainTableFileReader.
|
|
// These classes issue the lowest level of operations of PlainTable.
|
|
// Actual data format of the key is documented in comments of class
|
|
// PlainTableFactory.
|
|
namespace ROCKSDB_NAMESPACE {
|
|
|
|
class WritableFile;
|
|
struct ParsedInternalKey;
|
|
struct PlainTableReaderFileInfo;
|
|
enum PlainTableEntryType : unsigned char;
|
|
|
|
// Helper class for PlainTable format to write out a key to an output file
|
|
// The class is used in PlainTableBuilder.
|
|
class PlainTableKeyEncoder {
|
|
public:
|
|
explicit PlainTableKeyEncoder(EncodingType encoding_type,
|
|
uint32_t user_key_len,
|
|
const SliceTransform* prefix_extractor,
|
|
size_t index_sparseness)
|
|
: encoding_type_((prefix_extractor != nullptr) ? encoding_type : kPlain),
|
|
fixed_user_key_len_(user_key_len),
|
|
prefix_extractor_(prefix_extractor),
|
|
index_sparseness_((index_sparseness > 1) ? index_sparseness : 1),
|
|
key_count_for_prefix_(0) {}
|
|
// key: the key to write out, in the format of internal key.
|
|
// file: the output file to write out
|
|
// offset: offset in the file. Needs to be updated after appending bytes
|
|
// for the key
|
|
// meta_bytes_buf: buffer for extra meta bytes
|
|
// meta_bytes_buf_size: offset to append extra meta bytes. Will be updated
|
|
// if meta_bytes_buf is updated.
|
|
IOStatus AppendKey(const Slice& key, WritableFileWriter* file,
|
|
uint64_t* offset, char* meta_bytes_buf,
|
|
size_t* meta_bytes_buf_size);
|
|
|
|
// Return actual encoding type to be picked
|
|
EncodingType GetEncodingType() { return encoding_type_; }
|
|
|
|
private:
|
|
EncodingType encoding_type_;
|
|
uint32_t fixed_user_key_len_;
|
|
const SliceTransform* prefix_extractor_;
|
|
const size_t index_sparseness_;
|
|
size_t key_count_for_prefix_;
|
|
IterKey pre_prefix_;
|
|
};
|
|
|
|
// The class does raw file reads for PlainTableReader.
|
|
// It hides whether it is a mmap-read, or a non-mmap read.
|
|
// The class is implemented in a way to favor the performance of mmap case.
|
|
// The class is used by PlainTableReader.
|
|
class PlainTableFileReader {
|
|
public:
|
|
explicit PlainTableFileReader(const PlainTableReaderFileInfo* _file_info)
|
|
: file_info_(_file_info), num_buf_(0) {}
|
|
|
|
~PlainTableFileReader() {
|
|
// Should fix.
|
|
status_.PermitUncheckedError();
|
|
}
|
|
|
|
// In mmaped mode, the results point to mmaped area of the file, which
|
|
// means it is always valid before closing the file.
|
|
// In non-mmap mode, the results point to an internal buffer. If the caller
|
|
// makes another read call, the results may not be valid. So callers should
|
|
// make a copy when needed.
|
|
// In order to save read calls to files, we keep two internal buffers:
|
|
// the first read and the most recent read. This is efficient because it
|
|
// columns these two common use cases:
|
|
// (1) hash index only identify one location, we read the key to verify
|
|
// the location, and read key and value if it is the right location.
|
|
// (2) after hash index checking, we identify two locations (because of
|
|
// hash bucket conflicts), we binary search the two location to see
|
|
// which one is what we need and start to read from the location.
|
|
// These two most common use cases will be covered by the two buffers
|
|
// so that we don't need to re-read the same location.
|
|
// Currently we keep a fixed size buffer. If a read doesn't exactly fit
|
|
// the buffer, we replace the second buffer with the location user reads.
|
|
//
|
|
// If return false, status code is stored in status_.
|
|
bool Read(uint32_t file_offset, uint32_t len, Slice* out) {
|
|
if (file_info_->is_mmap_mode) {
|
|
assert(file_offset + len <= file_info_->data_end_offset);
|
|
*out = Slice(file_info_->file_data.data() + file_offset, len);
|
|
return true;
|
|
} else {
|
|
return ReadNonMmap(file_offset, len, out);
|
|
}
|
|
}
|
|
|
|
// If return false, status code is stored in status_.
|
|
bool ReadNonMmap(uint32_t file_offset, uint32_t len, Slice* output);
|
|
|
|
// *bytes_read = 0 means eof. false means failure and status is saved
|
|
// in status_. Not directly returning Status to save copying status
|
|
// object to map previous performance of mmap mode.
|
|
inline bool ReadVarint32(uint32_t offset, uint32_t* output,
|
|
uint32_t* bytes_read);
|
|
|
|
bool ReadVarint32NonMmap(uint32_t offset, uint32_t* output,
|
|
uint32_t* bytes_read);
|
|
|
|
Status status() const { return status_; }
|
|
|
|
const PlainTableReaderFileInfo* file_info() { return file_info_; }
|
|
|
|
private:
|
|
const PlainTableReaderFileInfo* file_info_;
|
|
|
|
struct Buffer {
|
|
Buffer() : buf_start_offset(0), buf_len(0), buf_capacity(0) {}
|
|
std::unique_ptr<char[]> buf;
|
|
uint32_t buf_start_offset;
|
|
uint32_t buf_len;
|
|
uint32_t buf_capacity;
|
|
};
|
|
|
|
// Keep buffers for two recent reads.
|
|
std::array<std::unique_ptr<Buffer>, 2> buffers_;
|
|
uint32_t num_buf_;
|
|
Status status_;
|
|
|
|
Slice GetFromBuffer(Buffer* buf, uint32_t file_offset, uint32_t len);
|
|
};
|
|
|
|
// A helper class to decode keys from input buffer
|
|
// The class is used by PlainTableBuilder.
|
|
class PlainTableKeyDecoder {
|
|
public:
|
|
explicit PlainTableKeyDecoder(const PlainTableReaderFileInfo* file_info,
|
|
EncodingType encoding_type,
|
|
uint32_t user_key_len,
|
|
const SliceTransform* prefix_extractor)
|
|
: file_reader_(file_info),
|
|
encoding_type_(encoding_type),
|
|
prefix_len_(0),
|
|
fixed_user_key_len_(user_key_len),
|
|
prefix_extractor_(prefix_extractor),
|
|
in_prefix_(false) {}
|
|
|
|
// Find the next key.
|
|
// start: char array where the key starts.
|
|
// limit: boundary of the char array
|
|
// parsed_key: the output of the result key
|
|
// internal_key: if not null, fill with the output of the result key in
|
|
// un-parsed format
|
|
// bytes_read: how many bytes read from start. Output
|
|
// seekable: whether key can be read from this place. Used when building
|
|
// indexes. Output.
|
|
Status NextKey(uint32_t start_offset, ParsedInternalKey* parsed_key,
|
|
Slice* internal_key, Slice* value, uint32_t* bytes_read,
|
|
bool* seekable = nullptr);
|
|
|
|
Status NextKeyNoValue(uint32_t start_offset, ParsedInternalKey* parsed_key,
|
|
Slice* internal_key, uint32_t* bytes_read,
|
|
bool* seekable = nullptr);
|
|
|
|
PlainTableFileReader file_reader_;
|
|
EncodingType encoding_type_;
|
|
uint32_t prefix_len_;
|
|
uint32_t fixed_user_key_len_;
|
|
Slice saved_user_key_;
|
|
IterKey cur_key_;
|
|
const SliceTransform* prefix_extractor_;
|
|
bool in_prefix_;
|
|
|
|
private:
|
|
Status NextPlainEncodingKey(uint32_t start_offset,
|
|
ParsedInternalKey* parsed_key,
|
|
Slice* internal_key, uint32_t* bytes_read,
|
|
bool* seekable = nullptr);
|
|
Status NextPrefixEncodingKey(uint32_t start_offset,
|
|
ParsedInternalKey* parsed_key,
|
|
Slice* internal_key, uint32_t* bytes_read,
|
|
bool* seekable = nullptr);
|
|
Status ReadInternalKey(uint32_t file_offset, uint32_t user_key_size,
|
|
ParsedInternalKey* parsed_key, uint32_t* bytes_read,
|
|
bool* internal_key_valid, Slice* internal_key);
|
|
inline Status DecodeSize(uint32_t start_offset,
|
|
PlainTableEntryType* entry_type, uint32_t* key_size,
|
|
uint32_t* bytes_read);
|
|
};
|
|
|
|
} // namespace ROCKSDB_NAMESPACE
|
|
|
|
#endif // ROCKSDB_LITE
|
|
|