|
|
|
// Copyright (c) 2011-present, Facebook, Inc. All rights reserved.
|
|
|
|
// This source code is licensed under both the GPLv2 (found in the
|
|
|
|
// COPYING file in the root directory) and Apache 2.0 License
|
|
|
|
// (found in the LICENSE.Apache file in the root directory).
|
|
|
|
|
|
|
|
#pragma once
|
|
|
|
|
|
|
|
#ifndef ROCKSDB_LITE
|
|
|
|
|
|
|
|
#include <array>
|
|
|
|
#include "db/dbformat.h"
|
|
|
|
#include "rocksdb/slice.h"
|
|
|
|
#include "table/plain/plain_table_reader.h"
|
|
|
|
|
|
|
|
// The file contains three helper classes of PlainTable format,
|
|
|
|
// PlainTableKeyEncoder, PlainTableKeyDecoder and PlainTableFileReader.
|
|
|
|
// These classes issue the lowest level of operations of PlainTable.
|
|
|
|
// Actual data format of the key is documented in comments of class
|
|
|
|
// PlainTableFactory.
|
|
|
|
namespace ROCKSDB_NAMESPACE {
|
|
|
|
|
|
|
|
class WritableFile;
|
|
|
|
struct ParsedInternalKey;
|
|
|
|
struct PlainTableReaderFileInfo;
|
|
|
|
enum PlainTableEntryType : unsigned char;
|
|
|
|
|
|
|
|
// Helper class for PlainTable format to write out a key to an output file
|
|
|
|
// The class is used in PlainTableBuilder.
|
|
|
|
class PlainTableKeyEncoder {
|
|
|
|
public:
|
|
|
|
explicit PlainTableKeyEncoder(EncodingType encoding_type,
|
|
|
|
uint32_t user_key_len,
|
|
|
|
const SliceTransform* prefix_extractor,
|
|
|
|
size_t index_sparseness)
|
|
|
|
: encoding_type_((prefix_extractor != nullptr) ? encoding_type : kPlain),
|
|
|
|
fixed_user_key_len_(user_key_len),
|
|
|
|
prefix_extractor_(prefix_extractor),
|
|
|
|
index_sparseness_((index_sparseness > 1) ? index_sparseness : 1),
|
|
|
|
key_count_for_prefix_(0) {}
|
|
|
|
// key: the key to write out, in the format of internal key.
|
|
|
|
// file: the output file to write out
|
|
|
|
// offset: offset in the file. Needs to be updated after appending bytes
|
|
|
|
// for the key
|
|
|
|
// meta_bytes_buf: buffer for extra meta bytes
|
|
|
|
// meta_bytes_buf_size: offset to append extra meta bytes. Will be updated
|
|
|
|
// if meta_bytes_buf is updated.
|
Pass IOStatus to write path and set retryable IO Error as hard error in BG jobs (#6487)
Summary:
In the current code base, we use Status to get and store the returned status from the call. Specifically, for IO related functions, the current Status cannot reflect the IO Error details such as error scope, error retryable attribute, and others. With the implementation of https://github.com/facebook/rocksdb/issues/5761, we have the new Wrapper for IO, which returns IOStatus instead of Status. However, the IOStatus is purged at the lower level of write path and transferred to Status.
The first job of this PR is to pass the IOStatus to the write path (flush, WAL write, and Compaction). The second job is to identify the Retryable IO Error as HardError, and set the bg_error_ as HardError. In this case, the DB Instance becomes read only. User is informed of the Status and need to take actions to deal with it (e.g., call db->Resume()).
Pull Request resolved: https://github.com/facebook/rocksdb/pull/6487
Test Plan: Added the testing case to error_handler_fs_test. Pass make asan_check
Reviewed By: anand1976
Differential Revision: D20685017
Pulled By: zhichao-cao
fbshipit-source-id: ff85f042896243abcd6ef37877834e26f36b6eb0
5 years ago
|
|
|
IOStatus AppendKey(const Slice& key, WritableFileWriter* file,
|
|
|
|
uint64_t* offset, char* meta_bytes_buf,
|
|
|
|
size_t* meta_bytes_buf_size);
|
|
|
|
|
|
|
|
// Return actual encoding type to be picked
|
|
|
|
EncodingType GetEncodingType() { return encoding_type_; }
|
|
|
|
|
|
|
|
private:
|
|
|
|
EncodingType encoding_type_;
|
|
|
|
uint32_t fixed_user_key_len_;
|
|
|
|
const SliceTransform* prefix_extractor_;
|
|
|
|
const size_t index_sparseness_;
|
|
|
|
size_t key_count_for_prefix_;
|
|
|
|
IterKey pre_prefix_;
|
|
|
|
};
|
|
|
|
|
|
|
|
// The class does raw file reads for PlainTableReader.
|
|
|
|
// It hides whether it is a mmap-read, or a non-mmap read.
|
|
|
|
// The class is implemented in a way to favor the performance of mmap case.
|
|
|
|
// The class is used by PlainTableReader.
|
plain table reader: non-mmap mode to keep two recent buffers
Summary: In plain table reader's non-mmap mode, we only keep the most recent read buffer. However, for binary search, it is likely we come back to a location to read. To avoid one pread in such a case, we keep two read buffers. It should cover most of the cases.
Test Plan:
1. run tests
2. check the optimization works through strace when running
./table_reader_bench -mmap_read=false --num_keys2=1 -num_keys1=5000 -table_factory=plain_table --iterator --through_db
Reviewers: anthony, rven, kradhakrishnan, igor, yhchiang, IslamAbdelRahman
Reviewed By: IslamAbdelRahman
Subscribers: leveldb, dhruba
Differential Revision: https://reviews.facebook.net/D51171
9 years ago
|
|
|
class PlainTableFileReader {
|
|
|
|
public:
|
|
|
|
explicit PlainTableFileReader(const PlainTableReaderFileInfo* _file_info)
|
|
|
|
: file_info_(_file_info), num_buf_(0) {}
|
|
|
|
|
|
|
|
~PlainTableFileReader() {
|
|
|
|
// Should fix.
|
|
|
|
status_.PermitUncheckedError();
|
|
|
|
}
|
|
|
|
|
plain table reader: non-mmap mode to keep two recent buffers
Summary: In plain table reader's non-mmap mode, we only keep the most recent read buffer. However, for binary search, it is likely we come back to a location to read. To avoid one pread in such a case, we keep two read buffers. It should cover most of the cases.
Test Plan:
1. run tests
2. check the optimization works through strace when running
./table_reader_bench -mmap_read=false --num_keys2=1 -num_keys1=5000 -table_factory=plain_table --iterator --through_db
Reviewers: anthony, rven, kradhakrishnan, igor, yhchiang, IslamAbdelRahman
Reviewed By: IslamAbdelRahman
Subscribers: leveldb, dhruba
Differential Revision: https://reviews.facebook.net/D51171
9 years ago
|
|
|
// In mmaped mode, the results point to mmaped area of the file, which
|
|
|
|
// means it is always valid before closing the file.
|
|
|
|
// In non-mmap mode, the results point to an internal buffer. If the caller
|
|
|
|
// makes another read call, the results may not be valid. So callers should
|
|
|
|
// make a copy when needed.
|
|
|
|
// In order to save read calls to files, we keep two internal buffers:
|
|
|
|
// the first read and the most recent read. This is efficient because it
|
|
|
|
// columns these two common use cases:
|
|
|
|
// (1) hash index only identify one location, we read the key to verify
|
|
|
|
// the location, and read key and value if it is the right location.
|
|
|
|
// (2) after hash index checking, we identify two locations (because of
|
|
|
|
// hash bucket conflicts), we binary search the two location to see
|
|
|
|
// which one is what we need and start to read from the location.
|
|
|
|
// These two most common use cases will be covered by the two buffers
|
|
|
|
// so that we don't need to re-read the same location.
|
|
|
|
// Currently we keep a fixed size buffer. If a read doesn't exactly fit
|
|
|
|
// the buffer, we replace the second buffer with the location user reads.
|
|
|
|
//
|
|
|
|
// If return false, status code is stored in status_.
|
|
|
|
bool Read(uint32_t file_offset, uint32_t len, Slice* out) {
|
|
|
|
if (file_info_->is_mmap_mode) {
|
|
|
|
assert(file_offset + len <= file_info_->data_end_offset);
|
|
|
|
*out = Slice(file_info_->file_data.data() + file_offset, len);
|
|
|
|
return true;
|
|
|
|
} else {
|
|
|
|
return ReadNonMmap(file_offset, len, out);
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
// If return false, status code is stored in status_.
|
|
|
|
bool ReadNonMmap(uint32_t file_offset, uint32_t len, Slice* output);
|
|
|
|
|
|
|
|
// *bytes_read = 0 means eof. false means failure and status is saved
|
|
|
|
// in status_. Not directly returning Status to save copying status
|
|
|
|
// object to map previous performance of mmap mode.
|
|
|
|
inline bool ReadVarint32(uint32_t offset, uint32_t* output,
|
|
|
|
uint32_t* bytes_read);
|
|
|
|
|
|
|
|
bool ReadVarint32NonMmap(uint32_t offset, uint32_t* output,
|
|
|
|
uint32_t* bytes_read);
|
|
|
|
|
|
|
|
Status status() const { return status_; }
|
|
|
|
|
|
|
|
const PlainTableReaderFileInfo* file_info() { return file_info_; }
|
|
|
|
|
|
|
|
private:
|
|
|
|
const PlainTableReaderFileInfo* file_info_;
|
|
|
|
|
|
|
|
struct Buffer {
|
|
|
|
Buffer() : buf_start_offset(0), buf_len(0), buf_capacity(0) {}
|
|
|
|
std::unique_ptr<char[]> buf;
|
|
|
|
uint32_t buf_start_offset;
|
|
|
|
uint32_t buf_len;
|
|
|
|
uint32_t buf_capacity;
|
|
|
|
};
|
|
|
|
|
|
|
|
// Keep buffers for two recent reads.
|
|
|
|
std::array<std::unique_ptr<Buffer>, 2> buffers_;
|
plain table reader: non-mmap mode to keep two recent buffers
Summary: In plain table reader's non-mmap mode, we only keep the most recent read buffer. However, for binary search, it is likely we come back to a location to read. To avoid one pread in such a case, we keep two read buffers. It should cover most of the cases.
Test Plan:
1. run tests
2. check the optimization works through strace when running
./table_reader_bench -mmap_read=false --num_keys2=1 -num_keys1=5000 -table_factory=plain_table --iterator --through_db
Reviewers: anthony, rven, kradhakrishnan, igor, yhchiang, IslamAbdelRahman
Reviewed By: IslamAbdelRahman
Subscribers: leveldb, dhruba
Differential Revision: https://reviews.facebook.net/D51171
9 years ago
|
|
|
uint32_t num_buf_;
|
|
|
|
Status status_;
|
|
|
|
|
|
|
|
Slice GetFromBuffer(Buffer* buf, uint32_t file_offset, uint32_t len);
|
|
|
|
};
|
|
|
|
|
|
|
|
// A helper class to decode keys from input buffer
|
|
|
|
// The class is used by PlainTableBuilder.
|
|
|
|
class PlainTableKeyDecoder {
|
|
|
|
public:
|
|
|
|
explicit PlainTableKeyDecoder(const PlainTableReaderFileInfo* file_info,
|
|
|
|
EncodingType encoding_type,
|
|
|
|
uint32_t user_key_len,
|
|
|
|
const SliceTransform* prefix_extractor)
|
|
|
|
: file_reader_(file_info),
|
|
|
|
encoding_type_(encoding_type),
|
|
|
|
prefix_len_(0),
|
|
|
|
fixed_user_key_len_(user_key_len),
|
|
|
|
prefix_extractor_(prefix_extractor),
|
|
|
|
in_prefix_(false) {}
|
|
|
|
|
|
|
|
// Find the next key.
|
|
|
|
// start: char array where the key starts.
|
|
|
|
// limit: boundary of the char array
|
|
|
|
// parsed_key: the output of the result key
|
|
|
|
// internal_key: if not null, fill with the output of the result key in
|
|
|
|
// un-parsed format
|
|
|
|
// bytes_read: how many bytes read from start. Output
|
|
|
|
// seekable: whether key can be read from this place. Used when building
|
|
|
|
// indexes. Output.
|
|
|
|
Status NextKey(uint32_t start_offset, ParsedInternalKey* parsed_key,
|
|
|
|
Slice* internal_key, Slice* value, uint32_t* bytes_read,
|
|
|
|
bool* seekable = nullptr);
|
|
|
|
|
|
|
|
Status NextKeyNoValue(uint32_t start_offset, ParsedInternalKey* parsed_key,
|
|
|
|
Slice* internal_key, uint32_t* bytes_read,
|
|
|
|
bool* seekable = nullptr);
|
|
|
|
|
plain table reader: non-mmap mode to keep two recent buffers
Summary: In plain table reader's non-mmap mode, we only keep the most recent read buffer. However, for binary search, it is likely we come back to a location to read. To avoid one pread in such a case, we keep two read buffers. It should cover most of the cases.
Test Plan:
1. run tests
2. check the optimization works through strace when running
./table_reader_bench -mmap_read=false --num_keys2=1 -num_keys1=5000 -table_factory=plain_table --iterator --through_db
Reviewers: anthony, rven, kradhakrishnan, igor, yhchiang, IslamAbdelRahman
Reviewed By: IslamAbdelRahman
Subscribers: leveldb, dhruba
Differential Revision: https://reviews.facebook.net/D51171
9 years ago
|
|
|
PlainTableFileReader file_reader_;
|
|
|
|
EncodingType encoding_type_;
|
|
|
|
uint32_t prefix_len_;
|
|
|
|
uint32_t fixed_user_key_len_;
|
|
|
|
Slice saved_user_key_;
|
|
|
|
IterKey cur_key_;
|
|
|
|
const SliceTransform* prefix_extractor_;
|
|
|
|
bool in_prefix_;
|
|
|
|
|
|
|
|
private:
|
|
|
|
Status NextPlainEncodingKey(uint32_t start_offset,
|
|
|
|
ParsedInternalKey* parsed_key,
|
|
|
|
Slice* internal_key, uint32_t* bytes_read,
|
|
|
|
bool* seekable = nullptr);
|
|
|
|
Status NextPrefixEncodingKey(uint32_t start_offset,
|
|
|
|
ParsedInternalKey* parsed_key,
|
|
|
|
Slice* internal_key, uint32_t* bytes_read,
|
|
|
|
bool* seekable = nullptr);
|
|
|
|
Status ReadInternalKey(uint32_t file_offset, uint32_t user_key_size,
|
|
|
|
ParsedInternalKey* parsed_key, uint32_t* bytes_read,
|
|
|
|
bool* internal_key_valid, Slice* internal_key);
|
|
|
|
inline Status DecodeSize(uint32_t start_offset,
|
|
|
|
PlainTableEntryType* entry_type, uint32_t* key_size,
|
|
|
|
uint32_t* bytes_read);
|
|
|
|
};
|
|
|
|
|
|
|
|
} // namespace ROCKSDB_NAMESPACE
|
|
|
|
|
|
|
|
#endif // ROCKSDB_LITE
|