plain table reader: non-mmap mode to keep two recent buffers

Summary: In plain table reader's non-mmap mode, we only keep the most recent read buffer. However, for binary search, it is likely we come back to a location to read. To avoid one pread in such a case, we keep two read buffers. It should cover most of the cases.

Test Plan:
1. run tests
2. check the optimization works through strace when running
./table_reader_bench -mmap_read=false --num_keys2=1 -num_keys1=5000 -table_factory=plain_table --iterator --through_db

Reviewers: anthony, rven, kradhakrishnan, igor, yhchiang, IslamAbdelRahman

Reviewed By: IslamAbdelRahman

Subscribers: leveldb, dhruba

Differential Revision: https://reviews.facebook.net/D51171
main
sdong 9 years ago
parent 7ece10ecb6
commit 9a8e3f73ed
  1. 54
      db/plain_table_db_test.cc
  2. 93
      table/plain_table_key_coding.cc
  3. 107
      table/plain_table_key_coding.h
  4. 3
      table/table_reader_bench.cc
  5. 9
      util/testutil.h

@ -29,6 +29,7 @@
#include "table/bloom_block.h" #include "table/bloom_block.h"
#include "table/table_builder.h" #include "table/table_builder.h"
#include "table/plain_table_factory.h" #include "table/plain_table_factory.h"
#include "table/plain_table_key_coding.h"
#include "table/plain_table_reader.h" #include "table/plain_table_reader.h"
#include "util/hash.h" #include "util/hash.h"
#include "util/logging.h" #include "util/logging.h"
@ -41,6 +42,59 @@
using std::unique_ptr; using std::unique_ptr;
namespace rocksdb { namespace rocksdb {
class PlainTableKeyDecoderTest : public testing::Test {};
TEST_F(PlainTableKeyDecoderTest, ReadNonMmap) {
std::string tmp;
Random rnd(301);
const uint32_t kLength = 2222;
Slice contents = test::RandomString(&rnd, kLength, &tmp);
test::StringSource* string_source =
new test::StringSource(contents, 0, false);
unique_ptr<RandomAccessFileReader> file_reader(
test::GetRandomAccessFileReader(string_source));
unique_ptr<PlainTableReaderFileInfo> file_info(new PlainTableReaderFileInfo(
std::move(file_reader), EnvOptions(), kLength));
{
PlainTableFileReader reader(file_info.get());
const uint32_t kReadSize = 77;
for (uint32_t pos = 0; pos < kLength; pos += kReadSize) {
uint32_t read_size = std::min(kLength - pos, kReadSize);
Slice out;
ASSERT_TRUE(reader.Read(pos, read_size, &out));
ASSERT_EQ(0, out.compare(tmp.substr(pos, read_size)));
}
ASSERT_LT(string_source->total_reads(), kLength / kReadSize / 2);
}
std::vector<std::vector<std::pair<uint32_t, uint32_t>>> reads = {
{{600, 30}, {590, 30}, {600, 20}, {600, 40}},
{{800, 20}, {100, 20}, {500, 20}, {1500, 20}, {100, 20}, {80, 20}},
{{1000, 20}, {500, 20}, {1000, 50}},
{{1000, 20}, {500, 20}, {500, 20}},
{{1000, 20}, {500, 20}, {200, 20}, {500, 20}},
{{1000, 20}, {500, 20}, {200, 20}, {1000, 50}},
{{600, 500}, {610, 20}, {100, 20}},
{{500, 100}, {490, 100}, {550, 50}},
};
std::vector<int> num_file_reads = {2, 6, 2, 2, 4, 3, 2, 2};
for (size_t i = 0; i < reads.size(); i++) {
string_source->set_total_reads(0);
PlainTableFileReader reader(file_info.get());
for (auto p : reads[i]) {
Slice out;
ASSERT_TRUE(reader.Read(p.first, p.second, &out));
ASSERT_EQ(0, out.compare(tmp.substr(p.first, p.second)));
}
ASSERT_EQ(num_file_reads[i], string_source->total_reads());
}
}
class PlainTableDBTest : public testing::Test, class PlainTableDBTest : public testing::Test,
public testing::WithParamInterface<bool> { public testing::WithParamInterface<bool> {

@ -164,47 +164,62 @@ Status PlainTableKeyEncoder::AppendKey(const Slice& key,
return Status::OK(); return Status::OK();
} }
inline bool PlainTableKeyDecoder::FileReader::Read(uint32_t file_offset, Slice PlainTableFileReader::GetFromBuffer(Buffer* buffer, uint32_t file_offset,
uint32_t len, Slice* out) { uint32_t len) {
if (file_info_->is_mmap_mode) { assert(file_offset + len <= file_info_->data_end_offset);
assert(file_offset + len <= file_info_->data_end_offset); return Slice(buffer->buf.get() + (file_offset - buffer->buf_start_offset),
*out = Slice(file_info_->file_data.data() + file_offset, len); len);
return true;
} else {
return ReadNonMmap(file_offset, len, out);
}
} }
bool PlainTableKeyDecoder::FileReader::ReadNonMmap(uint32_t file_offset, bool PlainTableFileReader::ReadNonMmap(uint32_t file_offset, uint32_t len,
uint32_t len, Slice* out) { Slice* out) {
const uint32_t kPrefetchSize = 256u; const uint32_t kPrefetchSize = 256u;
if (file_offset < buf_start_offset_ ||
file_offset + len > buf_start_offset_ + buf_len_) { // Try to read from buffers.
// Load buffer for (uint32_t i = 0; i < num_buf_; i++) {
assert(file_offset + len <= file_info_->data_end_offset); Buffer* buffer = buffers_[num_buf_ - 1 - i].get();
uint32_t size_to_read = std::min(file_info_->data_end_offset - file_offset, if (file_offset >= buffer->buf_start_offset &&
std::max(kPrefetchSize, len)); file_offset + len <= buffer->buf_start_offset + buffer->buf_len) {
if (size_to_read > buf_capacity_) { *out = GetFromBuffer(buffer, file_offset, len);
buf_.reset(new char[size_to_read]); return true;
buf_capacity_ = size_to_read;
buf_len_ = 0;
}
Slice read_result;
Status s = file_info_->file->Read(file_offset, size_to_read, &read_result,
buf_.get());
if (!s.ok()) {
status_ = s;
return false;
} }
buf_start_offset_ = file_offset;
buf_len_ = size_to_read;
} }
*out = Slice(buf_.get() + (file_offset - buf_start_offset_), len);
Buffer* new_buffer;
// Data needed is not in any of the buffer. Allocate a new buffer.
if (num_buf_ < buffers_.size()) {
// Add a new buffer
new_buffer = new Buffer();
buffers_[num_buf_++].reset(new_buffer);
} else {
// Now simply replace the last buffer. Can improve the placement policy
// if needed.
new_buffer = buffers_[num_buf_ - 1].get();
}
assert(file_offset + len <= file_info_->data_end_offset);
uint32_t size_to_read = std::min(file_info_->data_end_offset - file_offset,
std::max(kPrefetchSize, len));
if (size_to_read > new_buffer->buf_capacity) {
new_buffer->buf.reset(new char[size_to_read]);
new_buffer->buf_capacity = size_to_read;
new_buffer->buf_len = 0;
}
Slice read_result;
Status s = file_info_->file->Read(file_offset, size_to_read, &read_result,
new_buffer->buf.get());
if (!s.ok()) {
status_ = s;
return false;
}
new_buffer->buf_start_offset = file_offset;
new_buffer->buf_len = size_to_read;
*out = GetFromBuffer(new_buffer, file_offset, len);
return true; return true;
} }
inline bool PlainTableKeyDecoder::FileReader::ReadVarint32( inline bool PlainTableFileReader::ReadVarint32(uint32_t offset, uint32_t* out,
uint32_t offset, uint32_t* out, uint32_t* bytes_read) { uint32_t* bytes_read) {
if (file_info_->is_mmap_mode) { if (file_info_->is_mmap_mode) {
const char* start = file_info_->file_data.data() + offset; const char* start = file_info_->file_data.data() + offset;
const char* limit = const char* limit =
@ -218,8 +233,8 @@ inline bool PlainTableKeyDecoder::FileReader::ReadVarint32(
} }
} }
bool PlainTableKeyDecoder::FileReader::ReadVarint32NonMmap( bool PlainTableFileReader::ReadVarint32NonMmap(uint32_t offset, uint32_t* out,
uint32_t offset, uint32_t* out, uint32_t* bytes_read) { uint32_t* bytes_read) {
const char* start; const char* start;
const char* limit; const char* limit;
const uint32_t kMaxVarInt32Size = 6u; const uint32_t kMaxVarInt32Size = 6u;
@ -298,7 +313,7 @@ Status PlainTableKeyDecoder::NextPlainEncodingKey(uint32_t start_offset,
if (!s.ok()) { if (!s.ok()) {
return s; return s;
} }
if (!file_reader_.file_info_->is_mmap_mode) { if (!file_reader_.file_info()->is_mmap_mode) {
cur_key_.SetInternalKey(*parsed_key); cur_key_.SetInternalKey(*parsed_key);
parsed_key->user_key = Slice(cur_key_.GetKey().data(), user_key_size); parsed_key->user_key = Slice(cur_key_.GetKey().data(), user_key_size);
if (internal_key != nullptr) { if (internal_key != nullptr) {
@ -348,14 +363,14 @@ Status PlainTableKeyDecoder::NextPrefixEncodingKey(
if (!s.ok()) { if (!s.ok()) {
return s; return s;
} }
if (!file_reader_.file_info_->is_mmap_mode || if (!file_reader_.file_info()->is_mmap_mode ||
(internal_key != nullptr && !decoded_internal_key_valid)) { (internal_key != nullptr && !decoded_internal_key_valid)) {
// In non-mmap mode, always need to make a copy of keys returned to // In non-mmap mode, always need to make a copy of keys returned to
// users, because after reading value for the key, the key might // users, because after reading value for the key, the key might
// be invalid. // be invalid.
cur_key_.SetInternalKey(*parsed_key); cur_key_.SetInternalKey(*parsed_key);
saved_user_key_ = cur_key_.GetKey(); saved_user_key_ = cur_key_.GetKey();
if (!file_reader_.file_info_->is_mmap_mode) { if (!file_reader_.file_info()->is_mmap_mode) {
parsed_key->user_key = Slice(cur_key_.GetKey().data(), size); parsed_key->user_key = Slice(cur_key_.GetKey().data(), size);
} }
if (internal_key != nullptr) { if (internal_key != nullptr) {
@ -394,7 +409,7 @@ Status PlainTableKeyDecoder::NextPrefixEncodingKey(
if (!s.ok()) { if (!s.ok()) {
return s; return s;
} }
if (!file_reader_.file_info_->is_mmap_mode) { if (!file_reader_.file_info()->is_mmap_mode) {
// In non-mmap mode, we need to make a copy of keys returned to // In non-mmap mode, we need to make a copy of keys returned to
// users, because after reading value for the key, the key might // users, because after reading value for the key, the key might
// be invalid. // be invalid.

@ -8,6 +8,7 @@
#include "rocksdb/slice.h" #include "rocksdb/slice.h"
#include "db/dbformat.h" #include "db/dbformat.h"
#include "table/plain_table_reader.h"
namespace rocksdb { namespace rocksdb {
@ -51,6 +52,74 @@ class PlainTableKeyEncoder {
IterKey pre_prefix_; IterKey pre_prefix_;
}; };
class PlainTableFileReader {
public:
explicit PlainTableFileReader(const PlainTableReaderFileInfo* _file_info)
: file_info_(_file_info), num_buf_(0) {}
// In mmaped mode, the results point to mmaped area of the file, which
// means it is always valid before closing the file.
// In non-mmap mode, the results point to an internal buffer. If the caller
// makes another read call, the results may not be valid. So callers should
// make a copy when needed.
// In order to save read calls to files, we keep two internal buffers:
// the first read and the most recent read. This is efficient because it
// columns these two common use cases:
// (1) hash index only identify one location, we read the key to verify
// the location, and read key and value if it is the right location.
// (2) after hash index checking, we identify two locations (because of
// hash bucket conflicts), we binary search the two location to see
// which one is what we need and start to read from the location.
// These two most common use cases will be covered by the two buffers
// so that we don't need to re-read the same location.
// Currently we keep a fixed size buffer. If a read doesn't exactly fit
// the buffer, we replace the second buffer with the location user reads.
//
// If return false, status code is stored in status_.
bool Read(uint32_t file_offset, uint32_t len, Slice* out) {
if (file_info_->is_mmap_mode) {
assert(file_offset + len <= file_info_->data_end_offset);
*out = Slice(file_info_->file_data.data() + file_offset, len);
return true;
} else {
return ReadNonMmap(file_offset, len, out);
}
}
// If return false, status code is stored in status_.
bool ReadNonMmap(uint32_t file_offset, uint32_t len, Slice* output);
// *bytes_read = 0 means eof. false means failure and status is saved
// in status_. Not directly returning Status to save copying status
// object to map previous performance of mmap mode.
inline bool ReadVarint32(uint32_t offset, uint32_t* output,
uint32_t* bytes_read);
bool ReadVarint32NonMmap(uint32_t offset, uint32_t* output,
uint32_t* bytes_read);
Status status() const { return status_; }
const PlainTableReaderFileInfo* file_info() { return file_info_; }
private:
const PlainTableReaderFileInfo* file_info_;
struct Buffer {
Buffer() : buf_start_offset(0), buf_len(0), buf_capacity(0) {}
std::unique_ptr<char[]> buf;
uint32_t buf_start_offset;
uint32_t buf_len;
uint32_t buf_capacity;
};
// Keep buffers for two recent reads.
std::array<unique_ptr<Buffer>, 2> buffers_;
uint32_t num_buf_;
Status status_;
Slice GetFromBuffer(Buffer* buf, uint32_t file_offset, uint32_t len);
};
// A helper class to decode keys from input buffer // A helper class to decode keys from input buffer
// Actual data format of the key is documented in plain_table_factory.h // Actual data format of the key is documented in plain_table_factory.h
class PlainTableKeyDecoder { class PlainTableKeyDecoder {
@ -82,43 +151,7 @@ class PlainTableKeyDecoder {
Slice* internal_key, uint32_t* bytes_read, Slice* internal_key, uint32_t* bytes_read,
bool* seekable = nullptr); bool* seekable = nullptr);
class FileReader { PlainTableFileReader file_reader_;
public:
explicit FileReader(const PlainTableReaderFileInfo* file_info)
: file_info_(file_info),
buf_start_offset_(0),
buf_len_(0),
buf_capacity_(0) {}
// In mmaped mode, the results point to mmaped area of the file, which
// means it is always valid before closing the file.
// In non-mmap mode, the results point to an internal buffer. If the caller
// makes another read call, the results will not be valid. So callers should
// make a copy when needed.
// If return false, status code is stored in status_.
inline bool Read(uint32_t file_offset, uint32_t len, Slice* output);
// If return false, status code is stored in status_.
bool ReadNonMmap(uint32_t file_offset, uint32_t len, Slice* output);
// *bytes_read = 0 means eof. false means failure and status is saved
// in status_. Not directly returning Status to save copying status
// object to map previous performance of mmap mode.
inline bool ReadVarint32(uint32_t offset, uint32_t* output,
uint32_t* bytes_read);
bool ReadVarint32NonMmap(uint32_t offset, uint32_t* output,
uint32_t* bytes_read);
Status status() const { return status_; }
const PlainTableReaderFileInfo* file_info_;
std::unique_ptr<char[]> buf_;
uint32_t buf_start_offset_;
uint32_t buf_len_;
uint32_t buf_capacity_;
Status status_;
};
FileReader file_reader_;
EncodingType encoding_type_; EncodingType encoding_type_;
uint32_t prefix_len_; uint32_t prefix_len_;
uint32_t fixed_user_key_len_; uint32_t fixed_user_key_len_;

@ -204,7 +204,8 @@ void TableReaderBenchmark(Options& opts, EnvOptions& env_options,
} }
// verify key; // verify key;
total_time += Now(env, measured_by_nanosecond) - start_time; total_time += Now(env, measured_by_nanosecond) - start_time;
assert(Slice(MakeKey(r1, r2 + count, through_db)) == iter->key()); assert(Slice(MakeKey(r1, r2 + count, through_db)) ==
(through_db ? iter->key() : iiter->key()));
start_time = Now(env, measured_by_nanosecond); start_time = Now(env, measured_by_nanosecond);
if (++count >= r2_len) { if (++count >= r2_len) {
break; break;

@ -237,7 +237,8 @@ class StringSource: public RandomAccessFile {
bool mmap = false) bool mmap = false)
: contents_(contents.data(), contents.size()), : contents_(contents.data(), contents.size()),
uniq_id_(uniq_id), uniq_id_(uniq_id),
mmap_(mmap) {} mmap_(mmap),
total_reads_(0) {}
virtual ~StringSource() { } virtual ~StringSource() { }
@ -245,6 +246,7 @@ class StringSource: public RandomAccessFile {
virtual Status Read(uint64_t offset, size_t n, Slice* result, virtual Status Read(uint64_t offset, size_t n, Slice* result,
char* scratch) const override { char* scratch) const override {
total_reads_++;
if (offset > contents_.size()) { if (offset > contents_.size()) {
return Status::InvalidArgument("invalid Read offset"); return Status::InvalidArgument("invalid Read offset");
} }
@ -271,10 +273,15 @@ class StringSource: public RandomAccessFile {
return static_cast<size_t>(rid-id); return static_cast<size_t>(rid-id);
} }
int total_reads() const { return total_reads_; }
void set_total_reads(int tr) { total_reads_ = tr; }
private: private:
std::string contents_; std::string contents_;
uint64_t uniq_id_; uint64_t uniq_id_;
bool mmap_; bool mmap_;
mutable int total_reads_;
}; };
class NullLogger : public Logger { class NullLogger : public Logger {

Loading…
Cancel
Save