Introduced a new flag non_blocking_io in ReadOptions.

Summary:
If ReadOptions.non_blocking_io is set to true, then KeyMayExists
and Iterators will return data that is cached in RAM.
If the Iterator needs to do IO from storage to serve the data,
then the Iterator.status() will return Status::IsRetry().

Test Plan:
Enhanced unit test DBTest.KeyMayExist to detect if there were are IOs
issues from storage. Added DBTest.NonBlockingIteration to verify
nonblocking Iterations.

Reviewers: emayanke, haobo

Reviewed By: haobo

CC: leveldb

Maniphest Tasks: T63

Differential Revision: https://reviews.facebook.net/D12531
main
Dhruba Borthakur 11 years ago
parent 43eef52001
commit fc0c399d2e
  1. 1
      .gitignore
  2. 7
      db/db_impl.cc
  3. 1
      db/db_impl.h
  4. 94
      db/db_test.cc
  5. 15
      db/table_cache.cc
  6. 3
      db/table_cache.h
  7. 6
      db/version_set.cc
  8. 2
      db/version_set.h
  9. 2
      include/rocksdb/iterator.h
  10. 24
      include/rocksdb/options.h
  11. 7
      include/rocksdb/status.h
  12. 18
      table/table.cc
  13. 6
      table/table.h

1
.gitignore vendored

@ -21,3 +21,4 @@ util/build_version.cc
build_tools/VALGRIND_LOGS/ build_tools/VALGRIND_LOGS/
coverage/COVERAGE_REPORT coverage/COVERAGE_REPORT
util/build_version.cc.tmp util/build_version.cc.tmp
.gdbhistory

@ -2203,7 +2203,6 @@ Status DBImpl::Get(const ReadOptions& options,
Status DBImpl::GetImpl(const ReadOptions& options, Status DBImpl::GetImpl(const ReadOptions& options,
const Slice& key, const Slice& key,
std::string* value, std::string* value,
const bool no_io,
bool* value_found) { bool* value_found) {
Status s; Status s;
@ -2242,7 +2241,7 @@ Status DBImpl::GetImpl(const ReadOptions& options,
// Done // Done
} else { } else {
current->Get(options, lkey, value, &s, &merge_operands, &stats, current->Get(options, lkey, value, &s, &merge_operands, &stats,
options_, no_io, value_found); options_, value_found);
have_stat_update = true; have_stat_update = true;
} }
mutex_.Lock(); mutex_.Lock();
@ -2348,7 +2347,9 @@ bool DBImpl::KeyMayExist(const ReadOptions& options,
if (value_found != nullptr) { if (value_found != nullptr) {
*value_found = true; // falsify later if key-may-exist but can't fetch value *value_found = true; // falsify later if key-may-exist but can't fetch value
} }
return GetImpl(options, key, value, true, value_found).ok(); ReadOptions roptions = options;
roptions.read_tier = kBlockCacheTier; // read from block cache only
return GetImpl(roptions, key, value, value_found).ok();
} }
Iterator* DBImpl::NewIterator(const ReadOptions& options) { Iterator* DBImpl::NewIterator(const ReadOptions& options) {

@ -424,7 +424,6 @@ class DBImpl : public DB {
Status GetImpl(const ReadOptions& options, Status GetImpl(const ReadOptions& options,
const Slice& key, const Slice& key,
std::string* value, std::string* value,
const bool no_io = false,
bool* value_found = nullptr); bool* value_found = nullptr);
}; };

@ -11,6 +11,7 @@
#include "db/filename.h" #include "db/filename.h"
#include "db/version_set.h" #include "db/version_set.h"
#include "db/write_batch_internal.h" #include "db/write_batch_internal.h"
#include "db/db_statistics.h"
#include "rocksdb/cache.h" #include "rocksdb/cache.h"
#include "rocksdb/compaction_filter.h" #include "rocksdb/compaction_filter.h"
#include "rocksdb/env.h" #include "rocksdb/env.h"
@ -829,6 +830,7 @@ TEST(DBTest, KeyMayExist) {
std::string value; std::string value;
Options options = CurrentOptions(); Options options = CurrentOptions();
options.filter_policy = NewBloomFilterPolicy(20); options.filter_policy = NewBloomFilterPolicy(20);
options.statistics = leveldb::CreateDBStatistics();
Reopen(&options); Reopen(&options);
ASSERT_TRUE(!db_->KeyMayExist(ropts, "a", &value)); ASSERT_TRUE(!db_->KeyMayExist(ropts, "a", &value));
@ -841,24 +843,114 @@ TEST(DBTest, KeyMayExist) {
dbfull()->Flush(FlushOptions()); dbfull()->Flush(FlushOptions());
value.clear(); value.clear();
value_found = false;
long numopen = options.statistics.get()->getTickerCount(NO_FILE_OPENS);
long cache_miss =
options.statistics.get()->getTickerCount(BLOCK_CACHE_MISS);
ASSERT_TRUE(db_->KeyMayExist(ropts, "a", &value, &value_found)); ASSERT_TRUE(db_->KeyMayExist(ropts, "a", &value, &value_found));
ASSERT_TRUE(!value_found); ASSERT_TRUE(!value_found);
// assert that no new files were opened and no new blocks were
// read into block cache.
ASSERT_EQ(numopen, options.statistics.get()->getTickerCount(NO_FILE_OPENS));
ASSERT_EQ(cache_miss,
options.statistics.get()->getTickerCount(BLOCK_CACHE_MISS));
ASSERT_OK(db_->Delete(WriteOptions(), "a")); ASSERT_OK(db_->Delete(WriteOptions(), "a"));
numopen = options.statistics.get()->getTickerCount(NO_FILE_OPENS);
cache_miss = options.statistics.get()->getTickerCount(BLOCK_CACHE_MISS);
ASSERT_TRUE(!db_->KeyMayExist(ropts, "a", &value)); ASSERT_TRUE(!db_->KeyMayExist(ropts, "a", &value));
ASSERT_EQ(numopen, options.statistics.get()->getTickerCount(NO_FILE_OPENS));
ASSERT_EQ(cache_miss,
options.statistics.get()->getTickerCount(BLOCK_CACHE_MISS));
dbfull()->Flush(FlushOptions()); dbfull()->Flush(FlushOptions());
dbfull()->CompactRange(nullptr, nullptr); dbfull()->CompactRange(nullptr, nullptr);
numopen = options.statistics.get()->getTickerCount(NO_FILE_OPENS);
cache_miss = options.statistics.get()->getTickerCount(BLOCK_CACHE_MISS);
ASSERT_TRUE(!db_->KeyMayExist(ropts, "a", &value)); ASSERT_TRUE(!db_->KeyMayExist(ropts, "a", &value));
ASSERT_EQ(numopen, options.statistics.get()->getTickerCount(NO_FILE_OPENS));
ASSERT_EQ(cache_miss,
options.statistics.get()->getTickerCount(BLOCK_CACHE_MISS));
ASSERT_OK(db_->Delete(WriteOptions(), "c")); ASSERT_OK(db_->Delete(WriteOptions(), "c"));
numopen = options.statistics.get()->getTickerCount(NO_FILE_OPENS);
cache_miss = options.statistics.get()->getTickerCount(BLOCK_CACHE_MISS);
ASSERT_TRUE(!db_->KeyMayExist(ropts, "c", &value)); ASSERT_TRUE(!db_->KeyMayExist(ropts, "c", &value));
ASSERT_EQ(numopen, options.statistics.get()->getTickerCount(NO_FILE_OPENS));
ASSERT_EQ(cache_miss,
options.statistics.get()->getTickerCount(BLOCK_CACHE_MISS));
delete options.filter_policy; delete options.filter_policy;
} while (ChangeOptions()); } while (ChangeOptions());
} }
TEST(DBTest, NonBlockingIteration) {
do {
ReadOptions non_blocking_opts, regular_opts;
Options options = CurrentOptions();
options.statistics = leveldb::CreateDBStatistics();
non_blocking_opts.read_tier = kBlockCacheTier;
Reopen(&options);
// write one kv to the database.
ASSERT_OK(db_->Put(WriteOptions(), "a", "b"));
// scan using non-blocking iterator. We should find it because
// it is in memtable.
Iterator* iter = db_->NewIterator(non_blocking_opts);
int count = 0;
for (iter->SeekToFirst(); iter->Valid(); iter->Next()) {
ASSERT_TRUE(iter->status().ok());
count++;
}
ASSERT_EQ(count, 1);
delete iter;
// flush memtable to storage. Now, the key should not be in the
// memtable neither in the block cache.
dbfull()->Flush(FlushOptions());
// verify that a non-blocking iterator does not find any
// kvs. Neither does it do any IOs to storage.
long numopen = options.statistics.get()->getTickerCount(NO_FILE_OPENS);
long cache_miss =
options.statistics.get()->getTickerCount(BLOCK_CACHE_MISS);
iter = db_->NewIterator(non_blocking_opts);
count = 0;
for (iter->SeekToFirst(); iter->Valid(); iter->Next()) {
count++;
}
ASSERT_EQ(count, 0);
ASSERT_TRUE(iter->status().IsIncomplete());
ASSERT_EQ(numopen, options.statistics.get()->getTickerCount(NO_FILE_OPENS));
ASSERT_EQ(cache_miss,
options.statistics.get()->getTickerCount(BLOCK_CACHE_MISS));
delete iter;
// read in the specified block via a regular get
ASSERT_EQ(Get("a"), "b");
// verify that we can find it via a non-blocking scan
numopen = options.statistics.get()->getTickerCount(NO_FILE_OPENS);
cache_miss = options.statistics.get()->getTickerCount(BLOCK_CACHE_MISS);
iter = db_->NewIterator(non_blocking_opts);
count = 0;
for (iter->SeekToFirst(); iter->Valid(); iter->Next()) {
ASSERT_TRUE(iter->status().ok());
count++;
}
ASSERT_EQ(count, 1);
ASSERT_EQ(numopen, options.statistics.get()->getTickerCount(NO_FILE_OPENS));
ASSERT_EQ(cache_miss,
options.statistics.get()->getTickerCount(BLOCK_CACHE_MISS));
delete iter;
} while (ChangeOptions());
}
// A delete is skipped for key if KeyMayExist(key) returns False // A delete is skipped for key if KeyMayExist(key) returns False
// Tests Writebatch consistency and proper delete behaviour // Tests Writebatch consistency and proper delete behaviour
TEST(DBTest, FilterDeletes) { TEST(DBTest, FilterDeletes) {

@ -48,7 +48,7 @@ Status TableCache::FindTable(const EnvOptions& toptions,
*handle = cache_->Lookup(key); *handle = cache_->Lookup(key);
if (*handle == nullptr) { if (*handle == nullptr) {
if (no_io) { // Dont do IO and return a not-found status if (no_io) { // Dont do IO and return a not-found status
return Status::NotFound("Table not found in table_cache, no_io is set"); return Status::Incomplete("Table not found in table_cache, no_io is set");
} }
if (table_io != nullptr) { if (table_io != nullptr) {
*table_io = true; // we had to do IO from storage *table_io = true; // we had to do IO from storage
@ -90,7 +90,8 @@ Iterator* TableCache::NewIterator(const ReadOptions& options,
} }
Cache::Handle* handle = nullptr; Cache::Handle* handle = nullptr;
Status s = FindTable(toptions, file_number, file_size, &handle); Status s = FindTable(toptions, file_number, file_size, &handle,
nullptr, options.read_tier == kBlockCacheTier);
if (!s.ok()) { if (!s.ok()) {
return NewErrorIterator(s); return NewErrorIterator(s);
} }
@ -117,17 +118,17 @@ Status TableCache::Get(const ReadOptions& options,
void* arg, void* arg,
bool (*saver)(void*, const Slice&, const Slice&, bool), bool (*saver)(void*, const Slice&, const Slice&, bool),
bool* table_io, bool* table_io,
void (*mark_key_may_exist)(void*), void (*mark_key_may_exist)(void*)) {
const bool no_io) {
Cache::Handle* handle = nullptr; Cache::Handle* handle = nullptr;
Status s = FindTable(storage_options_, file_number, file_size, Status s = FindTable(storage_options_, file_number, file_size,
&handle, table_io, no_io); &handle, table_io,
options.read_tier == kBlockCacheTier);
if (s.ok()) { if (s.ok()) {
Table* t = Table* t =
reinterpret_cast<Table*>(cache_->Value(handle)); reinterpret_cast<Table*>(cache_->Value(handle));
s = t->InternalGet(options, k, arg, saver, mark_key_may_exist, no_io); s = t->InternalGet(options, k, arg, saver, mark_key_may_exist);
cache_->Release(handle); cache_->Release(handle);
} else if (no_io && s.IsNotFound()) { } else if (options.read_tier && s.IsIncomplete()) {
// Couldnt find Table in cache but treat as kFound if no_io set // Couldnt find Table in cache but treat as kFound if no_io set
(*mark_key_may_exist)(arg); (*mark_key_may_exist)(arg);
return Status::OK(); return Status::OK();

@ -49,8 +49,7 @@ class TableCache {
void* arg, void* arg,
bool (*handle_result)(void*, const Slice&, const Slice&, bool), bool (*handle_result)(void*, const Slice&, const Slice&, bool),
bool* table_io, bool* table_io,
void (*mark_key_may_exist)(void*) = nullptr, void (*mark_key_may_exist)(void*) = nullptr);
const bool no_io = false);
// Determine whether the table may contain the specified prefix. If // Determine whether the table may contain the specified prefix. If
// the table index of blooms are not in memory, this may cause an I/O // the table index of blooms are not in memory, this may cause an I/O

@ -415,7 +415,6 @@ void Version::Get(const ReadOptions& options,
std::deque<std::string>* operands, std::deque<std::string>* operands,
GetStats* stats, GetStats* stats,
const Options& db_options, const Options& db_options,
const bool no_io,
bool* value_found) { bool* value_found) {
Slice ikey = k.internal_key(); Slice ikey = k.internal_key();
Slice user_key = k.user_key(); Slice user_key = k.user_key();
@ -425,9 +424,6 @@ void Version::Get(const ReadOptions& options,
auto logger = db_options.info_log; auto logger = db_options.info_log;
assert(status->ok() || status->IsMergeInProgress()); assert(status->ok() || status->IsMergeInProgress());
if (no_io) {
assert(status->ok());
}
Saver saver; Saver saver;
saver.state = status->ok()? kNotFound : kMerge; saver.state = status->ok()? kNotFound : kMerge;
saver.ucmp = ucmp; saver.ucmp = ucmp;
@ -516,7 +512,7 @@ void Version::Get(const ReadOptions& options,
bool tableIO = false; bool tableIO = false;
*status = vset_->table_cache_->Get(options, f->number, f->file_size, *status = vset_->table_cache_->Get(options, f->number, f->file_size,
ikey, &saver, SaveValue, &tableIO, ikey, &saver, SaveValue, &tableIO,
MarkKeyMayExist, no_io); MarkKeyMayExist);
// TODO: examine the behavior for corrupted key // TODO: examine the behavior for corrupted key
if (!status->ok()) { if (!status->ok()) {
return; return;

@ -76,7 +76,7 @@ class Version {
}; };
void Get(const ReadOptions&, const LookupKey& key, std::string* val, void Get(const ReadOptions&, const LookupKey& key, std::string* val,
Status* status, std::deque<std::string>* operands, GetStats* stats, Status* status, std::deque<std::string>* operands, GetStats* stats,
const Options& db_option, const bool no_io = false, const Options& db_option,
bool* value_found = nullptr); bool* value_found = nullptr);
// Adds "stats" into the current state. Returns true if a new // Adds "stats" into the current state. Returns true if a new

@ -65,6 +65,8 @@ class Iterator {
virtual Slice value() const = 0; virtual Slice value() const = 0;
// If an error has occurred, return it. Else return an ok status. // If an error has occurred, return it. Else return an ok status.
// If non-blocking IO is requested and this operation cannot be
// satisfied without doing some IO, then this returns Status::Incomplete().
virtual Status status() const = 0; virtual Status status() const = 0;
// Clients are allowed to register function/arg1/arg2 triples that // Clients are allowed to register function/arg1/arg2 triples that

@ -543,6 +543,18 @@ struct Options {
std::shared_ptr<CompactionFilterFactory> compaction_filter_factory; std::shared_ptr<CompactionFilterFactory> compaction_filter_factory;
}; };
//
// An application can issue a read request (via Get/Iterators) and specify
// if that read should process data that ALREADY resides on a specified cache
// level. For example, if an application specifies kBlockCacheTier then the
// Get call will process data that is already processed in the memtable or
// the block cache. It will not page in data from the OS cache or data that
// resides in storage.
enum ReadTier {
kReadAllTier = 0x0, // data in memtable, block cache, OS cache or storage
kBlockCacheTier = 0x1 // data in memtable or block cache
};
// Options that control read operations // Options that control read operations
struct ReadOptions { struct ReadOptions {
// If true, all data read from underlying storage will be // If true, all data read from underlying storage will be
@ -575,15 +587,23 @@ struct ReadOptions {
// Default: nullptr // Default: nullptr
const Slice* prefix; const Slice* prefix;
// Specify if this read request should process data that ALREADY
// resides on a particular cache. If the required data is not
// found at the specified cache, then Status::WouldBlock is returned.
// Default: kReadAllTier
ReadTier read_tier;
ReadOptions() ReadOptions()
: verify_checksums(false), : verify_checksums(false),
fill_cache(true), fill_cache(true),
snapshot(nullptr), snapshot(nullptr),
prefix(nullptr) { prefix(nullptr),
read_tier(kReadAllTier) {
} }
ReadOptions(bool cksum, bool cache) : ReadOptions(bool cksum, bool cache) :
verify_checksums(cksum), fill_cache(cache), verify_checksums(cksum), fill_cache(cache),
snapshot(nullptr), prefix(nullptr) { snapshot(nullptr), prefix(nullptr),
read_tier(kReadAllTier) {
} }
}; };

@ -50,6 +50,9 @@ class Status {
static Status MergeInProgress(const Slice& msg, const Slice& msg2 = Slice()) { static Status MergeInProgress(const Slice& msg, const Slice& msg2 = Slice()) {
return Status(kMergeInProgress, msg, msg2); return Status(kMergeInProgress, msg, msg2);
} }
static Status Incomplete(const Slice& msg, const Slice& msg2 = Slice()) {
return Status(kIncomplete, msg, msg2);
}
// Returns true iff the status indicates success. // Returns true iff the status indicates success.
bool ok() const { return (state_ == nullptr); } bool ok() const { return (state_ == nullptr); }
@ -72,6 +75,9 @@ class Status {
// Returns true iff the status indicates an MergeInProgress. // Returns true iff the status indicates an MergeInProgress.
bool IsMergeInProgress() const { return code() == kMergeInProgress; } bool IsMergeInProgress() const { return code() == kMergeInProgress; }
// Returns true iff the status indicates Incomplete
bool IsIncomplete() const { return code() == kIncomplete; }
// Return a string representation of this status suitable for printing. // Return a string representation of this status suitable for printing.
// Returns the string "OK" for success. // Returns the string "OK" for success.
std::string ToString() const; std::string ToString() const;
@ -92,6 +98,7 @@ class Status {
kInvalidArgument = 4, kInvalidArgument = 4,
kIOError = 5, kIOError = 5,
kMergeInProgress = 6, kMergeInProgress = 6,
kIncomplete = 7
}; };
Code code() const { Code code() const {

@ -237,8 +237,8 @@ Iterator* Table::BlockReader(void* arg,
const ReadOptions& options, const ReadOptions& options,
const Slice& index_value, const Slice& index_value,
bool* didIO, bool* didIO,
bool for_compaction, bool for_compaction) {
const bool no_io) { const bool no_io = (options.read_tier == kBlockCacheTier);
Table* table = reinterpret_cast<Table*>(arg); Table* table = reinterpret_cast<Table*>(arg);
Cache* block_cache = table->rep_->options.block_cache.get(); Cache* block_cache = table->rep_->options.block_cache.get();
std::shared_ptr<Statistics> statistics = table->rep_->options.statistics; std::shared_ptr<Statistics> statistics = table->rep_->options.statistics;
@ -268,7 +268,8 @@ Iterator* Table::BlockReader(void* arg,
RecordTick(statistics, BLOCK_CACHE_HIT); RecordTick(statistics, BLOCK_CACHE_HIT);
} else if (no_io) { } else if (no_io) {
return nullptr; // Did not find in block_cache and can't do IO // Did not find in block_cache and can't do IO
return NewErrorIterator(Status::Incomplete("no blocking io"));
} else { } else {
Histograms histogram = for_compaction ? Histograms histogram = for_compaction ?
READ_BLOCK_COMPACTION_MICROS : READ_BLOCK_GET_MICROS; READ_BLOCK_COMPACTION_MICROS : READ_BLOCK_GET_MICROS;
@ -292,7 +293,8 @@ Iterator* Table::BlockReader(void* arg,
RecordTick(statistics, BLOCK_CACHE_MISS); RecordTick(statistics, BLOCK_CACHE_MISS);
} }
} else if (no_io) { } else if (no_io) {
return nullptr; // Could not read from block_cache and can't do IO // Could not read from block_cache and can't do IO
return NewErrorIterator(Status::Incomplete("no blocking io"));
}else { }else {
s = ReadBlock(table->rep_->file.get(), options, handle, &block, didIO); s = ReadBlock(table->rep_->file.get(), options, handle, &block, didIO);
} }
@ -401,8 +403,7 @@ Status Table::InternalGet(const ReadOptions& options, const Slice& k,
void* arg, void* arg,
bool (*saver)(void*, const Slice&, const Slice&, bool (*saver)(void*, const Slice&, const Slice&,
bool), bool),
void (*mark_key_may_exist)(void*), void (*mark_key_may_exist)(void*)) {
const bool no_io) {
Status s; Status s;
Iterator* iiter = rep_->index_block->NewIterator(rep_->options.comparator); Iterator* iiter = rep_->index_block->NewIterator(rep_->options.comparator);
bool done = false; bool done = false;
@ -421,9 +422,10 @@ Status Table::InternalGet(const ReadOptions& options, const Slice& k,
} else { } else {
bool didIO = false; bool didIO = false;
Iterator* block_iter = BlockReader(this, options, iiter->value(), Iterator* block_iter = BlockReader(this, options, iiter->value(),
&didIO, false, no_io); &didIO);
if (no_io && !block_iter) { // couldn't get block from block_cache if (options.read_tier && block_iter->status().IsIncomplete()) {
// couldn't get block from block_cache
// Update Saver.state to Found because we are only looking for whether // Update Saver.state to Found because we are only looking for whether
// we can guarantee the key is not there when "no_io" is set // we can guarantee the key is not there when "no_io" is set
(*mark_key_may_exist)(arg); (*mark_key_may_exist)(arg);

@ -79,8 +79,7 @@ class Table {
const EnvOptions& soptions, const Slice&, const EnvOptions& soptions, const Slice&,
bool for_compaction); bool for_compaction);
static Iterator* BlockReader(void*, const ReadOptions&, const Slice&, static Iterator* BlockReader(void*, const ReadOptions&, const Slice&,
bool* didIO, bool for_compaction = false, bool* didIO, bool for_compaction = false);
const bool no_io = false);
// Calls (*handle_result)(arg, ...) repeatedly, starting with the entry found // Calls (*handle_result)(arg, ...) repeatedly, starting with the entry found
// after a call to Seek(key), until handle_result returns false. // after a call to Seek(key), until handle_result returns false.
@ -90,8 +89,7 @@ class Table {
const ReadOptions&, const Slice& key, const ReadOptions&, const Slice& key,
void* arg, void* arg,
bool (*handle_result)(void* arg, const Slice& k, const Slice& v, bool), bool (*handle_result)(void* arg, const Slice& k, const Slice& v, bool),
void (*mark_key_may_exist)(void*) = nullptr, void (*mark_key_may_exist)(void*) = nullptr);
const bool no_io = false);
void ReadMeta(const Footer& footer); void ReadMeta(const Footer& footer);

Loading…
Cancel
Save