diff --git a/db/db_bench.cc b/db/db_bench.cc index 253a64bcf..a5c95732e 100644 --- a/db/db_bench.cc +++ b/db/db_bench.cc @@ -328,7 +328,7 @@ static auto FLAGS_bytes_per_sync = leveldb::Options().bytes_per_sync; // On true, deletes use bloom-filter and drop the delete if key not present -static bool FLAGS_deletes_check_filter_first = false; +static bool FLAGS_filter_deletes = false; namespace leveldb { @@ -1114,7 +1114,7 @@ unique_ptr GenerateKeyFromInt(int v, const char* suffix = "") options.max_bytes_for_level_base = FLAGS_max_bytes_for_level_base; options.max_bytes_for_level_multiplier = FLAGS_max_bytes_for_level_multiplier; - options.deletes_check_filter_first = FLAGS_deletes_check_filter_first; + options.filter_deletes = FLAGS_filter_deletes; if (FLAGS_max_bytes_for_level_multiplier_additional.size() > 0) { if (FLAGS_max_bytes_for_level_multiplier_additional.size() != (unsigned int)FLAGS_num_levels) { @@ -2220,9 +2220,9 @@ int main(int argc, char** argv) { FLAGS_keys_per_multiget = n; } else if (sscanf(argv[i], "--bytes_per_sync=%ld%c", &l, &junk) == 1) { FLAGS_bytes_per_sync = l; - } else if (sscanf(argv[i], "--deletes_check_filter_first=%d%c", &n, &junk) + } else if (sscanf(argv[i], "--filter_deletes=%d%c", &n, &junk) == 1 && (n == 0 || n ==1 )) { - FLAGS_deletes_check_filter_first = n; + FLAGS_filter_deletes = n; } else { fprintf(stderr, "Invalid flag '%s'\n", argv[i]); exit(1); diff --git a/db/db_impl.cc b/db/db_impl.cc index 326b61e83..3b4e77d65 100644 --- a/db/db_impl.cc +++ b/db/db_impl.cc @@ -691,7 +691,7 @@ Status DBImpl::RecoverLogFile(uint64_t log_number, mem = new MemTable(internal_comparator_, NumberLevels()); mem->Ref(); } - status = WriteBatchInternal::InsertInto(&batch, mem); + status = WriteBatchInternal::InsertInto(&batch, mem, &options_); MaybeIgnoreError(&status); if (!status.ok()) { break; @@ -2078,13 +2078,13 @@ Status DBImpl::Get(const ReadOptions& options, return GetImpl(options, key, value); } -// If no_IO is true, then returns Status::NotFound if key is not in memtable, +// If no_io is true, then returns Status::NotFound if key is not in memtable, // immutable-memtable and bloom-filters can guarantee that key is not in db, -// "value" is garbage string if no_IO is true +// "value" is garbage string if no_io is true Status DBImpl::GetImpl(const ReadOptions& options, const Slice& key, std::string* value, - const bool no_IO) { + const bool no_io) { Status s; StopWatch sw(env_, options_.statistics, DB_GET); @@ -2113,12 +2113,12 @@ Status DBImpl::GetImpl(const ReadOptions& options, // s is both in/out. When in, s could either be OK or MergeInProgress. // value will contain the current merge operand in the latter case. LookupKey lkey(key, snapshot); - if (mem->Get(lkey, value, &s, options_, no_IO)) { + if (mem->Get(lkey, value, &s, options_, no_io)) { // Done - } else if (imm.Get(lkey, value, &s, options_, no_IO)) { + } else if (imm.Get(lkey, value, &s, options_, no_io)) { // Done } else { - current->Get(options, lkey, value, &s, &stats, options_, no_IO); + current->Get(options, lkey, value, &s, &stats, options_, no_io); have_stat_update = true; } mutex_.Lock(); @@ -2209,8 +2209,17 @@ std::vector DBImpl::MultiGet(const ReadOptions& options, } bool DBImpl::KeyMayExist(const Slice& key) { + return KeyMayExistImpl(key, versions_->LastSequence()); +} + +bool DBImpl::KeyMayExistImpl(const Slice& key, + const SequenceNumber read_from_seq) { std::string value; - const Status s = GetImpl(ReadOptions(), key, &value, true); + SnapshotImpl read_from_snapshot; + read_from_snapshot.number_ = read_from_seq; + ReadOptions ropts; + ropts.snapshot = &read_from_snapshot; + const Status s = GetImpl(ropts, key, &value, true); return !s.IsNotFound(); } @@ -2249,10 +2258,6 @@ Status DBImpl::Merge(const WriteOptions& o, const Slice& key, } Status DBImpl::Delete(const WriteOptions& options, const Slice& key) { - if (options_.deletes_check_filter_first && !KeyMayExist(key)) { - RecordTick(options_.statistics, NUMBER_FILTERED_DELETES); - return Status::OK(); - } return DB::Delete(options, key); } @@ -2311,7 +2316,8 @@ Status DBImpl::Write(const WriteOptions& options, WriteBatch* my_batch) { } } if (status.ok()) { - status = WriteBatchInternal::InsertInto(updates, mem_); + status = WriteBatchInternal::InsertInto(updates, mem_, &options_, this, + options_.filter_deletes); if (!status.ok()) { // Panic for in-memory corruptions // Note that existing logic was not sound. Any partial failure writing diff --git a/db/db_impl.h b/db/db_impl.h index 157acd9b8..5229cf3e2 100644 --- a/db/db_impl.h +++ b/db/db_impl.h @@ -103,6 +103,15 @@ class DBImpl : public DB { // Trigger's a background call for testing. void TEST_PurgeObsoleteteWAL(); + // KeyMayExist's internal function, but can be called internally from rocksdb + // to check memtable from sequence_number=read_from_seq. This is useful to + // check presence of key in db when key's existence is to be also checked in + // an incompletely written WriteBatch in memtable. eg. Database doesn't have + // key A and WriteBatch=[PutA,B; DelA]. A KeyMayExist called from DelA also + // needs to check itself for any PutA to be sure to not drop the delete. + bool KeyMayExistImpl(const Slice& key, + const SequenceNumber read_from_seq); + protected: Env* const env_; const std::string dbname_; @@ -399,11 +408,11 @@ class DBImpl : public DB { std::vector& snapshots, SequenceNumber* prev_snapshot); - // Function that Get and KeyMayExist call with no_IO true or false + // Function that Get and KeyMayExistImpl call with no_io true or false Status GetImpl(const ReadOptions& options, const Slice& key, std::string* value, - const bool no_IO = false); + const bool no_io = false); }; // Sanitize db options. The caller should delete result.info_log if diff --git a/db/db_test.cc b/db/db_test.cc index acdcd41e2..d25fe8b3e 100644 --- a/db/db_test.cc +++ b/db/db_test.cc @@ -291,7 +291,7 @@ class DBTest { // TODO -- test more options break; case kDeletesFilterFirst: - options.deletes_check_filter_first = true; + options.filter_deletes = true; break; default: break; @@ -805,6 +805,44 @@ TEST(DBTest, KeyMayExist) { } while (ChangeOptions()); } +// A delete is skipped for key if KeyMayExist(key) returns False +// Tests Writebatch consistency and proper delete behaviour +TEST(DBTest, FilterDeletes) { + Options options = CurrentOptions(); + options.filter_policy = NewBloomFilterPolicy(20); + options.filter_deletes = true; + Reopen(&options); + WriteBatch batch; + + batch.Delete("a"); + dbfull()->Write(WriteOptions(), &batch); + ASSERT_EQ(AllEntriesFor("a"), "[ ]"); // Delete skipped + batch.Clear(); + + batch.Put("a", "b"); + batch.Delete("a"); + dbfull()->Write(WriteOptions(), &batch); + ASSERT_EQ(Get("a"), "NOT_FOUND"); + ASSERT_EQ(AllEntriesFor("a"), "[ DEL, b ]"); // Delete issued + batch.Clear(); + + batch.Delete("c"); + batch.Put("c", "d"); + dbfull()->Write(WriteOptions(), &batch); + ASSERT_EQ(Get("c"), "d"); + ASSERT_EQ(AllEntriesFor("c"), "[ d ]"); // Delete skipped + batch.Clear(); + + dbfull()->Flush(FlushOptions()); // A stray Flush + + batch.Delete("c"); + dbfull()->Write(WriteOptions(), &batch); + ASSERT_EQ(AllEntriesFor("c"), "[ DEL, d ]"); // Delete issued + batch.Clear(); + + delete options.filter_policy; +} + TEST(DBTest, IterEmpty) { Iterator* iter = db_->NewIterator(ReadOptions()); @@ -3192,6 +3230,9 @@ static bool CompareIterators(int step, TEST(DBTest, Randomized) { Random rnd(test::RandomSeed()); do { + if (CurrentOptions().filter_deletes) { + ChangeOptions(); // DBTest.Randomized not suited for filter_deletes + } ModelDB model(CurrentOptions()); const int N = 10000; const Snapshot* model_snap = nullptr; diff --git a/db/table_cache.cc b/db/table_cache.cc index 4cc105afe..02408d95c 100644 --- a/db/table_cache.cc +++ b/db/table_cache.cc @@ -39,15 +39,19 @@ TableCache::~TableCache() { Status TableCache::FindTable(const EnvOptions& toptions, uint64_t file_number, uint64_t file_size, - Cache::Handle** handle, bool* tableIO) { + Cache::Handle** handle, bool* table_io, + const bool no_io) { Status s; char buf[sizeof(file_number)]; EncodeFixed64(buf, file_number); Slice key(buf, sizeof(buf)); *handle = cache_->Lookup(key); if (*handle == nullptr) { - if (tableIO != nullptr) { - *tableIO = true; // we had to do IO from storage + if (no_io) { // Dont do IO and return a not-found status + return Status::NotFound("Table not found in table_cache, no_io is set"); + } + if (table_io != nullptr) { + *table_io = true; // we had to do IO from storage } std::string fname = TableFileName(dbname_, file_number); unique_ptr file; @@ -112,17 +116,21 @@ Status TableCache::Get(const ReadOptions& options, const Slice& k, void* arg, bool (*saver)(void*, const Slice&, const Slice&, bool), - bool* tableIO, + bool* table_io, void (*mark_key_may_exist)(void*), - const bool no_IO) { + const bool no_io) { Cache::Handle* handle = nullptr; Status s = FindTable(storage_options_, file_number, file_size, - &handle, tableIO); + &handle, table_io, no_io); if (s.ok()) { Table* t = reinterpret_cast(cache_->Value(handle)); - s = t->InternalGet(options, k, arg, saver, mark_key_may_exist, no_IO); + s = t->InternalGet(options, k, arg, saver, mark_key_may_exist, no_io); cache_->Release(handle); + } else if (no_io && s.IsNotFound()) { + // Couldnt find Table in cache but treat as kFound if no_io set + (*mark_key_may_exist)(arg); + return Status::OK(); } return s; } diff --git a/db/table_cache.h b/db/table_cache.h index 737e53c9e..2f3787609 100644 --- a/db/table_cache.h +++ b/db/table_cache.h @@ -48,9 +48,9 @@ class TableCache { const Slice& k, void* arg, bool (*handle_result)(void*, const Slice&, const Slice&, bool), - bool* tableIO, + bool* table_io, void (*mark_key_may_exist)(void*) = nullptr, - const bool no_IO = false); + const bool no_io = false); // Evict any entry for the specified file number void Evict(uint64_t file_number); @@ -62,9 +62,9 @@ class TableCache { const EnvOptions& storage_options_; std::shared_ptr cache_; - Status FindTable(const EnvOptions& toptions, - uint64_t file_number, uint64_t file_size, Cache::Handle**, - bool* tableIO = nullptr); + Status FindTable(const EnvOptions& toptions, uint64_t file_number, + uint64_t file_size, Cache::Handle**, bool* table_io=nullptr, + const bool no_io = false); }; } // namespace leveldb diff --git a/db/version_set.cc b/db/version_set.cc index 19dc022f4..15ff1330f 100644 --- a/db/version_set.cc +++ b/db/version_set.cc @@ -339,7 +339,7 @@ void Version::Get(const ReadOptions& options, Status *status, GetStats* stats, const Options& db_options, - const bool no_IO) { + const bool no_io) { Slice ikey = k.internal_key(); Slice user_key = k.user_key(); const Comparator* ucmp = vset_->icmp_.user_comparator(); @@ -348,7 +348,7 @@ void Version::Get(const ReadOptions& options, auto logger = db_options.info_log; assert(status->ok() || status->IsMergeInProgress()); - if (no_IO) { + if (no_io) { assert(status->ok()); } Saver saver; @@ -419,7 +419,7 @@ void Version::Get(const ReadOptions& options, bool tableIO = false; *status = vset_->table_cache_->Get(options, f->number, f->file_size, ikey, &saver, SaveValue, &tableIO, - MarkKeyMayExist, no_IO); + MarkKeyMayExist, no_io); // TODO: examine the behavior for corrupted key if (!status->ok()) { return; diff --git a/db/version_set.h b/db/version_set.h index ec7a2750e..11bfb9961 100644 --- a/db/version_set.h +++ b/db/version_set.h @@ -75,7 +75,7 @@ class Version { }; void Get(const ReadOptions&, const LookupKey& key, std::string* val, Status* status, GetStats* stats, const Options& db_option, - const bool no_IO = false); + const bool no_io = false); // Adds "stats" into the current state. Returns true if a new // compaction may need to be triggered, false otherwise. diff --git a/db/write_batch.cc b/db/write_batch.cc index 4ca4819fd..9845f49ad 100644 --- a/db/write_batch.cc +++ b/db/write_batch.cc @@ -16,8 +16,9 @@ #include "leveldb/write_batch.h" -#include "leveldb/db.h" +#include "leveldb/statistics.h" #include "db/dbformat.h" +#include "db/db_impl.h" #include "db/memtable.h" #include "db/write_batch_internal.h" #include "util/coding.h" @@ -139,6 +140,23 @@ class MemTableInserter : public WriteBatch::Handler { public: SequenceNumber sequence_; MemTable* mem_; + const Options* options_; + DBImpl* db_; + const bool filter_deletes_; + + MemTableInserter(SequenceNumber sequence, MemTable* mem, const Options* opts, + DB* db, const bool filter_deletes) + : sequence_(sequence), + mem_(mem), + options_(opts), + db_(reinterpret_cast(db)), + filter_deletes_(filter_deletes) { + assert(mem_); + if (filter_deletes_) { + assert(options_); + assert(db_); + } + } virtual void Put(const Slice& key, const Slice& value) { mem_->Add(sequence_, kTypeValue, key, value); @@ -149,17 +167,21 @@ class MemTableInserter : public WriteBatch::Handler { sequence_++; } virtual void Delete(const Slice& key) { + if (filter_deletes_ && !db_->KeyMayExistImpl(key, sequence_)) { + RecordTick(options_->statistics, NUMBER_FILTERED_DELETES); + return; + } mem_->Add(sequence_, kTypeDeletion, key, Slice()); sequence_++; } }; } // namespace -Status WriteBatchInternal::InsertInto(const WriteBatch* b, - MemTable* memtable) { - MemTableInserter inserter; - inserter.sequence_ = WriteBatchInternal::Sequence(b); - inserter.mem_ = memtable; +Status WriteBatchInternal::InsertInto(const WriteBatch* b, MemTable* mem, + const Options* opts, DB* db, + const bool filter_deletes) { + MemTableInserter inserter(WriteBatchInternal::Sequence(b), mem, opts, db, + filter_deletes); return b->Iterate(&inserter); } diff --git a/db/write_batch_internal.h b/db/write_batch_internal.h index eb37733c2..649752ce6 100644 --- a/db/write_batch_internal.h +++ b/db/write_batch_internal.h @@ -7,6 +7,8 @@ #include "leveldb/types.h" #include "leveldb/write_batch.h" +#include "leveldb/db.h" +#include "leveldb/options.h" namespace leveldb { @@ -39,7 +41,12 @@ class WriteBatchInternal { static void SetContents(WriteBatch* batch, const Slice& contents); - static Status InsertInto(const WriteBatch* batch, MemTable* memtable); + // Inserts batch entries into memtable + // Drops deletes in batch if filter_del is set to true and + // db->KeyMayExist returns false + static Status InsertInto(const WriteBatch* batch, MemTable* memtable, + const Options* opts = nullptr, DB* db = nullptr, + const bool filter_del = false); static void Append(WriteBatch* dst, const WriteBatch* src); }; diff --git a/include/leveldb/db.h b/include/leveldb/db.h index b7efeb54b..e331e7f67 100644 --- a/include/leveldb/db.h +++ b/include/leveldb/db.h @@ -122,8 +122,12 @@ class DB { // If the key definitely does not exist in the database, then this method // returns false. Otherwise return true. This check is potentially - // lighter-weight than invoking DB::Get(). No IO is performed - virtual bool KeyMayExist(const Slice& key) = 0; + // lighter-weight than invoking DB::Get(). One way to make this lighter weight + // is to avoid doing any IOs + // Default implementation here returns true + virtual bool KeyMayExist(const Slice& key) { + return true; + } // Return a heap-allocated iterator over the contents of the database. // The result of NewIterator() is initially invalid (caller must diff --git a/include/leveldb/options.h b/include/leveldb/options.h index 3341b72a2..c178ddeee 100644 --- a/include/leveldb/options.h +++ b/include/leveldb/options.h @@ -472,7 +472,7 @@ struct Options { // only incurs in-memory look up. This optimization avoids writing the delete // to storage when appropriate. // Default: false - bool deletes_check_filter_first; + bool filter_deletes; }; diff --git a/table/table.cc b/table/table.cc index f7b664a4f..c51a0aa44 100644 --- a/table/table.cc +++ b/table/table.cc @@ -324,7 +324,7 @@ Status Table::InternalGet(const ReadOptions& options, const Slice& k, bool (*saver)(void*, const Slice&, const Slice&, bool), void (*mark_key_may_exist)(void*), - const bool no_IO) { + const bool no_io) { Status s; Iterator* iiter = rep_->index_block->NewIterator(rep_->options.comparator); bool done = false; @@ -340,9 +340,9 @@ Status Table::InternalGet(const ReadOptions& options, const Slice& k, // cross one data block, we should be fine. RecordTick(rep_->options.statistics, BLOOM_FILTER_USEFUL); break; - } else if (no_IO) { + } else if (no_io) { // Update Saver.state to Found because we are only looking for whether - // bloom-filter can guarantee the key is not there when "no_IO" + // bloom-filter can guarantee the key is not there when "no_io" (*mark_key_may_exist)(arg); done = true; } else { diff --git a/table/table.h b/table/table.h index 4674e262b..0be95f368 100644 --- a/table/table.h +++ b/table/table.h @@ -88,7 +88,7 @@ class Table { void* arg, bool (*handle_result)(void* arg, const Slice& k, const Slice& v, bool), void (*mark_key_may_exist)(void*) = nullptr, - const bool no_IO = false); + const bool no_io = false); void ReadMeta(const Footer& footer); diff --git a/tools/db_stress.cc b/tools/db_stress.cc index 8d27c1a68..fcd5dc269 100644 --- a/tools/db_stress.cc +++ b/tools/db_stress.cc @@ -181,7 +181,7 @@ static uint32_t FLAGS_log2_keys_per_lock = 2; // implies 2^2 keys per lock static uint32_t FLAGS_purge_redundant_percent = 50; // On true, deletes use bloom-filter and drop the delete if key not present -static bool FLAGS_deletes_check_filter_first = false; +static bool FLAGS_filter_deletes = false; // Level0 compaction start trigger static int FLAGS_level0_file_num_compaction_trigger = 0; @@ -904,7 +904,7 @@ class StressTest { fprintf(stdout, "Purge redundant %% : %d\n", FLAGS_purge_redundant_percent); fprintf(stdout, "Deletes use filter : %d\n", - FLAGS_deletes_check_filter_first); + FLAGS_filter_deletes); fprintf(stdout, "Num keys per lock : %d\n", 1 << FLAGS_log2_keys_per_lock); @@ -960,7 +960,7 @@ class StressTest { options.delete_obsolete_files_period_micros = FLAGS_delete_obsolete_files_period_micros; options.max_manifest_file_size = 1024; - options.deletes_check_filter_first = FLAGS_deletes_check_filter_first; + options.filter_deletes = FLAGS_filter_deletes; static Random purge_percent(1000); // no benefit from non-determinism here if (purge_percent.Uniform(100) < FLAGS_purge_redundant_percent - 1) { options.purge_redundant_kvs_while_flush = false; @@ -1160,9 +1160,9 @@ int main(int argc, char** argv) { } else if (sscanf(argv[i], "--purge_redundant_percent=%d%c", &n, &junk) == 1 && (n >= 0 && n <= 100)) { FLAGS_purge_redundant_percent = n; - } else if (sscanf(argv[i], "--deletes_check_filter_first=%d%c", &n, &junk) + } else if (sscanf(argv[i], "--filter_deletes=%d%c", &n, &junk) == 1 && (n == 0 || n == 1)) { - FLAGS_deletes_check_filter_first = n; + FLAGS_filter_deletes = n; } else { fprintf(stderr, "Invalid flag '%s'\n", argv[i]); exit(1); diff --git a/util/options.cc b/util/options.cc index 1ac25845a..e884d87d7 100644 --- a/util/options.cc +++ b/util/options.cc @@ -75,7 +75,7 @@ Options::Options() access_hint_on_compaction_start(NORMAL), use_adaptive_mutex(false), bytes_per_sync(0), - deletes_check_filter_first(false) { + filter_deletes(false) { } static const char* const access_hints[] = { @@ -209,8 +209,8 @@ Options::Dump(Logger* log) const use_adaptive_mutex); Log(log," Options.bytes_per_sync: %ld", bytes_per_sync); - Log(log," Options.deletes_check_filter_first: %d", - deletes_check_filter_first); + Log(log," Options.filter_deletes: %d", + filter_deletes); } // Options::Dump //