diff --git a/Makefile b/Makefile index 2665c3a21..9b1d05fcf 100644 --- a/Makefile +++ b/Makefile @@ -90,8 +90,8 @@ SHARED3 = $(SHARED1) SHARED = $(SHARED1) else # Update db.h if you change these. -SHARED_MAJOR = 1 -SHARED_MINOR = 5 +SHARED_MAJOR = 2 +SHARED_MINOR = 0 SHARED1 = libleveldb.$(PLATFORM_SHARED_EXT) SHARED2 = $(SHARED1).$(SHARED_MAJOR) SHARED3 = $(SHARED1).$(SHARED_MAJOR).$(SHARED_MINOR) diff --git a/db/db_bench.cc b/db/db_bench.cc index ecd979ec8..1f9dbd1b4 100644 --- a/db/db_bench.cc +++ b/db/db_bench.cc @@ -302,18 +302,12 @@ static uint64_t FLAGS_WAL_ttl_seconds = 0; // Allow buffered io using OS buffers static bool FLAGS_use_os_buffer; -// Allow filesystem to do read-aheads -static bool FLAGS_use_fsreadahead; - // Allow reads to occur via mmap-ing files static bool FLAGS_use_mmap_reads; // Allow writes to occur via mmap-ing files static bool FLAGS_use_mmap_writes; -// Allow readaheads to occur for compactions -static bool FLAGS_use_readahead_compactions; - // Advise random access on table file open static bool FLAGS_advise_random_on_open = leveldb::Options().advise_random_on_open; @@ -343,6 +337,9 @@ static auto FLAGS_use_adaptive_mutex = static auto FLAGS_bytes_per_sync = leveldb::Options().bytes_per_sync; +// On true, deletes use bloom-filter and drop the delete if key not present +static bool FLAGS_deletes_check_filter_first = false; + namespace leveldb { // Helper for quickly generating random data. @@ -1131,6 +1128,7 @@ unique_ptr GenerateKeyFromInt(int v, const char* suffix = "") options.max_bytes_for_level_base = FLAGS_max_bytes_for_level_base; options.max_bytes_for_level_multiplier = FLAGS_max_bytes_for_level_multiplier; + options.deletes_check_filter_first = FLAGS_deletes_check_filter_first; if (FLAGS_max_bytes_for_level_multiplier_additional.size() > 0) { if (FLAGS_max_bytes_for_level_multiplier_additional.size() != (unsigned int)FLAGS_num_levels) { @@ -1172,7 +1170,6 @@ unique_ptr GenerateKeyFromInt(int v, const char* suffix = "") // fill storage options options.allow_os_buffer = FLAGS_use_os_buffer; - options.allow_readahead = FLAGS_use_fsreadahead; options.allow_mmap_reads = FLAGS_use_mmap_reads; options.allow_mmap_writes = FLAGS_use_mmap_writes; options.advise_random_on_open = FLAGS_advise_random_on_open; @@ -2106,12 +2103,6 @@ int main(int argc, char** argv) { } else if (sscanf(argv[i], "--mmap_write=%d%c", &n, &junk) == 1 && (n == 0 || n == 1)) { FLAGS_use_mmap_writes = n; - } else if (sscanf(argv[i], "--readahead=%d%c", &n, &junk) == 1 && - (n == 0 || n == 1)) { - FLAGS_use_fsreadahead = n; - } else if (sscanf(argv[i], "--readahead_compactions=%d%c", &n, &junk) == 1&& - (n == 0 || n == 1)) { - FLAGS_use_readahead_compactions = n; } else if (sscanf(argv[i], "--statistics=%d%c", &n, &junk) == 1 && (n == 0 || n == 1)) { if (n == 1) { @@ -2255,6 +2246,9 @@ int main(int argc, char** argv) { FLAGS_keys_per_multiget = n; } else if (sscanf(argv[i], "--bytes_per_sync=%ld%c", &l, &junk) == 1) { FLAGS_bytes_per_sync = l; + } else if (sscanf(argv[i], "--deletes_check_filter_first=%d%c", &n, &junk) + == 1 && (n == 0 || n ==1 )) { + FLAGS_deletes_check_filter_first = n; } else { fprintf(stderr, "Invalid flag '%s'\n", argv[i]); exit(1); diff --git a/db/db_impl.cc b/db/db_impl.cc index 37fe889a5..5cf3a28b9 100644 --- a/db/db_impl.cc +++ b/db/db_impl.cc @@ -10,6 +10,7 @@ #include #include #include +#include #include #include @@ -2015,6 +2016,16 @@ int64_t DBImpl::TEST_MaxNextLevelOverlappingBytes() { Status DBImpl::Get(const ReadOptions& options, const Slice& key, std::string* value) { + return GetImpl(options, key, value); +} + +// If no_IO is true, then returns Status::NotFound if key is not in memtable, +// immutable-memtable and bloom-filters can guarantee that key is not in db, +// "value" is garbage string if no_IO is true +Status DBImpl::GetImpl(const ReadOptions& options, + const Slice& key, + std::string* value, + const bool no_IO) { Status s; StopWatch sw(env_, options_.statistics, DB_GET); @@ -2043,12 +2054,12 @@ Status DBImpl::Get(const ReadOptions& options, // s is both in/out. When in, s could either be OK or MergeInProgress. // value will contain the current merge operand in the latter case. LookupKey lkey(key, snapshot); - if (mem->Get(lkey, value, &s, options_)) { + if (mem->Get(lkey, value, &s, options_, no_IO)) { // Done - } else if (imm.Get(lkey, value, &s, options_)) { + } else if (imm.Get(lkey, value, &s, options_, no_IO)) { // Done } else { - current->Get(options, lkey, value, &s, &stats, options_); + current->Get(options, lkey, value, &s, &stats, options_, no_IO); have_stat_update = true; } mutex_.Lock(); @@ -2138,6 +2149,12 @@ std::vector DBImpl::MultiGet(const ReadOptions& options, return statList; } +bool DBImpl::KeyMayExist(const Slice& key) { + std::string value; + const Status s = GetImpl(ReadOptions(), key, &value, true); + return !s.IsNotFound(); +} + Iterator* DBImpl::NewIterator(const ReadOptions& options) { SequenceNumber latest_snapshot; Iterator* internal_iter = NewInternalIterator(options, &latest_snapshot); @@ -2173,6 +2190,10 @@ Status DBImpl::Merge(const WriteOptions& o, const Slice& key, } Status DBImpl::Delete(const WriteOptions& options, const Slice& key) { + if (options_.deletes_check_filter_first && !KeyMayExist(key)) { + RecordTick(options_.statistics, NUMBER_FILTERED_DELETES); + return Status::OK(); + } return DB::Delete(options, key); } @@ -2232,13 +2253,19 @@ Status DBImpl::Write(const WriteOptions& options, WriteBatch* my_batch) { } if (status.ok()) { status = WriteBatchInternal::InsertInto(updates, mem_); + if (!status.ok()) { + // Panic for in-memory corruptions + // Note that existing logic was not sound. Any partial failure writing + // into the memtable would result in a state that some write ops might + // have succeeded in memtable but Status reports error for all writes. + throw std::runtime_error("In memory WriteBatch corruption!"); + } + versions_->SetLastSequence(last_sequence); + last_flushed_sequence_ = current_sequence; } mutex_.Lock(); } - last_flushed_sequence_ = current_sequence; if (updates == &tmp_batch_) tmp_batch_.Clear(); - - versions_->SetLastSequence(last_sequence); } while (true) { diff --git a/db/db_impl.h b/db/db_impl.h index fb6879020..5f09035f2 100644 --- a/db/db_impl.h +++ b/db/db_impl.h @@ -48,6 +48,10 @@ class DBImpl : public DB { virtual std::vector MultiGet(const ReadOptions& options, const std::vector& keys, std::vector* values); + + // Returns false if key can't exist- based on memtable, immutable-memtable and + // bloom-filters; true otherwise. No IO is performed + virtual bool KeyMayExist(const Slice& key); virtual Iterator* NewIterator(const ReadOptions&); virtual const Snapshot* GetSnapshot(); virtual void ReleaseSnapshot(const Snapshot* snapshot); @@ -379,6 +383,12 @@ class DBImpl : public DB { SequenceNumber in, std::vector& snapshots, SequenceNumber* prev_snapshot); + + // Function that Get and KeyMayExist call with no_IO true or false + Status GetImpl(const ReadOptions& options, + const Slice& key, + std::string* value, + const bool no_IO = false); }; // Sanitize db options. The caller should delete result.info_log if diff --git a/db/db_test.cc b/db/db_test.cc index 8275029b4..02b77f168 100644 --- a/db/db_test.cc +++ b/db/db_test.cc @@ -218,6 +218,7 @@ class DBTest { kManifestFileSize, kCompactOnFlush, kPerfOptions, + kDeletesFilterFirst, kEnd }; int option_config_; @@ -289,6 +290,9 @@ class DBTest { options.rate_limit_delay_milliseconds = 2; // TODO -- test more options break; + case kDeletesFilterFirst: + options.deletes_check_filter_first = true; + break; default: break; } @@ -768,6 +772,39 @@ TEST(DBTest, GetEncountersEmptyLevel) { } while (ChangeOptions()); } +// KeyMayExist-API returns false if memtable(s) and in-memory bloom-filters can +// guarantee that the key doesn't exist in the db, else true. This can lead to +// a few false positives, but not false negatives. To make test deterministic, +// use a much larger number of bits per key-20 than bits in the key, so +// that false positives are eliminated +TEST(DBTest, KeyMayExist) { + do { + Options options = CurrentOptions(); + options.filter_policy = NewBloomFilterPolicy(20); + Reopen(&options); + + ASSERT_TRUE(!db_->KeyMayExist("a")); + + ASSERT_OK(db_->Put(WriteOptions(), "a", "b")); + ASSERT_TRUE(db_->KeyMayExist("a")); + + dbfull()->Flush(FlushOptions()); + ASSERT_TRUE(db_->KeyMayExist("a")); + + ASSERT_OK(db_->Delete(WriteOptions(), "a")); + ASSERT_TRUE(!db_->KeyMayExist("a")); + + dbfull()->Flush(FlushOptions()); + dbfull()->CompactRange(nullptr, nullptr); + ASSERT_TRUE(!db_->KeyMayExist("a")); + + ASSERT_OK(db_->Delete(WriteOptions(), "c")); + ASSERT_TRUE(!db_->KeyMayExist("c")); + + delete options.filter_policy; + } while (ChangeOptions()); +} + TEST(DBTest, IterEmpty) { Iterator* iter = db_->NewIterator(ReadOptions()); @@ -1403,7 +1440,7 @@ class DeleteFilter : public CompactionFilter { class ChangeFilter : public CompactionFilter { public: - ChangeFilter(int argv) : argv_(argv) {} + explicit ChangeFilter(int argv) : argv_(argv) {} virtual bool Filter(int level, const Slice& key, const Slice& value, std::string* new_value, @@ -2970,6 +3007,9 @@ class ModelDB: public DB { Status::NotSupported("Not implemented.")); return s; } + virtual bool KeyMayExist(const Slice& key) { + return true; // Not Supported directly + } virtual Iterator* NewIterator(const ReadOptions& options) { if (options.snapshot == nullptr) { KVMap* saved = new KVMap; diff --git a/db/memtable.cc b/db/memtable.cc index c6f8f26c2..cfd2bed04 100644 --- a/db/memtable.cc +++ b/db/memtable.cc @@ -119,7 +119,7 @@ void MemTable::Add(SequenceNumber s, ValueType type, } bool MemTable::Get(const LookupKey& key, std::string* value, Status* s, - const Options& options) { + const Options& options, const bool check_presence_only) { Slice memkey = key.memtable_key(); Table::Iterator iter(&table_); iter.Seek(memkey.data()); @@ -164,6 +164,10 @@ bool MemTable::Get(const LookupKey& key, std::string* value, Status* s, return true; } case kTypeMerge: { + if (check_presence_only) { + *s = Status::OK(); + return true; + } Slice v = GetLengthPrefixedSlice(key_ptr + key_length); if (merge_in_progress) { merge_operator->Merge(key.user_key(), &v, operand, diff --git a/db/memtable.h b/db/memtable.h index 6b3c68bf6..def3a5d3d 100644 --- a/db/memtable.h +++ b/db/memtable.h @@ -63,12 +63,13 @@ class MemTable { // If memtable contains a deletion for key, store a NotFound() error // in *status and return true. // If memtable contains Merge operation as the most recent entry for a key, - // and the merge process does not stop (not reaching a value or delete), + // and if check_presence_only is set, return true with Status::OK, + // else if the merge process does not stop (not reaching a value or delete), // store the current merged result in value and MergeInProgress in s. // return false // Else, return false. bool Get(const LookupKey& key, std::string* value, Status* s, - const Options& options); + const Options& options, const bool check_presence_only = false); // Returns the edits area that is needed for flushing the memtable VersionEdit* GetEdits() { return &edit_; } diff --git a/db/memtablelist.cc b/db/memtablelist.cc index eb427eb16..ac89d1043 100644 --- a/db/memtablelist.cc +++ b/db/memtablelist.cc @@ -194,10 +194,10 @@ size_t MemTableList::ApproximateMemoryUsage() { // Search all the memtables starting from the most recent one. // Return the most recent value found, if any. bool MemTableList::Get(const LookupKey& key, std::string* value, Status* s, - const Options& options ) { + const Options& options, const bool check_presence_only) { for (list::iterator it = memlist_.begin(); it != memlist_.end(); ++it) { - if ((*it)->Get(key, value, s, options)) { + if ((*it)->Get(key, value, s, options, check_presence_only)) { return true; } } diff --git a/db/memtablelist.h b/db/memtablelist.h index 31831deac..40419e56f 100644 --- a/db/memtablelist.h +++ b/db/memtablelist.h @@ -71,7 +71,7 @@ class MemTableList { // Search all the memtables starting from the most recent one. // Return the most recent value found, if any. bool Get(const LookupKey& key, std::string* value, Status* s, - const Options& options); + const Options& options, const bool check_presence_only = false); // Returns the list of underlying memtables. void GetMemTables(std::vector* list); diff --git a/db/table_cache.cc b/db/table_cache.cc index 700db74db..4cc105afe 100644 --- a/db/table_cache.cc +++ b/db/table_cache.cc @@ -112,14 +112,16 @@ Status TableCache::Get(const ReadOptions& options, const Slice& k, void* arg, bool (*saver)(void*, const Slice&, const Slice&, bool), - bool* tableIO) { + bool* tableIO, + void (*mark_key_may_exist)(void*), + const bool no_IO) { Cache::Handle* handle = nullptr; Status s = FindTable(storage_options_, file_number, file_size, &handle, tableIO); if (s.ok()) { Table* t = reinterpret_cast(cache_->Value(handle)); - s = t->InternalGet(options, k, arg, saver); + s = t->InternalGet(options, k, arg, saver, mark_key_may_exist, no_IO); cache_->Release(handle); } return s; diff --git a/db/table_cache.h b/db/table_cache.h index c3996a3cc..737e53c9e 100644 --- a/db/table_cache.h +++ b/db/table_cache.h @@ -48,7 +48,9 @@ class TableCache { const Slice& k, void* arg, bool (*handle_result)(void*, const Slice&, const Slice&, bool), - bool* tableIO); + bool* tableIO, + void (*mark_key_may_exist)(void*) = nullptr, + const bool no_IO = false); // Evict any entry for the specified file number void Evict(uint64_t file_number); diff --git a/db/version_set.cc b/db/version_set.cc index 4b2774ecd..c8877fb82 100644 --- a/db/version_set.cc +++ b/db/version_set.cc @@ -245,6 +245,16 @@ struct Saver { bool didIO; // did we do any disk io? }; } + +// Called from TableCache::Get when bloom-filters can't guarantee that key does +// not exist and Get is not permitted to do IO to read the data-block and be +// certain. +// Set the key as Found and let the caller know that key-may-exist +static void MarkKeyMayExist(void* arg) { + Saver* s = reinterpret_cast(arg); + s->state = kFound; +} + static bool SaveValue(void* arg, const Slice& ikey, const Slice& v, bool didIO){ Saver* s = reinterpret_cast(arg); ParsedInternalKey parsed_key; @@ -337,7 +347,8 @@ void Version::Get(const ReadOptions& options, std::string* value, Status *status, GetStats* stats, - const Options& db_options) { + const Options& db_options, + const bool no_IO) { Slice ikey = k.internal_key(); Slice user_key = k.user_key(); const Comparator* ucmp = vset_->icmp_.user_comparator(); @@ -346,6 +357,9 @@ void Version::Get(const ReadOptions& options, auto logger = db_options.info_log; assert(status->ok() || status->IsMergeInProgress()); + if (no_IO) { + assert(status->ok()); + } Saver saver; saver.state = status->ok()? kNotFound : kMerge; saver.ucmp = ucmp; @@ -417,7 +431,8 @@ void Version::Get(const ReadOptions& options, FileMetaData* f = files[i]; bool tableIO = false; *status = vset_->table_cache_->Get(options, f->number, f->file_size, - ikey, &saver, SaveValue, &tableIO); + ikey, &saver, SaveValue, &tableIO, + MarkKeyMayExist, no_IO); // TODO: examine the behavior for corrupted key if (!status->ok()) { return; @@ -2199,7 +2214,7 @@ Compaction* VersionSet::PickCompactionBySize(int level, double score) { assert(level >= 0); assert(level+1 < NumberLevels()); - c = new Compaction(level, level+1, MaxFileSizeForLevel(level), + c = new Compaction(level, level+1, MaxFileSizeForLevel(level+1), MaxGrandParentOverlapBytes(level), NumberLevels()); c->score_ = score; @@ -2304,7 +2319,7 @@ Compaction* VersionSet::PickCompaction() { if (level != 0 || compactions_in_progress_[0].empty()) { if(!ParentRangeInCompaction(&f->smallest, &f->largest, level, &parent_index)) { - c = new Compaction(level, level, MaxFileSizeForLevel(level), + c = new Compaction(level, level, MaxFileSizeForLevel(level+1), MaxGrandParentOverlapBytes(level), NumberLevels(), true); c->inputs_[0].push_back(f); c->parent_index_ = parent_index; @@ -2481,7 +2496,7 @@ Compaction* VersionSet::CompactRange( int out_level = (options_->compaction_style == kCompactionStyleUniversal) ? level : level+1; - Compaction* c = new Compaction(level, out_level, MaxFileSizeForLevel(level), + Compaction* c = new Compaction(level, out_level, MaxFileSizeForLevel(out_level), MaxGrandParentOverlapBytes(level), NumberLevels()); c->input_version_ = current_; c->input_version_->Ref(); diff --git a/db/version_set.h b/db/version_set.h index 56eec7ecd..342bda711 100644 --- a/db/version_set.h +++ b/db/version_set.h @@ -74,7 +74,8 @@ class Version { int seek_file_level; }; void Get(const ReadOptions&, const LookupKey& key, std::string* val, - Status* status, GetStats* stats, const Options& db_option); + Status* status, GetStats* stats, const Options& db_option, + const bool no_IO = false); // Adds "stats" into the current state. Returns true if a new // compaction may need to be triggered, false otherwise. diff --git a/include/leveldb/db.h b/include/leveldb/db.h index 8fd780e51..056920d9e 100644 --- a/include/leveldb/db.h +++ b/include/leveldb/db.h @@ -19,8 +19,8 @@ namespace leveldb { using std::unique_ptr; // Update Makefile if you change these -static const int kMajorVersion = 1; -static const int kMinorVersion = 5; +static const int kMajorVersion = 2; +static const int kMinorVersion = 0; struct Options; struct ReadOptions; @@ -120,6 +120,11 @@ class DB { const std::vector& keys, std::vector* values) = 0; + // If the key definitely does not exist in the database, then this method + // returns false. Otherwise return true. This check is potentially + // lighter-weight than invoking DB::Get(). No IO is performed + virtual bool KeyMayExist(const Slice& key) = 0; + // Return a heap-allocated iterator over the contents of the database. // The result of NewIterator() is initially invalid (caller must // call one of the Seek methods on the iterator before using it). diff --git a/include/leveldb/options.h b/include/leveldb/options.h index 1e88c8c8f..a94a17a22 100644 --- a/include/leveldb/options.h +++ b/include/leveldb/options.h @@ -422,17 +422,6 @@ struct Options { // Default: true bool allow_os_buffer; - // Reading a single block from a file can cause the OS/FS to start - // readaheads of other blocks from the file. Default: true - // Note: Deprecated - bool allow_readahead; - - // The reads triggered by compaction allows data to be readahead - // by the OS/FS. This overrides the setting of 'allow_readahead' - // for compaction-reads. Default: true - // Note: Deprecated - bool allow_readahead_compactions; - // Allow the OS to mmap file for reading sst tables. Default: false bool allow_mmap_reads; @@ -487,6 +476,15 @@ struct Options { // The options needed to support Universal Style compactions CompactionOptionsUniversal compaction_options_universal; + + // Use bloom-filter for deletes when this is true. + // db->Delete first calls KeyMayExist which checks memtable,immutable-memtable + // and bloom-filters to determine if the key does not exist in the database. + // If the key definitely does not exist, then the delete is a noop.KeyMayExist + // only incurs in-memory look up. This optimization avoids writing the delete + // to storage when appropriate. + // Default: false + bool deletes_check_filter_first; }; // Options that control read operations diff --git a/include/leveldb/statistics.h b/include/leveldb/statistics.h index 883d30c41..928ae5a14 100644 --- a/include/leveldb/statistics.h +++ b/include/leveldb/statistics.h @@ -58,7 +58,9 @@ enum Tickers { NUMBER_MULTIGET_KEYS_READ = 19, NUMBER_MULTIGET_BYTES_READ = 20, - TICKER_ENUM_MAX = 21 + NUMBER_FILTERED_DELETES = 21, + + TICKER_ENUM_MAX = 22 }; const std::vector> TickersNameMap = { @@ -82,7 +84,8 @@ const std::vector> TickersNameMap = { { NO_ITERATORS, "rocksdb.num.iterators" }, { NUMBER_MULTIGET_CALLS, "rocksdb.number.multiget.get" }, { NUMBER_MULTIGET_KEYS_READ, "rocksdb.number.multiget.keys.read" }, - { NUMBER_MULTIGET_BYTES_READ, "rocksdb.number.multiget.bytes.read" } + { NUMBER_MULTIGET_BYTES_READ, "rocksdb.number.multiget.bytes.read" }, + { NUMBER_FILTERED_DELETES, "rocksdb.number.deletes.filtered" } }; /** diff --git a/table/table.cc b/table/table.cc index 5aceebe0f..f7b664a4f 100644 --- a/table/table.cc +++ b/table/table.cc @@ -322,7 +322,9 @@ Iterator* Table::NewIterator(const ReadOptions& options) const { Status Table::InternalGet(const ReadOptions& options, const Slice& k, void* arg, bool (*saver)(void*, const Slice&, const Slice&, - bool)) { + bool), + void (*mark_key_may_exist)(void*), + const bool no_IO) { Status s; Iterator* iiter = rep_->index_block->NewIterator(rep_->options.comparator); bool done = false; @@ -338,6 +340,11 @@ Status Table::InternalGet(const ReadOptions& options, const Slice& k, // cross one data block, we should be fine. RecordTick(rep_->options.statistics, BLOOM_FILTER_USEFUL); break; + } else if (no_IO) { + // Update Saver.state to Found because we are only looking for whether + // bloom-filter can guarantee the key is not there when "no_IO" + (*mark_key_may_exist)(arg); + done = true; } else { bool didIO = false; Iterator* block_iter = BlockReader(this, options, iiter->value(), diff --git a/table/table.h b/table/table.h index a657290b5..4674e262b 100644 --- a/table/table.h +++ b/table/table.h @@ -86,7 +86,9 @@ class Table { Status InternalGet( const ReadOptions&, const Slice& key, void* arg, - bool (*handle_result)(void* arg, const Slice& k, const Slice& v, bool)); + bool (*handle_result)(void* arg, const Slice& k, const Slice& v, bool), + void (*mark_key_may_exist)(void*) = nullptr, + const bool no_IO = false); void ReadMeta(const Footer& footer); diff --git a/tools/db_stress.cc b/tools/db_stress.cc index cf4951b5c..fee424359 100644 --- a/tools/db_stress.cc +++ b/tools/db_stress.cc @@ -183,6 +183,9 @@ static uint32_t FLAGS_log2_keys_per_lock = 2; // implies 2^2 keys per lock // Percentage of times we want to purge redundant keys in memory before flushing static uint32_t FLAGS_purge_redundant_percent = 50; +// On true, deletes use bloom-filter and drop the delete if key not present +static bool FLAGS_deletes_check_filter_first = false; + // Level0 compaction start trigger static int FLAGS_level0_file_num_compaction_trigger = 0; @@ -901,8 +904,10 @@ class StressTest { fprintf(stdout, "Num times DB reopens: %d\n", FLAGS_reopen); fprintf(stdout, "Batches/snapshots : %d\n", FLAGS_test_batches_snapshots); - fprintf(stdout, "Purge redundant %% : %d\n", + fprintf(stdout, "Purge redundant %% : %d\n", FLAGS_purge_redundant_percent); + fprintf(stdout, "Deletes use filter : %d\n", + FLAGS_deletes_check_filter_first); fprintf(stdout, "Num keys per lock : %d\n", 1 << FLAGS_log2_keys_per_lock); @@ -959,6 +964,7 @@ class StressTest { options.delete_obsolete_files_period_micros = FLAGS_delete_obsolete_files_period_micros; options.max_manifest_file_size = 1024; + options.deletes_check_filter_first = FLAGS_deletes_check_filter_first; static Random purge_percent(1000); // no benefit from non-determinism here if (purge_percent.Uniform(100) < FLAGS_purge_redundant_percent - 1) { options.purge_redundant_kvs_while_flush = false; @@ -998,10 +1004,7 @@ class StressTest { void PrintStatistics() { if (dbstats) { - fprintf(stdout, "File opened:%ld closed:%ld errors:%ld\n", - dbstats->getTickerCount(NO_FILE_OPENS), - dbstats->getTickerCount(NO_FILE_CLOSES), - dbstats->getTickerCount(NO_FILE_ERRORS)); + fprintf(stdout, "STATISTICS:\n%s\n", dbstats->ToString().c_str()); } } @@ -1165,6 +1168,9 @@ int main(int argc, char** argv) { } else if (sscanf(argv[i], "--purge_redundant_percent=%d%c", &n, &junk) == 1 && (n >= 0 && n <= 100)) { FLAGS_purge_redundant_percent = n; + } else if (sscanf(argv[i], "--deletes_check_filter_first=%d%c", &n, &junk) + == 1 && (n == 0 || n == 1)) { + FLAGS_deletes_check_filter_first = n; } else { fprintf(stderr, "Invalid flag '%s'\n", argv[i]); exit(1); diff --git a/util/ldb_cmd.cc b/util/ldb_cmd.cc index d4fdf8c48..70f5cb743 100644 --- a/util/ldb_cmd.cc +++ b/util/ldb_cmd.cc @@ -5,6 +5,7 @@ #include "util/ldb_cmd.h" #include "db/dbformat.h" +#include "db/db_impl.h" #include "db/log_reader.h" #include "db/filename.h" #include "db/write_batch_internal.h" @@ -45,7 +46,7 @@ const char* LDBCommand::DELIM = " ==> "; LDBCommand* LDBCommand::InitFromCmdLineArgs( int argc, char** argv, - Options options + const Options& options ) { vector args; for (int i = 1; i < argc; i++) { @@ -66,7 +67,7 @@ LDBCommand* LDBCommand::InitFromCmdLineArgs( */ LDBCommand* LDBCommand::InitFromCmdLineArgs( const vector& args, - Options options + const Options& options ) { // --x=y command line arguments are added as x->y map entries. map option_map; @@ -80,9 +81,7 @@ LDBCommand* LDBCommand::InitFromCmdLineArgs( const string OPTION_PREFIX = "--"; - for (vector::const_iterator itr = args.begin(); - itr != args.end(); itr++) { - string arg = *itr; + for (const auto& arg : args) { if (arg[0] == '-' && arg[1] == '-'){ vector splits = stringSplit(arg, '='); if (splits.size() == 2) { @@ -93,7 +92,7 @@ LDBCommand* LDBCommand::InitFromCmdLineArgs( flags.push_back(optionKey); } } else { - cmdTokens.push_back(string(arg)); + cmdTokens.push_back(arg); } } @@ -119,9 +118,9 @@ LDBCommand* LDBCommand::InitFromCmdLineArgs( LDBCommand* LDBCommand::SelectCommand( const std::string& cmd, - vector& cmdParams, - map& option_map, - vector& flags + const vector& cmdParams, + const map& option_map, + const vector& flags ) { if (cmd == GetCommand::Name()) { @@ -150,6 +149,8 @@ LDBCommand* LDBCommand::SelectCommand( return new DBLoaderCommand(cmdParams, option_map, flags); } else if (cmd == ManifestDumpCommand::Name()) { return new ManifestDumpCommand(cmdParams, option_map, flags); + } else if (cmd == InternalDumpCommand::Name()) { + return new InternalDumpCommand(cmdParams, option_map, flags); } return nullptr; } @@ -163,7 +164,8 @@ LDBCommand* LDBCommand::SelectCommand( * updated. */ bool LDBCommand::ParseIntOption(const map& options, - string option, int& value, LDBCommandExecuteResult& exec_state) { + const string& option, int& value, + LDBCommandExecuteResult& exec_state) { map::const_iterator itr = option_map_.find(option); if (itr != option_map_.end()) { @@ -181,6 +183,21 @@ bool LDBCommand::ParseIntOption(const map& options, return false; } +/** + * Parses the specified option and fills in the value. + * Returns true if the option is found. + * Returns false otherwise. + */ +bool LDBCommand::ParseStringOption(const map& options, + const string& option, string* value) { + auto itr = option_map_.find(option); + if (itr != option_map_.end()) { + *value = itr->second; + return true; + } + return false; +} + Options LDBCommand::PrepareOptionsForOpenDB() { Options opt = options_; @@ -453,7 +470,7 @@ void ManifestDumpCommand::Help(string& ret) { ManifestDumpCommand::ManifestDumpCommand(const vector& params, const map& options, const vector& flags) : LDBCommand(options, flags, false, - BuildCmdLineOptions({ARG_VERBOSE,ARG_PATH})), + BuildCmdLineOptions({ARG_VERBOSE, ARG_PATH, ARG_HEX})), verbose_(false), path_("") { @@ -559,6 +576,115 @@ void PrintBucketCounts(const vector& bucket_counts, int ttl_start, ReadableTime(ttl_end).c_str(), bucket_counts[num_buckets - 1]); } +const string InternalDumpCommand::ARG_COUNT_ONLY = "count_only"; +const string InternalDumpCommand::ARG_STATS = "stats"; + +InternalDumpCommand::InternalDumpCommand(const vector& params, + const map& options, + const vector& flags) : + LDBCommand(options, flags, true, + BuildCmdLineOptions({ ARG_HEX, ARG_KEY_HEX, ARG_VALUE_HEX, + ARG_FROM, ARG_TO, ARG_MAX_KEYS, + ARG_COUNT_ONLY, ARG_STATS})), + has_from_(false), + has_to_(false), + max_keys_(-1), + count_only_(false), + print_stats_(false) { + + has_from_ = ParseStringOption(options, ARG_FROM, &from_); + has_to_ = ParseStringOption(options, ARG_TO, &to_); + + ParseIntOption(options, ARG_MAX_KEYS, max_keys_, exec_state_); + + print_stats_ = IsFlagPresent(flags, ARG_STATS); + count_only_ = IsFlagPresent(flags, ARG_COUNT_ONLY); + + if (is_key_hex_) { + if (has_from_) { + from_ = HexToString(from_); + } + if (has_to_) { + to_ = HexToString(to_); + } + } +} + +void InternalDumpCommand::Help(string& ret) { + ret.append(" "); + ret.append(InternalDumpCommand::Name()); + ret.append(HelpRangeCmdArgs()); + ret.append(" [--" + ARG_MAX_KEYS + "=]"); + ret.append(" [--" + ARG_COUNT_ONLY + "]"); + ret.append(" [--" + ARG_STATS + "]"); + ret.append("\n"); +} + +void InternalDumpCommand::DoCommand() { + if (!db_) { + return; + } + + if (print_stats_) { + string stats; + if (db_->GetProperty("leveldb.stats", &stats)) { + fprintf(stdout, "%s\n", stats.c_str()); + } + } + + // Cast as DBImpl to get internal iterator + DBImpl* idb = dynamic_cast(db_); + if (!idb) { + exec_state_ = LDBCommandExecuteResult::FAILED("DB is not DBImpl"); + return; + } + + // Setup internal key iterator + auto iter = unique_ptr(idb->TEST_NewInternalIterator()); + Status st = iter->status(); + if (!st.ok()) { + exec_state_ = LDBCommandExecuteResult::FAILED("Iterator error:" + + st.ToString()); + } + + if (has_from_) { + InternalKey ikey(from_, kMaxSequenceNumber, kValueTypeForSeek); + iter->Seek(ikey.Encode()); + } else { + iter->SeekToFirst(); + } + + long long count = 0; + for (; iter->Valid(); iter->Next()) { + ParsedInternalKey ikey; + if (!ParseInternalKey(iter->key(), &ikey)) { + fprintf(stderr, "Internal Key [%s] parse error!\n", + iter->key().ToString(true /* in hex*/).data()); + // TODO: add error counter + continue; + } + + // If end marker was specified, we stop before it + if (has_to_ && options_.comparator->Compare(ikey.user_key, to_) >= 0) { + break; + } + + ++count; + + if (!count_only_) { + string key = ikey.DebugString(is_key_hex_); + string value = iter->value().ToString(is_value_hex_); + fprintf(stdout, "%s => %s\n", key.data(), value.data()); + } + + // Terminate if maximum number of keys have been dumped + if (max_keys_ > 0 && count >= max_keys_) break; + } + + fprintf(stdout, "Internal keys in range: %lld\n", (long long) count); +} + + const string DBDumperCommand::ARG_COUNT_ONLY = "count_only"; const string DBDumperCommand::ARG_STATS = "stats"; const string DBDumperCommand::ARG_TTL_BUCKET = "bucket"; diff --git a/util/ldb_cmd.h b/util/ldb_cmd.h index d8e4c4b11..240ebca75 100644 --- a/util/ldb_cmd.h +++ b/util/ldb_cmd.h @@ -55,13 +55,13 @@ public: static LDBCommand* InitFromCmdLineArgs( const vector& args, - Options options = Options() + const Options& options = Options() ); static LDBCommand* InitFromCmdLineArgs( int argc, char** argv, - Options options = Options() + const Options& options = Options() ); bool ValidateCmdLineOptions(); @@ -230,6 +230,8 @@ protected: string msg = st.ToString(); exec_state_ = LDBCommandExecuteResult::FAILED(msg); } + + options_ = opt; } void CloseDB () { @@ -281,13 +283,16 @@ protected: return ret; } - bool ParseIntOption(const map& options, string option, - int& value, LDBCommandExecuteResult& exec_state); + bool ParseIntOption(const map& options, const string& option, + int& value, LDBCommandExecuteResult& exec_state); -private: + bool ParseStringOption(const map& options, + const string& option, string* value); Options options_; +private: + /** * Interpret command line options and flags to determine if the key * should be input/output in hex. @@ -347,9 +352,9 @@ private: static LDBCommand* SelectCommand( const string& cmd, - vector& cmdParams, - map& option_map, - vector& flags + const vector& cmdParams, + const map& option_map, + const vector& flags ); }; @@ -397,6 +402,31 @@ private: static const string ARG_TTL_BUCKET; }; +class InternalDumpCommand: public LDBCommand { +public: + static string Name() { return "idump"; } + + InternalDumpCommand(const vector& params, + const map& options, + const vector& flags); + + static void Help(string& ret); + + virtual void DoCommand(); + +private: + bool has_from_; + string from_; + bool has_to_; + string to_; + int max_keys_; + bool count_only_; + bool print_stats_; + + static const string ARG_COUNT_ONLY; + static const string ARG_STATS; +}; + class DBLoaderCommand: public LDBCommand { public: static string Name() { return "load"; } diff --git a/util/ldb_tool.cc b/util/ldb_tool.cc index e46aee39d..eec1f4e0e 100644 --- a/util/ldb_tool.cc +++ b/util/ldb_tool.cc @@ -59,6 +59,7 @@ public: DBDumperCommand::Help(ret); DBLoaderCommand::Help(ret); ManifestDumpCommand::Help(ret); + InternalDumpCommand::Help(ret); fprintf(stderr, "%s\n", ret.c_str()); } diff --git a/util/options.cc b/util/options.cc index c58d9614d..0ceba55a7 100644 --- a/util/options.cc +++ b/util/options.cc @@ -65,8 +65,6 @@ Options::Options() manifest_preallocation_size(4 * 1024 * 1024), purge_redundant_kvs_while_flush(true), allow_os_buffer(true), - allow_readahead(true), - allow_readahead_compactions(true), allow_mmap_reads(false), allow_mmap_writes(true), is_fd_close_on_exec(true), @@ -78,6 +76,7 @@ Options::Options() use_adaptive_mutex(false), bytes_per_sync(0), compaction_style(kCompactionStyleLevel) { + deletes_check_filter_first(false) { } static const char* const access_hints[] = { @@ -127,13 +126,10 @@ Options::Dump(Logger* log) const Log(log," Options.db_stats_log_interval: %d", db_stats_log_interval); Log(log," Options.allow_os_buffer: %d", allow_os_buffer); - Log(log," Options.allow_readahead: %d", allow_readahead); Log(log," Options.allow_mmap_reads: %d", allow_mmap_reads); Log(log," Options.allow_mmap_writes: %d", allow_mmap_writes); Log(log," Options.min_write_buffer_number_to_merge: %d", min_write_buffer_number_to_merge); - Log(log," Options.allow_readahead_compactions: %d", - allow_readahead_compactions); Log(log," Options.purge_redundant_kvs_while_flush: %d", purge_redundant_kvs_while_flush); Log(log," Options.compression_opts.window_bits: %d", @@ -194,10 +190,6 @@ Options::Dump(Logger* log) const purge_redundant_kvs_while_flush); Log(log," Options.allow_os_buffer: %d", allow_os_buffer); - Log(log," Options.allow_readahead: %d", - allow_readahead); - Log(log," Options.allow_readahead_compactions: %d", - allow_readahead_compactions); Log(log," Options.allow_mmap_reads: %d", allow_mmap_reads); Log(log," Options.allow_mmap_writes: %d", @@ -220,6 +212,8 @@ Options::Dump(Logger* log) const bytes_per_sync); Log(log," Options.compaction_style: %d", compaction_style); + Log(log," Options.deletes_check_filter_first: %d", + deletes_check_filter_first); Log(log," Options.compaction_options_universal.size_ratio: %d", compaction_options_universal.size_ratio); Log(log," Options.compaction_options_universal.min_merge_width: %d", diff --git a/utilities/ttl/db_ttl.cc b/utilities/ttl/db_ttl.cc index d201208d3..eff675340 100644 --- a/utilities/ttl/db_ttl.cc +++ b/utilities/ttl/db_ttl.cc @@ -158,6 +158,10 @@ std::vector DBWithTTL::MultiGet(const ReadOptions& options, supported with TTL")); } +bool DBWithTTL::KeyMayExist(const Slice& key) { + return db_->KeyMayExist(key); +} + Status DBWithTTL::Delete(const WriteOptions& wopts, const Slice& key) { return db_->Delete(wopts, key); } diff --git a/utilities/ttl/db_ttl.h b/utilities/ttl/db_ttl.h index 2e01c1e3d..d24efbe48 100644 --- a/utilities/ttl/db_ttl.h +++ b/utilities/ttl/db_ttl.h @@ -33,6 +33,8 @@ class DBWithTTL : public DB, CompactionFilter { const std::vector& keys, std::vector* values); + virtual bool KeyMayExist(const Slice& key); + virtual Status Delete(const WriteOptions& wopts, const Slice& key); virtual Status Merge(const WriteOptions& options,