From d56523c49c812c7fca92441f6e66e7e8b22ecb26 Mon Sep 17 00:00:00 2001 From: Mayank Agarwal Date: Sat, 29 Jun 2013 12:51:24 -0700 Subject: [PATCH 1/8] Update rocksdb version Summary: rocksdb-2.0 released to third party Test Plan: visual inspection Reviewers: dhruba, haobo, sheki Reviewed By: dhruba CC: leveldb Differential Revision: https://reviews.facebook.net/D11559 --- Makefile | 4 ++-- include/leveldb/db.h | 4 ++-- 2 files changed, 4 insertions(+), 4 deletions(-) diff --git a/Makefile b/Makefile index 2665c3a21..9b1d05fcf 100644 --- a/Makefile +++ b/Makefile @@ -90,8 +90,8 @@ SHARED3 = $(SHARED1) SHARED = $(SHARED1) else # Update db.h if you change these. -SHARED_MAJOR = 1 -SHARED_MINOR = 5 +SHARED_MAJOR = 2 +SHARED_MINOR = 0 SHARED1 = libleveldb.$(PLATFORM_SHARED_EXT) SHARED2 = $(SHARED1).$(SHARED_MAJOR) SHARED3 = $(SHARED1).$(SHARED_MAJOR).$(SHARED_MINOR) diff --git a/include/leveldb/db.h b/include/leveldb/db.h index 8fd780e51..6d816edce 100644 --- a/include/leveldb/db.h +++ b/include/leveldb/db.h @@ -19,8 +19,8 @@ namespace leveldb { using std::unique_ptr; // Update Makefile if you change these -static const int kMajorVersion = 1; -static const int kMinorVersion = 5; +static const int kMajorVersion = 2; +static const int kMinorVersion = 0; struct Options; struct ReadOptions; From 92ca816a60556846a954289d183a290a1a6b2dc2 Mon Sep 17 00:00:00 2001 From: Haobo Xu Date: Thu, 20 Jun 2013 16:02:36 -0700 Subject: [PATCH 2/8] [RocksDB] Support internal key/value dump for ldb Summary: This diff added a command 'idump' to ldb tool, which dumps the internal key/value pairs. It could be useful for diagnosis and estimating the per user key 'overhead'. Also cleaned up the ldb code a bit where I touched. Test Plan: make check; ldb idump Reviewers: emayanke, sheki, dhruba CC: leveldb Differential Revision: https://reviews.facebook.net/D11517 --- util/ldb_cmd.cc | 148 +++++++++++++++++++++++++++++++++++++++++++---- util/ldb_cmd.h | 46 ++++++++++++--- util/ldb_tool.cc | 1 + 3 files changed, 176 insertions(+), 19 deletions(-) diff --git a/util/ldb_cmd.cc b/util/ldb_cmd.cc index 780d3d2a4..0134e93b4 100644 --- a/util/ldb_cmd.cc +++ b/util/ldb_cmd.cc @@ -5,6 +5,7 @@ #include "util/ldb_cmd.h" #include "db/dbformat.h" +#include "db/db_impl.h" #include "db/log_reader.h" #include "db/filename.h" #include "db/write_batch_internal.h" @@ -45,7 +46,7 @@ const char* LDBCommand::DELIM = " ==> "; LDBCommand* LDBCommand::InitFromCmdLineArgs( int argc, char** argv, - Options options + const Options& options ) { vector args; for (int i = 1; i < argc; i++) { @@ -66,7 +67,7 @@ LDBCommand* LDBCommand::InitFromCmdLineArgs( */ LDBCommand* LDBCommand::InitFromCmdLineArgs( const vector& args, - Options options + const Options& options ) { // --x=y command line arguments are added as x->y map entries. map option_map; @@ -80,9 +81,7 @@ LDBCommand* LDBCommand::InitFromCmdLineArgs( const string OPTION_PREFIX = "--"; - for (vector::const_iterator itr = args.begin(); - itr != args.end(); itr++) { - string arg = *itr; + for (const auto& arg : args) { if (arg[0] == '-' && arg[1] == '-'){ vector splits = stringSplit(arg, '='); if (splits.size() == 2) { @@ -93,7 +92,7 @@ LDBCommand* LDBCommand::InitFromCmdLineArgs( flags.push_back(optionKey); } } else { - cmdTokens.push_back(string(arg)); + cmdTokens.push_back(arg); } } @@ -119,9 +118,9 @@ LDBCommand* LDBCommand::InitFromCmdLineArgs( LDBCommand* LDBCommand::SelectCommand( const std::string& cmd, - vector& cmdParams, - map& option_map, - vector& flags + const vector& cmdParams, + const map& option_map, + const vector& flags ) { if (cmd == GetCommand::Name()) { @@ -150,6 +149,8 @@ LDBCommand* LDBCommand::SelectCommand( return new DBLoaderCommand(cmdParams, option_map, flags); } else if (cmd == ManifestDumpCommand::Name()) { return new ManifestDumpCommand(cmdParams, option_map, flags); + } else if (cmd == InternalDumpCommand::Name()) { + return new InternalDumpCommand(cmdParams, option_map, flags); } return nullptr; } @@ -163,7 +164,8 @@ LDBCommand* LDBCommand::SelectCommand( * updated. */ bool LDBCommand::ParseIntOption(const map& options, - string option, int& value, LDBCommandExecuteResult& exec_state) { + const string& option, int& value, + LDBCommandExecuteResult& exec_state) { map::const_iterator itr = option_map_.find(option); if (itr != option_map_.end()) { @@ -181,6 +183,21 @@ bool LDBCommand::ParseIntOption(const map& options, return false; } +/** + * Parses the specified option and fills in the value. + * Returns true if the option is found. + * Returns false otherwise. + */ +bool LDBCommand::ParseStringOption(const map& options, + const string& option, string* value) { + auto itr = option_map_.find(option); + if (itr != option_map_.end()) { + *value = itr->second; + return true; + } + return false; +} + Options LDBCommand::PrepareOptionsForOpenDB() { Options opt = options_; @@ -453,7 +470,7 @@ void ManifestDumpCommand::Help(string& ret) { ManifestDumpCommand::ManifestDumpCommand(const vector& params, const map& options, const vector& flags) : LDBCommand(options, flags, false, - BuildCmdLineOptions({ARG_VERBOSE,ARG_PATH})), + BuildCmdLineOptions({ARG_VERBOSE, ARG_PATH, ARG_HEX})), verbose_(false), path_("") { @@ -559,6 +576,115 @@ void PrintBucketCounts(const vector& bucket_counts, int ttl_start, ReadableTime(ttl_end).c_str(), bucket_counts[num_buckets - 1]); } +const string InternalDumpCommand::ARG_COUNT_ONLY = "count_only"; +const string InternalDumpCommand::ARG_STATS = "stats"; + +InternalDumpCommand::InternalDumpCommand(const vector& params, + const map& options, + const vector& flags) : + LDBCommand(options, flags, true, + BuildCmdLineOptions({ ARG_HEX, ARG_KEY_HEX, ARG_VALUE_HEX, + ARG_FROM, ARG_TO, ARG_MAX_KEYS, + ARG_COUNT_ONLY, ARG_STATS})), + has_from_(false), + has_to_(false), + max_keys_(-1), + count_only_(false), + print_stats_(false) { + + has_from_ = ParseStringOption(options, ARG_FROM, &from_); + has_to_ = ParseStringOption(options, ARG_TO, &to_); + + ParseIntOption(options, ARG_MAX_KEYS, max_keys_, exec_state_); + + print_stats_ = IsFlagPresent(flags, ARG_STATS); + count_only_ = IsFlagPresent(flags, ARG_COUNT_ONLY); + + if (is_key_hex_) { + if (has_from_) { + from_ = HexToString(from_); + } + if (has_to_) { + to_ = HexToString(to_); + } + } +} + +void InternalDumpCommand::Help(string& ret) { + ret.append(" "); + ret.append(InternalDumpCommand::Name()); + ret.append(HelpRangeCmdArgs()); + ret.append(" [--" + ARG_MAX_KEYS + "=]"); + ret.append(" [--" + ARG_COUNT_ONLY + "]"); + ret.append(" [--" + ARG_STATS + "]"); + ret.append("\n"); +} + +void InternalDumpCommand::DoCommand() { + if (!db_) { + return; + } + + if (print_stats_) { + string stats; + if (db_->GetProperty("leveldb.stats", &stats)) { + fprintf(stdout, "%s\n", stats.c_str()); + } + } + + // Cast as DBImpl to get internal iterator + DBImpl* idb = dynamic_cast(db_); + if (!idb) { + exec_state_ = LDBCommandExecuteResult::FAILED("DB is not DBImpl"); + return; + } + + // Setup internal key iterator + auto iter = unique_ptr(idb->TEST_NewInternalIterator()); + Status st = iter->status(); + if (!st.ok()) { + exec_state_ = LDBCommandExecuteResult::FAILED("Iterator error:" + + st.ToString()); + } + + if (has_from_) { + InternalKey ikey(from_, kMaxSequenceNumber, kValueTypeForSeek); + iter->Seek(ikey.Encode()); + } else { + iter->SeekToFirst(); + } + + long long count = 0; + for (; iter->Valid(); iter->Next()) { + ParsedInternalKey ikey; + if (!ParseInternalKey(iter->key(), &ikey)) { + fprintf(stderr, "Internal Key [%s] parse error!\n", + iter->key().ToString(true /* in hex*/).data()); + // TODO: add error counter + continue; + } + + // If end marker was specified, we stop before it + if (has_to_ && options_.comparator->Compare(ikey.user_key, to_) >= 0) { + break; + } + + ++count; + + if (!count_only_) { + string key = ikey.DebugString(is_key_hex_); + string value = iter->value().ToString(is_value_hex_); + fprintf(stdout, "%s => %s\n", key.data(), value.data()); + } + + // Terminate if maximum number of keys have been dumped + if (max_keys_ > 0 && count >= max_keys_) break; + } + + fprintf(stdout, "Internal keys in range: %lld\n", (long long) count); +} + + const string DBDumperCommand::ARG_COUNT_ONLY = "count_only"; const string DBDumperCommand::ARG_STATS = "stats"; const string DBDumperCommand::ARG_TTL_BUCKET = "bucket"; diff --git a/util/ldb_cmd.h b/util/ldb_cmd.h index d8e4c4b11..240ebca75 100644 --- a/util/ldb_cmd.h +++ b/util/ldb_cmd.h @@ -55,13 +55,13 @@ public: static LDBCommand* InitFromCmdLineArgs( const vector& args, - Options options = Options() + const Options& options = Options() ); static LDBCommand* InitFromCmdLineArgs( int argc, char** argv, - Options options = Options() + const Options& options = Options() ); bool ValidateCmdLineOptions(); @@ -230,6 +230,8 @@ protected: string msg = st.ToString(); exec_state_ = LDBCommandExecuteResult::FAILED(msg); } + + options_ = opt; } void CloseDB () { @@ -281,13 +283,16 @@ protected: return ret; } - bool ParseIntOption(const map& options, string option, - int& value, LDBCommandExecuteResult& exec_state); + bool ParseIntOption(const map& options, const string& option, + int& value, LDBCommandExecuteResult& exec_state); -private: + bool ParseStringOption(const map& options, + const string& option, string* value); Options options_; +private: + /** * Interpret command line options and flags to determine if the key * should be input/output in hex. @@ -347,9 +352,9 @@ private: static LDBCommand* SelectCommand( const string& cmd, - vector& cmdParams, - map& option_map, - vector& flags + const vector& cmdParams, + const map& option_map, + const vector& flags ); }; @@ -397,6 +402,31 @@ private: static const string ARG_TTL_BUCKET; }; +class InternalDumpCommand: public LDBCommand { +public: + static string Name() { return "idump"; } + + InternalDumpCommand(const vector& params, + const map& options, + const vector& flags); + + static void Help(string& ret); + + virtual void DoCommand(); + +private: + bool has_from_; + string from_; + bool has_to_; + string to_; + int max_keys_; + bool count_only_; + bool print_stats_; + + static const string ARG_COUNT_ONLY; + static const string ARG_STATS; +}; + class DBLoaderCommand: public LDBCommand { public: static string Name() { return "load"; } diff --git a/util/ldb_tool.cc b/util/ldb_tool.cc index e46aee39d..eec1f4e0e 100644 --- a/util/ldb_tool.cc +++ b/util/ldb_tool.cc @@ -59,6 +59,7 @@ public: DBDumperCommand::Help(ret); DBLoaderCommand::Help(ret); ManifestDumpCommand::Help(ret); + InternalDumpCommand::Help(ret); fprintf(stderr, "%s\n", ret.c_str()); } From 9ba82786ce56d515cc07498bf46d06e2c62a6887 Mon Sep 17 00:00:00 2001 From: Haobo Xu Date: Thu, 20 Jun 2013 16:58:59 -0700 Subject: [PATCH 3/8] [RocksDB] Provide contiguous sequence number even in case of write failure Summary: Replication logic would be simplifeid if we can guarantee that write sequence number is always contiguous, even if write failure occurs. Dhruba and I looked at the sequence number generation part of the code. It seems fixable. Note that if WAL was successful and insert into memtable was not, we would be in an unfortunate state. The approach in this diff is : IO error is expected and error status will be returned to client, sequence number will not be advanced; In-mem error is not expected and we panic. Test Plan: make check; db_stress Reviewers: dhruba, sheki CC: leveldb Differential Revision: https://reviews.facebook.net/D11439 --- db/db_impl.cc | 13 ++++++++++--- 1 file changed, 10 insertions(+), 3 deletions(-) diff --git a/db/db_impl.cc b/db/db_impl.cc index c5c156feb..7fda3ef7e 100644 --- a/db/db_impl.cc +++ b/db/db_impl.cc @@ -10,6 +10,7 @@ #include #include #include +#include #include #include @@ -2214,13 +2215,19 @@ Status DBImpl::Write(const WriteOptions& options, WriteBatch* my_batch) { } if (status.ok()) { status = WriteBatchInternal::InsertInto(updates, mem_); + if (!status.ok()) { + // Panic for in-memory corruptions + // Note that existing logic was not sound. Any partial failure writing + // into the memtable would result in a state that some write ops might + // have succeeded in memtable but Status reports error for all writes. + throw std::runtime_error("In memory WriteBatch corruption!"); + } + versions_->SetLastSequence(last_sequence); + last_flushed_sequence_ = current_sequence; } mutex_.Lock(); } - last_flushed_sequence_ = current_sequence; if (updates == &tmp_batch_) tmp_batch_.Clear(); - - versions_->SetLastSequence(last_sequence); } while (true) { From a8d5f8dde2fbeb19600624144275511b76a57a90 Mon Sep 17 00:00:00 2001 From: Haobo Xu Date: Tue, 9 Jul 2013 11:16:39 -0700 Subject: [PATCH 4/8] [RocksDB] Remove old readahead options Summary: As title. Test Plan: make check; db_bench Reviewers: dhruba, MarkCallaghan CC: leveldb Differential Revision: https://reviews.facebook.net/D11643 --- db/db_bench.cc | 13 ------------- include/leveldb/options.h | 13 +------------ util/options.cc | 9 --------- 3 files changed, 1 insertion(+), 34 deletions(-) diff --git a/db/db_bench.cc b/db/db_bench.cc index 179896a77..bfba23b4c 100644 --- a/db/db_bench.cc +++ b/db/db_bench.cc @@ -292,18 +292,12 @@ static uint64_t FLAGS_WAL_ttl_seconds = 0; // Allow buffered io using OS buffers static bool FLAGS_use_os_buffer; -// Allow filesystem to do read-aheads -static bool FLAGS_use_fsreadahead; - // Allow reads to occur via mmap-ing files static bool FLAGS_use_mmap_reads; // Allow writes to occur via mmap-ing files static bool FLAGS_use_mmap_writes; -// Allow readaheads to occur for compactions -static bool FLAGS_use_readahead_compactions; - // Advise random access on table file open static bool FLAGS_advise_random_on_open = leveldb::Options().advise_random_on_open; @@ -1158,7 +1152,6 @@ unique_ptr GenerateKeyFromInt(int v, const char* suffix = "") // fill storage options options.allow_os_buffer = FLAGS_use_os_buffer; - options.allow_readahead = FLAGS_use_fsreadahead; options.allow_mmap_reads = FLAGS_use_mmap_reads; options.allow_mmap_writes = FLAGS_use_mmap_writes; options.advise_random_on_open = FLAGS_advise_random_on_open; @@ -2080,12 +2073,6 @@ int main(int argc, char** argv) { } else if (sscanf(argv[i], "--mmap_write=%d%c", &n, &junk) == 1 && (n == 0 || n == 1)) { FLAGS_use_mmap_writes = n; - } else if (sscanf(argv[i], "--readahead=%d%c", &n, &junk) == 1 && - (n == 0 || n == 1)) { - FLAGS_use_fsreadahead = n; - } else if (sscanf(argv[i], "--readahead_compactions=%d%c", &n, &junk) == 1&& - (n == 0 || n == 1)) { - FLAGS_use_readahead_compactions = n; } else if (sscanf(argv[i], "--statistics=%d%c", &n, &junk) == 1 && (n == 0 || n == 1)) { if (n == 1) { diff --git a/include/leveldb/options.h b/include/leveldb/options.h index fa69a7eff..e0c21cb7e 100644 --- a/include/leveldb/options.h +++ b/include/leveldb/options.h @@ -132,7 +132,7 @@ struct Options { int max_write_buffer_number; // The minimum number of write buffers that will be merged together - // before writing to storage. If set to 1, then + // before writing to storage. If set to 1, then // all write buffers are fushed to L0 as individual files and this increases // read amplification because a get request has to check in all of these // files. Also, an in-memory merge may result in writing lesser @@ -416,17 +416,6 @@ struct Options { // Default: true bool allow_os_buffer; - // Reading a single block from a file can cause the OS/FS to start - // readaheads of other blocks from the file. Default: true - // Note: Deprecated - bool allow_readahead; - - // The reads triggered by compaction allows data to be readahead - // by the OS/FS. This overrides the setting of 'allow_readahead' - // for compaction-reads. Default: true - // Note: Deprecated - bool allow_readahead_compactions; - // Allow the OS to mmap file for reading sst tables. Default: false bool allow_mmap_reads; diff --git a/util/options.cc b/util/options.cc index a8222ad5c..09121a0e5 100644 --- a/util/options.cc +++ b/util/options.cc @@ -65,8 +65,6 @@ Options::Options() manifest_preallocation_size(4 * 1024 * 1024), purge_redundant_kvs_while_flush(true), allow_os_buffer(true), - allow_readahead(true), - allow_readahead_compactions(true), allow_mmap_reads(false), allow_mmap_writes(true), is_fd_close_on_exec(true), @@ -126,13 +124,10 @@ Options::Dump(Logger* log) const Log(log," Options.db_stats_log_interval: %d", db_stats_log_interval); Log(log," Options.allow_os_buffer: %d", allow_os_buffer); - Log(log," Options.allow_readahead: %d", allow_readahead); Log(log," Options.allow_mmap_reads: %d", allow_mmap_reads); Log(log," Options.allow_mmap_writes: %d", allow_mmap_writes); Log(log," Options.min_write_buffer_number_to_merge: %d", min_write_buffer_number_to_merge); - Log(log," Options.allow_readahead_compactions: %d", - allow_readahead_compactions); Log(log," Options.purge_redundant_kvs_while_flush: %d", purge_redundant_kvs_while_flush); Log(log," Options.compression_opts.window_bits: %d", @@ -193,10 +188,6 @@ Options::Dump(Logger* log) const purge_redundant_kvs_while_flush); Log(log," Options.allow_os_buffer: %d", allow_os_buffer); - Log(log," Options.allow_readahead: %d", - allow_readahead); - Log(log," Options.allow_readahead_compactions: %d", - allow_readahead_compactions); Log(log," Options.allow_mmap_reads: %d", allow_mmap_reads); Log(log," Options.allow_mmap_writes: %d", From 821889e20764e88276e83df37e485a64191f1d75 Mon Sep 17 00:00:00 2001 From: Mayank Agarwal Date: Wed, 10 Jul 2013 13:17:51 -0700 Subject: [PATCH 5/8] Print complete statistics in db_stress Summary: db_stress should alos print complete statistics like db_bench. Needed this when I wanted to measure number of delete-IOs dropped due to CheckKeyMayExist to be introduced to rocksdb codebase later- to make deltes in rocksdb faster Test Plan: make db_stress;./db_stress --max_key=100 --ops_per_thread=1000 --statistics=1 Reviewers: sheki, dhruba, vamsi, haobo Reviewed By: dhruba Differential Revision: https://reviews.facebook.net/D11655 --- tools/db_stress.cc | 5 +---- 1 file changed, 1 insertion(+), 4 deletions(-) diff --git a/tools/db_stress.cc b/tools/db_stress.cc index 02afe306c..26a0e6775 100644 --- a/tools/db_stress.cc +++ b/tools/db_stress.cc @@ -994,10 +994,7 @@ class StressTest { void PrintStatistics() { if (dbstats) { - fprintf(stdout, "File opened:%ld closed:%ld errors:%ld\n", - dbstats->getTickerCount(NO_FILE_OPENS), - dbstats->getTickerCount(NO_FILE_CLOSES), - dbstats->getTickerCount(NO_FILE_ERRORS)); + fprintf(stdout, "STATISTICS:\n%s\n", dbstats->ToString().c_str()); } } From 8a5341ec7dd9249cadd196b50e82fa3d23f414ad Mon Sep 17 00:00:00 2001 From: Xing Jin Date: Thu, 11 Jul 2013 09:03:40 -0700 Subject: [PATCH 6/8] Newbie code question Summary: This diff is more about my question when reading compaction codes, instead of a normal diff. I don't quite understand the logic here. Test Plan: I didn't do any test. If this is a bug, I will continue doing some test. Reviewers: haobo, dhruba, emayanke Reviewed By: dhruba CC: leveldb Differential Revision: https://reviews.facebook.net/D11661 --- db/version_set.cc | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/db/version_set.cc b/db/version_set.cc index 9b01d935e..686e82cba 100644 --- a/db/version_set.cc +++ b/db/version_set.cc @@ -1974,7 +1974,7 @@ Compaction* VersionSet::PickCompactionBySize(int level, double score) { assert(level >= 0); assert(level+1 < NumberLevels()); - c = new Compaction(level, MaxFileSizeForLevel(level), + c = new Compaction(level, MaxFileSizeForLevel(level+1), MaxGrandParentOverlapBytes(level), NumberLevels()); c->score_ = score; @@ -2072,7 +2072,7 @@ Compaction* VersionSet::PickCompaction() { if (level != 0 || compactions_in_progress_[0].empty()) { if(!ParentRangeInCompaction(&f->smallest, &f->largest, level, &parent_index)) { - c = new Compaction(level, MaxFileSizeForLevel(level), + c = new Compaction(level, MaxFileSizeForLevel(level+1), MaxGrandParentOverlapBytes(level), NumberLevels(), true); c->inputs_[0].push_back(f); c->parent_index_ = parent_index; @@ -2247,7 +2247,7 @@ Compaction* VersionSet::CompactRange( } } - Compaction* c = new Compaction(level, MaxFileSizeForLevel(level), + Compaction* c = new Compaction(level, MaxFileSizeForLevel(level+1), MaxGrandParentOverlapBytes(level), NumberLevels()); c->input_version_ = current_; c->input_version_->Ref(); From 2a986919d600eba8aa0d827d5c78443f33c28310 Mon Sep 17 00:00:00 2001 From: Mayank Agarwal Date: Fri, 5 Jul 2013 18:49:18 -0700 Subject: [PATCH 7/8] Make rocksdb-deletes faster using bloom filter Summary: Wrote a new function in db_impl.c-CheckKeyMayExist that calls Get but with a new parameter turned on which makes Get return false only if bloom filters can guarantee that key is not in database. Delete calls this function and if the option- deletes_use_filter is turned on and CheckKeyMayExist returns false, the delete will be dropped saving: 1. Put of delete type 2. Space in the db,and 3. Compaction time Test Plan: make all check; will run db_stress and db_bench and enhance unit-test once the basic design gets approved Reviewers: dhruba, haobo, vamsi Reviewed By: haobo CC: leveldb Differential Revision: https://reviews.facebook.net/D11607 --- db/db_bench.cc | 7 +++++++ db/db_impl.cc | 26 ++++++++++++++++++++--- db/db_impl.h | 10 +++++++++ db/db_test.cc | 40 +++++++++++++++++++++++++++++++++++- db/memtable.cc | 6 +++++- db/memtable.h | 5 +++-- db/memtablelist.cc | 4 ++-- db/memtablelist.h | 2 +- db/table_cache.cc | 6 ++++-- db/table_cache.h | 4 +++- db/version_set.cc | 19 +++++++++++++++-- db/version_set.h | 3 ++- include/leveldb/db.h | 5 +++++ include/leveldb/options.h | 9 ++++++++ include/leveldb/statistics.h | 7 +++++-- table/table.cc | 9 +++++++- table/table.h | 4 +++- tools/db_stress.cc | 11 +++++++++- util/options.cc | 5 ++++- utilities/ttl/db_ttl.cc | 4 ++++ utilities/ttl/db_ttl.h | 2 ++ 21 files changed, 166 insertions(+), 22 deletions(-) diff --git a/db/db_bench.cc b/db/db_bench.cc index bfba23b4c..253a64bcf 100644 --- a/db/db_bench.cc +++ b/db/db_bench.cc @@ -327,6 +327,9 @@ static auto FLAGS_use_adaptive_mutex = static auto FLAGS_bytes_per_sync = leveldb::Options().bytes_per_sync; +// On true, deletes use bloom-filter and drop the delete if key not present +static bool FLAGS_deletes_check_filter_first = false; + namespace leveldb { // Helper for quickly generating random data. @@ -1111,6 +1114,7 @@ unique_ptr GenerateKeyFromInt(int v, const char* suffix = "") options.max_bytes_for_level_base = FLAGS_max_bytes_for_level_base; options.max_bytes_for_level_multiplier = FLAGS_max_bytes_for_level_multiplier; + options.deletes_check_filter_first = FLAGS_deletes_check_filter_first; if (FLAGS_max_bytes_for_level_multiplier_additional.size() > 0) { if (FLAGS_max_bytes_for_level_multiplier_additional.size() != (unsigned int)FLAGS_num_levels) { @@ -2216,6 +2220,9 @@ int main(int argc, char** argv) { FLAGS_keys_per_multiget = n; } else if (sscanf(argv[i], "--bytes_per_sync=%ld%c", &l, &junk) == 1) { FLAGS_bytes_per_sync = l; + } else if (sscanf(argv[i], "--deletes_check_filter_first=%d%c", &n, &junk) + == 1 && (n == 0 || n ==1 )) { + FLAGS_deletes_check_filter_first = n; } else { fprintf(stderr, "Invalid flag '%s'\n", argv[i]); exit(1); diff --git a/db/db_impl.cc b/db/db_impl.cc index 7fda3ef7e..1bdd61d39 100644 --- a/db/db_impl.cc +++ b/db/db_impl.cc @@ -1998,6 +1998,16 @@ int64_t DBImpl::TEST_MaxNextLevelOverlappingBytes() { Status DBImpl::Get(const ReadOptions& options, const Slice& key, std::string* value) { + return GetImpl(options, key, value); +} + +// If no_IO is true, then returns Status::NotFound if key is not in memtable, +// immutable-memtable and bloom-filters can guarantee that key is not in db, +// "value" is garbage string if no_IO is true +Status DBImpl::GetImpl(const ReadOptions& options, + const Slice& key, + std::string* value, + const bool no_IO) { Status s; StopWatch sw(env_, options_.statistics, DB_GET); @@ -2026,12 +2036,12 @@ Status DBImpl::Get(const ReadOptions& options, // s is both in/out. When in, s could either be OK or MergeInProgress. // value will contain the current merge operand in the latter case. LookupKey lkey(key, snapshot); - if (mem->Get(lkey, value, &s, options_)) { + if (mem->Get(lkey, value, &s, options_, no_IO)) { // Done - } else if (imm.Get(lkey, value, &s, options_)) { + } else if (imm.Get(lkey, value, &s, options_, no_IO)) { // Done } else { - current->Get(options, lkey, value, &s, &stats, options_); + current->Get(options, lkey, value, &s, &stats, options_, no_IO); have_stat_update = true; } mutex_.Lock(); @@ -2121,6 +2131,12 @@ std::vector DBImpl::MultiGet(const ReadOptions& options, return statList; } +bool DBImpl::KeyMayExist(const Slice& key) { + std::string value; + const Status s = GetImpl(ReadOptions(), key, &value, true); + return !s.IsNotFound(); +} + Iterator* DBImpl::NewIterator(const ReadOptions& options) { SequenceNumber latest_snapshot; Iterator* internal_iter = NewInternalIterator(options, &latest_snapshot); @@ -2156,6 +2172,10 @@ Status DBImpl::Merge(const WriteOptions& o, const Slice& key, } Status DBImpl::Delete(const WriteOptions& options, const Slice& key) { + if (options_.deletes_check_filter_first && !KeyMayExist(key)) { + RecordTick(options_.statistics, NUMBER_FILTERED_DELETES); + return Status::OK(); + } return DB::Delete(options, key); } diff --git a/db/db_impl.h b/db/db_impl.h index fb6879020..5f09035f2 100644 --- a/db/db_impl.h +++ b/db/db_impl.h @@ -48,6 +48,10 @@ class DBImpl : public DB { virtual std::vector MultiGet(const ReadOptions& options, const std::vector& keys, std::vector* values); + + // Returns false if key can't exist- based on memtable, immutable-memtable and + // bloom-filters; true otherwise. No IO is performed + virtual bool KeyMayExist(const Slice& key); virtual Iterator* NewIterator(const ReadOptions&); virtual const Snapshot* GetSnapshot(); virtual void ReleaseSnapshot(const Snapshot* snapshot); @@ -379,6 +383,12 @@ class DBImpl : public DB { SequenceNumber in, std::vector& snapshots, SequenceNumber* prev_snapshot); + + // Function that Get and KeyMayExist call with no_IO true or false + Status GetImpl(const ReadOptions& options, + const Slice& key, + std::string* value, + const bool no_IO = false); }; // Sanitize db options. The caller should delete result.info_log if diff --git a/db/db_test.cc b/db/db_test.cc index 7ce4ac419..a88b78e3c 100644 --- a/db/db_test.cc +++ b/db/db_test.cc @@ -218,6 +218,7 @@ class DBTest { kManifestFileSize, kCompactOnFlush, kPerfOptions, + kDeletesFilterFirst, kEnd }; int option_config_; @@ -289,6 +290,9 @@ class DBTest { options.rate_limit_delay_milliseconds = 2; // TODO -- test more options break; + case kDeletesFilterFirst: + options.deletes_check_filter_first = true; + break; default: break; } @@ -768,6 +772,37 @@ TEST(DBTest, GetEncountersEmptyLevel) { } while (ChangeOptions()); } +// KeyMayExist-API returns false if memtable(s) and in-memory bloom-filters can +// guarantee that the key doesn't exist in the db, else true. This can lead to +// a few false positives, but not false negatives. To make test deterministic, +// use a much larger number of bits per key-20 than bits in the key, so +// that false positives are eliminated +TEST(DBTest, KeyMayExist) { + do { + Options options = CurrentOptions(); + options.filter_policy = NewBloomFilterPolicy(20); + Reopen(&options); + + ASSERT_TRUE(!db_->KeyMayExist("a")); + + ASSERT_OK(db_->Put(WriteOptions(), "a", "b")); + ASSERT_TRUE(db_->KeyMayExist("a")); + + dbfull()->Flush(FlushOptions()); + ASSERT_TRUE(db_->KeyMayExist("a")); + + ASSERT_OK(db_->Delete(WriteOptions(), "a")); + ASSERT_TRUE(!db_->KeyMayExist("a")); + + dbfull()->Flush(FlushOptions()); + dbfull()->CompactRange(nullptr, nullptr); + ASSERT_TRUE(!db_->KeyMayExist("a")); + + ASSERT_OK(db_->Delete(WriteOptions(), "c")); + ASSERT_TRUE(!db_->KeyMayExist("c")); + } while (ChangeOptions()); +} + TEST(DBTest, IterEmpty) { Iterator* iter = db_->NewIterator(ReadOptions()); @@ -1403,7 +1438,7 @@ class DeleteFilter : public CompactionFilter { class ChangeFilter : public CompactionFilter { public: - ChangeFilter(int argv) : argv_(argv) {} + explicit ChangeFilter(int argv) : argv_(argv) {} virtual bool Filter(int level, const Slice& key, const Slice& value, std::string* new_value, @@ -2970,6 +3005,9 @@ class ModelDB: public DB { Status::NotSupported("Not implemented.")); return s; } + virtual bool KeyMayExist(const Slice& key) { + return true; // Not Supported directly + } virtual Iterator* NewIterator(const ReadOptions& options) { if (options.snapshot == nullptr) { KVMap* saved = new KVMap; diff --git a/db/memtable.cc b/db/memtable.cc index c6f8f26c2..cfd2bed04 100644 --- a/db/memtable.cc +++ b/db/memtable.cc @@ -119,7 +119,7 @@ void MemTable::Add(SequenceNumber s, ValueType type, } bool MemTable::Get(const LookupKey& key, std::string* value, Status* s, - const Options& options) { + const Options& options, const bool check_presence_only) { Slice memkey = key.memtable_key(); Table::Iterator iter(&table_); iter.Seek(memkey.data()); @@ -164,6 +164,10 @@ bool MemTable::Get(const LookupKey& key, std::string* value, Status* s, return true; } case kTypeMerge: { + if (check_presence_only) { + *s = Status::OK(); + return true; + } Slice v = GetLengthPrefixedSlice(key_ptr + key_length); if (merge_in_progress) { merge_operator->Merge(key.user_key(), &v, operand, diff --git a/db/memtable.h b/db/memtable.h index 6b3c68bf6..def3a5d3d 100644 --- a/db/memtable.h +++ b/db/memtable.h @@ -63,12 +63,13 @@ class MemTable { // If memtable contains a deletion for key, store a NotFound() error // in *status and return true. // If memtable contains Merge operation as the most recent entry for a key, - // and the merge process does not stop (not reaching a value or delete), + // and if check_presence_only is set, return true with Status::OK, + // else if the merge process does not stop (not reaching a value or delete), // store the current merged result in value and MergeInProgress in s. // return false // Else, return false. bool Get(const LookupKey& key, std::string* value, Status* s, - const Options& options); + const Options& options, const bool check_presence_only = false); // Returns the edits area that is needed for flushing the memtable VersionEdit* GetEdits() { return &edit_; } diff --git a/db/memtablelist.cc b/db/memtablelist.cc index eb427eb16..ac89d1043 100644 --- a/db/memtablelist.cc +++ b/db/memtablelist.cc @@ -194,10 +194,10 @@ size_t MemTableList::ApproximateMemoryUsage() { // Search all the memtables starting from the most recent one. // Return the most recent value found, if any. bool MemTableList::Get(const LookupKey& key, std::string* value, Status* s, - const Options& options ) { + const Options& options, const bool check_presence_only) { for (list::iterator it = memlist_.begin(); it != memlist_.end(); ++it) { - if ((*it)->Get(key, value, s, options)) { + if ((*it)->Get(key, value, s, options, check_presence_only)) { return true; } } diff --git a/db/memtablelist.h b/db/memtablelist.h index 31831deac..40419e56f 100644 --- a/db/memtablelist.h +++ b/db/memtablelist.h @@ -71,7 +71,7 @@ class MemTableList { // Search all the memtables starting from the most recent one. // Return the most recent value found, if any. bool Get(const LookupKey& key, std::string* value, Status* s, - const Options& options); + const Options& options, const bool check_presence_only = false); // Returns the list of underlying memtables. void GetMemTables(std::vector* list); diff --git a/db/table_cache.cc b/db/table_cache.cc index 700db74db..4cc105afe 100644 --- a/db/table_cache.cc +++ b/db/table_cache.cc @@ -112,14 +112,16 @@ Status TableCache::Get(const ReadOptions& options, const Slice& k, void* arg, bool (*saver)(void*, const Slice&, const Slice&, bool), - bool* tableIO) { + bool* tableIO, + void (*mark_key_may_exist)(void*), + const bool no_IO) { Cache::Handle* handle = nullptr; Status s = FindTable(storage_options_, file_number, file_size, &handle, tableIO); if (s.ok()) { Table* t = reinterpret_cast(cache_->Value(handle)); - s = t->InternalGet(options, k, arg, saver); + s = t->InternalGet(options, k, arg, saver, mark_key_may_exist, no_IO); cache_->Release(handle); } return s; diff --git a/db/table_cache.h b/db/table_cache.h index c3996a3cc..737e53c9e 100644 --- a/db/table_cache.h +++ b/db/table_cache.h @@ -48,7 +48,9 @@ class TableCache { const Slice& k, void* arg, bool (*handle_result)(void*, const Slice&, const Slice&, bool), - bool* tableIO); + bool* tableIO, + void (*mark_key_may_exist)(void*) = nullptr, + const bool no_IO = false); // Evict any entry for the specified file number void Evict(uint64_t file_number); diff --git a/db/version_set.cc b/db/version_set.cc index 686e82cba..19dc022f4 100644 --- a/db/version_set.cc +++ b/db/version_set.cc @@ -244,6 +244,16 @@ struct Saver { bool didIO; // did we do any disk io? }; } + +// Called from TableCache::Get when bloom-filters can't guarantee that key does +// not exist and Get is not permitted to do IO to read the data-block and be +// certain. +// Set the key as Found and let the caller know that key-may-exist +static void MarkKeyMayExist(void* arg) { + Saver* s = reinterpret_cast(arg); + s->state = kFound; +} + static bool SaveValue(void* arg, const Slice& ikey, const Slice& v, bool didIO){ Saver* s = reinterpret_cast(arg); ParsedInternalKey parsed_key; @@ -328,7 +338,8 @@ void Version::Get(const ReadOptions& options, std::string* value, Status *status, GetStats* stats, - const Options& db_options) { + const Options& db_options, + const bool no_IO) { Slice ikey = k.internal_key(); Slice user_key = k.user_key(); const Comparator* ucmp = vset_->icmp_.user_comparator(); @@ -337,6 +348,9 @@ void Version::Get(const ReadOptions& options, auto logger = db_options.info_log; assert(status->ok() || status->IsMergeInProgress()); + if (no_IO) { + assert(status->ok()); + } Saver saver; saver.state = status->ok()? kNotFound : kMerge; saver.ucmp = ucmp; @@ -404,7 +418,8 @@ void Version::Get(const ReadOptions& options, FileMetaData* f = files[i]; bool tableIO = false; *status = vset_->table_cache_->Get(options, f->number, f->file_size, - ikey, &saver, SaveValue, &tableIO); + ikey, &saver, SaveValue, &tableIO, + MarkKeyMayExist, no_IO); // TODO: examine the behavior for corrupted key if (!status->ok()) { return; diff --git a/db/version_set.h b/db/version_set.h index ba924126f..2369bcc1e 100644 --- a/db/version_set.h +++ b/db/version_set.h @@ -74,7 +74,8 @@ class Version { int seek_file_level; }; void Get(const ReadOptions&, const LookupKey& key, std::string* val, - Status* status, GetStats* stats, const Options& db_option); + Status* status, GetStats* stats, const Options& db_option, + const bool no_IO = false); // Adds "stats" into the current state. Returns true if a new // compaction may need to be triggered, false otherwise. diff --git a/include/leveldb/db.h b/include/leveldb/db.h index 6d816edce..056920d9e 100644 --- a/include/leveldb/db.h +++ b/include/leveldb/db.h @@ -120,6 +120,11 @@ class DB { const std::vector& keys, std::vector* values) = 0; + // If the key definitely does not exist in the database, then this method + // returns false. Otherwise return true. This check is potentially + // lighter-weight than invoking DB::Get(). No IO is performed + virtual bool KeyMayExist(const Slice& key) = 0; + // Return a heap-allocated iterator over the contents of the database. // The result of NewIterator() is initially invalid (caller must // call one of the Seek methods on the iterator before using it). diff --git a/include/leveldb/options.h b/include/leveldb/options.h index e0c21cb7e..3341b72a2 100644 --- a/include/leveldb/options.h +++ b/include/leveldb/options.h @@ -465,6 +465,15 @@ struct Options { // Default: 0 uint64_t bytes_per_sync; + // Use bloom-filter for deletes when this is true. + // db->Delete first calls KeyMayExist which checks memtable,immutable-memtable + // and bloom-filters to determine if the key does not exist in the database. + // If the key definitely does not exist, then the delete is a noop.KeyMayExist + // only incurs in-memory look up. This optimization avoids writing the delete + // to storage when appropriate. + // Default: false + bool deletes_check_filter_first; + }; // Options that control read operations diff --git a/include/leveldb/statistics.h b/include/leveldb/statistics.h index 155974659..4ce4f6d1b 100644 --- a/include/leveldb/statistics.h +++ b/include/leveldb/statistics.h @@ -58,7 +58,9 @@ enum Tickers { NUMBER_MULTIGET_KEYS_READ = 19, NUMBER_MULTIGET_BYTES_READ = 20, - TICKER_ENUM_MAX = 21 + NUMBER_FILTERED_DELETES = 21, + + TICKER_ENUM_MAX = 22 }; const std::vector> TickersNameMap = { @@ -82,7 +84,8 @@ const std::vector> TickersNameMap = { { NO_ITERATORS, "rocksdb.num.iterators" }, { NUMBER_MULTIGET_CALLS, "rocksdb.number.multiget.get" }, { NUMBER_MULTIGET_KEYS_READ, "rocksdb.number.multiget.keys.read" }, - { NUMBER_MULTIGET_BYTES_READ, "rocksdb.number.multiget.bytes.read" } + { NUMBER_MULTIGET_BYTES_READ, "rocksdb.number.multiget.bytes.read" }, + { NUMBER_FILTERED_DELETES, "rocksdb.number.deletes.filtered" } }; /** diff --git a/table/table.cc b/table/table.cc index 5aceebe0f..f7b664a4f 100644 --- a/table/table.cc +++ b/table/table.cc @@ -322,7 +322,9 @@ Iterator* Table::NewIterator(const ReadOptions& options) const { Status Table::InternalGet(const ReadOptions& options, const Slice& k, void* arg, bool (*saver)(void*, const Slice&, const Slice&, - bool)) { + bool), + void (*mark_key_may_exist)(void*), + const bool no_IO) { Status s; Iterator* iiter = rep_->index_block->NewIterator(rep_->options.comparator); bool done = false; @@ -338,6 +340,11 @@ Status Table::InternalGet(const ReadOptions& options, const Slice& k, // cross one data block, we should be fine. RecordTick(rep_->options.statistics, BLOOM_FILTER_USEFUL); break; + } else if (no_IO) { + // Update Saver.state to Found because we are only looking for whether + // bloom-filter can guarantee the key is not there when "no_IO" + (*mark_key_may_exist)(arg); + done = true; } else { bool didIO = false; Iterator* block_iter = BlockReader(this, options, iiter->value(), diff --git a/table/table.h b/table/table.h index a657290b5..4674e262b 100644 --- a/table/table.h +++ b/table/table.h @@ -86,7 +86,9 @@ class Table { Status InternalGet( const ReadOptions&, const Slice& key, void* arg, - bool (*handle_result)(void* arg, const Slice& k, const Slice& v, bool)); + bool (*handle_result)(void* arg, const Slice& k, const Slice& v, bool), + void (*mark_key_may_exist)(void*) = nullptr, + const bool no_IO = false); void ReadMeta(const Footer& footer); diff --git a/tools/db_stress.cc b/tools/db_stress.cc index 26a0e6775..8d27c1a68 100644 --- a/tools/db_stress.cc +++ b/tools/db_stress.cc @@ -180,6 +180,9 @@ static uint32_t FLAGS_log2_keys_per_lock = 2; // implies 2^2 keys per lock // Percentage of times we want to purge redundant keys in memory before flushing static uint32_t FLAGS_purge_redundant_percent = 50; +// On true, deletes use bloom-filter and drop the delete if key not present +static bool FLAGS_deletes_check_filter_first = false; + // Level0 compaction start trigger static int FLAGS_level0_file_num_compaction_trigger = 0; @@ -898,8 +901,10 @@ class StressTest { fprintf(stdout, "Num times DB reopens: %d\n", FLAGS_reopen); fprintf(stdout, "Batches/snapshots : %d\n", FLAGS_test_batches_snapshots); - fprintf(stdout, "Purge redundant %% : %d\n", + fprintf(stdout, "Purge redundant %% : %d\n", FLAGS_purge_redundant_percent); + fprintf(stdout, "Deletes use filter : %d\n", + FLAGS_deletes_check_filter_first); fprintf(stdout, "Num keys per lock : %d\n", 1 << FLAGS_log2_keys_per_lock); @@ -955,6 +960,7 @@ class StressTest { options.delete_obsolete_files_period_micros = FLAGS_delete_obsolete_files_period_micros; options.max_manifest_file_size = 1024; + options.deletes_check_filter_first = FLAGS_deletes_check_filter_first; static Random purge_percent(1000); // no benefit from non-determinism here if (purge_percent.Uniform(100) < FLAGS_purge_redundant_percent - 1) { options.purge_redundant_kvs_while_flush = false; @@ -1154,6 +1160,9 @@ int main(int argc, char** argv) { } else if (sscanf(argv[i], "--purge_redundant_percent=%d%c", &n, &junk) == 1 && (n >= 0 && n <= 100)) { FLAGS_purge_redundant_percent = n; + } else if (sscanf(argv[i], "--deletes_check_filter_first=%d%c", &n, &junk) + == 1 && (n == 0 || n == 1)) { + FLAGS_deletes_check_filter_first = n; } else { fprintf(stderr, "Invalid flag '%s'\n", argv[i]); exit(1); diff --git a/util/options.cc b/util/options.cc index 09121a0e5..1ac25845a 100644 --- a/util/options.cc +++ b/util/options.cc @@ -74,7 +74,8 @@ Options::Options() advise_random_on_open(true), access_hint_on_compaction_start(NORMAL), use_adaptive_mutex(false), - bytes_per_sync(0) { + bytes_per_sync(0), + deletes_check_filter_first(false) { } static const char* const access_hints[] = { @@ -208,6 +209,8 @@ Options::Dump(Logger* log) const use_adaptive_mutex); Log(log," Options.bytes_per_sync: %ld", bytes_per_sync); + Log(log," Options.deletes_check_filter_first: %d", + deletes_check_filter_first); } // Options::Dump // diff --git a/utilities/ttl/db_ttl.cc b/utilities/ttl/db_ttl.cc index d201208d3..eff675340 100644 --- a/utilities/ttl/db_ttl.cc +++ b/utilities/ttl/db_ttl.cc @@ -158,6 +158,10 @@ std::vector DBWithTTL::MultiGet(const ReadOptions& options, supported with TTL")); } +bool DBWithTTL::KeyMayExist(const Slice& key) { + return db_->KeyMayExist(key); +} + Status DBWithTTL::Delete(const WriteOptions& wopts, const Slice& key) { return db_->Delete(wopts, key); } diff --git a/utilities/ttl/db_ttl.h b/utilities/ttl/db_ttl.h index 2e01c1e3d..d24efbe48 100644 --- a/utilities/ttl/db_ttl.h +++ b/utilities/ttl/db_ttl.h @@ -33,6 +33,8 @@ class DBWithTTL : public DB, CompactionFilter { const std::vector& keys, std::vector* values); + virtual bool KeyMayExist(const Slice& key); + virtual Status Delete(const WriteOptions& wopts, const Slice& key); virtual Status Merge(const WriteOptions& options, From e9b675bd94370ac15cf0ab65ba9beef7a02d904b Mon Sep 17 00:00:00 2001 From: Mayank Agarwal Date: Thu, 11 Jul 2013 14:05:31 -0700 Subject: [PATCH 8/8] Fix memory leak in KeyMayExist test part of db_test Summary: NewBloomFilterPolicy call requires Delete to be called later on Test Plan: make; valgrind ./db_test Reviewers: haobo, dhruba, vamsi Differential Revision: https://reviews.facebook.net/D11667 --- db/db_test.cc | 2 ++ 1 file changed, 2 insertions(+) diff --git a/db/db_test.cc b/db/db_test.cc index a88b78e3c..52c6ad794 100644 --- a/db/db_test.cc +++ b/db/db_test.cc @@ -800,6 +800,8 @@ TEST(DBTest, KeyMayExist) { ASSERT_OK(db_->Delete(WriteOptions(), "c")); ASSERT_TRUE(!db_->KeyMayExist("c")); + + delete options.filter_policy; } while (ChangeOptions()); }