From 1fb3593f254f032cddb15d156e9366db927940f3 Mon Sep 17 00:00:00 2001 From: anand76 Date: Mon, 8 Jun 2020 16:08:31 -0700 Subject: [PATCH] Fix a bug in looking up duplicate keys with MultiGet (#6953) Summary: When MultiGet is called with duplicate keys, and the key matches the largest key in an SST file and the value type is merge, only the first instance of the duplicate key is returned with correct results. This is due to the incorrect assumption that if a key in a batch is equal to the largest key in the file, the next key cannot be present in that file. Tests: Add a new unit test Pull Request resolved: https://github.com/facebook/rocksdb/pull/6953 Reviewed By: cheng-chang Differential Revision: D21935898 Pulled By: anand1976 fbshipit-source-id: a2cc327a15150e23fd997546ca64d1c33021cb4c --- HISTORY.md | 1 + db/db_basic_test.cc | 54 +++++++++++++++++++++++++++++++++++++++++++++ db/version_set.cc | 42 ++++++++++++++++++++++++++--------- 3 files changed, 86 insertions(+), 11 deletions(-) diff --git a/HISTORY.md b/HISTORY.md index d77b9bc7c..1ed89a76a 100644 --- a/HISTORY.md +++ b/HISTORY.md @@ -14,6 +14,7 @@ * Fix potential file descriptor leakage in PosixEnv's IsDirectory() and NewRandomAccessFile(). * Fix false negative from the VerifyChecksum() API when there is a checksum mismatch in an index partition block in a BlockBasedTable format table file (index_type is kTwoLevelIndexSearch). * Fix sst_dump to return non-zero exit code if the specified file is not a recognized SST file or fails requested checks. +* Fix incorrect results from batched MultiGet for duplicate keys, when the duplicate key matches the largest key of an SST file and the value type for the key in the file is a merge value. ### Public API Change * Flush(..., column_family) may return Status::ColumnFamilyDropped() instead of Status::InvalidArgument() if column_family is dropped while processing the flush request. diff --git a/db/db_basic_test.cc b/db/db_basic_test.cc index 50c3ba3d7..3544400ba 100644 --- a/db/db_basic_test.cc +++ b/db/db_basic_test.cc @@ -9,6 +9,7 @@ #include "db/db_test_util.h" #include "port/stack_trace.h" +#include "rocksdb/merge_operator.h" #include "rocksdb/perf_context.h" #include "rocksdb/utilities/debug.h" #include "table/block_based/block_based_table_reader.h" @@ -17,6 +18,8 @@ #if !defined(ROCKSDB_LITE) #include "test_util/sync_point.h" #endif +#include "utilities/merge_operators.h" +#include "utilities/merge_operators/string_append/stringappend.h" namespace ROCKSDB_NAMESPACE { @@ -1340,6 +1343,57 @@ TEST_F(DBBasicTest, MultiGetBatchedSortedMultiFile) { } while (ChangeOptions()); } +TEST_F(DBBasicTest, MultiGetBatchedDuplicateKeys) { + Options opts = CurrentOptions(); + opts.merge_operator = MergeOperators::CreateStringAppendOperator(); + CreateAndReopenWithCF({"pikachu"}, opts); + SetPerfLevel(kEnableCount); + // To expand the power of this test, generate > 1 table file and + // mix with memtable + ASSERT_OK(Merge(1, "k1", "v1")); + ASSERT_OK(Merge(1, "k2", "v2")); + Flush(1); + MoveFilesToLevel(2, 1); + ASSERT_OK(Merge(1, "k3", "v3")); + ASSERT_OK(Merge(1, "k4", "v4")); + Flush(1); + MoveFilesToLevel(2, 1); + ASSERT_OK(Merge(1, "k4", "v4_2")); + ASSERT_OK(Merge(1, "k6", "v6")); + Flush(1); + MoveFilesToLevel(2, 1); + ASSERT_OK(Merge(1, "k7", "v7")); + ASSERT_OK(Merge(1, "k8", "v8")); + Flush(1); + MoveFilesToLevel(2, 1); + + get_perf_context()->Reset(); + + std::vector keys({"k8", "k8", "k8", "k4", "k4", "k1", "k3"}); + std::vector values(keys.size()); + std::vector cfs(keys.size(), handles_[1]); + std::vector s(keys.size()); + + db_->MultiGet(ReadOptions(), handles_[1], keys.size(), keys.data(), + values.data(), s.data(), false); + + ASSERT_EQ(values.size(), keys.size()); + ASSERT_EQ(std::string(values[0].data(), values[0].size()), "v8"); + ASSERT_EQ(std::string(values[1].data(), values[1].size()), "v8"); + ASSERT_EQ(std::string(values[2].data(), values[2].size()), "v8"); + ASSERT_EQ(std::string(values[3].data(), values[3].size()), "v4,v4_2"); + ASSERT_EQ(std::string(values[4].data(), values[4].size()), "v4,v4_2"); + ASSERT_EQ(std::string(values[5].data(), values[5].size()), "v1"); + ASSERT_EQ(std::string(values[6].data(), values[6].size()), "v3"); + ASSERT_EQ(24, (int)get_perf_context()->multiget_read_bytes); + + for (Status& status : s) { + ASSERT_OK(status); + } + + SetPerfLevel(kDisable); +} + TEST_F(DBBasicTest, MultiGetBatchedMultiLevel) { Options options = CurrentOptions(); options.disable_auto_compactions = true; diff --git a/db/version_set.cc b/db/version_set.cc index 7fb4f5665..ce69afe34 100644 --- a/db/version_set.cc +++ b/db/version_set.cc @@ -365,6 +365,7 @@ class FilePickerMultiGet { range_(range), batch_iter_(range->begin()), batch_iter_prev_(range->begin()), + upper_key_(range->begin()), maybe_repeat_key_(false), current_level_range_(*range, range->begin(), range->end()), current_file_range_(*range, range->begin(), range->end()), @@ -481,9 +482,19 @@ class FilePickerMultiGet { } if (cmp_largest == 0) { // cmp_largest is 0, which means the next key will not be in this - // file, so stop looking further. Also don't increment megt_iter_ - // as we may have to look for this key in the next file if we don't - // find it in this one + // file, so stop looking further. However, its possible there are + // duplicates in the batch, so find the upper bound for the batch + // in this file (upper_key_) by skipping past the duplicates. We + // leave batch_iter_ as is since we may have to pick up from there + // for the next file, if this file has a merge value rather than + // final value + upper_key_ = batch_iter_; + ++upper_key_; + while (upper_key_ != current_level_range_.end() && + user_comparator_->Compare(batch_iter_->ukey, upper_key_->ukey) == + 0) { + ++upper_key_; + } break; } else { if (curr_level_ == 0) { @@ -503,6 +514,12 @@ class FilePickerMultiGet { *fd = f; *file_index = curr_file_index; *is_last_key_in_file = cmp_largest == 0; + if (!*is_last_key_in_file) { + // If the largest key in the batch overlapping the file is not the + // largest key in the file, upper_ley_ would not have been updated so + // update it here + upper_key_ = batch_iter_; + } return file_hit; } @@ -524,7 +541,7 @@ class FilePickerMultiGet { // file regardless for all keys not found yet if (current_level_range_.CheckKeyDone(batch_iter_) || curr_level_ == 0) { - ++batch_iter_; + batch_iter_ = upper_key_; } } // batch_iter_prev_ will become the start key for the next file @@ -544,18 +561,20 @@ class FilePickerMultiGet { &is_last_key_in_file)) { search_ended_ = !PrepareNextLevel(); } else { - MultiGetRange::Iterator upper_key = batch_iter_; if (is_last_key_in_file) { // Since cmp_largest is 0, batch_iter_ still points to the last key // that falls in this file, instead of the next one. Increment - // upper_key so we can set the range properly for SST MultiGet - ++upper_key; - ++(fp_ctx_array_[batch_iter_.index()].curr_index_in_curr_level); + // the file index for all keys between batch_iter_ and upper_key_ + auto tmp_iter = batch_iter_; + while (tmp_iter != upper_key_) { + ++(fp_ctx_array_[tmp_iter.index()].curr_index_in_curr_level); + ++tmp_iter; + } maybe_repeat_key_ = true; } // Set the range for this file current_file_range_ = - MultiGetRange(next_file_range, batch_iter_prev_, upper_key); + MultiGetRange(next_file_range, batch_iter_prev_, upper_key_); returned_file_level_ = curr_level_; hit_file_level_ = curr_level_; is_hit_file_last_in_level_ = @@ -607,6 +626,7 @@ class FilePickerMultiGet { // key found in the previous SST file, in order to serve as the start of // the batch key range for the next SST file MultiGetRange::Iterator batch_iter_prev_; + MultiGetRange::Iterator upper_key_; bool maybe_repeat_key_; MultiGetRange current_level_range_; MultiGetRange current_file_range_; @@ -626,7 +646,7 @@ class FilePickerMultiGet { if (fp_ctx_array_[mget_iter.index()].curr_index_in_curr_level < curr_file_level_->num_files) { batch_iter_prev_ = current_level_range_.begin(); - batch_iter_ = current_level_range_.begin(); + upper_key_ = batch_iter_ = current_level_range_.begin(); return true; } } @@ -721,7 +741,7 @@ class FilePickerMultiGet { } if (level_contains_keys) { batch_iter_prev_ = current_level_range_.begin(); - batch_iter_ = current_level_range_.begin(); + upper_key_ = batch_iter_ = current_level_range_.begin(); return true; } curr_level_++;