From 0980dc6c9abe263531cb313af56c5018b8f4b331 Mon Sep 17 00:00:00 2001 From: Huachao Huang Date: Tue, 29 Aug 2017 18:27:21 -0700 Subject: [PATCH] Fix wrong smallest key of delete range tombstones Summary: Since tombstones are not stored in order, we may get a wrong smallest key if we only consider the first added tombstone. Check https://github.com/facebook/rocksdb/issues/2752 for more details. Closes https://github.com/facebook/rocksdb/pull/2799 Differential Revision: D5728217 Pulled By: ajkr fbshipit-source-id: 4a53edb0ca80d2a9fcf10749e52d47d57d6417d3 --- db/db_range_del_test.cc | 34 ++++++++++++++++++++++++++++++++++ db/range_del_aggregator.cc | 4 ++-- 2 files changed, 36 insertions(+), 2 deletions(-) diff --git a/db/db_range_del_test.cc b/db/db_range_del_test.cc index dbc27e870..982cbb85a 100644 --- a/db/db_range_del_test.cc +++ b/db/db_range_del_test.cc @@ -962,6 +962,40 @@ TEST_F(DBRangeDelTest, CompactionTreatsSplitInputLevelDeletionAtomically) { } } +TEST_F(DBRangeDelTest, UnorderedTombstones) { + // Regression test for #2752. Range delete tombstones between + // different snapshot stripes are not stored in order, so the first + // tombstone of each snapshot stripe should be checked as a smallest + // candidate. + Options options = CurrentOptions(); + DestroyAndReopen(options); + + auto cf = db_->DefaultColumnFamily(); + + ASSERT_OK(db_->Put(WriteOptions(), cf, "a", "a")); + ASSERT_OK(db_->Flush(FlushOptions(), cf)); + ASSERT_EQ(1, NumTableFilesAtLevel(0)); + ASSERT_OK(dbfull()->TEST_CompactRange(0, nullptr, nullptr)); + ASSERT_EQ(1, NumTableFilesAtLevel(1)); + + ASSERT_OK(db_->DeleteRange(WriteOptions(), cf, "b", "c")); + // Hold a snapshot to separate these two delete ranges. + auto snapshot = db_->GetSnapshot(); + ASSERT_OK(db_->DeleteRange(WriteOptions(), cf, "a", "b")); + ASSERT_OK(db_->Flush(FlushOptions(), cf)); + db_->ReleaseSnapshot(snapshot); + + std::vector> files; + dbfull()->TEST_GetFilesMetaData(cf, &files); + ASSERT_EQ(1, files[0].size()); + ASSERT_EQ("a", files[0][0].smallest.user_key()); + ASSERT_EQ("c", files[0][0].largest.user_key()); + + std::string v; + auto s = db_->Get(ReadOptions(), "a", &v); + ASSERT_TRUE(s.IsNotFound()); +} + #endif // ROCKSDB_LITE } // namespace rocksdb diff --git a/db/range_del_aggregator.cc b/db/range_del_aggregator.cc index cb51ea7f8..c83f5a88c 100644 --- a/db/range_del_aggregator.cc +++ b/db/range_del_aggregator.cc @@ -413,8 +413,8 @@ void RangeDelAggregator::AddToBuilder( // Note the order in which tombstones are stored is insignificant since we // insert them into a std::map on the read path. - bool first_added = false; while (stripe_map_iter != rep_->stripe_map_.end()) { + bool first_added = false; for (auto tombstone_map_iter = stripe_map_iter->second.raw_map.begin(); tombstone_map_iter != stripe_map_iter->second.raw_map.end(); ++tombstone_map_iter) { @@ -453,7 +453,7 @@ void RangeDelAggregator::AddToBuilder( builder->Add(ikey_and_end_key.first.Encode(), ikey_and_end_key.second); if (!first_added) { first_added = true; - InternalKey smallest_candidate = std::move(ikey_and_end_key.first);; + InternalKey smallest_candidate = std::move(ikey_and_end_key.first); if (lower_bound != nullptr && icmp_.user_comparator()->Compare(smallest_candidate.user_key(), *lower_bound) <= 0) {