From f89423a57a78e28de68c9a618a6e5f2f00ec5270 Mon Sep 17 00:00:00 2001 From: Jay Zhuang Date: Tue, 22 Jun 2021 11:09:11 -0700 Subject: [PATCH] Revert "Revert "Snapshot release triggered compaction without multiple tombstones (#8357)" (#8410)" (#8438) Summary: This reverts commit 25be1ed66a354ee1d665d7a28eb20d36afc75e90. Pull Request resolved: https://github.com/facebook/rocksdb/pull/8438 Test Plan: Run the impacted mysql test 40 times Reviewed By: ajkr Differential Revision: D29286247 Pulled By: jay-zhuang fbshipit-source-id: d3bd056971a19a8b012d5d0295fa045c012b3c04 --- HISTORY.md | 5 ++++- db/version_set.cc | 3 +-- utilities/blob_db/blob_db_test.cc | 12 ++++++++++-- .../write_prepared_transaction_test.cc | 18 ++++++++++++++++++ 4 files changed, 33 insertions(+), 5 deletions(-) diff --git a/HISTORY.md b/HISTORY.md index 6463f5e04..ba7ef92de 100644 --- a/HISTORY.md +++ b/HISTORY.md @@ -1,9 +1,12 @@ # Rocksdb Change Log +## Unreleased +### Behavior Changes +* Obsolete keys in the bottommost level that were preserved for a snapshot will now be cleaned upon snapshot release in all cases. This form of compaction (snapshot release triggered compaction) previously had an artificial limitation that multiple tombstones needed to be present. + ## 6.22.0 (2021-06-18) ### Behavior Changes * Added two additional tickers, MEMTABLE_PAYLOAD_BYTES_AT_FLUSH and MEMTABLE_GARBAGE_BYTES_AT_FLUSH. These stats can be used to estimate the ratio of "garbage" (outdated) bytes in the memtable that are discarded at flush time. * Added API comments clarifying safe usage of Disable/EnableManualCompaction and EventListener callbacks for compaction. - ### Bug Fixes * fs_posix.cc GetFreeSpace() always report disk space available to root even when running as non-root. Linux defaults often have disk mounts with 5 to 10 percent of total space reserved only for root. Out of space could result for non-root users. * Subcompactions are now disabled when user-defined timestamps are used, since the subcompaction boundary picking logic is currently not timestamp-aware, which could lead to incorrect results when different subcompactions process keys that only differ by timestamp. diff --git a/db/version_set.cc b/db/version_set.cc index 44e7462c8..1360a0834 100644 --- a/db/version_set.cc +++ b/db/version_set.cc @@ -3049,8 +3049,7 @@ void VersionStorageInfo::ComputeBottommostFilesMarkedForCompaction() { bottommost_files_mark_threshold_ = kMaxSequenceNumber; for (auto& level_and_file : bottommost_files_) { if (!level_and_file.second->being_compacted && - level_and_file.second->fd.largest_seqno != 0 && - level_and_file.second->num_deletions > 1) { + level_and_file.second->fd.largest_seqno != 0) { // largest_seqno might be nonzero due to containing the final key in an // earlier compaction, whose seqnum we didn't zero out. Multiple deletions // ensures the file really contains deleted or overwritten keys. diff --git a/utilities/blob_db/blob_db_test.cc b/utilities/blob_db/blob_db_test.cc index 652b082c8..84401d6bb 100644 --- a/utilities/blob_db/blob_db_test.cc +++ b/utilities/blob_db/blob_db_test.cc @@ -571,7 +571,6 @@ TEST_F(BlobDBTest, EnableDisableCompressionGC) { Random rnd(301); BlobDBOptions bdb_options; bdb_options.min_blob_size = 0; - bdb_options.enable_garbage_collection = true; bdb_options.garbage_collection_cutoff = 1.0; bdb_options.disable_background_tasks = true; bdb_options.compression = kSnappyCompression; @@ -600,6 +599,11 @@ TEST_F(BlobDBTest, EnableDisableCompressionGC) { ASSERT_EQ(2, blob_files.size()); ASSERT_EQ(kNoCompression, blob_files[1]->GetCompressionType()); + // Enable GC. If we do it earlier the snapshot release triggered compaction + // may compact files and trigger GC before we can verify there are two files. + bdb_options.enable_garbage_collection = true; + Reopen(bdb_options); + // Trigger compaction ASSERT_OK(blob_db_->CompactRange(CompactRangeOptions(), nullptr, nullptr)); blob_db_impl()->TEST_DeleteObsoleteFiles(); @@ -638,7 +642,6 @@ TEST_F(BlobDBTest, ChangeCompressionGC) { Random rnd(301); BlobDBOptions bdb_options; bdb_options.min_blob_size = 0; - bdb_options.enable_garbage_collection = true; bdb_options.garbage_collection_cutoff = 1.0; bdb_options.disable_background_tasks = true; bdb_options.compression = kLZ4Compression; @@ -668,6 +671,11 @@ TEST_F(BlobDBTest, ChangeCompressionGC) { ASSERT_EQ(2, blob_files.size()); ASSERT_EQ(kSnappyCompression, blob_files[1]->GetCompressionType()); + // Enable GC. If we do it earlier the snapshot release triggered compaction + // may compact files and trigger GC before we can verify there are two files. + bdb_options.enable_garbage_collection = true; + Reopen(bdb_options); + ASSERT_OK(blob_db_->CompactRange(CompactRangeOptions(), nullptr, nullptr)); VerifyDB(data); diff --git a/utilities/transactions/write_prepared_transaction_test.cc b/utilities/transactions/write_prepared_transaction_test.cc index a63374f61..06abab792 100644 --- a/utilities/transactions/write_prepared_transaction_test.cc +++ b/utilities/transactions/write_prepared_transaction_test.cc @@ -2587,6 +2587,7 @@ TEST_P(WritePreparedTransactionTest, ReleaseSnapshotDuringCompaction) { const size_t snapshot_cache_bits = 7; // same as default const size_t commit_cache_bits = 0; // minimum commit cache UpdateTransactionDBOptions(snapshot_cache_bits, commit_cache_bits); + options.disable_auto_compactions = true; ASSERT_OK(ReOpen()); ASSERT_OK(db->Put(WriteOptions(), "key1", "value1_1")); @@ -2606,7 +2607,13 @@ TEST_P(WritePreparedTransactionTest, ReleaseSnapshotDuringCompaction) { VerifyKeys({{"key1", "value1_1"}}, snapshot2); // Add a flush to avoid compaction to fallback to trivial move. + // The callback might be called twice, record the calling state to + // prevent double calling. + bool callback_finished = false; auto callback = [&](void*) { + if (callback_finished) { + return; + } // Release snapshot1 after CompactionIterator init. // CompactionIterator need to figure out the earliest snapshot // that can see key1:value1_2 is kMaxSequenceNumber, not @@ -2615,6 +2622,7 @@ TEST_P(WritePreparedTransactionTest, ReleaseSnapshotDuringCompaction) { // Add some keys to advance max_evicted_seq. ASSERT_OK(db->Put(WriteOptions(), "key3", "value3")); ASSERT_OK(db->Put(WriteOptions(), "key4", "value4")); + callback_finished = true; }; SyncPoint::GetInstance()->SetCallBack("CompactionIterator:AfterInit", callback); @@ -2636,6 +2644,7 @@ TEST_P(WritePreparedTransactionTest, ReleaseSnapshotDuringCompaction2) { const size_t snapshot_cache_bits = 7; // same as default const size_t commit_cache_bits = 0; // minimum commit cache UpdateTransactionDBOptions(snapshot_cache_bits, commit_cache_bits); + options.disable_auto_compactions = true; ASSERT_OK(ReOpen()); ASSERT_OK(db->Put(WriteOptions(), "key1", "value1")); @@ -2686,6 +2695,7 @@ TEST_P(WritePreparedTransactionTest, ReleaseSnapshotDuringCompaction3) { const size_t snapshot_cache_bits = 7; // same as default const size_t commit_cache_bits = 1; // commit cache size = 2 UpdateTransactionDBOptions(snapshot_cache_bits, commit_cache_bits); + options.disable_auto_compactions = true; ASSERT_OK(ReOpen()); // Add a dummy key to evict v2 commit cache, but keep v1 commit cache. @@ -2715,11 +2725,18 @@ TEST_P(WritePreparedTransactionTest, ReleaseSnapshotDuringCompaction3) { add_dummy(); auto* s2 = db->GetSnapshot(); + // The callback might be called twice, record the calling state to + // prevent double calling. + bool callback_finished = false; auto callback = [&](void*) { + if (callback_finished) { + return; + } db->ReleaseSnapshot(s1); // Add some dummy entries to trigger s1 being cleanup from old_commit_map. add_dummy(); add_dummy(); + callback_finished = true; }; SyncPoint::GetInstance()->SetCallBack("CompactionIterator:AfterInit", callback); @@ -2737,6 +2754,7 @@ TEST_P(WritePreparedTransactionTest, ReleaseEarliestSnapshotDuringCompaction) { const size_t snapshot_cache_bits = 7; // same as default const size_t commit_cache_bits = 0; // minimum commit cache UpdateTransactionDBOptions(snapshot_cache_bits, commit_cache_bits); + options.disable_auto_compactions = true; ASSERT_OK(ReOpen()); ASSERT_OK(db->Put(WriteOptions(), "key1", "value1"));