From 205e5776945398c623089e269cfef94d2e455b20 Mon Sep 17 00:00:00 2001 From: Yanqin Jin Date: Fri, 11 Sep 2020 17:44:06 -0700 Subject: [PATCH] Cancel tombstone skipping during bottommost compaction (#7356) Summary: During bottommost compaction, RocksDB cannot simply drop a tombstone if this tombstone is not in the earliest snapshot. The current behavior is: RocksDB skips other internal keys (of the same user key) in the same snapshot range. In the meantime, RocksDB should check for the `shutting_down` flag. Otherwise, it is possible for a bottommost compaction that has already started running to take a long time to finish, even if the application has tried to cancel all background jobs. Pull Request resolved: https://github.com/facebook/rocksdb/pull/7356 Test Plan: make check Reviewed By: ltamasi Differential Revision: D23663241 Pulled By: riversand963 fbshipit-source-id: 25f8e9b51bc3bfa3353cdf87557800f9d90ee0b5 --- HISTORY.md | 1 + db/compaction/compaction_iterator.cc | 14 +++++++------- 2 files changed, 8 insertions(+), 7 deletions(-) diff --git a/HISTORY.md b/HISTORY.md index 1f0a3e6c4..1eb16b4a8 100644 --- a/HISTORY.md +++ b/HISTORY.md @@ -11,6 +11,7 @@ * Avoid converting MERGES to PUTS when allow_ingest_behind is true. * Fix compression dictionary sampling together with `SstFileWriter`. Previously, the dictionary would be trained/finalized immediately with zero samples. Now, the whole `SstFileWriter` file is buffered in memory and then sampled. * Fix a bug with `avoid_unnecessary_blocking_io=1` and creating backups (BackupEngine::CreateNewBackup) or checkpoints (Checkpoint::Create). With this setting and WAL enabled, these operations could randomly fail with non-OK status. +* Fix a bug in which bottommost compaction continues to advance the underlying InternalIterator to skip tombstones even after shutdown. ### New Features * A new option `std::shared_ptr file_checksum_gen_factory` is added to `BackupableDBOptions`. The default value for this option is `nullptr`. If this option is null, the default backup engine checksum function (crc32c) will be used for creating, verifying, or restoring backups. If it is not null and is set to the DB custom checksum factory, the custom checksum function used in DB will also be used for creating, verifying, or restoring backups, in addition to the default checksum function (crc32c). If it is not null and is set to a custom checksum factory different than the DB custom checksum factory (which may be null), BackupEngine will return `Status::InvalidArgument()`. diff --git a/db/compaction/compaction_iterator.cc b/db/compaction/compaction_iterator.cc index cc1cd6e96..9076a256a 100644 --- a/db/compaction/compaction_iterator.cc +++ b/db/compaction/compaction_iterator.cc @@ -144,8 +144,7 @@ void CompactionIterator::Next() { if (merge_out_iter_.Valid()) { key_ = merge_out_iter_.key(); value_ = merge_out_iter_.value(); - bool valid_key __attribute__((__unused__)); - valid_key = ParseInternalKey(key_, &ikey_); + bool valid_key = ParseInternalKey(key_, &ikey_); // MergeUntil stops when it encounters a corrupt key and does not // include them in the result, so we expect the keys here to be valid. assert(valid_key); @@ -352,8 +351,7 @@ void CompactionIterator::NextFromInput() { // If there are no snapshots, then this kv affect visibility at tip. // Otherwise, search though all existing snapshots to find the earliest // snapshot that is affected by this kv. - SequenceNumber last_sequence __attribute__((__unused__)); - last_sequence = current_user_key_sequence_; + SequenceNumber last_sequence = current_user_key_sequence_; current_user_key_sequence_ = ikey_.sequence; SequenceNumber last_snapshot = current_user_key_snapshot_; SequenceNumber prev_snapshot = 0; // 0 means no previous snapshot @@ -565,11 +563,14 @@ void CompactionIterator::NextFromInput() { // Handle the case where we have a delete key at the bottom most level // We can skip outputting the key iff there are no subsequent puts for this // key + assert(!compaction_ || compaction_->KeyNotExistsBeyondOutputLevel( + ikey_.user_key, &level_ptrs_)); ParsedInternalKey next_ikey; input_->Next(); // Skip over all versions of this key that happen to occur in the same snapshot // range as the delete - while (input_->Valid() && ParseInternalKey(input_->key(), &next_ikey) && + while (!IsPausingManualCompaction() && !IsShuttingDown() && + input_->Valid() && ParseInternalKey(input_->key(), &next_ikey) && cmp_->Equal(ikey_.user_key, next_ikey.user_key) && (prev_snapshot == 0 || DEFINITELY_NOT_IN_SNAPSHOT(next_ikey.sequence, prev_snapshot))) { @@ -606,8 +607,7 @@ void CompactionIterator::NextFromInput() { // These will be correctly set below. key_ = merge_out_iter_.key(); value_ = merge_out_iter_.value(); - bool valid_key __attribute__((__unused__)); - valid_key = ParseInternalKey(key_, &ikey_); + bool valid_key = ParseInternalKey(key_, &ikey_); // MergeUntil stops when it encounters a corrupt key and does not // include them in the result, so we expect the keys here to valid. assert(valid_key);