Ignore kBottommostFiles compaction logic when allow_ingest_behind (#10767)

Summary: fix for https://github.com/facebook/rocksdb/issues/10752 where RocksDB could be in an infinite compaction loop (with compaction reason kBottommostFiles) if allow_ingest_behind is enabled and the bottommost level is unfilled. Pull Request resolved: https://github.com/facebook/rocksdb/pull/10767 Test Plan: Added a unit test to reproduce the compaction loop. Reviewed By: ajkr Differential Revision: D40031861 Pulled By: ajkr fbshipit-source-id: 71c4b02931fbe507a847632905404c9b8fa8c96b
3 years ago · eca47fb696
parent 00d697bdc5
commit eca47fb696
5 changed files with 60 additions and 14 deletions
--- a/HISTORY.md
+++ b/HISTORY.md
@ -10,6 +10,7 @@
 * Fixed a bug in iterator refresh which could segfault for DeleteRange users (#10739).
 * Fixed a bug causing manual flush with `flush_opts.wait=false` to stall when database has stopped all writes (#10001).
 * Fixed a bug in iterator refresh that was not freeing up SuperVersion, which could cause excessive resource pinniung (#10770).
 * Fixed a bug where RocksDB could be doing compaction endlessly when allow_ingest_behind is true and the bottommost level is not filled (#10767).
 ### Performance Improvements
 * Try to align the compaction output file boundaries to the next level ones, which can reduce more than 10% compaction load for the default level compaction. The feature is enabled by default, to disable, set `AdvancedColumnFamilyOptions.level_compaction_dynamic_file_size` to false. As a side effect, it can create SSTs larger than the target_file_size (capped at 2x target_file_size) or smaller files.
--- a/db/db_compaction_test.cc
+++ b/db/db_compaction_test.cc
@ -8194,6 +8194,42 @@ TEST_F(DBCompactionTest, BottomPriCompactionCountsTowardConcurrencyLimit) {
  compact_range_thread.join();
 }
 TEST_F(DBCompactionTest, BottommostFileCompactionAllowIngestBehind) {
  // allow_ingest_behind prevents seqnum zeroing, and could cause
  // compaction loop with reason kBottommostFiles.
  Options options = CurrentOptions();
  options.env = env_;
  options.compaction_style = kCompactionStyleLevel;
  options.allow_ingest_behind = true;
  options.comparator = BytewiseComparator();
  DestroyAndReopen(options);
  WriteOptions write_opts;
  ASSERT_OK(db_->Put(write_opts, "infinite", "compaction loop"));
  ASSERT_OK(db_->Put(write_opts, "infinite", "loop"));
  ASSERT_OK(Flush());
  MoveFilesToLevel(1);
  ASSERT_OK(db_->Put(write_opts, "bumpseqnum", ""));
  ASSERT_OK(Flush());
  auto snapshot = db_->GetSnapshot();
  // Bump up oldest_snapshot_seqnum_ in VersionStorageInfo.
  db_->ReleaseSnapshot(snapshot);
  bool compacted = false;
  ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->SetCallBack(
      "LevelCompactionPicker::PickCompaction:Return", [&](void* /* arg */) {
        // There should not be a compaction.
        compacted = true;
      });
  ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->EnableProcessing();
  // Wait for compaction to be scheduled.
  env_->SleepForMicroseconds(2000000);
  ASSERT_FALSE(compacted);
  // The following assert can be used to check for compaction loop:
  // it used to wait forever before the fix.
  // ASSERT_OK(dbfull()->TEST_WaitForCompact(true /* wait_unscheduled */));
 }
 #endif  // !defined(ROCKSDB_LITE)
 }  // namespace ROCKSDB_NAMESPACE
--- a/db/db_impl/db_impl.cc
+++ b/db/db_impl/db_impl.cc
@ -3674,6 +3674,7 @@ void DBImpl::ReleaseSnapshot(const Snapshot* s) {
    if (oldest_snapshot > bottommost_files_mark_threshold_) {
      CfdList cf_scheduled;
      for (auto* cfd : *versions_->GetColumnFamilySet()) {
        if (!cfd->ioptions()->allow_ingest_behind) {
          cfd->current()->storage_info()->UpdateOldestSnapshot(oldest_snapshot);
          if (!cfd->current()
                   ->storage_info()
@ -3684,13 +3685,15 @@ void DBImpl::ReleaseSnapshot(const Snapshot* s) {
            cf_scheduled.push_back(cfd);
          }
        }
      }
      // Calculate a new threshold, skipping those CFs where compactions are
      // scheduled. We do not do the same pass as the previous loop because
      // mutex might be unlocked during the loop, making the result inaccurate.
      SequenceNumber new_bottommost_files_mark_threshold = kMaxSequenceNumber;
      for (auto* cfd : *versions_->GetColumnFamilySet()) {
-        if (CfdListContains(cf_scheduled, cfd)) {
+        if (CfdListContains(cf_scheduled, cfd) ||
            cfd->ioptions()->allow_ingest_behind) {
          continue;
        }
        new_bottommost_files_mark_threshold = std::min(
--- a/db/db_impl/db_impl_compaction_flush.cc
+++ b/db/db_impl/db_impl_compaction_flush.cc
@ -3772,10 +3772,12 @@ void DBImpl::InstallSuperVersionAndScheduleWork(
  // triggered soon anyway.
  bottommost_files_mark_threshold_ = kMaxSequenceNumber;
  for (auto* my_cfd : *versions_->GetColumnFamilySet()) {
    if (!my_cfd->ioptions()->allow_ingest_behind) {
      bottommost_files_mark_threshold_ = std::min(
          bottommost_files_mark_threshold_,
          my_cfd->current()->storage_info()->bottommost_files_mark_threshold());
    }
  }
  // Whenever we install new SuperVersion, we might need to issue new flushes or
  // compactions.
--- a/db/version_set.cc
+++ b/db/version_set.cc
@ -2915,7 +2915,9 @@ void VersionStorageInfo::PrepareForVersionAppend(
  GenerateFileIndexer();
  GenerateLevelFilesBrief();
  GenerateLevel0NonOverlapping();
  if (!immutable_options.allow_ingest_behind) {
    GenerateBottommostFiles();
  }
  GenerateFileLocationIndex();
 }
@ -3355,7 +3357,9 @@ void VersionStorageInfo::ComputeCompactionScore(
    }
  }
  ComputeFilesMarkedForCompaction();
  if (!immutable_options.allow_ingest_behind) {
    ComputeBottommostFilesMarkedForCompaction();
  }
  if (mutable_cf_options.ttl > 0) {
    ComputeExpiredTtlFiles(immutable_options, mutable_cf_options.ttl);
  }