Ignore kBottommostFiles compaction logic when allow_ingest_behind (#10767)

Summary:
fix for https://github.com/facebook/rocksdb/issues/10752 where RocksDB could be in an infinite compaction loop (with compaction reason kBottommostFiles)  if allow_ingest_behind is enabled and the bottommost level is unfilled.

Pull Request resolved: https://github.com/facebook/rocksdb/pull/10767

Test Plan: Added a unit test to reproduce the compaction loop.

Reviewed By: ajkr

Differential Revision: D40031861

Pulled By: ajkr

fbshipit-source-id: 71c4b02931fbe507a847632905404c9b8fa8c96b
main
Changyu Bi 2 years ago committed by Facebook GitHub Bot
parent 00d697bdc5
commit eca47fb696
  1. 1
      HISTORY.md
  2. 36
      db/db_compaction_test.cc
  3. 5
      db/db_impl/db_impl.cc
  4. 2
      db/db_impl/db_impl_compaction_flush.cc
  5. 4
      db/version_set.cc

@ -10,6 +10,7 @@
* Fixed a bug in iterator refresh which could segfault for DeleteRange users (#10739). * Fixed a bug in iterator refresh which could segfault for DeleteRange users (#10739).
* Fixed a bug causing manual flush with `flush_opts.wait=false` to stall when database has stopped all writes (#10001). * Fixed a bug causing manual flush with `flush_opts.wait=false` to stall when database has stopped all writes (#10001).
* Fixed a bug in iterator refresh that was not freeing up SuperVersion, which could cause excessive resource pinniung (#10770). * Fixed a bug in iterator refresh that was not freeing up SuperVersion, which could cause excessive resource pinniung (#10770).
* Fixed a bug where RocksDB could be doing compaction endlessly when allow_ingest_behind is true and the bottommost level is not filled (#10767).
### Performance Improvements ### Performance Improvements
* Try to align the compaction output file boundaries to the next level ones, which can reduce more than 10% compaction load for the default level compaction. The feature is enabled by default, to disable, set `AdvancedColumnFamilyOptions.level_compaction_dynamic_file_size` to false. As a side effect, it can create SSTs larger than the target_file_size (capped at 2x target_file_size) or smaller files. * Try to align the compaction output file boundaries to the next level ones, which can reduce more than 10% compaction load for the default level compaction. The feature is enabled by default, to disable, set `AdvancedColumnFamilyOptions.level_compaction_dynamic_file_size` to false. As a side effect, it can create SSTs larger than the target_file_size (capped at 2x target_file_size) or smaller files.

@ -8194,6 +8194,42 @@ TEST_F(DBCompactionTest, BottomPriCompactionCountsTowardConcurrencyLimit) {
compact_range_thread.join(); compact_range_thread.join();
} }
TEST_F(DBCompactionTest, BottommostFileCompactionAllowIngestBehind) {
// allow_ingest_behind prevents seqnum zeroing, and could cause
// compaction loop with reason kBottommostFiles.
Options options = CurrentOptions();
options.env = env_;
options.compaction_style = kCompactionStyleLevel;
options.allow_ingest_behind = true;
options.comparator = BytewiseComparator();
DestroyAndReopen(options);
WriteOptions write_opts;
ASSERT_OK(db_->Put(write_opts, "infinite", "compaction loop"));
ASSERT_OK(db_->Put(write_opts, "infinite", "loop"));
ASSERT_OK(Flush());
MoveFilesToLevel(1);
ASSERT_OK(db_->Put(write_opts, "bumpseqnum", ""));
ASSERT_OK(Flush());
auto snapshot = db_->GetSnapshot();
// Bump up oldest_snapshot_seqnum_ in VersionStorageInfo.
db_->ReleaseSnapshot(snapshot);
bool compacted = false;
ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->SetCallBack(
"LevelCompactionPicker::PickCompaction:Return", [&](void* /* arg */) {
// There should not be a compaction.
compacted = true;
});
ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->EnableProcessing();
// Wait for compaction to be scheduled.
env_->SleepForMicroseconds(2000000);
ASSERT_FALSE(compacted);
// The following assert can be used to check for compaction loop:
// it used to wait forever before the fix.
// ASSERT_OK(dbfull()->TEST_WaitForCompact(true /* wait_unscheduled */));
}
#endif // !defined(ROCKSDB_LITE) #endif // !defined(ROCKSDB_LITE)
} // namespace ROCKSDB_NAMESPACE } // namespace ROCKSDB_NAMESPACE

@ -3674,6 +3674,7 @@ void DBImpl::ReleaseSnapshot(const Snapshot* s) {
if (oldest_snapshot > bottommost_files_mark_threshold_) { if (oldest_snapshot > bottommost_files_mark_threshold_) {
CfdList cf_scheduled; CfdList cf_scheduled;
for (auto* cfd : *versions_->GetColumnFamilySet()) { for (auto* cfd : *versions_->GetColumnFamilySet()) {
if (!cfd->ioptions()->allow_ingest_behind) {
cfd->current()->storage_info()->UpdateOldestSnapshot(oldest_snapshot); cfd->current()->storage_info()->UpdateOldestSnapshot(oldest_snapshot);
if (!cfd->current() if (!cfd->current()
->storage_info() ->storage_info()
@ -3684,13 +3685,15 @@ void DBImpl::ReleaseSnapshot(const Snapshot* s) {
cf_scheduled.push_back(cfd); cf_scheduled.push_back(cfd);
} }
} }
}
// Calculate a new threshold, skipping those CFs where compactions are // Calculate a new threshold, skipping those CFs where compactions are
// scheduled. We do not do the same pass as the previous loop because // scheduled. We do not do the same pass as the previous loop because
// mutex might be unlocked during the loop, making the result inaccurate. // mutex might be unlocked during the loop, making the result inaccurate.
SequenceNumber new_bottommost_files_mark_threshold = kMaxSequenceNumber; SequenceNumber new_bottommost_files_mark_threshold = kMaxSequenceNumber;
for (auto* cfd : *versions_->GetColumnFamilySet()) { for (auto* cfd : *versions_->GetColumnFamilySet()) {
if (CfdListContains(cf_scheduled, cfd)) { if (CfdListContains(cf_scheduled, cfd) ||
cfd->ioptions()->allow_ingest_behind) {
continue; continue;
} }
new_bottommost_files_mark_threshold = std::min( new_bottommost_files_mark_threshold = std::min(

@ -3772,10 +3772,12 @@ void DBImpl::InstallSuperVersionAndScheduleWork(
// triggered soon anyway. // triggered soon anyway.
bottommost_files_mark_threshold_ = kMaxSequenceNumber; bottommost_files_mark_threshold_ = kMaxSequenceNumber;
for (auto* my_cfd : *versions_->GetColumnFamilySet()) { for (auto* my_cfd : *versions_->GetColumnFamilySet()) {
if (!my_cfd->ioptions()->allow_ingest_behind) {
bottommost_files_mark_threshold_ = std::min( bottommost_files_mark_threshold_ = std::min(
bottommost_files_mark_threshold_, bottommost_files_mark_threshold_,
my_cfd->current()->storage_info()->bottommost_files_mark_threshold()); my_cfd->current()->storage_info()->bottommost_files_mark_threshold());
} }
}
// Whenever we install new SuperVersion, we might need to issue new flushes or // Whenever we install new SuperVersion, we might need to issue new flushes or
// compactions. // compactions.

@ -2915,7 +2915,9 @@ void VersionStorageInfo::PrepareForVersionAppend(
GenerateFileIndexer(); GenerateFileIndexer();
GenerateLevelFilesBrief(); GenerateLevelFilesBrief();
GenerateLevel0NonOverlapping(); GenerateLevel0NonOverlapping();
if (!immutable_options.allow_ingest_behind) {
GenerateBottommostFiles(); GenerateBottommostFiles();
}
GenerateFileLocationIndex(); GenerateFileLocationIndex();
} }
@ -3355,7 +3357,9 @@ void VersionStorageInfo::ComputeCompactionScore(
} }
} }
ComputeFilesMarkedForCompaction(); ComputeFilesMarkedForCompaction();
if (!immutable_options.allow_ingest_behind) {
ComputeBottommostFilesMarkedForCompaction(); ComputeBottommostFilesMarkedForCompaction();
}
if (mutable_cf_options.ttl > 0) { if (mutable_cf_options.ttl > 0) {
ComputeExpiredTtlFiles(immutable_options, mutable_cf_options.ttl); ComputeExpiredTtlFiles(immutable_options, mutable_cf_options.ttl);
} }

Loading…
Cancel
Save