Ignore kBottommostFiles compaction logic when allow_ingest_behind (#10767)

Summary:
fix for https://github.com/facebook/rocksdb/issues/10752 where RocksDB could be in an infinite compaction loop (with compaction reason kBottommostFiles)  if allow_ingest_behind is enabled and the bottommost level is unfilled.

Pull Request resolved: https://github.com/facebook/rocksdb/pull/10767

Test Plan: Added a unit test to reproduce the compaction loop.

Reviewed By: ajkr

Differential Revision: D40031861

Pulled By: ajkr

fbshipit-source-id: 71c4b02931fbe507a847632905404c9b8fa8c96b
main
Changyu Bi 2 years ago committed by Facebook GitHub Bot
parent 00d697bdc5
commit eca47fb696
  1. 1
      HISTORY.md
  2. 36
      db/db_compaction_test.cc
  3. 21
      db/db_impl/db_impl.cc
  4. 8
      db/db_impl/db_impl_compaction_flush.cc
  5. 8
      db/version_set.cc

@ -10,6 +10,7 @@
* Fixed a bug in iterator refresh which could segfault for DeleteRange users (#10739).
* Fixed a bug causing manual flush with `flush_opts.wait=false` to stall when database has stopped all writes (#10001).
* Fixed a bug in iterator refresh that was not freeing up SuperVersion, which could cause excessive resource pinniung (#10770).
* Fixed a bug where RocksDB could be doing compaction endlessly when allow_ingest_behind is true and the bottommost level is not filled (#10767).
### Performance Improvements
* Try to align the compaction output file boundaries to the next level ones, which can reduce more than 10% compaction load for the default level compaction. The feature is enabled by default, to disable, set `AdvancedColumnFamilyOptions.level_compaction_dynamic_file_size` to false. As a side effect, it can create SSTs larger than the target_file_size (capped at 2x target_file_size) or smaller files.

@ -8194,6 +8194,42 @@ TEST_F(DBCompactionTest, BottomPriCompactionCountsTowardConcurrencyLimit) {
compact_range_thread.join();
}
TEST_F(DBCompactionTest, BottommostFileCompactionAllowIngestBehind) {
// allow_ingest_behind prevents seqnum zeroing, and could cause
// compaction loop with reason kBottommostFiles.
Options options = CurrentOptions();
options.env = env_;
options.compaction_style = kCompactionStyleLevel;
options.allow_ingest_behind = true;
options.comparator = BytewiseComparator();
DestroyAndReopen(options);
WriteOptions write_opts;
ASSERT_OK(db_->Put(write_opts, "infinite", "compaction loop"));
ASSERT_OK(db_->Put(write_opts, "infinite", "loop"));
ASSERT_OK(Flush());
MoveFilesToLevel(1);
ASSERT_OK(db_->Put(write_opts, "bumpseqnum", ""));
ASSERT_OK(Flush());
auto snapshot = db_->GetSnapshot();
// Bump up oldest_snapshot_seqnum_ in VersionStorageInfo.
db_->ReleaseSnapshot(snapshot);
bool compacted = false;
ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->SetCallBack(
"LevelCompactionPicker::PickCompaction:Return", [&](void* /* arg */) {
// There should not be a compaction.
compacted = true;
});
ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->EnableProcessing();
// Wait for compaction to be scheduled.
env_->SleepForMicroseconds(2000000);
ASSERT_FALSE(compacted);
// The following assert can be used to check for compaction loop:
// it used to wait forever before the fix.
// ASSERT_OK(dbfull()->TEST_WaitForCompact(true /* wait_unscheduled */));
}
#endif // !defined(ROCKSDB_LITE)
} // namespace ROCKSDB_NAMESPACE

@ -3674,14 +3674,16 @@ void DBImpl::ReleaseSnapshot(const Snapshot* s) {
if (oldest_snapshot > bottommost_files_mark_threshold_) {
CfdList cf_scheduled;
for (auto* cfd : *versions_->GetColumnFamilySet()) {
cfd->current()->storage_info()->UpdateOldestSnapshot(oldest_snapshot);
if (!cfd->current()
->storage_info()
->BottommostFilesMarkedForCompaction()
.empty()) {
SchedulePendingCompaction(cfd);
MaybeScheduleFlushOrCompaction();
cf_scheduled.push_back(cfd);
if (!cfd->ioptions()->allow_ingest_behind) {
cfd->current()->storage_info()->UpdateOldestSnapshot(oldest_snapshot);
if (!cfd->current()
->storage_info()
->BottommostFilesMarkedForCompaction()
.empty()) {
SchedulePendingCompaction(cfd);
MaybeScheduleFlushOrCompaction();
cf_scheduled.push_back(cfd);
}
}
}
@ -3690,7 +3692,8 @@ void DBImpl::ReleaseSnapshot(const Snapshot* s) {
// mutex might be unlocked during the loop, making the result inaccurate.
SequenceNumber new_bottommost_files_mark_threshold = kMaxSequenceNumber;
for (auto* cfd : *versions_->GetColumnFamilySet()) {
if (CfdListContains(cf_scheduled, cfd)) {
if (CfdListContains(cf_scheduled, cfd) ||
cfd->ioptions()->allow_ingest_behind) {
continue;
}
new_bottommost_files_mark_threshold = std::min(

@ -3772,9 +3772,11 @@ void DBImpl::InstallSuperVersionAndScheduleWork(
// triggered soon anyway.
bottommost_files_mark_threshold_ = kMaxSequenceNumber;
for (auto* my_cfd : *versions_->GetColumnFamilySet()) {
bottommost_files_mark_threshold_ = std::min(
bottommost_files_mark_threshold_,
my_cfd->current()->storage_info()->bottommost_files_mark_threshold());
if (!my_cfd->ioptions()->allow_ingest_behind) {
bottommost_files_mark_threshold_ = std::min(
bottommost_files_mark_threshold_,
my_cfd->current()->storage_info()->bottommost_files_mark_threshold());
}
}
// Whenever we install new SuperVersion, we might need to issue new flushes or

@ -2915,7 +2915,9 @@ void VersionStorageInfo::PrepareForVersionAppend(
GenerateFileIndexer();
GenerateLevelFilesBrief();
GenerateLevel0NonOverlapping();
GenerateBottommostFiles();
if (!immutable_options.allow_ingest_behind) {
GenerateBottommostFiles();
}
GenerateFileLocationIndex();
}
@ -3355,7 +3357,9 @@ void VersionStorageInfo::ComputeCompactionScore(
}
}
ComputeFilesMarkedForCompaction();
ComputeBottommostFilesMarkedForCompaction();
if (!immutable_options.allow_ingest_behind) {
ComputeBottommostFilesMarkedForCompaction();
}
if (mutable_cf_options.ttl > 0) {
ComputeExpiredTtlFiles(immutable_options, mutable_cf_options.ttl);
}

Loading…
Cancel
Save