diff --git a/HISTORY.md b/HISTORY.md index 2819c69a5..593a74324 100644 --- a/HISTORY.md +++ b/HISTORY.md @@ -9,6 +9,8 @@ * Introduce two more stats levels, kExceptHistogramOrTimers and kExceptTimers. * Added a feature to perform data-block sampling for compressibility, and report stats to user. * Add support for trace filtering. +* Add DBOptions.avoid_unnecessary_blocking_io. If true, we avoid file deletion when destorying ColumnFamilyHandle and Iterator. Instead, a job is scheduled to delete the files in background. + ### Public API Change * Remove bundled fbson library. * statistics.stats_level_ becomes atomic. It is preferred to use statistics.set_stats_level() and statistics.get_stats_level() to access it. diff --git a/db/column_family.cc b/db/column_family.cc index c9172f662..f9a4ae66d 100644 --- a/db/column_family.cc +++ b/db/column_family.cc @@ -68,7 +68,14 @@ ColumnFamilyHandleImpl::~ColumnFamilyHandleImpl() { db_->FindObsoleteFiles(&job_context, false, true); mutex_->Unlock(); if (job_context.HaveSomethingToDelete()) { - db_->PurgeObsoleteFiles(job_context); + bool defer_purge = + db_->immutable_db_options().avoid_unnecessary_blocking_io; + db_->PurgeObsoleteFiles(job_context, defer_purge); + if (defer_purge) { + mutex_->Lock(); + db_->SchedulePurge(); + mutex_->Unlock(); + } } job_context.Clean(); } diff --git a/db/db_impl.cc b/db/db_impl.cc index becc8e8e2..4a55ad7e3 100644 --- a/db/db_impl.cc +++ b/db/db_impl.cc @@ -1313,7 +1313,8 @@ InternalIterator* DBImpl::NewInternalIterator(const ReadOptions& read_options, internal_iter = merge_iter_builder.Finish(); IterState* cleanup = new IterState(this, &mutex_, super_version, - read_options.background_purge_on_iterator_cleanup); + read_options.background_purge_on_iterator_cleanup || + immutable_db_options_.avoid_unnecessary_blocking_io); internal_iter->RegisterCleanup(CleanupIteratorState, cleanup, nullptr); return internal_iter; diff --git a/db/deletefile_test.cc b/db/deletefile_test.cc index 8583535fa..0905c90cc 100644 --- a/db/deletefile_test.cc +++ b/db/deletefile_test.cc @@ -241,7 +241,7 @@ TEST_F(DeleteFileTest, PurgeObsoleteFilesTest) { CloseDB(); } -TEST_F(DeleteFileTest, BackgroundPurgeTest) { +TEST_F(DeleteFileTest, BackgroundPurgeIteratorTest) { std::string first("0"), last("999999"); CompactRangeOptions compact_options; compact_options.change_level = true; @@ -279,6 +279,53 @@ TEST_F(DeleteFileTest, BackgroundPurgeTest) { CloseDB(); } +TEST_F(DeleteFileTest, BackgroundPurgeCFDropTest) { + auto do_test = [&](bool bg_purge) { + ColumnFamilyOptions co; + WriteOptions wo; + FlushOptions fo; + ColumnFamilyHandle* cfh = nullptr; + + ASSERT_OK(db_->CreateColumnFamily(co, "dropme", &cfh)); + + ASSERT_OK(db_->Put(wo, cfh, "pika", "chu")); + ASSERT_OK(db_->Flush(fo, cfh)); + // Expect 1 sst file. + CheckFileTypeCounts(dbname_, 0, 1, 1); + + ASSERT_OK(db_->DropColumnFamily(cfh)); + // Still 1 file, it won't be deleted while ColumnFamilyHandle is alive. + CheckFileTypeCounts(dbname_, 0, 1, 1); + + delete cfh; + test::SleepingBackgroundTask sleeping_task_after; + env_->Schedule(&test::SleepingBackgroundTask::DoSleepTask, + &sleeping_task_after, Env::Priority::HIGH); + // If background purge is enabled, the file should still be there. + CheckFileTypeCounts(dbname_, 0, bg_purge ? 1 : 0, 1); + + // Execute background purges. + sleeping_task_after.WakeUp(); + sleeping_task_after.WaitUntilDone(); + // The file should have been deleted. + CheckFileTypeCounts(dbname_, 0, 0, 1); + }; + + { + SCOPED_TRACE("avoid_unnecessary_blocking_io = false"); + do_test(false); + } + + options_.avoid_unnecessary_blocking_io = true; + ASSERT_OK(ReopenDB(false)); + { + SCOPED_TRACE("avoid_unnecessary_blocking_io = true"); + do_test(true); + } + + CloseDB(); +} + // This test is to reproduce a bug that read invalid ReadOption in iterator // cleanup function TEST_F(DeleteFileTest, BackgroundPurgeCopyOptions) { diff --git a/db/forward_iterator.cc b/db/forward_iterator.cc index 165982241..d1c073468 100644 --- a/db/forward_iterator.cc +++ b/db/forward_iterator.cc @@ -265,16 +265,18 @@ void ForwardIterator::SVCleanup() { if (sv_ == nullptr) { return; } + bool background_purge = + read_options_.background_purge_on_iterator_cleanup || + db_->immutable_db_options().avoid_unnecessary_blocking_io; if (pinned_iters_mgr_ && pinned_iters_mgr_->PinningEnabled()) { // pinned_iters_mgr_ tells us to make sure that all visited key-value slices // are alive until pinned_iters_mgr_->ReleasePinnedData() is called. // The slices may point into some memtables owned by sv_, so we need to keep // sv_ referenced until pinned_iters_mgr_ unpins everything. - auto p = new SVCleanupParams{ - db_, sv_, read_options_.background_purge_on_iterator_cleanup}; + auto p = new SVCleanupParams{db_, sv_, background_purge}; pinned_iters_mgr_->PinPtr(p, &ForwardIterator::DeferredSVCleanup); } else { - SVCleanup(db_, sv_, read_options_.background_purge_on_iterator_cleanup); + SVCleanup(db_, sv_, background_purge); } } diff --git a/include/rocksdb/options.h b/include/rocksdb/options.h index b42a1d5e6..f7d6dfaf5 100644 --- a/include/rocksdb/options.h +++ b/include/rocksdb/options.h @@ -1003,6 +1003,14 @@ struct DBOptions { // Currently, any WAL-enabled writes after atomic flush may be replayed // independently if the process crashes later and tries to recover. bool atomic_flush = false; + + // If true, ColumnFamilyHandle's and Iterator's destructors won't delete + // obsolete files directly and will instead schedule a background job + // to do it. Use it if you're destroying iterators or ColumnFamilyHandle-s + // from latency-sensitive threads. + // If set to true, takes precedence over + // ReadOptions::background_purge_on_iterator_cleanup. + bool avoid_unnecessary_blocking_io = false; }; // Options to control the behavior of a database (passed to DB::Open) diff --git a/options/db_options.cc b/options/db_options.cc index b90c27606..b9950e09e 100644 --- a/options/db_options.cc +++ b/options/db_options.cc @@ -86,7 +86,8 @@ ImmutableDBOptions::ImmutableDBOptions(const DBOptions& options) preserve_deletes(options.preserve_deletes), two_write_queues(options.two_write_queues), manual_wal_flush(options.manual_wal_flush), - atomic_flush(options.atomic_flush) { + atomic_flush(options.atomic_flush), + avoid_unnecessary_blocking_io(options.avoid_unnecessary_blocking_io) { } void ImmutableDBOptions::Dump(Logger* log) const { @@ -217,6 +218,10 @@ void ImmutableDBOptions::Dump(Logger* log) const { two_write_queues); ROCKS_LOG_HEADER(log, " Options.manual_wal_flush: %d", manual_wal_flush); + ROCKS_LOG_HEADER(log, " Options.atomic_flush: %d", atomic_flush); + ROCKS_LOG_HEADER(log, + " Options.avoid_unnecessary_blocking_io: %d", + avoid_unnecessary_blocking_io); } MutableDBOptions::MutableDBOptions() diff --git a/options/db_options.h b/options/db_options.h index 360848851..283cf7d35 100644 --- a/options/db_options.h +++ b/options/db_options.h @@ -79,6 +79,7 @@ struct ImmutableDBOptions { bool two_write_queues; bool manual_wal_flush; bool atomic_flush; + bool avoid_unnecessary_blocking_io; }; struct MutableDBOptions { diff --git a/options/options_helper.cc b/options/options_helper.cc index be8ec2e18..9facf6e94 100644 --- a/options/options_helper.cc +++ b/options/options_helper.cc @@ -132,6 +132,8 @@ DBOptions BuildDBOptions(const ImmutableDBOptions& immutable_db_options, options.two_write_queues = immutable_db_options.two_write_queues; options.manual_wal_flush = immutable_db_options.manual_wal_flush; options.atomic_flush = immutable_db_options.atomic_flush; + options.avoid_unnecessary_blocking_io = + immutable_db_options.avoid_unnecessary_blocking_io; return options; } @@ -1615,7 +1617,12 @@ std::unordered_map {"atomic_flush", {offsetof(struct DBOptions, atomic_flush), OptionType::kBoolean, OptionVerificationType::kNormal, false, - offsetof(struct ImmutableDBOptions, atomic_flush)}}}; + offsetof(struct ImmutableDBOptions, atomic_flush)}}, + {"avoid_unnecessary_blocking_io", + {offsetof(struct DBOptions, avoid_unnecessary_blocking_io), + OptionType::kBoolean, OptionVerificationType::kNormal, false, + offsetof(struct ImmutableDBOptions, avoid_unnecessary_blocking_io)}} + }; std::unordered_map OptionsHelper::block_base_table_index_type_string_map = { diff --git a/options/options_settable_test.cc b/options/options_settable_test.cc index cf1ce4e21..3a6bd6a88 100644 --- a/options/options_settable_test.cc +++ b/options/options_settable_test.cc @@ -294,7 +294,8 @@ TEST_F(OptionsSettableTest, DBOptionsAllFieldsSettable) { "two_write_queues=false;" "manual_wal_flush=false;" "seq_per_batch=false;" - "atomic_flush=false", + "atomic_flush=false;" + "avoid_unnecessary_blocking_io=false", new_options)); ASSERT_EQ(unset_bytes_base, NumUnsetBytes(new_options_ptr, sizeof(DBOptions),