diff --git a/HISTORY.md b/HISTORY.md index f5d77802e..5affc7960 100644 --- a/HISTORY.md +++ b/HISTORY.md @@ -13,6 +13,7 @@ * Fix a bug in WriteBatchWithIndex::MultiGetFromBatchAndDB, which is called by Transaction::MultiGet, that causes due to stale pointer access when the number of keys is > 32 * BlobDB no longer updates the SST to blob file mapping upon failed compactions. * Fixed a bug where BlobDB was comparing the `ColumnFamilyHandle` pointers themselves instead of only the column family IDs when checking whether an API call uses the default column family or not. +* Fix a race condition for cfd->log_number_ between manifest switch and memtable switch (PR 6249) when number of column families is greater than 1. ### New Features * It is now possible to enable periodic compactions for the base DB when using BlobDB. diff --git a/db/db_test2.cc b/db/db_test2.cc index 464aa23a9..0b7fdb44b 100644 --- a/db/db_test2.cc +++ b/db/db_test2.cc @@ -4248,6 +4248,18 @@ TEST_F(DBTest2, BackgroundPurgeTest) { value = options.write_buffer_manager->memory_usage(); ASSERT_EQ(base_value, value); } + +TEST_F(DBTest2, SwitchMemtableRaceWithNewManifest) { + Options options = CurrentOptions(); + DestroyAndReopen(options); + options.max_manifest_file_size = 10; + options.create_if_missing = true; + CreateAndReopenWithCF({"pikachu"}, options); + ASSERT_OK(Put("foo", "value")); + port::Thread thread([&]() { ASSERT_OK(Flush()); }); + ASSERT_OK(dbfull()->SetOptions({{"prefix_extractor", "fixed:5"}})); + thread.join(); +} } // namespace rocksdb #ifdef ROCKSDB_UNITTESTS_WITH_CUSTOM_OBJECTS_FROM_STATIC_LIBS diff --git a/db/version_set.cc b/db/version_set.cc index 06d4abd7f..a473c7129 100644 --- a/db/version_set.cc +++ b/db/version_set.cc @@ -3753,6 +3753,10 @@ Status VersionSet::ProcessManifestWrites( pending_manifest_file_number_ = manifest_file_number_; } + // Local cached copy of state variable(s). WriteCurrentStateToManifest() + // reads its content after releasing db mutex to avoid race with + // SwitchMemtable(). + std::unordered_map curr_state; if (new_descriptor_log) { // if we are writing out new snapshot make sure to persist max column // family. @@ -3760,6 +3764,10 @@ Status VersionSet::ProcessManifestWrites( first_writer.edit_list.front()->SetMaxColumnFamily( column_family_set_->GetMaxColumnFamily()); } + for (const auto* cfd : *column_family_set_) { + assert(curr_state.find(cfd->GetID()) == curr_state.end()); + curr_state[cfd->GetID()] = {cfd->GetLogNumber()}; + } } { @@ -3802,7 +3810,7 @@ Status VersionSet::ProcessManifestWrites( nullptr, db_options_->listeners)); descriptor_log_.reset( new log::Writer(std::move(file_writer), 0, false)); - s = WriteCurrentStateToManifest(descriptor_log_.get()); + s = WriteCurrentStateToManifest(curr_state, descriptor_log_.get()); } } @@ -4918,7 +4926,9 @@ void VersionSet::MarkMinLogNumberToKeep2PC(uint64_t number) { } } -Status VersionSet::WriteCurrentStateToManifest(log::Writer* log) { +Status VersionSet::WriteCurrentStateToManifest( + const std::unordered_map& curr_state, + log::Writer* log) { // TODO: Break up into multiple records to reduce memory usage on recovery? // WARNING: This method doesn't hold a mutex!! @@ -4984,7 +4994,10 @@ Status VersionSet::WriteCurrentStateToManifest(log::Writer* log) { f->oldest_ancester_time, f->file_creation_time); } } - edit.SetLogNumber(cfd->GetLogNumber()); + const auto iter = curr_state.find(cfd->GetID()); + assert(iter != curr_state.end()); + uint64_t log_number = iter->second.log_number; + edit.SetLogNumber(log_number); std::string record; if (!edit.EncodeTo(&record)) { return Status::Corruption( diff --git a/db/version_set.h b/db/version_set.h index aa18cabb5..c44e9f536 100644 --- a/db/version_set.h +++ b/db/version_set.h @@ -1075,8 +1075,14 @@ class VersionSet { const Slice& start, const Slice& end, TableReaderCaller caller); + struct MutableCFState { + uint64_t log_number; + }; + // Save current contents to *log - Status WriteCurrentStateToManifest(log::Writer* log); + Status WriteCurrentStateToManifest( + const std::unordered_map& curr_state, + log::Writer* log); void AppendVersion(ColumnFamilyData* column_family_data, Version* v);