From f287f8dc930f0e5455cc236b65960abce6e7bbf0 Mon Sep 17 00:00:00 2001 From: Yanqin Jin Date: Tue, 18 Jun 2019 11:16:57 -0700 Subject: [PATCH] Fix a bug caused by secondary not skipping the beginning of new MANIFEST (#5472) Summary: While the secondary is replaying after the primary, the primary may switch to a new MANIFEST. The secondary is already able to detect and follow the primary to the new MANIFEST. However, the current implementation has a bug, described as follows. The new MANIFEST's first records have been generated by VersionSet::WriteSnapshot to describe the current state of the column families and the db as of the MANIFEST creation. Since the secondary instance has already finished recovering upon start, there is no need for the secondary to process these records. Actually, if the secondary were to replay these records, the secondary may end up adding the same SST files **again** to each column family, causing consistency checks done by VersionBuilder to fail. Therefore, we record the number of records to skip at the beginning of the new MANIFEST and ignore them. Test plan (on dev server) ``` $make clean && make -j32 all $./db_secondary_test ``` All existing unit tests must pass as well. Pull Request resolved: https://github.com/facebook/rocksdb/pull/5472 Differential Revision: D15866771 Pulled By: riversand963 fbshipit-source-id: a1eec4837fb2ad13059398efb0f437e74fd53bed --- HISTORY.md | 1 + db/db_impl/db_secondary_test.cc | 28 +++++++++++++++++++ db/version_set.cc | 48 +++++++++++++++++++++++++++++---- db/version_set.h | 3 +++ 4 files changed, 75 insertions(+), 5 deletions(-) diff --git a/HISTORY.md b/HISTORY.md index 228d02b61..0b6409dbe 100644 --- a/HISTORY.md +++ b/HISTORY.md @@ -28,6 +28,7 @@ * Fix a bug in WAL replay of secondary instance by skipping write batches with older sequence numbers than the current last sequence number. * Fix flush's/compaction's merge processing logic which allowed `Put`s covered by range tombstones to reappear. Note `Put`s may exist even if the user only ever called `Merge()` due to an internal conversion during compaction to the bottommost level. * Fix/improve memtable earliest sequence assignment and WAL replay so that WAL entries of unflushed column families will not be skipped after replaying the MANIFEST and increasing db sequence due to another flushed/compacted column family. +* Fix a bug caused by secondary not skipping the beginning of new MANIFEST. ## 6.2.0 (4/30/2019) ### New Features diff --git a/db/db_impl/db_secondary_test.cc b/db/db_impl/db_secondary_test.cc index c79589d50..26f43c107 100644 --- a/db/db_impl/db_secondary_test.cc +++ b/db/db_impl/db_secondary_test.cc @@ -525,6 +525,34 @@ TEST_F(DBSecondaryTest, SwitchManifest) { range_scan_db(); } +// Here, "Snapshot" refers to the version edits written by +// VersionSet::WriteSnapshot() at the beginning of the new MANIFEST after +// switching from the old one. +TEST_F(DBSecondaryTest, SkipSnapshotAfterManifestSwitch) { + Options options; + options.env = env_; + options.disable_auto_compactions = true; + Reopen(options); + + Options options1; + options1.env = env_; + options1.max_open_files = -1; + OpenSecondary(options1); + + ASSERT_OK(Put("0", "value0")); + ASSERT_OK(Flush()); + ASSERT_OK(db_secondary_->TryCatchUpWithPrimary()); + std::string value; + ReadOptions ropts; + ropts.verify_checksums = true; + ASSERT_OK(db_secondary_->Get(ropts, "0", &value)); + ASSERT_EQ("value0", value); + + Reopen(options); + ASSERT_OK(dbfull()->SetOptions({{"disable_auto_compactions", "false"}})); + ASSERT_OK(db_secondary_->TryCatchUpWithPrimary()); +} + TEST_F(DBSecondaryTest, SwitchWAL) { const int kNumKeysPerMemtable = 1; Options options; diff --git a/db/version_set.cc b/db/version_set.cc index ccedca794..9978c8cd4 100644 --- a/db/version_set.cc +++ b/db/version_set.cc @@ -5217,7 +5217,8 @@ ReactiveVersionSet::ReactiveVersionSet(const std::string& dbname, WriteController* write_controller) : VersionSet(dbname, _db_options, _env_options, table_cache, write_buffer_manager, write_controller, - /*block_cache_tracer=*/nullptr) {} + /*block_cache_tracer=*/nullptr), + number_of_edits_to_skip_(0) {} ReactiveVersionSet::~ReactiveVersionSet() {} @@ -5415,6 +5416,17 @@ Status ReactiveVersionSet::ReadAndApply( break; } + // Skip the first VersionEdits of each MANIFEST generated by + // VersionSet::WriteSnapshot. + if (number_of_edits_to_skip_ > 0) { + ColumnFamilyData* cfd = + column_family_set_->GetColumnFamily(edit.column_family_); + if (cfd != nullptr && !cfd->IsDropped()) { + --number_of_edits_to_skip_; + } + continue; + } + s = read_buffer_.AddEdit(&edit); if (!s.ok()) { break; @@ -5463,8 +5475,33 @@ Status ReactiveVersionSet::ReadAndApply( // find the next MANIFEST, we should exit the loop. s = MaybeSwitchManifest(reader->GetReporter(), manifest_reader); reader = manifest_reader->get(); - if (s.ok() && reader->file()->file_name() == old_manifest_path) { - break; + if (s.ok()) { + if (reader->file()->file_name() == old_manifest_path) { + // Still processing the same MANIFEST, thus no need to continue this + // loop since no record is available if we have reached here. + break; + } else { + // We have switched to a new MANIFEST whose first records have been + // generated by VersionSet::WriteSnapshot. Since the secondary instance + // has already finished recovering upon start, there is no need for the + // secondary to process these records. Actually, if the secondary were + // to replay these records, the secondary may end up adding the same + // SST files AGAIN to each column family, causing consistency checks + // done by VersionBuilder to fail. Therefore, we record the number of + // records to skip at the beginning of the new MANIFEST and ignore + // them. + number_of_edits_to_skip_ = 0; + for (auto* cfd : *column_family_set_) { + if (cfd->IsDropped()) { + continue; + } + // Increase number_of_edits_to_skip by 2 because WriteSnapshot() + // writes 2 version edits for each column family at the beginning of + // the newly-generated MANIFEST. + // TODO(yanqin) remove hard-coded value. + number_of_edits_to_skip_ += 2; + } + } } } @@ -5504,7 +5541,7 @@ Status ReactiveVersionSet::ApplyOneVersionEditToBuilder( return Status::OK(); } if (active_version_builders_.find(edit.column_family_) == - active_version_builders_.end()) { + active_version_builders_.end() && !cfd->IsDropped()) { std::unique_ptr builder_guard( new BaseReferencedVersionBuilder(cfd)); active_version_builders_.insert( @@ -5532,6 +5569,7 @@ Status ReactiveVersionSet::ApplyOneVersionEditToBuilder( delete cfd; cfd = nullptr; } + active_version_builders_.erase(builder_iter); } else { builder->Apply(&edit); } @@ -5543,7 +5581,7 @@ Status ReactiveVersionSet::ApplyOneVersionEditToBuilder( return s; } - if (cfd != nullptr) { + if (cfd != nullptr && !cfd->IsDropped()) { s = builder->LoadTableHandlers( cfd->internal_stats(), db_options_->max_file_opening_threads, false /* prefetch_index_and_filter_in_cache */, diff --git a/db/version_set.h b/db/version_set.h index 90be94a78..ba1b4d3e3 100644 --- a/db/version_set.h +++ b/db/version_set.h @@ -1195,6 +1195,9 @@ class ReactiveVersionSet : public VersionSet { std::unordered_map> active_version_builders_; AtomicGroupReadBuffer read_buffer_; + // Number of version edits to skip by ReadAndApply at the beginning of a new + // MANIFEST created by primary. + int number_of_edits_to_skip_; using VersionSet::LogAndApply; using VersionSet::Recover;