From aa0a11e1b98e9a852110bde3d473df045b7f254a Mon Sep 17 00:00:00 2001 From: Andrew Kryczka Date: Fri, 4 Nov 2022 15:55:54 -0700 Subject: [PATCH] Fix flush picking non-consecutive memtables (#10921) Summary: Prevents `MemTableList::PickMemtablesToFlush()` from picking non-consecutive memtables. It leads to wrong ordering in L0 if the files are committed, or an error like below if force_consistency_checks=true catches it: ``` Corruption: force_consistency_checks: VersionBuilder: L0 file https://github.com/facebook/rocksdb/issues/25 with seqno 320416 368066 vs. file https://github.com/facebook/rocksdb/issues/24 with seqno 336037 352068 ``` Pull Request resolved: https://github.com/facebook/rocksdb/pull/10921 Test Plan: fix the expectation in the existing test of this behavior Reviewed By: riversand963 Differential Revision: D41046935 Pulled By: ajkr fbshipit-source-id: 783696bff56115063d5dc5856dfaed6a9881d1ab --- HISTORY.md | 1 + db/memtable_list.cc | 7 ++++++ db/memtable_list_test.cc | 48 +++++++++++++++++++++++++++++----------- 3 files changed, 43 insertions(+), 13 deletions(-) diff --git a/HISTORY.md b/HISTORY.md index b57b84374..33f87d36d 100644 --- a/HISTORY.md +++ b/HISTORY.md @@ -6,6 +6,7 @@ ### Bug Fixes * Fix FIFO compaction causing corruption of overlapping seqnos in L0 files due to ingesting files of overlapping seqnos with memtable's under `CompactionOptionsFIFO::allow_compaction=true` or `CompactionOptionsFIFO::age_for_warm>0` or `CompactRange()/CompactFiles()` is used. Before the fix, `force_consistency_checks=true` may catch the corruption before it's exposed to readers, in which case writes returning `Status::Corruption` would be expected. * Fix memory corruption error in scans if async_io is enabled. Memory corruption happened if there is IOError while reading the data leading to empty buffer and other buffer already in progress of async read goes again for reading. +* Fix failed memtable flush retry bug that could cause wrongly ordered updates, which would surface to writers as `Status::Corruption` in case of `force_consistency_checks=true` (default). It affects use cases that enable both parallel flush (`max_background_flushes > 1` or `max_background_jobs >= 8`) and non-default memtable count (`max_write_buffer_number > 2`). ### New Features * Add basic support for user-defined timestamp to Merge (#10819). diff --git a/db/memtable_list.cc b/db/memtable_list.cc index 7cc669cdd..1545003ad 100644 --- a/db/memtable_list.cc +++ b/db/memtable_list.cc @@ -419,6 +419,13 @@ void MemTableList::PickMemtablesToFlush(uint64_t max_memtable_id, std::max(m->GetNextLogNumber(), *max_next_log_number); } ret->push_back(m); + } else if (!ret->empty()) { + // This `break` is necessary to prevent picking non-consecutive memtables + // in case `memlist` has one or more entries with + // `flush_in_progress_ == true` sandwiched between entries with + // `flush_in_progress_ == false`. This could happen after parallel flushes + // are picked and the one flushing older memtables is rolled back. + break; } } if (!atomic_flush || num_flush_not_started_ == 0) { diff --git a/db/memtable_list_test.cc b/db/memtable_list_test.cc index 258f02f96..8242061af 100644 --- a/db/memtable_list_test.cc +++ b/db/memtable_list_test.cc @@ -744,28 +744,40 @@ TEST_F(MemTableListTest, FlushPendingTest) { // Pick tables to flush list.PickMemtablesToFlush( std::numeric_limits::max() /* memtable_id */, &to_flush); - // Should pick 4 of 5 since 1 table has been picked in to_flush2 - ASSERT_EQ(4, to_flush.size()); + // Picks three oldest memtables. The fourth oldest is picked in `to_flush2` so + // must be excluded. The newest (fifth oldest) is non-consecutive with the + // three oldest due to omitting the fourth oldest so must not be picked. + ASSERT_EQ(3, to_flush.size()); ASSERT_EQ(5, list.NumNotFlushed()); ASSERT_FALSE(list.IsFlushPending()); - ASSERT_FALSE(list.imm_flush_needed.load(std::memory_order_acquire)); + ASSERT_TRUE(list.imm_flush_needed.load(std::memory_order_acquire)); // Pick tables to flush again autovector to_flush3; list.PickMemtablesToFlush( std::numeric_limits::max() /* memtable_id */, &to_flush3); - ASSERT_EQ(0, to_flush3.size()); // nothing not in progress of being flushed + // Picks newest (fifth oldest) + ASSERT_EQ(1, to_flush3.size()); + ASSERT_EQ(5, list.NumNotFlushed()); + ASSERT_FALSE(list.IsFlushPending()); + ASSERT_FALSE(list.imm_flush_needed.load(std::memory_order_acquire)); + + // Nothing left to flush + autovector to_flush4; + list.PickMemtablesToFlush( + std::numeric_limits::max() /* memtable_id */, &to_flush4); + ASSERT_EQ(0, to_flush4.size()); ASSERT_EQ(5, list.NumNotFlushed()); ASSERT_FALSE(list.IsFlushPending()); ASSERT_FALSE(list.imm_flush_needed.load(std::memory_order_acquire)); - // Flush the 4 memtables that were picked in to_flush + // Flush the 3 memtables that were picked in to_flush s = Mock_InstallMemtableFlushResults(&list, mutable_cf_options, to_flush, &to_delete); ASSERT_OK(s); - // Note: now to_flush contains tables[0,1,2,4]. to_flush2 contains - // tables[3]. + // Note: now to_flush contains tables[0,1,2]. to_flush2 contains + // tables[3]. to_flush3 contains tables[4]. // Current implementation will only commit memtables in the order they were // created. So TryInstallMemtableFlushResults will install the first 3 tables // in to_flush and stop when it encounters a table not yet flushed. @@ -781,7 +793,17 @@ TEST_F(MemTableListTest, FlushPendingTest) { ASSERT_FALSE(list.IsFlushPending()); ASSERT_FALSE(list.imm_flush_needed.load(std::memory_order_acquire)); - // Flush the 1 memtable that was picked in to_flush2 + // Flush the 1 memtable (tables[4]) that was picked in to_flush3 + s = MemTableListTest::Mock_InstallMemtableFlushResults( + &list, mutable_cf_options, to_flush3, &to_delete); + ASSERT_OK(s); + + // This will install 0 tables since tables[4] flushed while tables[3] has not + // yet flushed. + ASSERT_EQ(2, list.NumNotFlushed()); + ASSERT_EQ(0, to_delete.size()); + + // Flush the 1 memtable (tables[3]) that was picked in to_flush2 s = MemTableListTest::Mock_InstallMemtableFlushResults( &list, mutable_cf_options, to_flush2, &to_delete); ASSERT_OK(s); @@ -811,11 +833,11 @@ TEST_F(MemTableListTest, FlushPendingTest) { memtable_id = 4; // Pick tables to flush. The tables to pick must have ID smaller than or // equal to 4. Therefore, no table will be selected in this case. - autovector to_flush4; + autovector to_flush5; list.FlushRequested(); ASSERT_TRUE(list.HasFlushRequested()); - list.PickMemtablesToFlush(memtable_id, &to_flush4); - ASSERT_TRUE(to_flush4.empty()); + list.PickMemtablesToFlush(memtable_id, &to_flush5); + ASSERT_TRUE(to_flush5.empty()); ASSERT_EQ(1, list.NumNotFlushed()); ASSERT_TRUE(list.imm_flush_needed.load(std::memory_order_acquire)); ASSERT_FALSE(list.IsFlushPending()); @@ -825,8 +847,8 @@ TEST_F(MemTableListTest, FlushPendingTest) { // equal to 5. Therefore, only tables[5] will be selected. memtable_id = 5; list.FlushRequested(); - list.PickMemtablesToFlush(memtable_id, &to_flush4); - ASSERT_EQ(1, static_cast(to_flush4.size())); + list.PickMemtablesToFlush(memtable_id, &to_flush5); + ASSERT_EQ(1, static_cast(to_flush5.size())); ASSERT_EQ(1, list.NumNotFlushed()); ASSERT_FALSE(list.imm_flush_needed.load(std::memory_order_acquire)); ASSERT_FALSE(list.IsFlushPending());