Fix flush picking non-consecutive memtables (#10921)

Summary:
Prevents `MemTableList::PickMemtablesToFlush()` from picking non-consecutive memtables. It leads to wrong ordering in L0 if the files are committed, or an error like below if force_consistency_checks=true catches it:

```
Corruption: force_consistency_checks: VersionBuilder: L0 file https://github.com/facebook/rocksdb/issues/25 with seqno 320416 368066 vs. file https://github.com/facebook/rocksdb/issues/24 with seqno 336037 352068
```

Pull Request resolved: https://github.com/facebook/rocksdb/pull/10921

Test Plan: fix the expectation in the existing test of this behavior

Reviewed By: riversand963

Differential Revision: D41046935

Pulled By: ajkr

fbshipit-source-id: 783696bff56115063d5dc5856dfaed6a9881d1ab
main
Andrew Kryczka 2 years ago committed by Facebook GitHub Bot
parent aafe7bd376
commit aa0a11e1b9
  1. 1
      HISTORY.md
  2. 7
      db/memtable_list.cc
  3. 48
      db/memtable_list_test.cc

@ -6,6 +6,7 @@
### Bug Fixes ### Bug Fixes
* Fix FIFO compaction causing corruption of overlapping seqnos in L0 files due to ingesting files of overlapping seqnos with memtable's under `CompactionOptionsFIFO::allow_compaction=true` or `CompactionOptionsFIFO::age_for_warm>0` or `CompactRange()/CompactFiles()` is used. Before the fix, `force_consistency_checks=true` may catch the corruption before it's exposed to readers, in which case writes returning `Status::Corruption` would be expected. * Fix FIFO compaction causing corruption of overlapping seqnos in L0 files due to ingesting files of overlapping seqnos with memtable's under `CompactionOptionsFIFO::allow_compaction=true` or `CompactionOptionsFIFO::age_for_warm>0` or `CompactRange()/CompactFiles()` is used. Before the fix, `force_consistency_checks=true` may catch the corruption before it's exposed to readers, in which case writes returning `Status::Corruption` would be expected.
* Fix memory corruption error in scans if async_io is enabled. Memory corruption happened if there is IOError while reading the data leading to empty buffer and other buffer already in progress of async read goes again for reading. * Fix memory corruption error in scans if async_io is enabled. Memory corruption happened if there is IOError while reading the data leading to empty buffer and other buffer already in progress of async read goes again for reading.
* Fix failed memtable flush retry bug that could cause wrongly ordered updates, which would surface to writers as `Status::Corruption` in case of `force_consistency_checks=true` (default). It affects use cases that enable both parallel flush (`max_background_flushes > 1` or `max_background_jobs >= 8`) and non-default memtable count (`max_write_buffer_number > 2`).
### New Features ### New Features
* Add basic support for user-defined timestamp to Merge (#10819). * Add basic support for user-defined timestamp to Merge (#10819).

@ -419,6 +419,13 @@ void MemTableList::PickMemtablesToFlush(uint64_t max_memtable_id,
std::max(m->GetNextLogNumber(), *max_next_log_number); std::max(m->GetNextLogNumber(), *max_next_log_number);
} }
ret->push_back(m); ret->push_back(m);
} else if (!ret->empty()) {
// This `break` is necessary to prevent picking non-consecutive memtables
// in case `memlist` has one or more entries with
// `flush_in_progress_ == true` sandwiched between entries with
// `flush_in_progress_ == false`. This could happen after parallel flushes
// are picked and the one flushing older memtables is rolled back.
break;
} }
} }
if (!atomic_flush || num_flush_not_started_ == 0) { if (!atomic_flush || num_flush_not_started_ == 0) {

@ -744,28 +744,40 @@ TEST_F(MemTableListTest, FlushPendingTest) {
// Pick tables to flush // Pick tables to flush
list.PickMemtablesToFlush( list.PickMemtablesToFlush(
std::numeric_limits<uint64_t>::max() /* memtable_id */, &to_flush); std::numeric_limits<uint64_t>::max() /* memtable_id */, &to_flush);
// Should pick 4 of 5 since 1 table has been picked in to_flush2 // Picks three oldest memtables. The fourth oldest is picked in `to_flush2` so
ASSERT_EQ(4, to_flush.size()); // must be excluded. The newest (fifth oldest) is non-consecutive with the
// three oldest due to omitting the fourth oldest so must not be picked.
ASSERT_EQ(3, to_flush.size());
ASSERT_EQ(5, list.NumNotFlushed()); ASSERT_EQ(5, list.NumNotFlushed());
ASSERT_FALSE(list.IsFlushPending()); ASSERT_FALSE(list.IsFlushPending());
ASSERT_FALSE(list.imm_flush_needed.load(std::memory_order_acquire)); ASSERT_TRUE(list.imm_flush_needed.load(std::memory_order_acquire));
// Pick tables to flush again // Pick tables to flush again
autovector<MemTable*> to_flush3; autovector<MemTable*> to_flush3;
list.PickMemtablesToFlush( list.PickMemtablesToFlush(
std::numeric_limits<uint64_t>::max() /* memtable_id */, &to_flush3); std::numeric_limits<uint64_t>::max() /* memtable_id */, &to_flush3);
ASSERT_EQ(0, to_flush3.size()); // nothing not in progress of being flushed // Picks newest (fifth oldest)
ASSERT_EQ(1, to_flush3.size());
ASSERT_EQ(5, list.NumNotFlushed());
ASSERT_FALSE(list.IsFlushPending());
ASSERT_FALSE(list.imm_flush_needed.load(std::memory_order_acquire));
// Nothing left to flush
autovector<MemTable*> to_flush4;
list.PickMemtablesToFlush(
std::numeric_limits<uint64_t>::max() /* memtable_id */, &to_flush4);
ASSERT_EQ(0, to_flush4.size());
ASSERT_EQ(5, list.NumNotFlushed()); ASSERT_EQ(5, list.NumNotFlushed());
ASSERT_FALSE(list.IsFlushPending()); ASSERT_FALSE(list.IsFlushPending());
ASSERT_FALSE(list.imm_flush_needed.load(std::memory_order_acquire)); ASSERT_FALSE(list.imm_flush_needed.load(std::memory_order_acquire));
// Flush the 4 memtables that were picked in to_flush // Flush the 3 memtables that were picked in to_flush
s = Mock_InstallMemtableFlushResults(&list, mutable_cf_options, to_flush, s = Mock_InstallMemtableFlushResults(&list, mutable_cf_options, to_flush,
&to_delete); &to_delete);
ASSERT_OK(s); ASSERT_OK(s);
// Note: now to_flush contains tables[0,1,2,4]. to_flush2 contains // Note: now to_flush contains tables[0,1,2]. to_flush2 contains
// tables[3]. // tables[3]. to_flush3 contains tables[4].
// Current implementation will only commit memtables in the order they were // Current implementation will only commit memtables in the order they were
// created. So TryInstallMemtableFlushResults will install the first 3 tables // created. So TryInstallMemtableFlushResults will install the first 3 tables
// in to_flush and stop when it encounters a table not yet flushed. // in to_flush and stop when it encounters a table not yet flushed.
@ -781,7 +793,17 @@ TEST_F(MemTableListTest, FlushPendingTest) {
ASSERT_FALSE(list.IsFlushPending()); ASSERT_FALSE(list.IsFlushPending());
ASSERT_FALSE(list.imm_flush_needed.load(std::memory_order_acquire)); ASSERT_FALSE(list.imm_flush_needed.load(std::memory_order_acquire));
// Flush the 1 memtable that was picked in to_flush2 // Flush the 1 memtable (tables[4]) that was picked in to_flush3
s = MemTableListTest::Mock_InstallMemtableFlushResults(
&list, mutable_cf_options, to_flush3, &to_delete);
ASSERT_OK(s);
// This will install 0 tables since tables[4] flushed while tables[3] has not
// yet flushed.
ASSERT_EQ(2, list.NumNotFlushed());
ASSERT_EQ(0, to_delete.size());
// Flush the 1 memtable (tables[3]) that was picked in to_flush2
s = MemTableListTest::Mock_InstallMemtableFlushResults( s = MemTableListTest::Mock_InstallMemtableFlushResults(
&list, mutable_cf_options, to_flush2, &to_delete); &list, mutable_cf_options, to_flush2, &to_delete);
ASSERT_OK(s); ASSERT_OK(s);
@ -811,11 +833,11 @@ TEST_F(MemTableListTest, FlushPendingTest) {
memtable_id = 4; memtable_id = 4;
// Pick tables to flush. The tables to pick must have ID smaller than or // Pick tables to flush. The tables to pick must have ID smaller than or
// equal to 4. Therefore, no table will be selected in this case. // equal to 4. Therefore, no table will be selected in this case.
autovector<MemTable*> to_flush4; autovector<MemTable*> to_flush5;
list.FlushRequested(); list.FlushRequested();
ASSERT_TRUE(list.HasFlushRequested()); ASSERT_TRUE(list.HasFlushRequested());
list.PickMemtablesToFlush(memtable_id, &to_flush4); list.PickMemtablesToFlush(memtable_id, &to_flush5);
ASSERT_TRUE(to_flush4.empty()); ASSERT_TRUE(to_flush5.empty());
ASSERT_EQ(1, list.NumNotFlushed()); ASSERT_EQ(1, list.NumNotFlushed());
ASSERT_TRUE(list.imm_flush_needed.load(std::memory_order_acquire)); ASSERT_TRUE(list.imm_flush_needed.load(std::memory_order_acquire));
ASSERT_FALSE(list.IsFlushPending()); ASSERT_FALSE(list.IsFlushPending());
@ -825,8 +847,8 @@ TEST_F(MemTableListTest, FlushPendingTest) {
// equal to 5. Therefore, only tables[5] will be selected. // equal to 5. Therefore, only tables[5] will be selected.
memtable_id = 5; memtable_id = 5;
list.FlushRequested(); list.FlushRequested();
list.PickMemtablesToFlush(memtable_id, &to_flush4); list.PickMemtablesToFlush(memtable_id, &to_flush5);
ASSERT_EQ(1, static_cast<int>(to_flush4.size())); ASSERT_EQ(1, static_cast<int>(to_flush5.size()));
ASSERT_EQ(1, list.NumNotFlushed()); ASSERT_EQ(1, list.NumNotFlushed());
ASSERT_FALSE(list.imm_flush_needed.load(std::memory_order_acquire)); ASSERT_FALSE(list.imm_flush_needed.load(std::memory_order_acquire));
ASSERT_FALSE(list.IsFlushPending()); ASSERT_FALSE(list.IsFlushPending());

Loading…
Cancel
Save