diff --git a/HISTORY.md b/HISTORY.md index 01d3f2e3a..bfd4e4551 100644 --- a/HISTORY.md +++ b/HISTORY.md @@ -9,6 +9,7 @@ * Fixed issue #6316 that can cause a corruption of the MANIFEST file in the middle when writing to it fails due to no disk space. * Add DBOptions::skip_checking_sst_file_sizes_on_db_open. It disables potentially expensive checking of all sst file sizes in DB::Open(). * BlobDB now ignores trivially moved files when updating the mapping between blob files and SSTs. This should mitigate issue #6338 where out of order flush/compaction notifications could trigger an assertion with the earlier code. +* Batched MultiGet() ignores IO errors while reading data blocks, causing it to potentially continue looking for a key and returning stale results. ### Performance Improvements * Perfom readahead when reading from option files. Inside DB, options.log_readahead_size will be used as the readahead size. In other cases, a default 512KB is used. diff --git a/db/db_basic_test.cc b/db/db_basic_test.cc index 86f6f810d..842f5d874 100644 --- a/db/db_basic_test.cc +++ b/db/db_basic_test.cc @@ -2011,6 +2011,90 @@ TEST_P(DBBasicTestWithParallelIO, MultiGet) { } } +TEST_P(DBBasicTestWithParallelIO, MultiGetWithChecksumMismatch) { + std::vector key_data(10); + std::vector keys; + // We cannot resize a PinnableSlice vector, so just set initial size to + // largest we think we will need + std::vector values(10); + std::vector statuses; + int read_count = 0; + ReadOptions ro; + ro.fill_cache = fill_cache(); + + SyncPoint::GetInstance()->SetCallBack( + "RetrieveMultipleBlocks:VerifyChecksum", [&](void *status) { + Status* s = static_cast(status); + read_count++; + if (read_count == 2) { + *s = Status::Corruption(); + } + }); + SyncPoint::GetInstance()->EnableProcessing(); + + // Warm up the cache first + key_data.emplace_back(Key(0)); + keys.emplace_back(Slice(key_data.back())); + key_data.emplace_back(Key(50)); + keys.emplace_back(Slice(key_data.back())); + statuses.resize(keys.size()); + + dbfull()->MultiGet(ro, dbfull()->DefaultColumnFamily(), keys.size(), + keys.data(), values.data(), statuses.data(), true); + ASSERT_TRUE(CheckValue(0, values[0].ToString())); + //ASSERT_TRUE(CheckValue(50, values[1].ToString())); + ASSERT_EQ(statuses[0], Status::OK()); + ASSERT_EQ(statuses[1], Status::Corruption()); + + SyncPoint::GetInstance()->DisableProcessing(); +} + +TEST_P(DBBasicTestWithParallelIO, MultiGetWithMissingFile) { + std::vector key_data(10); + std::vector keys; + // We cannot resize a PinnableSlice vector, so just set initial size to + // largest we think we will need + std::vector values(10); + std::vector statuses; + ReadOptions ro; + ro.fill_cache = fill_cache(); + + SyncPoint::GetInstance()->SetCallBack( + "TableCache::MultiGet:FindTable", [&](void *status) { + Status* s = static_cast(status); + *s = Status::IOError(); + }); + // DB open will create table readers unless we reduce the table cache + // capacity. + // SanitizeOptions will set max_open_files to minimum of 20. Table cache + // is allocated with max_open_files - 10 as capacity. So override + // max_open_files to 11 so table cache capacity will become 1. This will + // prevent file open during DB open and force the file to be opened + // during MultiGet + SyncPoint::GetInstance()->SetCallBack( + "SanitizeOptions::AfterChangeMaxOpenFiles", [&](void *arg) { + int* max_open_files = (int*)arg; + *max_open_files = 11; + }); + SyncPoint::GetInstance()->EnableProcessing(); + + Reopen(CurrentOptions()); + + // Warm up the cache first + key_data.emplace_back(Key(0)); + keys.emplace_back(Slice(key_data.back())); + key_data.emplace_back(Key(50)); + keys.emplace_back(Slice(key_data.back())); + statuses.resize(keys.size()); + + dbfull()->MultiGet(ro, dbfull()->DefaultColumnFamily(), keys.size(), + keys.data(), values.data(), statuses.data(), true); + ASSERT_EQ(statuses[0], Status::IOError()); + ASSERT_EQ(statuses[1], Status::IOError()); + + SyncPoint::GetInstance()->DisableProcessing(); +} + INSTANTIATE_TEST_CASE_P( ParallelIO, DBBasicTestWithParallelIO, // Params are as follows - diff --git a/db/table_cache.cc b/db/table_cache.cc index 12bd90230..5dc895d84 100644 --- a/db/table_cache.cc +++ b/db/table_cache.cc @@ -490,6 +490,7 @@ Status TableCache::MultiGet(const ReadOptions& options, file_options_, internal_comparator, fd, &handle, prefix_extractor, options.read_tier == kBlockCacheTier /* no_io */, true /* record_read_stats */, file_read_hist, skip_filters, level); + TEST_SYNC_POINT_CALLBACK("TableCache::MultiGet:FindTable", &s); if (s.ok()) { t = GetTableReaderFromHandle(handle); assert(t); diff --git a/db/version_set.cc b/db/version_set.cc index 6511a4d8d..40f08a674 100644 --- a/db/version_set.cc +++ b/db/version_set.cc @@ -1965,6 +1965,11 @@ void Version::MultiGet(const ReadOptions& read_options, MultiGetRange* range, for (auto iter = file_range.begin(); iter != file_range.end(); ++iter) { GetContext& get_context = *iter->get_context; Status* status = iter->s; + // The Status in the KeyContext takes precedence over GetContext state + if (!status->ok()) { + file_range.MarkKeyDone(iter); + continue; + } if (get_context.sample()) { sample_file_read_inc(f->file_metadata); diff --git a/table/block_based/block_based_table_reader.cc b/table/block_based/block_based_table_reader.cc index 336669f1b..0a6424b73 100644 --- a/table/block_based/block_based_table_reader.cc +++ b/table/block_based/block_based_table_reader.cc @@ -2458,6 +2458,7 @@ void BlockBasedTable::RetrieveMultipleBlocks( s = rocksdb::VerifyChecksum(footer.checksum(), req.result.data() + req_offset, handle.size() + 1, expected); + TEST_SYNC_POINT_CALLBACK("RetrieveMultipleBlocks:VerifyChecksum", &s); } }