Check KeyContext status in MultiGet (#6387)

Summary:
Currently, any IO errors and checksum mismatches while reading data
blocks, are being ignored by the batched MultiGet. Its only looking at
the GetContext state. Fix that.
Pull Request resolved: https://github.com/facebook/rocksdb/pull/6387

Test Plan: Add unit tests

Differential Revision: D19799819

Pulled By: anand1976

fbshipit-source-id: 46133dccbb04e64067b9fe6cda73e282203db969
main
anand76 5 years ago committed by Facebook Github Bot
parent 4e457278fa
commit d70011bccc
  1. 1
      HISTORY.md
  2. 84
      db/db_basic_test.cc
  3. 1
      db/table_cache.cc
  4. 5
      db/version_set.cc
  5. 1
      table/block_based/block_based_table_reader.cc

@ -9,6 +9,7 @@
* Fixed issue #6316 that can cause a corruption of the MANIFEST file in the middle when writing to it fails due to no disk space. * Fixed issue #6316 that can cause a corruption of the MANIFEST file in the middle when writing to it fails due to no disk space.
* Add DBOptions::skip_checking_sst_file_sizes_on_db_open. It disables potentially expensive checking of all sst file sizes in DB::Open(). * Add DBOptions::skip_checking_sst_file_sizes_on_db_open. It disables potentially expensive checking of all sst file sizes in DB::Open().
* BlobDB now ignores trivially moved files when updating the mapping between blob files and SSTs. This should mitigate issue #6338 where out of order flush/compaction notifications could trigger an assertion with the earlier code. * BlobDB now ignores trivially moved files when updating the mapping between blob files and SSTs. This should mitigate issue #6338 where out of order flush/compaction notifications could trigger an assertion with the earlier code.
* Batched MultiGet() ignores IO errors while reading data blocks, causing it to potentially continue looking for a key and returning stale results.
### Performance Improvements ### Performance Improvements
* Perfom readahead when reading from option files. Inside DB, options.log_readahead_size will be used as the readahead size. In other cases, a default 512KB is used. * Perfom readahead when reading from option files. Inside DB, options.log_readahead_size will be used as the readahead size. In other cases, a default 512KB is used.

@ -2011,6 +2011,90 @@ TEST_P(DBBasicTestWithParallelIO, MultiGet) {
} }
} }
TEST_P(DBBasicTestWithParallelIO, MultiGetWithChecksumMismatch) {
std::vector<std::string> key_data(10);
std::vector<Slice> keys;
// We cannot resize a PinnableSlice vector, so just set initial size to
// largest we think we will need
std::vector<PinnableSlice> values(10);
std::vector<Status> statuses;
int read_count = 0;
ReadOptions ro;
ro.fill_cache = fill_cache();
SyncPoint::GetInstance()->SetCallBack(
"RetrieveMultipleBlocks:VerifyChecksum", [&](void *status) {
Status* s = static_cast<Status*>(status);
read_count++;
if (read_count == 2) {
*s = Status::Corruption();
}
});
SyncPoint::GetInstance()->EnableProcessing();
// Warm up the cache first
key_data.emplace_back(Key(0));
keys.emplace_back(Slice(key_data.back()));
key_data.emplace_back(Key(50));
keys.emplace_back(Slice(key_data.back()));
statuses.resize(keys.size());
dbfull()->MultiGet(ro, dbfull()->DefaultColumnFamily(), keys.size(),
keys.data(), values.data(), statuses.data(), true);
ASSERT_TRUE(CheckValue(0, values[0].ToString()));
//ASSERT_TRUE(CheckValue(50, values[1].ToString()));
ASSERT_EQ(statuses[0], Status::OK());
ASSERT_EQ(statuses[1], Status::Corruption());
SyncPoint::GetInstance()->DisableProcessing();
}
TEST_P(DBBasicTestWithParallelIO, MultiGetWithMissingFile) {
std::vector<std::string> key_data(10);
std::vector<Slice> keys;
// We cannot resize a PinnableSlice vector, so just set initial size to
// largest we think we will need
std::vector<PinnableSlice> values(10);
std::vector<Status> statuses;
ReadOptions ro;
ro.fill_cache = fill_cache();
SyncPoint::GetInstance()->SetCallBack(
"TableCache::MultiGet:FindTable", [&](void *status) {
Status* s = static_cast<Status*>(status);
*s = Status::IOError();
});
// DB open will create table readers unless we reduce the table cache
// capacity.
// SanitizeOptions will set max_open_files to minimum of 20. Table cache
// is allocated with max_open_files - 10 as capacity. So override
// max_open_files to 11 so table cache capacity will become 1. This will
// prevent file open during DB open and force the file to be opened
// during MultiGet
SyncPoint::GetInstance()->SetCallBack(
"SanitizeOptions::AfterChangeMaxOpenFiles", [&](void *arg) {
int* max_open_files = (int*)arg;
*max_open_files = 11;
});
SyncPoint::GetInstance()->EnableProcessing();
Reopen(CurrentOptions());
// Warm up the cache first
key_data.emplace_back(Key(0));
keys.emplace_back(Slice(key_data.back()));
key_data.emplace_back(Key(50));
keys.emplace_back(Slice(key_data.back()));
statuses.resize(keys.size());
dbfull()->MultiGet(ro, dbfull()->DefaultColumnFamily(), keys.size(),
keys.data(), values.data(), statuses.data(), true);
ASSERT_EQ(statuses[0], Status::IOError());
ASSERT_EQ(statuses[1], Status::IOError());
SyncPoint::GetInstance()->DisableProcessing();
}
INSTANTIATE_TEST_CASE_P( INSTANTIATE_TEST_CASE_P(
ParallelIO, DBBasicTestWithParallelIO, ParallelIO, DBBasicTestWithParallelIO,
// Params are as follows - // Params are as follows -

@ -490,6 +490,7 @@ Status TableCache::MultiGet(const ReadOptions& options,
file_options_, internal_comparator, fd, &handle, prefix_extractor, file_options_, internal_comparator, fd, &handle, prefix_extractor,
options.read_tier == kBlockCacheTier /* no_io */, options.read_tier == kBlockCacheTier /* no_io */,
true /* record_read_stats */, file_read_hist, skip_filters, level); true /* record_read_stats */, file_read_hist, skip_filters, level);
TEST_SYNC_POINT_CALLBACK("TableCache::MultiGet:FindTable", &s);
if (s.ok()) { if (s.ok()) {
t = GetTableReaderFromHandle(handle); t = GetTableReaderFromHandle(handle);
assert(t); assert(t);

@ -1965,6 +1965,11 @@ void Version::MultiGet(const ReadOptions& read_options, MultiGetRange* range,
for (auto iter = file_range.begin(); iter != file_range.end(); ++iter) { for (auto iter = file_range.begin(); iter != file_range.end(); ++iter) {
GetContext& get_context = *iter->get_context; GetContext& get_context = *iter->get_context;
Status* status = iter->s; Status* status = iter->s;
// The Status in the KeyContext takes precedence over GetContext state
if (!status->ok()) {
file_range.MarkKeyDone(iter);
continue;
}
if (get_context.sample()) { if (get_context.sample()) {
sample_file_read_inc(f->file_metadata); sample_file_read_inc(f->file_metadata);

@ -2458,6 +2458,7 @@ void BlockBasedTable::RetrieveMultipleBlocks(
s = rocksdb::VerifyChecksum(footer.checksum(), s = rocksdb::VerifyChecksum(footer.checksum(),
req.result.data() + req_offset, req.result.data() + req_offset,
handle.size() + 1, expected); handle.size() + 1, expected);
TEST_SYNC_POINT_CALLBACK("RetrieveMultipleBlocks:VerifyChecksum", &s);
} }
} }

Loading…
Cancel
Save