Do readahead in VerifyChecksum() (#5713)

Summary:
Right now VerifyChecksum() doesn't do read-ahead. In some use cases, users won't be able to achieve good performance. With this change, by default, RocksDB will do a default readahead, and users will be able to overwrite the readahead size by passing in a ReadOptions.
Pull Request resolved: https://github.com/facebook/rocksdb/pull/5713

Test Plan: Add a new unit test.

Differential Revision: D16860874

fbshipit-source-id: 0cff0fe79ac855d3d068e6ccd770770854a68413
main
sdong 5 years ago committed by Facebook Github Bot
parent e89b1c9c6e
commit e1c468d16f
  1. 2
      HISTORY.md
  2. 2
      Makefile
  3. 9
      db/convenience.cc
  4. 55
      db/corruption_test.cc
  5. 5
      db/db_impl/db_impl.cc
  6. 3
      db/db_impl/db_impl.h
  7. 3
      db/db_test.cc
  8. 7
      db/external_sst_file_ingestion_job.cc
  9. 7
      include/rocksdb/convenience.h
  10. 4
      include/rocksdb/db.h
  11. 4
      include/rocksdb/sst_file_reader.h
  12. 4
      include/rocksdb/utilities/stackable_db.h
  13. 46
      table/block_based/block_based_table_reader.cc
  14. 21
      table/block_based/block_based_table_reader.h
  15. 5
      table/sst_file_reader.cc
  16. 3
      table/table_reader.h
  17. 4
      tools/sst_dump_tool.cc

@ -2,6 +2,8 @@
## Unreleased
### Bug Fixes
* Fixed a number of data races in BlobDB.
### New Features
* VerifyChecksum() by default will issue readahead. Allow ReadOptions to be passed in to those functions to override the readhead size.
## 6.4.0 (7/30/2019)
### Default Option Change

@ -1226,7 +1226,7 @@ histogram_test: monitoring/histogram_test.o $(LIBOBJECTS) $(TESTHARNESS)
thread_local_test: util/thread_local_test.o $(LIBOBJECTS) $(TESTHARNESS)
$(AM_LINK)
corruption_test: db/corruption_test.o $(LIBOBJECTS) $(TESTHARNESS)
corruption_test: db/corruption_test.o db/db_test_util.o $(LIBOBJECTS) $(TESTHARNESS)
$(AM_LINK)
crc32c_test: util/crc32c_test.o $(LIBOBJECTS) $(TESTHARNESS)

@ -35,6 +35,12 @@ Status DeleteFilesInRanges(DB* db, ColumnFamilyHandle* column_family,
Status VerifySstFileChecksum(const Options& options,
const EnvOptions& env_options,
const std::string& file_path) {
return VerifySstFileChecksum(options, env_options, ReadOptions(), file_path);
}
Status VerifySstFileChecksum(const Options& options,
const EnvOptions& env_options,
const ReadOptions& read_options,
const std::string& file_path) {
std::unique_ptr<RandomAccessFile> file;
uint64_t file_size;
InternalKeyComparator internal_comparator(options.comparator);
@ -59,7 +65,8 @@ Status VerifySstFileChecksum(const Options& options,
if (!s.ok()) {
return s;
}
s = table_reader->VerifyChecksum(TableReaderCaller::kUserVerifyChecksum);
s = table_reader->VerifyChecksum(read_options,
TableReaderCaller::kUserVerifyChecksum);
return s;
}

@ -13,10 +13,11 @@
#include <errno.h>
#include <fcntl.h>
#include <cinttypes>
#include <sys/stat.h>
#include <sys/types.h>
#include <cinttypes>
#include "db/db_impl/db_impl.h"
#include "db/db_test_util.h"
#include "db/log_format.h"
#include "db/version_set.h"
#include "file/filename.h"
@ -76,7 +77,11 @@ class CorruptionTest : public testing::Test {
delete db_;
db_ = nullptr;
Options opt = (options ? *options : options_);
opt.env = &env_;
if (opt.env == Options().env) {
// If env is not overridden, replace it with ErrorEnv.
// Otherwise, the test already uses a non-default Env.
opt.env = &env_;
}
opt.arena_block_size = 4096;
BlockBasedTableOptions table_options;
table_options.block_cache = tiny_cache_;
@ -321,6 +326,52 @@ TEST_F(CorruptionTest, TableFile) {
ASSERT_NOK(dbi->VerifyChecksum());
}
TEST_F(CorruptionTest, VerifyChecksumReadahead) {
Options options;
SpecialEnv senv(Env::Default());
options.env = &senv;
// Disable block cache as we are going to check checksum for
// the same file twice and measure number of reads.
BlockBasedTableOptions table_options_no_bc;
table_options_no_bc.no_block_cache = true;
options.table_factory.reset(NewBlockBasedTableFactory(table_options_no_bc));
Reopen(&options);
Build(10000);
DBImpl* dbi = reinterpret_cast<DBImpl*>(db_);
dbi->TEST_FlushMemTable();
dbi->TEST_CompactRange(0, nullptr, nullptr);
dbi->TEST_CompactRange(1, nullptr, nullptr);
senv.count_random_reads_ = true;
senv.random_read_counter_.Reset();
ASSERT_OK(dbi->VerifyChecksum());
// Make sure the counter is enabled.
ASSERT_GT(senv.random_read_counter_.Read(), 0);
// The SST file is about 10MB. Default readahead size is 256KB.
// Give a conservative 20 reads for metadata blocks, The number
// of random reads should be within 10 MB / 256KB + 20 = 60.
ASSERT_LT(senv.random_read_counter_.Read(), 60);
senv.random_read_bytes_counter_ = 0;
ReadOptions ro;
ro.readahead_size = size_t{32 * 1024};
ASSERT_OK(dbi->VerifyChecksum(ro));
// The SST file is about 10MB. We set readahead size to 32KB.
// Give 0 to 20 reads for metadata blocks, and allow real read
// to range from 24KB to 48KB. The lower bound would be:
// 10MB / 48KB + 0 = 213
// The higher bound is
// 10MB / 24KB + 20 = 447.
ASSERT_GE(senv.random_read_counter_.Read(), 213);
ASSERT_LE(senv.random_read_counter_.Read(), 447);
CloseDb();
}
TEST_F(CorruptionTest, TableFileIndexData) {
Options options;
// very big, we'll trigger flushes manually

@ -4112,7 +4112,7 @@ Status DBImpl::CreateColumnFamilyWithImport(
return status;
}
Status DBImpl::VerifyChecksum() {
Status DBImpl::VerifyChecksum(const ReadOptions& read_options) {
Status s;
std::vector<ColumnFamilyData*> cfd_list;
{
@ -4143,7 +4143,8 @@ Status DBImpl::VerifyChecksum() {
const auto& fd = vstorage->LevelFilesBrief(i).files[j].fd;
std::string fname = TableFileName(cfd->ioptions()->cf_paths,
fd.GetNumber(), fd.GetPathId());
s = rocksdb::VerifySstFileChecksum(opts, env_options_, fname);
s = rocksdb::VerifySstFileChecksum(opts, env_options_, read_options,
fname);
}
}
if (!s.ok()) {

@ -380,7 +380,8 @@ class DBImpl : public DB {
const ExportImportFilesMetaData& metadata,
ColumnFamilyHandle** handle) override;
virtual Status VerifyChecksum() override;
using DB::VerifyChecksum;
virtual Status VerifyChecksum(const ReadOptions& /*read_options*/) override;
using DB::StartTrace;
virtual Status StartTrace(

@ -2585,7 +2585,8 @@ class ModelDB : public DB {
return Status::NotSupported("Not implemented.");
}
Status VerifyChecksum() override {
using DB::VerifyChecksum;
Status VerifyChecksum(const ReadOptions&) override {
return Status::NotSupported("Not implemented.");
}

@ -347,8 +347,11 @@ Status ExternalSstFileIngestionJob::GetIngestedFileInfo(
}
if (ingestion_options_.verify_checksums_before_ingest) {
status =
table_reader->VerifyChecksum(TableReaderCaller::kExternalSSTIngestion);
// If customized readahead size is needed, we can pass a user option
// all the way to here. Right now we just rely on the default readahead
// to keep things simple.
status = table_reader->VerifyChecksum(
ReadOptions(), TableReaderCaller::kExternalSSTIngestion);
}
if (!status.ok()) {
return status;

@ -339,6 +339,13 @@ Status DeleteFilesInRanges(DB* db, ColumnFamilyHandle* column_family,
Status VerifySstFileChecksum(const Options& options,
const EnvOptions& env_options,
const std::string& file_path);
// Verify the checksum of file
Status VerifySstFileChecksum(const Options& options,
const EnvOptions& env_options,
const ReadOptions& read_options,
const std::string& file_path);
#endif // ROCKSDB_LITE
} // namespace rocksdb

@ -1225,7 +1225,9 @@ class DB {
const ExportImportFilesMetaData& metadata,
ColumnFamilyHandle** handle) = 0;
virtual Status VerifyChecksum() = 0;
virtual Status VerifyChecksum(const ReadOptions& read_options) = 0;
virtual Status VerifyChecksum() { return VerifyChecksum(ReadOptions()); }
// AddFile() is deprecated, please use IngestExternalFile()
ROCKSDB_DEPRECATED_FUNC virtual Status AddFile(

@ -33,7 +33,9 @@ class SstFileReader {
std::shared_ptr<const TableProperties> GetTableProperties() const;
// Verifies whether there is corruption in this table.
Status VerifyChecksum();
Status VerifyChecksum(const ReadOptions& /*read_options*/);
Status VerifyChecksum() { return VerifyChecksum(ReadOptions()); }
private:
struct Rep;

@ -143,6 +143,10 @@ class StackableDB : public DB {
virtual Status VerifyChecksum() override { return db_->VerifyChecksum(); }
virtual Status VerifyChecksum(const ReadOptions& options) override {
return db_->VerifyChecksum(options);
}
using DB::KeyMayExist;
virtual bool KeyMayExist(const ReadOptions& options,
ColumnFamilyHandle* column_family, const Slice& key,

@ -62,6 +62,10 @@ extern const std::string kHashIndexPrefixesMetadataBlock;
typedef BlockBasedTable::IndexReader IndexReader;
// Found that 256 KB readahead size provides the best performance, based on
// experiments, for auto readahead. Experiment data is in PR #3282.
const size_t BlockBasedTable::kMaxAutoReadaheadSize = 256 * 1024;
BlockBasedTable::~BlockBasedTable() {
delete rep_;
}
@ -2854,13 +2858,6 @@ void BlockBasedTableIterator<TBlockIter, TValue>::Prev() {
FindKeyBackward();
}
// Found that 256 KB readahead size provides the best performance, based on
// experiments, for auto readahead. Experiment data is in PR #3282.
template <class TBlockIter, typename TValue>
const size_t
BlockBasedTableIterator<TBlockIter, TValue>::kMaxAutoReadaheadSize =
256 * 1024;
template <class TBlockIter, typename TValue>
void BlockBasedTableIterator<TBlockIter, TValue>::InitDataBlock() {
BlockHandle data_block_handle = index_iter_->value().handle;
@ -2883,7 +2880,8 @@ void BlockBasedTableIterator<TBlockIter, TValue>::InitDataBlock() {
if (read_options_.readahead_size == 0) {
// Implicit auto readahead
num_file_reads_++;
if (num_file_reads_ > kMinNumFileReadsToStartAutoReadahead) {
if (num_file_reads_ >
BlockBasedTable::kMinNumFileReadsToStartAutoReadahead) {
if (!rep->file->use_direct_io() &&
(data_block_handle.offset() +
static_cast<size_t>(data_block_handle.size()) +
@ -2897,14 +2895,14 @@ void BlockBasedTableIterator<TBlockIter, TValue>::InitDataBlock() {
readahead_size_);
// Keep exponentially increasing readahead size until
// kMaxAutoReadaheadSize.
readahead_size_ =
std::min(kMaxAutoReadaheadSize, readahead_size_ * 2);
readahead_size_ = std::min(BlockBasedTable::kMaxAutoReadaheadSize,
readahead_size_ * 2);
} else if (rep->file->use_direct_io() && !prefetch_buffer_) {
// Direct I/O
// Let FilePrefetchBuffer take care of the readahead.
prefetch_buffer_.reset(
new FilePrefetchBuffer(rep->file.get(), kInitAutoReadaheadSize,
kMaxAutoReadaheadSize));
prefetch_buffer_.reset(new FilePrefetchBuffer(
rep->file.get(), BlockBasedTable::kInitAutoReadaheadSize,
BlockBasedTable::kMaxAutoReadaheadSize));
}
}
} else if (!prefetch_buffer_) {
@ -3707,7 +3705,8 @@ Status BlockBasedTable::Prefetch(const Slice* const begin,
return Status::OK();
}
Status BlockBasedTable::VerifyChecksum(TableReaderCaller caller) {
Status BlockBasedTable::VerifyChecksum(const ReadOptions& read_options,
TableReaderCaller caller) {
Status s;
// Check Meta blocks
std::unique_ptr<Block> meta;
@ -3725,7 +3724,7 @@ Status BlockBasedTable::VerifyChecksum(TableReaderCaller caller) {
IndexBlockIter iiter_on_stack;
BlockCacheLookupContext context{caller};
InternalIteratorBase<IndexValue>* iiter = NewIndexIterator(
ReadOptions(), /*need_upper_bound_check=*/false, &iiter_on_stack,
read_options, /*disable_prefix_seek=*/false, &iiter_on_stack,
/*get_context=*/nullptr, &context);
std::unique_ptr<InternalIteratorBase<IndexValue>> iiter_unique_ptr;
if (iiter != &iiter_on_stack) {
@ -3735,13 +3734,22 @@ Status BlockBasedTable::VerifyChecksum(TableReaderCaller caller) {
// error opening index iterator
return iiter->status();
}
s = VerifyChecksumInBlocks(iiter);
s = VerifyChecksumInBlocks(read_options, iiter);
return s;
}
Status BlockBasedTable::VerifyChecksumInBlocks(
const ReadOptions& read_options,
InternalIteratorBase<IndexValue>* index_iter) {
Status s;
// We are scanning the whole file, so no need to do exponential
// increasing of the buffer size.
size_t readahead_size = (read_options.readahead_size != 0)
? read_options.readahead_size
: kMaxAutoReadaheadSize;
FilePrefetchBuffer prefetch_buffer(rep_->file.get(), readahead_size,
readahead_size);
for (index_iter->SeekToFirst(); index_iter->Valid(); index_iter->Next()) {
s = index_iter->status();
if (!s.ok()) {
@ -3750,9 +3758,9 @@ Status BlockBasedTable::VerifyChecksumInBlocks(
BlockHandle handle = index_iter->value().handle;
BlockContents contents;
BlockFetcher block_fetcher(
rep_->file.get(), nullptr /* prefetch buffer */, rep_->footer,
ReadOptions(), handle, &contents, rep_->ioptions,
false /* decompress */, false /*maybe_compressed*/, BlockType::kData,
rep_->file.get(), &prefetch_buffer, rep_->footer, ReadOptions(), handle,
&contents, rep_->ioptions, false /* decompress */,
false /*maybe_compressed*/, BlockType::kData,
UncompressionDict::GetEmptyDict(), rep_->persistent_cache_options);
s = block_fetcher.ReadBlockContents();
if (!s.ok()) {

@ -82,6 +82,13 @@ class BlockBasedTable : public TableReader {
// For Posix files the unique ID is three varints.
static const size_t kMaxCacheKeyPrefixSize = kMaxVarint64Length * 3 + 1;
// All the below fields control iterator readahead
static const size_t kInitAutoReadaheadSize = 8 * 1024;
// Found that 256 KB readahead size provides the best performance, based on
// experiments, for auto readahead. Experiment data is in PR #3282.
static const size_t kMaxAutoReadaheadSize;
static const int kMinNumFileReadsToStartAutoReadahead = 2;
// Attempt to open the table that is stored in bytes [0..file_size)
// of "file", and read the metadata entries necessary to allow
// retrieving data from the table.
@ -182,7 +189,8 @@ class BlockBasedTable : public TableReader {
// convert SST file to a human readable form
Status DumpTable(WritableFile* out_file) override;
Status VerifyChecksum(TableReaderCaller caller) override;
Status VerifyChecksum(const ReadOptions& readOptions,
TableReaderCaller caller) override;
~BlockBasedTable();
@ -430,7 +438,8 @@ class BlockBasedTable : public TableReader {
static BlockType GetBlockTypeForMetaBlockByName(const Slice& meta_block_name);
Status VerifyChecksumInMetaBlocks(InternalIteratorBase<Slice>* index_iter);
Status VerifyChecksumInBlocks(InternalIteratorBase<IndexValue>* index_iter);
Status VerifyChecksumInBlocks(const ReadOptions& read_options,
InternalIteratorBase<IndexValue>* index_iter);
// Create the filter from the filter block.
std::unique_ptr<FilterBlockReader> CreateFilterBlockReader(
@ -772,13 +781,7 @@ class BlockBasedTableIterator : public InternalIteratorBase<TValue> {
// lookup_context_.caller = kCompaction.
size_t compaction_readahead_size_;
// All the below fields control iterator readahead
static const size_t kInitAutoReadaheadSize = 8 * 1024;
// Found that 256 KB readahead size provides the best performance, based on
// experiments, for auto readahead. Experiment data is in PR #3282.
static const size_t kMaxAutoReadaheadSize;
static const int kMinNumFileReadsToStartAutoReadahead = 2;
size_t readahead_size_ = kInitAutoReadaheadSize;
size_t readahead_size_ = BlockBasedTable::kInitAutoReadaheadSize;
size_t readahead_limit_ = 0;
int64_t num_file_reads_ = 0;
std::unique_ptr<FilePrefetchBuffer> prefetch_buffer_;

@ -79,8 +79,9 @@ std::shared_ptr<const TableProperties> SstFileReader::GetTableProperties()
return rep_->table_reader->GetTableProperties();
}
Status SstFileReader::VerifyChecksum() {
return rep_->table_reader->VerifyChecksum(TableReaderCaller::kSSTFileReader);
Status SstFileReader::VerifyChecksum(const ReadOptions& read_options) {
return rep_->table_reader->VerifyChecksum(read_options,
TableReaderCaller::kSSTFileReader);
}
} // namespace rocksdb

@ -127,7 +127,8 @@ class TableReader {
}
// check whether there is corruption in this db file
virtual Status VerifyChecksum(TableReaderCaller /*caller*/) {
virtual Status VerifyChecksum(const ReadOptions& /*read_options*/,
TableReaderCaller /*caller*/) {
return Status::NotSupported("VerifyChecksum() not supported");
}
};

@ -143,7 +143,9 @@ Status SstFileDumper::NewTableReader(
}
Status SstFileDumper::VerifyChecksum() {
return table_reader_->VerifyChecksum(TableReaderCaller::kSSTDumpTool);
// We could pass specific readahead setting into read options if needed.
return table_reader_->VerifyChecksum(ReadOptions(),
TableReaderCaller::kSSTDumpTool);
}
Status SstFileDumper::DumpTable(const std::string& out_filename) {

Loading…
Cancel
Save