From e1c468d16ff3400508570fbbed4d4b01f3505471 Mon Sep 17 00:00:00 2001 From: sdong Date: Fri, 16 Aug 2019 16:40:09 -0700 Subject: [PATCH] Do readahead in VerifyChecksum() (#5713) Summary: Right now VerifyChecksum() doesn't do read-ahead. In some use cases, users won't be able to achieve good performance. With this change, by default, RocksDB will do a default readahead, and users will be able to overwrite the readahead size by passing in a ReadOptions. Pull Request resolved: https://github.com/facebook/rocksdb/pull/5713 Test Plan: Add a new unit test. Differential Revision: D16860874 fbshipit-source-id: 0cff0fe79ac855d3d068e6ccd770770854a68413 --- HISTORY.md | 2 + Makefile | 2 +- db/convenience.cc | 9 ++- db/corruption_test.cc | 55 ++++++++++++++++++- db/db_impl/db_impl.cc | 5 +- db/db_impl/db_impl.h | 3 +- db/db_test.cc | 3 +- db/external_sst_file_ingestion_job.cc | 7 ++- include/rocksdb/convenience.h | 7 +++ include/rocksdb/db.h | 4 +- include/rocksdb/sst_file_reader.h | 4 +- include/rocksdb/utilities/stackable_db.h | 4 ++ table/block_based/block_based_table_reader.cc | 46 +++++++++------- table/block_based/block_based_table_reader.h | 21 ++++--- table/sst_file_reader.cc | 5 +- table/table_reader.h | 3 +- tools/sst_dump_tool.cc | 4 +- 17 files changed, 140 insertions(+), 44 deletions(-) diff --git a/HISTORY.md b/HISTORY.md index cbb3fec35..13b49f469 100644 --- a/HISTORY.md +++ b/HISTORY.md @@ -2,6 +2,8 @@ ## Unreleased ### Bug Fixes * Fixed a number of data races in BlobDB. +### New Features +* VerifyChecksum() by default will issue readahead. Allow ReadOptions to be passed in to those functions to override the readhead size. ## 6.4.0 (7/30/2019) ### Default Option Change diff --git a/Makefile b/Makefile index 742d8ab79..be02db7f3 100644 --- a/Makefile +++ b/Makefile @@ -1226,7 +1226,7 @@ histogram_test: monitoring/histogram_test.o $(LIBOBJECTS) $(TESTHARNESS) thread_local_test: util/thread_local_test.o $(LIBOBJECTS) $(TESTHARNESS) $(AM_LINK) -corruption_test: db/corruption_test.o $(LIBOBJECTS) $(TESTHARNESS) +corruption_test: db/corruption_test.o db/db_test_util.o $(LIBOBJECTS) $(TESTHARNESS) $(AM_LINK) crc32c_test: util/crc32c_test.o $(LIBOBJECTS) $(TESTHARNESS) diff --git a/db/convenience.cc b/db/convenience.cc index 271217cd4..320d5c6e1 100644 --- a/db/convenience.cc +++ b/db/convenience.cc @@ -35,6 +35,12 @@ Status DeleteFilesInRanges(DB* db, ColumnFamilyHandle* column_family, Status VerifySstFileChecksum(const Options& options, const EnvOptions& env_options, const std::string& file_path) { + return VerifySstFileChecksum(options, env_options, ReadOptions(), file_path); +} +Status VerifySstFileChecksum(const Options& options, + const EnvOptions& env_options, + const ReadOptions& read_options, + const std::string& file_path) { std::unique_ptr file; uint64_t file_size; InternalKeyComparator internal_comparator(options.comparator); @@ -59,7 +65,8 @@ Status VerifySstFileChecksum(const Options& options, if (!s.ok()) { return s; } - s = table_reader->VerifyChecksum(TableReaderCaller::kUserVerifyChecksum); + s = table_reader->VerifyChecksum(read_options, + TableReaderCaller::kUserVerifyChecksum); return s; } diff --git a/db/corruption_test.cc b/db/corruption_test.cc index 82752161f..3fd85953d 100644 --- a/db/corruption_test.cc +++ b/db/corruption_test.cc @@ -13,10 +13,11 @@ #include #include -#include #include #include +#include #include "db/db_impl/db_impl.h" +#include "db/db_test_util.h" #include "db/log_format.h" #include "db/version_set.h" #include "file/filename.h" @@ -76,7 +77,11 @@ class CorruptionTest : public testing::Test { delete db_; db_ = nullptr; Options opt = (options ? *options : options_); - opt.env = &env_; + if (opt.env == Options().env) { + // If env is not overridden, replace it with ErrorEnv. + // Otherwise, the test already uses a non-default Env. + opt.env = &env_; + } opt.arena_block_size = 4096; BlockBasedTableOptions table_options; table_options.block_cache = tiny_cache_; @@ -321,6 +326,52 @@ TEST_F(CorruptionTest, TableFile) { ASSERT_NOK(dbi->VerifyChecksum()); } +TEST_F(CorruptionTest, VerifyChecksumReadahead) { + Options options; + SpecialEnv senv(Env::Default()); + options.env = &senv; + // Disable block cache as we are going to check checksum for + // the same file twice and measure number of reads. + BlockBasedTableOptions table_options_no_bc; + table_options_no_bc.no_block_cache = true; + options.table_factory.reset(NewBlockBasedTableFactory(table_options_no_bc)); + + Reopen(&options); + + Build(10000); + DBImpl* dbi = reinterpret_cast(db_); + dbi->TEST_FlushMemTable(); + dbi->TEST_CompactRange(0, nullptr, nullptr); + dbi->TEST_CompactRange(1, nullptr, nullptr); + + senv.count_random_reads_ = true; + senv.random_read_counter_.Reset(); + ASSERT_OK(dbi->VerifyChecksum()); + + // Make sure the counter is enabled. + ASSERT_GT(senv.random_read_counter_.Read(), 0); + + // The SST file is about 10MB. Default readahead size is 256KB. + // Give a conservative 20 reads for metadata blocks, The number + // of random reads should be within 10 MB / 256KB + 20 = 60. + ASSERT_LT(senv.random_read_counter_.Read(), 60); + + senv.random_read_bytes_counter_ = 0; + ReadOptions ro; + ro.readahead_size = size_t{32 * 1024}; + ASSERT_OK(dbi->VerifyChecksum(ro)); + // The SST file is about 10MB. We set readahead size to 32KB. + // Give 0 to 20 reads for metadata blocks, and allow real read + // to range from 24KB to 48KB. The lower bound would be: + // 10MB / 48KB + 0 = 213 + // The higher bound is + // 10MB / 24KB + 20 = 447. + ASSERT_GE(senv.random_read_counter_.Read(), 213); + ASSERT_LE(senv.random_read_counter_.Read(), 447); + + CloseDb(); +} + TEST_F(CorruptionTest, TableFileIndexData) { Options options; // very big, we'll trigger flushes manually diff --git a/db/db_impl/db_impl.cc b/db/db_impl/db_impl.cc index 5d885b8b2..dae879346 100644 --- a/db/db_impl/db_impl.cc +++ b/db/db_impl/db_impl.cc @@ -4112,7 +4112,7 @@ Status DBImpl::CreateColumnFamilyWithImport( return status; } -Status DBImpl::VerifyChecksum() { +Status DBImpl::VerifyChecksum(const ReadOptions& read_options) { Status s; std::vector cfd_list; { @@ -4143,7 +4143,8 @@ Status DBImpl::VerifyChecksum() { const auto& fd = vstorage->LevelFilesBrief(i).files[j].fd; std::string fname = TableFileName(cfd->ioptions()->cf_paths, fd.GetNumber(), fd.GetPathId()); - s = rocksdb::VerifySstFileChecksum(opts, env_options_, fname); + s = rocksdb::VerifySstFileChecksum(opts, env_options_, read_options, + fname); } } if (!s.ok()) { diff --git a/db/db_impl/db_impl.h b/db/db_impl/db_impl.h index f2b3df5e6..42377ce41 100644 --- a/db/db_impl/db_impl.h +++ b/db/db_impl/db_impl.h @@ -380,7 +380,8 @@ class DBImpl : public DB { const ExportImportFilesMetaData& metadata, ColumnFamilyHandle** handle) override; - virtual Status VerifyChecksum() override; + using DB::VerifyChecksum; + virtual Status VerifyChecksum(const ReadOptions& /*read_options*/) override; using DB::StartTrace; virtual Status StartTrace( diff --git a/db/db_test.cc b/db/db_test.cc index ae0481da8..17ac22729 100644 --- a/db/db_test.cc +++ b/db/db_test.cc @@ -2585,7 +2585,8 @@ class ModelDB : public DB { return Status::NotSupported("Not implemented."); } - Status VerifyChecksum() override { + using DB::VerifyChecksum; + Status VerifyChecksum(const ReadOptions&) override { return Status::NotSupported("Not implemented."); } diff --git a/db/external_sst_file_ingestion_job.cc b/db/external_sst_file_ingestion_job.cc index 2fce8e01b..46c1a6b68 100644 --- a/db/external_sst_file_ingestion_job.cc +++ b/db/external_sst_file_ingestion_job.cc @@ -347,8 +347,11 @@ Status ExternalSstFileIngestionJob::GetIngestedFileInfo( } if (ingestion_options_.verify_checksums_before_ingest) { - status = - table_reader->VerifyChecksum(TableReaderCaller::kExternalSSTIngestion); + // If customized readahead size is needed, we can pass a user option + // all the way to here. Right now we just rely on the default readahead + // to keep things simple. + status = table_reader->VerifyChecksum( + ReadOptions(), TableReaderCaller::kExternalSSTIngestion); } if (!status.ok()) { return status; diff --git a/include/rocksdb/convenience.h b/include/rocksdb/convenience.h index d3cbe6016..db26948a4 100644 --- a/include/rocksdb/convenience.h +++ b/include/rocksdb/convenience.h @@ -339,6 +339,13 @@ Status DeleteFilesInRanges(DB* db, ColumnFamilyHandle* column_family, Status VerifySstFileChecksum(const Options& options, const EnvOptions& env_options, const std::string& file_path); + +// Verify the checksum of file +Status VerifySstFileChecksum(const Options& options, + const EnvOptions& env_options, + const ReadOptions& read_options, + const std::string& file_path); + #endif // ROCKSDB_LITE } // namespace rocksdb diff --git a/include/rocksdb/db.h b/include/rocksdb/db.h index 36d6fea92..023659524 100644 --- a/include/rocksdb/db.h +++ b/include/rocksdb/db.h @@ -1225,7 +1225,9 @@ class DB { const ExportImportFilesMetaData& metadata, ColumnFamilyHandle** handle) = 0; - virtual Status VerifyChecksum() = 0; + virtual Status VerifyChecksum(const ReadOptions& read_options) = 0; + + virtual Status VerifyChecksum() { return VerifyChecksum(ReadOptions()); } // AddFile() is deprecated, please use IngestExternalFile() ROCKSDB_DEPRECATED_FUNC virtual Status AddFile( diff --git a/include/rocksdb/sst_file_reader.h b/include/rocksdb/sst_file_reader.h index 517907dd5..522a8d9a1 100644 --- a/include/rocksdb/sst_file_reader.h +++ b/include/rocksdb/sst_file_reader.h @@ -33,7 +33,9 @@ class SstFileReader { std::shared_ptr GetTableProperties() const; // Verifies whether there is corruption in this table. - Status VerifyChecksum(); + Status VerifyChecksum(const ReadOptions& /*read_options*/); + + Status VerifyChecksum() { return VerifyChecksum(ReadOptions()); } private: struct Rep; diff --git a/include/rocksdb/utilities/stackable_db.h b/include/rocksdb/utilities/stackable_db.h index 35fddc804..3941c1821 100644 --- a/include/rocksdb/utilities/stackable_db.h +++ b/include/rocksdb/utilities/stackable_db.h @@ -143,6 +143,10 @@ class StackableDB : public DB { virtual Status VerifyChecksum() override { return db_->VerifyChecksum(); } + virtual Status VerifyChecksum(const ReadOptions& options) override { + return db_->VerifyChecksum(options); + } + using DB::KeyMayExist; virtual bool KeyMayExist(const ReadOptions& options, ColumnFamilyHandle* column_family, const Slice& key, diff --git a/table/block_based/block_based_table_reader.cc b/table/block_based/block_based_table_reader.cc index 65e960f8c..ef3758740 100644 --- a/table/block_based/block_based_table_reader.cc +++ b/table/block_based/block_based_table_reader.cc @@ -62,6 +62,10 @@ extern const std::string kHashIndexPrefixesMetadataBlock; typedef BlockBasedTable::IndexReader IndexReader; +// Found that 256 KB readahead size provides the best performance, based on +// experiments, for auto readahead. Experiment data is in PR #3282. +const size_t BlockBasedTable::kMaxAutoReadaheadSize = 256 * 1024; + BlockBasedTable::~BlockBasedTable() { delete rep_; } @@ -2854,13 +2858,6 @@ void BlockBasedTableIterator::Prev() { FindKeyBackward(); } -// Found that 256 KB readahead size provides the best performance, based on -// experiments, for auto readahead. Experiment data is in PR #3282. -template -const size_t - BlockBasedTableIterator::kMaxAutoReadaheadSize = - 256 * 1024; - template void BlockBasedTableIterator::InitDataBlock() { BlockHandle data_block_handle = index_iter_->value().handle; @@ -2883,7 +2880,8 @@ void BlockBasedTableIterator::InitDataBlock() { if (read_options_.readahead_size == 0) { // Implicit auto readahead num_file_reads_++; - if (num_file_reads_ > kMinNumFileReadsToStartAutoReadahead) { + if (num_file_reads_ > + BlockBasedTable::kMinNumFileReadsToStartAutoReadahead) { if (!rep->file->use_direct_io() && (data_block_handle.offset() + static_cast(data_block_handle.size()) + @@ -2897,14 +2895,14 @@ void BlockBasedTableIterator::InitDataBlock() { readahead_size_); // Keep exponentially increasing readahead size until // kMaxAutoReadaheadSize. - readahead_size_ = - std::min(kMaxAutoReadaheadSize, readahead_size_ * 2); + readahead_size_ = std::min(BlockBasedTable::kMaxAutoReadaheadSize, + readahead_size_ * 2); } else if (rep->file->use_direct_io() && !prefetch_buffer_) { // Direct I/O // Let FilePrefetchBuffer take care of the readahead. - prefetch_buffer_.reset( - new FilePrefetchBuffer(rep->file.get(), kInitAutoReadaheadSize, - kMaxAutoReadaheadSize)); + prefetch_buffer_.reset(new FilePrefetchBuffer( + rep->file.get(), BlockBasedTable::kInitAutoReadaheadSize, + BlockBasedTable::kMaxAutoReadaheadSize)); } } } else if (!prefetch_buffer_) { @@ -3707,7 +3705,8 @@ Status BlockBasedTable::Prefetch(const Slice* const begin, return Status::OK(); } -Status BlockBasedTable::VerifyChecksum(TableReaderCaller caller) { +Status BlockBasedTable::VerifyChecksum(const ReadOptions& read_options, + TableReaderCaller caller) { Status s; // Check Meta blocks std::unique_ptr meta; @@ -3725,7 +3724,7 @@ Status BlockBasedTable::VerifyChecksum(TableReaderCaller caller) { IndexBlockIter iiter_on_stack; BlockCacheLookupContext context{caller}; InternalIteratorBase* iiter = NewIndexIterator( - ReadOptions(), /*need_upper_bound_check=*/false, &iiter_on_stack, + read_options, /*disable_prefix_seek=*/false, &iiter_on_stack, /*get_context=*/nullptr, &context); std::unique_ptr> iiter_unique_ptr; if (iiter != &iiter_on_stack) { @@ -3735,13 +3734,22 @@ Status BlockBasedTable::VerifyChecksum(TableReaderCaller caller) { // error opening index iterator return iiter->status(); } - s = VerifyChecksumInBlocks(iiter); + s = VerifyChecksumInBlocks(read_options, iiter); return s; } Status BlockBasedTable::VerifyChecksumInBlocks( + const ReadOptions& read_options, InternalIteratorBase* index_iter) { Status s; + // We are scanning the whole file, so no need to do exponential + // increasing of the buffer size. + size_t readahead_size = (read_options.readahead_size != 0) + ? read_options.readahead_size + : kMaxAutoReadaheadSize; + FilePrefetchBuffer prefetch_buffer(rep_->file.get(), readahead_size, + readahead_size); + for (index_iter->SeekToFirst(); index_iter->Valid(); index_iter->Next()) { s = index_iter->status(); if (!s.ok()) { @@ -3750,9 +3758,9 @@ Status BlockBasedTable::VerifyChecksumInBlocks( BlockHandle handle = index_iter->value().handle; BlockContents contents; BlockFetcher block_fetcher( - rep_->file.get(), nullptr /* prefetch buffer */, rep_->footer, - ReadOptions(), handle, &contents, rep_->ioptions, - false /* decompress */, false /*maybe_compressed*/, BlockType::kData, + rep_->file.get(), &prefetch_buffer, rep_->footer, ReadOptions(), handle, + &contents, rep_->ioptions, false /* decompress */, + false /*maybe_compressed*/, BlockType::kData, UncompressionDict::GetEmptyDict(), rep_->persistent_cache_options); s = block_fetcher.ReadBlockContents(); if (!s.ok()) { diff --git a/table/block_based/block_based_table_reader.h b/table/block_based/block_based_table_reader.h index 2b0c4754a..7597c00d6 100644 --- a/table/block_based/block_based_table_reader.h +++ b/table/block_based/block_based_table_reader.h @@ -82,6 +82,13 @@ class BlockBasedTable : public TableReader { // For Posix files the unique ID is three varints. static const size_t kMaxCacheKeyPrefixSize = kMaxVarint64Length * 3 + 1; + // All the below fields control iterator readahead + static const size_t kInitAutoReadaheadSize = 8 * 1024; + // Found that 256 KB readahead size provides the best performance, based on + // experiments, for auto readahead. Experiment data is in PR #3282. + static const size_t kMaxAutoReadaheadSize; + static const int kMinNumFileReadsToStartAutoReadahead = 2; + // Attempt to open the table that is stored in bytes [0..file_size) // of "file", and read the metadata entries necessary to allow // retrieving data from the table. @@ -182,7 +189,8 @@ class BlockBasedTable : public TableReader { // convert SST file to a human readable form Status DumpTable(WritableFile* out_file) override; - Status VerifyChecksum(TableReaderCaller caller) override; + Status VerifyChecksum(const ReadOptions& readOptions, + TableReaderCaller caller) override; ~BlockBasedTable(); @@ -430,7 +438,8 @@ class BlockBasedTable : public TableReader { static BlockType GetBlockTypeForMetaBlockByName(const Slice& meta_block_name); Status VerifyChecksumInMetaBlocks(InternalIteratorBase* index_iter); - Status VerifyChecksumInBlocks(InternalIteratorBase* index_iter); + Status VerifyChecksumInBlocks(const ReadOptions& read_options, + InternalIteratorBase* index_iter); // Create the filter from the filter block. std::unique_ptr CreateFilterBlockReader( @@ -772,13 +781,7 @@ class BlockBasedTableIterator : public InternalIteratorBase { // lookup_context_.caller = kCompaction. size_t compaction_readahead_size_; - // All the below fields control iterator readahead - static const size_t kInitAutoReadaheadSize = 8 * 1024; - // Found that 256 KB readahead size provides the best performance, based on - // experiments, for auto readahead. Experiment data is in PR #3282. - static const size_t kMaxAutoReadaheadSize; - static const int kMinNumFileReadsToStartAutoReadahead = 2; - size_t readahead_size_ = kInitAutoReadaheadSize; + size_t readahead_size_ = BlockBasedTable::kInitAutoReadaheadSize; size_t readahead_limit_ = 0; int64_t num_file_reads_ = 0; std::unique_ptr prefetch_buffer_; diff --git a/table/sst_file_reader.cc b/table/sst_file_reader.cc index 7c3b91cc3..cc892c25b 100644 --- a/table/sst_file_reader.cc +++ b/table/sst_file_reader.cc @@ -79,8 +79,9 @@ std::shared_ptr SstFileReader::GetTableProperties() return rep_->table_reader->GetTableProperties(); } -Status SstFileReader::VerifyChecksum() { - return rep_->table_reader->VerifyChecksum(TableReaderCaller::kSSTFileReader); +Status SstFileReader::VerifyChecksum(const ReadOptions& read_options) { + return rep_->table_reader->VerifyChecksum(read_options, + TableReaderCaller::kSSTFileReader); } } // namespace rocksdb diff --git a/table/table_reader.h b/table/table_reader.h index e968014b1..bf4dad766 100644 --- a/table/table_reader.h +++ b/table/table_reader.h @@ -127,7 +127,8 @@ class TableReader { } // check whether there is corruption in this db file - virtual Status VerifyChecksum(TableReaderCaller /*caller*/) { + virtual Status VerifyChecksum(const ReadOptions& /*read_options*/, + TableReaderCaller /*caller*/) { return Status::NotSupported("VerifyChecksum() not supported"); } }; diff --git a/tools/sst_dump_tool.cc b/tools/sst_dump_tool.cc index 44a733b57..cbd9c0c87 100644 --- a/tools/sst_dump_tool.cc +++ b/tools/sst_dump_tool.cc @@ -143,7 +143,9 @@ Status SstFileDumper::NewTableReader( } Status SstFileDumper::VerifyChecksum() { - return table_reader_->VerifyChecksum(TableReaderCaller::kSSTDumpTool); + // We could pass specific readahead setting into read options if needed. + return table_reader_->VerifyChecksum(ReadOptions(), + TableReaderCaller::kSSTDumpTool); } Status SstFileDumper::DumpTable(const std::string& out_filename) {