From 7848f0b24c91637a771b73a78ddbc2d638c65fa3 Mon Sep 17 00:00:00 2001 From: Aaron G Date: Wed, 9 Aug 2017 15:49:40 -0700 Subject: [PATCH] add VerifyChecksum() to db.h Summary: We need a tool to check any sst file corruption in the db. It will check all the sst files in current version and read all the blocks (data, meta, index) with checksum verification. If any verification fails, the function will return non-OK status. Closes https://github.com/facebook/rocksdb/pull/2498 Differential Revision: D5324269 Pulled By: lightmark fbshipit-source-id: 6f8a272008b722402a772acfc804524c9d1a483b --- db/convenience.cc | 29 +++++++++++++ db/corruption_test.cc | 10 +++++ db/db_impl.cc | 50 +++++++++++++++++++++ db/db_impl.h | 2 + db/db_test.cc | 4 ++ include/rocksdb/convenience.h | 5 +++ include/rocksdb/db.h | 2 + include/rocksdb/utilities/stackable_db.h | 2 + table/block_based_table_reader.cc | 55 +++++++++++++++++++++++- table/block_based_table_reader.h | 4 ++ table/table_reader.h | 5 +++ tools/sst_dump_tool.cc | 18 +++++++- tools/sst_dump_tool_imp.h | 1 + 13 files changed, 185 insertions(+), 2 deletions(-) diff --git a/db/convenience.cc b/db/convenience.cc index e3e7165b4..8ee31caca 100644 --- a/db/convenience.cc +++ b/db/convenience.cc @@ -24,6 +24,35 @@ Status DeleteFilesInRange(DB* db, ColumnFamilyHandle* column_family, ->DeleteFilesInRange(column_family, begin, end); } +Status VerifySstFileChecksum(const Options& options, + const EnvOptions& env_options, + const std::string& file_path) { + unique_ptr file; + uint64_t file_size; + InternalKeyComparator internal_comparator(options.comparator); + ImmutableCFOptions ioptions(options); + + Status s = ioptions.env->NewRandomAccessFile(file_path, &file, env_options); + if (s.ok()) { + s = ioptions.env->GetFileSize(file_path, &file_size); + } else { + return s; + } + unique_ptr table_reader; + std::unique_ptr file_reader( + new RandomAccessFileReader(std::move(file), file_path)); + s = ioptions.table_factory->NewTableReader( + TableReaderOptions(ioptions, env_options, internal_comparator, + false /* skip_filters */, -1 /* level */), + std::move(file_reader), file_size, &table_reader, + false /* prefetch_index_and_filter_in_cache */); + if (!s.ok()) { + return s; + } + s = table_reader->VerifyChecksum(); + return s; +} + } // namespace rocksdb #endif // ROCKSDB_LITE diff --git a/db/corruption_test.cc b/db/corruption_test.cc index 9f4237579..608c88d59 100644 --- a/db/corruption_test.cc +++ b/db/corruption_test.cc @@ -20,6 +20,7 @@ #include "db/log_format.h" #include "db/version_set.h" #include "rocksdb/cache.h" +#include "rocksdb/convenience.h" #include "rocksdb/env.h" #include "rocksdb/table.h" #include "rocksdb/write_batch.h" @@ -179,6 +180,9 @@ class CorruptionTest : public testing::Test { } s = WriteStringToFile(Env::Default(), contents, fname); ASSERT_TRUE(s.ok()) << s.ToString(); + Options options; + EnvOptions env_options; + ASSERT_NOK(VerifySstFileChecksum(options, env_options, fname)); } void Corrupt(FileType filetype, int offset, int bytes_to_corrupt) { @@ -312,6 +316,7 @@ TEST_F(CorruptionTest, TableFile) { Corrupt(kTableFile, 100, 1); Check(99, 99); + ASSERT_NOK(dbi->VerifyChecksum()); } TEST_F(CorruptionTest, TableFileIndexData) { @@ -330,6 +335,7 @@ TEST_F(CorruptionTest, TableFileIndexData) { // one full file should be readable, since only one was corrupted // the other file should be fully non-readable, since index was corrupted Check(5000, 5000); + ASSERT_NOK(dbi->VerifyChecksum()); } TEST_F(CorruptionTest, MissingDescriptor) { @@ -389,10 +395,12 @@ TEST_F(CorruptionTest, CompactionInputError) { Corrupt(kTableFile, 100, 1); Check(9, 9); + ASSERT_NOK(dbi->VerifyChecksum()); // Force compactions by writing lots of values Build(10000); Check(10000, 10000); + ASSERT_NOK(dbi->VerifyChecksum()); } TEST_F(CorruptionTest, CompactionInputErrorParanoid) { @@ -424,6 +432,7 @@ TEST_F(CorruptionTest, CompactionInputErrorParanoid) { CorruptTableFileAtLevel(0, 100, 1); Check(9, 9); + ASSERT_NOK(dbi->VerifyChecksum()); // Write must eventually fail because of corrupted table Status s; @@ -445,6 +454,7 @@ TEST_F(CorruptionTest, UnrelatedKeys) { DBImpl* dbi = reinterpret_cast(db_); dbi->TEST_FlushMemTable(); Corrupt(kTableFile, 100, 1); + ASSERT_NOK(dbi->VerifyChecksum()); std::string tmp1, tmp2; ASSERT_OK(db_->Put(WriteOptions(), Key(1000, &tmp1), Value(1000, &tmp2))); diff --git a/db/db_impl.cc b/db/db_impl.cc index 5abd632c0..cdba03915 100644 --- a/db/db_impl.cc +++ b/db/db_impl.cc @@ -67,6 +67,7 @@ #include "port/port.h" #include "rocksdb/cache.h" #include "rocksdb/compaction_filter.h" +#include "rocksdb/convenience.h" #include "rocksdb/db.h" #include "rocksdb/env.h" #include "rocksdb/merge_operator.h" @@ -80,6 +81,7 @@ #include "table/merging_iterator.h" #include "table/table_builder.h" #include "table/two_level_iterator.h" +#include "tools/sst_dump_tool_imp.h" #include "util/auto_roll_logger.h" #include "util/autovector.h" #include "util/build_version.h" @@ -2740,6 +2742,54 @@ Status DBImpl::IngestExternalFile( return status; } +Status DBImpl::VerifyChecksum() { + Status s; + Options options; + EnvOptions env_options; + std::vector cfd_list; + { + InstrumentedMutexLock l(&mutex_); + for (auto cfd : *versions_->GetColumnFamilySet()) { + if (!cfd->IsDropped() && cfd->initialized()) { + cfd->Ref(); + cfd_list.push_back(cfd); + } + } + } + std::vector sv_list; + for (auto cfd : cfd_list) { + sv_list.push_back(cfd->GetReferencedSuperVersion(&mutex_)); + } + for (auto& sv : sv_list) { + VersionStorageInfo* vstorage = sv->current->storage_info(); + for (int i = 0; i < vstorage->num_non_empty_levels() && s.ok(); i++) { + for (size_t j = 0; j < vstorage->LevelFilesBrief(i).num_files && s.ok(); + j++) { + const auto& fd = vstorage->LevelFilesBrief(i).files[j].fd; + std::string fname = TableFileName(immutable_db_options_.db_paths, + fd.GetNumber(), fd.GetPathId()); + s = rocksdb::VerifySstFileChecksum(options, env_options, fname); + } + } + if (!s.ok()) { + break; + } + } + { + InstrumentedMutexLock l(&mutex_); + for (auto sv : sv_list) { + if (sv && sv->Unref()) { + sv->Cleanup(); + delete sv; + } + } + for (auto cfd : cfd_list) { + cfd->Unref(); + } + } + return s; +} + void DBImpl::NotifyOnExternalFileIngested( ColumnFamilyData* cfd, const ExternalSstFileIngestionJob& ingestion_job) { #ifndef ROCKSDB_LITE diff --git a/db/db_impl.h b/db/db_impl.h index d057f9345..31d69a970 100644 --- a/db/db_impl.h +++ b/db/db_impl.h @@ -293,6 +293,8 @@ class DBImpl : public DB { const std::vector& external_files, const IngestExternalFileOptions& ingestion_options) override; + virtual Status VerifyChecksum() override; + #endif // ROCKSDB_LITE // Similar to GetSnapshot(), but also lets the db know that this snapshot diff --git a/db/db_test.cc b/db/db_test.cc index 8d637e579..675c403e5 100644 --- a/db/db_test.cc +++ b/db/db_test.cc @@ -2235,6 +2235,10 @@ class ModelDB : public DB { return Status::NotSupported("Not implemented."); } + virtual Status VerifyChecksum() override { + return Status::NotSupported("Not implemented."); + } + using DB::GetPropertiesOfAllTables; virtual Status GetPropertiesOfAllTables( ColumnFamilyHandle* column_family, diff --git a/include/rocksdb/convenience.h b/include/rocksdb/convenience.h index cb0c6f56b..4a60afb11 100644 --- a/include/rocksdb/convenience.h +++ b/include/rocksdb/convenience.h @@ -329,6 +329,11 @@ void CancelAllBackgroundWork(DB* db, bool wait = false); // Snapshots before the delete might not see the data in the given range. Status DeleteFilesInRange(DB* db, ColumnFamilyHandle* column_family, const Slice* begin, const Slice* end); + +// Verify the checksum of file +Status VerifySstFileChecksum(const Options& options, + const EnvOptions& env_options, + const std::string& file_path); #endif // ROCKSDB_LITE } // namespace rocksdb diff --git a/include/rocksdb/db.h b/include/rocksdb/db.h index 692932c35..078c24b4f 100644 --- a/include/rocksdb/db.h +++ b/include/rocksdb/db.h @@ -976,6 +976,8 @@ class DB { return IngestExternalFile(DefaultColumnFamily(), external_files, options); } + virtual Status VerifyChecksum() = 0; + // AddFile() is deprecated, please use IngestExternalFile() ROCKSDB_DEPRECATED_FUNC virtual Status AddFile( ColumnFamilyHandle* column_family, diff --git a/include/rocksdb/utilities/stackable_db.h b/include/rocksdb/utilities/stackable_db.h index d2c0dbd7b..991de90aa 100644 --- a/include/rocksdb/utilities/stackable_db.h +++ b/include/rocksdb/utilities/stackable_db.h @@ -95,6 +95,8 @@ class StackableDB : public DB { return db_->IngestExternalFile(column_family, external_files, options); } + virtual Status VerifyChecksum() override { return db_->VerifyChecksum(); } + using DB::KeyMayExist; virtual bool KeyMayExist(const ReadOptions& options, ColumnFamilyHandle* column_family, const Slice& key, diff --git a/table/block_based_table_reader.cc b/table/block_based_table_reader.cc index a0b58c6b2..5931692f0 100644 --- a/table/block_based_table_reader.cc +++ b/table/block_based_table_reader.cc @@ -820,7 +820,6 @@ Status BlockBasedTable::ReadMetaBlock(Rep* rep, std::unique_ptr* iter) { // TODO(sanjay): Skip this if footer.metaindex_handle() size indicates // it is an empty block. - // TODO: we never really verify check sum for meta index block std::unique_ptr meta; Status s = ReadBlockFromFile( rep->file.get(), rep->footer, ReadOptions(), @@ -1746,6 +1745,60 @@ Status BlockBasedTable::Prefetch(const Slice* const begin, return Status::OK(); } +Status BlockBasedTable::VerifyChecksum() { + Status s; + // Check Meta blocks + std::unique_ptr meta; + std::unique_ptr meta_iter; + s = ReadMetaBlock(rep_, &meta, &meta_iter); + if (s.ok()) { + s = VerifyChecksumInBlocks(meta_iter.get()); + if (!s.ok()) { + return s; + } + } else { + return s; + } + // Check Data blocks + BlockIter iiter_on_stack; + InternalIterator* iiter = NewIndexIterator(ReadOptions(), &iiter_on_stack); + std::unique_ptr iiter_unique_ptr; + if (iiter != &iiter_on_stack) { + iiter_unique_ptr = std::unique_ptr(iiter); + } + if (!iiter->status().ok()) { + // error opening index iterator + return iiter->status(); + } + s = VerifyChecksumInBlocks(iiter); + return s; +} + +Status BlockBasedTable::VerifyChecksumInBlocks(InternalIterator* index_iter) { + Status s; + for (index_iter->SeekToFirst(); index_iter->Valid(); index_iter->Next()) { + s = index_iter->status(); + if (!s.ok()) { + break; + } + BlockHandle handle; + Slice input = index_iter->value(); + s = handle.DecodeFrom(&input); + if (!s.ok()) { + break; + } + BlockContents contents; + s = ReadBlockContents(rep_->file.get(), rep_->footer, ReadOptions(), + handle, &contents, rep_->ioptions, + false /* decompress */, Slice() /*compression dict*/, + rep_->persistent_cache_options); + if (!s.ok()) { + break; + } + } + return s; +} + bool BlockBasedTable::TEST_KeyInCache(const ReadOptions& options, const Slice& key) { std::unique_ptr iiter(NewIndexIterator(options)); diff --git a/table/block_based_table_reader.h b/table/block_based_table_reader.h index 3acc3a8fb..3451614c8 100644 --- a/table/block_based_table_reader.h +++ b/table/block_based_table_reader.h @@ -139,6 +139,8 @@ class BlockBasedTable : public TableReader { // convert SST file to a human readable form Status DumpTable(WritableFile* out_file) override; + Status VerifyChecksum() override; + void Close() override; ~BlockBasedTable(); @@ -310,6 +312,8 @@ class BlockBasedTable : public TableReader { static Status ReadMetaBlock(Rep* rep, std::unique_ptr* meta_block, std::unique_ptr* iter); + Status VerifyChecksumInBlocks(InternalIterator* index_iter); + // Create the filter from the filter block. FilterBlockReader* ReadFilter(const BlockHandle& filter_handle, const bool is_a_filter_partition) const; diff --git a/table/table_reader.h b/table/table_reader.h index 9681d5467..18fcda273 100644 --- a/table/table_reader.h +++ b/table/table_reader.h @@ -98,6 +98,11 @@ class TableReader { return Status::NotSupported("DumpTable() not supported"); } + // check whether there is corruption in this db file + virtual Status VerifyChecksum() { + return Status::NotSupported("VerifyChecksum() not supported"); + } + virtual void Close() {} }; diff --git a/tools/sst_dump_tool.cc b/tools/sst_dump_tool.cc index 07f348612..e6322f8b4 100644 --- a/tools/sst_dump_tool.cc +++ b/tools/sst_dump_tool.cc @@ -127,6 +127,10 @@ Status SstFileReader::NewTableReader( std::move(file_), file_size, &table_reader_); } +Status SstFileReader::VerifyChecksum() { + return table_reader_->VerifyChecksum(); +} + Status SstFileReader::DumpTable(const std::string& out_filename) { unique_ptr out_file; Env* env = Env::Default(); @@ -349,10 +353,11 @@ void print_help() { --file= Path to SST file or directory containing SST files - --command=check|scan|raw + --command=check|scan|raw|verify check: Iterate over entries in files but dont print anything except if an error is encounterd (default command) scan: Iterate over entries in files and print them to screen raw: Dump all the table contents to _dump.txt + verify: Iterate all the blocks in files verifying checksum to detect possible coruption but dont print anything except if a corruption is encountered --output_hex Can be combined with scan command to print the keys and values in Hex @@ -580,6 +585,17 @@ int SSTDumpTool::Run(int argc, char** argv) { } } + if (command == "verify") { + st = reader.VerifyChecksum(); + if (!st.ok()) { + fprintf(stderr, "%s is corrupted: %s\n", filename.c_str(), + st.ToString().c_str()); + } else { + fprintf(stdout, "The file is ok\n"); + } + continue; + } + if (show_properties || show_summary) { const rocksdb::TableProperties* table_properties; diff --git a/tools/sst_dump_tool_imp.h b/tools/sst_dump_tool_imp.h index 0129d98eb..e2b639607 100644 --- a/tools/sst_dump_tool_imp.h +++ b/tools/sst_dump_tool_imp.h @@ -30,6 +30,7 @@ class SstFileReader { uint64_t GetReadNumber() { return read_num_; } TableProperties* GetInitTableProperties() { return table_properties_.get(); } + Status VerifyChecksum(); Status DumpTable(const std::string& out_filename); Status getStatus() { return init_result_; }