diff --git a/db/convenience.cc b/db/convenience.cc index e3e7165b4..8ee31caca 100644 --- a/db/convenience.cc +++ b/db/convenience.cc @@ -24,6 +24,35 @@ Status DeleteFilesInRange(DB* db, ColumnFamilyHandle* column_family, ->DeleteFilesInRange(column_family, begin, end); } +Status VerifySstFileChecksum(const Options& options, + const EnvOptions& env_options, + const std::string& file_path) { + unique_ptr file; + uint64_t file_size; + InternalKeyComparator internal_comparator(options.comparator); + ImmutableCFOptions ioptions(options); + + Status s = ioptions.env->NewRandomAccessFile(file_path, &file, env_options); + if (s.ok()) { + s = ioptions.env->GetFileSize(file_path, &file_size); + } else { + return s; + } + unique_ptr table_reader; + std::unique_ptr file_reader( + new RandomAccessFileReader(std::move(file), file_path)); + s = ioptions.table_factory->NewTableReader( + TableReaderOptions(ioptions, env_options, internal_comparator, + false /* skip_filters */, -1 /* level */), + std::move(file_reader), file_size, &table_reader, + false /* prefetch_index_and_filter_in_cache */); + if (!s.ok()) { + return s; + } + s = table_reader->VerifyChecksum(); + return s; +} + } // namespace rocksdb #endif // ROCKSDB_LITE diff --git a/db/corruption_test.cc b/db/corruption_test.cc index 9f4237579..608c88d59 100644 --- a/db/corruption_test.cc +++ b/db/corruption_test.cc @@ -20,6 +20,7 @@ #include "db/log_format.h" #include "db/version_set.h" #include "rocksdb/cache.h" +#include "rocksdb/convenience.h" #include "rocksdb/env.h" #include "rocksdb/table.h" #include "rocksdb/write_batch.h" @@ -179,6 +180,9 @@ class CorruptionTest : public testing::Test { } s = WriteStringToFile(Env::Default(), contents, fname); ASSERT_TRUE(s.ok()) << s.ToString(); + Options options; + EnvOptions env_options; + ASSERT_NOK(VerifySstFileChecksum(options, env_options, fname)); } void Corrupt(FileType filetype, int offset, int bytes_to_corrupt) { @@ -312,6 +316,7 @@ TEST_F(CorruptionTest, TableFile) { Corrupt(kTableFile, 100, 1); Check(99, 99); + ASSERT_NOK(dbi->VerifyChecksum()); } TEST_F(CorruptionTest, TableFileIndexData) { @@ -330,6 +335,7 @@ TEST_F(CorruptionTest, TableFileIndexData) { // one full file should be readable, since only one was corrupted // the other file should be fully non-readable, since index was corrupted Check(5000, 5000); + ASSERT_NOK(dbi->VerifyChecksum()); } TEST_F(CorruptionTest, MissingDescriptor) { @@ -389,10 +395,12 @@ TEST_F(CorruptionTest, CompactionInputError) { Corrupt(kTableFile, 100, 1); Check(9, 9); + ASSERT_NOK(dbi->VerifyChecksum()); // Force compactions by writing lots of values Build(10000); Check(10000, 10000); + ASSERT_NOK(dbi->VerifyChecksum()); } TEST_F(CorruptionTest, CompactionInputErrorParanoid) { @@ -424,6 +432,7 @@ TEST_F(CorruptionTest, CompactionInputErrorParanoid) { CorruptTableFileAtLevel(0, 100, 1); Check(9, 9); + ASSERT_NOK(dbi->VerifyChecksum()); // Write must eventually fail because of corrupted table Status s; @@ -445,6 +454,7 @@ TEST_F(CorruptionTest, UnrelatedKeys) { DBImpl* dbi = reinterpret_cast(db_); dbi->TEST_FlushMemTable(); Corrupt(kTableFile, 100, 1); + ASSERT_NOK(dbi->VerifyChecksum()); std::string tmp1, tmp2; ASSERT_OK(db_->Put(WriteOptions(), Key(1000, &tmp1), Value(1000, &tmp2))); diff --git a/db/db_impl.cc b/db/db_impl.cc index 5abd632c0..cdba03915 100644 --- a/db/db_impl.cc +++ b/db/db_impl.cc @@ -67,6 +67,7 @@ #include "port/port.h" #include "rocksdb/cache.h" #include "rocksdb/compaction_filter.h" +#include "rocksdb/convenience.h" #include "rocksdb/db.h" #include "rocksdb/env.h" #include "rocksdb/merge_operator.h" @@ -80,6 +81,7 @@ #include "table/merging_iterator.h" #include "table/table_builder.h" #include "table/two_level_iterator.h" +#include "tools/sst_dump_tool_imp.h" #include "util/auto_roll_logger.h" #include "util/autovector.h" #include "util/build_version.h" @@ -2740,6 +2742,54 @@ Status DBImpl::IngestExternalFile( return status; } +Status DBImpl::VerifyChecksum() { + Status s; + Options options; + EnvOptions env_options; + std::vector cfd_list; + { + InstrumentedMutexLock l(&mutex_); + for (auto cfd : *versions_->GetColumnFamilySet()) { + if (!cfd->IsDropped() && cfd->initialized()) { + cfd->Ref(); + cfd_list.push_back(cfd); + } + } + } + std::vector sv_list; + for (auto cfd : cfd_list) { + sv_list.push_back(cfd->GetReferencedSuperVersion(&mutex_)); + } + for (auto& sv : sv_list) { + VersionStorageInfo* vstorage = sv->current->storage_info(); + for (int i = 0; i < vstorage->num_non_empty_levels() && s.ok(); i++) { + for (size_t j = 0; j < vstorage->LevelFilesBrief(i).num_files && s.ok(); + j++) { + const auto& fd = vstorage->LevelFilesBrief(i).files[j].fd; + std::string fname = TableFileName(immutable_db_options_.db_paths, + fd.GetNumber(), fd.GetPathId()); + s = rocksdb::VerifySstFileChecksum(options, env_options, fname); + } + } + if (!s.ok()) { + break; + } + } + { + InstrumentedMutexLock l(&mutex_); + for (auto sv : sv_list) { + if (sv && sv->Unref()) { + sv->Cleanup(); + delete sv; + } + } + for (auto cfd : cfd_list) { + cfd->Unref(); + } + } + return s; +} + void DBImpl::NotifyOnExternalFileIngested( ColumnFamilyData* cfd, const ExternalSstFileIngestionJob& ingestion_job) { #ifndef ROCKSDB_LITE diff --git a/db/db_impl.h b/db/db_impl.h index d057f9345..31d69a970 100644 --- a/db/db_impl.h +++ b/db/db_impl.h @@ -293,6 +293,8 @@ class DBImpl : public DB { const std::vector& external_files, const IngestExternalFileOptions& ingestion_options) override; + virtual Status VerifyChecksum() override; + #endif // ROCKSDB_LITE // Similar to GetSnapshot(), but also lets the db know that this snapshot diff --git a/db/db_test.cc b/db/db_test.cc index 8d637e579..675c403e5 100644 --- a/db/db_test.cc +++ b/db/db_test.cc @@ -2235,6 +2235,10 @@ class ModelDB : public DB { return Status::NotSupported("Not implemented."); } + virtual Status VerifyChecksum() override { + return Status::NotSupported("Not implemented."); + } + using DB::GetPropertiesOfAllTables; virtual Status GetPropertiesOfAllTables( ColumnFamilyHandle* column_family, diff --git a/include/rocksdb/convenience.h b/include/rocksdb/convenience.h index cb0c6f56b..4a60afb11 100644 --- a/include/rocksdb/convenience.h +++ b/include/rocksdb/convenience.h @@ -329,6 +329,11 @@ void CancelAllBackgroundWork(DB* db, bool wait = false); // Snapshots before the delete might not see the data in the given range. Status DeleteFilesInRange(DB* db, ColumnFamilyHandle* column_family, const Slice* begin, const Slice* end); + +// Verify the checksum of file +Status VerifySstFileChecksum(const Options& options, + const EnvOptions& env_options, + const std::string& file_path); #endif // ROCKSDB_LITE } // namespace rocksdb diff --git a/include/rocksdb/db.h b/include/rocksdb/db.h index 692932c35..078c24b4f 100644 --- a/include/rocksdb/db.h +++ b/include/rocksdb/db.h @@ -976,6 +976,8 @@ class DB { return IngestExternalFile(DefaultColumnFamily(), external_files, options); } + virtual Status VerifyChecksum() = 0; + // AddFile() is deprecated, please use IngestExternalFile() ROCKSDB_DEPRECATED_FUNC virtual Status AddFile( ColumnFamilyHandle* column_family, diff --git a/include/rocksdb/utilities/stackable_db.h b/include/rocksdb/utilities/stackable_db.h index d2c0dbd7b..991de90aa 100644 --- a/include/rocksdb/utilities/stackable_db.h +++ b/include/rocksdb/utilities/stackable_db.h @@ -95,6 +95,8 @@ class StackableDB : public DB { return db_->IngestExternalFile(column_family, external_files, options); } + virtual Status VerifyChecksum() override { return db_->VerifyChecksum(); } + using DB::KeyMayExist; virtual bool KeyMayExist(const ReadOptions& options, ColumnFamilyHandle* column_family, const Slice& key, diff --git a/table/block_based_table_reader.cc b/table/block_based_table_reader.cc index a0b58c6b2..5931692f0 100644 --- a/table/block_based_table_reader.cc +++ b/table/block_based_table_reader.cc @@ -820,7 +820,6 @@ Status BlockBasedTable::ReadMetaBlock(Rep* rep, std::unique_ptr* iter) { // TODO(sanjay): Skip this if footer.metaindex_handle() size indicates // it is an empty block. - // TODO: we never really verify check sum for meta index block std::unique_ptr meta; Status s = ReadBlockFromFile( rep->file.get(), rep->footer, ReadOptions(), @@ -1746,6 +1745,60 @@ Status BlockBasedTable::Prefetch(const Slice* const begin, return Status::OK(); } +Status BlockBasedTable::VerifyChecksum() { + Status s; + // Check Meta blocks + std::unique_ptr meta; + std::unique_ptr meta_iter; + s = ReadMetaBlock(rep_, &meta, &meta_iter); + if (s.ok()) { + s = VerifyChecksumInBlocks(meta_iter.get()); + if (!s.ok()) { + return s; + } + } else { + return s; + } + // Check Data blocks + BlockIter iiter_on_stack; + InternalIterator* iiter = NewIndexIterator(ReadOptions(), &iiter_on_stack); + std::unique_ptr iiter_unique_ptr; + if (iiter != &iiter_on_stack) { + iiter_unique_ptr = std::unique_ptr(iiter); + } + if (!iiter->status().ok()) { + // error opening index iterator + return iiter->status(); + } + s = VerifyChecksumInBlocks(iiter); + return s; +} + +Status BlockBasedTable::VerifyChecksumInBlocks(InternalIterator* index_iter) { + Status s; + for (index_iter->SeekToFirst(); index_iter->Valid(); index_iter->Next()) { + s = index_iter->status(); + if (!s.ok()) { + break; + } + BlockHandle handle; + Slice input = index_iter->value(); + s = handle.DecodeFrom(&input); + if (!s.ok()) { + break; + } + BlockContents contents; + s = ReadBlockContents(rep_->file.get(), rep_->footer, ReadOptions(), + handle, &contents, rep_->ioptions, + false /* decompress */, Slice() /*compression dict*/, + rep_->persistent_cache_options); + if (!s.ok()) { + break; + } + } + return s; +} + bool BlockBasedTable::TEST_KeyInCache(const ReadOptions& options, const Slice& key) { std::unique_ptr iiter(NewIndexIterator(options)); diff --git a/table/block_based_table_reader.h b/table/block_based_table_reader.h index 3acc3a8fb..3451614c8 100644 --- a/table/block_based_table_reader.h +++ b/table/block_based_table_reader.h @@ -139,6 +139,8 @@ class BlockBasedTable : public TableReader { // convert SST file to a human readable form Status DumpTable(WritableFile* out_file) override; + Status VerifyChecksum() override; + void Close() override; ~BlockBasedTable(); @@ -310,6 +312,8 @@ class BlockBasedTable : public TableReader { static Status ReadMetaBlock(Rep* rep, std::unique_ptr* meta_block, std::unique_ptr* iter); + Status VerifyChecksumInBlocks(InternalIterator* index_iter); + // Create the filter from the filter block. FilterBlockReader* ReadFilter(const BlockHandle& filter_handle, const bool is_a_filter_partition) const; diff --git a/table/table_reader.h b/table/table_reader.h index 9681d5467..18fcda273 100644 --- a/table/table_reader.h +++ b/table/table_reader.h @@ -98,6 +98,11 @@ class TableReader { return Status::NotSupported("DumpTable() not supported"); } + // check whether there is corruption in this db file + virtual Status VerifyChecksum() { + return Status::NotSupported("VerifyChecksum() not supported"); + } + virtual void Close() {} }; diff --git a/tools/sst_dump_tool.cc b/tools/sst_dump_tool.cc index 07f348612..e6322f8b4 100644 --- a/tools/sst_dump_tool.cc +++ b/tools/sst_dump_tool.cc @@ -127,6 +127,10 @@ Status SstFileReader::NewTableReader( std::move(file_), file_size, &table_reader_); } +Status SstFileReader::VerifyChecksum() { + return table_reader_->VerifyChecksum(); +} + Status SstFileReader::DumpTable(const std::string& out_filename) { unique_ptr out_file; Env* env = Env::Default(); @@ -349,10 +353,11 @@ void print_help() { --file= Path to SST file or directory containing SST files - --command=check|scan|raw + --command=check|scan|raw|verify check: Iterate over entries in files but dont print anything except if an error is encounterd (default command) scan: Iterate over entries in files and print them to screen raw: Dump all the table contents to _dump.txt + verify: Iterate all the blocks in files verifying checksum to detect possible coruption but dont print anything except if a corruption is encountered --output_hex Can be combined with scan command to print the keys and values in Hex @@ -580,6 +585,17 @@ int SSTDumpTool::Run(int argc, char** argv) { } } + if (command == "verify") { + st = reader.VerifyChecksum(); + if (!st.ok()) { + fprintf(stderr, "%s is corrupted: %s\n", filename.c_str(), + st.ToString().c_str()); + } else { + fprintf(stdout, "The file is ok\n"); + } + continue; + } + if (show_properties || show_summary) { const rocksdb::TableProperties* table_properties; diff --git a/tools/sst_dump_tool_imp.h b/tools/sst_dump_tool_imp.h index 0129d98eb..e2b639607 100644 --- a/tools/sst_dump_tool_imp.h +++ b/tools/sst_dump_tool_imp.h @@ -30,6 +30,7 @@ class SstFileReader { uint64_t GetReadNumber() { return read_num_; } TableProperties* GetInitTableProperties() { return table_properties_.get(); } + Status VerifyChecksum(); Status DumpTable(const std::string& out_filename); Status getStatus() { return init_result_; }