From d44cbc53148b74fd094c9f9394f30a1a5b8fabc8 Mon Sep 17 00:00:00 2001 From: mrambacher Date: Wed, 22 Jul 2020 11:03:29 -0700 Subject: [PATCH] Add hash of key/value checks when paranoid_file_checks=true (#7134) Summary: When paraoid_files_checks=true, a rolling key-value hash is generated and compared to what is written to the file. If the values do not match, the SST file is rejected. Code put in place for the check for both flush and compaction jobs. Corresponding test added to corruption_test. Pull Request resolved: https://github.com/facebook/rocksdb/pull/7134 Reviewed By: cheng-chang Differential Revision: D22646149 fbshipit-source-id: 8fde1984a1a11edd3bd82a413acffc5ea7aa683f --- HISTORY.md | 2 +- db/builder.cc | 15 ++- db/compaction/compaction_job.cc | 54 +++++++--- db/corruption_test.cc | 54 ++++++++++ include/rocksdb/advanced_options.h | 2 + table/mock_table.cc | 153 ++++++++++++++++++++++++++++- table/mock_table.h | 144 ++------------------------- 7 files changed, 271 insertions(+), 153 deletions(-) diff --git a/HISTORY.md b/HISTORY.md index 609073a1f..314d9fa1a 100644 --- a/HISTORY.md +++ b/HISTORY.md @@ -10,7 +10,7 @@ * In best-efforts recovery, an error that is not Corruption or IOError::kNotFound or IOError::kPathNotFound will be overwritten silently. Fix this by checking all non-ok cases and return early. * When `file_checksum_gen_factory` is set to `GetFileChecksumGenCrc32cFactory()`, BackupEngine will compare the crc32c checksums of table files computed when creating a backup to the expected checksums stored in the DB manifest, and will fail `CreateNewBackup()` on mismatch (corruption). If the `file_checksum_gen_factory` is not set or set to any other customized factory, there is no checksum verification to detect if SST files in a DB are corrupt when read, copied, and independently checksummed by BackupEngine. * When a DB sets `stats_dump_period_sec > 0`, either as the initial value for DB open or as a dynamic option change, the first stats dump is staggered in the following X seconds, where X is an integer in `[0, stats_dump_period_sec)`. Subsequent stats dumps are still spaced `stats_dump_period_sec` seconds apart. - +* When the paranoid_file_checks option is true, a hash is generated of all keys and values are generated when the SST file is written, and then the values are read back in to validate the file. A corruption is signaled if the two hashes do not match. ### Bug fixes * Compressed block cache was automatically disabled with read-only DBs by mistake. Now it is fixed: compressed block cache will be in effective with read-only DB too. * Fix a bug of wrong iterator result if another thread finishes an update and a DB flush between two statement. diff --git a/db/builder.cc b/db/builder.cc index eb3ab5422..510eb36fb 100644 --- a/db/builder.cc +++ b/db/builder.cc @@ -93,6 +93,7 @@ Status BuildTable( column_family_name.empty()); // Reports the IOStats for flush for every following bytes. const size_t kReportFlushIOStatsEvery = 1048576; + uint64_t paranoid_hash = 0; Status s; IOStatus io_s; meta->fd.file_size = 0; @@ -110,7 +111,6 @@ Status BuildTable( ioptions.listeners, dbname, column_family_name, fname, job_id, reason); #endif // !ROCKSDB_LITE TableProperties tp; - if (iter->Valid() || !range_del_agg->IsEmpty()) { TableBuilder* builder; std::unique_ptr file_writer; @@ -168,6 +168,11 @@ Status BuildTable( const Slice& key = c_iter.key(); const Slice& value = c_iter.value(); const ParsedInternalKey& ikey = c_iter.ikey(); + if (paranoid_file_checks) { + // Generate a rolling 64-bit hash of the key and values + paranoid_hash = Hash64(key.data(), key.size(), paranoid_hash); + paranoid_hash = Hash64(value.data(), value.size(), paranoid_hash); + } builder->Add(key, value); meta->UpdateBoundaries(key, value, ikey.sequence, ikey.type); @@ -256,9 +261,17 @@ Status BuildTable( /*allow_unprepared_value*/ false)); s = it->status(); if (s.ok() && paranoid_file_checks) { + uint64_t check_hash = 0; for (it->SeekToFirst(); it->Valid(); it->Next()) { + // Generate a rolling 64-bit hash of the key and values + check_hash = Hash64(it->key().data(), it->key().size(), check_hash); + check_hash = + Hash64(it->value().data(), it->value().size(), check_hash); } s = it->status(); + if (s.ok() && check_hash != paranoid_hash) { + s = Status::Corruption("Paraniod checksums do not match"); + } } } } diff --git a/db/compaction/compaction_job.cc b/db/compaction/compaction_job.cc index 9eaa76f08..13654ef36 100644 --- a/db/compaction/compaction_job.cc +++ b/db/compaction/compaction_job.cc @@ -7,6 +7,8 @@ // Use of this source code is governed by a BSD-style license that can be // found in the LICENSE file. See the AUTHORS file for names of contributors. +#include "db/compaction/compaction_job.h" + #include #include #include @@ -19,7 +21,6 @@ #include #include "db/builder.h" -#include "db/compaction/compaction_job.h" #include "db/db_impl/db_impl.h" #include "db/db_iter.h" #include "db/dbformat.h" @@ -54,6 +55,7 @@ #include "table/table_builder.h" #include "test_util/sync_point.h" #include "util/coding.h" +#include "util/hash.h" #include "util/mutexlock.h" #include "util/random.h" #include "util/stop_watch.h" @@ -123,6 +125,7 @@ struct CompactionJob::SubcompactionState { struct Output { FileMetaData meta; bool finished; + uint64_t paranoid_hash; std::shared_ptr table_properties; }; @@ -132,7 +135,7 @@ struct CompactionJob::SubcompactionState { std::unique_ptr builder; Output* current_output() { if (outputs.empty()) { - // This subcompaction's outptut could be empty if compaction was aborted + // This subcompaction's output could be empty if compaction was aborted // before this subcompaction had a chance to generate any output files. // When subcompactions are executed sequentially this is more likely and // will be particulalry likely for the later subcompactions to be empty. @@ -202,6 +205,21 @@ struct CompactionJob::SubcompactionState { SubcompactionState& operator=(const SubcompactionState&) = delete; + // Adds the key and value to the builder + // If paranoid is true, adds the key-value to the paranoid hash + void AddToBuilder(const Slice& key, const Slice& value, bool paranoid) { + auto curr = current_output(); + assert(builder != nullptr); + assert(curr != nullptr); + if (paranoid) { + // Generate a rolling 64-bit hash of the key and values + curr->paranoid_hash = Hash64(key.data(), key.size(), curr->paranoid_hash); + curr->paranoid_hash = + Hash64(value.data(), value.size(), curr->paranoid_hash); + } + builder->Add(key, value); + } + // Returns true iff we should stop building the current output // before processing "internal_key". bool ShouldStopBefore(const Slice& internal_key, uint64_t curr_file_size) { @@ -635,20 +653,20 @@ Status CompactionJob::Run() { } if (status.ok()) { thread_pool.clear(); - std::vector files_meta; + std::vector files_output; for (const auto& state : compact_->sub_compact_states) { for (const auto& output : state.outputs) { - files_meta.emplace_back(&output.meta); + files_output.emplace_back(&output); } } ColumnFamilyData* cfd = compact_->compaction->column_family_data(); auto prefix_extractor = compact_->compaction->mutable_cf_options()->prefix_extractor.get(); - std::atomic next_file_meta_idx(0); + std::atomic next_file_idx(0); auto verify_table = [&](Status& output_status) { while (true) { - size_t file_idx = next_file_meta_idx.fetch_add(1); - if (file_idx >= files_meta.size()) { + size_t file_idx = next_file_idx.fetch_add(1); + if (file_idx >= files_output.size()) { break; } // Verify that the table is usable @@ -659,7 +677,8 @@ Status CompactionJob::Run() { // to cache it here for further user reads InternalIterator* iter = cfd->table_cache()->NewIterator( ReadOptions(), file_options_, cfd->internal_comparator(), - *files_meta[file_idx], /*range_del_agg=*/nullptr, prefix_extractor, + files_output[file_idx]->meta, /*range_del_agg=*/nullptr, + prefix_extractor, /*table_reader_ptr=*/nullptr, cfd->internal_stats()->GetFileReadHist( compact_->compaction->output_level()), @@ -673,8 +692,16 @@ Status CompactionJob::Run() { auto s = iter->status(); if (s.ok() && paranoid_file_checks_) { - for (iter->SeekToFirst(); iter->Valid(); iter->Next()) {} + uint64_t hash = 0; + for (iter->SeekToFirst(); iter->Valid(); iter->Next()) { + // Generate a rolling 64-bit hash of the key and values, using the + hash = Hash64(iter->key().data(), iter->key().size(), hash); + hash = Hash64(iter->value().data(), iter->value().size(), hash); + } s = iter->status(); + if (s.ok() && hash != files_output[file_idx]->paranoid_hash) { + s = Status::Corruption("Paraniod checksums do not match"); + } } delete iter; @@ -948,9 +975,8 @@ void CompactionJob::ProcessKeyValueCompaction(SubcompactionState* sub_compact) { break; } } - assert(sub_compact->builder != nullptr); - assert(sub_compact->current_output() != nullptr); - sub_compact->builder->Add(key, value); + sub_compact->AddToBuilder(key, value, paranoid_file_checks_); + sub_compact->current_output_file_size = sub_compact->builder->EstimatedFileSize(); const ParsedInternalKey& ikey = c_iter->ikey(); @@ -1246,7 +1272,8 @@ Status CompactionJob::FinishCompactionOutputFile( auto kv = tombstone.Serialize(); assert(lower_bound == nullptr || ucmp->Compare(*lower_bound, kv.second) < 0); - sub_compact->builder->Add(kv.first.Encode(), kv.second); + sub_compact->AddToBuilder(kv.first.Encode(), kv.second, + paranoid_file_checks_); InternalKey smallest_candidate = std::move(kv.first); if (lower_bound != nullptr && ucmp->Compare(smallest_candidate.user_key(), *lower_bound) <= 0) { @@ -1543,6 +1570,7 @@ Status CompactionJob::OpenCompactionOutputFile( out.meta.oldest_ancester_time = oldest_ancester_time; out.meta.file_creation_time = current_time; out.finished = false; + out.paranoid_hash = 0; sub_compact->outputs.push_back(out); } diff --git a/db/corruption_test.cc b/db/corruption_test.cc index cae3ad728..f1f2e471d 100644 --- a/db/corruption_test.cc +++ b/db/corruption_test.cc @@ -30,6 +30,7 @@ #include "rocksdb/write_batch.h" #include "table/block_based/block_based_table_builder.h" #include "table/meta_blocks.h" +#include "table/mock_table.h" #include "test_util/testharness.h" #include "test_util/testutil.h" #include "util/random.h" @@ -560,6 +561,59 @@ TEST_F(CorruptionTest, FileSystemStateCorrupted) { } } +static const auto& corruption_modes = {mock::MockTableFactory::kCorruptNone, + mock::MockTableFactory::kCorruptKey, + mock::MockTableFactory::kCorruptValue}; + +TEST_F(CorruptionTest, ParaniodFileChecksOnFlush) { + Options options; + options.paranoid_file_checks = true; + options.create_if_missing = true; + Status s; + for (const auto& mode : corruption_modes) { + delete db_; + s = DestroyDB(dbname_, options); + std::shared_ptr mock = + std::make_shared(); + options.table_factory = mock; + mock->SetCorruptionMode(mode); + ASSERT_OK(DB::Open(options, dbname_, &db_)); + Build(10); + s = db_->Flush(FlushOptions()); + if (mode == mock::MockTableFactory::kCorruptNone) { + ASSERT_OK(s); + } else { + ASSERT_NOK(s); + } + } +} + +TEST_F(CorruptionTest, ParaniodFileChecksOnCompact) { + Options options; + options.paranoid_file_checks = true; + options.create_if_missing = true; + Status s; + for (const auto& mode : corruption_modes) { + delete db_; + s = DestroyDB(dbname_, options); + std::shared_ptr mock = + std::make_shared(); + options.table_factory = mock; + ASSERT_OK(DB::Open(options, dbname_, &db_)); + Build(100, 2); + // ASSERT_OK(db_->Flush(FlushOptions())); + DBImpl* dbi = static_cast_with_check(db_); + ASSERT_OK(dbi->TEST_FlushMemTable()); + mock->SetCorruptionMode(mode); + s = dbi->TEST_CompactRange(0, nullptr, nullptr, nullptr, true); + if (mode == mock::MockTableFactory::kCorruptNone) { + ASSERT_OK(s); + } else { + ASSERT_NOK(s); + } + } +} + } // namespace ROCKSDB_NAMESPACE int main(int argc, char** argv) { diff --git a/include/rocksdb/advanced_options.h b/include/rocksdb/advanced_options.h index 574e9390a..5c3a19a34 100644 --- a/include/rocksdb/advanced_options.h +++ b/include/rocksdb/advanced_options.h @@ -643,6 +643,8 @@ struct AdvancedColumnFamilyOptions { bool optimize_filters_for_hits = false; // After writing every SST file, reopen it and read all the keys. + // Checks the hash of all of the keys and values written versus the + // keys in the file and signals a corruption if they do not match // // Default: false // diff --git a/table/mock_table.cc b/table/mock_table.cc index 30ec9a671..eb198a44b 100644 --- a/table/mock_table.cc +++ b/table/mock_table.cc @@ -27,6 +27,154 @@ stl_wrappers::KVMap MakeMockFile( return stl_wrappers::KVMap(l, stl_wrappers::LessOfComparator(&icmp_)); } +class MockTableReader : public TableReader { + public: + explicit MockTableReader(const stl_wrappers::KVMap& table) : table_(table) {} + + InternalIterator* NewIterator(const ReadOptions&, + const SliceTransform* prefix_extractor, + Arena* arena, bool skip_filters, + TableReaderCaller caller, + size_t compaction_readahead_size = 0, + bool allow_unprepared_value = false) override; + + Status Get(const ReadOptions& readOptions, const Slice& key, + GetContext* get_context, const SliceTransform* prefix_extractor, + bool skip_filters = false) override; + + uint64_t ApproximateOffsetOf(const Slice& /*key*/, + TableReaderCaller /*caller*/) override { + return 0; + } + + uint64_t ApproximateSize(const Slice& /*start*/, const Slice& /*end*/, + TableReaderCaller /*caller*/) override { + return 0; + } + + size_t ApproximateMemoryUsage() const override { return 0; } + + void SetupForCompaction() override {} + + std::shared_ptr GetTableProperties() const override; + + ~MockTableReader() {} + + private: + const stl_wrappers::KVMap& table_; +}; + +class MockTableIterator : public InternalIterator { + public: + explicit MockTableIterator(const stl_wrappers::KVMap& table) : table_(table) { + itr_ = table_.end(); + } + + bool Valid() const override { return itr_ != table_.end(); } + + void SeekToFirst() override { itr_ = table_.begin(); } + + void SeekToLast() override { + itr_ = table_.end(); + --itr_; + } + + void Seek(const Slice& target) override { + std::string str_target(target.data(), target.size()); + itr_ = table_.lower_bound(str_target); + } + + void SeekForPrev(const Slice& target) override { + std::string str_target(target.data(), target.size()); + itr_ = table_.upper_bound(str_target); + Prev(); + } + + void Next() override { ++itr_; } + + void Prev() override { + if (itr_ == table_.begin()) { + itr_ = table_.end(); + } else { + --itr_; + } + } + + Slice key() const override { return Slice(itr_->first); } + + Slice value() const override { return Slice(itr_->second); } + + Status status() const override { return Status::OK(); } + + private: + const stl_wrappers::KVMap& table_; + stl_wrappers::KVMap::const_iterator itr_; +}; + +class MockTableBuilder : public TableBuilder { + public: + MockTableBuilder(uint32_t id, MockTableFileSystem* file_system, + MockTableFactory::MockCorruptionMode corrupt_mode = + MockTableFactory::kCorruptNone) + : id_(id), file_system_(file_system), corrupt_mode_(corrupt_mode) { + table_ = MakeMockFile({}); + } + + // REQUIRES: Either Finish() or Abandon() has been called. + ~MockTableBuilder() {} + + // Add key,value to the table being constructed. + // REQUIRES: key is after any previously added key according to comparator. + // REQUIRES: Finish(), Abandon() have not been called + void Add(const Slice& key, const Slice& value) override { + if (corrupt_mode_ == MockTableFactory::kCorruptValue) { + // Corrupt the value + table_.insert({key.ToString(), value.ToString() + " "}); + corrupt_mode_ = MockTableFactory::kCorruptNone; + } else if (corrupt_mode_ == MockTableFactory::kCorruptKey) { + table_.insert({key.ToString() + " ", value.ToString()}); + corrupt_mode_ = MockTableFactory::kCorruptNone; + } else { + table_.insert({key.ToString(), value.ToString()}); + } + } + + // Return non-ok iff some error has been detected. + Status status() const override { return Status::OK(); } + + // Return non-ok iff some error happens during IO. + IOStatus io_status() const override { return IOStatus::OK(); } + + Status Finish() override { + MutexLock lock_guard(&file_system_->mutex); + file_system_->files.insert({id_, table_}); + return Status::OK(); + } + + void Abandon() override {} + + uint64_t NumEntries() const override { return table_.size(); } + + uint64_t FileSize() const override { return table_.size(); } + + TableProperties GetTableProperties() const override { + return TableProperties(); + } + + // Get file checksum + std::string GetFileChecksum() const override { return kUnknownFileChecksum; } + // Get file checksum function name + const char* GetFileChecksumFuncName() const override { + return kUnknownFileChecksumFuncName; + } + + private: + uint32_t id_; + MockTableFileSystem* file_system_; + int corrupt_mode_; + stl_wrappers::KVMap table_; +}; + InternalIterator* MockTableReader::NewIterator( const ReadOptions&, const SliceTransform* /* prefix_extractor */, Arena* /*arena*/, bool /*skip_filters*/, TableReaderCaller /*caller*/, @@ -58,7 +206,8 @@ std::shared_ptr MockTableReader::GetTableProperties() return std::shared_ptr(new TableProperties()); } -MockTableFactory::MockTableFactory() : next_id_(1) {} +MockTableFactory::MockTableFactory() + : next_id_(1), corrupt_mode_(MockTableFactory::kCorruptNone) {} Status MockTableFactory::NewTableReader( const ReadOptions& /*ro*/, @@ -85,7 +234,7 @@ TableBuilder* MockTableFactory::NewTableBuilder( uint32_t /*column_family_id*/, WritableFileWriter* file) const { uint32_t id = GetAndWriteNextID(file); - return new MockTableBuilder(id, &file_system_); + return new MockTableBuilder(id, &file_system_, corrupt_mode_); } Status MockTableFactory::CreateMockTable(Env* env, const std::string& fname, diff --git a/table/mock_table.h b/table/mock_table.h index 7d89fd382..3484dbd2d 100644 --- a/table/mock_table.h +++ b/table/mock_table.h @@ -36,144 +36,14 @@ struct MockTableFileSystem { std::map files; }; -class MockTableReader : public TableReader { - public: - explicit MockTableReader(const stl_wrappers::KVMap& table) : table_(table) {} - - InternalIterator* NewIterator(const ReadOptions&, - const SliceTransform* prefix_extractor, - Arena* arena, bool skip_filters, - TableReaderCaller caller, - size_t compaction_readahead_size = 0, - bool allow_unprepared_value = false) override; - - Status Get(const ReadOptions& readOptions, const Slice& key, - GetContext* get_context, const SliceTransform* prefix_extractor, - bool skip_filters = false) override; - - uint64_t ApproximateOffsetOf(const Slice& /*key*/, - TableReaderCaller /*caller*/) override { - return 0; - } - - uint64_t ApproximateSize(const Slice& /*start*/, const Slice& /*end*/, - TableReaderCaller /*caller*/) override { - return 0; - } - - size_t ApproximateMemoryUsage() const override { return 0; } - - void SetupForCompaction() override {} - - std::shared_ptr GetTableProperties() const override; - - ~MockTableReader() {} - - private: - const stl_wrappers::KVMap& table_; -}; - -class MockTableIterator : public InternalIterator { - public: - explicit MockTableIterator(const stl_wrappers::KVMap& table) : table_(table) { - itr_ = table_.end(); - } - - bool Valid() const override { return itr_ != table_.end(); } - - void SeekToFirst() override { itr_ = table_.begin(); } - - void SeekToLast() override { - itr_ = table_.end(); - --itr_; - } - - void Seek(const Slice& target) override { - std::string str_target(target.data(), target.size()); - itr_ = table_.lower_bound(str_target); - } - - void SeekForPrev(const Slice& target) override { - std::string str_target(target.data(), target.size()); - itr_ = table_.upper_bound(str_target); - Prev(); - } - - void Next() override { ++itr_; } - - void Prev() override { - if (itr_ == table_.begin()) { - itr_ = table_.end(); - } else { - --itr_; - } - } - - Slice key() const override { return Slice(itr_->first); } - - Slice value() const override { return Slice(itr_->second); } - - Status status() const override { return Status::OK(); } - - private: - const stl_wrappers::KVMap& table_; - stl_wrappers::KVMap::const_iterator itr_; -}; - -class MockTableBuilder : public TableBuilder { - public: - MockTableBuilder(uint32_t id, MockTableFileSystem* file_system) - : id_(id), file_system_(file_system) { - table_ = MakeMockFile({}); - } - - // REQUIRES: Either Finish() or Abandon() has been called. - ~MockTableBuilder() {} - - // Add key,value to the table being constructed. - // REQUIRES: key is after any previously added key according to comparator. - // REQUIRES: Finish(), Abandon() have not been called - void Add(const Slice& key, const Slice& value) override { - table_.insert({key.ToString(), value.ToString()}); - } - - // Return non-ok iff some error has been detected. - Status status() const override { return Status::OK(); } - - // Return non-ok iff some error happens during IO. - IOStatus io_status() const override { return IOStatus::OK(); } - - Status Finish() override { - MutexLock lock_guard(&file_system_->mutex); - file_system_->files.insert({id_, table_}); - return Status::OK(); - } - - void Abandon() override {} - - uint64_t NumEntries() const override { return table_.size(); } - - uint64_t FileSize() const override { return table_.size(); } - - TableProperties GetTableProperties() const override { - return TableProperties(); - } - - // Get file checksum - std::string GetFileChecksum() const override { return kUnknownFileChecksum; } - // Get file checksum function name - const char* GetFileChecksumFuncName() const override { - return kUnknownFileChecksumFuncName; - } - - private: - uint32_t id_; - MockTableFileSystem* file_system_; - stl_wrappers::KVMap table_; -}; - class MockTableFactory : public TableFactory { public: + enum MockCorruptionMode { + kCorruptNone, + kCorruptKey, + kCorruptValue, + }; + MockTableFactory(); const char* Name() const override { return "MockTable"; } using TableFactory::NewTableReader; @@ -202,6 +72,7 @@ class MockTableFactory : public TableFactory { return std::string(); } + void SetCorruptionMode(MockCorruptionMode mode) { corrupt_mode_ = mode; } // This function will assert that only a single file exists and that the // contents are equal to file_contents void AssertSingleFile(const stl_wrappers::KVMap& file_contents); @@ -213,6 +84,7 @@ class MockTableFactory : public TableFactory { mutable MockTableFileSystem file_system_; mutable std::atomic next_id_; + MockCorruptionMode corrupt_mode_; }; } // namespace mock