Make best-efforts recovery verify SST unique ID before Version construction (#10962)

Summary:
The check for SST unique IDs added to best-efforts recovery (`Options::best_efforts_recovery` is true).

With best_efforts_recovery being true, RocksDB will recover to the latest point in
MANIFEST such that all valid SST files included up to this point pass unique ID checks as well.

Pull Request resolved: https://github.com/facebook/rocksdb/pull/10962

Test Plan: make check

Reviewed By: pdillinger

Differential Revision: D41378241

Pulled By: riversand963

fbshipit-source-id: a036064e2c17dec13d080a24ef2a9f85d607b16c
main
Yanqin Jin 2 years ago committed by Facebook GitHub Bot
parent d8e792e4cf
commit 3d0d6b8140
  1. 2
      HISTORY.md
  2. 74
      db/db_test2.cc
  3. 33
      db/version_edit_handler.cc
  4. 16
      db/version_edit_handler.h
  5. 37
      db/version_set.cc
  6. 4
      db/version_set.h
  7. 41
      include/rocksdb/options.h

@ -1,5 +1,7 @@
# Rocksdb Change Log # Rocksdb Change Log
## Unreleased ## Unreleased
### Behavior changes
* Make best-efforts recovery verify SST unique ID before Version construction (#10962)
## 7.9.0 (11/21/2022) ## 7.9.0 (11/21/2022)
### Performance Improvements ### Performance Improvements

@ -7509,6 +7509,80 @@ TEST_F(DBTest2, SstUniqueIdVerifyMultiCFs) {
ASSERT_TRUE(s.IsCorruption()); ASSERT_TRUE(s.IsCorruption());
} }
TEST_F(DBTest2, BestEffortsRecoveryWithSstUniqueIdVerification) {
const auto tamper_with_uniq_id = [&](void* arg) {
auto props = static_cast<TableProperties*>(arg);
assert(props);
// update table property session_id to a different one
props->db_session_id = DBImpl::GenerateDbSessionId(nullptr);
};
const auto assert_db = [&](size_t expected_count,
const std::string& expected_v) {
std::unique_ptr<Iterator> it(db_->NewIterator(ReadOptions()));
size_t cnt = 0;
for (it->SeekToFirst(); it->Valid(); it->Next(), ++cnt) {
ASSERT_EQ(std::to_string(cnt), it->key());
ASSERT_EQ(expected_v, it->value());
}
ASSERT_EQ(expected_count, cnt);
};
const int num_l0_compaction_trigger = 8;
const int num_l0 = num_l0_compaction_trigger - 1;
Options options = CurrentOptions();
options.level0_file_num_compaction_trigger = num_l0_compaction_trigger;
for (int k = 0; k < num_l0; ++k) {
// Allow mismatch for now
options.verify_sst_unique_id_in_manifest = false;
DestroyAndReopen(options);
constexpr size_t num_keys_per_file = 10;
for (int i = 0; i < num_l0; ++i) {
for (size_t j = 0; j < num_keys_per_file; ++j) {
ASSERT_OK(Put(std::to_string(j), "v" + std::to_string(i)));
}
if (i == k) {
SyncPoint::GetInstance()->DisableProcessing();
SyncPoint::GetInstance()->SetCallBack(
"PropertyBlockBuilder::AddTableProperty:Start",
tamper_with_uniq_id);
SyncPoint::GetInstance()->EnableProcessing();
}
ASSERT_OK(Flush());
}
options.verify_sst_unique_id_in_manifest = true;
Status s = TryReopen(options);
ASSERT_TRUE(s.IsCorruption());
options.best_efforts_recovery = true;
Reopen(options);
assert_db(k == 0 ? 0 : num_keys_per_file, "v" + std::to_string(k - 1));
// Reopen with regular recovery
options.best_efforts_recovery = false;
Reopen(options);
assert_db(k == 0 ? 0 : num_keys_per_file, "v" + std::to_string(k - 1));
SyncPoint::GetInstance()->DisableProcessing();
SyncPoint::GetInstance()->ClearAllCallBacks();
for (size_t i = 0; i < num_keys_per_file; ++i) {
ASSERT_OK(Put(std::to_string(i), "v"));
}
ASSERT_OK(Flush());
Reopen(options);
{
for (size_t i = 0; i < num_keys_per_file; ++i) {
ASSERT_EQ("v", Get(std::to_string(i)));
}
}
}
}
#ifndef ROCKSDB_LITE #ifndef ROCKSDB_LITE
TEST_F(DBTest2, GetLatestSeqAndTsForKey) { TEST_F(DBTest2, GetLatestSeqAndTsForKey) {
Destroy(last_options_); Destroy(last_options_);

@ -734,12 +734,13 @@ Status VersionEditHandlerPointInTime::MaybeCreateVersion(
assert(!cfd->ioptions()->cf_paths.empty()); assert(!cfd->ioptions()->cf_paths.empty());
Status s; Status s;
for (const auto& elem : edit.GetNewFiles()) { for (const auto& elem : edit.GetNewFiles()) {
int level = elem.first;
const FileMetaData& meta = elem.second; const FileMetaData& meta = elem.second;
const FileDescriptor& fd = meta.fd; const FileDescriptor& fd = meta.fd;
uint64_t file_num = fd.GetNumber(); uint64_t file_num = fd.GetNumber();
const std::string fpath = const std::string fpath =
MakeTableFileName(cfd->ioptions()->cf_paths[0].path, file_num); MakeTableFileName(cfd->ioptions()->cf_paths[0].path, file_num);
s = VerifyFile(fpath, meta); s = VerifyFile(cfd, fpath, level, meta);
if (s.IsPathNotFound() || s.IsNotFound() || s.IsCorruption()) { if (s.IsPathNotFound() || s.IsNotFound() || s.IsCorruption()) {
missing_files.insert(file_num); missing_files.insert(file_num);
s = Status::OK(); s = Status::OK();
@ -804,6 +805,18 @@ Status VersionEditHandlerPointInTime::MaybeCreateVersion(
auto* version = new Version(cfd, version_set_, version_set_->file_options_, auto* version = new Version(cfd, version_set_, version_set_->file_options_,
*cfd->GetLatestMutableCFOptions(), io_tracer_, *cfd->GetLatestMutableCFOptions(), io_tracer_,
version_set_->current_version_number_++); version_set_->current_version_number_++);
s = builder->LoadTableHandlers(
cfd->internal_stats(),
version_set_->db_options_->max_file_opening_threads, false, true,
cfd->GetLatestMutableCFOptions()->prefix_extractor,
MaxFileSizeForL0MetaPin(*cfd->GetLatestMutableCFOptions()));
if (!s.ok()) {
delete version;
if (s.IsCorruption()) {
s = Status::OK();
}
return s;
}
s = builder->SaveTo(version->storage_info()); s = builder->SaveTo(version->storage_info());
if (s.ok()) { if (s.ok()) {
version->PrepareAppend( version->PrepareAppend(
@ -823,9 +836,11 @@ Status VersionEditHandlerPointInTime::MaybeCreateVersion(
return s; return s;
} }
Status VersionEditHandlerPointInTime::VerifyFile(const std::string& fpath, Status VersionEditHandlerPointInTime::VerifyFile(ColumnFamilyData* cfd,
const std::string& fpath,
int level,
const FileMetaData& fmeta) { const FileMetaData& fmeta) {
return version_set_->VerifyFileMetadata(fpath, fmeta); return version_set_->VerifyFileMetadata(cfd, fpath, level, fmeta);
} }
Status VersionEditHandlerPointInTime::VerifyBlobFile( Status VersionEditHandlerPointInTime::VerifyBlobFile(
@ -843,6 +858,12 @@ Status VersionEditHandlerPointInTime::VerifyBlobFile(
return s; return s;
} }
Status VersionEditHandlerPointInTime::LoadTables(
ColumnFamilyData* /*cfd*/, bool /*prefetch_index_and_filter_in_cache*/,
bool /*is_initial_load*/) {
return Status::OK();
}
Status ManifestTailer::Initialize() { Status ManifestTailer::Initialize() {
if (Mode::kRecovery == mode_) { if (Mode::kRecovery == mode_) {
return VersionEditHandler::Initialize(); return VersionEditHandler::Initialize();
@ -930,9 +951,11 @@ void ManifestTailer::CheckIterationResult(const log::Reader& reader,
} }
} }
Status ManifestTailer::VerifyFile(const std::string& fpath, Status ManifestTailer::VerifyFile(ColumnFamilyData* cfd,
const std::string& fpath, int level,
const FileMetaData& fmeta) { const FileMetaData& fmeta) {
Status s = VersionEditHandlerPointInTime::VerifyFile(fpath, fmeta); Status s =
VersionEditHandlerPointInTime::VerifyFile(cfd, fpath, level, fmeta);
// TODO: Open file or create hard link to prevent the file from being // TODO: Open file or create hard link to prevent the file from being
// deleted. // deleted.
return s; return s;

@ -164,9 +164,9 @@ class VersionEditHandler : public VersionEditHandlerBase {
ColumnFamilyData* cfd, ColumnFamilyData* cfd,
bool force_create_version); bool force_create_version);
Status LoadTables(ColumnFamilyData* cfd, virtual Status LoadTables(ColumnFamilyData* cfd,
bool prefetch_index_and_filter_in_cache, bool prefetch_index_and_filter_in_cache,
bool is_initial_load); bool is_initial_load);
virtual bool MustOpenAllColumnFamilies() const { return !read_only_; } virtual bool MustOpenAllColumnFamilies() const { return !read_only_; }
@ -213,11 +213,15 @@ class VersionEditHandlerPointInTime : public VersionEditHandler {
ColumnFamilyData* DestroyCfAndCleanup(const VersionEdit& edit) override; ColumnFamilyData* DestroyCfAndCleanup(const VersionEdit& edit) override;
Status MaybeCreateVersion(const VersionEdit& edit, ColumnFamilyData* cfd, Status MaybeCreateVersion(const VersionEdit& edit, ColumnFamilyData* cfd,
bool force_create_version) override; bool force_create_version) override;
virtual Status VerifyFile(const std::string& fpath, virtual Status VerifyFile(ColumnFamilyData* cfd, const std::string& fpath,
const FileMetaData& fmeta); int level, const FileMetaData& fmeta);
virtual Status VerifyBlobFile(ColumnFamilyData* cfd, uint64_t blob_file_num, virtual Status VerifyBlobFile(ColumnFamilyData* cfd, uint64_t blob_file_num,
const BlobFileAddition& blob_addition); const BlobFileAddition& blob_addition);
Status LoadTables(ColumnFamilyData* cfd,
bool prefetch_index_and_filter_in_cache,
bool is_initial_load) override;
std::unordered_map<uint32_t, Version*> versions_; std::unordered_map<uint32_t, Version*> versions_;
}; };
@ -250,7 +254,7 @@ class ManifestTailer : public VersionEditHandlerPointInTime {
void CheckIterationResult(const log::Reader& reader, Status* s) override; void CheckIterationResult(const log::Reader& reader, Status* s) override;
Status VerifyFile(const std::string& fpath, Status VerifyFile(ColumnFamilyData* cfd, const std::string& fpath, int level,
const FileMetaData& fmeta) override; const FileMetaData& fmeta) override;
enum Mode : uint8_t { enum Mode : uint8_t {

@ -6704,8 +6704,9 @@ uint64_t VersionSet::GetTotalBlobFileSize(Version* dummy_versions) {
return all_versions_blob_file_size; return all_versions_blob_file_size;
} }
Status VersionSet::VerifyFileMetadata(const std::string& fpath, Status VersionSet::VerifyFileMetadata(ColumnFamilyData* cfd,
const FileMetaData& meta) const { const std::string& fpath, int level,
const FileMetaData& meta) {
uint64_t fsize = 0; uint64_t fsize = 0;
Status status = fs_->GetFileSize(fpath, IOOptions(), &fsize, nullptr); Status status = fs_->GetFileSize(fpath, IOOptions(), &fsize, nullptr);
if (status.ok()) { if (status.ok()) {
@ -6713,6 +6714,38 @@ Status VersionSet::VerifyFileMetadata(const std::string& fpath,
status = Status::Corruption("File size mismatch: " + fpath); status = Status::Corruption("File size mismatch: " + fpath);
} }
} }
if (status.ok() && db_options_->verify_sst_unique_id_in_manifest) {
assert(cfd);
TableCache* table_cache = cfd->table_cache();
assert(table_cache);
const MutableCFOptions* const cf_opts = cfd->GetLatestMutableCFOptions();
assert(cf_opts);
std::shared_ptr<const SliceTransform> pe = cf_opts->prefix_extractor;
size_t max_sz_for_l0_meta_pin = MaxFileSizeForL0MetaPin(*cf_opts);
const FileOptions& file_opts = file_options();
Version* version = cfd->current();
assert(version);
VersionStorageInfo& storage_info = version->storage_info_;
const InternalKeyComparator* icmp = storage_info.InternalComparator();
assert(icmp);
InternalStats* internal_stats = cfd->internal_stats();
FileMetaData meta_copy = meta;
status = table_cache->FindTable(
ReadOptions(), file_opts, *icmp, meta_copy,
&(meta_copy.table_reader_handle), pe,
/*no_io=*/false, /*record_read_stats=*/true,
internal_stats->GetFileReadHist(level), false, level,
/*prefetch_index_and_filter_in_cache*/ false, max_sz_for_l0_meta_pin,
meta_copy.temperature);
if (meta_copy.table_reader_handle) {
table_cache->ReleaseHandle(meta_copy.table_reader_handle);
}
}
return status; return status;
} }

@ -1501,8 +1501,8 @@ class VersionSet {
ColumnFamilyData* CreateColumnFamily(const ColumnFamilyOptions& cf_options, ColumnFamilyData* CreateColumnFamily(const ColumnFamilyOptions& cf_options,
const VersionEdit* edit); const VersionEdit* edit);
Status VerifyFileMetadata(const std::string& fpath, Status VerifyFileMetadata(ColumnFamilyData* cfd, const std::string& fpath,
const FileMetaData& meta) const; int level, const FileMetaData& meta);
// Protected by DB mutex. // Protected by DB mutex.
WalSet wals_; WalSet wals_;

@ -1285,30 +1285,37 @@ struct DBOptions {
// Default: nullptr // Default: nullptr
std::shared_ptr<FileChecksumGenFactory> file_checksum_gen_factory = nullptr; std::shared_ptr<FileChecksumGenFactory> file_checksum_gen_factory = nullptr;
// By default, RocksDB recovery fails if any table/blob file referenced in // By default, RocksDB recovery fails if any table/blob file referenced in the
// final version reconstructed from the
// MANIFEST are missing after scanning the MANIFEST pointed to by the // MANIFEST are missing after scanning the MANIFEST pointed to by the
// CURRENT file. // CURRENT file. It can also fail if verification of unique SST id fails.
// Best-efforts recovery is another recovery mode that tolerates missing or // Best-efforts recovery is another recovery mode that does not necessarily
// corrupted table or blob files. // fail when certain table/blob files are missing/corrupted or have mismatched
// unique id table property. Instead, best-efforts recovery recovers each
// column family to a point in the MANIFEST that corresponds to a version. In
// such a version, all valid table/blob files referenced have the expected
// file size. For table files, their unique id table property match the
// MANIFEST.
//
// Best-efforts recovery does not need a valid CURRENT file, and tries to // Best-efforts recovery does not need a valid CURRENT file, and tries to
// recover the database using one of the available MANIFEST files in the db // recover the database using one of the available MANIFEST files in the db
// directory. // directory.
// Best-efforts recovery recovers database to a state in which the database
// includes only table and blob files whose actual sizes match the
// information in the chosen MANIFEST without holes in the history.
// Best-efforts recovery tries the available MANIFEST files from high file // Best-efforts recovery tries the available MANIFEST files from high file
// numbers (newer) to low file numbers (older), and stops after finding the // numbers (newer) to low file numbers (older), and stops after finding the
// first MANIFEST file from which the db can be recovered to a state without // first MANIFEST file from which the db can be recovered to a state without
// invalid (missing/file-mismatch) table and blob files. // invalid (missing/filesize-mismatch/unique-id-mismatch) table and blob
// It is possible that the database can be restored to an empty state with no // files. It is possible that the database can be restored to an empty state
// table or blob files. // with no table or blob files.
// Regardless of this option, the IDENTITY file is updated if needed during //
// recovery to match the DB ID in the MANIFEST (if previously using // Regardless of this option, the IDENTITY file
// write_dbid_to_manifest) or to be in some valid state (non-empty DB ID). // is updated if needed during recovery to match the DB ID in the MANIFEST (if
// Currently, not compatible with atomic flush. Furthermore, WAL files will // previously using write_dbid_to_manifest) or to be in some valid state
// not be used for recovery if best_efforts_recovery is true. // (non-empty DB ID). Currently, not compatible with atomic flush.
// Also requires either 1) LOCK file exists or 2) underlying env's LockFile() // Furthermore, WAL files will not be used for recovery if
// call returns ok even for non-existing LOCK file. // best_efforts_recovery is true. Also requires either 1) LOCK file exists or
// 2) underlying env's LockFile() call returns ok even for non-existing LOCK
// file.
//
// Default: false // Default: false
bool best_efforts_recovery = false; bool best_efforts_recovery = false;

Loading…
Cancel
Save