Fix a recovery corner case (#7621)

Summary:
Consider the following sequence of events:

1. Db flushed an SST with file number N, appended to MANIFEST, and tried to sync the MANIFEST.
2. Syncing MANIFEST failed and db crashed.
3. Db tried to recover with this MANIFEST. In the meantime, no entry about the newly-flushed SST was found in the MANIFEST. Therefore, RocksDB replayed WAL and tried to flush to an SST file reusing the same file number N. This failed because file system does not support overwrite. Then Db deleted this file.
4. Db crashed again.
5. Db tried to recover. When db read the MANIFEST, there was an entry referencing N.sst. This could happen probably because the append in step 1 finally reached the MANIFEST and became visible. Since N.sst had been deleted in step 3, recovery failed.

It is possible that N.sst created in step 1 is valid. Although step 3 would still fail since the MANIFEST was not synced properly in step 1 and 2, deleting N.sst would make it impossible for the db to recover even if the remaining part of MANIFEST was appended and visible after step 5.

After this PR, in step 3, immediately after recovering from MANIFEST, a new MANIFEST is created, then we find that N.sst is not referenced in the MANIFEST, so we delete it, and we'll not reuse N as file number. Then in step 5, since the new MANIFEST does not contain N.sst, the recovery failure situation in step 5 won't happen.

Pull Request resolved: https://github.com/facebook/rocksdb/pull/7621

Test Plan:
1. some tests are updated, because these tests assume that new MANIFEST is created after WAL recovery.
2. a new unit test is added in db_basic_test to simulate step 3.

Reviewed By: riversand963

Differential Revision: D24668144

Pulled By: cheng-chang

fbshipit-source-id: 90d7487fbad2bc3714f5ede46ea949895b15ae3b
main
Cheng Chang 4 years ago committed by Facebook GitHub Bot
parent 8b8a2e9f05
commit 5e794b0841
  1. 11
      db/cuckoo_table_db_test.cc
  2. 37
      db/db_basic_test.cc
  3. 14
      db/db_impl/db_impl.h
  4. 40
      db/db_impl/db_impl_files.cc
  5. 34
      db/db_impl/db_impl_open.cc
  6. 10
      db/db_range_del_test.cc
  7. 1
      db_stress_tool/db_stress_test_base.cc
  8. 2
      env/mock_env.cc
  9. 6
      java/src/test/java/org/rocksdb/RocksDBTest.java
  10. 8
      utilities/backupable/backupable_db_test.cc

@ -63,6 +63,15 @@ class CuckooTableDBTest : public testing::Test {
ASSERT_OK(DB::Open(opts, dbname_, &db_));
}
void DestroyAndReopen(Options* options) {
assert(options);
ASSERT_OK(db_->Close());
delete db_;
db_ = nullptr;
ASSERT_OK(DestroyDB(dbname_, *options));
Reopen(options);
}
Status Put(const Slice& k, const Slice& v) {
return db_->Put(WriteOptions(), k, v);
}
@ -205,7 +214,7 @@ static std::string Uint64Key(uint64_t i) {
TEST_F(CuckooTableDBTest, Uint64Comparator) {
Options options = CurrentOptions();
options.comparator = test::Uint64Comparator();
Reopen(&options);
DestroyAndReopen(&options);
ASSERT_OK(Put(Uint64Key(1), "v1"));
ASSERT_OK(Put(Uint64Key(2), "v2"));

@ -2279,6 +2279,43 @@ class TableFileListener : public EventListener {
};
} // namespace
TEST_F(DBBasicTest, LastSstFileNotInManifest) {
// If the last sst file is not tracked in MANIFEST,
// or the VersionEdit for the last sst file is not synced,
// on recovery, the last sst file should be deleted,
// and new sst files shouldn't reuse its file number.
Options options = CurrentOptions();
DestroyAndReopen(options);
Close();
// Manually add a sst file.
constexpr uint64_t kSstFileNumber = 100;
const std::string kSstFile = MakeTableFileName(dbname_, kSstFileNumber);
ASSERT_OK(WriteStringToFile(env_, /* data = */ "bad sst file content",
/* fname = */ kSstFile,
/* should_sync = */ true));
ASSERT_OK(env_->FileExists(kSstFile));
TableFileListener* listener = new TableFileListener();
options.listeners.emplace_back(listener);
Reopen(options);
// kSstFile should already be deleted.
ASSERT_TRUE(env_->FileExists(kSstFile).IsNotFound());
ASSERT_OK(Put("k", "v"));
ASSERT_OK(Flush());
// New sst file should have file number > kSstFileNumber.
std::vector<std::string>& files =
listener->GetFiles(kDefaultColumnFamilyName);
ASSERT_EQ(files.size(), 1);
const std::string fname = files[0].erase(0, (dbname_ + "/").size());
uint64_t number = 0;
FileType type = kTableFile;
ASSERT_TRUE(ParseFileName(fname, &number, &type));
ASSERT_EQ(type, kTableFile);
ASSERT_GT(number, kSstFileNumber);
}
TEST_F(DBBasicTest, RecoverWithMissingFiles) {
Options options = CurrentOptions();
DestroyAndReopen(options);

@ -1215,14 +1215,22 @@ class DBImpl : public DB {
virtual bool OwnTablesAndLogs() const { return true; }
// Set DB identity file, and write DB ID to manifest if necessary.
Status SetDBId();
// REQUIRES: db mutex held when calling this function, but the db mutex can
// be released and re-acquired. Db mutex will be held when the function
// returns.
// After best-efforts recovery, there may be SST files in db/cf paths that are
// not referenced in the MANIFEST. We delete these SST files. In the
// After recovery, there may be SST files in db/cf paths that are
// not referenced in the MANIFEST (e.g.
// 1. It's best effort recovery;
// 2. The VersionEdits referencing the SST files are appended to
// MANIFEST, DB crashes when syncing the MANIFEST, the VersionEdits are
// still not synced to MANIFEST during recovery.)
// We delete these SST files. In the
// meantime, we find out the largest file number present in the paths, and
// bump up the version set's next_file_number_ to be 1 + largest_file_number.
Status FinishBestEffortsRecovery();
Status DeleteUnreferencedSstFiles();
// SetDbSessionId() should be called in the constuctor DBImpl()
// to ensure that db_session_id_ gets updated every time the DB is opened

@ -751,7 +751,43 @@ uint64_t PrecomputeMinLogNumberToKeep2PC(
return min_log_number_to_keep;
}
Status DBImpl::FinishBestEffortsRecovery() {
Status DBImpl::SetDBId() {
Status s;
// Happens when immutable_db_options_.write_dbid_to_manifest is set to true
// the very first time.
if (db_id_.empty()) {
// Check for the IDENTITY file and create it if not there.
s = fs_->FileExists(IdentityFileName(dbname_), IOOptions(), nullptr);
// Typically Identity file is created in NewDB() and for some reason if
// it is no longer available then at this point DB ID is not in Identity
// file or Manifest.
if (s.IsNotFound()) {
s = SetIdentityFile(env_, dbname_);
if (!s.ok()) {
return s;
}
} else if (!s.ok()) {
assert(s.IsIOError());
return s;
}
s = GetDbIdentityFromIdentityFile(&db_id_);
if (immutable_db_options_.write_dbid_to_manifest && s.ok()) {
VersionEdit edit;
edit.SetDBId(db_id_);
Options options;
MutableCFOptions mutable_cf_options(options);
versions_->db_id_ = db_id_;
s = versions_->LogAndApply(versions_->GetColumnFamilySet()->GetDefault(),
mutable_cf_options, &edit, &mutex_, nullptr,
/* new_descriptor_log */ false);
}
} else {
s = SetIdentityFile(env_, dbname_, db_id_);
}
return s;
}
Status DBImpl::DeleteUnreferencedSstFiles() {
mutex_.AssertHeld();
std::vector<std::string> paths;
paths.push_back(NormalizePath(dbname_ + std::string(1, kFilePathSeparator)));
@ -807,8 +843,6 @@ Status DBImpl::FinishBestEffortsRecovery() {
assert(versions_->GetColumnFamilySet());
ColumnFamilyData* default_cfd = versions_->GetColumnFamilySet()->GetDefault();
assert(default_cfd);
// Even if new_descriptor_log is false, we will still switch to a new
// MANIFEST and update CURRENT file, since this is in recovery.
s = versions_->LogAndApply(
default_cfd, *default_cfd->GetLatestMutableCFOptions(), &edit, &mutex_,
directories_.GetDbDir(), /*new_descriptor_log*/ false);

@ -479,42 +479,14 @@ Status DBImpl::Recover(
// TryRecover may delete previous column_family_set_.
column_family_memtables_.reset(
new ColumnFamilyMemTablesImpl(versions_->GetColumnFamilySet()));
s = FinishBestEffortsRecovery();
}
}
if (!s.ok()) {
return s;
}
// Happens when immutable_db_options_.write_dbid_to_manifest is set to true
// the very first time.
if (db_id_.empty()) {
// Check for the IDENTITY file and create it if not there.
s = fs_->FileExists(IdentityFileName(dbname_), IOOptions(), nullptr);
// Typically Identity file is created in NewDB() and for some reason if
// it is no longer available then at this point DB ID is not in Identity
// file or Manifest.
if (s.IsNotFound()) {
s = SetIdentityFile(env_, dbname_);
if (!s.ok()) {
return s;
}
} else if (!s.ok()) {
assert(s.IsIOError());
return s;
}
s = GetDbIdentityFromIdentityFile(&db_id_);
if (immutable_db_options_.write_dbid_to_manifest && s.ok()) {
VersionEdit edit;
edit.SetDBId(db_id_);
Options options;
MutableCFOptions mutable_cf_options(options);
versions_->db_id_ = db_id_;
s = versions_->LogAndApply(versions_->GetColumnFamilySet()->GetDefault(),
mutable_cf_options, &edit, &mutex_, nullptr,
false);
}
} else {
s = SetIdentityFile(env_, dbname_, db_id_);
s = SetDBId();
if (s.ok() && !read_only) {
s = DeleteUnreferencedSstFiles();
}
if (immutable_db_options_.paranoid_checks && s.ok()) {

@ -269,7 +269,7 @@ TEST_F(DBRangeDelTest, FlushRemovesCoveredKeys) {
const int kNum = 300, kRangeBegin = 50, kRangeEnd = 250;
Options opts = CurrentOptions();
opts.comparator = test::Uint64Comparator();
Reopen(opts);
DestroyAndReopen(opts);
// Write a third before snapshot, a third between snapshot and tombstone, and
// a third after the tombstone. Keys older than snapshot or newer than the
@ -309,7 +309,7 @@ TEST_F(DBRangeDelTest, CompactionRemovesCoveredKeys) {
opts.memtable_factory.reset(new SpecialSkipListFactory(kNumPerFile));
opts.num_levels = 2;
opts.statistics = CreateDBStatistics();
Reopen(opts);
DestroyAndReopen(opts);
for (int i = 0; i < kNumFiles; ++i) {
if (i > 0) {
@ -603,7 +603,7 @@ TEST_F(DBRangeDelTest, TableEvictedDuringScan) {
bbto.cache_index_and_filter_blocks = true;
bbto.block_cache = NewLRUCache(8 << 20);
opts.table_factory.reset(NewBlockBasedTableFactory(bbto));
Reopen(opts);
DestroyAndReopen(opts);
// Hold a snapshot so range deletions can't become obsolete during compaction
// to bottommost level (i.e., L1).
@ -761,7 +761,7 @@ TEST_F(DBRangeDelTest, IteratorRemovesCoveredKeys) {
Options opts = CurrentOptions();
opts.comparator = test::Uint64Comparator();
opts.memtable_factory.reset(new SpecialSkipListFactory(kNumPerFile));
Reopen(opts);
DestroyAndReopen(opts);
// Write half of the keys before the tombstone and half after the tombstone.
// Only covered keys (i.e., within the range and older than the tombstone)
@ -794,7 +794,7 @@ TEST_F(DBRangeDelTest, IteratorOverUserSnapshot) {
Options opts = CurrentOptions();
opts.comparator = test::Uint64Comparator();
opts.memtable_factory.reset(new SpecialSkipListFactory(kNumPerFile));
Reopen(opts);
DestroyAndReopen(opts);
const Snapshot* snapshot = nullptr;
// Put a snapshot before the range tombstone, verify an iterator using that

@ -2066,7 +2066,6 @@ void StressTest::Open() {
FLAGS_level_compaction_dynamic_level_bytes;
options_.file_checksum_gen_factory =
GetFileChecksumImpl(FLAGS_file_checksum_impl);
options_.track_and_verify_wals_in_manifest = true;
} else {
#ifdef ROCKSDB_LITE
fprintf(stderr, "--options_file not supported in lite mode\n");

2
env/mock_env.cc vendored

@ -601,7 +601,7 @@ class MockFileSystem : public FileSystem {
std::string NormalizeMockPath(const std::string& path) {
std::string p = NormalizePath(path);
if (p.back() == '/' && p.size() > 1) {
if (p.back() == kFilePathSeparator && p.size() > 1) {
p.pop_back();
}
return p;

@ -1456,11 +1456,11 @@ public class RocksDBTest {
try (final RocksDB db = RocksDB.open(options, dbPath)) {
final RocksDB.LiveFiles livefiles = db.getLiveFiles(true);
assertThat(livefiles).isNotNull();
assertThat(livefiles.manifestFileSize).isEqualTo(13);
assertThat(livefiles.manifestFileSize).isEqualTo(57);
assertThat(livefiles.files.size()).isEqualTo(3);
assertThat(livefiles.files.get(0)).isEqualTo("/CURRENT");
assertThat(livefiles.files.get(1)).isEqualTo("/MANIFEST-000001");
assertThat(livefiles.files.get(2)).isEqualTo("/OPTIONS-000005");
assertThat(livefiles.files.get(1)).isEqualTo("/MANIFEST-000003");
assertThat(livefiles.files.get(2)).isEqualTo("/OPTIONS-000006");
}
}
}

@ -2500,19 +2500,19 @@ TEST_F(BackupableDBTest, GarbageCollectionBeforeBackup) {
OpenDBAndBackupEngine(true);
backup_chroot_env_->CreateDirIfMissing(backupdir_ + "/shared");
std::string file_five = backupdir_ + "/shared/000007.sst";
std::string file_five = backupdir_ + "/shared/000008.sst";
std::string file_five_contents = "I'm not really a sst file";
// this depends on the fact that 00007.sst is the first file created by the DB
// this depends on the fact that 00008.sst is the first file created by the DB
ASSERT_OK(file_manager_->WriteToFile(file_five, file_five_contents));
FillDB(db_.get(), 0, 100);
// backup overwrites file 000007.sst
// backup overwrites file 000008.sst
ASSERT_TRUE(backup_engine_->CreateNewBackup(db_.get(), true).ok());
std::string new_file_five_contents;
ASSERT_OK(ReadFileToString(backup_chroot_env_.get(), file_five,
&new_file_five_contents));
// file 000007.sst was overwritten
// file 000008.sst was overwritten
ASSERT_TRUE(new_file_five_contents != file_five_contents);
CloseDBAndBackupEngine();

Loading…
Cancel
Save