Add a unit test to reproduce a corruption bug (#5851)

Summary:
This is a bug occaionally shows up in crash test, and this unit test is to reproduce it. The bug is following:
1. Database has multiple CFs.
2. Between one DB restart, the last log file is corrupted in the middle (not the tail)
3. During restart, DB crashes between flushes between two CFs.
The DB will fail to be opened again with error "SST file is ahead of WALs"
Pull Request resolved: https://github.com/facebook/rocksdb/pull/5851

Test Plan: Run the test itself.

Differential Revision: D17614721

fbshipit-source-id: 1b0abce49b203a76a039e38e76bc940429975f20
main
sdong 5 years ago committed by Facebook Github Bot
parent 6652c94f59
commit 76e951dbb1
  1. 2
      db/db_impl/db_impl_open.cc
  2. 60
      db/db_test2.cc

@ -1053,6 +1053,8 @@ Status DBImpl::RecoverLogFiles(const std::vector<uint64_t>& log_numbers,
versions_->MarkFileNumberUsed(max_log_number + 1);
status = versions_->LogAndApply(cfd, *cfd->GetLatestMutableCFOptions(),
edit, &mutex_);
TEST_SYNC_POINT_CALLBACK("DBImpl::RecoverLogFiles:AfterLogAndApply",
nullptr);
if (!status.ok()) {
// Recovery failed
break;

@ -16,6 +16,7 @@
#include "port/stack_trace.h"
#include "rocksdb/persistent_cache.h"
#include "rocksdb/wal_filter.h"
#include "test_util/fault_injection_test_env.h"
namespace rocksdb {
@ -4070,6 +4071,65 @@ TEST_F(DBTest2, RowCacheSnapshot) {
db_->ReleaseSnapshot(s3);
}
#endif // ROCKSDB_LITE
// Disabled but the test is failing.
// When DB is reopened with multiple column families, the manifest file
// is written after the first CF is flushed, and it is written again
// after each flush. If DB crashes between the flushes, the flushed CF
// flushed will pass the latest log file, and now we require it not
// to be corrupted, and triggering a corruption report.
// We need to fix the bug and enable the test.
TEST_F(DBTest2, DISABLED_CrashInRecoveryMultipleCF) {
Options options = CurrentOptions();
options.create_if_missing = true;
options.wal_recovery_mode = WALRecoveryMode::kPointInTimeRecovery;
CreateAndReopenWithCF({"pikachu"}, options);
ASSERT_OK(Put("foo", "bar"));
ASSERT_OK(Flush());
ASSERT_OK(Put(1, "foo", "bar"));
ASSERT_OK(Flush(1));
ASSERT_OK(Put("foo", "bar"));
ASSERT_OK(Put(1, "foo", "bar"));
// The value is large enough to be divided to two blocks.
std::string large_value(400, ' ');
ASSERT_OK(Put("foo1", large_value));
ASSERT_OK(Put("foo2", large_value));
Close();
// Corrupt the log file in the middle, so that it is not corrupted
// in the tail.
std::vector<std::string> filenames;
ASSERT_OK(env_->GetChildren(dbname_, &filenames));
for (const auto& f : filenames) {
uint64_t number;
FileType type;
if (ParseFileName(f, &number, &type) && type == FileType::kLogFile) {
std::string fname = dbname_ + "/" + f;
std::string file_content;
ASSERT_OK(ReadFileToString(env_, fname, &file_content));
file_content[400] = 'h';
file_content[401] = 'a';
ASSERT_OK(WriteStringToFile(env_, file_content, fname));
break;
}
}
// Reopen and freeze the file system after the first manifest write.
FaultInjectionTestEnv fit_env(options.env);
options.env = &fit_env;
rocksdb::SyncPoint::GetInstance()->SetCallBack(
"DBImpl::RecoverLogFiles:AfterLogAndApply",
[&](void* /*arg*/) { fit_env.SetFilesystemActive(false); });
rocksdb::SyncPoint::GetInstance()->EnableProcessing();
ASSERT_NOK(TryReopenWithColumnFamilies({"default", "pikachu"}, options));
rocksdb::SyncPoint::GetInstance()->DisableProcessing();
fit_env.SetFilesystemActive(true);
// If we continue using failure ingestion Env, it will conplain something
// when renaming current file, which is not expected. Need to investigate why.
options.env = env_;
ASSERT_OK(TryReopenWithColumnFamilies({"default", "pikachu"}, options));
}
} // namespace rocksdb
#ifdef ROCKSDB_UNITTESTS_WITH_CUSTOM_OBJECTS_FROM_STATIC_LIBS

Loading…
Cancel
Save