Improve db recovery

Summary: Avoid creating unnecessary sst files while db opening

Test Plan: make all check

Reviewers: sdong, igor

Reviewed By: igor

Subscribers: zagfox, yhchiang, ljin, leveldb

Differential Revision: https://reviews.facebook.net/D20661
main
Stanislau Hlebik 10 years ago
parent 6bb7e3ef25
commit d343c3fe46
  1. 79
      db/db_impl.cc
  2. 5
      db/db_impl.h
  3. 130
      db/db_test.cc

@ -1219,14 +1219,16 @@ Status DBImpl::Recover(
"flag but a log file already exists"); "flag but a log file already exists");
} }
if (!logs.empty()) {
// Recover in the order in which the logs were generated // Recover in the order in which the logs were generated
std::sort(logs.begin(), logs.end()); std::sort(logs.begin(), logs.end());
for (const auto& log : logs) { s = RecoverLogFiles(logs, &max_sequence, read_only);
// The previous incarnation may not have written any MANIFEST if (!s.ok()) {
// records after allocating this log number. So we manually // Clear memtables if recovery failed
// update the file number allocation counter in VersionSet. for (auto cfd : *versions_->GetColumnFamilySet()) {
versions_->MarkFileNumberUsed(log); cfd->CreateNewMemtable();
s = RecoverLogFile(log, &max_sequence, read_only); }
}
} }
SetTickerCount(stats_, SEQUENCE_NUMBER, versions_->LastSequence()); SetTickerCount(stats_, SEQUENCE_NUMBER, versions_->LastSequence());
} }
@ -1239,8 +1241,9 @@ Status DBImpl::Recover(
return s; return s;
} }
Status DBImpl::RecoverLogFile(uint64_t log_number, SequenceNumber* max_sequence, // REQUIRES: log_numbers are sorted in ascending order
bool read_only) { Status DBImpl::RecoverLogFiles(const std::vector<uint64_t>& log_numbers,
SequenceNumber* max_sequence, bool read_only) {
struct LogReporter : public log::Reader::Reporter { struct LogReporter : public log::Reader::Reporter {
Env* env; Env* env;
Logger* info_log; Logger* info_log;
@ -1256,7 +1259,7 @@ Status DBImpl::RecoverLogFile(uint64_t log_number, SequenceNumber* max_sequence,
}; };
mutex_.AssertHeld(); mutex_.AssertHeld();
Status status;
std::unordered_map<int, VersionEdit> version_edits; std::unordered_map<int, VersionEdit> version_edits;
// no need to refcount because iteration is under mutex // no need to refcount because iteration is under mutex
for (auto cfd : *versions_->GetColumnFamilySet()) { for (auto cfd : *versions_->GetColumnFamilySet()) {
@ -1265,13 +1268,24 @@ Status DBImpl::RecoverLogFile(uint64_t log_number, SequenceNumber* max_sequence,
version_edits.insert({cfd->GetID(), edit}); version_edits.insert({cfd->GetID(), edit});
} }
for (auto log_number : log_numbers) {
// The previous incarnation may not have written any MANIFEST
// records after allocating this log number. So we manually
// update the file number allocation counter in VersionSet.
versions_->MarkFileNumberUsed(log_number);
// Open the log file // Open the log file
std::string fname = LogFileName(db_options_.wal_dir, log_number); std::string fname = LogFileName(db_options_.wal_dir, log_number);
unique_ptr<SequentialFile> file; unique_ptr<SequentialFile> file;
Status status = env_->NewSequentialFile(fname, &file, env_options_); status = env_->NewSequentialFile(fname, &file, env_options_);
if (!status.ok()) { if (!status.ok()) {
MaybeIgnoreError(&status); MaybeIgnoreError(&status);
if (!status.ok()) {
return status; return status;
} else {
// Fail with one log file, but that's ok.
// Try next one.
continue;
}
} }
// Create the log reader. // Create the log reader.
@ -1279,8 +1293,9 @@ Status DBImpl::RecoverLogFile(uint64_t log_number, SequenceNumber* max_sequence,
reporter.env = env_; reporter.env = env_;
reporter.info_log = db_options_.info_log.get(); reporter.info_log = db_options_.info_log.get();
reporter.fname = fname.c_str(); reporter.fname = fname.c_str();
reporter.status = (db_options_.paranoid_checks && reporter.status =
!db_options_.skip_log_error_on_recovery ? &status (db_options_.paranoid_checks && !db_options_.skip_log_error_on_recovery
? &status
: nullptr); : nullptr);
// We intentially make log::Reader do checksumming even if // We intentially make log::Reader do checksumming even if
// paranoid_checks==false so that corruptions cause entire commits // paranoid_checks==false so that corruptions cause entire commits
@ -1304,19 +1319,17 @@ Status DBImpl::RecoverLogFile(uint64_t log_number, SequenceNumber* max_sequence,
// If column family was not found, it might mean that the WAL write // If column family was not found, it might mean that the WAL write
// batch references to the column family that was dropped after the // batch references to the column family that was dropped after the
// insert. We don't want to fail the whole write batch in that case -- we // insert. We don't want to fail the whole write batch in that case --
// just ignore the update. That's why we set ignore missing column families // we just ignore the update.
// to true // That's why we set ignore missing column families to true
status = WriteBatchInternal::InsertInto( status = WriteBatchInternal::InsertInto(
&batch, column_family_memtables_.get(), &batch, column_family_memtables_.get(), true, log_number);
true /* ignore missing column families */, log_number);
MaybeIgnoreError(&status); MaybeIgnoreError(&status);
if (!status.ok()) { if (!status.ok()) {
return status; return status;
} }
const SequenceNumber last_seq = const SequenceNumber last_seq = WriteBatchInternal::Sequence(&batch) +
WriteBatchInternal::Sequence(&batch) +
WriteBatchInternal::Count(&batch) - 1; WriteBatchInternal::Count(&batch) - 1;
if (last_seq > *max_sequence) { if (last_seq > *max_sequence) {
*max_sequence = last_seq; *max_sequence = last_seq;
@ -1334,13 +1347,12 @@ Status DBImpl::RecoverLogFile(uint64_t log_number, SequenceNumber* max_sequence,
assert(iter != version_edits.end()); assert(iter != version_edits.end());
VersionEdit* edit = &iter->second; VersionEdit* edit = &iter->second;
status = WriteLevel0TableForRecovery(cfd, cfd->mem(), edit); status = WriteLevel0TableForRecovery(cfd, cfd->mem(), edit);
// we still want to clear the memtable, even if the recovery failed
cfd->CreateNewMemtable();
if (!status.ok()) { if (!status.ok()) {
// Reflect errors immediately so that conditions like full // Reflect errors immediately so that conditions like full
// file-systems cause the DB::Open() to fail. // file-systems cause the DB::Open() to fail.
return status; return status;
} }
cfd->CreateNewMemtable();
} }
} }
} }
@ -1349,18 +1361,20 @@ Status DBImpl::RecoverLogFile(uint64_t log_number, SequenceNumber* max_sequence,
if (versions_->LastSequence() < *max_sequence) { if (versions_->LastSequence() < *max_sequence) {
versions_->SetLastSequence(*max_sequence); versions_->SetLastSequence(*max_sequence);
} }
}
if (!read_only) { if (!read_only) {
// no need to refcount since client still doesn't have access // no need to refcount since client still doesn't have access
// to the DB and can not drop column families while we iterate // to the DB and can not drop column families while we iterate
auto max_log_number = log_numbers.back();
for (auto cfd : *versions_->GetColumnFamilySet()) { for (auto cfd : *versions_->GetColumnFamilySet()) {
auto iter = version_edits.find(cfd->GetID()); auto iter = version_edits.find(cfd->GetID());
assert(iter != version_edits.end()); assert(iter != version_edits.end());
VersionEdit* edit = &iter->second; VersionEdit* edit = &iter->second;
if (cfd->GetLogNumber() > log_number) { if (cfd->GetLogNumber() > max_log_number) {
// Column family cfd has already flushed the data // Column family cfd has already flushed the data
// from log_number. Memtable has to be empty because // from all logs. Memtable has to be empty because
// we filter the updates based on log_number // we filter the updates based on log_number
// (in WriteBatch::InsertInto) // (in WriteBatch::InsertInto)
assert(cfd->mem()->GetFirstSequenceNumber() == 0); assert(cfd->mem()->GetFirstSequenceNumber() == 0);
@ -1371,28 +1385,29 @@ Status DBImpl::RecoverLogFile(uint64_t log_number, SequenceNumber* max_sequence,
// flush the final memtable (if non-empty) // flush the final memtable (if non-empty)
if (cfd->mem()->GetFirstSequenceNumber() != 0) { if (cfd->mem()->GetFirstSequenceNumber() != 0) {
status = WriteLevel0TableForRecovery(cfd, cfd->mem(), edit); status = WriteLevel0TableForRecovery(cfd, cfd->mem(), edit);
if (!status.ok()) {
// Recovery failed
break;
} }
// we still want to clear the memtable, even if the recovery failed
cfd->CreateNewMemtable(); cfd->CreateNewMemtable();
if (!status.ok()) {
return status;
} }
// write MANIFEST with update // write MANIFEST with update
// writing log number in the manifest means that any log file // writing log_number in the manifest means that any log file
// with number strongly less than (log_number + 1) is already // with number strongly less than (log_number + 1) is already
// recovered and should be ignored on next reincarnation. // recovered and should be ignored on next reincarnation.
// Since we already recovered log_number, we want all logs // Since we already recovered max_log_number, we want all logs
// with numbers `<= log_number` (includes this one) to be ignored // with numbers `<= max_log_number` (includes this one) to be ignored
edit->SetLogNumber(log_number + 1); edit->SetLogNumber(max_log_number + 1);
// we must mark the next log number as used, even though it's // we must mark the next log number as used, even though it's
// not actually used. that is because VersionSet assumes // not actually used. that is because VersionSet assumes
// VersionSet::next_file_number_ always to be strictly greater than any // VersionSet::next_file_number_ always to be strictly greater than any
// log number // log number
versions_->MarkFileNumberUsed(log_number + 1); versions_->MarkFileNumberUsed(max_log_number + 1);
status = versions_->LogAndApply(cfd, edit, &mutex_); status = versions_->LogAndApply(cfd, edit, &mutex_);
if (!status.ok()) { if (!status.ok()) {
return status; // Recovery failed
break;
} }
} }
} }

@ -344,8 +344,9 @@ class DBImpl : public DB {
DeletionState& deletion_state, DeletionState& deletion_state,
LogBuffer* log_buffer); LogBuffer* log_buffer);
Status RecoverLogFile(uint64_t log_number, SequenceNumber* max_sequence, // REQUIRES: log_numbers are sorted in ascending order
bool read_only); Status RecoverLogFiles(const std::vector<uint64_t>& log_numbers,
SequenceNumber* max_sequence, bool read_only);
// The following two methods are used to flush a memtable to // The following two methods are used to flush a memtable to
// storage. The first one is used atdatabase RecoveryTime (when the // storage. The first one is used atdatabase RecoveryTime (when the

@ -6120,18 +6120,18 @@ namespace {
std::vector<std::uint64_t> ListSpecificFiles( std::vector<std::uint64_t> ListSpecificFiles(
Env* env, const std::string& path, const FileType expected_file_type) { Env* env, const std::string& path, const FileType expected_file_type) {
std::vector<std::string> files; std::vector<std::string> files;
std::vector<uint64_t> log_files; std::vector<uint64_t> file_numbers;
env->GetChildren(path, &files); env->GetChildren(path, &files);
uint64_t number; uint64_t number;
FileType type; FileType type;
for (size_t i = 0; i < files.size(); ++i) { for (size_t i = 0; i < files.size(); ++i) {
if (ParseFileName(files[i], &number, &type)) { if (ParseFileName(files[i], &number, &type)) {
if (type == expected_file_type) { if (type == expected_file_type) {
log_files.push_back(number); file_numbers.push_back(number);
} }
} }
} }
return std::move(log_files); return std::move(file_numbers);
} }
std::vector<std::uint64_t> ListLogFiles(Env* env, const std::string& path) { std::vector<std::uint64_t> ListLogFiles(Env* env, const std::string& path) {
@ -6141,6 +6141,17 @@ std::vector<std::uint64_t> ListLogFiles(Env* env, const std::string& path) {
std::vector<std::uint64_t> ListTableFiles(Env* env, const std::string& path) { std::vector<std::uint64_t> ListTableFiles(Env* env, const std::string& path) {
return ListSpecificFiles(env, path, kTableFile); return ListSpecificFiles(env, path, kTableFile);
} }
std::uint64_t GetNumberOfSstFilesForColumnFamily(
DB* db, std::string column_family_name) {
std::vector<LiveFileMetaData> metadata;
db->GetLiveFilesMetaData(&metadata);
uint64_t result = 0;
for (auto& fileMetadata : metadata) {
result += (fileMetadata.column_family_name == column_family_name);
}
return result;
}
} // namespace } // namespace
TEST(DBTest, FlushOneColumnFamily) { TEST(DBTest, FlushOneColumnFamily) {
@ -6165,6 +6176,119 @@ TEST(DBTest, FlushOneColumnFamily) {
} }
} }
// In https://reviews.facebook.net/D20661 we change
// recovery behavior: previously for each log file each column family
// memtable was flushed, even it was empty. Now it's changed:
// we try to create the smallest number of table files by merging
// updates from multiple logs
TEST(DBTest, RecoverCheckFileAmountWithSmallWriteBuffer) {
Options options;
options.write_buffer_size = 5000000;
CreateAndReopenWithCF({"pikachu", "dobrynia", "nikitich"}, &options);
// Since we will reopen DB with smaller write_buffer_size,
// each key will go to new SST file
ASSERT_OK(Put(1, Key(10), DummyString(1000000)));
ASSERT_OK(Put(1, Key(10), DummyString(1000000)));
ASSERT_OK(Put(1, Key(10), DummyString(1000000)));
ASSERT_OK(Put(1, Key(10), DummyString(1000000)));
ASSERT_OK(Put(3, Key(10), DummyString(1)));
// Make 'dobrynia' to be flushed and new WAL file to be created
ASSERT_OK(Put(2, Key(10), DummyString(7500000)));
ASSERT_OK(Put(2, Key(1), DummyString(1)));
dbfull()->TEST_WaitForFlushMemTable(handles_[2]);
{
auto tables = ListTableFiles(env_, dbname_);
ASSERT_EQ(tables.size(), 1);
// Make sure 'dobrynia' was flushed: check sst files amount
ASSERT_EQ(GetNumberOfSstFilesForColumnFamily(db_, "dobrynia"), 1);
}
// New WAL file
ASSERT_OK(Put(1, Key(1), DummyString(1)));
ASSERT_OK(Put(1, Key(1), DummyString(1)));
ASSERT_OK(Put(3, Key(10), DummyString(1)));
ASSERT_OK(Put(3, Key(10), DummyString(1)));
ASSERT_OK(Put(3, Key(10), DummyString(1)));
options.write_buffer_size = 10;
ReopenWithColumnFamilies({"default", "pikachu", "dobrynia", "nikitich"},
&options);
{
// No inserts => default is empty
ASSERT_EQ(GetNumberOfSstFilesForColumnFamily(db_, "default"), 0);
// First 4 keys goes to separate SSTs + 1 more SST for 2 smaller keys
ASSERT_EQ(GetNumberOfSstFilesForColumnFamily(db_, "pikachu"), 5);
// 1 SST for big key + 1 SST for small one
ASSERT_EQ(GetNumberOfSstFilesForColumnFamily(db_, "dobrynia"), 2);
// 1 SST for all keys
ASSERT_EQ(GetNumberOfSstFilesForColumnFamily(db_, "nikitich"), 1);
}
}
// In https://reviews.facebook.net/D20661 we change
// recovery behavior: previously for each log file each column family
// memtable was flushed, even it wasn't empty. Now it's changed:
// we try to create the smallest number of table files by merging
// updates from multiple logs
TEST(DBTest, RecoverCheckFileAmount) {
Options options;
options.write_buffer_size = 100000;
CreateAndReopenWithCF({"pikachu", "dobrynia", "nikitich"}, &options);
ASSERT_OK(Put(0, Key(1), DummyString(1)));
ASSERT_OK(Put(1, Key(1), DummyString(1)));
ASSERT_OK(Put(2, Key(1), DummyString(1)));
// Make 'nikitich' memtable to be flushed
ASSERT_OK(Put(3, Key(10), DummyString(1002400)));
ASSERT_OK(Put(3, Key(1), DummyString(1)));
dbfull()->TEST_WaitForFlushMemTable(handles_[3]);
// 4 memtable are not flushed, 1 sst file
{
auto tables = ListTableFiles(env_, dbname_);
ASSERT_EQ(tables.size(), 1);
ASSERT_EQ(GetNumberOfSstFilesForColumnFamily(db_, "nikitich"), 1);
}
// Memtable for 'nikitich' has flushed, new WAL file has opened
// 4 memtable still not flushed
// Write to new WAL file
ASSERT_OK(Put(0, Key(1), DummyString(1)));
ASSERT_OK(Put(1, Key(1), DummyString(1)));
ASSERT_OK(Put(2, Key(1), DummyString(1)));
// Fill up 'nikitich' one more time
ASSERT_OK(Put(3, Key(10), DummyString(1002400)));
// make it flush
ASSERT_OK(Put(3, Key(1), DummyString(1)));
dbfull()->TEST_WaitForFlushMemTable(handles_[3]);
// There are still 4 memtable not flushed, and 2 sst tables
ASSERT_OK(Put(0, Key(1), DummyString(1)));
ASSERT_OK(Put(1, Key(1), DummyString(1)));
ASSERT_OK(Put(2, Key(1), DummyString(1)));
{
auto tables = ListTableFiles(env_, dbname_);
ASSERT_EQ(tables.size(), 2);
ASSERT_EQ(GetNumberOfSstFilesForColumnFamily(db_, "nikitich"), 2);
}
ReopenWithColumnFamilies({"default", "pikachu", "dobrynia", "nikitich"},
&options);
{
std::vector<uint64_t> table_files = ListTableFiles(env_, dbname_);
// Check, that records for 'default', 'dobrynia' and 'pikachu' from
// first, second and third WALs went to the same SST.
// So, there is 6 SSTs: three for 'nikitich', one for 'default', one for
// 'dobrynia', one for 'pikachu'
ASSERT_EQ(GetNumberOfSstFilesForColumnFamily(db_, "default"), 1);
ASSERT_EQ(GetNumberOfSstFilesForColumnFamily(db_, "nikitich"), 3);
ASSERT_EQ(GetNumberOfSstFilesForColumnFamily(db_, "dobrynia"), 1);
ASSERT_EQ(GetNumberOfSstFilesForColumnFamily(db_, "pikachu"), 1);
}
}
TEST(DBTest, WALArchivalTtl) { TEST(DBTest, WALArchivalTtl) {
do { do {
Options options = CurrentOptions(); Options options = CurrentOptions();

Loading…
Cancel
Save