@ -1219,14 +1219,16 @@ Status DBImpl::Recover(
" flag but a log file already exists " ) ;
" flag but a log file already exists " ) ;
}
}
// Recover in the order in which the logs were generated
if ( ! logs . empty ( ) ) {
std : : sort ( logs . begin ( ) , logs . end ( ) ) ;
// Recover in the order in which the logs were generated
for ( const auto & log : logs ) {
std : : sort ( logs . begin ( ) , logs . end ( ) ) ;
// The previous incarnation may not have written any MANIFEST
s = RecoverLogFiles ( logs , & max_sequence , read_only ) ;
// records after allocating this log number. So we manually
if ( ! s . ok ( ) ) {
// update the file number allocation counter in VersionSet.
// Clear memtables if recovery failed
versions_ - > MarkFileNumberUsed ( log ) ;
for ( auto cfd : * versions_ - > GetColumnFamilySet ( ) ) {
s = RecoverLogFile ( log , & max_sequence , read_only ) ;
cfd - > CreateNewMemtable ( ) ;
}
}
}
}
SetTickerCount ( stats_ , SEQUENCE_NUMBER , versions_ - > LastSequence ( ) ) ;
SetTickerCount ( stats_ , SEQUENCE_NUMBER , versions_ - > LastSequence ( ) ) ;
}
}
@ -1239,8 +1241,9 @@ Status DBImpl::Recover(
return s ;
return s ;
}
}
Status DBImpl : : RecoverLogFile ( uint64_t log_number , SequenceNumber * max_sequence ,
// REQUIRES: log_numbers are sorted in ascending order
bool read_only ) {
Status DBImpl : : RecoverLogFiles ( const std : : vector < uint64_t > & log_numbers ,
SequenceNumber * max_sequence , bool read_only ) {
struct LogReporter : public log : : Reader : : Reporter {
struct LogReporter : public log : : Reader : : Reporter {
Env * env ;
Env * env ;
Logger * info_log ;
Logger * info_log ;
@ -1256,7 +1259,7 @@ Status DBImpl::RecoverLogFile(uint64_t log_number, SequenceNumber* max_sequence,
} ;
} ;
mutex_ . AssertHeld ( ) ;
mutex_ . AssertHeld ( ) ;
Status status ;
std : : unordered_map < int , VersionEdit > version_edits ;
std : : unordered_map < int , VersionEdit > version_edits ;
// no need to refcount because iteration is under mutex
// no need to refcount because iteration is under mutex
for ( auto cfd : * versions_ - > GetColumnFamilySet ( ) ) {
for ( auto cfd : * versions_ - > GetColumnFamilySet ( ) ) {
@ -1265,102 +1268,113 @@ Status DBImpl::RecoverLogFile(uint64_t log_number, SequenceNumber* max_sequence,
version_edits . insert ( { cfd - > GetID ( ) , edit } ) ;
version_edits . insert ( { cfd - > GetID ( ) , edit } ) ;
}
}
// Open the log file
for ( auto log_number : log_numbers ) {
std : : string fname = LogFileName ( db_options_ . wal_dir , log_number ) ;
// The previous incarnation may not have written any MANIFEST
unique_ptr < SequentialFile > file ;
// records after allocating this log number. So we manually
Status status = env_ - > NewSequentialFile ( fname , & file , env_options_ ) ;
// update the file number allocation counter in VersionSet.
if ( ! status . ok ( ) ) {
versions_ - > MarkFileNumberUsed ( log_number ) ;
MaybeIgnoreError ( & status ) ;
// Open the log file
return status ;
std : : string fname = LogFileName ( db_options_ . wal_dir , log_number ) ;
}
unique_ptr < SequentialFile > file ;
status = env_ - > NewSequentialFile ( fname , & file , env_options_ ) ;
// Create the log reader.
if ( ! status . ok ( ) ) {
LogReporter reporter ;
MaybeIgnoreError ( & status ) ;
reporter . env = env_ ;
if ( ! status . ok ( ) ) {
reporter . info_log = db_options_ . info_log . get ( ) ;
return status ;
reporter . fname = fname . c_str ( ) ;
} else {
reporter . status = ( db_options_ . paranoid_checks & &
// Fail with one log file, but that's ok.
! db_options_ . skip_log_error_on_recovery ? & status
// Try next one.
: nullptr ) ;
continue ;
// We intentially make log::Reader do checksumming even if
}
// paranoid_checks==false so that corruptions cause entire commits
// to be skipped instead of propagating bad information (like overly
// large sequence numbers).
log : : Reader reader ( std : : move ( file ) , & reporter , true /*checksum*/ ,
0 /*initial_offset*/ ) ;
Log ( db_options_ . info_log , " Recovering log #% " PRIu64 " " , log_number ) ;
// Read all the records and add to a memtable
std : : string scratch ;
Slice record ;
WriteBatch batch ;
while ( reader . ReadRecord ( & record , & scratch ) ) {
if ( record . size ( ) < 12 ) {
reporter . Corruption ( record . size ( ) ,
Status : : Corruption ( " log record too small " ) ) ;
continue ;
}
}
WriteBatchInternal : : SetContents ( & batch , record ) ;
// If column family was not found, it might mean that the WAL write
// Create the log reader.
// batch references to the column family that was dropped after the
LogReporter reporter ;
// insert. We don't want to fail the whole write batch in that case -- we
reporter . env = env_ ;
// just ignore the update. That's why we set ignore missing column families
reporter . info_log = db_options_ . info_log . get ( ) ;
// to true
reporter . fname = fname . c_str ( ) ;
status = WriteBatchInternal : : InsertInto (
reporter . status =
& batch , column_family_memtables_ . get ( ) ,
( db_options_ . paranoid_checks & & ! db_options_ . skip_log_error_on_recovery
true /* ignore missing column families */ , log_number ) ;
? & status
: nullptr ) ;
// We intentially make log::Reader do checksumming even if
// paranoid_checks==false so that corruptions cause entire commits
// to be skipped instead of propagating bad information (like overly
// large sequence numbers).
log : : Reader reader ( std : : move ( file ) , & reporter , true /*checksum*/ ,
0 /*initial_offset*/ ) ;
Log ( db_options_ . info_log , " Recovering log #% " PRIu64 " " , log_number ) ;
// Read all the records and add to a memtable
std : : string scratch ;
Slice record ;
WriteBatch batch ;
while ( reader . ReadRecord ( & record , & scratch ) ) {
if ( record . size ( ) < 12 ) {
reporter . Corruption ( record . size ( ) ,
Status : : Corruption ( " log record too small " ) ) ;
continue ;
}
WriteBatchInternal : : SetContents ( & batch , record ) ;
MaybeIgnoreError ( & status ) ;
// If column family was not found, it might mean that the WAL write
if ( ! status . ok ( ) ) {
// batch references to the column family that was dropped after the
return status ;
// insert. We don't want to fail the whole write batch in that case --
}
// we just ignore the update.
const SequenceNumber last_seq =
// That's why we set ignore missing column families to true
WriteBatchInternal : : Sequence ( & batch ) +
status = WriteBatchInternal : : InsertInto (
WriteBatchInternal : : Count ( & batch ) - 1 ;
& batch , column_family_memtables_ . get ( ) , true , log_number ) ;
if ( last_seq > * max_sequence ) {
* max_sequence = last_seq ;
}
if ( ! read_only ) {
MaybeIgnoreError ( & status ) ;
// no need to refcount since client still doesn't have access
if ( ! status . ok ( ) ) {
// to the DB and can not drop column families while we iterate
return status ;
for ( auto cfd : * versions_ - > GetColumnFamilySet ( ) ) {
}
if ( cfd - > mem ( ) - > ShouldFlush ( ) ) {
const SequenceNumber last_seq = WriteBatchInternal : : Sequence ( & batch ) +
// If this asserts, it means that InsertInto failed in
WriteBatchInternal : : Count ( & batch ) - 1 ;
// filtering updates to already-flushed column families
if ( last_seq > * max_sequence ) {
assert ( cfd - > GetLogNumber ( ) < = log_number ) ;
* max_sequence = last_seq ;
auto iter = version_edits . find ( cfd - > GetID ( ) ) ;
}
assert ( iter ! = version_edits . end ( ) ) ;
VersionEdit * edit = & iter - > second ;
if ( ! read_only ) {
status = WriteLevel0TableForRecovery ( cfd , cfd - > mem ( ) , edit ) ;
// no need to refcount since client still doesn't have access
// we still want to clear the memtable, even if the recovery failed
// to the DB and can not drop column families while we iterate
cfd - > CreateNewMemtable ( ) ;
for ( auto cfd : * versions_ - > GetColumnFamilySet ( ) ) {
if ( ! status . ok ( ) ) {
if ( cfd - > mem ( ) - > ShouldFlush ( ) ) {
// Reflect errors immediately so that conditions like full
// If this asserts, it means that InsertInto failed in
// file-systems cause the DB::Open() to fail.
// filtering updates to already-flushed column families
return status ;
assert ( cfd - > GetLogNumber ( ) < = log_number ) ;
auto iter = version_edits . find ( cfd - > GetID ( ) ) ;
assert ( iter ! = version_edits . end ( ) ) ;
VersionEdit * edit = & iter - > second ;
status = WriteLevel0TableForRecovery ( cfd , cfd - > mem ( ) , edit ) ;
if ( ! status . ok ( ) ) {
// Reflect errors immediately so that conditions like full
// file-systems cause the DB::Open() to fail.
return status ;
}
cfd - > CreateNewMemtable ( ) ;
}
}
}
}
}
}
}
}
}
if ( versions_ - > LastSequence ( ) < * max_sequence ) {
if ( versions_ - > LastSequence ( ) < * max_sequence ) {
versions_ - > SetLastSequence ( * max_sequence ) ;
versions_ - > SetLastSequence ( * max_sequence ) ;
}
}
}
if ( ! read_only ) {
if ( ! read_only ) {
// no need to refcount since client still doesn't have access
// no need to refcount since client still doesn't have access
// to the DB and can not drop column families while we iterate
// to the DB and can not drop column families while we iterate
auto max_log_number = log_numbers . back ( ) ;
for ( auto cfd : * versions_ - > GetColumnFamilySet ( ) ) {
for ( auto cfd : * versions_ - > GetColumnFamilySet ( ) ) {
auto iter = version_edits . find ( cfd - > GetID ( ) ) ;
auto iter = version_edits . find ( cfd - > GetID ( ) ) ;
assert ( iter ! = version_edits . end ( ) ) ;
assert ( iter ! = version_edits . end ( ) ) ;
VersionEdit * edit = & iter - > second ;
VersionEdit * edit = & iter - > second ;
if ( cfd - > GetLogNumber ( ) > log_number ) {
if ( cfd - > GetLogNumber ( ) > max_ log_number) {
// Column family cfd has already flushed the data
// Column family cfd has already flushed the data
// from log_number . Memtable has to be empty because
// from all logs . Memtable has to be empty because
// we filter the updates based on log_number
// we filter the updates based on log_number
// (in WriteBatch::InsertInto)
// (in WriteBatch::InsertInto)
assert ( cfd - > mem ( ) - > GetFirstSequenceNumber ( ) = = 0 ) ;
assert ( cfd - > mem ( ) - > GetFirstSequenceNumber ( ) = = 0 ) ;
@ -1371,28 +1385,29 @@ Status DBImpl::RecoverLogFile(uint64_t log_number, SequenceNumber* max_sequence,
// flush the final memtable (if non-empty)
// flush the final memtable (if non-empty)
if ( cfd - > mem ( ) - > GetFirstSequenceNumber ( ) ! = 0 ) {
if ( cfd - > mem ( ) - > GetFirstSequenceNumber ( ) ! = 0 ) {
status = WriteLevel0TableForRecovery ( cfd , cfd - > mem ( ) , edit ) ;
status = WriteLevel0TableForRecovery ( cfd , cfd - > mem ( ) , edit ) ;
}
if ( ! status . ok ( ) ) {
// we still want to clear the memtable, even if the r ecovery failed
// R ecovery failed
cfd - > CreateNewMemtable ( ) ;
break ;
if ( ! status . ok ( ) ) {
}
return status ;
cfd - > CreateNewMemtable ( ) ;
}
}
// write MANIFEST with update
// write MANIFEST with update
// writing log number in the manifest means that any log file
// writing log_ number in the manifest means that any log file
// with number strongly less than (log_number + 1) is already
// with number strongly less than (log_number + 1) is already
// recovered and should be ignored on next reincarnation.
// recovered and should be ignored on next reincarnation.
// Since we already recovered log_number, we want all logs
// Since we already recovered max_ log_number, we want all logs
// with numbers `<= log_number` (includes this one) to be ignored
// with numbers `<= max_ log_number` (includes this one) to be ignored
edit - > SetLogNumber ( log_number + 1 ) ;
edit - > SetLogNumber ( max_ log_number + 1 ) ;
// we must mark the next log number as used, even though it's
// we must mark the next log number as used, even though it's
// not actually used. that is because VersionSet assumes
// not actually used. that is because VersionSet assumes
// VersionSet::next_file_number_ always to be strictly greater than any
// VersionSet::next_file_number_ always to be strictly greater than any
// log number
// log number
versions_ - > MarkFileNumberUsed ( log_number + 1 ) ;
versions_ - > MarkFileNumberUsed ( max_ log_number + 1 ) ;
status = versions_ - > LogAndApply ( cfd , edit , & mutex_ ) ;
status = versions_ - > LogAndApply ( cfd , edit , & mutex_ ) ;
if ( ! status . ok ( ) ) {
if ( ! status . ok ( ) ) {
return status ;
// Recovery failed
break ;
}
}
}
}
}
}