@ -399,7 +399,7 @@ IOStatus Directories::SetDirectories(FileSystem* fs, const std::string& dbname,
Status DBImpl : : Recover (
Status DBImpl : : Recover (
const std : : vector < ColumnFamilyDescriptor > & column_families , bool read_only ,
const std : : vector < ColumnFamilyDescriptor > & column_families , bool read_only ,
bool error_if_wal_file_exists , bool error_if_data_exists_in_wals ,
bool error_if_wal_file_exists , bool error_if_data_exists_in_wals ,
uint64_t * recovered_seq ) {
uint64_t * recovered_seq , RecoveryContext * recovery_ctx ) {
mutex_ . AssertHeld ( ) ;
mutex_ . AssertHeld ( ) ;
bool is_new_db = false ;
bool is_new_db = false ;
@ -518,9 +518,10 @@ Status DBImpl::Recover(
if ( ! s . ok ( ) ) {
if ( ! s . ok ( ) ) {
return s ;
return s ;
}
}
s = SetDBId ( read_only ) ;
s = SetDBId ( read_only , recovery_ctx ) ;
if ( s . ok ( ) & & ! read_only ) {
if ( s . ok ( ) & & ! read_only ) {
s = DeleteUnreferencedSstFiles ( ) ;
s = DeleteUnreferencedSstFiles ( recovery_ctx ) ;
}
}
if ( immutable_db_options_ . paranoid_checks & & s . ok ( ) ) {
if ( immutable_db_options_ . paranoid_checks & & s . ok ( ) ) {
@ -535,10 +536,6 @@ Status DBImpl::Recover(
}
}
}
}
}
}
// DB mutex is already held
if ( s . ok ( ) & & immutable_db_options_ . persist_stats_to_disk ) {
s = InitPersistStatsColumnFamily ( ) ;
}
std : : vector < std : : string > files_in_wal_dir ;
std : : vector < std : : string > files_in_wal_dir ;
if ( s . ok ( ) ) {
if ( s . ok ( ) ) {
@ -608,7 +605,10 @@ Status DBImpl::Recover(
WalNumber max_wal_number =
WalNumber max_wal_number =
versions_ - > GetWalSet ( ) . GetWals ( ) . rbegin ( ) - > first ;
versions_ - > GetWalSet ( ) . GetWals ( ) . rbegin ( ) - > first ;
edit . DeleteWalsBefore ( max_wal_number + 1 ) ;
edit . DeleteWalsBefore ( max_wal_number + 1 ) ;
s = versions_ - > LogAndApplyToDefaultColumnFamily ( & edit , & mutex_ ) ;
assert ( recovery_ctx ! = nullptr ) ;
assert ( versions_ - > GetColumnFamilySet ( ) ! = nullptr ) ;
recovery_ctx - > UpdateVersionEdits (
versions_ - > GetColumnFamilySet ( ) - > GetDefault ( ) , edit ) ;
}
}
if ( ! s . ok ( ) ) {
if ( ! s . ok ( ) ) {
return s ;
return s ;
@ -644,8 +644,8 @@ Status DBImpl::Recover(
std : : sort ( wals . begin ( ) , wals . end ( ) ) ;
std : : sort ( wals . begin ( ) , wals . end ( ) ) ;
bool corrupted_wal_found = false ;
bool corrupted_wal_found = false ;
s = RecoverLogFiles ( wals , & next_sequence , read_only ,
s = RecoverLogFiles ( wals , & next_sequence , read_only , & corrupted_wal_found ,
& corrupted_wal_found ) ;
recovery_ctx ) ;
if ( corrupted_wal_found & & recovered_seq ! = nullptr ) {
if ( corrupted_wal_found & & recovered_seq ! = nullptr ) {
* recovered_seq = next_sequence ;
* recovered_seq = next_sequence ;
}
}
@ -805,10 +805,30 @@ Status DBImpl::InitPersistStatsColumnFamily() {
return s ;
return s ;
}
}
Status DBImpl : : LogAndApplyForRecovery ( const RecoveryContext & recovery_ctx ) {
mutex_ . AssertHeld ( ) ;
assert ( versions_ - > descriptor_log_ = = nullptr ) ;
Status s = versions_ - > LogAndApply (
recovery_ctx . cfds_ , recovery_ctx . mutable_cf_opts_ ,
recovery_ctx . edit_lists_ , & mutex_ , directories_ . GetDbDir ( ) ) ;
if ( s . ok ( ) & & ! ( recovery_ctx . files_to_delete_ . empty ( ) ) ) {
mutex_ . Unlock ( ) ;
for ( const auto & fname : recovery_ctx . files_to_delete_ ) {
s = env_ - > DeleteFile ( fname ) ;
if ( ! s . ok ( ) ) {
break ;
}
}
mutex_ . Lock ( ) ;
}
return s ;
}
// REQUIRES: wal_numbers are sorted in ascending order
// REQUIRES: wal_numbers are sorted in ascending order
Status DBImpl : : RecoverLogFiles ( const std : : vector < uint64_t > & wal_numbers ,
Status DBImpl : : RecoverLogFiles ( std : : vector < uint64_t > & wal_numbers ,
SequenceNumber * next_sequence , bool read_only ,
SequenceNumber * next_sequence , bool read_only ,
bool * corrupted_wal_found ) {
bool * corrupted_wal_found ,
RecoveryContext * recovery_ctx ) {
struct LogReporter : public log : : Reader : : Reporter {
struct LogReporter : public log : : Reader : : Reporter {
Env * env ;
Env * env ;
Logger * info_log ;
Logger * info_log ;
@ -833,6 +853,7 @@ Status DBImpl::RecoverLogFiles(const std::vector<uint64_t>& wal_numbers,
edit . SetColumnFamily ( cfd - > GetID ( ) ) ;
edit . SetColumnFamily ( cfd - > GetID ( ) ) ;
version_edits . insert ( { cfd - > GetID ( ) , edit } ) ;
version_edits . insert ( { cfd - > GetID ( ) , edit } ) ;
}
}
int job_id = next_job_id_ . fetch_add ( 1 ) ;
int job_id = next_job_id_ . fetch_add ( 1 ) ;
{
{
auto stream = event_logger_ . Log ( ) ;
auto stream = event_logger_ . Log ( ) ;
@ -1256,6 +1277,7 @@ Status DBImpl::RecoverLogFiles(const std::vector<uint64_t>& wal_numbers,
edit - > SetLogNumber ( max_wal_number + 1 ) ;
edit - > SetLogNumber ( max_wal_number + 1 ) ;
}
}
}
}
if ( status . ok ( ) ) {
if ( status . ok ( ) ) {
// we must mark the next log number as used, even though it's
// we must mark the next log number as used, even though it's
// not actually used. that is because VersionSet assumes
// not actually used. that is because VersionSet assumes
@ -1263,42 +1285,40 @@ Status DBImpl::RecoverLogFiles(const std::vector<uint64_t>& wal_numbers,
// log number
// log number
versions_ - > MarkFileNumberUsed ( max_wal_number + 1 ) ;
versions_ - > MarkFileNumberUsed ( max_wal_number + 1 ) ;
autovector < ColumnFamilyData * > cfds ;
if ( corrupted_wal_found ! = nullptr & & * corrupted_wal_found = = true & &
autovector < const MutableCFOptions * > cf_opts ;
immutable_db_options_ . wal_recovery_mode = =
autovector < autovector < VersionEdit * > > edit_lists ;
WALRecoveryMode : : kPointInTimeRecovery ) {
MoveCorruptedWalFiles ( wal_numbers , corrupted_wal_number ) ;
}
assert ( recovery_ctx ! = nullptr ) ;
for ( auto * cfd : * versions_ - > GetColumnFamilySet ( ) ) {
for ( auto * cfd : * versions_ - > GetColumnFamilySet ( ) ) {
cfds . push_back ( cfd ) ;
cf_opts . push_back ( cfd - > GetLatestMutableCFOptions ( ) ) ;
auto iter = version_edits . find ( cfd - > GetID ( ) ) ;
auto iter = version_edits . find ( cfd - > GetID ( ) ) ;
assert ( iter ! = version_edits . end ( ) ) ;
assert ( iter ! = version_edits . end ( ) ) ;
edit_lists . push_back ( { & iter - > second } ) ;
recovery_ctx - > UpdateVersionEdits ( cfd , iter - > second ) ;
}
}
std : : unique_ptr < VersionEdit > wal_deletion ;
if ( flushed ) {
if ( flushed ) {
wal_deletion = std : : make_unique < VersionEdit > ( ) ;
VersionEdit wal_deletion ;
if ( immutable_db_options_ . track_and_verify_wals_in_manifest ) {
if ( immutable_db_options_ . track_and_verify_wals_in_manifest ) {
wal_deletion - > DeleteWalsBefore ( max_wal_number + 1 ) ;
wal_deletion . DeleteWalsBefore ( max_wal_number + 1 ) ;
}
}
if ( ! allow_2pc ( ) ) {
if ( ! allow_2pc ( ) ) {
// In non-2pc mode, flushing the memtables of the column families
// In non-2pc mode, flushing the memtables of the column families
// means we can advance min_log_number_to_keep.
// means we can advance min_log_number_to_keep.
wal_deletion - > SetMinLogNumberToKeep ( max_wal_number + 1 ) ;
wal_deletion . SetMinLogNumberToKeep ( max_wal_number + 1 ) ;
}
}
edit_lists . back ( ) . push_back ( wal_deletion . get ( ) ) ;
assert ( versions_ - > GetColumnFamilySet ( ) ! = nullptr ) ;
recovery_ctx - > UpdateVersionEdits (
versions_ - > GetColumnFamilySet ( ) - > GetDefault ( ) , wal_deletion ) ;
}
}
// write MANIFEST with update
status = versions_ - > LogAndApply ( cfds , cf_opts , edit_lists , & mutex_ ,
directories_ . GetDbDir ( ) ,
/*new_descriptor_log=*/ true ) ;
}
}
}
}
if ( status . ok ( ) ) {
if ( status . ok ( ) ) {
if ( data_seen & & ! flushed ) {
if ( data_seen & & ! flushed ) {
status = RestoreAliveLogFiles ( wal_numbers ) ;
status = RestoreAliveLogFiles ( wal_numbers ) ;
} else {
} else if ( ! wal_numbers . empty ( ) ) {
// If there's no data in the WAL, or we flushed all the data, still
// If there's no data in the WAL, or we flushed all the data, still
// truncate the log file. If the process goes into a crash loop before
// truncate the log file. If the process goes into a crash loop before
// the file is deleted, the preallocated space will never get freed.
// the file is deleted, the preallocated space will never get freed.
@ -1314,6 +1334,48 @@ Status DBImpl::RecoverLogFiles(const std::vector<uint64_t>& wal_numbers,
return status ;
return status ;
}
}
void DBImpl : : MoveCorruptedWalFiles ( std : : vector < uint64_t > & wal_numbers ,
uint64_t corrupted_wal_number ) {
size_t num_wals = wal_numbers . size ( ) ;
// Find the first corrupted wal.
auto iter = std : : lower_bound ( wal_numbers . begin ( ) , wal_numbers . end ( ) ,
corrupted_wal_number ) ;
auto corrupt_start_iter = iter ;
// Increment iter to move WAL files from first corrupted_wal_number + 1.
iter + + ;
std : : string archival_path =
ArchivalDirectory ( immutable_db_options_ . GetWalDir ( ) ) ;
Status create_status = env_ - > CreateDirIfMissing ( archival_path ) ;
// create_status is only checked when it needs to move the corrupted WAL files
// to archive folder.
create_status . PermitUncheckedError ( ) ;
// Truncate the last WAL to reclaim the pre allocated space before
// moving it.
GetLogSizeAndMaybeTruncate ( wal_numbers . back ( ) , /*truncate=*/ true , nullptr )
. PermitUncheckedError ( ) ;
// Move all the WAL files from corrupted_wal_number + 1 to last WAL
// (max_wal_number) to avoid column family inconsistency error to archival
// directory. If its unable to create archive dir, it will delete the
// corrupted WAL files.
// We are moving all but first corrupted WAL file to a different folder.
while ( iter ! = wal_numbers . end ( ) ) {
LogFileNumberSize log ( * iter ) ;
std : : string fname = LogFileName ( immutable_db_options_ . GetWalDir ( ) , * iter ) ;
# ifndef ROCKSDB_LITE
if ( create_status . ok ( ) ) {
wal_manager_ . ArchiveWALFile ( fname , * iter ) ;
}
# endif
iter + + ;
}
wal_numbers . erase ( corrupt_start_iter + 1 , wal_numbers . begin ( ) + num_wals ) ;
}
Status DBImpl : : GetLogSizeAndMaybeTruncate ( uint64_t wal_number , bool truncate ,
Status DBImpl : : GetLogSizeAndMaybeTruncate ( uint64_t wal_number , bool truncate ,
LogFileNumberSize * log_ptr ) {
LogFileNumberSize * log_ptr ) {
LogFileNumberSize log ( wal_number ) ;
LogFileNumberSize log ( wal_number ) ;
@ -1376,7 +1438,8 @@ Status DBImpl::RestoreAliveLogFiles(const std::vector<uint64_t>& wal_numbers) {
// log has such preallocated space, so we only truncate for the last log.
// log has such preallocated space, so we only truncate for the last log.
LogFileNumberSize log ;
LogFileNumberSize log ;
s = GetLogSizeAndMaybeTruncate (
s = GetLogSizeAndMaybeTruncate (
wal_number , /*truncate=*/ ( wal_number = = wal_numbers . back ( ) ) , & log ) ;
wal_number ,
/*truncate=*/ ( wal_number = = wal_numbers . back ( ) ) , & log ) ;
if ( ! s . ok ( ) ) {
if ( ! s . ok ( ) ) {
break ;
break ;
}
}
@ -1737,9 +1800,13 @@ Status DBImpl::Open(const DBOptions& db_options, const std::string& dbname,
impl - > wal_in_db_path_ = impl - > immutable_db_options_ . IsWalDirSameAsDBPath ( ) ;
impl - > wal_in_db_path_ = impl - > immutable_db_options_ . IsWalDirSameAsDBPath ( ) ;
impl - > mutex_ . Lock ( ) ;
impl - > mutex_ . Lock ( ) ;
RecoveryContext recovery_ctx ;
// Handles create_if_missing, error_if_exists
// Handles create_if_missing, error_if_exists
uint64_t recovered_seq ( kMaxSequenceNumber ) ;
uint64_t recovered_seq ( kMaxSequenceNumber ) ;
s = impl - > Recover ( column_families , false , false , false , & recovered_seq ) ;
s = impl - > Recover ( column_families , false , false , false , & recovered_seq ,
& recovery_ctx ) ;
if ( s . ok ( ) ) {
if ( s . ok ( ) ) {
uint64_t new_log_number = impl - > versions_ - > NewFileNumber ( ) ;
uint64_t new_log_number = impl - > versions_ - > NewFileNumber ( ) ;
log : : Writer * new_log = nullptr ;
log : : Writer * new_log = nullptr ;
@ -1755,6 +1822,55 @@ Status DBImpl::Open(const DBOptions& db_options, const std::string& dbname,
impl - > logs_ . emplace_back ( new_log_number , new_log ) ;
impl - > logs_ . emplace_back ( new_log_number , new_log ) ;
}
}
if ( s . ok ( ) ) {
if ( impl - > two_write_queues_ ) {
impl - > log_write_mutex_ . Lock ( ) ;
}
impl - > alive_log_files_ . push_back (
DBImpl : : LogFileNumberSize ( impl - > logfile_number_ ) ) ;
impl - > alive_log_files_tail_ = impl - > alive_log_files_ . rbegin ( ) ;
if ( impl - > two_write_queues_ ) {
impl - > log_write_mutex_ . Unlock ( ) ;
}
}
if ( s . ok ( ) ) {
// In WritePrepared there could be gap in sequence numbers. This breaks
// the trick we use in kPointInTimeRecovery which assumes the first seq
// in the log right after the corrupted log is one larger than the last
// seq we read from the wals. To let this trick keep working, we add a
// dummy entry with the expected sequence to the first log right after
// recovery. In non-WritePrepared case also the new log after recovery
// could be empty, and thus missing the consecutive seq hint to
// distinguish middle-log corruption to
// corrupted-log-remained-after-recovery. This case also will be
// addressed by a dummy write.
if ( recovered_seq ! = kMaxSequenceNumber ) {
WriteBatch empty_batch ;
WriteBatchInternal : : SetSequence ( & empty_batch , recovered_seq ) ;
WriteOptions write_options ;
uint64_t log_used , log_size ;
log : : Writer * log_writer = impl - > logs_ . back ( ) . writer ;
s = impl - > WriteToWAL ( empty_batch , log_writer , & log_used , & log_size ,
Env : : IO_TOTAL , /*with_db_mutex==*/ true ) ;
if ( s . ok ( ) ) {
// Need to fsync, otherwise it might get lost after a power reset.
s = impl - > FlushWAL ( false ) ;
if ( s . ok ( ) ) {
s = log_writer - > file ( ) - > Sync ( impl - > immutable_db_options_ . use_fsync ) ;
}
}
}
}
}
if ( s . ok ( ) ) {
s = impl - > LogAndApplyForRecovery ( recovery_ctx ) ;
}
if ( s . ok ( ) & & impl - > immutable_db_options_ . persist_stats_to_disk ) {
impl - > mutex_ . AssertHeld ( ) ;
s = impl - > InitPersistStatsColumnFamily ( ) ;
}
if ( s . ok ( ) ) {
if ( s . ok ( ) ) {
// set column family handles
// set column family handles
for ( auto cf : column_families ) {
for ( auto cf : column_families ) {
@ -1783,6 +1899,7 @@ Status DBImpl::Open(const DBOptions& db_options, const std::string& dbname,
}
}
}
}
}
}
if ( s . ok ( ) ) {
if ( s . ok ( ) ) {
SuperVersionContext sv_context ( /* create_superversion */ true ) ;
SuperVersionContext sv_context ( /* create_superversion */ true ) ;
for ( auto cfd : * impl - > versions_ - > GetColumnFamilySet ( ) ) {
for ( auto cfd : * impl - > versions_ - > GetColumnFamilySet ( ) ) {
@ -1790,43 +1907,6 @@ Status DBImpl::Open(const DBOptions& db_options, const std::string& dbname,
cfd , & sv_context , * cfd - > GetLatestMutableCFOptions ( ) ) ;
cfd , & sv_context , * cfd - > GetLatestMutableCFOptions ( ) ) ;
}
}
sv_context . Clean ( ) ;
sv_context . Clean ( ) ;
if ( impl - > two_write_queues_ ) {
impl - > log_write_mutex_ . Lock ( ) ;
}
impl - > alive_log_files_ . push_back (
DBImpl : : LogFileNumberSize ( impl - > logfile_number_ ) ) ;
impl - > alive_log_files_tail_ = impl - > alive_log_files_ . rbegin ( ) ;
if ( impl - > two_write_queues_ ) {
impl - > log_write_mutex_ . Unlock ( ) ;
}
}
if ( s . ok ( ) ) {
// In WritePrepared there could be gap in sequence numbers. This breaks
// the trick we use in kPointInTimeRecovery which assumes the first seq in
// the log right after the corrupted log is one larger than the last seq
// we read from the wals. To let this trick keep working, we add a dummy
// entry with the expected sequence to the first log right after recovery.
// In non-WritePrepared case also the new log after recovery could be
// empty, and thus missing the consecutive seq hint to distinguish
// middle-log corruption to corrupted-log-remained-after-recovery. This
// case also will be addressed by a dummy write.
if ( recovered_seq ! = kMaxSequenceNumber ) {
WriteBatch empty_batch ;
WriteBatchInternal : : SetSequence ( & empty_batch , recovered_seq ) ;
WriteOptions write_options ;
uint64_t log_used , log_size ;
log : : Writer * log_writer = impl - > logs_ . back ( ) . writer ;
s = impl - > WriteToWAL ( empty_batch , log_writer , & log_used , & log_size ,
Env : : IO_TOTAL , /*with_db_mutex==*/ true ) ;
if ( s . ok ( ) ) {
// Need to fsync, otherwise it might get lost after a power reset.
s = impl - > FlushWAL ( false ) ;
if ( s . ok ( ) ) {
s = log_writer - > file ( ) - > Sync ( impl - > immutable_db_options_ . use_fsync ) ;
}
}
}
}
}
}
if ( s . ok ( ) & & impl - > immutable_db_options_ . persist_stats_to_disk ) {
if ( s . ok ( ) & & impl - > immutable_db_options_ . persist_stats_to_disk ) {
// try to read format version
// try to read format version
@ -1853,7 +1933,8 @@ Status DBImpl::Open(const DBOptions& db_options, const std::string& dbname,
if ( cfd - > ioptions ( ) - > merge_operator ! = nullptr & &
if ( cfd - > ioptions ( ) - > merge_operator ! = nullptr & &
! cfd - > mem ( ) - > IsMergeOperatorSupported ( ) ) {
! cfd - > mem ( ) - > IsMergeOperatorSupported ( ) ) {
s = Status : : InvalidArgument (
s = Status : : InvalidArgument (
" The memtable of column family %s does not support merge operator "
" The memtable of column family %s does not support merge "
" operator "
" its options.merge_operator is non-null " ,
" its options.merge_operator is non-null " ,
cfd - > GetName ( ) . c_str ( ) ) ;
cfd - > GetName ( ) . c_str ( ) ) ;
}
}