@ -101,6 +101,7 @@ Status DBImpl::WriteImpl(const WriteOptions& write_options,
disable_memtable ) ;
disable_memtable ) ;
Status status ;
Status status ;
IOStatus io_s ;
if ( write_options . low_pri ) {
if ( write_options . low_pri ) {
status = ThrottleLowPriWritesIfNeeded ( write_options , my_batch ) ;
status = ThrottleLowPriWritesIfNeeded ( write_options , my_batch ) ;
if ( ! status . ok ( ) ) {
if ( ! status . ok ( ) ) {
@ -322,7 +323,7 @@ Status DBImpl::WriteImpl(const WriteOptions& write_options,
if ( ! two_write_queues_ ) {
if ( ! two_write_queues_ ) {
if ( status . ok ( ) & & ! write_options . disableWAL ) {
if ( status . ok ( ) & & ! write_options . disableWAL ) {
PERF_TIMER_GUARD ( write_wal_time ) ;
PERF_TIMER_GUARD ( write_wal_time ) ;
statu s = WriteToWAL ( write_group , log_writer , log_used , need_log_sync ,
io_ s = WriteToWAL ( write_group , log_writer , log_used , need_log_sync ,
need_log_dir_sync , last_sequence + 1 ) ;
need_log_dir_sync , last_sequence + 1 ) ;
}
}
} else {
} else {
@ -330,13 +331,14 @@ Status DBImpl::WriteImpl(const WriteOptions& write_options,
PERF_TIMER_GUARD ( write_wal_time ) ;
PERF_TIMER_GUARD ( write_wal_time ) ;
// LastAllocatedSequence is increased inside WriteToWAL under
// LastAllocatedSequence is increased inside WriteToWAL under
// wal_write_mutex_ to ensure ordered events in WAL
// wal_write_mutex_ to ensure ordered events in WAL
statu s = ConcurrentWriteToWAL ( write_group , log_used , & last_sequence ,
io_ s = ConcurrentWriteToWAL ( write_group , log_used , & last_sequence ,
seq_inc ) ;
seq_inc ) ;
} else {
} else {
// Otherwise we inc seq number for memtable writes
// Otherwise we inc seq number for memtable writes
last_sequence = versions_ - > FetchAddLastAllocatedSequence ( seq_inc ) ;
last_sequence = versions_ - > FetchAddLastAllocatedSequence ( seq_inc ) ;
}
}
}
}
status = io_s ;
assert ( last_sequence ! = kMaxSequenceNumber ) ;
assert ( last_sequence ! = kMaxSequenceNumber ) ;
const SequenceNumber current_sequence = last_sequence + 1 ;
const SequenceNumber current_sequence = last_sequence + 1 ;
last_sequence + = seq_inc ;
last_sequence + = seq_inc ;
@ -411,8 +413,12 @@ Status DBImpl::WriteImpl(const WriteOptions& write_options,
PERF_TIMER_START ( write_pre_and_post_process_time ) ;
PERF_TIMER_START ( write_pre_and_post_process_time ) ;
if ( ! w . CallbackFailed ( ) ) {
if ( ! w . CallbackFailed ( ) ) {
if ( ! io_s . ok ( ) ) {
IOStatusCheck ( io_s ) ;
} else {
WriteStatusCheck ( status ) ;
WriteStatusCheck ( status ) ;
}
}
}
if ( need_log_sync ) {
if ( need_log_sync ) {
mutex_ . Lock ( ) ;
mutex_ . Lock ( ) ;
@ -515,6 +521,7 @@ Status DBImpl::PipelinedWriteImpl(const WriteOptions& write_options,
PERF_TIMER_STOP ( write_pre_and_post_process_time ) ;
PERF_TIMER_STOP ( write_pre_and_post_process_time ) ;
IOStatus io_s ;
if ( w . status . ok ( ) & & ! write_options . disableWAL ) {
if ( w . status . ok ( ) & & ! write_options . disableWAL ) {
PERF_TIMER_GUARD ( write_wal_time ) ;
PERF_TIMER_GUARD ( write_wal_time ) ;
stats - > AddDBStats ( InternalStats : : kIntStatsWriteDoneBySelf , 1 ) ;
stats - > AddDBStats ( InternalStats : : kIntStatsWriteDoneBySelf , 1 ) ;
@ -524,13 +531,18 @@ Status DBImpl::PipelinedWriteImpl(const WriteOptions& write_options,
wal_write_group . size - 1 ) ;
wal_write_group . size - 1 ) ;
RecordTick ( stats_ , WRITE_DONE_BY_OTHER , wal_write_group . size - 1 ) ;
RecordTick ( stats_ , WRITE_DONE_BY_OTHER , wal_write_group . size - 1 ) ;
}
}
w . status = WriteToWAL ( wal_write_group , log_writer , log_used ,
io_s = WriteToWAL ( wal_write_group , log_writer , log_used , need_log_sync ,
need_log_sync , need_log_dir_sync , current_sequence ) ;
need_log_dir_sync , current_sequence ) ;
w . status = io_s ;
}
}
if ( ! w . CallbackFailed ( ) ) {
if ( ! w . CallbackFailed ( ) ) {
if ( ! io_s . ok ( ) ) {
IOStatusCheck ( io_s ) ;
} else {
WriteStatusCheck ( w . status ) ;
WriteStatusCheck ( w . status ) ;
}
}
}
if ( need_log_sync ) {
if ( need_log_sync ) {
mutex_ . Lock ( ) ;
mutex_ . Lock ( ) ;
@ -740,9 +752,10 @@ Status DBImpl::WriteImplWALOnly(
}
}
seq_inc = total_batch_cnt ;
seq_inc = total_batch_cnt ;
}
}
IOStatus io_s ;
if ( ! write_options . disableWAL ) {
if ( ! write_options . disableWAL ) {
sta tus =
io_ s = Concurren tWriteToWAL ( wri te_gro up , log_used , & last_sequence , seq_inc ) ;
ConcurrentWriteToWAL ( write_group , log_used , & last_sequence , seq_inc ) ;
status = io_s ;
} else {
} else {
// Otherwise we inc seq number to do solely the seq allocation
// Otherwise we inc seq number to do solely the seq allocation
last_sequence = versions_ - > FetchAddLastAllocatedSequence ( seq_inc ) ;
last_sequence = versions_ - > FetchAddLastAllocatedSequence ( seq_inc ) ;
@ -777,8 +790,12 @@ Status DBImpl::WriteImplWALOnly(
PERF_TIMER_START ( write_pre_and_post_process_time ) ;
PERF_TIMER_START ( write_pre_and_post_process_time ) ;
if ( ! w . CallbackFailed ( ) ) {
if ( ! w . CallbackFailed ( ) ) {
if ( ! io_s . ok ( ) ) {
IOStatusCheck ( io_s ) ;
} else {
WriteStatusCheck ( status ) ;
WriteStatusCheck ( status ) ;
}
}
}
if ( status . ok ( ) ) {
if ( status . ok ( ) ) {
size_t index = 0 ;
size_t index = 0 ;
for ( auto * writer : write_group ) {
for ( auto * writer : write_group ) {
@ -823,6 +840,17 @@ void DBImpl::WriteStatusCheck(const Status& status) {
}
}
}
}
void DBImpl : : IOStatusCheck ( const IOStatus & io_status ) {
// Is setting bg_error_ enough here? This will at least stop
// compaction and fail any further writes.
if ( immutable_db_options_ . paranoid_checks & & ! io_status . ok ( ) & &
! io_status . IsBusy ( ) & & ! io_status . IsIncomplete ( ) ) {
mutex_ . Lock ( ) ;
error_handler_ . SetBGError ( io_status , BackgroundErrorReason : : kWriteCallback ) ;
mutex_ . Unlock ( ) ;
}
}
void DBImpl : : MemTableInsertStatusCheck ( const Status & status ) {
void DBImpl : : MemTableInsertStatusCheck ( const Status & status ) {
// A non-OK status here indicates that the state implied by the
// A non-OK status here indicates that the state implied by the
// WAL has diverged from the in-memory state. This could be
// WAL has diverged from the in-memory state. This could be
@ -961,7 +989,7 @@ WriteBatch* DBImpl::MergeBatch(const WriteThread::WriteGroup& write_group,
// When two_write_queues_ is disabled, this function is called from the only
// When two_write_queues_ is disabled, this function is called from the only
// write thread. Otherwise this must be called holding log_write_mutex_.
// write thread. Otherwise this must be called holding log_write_mutex_.
Status DBImpl : : WriteToWAL ( const WriteBatch & merged_batch ,
IO Status DBImpl : : WriteToWAL ( const WriteBatch & merged_batch ,
log : : Writer * log_writer , uint64_t * log_used ,
log : : Writer * log_writer , uint64_t * log_used ,
uint64_t * log_size ) {
uint64_t * log_size ) {
assert ( log_size ! = nullptr ) ;
assert ( log_size ! = nullptr ) ;
@ -978,7 +1006,8 @@ Status DBImpl::WriteToWAL(const WriteBatch& merged_batch,
if ( UNLIKELY ( needs_locking ) ) {
if ( UNLIKELY ( needs_locking ) ) {
log_write_mutex_ . Lock ( ) ;
log_write_mutex_ . Lock ( ) ;
}
}
Status status = log_writer - > AddRecord ( log_entry ) ;
IOStatus io_s = log_writer - > AddRecord ( log_entry ) ;
if ( UNLIKELY ( needs_locking ) ) {
if ( UNLIKELY ( needs_locking ) ) {
log_write_mutex_ . Unlock ( ) ;
log_write_mutex_ . Unlock ( ) ;
}
}
@ -990,15 +1019,14 @@ Status DBImpl::WriteToWAL(const WriteBatch& merged_batch,
// since alive_log_files_ might be modified concurrently
// since alive_log_files_ might be modified concurrently
alive_log_files_ . back ( ) . AddSize ( log_entry . size ( ) ) ;
alive_log_files_ . back ( ) . AddSize ( log_entry . size ( ) ) ;
log_empty_ = false ;
log_empty_ = false ;
return statu s;
return io_ s;
}
}
Status DBImpl : : WriteToWAL ( const WriteThread : : WriteGroup & write_group ,
IO Status DBImpl : : WriteToWAL ( const WriteThread : : WriteGroup & write_group ,
log : : Writer * log_writer , uint64_t * log_used ,
log : : Writer * log_writer , uint64_t * log_used ,
bool need_log_sync , bool need_log_dir_sync ,
bool need_log_sync , bool need_log_dir_sync ,
SequenceNumber sequence ) {
SequenceNumber sequence ) {
Status status ;
IOStatus io_s ;
assert ( ! write_group . leader - > disable_wal ) ;
assert ( ! write_group . leader - > disable_wal ) ;
// Same holds for all in the batch group
// Same holds for all in the batch group
size_t write_with_wal = 0 ;
size_t write_with_wal = 0 ;
@ -1016,13 +1044,13 @@ Status DBImpl::WriteToWAL(const WriteThread::WriteGroup& write_group,
WriteBatchInternal : : SetSequence ( merged_batch , sequence ) ;
WriteBatchInternal : : SetSequence ( merged_batch , sequence ) ;
uint64_t log_size ;
uint64_t log_size ;
statu s = WriteToWAL ( * merged_batch , log_writer , log_used , & log_size ) ;
io_ s = WriteToWAL ( * merged_batch , log_writer , log_used , & log_size ) ;
if ( to_be_cached_state ) {
if ( to_be_cached_state ) {
cached_recoverable_state_ = * to_be_cached_state ;
cached_recoverable_state_ = * to_be_cached_state ;
cached_recoverable_state_empty_ = false ;
cached_recoverable_state_empty_ = false ;
}
}
if ( statu s. ok ( ) & & need_log_sync ) {
if ( io_ s. ok ( ) & & need_log_sync ) {
StopWatch sw ( env_ , stats_ , WAL_FILE_SYNC_MICROS ) ;
StopWatch sw ( env_ , stats_ , WAL_FILE_SYNC_MICROS ) ;
// It's safe to access logs_ with unlocked mutex_ here because:
// It's safe to access logs_ with unlocked mutex_ here because:
// - we've set getting_synced=true for all logs,
// - we've set getting_synced=true for all logs,
@ -1032,23 +1060,24 @@ Status DBImpl::WriteToWAL(const WriteThread::WriteGroup& write_group,
// - as long as other threads don't modify it, it's safe to read
// - as long as other threads don't modify it, it's safe to read
// from std::deque from multiple threads concurrently.
// from std::deque from multiple threads concurrently.
for ( auto & log : logs_ ) {
for ( auto & log : logs_ ) {
statu s = log . writer - > file ( ) - > Sync ( immutable_db_options_ . use_fsync ) ;
io_ s = log . writer - > file ( ) - > Sync ( immutable_db_options_ . use_fsync ) ;
if ( ! statu s. ok ( ) ) {
if ( ! io_ s. ok ( ) ) {
break ;
break ;
}
}
}
}
if ( status . ok ( ) & & need_log_dir_sync ) {
if ( io_s . ok ( ) & & need_log_dir_sync ) {
// We only sync WAL directory the first time WAL syncing is
// We only sync WAL directory the first time WAL syncing is
// requested, so that in case users never turn on WAL sync,
// requested, so that in case users never turn on WAL sync,
// we can avoid the disk I/O in the write code path.
// we can avoid the disk I/O in the write code path.
statu s = directories_ . GetWalDir ( ) - > Fsync ( IOOptions ( ) , nullptr ) ;
io_ s = directories_ . GetWalDir ( ) - > Fsync ( IOOptions ( ) , nullptr ) ;
}
}
}
}
if ( merged_batch = = & tmp_batch_ ) {
if ( merged_batch = = & tmp_batch_ ) {
tmp_batch_ . Clear ( ) ;
tmp_batch_ . Clear ( ) ;
}
}
if ( statu s. ok ( ) ) {
if ( io_ s. ok ( ) ) {
auto stats = default_cf_internal_stats_ ;
auto stats = default_cf_internal_stats_ ;
if ( need_log_sync ) {
if ( need_log_sync ) {
stats - > AddDBStats ( InternalStats : : kIntStatsWalFileSynced , 1 ) ;
stats - > AddDBStats ( InternalStats : : kIntStatsWalFileSynced , 1 ) ;
@ -1059,14 +1088,13 @@ Status DBImpl::WriteToWAL(const WriteThread::WriteGroup& write_group,
stats - > AddDBStats ( InternalStats : : kIntStatsWriteWithWal , write_with_wal ) ;
stats - > AddDBStats ( InternalStats : : kIntStatsWriteWithWal , write_with_wal ) ;
RecordTick ( stats_ , WRITE_WITH_WAL , write_with_wal ) ;
RecordTick ( stats_ , WRITE_WITH_WAL , write_with_wal ) ;
}
}
return statu s;
return io_ s;
}
}
Status DBImpl : : ConcurrentWriteToWAL ( const WriteThread : : WriteGroup & write_group ,
IOStatus DBImpl : : ConcurrentWriteToWAL (
uint64_t * log_used ,
const WriteThread : : WriteGroup & write_group , uint64_t * log_used ,
SequenceNumber * last_sequence ,
SequenceNumber * last_sequence , size_t seq_inc ) {
size_t seq_inc ) {
IOStatus io_s ;
Status status ;
assert ( ! write_group . leader - > disable_wal ) ;
assert ( ! write_group . leader - > disable_wal ) ;
// Same holds for all in the batch group
// Same holds for all in the batch group
@ -1092,14 +1120,14 @@ Status DBImpl::ConcurrentWriteToWAL(const WriteThread::WriteGroup& write_group,
log : : Writer * log_writer = logs_ . back ( ) . writer ;
log : : Writer * log_writer = logs_ . back ( ) . writer ;
uint64_t log_size ;
uint64_t log_size ;
statu s = WriteToWAL ( * merged_batch , log_writer , log_used , & log_size ) ;
io_ s = WriteToWAL ( * merged_batch , log_writer , log_used , & log_size ) ;
if ( to_be_cached_state ) {
if ( to_be_cached_state ) {
cached_recoverable_state_ = * to_be_cached_state ;
cached_recoverable_state_ = * to_be_cached_state ;
cached_recoverable_state_empty_ = false ;
cached_recoverable_state_empty_ = false ;
}
}
log_write_mutex_ . Unlock ( ) ;
log_write_mutex_ . Unlock ( ) ;
if ( statu s. ok ( ) ) {
if ( io_ s. ok ( ) ) {
const bool concurrent = true ;
const bool concurrent = true ;
auto stats = default_cf_internal_stats_ ;
auto stats = default_cf_internal_stats_ ;
stats - > AddDBStats ( InternalStats : : kIntStatsWalFileBytes , log_size ,
stats - > AddDBStats ( InternalStats : : kIntStatsWalFileBytes , log_size ,
@ -1109,7 +1137,7 @@ Status DBImpl::ConcurrentWriteToWAL(const WriteThread::WriteGroup& write_group,
concurrent ) ;
concurrent ) ;
RecordTick ( stats_ , WRITE_WITH_WAL , write_with_wal ) ;
RecordTick ( stats_ , WRITE_WITH_WAL , write_with_wal ) ;
}
}
return statu s;
return io_ s;
}
}
Status DBImpl : : WriteRecoverableState ( ) {
Status DBImpl : : WriteRecoverableState ( ) {