@ -80,7 +80,9 @@ struct DBImpl::Writer {
WriteBatch * batch ;
WriteBatch * batch ;
bool sync ;
bool sync ;
bool disableWAL ;
bool disableWAL ;
bool in_batch_group ;
bool done ;
bool done ;
uint64_t timeout_hint_us ;
port : : CondVar cv ;
port : : CondVar cv ;
explicit Writer ( port : : Mutex * mu ) : cv ( mu ) { }
explicit Writer ( port : : Mutex * mu ) : cv ( mu ) { }
@ -3729,13 +3731,41 @@ Status DBImpl::Write(const WriteOptions& options, WriteBatch* my_batch) {
w . batch = my_batch ;
w . batch = my_batch ;
w . sync = options . sync ;
w . sync = options . sync ;
w . disableWAL = options . disableWAL ;
w . disableWAL = options . disableWAL ;
w . in_batch_group = false ;
w . done = false ;
w . timeout_hint_us = options . timeout_hint_us ;
uint64_t expiration_time = 0 ;
if ( w . timeout_hint_us = = 0 ) {
w . timeout_hint_us = kNoTimeOut ;
} else {
expiration_time = env_ - > NowMicros ( ) + w . timeout_hint_us ;
}
w . done = false ;
w . done = false ;
StopWatch sw ( env_ , options_ . statistics . get ( ) , DB_WRITE , false ) ;
mutex_ . Lock ( ) ;
mutex_ . Lock ( ) ;
// the following code block pushes the current writer "w" into the writer
// queue "writers_" and wait until one of the following conditions met:
// 1. the job of "w" has been done by some other writers.
// 2. "w" becomes the first writer in "writers_"
// 3. "w" timed-out.
writers_ . push_back ( & w ) ;
writers_ . push_back ( & w ) ;
bool timed_out = false ;
while ( ! w . done & & & w ! = writers_ . front ( ) ) {
while ( ! w . done & & & w ! = writers_ . front ( ) ) {
if ( expiration_time = = 0 ) {
w . cv . Wait ( ) ;
w . cv . Wait ( ) ;
} else if ( w . cv . TimedWait ( expiration_time ) ) {
if ( w . in_batch_group ) {
// then it means the front writer is currently doing the
// write on behalf of this "timed-out" writer. Then it
// should wait until the write completes.
expiration_time = 0 ;
} else {
timed_out = true ;
break ;
}
}
}
}
if ( ! options . disableWAL ) {
if ( ! options . disableWAL ) {
@ -3746,10 +3776,33 @@ Status DBImpl::Write(const WriteOptions& options, WriteBatch* my_batch) {
mutex_ . Unlock ( ) ;
mutex_ . Unlock ( ) ;
RecordTick ( options_ . statistics . get ( ) , WRITE_DONE_BY_OTHER , 1 ) ;
RecordTick ( options_ . statistics . get ( ) , WRITE_DONE_BY_OTHER , 1 ) ;
return w . status ;
return w . status ;
} else if ( timed_out ) {
bool found = false ;
for ( auto iter = writers_ . begin ( ) ; iter ! = writers_ . end ( ) ; iter + + ) {
if ( * iter = = & w ) {
writers_ . erase ( iter ) ;
found = true ;
break ;
}
}
assert ( found ) ;
// writers_.front() might still be in cond_wait without a time-out.
// As a result, we need to signal it to wake it up. Otherwise no
// one else will wake him up, and RocksDB will hang.
if ( ! writers_ . empty ( ) ) {
writers_ . front ( ) - > cv . Signal ( ) ;
}
mutex_ . Unlock ( ) ;
RecordTick ( options_ . statistics . get ( ) , WRITE_TIMEDOUT , 1 ) ;
return Status : : TimedOut ( ) ;
} else {
} else {
RecordTick ( options_ . statistics . get ( ) , WRITE_DONE_BY_SELF , 1 ) ;
RecordTick ( options_ . statistics . get ( ) , WRITE_DONE_BY_SELF , 1 ) ;
}
}
// Once reaches this point, the current writer "w" will try to do its write
// job. It may also pick up some of the remaining writers in the "writers_"
// when it finds suitable, and finish them in the same write batch.
// This is how a write job could be done by the other writer.
assert ( ! single_column_family_mode_ | |
assert ( ! single_column_family_mode_ | |
versions_ - > GetColumnFamilySet ( ) - > NumberOfColumnFamilies ( ) = = 1 ) ;
versions_ - > GetColumnFamilySet ( ) - > NumberOfColumnFamilies ( ) = = 1 ) ;
@ -3774,8 +3827,10 @@ Status DBImpl::Write(const WriteOptions& options, WriteBatch* my_batch) {
if ( LIKELY ( single_column_family_mode_ ) ) {
if ( LIKELY ( single_column_family_mode_ ) ) {
// fast path
// fast path
status = MakeRoomForWrite ( default_cf_handle_ - > cfd ( ) , my_batch = = nullptr ,
status = MakeRoomForWrite (
& superversions_to_free , & logs_to_free ) ;
default_cf_handle_ - > cfd ( ) , my_batch = = nullptr ,
& superversions_to_free , & logs_to_free ,
expiration_time ) ;
} else {
} else {
// refcounting cfd in iteration
// refcounting cfd in iteration
bool dead_cfd = false ;
bool dead_cfd = false ;
@ -3786,8 +3841,9 @@ Status DBImpl::Write(const WriteOptions& options, WriteBatch* my_batch) {
( flush_column_family_if_log_file ! = 0 & &
( flush_column_family_if_log_file ! = 0 & &
cfd - > GetLogNumber ( ) < = flush_column_family_if_log_file ) ;
cfd - > GetLogNumber ( ) < = flush_column_family_if_log_file ) ;
// May temporarily unlock and wait.
// May temporarily unlock and wait.
status = MakeRoomForWrite ( cfd , force_flush , & superversions_to_free ,
status = MakeRoomForWrite (
& logs_to_free ) ;
cfd , force_flush , & superversions_to_free , & logs_to_free ,
expiration_time ) ;
if ( cfd - > Unref ( ) ) {
if ( cfd - > Unref ( ) ) {
dead_cfd = true ;
dead_cfd = true ;
}
}
@ -3883,11 +3939,14 @@ Status DBImpl::Write(const WriteOptions& options, WriteBatch* my_batch) {
}
}
}
}
}
}
if ( options_ . paranoid_checks & & ! status . ok ( ) & & bg_error_ . ok ( ) ) {
if ( options_ . paranoid_checks & & ! status . ok ( ) & &
! status . IsTimedOut ( ) & & bg_error_ . ok ( ) ) {
bg_error_ = status ; // stop compaction & fail any further writes
bg_error_ = status ; // stop compaction & fail any further writes
}
}
while ( true ) {
// Pop out the current writer and all writers being pushed before the
// current writer from the writer queue.
while ( ! writers_ . empty ( ) ) {
Writer * ready = writers_ . front ( ) ;
Writer * ready = writers_ . front ( ) ;
writers_ . pop_front ( ) ;
writers_ . pop_front ( ) ;
if ( ready ! = & w ) {
if ( ready ! = & w ) {
@ -3904,6 +3963,10 @@ Status DBImpl::Write(const WriteOptions& options, WriteBatch* my_batch) {
}
}
mutex_ . Unlock ( ) ;
mutex_ . Unlock ( ) ;
if ( status . IsTimedOut ( ) ) {
RecordTick ( options_ . statistics . get ( ) , WRITE_TIMEDOUT , 1 ) ;
}
for ( auto & sv : superversions_to_free ) {
for ( auto & sv : superversions_to_free ) {
delete sv ;
delete sv ;
}
}
@ -3915,6 +3978,9 @@ Status DBImpl::Write(const WriteOptions& options, WriteBatch* my_batch) {
return status ;
return status ;
}
}
// This function will be called only when the first writer succeeds.
// All writers in the to-be-built batch group will be processed.
//
// REQUIRES: Writer list must be non-empty
// REQUIRES: Writer list must be non-empty
// REQUIRES: First writer must have a non-nullptr batch
// REQUIRES: First writer must have a non-nullptr batch
void DBImpl : : BuildBatchGroup ( Writer * * last_writer ,
void DBImpl : : BuildBatchGroup ( Writer * * last_writer ,
@ -3950,6 +4016,12 @@ void DBImpl::BuildBatchGroup(Writer** last_writer,
break ;
break ;
}
}
if ( w - > timeout_hint_us < first - > timeout_hint_us ) {
// Do not include those writes with shorter timeout. Otherwise, we might
// execute a write that should instead be aborted because of timeout.
break ;
}
if ( w - > batch ! = nullptr ) {
if ( w - > batch ! = nullptr ) {
size + = WriteBatchInternal : : ByteSize ( w - > batch ) ;
size + = WriteBatchInternal : : ByteSize ( w - > batch ) ;
if ( size > max_size ) {
if ( size > max_size ) {
@ -3959,6 +4031,7 @@ void DBImpl::BuildBatchGroup(Writer** last_writer,
write_batch_group - > push_back ( w - > batch ) ;
write_batch_group - > push_back ( w - > batch ) ;
}
}
w - > in_batch_group = true ;
* last_writer = w ;
* last_writer = w ;
}
}
}
}
@ -4000,7 +4073,8 @@ uint64_t DBImpl::SlowdownAmount(int n, double bottom, double top) {
Status DBImpl : : MakeRoomForWrite (
Status DBImpl : : MakeRoomForWrite (
ColumnFamilyData * cfd , bool force ,
ColumnFamilyData * cfd , bool force ,
autovector < SuperVersion * > * superversions_to_free ,
autovector < SuperVersion * > * superversions_to_free ,
autovector < log : : Writer * > * logs_to_free ) {
autovector < log : : Writer * > * logs_to_free ,
uint64_t expiration_time ) {
mutex_ . AssertHeld ( ) ;
mutex_ . AssertHeld ( ) ;
assert ( ! writers_ . empty ( ) ) ;
assert ( ! writers_ . empty ( ) ) ;
bool allow_delay = ! force ;
bool allow_delay = ! force ;
@ -4013,12 +4087,16 @@ Status DBImpl::MakeRoomForWrite(
// might generate a tight feedback loop, constantly scheduling more background
// might generate a tight feedback loop, constantly scheduling more background
// work, even if additional background work is not needed
// work, even if additional background work is not needed
bool schedule_background_work = true ;
bool schedule_background_work = true ;
bool has_timeout = ( expiration_time > 0 ) ;
while ( true ) {
while ( true ) {
if ( ! bg_error_ . ok ( ) ) {
if ( ! bg_error_ . ok ( ) ) {
// Yield previous error
// Yield previous error
s = bg_error_ ;
s = bg_error_ ;
break ;
break ;
} else if ( has_timeout & & env_ - > NowMicros ( ) > expiration_time ) {
s = Status : : TimedOut ( ) ;
break ;
} else if ( allow_delay & & cfd - > NeedSlowdownForNumLevel0Files ( ) ) {
} else if ( allow_delay & & cfd - > NeedSlowdownForNumLevel0Files ( ) ) {
// We are getting close to hitting a hard limit on the number of
// We are getting close to hitting a hard limit on the number of
// L0 files. Rather than delaying a single write by several
// L0 files. Rather than delaying a single write by several
@ -4063,7 +4141,11 @@ Status DBImpl::MakeRoomForWrite(
{
{
StopWatch sw ( env_ , options_ . statistics . get ( ) ,
StopWatch sw ( env_ , options_ . statistics . get ( ) ,
STALL_MEMTABLE_COMPACTION_COUNT ) ;
STALL_MEMTABLE_COMPACTION_COUNT ) ;
if ( ! has_timeout ) {
bg_cv_ . Wait ( ) ;
bg_cv_ . Wait ( ) ;
} else {
bg_cv_ . TimedWait ( expiration_time ) ;
}
stall = sw . ElapsedMicros ( ) ;
stall = sw . ElapsedMicros ( ) ;
}
}
RecordTick ( options_ . statistics . get ( ) ,
RecordTick ( options_ . statistics . get ( ) ,
@ -4078,10 +4160,15 @@ Status DBImpl::MakeRoomForWrite(
{
{
StopWatch sw ( env_ , options_ . statistics . get ( ) ,
StopWatch sw ( env_ , options_ . statistics . get ( ) ,
STALL_L0_NUM_FILES_COUNT ) ;
STALL_L0_NUM_FILES_COUNT ) ;
if ( ! has_timeout ) {
bg_cv_ . Wait ( ) ;
bg_cv_ . Wait ( ) ;
} else {
bg_cv_ . TimedWait ( expiration_time ) ;
}
stall = sw . ElapsedMicros ( ) ;
stall = sw . ElapsedMicros ( ) ;
}
}
RecordTick ( options_ . statistics . get ( ) , STALL_L0_NUM_FILES_MICROS , stall ) ;
RecordTick ( options_ . statistics . get ( ) ,
STALL_L0_NUM_FILES_MICROS , stall ) ;
cfd - > internal_stats ( ) - > RecordWriteStall ( InternalStats : : LEVEL0_NUM_FILES ,
cfd - > internal_stats ( ) - > RecordWriteStall ( InternalStats : : LEVEL0_NUM_FILES ,
stall ) ;
stall ) ;
} else if ( allow_hard_rate_limit_delay & & cfd - > ExceedsHardRateLimit ( ) ) {
} else if ( allow_hard_rate_limit_delay & & cfd - > ExceedsHardRateLimit ( ) ) {
@ -4112,18 +4199,18 @@ Status DBImpl::MakeRoomForWrite(
score = cfd - > current ( ) - > MaxCompactionScore ( ) ;
score = cfd - > current ( ) - > MaxCompactionScore ( ) ;
// Delay a write when the compaction score for any level is too large.
// Delay a write when the compaction score for any level is too large.
// TODO: add statistics
// TODO: add statistics
uint64_t slowdown = SlowdownAmount ( score , cfd - > options ( ) - > soft_rate_limit ,
cfd - > options ( ) - > hard_rate_limit ) ;
mutex_ . Unlock ( ) ;
mutex_ . Unlock ( ) ;
{
{
StopWatch sw ( env_ , options_ . statistics . get ( ) ,
StopWatch sw ( env_ , options_ . statistics . get ( ) ,
SOFT_RATE_LIMIT_DELAY_COUNT ) ;
SOFT_RATE_LIMIT_DELAY_COUNT ) ;
env_ - > SleepForMicroseconds (
env_ - > SleepForMicroseconds ( slowdown ) ;
SlowdownAmount ( score , cfd - > options ( ) - > soft_rate_limit ,
slowdown = sw . ElapsedMicros ( ) ;
cfd - > options ( ) - > hard_rate_limit ) ) ;
rate_limit_delay_millis + = slowdown ;
rate_limit_delay_millis + = sw . ElapsedMicros ( ) ;
}
}
allow_soft_rate_limit_delay = false ;
allow_soft_rate_limit_delay = false ;
mutex_ . Lock ( ) ;
mutex_ . Lock ( ) ;
} else {
} else {
unique_ptr < WritableFile > lfile ;
unique_ptr < WritableFile > lfile ;
log : : Writer * new_log = nullptr ;
log : : Writer * new_log = nullptr ;