@ -195,7 +195,7 @@ void FlushJob::PickMemTable() {
// If mempurge feature is activated, keep track of any potential
// If mempurge feature is activated, keep track of any potential
// memtables coming from a previous mempurge operation.
// memtables coming from a previous mempurge operation.
// Used for mempurge policy.
// Used for mempurge policy.
if ( db_options_ . experimental_allow_ mempurge ) {
if ( db_options_ . experimental_mempurge_threshold > 0.0 ) {
contains_mempurge_outcome_ = false ;
contains_mempurge_outcome_ = false ;
for ( MemTable * mt : mems_ ) {
for ( MemTable * mt : mems_ ) {
if ( cfd_ - > imm ( ) - > IsMemPurgeOutput ( mt - > GetID ( ) ) ) {
if ( cfd_ - > imm ( ) - > IsMemPurgeOutput ( mt - > GetID ( ) ) ) {
@ -241,7 +241,7 @@ Status FlushJob::Run(LogsWithPrepTracker* prep_tracker,
prev_cpu_read_nanos = IOSTATS ( cpu_read_nanos ) ;
prev_cpu_read_nanos = IOSTATS ( cpu_read_nanos ) ;
}
}
Status mempurge_s = Status : : NotFound ( " No MemPurge. " ) ;
Status mempurge_s = Status : : NotFound ( " No MemPurge. " ) ;
if ( db_options_ . experimental_allow_ mempurge & &
if ( ( db_options_ . experimental_mempurge_threshold > 0.0 ) & &
( cfd_ - > GetFlushReason ( ) = = FlushReason : : kWriteBufferFull ) & &
( cfd_ - > GetFlushReason ( ) = = FlushReason : : kWriteBufferFull ) & &
( ! mems_ . empty ( ) ) & & MemPurgeDecider ( ) ) {
( ! mems_ . empty ( ) ) & & MemPurgeDecider ( ) ) {
mempurge_s = MemPurge ( ) ;
mempurge_s = MemPurge ( ) ;
@ -580,8 +580,6 @@ Status FlushJob::MemPurge() {
// This addition will not trigger another flush, because
// This addition will not trigger another flush, because
// we do not call SchedulePendingFlush().
// we do not call SchedulePendingFlush().
cfd_ - > imm ( ) - > Add ( new_mem , & job_context_ - > memtables_to_free ) ;
cfd_ - > imm ( ) - > Add ( new_mem , & job_context_ - > memtables_to_free ) ;
new_mem_capacity = ( new_mem - > ApproximateMemoryUsage ( ) ) * 1.0 /
mutable_cf_options_ . write_buffer_size ;
new_mem - > Ref ( ) ;
new_mem - > Ref ( ) ;
db_mutex_ - > Unlock ( ) ;
db_mutex_ - > Unlock ( ) ;
} else {
} else {
@ -622,16 +620,129 @@ Status FlushJob::MemPurge() {
}
}
bool FlushJob : : MemPurgeDecider ( ) {
bool FlushJob : : MemPurgeDecider ( ) {
MemPurgePolicy policy = db_options_ . experimental_mempurge_policy ;
double threshold = db_options_ . experimental_mempurge_threshold ;
if ( policy = = MemPurgePolicy : : kAlways ) {
// Never trigger mempurge if threshold is not a strictly positive value.
if ( ! ( threshold > 0.0 ) ) {
return false ;
}
if ( threshold > ( 1.0 * mems_ . size ( ) ) ) {
return true ;
return true ;
} else if ( policy = = MemPurgePolicy : : kAlternate ) {
// Note: if at least one of the flushed memtables is
// an output of a previous mempurge process, then flush
// to storage.
return ! ( contains_mempurge_outcome_ ) ;
}
}
return false ;
// Payload and useful_payload (in bytes).
// The useful payload ratio of a given MemTable
// is estimated to be useful_payload/payload.
uint64_t payload = 0 , useful_payload = 0 ;
// If estimated_useful_payload is > threshold,
// then flush to storage, else MemPurge.
double estimated_useful_payload = 0.0 ;
// Cochran formula for determining sample size.
// 95% confidence interval, 7% precision.
// n0 = (1.96*1.96)*0.25/(0.07*0.07) = 196.0
double n0 = 196.0 ;
ReadOptions ro ;
ro . total_order_seek = true ;
// Iterate over each memtable of the set.
for ( MemTable * mt : mems_ ) {
// If the memtable is the output of a previous mempurge,
// its approximate useful payload ratio is already calculated.
if ( cfd_ - > imm ( ) - > IsMemPurgeOutput ( mt - > GetID ( ) ) ) {
// We make the assumption that this memtable is already
// free of garbage (garbage underestimation).
estimated_useful_payload + = mt - > ApproximateMemoryUsage ( ) ;
} else {
// Else sample from the table.
uint64_t nentries = mt - > num_entries ( ) ;
// Corrected Cochran formula for small populations
// (converges to n0 for large populations).
uint64_t target_sample_size =
static_cast < uint64_t > ( ceil ( n0 / ( 1.0 + ( n0 / nentries ) ) ) ) ;
std : : unordered_set < const char * > sentries = { } ;
// Populate sample entries set.
mt - > UniqueRandomSample ( target_sample_size , & sentries ) ;
// Estimate the garbage ratio by comparing if
// each sample corresponds to a valid entry.
for ( const char * ss : sentries ) {
ParsedInternalKey res ;
Slice entry_slice = GetLengthPrefixedSlice ( ss ) ;
Status parse_s =
ParseInternalKey ( entry_slice , & res , true /*log_err_key*/ ) ;
if ( ! parse_s . ok ( ) ) {
ROCKS_LOG_WARN ( db_options_ . info_log ,
" Memtable Decider: ParseInternalKey did not parse "
" entry_slice %s "
" successfully. " ,
entry_slice . data ( ) ) ;
}
LookupKey lkey ( res . user_key , kMaxSequenceNumber ) ;
std : : string vget ;
Status s ;
MergeContext merge_context ;
SequenceNumber max_covering_tombstone_seq = 0 , sqno = 0 ;
// Pick the oldest existing snapshot that is more recent
// than the sequence number of the sampled entry.
SequenceNumber min_seqno_snapshot = kMaxSequenceNumber ;
SnapshotImpl min_snapshot ;
for ( SequenceNumber seq_num : existing_snapshots_ ) {
if ( seq_num > res . sequence & & seq_num < min_seqno_snapshot ) {
min_seqno_snapshot = seq_num ;
}
}
min_snapshot . number_ = min_seqno_snapshot ;
ro . snapshot =
min_seqno_snapshot < kMaxSequenceNumber ? & min_snapshot : nullptr ;
// Estimate if the sample entry is valid or not.
bool gres = mt - > Get ( lkey , & vget , nullptr , & s , & merge_context ,
& max_covering_tombstone_seq , & sqno , ro ) ;
if ( ! gres ) {
ROCKS_LOG_WARN (
db_options_ . info_log ,
" Memtable Get returned false when Get(sampled entry). "
" Yet each sample entry should exist somewhere in the memtable, "
" unrelated to whether it has been deleted or not. " ) ;
}
payload + = entry_slice . size ( ) ;
// TODO(bjlemaire): evaluate typeMerge.
// This is where the sampled entry is estimated to be
// garbage or not. Note that this is a garbage *estimation*
// because we do not include certain items such as
// CompactionFitlers triggered at flush, or if the same delete
// has been inserted twice or more in the memtable.
if ( res . type = = kTypeValue & & gres & & s . ok ( ) & & sqno = = res . sequence ) {
useful_payload + = entry_slice . size ( ) ;
} else if ( ( ( res . type = = kTypeDeletion ) | |
( res . type = = kTypeSingleDeletion ) ) & &
s . IsNotFound ( ) & & gres ) {
useful_payload + = entry_slice . size ( ) ;
}
}
if ( payload > 0 ) {
// We used the estimated useful payload ratio
// to evaluate how much of the total memtable is useful bytes.
estimated_useful_payload + =
( mt - > ApproximateMemoryUsage ( ) ) * ( useful_payload * 1.0 / payload ) ;
ROCKS_LOG_INFO (
db_options_ . info_log ,
" Mempurge sampling - found garbage ratio from sampling: %f. \n " ,
( payload - useful_payload ) * 1.0 / payload ) ;
} else {
ROCKS_LOG_WARN (
db_options_ . info_log ,
" Mempurge kSampling policy: null payload measured, and collected "
" sample size is %zu \n . " ,
sentries . size ( ) ) ;
}
}
}
// We convert the total number of useful paylaod bytes
// into the proportion of memtable necessary to store all these bytes.
// We compare this proportion with the threshold value.
return ( estimated_useful_payload / mutable_cf_options_ . write_buffer_size ) <
threshold ;
}
}
Status FlushJob : : WriteLevel0Table ( ) {
Status FlushJob : : WriteLevel0Table ( ) {
@ -843,7 +954,7 @@ Status FlushJob::WriteLevel0Table() {
stats . num_output_files_blob = static_cast < int > ( blobs . size ( ) ) ;
stats . num_output_files_blob = static_cast < int > ( blobs . size ( ) ) ;
if ( db_options_ . experimental_allow_ mempurge & & s . ok ( ) ) {
if ( ( db_options_ . experimental_mempurge_threshold > 0.0 ) & & s . ok ( ) ) {
// The db_mutex is held at this point.
// The db_mutex is held at this point.
for ( MemTable * mt : mems_ ) {
for ( MemTable * mt : mems_ ) {
// Note: if m is not a previous mempurge output memtable,
// Note: if m is not a previous mempurge output memtable,