@ -9,6 +9,7 @@
# include "db/builder.h"
# include "db/builder.h"
# include <algorithm>
# include <deque>
# include <deque>
# include <vector>
# include <vector>
@ -31,6 +32,28 @@
namespace rocksdb {
namespace rocksdb {
namespace {
inline SequenceNumber EarliestVisibleSnapshot (
SequenceNumber in , const std : : vector < SequenceNumber > & snapshots ,
SequenceNumber * prev_snapshot ) {
if ( snapshots . empty ( ) ) {
* prev_snapshot = 0 ; // 0 means no previous snapshot
return kMaxSequenceNumber ;
}
SequenceNumber prev = 0 ;
for ( const auto cur : snapshots ) {
assert ( prev < = cur ) ;
if ( cur > = in ) {
* prev_snapshot = prev ;
return cur ;
}
prev = cur ; // assignment
}
* prev_snapshot = prev ;
return kMaxSequenceNumber ;
}
} // namespace
class TableFactory ;
class TableFactory ;
TableBuilder * NewTableBuilder (
TableBuilder * NewTableBuilder (
@ -53,9 +76,7 @@ Status BuildTable(
FileMetaData * meta , const InternalKeyComparator & internal_comparator ,
FileMetaData * meta , const InternalKeyComparator & internal_comparator ,
const std : : vector < std : : unique_ptr < IntTblPropCollectorFactory > > *
const std : : vector < std : : unique_ptr < IntTblPropCollectorFactory > > *
int_tbl_prop_collector_factories ,
int_tbl_prop_collector_factories ,
const SequenceNumber newest_snapshot ,
std : : vector < SequenceNumber > snapshots , const CompressionType compression ,
const SequenceNumber earliest_seqno_in_memtable ,
const CompressionType compression ,
const CompressionOptions & compression_opts , bool paranoid_file_checks ,
const CompressionOptions & compression_opts , bool paranoid_file_checks ,
InternalStats * internal_stats , const Env : : IOPriority io_priority ,
InternalStats * internal_stats , const Env : : IOPriority io_priority ,
TableProperties * table_properties ) {
TableProperties * table_properties ) {
@ -66,14 +87,6 @@ Status BuildTable(
meta - > smallest_seqno = meta - > largest_seqno = 0 ;
meta - > smallest_seqno = meta - > largest_seqno = 0 ;
iter - > SeekToFirst ( ) ;
iter - > SeekToFirst ( ) ;
// If the sequence number of the smallest entry in the memtable is
// smaller than the most recent snapshot, then we do not trigger
// removal of duplicate/deleted keys as part of this builder.
bool purge = true ;
if ( earliest_seqno_in_memtable < = newest_snapshot ) {
purge = false ;
}
std : : string fname = TableFileName ( ioptions . db_paths , meta - > fd . GetNumber ( ) ,
std : : string fname = TableFileName ( ioptions . db_paths , meta - > fd . GetNumber ( ) ,
meta - > fd . GetPathId ( ) ) ;
meta - > fd . GetPathId ( ) ) ;
if ( iter - > Valid ( ) ) {
if ( iter - > Valid ( ) ) {
@ -107,112 +120,112 @@ Status BuildTable(
ioptions . min_partial_merge_operands ,
ioptions . min_partial_merge_operands ,
true /* internal key corruption is not ok */ ) ;
true /* internal key corruption is not ok */ ) ;
if ( purge ) {
IterKey current_user_key ;
bool has_current_user_key = false ;
// If has_current_user_key == true, this variable remembers the earliest
// snapshot in which this current key already exists. If two internal keys
// have the same user key AND the earlier one should be visible in the
// snapshot in which we already have a user key, we can drop the earlier
// user key
SequenceNumber current_user_key_exists_in_snapshot = kMaxSequenceNumber ;
while ( iter - > Valid ( ) ) {
// Get current key
ParsedInternalKey ikey ;
Slice key = iter - > key ( ) ;
Slice value = iter - > value ( ) ;
// In-memory key corruption is not ok;
// TODO: find a clean way to treat in memory key corruption
// Ugly walkaround to avoid compiler error for release build
// Ugly walkaround to avoid compiler error for release build
bool ok __attribute__ ( ( unused ) ) = true ;
bool ok __attribute__ ( ( unused ) ) = true ;
ok = ParseInternalKey ( key , & ikey ) ;
assert ( ok ) ;
meta - > smallest_seqno = std : : min ( meta - > smallest_seqno , ikey . sequence ) ;
meta - > largest_seqno = std : : max ( meta - > largest_seqno , ikey . sequence ) ;
// If the key is the same as the previous key (and it is not the
// first key), then we skip it, since it is an older version.
// Otherwise we output the key and mark it as the "new" previous key.
if ( ! has_current_user_key | |
internal_comparator . user_comparator ( ) - > Compare (
ikey . user_key , current_user_key . GetKey ( ) ) ! = 0 ) {
// First occurrence of this user key
current_user_key . SetKey ( ikey . user_key ) ;
has_current_user_key = true ;
current_user_key_exists_in_snapshot = 0 ;
}
// Will write to builder if current key != prev key
// If there are no snapshots, then this kv affect visibility at tip.
ParsedInternalKey prev_ikey ;
// Otherwise, search though all existing snapshots to find
std : : string prev_key ;
// the earlist snapshot that is affected by this kv.
bool is_first_key = true ; // Also write if this is the very first key
SequenceNumber prev_snapshot = 0 ; // 0 means no previous snapshot
SequenceNumber key_needs_to_exist_in_snapshot =
while ( iter - > Valid ( ) ) {
EarliestVisibleSnapshot ( ikey . sequence , snapshots , & prev_snapshot ) ;
bool iterator_at_next = false ;
if ( current_user_key_exists_in_snapshot = =
// Get current key
key_needs_to_exist_in_snapshot ) {
ParsedInternalKey this_ikey ;
// If this user key already exists in snapshot in which it needs to
Slice key = iter - > key ( ) ;
// exist, we can drop it.
Slice value = iter - > value ( ) ;
// In other words, if the earliest snapshot is which this key is visible
// in is the same as the visibily of a previous instance of the
// In-memory key corruption is not ok;
// same key, then this kv is not visible in any snapshot.
// TODO: find a clean way to treat in memory key corruption
// Hidden by an newer entry for same user key
ok = ParseInternalKey ( key , & this_ikey ) ;
iter - > Next ( ) ;
assert ( ok ) ;
} else if ( ikey . type = = kTypeMerge ) {
assert ( this_ikey . sequence > = earliest_seqno_in_memtable ) ;
meta - > largest . DecodeFrom ( key ) ;
// If the key is the same as the previous key (and it is not the
// first key), then we skip it, since it is an older version.
// Otherwise we output the key and mark it as the "new" previous key.
if ( ! is_first_key & & ! internal_comparator . user_comparator ( ) - > Compare (
prev_ikey . user_key , this_ikey . user_key ) ) {
// seqno within the same key are in decreasing order
assert ( this_ikey . sequence < prev_ikey . sequence ) ;
} else {
is_first_key = false ;
if ( this_ikey . type = = kTypeMerge ) {
// TODO(tbd): Add a check here to prevent RocksDB from crash when
// reopening a DB w/o properly specifying the merge operator. But
// currently we observed a memory leak on failing in RocksDB
// recovery, so we decide to let it crash instead of causing
// memory leak for now before we have identified the real cause
// of the memory leak.
// Handle merge-type keys using the MergeHelper
// TODO: pass statistics to MergeUntil
merge . MergeUntil ( iter , 0 /* don't worry about snapshot */ ) ;
iterator_at_next = true ;
// Write them out one-by-one. (Proceed back() to front())
// If the merge successfully merged the input into
// a kTypeValue, the list contains a single element.
const std : : deque < std : : string > & keys = merge . keys ( ) ;
const std : : deque < std : : string > & values = merge . values ( ) ;
assert ( keys . size ( ) = = values . size ( ) & & keys . size ( ) > = 1 ) ;
std : : deque < std : : string > : : const_reverse_iterator key_iter ;
std : : deque < std : : string > : : const_reverse_iterator value_iter ;
for ( key_iter = keys . rbegin ( ) , value_iter = values . rbegin ( ) ;
key_iter ! = keys . rend ( ) & & value_iter ! = values . rend ( ) ;
+ + key_iter , + + value_iter ) {
builder - > Add ( Slice ( * key_iter ) , Slice ( * value_iter ) ) ;
}
// Sanity check. Both iterators should end at the same time
assert ( key_iter = = keys . rend ( ) & & value_iter = = values . rend ( ) ) ;
prev_key . assign ( keys . front ( ) ) ;
ok = ParseInternalKey ( Slice ( prev_key ) , & prev_ikey ) ;
assert ( ok ) ;
} else {
// Handle Put/Delete-type keys by simply writing them
builder - > Add ( key , value ) ;
prev_key . assign ( key . data ( ) , key . size ( ) ) ;
ok = ParseInternalKey ( Slice ( prev_key ) , & prev_ikey ) ;
assert ( ok ) ;
}
}
if ( io_priority = = Env : : IO_HIGH & &
// TODO(tbd): Add a check here to prevent RocksDB from crash when
IOSTATS ( bytes_written ) > = kReportFlushIOStatsEvery ) {
// reopening a DB w/o properly specifying the merge operator. But
ThreadStatusUtil : : IncreaseThreadOperationProperty (
// currently we observed a memory leak on failing in RocksDB
ThreadStatus : : FLUSH_BYTES_WRITTEN ,
// recovery, so we decide to let it crash instead of causing
IOSTATS ( bytes_written ) ) ;
// memory leak for now before we have identified the real cause
IOSTATS_RESET ( bytes_written ) ;
// of the memory leak.
// Handle merge-type keys using the MergeHelper
// TODO: pass statistics to MergeUntil
merge . MergeUntil ( iter , prev_snapshot , false , nullptr , env ) ;
// IMPORTANT: Slice key doesn't point to a valid value anymore!!
const auto & keys = merge . keys ( ) ;
const auto & values = merge . values ( ) ;
assert ( ! keys . empty ( ) ) ;
assert ( keys . size ( ) = = values . size ( ) ) ;
// largest possible sequence number in a merge queue is already stored
// in ikey.sequence.
// we additionally have to consider the front of the merge queue, which
// might have the smallest sequence number (out of all the merges with
// the same key)
meta - > smallest_seqno =
std : : min ( meta - > smallest_seqno , GetInternalKeySeqno ( keys . front ( ) ) ) ;
// We have a list of keys to write, write all keys in the list.
for ( auto key_iter = keys . rbegin ( ) , value_iter = values . rbegin ( ) ;
key_iter ! = keys . rend ( ) ; key_iter + + , value_iter + + ) {
key = Slice ( * key_iter ) ;
value = Slice ( * value_iter ) ;
bool valid_key __attribute__ ( ( __unused__ ) ) =
ParseInternalKey ( key , & ikey ) ;
// MergeUntil stops when it encounters a corrupt key and does not
// include them in the result, so we expect the keys here to valid.
assert ( valid_key ) ;
builder - > Add ( key , value ) ;
}
}
if ( ! iterator_at_next ) iter - > Next ( ) ;
} else { // just write out the key-value
builder - > Add ( key , value ) ;
meta - > largest . DecodeFrom ( key ) ;
iter - > Next ( ) ;
}
}
// The last key is the largest key
current_user_key_exists_in_snapshot = key_needs_to_exist_in_snapshot ;
meta - > largest . DecodeFrom ( Slice ( prev_key ) ) ;
SequenceNumber seqno = GetInternalKeySeqno ( Slice ( prev_key ) ) ;
meta - > smallest_seqno = std : : min ( meta - > smallest_seqno , seqno ) ;
meta - > largest_seqno = std : : max ( meta - > largest_seqno , seqno ) ;
} else {
if ( io_priority = = Env : : IO_HIGH & &
for ( ; iter - > Valid ( ) ; iter - > Next ( ) ) {
IOSTATS ( bytes_written ) > = kReportFlushIOStatsEvery ) {
Slice key = iter - > key ( ) ;
ThreadStatusUtil : : IncreaseThreadOperationProperty (
meta - > largest . DecodeFrom ( key ) ;
ThreadStatus : : FLUSH_BYTES_WRITTEN , IOSTATS ( bytes_written ) ) ;
builder - > Add ( key , iter - > value ( ) ) ;
IOSTATS_RESET ( bytes_written ) ;
SequenceNumber seqno = GetInternalKeySeqno ( key ) ;
meta - > smallest_seqno = std : : min ( meta - > smallest_seqno , seqno ) ;
meta - > largest_seqno = std : : max ( meta - > largest_seqno , seqno ) ;
if ( io_priority = = Env : : IO_HIGH & &
IOSTATS ( bytes_written ) > = kReportFlushIOStatsEvery ) {
ThreadStatusUtil : : IncreaseThreadOperationProperty (
ThreadStatus : : FLUSH_BYTES_WRITTEN ,
IOSTATS ( bytes_written ) ) ;
IOSTATS_RESET ( bytes_written ) ;
}
}
}
}
}