@ -4872,84 +4872,134 @@ Status VersionSet::WriteSnapshot(log::Writer* log) {
// (a,b) then (b,c) then (c,d). Knowing this, an optimization is possible where
// (a,b) then (b,c) then (c,d). Knowing this, an optimization is possible where
// we avoid doing binary search for the keys b and c twice and instead somehow
// we avoid doing binary search for the keys b and c twice and instead somehow
// maintain state of where they first appear in the files.
// maintain state of where they first appear in the files.
uint64_t VersionSet : : ApproximateSize ( Version * v , const Slice & start ,
uint64_t VersionSet : : ApproximateSize ( const SizeApproximationOptions & options ,
Version * v , const Slice & start ,
const Slice & end , int start_level ,
const Slice & end , int start_level ,
int end_level , TableReaderCaller caller ) {
int end_level , TableReaderCaller caller ) {
const auto & icmp = v - > cfd_ - > internal_comparator ( ) ;
// pre-condition
// pre-condition
assert ( v - > cfd_ - > internal_comparator ( ) . Compare ( start , end ) < = 0 ) ;
assert ( icmp . Compare ( start , end ) < = 0 ) ;
uint64_t size = 0 ;
uint64_t total_full_ size = 0 ;
const auto * vstorage = v - > storage_info ( ) ;
const auto * vstorage = v - > storage_info ( ) ;
end_level = end_level = = - 1
const int num_non_empty_levels = vstorage - > num_non_empty_levels ( ) ;
? vstorage - > num_non_empty_levels ( )
end_level = ( end_level = = - 1 ) ? num_non_empty_levels
: std : : min ( end_level , vstorage - > num_non_empty_levels ( ) ) ;
: std : : min ( end_level , num_non_empty_levels ) ;
assert ( start_level < = end_level ) ;
assert ( start_level < = end_level ) ;
for ( int level = start_level ; level < end_level ; level + + ) {
// Outline of the optimization that uses options.files_size_error_margin.
// When approximating the files total size that is used to store a keys range,
// we first sum up the sizes of the files that fully fall into the range.
// Then we sum up the sizes of all the files that may intersect with the range
// (this includes all files in L0 as well). Then, if total_intersecting_size
// is smaller than total_full_size * options.files_size_error_margin - we can
// infer that the intersecting files have a sufficiently negligible
// contribution to the total size, and we can approximate the storage required
// for the keys in range as just half of the intersecting_files_size.
// E.g., if the value of files_size_error_margin is 0.1, then the error of the
// approximation is limited to only ~10% of the total size of files that fully
// fall into the keys range. In such case, this helps to avoid a costly
// process of binary searching the intersecting files that is required only
// for a more precise calculation of the total size.
autovector < FdWithKeyRange * , 32 > first_files ;
autovector < FdWithKeyRange * , 16 > last_files ;
// scan all the levels
for ( int level = start_level ; level < end_level ; + + level ) {
const LevelFilesBrief & files_brief = vstorage - > LevelFilesBrief ( level ) ;
const LevelFilesBrief & files_brief = vstorage - > LevelFilesBrief ( level ) ;
if ( ! files_brief . num_files ) {
if ( files_brief . num_files = = 0 ) {
// empty level, skip exploration
// empty level, skip exploration
continue ;
continue ;
}
}
if ( ! level ) {
if ( level = = 0 ) {
// level 0 data is sorted order, handle the use case explicitly
// level 0 files are not in sorted order, we need to iterate through
size + = ApproximateSizeLevel0 ( v , files_brief , start , end , caller ) ;
// the list to compute the total bytes that require scanning,
// so handle the case explicitly (similarly to first_files case)
for ( size_t i = 0 ; i < files_brief . num_files ; i + + ) {
first_files . push_back ( & files_brief . files [ i ] ) ;
}
continue ;
continue ;
}
}
assert ( level > 0 ) ;
assert ( level > 0 ) ;
assert ( files_brief . num_files > 0 ) ;
assert ( files_brief . num_files > 0 ) ;
// identify the file position for starting key
// identify the file position for start key
const uint64_t idx_start = FindFileInRange (
const int idx_start =
v - > cfd_ - > internal_comparator ( ) , files_brief , start ,
FindFileInRange ( icmp , files_brief , start , 0 ,
/*start=*/ 0 , static_cast < uint32_t > ( files_brief . num_files - 1 ) ) ;
static_cast < uint32_t > ( files_brief . num_files - 1 ) ) ;
assert ( idx_start < files_brief . num_files ) ;
assert ( static_cast < size_t > ( idx_start ) < files_brief . num_files ) ;
// scan all files from the starting position until the ending position
// inferred from the sorted order
for ( uint64_t i = idx_start ; i < files_brief . num_files ; i + + ) {
uint64_t val ;
val = ApproximateSize ( v , files_brief . files [ i ] , end , caller ) ;
if ( ! val ) {
// the files after this will not have the range
break ;
}
size + = val ;
// identify the file position for end key
int idx_end = idx_start ;
if ( icmp . Compare ( files_brief . files [ idx_end ] . largest_key , end ) < 0 ) {
idx_end =
FindFileInRange ( icmp , files_brief , end , idx_start ,
static_cast < uint32_t > ( files_brief . num_files - 1 ) ) ;
}
assert ( idx_end > = idx_start & &
static_cast < size_t > ( idx_end ) < files_brief . num_files ) ;
if ( i = = idx_start ) {
// scan all files from the starting index to the ending index
// subtract the bytes needed to be scanned to get to the starting
// (inferred from the sorted order)
// key
val = ApproximateSize ( v , files_brief . files [ i ] , start , caller ) ;
// first scan all the intermediate full files (excluding first and last)
assert ( size > = val ) ;
for ( int i = idx_start + 1 ; i < idx_end ; + + i ) {
size - = val ;
uint64_t file_size = files_brief . files [ i ] . fd . GetFileSize ( ) ;
}
// The entire file falls into the range, so we can just take its size.
assert ( file_size = =
ApproximateSize ( v , files_brief . files [ i ] , end , caller ) ) ;
total_full_size + = file_size ;
}
// save the first and the last files (which may be the same file), so we
// can scan them later.
first_files . push_back ( & files_brief . files [ idx_start ] ) ;
if ( idx_start ! = idx_end ) {
// we need to estimate size for both files, only if they are different
last_files . push_back ( & files_brief . files [ idx_end ] ) ;
}
}
}
}
return size ;
// The sum of all file sizes that intersect the [start, end] keys range.
}
uint64_t total_intersecting_size = 0 ;
for ( const auto * file_ptr : first_files ) {
total_intersecting_size + = file_ptr - > fd . GetFileSize ( ) ;
}
for ( const auto * file_ptr : last_files ) {
total_intersecting_size + = file_ptr - > fd . GetFileSize ( ) ;
}
uint64_t VersionSet : : ApproximateSizeLevel0 ( Version * v ,
// Now scan all the first & last files at each level, and estimate their size.
const LevelFilesBrief & files_brief ,
// If the total_intersecting_size is less than X% of the total_full_size - we
const Slice & key_start ,
// want to approximate the result in order to avoid the costly binary search
const Slice & key_end ,
// inside ApproximateSize. We use half of file size as an approximation below.
TableReaderCaller caller ) {
// level 0 files are not in sorted order, we need to iterate through
const double margin = options . files_size_error_margin ;
// the list to compute the total bytes that require scanning
if ( margin > 0 & & total_intersecting_size <
uint64_t size = 0 ;
static_cast < uint64_t > ( total_full_size * margin ) ) {
for ( size_t i = 0 ; i < files_brief . num_files ; i + + ) {
total_full_size + = total_intersecting_size / 2 ;
const uint64_t start =
} else {
ApproximateSize ( v , files_brief . files [ i ] , key_start , caller ) ;
// Estimate for all the first files, at each level
const uint64_t end =
for ( const auto file_ptr : first_files ) {
ApproximateSize ( v , files_brief . files [ i ] , key_end , caller ) ;
total_full_size + = ApproximateSize ( v , * file_ptr , end , caller ) ;
assert ( end > = start ) ;
// subtract the bytes needed to be scanned to get to the starting key
size + = end - start ;
uint64_t val = ApproximateSize ( v , * file_ptr , start , caller ) ;
assert ( total_full_size > = val ) ;
total_full_size - = val ;
}
// Estimate for all the last files, at each level
for ( const auto file_ptr : last_files ) {
total_full_size + = ApproximateSize ( v , * file_ptr , end , caller ) ;
}
}
}
return size ;
return total_full_size ;
}
}
uint64_t VersionSet : : ApproximateSize ( Version * v , const FdWithKeyRange & f ,
uint64_t VersionSet : : ApproximateSize ( Version * v , const FdWithKeyRange & f ,
@ -4957,12 +5007,13 @@ uint64_t VersionSet::ApproximateSize(Version* v, const FdWithKeyRange& f,
TableReaderCaller caller ) {
TableReaderCaller caller ) {
// pre-condition
// pre-condition
assert ( v ) ;
assert ( v ) ;
const auto & icmp = v - > cfd_ - > internal_comparator ( ) ;
uint64_t result = 0 ;
uint64_t result = 0 ;
if ( v - > cfd_ - > internal_ co mparator ( ) . Compare ( f . largest_key , key ) < = 0 ) {
if ( icmp . Compare ( f . largest_key , key ) < = 0 ) {
// Entire file is before "key", so just add the file size
// Entire file is before "key", so just add the file size
result = f . fd . GetFileSize ( ) ;
result = f . fd . GetFileSize ( ) ;
} else if ( v - > cfd_ - > internal_ co mparator ( ) . Compare ( f . smallest_key , key ) > 0 ) {
} else if ( icmp . Compare ( f . smallest_key , key ) > 0 ) {
// Entire file is after "key", so ignore
// Entire file is after "key", so ignore
result = 0 ;
result = 0 ;
} else {
} else {
@ -4971,7 +5022,7 @@ uint64_t VersionSet::ApproximateSize(Version* v, const FdWithKeyRange& f,
TableCache * table_cache = v - > cfd_ - > table_cache ( ) ;
TableCache * table_cache = v - > cfd_ - > table_cache ( ) ;
if ( table_cache ! = nullptr ) {
if ( table_cache ! = nullptr ) {
result = table_cache - > ApproximateOffsetOf (
result = table_cache - > ApproximateOffsetOf (
key , f . file_metadata - > fd , caller , v - > cfd ( ) - > internal_ co mparator ( ) ,
key , f . file_metadata - > fd , caller , icmp ,
v - > GetMutableCFOptions ( ) . prefix_extractor . get ( ) ) ;
v - > GetMutableCFOptions ( ) . prefix_extractor . get ( ) ) ;
}
}
}
}