diff --git a/HISTORY.md b/HISTORY.md index 7a183d654..5bdf168c3 100644 --- a/HISTORY.md +++ b/HISTORY.md @@ -2,6 +2,10 @@ ## Unreleased ### Public API Change * Iterator::SeekForPrev is now a pure virtual method. This is to prevent user who implement the Iterator interface fail to implement SeekForPrev by mistake. + +### New Features +* Improve the performance of iterators doing long range scans by using readahead. + ### Bug Fixes * Fix `DisableFileDeletions()` followed by `GetSortedWalFiles()` to not return obsolete WAL files that `PurgeObsoleteFiles()` is going to delete. * Fix DB::Flush() keep waiting after flush finish under certain condition. diff --git a/table/block_based_table_reader.cc b/table/block_based_table_reader.cc index 0c0a6a1c6..808c5fd06 100644 --- a/table/block_based_table_reader.cc +++ b/table/block_based_table_reader.cc @@ -1594,6 +1594,9 @@ BlockBasedTable::BlockEntryIteratorState::BlockEntryIteratorState( is_index_(is_index), block_map_(block_map) {} +const size_t BlockBasedTable::BlockEntryIteratorState::kMaxReadaheadSize = + 256 * 1024; + InternalIterator* BlockBasedTable::BlockEntryIteratorState::NewSecondaryIterator( const Slice& index_value) { @@ -1618,6 +1621,28 @@ BlockBasedTable::BlockEntryIteratorState::NewSecondaryIterator( &rep->internal_comparator, nullptr, true, rep->ioptions.statistics); } } + + // Automatically prefetch additional data when a range scan (iterator) does + // more than 2 sequential IOs. This is enabled only when + // ReadOptions.readahead_size is 0. + if (read_options_.readahead_size == 0) { + if (num_file_reads_ < 2) { + num_file_reads_++; + } else if (handle.offset() + static_cast(handle.size()) + + kBlockTrailerSize > + readahead_limit_) { + num_file_reads_++; + // Do not readahead more than kMaxReadaheadSize. + readahead_size_ = + std::min(BlockBasedTable::BlockEntryIteratorState::kMaxReadaheadSize, + readahead_size_); + table_->rep_->file->Prefetch(handle.offset(), readahead_size_); + readahead_limit_ = handle.offset() + readahead_size_; + // Keep exponentially increasing readahead size until kMaxReadaheadSize. + readahead_size_ *= 2; + } + } + return NewDataBlockIterator(rep, read_options_, handle, /* input_iter */ nullptr, is_index_, /* get_context */ nullptr, s); diff --git a/table/block_based_table_reader.h b/table/block_based_table_reader.h index 886fec6c5..5574c5d50 100644 --- a/table/block_based_table_reader.h +++ b/table/block_based_table_reader.h @@ -376,6 +376,14 @@ class BlockBasedTable::BlockEntryIteratorState : public TwoLevelIteratorState { bool is_index_; std::unordered_map>* block_map_; port::RWMutex cleaner_mu; + + static const size_t kInitReadaheadSize = 8 * 1024; + // Found that 256 KB readahead size provides the best performance, based on + // experiments. + static const size_t kMaxReadaheadSize; + size_t readahead_size_ = kInitReadaheadSize; + size_t readahead_limit_ = 0; + int num_file_reads_ = 0; }; // CachableEntry represents the entries that *may* be fetched from block cache.