diff --git a/HISTORY.md b/HISTORY.md index 272435d8e..66dd73965 100644 --- a/HISTORY.md +++ b/HISTORY.md @@ -7,6 +7,7 @@ * Block-based table index now contains exact highest key in the file, rather than an upper bound. This may improve Get() and iterator Seek() performance in some situations, especially when direct IO is enabled and block cache is disabled. A setting BlockBasedTableOptions::index_shortening is introduced to control this behavior. Set it to kShortenSeparatorsAndSuccessor to get the old behavior. * When reading from option file/string/map, customized envs can be filled according to object registry. * Add an option `snap_refresh_nanos` (default to 0.5s) to periodically refresh the snapshot list in compaction jobs. Assign to 0 to disable the feature. +* Improve range scan performance when using explicit user readahead by not creating new table readers for every iterator. ### Public API Change * Change the behavior of OptimizeForPointLookup(): move away from hash-based block-based-table index, and use whole key memtable filtering. diff --git a/db/db_iterator_test.cc b/db/db_iterator_test.cc index d41d417eb..ec5fc8006 100644 --- a/db/db_iterator_test.cc +++ b/db/db_iterator_test.cc @@ -1943,8 +1943,8 @@ TEST_P(DBIteratorTest, ReadAhead) { delete iter; int64_t num_file_closes_readahead = TestGetTickerCount(options, NO_FILE_CLOSES); - ASSERT_EQ(num_file_opens + 3, num_file_opens_readahead); - ASSERT_EQ(num_file_closes + 3, num_file_closes_readahead); + ASSERT_EQ(num_file_opens, num_file_opens_readahead); + ASSERT_EQ(num_file_closes, num_file_closes_readahead); ASSERT_GT(bytes_read_readahead, bytes_read); ASSERT_GT(bytes_read_readahead, read_options.readahead_size * 3); diff --git a/db/db_test_util.h b/db/db_test_util.h index 36f3813c9..50109e0a4 100644 --- a/db/db_test_util.h +++ b/db/db_test_util.h @@ -438,6 +438,12 @@ class SpecialEnv : public EnvWrapper { return s; } + virtual Status Prefetch(uint64_t offset, size_t n) override { + Status s = target_->Prefetch(offset, n); + *bytes_read_ += n; + return s; + } + private: std::unique_ptr target_; anon::AtomicCounter* counter_; diff --git a/db/table_cache.cc b/db/table_cache.cc index 2eb742e24..06255d6a3 100644 --- a/db/table_cache.cc +++ b/db/table_cache.cc @@ -213,9 +213,6 @@ InternalIterator* TableCache::NewIterator( readahead = env_options.compaction_readahead_size; create_new_table_reader = true; } - } else { - readahead = options.readahead_size; - create_new_table_reader = readahead > 0; } auto& fd = file_meta.fd; diff --git a/include/rocksdb/options.h b/include/rocksdb/options.h index 4cf4be8c1..ab856bee8 100644 --- a/include/rocksdb/options.h +++ b/include/rocksdb/options.h @@ -1131,9 +1131,14 @@ struct ReadOptions { // Default: nullptr const Slice* iterate_upper_bound; - // If non-zero, NewIterator will create a new table reader which - // performs reads of the given size. Using a large size (> 2MB) can - // improve the performance of forward iteration on spinning disks. + // RocksDB does auto-readahead for iterators on noticing more than two reads + // for a table file. The readahead starts at 8KB and doubles on every + // additional read upto 256KB. + // This option can help if most of the range scans are large, and if it is + // determined that a larger readahead than that enabled by auto-readahead is + // needed. + // Using a large readahead size (> 2MB) can typically improve the performance + // of forward iteration on spinning disks. // Default: 0 size_t readahead_size; diff --git a/table/block_based_table_reader.cc b/table/block_based_table_reader.cc index 4aefbe7c5..ad088337a 100644 --- a/table/block_based_table_reader.cc +++ b/table/block_based_table_reader.cc @@ -2167,10 +2167,6 @@ BlockBasedTable::PartitionedIndexIteratorState::PartitionedIndexIteratorState( index_key_includes_seq_(index_key_includes_seq), index_key_is_full_(index_key_is_full) {} -template -const size_t BlockBasedTableIterator::kMaxReadaheadSize = - 256 * 1024; - InternalIteratorBase* BlockBasedTable::PartitionedIndexIteratorState::NewSecondaryIterator( const BlockHandle& handle) { @@ -2453,6 +2449,13 @@ void BlockBasedTableIterator::Prev() { FindKeyBackward(); } +// Found that 256 KB readahead size provides the best performance, based on +// experiments, for auto readahead. Experiment data is in PR #3282. +template +const size_t + BlockBasedTableIterator::kMaxAutoReadaheadSize = + 256 * 1024; + template void BlockBasedTableIterator::InitDataBlock() { BlockHandle data_block_handle = index_iter_->value(); @@ -2465,32 +2468,47 @@ void BlockBasedTableIterator::InitDataBlock() { } auto* rep = table_->get_rep(); - // Automatically prefetch additional data when a range scan (iterator) does - // more than 2 sequential IOs. This is enabled only for user reads and when - // ReadOptions.readahead_size is 0. - if (!for_compaction_ && read_options_.readahead_size == 0) { - num_file_reads_++; - if (num_file_reads_ > 2) { - if (!rep->file->use_direct_io() && - (data_block_handle.offset() + - static_cast(data_block_handle.size()) + - kBlockTrailerSize > - readahead_limit_)) { - // Buffered I/O - // Discarding the return status of Prefetch calls intentionally, as we - // can fallback to reading from disk if Prefetch fails. - rep->file->Prefetch(data_block_handle.offset(), readahead_size_); - readahead_limit_ = - static_cast(data_block_handle.offset() + readahead_size_); - // Keep exponentially increasing readahead size until - // kMaxReadaheadSize. - readahead_size_ = std::min(kMaxReadaheadSize, readahead_size_ * 2); - } else if (rep->file->use_direct_io() && !prefetch_buffer_) { - // Direct I/O - // Let FilePrefetchBuffer take care of the readahead. - prefetch_buffer_.reset(new FilePrefetchBuffer( - rep->file.get(), kInitReadaheadSize, kMaxReadaheadSize)); + // Prefetch additional data for range scans (iterators). Enabled only for + // user reads. + // Implicit auto readahead: + // Enabled after 2 sequential IOs when ReadOptions.readahead_size == 0. + // Explicit user requested readahead: + // Enabled from the very first IO when ReadOptions.readahead_size is set. + if (!for_compaction_) { + if (read_options_.readahead_size == 0) { + // Implicit auto readahead + num_file_reads_++; + if (num_file_reads_ > kMinNumFileReadsToStartAutoReadahead) { + if (!rep->file->use_direct_io() && + (data_block_handle.offset() + + static_cast(data_block_handle.size()) + + kBlockTrailerSize > + readahead_limit_)) { + // Buffered I/O + // Discarding the return status of Prefetch calls intentionally, as + // we can fallback to reading from disk if Prefetch fails. + rep->file->Prefetch(data_block_handle.offset(), readahead_size_); + readahead_limit_ = static_cast(data_block_handle.offset() + + readahead_size_); + // Keep exponentially increasing readahead size until + // kMaxAutoReadaheadSize. + readahead_size_ = + std::min(kMaxAutoReadaheadSize, readahead_size_ * 2); + } else if (rep->file->use_direct_io() && !prefetch_buffer_) { + // Direct I/O + // Let FilePrefetchBuffer take care of the readahead. + prefetch_buffer_.reset( + new FilePrefetchBuffer(rep->file.get(), kInitAutoReadaheadSize, + kMaxAutoReadaheadSize)); + } } + } else if (!prefetch_buffer_) { + // Explicit user requested readahead + // The actual condition is: + // if (read_options_.readahead_size != 0 && !prefetch_buffer_) + prefetch_buffer_.reset(new FilePrefetchBuffer( + rep->file.get(), read_options_.readahead_size, + read_options_.readahead_size)); } } diff --git a/table/block_based_table_reader.h b/table/block_based_table_reader.h index ea8ba62c5..1fcc8cbfa 100644 --- a/table/block_based_table_reader.h +++ b/table/block_based_table_reader.h @@ -717,13 +717,15 @@ class BlockBasedTableIterator : public InternalIteratorBase { bool for_compaction_; BlockHandle prev_index_value_; - static const size_t kInitReadaheadSize = 8 * 1024; + // All the below fields control iterator readahead + static const size_t kInitAutoReadaheadSize = 8 * 1024; // Found that 256 KB readahead size provides the best performance, based on - // experiments. - static const size_t kMaxReadaheadSize; - size_t readahead_size_ = kInitReadaheadSize; + // experiments, for auto readahead. Experiment data is in PR #3282. + static const size_t kMaxAutoReadaheadSize; + static const int kMinNumFileReadsToStartAutoReadahead = 2; + size_t readahead_size_ = kInitAutoReadaheadSize; size_t readahead_limit_ = 0; - int num_file_reads_ = 0; + int64_t num_file_reads_ = 0; std::unique_ptr prefetch_buffer_; }; diff --git a/tools/db_bench_tool.cc b/tools/db_bench_tool.cc index 24643ff76..f0f1d879b 100644 --- a/tools/db_bench_tool.cc +++ b/tools/db_bench_tool.cc @@ -1172,6 +1172,7 @@ DEFINE_int32(skip_list_lookahead, 0, "Used with skip_list memtablerep; try " "position"); DEFINE_bool(report_file_operations, false, "if report number of file " "operations"); +DEFINE_int32(readahead_size, 0, "Iterator readahead size"); static const bool FLAGS_soft_rate_limit_dummy __attribute__((__unused__)) = RegisterFlagValidator(&FLAGS_soft_rate_limit, &ValidateRateLimit); @@ -4987,6 +4988,7 @@ void VerifyDBFromDB(std::string& truth_db_name) { options.total_order_seek = FLAGS_total_order_seek; options.prefix_same_as_start = FLAGS_prefix_same_as_start; options.tailing = FLAGS_use_tailing_iterator; + options.readahead_size = FLAGS_readahead_size; Iterator* single_iter = nullptr; std::vector multi_iters;