diff --git a/HISTORY.md b/HISTORY.md index ab56b975b..fb5fff4d0 100644 --- a/HISTORY.md +++ b/HISTORY.md @@ -20,6 +20,7 @@ * Enable async prefetching if ReadOptions.readahead_size is set along with ReadOptions.async_io in FilePrefetchBuffer. * Add event listener support on remote compaction compactor side. * Added a dedicated integer DB property `rocksdb.live-blob-file-garbage-size` that exposes the total amount of garbage in the blob files in the current version. +* RocksDB does internal auto prefetching if it notices sequential reads. It starts with readahead size `initial_auto_readahead_size` which now can be configured through BlockBasedTableOptions. ### Behavior changes * Disallow usage of commit-time-write-batch for write-prepared/write-unprepared transactions if TransactionOptions::use_only_the_last_commit_time_batch_for_recovery is false to prevent two (or more) uncommitted versions of the same key in the database. Otherwise, bottommost compaction may violate the internal key uniqueness invariant of SSTs if the sequence numbers of both internal keys are zeroed out (#9794). diff --git a/file/file_prefetch_buffer.h b/file/file_prefetch_buffer.h index 8f2f09c73..78ed3153c 100644 --- a/file/file_prefetch_buffer.h +++ b/file/file_prefetch_buffer.h @@ -36,7 +36,6 @@ struct BufferInfo { class FilePrefetchBuffer { public: static const int kMinNumFileReadsToStartAutoReadahead = 2; - static const size_t kInitAutoReadaheadSize = 8 * 1024; // Constructor. // @@ -68,6 +67,7 @@ class FilePrefetchBuffer { bool async_io = false, FileSystem* fs = nullptr) : curr_(0), readahead_size_(readahead_size), + initial_auto_readahead_size_(readahead_size), max_readahead_size_(max_readahead_size), min_offset_read_(port::kMaxSizet), enable_(enable), @@ -184,9 +184,8 @@ class FilePrefetchBuffer { bufs_[curr_].offset_ + bufs_[curr_].buffer_.CurrentSize()) && IsBlockSequential(offset) && (num_file_reads_ + 1 > kMinNumFileReadsToStartAutoReadahead)) { - size_t initial_auto_readahead_size = kInitAutoReadaheadSize; readahead_size_ = - std::max(initial_auto_readahead_size, + std::max(initial_auto_readahead_size_, (readahead_size_ >= value ? readahead_size_ - value : 0)); } } @@ -238,7 +237,7 @@ class FilePrefetchBuffer { // Called in case of implicit auto prefetching. void ResetValues() { num_file_reads_ = 1; - readahead_size_ = kInitAutoReadaheadSize; + readahead_size_ = initial_auto_readahead_size_; } std::vector bufs_; @@ -246,6 +245,7 @@ class FilePrefetchBuffer { // consumed currently. uint32_t curr_; size_t readahead_size_; + size_t initial_auto_readahead_size_; // FilePrefetchBuffer object won't be created from Iterator flow if // max_readahead_size_ = 0. size_t max_readahead_size_; diff --git a/file/prefetch_test.cc b/file/prefetch_test.cc index 579d15d6b..20e569568 100644 --- a/file/prefetch_test.cc +++ b/file/prefetch_test.cc @@ -275,8 +275,8 @@ TEST_P(PrefetchTest, ConfigureAutoMaxReadaheadSize) { break; case 1: // max_auto_readahead_size is set less than - // BlockBasedTable::kInitAutoReadaheadSize. So readahead_size remains - // equal to max_auto_readahead_size. + // initial_auto_readahead_size. So readahead_size remains equal to + // max_auto_readahead_size. ASSERT_OK(db_->SetOptions({{"block_based_table_factory", "{max_auto_readahead_size=4096;}"}})); break; @@ -321,6 +321,145 @@ TEST_P(PrefetchTest, ConfigureAutoMaxReadaheadSize) { SyncPoint::GetInstance()->ClearAllCallBacks(); Close(); } + +TEST_P(PrefetchTest, ConfigureInternalAutoReadaheadSize) { + // First param is if the mockFS support_prefetch or not + bool support_prefetch = + std::get<0>(GetParam()) && + test::IsPrefetchSupported(env_->GetFileSystem(), dbname_); + + // Second param is if directIO is enabled or not + bool use_direct_io = std::get<1>(GetParam()); + + std::shared_ptr fs = + std::make_shared(env_->GetFileSystem(), support_prefetch); + std::unique_ptr env(new CompositeEnvWrapper(env_, fs)); + + Options options = CurrentOptions(); + options.write_buffer_size = 1024; + options.create_if_missing = true; + options.compression = kNoCompression; + options.env = env.get(); + options.disable_auto_compactions = true; + if (use_direct_io) { + options.use_direct_reads = true; + options.use_direct_io_for_flush_and_compaction = true; + } + BlockBasedTableOptions table_options; + table_options.no_block_cache = true; + table_options.cache_index_and_filter_blocks = false; + table_options.metadata_block_size = 1024; + table_options.index_type = + BlockBasedTableOptions::IndexType::kTwoLevelIndexSearch; + table_options.initial_auto_readahead_size = 0; + options.table_factory.reset(NewBlockBasedTableFactory(table_options)); + + int buff_prefetch_count = 0; + // DB open will create table readers unless we reduce the table cache + // capacity. SanitizeOptions will set max_open_files to minimum of 20. + // Table cache is allocated with max_open_files - 10 as capacity. So + // override max_open_files to 10 so table cache capacity will become 0. + // This will prevent file open during DB open and force the file to be + // opened during Iteration. + SyncPoint::GetInstance()->SetCallBack( + "SanitizeOptions::AfterChangeMaxOpenFiles", [&](void* arg) { + int* max_open_files = (int*)arg; + *max_open_files = 11; + }); + + SyncPoint::GetInstance()->SetCallBack("FilePrefetchBuffer::Prefetch:Start", + [&](void*) { buff_prefetch_count++; }); + + SyncPoint::GetInstance()->EnableProcessing(); + + SyncPoint::GetInstance()->EnableProcessing(); + + Status s = TryReopen(options); + + if (use_direct_io && (s.IsNotSupported() || s.IsInvalidArgument())) { + // If direct IO is not supported, skip the test + return; + } else { + ASSERT_OK(s); + } + + Random rnd(309); + int key_count = 0; + const int num_keys_per_level = 100; + // Level 0 : Keys in range [0, 99], Level 1:[100, 199], Level 2:[200, 299]. + for (int level = 2; level >= 0; level--) { + key_count = level * num_keys_per_level; + for (int i = 0; i < num_keys_per_level; ++i) { + ASSERT_OK(Put(Key(key_count++), rnd.RandomString(500))); + } + ASSERT_OK(Flush()); + MoveFilesToLevel(level); + } + Close(); + + TryReopen(options); + { + auto iter = std::unique_ptr(db_->NewIterator(ReadOptions())); + fs->ClearPrefetchCount(); + buff_prefetch_count = 0; + std::vector buff_prefetch_level_count = {0, 0, 0}; + + for (int level = 2; level >= 0; level--) { + key_count = level * num_keys_per_level; + switch (level) { + case 0: + // initial_auto_readahead_size is set 0 so data and index blocks are + // not prefetched. + ASSERT_OK(db_->SetOptions({{"block_based_table_factory", + "{initial_auto_readahead_size=0;}"}})); + break; + case 1: + // intial_auto_readahead_size and max_auto_readahead_size are set same + // so readahead_size remains same. + ASSERT_OK(db_->SetOptions({{"block_based_table_factory", + "{initial_auto_readahead_size=4096;max_" + "auto_readahead_size=4096;}"}})); + break; + case 2: + ASSERT_OK( + db_->SetOptions({{"block_based_table_factory", + "{initial_auto_readahead_size=65536;}"}})); + break; + default: + assert(false); + } + + for (int i = 0; i < num_keys_per_level; ++i) { + iter->Seek(Key(key_count++)); + iter->Next(); + } + + buff_prefetch_level_count[level] = buff_prefetch_count; + if (support_prefetch && !use_direct_io) { + if (level == 0) { + ASSERT_FALSE(fs->IsPrefetchCalled()); + } else { + ASSERT_TRUE(fs->IsPrefetchCalled()); + } + fs->ClearPrefetchCount(); + } else { + ASSERT_FALSE(fs->IsPrefetchCalled()); + if (level == 0) { + ASSERT_EQ(buff_prefetch_count, 0); + } else { + ASSERT_GT(buff_prefetch_count, 0); + } + buff_prefetch_count = 0; + } + } + if (!support_prefetch) { + ASSERT_GT(buff_prefetch_level_count[1], buff_prefetch_level_count[2]); + } + } + SyncPoint::GetInstance()->DisableProcessing(); + SyncPoint::GetInstance()->ClearAllCallBacks(); + Close(); +} #endif // !ROCKSDB_LITE TEST_P(PrefetchTest, PrefetchWhenReseek) { diff --git a/include/rocksdb/table.h b/include/rocksdb/table.h index 53eb47d78..f4898d087 100644 --- a/include/rocksdb/table.h +++ b/include/rocksdb/table.h @@ -501,14 +501,15 @@ struct BlockBasedTableOptions { // RocksDB does auto-readahead for iterators on noticing more than two reads // for a table file if user doesn't provide readahead_size. The readahead - // starts at 8KB and doubles on every additional read upto - // max_auto_readahead_size and max_auto_readahead_size can be configured. + // starts at BlockBasedTableOptions.initial_auto_readahead_size (default: 8KB) + // and doubles on every additional read upto max_auto_readahead_size and + // max_auto_readahead_size can be configured. // - // Special Value: 0 - If max_auto_readahead_size is set 0 then no implicit - // auto prefetching will be done. If max_auto_readahead_size provided is less - // than 8KB (which is initial readahead size used by rocksdb in case of - // auto-readahead), readahead size will remain same as - // max_auto_readahead_size. + // Special Value: 0 - If max_auto_readahead_size is set 0 then it will disable + // the implicit auto prefetching. + // If max_auto_readahead_size provided is less + // than initial_auto_readahead_size, then RocksDB will sanitize the + // initial_auto_readahead_size and set it to max_auto_readahead_size. // // Value should be provided along with KB i.e. 256 * 1024 as it will prefetch // the blocks. @@ -547,6 +548,35 @@ struct BlockBasedTableOptions { PrepopulateBlockCache prepopulate_block_cache = PrepopulateBlockCache::kDisable; + + // RocksDB does auto-readahead for iterators on noticing more than two reads + // for a table file if user doesn't provide readahead_size. The readahead size + // starts at initial_auto_readahead_size and doubles on every additional read + // upto BlockBasedTableOptions.max_auto_readahead_size. + // max_auto_readahead_size can also be configured. + // + // Scenarios: + // - If initial_auto_readahead_size is set 0 then it will disabled the + // implicit auto prefetching irrespective of max_auto_readahead_size. + // - If max_auto_readahead_size is set 0, it will disable the internal + // prefetching irrespective of initial_auto_readahead_size. + // - If initial_auto_readahead_size > max_auto_readahead_size, then RocksDB + // will sanitize the value of initial_auto_readahead_size to + // max_auto_readahead_size and readahead_size will be + // max_auto_readahead_size. + // + // Value should be provided along with KB i.e. 8 * 1024 as it will prefetch + // the blocks. + // + // This parameter can be changed dynamically by + // DB::SetOptions({{"block_based_table_factory", + // "{initial_auto_readahead_size=0;}"}})); + // + // Changing the value dynamically will only affect files opened after the + // change. + // + // Default: 8 KB (8 * 1024). + size_t initial_auto_readahead_size = 8 * 1024; }; // Table Properties that are specific to block-based table properties. diff --git a/options/options_settable_test.cc b/options/options_settable_test.cc index 8786e3607..d399947e7 100644 --- a/options/options_settable_test.cc +++ b/options/options_settable_test.cc @@ -195,7 +195,8 @@ TEST_F(OptionsSettableTest, BlockBasedTableOptionsAllFieldsSettable) { "enable_index_compression=false;" "block_align=true;" "max_auto_readahead_size=0;" - "prepopulate_block_cache=kDisable", + "prepopulate_block_cache=kDisable;" + "initial_auto_readahead_size=0", new_bbto)); ASSERT_EQ(unset_bytes_base, diff --git a/table/block_based/block_based_table_factory.cc b/table/block_based/block_based_table_factory.cc index dad82cac1..2a2258a40 100644 --- a/table/block_based/block_based_table_factory.cc +++ b/table/block_based/block_based_table_factory.cc @@ -413,6 +413,10 @@ static std::unordered_map offsetof(struct BlockBasedTableOptions, prepopulate_block_cache), &block_base_table_prepopulate_block_cache_string_map, OptionTypeFlags::kMutable)}, + {"initial_auto_readahead_size", + {offsetof(struct BlockBasedTableOptions, initial_auto_readahead_size), + OptionType::kSizeT, OptionVerificationType::kNormal, + OptionTypeFlags::kMutable}}, #endif // ROCKSDB_LITE }; @@ -815,6 +819,10 @@ std::string BlockBasedTableFactory::GetPrintableOptions() const { snprintf(buffer, kBufferSize, " prepopulate_block_cache: %d\n", static_cast(table_options_.prepopulate_block_cache)); ret.append(buffer); + snprintf(buffer, kBufferSize, + " initial_auto_readahead_size: %" ROCKSDB_PRIszt "\n", + table_options_.initial_auto_readahead_size); + ret.append(buffer); return ret; } diff --git a/table/block_based/block_based_table_iterator.h b/table/block_based/block_based_table_iterator.h index 6228f73ef..fe659d9d0 100644 --- a/table/block_based/block_based_table_iterator.h +++ b/table/block_based/block_based_table_iterator.h @@ -35,7 +35,9 @@ class BlockBasedTableIterator : public InternalIteratorBase { pinned_iters_mgr_(nullptr), prefix_extractor_(prefix_extractor), lookup_context_(caller), - block_prefetcher_(compaction_readahead_size), + block_prefetcher_( + compaction_readahead_size, + table_->get_rep()->table_options.initial_auto_readahead_size), allow_unprepared_value_(allow_unprepared_value), block_iter_points_to_real_block_(false), check_filter_(check_filter), diff --git a/table/block_based/block_based_table_reader.h b/table/block_based/block_based_table_reader.h index e702de427..0671fc09f 100644 --- a/table/block_based/block_based_table_reader.h +++ b/table/block_based/block_based_table_reader.h @@ -71,7 +71,6 @@ class BlockBasedTable : public TableReader { static const std::string kPartitionedFilterBlockPrefix; // All the below fields control iterator readahead - static const size_t kInitAutoReadaheadSize = 8 * 1024; static const int kMinNumFileReadsToStartAutoReadahead = 2; // 1-byte compression type + 32-bit checksum diff --git a/table/block_based/block_prefetcher.cc b/table/block_based/block_prefetcher.cc index 924a35194..c4a637550 100644 --- a/table/block_based/block_prefetcher.cc +++ b/table/block_based/block_prefetcher.cc @@ -34,7 +34,7 @@ void BlockPrefetcher::PrefetchIfNeeded(const BlockBasedTable::Rep* rep, // If max_auto_readahead_size is set to be 0 by user, no data will be // prefetched. size_t max_auto_readahead_size = rep->table_options.max_auto_readahead_size; - if (max_auto_readahead_size == 0) { + if (max_auto_readahead_size == 0 || initial_auto_readahead_size_ == 0) { return; } @@ -50,7 +50,7 @@ void BlockPrefetcher::PrefetchIfNeeded(const BlockBasedTable::Rep* rep, if (!IsBlockSequential(offset)) { UpdateReadPattern(offset, len); - ResetValues(); + ResetValues(rep->table_options.initial_auto_readahead_size); return; } UpdateReadPattern(offset, len); diff --git a/table/block_based/block_prefetcher.h b/table/block_based/block_prefetcher.h index 85c52be21..e7c11532a 100644 --- a/table/block_based/block_prefetcher.h +++ b/table/block_based/block_prefetcher.h @@ -12,8 +12,12 @@ namespace ROCKSDB_NAMESPACE { class BlockPrefetcher { public: - explicit BlockPrefetcher(size_t compaction_readahead_size) - : compaction_readahead_size_(compaction_readahead_size) {} + explicit BlockPrefetcher(size_t compaction_readahead_size, + size_t initial_auto_readahead_size) + : compaction_readahead_size_(compaction_readahead_size), + readahead_size_(initial_auto_readahead_size), + initial_auto_readahead_size_(initial_auto_readahead_size) {} + void PrefetchIfNeeded(const BlockBasedTable::Rep* rep, const BlockHandle& handle, size_t readahead_size, bool is_for_compaction, bool async_io); @@ -28,12 +32,13 @@ class BlockPrefetcher { return (prev_len_ == 0 || (prev_offset_ + prev_len_ == offset)); } - void ResetValues() { + void ResetValues(size_t initial_auto_readahead_size) { num_file_reads_ = 1; // Since initial_auto_readahead_size_ can be different from - // kInitAutoReadaheadSize in case of adaptive_readahead, so fallback the - // readahead_size_ to kInitAutoReadaheadSize in case of reset. - initial_auto_readahead_size_ = BlockBasedTable::kInitAutoReadaheadSize; + // the value passed to BlockBasedTableOptions.initial_auto_readahead_size in + // case of adaptive_readahead, so fallback the readahead_size_ to that value + // in case of reset. + initial_auto_readahead_size_ = initial_auto_readahead_size; readahead_size_ = initial_auto_readahead_size_; readahead_limit_ = 0; return; @@ -52,12 +57,11 @@ class BlockPrefetcher { size_t compaction_readahead_size_; // readahead_size_ is used if underlying FS supports prefetching. - size_t readahead_size_ = BlockBasedTable::kInitAutoReadaheadSize; + size_t readahead_size_; size_t readahead_limit_ = 0; // initial_auto_readahead_size_ is used if RocksDB uses internal prefetch // buffer. - uint64_t initial_auto_readahead_size_ = - BlockBasedTable::kInitAutoReadaheadSize; + uint64_t initial_auto_readahead_size_; int64_t num_file_reads_ = 0; uint64_t prev_offset_ = 0; size_t prev_len_ = 0; diff --git a/table/block_based/partitioned_index_iterator.h b/table/block_based/partitioned_index_iterator.h index a2cbefe80..6532edc4b 100644 --- a/table/block_based/partitioned_index_iterator.h +++ b/table/block_based/partitioned_index_iterator.h @@ -36,7 +36,9 @@ class PartitionedIndexIterator : public InternalIteratorBase { user_comparator_(icomp.user_comparator()), block_iter_points_to_real_block_(false), lookup_context_(caller), - block_prefetcher_(compaction_readahead_size) { + block_prefetcher_( + compaction_readahead_size, + table_->get_rep()->table_options.initial_auto_readahead_size) { } ~PartitionedIndexIterator() override {}