diff --git a/HISTORY.md b/HISTORY.md index a4557c330..5e535f7c4 100644 --- a/HISTORY.md +++ b/HISTORY.md @@ -1,5 +1,7 @@ # Rocksdb Change Log ## Unreleased +### Public API change +* Add a new option BlockBasedTableOptions::max_auto_readahead_size. RocksDB does auto-readahead for iterators on noticing more than two reads for a table file if user doesn't provide readahead_size. The readahead starts at 8KB and doubles on every additional read upto max_auto_readahead_size and now max_auto_readahead_size can be configured dynamically as well. Found that 256 KB readahead size provides the best performance, based on experiments, for auto readahead. Experiment data is in PR #3282. If value is set 0 then no automatic prefetching will be done by rocksdb. Also changing the value will only affect files opened after the change. ## 6.18.0 (02/19/2021) ### Behavior Changes diff --git a/file/prefetch_test.cc b/file/prefetch_test.cc index 3dabb2094..954fcdeaa 100644 --- a/file/prefetch_test.cc +++ b/file/prefetch_test.cc @@ -175,9 +175,154 @@ TEST_P(PrefetchTest, Basic) { Close(); } +#ifndef ROCKSDB_LITE +TEST_P(PrefetchTest, ConfigureAutoMaxReadaheadSize) { + // First param is if the mockFS support_prefetch or not + bool support_prefetch = + std::get<0>(GetParam()) && + test::IsPrefetchSupported(env_->GetFileSystem(), dbname_); + + // Second param is if directIO is enabled or not + bool use_direct_io = std::get<1>(GetParam()); + + std::shared_ptr fs = + std::make_shared(env_->GetFileSystem(), support_prefetch); + std::unique_ptr env(new CompositeEnvWrapper(env_, fs)); + + Options options = CurrentOptions(); + options.write_buffer_size = 1024; + options.create_if_missing = true; + options.compression = kNoCompression; + options.env = env.get(); + options.disable_auto_compactions = true; + if (use_direct_io) { + options.use_direct_reads = true; + options.use_direct_io_for_flush_and_compaction = true; + } + BlockBasedTableOptions table_options; + table_options.no_block_cache = true; + table_options.cache_index_and_filter_blocks = false; + table_options.metadata_block_size = 1024; + table_options.index_type = + BlockBasedTableOptions::IndexType::kTwoLevelIndexSearch; + table_options.max_auto_readahead_size = 0; + options.table_factory.reset(NewBlockBasedTableFactory(table_options)); + + int buff_prefetch_count = 0; + SyncPoint::GetInstance()->SetCallBack("FilePrefetchBuffer::Prefetch:Start", + [&](void*) { buff_prefetch_count++; }); + + // DB open will create table readers unless we reduce the table cache + // capacity. SanitizeOptions will set max_open_files to minimum of 20. Table + // cache is allocated with max_open_files - 10 as capacity. So override + // max_open_files to 10 so table cache capacity will become 0. This will + // prevent file open during DB open and force the file to be opened during + // Iteration. + SyncPoint::GetInstance()->SetCallBack( + "SanitizeOptions::AfterChangeMaxOpenFiles", [&](void* arg) { + int* max_open_files = (int*)arg; + *max_open_files = 11; + }); + + SyncPoint::GetInstance()->EnableProcessing(); + + Status s = TryReopen(options); + + if (use_direct_io && (s.IsNotSupported() || s.IsInvalidArgument())) { + // If direct IO is not supported, skip the test + return; + } else { + ASSERT_OK(s); + } + + Random rnd(309); + int key_count = 0; + const int num_keys_per_level = 100; + // Level 0 : Keys in range [0, 99], Level 1:[100, 199], Level 2:[200, 299]. + for (int level = 2; level >= 0; level--) { + key_count = level * num_keys_per_level; + for (int i = 0; i < num_keys_per_level; ++i) { + ASSERT_OK(Put(Key(key_count++), rnd.RandomString(500))); + } + ASSERT_OK(Flush()); + MoveFilesToLevel(level); + } + Close(); + std::vector buff_prefectch_level_count = {0, 0, 0}; + TryReopen(options); + { + auto iter = std::unique_ptr(db_->NewIterator(ReadOptions())); + fs->ClearPrefetchCount(); + buff_prefetch_count = 0; + + for (int level = 2; level >= 0; level--) { + key_count = level * num_keys_per_level; + switch (level) { + case 0: + // max_auto_readahead_size is set 0 so data and index blocks are not + // prefetched. + ASSERT_OK(db_->SetOptions( + {{"block_based_table_factory", "{max_auto_readahead_size=0;}"}})); + break; + case 1: + // max_auto_readahead_size is set less than + // BlockBasedTable::kInitAutoReadaheadSize. So readahead_size remains + // equal to max_auto_readahead_size. + ASSERT_OK(db_->SetOptions({{"block_based_table_factory", + "{max_auto_readahead_size=4096;}"}})); + break; + case 2: + ASSERT_OK(db_->SetOptions({{"block_based_table_factory", + "{max_auto_readahead_size=65536;}"}})); + break; + default: + assert(false); + } + + for (int i = 0; i < num_keys_per_level; ++i) { + iter->Seek(Key(key_count++)); + iter->Next(); + } + + buff_prefectch_level_count[level] = buff_prefetch_count; + if (support_prefetch && !use_direct_io) { + if (level == 0) { + ASSERT_FALSE(fs->IsPrefetchCalled()); + } else { + ASSERT_TRUE(fs->IsPrefetchCalled()); + } + fs->ClearPrefetchCount(); + } else { + ASSERT_FALSE(fs->IsPrefetchCalled()); + if (level == 0) { + ASSERT_EQ(buff_prefetch_count, 0); + } else { + ASSERT_GT(buff_prefetch_count, 0); + } + buff_prefetch_count = 0; + } + } + } + + if (!support_prefetch) { + ASSERT_GT(buff_prefectch_level_count[1], buff_prefectch_level_count[2]); + } + + SyncPoint::GetInstance()->DisableProcessing(); + SyncPoint::GetInstance()->ClearAllCallBacks(); + Close(); +} + INSTANTIATE_TEST_CASE_P(PrefetchTest, PrefetchTest, ::testing::Combine(::testing::Bool(), ::testing::Bool())); +#endif // !ROCKSDB_LITE + +class PrefetchTest1 : public DBTestBase, + public ::testing::WithParamInterface { + public: + PrefetchTest1() : DBTestBase("/prefetch_test1", true) {} +}; } // namespace ROCKSDB_NAMESPACE diff --git a/include/rocksdb/table.h b/include/rocksdb/table.h index a2bfe3cb4..1cb7290fe 100644 --- a/include/rocksdb/table.h +++ b/include/rocksdb/table.h @@ -435,6 +435,33 @@ struct BlockBasedTableOptions { IndexShorteningMode index_shortening = IndexShorteningMode::kShortenSeparators; + + // RocksDB does auto-readahead for iterators on noticing more than two reads + // for a table file if user doesn't provide readahead_size. The readahead + // starts at 8KB and doubles on every additional read upto + // max_auto_readahead_size and max_auto_readahead_size can be configured. + // + // Special Value: 0 - If max_auto_readahead_size is set 0 then no implicit + // auto prefetching will be done. If max_auto_readahead_size provided is less + // than 8KB (which is initial readahead size used by rocksdb in case of + // auto-readahead), readahead size will remain same as + // max_auto_readahead_size. + // + // Value should be provided along with KB i.e. 256 * 1024 as it will prefetch + // the blocks. + // + // Found that 256 KB readahead size provides the best performance, based on + // experiments, for auto readahead. Experiment data is in PR #3282. + // + // This parameter can be changed dynamically by + // DB::SetOptions({{"block_based_table_factory", + // "{max_auto_readahead_size=0;}"}})); + // + // Changing the value dynamically will only affect files opened after the + // change. + // + // Default: 256 KB (256 * 1024). + size_t max_auto_readahead_size = 256 * 1024; }; // Table Properties that are specific to block-based table properties. diff --git a/options/options_settable_test.cc b/options/options_settable_test.cc index c2ff17c7b..a9f46cd92 100644 --- a/options/options_settable_test.cc +++ b/options/options_settable_test.cc @@ -179,7 +179,8 @@ TEST_F(OptionsSettableTest, BlockBasedTableOptionsAllFieldsSettable) { "hash_index_allow_collision=false;" "verify_compression=true;read_amp_bytes_per_bit=0;" "enable_index_compression=false;" - "block_align=true", + "block_align=true;" + "max_auto_readahead_size=0", new_bbto)); ASSERT_EQ(unset_bytes_base, diff --git a/table/block_based/block_based_table_factory.cc b/table/block_based/block_based_table_factory.cc index 16f74cf55..53e139a2d 100644 --- a/table/block_based/block_based_table_factory.cc +++ b/table/block_based/block_based_table_factory.cc @@ -415,6 +415,10 @@ static std::unordered_map auto* cache = reinterpret_cast*>(addr); return Cache::CreateFromString(opts, value, cache); }}}, + {"max_auto_readahead_size", + {offsetof(struct BlockBasedTableOptions, max_auto_readahead_size), + OptionType::kSizeT, OptionVerificationType::kNormal, + OptionTypeFlags::kMutable}}, #endif // ROCKSDB_LITE }; @@ -687,6 +691,9 @@ std::string BlockBasedTableFactory::GetPrintableOptions() const { snprintf(buffer, kBufferSize, " block_align: %d\n", table_options_.block_align); ret.append(buffer); + snprintf(buffer, kBufferSize, + " max_auto_readahead_size: %" ROCKSDB_PRIszt "\n", + table_options_.max_auto_readahead_size); return ret; } diff --git a/table/block_based/block_based_table_reader.cc b/table/block_based/block_based_table_reader.cc index 8758f7263..e210455fa 100644 --- a/table/block_based/block_based_table_reader.cc +++ b/table/block_based/block_based_table_reader.cc @@ -67,11 +67,6 @@ extern const uint64_t kBlockBasedTableMagicNumber; extern const std::string kHashIndexPrefixesBlock; extern const std::string kHashIndexPrefixesMetadataBlock; - -// Found that 256 KB readahead size provides the best performance, based on -// experiments, for auto readahead. Experiment data is in PR #3282. -const size_t BlockBasedTable::kMaxAutoReadaheadSize = 256 * 1024; - BlockBasedTable::~BlockBasedTable() { delete rep_; } @@ -2921,7 +2916,7 @@ Status BlockBasedTable::VerifyChecksumInBlocks( // increasing of the buffer size. size_t readahead_size = (read_options.readahead_size != 0) ? read_options.readahead_size - : kMaxAutoReadaheadSize; + : rep_->table_options.max_auto_readahead_size; // FilePrefetchBuffer doesn't work in mmap mode and readahead is not // needed there. FilePrefetchBuffer prefetch_buffer( diff --git a/table/block_based/block_based_table_reader.h b/table/block_based/block_based_table_reader.h index 37a325dbc..031206761 100644 --- a/table/block_based/block_based_table_reader.h +++ b/table/block_based/block_based_table_reader.h @@ -65,9 +65,6 @@ class BlockBasedTable : public TableReader { // All the below fields control iterator readahead static const size_t kInitAutoReadaheadSize = 8 * 1024; - // Found that 256 KB readahead size provides the best performance, based on - // experiments, for auto readahead. Experiment data is in PR #3282. - static const size_t kMaxAutoReadaheadSize; static const int kMinNumFileReadsToStartAutoReadahead = 2; // Attempt to open the table that is stored in bytes [0..file_size) diff --git a/table/block_based/block_prefetcher.cc b/table/block_based/block_prefetcher.cc index aa3fc3610..aab555a0c 100644 --- a/table/block_based/block_prefetcher.cc +++ b/table/block_based/block_prefetcher.cc @@ -35,10 +35,23 @@ void BlockPrefetcher::PrefetchIfNeeded(const BlockBasedTable::Rep* rep, return; } + size_t max_auto_readahead_size = rep->table_options.max_auto_readahead_size; + size_t initial_auto_readahead_size = BlockBasedTable::kInitAutoReadaheadSize; + + // If max_auto_readahead_size is set to be 0 by user, no data will be + // prefetched. + if (max_auto_readahead_size == 0) { + return; + } + + if (initial_auto_readahead_size > max_auto_readahead_size) { + initial_auto_readahead_size = max_auto_readahead_size; + } + if (rep->file->use_direct_io()) { - rep->CreateFilePrefetchBufferIfNotExists( - BlockBasedTable::kInitAutoReadaheadSize, - BlockBasedTable::kMaxAutoReadaheadSize, &prefetch_buffer_); + rep->CreateFilePrefetchBufferIfNotExists(initial_auto_readahead_size, + max_auto_readahead_size, + &prefetch_buffer_); return; } @@ -47,20 +60,24 @@ void BlockPrefetcher::PrefetchIfNeeded(const BlockBasedTable::Rep* rep, return; } + if (readahead_size_ > max_auto_readahead_size) { + readahead_size_ = max_auto_readahead_size; + } + // If prefetch is not supported, fall back to use internal prefetch buffer. // Discarding other return status of Prefetch calls intentionally, as // we can fallback to reading from disk if Prefetch fails. Status s = rep->file->Prefetch(handle.offset(), readahead_size_); if (s.IsNotSupported()) { - rep->CreateFilePrefetchBufferIfNotExists( - BlockBasedTable::kInitAutoReadaheadSize, - BlockBasedTable::kMaxAutoReadaheadSize, &prefetch_buffer_); + rep->CreateFilePrefetchBufferIfNotExists(initial_auto_readahead_size, + max_auto_readahead_size, + &prefetch_buffer_); return; } readahead_limit_ = static_cast(handle.offset() + readahead_size_); + // Keep exponentially increasing readahead size until - // kMaxAutoReadaheadSize. - readahead_size_ = - std::min(BlockBasedTable::kMaxAutoReadaheadSize, readahead_size_ * 2); + // max_auto_readahead_size. + readahead_size_ = std::min(max_auto_readahead_size, readahead_size_ * 2); } } // namespace ROCKSDB_NAMESPACE