Make BlockBasedTable::kMaxAutoReadAheadSize configurable (#7951)

Summary:
RocksDB does auto-readahead for iterators on noticing more
than two reads for a table file. The readahead starts at 8KB and doubles on every
additional read upto BlockBasedTable::kMaxAutoReadAheadSize which is
256*1024.
This PR adds a new option BlockBasedTableOptions::max_auto_readahead_size which
replaces BlockBasedTable::kMaxAutoReadAheadSize and the new option can be
configured.
If max_auto_readahead_size is set 0 then no implicit auto prefetching will
be done. If max_auto_readahead_size provided is less than
8KB (which is initial readahead size used by rocksdb in case of
auto-readahead), readahead size will remain same as max_auto_readahead_size.

Pull Request resolved: https://github.com/facebook/rocksdb/pull/7951

Test Plan: Add new unit test case.

Reviewed By: anand1976

Differential Revision: D26568085

Pulled By: akankshamahajan15

fbshipit-source-id: b6543520fc74e97d859f2002328d4c5254d417af
main
Akanksha Mahajan 4 years ago committed by Facebook GitHub Bot
parent e017af15c1
commit cd79a00903
  1. 2
      HISTORY.md
  2. 145
      file/prefetch_test.cc
  3. 27
      include/rocksdb/table.h
  4. 3
      options/options_settable_test.cc
  5. 7
      table/block_based/block_based_table_factory.cc
  6. 7
      table/block_based/block_based_table_reader.cc
  7. 3
      table/block_based/block_based_table_reader.h
  8. 35
      table/block_based/block_prefetcher.cc

@ -1,5 +1,7 @@
# Rocksdb Change Log # Rocksdb Change Log
## Unreleased ## Unreleased
### Public API change
* Add a new option BlockBasedTableOptions::max_auto_readahead_size. RocksDB does auto-readahead for iterators on noticing more than two reads for a table file if user doesn't provide readahead_size. The readahead starts at 8KB and doubles on every additional read upto max_auto_readahead_size and now max_auto_readahead_size can be configured dynamically as well. Found that 256 KB readahead size provides the best performance, based on experiments, for auto readahead. Experiment data is in PR #3282. If value is set 0 then no automatic prefetching will be done by rocksdb. Also changing the value will only affect files opened after the change.
## 6.18.0 (02/19/2021) ## 6.18.0 (02/19/2021)
### Behavior Changes ### Behavior Changes

@ -175,9 +175,154 @@ TEST_P(PrefetchTest, Basic) {
Close(); Close();
} }
#ifndef ROCKSDB_LITE
TEST_P(PrefetchTest, ConfigureAutoMaxReadaheadSize) {
// First param is if the mockFS support_prefetch or not
bool support_prefetch =
std::get<0>(GetParam()) &&
test::IsPrefetchSupported(env_->GetFileSystem(), dbname_);
// Second param is if directIO is enabled or not
bool use_direct_io = std::get<1>(GetParam());
std::shared_ptr<MockFS> fs =
std::make_shared<MockFS>(env_->GetFileSystem(), support_prefetch);
std::unique_ptr<Env> env(new CompositeEnvWrapper(env_, fs));
Options options = CurrentOptions();
options.write_buffer_size = 1024;
options.create_if_missing = true;
options.compression = kNoCompression;
options.env = env.get();
options.disable_auto_compactions = true;
if (use_direct_io) {
options.use_direct_reads = true;
options.use_direct_io_for_flush_and_compaction = true;
}
BlockBasedTableOptions table_options;
table_options.no_block_cache = true;
table_options.cache_index_and_filter_blocks = false;
table_options.metadata_block_size = 1024;
table_options.index_type =
BlockBasedTableOptions::IndexType::kTwoLevelIndexSearch;
table_options.max_auto_readahead_size = 0;
options.table_factory.reset(NewBlockBasedTableFactory(table_options));
int buff_prefetch_count = 0;
SyncPoint::GetInstance()->SetCallBack("FilePrefetchBuffer::Prefetch:Start",
[&](void*) { buff_prefetch_count++; });
// DB open will create table readers unless we reduce the table cache
// capacity. SanitizeOptions will set max_open_files to minimum of 20. Table
// cache is allocated with max_open_files - 10 as capacity. So override
// max_open_files to 10 so table cache capacity will become 0. This will
// prevent file open during DB open and force the file to be opened during
// Iteration.
SyncPoint::GetInstance()->SetCallBack(
"SanitizeOptions::AfterChangeMaxOpenFiles", [&](void* arg) {
int* max_open_files = (int*)arg;
*max_open_files = 11;
});
SyncPoint::GetInstance()->EnableProcessing();
Status s = TryReopen(options);
if (use_direct_io && (s.IsNotSupported() || s.IsInvalidArgument())) {
// If direct IO is not supported, skip the test
return;
} else {
ASSERT_OK(s);
}
Random rnd(309);
int key_count = 0;
const int num_keys_per_level = 100;
// Level 0 : Keys in range [0, 99], Level 1:[100, 199], Level 2:[200, 299].
for (int level = 2; level >= 0; level--) {
key_count = level * num_keys_per_level;
for (int i = 0; i < num_keys_per_level; ++i) {
ASSERT_OK(Put(Key(key_count++), rnd.RandomString(500)));
}
ASSERT_OK(Flush());
MoveFilesToLevel(level);
}
Close();
std::vector<int> buff_prefectch_level_count = {0, 0, 0};
TryReopen(options);
{
auto iter = std::unique_ptr<Iterator>(db_->NewIterator(ReadOptions()));
fs->ClearPrefetchCount();
buff_prefetch_count = 0;
for (int level = 2; level >= 0; level--) {
key_count = level * num_keys_per_level;
switch (level) {
case 0:
// max_auto_readahead_size is set 0 so data and index blocks are not
// prefetched.
ASSERT_OK(db_->SetOptions(
{{"block_based_table_factory", "{max_auto_readahead_size=0;}"}}));
break;
case 1:
// max_auto_readahead_size is set less than
// BlockBasedTable::kInitAutoReadaheadSize. So readahead_size remains
// equal to max_auto_readahead_size.
ASSERT_OK(db_->SetOptions({{"block_based_table_factory",
"{max_auto_readahead_size=4096;}"}}));
break;
case 2:
ASSERT_OK(db_->SetOptions({{"block_based_table_factory",
"{max_auto_readahead_size=65536;}"}}));
break;
default:
assert(false);
}
for (int i = 0; i < num_keys_per_level; ++i) {
iter->Seek(Key(key_count++));
iter->Next();
}
buff_prefectch_level_count[level] = buff_prefetch_count;
if (support_prefetch && !use_direct_io) {
if (level == 0) {
ASSERT_FALSE(fs->IsPrefetchCalled());
} else {
ASSERT_TRUE(fs->IsPrefetchCalled());
}
fs->ClearPrefetchCount();
} else {
ASSERT_FALSE(fs->IsPrefetchCalled());
if (level == 0) {
ASSERT_EQ(buff_prefetch_count, 0);
} else {
ASSERT_GT(buff_prefetch_count, 0);
}
buff_prefetch_count = 0;
}
}
}
if (!support_prefetch) {
ASSERT_GT(buff_prefectch_level_count[1], buff_prefectch_level_count[2]);
}
SyncPoint::GetInstance()->DisableProcessing();
SyncPoint::GetInstance()->ClearAllCallBacks();
Close();
}
INSTANTIATE_TEST_CASE_P(PrefetchTest, PrefetchTest, INSTANTIATE_TEST_CASE_P(PrefetchTest, PrefetchTest,
::testing::Combine(::testing::Bool(), ::testing::Combine(::testing::Bool(),
::testing::Bool())); ::testing::Bool()));
#endif // !ROCKSDB_LITE
class PrefetchTest1 : public DBTestBase,
public ::testing::WithParamInterface<bool> {
public:
PrefetchTest1() : DBTestBase("/prefetch_test1", true) {}
};
} // namespace ROCKSDB_NAMESPACE } // namespace ROCKSDB_NAMESPACE

@ -435,6 +435,33 @@ struct BlockBasedTableOptions {
IndexShorteningMode index_shortening = IndexShorteningMode index_shortening =
IndexShorteningMode::kShortenSeparators; IndexShorteningMode::kShortenSeparators;
// RocksDB does auto-readahead for iterators on noticing more than two reads
// for a table file if user doesn't provide readahead_size. The readahead
// starts at 8KB and doubles on every additional read upto
// max_auto_readahead_size and max_auto_readahead_size can be configured.
//
// Special Value: 0 - If max_auto_readahead_size is set 0 then no implicit
// auto prefetching will be done. If max_auto_readahead_size provided is less
// than 8KB (which is initial readahead size used by rocksdb in case of
// auto-readahead), readahead size will remain same as
// max_auto_readahead_size.
//
// Value should be provided along with KB i.e. 256 * 1024 as it will prefetch
// the blocks.
//
// Found that 256 KB readahead size provides the best performance, based on
// experiments, for auto readahead. Experiment data is in PR #3282.
//
// This parameter can be changed dynamically by
// DB::SetOptions({{"block_based_table_factory",
// "{max_auto_readahead_size=0;}"}}));
//
// Changing the value dynamically will only affect files opened after the
// change.
//
// Default: 256 KB (256 * 1024).
size_t max_auto_readahead_size = 256 * 1024;
}; };
// Table Properties that are specific to block-based table properties. // Table Properties that are specific to block-based table properties.

@ -179,7 +179,8 @@ TEST_F(OptionsSettableTest, BlockBasedTableOptionsAllFieldsSettable) {
"hash_index_allow_collision=false;" "hash_index_allow_collision=false;"
"verify_compression=true;read_amp_bytes_per_bit=0;" "verify_compression=true;read_amp_bytes_per_bit=0;"
"enable_index_compression=false;" "enable_index_compression=false;"
"block_align=true", "block_align=true;"
"max_auto_readahead_size=0",
new_bbto)); new_bbto));
ASSERT_EQ(unset_bytes_base, ASSERT_EQ(unset_bytes_base,

@ -415,6 +415,10 @@ static std::unordered_map<std::string, OptionTypeInfo>
auto* cache = reinterpret_cast<std::shared_ptr<Cache>*>(addr); auto* cache = reinterpret_cast<std::shared_ptr<Cache>*>(addr);
return Cache::CreateFromString(opts, value, cache); return Cache::CreateFromString(opts, value, cache);
}}}, }}},
{"max_auto_readahead_size",
{offsetof(struct BlockBasedTableOptions, max_auto_readahead_size),
OptionType::kSizeT, OptionVerificationType::kNormal,
OptionTypeFlags::kMutable}},
#endif // ROCKSDB_LITE #endif // ROCKSDB_LITE
}; };
@ -687,6 +691,9 @@ std::string BlockBasedTableFactory::GetPrintableOptions() const {
snprintf(buffer, kBufferSize, " block_align: %d\n", snprintf(buffer, kBufferSize, " block_align: %d\n",
table_options_.block_align); table_options_.block_align);
ret.append(buffer); ret.append(buffer);
snprintf(buffer, kBufferSize,
" max_auto_readahead_size: %" ROCKSDB_PRIszt "\n",
table_options_.max_auto_readahead_size);
return ret; return ret;
} }

@ -67,11 +67,6 @@ extern const uint64_t kBlockBasedTableMagicNumber;
extern const std::string kHashIndexPrefixesBlock; extern const std::string kHashIndexPrefixesBlock;
extern const std::string kHashIndexPrefixesMetadataBlock; extern const std::string kHashIndexPrefixesMetadataBlock;
// Found that 256 KB readahead size provides the best performance, based on
// experiments, for auto readahead. Experiment data is in PR #3282.
const size_t BlockBasedTable::kMaxAutoReadaheadSize = 256 * 1024;
BlockBasedTable::~BlockBasedTable() { BlockBasedTable::~BlockBasedTable() {
delete rep_; delete rep_;
} }
@ -2921,7 +2916,7 @@ Status BlockBasedTable::VerifyChecksumInBlocks(
// increasing of the buffer size. // increasing of the buffer size.
size_t readahead_size = (read_options.readahead_size != 0) size_t readahead_size = (read_options.readahead_size != 0)
? read_options.readahead_size ? read_options.readahead_size
: kMaxAutoReadaheadSize; : rep_->table_options.max_auto_readahead_size;
// FilePrefetchBuffer doesn't work in mmap mode and readahead is not // FilePrefetchBuffer doesn't work in mmap mode and readahead is not
// needed there. // needed there.
FilePrefetchBuffer prefetch_buffer( FilePrefetchBuffer prefetch_buffer(

@ -65,9 +65,6 @@ class BlockBasedTable : public TableReader {
// All the below fields control iterator readahead // All the below fields control iterator readahead
static const size_t kInitAutoReadaheadSize = 8 * 1024; static const size_t kInitAutoReadaheadSize = 8 * 1024;
// Found that 256 KB readahead size provides the best performance, based on
// experiments, for auto readahead. Experiment data is in PR #3282.
static const size_t kMaxAutoReadaheadSize;
static const int kMinNumFileReadsToStartAutoReadahead = 2; static const int kMinNumFileReadsToStartAutoReadahead = 2;
// Attempt to open the table that is stored in bytes [0..file_size) // Attempt to open the table that is stored in bytes [0..file_size)

@ -35,10 +35,23 @@ void BlockPrefetcher::PrefetchIfNeeded(const BlockBasedTable::Rep* rep,
return; return;
} }
size_t max_auto_readahead_size = rep->table_options.max_auto_readahead_size;
size_t initial_auto_readahead_size = BlockBasedTable::kInitAutoReadaheadSize;
// If max_auto_readahead_size is set to be 0 by user, no data will be
// prefetched.
if (max_auto_readahead_size == 0) {
return;
}
if (initial_auto_readahead_size > max_auto_readahead_size) {
initial_auto_readahead_size = max_auto_readahead_size;
}
if (rep->file->use_direct_io()) { if (rep->file->use_direct_io()) {
rep->CreateFilePrefetchBufferIfNotExists( rep->CreateFilePrefetchBufferIfNotExists(initial_auto_readahead_size,
BlockBasedTable::kInitAutoReadaheadSize, max_auto_readahead_size,
BlockBasedTable::kMaxAutoReadaheadSize, &prefetch_buffer_); &prefetch_buffer_);
return; return;
} }
@ -47,20 +60,24 @@ void BlockPrefetcher::PrefetchIfNeeded(const BlockBasedTable::Rep* rep,
return; return;
} }
if (readahead_size_ > max_auto_readahead_size) {
readahead_size_ = max_auto_readahead_size;
}
// If prefetch is not supported, fall back to use internal prefetch buffer. // If prefetch is not supported, fall back to use internal prefetch buffer.
// Discarding other return status of Prefetch calls intentionally, as // Discarding other return status of Prefetch calls intentionally, as
// we can fallback to reading from disk if Prefetch fails. // we can fallback to reading from disk if Prefetch fails.
Status s = rep->file->Prefetch(handle.offset(), readahead_size_); Status s = rep->file->Prefetch(handle.offset(), readahead_size_);
if (s.IsNotSupported()) { if (s.IsNotSupported()) {
rep->CreateFilePrefetchBufferIfNotExists( rep->CreateFilePrefetchBufferIfNotExists(initial_auto_readahead_size,
BlockBasedTable::kInitAutoReadaheadSize, max_auto_readahead_size,
BlockBasedTable::kMaxAutoReadaheadSize, &prefetch_buffer_); &prefetch_buffer_);
return; return;
} }
readahead_limit_ = static_cast<size_t>(handle.offset() + readahead_size_); readahead_limit_ = static_cast<size_t>(handle.offset() + readahead_size_);
// Keep exponentially increasing readahead size until // Keep exponentially increasing readahead size until
// kMaxAutoReadaheadSize. // max_auto_readahead_size.
readahead_size_ = readahead_size_ = std::min(max_auto_readahead_size, readahead_size_ * 2);
std::min(BlockBasedTable::kMaxAutoReadaheadSize, readahead_size_ * 2);
} }
} // namespace ROCKSDB_NAMESPACE } // namespace ROCKSDB_NAMESPACE

Loading…
Cancel
Save