Combine the read-ahead logic for user reads and compaction reads (#5431)

Summary:
Currently the read-ahead logic for user reads and compaction reads go through different code paths where compaction reads create new table readers and use `ReadaheadRandomAccessFile`. This change is to unify read-ahead logic to use read-ahead in BlockBasedTableReader::InitDataBlock(). As a result of the change  `ReadAheadRandomAccessFile` class and `new_table_reader_for_compaction_inputs` option will no longer be used.
Pull Request resolved: https://github.com/facebook/rocksdb/pull/5431

Test Plan:
make check

Here is the benchmarking - https://gist.github.com/vjnadimpalli/083cf423f7b6aa12dcdb14c858bc18a5

Differential Revision: D15772533

Pulled By: vjnadimpalli

fbshipit-source-id: b71dca710590471ede6fb37553388654e2e479b9
main
Vijay Nadimpalli 5 years ago committed by Facebook Github Bot
parent fe90ed7a70
commit 24b118ad98
  1. 21
      db/db_compaction_test.cc
  2. 76
      db/table_cache.cc
  3. 3
      db/table_cache.h
  4. 2
      include/rocksdb/options.h
  5. 39
      table/block_based/block_based_table_reader.cc
  6. 19
      table/block_based/block_based_table_reader.h
  7. 5
      table/block_fetcher.cc
  8. 8
      table/block_fetcher.h
  9. 3
      table/cuckoo/cuckoo_table_reader.cc
  10. 6
      table/cuckoo/cuckoo_table_reader.h
  11. 11
      table/meta_blocks.cc
  12. 3
      table/mock_table.cc
  13. 3
      table/mock_table.h
  14. 3
      table/plain/plain_table_reader.cc
  15. 6
      table/plain/plain_table_reader.h
  16. 11
      table/table_reader.h
  17. 4
      table/table_test.cc
  18. 16
      util/file_reader_writer.cc
  19. 11
      util/file_reader_writer.h

@ -497,14 +497,14 @@ TEST_F(DBCompactionTest, TestTableReaderForCompaction) {
// Create new iterator for: // Create new iterator for:
// (1) 1 for verifying flush results // (1) 1 for verifying flush results
// (2) 3 for compaction input files // (2) 1 for verifying compaction results.
// (3) 1 for verifying compaction results. // (3) New TableReaders will not be created for compaction inputs
ASSERT_EQ(num_new_table_reader, 5); ASSERT_EQ(num_new_table_reader, 2);
num_table_cache_lookup = 0; num_table_cache_lookup = 0;
num_new_table_reader = 0; num_new_table_reader = 0;
ASSERT_EQ(Key(1), Get(Key(1))); ASSERT_EQ(Key(1), Get(Key(1)));
ASSERT_EQ(num_table_cache_lookup + old_num_table_cache_lookup2, 3); ASSERT_EQ(num_table_cache_lookup + old_num_table_cache_lookup2, 5);
ASSERT_EQ(num_new_table_reader, 0); ASSERT_EQ(num_new_table_reader, 0);
num_table_cache_lookup = 0; num_table_cache_lookup = 0;
@ -519,13 +519,14 @@ TEST_F(DBCompactionTest, TestTableReaderForCompaction) {
// May preload table cache too. // May preload table cache too.
ASSERT_GE(num_table_cache_lookup, 1); ASSERT_GE(num_table_cache_lookup, 1);
old_num_table_cache_lookup2 = num_table_cache_lookup; old_num_table_cache_lookup2 = num_table_cache_lookup;
// One for compaction input, one for verifying compaction results. // One for verifying compaction results.
ASSERT_EQ(num_new_table_reader, 2); // No new iterator created for compaction.
ASSERT_EQ(num_new_table_reader, 1);
num_table_cache_lookup = 0; num_table_cache_lookup = 0;
num_new_table_reader = 0; num_new_table_reader = 0;
ASSERT_EQ(Key(1), Get(Key(1))); ASSERT_EQ(Key(1), Get(Key(1)));
ASSERT_EQ(num_table_cache_lookup + old_num_table_cache_lookup2, 2); ASSERT_EQ(num_table_cache_lookup + old_num_table_cache_lookup2, 3);
ASSERT_EQ(num_new_table_reader, 0); ASSERT_EQ(num_new_table_reader, 0);
rocksdb::SyncPoint::GetInstance()->ClearAllCallBacks(); rocksdb::SyncPoint::GetInstance()->ClearAllCallBacks();
@ -4339,12 +4340,6 @@ TEST_P(DBCompactionDirectIOTest, DirectIO) {
options.env = new MockEnv(Env::Default()); options.env = new MockEnv(Env::Default());
Reopen(options); Reopen(options);
bool readahead = false; bool readahead = false;
SyncPoint::GetInstance()->SetCallBack(
"TableCache::NewIterator:for_compaction", [&](void* arg) {
bool* use_direct_reads = static_cast<bool*>(arg);
ASSERT_EQ(*use_direct_reads,
options.use_direct_reads);
});
SyncPoint::GetInstance()->SetCallBack( SyncPoint::GetInstance()->SetCallBack(
"CompactionJob::OpenCompactionOutputFile", [&](void* arg) { "CompactionJob::OpenCompactionOutputFile", [&](void* arg) {
bool* use_direct_writes = static_cast<bool*>(arg); bool* use_direct_writes = static_cast<bool*>(arg);

@ -16,6 +16,7 @@
#include "monitoring/perf_context_imp.h" #include "monitoring/perf_context_imp.h"
#include "rocksdb/statistics.h" #include "rocksdb/statistics.h"
#include "table/block_based/block_based_table_reader.h"
#include "table/get_context.h" #include "table/get_context.h"
#include "table/internal_iterator.h" #include "table/internal_iterator.h"
#include "table/iterator_wrapper.h" #include "table/iterator_wrapper.h"
@ -43,13 +44,6 @@ static void UnrefEntry(void* arg1, void* arg2) {
cache->Release(h); cache->Release(h);
} }
static void DeleteTableReader(void* arg1, void* arg2) {
TableReader* table_reader = reinterpret_cast<TableReader*>(arg1);
Statistics* stats = reinterpret_cast<Statistics*>(arg2);
RecordTick(stats, NO_FILE_CLOSES);
delete table_reader;
}
static Slice GetSliceForFileNumber(const uint64_t* file_number) { static Slice GetSliceForFileNumber(const uint64_t* file_number) {
return Slice(reinterpret_cast<const char*>(file_number), return Slice(reinterpret_cast<const char*>(file_number),
sizeof(*file_number)); sizeof(*file_number));
@ -96,8 +90,8 @@ void TableCache::ReleaseHandle(Cache::Handle* handle) {
Status TableCache::GetTableReader( Status TableCache::GetTableReader(
const EnvOptions& env_options, const EnvOptions& env_options,
const InternalKeyComparator& internal_comparator, const FileDescriptor& fd, const InternalKeyComparator& internal_comparator, const FileDescriptor& fd,
bool sequential_mode, size_t readahead, bool record_read_stats, bool sequential_mode, bool record_read_stats, HistogramImpl* file_read_hist,
HistogramImpl* file_read_hist, std::unique_ptr<TableReader>* table_reader, std::unique_ptr<TableReader>* table_reader,
const SliceTransform* prefix_extractor, bool skip_filters, int level, const SliceTransform* prefix_extractor, bool skip_filters, int level,
bool prefetch_index_and_filter_in_cache, bool for_compaction) { bool prefetch_index_and_filter_in_cache, bool for_compaction) {
std::string fname = std::string fname =
@ -107,13 +101,6 @@ Status TableCache::GetTableReader(
RecordTick(ioptions_.statistics, NO_FILE_OPENS); RecordTick(ioptions_.statistics, NO_FILE_OPENS);
if (s.ok()) { if (s.ok()) {
if (readahead > 0 && !env_options.use_mmap_reads) {
// Not compatible with mmap files since ReadaheadRandomAccessFile requires
// its wrapped file's Read() to copy data into the provided scratch
// buffer, which mmap files don't use.
// TODO(ajkr): try madvise for mmap files in place of buffered readahead.
file = NewReadaheadRandomAccessFile(std::move(file), readahead);
}
if (!sequential_mode && ioptions_.advise_random_on_open) { if (!sequential_mode && ioptions_.advise_random_on_open) {
file->Hint(RandomAccessFile::RANDOM); file->Hint(RandomAccessFile::RANDOM);
} }
@ -164,10 +151,9 @@ Status TableCache::FindTable(const EnvOptions& env_options,
} }
std::unique_ptr<TableReader> table_reader; std::unique_ptr<TableReader> table_reader;
s = GetTableReader(env_options, internal_comparator, fd, s = GetTableReader(env_options, internal_comparator, fd,
false /* sequential mode */, 0 /* readahead */, false /* sequential mode */, record_read_stats,
record_read_stats, file_read_hist, &table_reader, file_read_hist, &table_reader, prefix_extractor,
prefix_extractor, skip_filters, level, skip_filters, level, prefetch_index_and_filter_in_cache);
prefetch_index_and_filter_in_cache);
if (!s.ok()) { if (!s.ok()) {
assert(table_reader == nullptr); assert(table_reader == nullptr);
RecordTick(ioptions_.statistics, NO_FILE_ERRORS); RecordTick(ioptions_.statistics, NO_FILE_ERRORS);
@ -196,48 +182,21 @@ InternalIterator* TableCache::NewIterator(
PERF_TIMER_GUARD(new_table_iterator_nanos); PERF_TIMER_GUARD(new_table_iterator_nanos);
Status s; Status s;
bool create_new_table_reader = false;
TableReader* table_reader = nullptr; TableReader* table_reader = nullptr;
Cache::Handle* handle = nullptr; Cache::Handle* handle = nullptr;
if (table_reader_ptr != nullptr) { if (table_reader_ptr != nullptr) {
*table_reader_ptr = nullptr; *table_reader_ptr = nullptr;
} }
size_t readahead = 0;
if (for_compaction) {
#ifndef NDEBUG
bool use_direct_reads_for_compaction = env_options.use_direct_reads;
TEST_SYNC_POINT_CALLBACK("TableCache::NewIterator:for_compaction",
&use_direct_reads_for_compaction);
#endif // !NDEBUG
if (ioptions_.new_table_reader_for_compaction_inputs) {
// get compaction_readahead_size from env_options allows us to set the
// value dynamically
readahead = env_options.compaction_readahead_size;
create_new_table_reader = true;
}
}
auto& fd = file_meta.fd; auto& fd = file_meta.fd;
if (create_new_table_reader) { table_reader = fd.table_reader;
std::unique_ptr<TableReader> table_reader_unique_ptr; if (table_reader == nullptr) {
s = GetTableReader( s = FindTable(env_options, icomparator, fd, &handle, prefix_extractor,
env_options, icomparator, fd, true /* sequential_mode */, readahead, options.read_tier == kBlockCacheTier /* no_io */,
!for_compaction /* record stats */, nullptr, &table_reader_unique_ptr, !for_compaction /* record read_stats */, file_read_hist,
prefix_extractor, false /* skip_filters */, level, skip_filters, level);
true /* prefetch_index_and_filter_in_cache */, for_compaction);
if (s.ok()) { if (s.ok()) {
table_reader = table_reader_unique_ptr.release(); table_reader = GetTableReaderFromHandle(handle);
}
} else {
table_reader = fd.table_reader;
if (table_reader == nullptr) {
s = FindTable(env_options, icomparator, fd, &handle, prefix_extractor,
options.read_tier == kBlockCacheTier /* no_io */,
!for_compaction /* record read_stats */, file_read_hist,
skip_filters, level);
if (s.ok()) {
table_reader = GetTableReaderFromHandle(handle);
}
} }
} }
InternalIterator* result = nullptr; InternalIterator* result = nullptr;
@ -247,13 +206,10 @@ InternalIterator* TableCache::NewIterator(
result = NewEmptyInternalIterator<Slice>(arena); result = NewEmptyInternalIterator<Slice>(arena);
} else { } else {
result = table_reader->NewIterator(options, prefix_extractor, arena, result = table_reader->NewIterator(options, prefix_extractor, arena,
skip_filters, for_compaction); skip_filters, for_compaction,
env_options.compaction_readahead_size);
} }
if (create_new_table_reader) { if (handle != nullptr) {
assert(handle == nullptr);
result->RegisterCleanup(&DeleteTableReader, table_reader,
ioptions_.statistics);
} else if (handle != nullptr) {
result->RegisterCleanup(&UnrefEntry, cache_, handle); result->RegisterCleanup(&UnrefEntry, cache_, handle);
handle = nullptr; // prevent from releasing below handle = nullptr; // prevent from releasing below
} }

@ -177,8 +177,7 @@ class TableCache {
Status GetTableReader(const EnvOptions& env_options, Status GetTableReader(const EnvOptions& env_options,
const InternalKeyComparator& internal_comparator, const InternalKeyComparator& internal_comparator,
const FileDescriptor& fd, bool sequential_mode, const FileDescriptor& fd, bool sequential_mode,
size_t readahead, bool record_read_stats, bool record_read_stats, HistogramImpl* file_read_hist,
HistogramImpl* file_read_hist,
std::unique_ptr<TableReader>* table_reader, std::unique_ptr<TableReader>* table_reader,
const SliceTransform* prefix_extractor = nullptr, const SliceTransform* prefix_extractor = nullptr,
bool skip_filters = false, int level = -1, bool skip_filters = false, int level = -1,

@ -760,6 +760,8 @@ struct DBOptions {
// for this mode if using block-based table. // for this mode if using block-based table.
// //
// Default: false // Default: false
// This flag has no affect on the behavior of compaction and plan to delete
// in the future.
bool new_table_reader_for_compaction_inputs = false; bool new_table_reader_for_compaction_inputs = false;
// If non-zero, we perform bigger reads when doing compaction. If you're // If non-zero, we perform bigger reads when doing compaction. If you're

@ -83,12 +83,13 @@ Status ReadBlockFromFile(
bool do_uncompress, bool maybe_compressed, BlockType block_type, bool do_uncompress, bool maybe_compressed, BlockType block_type,
const UncompressionDict& uncompression_dict, const UncompressionDict& uncompression_dict,
const PersistentCacheOptions& cache_options, SequenceNumber global_seqno, const PersistentCacheOptions& cache_options, SequenceNumber global_seqno,
size_t read_amp_bytes_per_bit, MemoryAllocator* memory_allocator) { size_t read_amp_bytes_per_bit, MemoryAllocator* memory_allocator,
bool for_compaction = false) {
BlockContents contents; BlockContents contents;
BlockFetcher block_fetcher(file, prefetch_buffer, footer, options, handle, BlockFetcher block_fetcher(
&contents, ioptions, do_uncompress, file, prefetch_buffer, footer, options, handle, &contents, ioptions,
maybe_compressed, block_type, uncompression_dict, do_uncompress, maybe_compressed, block_type, uncompression_dict,
cache_options, memory_allocator); cache_options, memory_allocator, nullptr, for_compaction);
Status s = block_fetcher.ReadBlockContents(); Status s = block_fetcher.ReadBlockContents();
if (s.ok()) { if (s.ok()) {
result->reset(new Block(std::move(contents), global_seqno, result->reset(new Block(std::move(contents), global_seqno,
@ -1906,7 +1907,7 @@ CachableEntry<FilterBlockReader> BlockBasedTable::GetFilter(
if (!is_a_filter_partition && rep_->filter_entry.IsCached()) { if (!is_a_filter_partition && rep_->filter_entry.IsCached()) {
return {rep_->filter_entry.GetValue(), /*cache=*/nullptr, return {rep_->filter_entry.GetValue(), /*cache=*/nullptr,
/*cache_handle=*/nullptr, /*own_value=*/false}; /*cache_handle=*/nullptr, /*own_value=*/false};
} }
PERF_TIMER_GUARD(read_filter_block_nanos); PERF_TIMER_GUARD(read_filter_block_nanos);
@ -2075,7 +2076,7 @@ TBlockIter* BlockBasedTable::NewDataBlockIterator(
const ReadOptions& ro, const BlockHandle& handle, TBlockIter* input_iter, const ReadOptions& ro, const BlockHandle& handle, TBlockIter* input_iter,
BlockType block_type, bool key_includes_seq, bool index_key_is_full, BlockType block_type, bool key_includes_seq, bool index_key_is_full,
GetContext* get_context, BlockCacheLookupContext* lookup_context, Status s, GetContext* get_context, BlockCacheLookupContext* lookup_context, Status s,
FilePrefetchBuffer* prefetch_buffer) const { FilePrefetchBuffer* prefetch_buffer, bool for_compaction) const {
PERF_TIMER_GUARD(new_table_block_iter_nanos); PERF_TIMER_GUARD(new_table_block_iter_nanos);
TBlockIter* iter = input_iter != nullptr ? input_iter : new TBlockIter; TBlockIter* iter = input_iter != nullptr ? input_iter : new TBlockIter;
@ -2094,7 +2095,7 @@ TBlockIter* BlockBasedTable::NewDataBlockIterator(
CachableEntry<Block> block; CachableEntry<Block> block;
s = RetrieveBlock(prefetch_buffer, ro, handle, uncompression_dict, &block, s = RetrieveBlock(prefetch_buffer, ro, handle, uncompression_dict, &block,
block_type, get_context, lookup_context); block_type, get_context, lookup_context, for_compaction);
if (!s.ok()) { if (!s.ok()) {
assert(block.IsEmpty()); assert(block.IsEmpty());
@ -2144,6 +2145,7 @@ TBlockIter* BlockBasedTable::NewDataBlockIterator(
s = block_cache->Insert(unique_key, nullptr, s = block_cache->Insert(unique_key, nullptr,
block.GetValue()->ApproximateMemoryUsage(), block.GetValue()->ApproximateMemoryUsage(),
nullptr, &cache_handle); nullptr, &cache_handle);
if (s.ok()) { if (s.ok()) {
assert(cache_handle != nullptr); assert(cache_handle != nullptr);
iter->RegisterCleanup(&ForceReleaseCachedEntry, block_cache, iter->RegisterCleanup(&ForceReleaseCachedEntry, block_cache,
@ -2297,7 +2299,8 @@ Status BlockBasedTable::RetrieveBlock(
FilePrefetchBuffer* prefetch_buffer, const ReadOptions& ro, FilePrefetchBuffer* prefetch_buffer, const ReadOptions& ro,
const BlockHandle& handle, const UncompressionDict& uncompression_dict, const BlockHandle& handle, const UncompressionDict& uncompression_dict,
CachableEntry<Block>* block_entry, BlockType block_type, CachableEntry<Block>* block_entry, BlockType block_type,
GetContext* get_context, BlockCacheLookupContext* lookup_context) const { GetContext* get_context, BlockCacheLookupContext* lookup_context,
bool for_compaction) const {
assert(block_entry); assert(block_entry);
assert(block_entry->IsEmpty()); assert(block_entry->IsEmpty());
@ -2340,7 +2343,7 @@ Status BlockBasedTable::RetrieveBlock(
block_type == BlockType::kData block_type == BlockType::kData
? rep_->table_options.read_amp_bytes_per_bit ? rep_->table_options.read_amp_bytes_per_bit
: 0, : 0,
GetMemoryAllocator(rep_->table_options)); GetMemoryAllocator(rep_->table_options), for_compaction);
} }
if (!s.ok()) { if (!s.ok()) {
@ -2714,13 +2717,18 @@ void BlockBasedTableIterator<TBlockIter, TValue>::InitDataBlock() {
rep->file.get(), read_options_.readahead_size, rep->file.get(), read_options_.readahead_size,
read_options_.readahead_size)); read_options_.readahead_size));
} }
} else if (!prefetch_buffer_) {
prefetch_buffer_.reset(
new FilePrefetchBuffer(rep->file.get(), compaction_readahead_size_,
compaction_readahead_size_));
} }
Status s; Status s;
table_->NewDataBlockIterator<TBlockIter>( table_->NewDataBlockIterator<TBlockIter>(
read_options_, data_block_handle, &block_iter_, block_type_, read_options_, data_block_handle, &block_iter_, block_type_,
key_includes_seq_, index_key_is_full_, key_includes_seq_, index_key_is_full_,
/*get_context=*/nullptr, &lookup_context_, s, prefetch_buffer_.get()); /*get_context=*/nullptr, &lookup_context_, s, prefetch_buffer_.get(),
for_compaction_);
block_iter_points_to_real_block_ = true; block_iter_points_to_real_block_ = true;
} }
} }
@ -2806,7 +2814,8 @@ void BlockBasedTableIterator<TBlockIter, TValue>::CheckOutOfBound() {
InternalIterator* BlockBasedTable::NewIterator( InternalIterator* BlockBasedTable::NewIterator(
const ReadOptions& read_options, const SliceTransform* prefix_extractor, const ReadOptions& read_options, const SliceTransform* prefix_extractor,
Arena* arena, bool skip_filters, bool for_compaction) { Arena* arena, bool skip_filters, bool for_compaction,
size_t compaction_readahead_size) {
BlockCacheLookupContext lookup_context{ BlockCacheLookupContext lookup_context{
for_compaction ? BlockCacheLookupCaller::kCompaction for_compaction ? BlockCacheLookupCaller::kCompaction
: BlockCacheLookupCaller::kUserIterator}; : BlockCacheLookupCaller::kUserIterator};
@ -2823,7 +2832,8 @@ InternalIterator* BlockBasedTable::NewIterator(
!skip_filters && !read_options.total_order_seek && !skip_filters && !read_options.total_order_seek &&
prefix_extractor != nullptr, prefix_extractor != nullptr,
need_upper_bound_check, prefix_extractor, BlockType::kData, need_upper_bound_check, prefix_extractor, BlockType::kData,
true /*key_includes_seq*/, true /*index_key_is_full*/, for_compaction); true /*key_includes_seq*/, true /*index_key_is_full*/, for_compaction,
compaction_readahead_size);
} else { } else {
auto* mem = auto* mem =
arena->AllocateAligned(sizeof(BlockBasedTableIterator<DataBlockIter>)); arena->AllocateAligned(sizeof(BlockBasedTableIterator<DataBlockIter>));
@ -2835,7 +2845,8 @@ InternalIterator* BlockBasedTable::NewIterator(
!skip_filters && !read_options.total_order_seek && !skip_filters && !read_options.total_order_seek &&
prefix_extractor != nullptr, prefix_extractor != nullptr,
need_upper_bound_check, prefix_extractor, BlockType::kData, need_upper_bound_check, prefix_extractor, BlockType::kData,
true /*key_includes_seq*/, true /*index_key_is_full*/, for_compaction); true /*key_includes_seq*/, true /*index_key_is_full*/, for_compaction,
compaction_readahead_size);
} }
} }

@ -123,6 +123,8 @@ class BlockBasedTable : public TableReader {
// The result of NewIterator() is initially invalid (caller must // The result of NewIterator() is initially invalid (caller must
// call one of the Seek methods on the iterator before using it). // call one of the Seek methods on the iterator before using it).
// @param skip_filters Disables loading/accessing the filter block // @param skip_filters Disables loading/accessing the filter block
// compaction_readahead_size: its value will only be used if for_compaction =
// true
InternalIterator* NewIterator( InternalIterator* NewIterator(
const ReadOptions&, const SliceTransform* prefix_extractor, const ReadOptions&, const SliceTransform* prefix_extractor,
Arena* arena = nullptr, bool skip_filters = false, Arena* arena = nullptr, bool skip_filters = false,
@ -131,7 +133,8 @@ class BlockBasedTable : public TableReader {
// i.e., it will populate the block cache with blocks in the new SST // i.e., it will populate the block cache with blocks in the new SST
// files. We treat those as a user is calling iterator for now. We should // files. We treat those as a user is calling iterator for now. We should
// differentiate the callers. // differentiate the callers.
bool for_compaction = false) override; bool for_compaction = false,
size_t compaction_readahead_size = 0) override;
FragmentedRangeTombstoneIterator* NewRangeTombstoneIterator( FragmentedRangeTombstoneIterator* NewRangeTombstoneIterator(
const ReadOptions& read_options) override; const ReadOptions& read_options) override;
@ -234,7 +237,7 @@ class BlockBasedTable : public TableReader {
TBlockIter* input_iter, BlockType block_type, bool key_includes_seq, TBlockIter* input_iter, BlockType block_type, bool key_includes_seq,
bool index_key_is_full, GetContext* get_context, bool index_key_is_full, GetContext* get_context,
BlockCacheLookupContext* lookup_context, Status s, BlockCacheLookupContext* lookup_context, Status s,
FilePrefetchBuffer* prefetch_buffer) const; FilePrefetchBuffer* prefetch_buffer, bool for_compaction = false) const;
class PartitionedIndexIteratorState; class PartitionedIndexIteratorState;
@ -283,7 +286,8 @@ class BlockBasedTable : public TableReader {
const UncompressionDict& uncompression_dict, const UncompressionDict& uncompression_dict,
CachableEntry<Block>* block_entry, BlockType block_type, CachableEntry<Block>* block_entry, BlockType block_type,
GetContext* get_context, GetContext* get_context,
BlockCacheLookupContext* lookup_context) const; BlockCacheLookupContext* lookup_context,
bool for_compaction = false) const;
// For the following two functions: // For the following two functions:
// if `no_io == true`, we will not try to read filter/index from sst file // if `no_io == true`, we will not try to read filter/index from sst file
@ -596,6 +600,8 @@ struct BlockBasedTable::Rep {
// Iterates over the contents of BlockBasedTable. // Iterates over the contents of BlockBasedTable.
template <class TBlockIter, typename TValue = Slice> template <class TBlockIter, typename TValue = Slice>
class BlockBasedTableIterator : public InternalIteratorBase<TValue> { class BlockBasedTableIterator : public InternalIteratorBase<TValue> {
// compaction_readahead_size: its value will only be used if for_compaction =
// true
public: public:
BlockBasedTableIterator(const BlockBasedTable* table, BlockBasedTableIterator(const BlockBasedTable* table,
const ReadOptions& read_options, const ReadOptions& read_options,
@ -605,7 +611,8 @@ class BlockBasedTableIterator : public InternalIteratorBase<TValue> {
const SliceTransform* prefix_extractor, const SliceTransform* prefix_extractor,
BlockType block_type, bool key_includes_seq = true, BlockType block_type, bool key_includes_seq = true,
bool index_key_is_full = true, bool index_key_is_full = true,
bool for_compaction = false) bool for_compaction = false,
size_t compaction_readahead_size = 0)
: InternalIteratorBase<TValue>(false), : InternalIteratorBase<TValue>(false),
table_(table), table_(table),
read_options_(read_options), read_options_(read_options),
@ -621,6 +628,7 @@ class BlockBasedTableIterator : public InternalIteratorBase<TValue> {
key_includes_seq_(key_includes_seq), key_includes_seq_(key_includes_seq),
index_key_is_full_(index_key_is_full), index_key_is_full_(index_key_is_full),
for_compaction_(for_compaction), for_compaction_(for_compaction),
compaction_readahead_size_(compaction_readahead_size),
lookup_context_(for_compaction lookup_context_(for_compaction
? BlockCacheLookupCaller::kCompaction ? BlockCacheLookupCaller::kCompaction
: BlockCacheLookupCaller::kUserIterator) {} : BlockCacheLookupCaller::kUserIterator) {}
@ -734,6 +742,9 @@ class BlockBasedTableIterator : public InternalIteratorBase<TValue> {
bool index_key_is_full_; bool index_key_is_full_;
// If this iterator is created for compaction // If this iterator is created for compaction
bool for_compaction_; bool for_compaction_;
// Readahead size used in compaction, its value is used only if
// for_compaction_ = true
size_t compaction_readahead_size_;
BlockHandle prev_index_value_; BlockHandle prev_index_value_;
BlockCacheLookupContext lookup_context_; BlockCacheLookupContext lookup_context_;

@ -93,7 +93,8 @@ inline bool BlockFetcher::TryGetFromPrefetchBuffer() {
if (prefetch_buffer_ != nullptr && if (prefetch_buffer_ != nullptr &&
prefetch_buffer_->TryReadFromCache( prefetch_buffer_->TryReadFromCache(
handle_.offset(), handle_.offset(),
static_cast<size_t>(handle_.size()) + kBlockTrailerSize, &slice_)) { static_cast<size_t>(handle_.size()) + kBlockTrailerSize, &slice_,
for_compaction_)) {
block_size_ = static_cast<size_t>(handle_.size()); block_size_ = static_cast<size_t>(handle_.size());
CheckBlockChecksum(); CheckBlockChecksum();
if (!status_.ok()) { if (!status_.ok()) {
@ -217,7 +218,7 @@ Status BlockFetcher::ReadBlockContents() {
PERF_TIMER_GUARD(block_read_time); PERF_TIMER_GUARD(block_read_time);
// Actual file read // Actual file read
status_ = file_->Read(handle_.offset(), block_size_ + kBlockTrailerSize, status_ = file_->Read(handle_.offset(), block_size_ + kBlockTrailerSize,
&slice_, used_buf_); &slice_, used_buf_, for_compaction_);
} }
PERF_COUNTER_ADD(block_read_count, 1); PERF_COUNTER_ADD(block_read_count, 1);

@ -44,7 +44,8 @@ class BlockFetcher {
const UncompressionDict& uncompression_dict, const UncompressionDict& uncompression_dict,
const PersistentCacheOptions& cache_options, const PersistentCacheOptions& cache_options,
MemoryAllocator* memory_allocator = nullptr, MemoryAllocator* memory_allocator = nullptr,
MemoryAllocator* memory_allocator_compressed = nullptr) MemoryAllocator* memory_allocator_compressed = nullptr,
bool for_compaction = false)
: file_(file), : file_(file),
prefetch_buffer_(prefetch_buffer), prefetch_buffer_(prefetch_buffer),
footer_(footer), footer_(footer),
@ -58,7 +59,9 @@ class BlockFetcher {
uncompression_dict_(uncompression_dict), uncompression_dict_(uncompression_dict),
cache_options_(cache_options), cache_options_(cache_options),
memory_allocator_(memory_allocator), memory_allocator_(memory_allocator),
memory_allocator_compressed_(memory_allocator_compressed) {} memory_allocator_compressed_(memory_allocator_compressed),
for_compaction_(for_compaction) {}
Status ReadBlockContents(); Status ReadBlockContents();
CompressionType get_compression_type() const { return compression_type_; } CompressionType get_compression_type() const { return compression_type_; }
@ -88,6 +91,7 @@ class BlockFetcher {
char stack_buf_[kDefaultStackBufferSize]; char stack_buf_[kDefaultStackBufferSize];
bool got_from_prefetch_buffer_ = false; bool got_from_prefetch_buffer_ = false;
rocksdb::CompressionType compression_type_; rocksdb::CompressionType compression_type_;
bool for_compaction_ = false;
// return true if found // return true if found
bool TryGetUncompressBlockFromPersistentCache(); bool TryGetUncompressBlockFromPersistentCache();

@ -377,7 +377,8 @@ Slice CuckooTableIterator::value() const {
InternalIterator* CuckooTableReader::NewIterator( InternalIterator* CuckooTableReader::NewIterator(
const ReadOptions& /*read_options*/, const ReadOptions& /*read_options*/,
const SliceTransform* /* prefix_extractor */, Arena* arena, const SliceTransform* /* prefix_extractor */, Arena* arena,
bool /*skip_filters*/, bool /*for_compaction*/) { bool /*skip_filters*/, bool /*for_compaction*/,
size_t /*compaction_readahead_size*/) {
if (!status().ok()) { if (!status().ok()) {
return NewErrorInternalIterator<Slice>( return NewErrorInternalIterator<Slice>(
Status::Corruption("CuckooTableReader status is not okay."), arena); Status::Corruption("CuckooTableReader status is not okay."), arena);

@ -45,11 +45,15 @@ class CuckooTableReader: public TableReader {
GetContext* get_context, const SliceTransform* prefix_extractor, GetContext* get_context, const SliceTransform* prefix_extractor,
bool skip_filters = false) override; bool skip_filters = false) override;
// Returns a new iterator over table contents
// compaction_readahead_size: its value will only be used if for_compaction =
// true
InternalIterator* NewIterator(const ReadOptions&, InternalIterator* NewIterator(const ReadOptions&,
const SliceTransform* prefix_extractor, const SliceTransform* prefix_extractor,
Arena* arena = nullptr, Arena* arena = nullptr,
bool skip_filters = false, bool skip_filters = false,
bool for_compaction = false) override; bool for_compaction = false,
size_t compaction_readahead_size = 0) override;
void Prepare(const Slice& target) override; void Prepare(const Slice& target) override;
// Report an approximation of how much memory has been used. // Report an approximation of how much memory has been used.

@ -487,12 +487,11 @@ Status ReadMetaBlock(RandomAccessFileReader* file,
read_options.verify_checksums = false; read_options.verify_checksums = false;
PersistentCacheOptions cache_options; PersistentCacheOptions cache_options;
BlockFetcher block_fetcher(file, prefetch_buffer, footer, read_options, BlockFetcher block_fetcher(
metaindex_handle, &metaindex_contents, ioptions, file, prefetch_buffer, footer, read_options, metaindex_handle,
false /* decompress */, false /*maybe_compressed*/, &metaindex_contents, ioptions, false /* decompress */,
BlockType::kMetaIndex, false /*maybe_compressed*/, BlockType::kMetaIndex,
UncompressionDict::GetEmptyDict(), cache_options, UncompressionDict::GetEmptyDict(), cache_options, memory_allocator);
memory_allocator);
status = block_fetcher.ReadBlockContents(); status = block_fetcher.ReadBlockContents();
if (!status.ok()) { if (!status.ok()) {
return status; return status;

@ -34,7 +34,8 @@ stl_wrappers::KVMap MakeMockFile(
InternalIterator* MockTableReader::NewIterator( InternalIterator* MockTableReader::NewIterator(
const ReadOptions&, const SliceTransform* /* prefix_extractor */, const ReadOptions&, const SliceTransform* /* prefix_extractor */,
Arena* /*arena*/, bool /*skip_filters*/, bool /*for_compaction*/) { Arena* /*arena*/, bool /*skip_filters*/, bool /*for_compaction*/,
size_t /*compaction_readahead_size*/) {
return new MockTableIterator(table_); return new MockTableIterator(table_);
} }

@ -44,7 +44,8 @@ class MockTableReader : public TableReader {
const SliceTransform* prefix_extractor, const SliceTransform* prefix_extractor,
Arena* arena = nullptr, Arena* arena = nullptr,
bool skip_filters = false, bool skip_filters = false,
bool for_compaction = false) override; bool for_compaction = false,
size_t compaction_readahead_size = 0) override;
Status Get(const ReadOptions& readOptions, const Slice& key, Status Get(const ReadOptions& readOptions, const Slice& key,
GetContext* get_context, const SliceTransform* prefix_extractor, GetContext* get_context, const SliceTransform* prefix_extractor,

@ -196,7 +196,8 @@ void PlainTableReader::SetupForCompaction() {
InternalIterator* PlainTableReader::NewIterator( InternalIterator* PlainTableReader::NewIterator(
const ReadOptions& options, const SliceTransform* /* prefix_extractor */, const ReadOptions& options, const SliceTransform* /* prefix_extractor */,
Arena* arena, bool /*skip_filters*/, bool /*for_compaction*/) { Arena* arena, bool /*skip_filters*/, bool /*for_compaction*/,
size_t /*compaction_readahead_size*/) {
bool use_prefix_seek = !IsTotalOrderMode() && !options.total_order_seek; bool use_prefix_seek = !IsTotalOrderMode() && !options.total_order_seek;
if (arena == nullptr) { if (arena == nullptr) {
return new PlainTableIterator(this, use_prefix_seek); return new PlainTableIterator(this, use_prefix_seek);

@ -77,11 +77,15 @@ class PlainTableReader: public TableReader {
bool full_scan_mode, const bool immortal_table = false, bool full_scan_mode, const bool immortal_table = false,
const SliceTransform* prefix_extractor = nullptr); const SliceTransform* prefix_extractor = nullptr);
// Returns new iterator over table contents
// compaction_readahead_size: its value will only be used if for_compaction =
// true
InternalIterator* NewIterator(const ReadOptions&, InternalIterator* NewIterator(const ReadOptions&,
const SliceTransform* prefix_extractor, const SliceTransform* prefix_extractor,
Arena* arena = nullptr, Arena* arena = nullptr,
bool skip_filters = false, bool skip_filters = false,
bool for_compaction = false) override; bool for_compaction = false,
size_t compaction_readahead_size = 0) override;
void Prepare(const Slice& target) override; void Prepare(const Slice& target) override;

@ -44,11 +44,12 @@ class TableReader {
// all the states but those allocated in arena. // all the states but those allocated in arena.
// skip_filters: disables checking the bloom filters even if they exist. This // skip_filters: disables checking the bloom filters even if they exist. This
// option is effective only for block-based table format. // option is effective only for block-based table format.
virtual InternalIterator* NewIterator(const ReadOptions&, // compaction_readahead_size: its value will only be used if for_compaction =
const SliceTransform* prefix_extractor, // true
Arena* arena = nullptr, virtual InternalIterator* NewIterator(
bool skip_filters = false, const ReadOptions&, const SliceTransform* prefix_extractor,
bool for_compaction = false) = 0; Arena* arena = nullptr, bool skip_filters = false,
bool for_compaction = false, size_t compaction_readahead_size = 0) = 0;
virtual FragmentedRangeTombstoneIterator* NewRangeTombstoneIterator( virtual FragmentedRangeTombstoneIterator* NewRangeTombstoneIterator(
const ReadOptions& /*read_options*/) { const ReadOptions& /*read_options*/) {

@ -3590,7 +3590,7 @@ TEST_P(BlockBasedTableTest, PropertiesBlockRestartPointTest) {
BlockContents metaindex_contents; BlockContents metaindex_contents;
BlockFetchHelper(metaindex_handle, BlockType::kMetaIndex, BlockFetchHelper(metaindex_handle, BlockType::kMetaIndex,
&metaindex_contents); &metaindex_contents);
Block metaindex_block(std::move(metaindex_contents), Block metaindex_block(std::move(metaindex_contents),
kDisableGlobalSequenceNumber); kDisableGlobalSequenceNumber);
@ -3608,7 +3608,7 @@ TEST_P(BlockBasedTableTest, PropertiesBlockRestartPointTest) {
BlockContents properties_contents; BlockContents properties_contents;
BlockFetchHelper(properties_handle, BlockType::kProperties, BlockFetchHelper(properties_handle, BlockType::kProperties,
&properties_contents); &properties_contents);
Block properties_block(std::move(properties_contents), Block properties_block(std::move(properties_contents),
kDisableGlobalSequenceNumber); kDisableGlobalSequenceNumber);

@ -70,7 +70,7 @@ Status SequentialFileReader::Skip(uint64_t n) {
} }
Status RandomAccessFileReader::Read(uint64_t offset, size_t n, Slice* result, Status RandomAccessFileReader::Read(uint64_t offset, size_t n, Slice* result,
char* scratch) const { char* scratch, bool for_compaction) const {
Status s; Status s;
uint64_t elapsed = 0; uint64_t elapsed = 0;
{ {
@ -90,7 +90,7 @@ Status RandomAccessFileReader::Read(uint64_t offset, size_t n, Slice* result,
buf.AllocateNewBuffer(read_size); buf.AllocateNewBuffer(read_size);
while (buf.CurrentSize() < read_size) { while (buf.CurrentSize() < read_size) {
size_t allowed; size_t allowed;
if (for_compaction_ && rate_limiter_ != nullptr) { if (for_compaction && rate_limiter_ != nullptr) {
allowed = rate_limiter_->RequestToken( allowed = rate_limiter_->RequestToken(
buf.Capacity() - buf.CurrentSize(), buf.Alignment(), buf.Capacity() - buf.CurrentSize(), buf.Alignment(),
Env::IOPriority::IO_LOW, stats_, RateLimiter::OpType::kRead); Env::IOPriority::IO_LOW, stats_, RateLimiter::OpType::kRead);
@ -134,7 +134,7 @@ Status RandomAccessFileReader::Read(uint64_t offset, size_t n, Slice* result,
const char* res_scratch = nullptr; const char* res_scratch = nullptr;
while (pos < n) { while (pos < n) {
size_t allowed; size_t allowed;
if (for_compaction_ && rate_limiter_ != nullptr) { if (for_compaction && rate_limiter_ != nullptr) {
if (rate_limiter_->IsRateLimited(RateLimiter::OpType::kRead)) { if (rate_limiter_->IsRateLimited(RateLimiter::OpType::kRead)) {
sw.DelayStart(); sw.DelayStart();
} }
@ -711,7 +711,8 @@ private:
} // namespace } // namespace
Status FilePrefetchBuffer::Prefetch(RandomAccessFileReader* reader, Status FilePrefetchBuffer::Prefetch(RandomAccessFileReader* reader,
uint64_t offset, size_t n) { uint64_t offset, size_t n,
bool for_compaction) {
size_t alignment = reader->file()->GetRequiredBufferAlignment(); size_t alignment = reader->file()->GetRequiredBufferAlignment();
size_t offset_ = static_cast<size_t>(offset); size_t offset_ = static_cast<size_t>(offset);
uint64_t rounddown_offset = Rounddown(offset_, alignment); uint64_t rounddown_offset = Rounddown(offset_, alignment);
@ -771,7 +772,7 @@ Status FilePrefetchBuffer::Prefetch(RandomAccessFileReader* reader,
Slice result; Slice result;
s = reader->Read(rounddown_offset + chunk_len, s = reader->Read(rounddown_offset + chunk_len,
static_cast<size_t>(roundup_len - chunk_len), &result, static_cast<size_t>(roundup_len - chunk_len), &result,
buffer_.BufferStart() + chunk_len); buffer_.BufferStart() + chunk_len, for_compaction);
if (s.ok()) { if (s.ok()) {
buffer_offset_ = rounddown_offset; buffer_offset_ = rounddown_offset;
buffer_.Size(static_cast<size_t>(chunk_len) + result.size()); buffer_.Size(static_cast<size_t>(chunk_len) + result.size());
@ -780,7 +781,7 @@ Status FilePrefetchBuffer::Prefetch(RandomAccessFileReader* reader,
} }
bool FilePrefetchBuffer::TryReadFromCache(uint64_t offset, size_t n, bool FilePrefetchBuffer::TryReadFromCache(uint64_t offset, size_t n,
Slice* result) { Slice* result, bool for_compaction) {
if (track_min_offset_ && offset < min_offset_read_) { if (track_min_offset_ && offset < min_offset_read_) {
min_offset_read_ = static_cast<size_t>(offset); min_offset_read_ = static_cast<size_t>(offset);
} }
@ -797,7 +798,8 @@ bool FilePrefetchBuffer::TryReadFromCache(uint64_t offset, size_t n,
assert(file_reader_ != nullptr); assert(file_reader_ != nullptr);
assert(max_readahead_size_ >= readahead_size_); assert(max_readahead_size_ >= readahead_size_);
Status s = Prefetch(file_reader_, offset, n + readahead_size_); Status s =
Prefetch(file_reader_, offset, n + readahead_size_, for_compaction);
if (!s.ok()) { if (!s.ok()) {
return false; return false;
} }

@ -158,7 +158,8 @@ class RandomAccessFileReader {
RandomAccessFileReader(const RandomAccessFileReader&) = delete; RandomAccessFileReader(const RandomAccessFileReader&) = delete;
RandomAccessFileReader& operator=(const RandomAccessFileReader&) = delete; RandomAccessFileReader& operator=(const RandomAccessFileReader&) = delete;
Status Read(uint64_t offset, size_t n, Slice* result, char* scratch) const; Status Read(uint64_t offset, size_t n, Slice* result, char* scratch,
bool for_compaction = false) const;
Status Prefetch(uint64_t offset, size_t n) const { Status Prefetch(uint64_t offset, size_t n) const {
return file_->Prefetch(offset, n); return file_->Prefetch(offset, n);
@ -343,7 +344,9 @@ class FilePrefetchBuffer {
// reader : the file reader. // reader : the file reader.
// offset : the file offset to start reading from. // offset : the file offset to start reading from.
// n : the number of bytes to read. // n : the number of bytes to read.
Status Prefetch(RandomAccessFileReader* reader, uint64_t offset, size_t n); // for_compaction : if prefetch is done for compaction read.
Status Prefetch(RandomAccessFileReader* reader, uint64_t offset, size_t n,
bool for_compaction = false);
// Tries returning the data for a file raed from this buffer, if that data is // Tries returning the data for a file raed from this buffer, if that data is
// in the buffer. // in the buffer.
@ -354,7 +357,9 @@ class FilePrefetchBuffer {
// offset : the file offset. // offset : the file offset.
// n : the number of bytes. // n : the number of bytes.
// result : output buffer to put the data into. // result : output buffer to put the data into.
bool TryReadFromCache(uint64_t offset, size_t n, Slice* result); // for_compaction : if cache read is done for compaction read.
bool TryReadFromCache(uint64_t offset, size_t n, Slice* result,
bool for_compaction = false);
// The minimum `offset` ever passed to TryReadFromCache(). This will nly be // The minimum `offset` ever passed to TryReadFromCache(). This will nly be
// tracked if track_min_offset = true. // tracked if track_min_offset = true.

Loading…
Cancel
Save