From 8f763bdeab5f22f94935bf2d080a544a61e1268e Mon Sep 17 00:00:00 2001 From: Hui Xiao Date: Mon, 8 May 2023 13:14:28 -0700 Subject: [PATCH] Record and use the tail size to prefetch table tail (#11406) Summary: **Context:** We prefetch the tail part of a SST file (i.e, the blocks after data blocks till the end of the file) during each SST file open in hope to prefetch all the stuff at once ahead of time for later read e.g, footer, meta index, filter/index etc. The existing approach to estimate the tail size to prefetch is through `TailPrefetchStats` heuristics introduced in https://github.com/facebook/rocksdb/pull/4156, which has caused small reads in unlucky case (e.g, small read into the tail buffer during table open in thread 1 under the same BlockBasedTableFactory object can make thread 2's tail prefetching use a small size that it shouldn't) and is hard to debug. Therefore we decide to record the exact tail size and use it directly to prefetch tail of the SST instead of relying heuristics. **Summary:** - Obtain and record in manifest the tail size in `BlockBasedTableBuilder::Finish()` - For backward compatibility, we fall back to TailPrefetchStats and last to simple heuristics that the tail size is a linear portion of the file size - see PR conversation for more. - Make`tail_start_offset` part of the table properties and deduct tail size to record in manifest for external files (e.g, file ingestion, import CF) and db repair (with no access to manifest). Pull Request resolved: https://github.com/facebook/rocksdb/pull/11406 Test Plan: 1. New UT 2. db bench Note: db bench on /tmp/ where direct read is supported is too slow to finish and the default pinning setting in db bench is not helpful to profile # sst read of Get. Therefore I hacked the following to obtain the following comparison. ``` diff --git a/table/block_based/block_based_table_reader.cc b/table/block_based/block_based_table_reader.cc index bd5669f0f..791484c1f 100644 --- a/table/block_based/block_based_table_reader.cc +++ b/table/block_based/block_based_table_reader.cc @@ -838,7 +838,7 @@ Status BlockBasedTable::PrefetchTail( &tail_prefetch_size); // Try file system prefetch - if (!file->use_direct_io() && !force_direct_prefetch) { + if (false && !file->use_direct_io() && !force_direct_prefetch) { if (!file->Prefetch(prefetch_off, prefetch_len, ro.rate_limiter_priority) .IsNotSupported()) { prefetch_buffer->reset(new FilePrefetchBuffer( diff --git a/tools/db_bench_tool.cc b/tools/db_bench_tool.cc index ea40f5fa0..39a0ac385 100644 --- a/tools/db_bench_tool.cc +++ b/tools/db_bench_tool.cc @@ -4191,6 +4191,8 @@ class Benchmark { std::shared_ptr(NewCuckooTableFactory(table_options)); } else { BlockBasedTableOptions block_based_options; + block_based_options.metadata_cache_options.partition_pinning = + PinningTier::kAll; block_based_options.checksum = static_cast(FLAGS_checksum_type); if (FLAGS_use_hash_search) { ``` Create DB ``` ./db_bench --bloom_bits=3 --use_existing_db=1 --seed=1682546046158958 --partition_index_and_filters=1 --statistics=1 -db=/dev/shm/testdb/ -benchmarks=readrandom -key_size=3200 -value_size=512 -num=1000000 -write_buffer_size=6550000 -disable_auto_compactions=false -target_file_size_base=6550000 -compression_type=none ``` ReadRandom ``` ./db_bench --bloom_bits=3 --use_existing_db=1 --seed=1682546046158958 --partition_index_and_filters=1 --statistics=1 -db=/dev/shm/testdb/ -benchmarks=readrandom -key_size=3200 -value_size=512 -num=1000000 -write_buffer_size=6550000 -disable_auto_compactions=false -target_file_size_base=6550000 -compression_type=none ``` (a) Existing (Use TailPrefetchStats for tail size + use seperate prefetch buffer in PartitionedFilter/IndexReader::CacheDependencies()) ``` rocksdb.table.open.prefetch.tail.hit COUNT : 3395 rocksdb.sst.read.micros P50 : 5.655570 P95 : 9.931396 P99 : 14.845454 P100 : 585.000000 COUNT : 999905 SUM : 6590614 ``` (b) This PR (Record tail size + use the same tail buffer in PartitionedFilter/IndexReader::CacheDependencies()) ``` rocksdb.table.open.prefetch.tail.hit COUNT : 14257 rocksdb.sst.read.micros P50 : 5.173347 P95 : 9.015017 P99 : 12.912610 P100 : 228.000000 COUNT : 998547 SUM : 5976540 ``` As we can see, we increase the prefetch tail hit count and decrease SST read count with this PR 3. Test backward compatibility by stepping through reading with post-PR code on a db generated pre-PR. Reviewed By: pdillinger Differential Revision: D45413346 Pulled By: hx235 fbshipit-source-id: 7d5e36a60a72477218f79905168d688452a4c064 --- HISTORY.md | 3 + db/builder.cc | 1 + db/compaction/compaction_job_test.cc | 2 +- db/compaction/compaction_outputs.cc | 1 + db/compaction/compaction_picker_test.cc | 2 +- db/db_impl/db_impl_compaction_flush.cc | 4 +- db/db_impl/db_impl_experimental.cc | 2 +- db/db_impl/db_impl_open.cc | 19 +- db/db_test2.cc | 82 ------ db/experimental.cc | 2 +- db/external_sst_file_ingestion_job.cc | 12 +- db/flush_job.cc | 3 +- db/import_column_family_job.cc | 12 +- db/repair.cc | 13 +- db/table_cache.cc | 14 +- db/version_builder_test.cc | 260 ++++++++++-------- db/version_edit.cc | 14 + db/version_edit.h | 16 +- db/version_edit_test.cc | 18 +- db/version_set.cc | 2 +- db/version_set_test.cc | 10 +- file/file_prefetch_buffer.h | 2 + file/prefetch_test.cc | 87 ++++++ include/rocksdb/table_properties.h | 5 + .../block_based/block_based_table_builder.cc | 10 + table/block_based/block_based_table_builder.h | 4 + .../block_based/block_based_table_factory.cc | 2 +- table/block_based/block_based_table_factory.h | 3 + table/block_based/block_based_table_reader.cc | 61 ++-- table/block_based/block_based_table_reader.h | 10 +- table/block_based/filter_block.h | 4 +- table/block_based/partitioned_filter_block.cc | 36 +-- table/block_based/partitioned_filter_block.h | 3 +- table/block_based/partitioned_index_reader.cc | 36 +-- table/block_based/partitioned_index_reader.h | 3 +- table/block_fetcher_test.cc | 7 +- table/meta_blocks.cc | 3 + table/table_builder.h | 10 +- table/table_properties.cc | 2 + 39 files changed, 466 insertions(+), 314 deletions(-) diff --git a/HISTORY.md b/HISTORY.md index 9b5e60279..96170b41b 100644 --- a/HISTORY.md +++ b/HISTORY.md @@ -10,6 +10,9 @@ ### Bug Fixes * Delete an empty WAL file on DB open if the log number is less than the min log number to keep +### Performance Improvements +* Improved the I/O efficiency of prefetching SST metadata by recording more information in the DB manifest. Opening files written with previous versions will still rely on heuristics for how much to prefetch (#11406). + ## 8.2.0 (04/24/2023) ### Public API Changes * `SstFileWriter::DeleteRange()` now returns `Status::InvalidArgument` if the range's end key comes before its start key according to the user comparator. Previously the behavior was undefined. diff --git a/db/builder.cc b/db/builder.cc index eadc315c9..a99bb57e8 100644 --- a/db/builder.cc +++ b/db/builder.cc @@ -291,6 +291,7 @@ Status BuildTable( if (s.ok() && !empty) { uint64_t file_size = builder->FileSize(); meta->fd.file_size = file_size; + meta->tail_size = builder->GetTailSize(); meta->marked_for_compaction = builder->NeedCompact(); assert(meta->fd.GetFileSize() > 0); tp = builder diff --git a/db/compaction/compaction_job_test.cc b/db/compaction/compaction_job_test.cc index 3f7966404..a7cf65f01 100644 --- a/db/compaction/compaction_job_test.cc +++ b/db/compaction/compaction_job_test.cc @@ -386,7 +386,7 @@ class CompactionJobTestBase : public testing::Test { kUnknownFileCreationTime, versions_->GetColumnFamilySet()->GetDefault()->NewEpochNumber(), kUnknownFileChecksum, kUnknownFileChecksumFuncName, kNullUniqueId64x2, - 0); + 0, 0); mutex_.Lock(); EXPECT_OK(versions_->LogAndApply( diff --git a/db/compaction/compaction_outputs.cc b/db/compaction/compaction_outputs.cc index cf5105e41..f6e0dbd7d 100644 --- a/db/compaction/compaction_outputs.cc +++ b/db/compaction/compaction_outputs.cc @@ -43,6 +43,7 @@ Status CompactionOutputs::Finish(const Status& intput_status, const uint64_t current_bytes = builder_->FileSize(); if (s.ok()) { meta->fd.file_size = current_bytes; + meta->tail_size = builder_->GetTailSize(); meta->marked_for_compaction = builder_->NeedCompact(); } current_output().finished = true; diff --git a/db/compaction/compaction_picker_test.cc b/db/compaction/compaction_picker_test.cc index c9cbb09ed..a5a14ac21 100644 --- a/db/compaction/compaction_picker_test.cc +++ b/db/compaction/compaction_picker_test.cc @@ -148,7 +148,7 @@ class CompactionPickerTestBase : public testing::Test { smallest_seq, largest_seq, marked_for_compact, temperature, kInvalidBlobFileNumber, kUnknownOldestAncesterTime, kUnknownFileCreationTime, epoch_number, kUnknownFileChecksum, - kUnknownFileChecksumFuncName, kNullUniqueId64x2, 0); + kUnknownFileChecksumFuncName, kNullUniqueId64x2, 0, 0); f->compensated_file_size = (compensated_file_size != 0) ? compensated_file_size : file_size; f->oldest_ancester_time = oldest_ancestor_time; diff --git a/db/db_impl/db_impl_compaction_flush.cc b/db/db_impl/db_impl_compaction_flush.cc index b8ab5eb7e..39a833919 100644 --- a/db/db_impl/db_impl_compaction_flush.cc +++ b/db/db_impl/db_impl_compaction_flush.cc @@ -1758,7 +1758,7 @@ Status DBImpl::ReFitLevel(ColumnFamilyData* cfd, int level, int target_level) { f->marked_for_compaction, f->temperature, f->oldest_blob_file_number, f->oldest_ancester_time, f->file_creation_time, f->epoch_number, f->file_checksum, f->file_checksum_func_name, f->unique_id, - f->compensated_range_deletion_size); + f->compensated_range_deletion_size, f->tail_size); } ROCKS_LOG_DEBUG(immutable_db_options_.info_log, "[%s] Apply version edit:\n%s", cfd->GetName().c_str(), @@ -3457,7 +3457,7 @@ Status DBImpl::BackgroundCompaction(bool* made_progress, f->oldest_blob_file_number, f->oldest_ancester_time, f->file_creation_time, f->epoch_number, f->file_checksum, f->file_checksum_func_name, f->unique_id, - f->compensated_range_deletion_size); + f->compensated_range_deletion_size, f->tail_size); ROCKS_LOG_BUFFER( log_buffer, diff --git a/db/db_impl/db_impl_experimental.cc b/db/db_impl/db_impl_experimental.cc index 5e3b7ba61..8d958ffc1 100644 --- a/db/db_impl/db_impl_experimental.cc +++ b/db/db_impl/db_impl_experimental.cc @@ -138,7 +138,7 @@ Status DBImpl::PromoteL0(ColumnFamilyHandle* column_family, int target_level) { f->oldest_blob_file_number, f->oldest_ancester_time, f->file_creation_time, f->epoch_number, f->file_checksum, f->file_checksum_func_name, f->unique_id, - f->compensated_range_deletion_size); + f->compensated_range_deletion_size, f->tail_size); } status = versions_->LogAndApply(cfd, *cfd->GetLatestMutableCFOptions(), diff --git a/db/db_impl/db_impl_open.cc b/db/db_impl/db_impl_open.cc index a9b42d62a..5bad1e12b 100644 --- a/db/db_impl/db_impl_open.cc +++ b/db/db_impl/db_impl_open.cc @@ -616,7 +616,8 @@ Status DBImpl::Recover( f->oldest_blob_file_number, f->oldest_ancester_time, f->file_creation_time, f->epoch_number, f->file_checksum, f->file_checksum_func_name, - f->unique_id, f->compensated_range_deletion_size); + f->unique_id, f->compensated_range_deletion_size, + f->tail_size); ROCKS_LOG_WARN(immutable_db_options_.info_log, "[%s] Moving #%" PRIu64 " from from_level-%d to from_level-%d %" PRIu64 @@ -1673,14 +1674,14 @@ Status DBImpl::WriteLevel0TableForRecovery(int job_id, ColumnFamilyData* cfd, constexpr int level = 0; if (s.ok() && has_output) { - edit->AddFile(level, meta.fd.GetNumber(), meta.fd.GetPathId(), - meta.fd.GetFileSize(), meta.smallest, meta.largest, - meta.fd.smallest_seqno, meta.fd.largest_seqno, - meta.marked_for_compaction, meta.temperature, - meta.oldest_blob_file_number, meta.oldest_ancester_time, - meta.file_creation_time, meta.epoch_number, - meta.file_checksum, meta.file_checksum_func_name, - meta.unique_id, meta.compensated_range_deletion_size); + edit->AddFile( + level, meta.fd.GetNumber(), meta.fd.GetPathId(), meta.fd.GetFileSize(), + meta.smallest, meta.largest, meta.fd.smallest_seqno, + meta.fd.largest_seqno, meta.marked_for_compaction, meta.temperature, + meta.oldest_blob_file_number, meta.oldest_ancester_time, + meta.file_creation_time, meta.epoch_number, meta.file_checksum, + meta.file_checksum_func_name, meta.unique_id, + meta.compensated_range_deletion_size, meta.tail_size); for (const auto& blob : blob_file_additions) { edit->AddBlobFile(blob); diff --git a/db/db_test2.cc b/db/db_test2.cc index 544d9b299..43aea2181 100644 --- a/db/db_test2.cc +++ b/db/db_test2.cc @@ -5298,88 +5298,6 @@ TEST_F(DBTest2, DISABLED_IteratorPinnedMemory) { ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->DisableProcessing(); } -TEST_F(DBTest2, TestBBTTailPrefetch) { - std::atomic called(false); - size_t expected_lower_bound = 512 * 1024; - size_t expected_higher_bound = 512 * 1024; - ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->SetCallBack( - "BlockBasedTable::Open::TailPrefetchLen", [&](void* arg) { - size_t* prefetch_size = static_cast(arg); - EXPECT_LE(expected_lower_bound, *prefetch_size); - EXPECT_GE(expected_higher_bound, *prefetch_size); - called = true; - }); - ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->EnableProcessing(); - - ASSERT_OK(Put("1", "1")); - ASSERT_OK(Put("9", "1")); - ASSERT_OK(Flush()); - - expected_lower_bound = 0; - expected_higher_bound = 8 * 1024; - - ASSERT_OK(Put("1", "1")); - ASSERT_OK(Put("9", "1")); - ASSERT_OK(Flush()); - - ASSERT_OK(Put("1", "1")); - ASSERT_OK(Put("9", "1")); - ASSERT_OK(Flush()); - - // Full compaction to make sure there is no L0 file after the open. - ASSERT_OK(db_->CompactRange(CompactRangeOptions(), nullptr, nullptr)); - - ASSERT_TRUE(called.load()); - called = false; - - ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->DisableProcessing(); - ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->ClearAllCallBacks(); - - std::atomic first_call(true); - ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->SetCallBack( - "BlockBasedTable::Open::TailPrefetchLen", [&](void* arg) { - size_t* prefetch_size = static_cast(arg); - if (first_call) { - EXPECT_EQ(4 * 1024, *prefetch_size); - first_call = false; - } else { - EXPECT_GE(4 * 1024, *prefetch_size); - } - called = true; - }); - ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->EnableProcessing(); - - Options options = CurrentOptions(); - options.max_file_opening_threads = 1; // one thread - BlockBasedTableOptions table_options; - table_options.cache_index_and_filter_blocks = true; - options.table_factory.reset(NewBlockBasedTableFactory(table_options)); - options.max_open_files = -1; - Reopen(options); - - ASSERT_OK(Put("1", "1")); - ASSERT_OK(Put("9", "1")); - ASSERT_OK(Flush()); - - ASSERT_OK(Put("1", "1")); - ASSERT_OK(Put("9", "1")); - ASSERT_OK(Flush()); - - ASSERT_TRUE(called.load()); - called = false; - - // Parallel loading SST files - options.max_file_opening_threads = 16; - Reopen(options); - - ASSERT_OK(db_->CompactRange(CompactRangeOptions(), nullptr, nullptr)); - - ASSERT_TRUE(called.load()); - - ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->DisableProcessing(); - ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->ClearAllCallBacks(); -} - TEST_F(DBTest2, TestGetColumnFamilyHandleUnlocked) { // Setup sync point dependency to reproduce the race condition of // DBImpl::GetColumnFamilyHandleUnlocked diff --git a/db/experimental.cc b/db/experimental.cc index c2dce7fde..cb5fb3179 100644 --- a/db/experimental.cc +++ b/db/experimental.cc @@ -102,7 +102,7 @@ Status UpdateManifestForFilesState( lf->oldest_blob_file_number, lf->oldest_ancester_time, lf->file_creation_time, lf->epoch_number, lf->file_checksum, lf->file_checksum_func_name, lf->unique_id, - lf->compensated_range_deletion_size); + lf->compensated_range_deletion_size, lf->tail_size); } } } else { diff --git a/db/external_sst_file_ingestion_job.cc b/db/external_sst_file_ingestion_job.cc index ca9b6fb9b..f8716e5f4 100644 --- a/db/external_sst_file_ingestion_job.cc +++ b/db/external_sst_file_ingestion_job.cc @@ -464,6 +464,16 @@ Status ExternalSstFileIngestionJob::Run() { current_time = oldest_ancester_time = static_cast(temp_current_time); } + uint64_t tail_size = 0; + bool contain_no_data_blocks = f.table_properties.num_entries > 0 && + (f.table_properties.num_entries == + f.table_properties.num_range_deletions); + if (f.table_properties.tail_start_offset > 0 || contain_no_data_blocks) { + uint64_t file_size = f.fd.GetFileSize(); + assert(f.table_properties.tail_start_offset <= file_size); + tail_size = file_size - f.table_properties.tail_start_offset; + } + FileMetaData f_metadata( f.fd.GetNumber(), f.fd.GetPathId(), f.fd.GetFileSize(), f.smallest_internal_key, f.largest_internal_key, f.assigned_seqno, @@ -472,7 +482,7 @@ Status ExternalSstFileIngestionJob::Run() { ingestion_options_.ingest_behind ? kReservedEpochNumberForFileIngestedBehind : cfd_->NewEpochNumber(), - f.file_checksum, f.file_checksum_func_name, f.unique_id, 0); + f.file_checksum, f.file_checksum_func_name, f.unique_id, 0, tail_size); f_metadata.temperature = f.file_temperature; edit_.AddFile(f.picked_level, f_metadata); } diff --git a/db/flush_job.cc b/db/flush_job.cc index a3ffc2707..8b152ee3e 100644 --- a/db/flush_job.cc +++ b/db/flush_job.cc @@ -1007,7 +1007,8 @@ Status FlushJob::WriteLevel0Table() { meta_.oldest_blob_file_number, meta_.oldest_ancester_time, meta_.file_creation_time, meta_.epoch_number, meta_.file_checksum, meta_.file_checksum_func_name, - meta_.unique_id, meta_.compensated_range_deletion_size); + meta_.unique_id, meta_.compensated_range_deletion_size, + meta_.tail_size); edit_->SetBlobFileAdditions(std::move(blob_file_additions)); } // Piggyback FlushJobInfo on the first first flushed memtable. diff --git a/db/import_column_family_job.cc b/db/import_column_family_job.cc index 9a8b48dd0..f76d0ac1b 100644 --- a/db/import_column_family_job.cc +++ b/db/import_column_family_job.cc @@ -138,6 +138,16 @@ Status ImportColumnFamilyJob::Run() { const auto& f = files_to_import_[i]; const auto& file_metadata = metadata_[i]; + uint64_t tail_size = 0; + bool contain_no_data_blocks = f.table_properties.num_entries > 0 && + (f.table_properties.num_entries == + f.table_properties.num_range_deletions); + if (f.table_properties.tail_start_offset > 0 || contain_no_data_blocks) { + uint64_t file_size = f.fd.GetFileSize(); + assert(f.table_properties.tail_start_offset <= file_size); + tail_size = file_size - f.table_properties.tail_start_offset; + } + VersionEdit dummy_version_edit; dummy_version_edit.AddFile( file_metadata.level, f.fd.GetNumber(), f.fd.GetPathId(), @@ -145,7 +155,7 @@ Status ImportColumnFamilyJob::Run() { file_metadata.smallest_seqno, file_metadata.largest_seqno, false, file_metadata.temperature, kInvalidBlobFileNumber, oldest_ancester_time, current_time, file_metadata.epoch_number, kUnknownFileChecksum, - kUnknownFileChecksumFuncName, f.unique_id, 0); + kUnknownFileChecksumFuncName, f.unique_id, 0, tail_size); s = dummy_version_builder.Apply(&dummy_version_edit); } if (s.ok()) { diff --git a/db/repair.cc b/db/repair.cc index 633c348a5..67513cacc 100644 --- a/db/repair.cc +++ b/db/repair.cc @@ -551,6 +551,17 @@ class Repairer { } t->meta.oldest_ancester_time = props->creation_time; } + if (status.ok()) { + uint64_t tail_size = 0; + bool contain_no_data_blocks = + props->num_entries > 0 && + (props->num_entries == props->num_range_deletions); + if (props->tail_start_offset > 0 || contain_no_data_blocks) { + assert(props->tail_start_offset <= file_size); + tail_size = file_size - props->tail_start_offset; + } + t->meta.tail_size = tail_size; + } ColumnFamilyData* cfd = nullptr; if (status.ok()) { cfd = vset_.GetColumnFamilySet()->GetColumnFamily(t->column_family_id); @@ -682,7 +693,7 @@ class Repairer { table->meta.oldest_ancester_time, table->meta.file_creation_time, table->meta.epoch_number, table->meta.file_checksum, table->meta.file_checksum_func_name, table->meta.unique_id, - table->meta.compensated_range_deletion_size); + table->meta.compensated_range_deletion_size, table->meta.tail_size); } s = dummy_version_builder.Apply(&dummy_edit); if (s.ok()) { diff --git a/db/table_cache.cc b/db/table_cache.cc index c288ec8c7..73fa07c6d 100644 --- a/db/table_cache.cc +++ b/db/table_cache.cc @@ -140,13 +140,13 @@ Status TableCache::GetTableReader( } s = ioptions_.table_factory->NewTableReader( ro, - TableReaderOptions(ioptions_, prefix_extractor, file_options, - internal_comparator, block_protection_bytes_per_key, - skip_filters, immortal_tables_, - false /* force_direct_prefetch */, level, - block_cache_tracer_, max_file_size_for_l0_meta_pin, - db_session_id_, file_meta.fd.GetNumber(), - expected_unique_id, file_meta.fd.largest_seqno), + TableReaderOptions( + ioptions_, prefix_extractor, file_options, internal_comparator, + block_protection_bytes_per_key, skip_filters, immortal_tables_, + false /* force_direct_prefetch */, level, block_cache_tracer_, + max_file_size_for_l0_meta_pin, db_session_id_, + file_meta.fd.GetNumber(), expected_unique_id, + file_meta.fd.largest_seqno, file_meta.tail_size), std::move(file_reader), file_meta.fd.GetFileSize(), table_reader, prefetch_index_and_filter_in_cache); TEST_SYNC_POINT("TableCache::GetTableReader:0"); diff --git a/db/version_builder_test.cc b/db/version_builder_test.cc index 611dee774..af41f00f6 100644 --- a/db/version_builder_test.cc +++ b/db/version_builder_test.cc @@ -73,7 +73,7 @@ class VersionBuilderTest : public testing::Test { /* marked_for_compact */ false, Temperature::kUnknown, oldest_blob_file_number, kUnknownOldestAncesterTime, kUnknownFileCreationTime, epoch_number, kUnknownFileChecksum, - kUnknownFileChecksumFuncName, kNullUniqueId64x2, 0); + kUnknownFileChecksumFuncName, kNullUniqueId64x2, 0, 0); f->compensated_file_size = file_size; f->num_entries = num_entries; f->num_deletions = num_deletions; @@ -136,7 +136,7 @@ class VersionBuilderTest : public testing::Test { Temperature::kUnknown, blob_file_number, kUnknownOldestAncesterTime, kUnknownFileCreationTime, epoch_number, kUnknownFileChecksum, - kUnknownFileChecksumFuncName, kNullUniqueId64x2, 0); + kUnknownFileChecksumFuncName, kNullUniqueId64x2, 0, 0); } void UpdateVersionStorageInfo(VersionStorageInfo* vstorage) { @@ -183,11 +183,12 @@ TEST_F(VersionBuilderTest, ApplyAndSaveTo) { UpdateVersionStorageInfo(); VersionEdit version_edit; - version_edit.AddFile( - 2, 666, 0, 100U, GetInternalKey("301"), GetInternalKey("350"), 200, 200, - false, Temperature::kUnknown, kInvalidBlobFileNumber, - kUnknownOldestAncesterTime, kUnknownFileCreationTime, kUnknownEpochNumber, - kUnknownFileChecksum, kUnknownFileChecksumFuncName, kNullUniqueId64x2, 0); + version_edit.AddFile(2, 666, 0, 100U, GetInternalKey("301"), + GetInternalKey("350"), 200, 200, false, + Temperature::kUnknown, kInvalidBlobFileNumber, + kUnknownOldestAncesterTime, kUnknownFileCreationTime, + kUnknownEpochNumber, kUnknownFileChecksum, + kUnknownFileChecksumFuncName, kNullUniqueId64x2, 0, 0); version_edit.DeleteFile(3, 27U); EnvOptions env_options; @@ -230,11 +231,12 @@ TEST_F(VersionBuilderTest, ApplyAndSaveToDynamic) { UpdateVersionStorageInfo(); VersionEdit version_edit; - version_edit.AddFile( - 3, 666, 0, 100U, GetInternalKey("301"), GetInternalKey("350"), 200, 200, - false, Temperature::kUnknown, kInvalidBlobFileNumber, - kUnknownOldestAncesterTime, kUnknownFileCreationTime, kUnknownEpochNumber, - kUnknownFileChecksum, kUnknownFileChecksumFuncName, kNullUniqueId64x2, 0); + version_edit.AddFile(3, 666, 0, 100U, GetInternalKey("301"), + GetInternalKey("350"), 200, 200, false, + Temperature::kUnknown, kInvalidBlobFileNumber, + kUnknownOldestAncesterTime, kUnknownFileCreationTime, + kUnknownEpochNumber, kUnknownFileChecksum, + kUnknownFileChecksumFuncName, kNullUniqueId64x2, 0, 0); version_edit.DeleteFile(0, 1U); version_edit.DeleteFile(0, 88U); @@ -281,11 +283,12 @@ TEST_F(VersionBuilderTest, ApplyAndSaveToDynamic2) { UpdateVersionStorageInfo(); VersionEdit version_edit; - version_edit.AddFile( - 4, 666, 0, 100U, GetInternalKey("301"), GetInternalKey("350"), 200, 200, - false, Temperature::kUnknown, kInvalidBlobFileNumber, - kUnknownOldestAncesterTime, kUnknownFileCreationTime, kUnknownEpochNumber, - kUnknownFileChecksum, kUnknownFileChecksumFuncName, kNullUniqueId64x2, 0); + version_edit.AddFile(4, 666, 0, 100U, GetInternalKey("301"), + GetInternalKey("350"), 200, 200, false, + Temperature::kUnknown, kInvalidBlobFileNumber, + kUnknownOldestAncesterTime, kUnknownFileCreationTime, + kUnknownEpochNumber, kUnknownFileChecksum, + kUnknownFileChecksumFuncName, kNullUniqueId64x2, 0, 0); version_edit.DeleteFile(0, 1U); version_edit.DeleteFile(0, 88U); version_edit.DeleteFile(4, 6U); @@ -317,31 +320,36 @@ TEST_F(VersionBuilderTest, ApplyMultipleAndSaveTo) { UpdateVersionStorageInfo(); VersionEdit version_edit; - version_edit.AddFile( - 2, 666, 0, 100U, GetInternalKey("301"), GetInternalKey("350"), 200, 200, - false, Temperature::kUnknown, kInvalidBlobFileNumber, - kUnknownOldestAncesterTime, kUnknownFileCreationTime, kUnknownEpochNumber, - kUnknownFileChecksum, kUnknownFileChecksumFuncName, kNullUniqueId64x2, 0); - version_edit.AddFile( - 2, 676, 0, 100U, GetInternalKey("401"), GetInternalKey("450"), 200, 200, - false, Temperature::kUnknown, kInvalidBlobFileNumber, - kUnknownOldestAncesterTime, kUnknownFileCreationTime, kUnknownEpochNumber, - kUnknownFileChecksum, kUnknownFileChecksumFuncName, kNullUniqueId64x2, 0); - version_edit.AddFile( - 2, 636, 0, 100U, GetInternalKey("601"), GetInternalKey("650"), 200, 200, - false, Temperature::kUnknown, kInvalidBlobFileNumber, - kUnknownOldestAncesterTime, kUnknownFileCreationTime, kUnknownEpochNumber, - kUnknownFileChecksum, kUnknownFileChecksumFuncName, kNullUniqueId64x2, 0); - version_edit.AddFile( - 2, 616, 0, 100U, GetInternalKey("501"), GetInternalKey("550"), 200, 200, - false, Temperature::kUnknown, kInvalidBlobFileNumber, - kUnknownOldestAncesterTime, kUnknownFileCreationTime, kUnknownEpochNumber, - kUnknownFileChecksum, kUnknownFileChecksumFuncName, kNullUniqueId64x2, 0); - version_edit.AddFile( - 2, 606, 0, 100U, GetInternalKey("701"), GetInternalKey("750"), 200, 200, - false, Temperature::kUnknown, kInvalidBlobFileNumber, - kUnknownOldestAncesterTime, kUnknownFileCreationTime, kUnknownEpochNumber, - kUnknownFileChecksum, kUnknownFileChecksumFuncName, kNullUniqueId64x2, 0); + version_edit.AddFile(2, 666, 0, 100U, GetInternalKey("301"), + GetInternalKey("350"), 200, 200, false, + Temperature::kUnknown, kInvalidBlobFileNumber, + kUnknownOldestAncesterTime, kUnknownFileCreationTime, + kUnknownEpochNumber, kUnknownFileChecksum, + kUnknownFileChecksumFuncName, kNullUniqueId64x2, 0, 0); + version_edit.AddFile(2, 676, 0, 100U, GetInternalKey("401"), + GetInternalKey("450"), 200, 200, false, + Temperature::kUnknown, kInvalidBlobFileNumber, + kUnknownOldestAncesterTime, kUnknownFileCreationTime, + kUnknownEpochNumber, kUnknownFileChecksum, + kUnknownFileChecksumFuncName, kNullUniqueId64x2, 0, 0); + version_edit.AddFile(2, 636, 0, 100U, GetInternalKey("601"), + GetInternalKey("650"), 200, 200, false, + Temperature::kUnknown, kInvalidBlobFileNumber, + kUnknownOldestAncesterTime, kUnknownFileCreationTime, + kUnknownEpochNumber, kUnknownFileChecksum, + kUnknownFileChecksumFuncName, kNullUniqueId64x2, 0, 0); + version_edit.AddFile(2, 616, 0, 100U, GetInternalKey("501"), + GetInternalKey("550"), 200, 200, false, + Temperature::kUnknown, kInvalidBlobFileNumber, + kUnknownOldestAncesterTime, kUnknownFileCreationTime, + kUnknownEpochNumber, kUnknownFileChecksum, + kUnknownFileChecksumFuncName, kNullUniqueId64x2, 0, 0); + version_edit.AddFile(2, 606, 0, 100U, GetInternalKey("701"), + GetInternalKey("750"), 200, 200, false, + Temperature::kUnknown, kInvalidBlobFileNumber, + kUnknownOldestAncesterTime, kUnknownFileCreationTime, + kUnknownEpochNumber, kUnknownFileChecksum, + kUnknownFileChecksumFuncName, kNullUniqueId64x2, 0, 0); EnvOptions env_options; constexpr TableCache* table_cache = nullptr; @@ -376,46 +384,53 @@ TEST_F(VersionBuilderTest, ApplyDeleteAndSaveTo) { kCompactionStyleLevel, nullptr, false); VersionEdit version_edit; - version_edit.AddFile( - 2, 666, 0, 100U, GetInternalKey("301"), GetInternalKey("350"), 200, 200, - false, Temperature::kUnknown, kInvalidBlobFileNumber, - kUnknownOldestAncesterTime, kUnknownFileCreationTime, kUnknownEpochNumber, - kUnknownFileChecksum, kUnknownFileChecksumFuncName, kNullUniqueId64x2, 0); - version_edit.AddFile( - 2, 676, 0, 100U, GetInternalKey("401"), GetInternalKey("450"), 200, 200, - false, Temperature::kUnknown, kInvalidBlobFileNumber, - kUnknownOldestAncesterTime, kUnknownFileCreationTime, kUnknownEpochNumber, - kUnknownFileChecksum, kUnknownFileChecksumFuncName, kNullUniqueId64x2, 0); - version_edit.AddFile( - 2, 636, 0, 100U, GetInternalKey("601"), GetInternalKey("650"), 200, 200, - false, Temperature::kUnknown, kInvalidBlobFileNumber, - kUnknownOldestAncesterTime, kUnknownFileCreationTime, kUnknownEpochNumber, - kUnknownFileChecksum, kUnknownFileChecksumFuncName, kNullUniqueId64x2, 0); - version_edit.AddFile( - 2, 616, 0, 100U, GetInternalKey("501"), GetInternalKey("550"), 200, 200, - false, Temperature::kUnknown, kInvalidBlobFileNumber, - kUnknownOldestAncesterTime, kUnknownFileCreationTime, kUnknownEpochNumber, - kUnknownFileChecksum, kUnknownFileChecksumFuncName, kNullUniqueId64x2, 0); - version_edit.AddFile( - 2, 606, 0, 100U, GetInternalKey("701"), GetInternalKey("750"), 200, 200, - false, Temperature::kUnknown, kInvalidBlobFileNumber, - kUnknownOldestAncesterTime, kUnknownFileCreationTime, kUnknownEpochNumber, - kUnknownFileChecksum, kUnknownFileChecksumFuncName, kNullUniqueId64x2, 0); + version_edit.AddFile(2, 666, 0, 100U, GetInternalKey("301"), + GetInternalKey("350"), 200, 200, false, + Temperature::kUnknown, kInvalidBlobFileNumber, + kUnknownOldestAncesterTime, kUnknownFileCreationTime, + kUnknownEpochNumber, kUnknownFileChecksum, + kUnknownFileChecksumFuncName, kNullUniqueId64x2, 0, 0); + version_edit.AddFile(2, 676, 0, 100U, GetInternalKey("401"), + GetInternalKey("450"), 200, 200, false, + Temperature::kUnknown, kInvalidBlobFileNumber, + kUnknownOldestAncesterTime, kUnknownFileCreationTime, + kUnknownEpochNumber, kUnknownFileChecksum, + kUnknownFileChecksumFuncName, kNullUniqueId64x2, 0, 0); + version_edit.AddFile(2, 636, 0, 100U, GetInternalKey("601"), + GetInternalKey("650"), 200, 200, false, + Temperature::kUnknown, kInvalidBlobFileNumber, + kUnknownOldestAncesterTime, kUnknownFileCreationTime, + kUnknownEpochNumber, kUnknownFileChecksum, + kUnknownFileChecksumFuncName, kNullUniqueId64x2, 0, 0); + version_edit.AddFile(2, 616, 0, 100U, GetInternalKey("501"), + GetInternalKey("550"), 200, 200, false, + Temperature::kUnknown, kInvalidBlobFileNumber, + kUnknownOldestAncesterTime, kUnknownFileCreationTime, + kUnknownEpochNumber, kUnknownFileChecksum, + kUnknownFileChecksumFuncName, kNullUniqueId64x2, 0, 0); + version_edit.AddFile(2, 606, 0, 100U, GetInternalKey("701"), + GetInternalKey("750"), 200, 200, false, + Temperature::kUnknown, kInvalidBlobFileNumber, + kUnknownOldestAncesterTime, kUnknownFileCreationTime, + kUnknownEpochNumber, kUnknownFileChecksum, + kUnknownFileChecksumFuncName, kNullUniqueId64x2, 0, 0); ASSERT_OK(version_builder.Apply(&version_edit)); VersionEdit version_edit2; - version_edit.AddFile( - 2, 808, 0, 100U, GetInternalKey("901"), GetInternalKey("950"), 200, 200, - false, Temperature::kUnknown, kInvalidBlobFileNumber, - kUnknownOldestAncesterTime, kUnknownFileCreationTime, kUnknownEpochNumber, - kUnknownFileChecksum, kUnknownFileChecksumFuncName, kNullUniqueId64x2, 0); + version_edit.AddFile(2, 808, 0, 100U, GetInternalKey("901"), + GetInternalKey("950"), 200, 200, false, + Temperature::kUnknown, kInvalidBlobFileNumber, + kUnknownOldestAncesterTime, kUnknownFileCreationTime, + kUnknownEpochNumber, kUnknownFileChecksum, + kUnknownFileChecksumFuncName, kNullUniqueId64x2, 0, 0); version_edit2.DeleteFile(2, 616); version_edit2.DeleteFile(2, 636); - version_edit.AddFile( - 2, 806, 0, 100U, GetInternalKey("801"), GetInternalKey("850"), 200, 200, - false, Temperature::kUnknown, kInvalidBlobFileNumber, - kUnknownOldestAncesterTime, kUnknownFileCreationTime, kUnknownEpochNumber, - kUnknownFileChecksum, kUnknownFileChecksumFuncName, kNullUniqueId64x2, 0); + version_edit.AddFile(2, 806, 0, 100U, GetInternalKey("801"), + GetInternalKey("850"), 200, 200, false, + Temperature::kUnknown, kInvalidBlobFileNumber, + kUnknownOldestAncesterTime, kUnknownFileCreationTime, + kUnknownEpochNumber, kUnknownFileChecksum, + kUnknownFileChecksumFuncName, kNullUniqueId64x2, 0, 0); ASSERT_OK(version_builder.Apply(&version_edit2)); ASSERT_OK(version_builder.SaveTo(&new_vstorage)); @@ -520,13 +535,14 @@ TEST_F(VersionBuilderTest, ApplyFileDeletionAndAddition) { constexpr bool marked_for_compaction = false; - addition.AddFile( - level, file_number, path_id, file_size, - GetInternalKey(smallest, smallest_seq), - GetInternalKey(largest, largest_seq), smallest_seqno, largest_seqno, - marked_for_compaction, Temperature::kUnknown, kInvalidBlobFileNumber, - kUnknownOldestAncesterTime, kUnknownFileCreationTime, kUnknownEpochNumber, - kUnknownFileChecksum, kUnknownFileChecksumFuncName, kNullUniqueId64x2, 0); + addition.AddFile(level, file_number, path_id, file_size, + GetInternalKey(smallest, smallest_seq), + GetInternalKey(largest, largest_seq), smallest_seqno, + largest_seqno, marked_for_compaction, Temperature::kUnknown, + kInvalidBlobFileNumber, kUnknownOldestAncesterTime, + kUnknownFileCreationTime, kUnknownEpochNumber, + kUnknownFileChecksum, kUnknownFileChecksumFuncName, + kNullUniqueId64x2, 0, 0); ASSERT_OK(builder.Apply(&addition)); @@ -570,12 +586,13 @@ TEST_F(VersionBuilderTest, ApplyFileAdditionAlreadyInBase) { constexpr SequenceNumber largest_seqno = 1000; constexpr bool marked_for_compaction = false; - edit.AddFile( - new_level, file_number, path_id, file_size, GetInternalKey(smallest), - GetInternalKey(largest), smallest_seqno, largest_seqno, - marked_for_compaction, Temperature::kUnknown, kInvalidBlobFileNumber, - kUnknownOldestAncesterTime, kUnknownFileCreationTime, kUnknownEpochNumber, - kUnknownFileChecksum, kUnknownFileChecksumFuncName, kNullUniqueId64x2, 0); + edit.AddFile(new_level, file_number, path_id, file_size, + GetInternalKey(smallest), GetInternalKey(largest), + smallest_seqno, largest_seqno, marked_for_compaction, + Temperature::kUnknown, kInvalidBlobFileNumber, + kUnknownOldestAncesterTime, kUnknownFileCreationTime, + kUnknownEpochNumber, kUnknownFileChecksum, + kUnknownFileChecksumFuncName, kNullUniqueId64x2, 0, 0); const Status s = builder.Apply(&edit); ASSERT_TRUE(s.IsCorruption()); @@ -606,12 +623,13 @@ TEST_F(VersionBuilderTest, ApplyFileAdditionAlreadyApplied) { constexpr SequenceNumber largest_seqno = 1000; constexpr bool marked_for_compaction = false; - edit.AddFile( - level, file_number, path_id, file_size, GetInternalKey(smallest), - GetInternalKey(largest), smallest_seqno, largest_seqno, - marked_for_compaction, Temperature::kUnknown, kInvalidBlobFileNumber, - kUnknownOldestAncesterTime, kUnknownFileCreationTime, kUnknownEpochNumber, - kUnknownFileChecksum, kUnknownFileChecksumFuncName, kNullUniqueId64x2, 0); + edit.AddFile(level, file_number, path_id, file_size, GetInternalKey(smallest), + GetInternalKey(largest), smallest_seqno, largest_seqno, + marked_for_compaction, Temperature::kUnknown, + kInvalidBlobFileNumber, kUnknownOldestAncesterTime, + kUnknownFileCreationTime, kUnknownEpochNumber, + kUnknownFileChecksum, kUnknownFileChecksumFuncName, + kNullUniqueId64x2, 0, 0); ASSERT_OK(builder.Apply(&edit)); @@ -619,12 +637,13 @@ TEST_F(VersionBuilderTest, ApplyFileAdditionAlreadyApplied) { constexpr int new_level = 2; - other_edit.AddFile( - new_level, file_number, path_id, file_size, GetInternalKey(smallest), - GetInternalKey(largest), smallest_seqno, largest_seqno, - marked_for_compaction, Temperature::kUnknown, kInvalidBlobFileNumber, - kUnknownOldestAncesterTime, kUnknownFileCreationTime, kUnknownEpochNumber, - kUnknownFileChecksum, kUnknownFileChecksumFuncName, kNullUniqueId64x2, 0); + other_edit.AddFile(new_level, file_number, path_id, file_size, + GetInternalKey(smallest), GetInternalKey(largest), + smallest_seqno, largest_seqno, marked_for_compaction, + Temperature::kUnknown, kInvalidBlobFileNumber, + kUnknownOldestAncesterTime, kUnknownFileCreationTime, + kUnknownEpochNumber, kUnknownFileChecksum, + kUnknownFileChecksumFuncName, kNullUniqueId64x2, 0, 0); const Status s = builder.Apply(&other_edit); ASSERT_TRUE(s.IsCorruption()); @@ -655,12 +674,13 @@ TEST_F(VersionBuilderTest, ApplyFileAdditionAndDeletion) { VersionEdit addition; - addition.AddFile( - level, file_number, path_id, file_size, GetInternalKey(smallest), - GetInternalKey(largest), smallest_seqno, largest_seqno, - marked_for_compaction, Temperature::kUnknown, kInvalidBlobFileNumber, - kUnknownOldestAncesterTime, kUnknownFileCreationTime, kUnknownEpochNumber, - kUnknownFileChecksum, kUnknownFileChecksumFuncName, kNullUniqueId64x2, 0); + addition.AddFile(level, file_number, path_id, file_size, + GetInternalKey(smallest), GetInternalKey(largest), + smallest_seqno, largest_seqno, marked_for_compaction, + Temperature::kUnknown, kInvalidBlobFileNumber, + kUnknownOldestAncesterTime, kUnknownFileCreationTime, + kUnknownEpochNumber, kUnknownFileChecksum, + kUnknownFileChecksumFuncName, kNullUniqueId64x2, 0, 0); ASSERT_OK(builder.Apply(&addition)); @@ -1233,7 +1253,7 @@ TEST_F(VersionBuilderTest, SaveBlobFilesToConcurrentJobs) { GetInternalKey(largest), smallest_seqno, largest_seqno, marked_for_compaction, Temperature::kUnknown, blob_file_number, kUnknownOldestAncesterTime, kUnknownFileCreationTime, 2 /*epoch_number*/, - checksum_value, checksum_method, kNullUniqueId64x2, 0); + checksum_value, checksum_method, kNullUniqueId64x2, 0, 0); edit.AddBlobFile(blob_file_number, total_blob_count, total_blob_bytes, checksum_method, checksum_value); @@ -1321,7 +1341,7 @@ TEST_F(VersionBuilderTest, CheckConsistencyForBlobFiles) { /* oldest_blob_file_number */ 16, kUnknownOldestAncesterTime, kUnknownFileCreationTime, kUnknownEpochNumber, kUnknownFileChecksum, kUnknownFileChecksumFuncName, - kNullUniqueId64x2, 0); + kNullUniqueId64x2, 0, 0); edit.AddFile(/* level */ 1, /* file_number */ 700, /* path_id */ 0, /* file_size */ 100, /* smallest */ GetInternalKey("801"), @@ -1331,7 +1351,7 @@ TEST_F(VersionBuilderTest, CheckConsistencyForBlobFiles) { /* oldest_blob_file_number */ 1000, kUnknownOldestAncesterTime, kUnknownFileCreationTime, kUnknownEpochNumber, kUnknownFileChecksum, kUnknownFileChecksumFuncName, - kNullUniqueId64x2, 0); + kNullUniqueId64x2, 0, 0); edit.AddBlobFile(/* blob_file_number */ 1000, /* total_blob_count */ 2000, /* total_blob_bytes */ 200000, /* checksum_method */ std::string(), @@ -1552,7 +1572,7 @@ TEST_F(VersionBuilderTest, MaintainLinkedSstsForBlobFiles) { Temperature::kUnknown, /* oldest_blob_file_number */ 1, kUnknownOldestAncesterTime, kUnknownFileCreationTime, kUnknownEpochNumber, kUnknownFileChecksum, - kUnknownFileChecksumFuncName, kNullUniqueId64x2, 0); + kUnknownFileChecksumFuncName, kNullUniqueId64x2, 0, 0); // Add an SST that does not reference any blob files. edit.AddFile( @@ -1562,7 +1582,7 @@ TEST_F(VersionBuilderTest, MaintainLinkedSstsForBlobFiles) { /* largest_seqno */ 2200, /* marked_for_compaction */ false, Temperature::kUnknown, kInvalidBlobFileNumber, kUnknownOldestAncesterTime, kUnknownFileCreationTime, kUnknownEpochNumber, kUnknownFileChecksum, - kUnknownFileChecksumFuncName, kNullUniqueId64x2, 0); + kUnknownFileChecksumFuncName, kNullUniqueId64x2, 0, 0); // Delete a file that references a blob file. edit.DeleteFile(/* level */ 1, /* file_number */ 6); @@ -1585,7 +1605,7 @@ TEST_F(VersionBuilderTest, MaintainLinkedSstsForBlobFiles) { /* oldest_blob_file_number */ 3, kUnknownOldestAncesterTime, kUnknownFileCreationTime, kUnknownEpochNumber, kUnknownFileChecksum, kUnknownFileChecksumFuncName, - kNullUniqueId64x2, 0); + kNullUniqueId64x2, 0, 0); // Trivially move a file that does not reference any blob files. edit.DeleteFile(/* level */ 1, /* file_number */ 13); @@ -1597,7 +1617,7 @@ TEST_F(VersionBuilderTest, MaintainLinkedSstsForBlobFiles) { Temperature::kUnknown, kInvalidBlobFileNumber, kUnknownOldestAncesterTime, kUnknownFileCreationTime, kUnknownEpochNumber, kUnknownFileChecksum, - kUnknownFileChecksumFuncName, kNullUniqueId64x2, 0); + kUnknownFileChecksumFuncName, kNullUniqueId64x2, 0, 0); // Add one more SST file that references a blob file, then promptly // delete it in a second version edit before the new version gets saved. @@ -1611,7 +1631,7 @@ TEST_F(VersionBuilderTest, MaintainLinkedSstsForBlobFiles) { /* oldest_blob_file_number */ 5, kUnknownOldestAncesterTime, kUnknownFileCreationTime, kUnknownEpochNumber, kUnknownFileChecksum, kUnknownFileChecksumFuncName, - kNullUniqueId64x2, 0); + kNullUniqueId64x2, 0, 0); VersionEdit edit2; @@ -1712,7 +1732,7 @@ TEST_F(VersionBuilderTest, CheckConsistencyForL0FilesSortedByEpochNumber) { /* oldest_blob_file_number */ kInvalidBlobFileNumber, kUnknownOldestAncesterTime, kUnknownFileCreationTime, 1 /* epoch_number */, kUnknownFileChecksum, kUnknownFileChecksumFuncName, - kNullUniqueId64x2, 0); + kNullUniqueId64x2, 0, 0); version_edit_1.AddFile( /* level */ 0, /* file_number */ 2U, /* path_id */ 0, /* file_size */ 100, /* smallest */ GetInternalKey("b", 2), @@ -1722,7 +1742,7 @@ TEST_F(VersionBuilderTest, CheckConsistencyForL0FilesSortedByEpochNumber) { /* oldest_blob_file_number */ kInvalidBlobFileNumber, kUnknownOldestAncesterTime, kUnknownFileCreationTime, 1 /* epoch_number */, kUnknownFileChecksum, kUnknownFileChecksumFuncName, - kNullUniqueId64x2, 0); + kNullUniqueId64x2, 0, 0); VersionBuilder version_builder_1(EnvOptions(), &ioptions_, nullptr /* table_cache */, &vstorage_, @@ -1749,7 +1769,7 @@ TEST_F(VersionBuilderTest, CheckConsistencyForL0FilesSortedByEpochNumber) { /* oldest_blob_file_number */ kInvalidBlobFileNumber, kUnknownOldestAncesterTime, kUnknownFileCreationTime, 1 /* epoch_number */, kUnknownFileChecksum, kUnknownFileChecksumFuncName, - kNullUniqueId64x2, 0); + kNullUniqueId64x2, 0, 0); version_edit_2.AddFile( /* level */ 0, /* file_number */ 2U, /* path_id */ 0, /* file_size */ 100, /* smallest */ GetInternalKey("b", 2), @@ -1759,7 +1779,7 @@ TEST_F(VersionBuilderTest, CheckConsistencyForL0FilesSortedByEpochNumber) { /* oldest_blob_file_number */ kInvalidBlobFileNumber, kUnknownOldestAncesterTime, kUnknownFileCreationTime, 2 /* epoch_number */, kUnknownFileChecksum, kUnknownFileChecksumFuncName, - kNullUniqueId64x2, 0); + kNullUniqueId64x2, 0, 0); VersionBuilder version_builder_2(EnvOptions(), &ioptions_, nullptr /* table_cache */, &vstorage_, diff --git a/db/version_edit.cc b/db/version_edit.cc index ecddaa49c..4f1ae80d2 100644 --- a/db/version_edit.cc +++ b/db/version_edit.cc @@ -238,6 +238,12 @@ bool VersionEdit::EncodeTo(std::string* dst) const { f.compensated_range_deletion_size); PutLengthPrefixedSlice(dst, Slice(compensated_range_deletion_size)); } + if (f.tail_size) { + PutVarint32(dst, NewFileCustomTag::kTailSize); + std::string varint_tail_size; + PutVarint64(&varint_tail_size, f.tail_size); + PutLengthPrefixedSlice(dst, Slice(varint_tail_size)); + } TEST_SYNC_POINT_CALLBACK("VersionEdit::EncodeTo:NewFile4:CustomizeFields", dst); @@ -416,6 +422,11 @@ const char* VersionEdit::DecodeNewFile4From(Slice* input) { return "Invalid compensated range deletion size"; } break; + case kTailSize: + if (!GetVarint64(&field, &f.tail_size)) { + return "invalid tail start offset"; + } + break; default: if ((custom_tag & kCustomTagNonSafeIgnoreMask) != 0) { // Should not proceed if cannot understand it @@ -851,6 +862,8 @@ std::string VersionEdit::DebugString(bool hex_key) const { InternalUniqueIdToExternal(&id); r.append(UniqueIdToHumanString(EncodeUniqueIdBytes(&id))); } + r.append(" tail size:"); + AppendNumberTo(&r, f.tail_size); } for (const auto& blob_file_addition : blob_file_additions_) { @@ -966,6 +979,7 @@ std::string VersionEdit::DebugJSON(int edit_num, bool hex_key) const { // permanent jw << "Temperature" << static_cast(f.temperature); } + jw << "TailSize" << f.tail_size; jw.EndArrayedObject(); } diff --git a/db/version_edit.h b/db/version_edit.h index 65c7fc43a..07e8f3774 100644 --- a/db/version_edit.h +++ b/db/version_edit.h @@ -90,6 +90,7 @@ enum NewFileCustomTag : uint32_t { kUniqueId = 12, kEpochNumber = 13, kCompensatedRangeDeletionSize = 14, + kTailSize = 15, // If this bit for the custom tag is set, opening DB should fail if // we don't know this field. @@ -238,6 +239,10 @@ struct FileMetaData { // SST unique id UniqueId64x2 unique_id{}; + // Size of the "tail" part of a SST file + // "Tail" refers to all blocks after data blocks till the end of the SST file + uint64_t tail_size = 0; + FileMetaData() = default; FileMetaData(uint64_t file, uint32_t file_path_id, uint64_t file_size, @@ -249,7 +254,8 @@ struct FileMetaData { uint64_t _epoch_number, const std::string& _file_checksum, const std::string& _file_checksum_func_name, UniqueId64x2 _unique_id, - const uint64_t _compensated_range_deletion_size) + const uint64_t _compensated_range_deletion_size, + uint64_t _tail_size) : fd(file, file_path_id, file_size, smallest_seq, largest_seq), smallest(smallest_key), largest(largest_key), @@ -262,7 +268,8 @@ struct FileMetaData { epoch_number(_epoch_number), file_checksum(_file_checksum), file_checksum_func_name(_file_checksum_func_name), - unique_id(std::move(_unique_id)) { + unique_id(std::move(_unique_id)), + tail_size(_tail_size) { TEST_SYNC_POINT_CALLBACK("FileMetaData::FileMetaData", this); } @@ -446,7 +453,8 @@ class VersionEdit { uint64_t epoch_number, const std::string& file_checksum, const std::string& file_checksum_func_name, const UniqueId64x2& unique_id, - const uint64_t compensated_range_deletion_size) { + const uint64_t compensated_range_deletion_size, + uint64_t tail_size) { assert(smallest_seqno <= largest_seqno); new_files_.emplace_back( level, @@ -455,7 +463,7 @@ class VersionEdit { temperature, oldest_blob_file_number, oldest_ancester_time, file_creation_time, epoch_number, file_checksum, file_checksum_func_name, unique_id, - compensated_range_deletion_size)); + compensated_range_deletion_size, tail_size)); if (!HasLastSequence() || largest_seqno > GetLastSequence()) { SetLastSequence(largest_seqno); } diff --git a/db/version_edit_test.cc b/db/version_edit_test.cc index 1fa6c0054..da1a85999 100644 --- a/db/version_edit_test.cc +++ b/db/version_edit_test.cc @@ -45,7 +45,7 @@ TEST_F(VersionEditTest, EncodeDecode) { kBig + 500 + i, kBig + 600 + i, false, Temperature::kUnknown, kInvalidBlobFileNumber, 888, 678, kBig + 300 + i /* epoch_number */, "234", "crc32c", - kNullUniqueId64x2, 0); + kNullUniqueId64x2, 0, 0); edit.DeleteFile(4, kBig + 700 + i); } @@ -65,24 +65,24 @@ TEST_F(VersionEditTest, EncodeDecodeNewFile4) { kBig + 600, true, Temperature::kUnknown, kInvalidBlobFileNumber, kUnknownOldestAncesterTime, kUnknownFileCreationTime, 300 /* epoch_number */, kUnknownFileChecksum, - kUnknownFileChecksumFuncName, kNullUniqueId64x2, 0); + kUnknownFileChecksumFuncName, kNullUniqueId64x2, 0, 0); edit.AddFile(4, 301, 3, 100, InternalKey("foo", kBig + 501, kTypeValue), InternalKey("zoo", kBig + 601, kTypeDeletion), kBig + 501, kBig + 601, false, Temperature::kUnknown, kInvalidBlobFileNumber, kUnknownOldestAncesterTime, kUnknownFileCreationTime, 301 /* epoch_number */, kUnknownFileChecksum, - kUnknownFileChecksumFuncName, kNullUniqueId64x2, 0); + kUnknownFileChecksumFuncName, kNullUniqueId64x2, 0, 0); edit.AddFile(5, 302, 0, 100, InternalKey("foo", kBig + 502, kTypeValue), InternalKey("zoo", kBig + 602, kTypeDeletion), kBig + 502, kBig + 602, true, Temperature::kUnknown, kInvalidBlobFileNumber, 666, 888, 302 /* epoch_number */, kUnknownFileChecksum, - kUnknownFileChecksumFuncName, kNullUniqueId64x2, 0); + kUnknownFileChecksumFuncName, kNullUniqueId64x2, 0, 0); edit.AddFile(5, 303, 0, 100, InternalKey("foo", kBig + 503, kTypeBlobIndex), InternalKey("zoo", kBig + 603, kTypeBlobIndex), kBig + 503, kBig + 603, true, Temperature::kUnknown, 1001, kUnknownOldestAncesterTime, kUnknownFileCreationTime, 303 /* epoch_number */, kUnknownFileChecksum, - kUnknownFileChecksumFuncName, kNullUniqueId64x2, 0); + kUnknownFileChecksumFuncName, kNullUniqueId64x2, 0, 0); edit.DeleteFile(4, 700); @@ -123,12 +123,12 @@ TEST_F(VersionEditTest, ForwardCompatibleNewFile4) { kBig + 600, true, Temperature::kUnknown, kInvalidBlobFileNumber, kUnknownOldestAncesterTime, kUnknownFileCreationTime, 300 /* epoch_number */, kUnknownFileChecksum, - kUnknownFileChecksumFuncName, kNullUniqueId64x2, 0); + kUnknownFileChecksumFuncName, kNullUniqueId64x2, 0, 0); edit.AddFile(4, 301, 3, 100, InternalKey("foo", kBig + 501, kTypeValue), InternalKey("zoo", kBig + 601, kTypeDeletion), kBig + 501, kBig + 601, false, Temperature::kUnknown, kInvalidBlobFileNumber, 686, 868, 301 /* epoch_number */, "234", "crc32c", - kNullUniqueId64x2, 0); + kNullUniqueId64x2, 0, 0); edit.DeleteFile(4, 700); edit.SetComparatorName("foo"); @@ -177,7 +177,7 @@ TEST_F(VersionEditTest, NewFile4NotSupportedField) { kBig + 600, true, Temperature::kUnknown, kInvalidBlobFileNumber, kUnknownOldestAncesterTime, kUnknownFileCreationTime, 300 /* epoch_number */, kUnknownFileChecksum, - kUnknownFileChecksumFuncName, kNullUniqueId64x2, 0); + kUnknownFileChecksumFuncName, kNullUniqueId64x2, 0, 0); edit.SetComparatorName("foo"); edit.SetLogNumber(kBig + 100); @@ -208,7 +208,7 @@ TEST_F(VersionEditTest, EncodeEmptyFile) { Temperature::kUnknown, kInvalidBlobFileNumber, kUnknownOldestAncesterTime, kUnknownFileCreationTime, 1 /*epoch_number*/, kUnknownFileChecksum, - kUnknownFileChecksumFuncName, kNullUniqueId64x2, 0); + kUnknownFileChecksumFuncName, kNullUniqueId64x2, 0, 0); std::string buffer; ASSERT_TRUE(!edit.EncodeTo(&buffer)); } diff --git a/db/version_set.cc b/db/version_set.cc index cfe8d6173..436389b81 100644 --- a/db/version_set.cc +++ b/db/version_set.cc @@ -6353,7 +6353,7 @@ Status VersionSet::WriteCurrentStateToManifest( f->oldest_blob_file_number, f->oldest_ancester_time, f->file_creation_time, f->epoch_number, f->file_checksum, f->file_checksum_func_name, f->unique_id, - f->compensated_range_deletion_size); + f->compensated_range_deletion_size, f->tail_size); } } diff --git a/db/version_set_test.cc b/db/version_set_test.cc index 481dd46d9..c0f6d1340 100644 --- a/db/version_set_test.cc +++ b/db/version_set_test.cc @@ -51,7 +51,7 @@ class GenerateLevelFilesBriefTest : public testing::Test { largest_seq, /* marked_for_compact */ false, Temperature::kUnknown, kInvalidBlobFileNumber, kUnknownOldestAncesterTime, kUnknownFileCreationTime, kUnknownEpochNumber, kUnknownFileChecksum, - kUnknownFileChecksumFuncName, kNullUniqueId64x2, 0); + kUnknownFileChecksumFuncName, kNullUniqueId64x2, 0, 0); files_.push_back(f); } @@ -163,7 +163,7 @@ class VersionStorageInfoTestBase : public testing::Test { Temperature::kUnknown, oldest_blob_file_number, kUnknownOldestAncesterTime, kUnknownFileCreationTime, kUnknownEpochNumber, kUnknownFileChecksum, kUnknownFileChecksumFuncName, - kNullUniqueId64x2, compensated_range_deletion_size); + kNullUniqueId64x2, compensated_range_deletion_size, 0); vstorage_.AddFile(level, f); } @@ -3296,7 +3296,7 @@ class VersionSetTestMissingFiles : public VersionSetTestBase, ikey, 0, 0, false, Temperature::kUnknown, 0, 0, 0, info.epoch_number, kUnknownFileChecksum, kUnknownFileChecksumFuncName, kNullUniqueId64x2, - 0); + 0, 0); } } @@ -3353,7 +3353,7 @@ TEST_F(VersionSetTestMissingFiles, ManifestFarBehindSst) { file_num, /*file_path_id=*/0, /*file_size=*/12, smallest_ikey, largest_ikey, 0, 0, false, Temperature::kUnknown, 0, 0, 0, file_num /* epoch_number */, kUnknownFileChecksum, - kUnknownFileChecksumFuncName, kNullUniqueId64x2, 0); + kUnknownFileChecksumFuncName, kNullUniqueId64x2, 0, 0); added_files.emplace_back(0, meta); } WriteFileAdditionAndDeletionToManifest( @@ -3414,7 +3414,7 @@ TEST_F(VersionSetTestMissingFiles, ManifestAheadofSst) { file_num, /*file_path_id=*/0, /*file_size=*/12, smallest_ikey, largest_ikey, 0, 0, false, Temperature::kUnknown, 0, 0, 0, file_num /* epoch_number */, kUnknownFileChecksum, - kUnknownFileChecksumFuncName, kNullUniqueId64x2, 0); + kUnknownFileChecksumFuncName, kNullUniqueId64x2, 0, 0); added_files.emplace_back(0, meta); } WriteFileAdditionAndDeletionToManifest( diff --git a/file/file_prefetch_buffer.h b/file/file_prefetch_buffer.h index 304d1c3bb..77286ddcf 100644 --- a/file/file_prefetch_buffer.h +++ b/file/file_prefetch_buffer.h @@ -180,6 +180,8 @@ class FilePrefetchBuffer { RecordInHistogram(stats_, PREFETCHED_BYTES_DISCARDED, bytes_discarded); } + bool Enabled() const { return enable_; } + // Load data into the buffer from a file. // reader : the file reader. // offset : the file offset to start reading from. diff --git a/file/prefetch_test.cc b/file/prefetch_test.cc index fb3114449..053caacd0 100644 --- a/file/prefetch_test.cc +++ b/file/prefetch_test.cc @@ -236,6 +236,93 @@ TEST_P(PrefetchTest, Basic) { Close(); } +TEST_P(PrefetchTest, BlockBasedTableTailPrefetch) { + const bool support_prefetch = + std::get<0>(GetParam()) && + test::IsPrefetchSupported(env_->GetFileSystem(), dbname_); + // Second param is if directIO is enabled or not + const bool use_direct_io = std::get<1>(GetParam()); + const bool use_file_prefetch_buffer = !support_prefetch || use_direct_io; + + std::shared_ptr fs = + std::make_shared(env_->GetFileSystem(), support_prefetch); + std::unique_ptr env(new CompositeEnvWrapper(env_, fs)); + + Options options; + SetGenericOptions(env.get(), use_direct_io, options); + options.statistics = CreateDBStatistics(); + + BlockBasedTableOptions bbto; + bbto.index_type = BlockBasedTableOptions::kTwoLevelIndexSearch; + bbto.partition_filters = true; + bbto.filter_policy.reset(NewBloomFilterPolicy(10, false)); + options.table_factory.reset(NewBlockBasedTableFactory(bbto)); + + Status s = TryReopen(options); + if (use_direct_io && (s.IsNotSupported() || s.IsInvalidArgument())) { + // If direct IO is not supported, skip the test + ROCKSDB_GTEST_BYPASS("Direct IO is not supported"); + return; + } else { + ASSERT_OK(s); + } + + ASSERT_OK(Put("k1", "v1")); + + HistogramData pre_flush_file_read; + options.statistics->histogramData(FILE_READ_FLUSH_MICROS, + &pre_flush_file_read); + ASSERT_OK(Flush()); + HistogramData post_flush_file_read; + options.statistics->histogramData(FILE_READ_FLUSH_MICROS, + &post_flush_file_read); + if (use_file_prefetch_buffer) { + // `PartitionedFilterBlockReader/PartitionIndexReader::CacheDependencies()` + // should read from the prefetched tail in file prefetch buffer instead of + // initiating extra SST reads. Therefore `BlockBasedTable::PrefetchTail()` + // should be the only SST read in table verification during flush. + ASSERT_EQ(post_flush_file_read.count - pre_flush_file_read.count, 1); + } else { + // Without the prefetched tail in file prefetch buffer, + // `PartitionedFilterBlockReader/PartitionIndexReader::CacheDependencies()` + // will initiate extra SST reads + ASSERT_GT(post_flush_file_read.count - pre_flush_file_read.count, 1); + } + ASSERT_OK(Put("k1", "v2")); + ASSERT_OK(Put("k2", "v2")); + ASSERT_OK(Flush()); + + CompactRangeOptions cro; + HistogramData pre_compaction_file_read; + options.statistics->histogramData(FILE_READ_COMPACTION_MICROS, + &pre_compaction_file_read); + ASSERT_OK(db_->CompactRange(cro, nullptr, nullptr)); + HistogramData post_compaction_file_read; + options.statistics->histogramData(FILE_READ_COMPACTION_MICROS, + &post_compaction_file_read); + if (use_file_prefetch_buffer) { + // `PartitionedFilterBlockReader/PartitionIndexReader::CacheDependencies()` + // should read from the prefetched tail in file prefetch buffer instead of + // initiating extra SST reads. + // + // Therefore the 3 reads are + // (1) `ProcessKeyValueCompaction()` of input file 1 + // (2) `ProcessKeyValueCompaction()` of input file 2 + // (3) `BlockBasedTable::PrefetchTail()` of output file during table + // verification in compaction + ASSERT_EQ(post_compaction_file_read.count - pre_compaction_file_read.count, + 3); + } else { + // Without the prefetched tail in file prefetch buffer, + // `PartitionedFilterBlockReader/PartitionIndexReader::CacheDependencies()` + // as well as reading other parts of the tail (e.g, footer, table + // properties..) will initiate extra SST reads + ASSERT_GT(post_compaction_file_read.count - pre_compaction_file_read.count, + 3); + } + Close(); +} + // This test verifies BlockBasedTableOptions.max_auto_readahead_size is // configured dynamically. TEST_P(PrefetchTest, ConfigureAutoMaxReadaheadSize) { diff --git a/include/rocksdb/table_properties.h b/include/rocksdb/table_properties.h index cbe87fa3a..ab259f930 100644 --- a/include/rocksdb/table_properties.h +++ b/include/rocksdb/table_properties.h @@ -70,6 +70,7 @@ struct TablePropertiesNames { static const std::string kSlowCompressionEstimatedDataSize; static const std::string kFastCompressionEstimatedDataSize; static const std::string kSequenceNumberTimeMapping; + static const std::string kTailStartOffset; }; // `TablePropertiesCollector` provides the mechanism for users to collect @@ -239,6 +240,10 @@ struct TableProperties { // 0 means not exists. uint64_t external_sst_file_global_seqno_offset = 0; + // Offset where the "tail" part of SST file starts + // "Tail" refers to all blocks after data blocks till the end of the SST file + uint64_t tail_start_offset = 0; + // DB identity // db_id is an identifier generated the first time the DB is created // If DB identity is unset or unassigned, `db_id` will be an empty string. diff --git a/table/block_based/block_based_table_builder.cc b/table/block_based/block_based_table_builder.cc index 5121d1b43..3d7a08e77 100644 --- a/table/block_based/block_based_table_builder.cc +++ b/table/block_based/block_based_table_builder.cc @@ -341,6 +341,10 @@ struct BlockBasedTableBuilder::Rep { std::unique_ptr pc_rep; BlockCreateContext create_context; + // The size of the "tail" part of a SST file. "Tail" refers to + // all blocks after data blocks till the end of the SST file. + uint64_t tail_size; + uint64_t get_offset() { return offset.load(std::memory_order_relaxed); } void set_offset(uint64_t o) { offset.store(o, std::memory_order_relaxed); } @@ -456,6 +460,7 @@ struct BlockBasedTableBuilder::Rep { !use_delta_encoding_for_index_values, table_opt.index_type == BlockBasedTableOptions::kBinarySearchWithFirstKey), + tail_size(0), status_ok(true), io_status_ok(true) { if (tbo.target_file_size == 0) { @@ -1908,6 +1913,8 @@ Status BlockBasedTableBuilder::Finish() { } } + r->props.tail_start_offset = r->offset; + // Write meta blocks, metaindex block and footer in the following order. // 1. [meta block: filter] // 2. [meta block: index] @@ -1935,6 +1942,7 @@ Status BlockBasedTableBuilder::Finish() { r->SetStatus(r->CopyIOStatus()); Status ret_status = r->CopyStatus(); assert(!ret_status.ok() || io_status().ok()); + r->tail_size = r->offset - r->props.tail_start_offset; return ret_status; } @@ -1968,6 +1976,8 @@ uint64_t BlockBasedTableBuilder::EstimatedFileSize() const { } } +uint64_t BlockBasedTableBuilder::GetTailSize() const { return rep_->tail_size; } + bool BlockBasedTableBuilder::NeedCompact() const { for (const auto& collector : rep_->table_properties_collectors) { if (collector->NeedCompact()) { diff --git a/table/block_based/block_based_table_builder.h b/table/block_based/block_based_table_builder.h index 32ca30d1b..3949474c5 100644 --- a/table/block_based/block_based_table_builder.h +++ b/table/block_based/block_based_table_builder.h @@ -91,6 +91,10 @@ class BlockBasedTableBuilder : public TableBuilder { // is enabled. uint64_t EstimatedFileSize() const override; + // Get the size of the "tail" part of a SST file. "Tail" refers to + // all blocks after data blocks till the end of the SST file. + uint64_t GetTailSize() const override; + bool NeedCompact() const override; // Get table properties diff --git a/table/block_based/block_based_table_factory.cc b/table/block_based/block_based_table_factory.cc index 2bca07033..653c222d5 100644 --- a/table/block_based/block_based_table_factory.cc +++ b/table/block_based/block_based_table_factory.cc @@ -568,7 +568,7 @@ Status BlockBasedTableFactory::NewTableReader( ro, table_reader_options.ioptions, table_reader_options.env_options, table_options_, table_reader_options.internal_comparator, std::move(file), file_size, table_reader_options.block_protection_bytes_per_key, - table_reader, table_reader_cache_res_mgr_, + table_reader, table_reader_options.tail_size, table_reader_cache_res_mgr_, table_reader_options.prefix_extractor, prefetch_index_and_filter_in_cache, table_reader_options.skip_filters, table_reader_options.level, table_reader_options.immortal, table_reader_options.largest_seqno, diff --git a/table/block_based/block_based_table_factory.h b/table/block_based/block_based_table_factory.h index 1bf870ba6..1f7876977 100644 --- a/table/block_based/block_based_table_factory.h +++ b/table/block_based/block_based_table_factory.h @@ -28,6 +28,9 @@ class BlockBasedTableBuilder; class RandomAccessFileReader; class WritableFileWriter; +// TODO: deprecate this class as it can be replaced with +// `FileMetaData::tail_size` +// // A class used to track actual bytes written from the tail in the recent SST // file opens, and provide a suggestion for following open. class TailPrefetchStats { diff --git a/table/block_based/block_based_table_reader.cc b/table/block_based/block_based_table_reader.cc index b51441661..a5e8e7014 100644 --- a/table/block_based/block_based_table_reader.cc +++ b/table/block_based/block_based_table_reader.cc @@ -561,7 +561,7 @@ Status BlockBasedTable::Open( const InternalKeyComparator& internal_comparator, std::unique_ptr&& file, uint64_t file_size, uint8_t block_protection_bytes_per_key, - std::unique_ptr* table_reader, + std::unique_ptr* table_reader, uint64_t tail_size, std::shared_ptr table_reader_cache_res_mgr, const std::shared_ptr& prefix_extractor, const bool prefetch_index_and_filter_in_cache, const bool skip_filters, @@ -593,7 +593,8 @@ Status BlockBasedTable::Open( if (!ioptions.allow_mmap_reads) { s = PrefetchTail(ro, file.get(), file_size, force_direct_prefetch, tail_prefetch_stats, prefetch_all, preload_all, - &prefetch_buffer, ioptions.stats); + &prefetch_buffer, ioptions.stats, tail_size, + ioptions.logger); // Return error in prefetch path to users. if (!s.ok()) { return s; @@ -807,21 +808,40 @@ Status BlockBasedTable::PrefetchTail( const ReadOptions& ro, RandomAccessFileReader* file, uint64_t file_size, bool force_direct_prefetch, TailPrefetchStats* tail_prefetch_stats, const bool prefetch_all, const bool preload_all, - std::unique_ptr* prefetch_buffer, Statistics* stats) { + std::unique_ptr* prefetch_buffer, Statistics* stats, + uint64_t tail_size, Logger* const logger) { + assert(tail_size <= file_size); + size_t tail_prefetch_size = 0; - if (tail_prefetch_stats != nullptr) { - // Multiple threads may get a 0 (no history) when running in parallel, - // but it will get cleared after the first of them finishes. - tail_prefetch_size = tail_prefetch_stats->GetSuggestedPrefetchSize(); - } - if (tail_prefetch_size == 0) { - // Before read footer, readahead backwards to prefetch data. Do more - // readahead if we're going to read index/filter. - // TODO: This may incorrectly select small readahead in case partitioned - // index/filter is enabled and top-level partition pinning is enabled. - // That's because we need to issue readahead before we read the properties, - // at which point we don't yet know the index type. - tail_prefetch_size = prefetch_all || preload_all ? 512 * 1024 : 4 * 1024; + if (tail_size != 0) { + tail_prefetch_size = tail_size; + } else { + if (tail_prefetch_stats != nullptr) { + // Multiple threads may get a 0 (no history) when running in parallel, + // but it will get cleared after the first of them finishes. + tail_prefetch_size = tail_prefetch_stats->GetSuggestedPrefetchSize(); + } + if (tail_prefetch_size == 0) { + // Before read footer, readahead backwards to prefetch data. Do more + // readahead if we're going to read index/filter. + // TODO: This may incorrectly select small readahead in case partitioned + // index/filter is enabled and top-level partition pinning is enabled. + // That's because we need to issue readahead before we read the + // properties, at which point we don't yet know the index type. + tail_prefetch_size = + prefetch_all || preload_all + ? static_cast(4 * 1024 + 0.01 * file_size) + : 4 * 1024; + + ROCKS_LOG_WARN(logger, + "Tail prefetch size %zu is calculated based on heuristics", + tail_prefetch_size); + } else { + ROCKS_LOG_WARN( + logger, + "Tail prefetch size %zu is calculated based on TailPrefetchStats", + tail_prefetch_size); + } } size_t prefetch_off; size_t prefetch_len; @@ -1140,7 +1160,8 @@ Status BlockBasedTable::PrefetchIndexAndFilterBlocks( // are hence follow the configuration for pin and prefetch regardless of // the value of cache_index_and_filter_blocks if (prefetch_all || pin_partition) { - s = rep_->index_reader->CacheDependencies(ro, pin_partition); + s = rep_->index_reader->CacheDependencies(ro, pin_partition, + prefetch_buffer); } if (!s.ok()) { return s; @@ -1164,7 +1185,7 @@ Status BlockBasedTable::PrefetchIndexAndFilterBlocks( if (filter) { // Refer to the comment above about paritioned indexes always being cached if (prefetch_all || pin_partition) { - s = filter->CacheDependencies(ro, pin_partition); + s = filter->CacheDependencies(ro, pin_partition, prefetch_buffer); if (!s.ok()) { return s; } @@ -1984,8 +2005,8 @@ Status BlockBasedTable::ApproximateKeyAnchors(const ReadOptions& read_options, // `CacheDependencies()` brings all the blocks into cache using one I/O. That // way the full index scan usually finds the index data it is looking for in // cache rather than doing an I/O for each "dependency" (partition). - Status s = - rep_->index_reader->CacheDependencies(read_options, false /* pin */); + Status s = rep_->index_reader->CacheDependencies( + read_options, false /* pin */, nullptr /* prefetch_buffer */); if (!s.ok()) { return s; } diff --git a/table/block_based/block_based_table_reader.h b/table/block_based/block_based_table_reader.h index dafaa4ebf..0f14e05dd 100644 --- a/table/block_based/block_based_table_reader.h +++ b/table/block_based/block_based_table_reader.h @@ -99,7 +99,7 @@ class BlockBasedTable : public TableReader { const InternalKeyComparator& internal_key_comparator, std::unique_ptr&& file, uint64_t file_size, uint8_t block_protection_bytes_per_key, - std::unique_ptr* table_reader, + std::unique_ptr* table_reader, uint64_t tail_size, std::shared_ptr table_reader_cache_res_mgr = nullptr, const std::shared_ptr& prefix_extractor = nullptr, @@ -225,8 +225,9 @@ class BlockBasedTable : public TableReader { virtual size_t ApproximateMemoryUsage() const = 0; // Cache the dependencies of the index reader (e.g. the partitions // of a partitioned index). - virtual Status CacheDependencies(const ReadOptions& /*ro*/, - bool /* pin */) { + virtual Status CacheDependencies( + const ReadOptions& /*ro*/, bool /* pin */, + FilePrefetchBuffer* /* tail_prefetch_buffer */) { return Status::OK(); } }; @@ -458,7 +459,8 @@ class BlockBasedTable : public TableReader { const ReadOptions& ro, RandomAccessFileReader* file, uint64_t file_size, bool force_direct_prefetch, TailPrefetchStats* tail_prefetch_stats, const bool prefetch_all, const bool preload_all, - std::unique_ptr* prefetch_buffer, Statistics* stats); + std::unique_ptr* prefetch_buffer, Statistics* stats, + uint64_t tail_size, Logger* const logger); Status ReadMetaIndexBlock(const ReadOptions& ro, FilePrefetchBuffer* prefetch_buffer, std::unique_ptr* metaindex_block, diff --git a/table/block_based/filter_block.h b/table/block_based/filter_block.h index 957413d10..b14858c02 100644 --- a/table/block_based/filter_block.h +++ b/table/block_based/filter_block.h @@ -163,7 +163,9 @@ class FilterBlockReader { return error_msg; } - virtual Status CacheDependencies(const ReadOptions& /*ro*/, bool /*pin*/) { + virtual Status CacheDependencies( + const ReadOptions& /*ro*/, bool /*pin*/, + FilePrefetchBuffer* /* tail_prefetch_buffer */) { return Status::OK(); } diff --git a/table/block_based/partitioned_filter_block.cc b/table/block_based/partitioned_filter_block.cc index 569b58fef..9c2b61e87 100644 --- a/table/block_based/partitioned_filter_block.cc +++ b/table/block_based/partitioned_filter_block.cc @@ -439,8 +439,8 @@ size_t PartitionedFilterBlockReader::ApproximateMemoryUsage() const { } // TODO(myabandeh): merge this with the same function in IndexReader -Status PartitionedFilterBlockReader::CacheDependencies(const ReadOptions& ro, - bool pin) { +Status PartitionedFilterBlockReader::CacheDependencies( + const ReadOptions& ro, bool pin, FilePrefetchBuffer* tail_prefetch_buffer) { assert(table()); const BlockBasedTable::Rep* const rep = table()->get_rep(); @@ -484,21 +484,22 @@ Status PartitionedFilterBlockReader::CacheDependencies(const ReadOptions& ro, handle.offset() + handle.size() + BlockBasedTable::kBlockTrailerSize; uint64_t prefetch_len = last_off - prefetch_off; std::unique_ptr prefetch_buffer; - rep->CreateFilePrefetchBuffer( - 0, 0, &prefetch_buffer, false /* Implicit autoreadahead */, - 0 /*num_reads_*/, 0 /*num_file_reads_for_auto_readahead*/); - - IOOptions opts; - s = rep->file->PrepareIOOptions(ro, opts); - if (s.ok()) { - s = prefetch_buffer->Prefetch(opts, rep->file.get(), prefetch_off, - static_cast(prefetch_len), - ro.rate_limiter_priority); - } - if (!s.ok()) { - return s; + if (tail_prefetch_buffer == nullptr || !tail_prefetch_buffer->Enabled()) { + rep->CreateFilePrefetchBuffer( + 0, 0, &prefetch_buffer, false /* Implicit autoreadahead */, + 0 /*num_reads_*/, 0 /*num_file_reads_for_auto_readahead*/); + + IOOptions opts; + s = rep->file->PrepareIOOptions(ro, opts); + if (s.ok()) { + s = prefetch_buffer->Prefetch(opts, rep->file.get(), prefetch_off, + static_cast(prefetch_len), + ro.rate_limiter_priority); + } + if (!s.ok()) { + return s; + } } - // After prefetch, read the partitions one by one for (biter.SeekToFirst(); biter.Valid(); biter.Next()) { handle = biter.value().handle; @@ -507,7 +508,8 @@ Status PartitionedFilterBlockReader::CacheDependencies(const ReadOptions& ro, // TODO: Support counter batch update for partitioned index and // filter blocks s = table()->MaybeReadBlockAndLoadToCache( - prefetch_buffer.get(), ro, handle, UncompressionDict::GetEmptyDict(), + prefetch_buffer ? prefetch_buffer.get() : tail_prefetch_buffer, ro, + handle, UncompressionDict::GetEmptyDict(), /* for_compaction */ false, &block, nullptr /* get_context */, &lookup_context, nullptr /* contents */, false); if (!s.ok()) { diff --git a/table/block_based/partitioned_filter_block.h b/table/block_based/partitioned_filter_block.h index 9f7a7be7b..0e3b4a5b3 100644 --- a/table/block_based/partitioned_filter_block.h +++ b/table/block_based/partitioned_filter_block.h @@ -166,7 +166,8 @@ class PartitionedFilterBlockReader BlockCacheLookupContext* lookup_context, const ReadOptions& read_options, FilterManyFunction filter_function) const; - Status CacheDependencies(const ReadOptions& ro, bool pin) override; + Status CacheDependencies(const ReadOptions& ro, bool pin, + FilePrefetchBuffer* tail_prefetch_buffer) override; const InternalKeyComparator* internal_comparator() const; bool index_key_includes_seq() const; diff --git a/table/block_based/partitioned_index_reader.cc b/table/block_based/partitioned_index_reader.cc index d1cc88834..f322cc910 100644 --- a/table/block_based/partitioned_index_reader.cc +++ b/table/block_based/partitioned_index_reader.cc @@ -112,8 +112,8 @@ InternalIteratorBase* PartitionIndexReader::NewIterator( // the first level iter is always on heap and will attempt to delete it // in its destructor. } -Status PartitionIndexReader::CacheDependencies(const ReadOptions& ro, - bool pin) { +Status PartitionIndexReader::CacheDependencies( + const ReadOptions& ro, bool pin, FilePrefetchBuffer* tail_prefetch_buffer) { if (!partition_map_.empty()) { // The dependencies are already cached since `partition_map_` is filled in // an all-or-nothing manner. @@ -162,22 +162,23 @@ Status PartitionIndexReader::CacheDependencies(const ReadOptions& ro, handle.offset() + BlockBasedTable::BlockSizeWithTrailer(handle); uint64_t prefetch_len = last_off - prefetch_off; std::unique_ptr prefetch_buffer; - rep->CreateFilePrefetchBuffer( - 0, 0, &prefetch_buffer, false /*Implicit auto readahead*/, - 0 /*num_reads_*/, 0 /*num_file_reads_for_auto_readahead*/); - IOOptions opts; - { - Status s = rep->file->PrepareIOOptions(ro, opts); - if (s.ok()) { - s = prefetch_buffer->Prefetch(opts, rep->file.get(), prefetch_off, - static_cast(prefetch_len), - ro.rate_limiter_priority); - } - if (!s.ok()) { - return s; + if (tail_prefetch_buffer == nullptr || !tail_prefetch_buffer->Enabled()) { + rep->CreateFilePrefetchBuffer( + 0, 0, &prefetch_buffer, false /*Implicit auto readahead*/, + 0 /*num_reads_*/, 0 /*num_file_reads_for_auto_readahead*/); + IOOptions opts; + { + Status s = rep->file->PrepareIOOptions(ro, opts); + if (s.ok()) { + s = prefetch_buffer->Prefetch(opts, rep->file.get(), prefetch_off, + static_cast(prefetch_len), + ro.rate_limiter_priority); + } + if (!s.ok()) { + return s; + } } } - // For saving "all or nothing" to partition_map_ UnorderedMap> map_in_progress; @@ -191,7 +192,8 @@ Status PartitionIndexReader::CacheDependencies(const ReadOptions& ro, // TODO: Support counter batch update for partitioned index and // filter blocks Status s = table()->MaybeReadBlockAndLoadToCache( - prefetch_buffer.get(), ro, handle, UncompressionDict::GetEmptyDict(), + prefetch_buffer ? prefetch_buffer.get() : tail_prefetch_buffer, ro, + handle, UncompressionDict::GetEmptyDict(), /*for_compaction=*/false, &block.As(), /*get_context=*/nullptr, &lookup_context, /*contents=*/nullptr, /*async_read=*/false); diff --git a/table/block_based/partitioned_index_reader.h b/table/block_based/partitioned_index_reader.h index 58a7877ab..9482fd6b4 100644 --- a/table/block_based/partitioned_index_reader.h +++ b/table/block_based/partitioned_index_reader.h @@ -30,7 +30,8 @@ class PartitionIndexReader : public BlockBasedTable::IndexReaderCommon { IndexBlockIter* iter, GetContext* get_context, BlockCacheLookupContext* lookup_context) override; - Status CacheDependencies(const ReadOptions& ro, bool pin) override; + Status CacheDependencies(const ReadOptions& ro, bool pin, + FilePrefetchBuffer* tail_prefetch_buffer) override; size_t ApproximateMemoryUsage() const override { size_t usage = ApproximateIndexBlockMemoryUsage(); #ifdef ROCKSDB_MALLOC_USABLE_SIZE diff --git a/table/block_fetcher_test.cc b/table/block_fetcher_test.cc index 9696a509d..c2f6552cc 100644 --- a/table/block_fetcher_test.cc +++ b/table/block_fetcher_test.cc @@ -266,9 +266,10 @@ class BlockFetcherTest : public testing::Test { const auto* table_options = table_factory_.GetOptions(); ASSERT_NE(table_options, nullptr); - ASSERT_OK(BlockBasedTable::Open( - ro, ioptions, EnvOptions(), *table_options, comparator, std::move(file), - file_size, 0 /* block_protection_bytes_per_key */, &table_reader)); + ASSERT_OK(BlockBasedTable::Open(ro, ioptions, EnvOptions(), *table_options, + comparator, std::move(file), file_size, + 0 /* block_protection_bytes_per_key */, + &table_reader, 0 /* tail_size */)); table->reset(reinterpret_cast(table_reader.release())); } diff --git a/table/meta_blocks.cc b/table/meta_blocks.cc index 2c58ff9c7..6fea536d6 100644 --- a/table/meta_blocks.cc +++ b/table/meta_blocks.cc @@ -115,6 +115,7 @@ void PropertyBlockBuilder::AddTableProperty(const TableProperties& props) { Add(TablePropertiesNames::kFastCompressionEstimatedDataSize, props.fast_compression_estimated_data_size); } + Add(TablePropertiesNames::kTailStartOffset, props.tail_start_offset); if (!props.db_id.empty()) { Add(TablePropertiesNames::kDbId, props.db_id); } @@ -307,6 +308,8 @@ Status ReadTablePropertiesHelper( &new_table_properties->slow_compression_estimated_data_size}, {TablePropertiesNames::kFastCompressionEstimatedDataSize, &new_table_properties->fast_compression_estimated_data_size}, + {TablePropertiesNames::kTailStartOffset, + &new_table_properties->tail_start_offset}, }; std::string last_key; diff --git a/table/table_builder.h b/table/table_builder.h index e1bb4b557..6d98bce70 100644 --- a/table/table_builder.h +++ b/table/table_builder.h @@ -42,7 +42,8 @@ struct TableReaderOptions { int _level = -1, BlockCacheTracer* const _block_cache_tracer = nullptr, size_t _max_file_size_for_l0_meta_pin = 0, const std::string& _cur_db_session_id = "", uint64_t _cur_file_num = 0, - UniqueId64x2 _unique_id = {}, SequenceNumber _largest_seqno = 0) + UniqueId64x2 _unique_id = {}, SequenceNumber _largest_seqno = 0, + uint64_t _tail_size = 0) : ioptions(_ioptions), prefix_extractor(_prefix_extractor), env_options(_env_options), @@ -57,7 +58,8 @@ struct TableReaderOptions { cur_db_session_id(_cur_db_session_id), cur_file_num(_cur_file_num), unique_id(_unique_id), - block_protection_bytes_per_key(_block_protection_bytes_per_key) {} + block_protection_bytes_per_key(_block_protection_bytes_per_key), + tail_size(_tail_size) {} const ImmutableOptions& ioptions; const std::shared_ptr& prefix_extractor; @@ -89,6 +91,8 @@ struct TableReaderOptions { UniqueId64x2 unique_id; uint8_t block_protection_bytes_per_key; + + uint64_t tail_size; }; struct TableBuilderOptions { @@ -200,6 +204,8 @@ class TableBuilder { // is enabled. virtual uint64_t EstimatedFileSize() const { return FileSize(); } + virtual uint64_t GetTailSize() const { return 0; } + // If the user defined table properties collector suggest the file to // be further compacted. virtual bool NeedCompact() const { return false; } diff --git a/table/table_properties.cc b/table/table_properties.cc index b382281f8..819d4da2a 100644 --- a/table/table_properties.cc +++ b/table/table_properties.cc @@ -303,6 +303,8 @@ const std::string TablePropertiesNames::kFastCompressionEstimatedDataSize = "rocksdb.sample_for_compression.fast.data.size"; const std::string TablePropertiesNames::kSequenceNumberTimeMapping = "rocksdb.seqno.time.map"; +const std::string TablePropertiesNames::kTailStartOffset = + "rocksdb.tail.start.offset"; #ifndef NDEBUG // WARNING: TEST_SetRandomTableProperties assumes the following layout of