From 151242ce46872dd080f99e0bd42e40db38686e25 Mon Sep 17 00:00:00 2001 From: Hui Xiao Date: Fri, 21 Apr 2023 09:07:18 -0700 Subject: [PATCH] Group rocksdb.sst.read.micros stat by IOActivity flush and compaction (#11288) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Summary: **Context:** The existing stat rocksdb.sst.read.micros does not reflect each of compaction and flush cases but aggregate them, which is not so helpful for us to understand IO read behavior of each of them. **Summary** - Update `StopWatch` and `RandomAccessFileReader` to record `rocksdb.sst.read.micros` and `rocksdb.file.{flush/compaction}.read.micros` - Fixed the default histogram in `RandomAccessFileReader` - New field `ReadOptions/IOOptions::io_activity`; Pass `ReadOptions` through paths under db open, flush and compaction to where we can prepare `IOOptions` and pass it to `RandomAccessFileReader` - Use `thread_status_util` for assertion in `DbStressFSWrapper` for continuous testing on we are passing correct `io_activity` under db open, flush and compaction Pull Request resolved: https://github.com/facebook/rocksdb/pull/11288 Test Plan: - **Stress test** - **Db bench 1: rocksdb.sst.read.micros COUNT ≈ sum of rocksdb.file.read.flush.micros's and rocksdb.file.read.compaction.micros's.** (without blob) - May not be exactly the same due to `HistogramStat::Add` only guarantees atomic not accuracy across threads. ``` ./db_bench -db=/dev/shm/testdb/ -statistics=true -benchmarks="fillseq" -key_size=32 -value_size=512 -num=50000 -write_buffer_size=655 -target_file_size_base=655 -disable_auto_compactions=false -compression_type=none -bloom_bits=3 (-use_plain_table=1 -prefix_size=10) ``` ``` // BlockBasedTable rocksdb.sst.read.micros P50 : 2.009374 P95 : 4.968548 P99 : 8.110362 P100 : 43.000000 COUNT : 40456 SUM : 114805 rocksdb.file.read.flush.micros P50 : 1.871841 P95 : 3.872407 P99 : 5.540541 P100 : 43.000000 COUNT : 2250 SUM : 6116 rocksdb.file.read.compaction.micros P50 : 2.023109 P95 : 5.029149 P99 : 8.196910 P100 : 26.000000 COUNT : 38206 SUM : 108689 // PlainTable Does not apply ``` - **Db bench 2: performance** **Read** SETUP: db with 900 files ``` ./db_bench -db=/dev/shm/testdb/ -benchmarks="fillseq" -key_size=32 -value_size=512 -num=50000 -write_buffer_size=655 -disable_auto_compactions=true -target_file_size_base=655 -compression_type=none ```run till convergence ``` ./db_bench -seed=1678564177044286 -use_existing_db=true -db=/dev/shm/testdb -benchmarks=readrandom[-X60] -statistics=true -num=1000000 -disable_auto_compactions=true -compression_type=none -bloom_bits=3 ``` Pre-change `readrandom [AVG 60 runs] : 21568 (± 248) ops/sec` Post-change (no regression, -0.3%) `readrandom [AVG 60 runs] : 21486 (± 236) ops/sec` **Compaction/Flush**run till convergence ``` ./db_bench -db=/dev/shm/testdb2/ -seed=1678564177044286 -benchmarks="fillseq[-X60]" -key_size=32 -value_size=512 -num=50000 -write_buffer_size=655 -disable_auto_compactions=false -target_file_size_base=655 -compression_type=none rocksdb.sst.read.micros COUNT : 33820 rocksdb.sst.read.flush.micros COUNT : 1800 rocksdb.sst.read.compaction.micros COUNT : 32020 ``` Pre-change `fillseq [AVG 46 runs] : 1391 (± 214) ops/sec; 0.7 (± 0.1) MB/sec` Post-change (no regression, ~-0.4%) `fillseq [AVG 46 runs] : 1385 (± 216) ops/sec; 0.7 (± 0.1) MB/sec` Reviewed By: ajkr Differential Revision: D44007011 Pulled By: hx235 fbshipit-source-id: a54c89e4846dfc9a135389edf3f3eedfea257132 --- HISTORY.md | 1 + db/blob/blob_file_cache.cc | 4 +- db/blob/blob_file_cache.h | 3 +- db/blob/blob_file_cache_test.cc | 25 ++- db/blob/blob_file_reader.cc | 59 +++++--- db/blob/blob_file_reader.h | 6 +- db/blob/blob_file_reader_test.cc | 83 +++++----- db/blob/blob_source.cc | 7 +- db/blob/blob_source.h | 4 +- db/blob/blob_source_test.cc | 8 +- db/builder.cc | 9 +- db/builder.h | 4 +- db/column_family.cc | 1 + db/compaction/compaction_iterator.cc | 1 + db/compaction/compaction_job.cc | 32 ++-- db/compaction/compaction_job_test.cc | 8 +- db/compaction/compaction_outputs.cc | 3 +- db/convenience.cc | 10 +- db/corruption_test.cc | 9 +- db/db_impl/compacted_db_impl.cc | 5 + db/db_impl/db_impl.cc | 127 +++++++++++++--- db/db_impl/db_impl.h | 2 +- db/db_impl/db_impl_compaction_flush.cc | 44 ++++-- db/db_impl/db_impl_experimental.cc | 6 +- db/db_impl/db_impl_open.cc | 18 ++- db/db_impl/db_impl_readonly.cc | 10 ++ db/db_impl/db_impl_secondary.cc | 15 ++ db/db_impl/db_impl_write.cc | 16 +- db/db_iter.cc | 1 + db/db_properties_test.cc | 5 +- db/db_rate_limiter_test.cc | 5 +- db/experimental.cc | 4 +- db/external_sst_file_ingestion_job.cc | 4 +- db/flush_job.cc | 32 ++-- db/import_column_family_job.cc | 2 +- db/internal_stats.cc | 27 +++- db/memtable.cc | 1 + db/memtable_list.cc | 12 +- db/perf_context_test.cc | 3 +- db/plain_table_db_test.cc | 9 +- db/repair.cc | 28 +++- db/table_cache.cc | 24 +-- db/table_cache.h | 11 +- db/table_properties_collector_test.cc | 9 +- db/version_builder.cc | 13 +- db/version_builder.h | 2 +- db/version_edit_handler.cc | 28 ++-- db/version_edit_handler.h | 42 +++-- db/version_set.cc | 143 +++++++++++------- db/version_set.h | 70 +++++---- db/version_set_test.cc | 43 +++--- db/version_util.h | 9 +- db/write_batch.cc | 2 + db_stress_tool/db_stress_common.h | 2 + db_stress_tool/db_stress_driver.cc | 9 +- db_stress_tool/db_stress_env_wrapper.h | 34 +++++ db_stress_tool/db_stress_gflags.cc | 5 + db_stress_tool/db_stress_test_base.cc | 2 + db_stress_tool/db_stress_tool.cc | 7 - db_stress_tool/multi_ops_txns_stress.cc | 4 +- db_stress_tool/multi_ops_txns_stress.h | 8 +- file/file_util.cc | 6 +- file/file_util.h | 2 + file/random_access_file_reader.cc | 26 +++- file/random_access_file_reader.h | 5 +- include/rocksdb/env.h | 8 + include/rocksdb/file_system.h | 2 + include/rocksdb/options.h | 3 + include/rocksdb/statistics.h | 6 + include/rocksdb/thread_status.h | 1 + include/rocksdb/utilities/transaction.h | 1 - java/rocksjni/portal.h | 12 ++ .../main/java/org/rocksdb/HistogramType.java | 4 + .../main/java/org/rocksdb/OperationType.java | 3 +- microbench/db_basic_bench.cc | 3 +- monitoring/statistics.cc | 2 + monitoring/thread_status_updater.cc | 24 ++- monitoring/thread_status_updater.h | 17 ++- monitoring/thread_status_util.cc | 37 ++--- monitoring/thread_status_util.h | 12 +- monitoring/thread_status_util_debug.cc | 14 ++ options/options.cc | 35 ++++- table/block_based/block_based_table_reader.cc | 52 ++++--- table/block_based/block_based_table_reader.h | 16 +- .../block_based_table_reader_impl.h | 2 +- .../block_based_table_reader_sync_and_async.h | 2 +- table/block_based/hash_index_reader.cc | 10 +- table/block_based/partitioned_index_reader.cc | 1 + .../block_based/uncompression_dict_reader.cc | 6 +- table/block_based/uncompression_dict_reader.h | 5 +- table/cuckoo/cuckoo_table_builder_test.cc | 5 +- table/cuckoo/cuckoo_table_reader.cc | 7 +- table/cuckoo/cuckoo_table_reader.h | 6 +- table/meta_blocks.cc | 52 ++++--- table/meta_blocks.h | 4 + table/mock_table.cc | 6 +- table/plain/plain_table_reader.cc | 28 ++-- table/plain/plain_table_reader.h | 7 +- table/sst_file_dumper.cc | 6 +- table/sst_file_reader.cc | 3 +- table/table_reader.h | 9 +- table/table_test.cc | 21 ++- tools/db_crashtest.py | 1 + tools/ldb_cmd.cc | 4 +- util/file_checksum_helper.cc | 5 +- util/stop_watch.h | 40 +++-- util/thread_list_test.cc | 2 + util/thread_operation.h | 3 +- utilities/blob_db/blob_db_impl.cc | 10 ++ .../transactions/pessimistic_transaction.cc | 6 +- utilities/transactions/transaction_base.cc | 30 +++- utilities/transactions/write_prepared_txn.cc | 7 +- .../transactions/write_prepared_txn_db.cc | 10 ++ .../transactions/write_unprepared_txn.cc | 7 +- .../transactions/write_unprepared_txn_db.cc | 5 + utilities/ttl/db_ttl_impl.cc | 5 + 116 files changed, 1181 insertions(+), 545 deletions(-) diff --git a/HISTORY.md b/HISTORY.md index ebb95663d..1ca433e8e 100644 --- a/HISTORY.md +++ b/HISTORY.md @@ -17,6 +17,7 @@ ### New Features * Add experimental `PerfContext` counters `iter_{next|prev|seek}_count` for db iterator, each counting the times of corresponding API being called. * Allow runtime changes to whether `WriteBufferManager` allows stall or not by calling `SetAllowStall()` +* New statistics `rocksdb.file.read.{flush|compaction}.micros` that measure read time of block-based SST tables or blob files during flush or compaction. ### Bug Fixes * In block cache tracing, fixed some cases of bad hit/miss information (and more) with MultiGet. diff --git a/db/blob/blob_file_cache.cc b/db/blob/blob_file_cache.cc index 19757946d..deebf8d34 100644 --- a/db/blob/blob_file_cache.cc +++ b/db/blob/blob_file_cache.cc @@ -37,7 +37,7 @@ BlobFileCache::BlobFileCache(Cache* cache, } Status BlobFileCache::GetBlobFileReader( - uint64_t blob_file_number, + const ReadOptions& read_options, uint64_t blob_file_number, CacheHandleGuard* blob_file_reader) { assert(blob_file_reader); assert(blob_file_reader->IsEmpty()); @@ -73,7 +73,7 @@ Status BlobFileCache::GetBlobFileReader( { assert(file_options_); const Status s = BlobFileReader::Create( - *immutable_options_, *file_options_, column_family_id_, + *immutable_options_, read_options, *file_options_, column_family_id_, blob_file_read_hist_, blob_file_number, io_tracer_, &reader); if (!s.ok()) { RecordTick(statistics, NO_FILE_ERRORS); diff --git a/db/blob/blob_file_cache.h b/db/blob/blob_file_cache.h index 6281897d6..a80be7c55 100644 --- a/db/blob/blob_file_cache.h +++ b/db/blob/blob_file_cache.h @@ -32,7 +32,8 @@ class BlobFileCache { BlobFileCache(const BlobFileCache&) = delete; BlobFileCache& operator=(const BlobFileCache&) = delete; - Status GetBlobFileReader(uint64_t blob_file_number, + Status GetBlobFileReader(const ReadOptions& read_options, + uint64_t blob_file_number, CacheHandleGuard* blob_file_reader); private: diff --git a/db/blob/blob_file_cache_test.cc b/db/blob/blob_file_cache_test.cc index d3a61b3c5..8c3c56de9 100644 --- a/db/blob/blob_file_cache_test.cc +++ b/db/blob/blob_file_cache_test.cc @@ -118,7 +118,9 @@ TEST_F(BlobFileCacheTest, GetBlobFileReader) { // First try: reader should be opened and put in cache CacheHandleGuard first; - ASSERT_OK(blob_file_cache.GetBlobFileReader(blob_file_number, &first)); + const ReadOptions read_options; + ASSERT_OK(blob_file_cache.GetBlobFileReader(read_options, blob_file_number, + &first)); ASSERT_NE(first.GetValue(), nullptr); ASSERT_EQ(options.statistics->getTickerCount(NO_FILE_OPENS), 1); ASSERT_EQ(options.statistics->getTickerCount(NO_FILE_ERRORS), 0); @@ -126,7 +128,8 @@ TEST_F(BlobFileCacheTest, GetBlobFileReader) { // Second try: reader should be served from cache CacheHandleGuard second; - ASSERT_OK(blob_file_cache.GetBlobFileReader(blob_file_number, &second)); + ASSERT_OK(blob_file_cache.GetBlobFileReader(read_options, blob_file_number, + &second)); ASSERT_NE(second.GetValue(), nullptr); ASSERT_EQ(options.statistics->getTickerCount(NO_FILE_OPENS), 1); ASSERT_EQ(options.statistics->getTickerCount(NO_FILE_ERRORS), 0); @@ -163,19 +166,21 @@ TEST_F(BlobFileCacheTest, GetBlobFileReader_Race) { CacheHandleGuard first; CacheHandleGuard second; + const ReadOptions read_options; SyncPoint::GetInstance()->SetCallBack( "BlobFileCache::GetBlobFileReader:DoubleCheck", [&](void* /* arg */) { // Disabling sync points to prevent infinite recursion SyncPoint::GetInstance()->DisableProcessing(); - - ASSERT_OK(blob_file_cache.GetBlobFileReader(blob_file_number, &second)); + ASSERT_OK(blob_file_cache.GetBlobFileReader(read_options, + blob_file_number, &second)); ASSERT_NE(second.GetValue(), nullptr); ASSERT_EQ(options.statistics->getTickerCount(NO_FILE_OPENS), 1); ASSERT_EQ(options.statistics->getTickerCount(NO_FILE_ERRORS), 0); }); SyncPoint::GetInstance()->EnableProcessing(); - ASSERT_OK(blob_file_cache.GetBlobFileReader(blob_file_number, &first)); + ASSERT_OK(blob_file_cache.GetBlobFileReader(read_options, blob_file_number, + &first)); ASSERT_NE(first.GetValue(), nullptr); ASSERT_EQ(options.statistics->getTickerCount(NO_FILE_OPENS), 1); ASSERT_EQ(options.statistics->getTickerCount(NO_FILE_ERRORS), 0); @@ -213,8 +218,10 @@ TEST_F(BlobFileCacheTest, GetBlobFileReader_IOError) { CacheHandleGuard reader; + const ReadOptions read_options; ASSERT_TRUE( - blob_file_cache.GetBlobFileReader(blob_file_number, &reader).IsIOError()); + blob_file_cache.GetBlobFileReader(read_options, blob_file_number, &reader) + .IsIOError()); ASSERT_EQ(reader.GetValue(), nullptr); ASSERT_EQ(options.statistics->getTickerCount(NO_FILE_OPENS), 1); ASSERT_EQ(options.statistics->getTickerCount(NO_FILE_ERRORS), 1); @@ -253,8 +260,10 @@ TEST_F(BlobFileCacheTest, GetBlobFileReader_CacheFull) { // strict_capacity_limit is set CacheHandleGuard reader; - ASSERT_TRUE(blob_file_cache.GetBlobFileReader(blob_file_number, &reader) - .IsMemoryLimit()); + const ReadOptions read_options; + ASSERT_TRUE( + blob_file_cache.GetBlobFileReader(read_options, blob_file_number, &reader) + .IsMemoryLimit()); ASSERT_EQ(reader.GetValue(), nullptr); ASSERT_EQ(options.statistics->getTickerCount(NO_FILE_OPENS), 1); ASSERT_EQ(options.statistics->getTickerCount(NO_FILE_ERRORS), 1); diff --git a/db/blob/blob_file_reader.cc b/db/blob/blob_file_reader.cc index da7f2bb12..79c0bf50d 100644 --- a/db/blob/blob_file_reader.cc +++ b/db/blob/blob_file_reader.cc @@ -26,9 +26,10 @@ namespace ROCKSDB_NAMESPACE { Status BlobFileReader::Create( - const ImmutableOptions& immutable_options, const FileOptions& file_options, - uint32_t column_family_id, HistogramImpl* blob_file_read_hist, - uint64_t blob_file_number, const std::shared_ptr& io_tracer, + const ImmutableOptions& immutable_options, const ReadOptions& read_options, + const FileOptions& file_options, uint32_t column_family_id, + HistogramImpl* blob_file_read_hist, uint64_t blob_file_number, + const std::shared_ptr& io_tracer, std::unique_ptr* blob_file_reader) { assert(blob_file_reader); assert(!*blob_file_reader); @@ -52,15 +53,17 @@ Status BlobFileReader::Create( CompressionType compression_type = kNoCompression; { - const Status s = ReadHeader(file_reader.get(), column_family_id, statistics, - &compression_type); + const Status s = + ReadHeader(file_reader.get(), read_options, column_family_id, + statistics, &compression_type); if (!s.ok()) { return s; } } { - const Status s = ReadFooter(file_reader.get(), file_size, statistics); + const Status s = + ReadFooter(file_reader.get(), read_options, file_size, statistics); if (!s.ok()) { return s; } @@ -134,6 +137,7 @@ Status BlobFileReader::OpenFile( } Status BlobFileReader::ReadHeader(const RandomAccessFileReader* file_reader, + const ReadOptions& read_options, uint32_t column_family_id, Statistics* statistics, CompressionType* compression_type) { @@ -151,9 +155,10 @@ Status BlobFileReader::ReadHeader(const RandomAccessFileReader* file_reader, constexpr size_t read_size = BlobLogHeader::kSize; // TODO: rate limit reading headers from blob files. - const Status s = ReadFromFile(file_reader, read_offset, read_size, - statistics, &header_slice, &buf, &aligned_buf, - Env::IO_TOTAL /* rate_limiter_priority */); + const Status s = + ReadFromFile(file_reader, read_options, read_offset, read_size, + statistics, &header_slice, &buf, &aligned_buf, + Env::IO_TOTAL /* rate_limiter_priority */); if (!s.ok()) { return s; } @@ -187,6 +192,7 @@ Status BlobFileReader::ReadHeader(const RandomAccessFileReader* file_reader, } Status BlobFileReader::ReadFooter(const RandomAccessFileReader* file_reader, + const ReadOptions& read_options, uint64_t file_size, Statistics* statistics) { assert(file_size >= BlobLogHeader::kSize + BlobLogFooter::kSize); assert(file_reader); @@ -202,9 +208,10 @@ Status BlobFileReader::ReadFooter(const RandomAccessFileReader* file_reader, constexpr size_t read_size = BlobLogFooter::kSize; // TODO: rate limit reading footers from blob files. - const Status s = ReadFromFile(file_reader, read_offset, read_size, - statistics, &footer_slice, &buf, &aligned_buf, - Env::IO_TOTAL /* rate_limiter_priority */); + const Status s = + ReadFromFile(file_reader, read_options, read_offset, read_size, + statistics, &footer_slice, &buf, &aligned_buf, + Env::IO_TOTAL /* rate_limiter_priority */); if (!s.ok()) { return s; } @@ -232,6 +239,7 @@ Status BlobFileReader::ReadFooter(const RandomAccessFileReader* file_reader, } Status BlobFileReader::ReadFromFile(const RandomAccessFileReader* file_reader, + const ReadOptions& read_options, uint64_t read_offset, size_t read_size, Statistics* statistics, Slice* slice, Buffer* buf, AlignedBuf* aligned_buf, @@ -246,17 +254,23 @@ Status BlobFileReader::ReadFromFile(const RandomAccessFileReader* file_reader, Status s; + IOOptions io_options; + s = file_reader->PrepareIOOptions(read_options, io_options); + if (!s.ok()) { + return s; + } + if (file_reader->use_direct_io()) { constexpr char* scratch = nullptr; - s = file_reader->Read(IOOptions(), read_offset, read_size, slice, scratch, + s = file_reader->Read(io_options, read_offset, read_size, slice, scratch, aligned_buf, rate_limiter_priority); } else { buf->reset(new char[read_size]); constexpr AlignedBuf* aligned_scratch = nullptr; - s = file_reader->Read(IOOptions(), read_offset, read_size, slice, - buf->get(), aligned_scratch, rate_limiter_priority); + s = file_reader->Read(io_options, read_offset, read_size, slice, buf->get(), + aligned_scratch, rate_limiter_priority); } if (!s.ok()) { @@ -324,8 +338,13 @@ Status BlobFileReader::GetBlob( Status s; constexpr bool for_compaction = true; + IOOptions io_options; + s = file_reader_->PrepareIOOptions(read_options, io_options); + if (!s.ok()) { + return s; + } prefetched = prefetch_buffer->TryReadFromCache( - IOOptions(), file_reader_.get(), record_offset, + io_options, file_reader_.get(), record_offset, static_cast(record_size), &record_slice, &s, read_options.rate_limiter_priority, for_compaction); if (!s.ok()) { @@ -338,10 +357,10 @@ Status BlobFileReader::GetBlob( PERF_COUNTER_ADD(blob_read_count, 1); PERF_COUNTER_ADD(blob_read_byte, record_size); PERF_TIMER_GUARD(blob_read_time); - const Status s = ReadFromFile(file_reader_.get(), record_offset, - static_cast(record_size), statistics_, - &record_slice, &buf, &aligned_buf, - read_options.rate_limiter_priority); + const Status s = ReadFromFile( + file_reader_.get(), read_options, record_offset, + static_cast(record_size), statistics_, &record_slice, &buf, + &aligned_buf, read_options.rate_limiter_priority); if (!s.ok()) { return s; } diff --git a/db/blob/blob_file_reader.h b/db/blob/blob_file_reader.h index 75b756da1..990e32540 100644 --- a/db/blob/blob_file_reader.h +++ b/db/blob/blob_file_reader.h @@ -29,6 +29,7 @@ class Statistics; class BlobFileReader { public: static Status Create(const ImmutableOptions& immutable_options, + const ReadOptions& read_options, const FileOptions& file_options, uint32_t column_family_id, HistogramImpl* blob_file_read_hist, @@ -74,15 +75,18 @@ class BlobFileReader { std::unique_ptr* file_reader); static Status ReadHeader(const RandomAccessFileReader* file_reader, + const ReadOptions& read_options, uint32_t column_family_id, Statistics* statistics, CompressionType* compression_type); static Status ReadFooter(const RandomAccessFileReader* file_reader, - uint64_t file_size, Statistics* statistics); + const ReadOptions& read_options, uint64_t file_size, + Statistics* statistics); using Buffer = std::unique_ptr; static Status ReadFromFile(const RandomAccessFileReader* file_reader, + const ReadOptions& read_options, uint64_t read_offset, size_t read_size, Statistics* statistics, Slice* slice, Buffer* buf, AlignedBuf* aligned_buf, diff --git a/db/blob/blob_file_reader_test.cc b/db/blob/blob_file_reader_test.cc index 03458e2b5..c8e4e5954 100644 --- a/db/blob/blob_file_reader_test.cc +++ b/db/blob/blob_file_reader_test.cc @@ -172,12 +172,12 @@ TEST_F(BlobFileReaderTest, CreateReaderAndGetBlob) { std::unique_ptr reader; + ReadOptions read_options; ASSERT_OK(BlobFileReader::Create( - immutable_options, FileOptions(), column_family_id, blob_file_read_hist, - blob_file_number, nullptr /*IOTracer*/, &reader)); + immutable_options, read_options, FileOptions(), column_family_id, + blob_file_read_hist, blob_file_number, nullptr /*IOTracer*/, &reader)); // Make sure the blob can be retrieved with and without checksum verification - ReadOptions read_options; read_options.verify_checksums = false; constexpr FilePrefetchBuffer* prefetch_buffer = nullptr; @@ -479,11 +479,11 @@ TEST_F(BlobFileReaderTest, Malformed) { constexpr HistogramImpl* blob_file_read_hist = nullptr; std::unique_ptr reader; - - ASSERT_TRUE(BlobFileReader::Create(immutable_options, FileOptions(), - column_family_id, blob_file_read_hist, - blob_file_number, nullptr /*IOTracer*/, - &reader) + const ReadOptions read_options; + ASSERT_TRUE(BlobFileReader::Create(immutable_options, read_options, + FileOptions(), column_family_id, + blob_file_read_hist, blob_file_number, + nullptr /*IOTracer*/, &reader) .IsCorruption()); } @@ -513,11 +513,11 @@ TEST_F(BlobFileReaderTest, TTL) { constexpr HistogramImpl* blob_file_read_hist = nullptr; std::unique_ptr reader; - - ASSERT_TRUE(BlobFileReader::Create(immutable_options, FileOptions(), - column_family_id, blob_file_read_hist, - blob_file_number, nullptr /*IOTracer*/, - &reader) + const ReadOptions read_options; + ASSERT_TRUE(BlobFileReader::Create(immutable_options, read_options, + FileOptions(), column_family_id, + blob_file_read_hist, blob_file_number, + nullptr /*IOTracer*/, &reader) .IsCorruption()); } @@ -552,11 +552,11 @@ TEST_F(BlobFileReaderTest, ExpirationRangeInHeader) { constexpr HistogramImpl* blob_file_read_hist = nullptr; std::unique_ptr reader; - - ASSERT_TRUE(BlobFileReader::Create(immutable_options, FileOptions(), - column_family_id, blob_file_read_hist, - blob_file_number, nullptr /*IOTracer*/, - &reader) + const ReadOptions read_options; + ASSERT_TRUE(BlobFileReader::Create(immutable_options, read_options, + FileOptions(), column_family_id, + blob_file_read_hist, blob_file_number, + nullptr /*IOTracer*/, &reader) .IsCorruption()); } @@ -591,11 +591,11 @@ TEST_F(BlobFileReaderTest, ExpirationRangeInFooter) { constexpr HistogramImpl* blob_file_read_hist = nullptr; std::unique_ptr reader; - - ASSERT_TRUE(BlobFileReader::Create(immutable_options, FileOptions(), - column_family_id, blob_file_read_hist, - blob_file_number, nullptr /*IOTracer*/, - &reader) + const ReadOptions read_options; + ASSERT_TRUE(BlobFileReader::Create(immutable_options, read_options, + FileOptions(), column_family_id, + blob_file_read_hist, blob_file_number, + nullptr /*IOTracer*/, &reader) .IsCorruption()); } @@ -629,9 +629,9 @@ TEST_F(BlobFileReaderTest, IncorrectColumnFamily) { std::unique_ptr reader; constexpr uint32_t incorrect_column_family_id = 2; - - ASSERT_TRUE(BlobFileReader::Create(immutable_options, FileOptions(), - incorrect_column_family_id, + const ReadOptions read_options; + ASSERT_TRUE(BlobFileReader::Create(immutable_options, read_options, + FileOptions(), incorrect_column_family_id, blob_file_read_hist, blob_file_number, nullptr /*IOTracer*/, &reader) .IsCorruption()); @@ -664,10 +664,10 @@ TEST_F(BlobFileReaderTest, BlobCRCError) { constexpr HistogramImpl* blob_file_read_hist = nullptr; std::unique_ptr reader; - + const ReadOptions read_options; ASSERT_OK(BlobFileReader::Create( - immutable_options, FileOptions(), column_family_id, blob_file_read_hist, - blob_file_number, nullptr /*IOTracer*/, &reader)); + immutable_options, read_options, FileOptions(), column_family_id, + blob_file_read_hist, blob_file_number, nullptr /*IOTracer*/, &reader)); SyncPoint::GetInstance()->SetCallBack( "BlobFileReader::VerifyBlob:CheckBlobCRC", [](void* arg) { @@ -728,13 +728,12 @@ TEST_F(BlobFileReaderTest, Compression) { constexpr HistogramImpl* blob_file_read_hist = nullptr; std::unique_ptr reader; - + ReadOptions read_options; ASSERT_OK(BlobFileReader::Create( - immutable_options, FileOptions(), column_family_id, blob_file_read_hist, - blob_file_number, nullptr /*IOTracer*/, &reader)); + immutable_options, read_options, FileOptions(), column_family_id, + blob_file_read_hist, blob_file_number, nullptr /*IOTracer*/, &reader)); // Make sure the blob can be retrieved with and without checksum verification - ReadOptions read_options; read_options.verify_checksums = false; constexpr FilePrefetchBuffer* prefetch_buffer = nullptr; @@ -803,10 +802,10 @@ TEST_F(BlobFileReaderTest, UncompressionError) { constexpr HistogramImpl* blob_file_read_hist = nullptr; std::unique_ptr reader; - + const ReadOptions read_options; ASSERT_OK(BlobFileReader::Create( - immutable_options, FileOptions(), column_family_id, blob_file_read_hist, - blob_file_number, nullptr /*IOTracer*/, &reader)); + immutable_options, read_options, FileOptions(), column_family_id, + blob_file_read_hist, blob_file_number, nullptr /*IOTracer*/, &reader)); SyncPoint::GetInstance()->SetCallBack( "BlobFileReader::UncompressBlobIfNeeded:TamperWithResult", [](void* arg) { @@ -895,10 +894,10 @@ TEST_P(BlobFileReaderIOErrorTest, IOError) { constexpr HistogramImpl* blob_file_read_hist = nullptr; std::unique_ptr reader; - + const ReadOptions read_options; const Status s = BlobFileReader::Create( - immutable_options, FileOptions(), column_family_id, blob_file_read_hist, - blob_file_number, nullptr /*IOTracer*/, &reader); + immutable_options, read_options, FileOptions(), column_family_id, + blob_file_read_hist, blob_file_number, nullptr /*IOTracer*/, &reader); const bool fail_during_create = (sync_point_ != "BlobFileReader::GetBlob:ReadFromFile"); @@ -983,10 +982,10 @@ TEST_P(BlobFileReaderDecodingErrorTest, DecodingError) { constexpr HistogramImpl* blob_file_read_hist = nullptr; std::unique_ptr reader; - + const ReadOptions read_options; const Status s = BlobFileReader::Create( - immutable_options, FileOptions(), column_family_id, blob_file_read_hist, - blob_file_number, nullptr /*IOTracer*/, &reader); + immutable_options, read_options, FileOptions(), column_family_id, + blob_file_read_hist, blob_file_number, nullptr /*IOTracer*/, &reader); const bool fail_during_create = sync_point_ != "BlobFileReader::GetBlob:TamperWithResult"; diff --git a/db/blob/blob_source.cc b/db/blob/blob_source.cc index 1e866c7dd..5e5e81355 100644 --- a/db/blob/blob_source.cc +++ b/db/blob/blob_source.cc @@ -209,7 +209,8 @@ Status BlobSource::GetBlob(const ReadOptions& read_options, { CacheHandleGuard blob_file_reader; - s = blob_file_cache_->GetBlobFileReader(file_number, &blob_file_reader); + s = blob_file_cache_->GetBlobFileReader(read_options, file_number, + &blob_file_reader); if (!s.ok()) { return s; } @@ -372,8 +373,8 @@ void BlobSource::MultiGetBlobFromOneFile(const ReadOptions& read_options, } CacheHandleGuard blob_file_reader; - Status s = - blob_file_cache_->GetBlobFileReader(file_number, &blob_file_reader); + Status s = blob_file_cache_->GetBlobFileReader(read_options, file_number, + &blob_file_reader); if (!s.ok()) { for (size_t i = 0; i < _blob_reqs.size(); ++i) { BlobReadRequest* const req = _blob_reqs[i].first; diff --git a/db/blob/blob_source.h b/db/blob/blob_source.h index cdc218747..d5e009b54 100644 --- a/db/blob/blob_source.h +++ b/db/blob/blob_source.h @@ -95,9 +95,9 @@ class BlobSource { uint64_t* bytes_read); inline Status GetBlobFileReader( - uint64_t blob_file_number, + const ReadOptions& read_options, uint64_t blob_file_number, CacheHandleGuard* blob_file_reader) { - return blob_file_cache_->GetBlobFileReader(blob_file_number, + return blob_file_cache_->GetBlobFileReader(read_options, blob_file_number, blob_file_reader); } diff --git a/db/blob/blob_source_test.cc b/db/blob/blob_source_test.cc index a82d5dd0d..a9771565a 100644 --- a/db/blob/blob_source_test.cc +++ b/db/blob/blob_source_test.cc @@ -517,7 +517,8 @@ TEST_F(BlobSourceTest, GetCompressedBlobs) { compression, blob_offsets, blob_sizes); CacheHandleGuard blob_file_reader; - ASSERT_OK(blob_source.GetBlobFileReader(file_number, &blob_file_reader)); + ASSERT_OK(blob_source.GetBlobFileReader(read_options, file_number, + &blob_file_reader)); ASSERT_NE(blob_file_reader.GetValue(), nullptr); const uint64_t file_size = blob_file_reader.GetValue()->GetFileSize(); @@ -1139,12 +1140,13 @@ TEST_F(BlobSecondaryCacheTest, GetBlobsFromSecondaryCache) { blob_file_cache.get()); CacheHandleGuard file_reader; - ASSERT_OK(blob_source.GetBlobFileReader(file_number, &file_reader)); + ReadOptions read_options; + ASSERT_OK( + blob_source.GetBlobFileReader(read_options, file_number, &file_reader)); ASSERT_NE(file_reader.GetValue(), nullptr); const uint64_t file_size = file_reader.GetValue()->GetFileSize(); ASSERT_EQ(file_reader.GetValue()->GetCompressionType(), kNoCompression); - ReadOptions read_options; read_options.verify_checksums = true; auto blob_cache = options_.blob_cache; diff --git a/db/builder.cc b/db/builder.cc index b86dd6b9c..be1ec29bf 100644 --- a/db/builder.cc +++ b/db/builder.cc @@ -56,8 +56,8 @@ TableBuilder* NewTableBuilder(const TableBuilderOptions& tboptions, Status BuildTable( const std::string& dbname, VersionSet* versions, const ImmutableDBOptions& db_options, const TableBuilderOptions& tboptions, - const FileOptions& file_options, TableCache* table_cache, - InternalIterator* iter, + const FileOptions& file_options, const ReadOptions& read_options, + TableCache* table_cache, InternalIterator* iter, std::vector> range_del_iters, FileMetaData* meta, std::vector* blob_file_additions, @@ -255,8 +255,8 @@ Status BuildTable( SizeApproximationOptions approx_opts; approx_opts.files_size_error_margin = 0.1; meta->compensated_range_deletion_size += versions->ApproximateSize( - approx_opts, version, kv.first.Encode(), tombstone_end.Encode(), - 0 /* start_level */, -1 /* end_level */, + approx_opts, read_options, version, kv.first.Encode(), + tombstone_end.Encode(), 0 /* start_level */, -1 /* end_level */, TableReaderCaller::kFlush); } last_tombstone_start_user_key = range_del_it->start_key(); @@ -369,7 +369,6 @@ Status BuildTable( // here because this is a special case after we finish the table building. // No matter whether use_direct_io_for_flush_and_compaction is true, // the goal is to cache it here for further user reads. - ReadOptions read_options; std::unique_ptr it(table_cache->NewIterator( read_options, file_options, tboptions.internal_comparator, *meta, nullptr /* range_del_agg */, mutable_cf_options.prefix_extractor, diff --git a/db/builder.h b/db/builder.h index 063da5ca9..6a6a1866a 100644 --- a/db/builder.h +++ b/db/builder.h @@ -53,8 +53,8 @@ TableBuilder* NewTableBuilder(const TableBuilderOptions& tboptions, extern Status BuildTable( const std::string& dbname, VersionSet* versions, const ImmutableDBOptions& db_options, const TableBuilderOptions& tboptions, - const FileOptions& file_options, TableCache* table_cache, - InternalIterator* iter, + const FileOptions& file_options, const ReadOptions& read_options, + TableCache* table_cache, InternalIterator* iter, std::vector> range_del_iters, FileMetaData* meta, std::vector* blob_file_additions, diff --git a/db/column_family.cc b/db/column_family.cc index b3d04dc6a..24ea46ac4 100644 --- a/db/column_family.cc +++ b/db/column_family.cc @@ -1141,6 +1141,7 @@ Status ColumnFamilyData::RangesOverlapWithMemtables( *overlap = false; // Create an InternalIterator over all unflushed memtables Arena arena; + // TODO: plumb Env::IOActivity ReadOptions read_opts; read_opts.total_order_seek = true; MergeIteratorBuilder merge_iter_builder(&internal_comparator_, &arena); diff --git a/db/compaction/compaction_iterator.cc b/db/compaction/compaction_iterator.cc index c2ac7f692..5be7b565a 100644 --- a/db/compaction/compaction_iterator.cc +++ b/db/compaction/compaction_iterator.cc @@ -1412,6 +1412,7 @@ std::unique_ptr CompactionIterator::CreateBlobFetcherIfNeeded( } ReadOptions read_options; + read_options.io_activity = Env::IOActivity::kCompaction; read_options.fill_cache = false; return std::unique_ptr(new BlobFetcher(version, read_options)); diff --git a/db/compaction/compaction_job.cc b/db/compaction/compaction_job.cc index 331be915e..8a326a508 100644 --- a/db/compaction/compaction_job.cc +++ b/db/compaction/compaction_job.cc @@ -192,8 +192,8 @@ CompactionJob::CompactionJob( assert(log_buffer_ != nullptr); const auto* cfd = compact_->compaction->column_family_data(); - ThreadStatusUtil::SetColumnFamily(cfd, cfd->ioptions()->env, - db_options_.enable_thread_tracking); + ThreadStatusUtil::SetEnableTracking(db_options_.enable_thread_tracking); + ThreadStatusUtil::SetColumnFamily(cfd); ThreadStatusUtil::SetThreadOperation(ThreadStatus::OP_COMPACTION); ReportStartedCompaction(compaction); } @@ -204,10 +204,6 @@ CompactionJob::~CompactionJob() { } void CompactionJob::ReportStartedCompaction(Compaction* compaction) { - const auto* cfd = compact_->compaction->column_family_data(); - ThreadStatusUtil::SetColumnFamily(cfd, cfd->ioptions()->env, - db_options_.enable_thread_tracking); - ThreadStatusUtil::SetThreadOperationProperty(ThreadStatus::COMPACTION_JOB_ID, job_id_); @@ -291,12 +287,14 @@ void CompactionJob::Prepare() { c->immutable_options()->preclude_last_level_data_seconds); if (preserve_time_duration > 0) { + const ReadOptions read_options(Env::IOActivity::kCompaction); // setup seqno_time_mapping_ seqno_time_mapping_.SetMaxTimeDuration(preserve_time_duration); for (const auto& each_level : *c->inputs()) { for (const auto& fmd : each_level.files) { std::shared_ptr tp; - Status s = cfd->current()->GetTableProperties(&tp, fmd, nullptr); + Status s = + cfd->current()->GetTableProperties(read_options, &tp, fmd, nullptr); if (s.ok()) { seqno_time_mapping_.Add(tp->seqno_to_time_mapping) .PermitUncheckedError(); @@ -472,7 +470,7 @@ void CompactionJob::GenSubcompactionBoundaries() { // overlap with N-1 other ranges. Since we requested a relatively large number // (128) of ranges from each input files, even N range overlapping would // cause relatively small inaccuracy. - + const ReadOptions read_options(Env::IOActivity::kCompaction); auto* c = compact_->compaction; if (c->max_subcompactions() <= 1 && !(c->immutable_options()->compaction_pri == kRoundRobin && @@ -506,7 +504,7 @@ void CompactionJob::GenSubcompactionBoundaries() { FileMetaData* f = flevel->files[i].file_metadata; std::vector my_anchors; Status s = cfd->table_cache()->ApproximateKeyAnchors( - ReadOptions(), icomp, *f, my_anchors); + read_options, icomp, *f, my_anchors); if (!s.ok() || my_anchors.empty()) { my_anchors.emplace_back(f->largest.user_key(), f->fd.GetFileSize()); } @@ -722,11 +720,12 @@ Status CompactionJob::Run() { // use_direct_io_for_flush_and_compaction is true, we will regard this // verification as user reads since the goal is to cache it here for // further user reads - ReadOptions read_options; + const ReadOptions verify_table_read_options( + Env::IOActivity::kCompaction); InternalIterator* iter = cfd->table_cache()->NewIterator( - read_options, file_options_, cfd->internal_comparator(), - files_output[file_idx]->meta, /*range_del_agg=*/nullptr, - prefix_extractor, + verify_table_read_options, file_options_, + cfd->internal_comparator(), files_output[file_idx]->meta, + /*range_del_agg=*/nullptr, prefix_extractor, /*table_reader_ptr=*/nullptr, cfd->internal_stats()->GetFileReadHist( compact_->compaction->output_level()), @@ -1032,7 +1031,6 @@ void CompactionJob::NotifyOnSubcompactionCompleted( void CompactionJob::ProcessKeyValueCompaction(SubcompactionState* sub_compact) { assert(sub_compact); assert(sub_compact->compaction); - if (db_options_.compaction_service) { CompactionServiceJobStatus comp_status = ProcessKeyValueCompactionWithCompactionService(sub_compact); @@ -1083,6 +1081,7 @@ void CompactionJob::ProcessKeyValueCompaction(SubcompactionState* sub_compact) { read_options.verify_checksums = true; read_options.fill_cache = false; read_options.rate_limiter_priority = GetRateLimiterPriority(); + read_options.io_activity = Env::IOActivity::kCompaction; // Compaction iterators shouldn't be confined to a single prefix. // Compactions use Seek() for // (a) concurrent compactions, @@ -1640,6 +1639,7 @@ Status CompactionJob::InstallCompactionResults( db_mutex_->AssertHeld(); + const ReadOptions read_options(Env::IOActivity::kCompaction); auto* compaction = compact_->compaction; assert(compaction); @@ -1717,8 +1717,8 @@ Status CompactionJob::InstallCompactionResults( } return versions_->LogAndApply(compaction->column_family_data(), - mutable_cf_options, edit, db_mutex_, - db_directory_); + mutable_cf_options, read_options, edit, + db_mutex_, db_directory_); } void CompactionJob::RecordCompactionIOStats() { diff --git a/db/compaction/compaction_job_test.cc b/db/compaction/compaction_job_test.cc index 578d7067c..9c5784d5e 100644 --- a/db/compaction/compaction_job_test.cc +++ b/db/compaction/compaction_job_test.cc @@ -389,9 +389,9 @@ class CompactionJobTestBase : public testing::Test { 0); mutex_.Lock(); - EXPECT_OK( - versions_->LogAndApply(versions_->GetColumnFamilySet()->GetDefault(), - mutable_cf_options_, &edit, &mutex_, nullptr)); + EXPECT_OK(versions_->LogAndApply( + versions_->GetColumnFamilySet()->GetDefault(), mutable_cf_options_, + read_options_, &edit, &mutex_, nullptr)); mutex_.Unlock(); } @@ -727,6 +727,7 @@ class CompactionJobTestBase : public testing::Test { ColumnFamilyOptions cf_options_; MutableCFOptions mutable_cf_options_; MutableDBOptions mutable_db_options_; + const ReadOptions read_options_; std::shared_ptr table_cache_; WriteController write_controller_; WriteBufferManager write_buffer_manager_; @@ -2440,4 +2441,3 @@ int main(int argc, char** argv) { RegisterCustomObjects(argc, argv); return RUN_ALL_TESTS(); } - diff --git a/db/compaction/compaction_outputs.cc b/db/compaction/compaction_outputs.cc index 3aedc3fe1..cf5105e41 100644 --- a/db/compaction/compaction_outputs.cc +++ b/db/compaction/compaction_outputs.cc @@ -574,6 +574,7 @@ Status CompactionOutputs::AddRangeDels( auto it = range_del_agg_->NewIterator(lower_bound, upper_bound); Slice last_tombstone_start_user_key{}; bool reached_lower_bound = false; + const ReadOptions read_options(Env::IOActivity::kCompaction); for (it->SeekToFirst(); it->Valid(); it->Next()) { auto tombstone = it->Tombstone(); auto kv = tombstone.Serialize(); @@ -713,7 +714,7 @@ Status CompactionOutputs::AddRangeDels( approx_opts.files_size_error_margin = 0.1; auto approximate_covered_size = compaction_->input_version()->version_set()->ApproximateSize( - approx_opts, compaction_->input_version(), + approx_opts, read_options, compaction_->input_version(), tombstone_start.Encode(), tombstone_end.Encode(), compaction_->output_level() + 1 /* start_level */, -1 /* end_level */, kCompaction); diff --git a/db/convenience.cc b/db/convenience.cc index f18473feb..8ab7cbc13 100644 --- a/db/convenience.cc +++ b/db/convenience.cc @@ -33,7 +33,9 @@ Status DeleteFilesInRanges(DB* db, ColumnFamilyHandle* column_family, Status VerifySstFileChecksum(const Options& options, const EnvOptions& env_options, const std::string& file_path) { - return VerifySstFileChecksum(options, env_options, ReadOptions(), file_path); + // TODO: plumb Env::IOActivity + const ReadOptions read_options; + return VerifySstFileChecksum(options, env_options, read_options, file_path); } Status VerifySstFileChecksum(const Options& options, const EnvOptions& env_options, @@ -56,8 +58,9 @@ Status VerifySstFileChecksum(const Options& options, std::unique_ptr file_reader( new RandomAccessFileReader( std::move(file), file_path, ioptions.clock, nullptr /* io_tracer */, - nullptr /* stats */, 0 /* hist_type */, nullptr /* file_read_hist */, - ioptions.rate_limiter.get())); + ioptions.stats /* stats */, + Histograms::SST_READ_MICROS /* hist_type */, + nullptr /* file_read_hist */, ioptions.rate_limiter.get())); const bool kImmortal = true; auto reader_options = TableReaderOptions( ioptions, options.prefix_extractor, env_options, internal_comparator, @@ -76,4 +79,3 @@ Status VerifySstFileChecksum(const Options& options, } } // namespace ROCKSDB_NAMESPACE - diff --git a/db/corruption_test.cc b/db/corruption_test.cc index ab506cdb7..7027181eb 100644 --- a/db/corruption_test.cc +++ b/db/corruption_test.cc @@ -762,9 +762,11 @@ TEST_F(CorruptionTest, RangeDeletionCorrupted) { fs->GetFileSize(filename, file_opts.io_options, &file_size, nullptr)); BlockHandle range_del_handle; - ASSERT_OK(FindMetaBlockInFile( - file_reader.get(), file_size, kBlockBasedTableMagicNumber, - ImmutableOptions(options_), kRangeDelBlockName, &range_del_handle)); + const ReadOptions read_options; + ASSERT_OK(FindMetaBlockInFile(file_reader.get(), file_size, + kBlockBasedTableMagicNumber, + ImmutableOptions(options_), read_options, + kRangeDelBlockName, &range_del_handle)); ASSERT_OK(TryReopen()); ASSERT_OK(test::CorruptFile(env_.get(), filename, @@ -1666,4 +1668,3 @@ int main(int argc, char** argv) { RegisterCustomObjects(argc, argv); return RUN_ALL_TESTS(); } - diff --git a/db/db_impl/compacted_db_impl.cc b/db/db_impl/compacted_db_impl.cc index 70de79858..3d824baf2 100644 --- a/db/db_impl/compacted_db_impl.cc +++ b/db/db_impl/compacted_db_impl.cc @@ -46,6 +46,11 @@ Status CompactedDBImpl::Get(const ReadOptions& options, ColumnFamilyHandle*, Status CompactedDBImpl::Get(const ReadOptions& options, ColumnFamilyHandle*, const Slice& key, PinnableSlice* value, std::string* timestamp) { + if (options.io_activity != Env::IOActivity::kUnknown) { + return Status::InvalidArgument( + "Cannot call Get with `ReadOptions::io_activity` != " + "`Env::IOActivity::kUnknown`"); + } assert(user_comparator_); if (options.timestamp) { const Status s = FailIfTsMismatchCf( diff --git a/db/db_impl/db_impl.cc b/db/db_impl/db_impl.cc index 69350af34..fcfb77731 100644 --- a/db/db_impl/db_impl.cc +++ b/db/db_impl/db_impl.cc @@ -327,6 +327,9 @@ Status DBImpl::Resume() { // means a new super version wouldn't have been installed Status DBImpl::ResumeImpl(DBRecoverContext context) { mutex_.AssertHeld(); + + // TODO: plumb Env::IOActivity + const ReadOptions read_options; WaitForBackgroundWork(); Status s; @@ -368,7 +371,7 @@ Status DBImpl::ResumeImpl(DBRecoverContext context) { assert(cfh); ColumnFamilyData* cfd = cfh->cfd(); const MutableCFOptions& cf_opts = *cfd->GetLatestMutableCFOptions(); - s = versions_->LogAndApply(cfd, cf_opts, &edit, &mutex_, + s = versions_->LogAndApply(cfd, cf_opts, read_options, &edit, &mutex_, directories_.GetDbDir()); if (!s.ok()) { io_s = versions_->io_status(); @@ -1147,6 +1150,8 @@ FSDirectory* DBImpl::GetDataDir(ColumnFamilyData* cfd, size_t path_id) const { Status DBImpl::SetOptions( ColumnFamilyHandle* column_family, const std::unordered_map& options_map) { + // TODO: plumb Env::IOActivity + const ReadOptions read_options; auto* cfd = static_cast_with_check(column_family)->cfd(); if (options_map.empty()) { @@ -1168,8 +1173,8 @@ Status DBImpl::SetOptions( new_options = *cfd->GetLatestMutableCFOptions(); // Append new version to recompute compaction score. VersionEdit dummy_edit; - s = versions_->LogAndApply(cfd, new_options, &dummy_edit, &mutex_, - directories_.GetDbDir()); + s = versions_->LogAndApply(cfd, new_options, read_options, &dummy_edit, + &mutex_, directories_.GetDbDir()); // Trigger possible flush/compactions. This has to be before we persist // options to file, otherwise there will be a deadlock with writer // thread. @@ -1507,7 +1512,9 @@ Status DBImpl::SyncWAL() { } if (status.ok() && synced_wals.IsWalAddition()) { InstrumentedMutexLock l(&mutex_); - status = ApplyWALToManifest(&synced_wals); + // TODO: plumb Env::IOActivity + const ReadOptions read_options; + status = ApplyWALToManifest(read_options, &synced_wals); } TEST_SYNC_POINT("DBImpl::SyncWAL:BeforeMarkLogsSynced:2"); @@ -1515,11 +1522,13 @@ Status DBImpl::SyncWAL() { return status; } -Status DBImpl::ApplyWALToManifest(VersionEdit* synced_wals) { +Status DBImpl::ApplyWALToManifest(const ReadOptions& read_options, + VersionEdit* synced_wals) { // not empty, write to MANIFEST. mutex_.AssertHeld(); + Status status = versions_->LogAndApplyToDefaultColumnFamily( - synced_wals, &mutex_, directories_.GetDbDir()); + read_options, synced_wals, &mutex_, directories_.GetDbDir()); if (!status.ok() && versions_->io_status().IsIOError()) { status = error_handler_.SetBGError(versions_->io_status(), BackgroundErrorReason::kManifestWrite); @@ -1936,6 +1945,12 @@ Status DBImpl::GetEntity(const ReadOptions& read_options, "Cannot call GetEntity without a PinnableWideColumns object"); } + if (read_options.io_activity != Env::IOActivity::kUnknown) { + return Status::InvalidArgument( + "Cannot call GetEntity with `ReadOptions::io_activity` != " + "`Env::IOActivity::kUnknown`"); + } + columns->Reset(); GetImplOptions get_impl_options; @@ -1981,6 +1996,12 @@ Status DBImpl::GetImpl(const ReadOptions& read_options, const Slice& key, assert(get_impl_options.column_family); + if (read_options.io_activity != Env::IOActivity::kUnknown) { + return Status::InvalidArgument( + "Cannot call Get with `ReadOptions::io_activity` != " + "`Env::IOActivity::kUnknown`"); + } + if (read_options.timestamp) { const Status s = FailIfTsMismatchCf(get_impl_options.column_family, *(read_options.timestamp), @@ -2931,6 +2952,11 @@ Status DBImpl::MultiGetImpl( autovector* sorted_keys, SuperVersion* super_version, SequenceNumber snapshot, ReadCallback* callback) { + if (read_options.io_activity != Env::IOActivity::kUnknown) { + return Status::InvalidArgument( + "Cannot call MultiGet with `ReadOptions::io_activity` != " + "`Env::IOActivity::kUnknown`"); + } PERF_CPU_TIMER_GUARD(get_cpu_nanos, immutable_db_options_.clock); StopWatch sw(immutable_db_options_.clock, stats_, DB_MULTIGET); @@ -3129,6 +3155,8 @@ Status DBImpl::CreateColumnFamilies( Status DBImpl::CreateColumnFamilyImpl(const ColumnFamilyOptions& cf_options, const std::string& column_family_name, ColumnFamilyHandle** handle) { + // TODO: plumb Env::IOActivity + const ReadOptions read_options; Status s; *handle = nullptr; @@ -3169,9 +3197,9 @@ Status DBImpl::CreateColumnFamilyImpl(const ColumnFamilyOptions& cf_options, write_thread_.EnterUnbatched(&w, &mutex_); // LogAndApply will both write the creation in MANIFEST and create // ColumnFamilyData object - s = versions_->LogAndApply(nullptr, MutableCFOptions(cf_options), &edit, - &mutex_, directories_.GetDbDir(), false, - &cf_options); + s = versions_->LogAndApply(nullptr, MutableCFOptions(cf_options), + read_options, &edit, &mutex_, + directories_.GetDbDir(), false, &cf_options); write_thread_.ExitUnbatched(&w); } if (s.ok()) { @@ -3250,6 +3278,8 @@ Status DBImpl::DropColumnFamilies( } Status DBImpl::DropColumnFamilyImpl(ColumnFamilyHandle* column_family) { + // TODO: plumb Env::IOActivity + const ReadOptions read_options; auto cfh = static_cast_with_check(column_family); auto cfd = cfh->cfd(); if (cfd->GetID() == 0) { @@ -3272,8 +3302,9 @@ Status DBImpl::DropColumnFamilyImpl(ColumnFamilyHandle* column_family) { // we drop column family from a single write thread WriteThread::Writer w; write_thread_.EnterUnbatched(&w, &mutex_); - s = versions_->LogAndApply(cfd, *cfd->GetLatestMutableCFOptions(), &edit, - &mutex_, directories_.GetDbDir()); + s = versions_->LogAndApply(cfd, *cfd->GetLatestMutableCFOptions(), + read_options, &edit, &mutex_, + directories_.GetDbDir()); write_thread_.ExitUnbatched(&w); } if (s.ok()) { @@ -3324,10 +3355,13 @@ bool DBImpl::KeyMayExist(const ReadOptions& read_options, std::string* value, std::string* timestamp, bool* value_found) { assert(value != nullptr); + assert(read_options.io_activity == Env::IOActivity::kUnknown); + if (value_found != nullptr) { // falsify later if key-may-exist but can't fetch value *value_found = true; } + // TODO: plumb Env::IOActivity ReadOptions roptions = read_options; roptions.read_tier = kBlockCacheTier; // read from block cache only PinnableSlice pinnable_val; @@ -3356,6 +3390,11 @@ Iterator* DBImpl::NewIterator(const ReadOptions& read_options, return NewErrorIterator(Status::NotSupported( "ReadTier::kPersistedData is not yet supported in iterators.")); } + if (read_options.io_activity != Env::IOActivity::kUnknown) { + return NewErrorIterator(Status::InvalidArgument( + "Cannot call NewIterator with `ReadOptions::io_activity` != " + "`Env::IOActivity::kUnknown`")); + } assert(column_family); @@ -3491,6 +3530,11 @@ Status DBImpl::NewIterators( return Status::NotSupported( "ReadTier::kPersistedData is not yet supported in iterators."); } + if (read_options.io_activity != Env::IOActivity::kUnknown) { + return Status::InvalidArgument( + "Cannot call NewIterators with `ReadOptions::io_activity` != " + "`Env::IOActivity::kUnknown`"); + } if (read_options.timestamp) { for (auto* cf : column_families) { @@ -3808,7 +3852,9 @@ Status DBImpl::GetPropertiesOfAllTables(ColumnFamilyHandle* column_family, version->Ref(); mutex_.Unlock(); - auto s = version->GetPropertiesOfAllTables(props); + // TODO: plumb Env::IOActivity + const ReadOptions read_options; + auto s = version->GetPropertiesOfAllTables(read_options, props); // Decrement the ref count mutex_.Lock(); @@ -3830,7 +3876,9 @@ Status DBImpl::GetPropertiesOfTablesInRange(ColumnFamilyHandle* column_family, version->Ref(); mutex_.Unlock(); - auto s = version->GetPropertiesOfTablesInRange(range, n, props); + // TODO: plumb Env::IOActivity + const ReadOptions read_options; + auto s = version->GetPropertiesOfTablesInRange(read_options, range, n, props); // Decrement the ref count mutex_.Lock(); @@ -4163,6 +4211,8 @@ Status DBImpl::GetApproximateSizes(const SizeApproximationOptions& options, SuperVersion* sv = GetAndRefSuperVersion(cfd); v = sv->current; + // TODO: plumb Env::IOActivity + const ReadOptions read_options; for (int i = 0; i < n; i++) { Slice start = range[i].start; Slice limit = range[i].limit; @@ -4184,7 +4234,7 @@ Status DBImpl::GetApproximateSizes(const SizeApproximationOptions& options, sizes[i] = 0; if (options.include_files) { sizes[i] += versions_->ApproximateSize( - options, v, k1.Encode(), k2.Encode(), /*start_level=*/0, + options, read_options, v, k1.Encode(), k2.Encode(), /*start_level=*/0, /*end_level=*/-1, TableReaderCaller::kUserApproximateSize); } if (options.include_memtables) { @@ -4232,6 +4282,8 @@ Status DBImpl::GetUpdatesSince( } Status DBImpl::DeleteFile(std::string name) { + // TODO: plumb Env::IOActivity + const ReadOptions read_options; uint64_t number; FileType type; WalFileType log_type; @@ -4311,7 +4363,8 @@ Status DBImpl::DeleteFile(std::string name) { edit.SetColumnFamily(cfd->GetID()); edit.DeleteFile(level, number); status = versions_->LogAndApply(cfd, *cfd->GetLatestMutableCFOptions(), - &edit, &mutex_, directories_.GetDbDir()); + read_options, &edit, &mutex_, + directories_.GetDbDir()); if (status.ok()) { InstallSuperVersionAndScheduleWork(cfd, &job_context.superversion_contexts[0], @@ -4333,6 +4386,8 @@ Status DBImpl::DeleteFile(std::string name) { Status DBImpl::DeleteFilesInRanges(ColumnFamilyHandle* column_family, const RangePtr* ranges, size_t n, bool include_end) { + // TODO: plumb Env::IOActivity + const ReadOptions read_options; Status status = Status::OK(); auto cfh = static_cast_with_check(column_family); ColumnFamilyData* cfd = cfh->cfd(); @@ -4398,7 +4453,8 @@ Status DBImpl::DeleteFilesInRanges(ColumnFamilyHandle* column_family, } input_version->Ref(); status = versions_->LogAndApply(cfd, *cfd->GetLatestMutableCFOptions(), - &edit, &mutex_, directories_.GetDbDir()); + read_options, &edit, &mutex_, + directories_.GetDbDir()); if (status.ok()) { InstallSuperVersionAndScheduleWork(cfd, &job_context.superversion_contexts[0], @@ -5034,6 +5090,7 @@ Status DBImpl::GetLatestSequenceForKey( MergeContext merge_context; SequenceNumber max_covering_tombstone_seq = 0; + // TODO: plumb Env::IOActivity ReadOptions read_options; SequenceNumber current_seq = versions_->LastSequence(); @@ -5189,6 +5246,8 @@ Status DBImpl::IngestExternalFile( Status DBImpl::IngestExternalFiles( const std::vector& args) { + // TODO: plumb Env::IOActivity + const ReadOptions read_options; if (args.empty()) { return Status::InvalidArgument("ingestion arg list is empty"); } @@ -5406,9 +5465,9 @@ Status DBImpl::IngestExternalFiles( } assert(0 == num_entries); } - status = - versions_->LogAndApply(cfds_to_commit, mutable_cf_options_list, - edit_lists, &mutex_, directories_.GetDbDir()); + status = versions_->LogAndApply(cfds_to_commit, mutable_cf_options_list, + read_options, edit_lists, &mutex_, + directories_.GetDbDir()); // It is safe to update VersionSet last seqno here after LogAndApply since // LogAndApply persists last sequence number from VersionEdits, // which are from file's largest seqno and not from VersionSet. @@ -5509,6 +5568,8 @@ Status DBImpl::CreateColumnFamilyWithImport( const ExportImportFilesMetaData& metadata, ColumnFamilyHandle** handle) { assert(handle != nullptr); assert(*handle == nullptr); + // TODO: plumb Env::IOActivity + const ReadOptions read_options; std::string cf_comparator_name = options.comparator->Name(); if (cf_comparator_name != metadata.db_comparator_name) { return Status::InvalidArgument("Comparator name mismatch"); @@ -5550,8 +5611,9 @@ Status DBImpl::CreateColumnFamilyWithImport( // file, we have to make sure the file number will never being reused. next_file_number = versions_->FetchAddFileNumber(metadata.files.size()); auto cf_options = cfd->GetLatestMutableCFOptions(); - status = versions_->LogAndApply(cfd, *cf_options, &dummy_edit, &mutex_, - directories_.GetDbDir()); + status = + versions_->LogAndApply(cfd, *cf_options, read_options, &dummy_edit, + &mutex_, directories_.GetDbDir()); if (status.ok()) { InstallSuperVersionAndScheduleWork(cfd, &dummy_sv_ctx, *cf_options); } @@ -5587,8 +5649,9 @@ Status DBImpl::CreateColumnFamilyWithImport( // Install job edit [Mutex will be unlocked here] if (status.ok()) { auto cf_options = cfd->GetLatestMutableCFOptions(); - status = versions_->LogAndApply(cfd, *cf_options, import_job.edit(), - &mutex_, directories_.GetDbDir()); + status = versions_->LogAndApply(cfd, *cf_options, read_options, + import_job.edit(), &mutex_, + directories_.GetDbDir()); if (status.ok()) { InstallSuperVersionAndScheduleWork(cfd, &sv_context, *cf_options); } @@ -5648,6 +5711,12 @@ Status DBImpl::VerifyChecksumInternal(const ReadOptions& read_options, Status s; + if (read_options.io_activity != Env::IOActivity::kUnknown) { + s = Status::InvalidArgument( + "Cannot verify file checksum with `ReadOptions::io_activity` != " + "`Env::IOActivity::kUnknown`"); + return s; + } if (use_file_checksum) { FileChecksumGenFactory* const file_checksum_gen_factory = immutable_db_options_.file_checksum_gen_factory.get(); @@ -5761,6 +5830,12 @@ Status DBImpl::VerifyFullFileChecksum(const std::string& file_checksum_expected, const std::string& func_name_expected, const std::string& fname, const ReadOptions& read_options) { + if (read_options.io_activity != Env::IOActivity::kUnknown) { + return Status::InvalidArgument( + "Cannot call VerifyChecksum with `ReadOptions::io_activity` != " + "`Env::IOActivity::kUnknown`"); + } + Status s; if (file_checksum_expected == kUnknownFileChecksum) { return s; @@ -5893,6 +5968,8 @@ Status DBImpl::ReserveFileNumbersBeforeIngestion( ColumnFamilyData* cfd, uint64_t num, std::unique_ptr::iterator>& pending_output_elem, uint64_t* next_file_number) { + // TODO: plumb Env::IOActivity + const ReadOptions read_options; Status s; SuperVersionContext dummy_sv_ctx(true /* create_superversion */); assert(nullptr != next_file_number); @@ -5910,8 +5987,8 @@ Status DBImpl::ReserveFileNumbersBeforeIngestion( // reuse the file number that has already assigned to the internal file, // and this will overwrite the external file. To protect the external // file, we have to make sure the file number will never being reused. - s = versions_->LogAndApply(cfd, *cf_options, &dummy_edit, &mutex_, - directories_.GetDbDir()); + s = versions_->LogAndApply(cfd, *cf_options, read_options, &dummy_edit, + &mutex_, directories_.GetDbDir()); if (s.ok()) { InstallSuperVersionAndScheduleWork(cfd, &dummy_sv_ctx, *cf_options); } diff --git a/db/db_impl/db_impl.h b/db/db_impl/db_impl.h index 28a6a4f31..50f9a8ca5 100644 --- a/db/db_impl/db_impl.h +++ b/db/db_impl/db_impl.h @@ -2127,7 +2127,7 @@ class DBImpl : public DB { // helper function to call after some of the logs_ were synced void MarkLogsSynced(uint64_t up_to, bool synced_dir, VersionEdit* edit); - Status ApplyWALToManifest(VersionEdit* edit); + Status ApplyWALToManifest(const ReadOptions& read_options, VersionEdit* edit); // WALs with log number up to up_to are not synced successfully. void MarkLogsNotSynced(uint64_t up_to); diff --git a/db/db_impl/db_impl_compaction_flush.cc b/db/db_impl/db_impl_compaction_flush.cc index 47ce69aeb..06cc2e6a8 100644 --- a/db/db_impl/db_impl_compaction_flush.cc +++ b/db/db_impl/db_impl_compaction_flush.cc @@ -229,7 +229,9 @@ Status DBImpl::FlushMemTableToOutputFile( log_io_s = SyncClosedLogs(job_context, &synced_wals); mutex_.Lock(); if (log_io_s.ok() && synced_wals.IsWalAddition()) { - log_io_s = status_to_io_status(ApplyWALToManifest(&synced_wals)); + const ReadOptions read_options(Env::IOActivity::kFlush); + log_io_s = + status_to_io_status(ApplyWALToManifest(read_options, &synced_wals)); TEST_SYNC_POINT_CALLBACK("DBImpl::FlushMemTableToOutputFile:CommitWal:1", nullptr); } @@ -492,7 +494,9 @@ Status DBImpl::AtomicFlushMemTablesToOutputFiles( log_io_s = SyncClosedLogs(job_context, &synced_wals); mutex_.Lock(); if (log_io_s.ok() && synced_wals.IsWalAddition()) { - log_io_s = status_to_io_status(ApplyWALToManifest(&synced_wals)); + const ReadOptions read_options(Env::IOActivity::kFlush); + log_io_s = + status_to_io_status(ApplyWALToManifest(read_options, &synced_wals)); } if (!log_io_s.ok() && !log_io_s.IsShutdownInProgress() && @@ -956,6 +960,9 @@ Status DBImpl::IncreaseFullHistoryTsLowImpl(ColumnFamilyData* cfd, VersionEdit edit; edit.SetColumnFamily(cfd->GetID()); edit.SetFullHistoryTsLow(ts_low); + + // TODO: plumb Env::IOActivity + const ReadOptions read_options; TEST_SYNC_POINT_CALLBACK("DBImpl::IncreaseFullHistoryTsLowImpl:BeforeEdit", &edit); @@ -969,7 +976,8 @@ Status DBImpl::IncreaseFullHistoryTsLowImpl(ColumnFamilyData* cfd, } Status s = versions_->LogAndApply(cfd, *cfd->GetLatestMutableCFOptions(), - &edit, &mutex_, directories_.GetDbDir()); + read_options, &edit, &mutex_, + directories_.GetDbDir()); if (!s.ok()) { return s; } @@ -1080,6 +1088,7 @@ Status DBImpl::CompactRangeInternal(const CompactRangeOptions& options, ReadOptions ro; ro.total_order_seek = true; + ro.io_activity = Env::IOActivity::kCompaction; bool overlap; for (int level = 0; level < current_version->storage_info()->num_non_empty_levels(); @@ -1639,6 +1648,8 @@ Status DBImpl::ReFitLevel(ColumnFamilyData* cfd, int level, int target_level) { return Status::InvalidArgument("Target level exceeds number of levels"); } + const ReadOptions read_options(Env::IOActivity::kCompaction); + SuperVersionContext sv_context(/* create_superversion */ true); InstrumentedMutexLock guard_lock(&mutex_); @@ -1753,8 +1764,9 @@ Status DBImpl::ReFitLevel(ColumnFamilyData* cfd, int level, int target_level) { "[%s] Apply version edit:\n%s", cfd->GetName().c_str(), edit.DebugString().data()); - Status status = versions_->LogAndApply(cfd, mutable_cf_options, &edit, - &mutex_, directories_.GetDbDir()); + Status status = + versions_->LogAndApply(cfd, mutable_cf_options, read_options, &edit, + &mutex_, directories_.GetDbDir()); cfd->compaction_picker()->UnregisterCompaction(c.get()); c.reset(); @@ -3189,6 +3201,8 @@ Status DBImpl::BackgroundCompaction(bool* made_progress, mutex_.AssertHeld(); TEST_SYNC_POINT("DBImpl::BackgroundCompaction:Start"); + const ReadOptions read_options(Env::IOActivity::kCompaction); + bool is_manual = (manual_compaction != nullptr); std::unique_ptr c; if (prepicked_compaction != nullptr && @@ -3399,9 +3413,9 @@ Status DBImpl::BackgroundCompaction(bool* made_progress, for (const auto& f : *c->inputs(0)) { c->edit()->DeleteFile(c->level(), f->fd.GetNumber()); } - status = versions_->LogAndApply(c->column_family_data(), - *c->mutable_cf_options(), c->edit(), - &mutex_, directories_.GetDbDir()); + status = versions_->LogAndApply( + c->column_family_data(), *c->mutable_cf_options(), read_options, + c->edit(), &mutex_, directories_.GetDbDir()); io_s = versions_->io_status(); InstallSuperVersionAndScheduleWork(c->column_family_data(), &job_context->superversion_contexts[0], @@ -3418,9 +3432,7 @@ Status DBImpl::BackgroundCompaction(bool* made_progress, c->column_family_data()); // Instrument for event update // TODO(yhchiang): add op details for showing trivial-move. - ThreadStatusUtil::SetColumnFamily( - c->column_family_data(), c->column_family_data()->ioptions()->env, - immutable_db_options_.enable_thread_tracking); + ThreadStatusUtil::SetColumnFamily(c->column_family_data()); ThreadStatusUtil::SetThreadOperation(ThreadStatus::OP_COMPACTION); compaction_job_stats.num_input_files = c->num_input_files(0); @@ -3466,9 +3478,9 @@ Status DBImpl::BackgroundCompaction(bool* made_progress, vstorage->GetNextCompactCursor(start_level, c->num_input_files(0))); } } - status = versions_->LogAndApply(c->column_family_data(), - *c->mutable_cf_options(), c->edit(), - &mutex_, directories_.GetDbDir()); + status = versions_->LogAndApply( + c->column_family_data(), *c->mutable_cf_options(), read_options, + c->edit(), &mutex_, directories_.GetDbDir()); io_s = versions_->io_status(); // Use latest MutableCFOptions InstallSuperVersionAndScheduleWork(c->column_family_data(), @@ -3782,6 +3794,8 @@ void DBImpl::BuildCompactionJobInfo( compaction_job_info->table_properties = c->GetOutputTableProperties(); compaction_job_info->compaction_reason = c->compaction_reason(); compaction_job_info->compression = c->output_compression(); + + const ReadOptions read_options(Env::IOActivity::kCompaction); for (size_t i = 0; i < c->num_input_levels(); ++i) { for (const auto fmd : *c->inputs(i)) { const FileDescriptor& desc = fmd->fd; @@ -3793,7 +3807,7 @@ void DBImpl::BuildCompactionJobInfo( static_cast(i), file_number, fmd->oldest_blob_file_number}); if (compaction_job_info->table_properties.count(fn) == 0) { std::shared_ptr tp; - auto s = current->GetTableProperties(&tp, fmd, &fn); + auto s = current->GetTableProperties(read_options, &tp, fmd, &fn); if (s.ok()) { compaction_job_info->table_properties[fn] = tp; } diff --git a/db/db_impl/db_impl_experimental.cc b/db/db_impl/db_impl_experimental.cc index 3e43fe498..5e3b7ba61 100644 --- a/db/db_impl/db_impl_experimental.cc +++ b/db/db_impl/db_impl_experimental.cc @@ -61,7 +61,8 @@ Status DBImpl::PromoteL0(ColumnFamilyHandle* column_family, int target_level) { "PromoteL0 FAILED. Invalid target level %d\n", target_level); return Status::InvalidArgument("Invalid target level"); } - + // TODO: plumb Env::IOActivity + const ReadOptions read_options; Status status; VersionEdit edit; JobContext job_context(next_job_id_.fetch_add(1), true); @@ -141,7 +142,8 @@ Status DBImpl::PromoteL0(ColumnFamilyHandle* column_family, int target_level) { } status = versions_->LogAndApply(cfd, *cfd->GetLatestMutableCFOptions(), - &edit, &mutex_, directories_.GetDbDir()); + read_options, &edit, &mutex_, + directories_.GetDbDir()); if (status.ok()) { InstallSuperVersionAndScheduleWork(cfd, &job_context.superversion_contexts[0], diff --git a/db/db_impl/db_impl_open.cc b/db/db_impl/db_impl_open.cc index 1758f4cc6..91ef84266 100644 --- a/db/db_impl/db_impl_open.cc +++ b/db/db_impl/db_impl_open.cc @@ -19,6 +19,7 @@ #include "file/writable_file_writer.h" #include "logging/logging.h" #include "monitoring/persistent_stats_history.h" +#include "monitoring/thread_status_util.h" #include "options/options_helper.h" #include "rocksdb/table.h" #include "rocksdb/wal_filter.h" @@ -923,8 +924,9 @@ Status DBImpl::InitPersistStatsColumnFamily() { Status DBImpl::LogAndApplyForRecovery(const RecoveryContext& recovery_ctx) { mutex_.AssertHeld(); assert(versions_->descriptor_log_ == nullptr); + const ReadOptions read_options(Env::IOActivity::kDBOpen); Status s = versions_->LogAndApply( - recovery_ctx.cfds_, recovery_ctx.mutable_cf_opts_, + recovery_ctx.cfds_, recovery_ctx.mutable_cf_opts_, read_options, recovery_ctx.edit_lists_, &mutex_, directories_.GetDbDir()); if (s.ok() && !(recovery_ctx.files_to_delete_.empty())) { mutex_.Unlock(); @@ -1577,6 +1579,7 @@ Status DBImpl::WriteLevel0TableForRecovery(int job_id, ColumnFamilyData* cfd, meta.fd = FileDescriptor(versions_->NewFileNumber(), 0, 0); ReadOptions ro; ro.total_order_seek = true; + ro.io_activity = Env::IOActivity::kDBOpen; Arena arena; Status s; TableProperties table_properties; @@ -1635,10 +1638,11 @@ Status DBImpl::WriteLevel0TableForRecovery(int job_id, ColumnFamilyData* cfd, SeqnoToTimeMapping empty_seqno_time_mapping; Version* version = cfd->current(); version->Ref(); + const ReadOptions read_option(Env::IOActivity::kDBOpen); s = BuildTable( dbname_, versions_.get(), immutable_db_options_, tboptions, - file_options_for_compaction_, cfd->table_cache(), iter.get(), - std::move(range_del_iters), &meta, &blob_file_additions, + file_options_for_compaction_, read_option, cfd->table_cache(), + iter.get(), std::move(range_del_iters), &meta, &blob_file_additions, snapshot_seqs, earliest_write_conflict_snapshot, kMaxSequenceNumber, snapshot_checker, paranoid_file_checks, cfd->internal_stats(), &io_s, io_tracer_, BlobFileCreationReason::kRecovery, @@ -1739,8 +1743,12 @@ Status DB::Open(const DBOptions& db_options, const std::string& dbname, std::vector* handles, DB** dbptr) { const bool kSeqPerBatch = true; const bool kBatchPerTxn = true; - return DBImpl::Open(db_options, dbname, column_families, handles, dbptr, - !kSeqPerBatch, kBatchPerTxn); + ThreadStatusUtil::SetEnableTracking(db_options.enable_thread_tracking); + ThreadStatusUtil::SetThreadOperation(ThreadStatus::OperationType::OP_DBOPEN); + Status s = DBImpl::Open(db_options, dbname, column_families, handles, dbptr, + !kSeqPerBatch, kBatchPerTxn); + ThreadStatusUtil::ResetThreadStatus(); + return s; } // TODO: Implement the trimming in flush code path. diff --git a/db/db_impl/db_impl_readonly.cc b/db/db_impl/db_impl_readonly.cc index 13bc37edb..871cf8085 100644 --- a/db/db_impl/db_impl_readonly.cc +++ b/db/db_impl/db_impl_readonly.cc @@ -40,6 +40,11 @@ Status DBImplReadOnly::Get(const ReadOptions& read_options, ColumnFamilyHandle* column_family, const Slice& key, PinnableSlice* pinnable_val, std::string* timestamp) { + if (read_options.io_activity != Env::IOActivity::kUnknown) { + return Status::InvalidArgument( + "Cannot call Get with `ReadOptions::io_activity` != " + "`Env::IOActivity::kUnknown`"); + } assert(pinnable_val != nullptr); // TODO: stopwatch DB_GET needed?, perf timer needed? PERF_TIMER_GUARD(get_snapshot_time); @@ -112,6 +117,11 @@ Status DBImplReadOnly::Get(const ReadOptions& read_options, Iterator* DBImplReadOnly::NewIterator(const ReadOptions& read_options, ColumnFamilyHandle* column_family) { + if (read_options.io_activity != Env::IOActivity::kUnknown) { + return NewErrorIterator(Status::InvalidArgument( + "Cannot call NewIterator with `ReadOptions::io_activity` != " + "`Env::IOActivity::kUnknown`")); + } assert(column_family); if (read_options.timestamp) { const Status s = FailIfTsMismatchCf( diff --git a/db/db_impl/db_impl_secondary.cc b/db/db_impl/db_impl_secondary.cc index f4ee4afbc..c6fcefddc 100644 --- a/db/db_impl/db_impl_secondary.cc +++ b/db/db_impl/db_impl_secondary.cc @@ -345,6 +345,11 @@ Status DBImplSecondary::GetImpl(const ReadOptions& read_options, ColumnFamilyHandle* column_family, const Slice& key, PinnableSlice* pinnable_val, std::string* timestamp) { + if (read_options.io_activity != Env::IOActivity::kUnknown) { + return Status::InvalidArgument( + "Cannot call Get with `ReadOptions::io_activity` != " + "`Env::IOActivity::kUnknown`"); + } assert(pinnable_val != nullptr); PERF_CPU_TIMER_GUARD(get_cpu_nanos, immutable_db_options_.clock); StopWatch sw(immutable_db_options_.clock, stats_, DB_GET); @@ -445,6 +450,11 @@ Iterator* DBImplSecondary::NewIterator(const ReadOptions& read_options, return NewErrorIterator(Status::NotSupported( "ReadTier::kPersistedData is not yet supported in iterators.")); } + if (read_options.io_activity != Env::IOActivity::kUnknown) { + return NewErrorIterator(Status::InvalidArgument( + "Cannot call NewIterator with `ReadOptions::io_activity` != " + "`Env::IOActivity::kUnknown`")); + } assert(column_family); if (read_options.timestamp) { @@ -511,6 +521,11 @@ Status DBImplSecondary::NewIterators( return Status::NotSupported( "ReadTier::kPersistedData is not yet supported in iterators."); } + if (read_options.io_activity != Env::IOActivity::kUnknown) { + return Status::InvalidArgument( + "Cannot call NewIterators with `ReadOptions::io_activity` != " + "`Env::IOActivity::kUnknown`"); + } ReadCallback* read_callback = nullptr; // No read callback provided. if (iterators == nullptr) { return Status::InvalidArgument("iterators not allowed to be nullptr"); diff --git a/db/db_impl/db_impl_write.cc b/db/db_impl/db_impl_write.cc index 89a054e4c..fb74434dd 100644 --- a/db/db_impl/db_impl_write.cc +++ b/db/db_impl/db_impl_write.cc @@ -606,7 +606,9 @@ Status DBImpl::WriteImpl(const WriteOptions& write_options, log_write_mutex_.Unlock(); if (status.ok() && synced_wals.IsWalAddition()) { InstrumentedMutexLock l(&mutex_); - status = ApplyWALToManifest(&synced_wals); + // TODO: plumb Env::IOActivity + const ReadOptions read_options; + status = ApplyWALToManifest(read_options, &synced_wals); } // Requesting sync with two_write_queues_ is expected to be very rare. We @@ -767,7 +769,9 @@ Status DBImpl::PipelinedWriteImpl(const WriteOptions& write_options, } if (w.status.ok() && synced_wals.IsWalAddition()) { InstrumentedMutexLock l(&mutex_); - w.status = ApplyWALToManifest(&synced_wals); + // TODO: plumb Env::IOActivity + const ReadOptions read_options; + w.status = ApplyWALToManifest(read_options, &synced_wals); } write_thread_.ExitAsBatchGroupLeader(wal_write_group, w.status); } @@ -1805,7 +1809,7 @@ Status DBImpl::DelayWrite(uint64_t num_bytes, WriteThread& write_thread, bool delayed = false; { StopWatch sw(immutable_db_options_.clock, stats_, WRITE_STALL, - &time_delayed); + Histograms::HISTOGRAM_ENUM_MAX, &time_delayed); // To avoid parallel timed delays (bad throttling), only support them // on the primary write queue. uint64_t delay; @@ -2086,6 +2090,8 @@ void DBImpl::NotifyOnMemTableSealed(ColumnFamilyData* /*cfd*/, // two_write_queues_ is true (This is to simplify the reasoning.) Status DBImpl::SwitchMemtable(ColumnFamilyData* cfd, WriteContext* context) { mutex_.AssertHeld(); + // TODO: plumb Env::IOActivity + const ReadOptions read_options; log::Writer* new_log = nullptr; MemTable* new_mem = nullptr; IOStatus io_s; @@ -2237,8 +2243,8 @@ Status DBImpl::SwitchMemtable(ColumnFamilyData* cfd, WriteContext* context) { VersionEdit wal_deletion; wal_deletion.DeleteWalsBefore(min_wal_number_to_keep); - s = versions_->LogAndApplyToDefaultColumnFamily(&wal_deletion, &mutex_, - directories_.GetDbDir()); + s = versions_->LogAndApplyToDefaultColumnFamily( + read_options, &wal_deletion, &mutex_, directories_.GetDbDir()); if (!s.ok() && versions_->io_status().IsIOError()) { s = error_handler_.SetBGError(versions_->io_status(), BackgroundErrorReason::kManifestWrite); diff --git a/db/db_iter.cc b/db/db_iter.cc index d564a7ac1..efa5746ff 100644 --- a/db/db_iter.cc +++ b/db/db_iter.cc @@ -196,6 +196,7 @@ bool DBIter::SetBlobValueIfNeeded(const Slice& user_key, // TODO: consider moving ReadOptions from ArenaWrappedDBIter to DBIter to // avoid having to copy options back and forth. + // TODO: plumb Env::IOActivity ReadOptions read_options; read_options.read_tier = read_tier_; read_options.fill_cache = fill_cache_; diff --git a/db/db_properties_test.cc b/db/db_properties_test.cc index 074f4e9a8..085ee064c 100644 --- a/db/db_properties_test.cc +++ b/db/db_properties_test.cc @@ -2333,8 +2333,9 @@ TEST_F(DBPropertiesTest, TableMetaIndexKeys) { // Read metaindex BlockContents bc; - ASSERT_OK(ReadMetaIndexBlockInFile(r.get(), file_size, 0U, - ImmutableOptions(options), &bc)); + const ReadOptions read_options; + ASSERT_OK(ReadMetaIndexBlockInFile( + r.get(), file_size, 0U, ImmutableOptions(options), read_options, &bc)); Block metaindex_block(std::move(bc)); std::unique_ptr meta_iter; meta_iter.reset(metaindex_block.NewMetaIterator()); diff --git a/db/db_rate_limiter_test.cc b/db/db_rate_limiter_test.cc index acea673cb..84c2df230 100644 --- a/db/db_rate_limiter_test.cc +++ b/db/db_rate_limiter_test.cc @@ -235,8 +235,9 @@ TEST_P(DBRateLimiterOnReadTest, VerifyChecksum) { ASSERT_EQ(0, options_.rate_limiter->GetTotalRequests(Env::IO_USER)); ASSERT_OK(db_->VerifyChecksum(GetReadOptions())); - // The files are tiny so there should have just been one read per file. - int expected = kNumFiles; + // There are 3 reads per file: ReadMetaIndexBlock, + // VerifyChecksumInMetaBlocks, VerifyChecksumInBlocks + int expected = kNumFiles * 3; ASSERT_EQ(expected, options_.rate_limiter->GetTotalRequests(Env::IO_USER)); } diff --git a/db/experimental.cc b/db/experimental.cc index 0e49eeca0..c2dce7fde 100644 --- a/db/experimental.cc +++ b/db/experimental.cc @@ -38,6 +38,8 @@ Status UpdateManifestForFilesState( const DBOptions& db_opts, const std::string& db_name, const std::vector& column_families, const UpdateManifestForFilesStateOptions& opts) { + // TODO: plumb Env::IOActivity + const ReadOptions read_options; OfflineManifestWriter w(db_opts, db_name); Status s = w.Recover(column_families); @@ -114,7 +116,7 @@ Status UpdateManifestForFilesState( std::unique_ptr db_dir; s = fs->NewDirectory(db_name, IOOptions(), &db_dir, nullptr); if (s.ok()) { - s = w.LogAndApply(cfd, &edit, db_dir.get()); + s = w.LogAndApply(read_options, cfd, &edit, db_dir.get()); } if (s.ok()) { ++cfs_updated; diff --git a/db/external_sst_file_ingestion_job.cc b/db/external_sst_file_ingestion_job.cc index 428c8bc6a..98bd6050a 100644 --- a/db/external_sst_file_ingestion_job.cc +++ b/db/external_sst_file_ingestion_job.cc @@ -692,6 +692,7 @@ Status ExternalSstFileIngestionJob::GetIngestedFileInfo( // If customized readahead size is needed, we can pass a user option // all the way to here. Right now we just rely on the default readahead // to keep things simple. + // TODO: plumb Env::IOActivity ReadOptions ro; ro.readahead_size = ingestion_options_.verify_checksums_readahead_size; status = table_reader->VerifyChecksum( @@ -745,6 +746,7 @@ Status ExternalSstFileIngestionJob::GetIngestedFileInfo( file_to_ingest->num_range_deletions = props->num_range_deletions; ParsedInternalKey key; + // TODO: plumb Env::IOActivity ReadOptions ro; std::unique_ptr iter(table_reader->NewIterator( ro, sv->mutable_cf_options.prefix_extractor.get(), /*arena=*/nullptr, @@ -855,6 +857,7 @@ Status ExternalSstFileIngestionJob::AssignLevelAndSeqnoForIngestedFile( bool overlap_with_db = false; Arena arena; + // TODO: plumb Env::IOActivity ReadOptions ro; ro.total_order_seek = true; int target_level = 0; @@ -1088,4 +1091,3 @@ Status ExternalSstFileIngestionJob::SyncIngestedFile(TWritableFile* file) { } } // namespace ROCKSDB_NAMESPACE - diff --git a/db/flush_job.cc b/db/flush_job.cc index 8193f594f..a3ffc2707 100644 --- a/db/flush_job.cc +++ b/db/flush_job.cc @@ -141,11 +141,12 @@ FlushJob::FlushJob( FlushJob::~FlushJob() { ThreadStatusUtil::ResetThreadStatus(); } void FlushJob::ReportStartedFlush() { - ThreadStatusUtil::SetColumnFamily(cfd_, cfd_->ioptions()->env, - db_options_.enable_thread_tracking); + ThreadStatusUtil::SetEnableTracking(db_options_.enable_thread_tracking); + ThreadStatusUtil::SetColumnFamily(cfd_); ThreadStatusUtil::SetThreadOperation(ThreadStatus::OP_FLUSH); ThreadStatusUtil::SetThreadOperationProperty(ThreadStatus::COMPACTION_JOB_ID, job_context_->job_id); + IOSTATS_RESET(bytes_written); } @@ -379,6 +380,7 @@ Status FlushJob::MemPurge() { // Create two iterators, one for the memtable data (contains // info from puts + deletes), and one for the memtable // Range Tombstones (from DeleteRanges). + // TODO: plumb Env::IOActivity ReadOptions ro; ro.total_order_seek = true; Arena arena; @@ -669,6 +671,7 @@ bool FlushJob::MemPurgeDecider(double threshold) { // Cochran formula for determining sample size. // 95% confidence interval, 7% precision. // n0 = (1.96*1.96)*0.25/(0.07*0.07) = 196.0 + // TODO: plumb Env::IOActivity double n0 = 196.0; ReadOptions ro; ro.total_order_seek = true; @@ -841,6 +844,7 @@ Status FlushJob::WriteLevel0Table() { range_del_iters; ReadOptions ro; ro.total_order_seek = true; + ro.io_activity = Env::IOActivity::kFlush; Arena arena; uint64_t total_num_entries = 0, total_num_deletes = 0; uint64_t total_data_size = 0; @@ -930,17 +934,19 @@ Status FlushJob::WriteLevel0Table() { meta_.fd.GetNumber()); const SequenceNumber job_snapshot_seq = job_context_->GetJobSnapshotSequence(); - s = BuildTable( - dbname_, versions_, db_options_, tboptions, file_options_, - cfd_->table_cache(), iter.get(), std::move(range_del_iters), &meta_, - &blob_file_additions, existing_snapshots_, - earliest_write_conflict_snapshot_, job_snapshot_seq, - snapshot_checker_, mutable_cf_options_.paranoid_file_checks, - cfd_->internal_stats(), &io_s, io_tracer_, - BlobFileCreationReason::kFlush, seqno_to_time_mapping_, event_logger_, - job_context_->job_id, io_priority, &table_properties_, write_hint, - full_history_ts_low, blob_callback_, base_, &num_input_entries, - &memtable_payload_bytes, &memtable_garbage_bytes); + const ReadOptions read_options(Env::IOActivity::kFlush); + s = BuildTable(dbname_, versions_, db_options_, tboptions, file_options_, + read_options, cfd_->table_cache(), iter.get(), + std::move(range_del_iters), &meta_, &blob_file_additions, + existing_snapshots_, earliest_write_conflict_snapshot_, + job_snapshot_seq, snapshot_checker_, + mutable_cf_options_.paranoid_file_checks, + cfd_->internal_stats(), &io_s, io_tracer_, + BlobFileCreationReason::kFlush, seqno_to_time_mapping_, + event_logger_, job_context_->job_id, io_priority, + &table_properties_, write_hint, full_history_ts_low, + blob_callback_, base_, &num_input_entries, + &memtable_payload_bytes, &memtable_garbage_bytes); // TODO: Cleanup io_status in BuildTable and table builders assert(!s.ok() || io_s.ok()); io_s.PermitUncheckedError(); diff --git a/db/import_column_family_job.cc b/db/import_column_family_job.cc index e637cb01d..12d2519e9 100644 --- a/db/import_column_family_job.cc +++ b/db/import_column_family_job.cc @@ -275,6 +275,7 @@ Status ImportColumnFamilyJob::GetIngestedFileInfo( // in file_meta. if (file_meta.smallest.empty()) { assert(file_meta.largest.empty()); + // TODO: plumb Env::IOActivity ReadOptions ro; std::unique_ptr iter(table_reader->NewIterator( ro, sv->mutable_cf_options.prefix_extractor.get(), /*arena=*/nullptr, @@ -350,4 +351,3 @@ Status ImportColumnFamilyJob::GetIngestedFileInfo( return status; } } // namespace ROCKSDB_NAMESPACE - diff --git a/db/internal_stats.cc b/db/internal_stats.cc index 5b76a7883..c75668c0d 100644 --- a/db/internal_stats.cc +++ b/db/internal_stats.cc @@ -1148,7 +1148,9 @@ bool InternalStats::HandleSsTables(std::string* value, Slice /*suffix*/) { bool InternalStats::HandleAggregatedTableProperties(std::string* value, Slice /*suffix*/) { std::shared_ptr tp; - auto s = cfd_->current()->GetAggregatedTableProperties(&tp); + // TODO: plumb Env::IOActivity + const ReadOptions read_options; + auto s = cfd_->current()->GetAggregatedTableProperties(read_options, &tp); if (!s.ok()) { return false; } @@ -1168,7 +1170,9 @@ static std::map MapUint64ValuesToString( bool InternalStats::HandleAggregatedTablePropertiesMap( std::map* values, Slice /*suffix*/) { std::shared_ptr tp; - auto s = cfd_->current()->GetAggregatedTableProperties(&tp); + // TODO: plumb Env::IOActivity + const ReadOptions read_options; + auto s = cfd_->current()->GetAggregatedTableProperties(read_options, &tp); if (!s.ok()) { return false; } @@ -1184,8 +1188,10 @@ bool InternalStats::HandleAggregatedTablePropertiesAtLevel(std::string* values, return false; } std::shared_ptr tp; + // TODO: plumb Env::IOActivity + const ReadOptions read_options; auto s = cfd_->current()->GetAggregatedTableProperties( - &tp, static_cast(level)); + read_options, &tp, static_cast(level)); if (!s.ok()) { return false; } @@ -1201,8 +1207,10 @@ bool InternalStats::HandleAggregatedTablePropertiesAtLevelMap( return false; } std::shared_ptr tp; + // TODO: plumb Env::IOActivity + const ReadOptions read_options; auto s = cfd_->current()->GetAggregatedTableProperties( - &tp, static_cast(level)); + read_options, &tp, static_cast(level)); if (!s.ok()) { return false; } @@ -1397,7 +1405,11 @@ bool InternalStats::HandleEstimatePendingCompactionBytes(uint64_t* value, bool InternalStats::HandleEstimateTableReadersMem(uint64_t* value, DBImpl* /*db*/, Version* version) { - *value = (version == nullptr) ? 0 : version->GetMemoryUsageByTableReaders(); + // TODO: plumb Env::IOActivity + const ReadOptions read_options; + *value = (version == nullptr) + ? 0 + : version->GetMemoryUsageByTableReaders(read_options); return true; } @@ -1448,9 +1460,10 @@ bool InternalStats::HandleEstimateOldestKeyTime(uint64_t* value, DBImpl* /*db*/, ->compaction_options_fifo.allow_compaction) { return false; } - + // TODO: plumb Env::IOActivity + const ReadOptions read_options; TablePropertiesCollection collection; - auto s = cfd_->current()->GetPropertiesOfAllTables(&collection); + auto s = cfd_->current()->GetPropertiesOfAllTables(read_options, &collection); if (!s.ok()) { return false; } diff --git a/db/memtable.cc b/db/memtable.cc index b99e1d345..e61ddc9db 100644 --- a/db/memtable.cc +++ b/db/memtable.cc @@ -599,6 +599,7 @@ void MemTable::ConstructFragmentedRangeTombstones() { assert(!IsFragmentedRangeTombstonesConstructed(false)); // There should be no concurrent Construction if (!is_range_del_table_empty_.load(std::memory_order_relaxed)) { + // TODO: plumb Env::IOActivity auto* unfragmented_iter = new MemTableIterator(*this, ReadOptions(), nullptr /* arena */, true /* use_range_del_table */); diff --git a/db/memtable_list.cc b/db/memtable_list.cc index ebcdf9b8e..ee1563f01 100644 --- a/db/memtable_list.cc +++ b/db/memtable_list.cc @@ -467,6 +467,8 @@ Status MemTableList::TryInstallMemtableFlushResults( ThreadStatus::STAGE_MEMTABLE_INSTALL_FLUSH_RESULTS); mu->AssertHeld(); + const ReadOptions read_options(Env::IOActivity::kFlush); + // Flush was successful // Record the status on the memtable object. Either this call or a call by a // concurrent flush thread will read the status and write it to manifest. @@ -578,8 +580,8 @@ Status MemTableList::TryInstallMemtableFlushResults( }; if (write_edits) { // this can release and reacquire the mutex. - s = vset->LogAndApply(cfd, mutable_cf_options, edit_list, mu, - db_directory, /*new_descriptor_log=*/false, + s = vset->LogAndApply(cfd, mutable_cf_options, read_options, edit_list, + mu, db_directory, /*new_descriptor_log=*/false, /*column_family_options=*/nullptr, manifest_write_cb); } else { @@ -798,6 +800,8 @@ Status InstallMemtableAtomicFlushResults( ThreadStatus::STAGE_MEMTABLE_INSTALL_FLUSH_RESULTS); mu->AssertHeld(); + const ReadOptions read_options(Env::IOActivity::kFlush); + size_t num = mems_list.size(); assert(cfds.size() == num); if (imm_lists != nullptr) { @@ -875,8 +879,8 @@ Status InstallMemtableAtomicFlushResults( } // this can release and reacquire the mutex. - s = vset->LogAndApply(cfds, mutable_cf_options_list, edit_lists, mu, - db_directory); + s = vset->LogAndApply(cfds, mutable_cf_options_list, read_options, edit_lists, + mu, db_directory); for (size_t k = 0; k != cfds.size(); ++k) { auto* imm = (imm_lists == nullptr) ? cfds[k]->imm() : imm_lists->at(k); diff --git a/db/perf_context_test.cc b/db/perf_context_test.cc index 3e78dbe27..bb8691b96 100644 --- a/db/perf_context_test.cc +++ b/db/perf_context_test.cc @@ -187,7 +187,8 @@ TEST_F(PerfContextTest, StopWatchOverhead) { uint64_t elapsed = 0; std::vector timings(kTotalIterations); - StopWatch timer(SystemClock::Default().get(), nullptr, 0, &elapsed); + StopWatch timer(SystemClock::Default().get(), nullptr, 0, + Histograms::HISTOGRAM_ENUM_MAX, &elapsed); for (auto& timing : timings) { timing = elapsed; } diff --git a/db/plain_table_db_test.cc b/db/plain_table_db_test.cc index 737ad4ed2..cc28b9f19 100644 --- a/db/plain_table_db_test.cc +++ b/db/plain_table_db_test.cc @@ -329,21 +329,23 @@ class TestPlainTableFactory : public PlainTableFactory { std::unique_ptr* table, bool /*prefetch_index_and_filter_in_cache*/) const override { std::unique_ptr props; + const ReadOptions read_options; auto s = ReadTableProperties(file.get(), file_size, kPlainTableMagicNumber, - table_reader_options.ioptions, &props); + table_reader_options.ioptions, read_options, + &props); EXPECT_TRUE(s.ok()); if (store_index_in_file_) { BlockHandle bloom_block_handle; s = FindMetaBlockInFile(file.get(), file_size, kPlainTableMagicNumber, - table_reader_options.ioptions, + table_reader_options.ioptions, read_options, BloomBlockBuilder::kBloomBlock, &bloom_block_handle); EXPECT_TRUE(s.ok()); BlockHandle index_block_handle; s = FindMetaBlockInFile(file.get(), file_size, kPlainTableMagicNumber, - table_reader_options.ioptions, + table_reader_options.ioptions, read_options, PlainTableIndexBuilder::kPlainTableIndexBlock, &index_block_handle); EXPECT_TRUE(s.ok()); @@ -1344,4 +1346,3 @@ int main(int argc, char** argv) { ::testing::InitGoogleTest(&argc, argv); return RUN_ALL_TESTS(); } - diff --git a/db/repair.cc b/db/repair.cc index 0b3e120c9..b4b9d0c5f 100644 --- a/db/repair.cc +++ b/db/repair.cc @@ -145,6 +145,8 @@ class Repairer { // Adds a column family to the VersionSet with cf_options_ and updates // manifest. Status AddColumnFamily(const std::string& cf_name, uint32_t cf_id) { + // TODO: plumb Env::IOActivity; + const ReadOptions read_options; const auto* cf_opts = GetColumnFamilyOptions(cf_name); if (cf_opts == nullptr) { return Status::Corruption("Encountered unknown column family with name=" + @@ -166,8 +168,9 @@ class Repairer { Status status = env_->GetFileSystem()->NewDirectory(dbname_, IOOptions(), &db_dir, nullptr); if (status.ok()) { - status = vset_.LogAndApply(cfd, mut_cf_opts, &edit, &mutex_, db_dir.get(), - false /* new_descriptor_log */, cf_opts); + status = vset_.LogAndApply(cfd, mut_cf_opts, read_options, &edit, &mutex_, + db_dir.get(), false /* new_descriptor_log */, + cf_opts); } mutex_.Unlock(); return status; @@ -357,6 +360,9 @@ class Repairer { } }; + // TODO: plumb Env::IOActivity + const ReadOptions read_options; + // Open the log file std::string logname = LogFileName(wal_dir, log); const auto& fs = env_->GetFileSystem(); @@ -422,6 +428,7 @@ class Repairer { FileMetaData meta; meta.fd = FileDescriptor(next_file_number_++, 0, 0); + // TODO: plumb Env::IOActivity ReadOptions ro; ro.total_order_seek = true; Arena arena; @@ -456,7 +463,7 @@ class Repairer { SeqnoToTimeMapping empty_seqno_time_mapping; status = BuildTable( dbname_, /* versions */ nullptr, immutable_db_options_, tboptions, - file_options_, table_cache_.get(), iter.get(), + file_options_, read_options, table_cache_.get(), iter.get(), std::move(range_del_iters), &meta, nullptr /* blob_file_additions */, {}, kMaxSequenceNumber, kMaxSequenceNumber, snapshot_checker, false /* paranoid_file_checks*/, nullptr /* internal_stats */, &io_s, @@ -509,8 +516,10 @@ class Repairer { file_size); std::shared_ptr props; if (status.ok()) { - status = table_cache_->GetTableProperties(file_options_, icmp_, t->meta, - &props); + // TODO: plumb Env::IOActivity + const ReadOptions read_options; + status = table_cache_->GetTableProperties(file_options_, read_options, + icmp_, t->meta, &props); } if (status.ok()) { auto s = @@ -556,6 +565,7 @@ class Repairer { } } if (status.ok()) { + // TODO: plumb Env::IOActivity ReadOptions ropts; ropts.total_order_seek = true; InternalIterator* iter = table_cache_->NewIterator( @@ -603,6 +613,7 @@ class Repairer { // an SST file is a full sorted run. This probably needs the extra logic // from compaction_job.cc around call to UpdateBoundariesForRange (to // handle range tombstones extendingg beyond range of other entries). + // TODO: plumb Env::IOActivity ReadOptions ropts; std::unique_ptr r_iter; status = table_cache_->GetRangeTombstoneIterator( @@ -625,6 +636,8 @@ class Repairer { } Status AddTables() { + // TODO: plumb Env::IOActivity; + const ReadOptions read_options; std::unordered_map> cf_id_to_tables; SequenceNumber max_sequence = 0; for (size_t i = 0; i < tables_.size(); i++) { @@ -706,8 +719,8 @@ class Repairer { s = env_->GetFileSystem()->NewDirectory(dbname_, IOOptions(), &db_dir, nullptr); if (s.ok()) { - s = vset_.LogAndApply(cfd, *cfd->GetLatestMutableCFOptions(), &edit, - &mutex_, db_dir.get(), + s = vset_.LogAndApply(cfd, *cfd->GetLatestMutableCFOptions(), + read_options, &edit, &mutex_, db_dir.get(), false /* new_descriptor_log */); } mutex_.Unlock(); @@ -809,4 +822,3 @@ Status RepairDB(const std::string& dbname, const Options& options) { } } // namespace ROCKSDB_NAMESPACE - diff --git a/db/table_cache.cc b/db/table_cache.cc index f456260bc..28206ed35 100644 --- a/db/table_cache.cc +++ b/db/table_cache.cc @@ -560,7 +560,7 @@ Status TableCache::MultiGetFilter( } Status TableCache::GetTableProperties( - const FileOptions& file_options, + const FileOptions& file_options, const ReadOptions& read_options, const InternalKeyComparator& internal_comparator, const FileMetaData& file_meta, std::shared_ptr* properties, @@ -574,7 +574,7 @@ Status TableCache::GetTableProperties( } TypedHandle* table_handle = nullptr; - Status s = FindTable(ReadOptions(), file_options, internal_comparator, + Status s = FindTable(read_options, file_options, internal_comparator, file_meta, &table_handle, prefix_extractor, no_io); if (!s.ok()) { return s; @@ -608,7 +608,7 @@ Status TableCache::ApproximateKeyAnchors( } size_t TableCache::GetMemoryUsageByTableReader( - const FileOptions& file_options, + const FileOptions& file_options, const ReadOptions& read_options, const InternalKeyComparator& internal_comparator, const FileMetaData& file_meta, const std::shared_ptr& prefix_extractor) { @@ -619,7 +619,7 @@ size_t TableCache::GetMemoryUsageByTableReader( } TypedHandle* table_handle = nullptr; - Status s = FindTable(ReadOptions(), file_options, internal_comparator, + Status s = FindTable(read_options, file_options, internal_comparator, file_meta, &table_handle, prefix_extractor, true); if (!s.ok()) { return 0; @@ -636,7 +636,8 @@ void TableCache::Evict(Cache* cache, uint64_t file_number) { } uint64_t TableCache::ApproximateOffsetOf( - const Slice& key, const FileMetaData& file_meta, TableReaderCaller caller, + const ReadOptions& read_options, const Slice& key, + const FileMetaData& file_meta, TableReaderCaller caller, const InternalKeyComparator& internal_comparator, const std::shared_ptr& prefix_extractor) { uint64_t result = 0; @@ -645,7 +646,7 @@ uint64_t TableCache::ApproximateOffsetOf( if (table_reader == nullptr) { const bool for_compaction = (caller == TableReaderCaller::kCompaction); Status s = - FindTable(ReadOptions(), file_options_, internal_comparator, file_meta, + FindTable(read_options, file_options_, internal_comparator, file_meta, &table_handle, prefix_extractor, false /* no_io */, !for_compaction /* record_read_stats */); if (s.ok()) { @@ -654,7 +655,7 @@ uint64_t TableCache::ApproximateOffsetOf( } if (table_reader != nullptr) { - result = table_reader->ApproximateOffsetOf(key, caller); + result = table_reader->ApproximateOffsetOf(read_options, key, caller); } if (table_handle != nullptr) { cache_.Release(table_handle); @@ -664,8 +665,9 @@ uint64_t TableCache::ApproximateOffsetOf( } uint64_t TableCache::ApproximateSize( - const Slice& start, const Slice& end, const FileMetaData& file_meta, - TableReaderCaller caller, const InternalKeyComparator& internal_comparator, + const ReadOptions& read_options, const Slice& start, const Slice& end, + const FileMetaData& file_meta, TableReaderCaller caller, + const InternalKeyComparator& internal_comparator, const std::shared_ptr& prefix_extractor) { uint64_t result = 0; TableReader* table_reader = file_meta.fd.table_reader; @@ -673,7 +675,7 @@ uint64_t TableCache::ApproximateSize( if (table_reader == nullptr) { const bool for_compaction = (caller == TableReaderCaller::kCompaction); Status s = - FindTable(ReadOptions(), file_options_, internal_comparator, file_meta, + FindTable(read_options, file_options_, internal_comparator, file_meta, &table_handle, prefix_extractor, false /* no_io */, !for_compaction /* record_read_stats */); if (s.ok()) { @@ -682,7 +684,7 @@ uint64_t TableCache::ApproximateSize( } if (table_reader != nullptr) { - result = table_reader->ApproximateSize(start, end, caller); + result = table_reader->ApproximateSize(read_options, start, end, caller); } if (table_handle != nullptr) { cache_.Release(table_handle); diff --git a/db/table_cache.h b/db/table_cache.h index 66282bf41..609e67498 100644 --- a/db/table_cache.h +++ b/db/table_cache.h @@ -179,7 +179,7 @@ class TableCache { // return Status::Incomplete() if table is not present in cache and // we set `no_io` to be true. Status GetTableProperties( - const FileOptions& toptions, + const FileOptions& toptions, const ReadOptions& read_options, const InternalKeyComparator& internal_comparator, const FileMetaData& file_meta, std::shared_ptr* properties, @@ -194,22 +194,23 @@ class TableCache { // Return total memory usage of the table reader of the file. // 0 if table reader of the file is not loaded. size_t GetMemoryUsageByTableReader( - const FileOptions& toptions, + const FileOptions& toptions, const ReadOptions& read_options, const InternalKeyComparator& internal_comparator, const FileMetaData& file_meta, const std::shared_ptr& prefix_extractor = nullptr); // Returns approximated offset of a key in a file represented by fd. uint64_t ApproximateOffsetOf( - const Slice& key, const FileMetaData& file_meta, TableReaderCaller caller, + const ReadOptions& read_options, const Slice& key, + const FileMetaData& file_meta, TableReaderCaller caller, const InternalKeyComparator& internal_comparator, const std::shared_ptr& prefix_extractor = nullptr); // Returns approximated data size between start and end keys in a file // represented by fd (the start key must not be greater than the end key). uint64_t ApproximateSize( - const Slice& start, const Slice& end, const FileMetaData& file_meta, - TableReaderCaller caller, + const ReadOptions& read_options, const Slice& start, const Slice& end, + const FileMetaData& file_meta, TableReaderCaller caller, const InternalKeyComparator& internal_comparator, const std::shared_ptr& prefix_extractor = nullptr); diff --git a/db/table_properties_collector_test.cc b/db/table_properties_collector_test.cc index 20f37e0c9..437b7e309 100644 --- a/db/table_properties_collector_test.cc +++ b/db/table_properties_collector_test.cc @@ -292,8 +292,9 @@ void TestCustomizedTablePropertiesCollector( new RandomAccessFileReader(std::move(source), "test")); std::unique_ptr props; + const ReadOptions read_options; Status s = ReadTableProperties(fake_file_reader.get(), fwf->contents().size(), - magic_number, ioptions, &props); + magic_number, ioptions, read_options, &props); ASSERT_OK(s); auto user_collected = props->user_collected_properties; @@ -429,8 +430,10 @@ void TestInternalKeyPropertiesCollector( new RandomAccessFileReader(std::move(source), "test")); std::unique_ptr props; - Status s = ReadTableProperties(reader.get(), fwf->contents().size(), - magic_number, ioptions, &props); + const ReadOptions read_options; + Status s = + ReadTableProperties(reader.get(), fwf->contents().size(), magic_number, + ioptions, read_options, &props); ASSERT_OK(s); auto user_collected = props->user_collected_properties; diff --git a/db/version_builder.cc b/db/version_builder.cc index 4f0e3a841..64590db5c 100644 --- a/db/version_builder.cc +++ b/db/version_builder.cc @@ -1257,7 +1257,7 @@ class VersionBuilder::Rep { InternalStats* internal_stats, int max_threads, bool prefetch_index_and_filter_in_cache, bool is_initial_load, const std::shared_ptr& prefix_extractor, - size_t max_file_size_for_l0_meta_pin) { + size_t max_file_size_for_l0_meta_pin, const ReadOptions& read_options) { assert(table_cache_ != nullptr); size_t table_cache_capacity = @@ -1324,7 +1324,7 @@ class VersionBuilder::Rep { int level = files_meta[file_idx].second; TableCache::TypedHandle* handle = nullptr; statuses[file_idx] = table_cache_->FindTable( - ReadOptions(), file_options_, + read_options, file_options_, *(base_vstorage_->InternalComparator()), *file_meta, &handle, prefix_extractor, false /*no_io */, true /* record_read_stats */, internal_stats->GetFileReadHist(level), false, level, @@ -1384,10 +1384,11 @@ Status VersionBuilder::LoadTableHandlers( InternalStats* internal_stats, int max_threads, bool prefetch_index_and_filter_in_cache, bool is_initial_load, const std::shared_ptr& prefix_extractor, - size_t max_file_size_for_l0_meta_pin) { - return rep_->LoadTableHandlers( - internal_stats, max_threads, prefetch_index_and_filter_in_cache, - is_initial_load, prefix_extractor, max_file_size_for_l0_meta_pin); + size_t max_file_size_for_l0_meta_pin, const ReadOptions& read_options) { + return rep_->LoadTableHandlers(internal_stats, max_threads, + prefetch_index_and_filter_in_cache, + is_initial_load, prefix_extractor, + max_file_size_for_l0_meta_pin, read_options); } uint64_t VersionBuilder::GetMinOldestBlobFileNumber() const { diff --git a/db/version_builder.h b/db/version_builder.h index 682d60524..8e7dd9e66 100644 --- a/db/version_builder.h +++ b/db/version_builder.h @@ -48,7 +48,7 @@ class VersionBuilder { InternalStats* internal_stats, int max_threads, bool prefetch_index_and_filter_in_cache, bool is_initial_load, const std::shared_ptr& prefix_extractor, - size_t max_file_size_for_l0_meta_pin); + size_t max_file_size_for_l0_meta_pin, const ReadOptions& read_options); uint64_t GetMinOldestBlobFileNumber() const; private: diff --git a/db/version_edit_handler.cc b/db/version_edit_handler.cc index f7a148968..7ea176e01 100644 --- a/db/version_edit_handler.cc +++ b/db/version_edit_handler.cc @@ -155,8 +155,9 @@ VersionEditHandler::VersionEditHandler( bool read_only, std::vector column_families, VersionSet* version_set, bool track_missing_files, bool no_error_if_files_missing, const std::shared_ptr& io_tracer, - bool skip_load_table_files, EpochNumberRequirement epoch_number_requirement) - : VersionEditHandlerBase(), + const ReadOptions& read_options, bool skip_load_table_files, + EpochNumberRequirement epoch_number_requirement) + : VersionEditHandlerBase(read_options), read_only_(read_only), column_families_(std::move(column_families)), version_set_(version_set), @@ -480,7 +481,8 @@ void VersionEditHandler::CheckIterationResult(const log::Reader& reader, ColumnFamilyData* VersionEditHandler::CreateCfAndInit( const ColumnFamilyOptions& cf_options, const VersionEdit& edit) { - ColumnFamilyData* cfd = version_set_->CreateColumnFamily(cf_options, &edit); + ColumnFamilyData* cfd = + version_set_->CreateColumnFamily(cf_options, read_options_, &edit); assert(cfd != nullptr); cfd->set_initialized(); assert(builders_.find(edit.column_family_) == builders_.end()); @@ -537,7 +539,7 @@ Status VersionEditHandler::MaybeCreateVersion(const VersionEdit& /*edit*/, if (s.ok()) { // Install new version v->PrepareAppend( - *cfd->GetLatestMutableCFOptions(), + *cfd->GetLatestMutableCFOptions(), read_options_, !(version_set_->db_options_->skip_stats_update_on_db_open)); version_set_->AppendVersion(cfd, v); } else { @@ -569,7 +571,8 @@ Status VersionEditHandler::LoadTables(ColumnFamilyData* cfd, version_set_->db_options_->max_file_opening_threads, prefetch_index_and_filter_in_cache, is_initial_load, cfd->GetLatestMutableCFOptions()->prefix_extractor, - MaxFileSizeForL0MetaPin(*cfd->GetLatestMutableCFOptions())); + MaxFileSizeForL0MetaPin(*cfd->GetLatestMutableCFOptions()), + read_options_); if ((s.IsPathNotFound() || s.IsCorruption()) && no_error_if_files_missing_) { s = Status::OK(); } @@ -647,11 +650,12 @@ Status VersionEditHandler::ExtractInfoFromVersionEdit(ColumnFamilyData* cfd, VersionEditHandlerPointInTime::VersionEditHandlerPointInTime( bool read_only, std::vector column_families, VersionSet* version_set, const std::shared_ptr& io_tracer, + const ReadOptions& read_options, EpochNumberRequirement epoch_number_requirement) : VersionEditHandler(read_only, column_families, version_set, /*track_missing_files=*/true, /*no_error_if_files_missing=*/true, io_tracer, - epoch_number_requirement) {} + read_options, epoch_number_requirement) {} VersionEditHandlerPointInTime::~VersionEditHandlerPointInTime() { for (const auto& elem : versions_) { @@ -816,7 +820,8 @@ Status VersionEditHandlerPointInTime::MaybeCreateVersion( cfd->internal_stats(), version_set_->db_options_->max_file_opening_threads, false, true, cfd->GetLatestMutableCFOptions()->prefix_extractor, - MaxFileSizeForL0MetaPin(*cfd->GetLatestMutableCFOptions())); + MaxFileSizeForL0MetaPin(*cfd->GetLatestMutableCFOptions()), + read_options_); if (!s.ok()) { delete version; if (s.IsCorruption()) { @@ -827,7 +832,7 @@ Status VersionEditHandlerPointInTime::MaybeCreateVersion( s = builder->SaveTo(version->storage_info()); if (s.ok()) { version->PrepareAppend( - *cfd->GetLatestMutableCFOptions(), + *cfd->GetLatestMutableCFOptions(), read_options_, !version_set_->db_options_->skip_stats_update_on_db_open); auto v_iter = versions_.find(cfd->GetID()); if (v_iter != versions_.end()) { @@ -847,7 +852,8 @@ Status VersionEditHandlerPointInTime::VerifyFile(ColumnFamilyData* cfd, const std::string& fpath, int level, const FileMetaData& fmeta) { - return version_set_->VerifyFileMetadata(cfd, fpath, level, fmeta); + return version_set_->VerifyFileMetadata(read_options_, cfd, fpath, level, + fmeta); } Status VersionEditHandlerPointInTime::VerifyBlobFile( @@ -856,7 +862,9 @@ Status VersionEditHandlerPointInTime::VerifyBlobFile( BlobSource* blob_source = cfd->blob_source(); assert(blob_source); CacheHandleGuard blob_file_reader; - Status s = blob_source->GetBlobFileReader(blob_file_num, &blob_file_reader); + + Status s = blob_source->GetBlobFileReader(read_options_, blob_file_num, + &blob_file_reader); if (!s.ok()) { return s; } diff --git a/db/version_edit_handler.h b/db/version_edit_handler.h index fc3fe7c6b..4b9f19542 100644 --- a/db/version_edit_handler.h +++ b/db/version_edit_handler.h @@ -19,8 +19,9 @@ struct FileMetaData; class VersionEditHandlerBase { public: - explicit VersionEditHandlerBase() - : max_manifest_read_size_(std::numeric_limits::max()) {} + explicit VersionEditHandlerBase(const ReadOptions& read_options) + : read_options_(read_options), + max_manifest_read_size_(std::numeric_limits::max()) {} virtual ~VersionEditHandlerBase() {} @@ -31,8 +32,9 @@ class VersionEditHandlerBase { AtomicGroupReadBuffer& GetReadBuffer() { return read_buffer_; } protected: - explicit VersionEditHandlerBase(uint64_t max_read_size) - : max_manifest_read_size_(max_read_size) {} + explicit VersionEditHandlerBase(const ReadOptions& read_options, + uint64_t max_read_size) + : read_options_(read_options), max_manifest_read_size_(max_read_size) {} virtual Status Initialize() { return Status::OK(); } virtual Status ApplyVersionEdit(VersionEdit& edit, @@ -45,6 +47,8 @@ class VersionEditHandlerBase { Status status_; + const ReadOptions& read_options_; + private: AtomicGroupReadBuffer read_buffer_; const uint64_t max_manifest_read_size_; @@ -52,7 +56,8 @@ class VersionEditHandlerBase { class ListColumnFamiliesHandler : public VersionEditHandlerBase { public: - ListColumnFamiliesHandler() : VersionEditHandlerBase() {} + explicit ListColumnFamiliesHandler(const ReadOptions& read_options) + : VersionEditHandlerBase(read_options) {} ~ListColumnFamiliesHandler() override {} @@ -72,9 +77,9 @@ class ListColumnFamiliesHandler : public VersionEditHandlerBase { class FileChecksumRetriever : public VersionEditHandlerBase { public: - FileChecksumRetriever(uint64_t max_read_size, + FileChecksumRetriever(const ReadOptions& read_options, uint64_t max_read_size, FileChecksumList& file_checksum_list) - : VersionEditHandlerBase(max_read_size), + : VersionEditHandlerBase(read_options, max_read_size), file_checksum_list_(file_checksum_list) {} ~FileChecksumRetriever() override {} @@ -111,12 +116,13 @@ class VersionEditHandler : public VersionEditHandlerBase { VersionSet* version_set, bool track_missing_files, bool no_error_if_files_missing, const std::shared_ptr& io_tracer, + const ReadOptions& read_options, EpochNumberRequirement epoch_number_requirement = EpochNumberRequirement::kMustPresent) - : VersionEditHandler(read_only, column_families, version_set, - track_missing_files, no_error_if_files_missing, - io_tracer, /*skip_load_table_files=*/false, - epoch_number_requirement) {} + : VersionEditHandler( + read_only, column_families, version_set, track_missing_files, + no_error_if_files_missing, io_tracer, read_options, + /*skip_load_table_files=*/false, epoch_number_requirement) {} ~VersionEditHandler() override {} @@ -137,7 +143,8 @@ class VersionEditHandler : public VersionEditHandlerBase { bool read_only, std::vector column_families, VersionSet* version_set, bool track_missing_files, bool no_error_if_files_missing, - const std::shared_ptr& io_tracer, bool skip_load_table_files, + const std::shared_ptr& io_tracer, + const ReadOptions& read_options, bool skip_load_table_files, EpochNumberRequirement epoch_number_requirement = EpochNumberRequirement::kMustPresent); @@ -212,6 +219,7 @@ class VersionEditHandlerPointInTime : public VersionEditHandler { VersionEditHandlerPointInTime( bool read_only, std::vector column_families, VersionSet* version_set, const std::shared_ptr& io_tracer, + const ReadOptions& read_options, EpochNumberRequirement epoch_number_requirement = EpochNumberRequirement::kMustPresent); ~VersionEditHandlerPointInTime() override; @@ -238,10 +246,11 @@ class ManifestTailer : public VersionEditHandlerPointInTime { explicit ManifestTailer(std::vector column_families, VersionSet* version_set, const std::shared_ptr& io_tracer, + const ReadOptions& read_options, EpochNumberRequirement epoch_number_requirement = EpochNumberRequirement::kMustPresent) : VersionEditHandlerPointInTime(/*read_only=*/false, column_families, - version_set, io_tracer, + version_set, io_tracer, read_options, epoch_number_requirement), mode_(Mode::kRecovery) {} @@ -281,12 +290,13 @@ class DumpManifestHandler : public VersionEditHandler { public: DumpManifestHandler(std::vector column_families, VersionSet* version_set, - const std::shared_ptr& io_tracer, bool verbose, - bool hex, bool json) + const std::shared_ptr& io_tracer, + const ReadOptions& read_options, bool verbose, bool hex, + bool json) : VersionEditHandler( /*read_only=*/true, column_families, version_set, /*track_missing_files=*/false, - /*no_error_if_files_missing=*/false, io_tracer, + /*no_error_if_files_missing=*/false, io_tracer, read_options, /*skip_load_table_files=*/true), verbose_(verbose), hex_(hex), diff --git a/db/version_set.cc b/db/version_set.cc index 125f03c9b..9f1888c78 100644 --- a/db/version_set.cc +++ b/db/version_set.cc @@ -1527,13 +1527,14 @@ void LevelIterator::InitFileIterator(size_t new_file_index) { } } // anonymous namespace -Status Version::GetTableProperties(std::shared_ptr* tp, +Status Version::GetTableProperties(const ReadOptions& read_options, + std::shared_ptr* tp, const FileMetaData* file_meta, const std::string* fname) const { auto table_cache = cfd_->table_cache(); auto ioptions = cfd_->ioptions(); Status s = table_cache->GetTableProperties( - file_options_, cfd_->internal_comparator(), *file_meta, tp, + file_options_, read_options, cfd_->internal_comparator(), *file_meta, tp, mutable_cf_options_.prefix_extractor, true /* no io */); if (s.ok()) { return s; @@ -1565,14 +1566,16 @@ Status Version::GetTableProperties(std::shared_ptr* tp, // the magic number check in the footer. std::unique_ptr file_reader( new RandomAccessFileReader( - std::move(file), file_name, nullptr /* env */, io_tracer_, - nullptr /* stats */, 0 /* hist_type */, nullptr /* file_read_hist */, - nullptr /* rate_limiter */, ioptions->listeners)); + std::move(file), file_name, ioptions->clock /* clock */, io_tracer_, + ioptions->stats /* stats */, + Histograms::SST_READ_MICROS /* hist_type */, + nullptr /* file_read_hist */, nullptr /* rate_limiter */, + ioptions->listeners)); std::unique_ptr props; s = ReadTableProperties( file_reader.get(), file_meta->fd.GetFileSize(), Footer::kNullTableMagicNumber /* table's magic number */, *ioptions, - &props); + read_options, &props); if (!s.ok()) { return s; } @@ -1581,10 +1584,11 @@ Status Version::GetTableProperties(std::shared_ptr* tp, return s; } -Status Version::GetPropertiesOfAllTables(TablePropertiesCollection* props) { +Status Version::GetPropertiesOfAllTables(const ReadOptions& read_options, + TablePropertiesCollection* props) { Status s; for (int level = 0; level < storage_info_.num_levels_; level++) { - s = GetPropertiesOfAllTables(props, level); + s = GetPropertiesOfAllTables(read_options, props, level); if (!s.ok()) { return s; } @@ -1602,6 +1606,8 @@ Status Version::TablesRangeTombstoneSummary(int max_entries_to_print, std::stringstream ss; + // TODO: plumb Env::IOActivity + const ReadOptions read_options; for (int level = 0; level < storage_info_.num_levels_; level++) { for (const auto& file_meta : storage_info_.files_[level]) { auto fname = @@ -1614,7 +1620,7 @@ Status Version::TablesRangeTombstoneSummary(int max_entries_to_print, std::unique_ptr tombstone_iter; Status s = table_cache->GetRangeTombstoneIterator( - ReadOptions(), cfd_->internal_comparator(), *file_meta, + read_options, cfd_->internal_comparator(), *file_meta, &tombstone_iter); if (!s.ok()) { return s; @@ -1648,7 +1654,8 @@ Status Version::TablesRangeTombstoneSummary(int max_entries_to_print, return Status::OK(); } -Status Version::GetPropertiesOfAllTables(TablePropertiesCollection* props, +Status Version::GetPropertiesOfAllTables(const ReadOptions& read_options, + TablePropertiesCollection* props, int level) { for (const auto& file_meta : storage_info_.files_[level]) { auto fname = @@ -1657,7 +1664,8 @@ Status Version::GetPropertiesOfAllTables(TablePropertiesCollection* props, // 1. If the table is already present in table cache, load table // properties from there. std::shared_ptr table_properties; - Status s = GetTableProperties(&table_properties, file_meta, &fname); + Status s = + GetTableProperties(read_options, &table_properties, file_meta, &fname); if (s.ok()) { props->insert({fname, table_properties}); } else { @@ -1669,7 +1677,8 @@ Status Version::GetPropertiesOfAllTables(TablePropertiesCollection* props, } Status Version::GetPropertiesOfTablesInRange( - const Range* range, std::size_t n, TablePropertiesCollection* props) const { + const ReadOptions& read_options, const Range* range, std::size_t n, + TablePropertiesCollection* props) const { for (int level = 0; level < storage_info_.num_non_empty_levels(); level++) { for (decltype(n) i = 0; i < n; i++) { // Convert user_key into a corresponding internal key. @@ -1686,7 +1695,8 @@ Status Version::GetPropertiesOfTablesInRange( // 1. If the table is already present in table cache, load table // properties from there. std::shared_ptr table_properties; - Status s = GetTableProperties(&table_properties, file_meta, &fname); + Status s = GetTableProperties(read_options, &table_properties, + file_meta, &fname); if (s.ok()) { props->insert({fname, table_properties}); } else { @@ -1701,13 +1711,14 @@ Status Version::GetPropertiesOfTablesInRange( } Status Version::GetAggregatedTableProperties( - std::shared_ptr* tp, int level) { + const ReadOptions& read_options, std::shared_ptr* tp, + int level) { TablePropertiesCollection props; Status s; if (level < 0) { - s = GetPropertiesOfAllTables(&props); + s = GetPropertiesOfAllTables(read_options, &props); } else { - s = GetPropertiesOfAllTables(&props, level); + s = GetPropertiesOfAllTables(read_options, &props, level); } if (!s.ok()) { return s; @@ -1721,12 +1732,12 @@ Status Version::GetAggregatedTableProperties( return Status::OK(); } -size_t Version::GetMemoryUsageByTableReaders() { +size_t Version::GetMemoryUsageByTableReaders(const ReadOptions& read_options) { size_t total_usage = 0; for (auto& file_level : storage_info_.level_files_brief_) { for (size_t i = 0; i < file_level.num_files; i++) { total_usage += cfd_->table_cache()->GetMemoryUsageByTableReader( - file_options_, cfd_->internal_comparator(), + file_options_, read_options, cfd_->internal_comparator(), *file_level.files[i].file_metadata, mutable_cf_options_.prefix_extractor); } @@ -2984,24 +2995,26 @@ void VersionStorageInfo::PrepareForVersionAppend( } void Version::PrepareAppend(const MutableCFOptions& mutable_cf_options, + const ReadOptions& read_options, bool update_stats) { TEST_SYNC_POINT_CALLBACK( "Version::PrepareAppend:forced_check", reinterpret_cast(&storage_info_.force_consistency_checks_)); if (update_stats) { - UpdateAccumulatedStats(); + UpdateAccumulatedStats(read_options); } storage_info_.PrepareForVersionAppend(*cfd_->ioptions(), mutable_cf_options); } -bool Version::MaybeInitializeFileMetaData(FileMetaData* file_meta) { +bool Version::MaybeInitializeFileMetaData(const ReadOptions& read_options, + FileMetaData* file_meta) { if (file_meta->init_stats_from_file || file_meta->compensated_file_size > 0) { return false; } std::shared_ptr tp; - Status s = GetTableProperties(&tp, file_meta); + Status s = GetTableProperties(read_options, &tp, file_meta); file_meta->init_stats_from_file = true; if (!s.ok()) { ROCKS_LOG_ERROR(vset_->db_options_->info_log, @@ -3046,7 +3059,7 @@ void VersionStorageInfo::RemoveCurrentStats(FileMetaData* file_meta) { } } -void Version::UpdateAccumulatedStats() { +void Version::UpdateAccumulatedStats(const ReadOptions& read_options) { // maximum number of table properties loaded from files. const int kMaxInitCount = 20; int init_count = 0; @@ -3064,7 +3077,7 @@ void Version::UpdateAccumulatedStats() { level < storage_info_.num_levels_ && init_count < kMaxInitCount; ++level) { for (auto* file_meta : storage_info_.files_[level]) { - if (MaybeInitializeFileMetaData(file_meta)) { + if (MaybeInitializeFileMetaData(read_options, file_meta)) { // each FileMeta will be initialized only once. storage_info_.UpdateAccumulatedStats(file_meta); // when option "max_open_files" is -1, all the file metadata has @@ -3089,7 +3102,8 @@ void Version::UpdateAccumulatedStats() { storage_info_.accumulated_raw_value_size_ == 0 && level >= 0; --level) { for (int i = static_cast(storage_info_.files_[level].size()) - 1; storage_info_.accumulated_raw_value_size_ == 0 && i >= 0; --i) { - if (MaybeInitializeFileMetaData(storage_info_.files_[level][i])) { + if (MaybeInitializeFileMetaData(read_options, + storage_info_.files_[level][i])) { storage_info_.UpdateAccumulatedStats(storage_info_.files_[level][i]); } } @@ -4971,7 +4985,8 @@ void VersionSet::AppendVersion(ColumnFamilyData* column_family_data, Status VersionSet::ProcessManifestWrites( std::deque& writers, InstrumentedMutex* mu, FSDirectory* dir_contains_current_file, bool new_descriptor_log, - const ColumnFamilyOptions* new_cf_options) { + const ColumnFamilyOptions* new_cf_options, + const ReadOptions& read_options) { mu->AssertHeld(); assert(!writers.empty()); ManifestWriter& first_writer = writers.front(); @@ -5202,7 +5217,7 @@ Status VersionSet::ProcessManifestWrites( true /* prefetch_index_and_filter_in_cache */, false /* is_initial_load */, mutable_cf_options_ptrs[i]->prefix_extractor, - MaxFileSizeForL0MetaPin(*mutable_cf_options_ptrs[i])); + MaxFileSizeForL0MetaPin(*mutable_cf_options_ptrs[i]), read_options); if (!s.ok()) { if (db_options_->paranoid_checks) { break; @@ -5247,7 +5262,8 @@ Status VersionSet::ProcessManifestWrites( constexpr bool update_stats = true; for (int i = 0; i < static_cast(versions.size()); ++i) { - versions[i]->PrepareAppend(*mutable_cf_options_ptrs[i], update_stats); + versions[i]->PrepareAppend(*mutable_cf_options_ptrs[i], read_options, + update_stats); } } @@ -5359,7 +5375,8 @@ Status VersionSet::ProcessManifestWrites( assert(batch_edits.size() == 1); assert(new_cf_options != nullptr); assert(max_last_sequence == descriptor_last_sequence_); - CreateColumnFamily(*new_cf_options, first_writer.edit_list.front()); + CreateColumnFamily(*new_cf_options, read_options, + first_writer.edit_list.front()); } else if (first_writer.edit_list.front()->is_column_family_drop_) { assert(batch_edits.size() == 1); assert(max_last_sequence == descriptor_last_sequence_); @@ -5528,6 +5545,7 @@ void VersionSet::WakeUpWaitingManifestWriters() { Status VersionSet::LogAndApply( const autovector& column_family_datas, const autovector& mutable_cf_options_list, + const ReadOptions& read_options, const autovector>& edit_lists, InstrumentedMutex* mu, FSDirectory* dir_contains_current_file, bool new_descriptor_log, const ColumnFamilyOptions* new_cf_options, @@ -5605,7 +5623,8 @@ Status VersionSet::LogAndApply( return Status::ColumnFamilyDropped(); } return ProcessManifestWrites(writers, mu, dir_contains_current_file, - new_descriptor_log, new_cf_options); + new_descriptor_log, new_cf_options, + read_options); } void VersionSet::LogAndApplyCFHelper(VersionEdit* edit, @@ -5689,6 +5708,7 @@ Status VersionSet::GetCurrentManifestPath(const std::string& dbname, Status VersionSet::Recover( const std::vector& column_families, bool read_only, std::string* db_id, bool no_error_if_files_missing) { + const ReadOptions read_options(Env::IOActivity::kDBOpen); // Read "CURRENT" file, which contains a pointer to the current manifest // file std::string manifest_path; @@ -5725,7 +5745,7 @@ Status VersionSet::Recover( VersionEditHandler handler( read_only, column_families, const_cast(this), /*track_missing_files=*/false, no_error_if_files_missing, io_tracer_, - EpochNumberRequirement::kMightMissing); + read_options, EpochNumberRequirement::kMightMissing); handler.Iterate(reader, &log_read_status); s = handler.status(); if (s.ok()) { @@ -5873,6 +5893,7 @@ Status VersionSet::TryRecoverFromOneManifest( const std::string& manifest_path, const std::vector& column_families, bool read_only, std::string* db_id, bool* has_missing_table_file) { + const ReadOptions read_options(Env::IOActivity::kDBOpen); ROCKS_LOG_INFO(db_options_->info_log, "Trying to recover from manifest: %s\n", manifest_path.c_str()); std::unique_ptr manifest_file_reader; @@ -5897,7 +5918,7 @@ Status VersionSet::TryRecoverFromOneManifest( /*checksum=*/true, /*log_num=*/0); VersionEditHandlerPointInTime handler_pit( read_only, column_families, const_cast(this), io_tracer_, - EpochNumberRequirement::kMightMissing); + read_options, EpochNumberRequirement::kMightMissing); handler_pit.Iterate(reader, &s); @@ -5940,6 +5961,8 @@ Status VersionSet::ListColumnFamilies(std::vector* column_families, Status VersionSet::ListColumnFamiliesFromManifest( const std::string& manifest_path, FileSystem* fs, std::vector* column_families) { + // TODO: plumb Env::IOActivity + const ReadOptions read_options; std::unique_ptr file_reader; Status s; { @@ -5959,7 +5982,7 @@ Status VersionSet::ListColumnFamiliesFromManifest( log::Reader reader(nullptr, std::move(file_reader), &reporter, true /* checksum */, 0 /* log_number */); - ListColumnFamiliesHandler handler; + ListColumnFamiliesHandler handler(read_options); handler.Iterate(reader, &s); assert(column_families); @@ -5982,6 +6005,9 @@ Status VersionSet::ReduceNumberOfLevels(const std::string& dbname, "Number of levels needs to be bigger than 1"); } + // TODO: plumb Env::IOActivity + const ReadOptions read_options; + ImmutableDBOptions db_options(*options); ColumnFamilyOptions cf_options(*options); std::shared_ptr tc(NewLRUCache(options->max_open_files - 10, @@ -6069,8 +6095,8 @@ Status VersionSet::ReduceNumberOfLevels(const std::string& dbname, InstrumentedMutex dummy_mutex; InstrumentedMutexLock l(&dummy_mutex); return versions.LogAndApply(versions.GetColumnFamilySet()->GetDefault(), - mutable_cf_options, &ve, &dummy_mutex, nullptr, - true); + mutable_cf_options, read_options, &ve, + &dummy_mutex, nullptr, true); } // Get the checksum information including the checksum and checksum function @@ -6143,6 +6169,9 @@ Status VersionSet::GetLiveFilesChecksumInfo(FileChecksumList* checksum_list) { Status VersionSet::DumpManifest(Options& options, std::string& dscname, bool verbose, bool hex, bool json) { assert(options.env); + // TODO: plumb Env::IOActivity + const ReadOptions read_options; + std::vector column_families; Status s = ListColumnFamiliesFromManifest( dscname, options.env->GetFileSystem().get(), &column_families); @@ -6169,7 +6198,8 @@ Status VersionSet::DumpManifest(Options& options, std::string& dscname, cf_descs.emplace_back(cf, options); } - DumpManifestHandler handler(cf_descs, this, io_tracer_, verbose, hex, json); + DumpManifestHandler handler(cf_descs, this, io_tracer_, read_options, verbose, + hex, json); { VersionSet::LogReporter reporter; reporter.status = &s; @@ -6372,6 +6402,7 @@ Status VersionSet::WriteCurrentStateToManifest( // we avoid doing binary search for the keys b and c twice and instead somehow // maintain state of where they first appear in the files. uint64_t VersionSet::ApproximateSize(const SizeApproximationOptions& options, + const ReadOptions& read_options, Version* v, const Slice& start, const Slice& end, int start_level, int end_level, TableReaderCaller caller) { @@ -6451,8 +6482,8 @@ uint64_t VersionSet::ApproximateSize(const SizeApproximationOptions& options, for (int i = idx_start + 1; i < idx_end; ++i) { uint64_t file_size = files_brief.files[i].fd.GetFileSize(); // The entire file falls into the range, so we can just take its size. - assert(file_size == - ApproximateSize(v, files_brief.files[i], start, end, caller)); + assert(file_size == ApproximateSize(read_options, v, files_brief.files[i], + start, end, caller)); total_full_size += file_size; } @@ -6487,21 +6518,24 @@ uint64_t VersionSet::ApproximateSize(const SizeApproximationOptions& options, // Estimate for all the first files (might also be last files), at each // level for (const auto file_ptr : first_files) { - total_full_size += ApproximateSize(v, *file_ptr, start, end, caller); + total_full_size += + ApproximateSize(read_options, v, *file_ptr, start, end, caller); } // Estimate for all the last files, at each level for (const auto file_ptr : last_files) { // We could use ApproximateSize here, but calling ApproximateOffsetOf // directly is just more efficient. - total_full_size += ApproximateOffsetOf(v, *file_ptr, end, caller); + total_full_size += + ApproximateOffsetOf(read_options, v, *file_ptr, end, caller); } } return total_full_size; } -uint64_t VersionSet::ApproximateOffsetOf(Version* v, const FdWithKeyRange& f, +uint64_t VersionSet::ApproximateOffsetOf(const ReadOptions& read_options, + Version* v, const FdWithKeyRange& f, const Slice& key, TableReaderCaller caller) { // pre-condition @@ -6521,14 +6555,15 @@ uint64_t VersionSet::ApproximateOffsetOf(Version* v, const FdWithKeyRange& f, TableCache* table_cache = v->cfd_->table_cache(); if (table_cache != nullptr) { result = table_cache->ApproximateOffsetOf( - key, *f.file_metadata, caller, icmp, + read_options, key, *f.file_metadata, caller, icmp, v->GetMutableCFOptions().prefix_extractor); } } return result; } -uint64_t VersionSet::ApproximateSize(Version* v, const FdWithKeyRange& f, +uint64_t VersionSet::ApproximateSize(const ReadOptions& read_options, + Version* v, const FdWithKeyRange& f, const Slice& start, const Slice& end, TableReaderCaller caller) { // pre-condition @@ -6544,13 +6579,14 @@ uint64_t VersionSet::ApproximateSize(Version* v, const FdWithKeyRange& f, if (icmp.Compare(f.smallest_key, start) >= 0) { // Start of the range is before the file start - approximate by end offset - return ApproximateOffsetOf(v, f, end, caller); + return ApproximateOffsetOf(read_options, v, f, end, caller); } if (icmp.Compare(f.largest_key, end) < 0) { // End of the range is after the file end - approximate by subtracting // start offset from the file size - uint64_t start_offset = ApproximateOffsetOf(v, f, start, caller); + uint64_t start_offset = + ApproximateOffsetOf(read_options, v, f, start, caller); assert(f.fd.GetFileSize() >= start_offset); return f.fd.GetFileSize() - start_offset; } @@ -6561,7 +6597,7 @@ uint64_t VersionSet::ApproximateSize(Version* v, const FdWithKeyRange& f, return 0; } return table_cache->ApproximateSize( - start, end, *f.file_metadata, caller, icmp, + read_options, start, end, *f.file_metadata, caller, icmp, v->GetMutableCFOptions().prefix_extractor); } @@ -6852,7 +6888,8 @@ void VersionSet::GetObsoleteFiles(std::vector* files, } ColumnFamilyData* VersionSet::CreateColumnFamily( - const ColumnFamilyOptions& cf_options, const VersionEdit* edit) { + const ColumnFamilyOptions& cf_options, const ReadOptions& read_options, + const VersionEdit* edit) { assert(edit->is_column_family_add_); MutableCFOptions dummy_cf_options; @@ -6871,7 +6908,8 @@ ColumnFamilyData* VersionSet::CreateColumnFamily( constexpr bool update_stats = false; - v->PrepareAppend(*new_cfd->GetLatestMutableCFOptions(), update_stats); + v->PrepareAppend(*new_cfd->GetLatestMutableCFOptions(), read_options, + update_stats); AppendVersion(new_cfd, v); // GetLatestMutableCFOptions() is safe here without mutex since the @@ -6936,7 +6974,8 @@ uint64_t VersionSet::GetTotalBlobFileSize(Version* dummy_versions) { return all_versions_blob_file_size; } -Status VersionSet::VerifyFileMetadata(ColumnFamilyData* cfd, +Status VersionSet::VerifyFileMetadata(const ReadOptions& read_options, + ColumnFamilyData* cfd, const std::string& fpath, int level, const FileMetaData& meta) { uint64_t fsize = 0; @@ -6969,7 +7008,7 @@ Status VersionSet::VerifyFileMetadata(ColumnFamilyData* cfd, TableCache::TypedHandle* handle = nullptr; FileMetaData meta_copy = meta; status = table_cache->FindTable( - ReadOptions(), file_opts, *icmp, meta_copy, &handle, pe, + read_options, file_opts, *icmp, meta_copy, &handle, pe, /*no_io=*/false, /*record_read_stats=*/true, internal_stats->GetFileReadHist(level), false, level, /*prefetch_index_and_filter_in_cache*/ false, max_sz_for_l0_meta_pin, @@ -7013,9 +7052,9 @@ Status ReactiveVersionSet::Recover( log::Reader* reader = manifest_reader->get(); assert(reader); - manifest_tailer_.reset( - new ManifestTailer(column_families, const_cast(this), - io_tracer_, EpochNumberRequirement::kMightMissing)); + manifest_tailer_.reset(new ManifestTailer( + column_families, const_cast(this), io_tracer_, + read_options_, EpochNumberRequirement::kMightMissing)); manifest_tailer_->Iterate(*reader, manifest_reader_status->get()); diff --git a/db/version_set.h b/db/version_set.h index 8d0633ea1..e7e96bc6c 100644 --- a/db/version_set.h +++ b/db/version_set.h @@ -913,7 +913,7 @@ class Version { // populates derived data structures. Call without mutex held. It needs to be // called before appending the version to the version set. void PrepareAppend(const MutableCFOptions& mutable_cf_options, - bool update_stats); + const ReadOptions& read_options, bool update_stats); // Reference count management (so Versions do not disappear out from // under live iterators) @@ -943,7 +943,8 @@ class Version { // specified in "file_meta". If the file name of "file_meta" is // known ahead, passing it by a non-null "fname" can save a // file-name conversion. - Status GetTableProperties(std::shared_ptr* tp, + Status GetTableProperties(const ReadOptions& read_options, + std::shared_ptr* tp, const FileMetaData* file_meta, const std::string* fname = nullptr) const; @@ -951,9 +952,12 @@ class Version { // On success, *props will be populated with all SSTables' table properties. // The keys of `props` are the sst file name, the values of `props` are the // tables' properties, represented as std::shared_ptr. - Status GetPropertiesOfAllTables(TablePropertiesCollection* props); - Status GetPropertiesOfAllTables(TablePropertiesCollection* props, int level); - Status GetPropertiesOfTablesInRange(const Range* range, std::size_t n, + Status GetPropertiesOfAllTables(const ReadOptions& read_options, + TablePropertiesCollection* props); + Status GetPropertiesOfAllTables(const ReadOptions& read_options, + TablePropertiesCollection* props, int level); + Status GetPropertiesOfTablesInRange(const ReadOptions& read_options, + const Range* range, std::size_t n, TablePropertiesCollection* props) const; // Print summary of range delete tombstones in SST files into out_str, @@ -965,13 +969,14 @@ class Version { // On success, "tp" will contains the aggregated table property among // the table properties of all sst files in this version. Status GetAggregatedTableProperties( + const ReadOptions& read_options, std::shared_ptr* tp, int level = -1); uint64_t GetEstimatedActiveKeys() { return storage_info_.GetEstimatedActiveKeys(); } - size_t GetMemoryUsageByTableReaders(); + size_t GetMemoryUsageByTableReaders(const ReadOptions& read_options); ColumnFamilyData* cfd() const { return cfd_; } @@ -1024,11 +1029,12 @@ class Version { // The helper function of UpdateAccumulatedStats, which may fill the missing // fields of file_meta from its associated TableProperties. // Returns true if it does initialize FileMetaData. - bool MaybeInitializeFileMetaData(FileMetaData* file_meta); + bool MaybeInitializeFileMetaData(const ReadOptions& read_options, + FileMetaData* file_meta); // Update the accumulated stats associated with the current version. // This accumulated stats will be used in compaction. - void UpdateAccumulatedStats(); + void UpdateAccumulatedStats(const ReadOptions& read_options); DECLARE_SYNC_AND_ASYNC( /* ret_type */ Status, /* func_name */ MultiGetFromSST, @@ -1136,13 +1142,13 @@ class VersionSet { virtual ~VersionSet(); Status LogAndApplyToDefaultColumnFamily( - VersionEdit* edit, InstrumentedMutex* mu, + const ReadOptions& read_options, VersionEdit* edit, InstrumentedMutex* mu, FSDirectory* dir_contains_current_file, bool new_descriptor_log = false, const ColumnFamilyOptions* column_family_options = nullptr) { ColumnFamilyData* default_cf = GetColumnFamilySet()->GetDefault(); const MutableCFOptions* cf_options = default_cf->GetLatestMutableCFOptions(); - return LogAndApply(default_cf, *cf_options, edit, mu, + return LogAndApply(default_cf, *cf_options, read_options, edit, mu, dir_contains_current_file, new_descriptor_log, column_family_options); } @@ -1155,9 +1161,9 @@ class VersionSet { // REQUIRES: no other thread concurrently calls LogAndApply() Status LogAndApply( ColumnFamilyData* column_family_data, - const MutableCFOptions& mutable_cf_options, VersionEdit* edit, - InstrumentedMutex* mu, FSDirectory* dir_contains_current_file, - bool new_descriptor_log = false, + const MutableCFOptions& mutable_cf_options, + const ReadOptions& read_options, VersionEdit* edit, InstrumentedMutex* mu, + FSDirectory* dir_contains_current_file, bool new_descriptor_log = false, const ColumnFamilyOptions* column_family_options = nullptr) { autovector cfds; cfds.emplace_back(column_family_data); @@ -1167,8 +1173,8 @@ class VersionSet { autovector edit_list; edit_list.emplace_back(edit); edit_lists.emplace_back(edit_list); - return LogAndApply(cfds, mutable_cf_options_list, edit_lists, mu, - dir_contains_current_file, new_descriptor_log, + return LogAndApply(cfds, mutable_cf_options_list, read_options, edit_lists, + mu, dir_contains_current_file, new_descriptor_log, column_family_options); } // The batch version. If edit_list.size() > 1, caller must ensure that @@ -1176,6 +1182,7 @@ class VersionSet { Status LogAndApply( ColumnFamilyData* column_family_data, const MutableCFOptions& mutable_cf_options, + const ReadOptions& read_options, const autovector& edit_list, InstrumentedMutex* mu, FSDirectory* dir_contains_current_file, bool new_descriptor_log = false, const ColumnFamilyOptions* column_family_options = nullptr, @@ -1186,8 +1193,8 @@ class VersionSet { mutable_cf_options_list.emplace_back(&mutable_cf_options); autovector> edit_lists; edit_lists.emplace_back(edit_list); - return LogAndApply(cfds, mutable_cf_options_list, edit_lists, mu, - dir_contains_current_file, new_descriptor_log, + return LogAndApply(cfds, mutable_cf_options_list, read_options, edit_lists, + mu, dir_contains_current_file, new_descriptor_log, column_family_options, {manifest_wcb}); } @@ -1197,6 +1204,7 @@ class VersionSet { virtual Status LogAndApply( const autovector& cfds, const autovector& mutable_cf_options_list, + const ReadOptions& read_options, const autovector>& edit_lists, InstrumentedMutex* mu, FSDirectory* dir_contains_current_file, bool new_descriptor_log = false, @@ -1427,7 +1435,8 @@ class VersionSet { // Return the approximate size of data to be scanned for range [start, end) // in levels [start_level, end_level). If end_level == -1 it will search // through all non-empty levels - uint64_t ApproximateSize(const SizeApproximationOptions& options, Version* v, + uint64_t ApproximateSize(const SizeApproximationOptions& options, + const ReadOptions& read_options, Version* v, const Slice& start, const Slice& end, int start_level, int end_level, TableReaderCaller caller); @@ -1487,7 +1496,8 @@ class VersionSet { new Version(cfd, this, file_options_, mutable_cf_options, io_tracer_); constexpr bool update_stats = false; - version->PrepareAppend(mutable_cf_options, update_stats); + const ReadOptions read_options; + version->PrepareAppend(mutable_cf_options, read_options, update_stats); AppendVersion(cfd, version); } @@ -1516,14 +1526,15 @@ class VersionSet { void Reset(); // Returns approximated offset of a key in a file for a given version. - uint64_t ApproximateOffsetOf(Version* v, const FdWithKeyRange& f, - const Slice& key, TableReaderCaller caller); + uint64_t ApproximateOffsetOf(const ReadOptions& read_options, Version* v, + const FdWithKeyRange& f, const Slice& key, + TableReaderCaller caller); // Returns approximated data size between start and end keys in a file // for a given version. - uint64_t ApproximateSize(Version* v, const FdWithKeyRange& f, - const Slice& start, const Slice& end, - TableReaderCaller caller); + uint64_t ApproximateSize(const ReadOptions& read_options, Version* v, + const FdWithKeyRange& f, const Slice& start, + const Slice& end, TableReaderCaller caller); struct MutableCFState { uint64_t log_number; @@ -1542,9 +1553,11 @@ class VersionSet { void AppendVersion(ColumnFamilyData* column_family_data, Version* v); ColumnFamilyData* CreateColumnFamily(const ColumnFamilyOptions& cf_options, + const ReadOptions& read_options, const VersionEdit* edit); - Status VerifyFileMetadata(ColumnFamilyData* cfd, const std::string& fpath, + Status VerifyFileMetadata(const ReadOptions& read_options, + ColumnFamilyData* cfd, const std::string& fpath, int level, const FileMetaData& meta); // Protected by DB mutex. @@ -1620,7 +1633,8 @@ class VersionSet { InstrumentedMutex* mu, FSDirectory* dir_contains_current_file, bool new_descriptor_log, - const ColumnFamilyOptions* new_cf_options); + const ColumnFamilyOptions* new_cf_options, + const ReadOptions& read_options); void LogAndApplyCFHelper(VersionEdit* edit, SequenceNumber* max_last_sequence); @@ -1672,13 +1686,15 @@ class ReactiveVersionSet : public VersionSet { private: std::unique_ptr manifest_tailer_; - + // TODO: plumb Env::IOActivity + const ReadOptions read_options_; using VersionSet::LogAndApply; using VersionSet::Recover; Status LogAndApply( const autovector& /*cfds*/, const autovector& /*mutable_cf_options_list*/, + const ReadOptions& /* read_options */, const autovector>& /*edit_lists*/, InstrumentedMutex* /*mu*/, FSDirectory* /*dir_contains_current_file*/, bool /*new_descriptor_log*/, const ColumnFamilyOptions* /*new_cf_option*/, diff --git a/db/version_set_test.cc b/db/version_set_test.cc index 0815d4cab..481dd46d9 100644 --- a/db/version_set_test.cc +++ b/db/version_set_test.cc @@ -1307,9 +1307,9 @@ class VersionSetTestBase { Status LogAndApplyToDefaultCF(VersionEdit& edit) { mutex_.Lock(); - Status s = - versions_->LogAndApply(versions_->GetColumnFamilySet()->GetDefault(), - mutable_cf_options_, &edit, &mutex_, nullptr); + Status s = versions_->LogAndApply( + versions_->GetColumnFamilySet()->GetDefault(), mutable_cf_options_, + read_options_, &edit, &mutex_, nullptr); mutex_.Unlock(); return s; } @@ -1321,9 +1321,9 @@ class VersionSetTestBase { vedits.push_back(e.get()); } mutex_.Lock(); - Status s = - versions_->LogAndApply(versions_->GetColumnFamilySet()->GetDefault(), - mutable_cf_options_, vedits, &mutex_, nullptr); + Status s = versions_->LogAndApply( + versions_->GetColumnFamilySet()->GetDefault(), mutable_cf_options_, + read_options_, vedits, &mutex_, nullptr); mutex_.Unlock(); return s; } @@ -1335,7 +1335,7 @@ class VersionSetTestBase { VersionEdit dummy; ASSERT_OK(versions_->LogAndApply( versions_->GetColumnFamilySet()->GetDefault(), mutable_cf_options_, - &dummy, &mutex_, db_directory, new_descriptor_log)); + read_options_, &dummy, &mutex_, db_directory, new_descriptor_log)); mutex_.Unlock(); } @@ -1350,7 +1350,8 @@ class VersionSetTestBase { Status s; mutex_.Lock(); s = versions_->LogAndApply(/*column_family_data=*/nullptr, - MutableCFOptions(cf_options), &new_cf, &mutex_, + MutableCFOptions(cf_options), read_options_, + &new_cf, &mutex_, /*db_directory=*/nullptr, /*new_descriptor_log=*/false, &cf_options); mutex_.Unlock(); @@ -1372,6 +1373,7 @@ class VersionSetTestBase { ColumnFamilyOptions cf_options_; ImmutableOptions immutable_options_; MutableCFOptions mutable_cf_options_; + const ReadOptions read_options_; std::shared_ptr table_cache_; WriteController write_controller_; WriteBufferManager write_buffer_manager_; @@ -1395,6 +1397,8 @@ class VersionSetTest : public VersionSetTestBase, public testing::Test { TEST_F(VersionSetTest, SameColumnFamilyGroupCommit) { NewDB(); const int kGroupSize = 5; + const ReadOptions read_options; + autovector edits; for (int i = 0; i != kGroupSize; ++i) { edits.emplace_back(VersionEdit()); @@ -1421,8 +1425,8 @@ TEST_F(VersionSetTest, SameColumnFamilyGroupCommit) { }); SyncPoint::GetInstance()->EnableProcessing(); mutex_.Lock(); - Status s = versions_->LogAndApply(cfds, all_mutable_cf_options, edit_lists, - &mutex_, nullptr); + Status s = versions_->LogAndApply(cfds, all_mutable_cf_options, read_options, + edit_lists, &mutex_, nullptr); mutex_.Unlock(); EXPECT_OK(s); EXPECT_EQ(kGroupSize - 1, count); @@ -1622,9 +1626,9 @@ TEST_F(VersionSetTest, ObsoleteBlobFile) { edit.AddBlobFileGarbage(blob_file_number, total_blob_count, total_blob_bytes); mutex_.Lock(); - Status s = - versions_->LogAndApply(versions_->GetColumnFamilySet()->GetDefault(), - mutable_cf_options_, &edit, &mutex_, nullptr); + Status s = versions_->LogAndApply( + versions_->GetColumnFamilySet()->GetDefault(), mutable_cf_options_, + read_options_, &edit, &mutex_, nullptr); mutex_.Unlock(); ASSERT_OK(s); @@ -2242,7 +2246,7 @@ class VersionSetWithTimestampTest : public VersionSetTest { Status s; mutex_.Lock(); s = versions_->LogAndApply(cfd_, *(cfd_->GetLatestMutableCFOptions()), - edits_, &mutex_, nullptr); + read_options_, edits_, &mutex_, nullptr); mutex_.Unlock(); ASSERT_OK(s); VerifyFullHistoryTsLow(*std::max_element(ts_lbs.begin(), ts_lbs.end())); @@ -2252,6 +2256,9 @@ class VersionSetWithTimestampTest : public VersionSetTest { ColumnFamilyData* cfd_{nullptr}; // edits_ must contain and own pointers to heap-alloc VersionEdit objects. autovector edits_; + + private: + const ReadOptions read_options_; }; const std::string VersionSetWithTimestampTest::kNewCfName("new_cf"); @@ -2680,6 +2687,8 @@ class VersionSetTestDropOneCF : public VersionSetTestBase, // Repeat the test for i = 1, 2, 3 to simulate dropping the first, middle and // last column family in an atomic group. TEST_P(VersionSetTestDropOneCF, HandleDroppedColumnFamilyInAtomicGroup) { + const ReadOptions read_options; + std::vector column_families; SequenceNumber last_seqno; std::unique_ptr log_writer; @@ -2709,7 +2718,7 @@ TEST_P(VersionSetTestDropOneCF, HandleDroppedColumnFamilyInAtomicGroup) { mutex_.Lock(); s = versions_->LogAndApply(cfd_to_drop, *cfd_to_drop->GetLatestMutableCFOptions(), - &drop_cf_edit, &mutex_, nullptr); + read_options, &drop_cf_edit, &mutex_, nullptr); mutex_.Unlock(); ASSERT_OK(s); @@ -2758,8 +2767,8 @@ TEST_P(VersionSetTestDropOneCF, HandleDroppedColumnFamilyInAtomicGroup) { }); SyncPoint::GetInstance()->EnableProcessing(); mutex_.Lock(); - s = versions_->LogAndApply(cfds, mutable_cf_options_list, edit_lists, &mutex_, - nullptr); + s = versions_->LogAndApply(cfds, mutable_cf_options_list, read_options, + edit_lists, &mutex_, nullptr); mutex_.Unlock(); ASSERT_OK(s); ASSERT_EQ(1, called); diff --git a/db/version_util.h b/db/version_util.h index 5ec6fda11..e39f25571 100644 --- a/db/version_util.h +++ b/db/version_util.h @@ -33,14 +33,15 @@ class OfflineManifestWriter { /*no_error_if_files_missing*/ true); } - Status LogAndApply(ColumnFamilyData* cfd, VersionEdit* edit, + Status LogAndApply(const ReadOptions& read_options, ColumnFamilyData* cfd, + VersionEdit* edit, FSDirectory* dir_contains_current_file) { // Use `mutex` to imitate a locked DB mutex when calling `LogAndApply()`. InstrumentedMutex mutex; mutex.Lock(); - Status s = versions_.LogAndApply(cfd, *cfd->GetLatestMutableCFOptions(), - edit, &mutex, dir_contains_current_file, - false /* new_descriptor_log */); + Status s = versions_.LogAndApply( + cfd, *cfd->GetLatestMutableCFOptions(), read_options, edit, &mutex, + dir_contains_current_file, false /* new_descriptor_log */); mutex.Unlock(); return s; } diff --git a/db/write_batch.cc b/db/write_batch.cc index 5f5c0bfcd..7fa0ed694 100644 --- a/db/write_batch.cc +++ b/db/write_batch.cc @@ -2036,6 +2036,7 @@ class MemTableInserter : public WriteBatch::Handler { // key not found in memtable. Do sst get, update, add SnapshotImpl read_from_snapshot; read_from_snapshot.number_ = sequence_; + // TODO: plumb Env::IOActivity ReadOptions ropts; // it's going to be overwritten for sure, so no point caching data block // containing the old version @@ -2480,6 +2481,7 @@ class MemTableInserter : public WriteBatch::Handler { // operations in the same batch. SnapshotImpl read_from_snapshot; read_from_snapshot.number_ = sequence_; + // TODO: plumb Env::IOActivity ReadOptions read_options; read_options.snapshot = &read_from_snapshot; diff --git a/db_stress_tool/db_stress_common.h b/db_stress_tool/db_stress_common.h index dbe912fe9..d16fefe4c 100644 --- a/db_stress_tool/db_stress_common.h +++ b/db_stress_tool/db_stress_common.h @@ -311,6 +311,8 @@ DECLARE_int32(create_timestamped_snapshot_one_in); DECLARE_bool(allow_data_in_errors); +DECLARE_bool(enable_thread_tracking); + // Tiered storage DECLARE_bool(enable_tiered_storage); // set last_level_temperature DECLARE_int64(preclude_last_level_data_seconds); diff --git a/db_stress_tool/db_stress_driver.cc b/db_stress_tool/db_stress_driver.cc index 2c8dcf610..4bf82c9d1 100644 --- a/db_stress_tool/db_stress_driver.cc +++ b/db_stress_tool/db_stress_driver.cc @@ -55,8 +55,7 @@ void ThreadBody(void* v) { } } } - -bool RunStressTest(SharedState* shared) { +bool RunStressTestImpl(SharedState* shared) { SystemClock* clock = db_stress_env->GetSystemClock().get(); StressTest* stress = shared->GetStressTest(); @@ -207,5 +206,11 @@ bool RunStressTest(SharedState* shared) { } return true; } +bool RunStressTest(SharedState* shared) { + ThreadStatusUtil::RegisterThread(db_stress_env, ThreadStatus::USER); + bool result = RunStressTestImpl(shared); + ThreadStatusUtil::UnregisterThread(); + return result; +} } // namespace ROCKSDB_NAMESPACE #endif // GFLAGS diff --git a/db_stress_tool/db_stress_env_wrapper.h b/db_stress_tool/db_stress_env_wrapper.h index af60df9bc..612d9fc6b 100644 --- a/db_stress_tool/db_stress_env_wrapper.h +++ b/db_stress_tool/db_stress_env_wrapper.h @@ -10,8 +10,30 @@ #ifdef GFLAGS #pragma once #include "db_stress_tool/db_stress_common.h" +#include "monitoring/thread_status_util.h" namespace ROCKSDB_NAMESPACE { +class DbStressRandomAccessFileWrapper : public FSRandomAccessFileOwnerWrapper { + public: + explicit DbStressRandomAccessFileWrapper( + std::unique_ptr&& target) + : FSRandomAccessFileOwnerWrapper(std::move(target)) {} + + IOStatus Read(uint64_t offset, size_t n, const IOOptions& options, + Slice* result, char* scratch, + IODebugContext* dbg) const override { +#ifndef NDEBUG + const ThreadStatus::OperationType thread_op = + ThreadStatusUtil::GetThreadOperation(); + Env::IOActivity io_activity = + ThreadStatusUtil::TEST_GetExpectedIOActivity(thread_op); + assert(io_activity == Env::IOActivity::kUnknown || + io_activity == options.io_activity); +#endif + return target()->Read(offset, n, options, result, scratch, dbg); + } +}; + class DbStressFSWrapper : public FileSystemWrapper { public: explicit DbStressFSWrapper(const std::shared_ptr& t) @@ -19,6 +41,18 @@ class DbStressFSWrapper : public FileSystemWrapper { static const char* kClassName() { return "DbStressFS"; } const char* Name() const override { return kClassName(); } + IOStatus NewRandomAccessFile(const std::string& f, + const FileOptions& file_opts, + std::unique_ptr* r, + IODebugContext* dbg) override { + std::unique_ptr file; + IOStatus s = target()->NewRandomAccessFile(f, file_opts, &file, dbg); + if (s.ok()) { + r->reset(new DbStressRandomAccessFileWrapper(std::move(file))); + } + return s; + } + IOStatus DeleteFile(const std::string& f, const IOOptions& opts, IODebugContext* dbg) override { // We determine whether it is a manifest file by searching a strong, diff --git a/db_stress_tool/db_stress_gflags.cc b/db_stress_tool/db_stress_gflags.cc index dd6d04916..b6ee67269 100644 --- a/db_stress_tool/db_stress_gflags.cc +++ b/db_stress_tool/db_stress_gflags.cc @@ -1051,6 +1051,11 @@ DEFINE_bool(allow_data_in_errors, ROCKSDB_NAMESPACE::Options().allow_data_in_errors, "If true, allow logging data, e.g. key, value in LOG files."); +DEFINE_bool(enable_thread_tracking, + ROCKSDB_NAMESPACE::Options().enable_thread_tracking, + "If true, the status of the threads involved in this DB will be " + "tracked and available via GetThreadList() API."); + DEFINE_int32(verify_iterator_with_expected_state_one_in, 0, "If non-zero, when TestIterate() is to be called, there is a " "1/verify_iterator_with_expected_state_one_in " diff --git a/db_stress_tool/db_stress_test_base.cc b/db_stress_tool/db_stress_test_base.cc index e508dadb5..60a12b331 100644 --- a/db_stress_tool/db_stress_test_base.cc +++ b/db_stress_tool/db_stress_test_base.cc @@ -3207,6 +3207,8 @@ void InitializeOptionsFromFlags( } options.allow_data_in_errors = FLAGS_allow_data_in_errors; + + options.enable_thread_tracking = FLAGS_enable_thread_tracking; } void InitializeOptionsGeneral( diff --git a/db_stress_tool/db_stress_tool.cc b/db_stress_tool/db_stress_tool.cc index c37117921..c41c5051f 100644 --- a/db_stress_tool/db_stress_tool.cc +++ b/db_stress_tool/db_stress_tool.cc @@ -99,13 +99,6 @@ int db_stress_tool(int argc, char** argv) { env_wrapper_guard = std::make_shared( raw_env, std::make_shared(raw_env->GetFileSystem())); - if (!env_opts && !FLAGS_use_io_uring) { - // If using the default Env (Posix), wrap DbStressEnvWrapper with the - // legacy EnvWrapper. This is a workaround to prevent MultiGet and scans - // from failing when IO uring is disabled. The EnvWrapper - // has a default implementation of ReadAsync that redirects to Read. - env_wrapper_guard = std::make_shared(env_wrapper_guard); - } db_stress_env = env_wrapper_guard.get(); FLAGS_rep_factory = StringToRepFactory(FLAGS_memtablerep.c_str()); diff --git a/db_stress_tool/multi_ops_txns_stress.cc b/db_stress_tool/multi_ops_txns_stress.cc index 4c05879aa..89b061004 100644 --- a/db_stress_tool/multi_ops_txns_stress.cc +++ b/db_stress_tool/multi_ops_txns_stress.cc @@ -1220,7 +1220,8 @@ void MultiOpsTxnsStressTest::VerifyDb(ThreadState* thread) const { // which can be called before TransactionDB::Open() returns to caller. // Therefore, at that time, db_ and txn_db_ may still be nullptr. // Caller has to make sure that the race condition does not happen. -void MultiOpsTxnsStressTest::VerifyPkSkFast(int job_id) { +void MultiOpsTxnsStressTest::VerifyPkSkFast(const ReadOptions& read_options, + int job_id) { DB* const db = db_aptr_.load(std::memory_order_acquire); if (db == nullptr) { return; @@ -1249,6 +1250,7 @@ void MultiOpsTxnsStressTest::VerifyPkSkFast(int job_id) { ReadOptions ropts; ropts.snapshot = snapshot; ropts.total_order_seek = true; + ropts.io_activity = read_options.io_activity; std::unique_ptr it(db_->NewIterator(ropts)); for (it->Seek(start_key); it->Valid(); it->Next()) { diff --git a/db_stress_tool/multi_ops_txns_stress.h b/db_stress_tool/multi_ops_txns_stress.h index 26744df66..12c45aaa3 100644 --- a/db_stress_tool/multi_ops_txns_stress.h +++ b/db_stress_tool/multi_ops_txns_stress.h @@ -288,7 +288,7 @@ class MultiOpsTxnsStressTest : public StressTest { VerifyDb(thread); } - void VerifyPkSkFast(int job_id); + void VerifyPkSkFast(const ReadOptions& read_options, int job_id); protected: class Counter { @@ -424,7 +424,8 @@ class MultiOpsTxnsStressListener : public EventListener { (void)db; #endif assert(info.cf_id == 0); - stress_test_->VerifyPkSkFast(info.job_id); + const ReadOptions read_options(Env::IOActivity::kFlush); + stress_test_->VerifyPkSkFast(read_options, info.job_id); } void OnCompactionCompleted(DB* db, const CompactionJobInfo& info) override { @@ -433,7 +434,8 @@ class MultiOpsTxnsStressListener : public EventListener { (void)db; #endif assert(info.cf_id == 0); - stress_test_->VerifyPkSkFast(info.job_id); + const ReadOptions read_options(Env::IOActivity::kCompaction); + stress_test_->VerifyPkSkFast(read_options, info.job_id); } private: diff --git a/file/file_util.cc b/file/file_util.cc index 43608fcdc..46faac67c 100644 --- a/file/file_util.cc +++ b/file/file_util.cc @@ -185,9 +185,9 @@ IOStatus GenerateOneFileChecksum( if (!io_s.ok()) { return io_s; } - reader.reset(new RandomAccessFileReader(std::move(r_file), file_path, - nullptr /*Env*/, io_tracer, nullptr, - 0, nullptr, rate_limiter)); + reader.reset(new RandomAccessFileReader( + std::move(r_file), file_path, nullptr /*Env*/, io_tracer, nullptr, + Histograms::HISTOGRAM_ENUM_MAX, nullptr, rate_limiter)); } // Found that 256 KB readahead size provides the best performance, based on diff --git a/file/file_util.h b/file/file_util.h index d46a7ba0e..e279cfba0 100644 --- a/file/file_util.h +++ b/file/file_util.h @@ -80,6 +80,8 @@ inline IOStatus PrepareIOFromReadOptions(const ReadOptions& ro, } opts.rate_limiter_priority = ro.rate_limiter_priority; + opts.io_activity = ro.io_activity; + return IOStatus::OK(); } diff --git a/file/random_access_file_reader.cc b/file/random_access_file_reader.cc index 226970641..2f8b51667 100644 --- a/file/random_access_file_reader.cc +++ b/file/random_access_file_reader.cc @@ -22,7 +22,11 @@ #include "util/rate_limiter.h" namespace ROCKSDB_NAMESPACE { - +const std::array + kReadHistograms{{ + FILE_READ_FLUSH_MICROS, + FILE_READ_COMPACTION_MICROS, + }}; inline void RecordIOStats(Statistics* stats, Temperature file_temperature, bool is_last_level, size_t size) { IOSTATS_ADD(bytes_read, size); @@ -94,6 +98,9 @@ IOStatus RandomAccessFileReader::Read( uint64_t elapsed = 0; { StopWatch sw(clock_, stats_, hist_type_, + (opts.io_activity != Env::IOActivity::kUnknown) + ? kReadHistograms[(std::size_t)(opts.io_activity)] + : Histograms::HISTOGRAM_ENUM_MAX, (stats_ != nullptr) ? &elapsed : nullptr, true /*overwrite*/, true /*delay_enabled*/); auto prev_perf_level = GetPerfLevel(); @@ -288,6 +295,9 @@ IOStatus RandomAccessFileReader::MultiRead( uint64_t elapsed = 0; { StopWatch sw(clock_, stats_, hist_type_, + (opts.io_activity != Env::IOActivity::kUnknown) + ? kReadHistograms[(std::size_t)(opts.io_activity)] + : Histograms::HISTOGRAM_ENUM_MAX, (stats_ != nullptr) ? &elapsed : nullptr, true /*overwrite*/, true /*delay_enabled*/); auto prev_perf_level = GetPerfLevel(); @@ -425,7 +435,7 @@ IOStatus RandomAccessFileReader::MultiRead( } IOStatus RandomAccessFileReader::PrepareIOOptions(const ReadOptions& ro, - IOOptions& opts) { + IOOptions& opts) const { if (clock_ != nullptr) { return PrepareIOFromReadOptions(ro, clock_, opts); } else { @@ -476,13 +486,17 @@ IOStatus RandomAccessFileReader::ReadAsync( assert(read_async_info->buf_.CurrentSize() == 0); - StopWatch sw(clock_, nullptr /*stats*/, 0 /*hist_type*/, &elapsed, - true /*overwrite*/, true /*delay_enabled*/); + StopWatch sw(clock_, nullptr /*stats*/, + Histograms::HISTOGRAM_ENUM_MAX /*hist_type*/, + Histograms::HISTOGRAM_ENUM_MAX, &elapsed, true /*overwrite*/, + true /*delay_enabled*/); s = file_->ReadAsync(aligned_req, opts, read_async_callback, read_async_info, io_handle, del_fn, nullptr /*dbg*/); } else { - StopWatch sw(clock_, nullptr /*stats*/, 0 /*hist_type*/, &elapsed, - true /*overwrite*/, true /*delay_enabled*/); + StopWatch sw(clock_, nullptr /*stats*/, + Histograms::HISTOGRAM_ENUM_MAX /*hist_type*/, + Histograms::HISTOGRAM_ENUM_MAX, &elapsed, true /*overwrite*/, + true /*delay_enabled*/); s = file_->ReadAsync(req, opts, read_async_callback, read_async_info, io_handle, del_fn, nullptr /*dbg*/); } diff --git a/file/random_access_file_reader.h b/file/random_access_file_reader.h index 50baa0318..ab4d1e797 100644 --- a/file/random_access_file_reader.h +++ b/file/random_access_file_reader.h @@ -122,7 +122,8 @@ class RandomAccessFileReader { std::unique_ptr&& raf, const std::string& _file_name, SystemClock* clock = nullptr, const std::shared_ptr& io_tracer = nullptr, - Statistics* stats = nullptr, uint32_t hist_type = 0, + Statistics* stats = nullptr, + uint32_t hist_type = Histograms::HISTOGRAM_ENUM_MAX, HistogramImpl* file_read_hist = nullptr, RateLimiter* rate_limiter = nullptr, const std::vector>& listeners = {}, @@ -197,7 +198,7 @@ class RandomAccessFileReader { bool use_direct_io() const { return file_->use_direct_io(); } - IOStatus PrepareIOOptions(const ReadOptions& ro, IOOptions& opts); + IOStatus PrepareIOOptions(const ReadOptions& ro, IOOptions& opts) const; IOStatus ReadAsync(FSReadRequest& req, const IOOptions& opts, std::function cb, diff --git a/include/rocksdb/env.h b/include/rocksdb/env.h index 62af602c6..c6523f063 100644 --- a/include/rocksdb/env.h +++ b/include/rocksdb/env.h @@ -436,6 +436,14 @@ class Env : public Customizable { IO_TOTAL = 4 }; + // EXPERIMENTAL + enum class IOActivity : uint8_t { + kFlush = 0, + kCompaction = 1, + kDBOpen = 2, + kUnknown, + }; + // Arrange to run "(*function)(arg)" once in a background thread, in // the thread pool specified by pri. By default, jobs go to the 'LOW' // priority thread pool. diff --git a/include/rocksdb/file_system.h b/include/rocksdb/file_system.h index 97b21e286..ae59ef800 100644 --- a/include/rocksdb/file_system.h +++ b/include/rocksdb/file_system.h @@ -116,6 +116,8 @@ struct IOOptions { // directories and list only files in GetChildren API. bool do_not_recurse; + Env::IOActivity io_activity = Env::IOActivity::kUnknown; + IOOptions() : IOOptions(false) {} explicit IOOptions(bool force_dir_fsync_) diff --git a/include/rocksdb/options.h b/include/rocksdb/options.h index 669afc1d4..611ba8b79 100644 --- a/include/rocksdb/options.h +++ b/include/rocksdb/options.h @@ -1696,8 +1696,11 @@ struct ReadOptions { // Default: true bool optimize_multiget_for_io; + Env::IOActivity io_activity; + ReadOptions(); ReadOptions(bool cksum, bool cache); + explicit ReadOptions(Env::IOActivity io_activity); }; // Options that control write operations diff --git a/include/rocksdb/statistics.h b/include/rocksdb/statistics.h index f25e02ebe..b7a8905ff 100644 --- a/include/rocksdb/statistics.h +++ b/include/rocksdb/statistics.h @@ -472,7 +472,13 @@ enum Histograms : uint32_t { NUM_FILES_IN_SINGLE_COMPACTION, DB_SEEK, WRITE_STALL, + // Time spent in reading block-based or plain SST table SST_READ_MICROS, + // Time spent in reading SST table (currently only block-based table) or blob + // file for flush or compaction + FILE_READ_FLUSH_MICROS, + FILE_READ_COMPACTION_MICROS, + // The number of subcompactions actually scheduled during a compaction NUM_SUBCOMPACTIONS_SCHEDULED, // Value size distribution in each operation diff --git a/include/rocksdb/thread_status.h b/include/rocksdb/thread_status.h index f37e45f97..beecdfd25 100644 --- a/include/rocksdb/thread_status.h +++ b/include/rocksdb/thread_status.h @@ -56,6 +56,7 @@ struct ThreadStatus { OP_UNKNOWN = 0, OP_COMPACTION, OP_FLUSH, + OP_DBOPEN, NUM_OP_TYPES }; diff --git a/include/rocksdb/utilities/transaction.h b/include/rocksdb/utilities/transaction.h index 91c088a60..947fcec55 100644 --- a/include/rocksdb/utilities/transaction.h +++ b/include/rocksdb/utilities/transaction.h @@ -681,4 +681,3 @@ class Transaction { }; } // namespace ROCKSDB_NAMESPACE - diff --git a/java/rocksjni/portal.h b/java/rocksjni/portal.h index ee87f8947..32dcca9df 100644 --- a/java/rocksjni/portal.h +++ b/java/rocksjni/portal.h @@ -5619,6 +5619,10 @@ class HistogramTypeJni { return 0x38; case ROCKSDB_NAMESPACE::Histograms::TABLE_OPEN_PREFETCH_TAIL_READ_BYTES: return 0x39; + case ROCKSDB_NAMESPACE::Histograms::FILE_READ_FLUSH_MICROS: + return 0x3A; + case ROCKSDB_NAMESPACE::Histograms::FILE_READ_COMPACTION_MICROS: + return 0x3B; case ROCKSDB_NAMESPACE::Histograms::HISTOGRAM_ENUM_MAX: // 0x1F for backwards compatibility on current minor version. return 0x1F; @@ -5738,6 +5742,10 @@ class HistogramTypeJni { case 0x39: return ROCKSDB_NAMESPACE::Histograms:: TABLE_OPEN_PREFETCH_TAIL_READ_BYTES; + case 0x3A: + return ROCKSDB_NAMESPACE::Histograms::FILE_READ_FLUSH_MICROS; + case 0x3B: + return ROCKSDB_NAMESPACE::Histograms::FILE_READ_COMPACTION_MICROS; case 0x1F: // 0x1F for backwards compatibility on current minor version. return ROCKSDB_NAMESPACE::Histograms::HISTOGRAM_ENUM_MAX; @@ -6777,6 +6785,8 @@ class OperationTypeJni { return 0x1; case ROCKSDB_NAMESPACE::ThreadStatus::OperationType::OP_FLUSH: return 0x2; + case ROCKSDB_NAMESPACE::ThreadStatus::OperationType::OP_DBOPEN: + return 0x3; default: return 0x7F; // undefined } @@ -6793,6 +6803,8 @@ class OperationTypeJni { return ROCKSDB_NAMESPACE::ThreadStatus::OperationType::OP_COMPACTION; case 0x2: return ROCKSDB_NAMESPACE::ThreadStatus::OperationType::OP_FLUSH; + case 0x3: + return ROCKSDB_NAMESPACE::ThreadStatus::OperationType::OP_DBOPEN; default: // undefined/default return ROCKSDB_NAMESPACE::ThreadStatus::OperationType::OP_UNKNOWN; diff --git a/java/src/main/java/org/rocksdb/HistogramType.java b/java/src/main/java/org/rocksdb/HistogramType.java index 20c54422c..c5da68d16 100644 --- a/java/src/main/java/org/rocksdb/HistogramType.java +++ b/java/src/main/java/org/rocksdb/HistogramType.java @@ -169,6 +169,10 @@ public enum HistogramType { */ TABLE_OPEN_PREFETCH_TAIL_READ_BYTES((byte) 0x39), + FILE_READ_FLUSH_MICROS((byte) 0x3A), + + FILE_READ_COMPACTION_MICROS((byte) 0x3B), + // 0x1F for backwards compatibility on current minor version. HISTOGRAM_ENUM_MAX((byte) 0x1F); diff --git a/java/src/main/java/org/rocksdb/OperationType.java b/java/src/main/java/org/rocksdb/OperationType.java index 7cc9b65cd..301caea32 100644 --- a/java/src/main/java/org/rocksdb/OperationType.java +++ b/java/src/main/java/org/rocksdb/OperationType.java @@ -14,7 +14,8 @@ package org.rocksdb; public enum OperationType { OP_UNKNOWN((byte)0x0), OP_COMPACTION((byte)0x1), - OP_FLUSH((byte)0x2); + OP_FLUSH((byte) 0x2), + OP_DBOPEN((byte) 0x3); private final byte value; diff --git a/microbench/db_basic_bench.cc b/microbench/db_basic_bench.cc index 3f9ed8e63..fbcab5391 100644 --- a/microbench/db_basic_bench.cc +++ b/microbench/db_basic_bench.cc @@ -1548,7 +1548,8 @@ static void RandomAccessFileReaderRead(benchmark::State& state) { : Temperature::kCold; readers.emplace_back(new RandomAccessFileReader( std::move(f), fname, env->GetSystemClock().get(), nullptr, statistics, - 0, nullptr, nullptr, {}, temperature, rand_num == 1)); + Histograms::HISTOGRAM_ENUM_MAX, nullptr, nullptr, {}, temperature, + rand_num == 1)); } IOOptions io_options; diff --git a/monitoring/statistics.cc b/monitoring/statistics.cc index 05de681a2..90e3fbda7 100644 --- a/monitoring/statistics.cc +++ b/monitoring/statistics.cc @@ -243,6 +243,8 @@ const std::vector> HistogramsNameMap = { {DB_SEEK, "rocksdb.db.seek.micros"}, {WRITE_STALL, "rocksdb.db.write.stall"}, {SST_READ_MICROS, "rocksdb.sst.read.micros"}, + {FILE_READ_FLUSH_MICROS, "rocksdb.file.read.flush.micros"}, + {FILE_READ_COMPACTION_MICROS, "rocksdb.file.read.compaction.micros"}, {NUM_SUBCOMPACTIONS_SCHEDULED, "rocksdb.num.subcompactions.scheduled"}, {BYTES_PER_READ, "rocksdb.bytes.per.read"}, {BYTES_PER_WRITE, "rocksdb.bytes.per.write"}, diff --git a/monitoring/thread_status_updater.cc b/monitoring/thread_status_updater.cc index 9707d2265..37fcef62b 100644 --- a/monitoring/thread_status_updater.cc +++ b/monitoring/thread_status_updater.cc @@ -47,15 +47,19 @@ void ThreadStatusUpdater::ResetThreadStatus() { SetColumnFamilyInfoKey(nullptr); } +void ThreadStatusUpdater::SetEnableTracking(bool enable_tracking) { + auto* data = Get(); + if (data == nullptr) { + return; + } + data->enable_tracking.store(enable_tracking, std::memory_order_relaxed); +} + void ThreadStatusUpdater::SetColumnFamilyInfoKey(const void* cf_key) { auto* data = Get(); if (data == nullptr) { return; } - // set the tracking flag based on whether cf_key is non-null or not. - // If enable_thread_tracking is set to false, the input cf_key - // would be nullptr. - data->enable_tracking = (cf_key != nullptr); data->cf_key.store(const_cast(cf_key), std::memory_order_relaxed); } @@ -86,6 +90,14 @@ void ThreadStatusUpdater::SetThreadOperation( } } +ThreadStatus::OperationType ThreadStatusUpdater::GetThreadOperation() { + ThreadStatusData* data = GetLocalThreadStatus(); + if (data == nullptr) { + return ThreadStatus::OperationType::OP_UNKNOWN; + } + return data->operation_type.load(std::memory_order_relaxed); +} + void ThreadStatusUpdater::SetThreadOperationProperty(int i, uint64_t value) { auto* data = GetLocalThreadStatus(); if (data == nullptr) { @@ -211,9 +223,7 @@ ThreadStatusData* ThreadStatusUpdater::GetLocalThreadStatus() { if (thread_status_data_ == nullptr) { return nullptr; } - if (!thread_status_data_->enable_tracking) { - assert(thread_status_data_->cf_key.load(std::memory_order_relaxed) == - nullptr); + if (!thread_status_data_->enable_tracking.load(std::memory_order_relaxed)) { return nullptr; } return thread_status_data_; diff --git a/monitoring/thread_status_updater.h b/monitoring/thread_status_updater.h index 762c73ae2..696063cb4 100644 --- a/monitoring/thread_status_updater.h +++ b/monitoring/thread_status_updater.h @@ -62,7 +62,8 @@ struct ConstantColumnFamilyInfo { // status of a thread using a set of atomic pointers. struct ThreadStatusData { #ifdef ROCKSDB_USING_THREAD_STATUS - explicit ThreadStatusData() : enable_tracking(false) { + explicit ThreadStatusData() { + enable_tracking.store(false); thread_id.store(0); thread_type.store(ThreadStatus::USER); cf_key.store(nullptr); @@ -72,13 +73,10 @@ struct ThreadStatusData { } // A flag to indicate whether the thread tracking is enabled - // in the current thread. This value will be updated based on whether - // the associated Options::enable_thread_tracking is set to true - // in ThreadStatusUtil::SetColumnFamily(). - // + // in the current thread. // If set to false, then SetThreadOperation and SetThreadState // will be no-op. - bool enable_tracking; + std::atomic enable_tracking; std::atomic thread_id; std::atomic thread_type; @@ -119,8 +117,10 @@ class ThreadStatusUpdater { // Register the current thread for tracking. void RegisterThread(ThreadStatus::ThreadType ttype, uint64_t thread_id); + void SetEnableTracking(bool enable_tracking); + // Update the column-family info of the current thread by setting - // its thread-local pointer of ThreadStateInfo to the correct entry. + // its thread-local pointer of ThreadStatusData to the correct entry. void SetColumnFamilyInfoKey(const void* cf_key); // returns the column family info key. @@ -129,6 +129,9 @@ class ThreadStatusUpdater { // Update the thread operation of the current thread. void SetThreadOperation(const ThreadStatus::OperationType type); + // Return the thread operation of the current thread. + ThreadStatus::OperationType GetThreadOperation(); + // The start time of the current thread operation. It is in the format // of micro-seconds since some fixed point in time. void SetOperationStartTime(const uint64_t start_time); diff --git a/monitoring/thread_status_util.cc b/monitoring/thread_status_util.cc index c07b85fa8..9b66dc28e 100644 --- a/monitoring/thread_status_util.cc +++ b/monitoring/thread_status_util.cc @@ -33,27 +33,23 @@ void ThreadStatusUtil::UnregisterThread() { } } -void ThreadStatusUtil::SetColumnFamily(const ColumnFamilyData* cfd, - const Env* env, - bool enable_thread_tracking) { - if (!MaybeInitThreadLocalUpdater(env)) { +void ThreadStatusUtil::SetEnableTracking(bool enable_tracking) { + if (thread_updater_local_cache_ == nullptr) { return; } - assert(thread_updater_local_cache_); - if (cfd != nullptr && enable_thread_tracking) { - thread_updater_local_cache_->SetColumnFamilyInfoKey(cfd); - } else { - // When cfd == nullptr or enable_thread_tracking == false, we set - // ColumnFamilyInfoKey to nullptr, which makes SetThreadOperation - // and SetThreadState become no-op. - thread_updater_local_cache_->SetColumnFamilyInfoKey(nullptr); + thread_updater_local_cache_->SetEnableTracking(enable_tracking); +} + +void ThreadStatusUtil::SetColumnFamily(const ColumnFamilyData* cfd) { + if (thread_updater_local_cache_ == nullptr) { + return; } + assert(cfd); + thread_updater_local_cache_->SetColumnFamilyInfoKey(cfd); } void ThreadStatusUtil::SetThreadOperation(ThreadStatus::OperationType op) { if (thread_updater_local_cache_ == nullptr) { - // thread_updater_local_cache_ must be set in SetColumnFamily - // or other ThreadStatusUtil functions. return; } @@ -68,6 +64,13 @@ void ThreadStatusUtil::SetThreadOperation(ThreadStatus::OperationType op) { thread_updater_local_cache_->SetThreadOperation(op); } +ThreadStatus::OperationType ThreadStatusUtil::GetThreadOperation() { + if (thread_updater_local_cache_ == nullptr) { + return ThreadStatus::OperationType::OP_UNKNOWN; + } + return thread_updater_local_cache_->GetThreadOperation(); +} + ThreadStatus::OperationStage ThreadStatusUtil::SetThreadOperationStage( ThreadStatus::OperationStage stage) { if (thread_updater_local_cache_ == nullptr) { @@ -172,9 +175,7 @@ bool ThreadStatusUtil::MaybeInitThreadLocalUpdater(const Env* /*env*/) { return false; } -void ThreadStatusUtil::SetColumnFamily(const ColumnFamilyData* /*cfd*/, - const Env* /*env*/, - bool /*enable_thread_tracking*/) {} +void ThreadStatusUtil::SetColumnFamily(const ColumnFamilyData* /*cfd*/) {} void ThreadStatusUtil::SetThreadOperation(ThreadStatus::OperationType /*op*/) {} @@ -189,7 +190,7 @@ void ThreadStatusUtil::SetThreadState(ThreadStatus::StateType /*state*/) {} void ThreadStatusUtil::NewColumnFamilyInfo(const DB* /*db*/, const ColumnFamilyData* /*cfd*/, const std::string& /*cf_name*/, - const Env* /*env*/) {} + const Env* env) {} void ThreadStatusUtil::EraseColumnFamilyInfo(const ColumnFamilyData* /*cfd*/) {} diff --git a/monitoring/thread_status_util.h b/monitoring/thread_status_util.h index 0137d2682..df148a039 100644 --- a/monitoring/thread_status_util.h +++ b/monitoring/thread_status_util.h @@ -52,13 +52,18 @@ class ThreadStatusUtil { // the current thread does not hold db_mutex. static void EraseDatabaseInfo(const DB* db); + static void SetEnableTracking(bool enable_tracking); + // Update the thread status to indicate the current thread is doing // something related to the specified column family. - static void SetColumnFamily(const ColumnFamilyData* cfd, const Env* env, - bool enable_thread_tracking); + // + // REQUIRES: cfd != nullptr + static void SetColumnFamily(const ColumnFamilyData* cfd); static void SetThreadOperation(ThreadStatus::OperationType type); + static ThreadStatus::OperationType GetThreadOperation(); + static ThreadStatus::OperationStage SetThreadOperationStage( ThreadStatus::OperationStage stage); @@ -74,6 +79,9 @@ class ThreadStatusUtil { static void TEST_SetStateDelay(const ThreadStatus::StateType state, int micro); static void TEST_StateDelay(const ThreadStatus::StateType state); + + static Env::IOActivity TEST_GetExpectedIOActivity( + ThreadStatus::OperationType thread_op); #endif protected: diff --git a/monitoring/thread_status_util_debug.cc b/monitoring/thread_status_util_debug.cc index f7a94355d..6e4fe8a9f 100644 --- a/monitoring/thread_status_util_debug.cc +++ b/monitoring/thread_status_util_debug.cc @@ -27,6 +27,20 @@ void ThreadStatusUtil::TEST_StateDelay(const ThreadStatus::StateType state) { } } +Env::IOActivity ThreadStatusUtil::TEST_GetExpectedIOActivity( + ThreadStatus::OperationType thread_op) { + switch (thread_op) { + case ThreadStatus::OperationType::OP_FLUSH: + return Env::IOActivity::kFlush; + case ThreadStatus::OperationType::OP_COMPACTION: + return Env::IOActivity::kCompaction; + case ThreadStatus::OperationType::OP_DBOPEN: + return Env::IOActivity::kDBOpen; + default: + return Env::IOActivity::kUnknown; + } +} + #endif // !NDEBUG } // namespace ROCKSDB_NAMESPACE diff --git a/options/options.cc b/options/options.cc index 4eeb7138b..4faddf5a2 100644 --- a/options/options.cc +++ b/options/options.cc @@ -682,8 +682,6 @@ DBOptions* DBOptions::IncreaseParallelism(int total_threads) { env->SetBackgroundThreads(1, Env::HIGH); return this; } - - ReadOptions::ReadOptions() : snapshot(nullptr), iterate_lower_bound(nullptr), @@ -708,7 +706,8 @@ ReadOptions::ReadOptions() value_size_soft_limit(std::numeric_limits::max()), adaptive_readahead(false), async_io(false), - optimize_multiget_for_io(true) {} + optimize_multiget_for_io(true), + io_activity(Env::IOActivity::kUnknown) {} ReadOptions::ReadOptions(bool cksum, bool cache) : snapshot(nullptr), @@ -734,6 +733,34 @@ ReadOptions::ReadOptions(bool cksum, bool cache) value_size_soft_limit(std::numeric_limits::max()), adaptive_readahead(false), async_io(false), - optimize_multiget_for_io(true) {} + optimize_multiget_for_io(true), + io_activity(Env::IOActivity::kUnknown) {} + +ReadOptions::ReadOptions(Env::IOActivity _io_activity) + : snapshot(nullptr), + iterate_lower_bound(nullptr), + iterate_upper_bound(nullptr), + readahead_size(0), + max_skippable_internal_keys(0), + read_tier(kReadAllTier), + verify_checksums(true), + fill_cache(true), + tailing(false), + managed(false), + total_order_seek(false), + auto_prefix_mode(false), + prefix_same_as_start(false), + pin_data(false), + background_purge_on_iterator_cleanup(false), + ignore_range_deletions(false), + timestamp(nullptr), + iter_start_ts(nullptr), + deadline(std::chrono::microseconds::zero()), + io_timeout(std::chrono::microseconds::zero()), + value_size_soft_limit(std::numeric_limits::max()), + adaptive_readahead(false), + async_io(false), + optimize_multiget_for_io(true), + io_activity(_io_activity) {} } // namespace ROCKSDB_NAMESPACE diff --git a/table/block_based/block_based_table_reader.cc b/table/block_based/block_based_table_reader.cc index 76aa9cec6..0ed42348f 100644 --- a/table/block_based/block_based_table_reader.cc +++ b/table/block_based/block_based_table_reader.cc @@ -583,6 +583,7 @@ Status BlockBasedTable::Open( ro.io_timeout = read_options.io_timeout; ro.rate_limiter_priority = read_options.rate_limiter_priority; ro.verify_checksums = read_options.verify_checksums; + ro.io_activity = read_options.io_activity; // prefetch both index and filters, down to all partitions const bool prefetch_all = prefetch_index_and_filter_in_cache || level == 0; @@ -2231,7 +2232,8 @@ Status BlockBasedTable::MultiGetFilter(const ReadOptions& read_options, return Status::OK(); } -Status BlockBasedTable::Prefetch(const Slice* const begin, +Status BlockBasedTable::Prefetch(const ReadOptions& read_options, + const Slice* const begin, const Slice* const end) { auto& comparator = rep_->internal_comparator; UserComparatorWrapper user_comparator(comparator.user_comparator()); @@ -2241,7 +2243,7 @@ Status BlockBasedTable::Prefetch(const Slice* const begin, } BlockCacheLookupContext lookup_context{TableReaderCaller::kPrefetch}; IndexBlockIter iiter_on_stack; - auto iiter = NewIndexIterator(ReadOptions(), /*need_upper_bound_check=*/false, + auto iiter = NewIndexIterator(read_options, /*need_upper_bound_check=*/false, &iiter_on_stack, /*get_context=*/nullptr, &lookup_context); std::unique_ptr> iiter_unique_ptr; @@ -2278,7 +2280,7 @@ Status BlockBasedTable::Prefetch(const Slice* const begin, DataBlockIter biter; Status tmp_status; NewDataBlockIterator( - ReadOptions(), block_handle, &biter, /*type=*/BlockType::kData, + read_options, block_handle, &biter, /*type=*/BlockType::kData, /*get_context=*/nullptr, &lookup_context, /*prefetch_buffer=*/nullptr, /*for_compaction=*/false, /*async_read=*/false, tmp_status); @@ -2298,11 +2300,10 @@ Status BlockBasedTable::VerifyChecksum(const ReadOptions& read_options, // Check Meta blocks std::unique_ptr metaindex; std::unique_ptr metaindex_iter; - ReadOptions ro; - s = ReadMetaIndexBlock(ro, nullptr /* prefetch buffer */, &metaindex, - &metaindex_iter); + s = ReadMetaIndexBlock(read_options, nullptr /* prefetch buffer */, + &metaindex, &metaindex_iter); if (s.ok()) { - s = VerifyChecksumInMetaBlocks(metaindex_iter.get()); + s = VerifyChecksumInMetaBlocks(read_options, metaindex_iter.get()); if (!s.ok()) { return s; } @@ -2409,7 +2410,7 @@ BlockType BlockBasedTable::GetBlockTypeForMetaBlockByName( } Status BlockBasedTable::VerifyChecksumInMetaBlocks( - InternalIteratorBase* index_iter) { + const ReadOptions& read_options, InternalIteratorBase* index_iter) { Status s; for (index_iter->SeekToFirst(); index_iter->Valid(); index_iter->Next()) { s = index_iter->status(); @@ -2425,14 +2426,14 @@ Status BlockBasedTable::VerifyChecksumInMetaBlocks( // Unfortunate special handling for properties block checksum w/ // global seqno std::unique_ptr table_properties; - s = ReadTablePropertiesHelper(ReadOptions(), handle, rep_->file.get(), + s = ReadTablePropertiesHelper(read_options, handle, rep_->file.get(), nullptr /* prefetch_buffer */, rep_->footer, rep_->ioptions, &table_properties, nullptr /* memory_allocator */); } else { s = BlockFetcher( rep_->file.get(), nullptr /* prefetch buffer */, rep_->footer, - ReadOptions(), handle, &contents, rep_->ioptions, + read_options, handle, &contents, rep_->ioptions, false /* decompress */, false /*maybe_compressed*/, GetBlockTypeForMetaBlockByName(meta_block_name), UncompressionDict::GetEmptyDict(), rep_->persistent_cache_options) @@ -2544,7 +2545,8 @@ uint64_t BlockBasedTable::GetApproximateDataSize() { return rep_->footer.metaindex_handle().offset(); } -uint64_t BlockBasedTable::ApproximateOffsetOf(const Slice& key, +uint64_t BlockBasedTable::ApproximateOffsetOf(const ReadOptions& read_options, + const Slice& key, TableReaderCaller caller) { uint64_t data_size = GetApproximateDataSize(); if (UNLIKELY(data_size == 0)) { @@ -2558,6 +2560,7 @@ uint64_t BlockBasedTable::ApproximateOffsetOf(const Slice& key, IndexBlockIter iiter_on_stack; ReadOptions ro; ro.total_order_seek = true; + ro.io_activity = read_options.io_activity; auto index_iter = NewIndexIterator(ro, /*disable_prefix_seek=*/true, /*input_iter=*/&iiter_on_stack, /*get_context=*/nullptr, @@ -2586,7 +2589,8 @@ uint64_t BlockBasedTable::ApproximateOffsetOf(const Slice& key, static_cast(rep_->file_size)); } -uint64_t BlockBasedTable::ApproximateSize(const Slice& start, const Slice& end, +uint64_t BlockBasedTable::ApproximateSize(const ReadOptions& read_options, + const Slice& start, const Slice& end, TableReaderCaller caller) { assert(rep_->internal_comparator.Compare(start, end) <= 0); @@ -2603,6 +2607,7 @@ uint64_t BlockBasedTable::ApproximateSize(const Slice& start, const Slice& end, IndexBlockIter iiter_on_stack; ReadOptions ro; ro.total_order_seek = true; + ro.io_activity = read_options.io_activity; auto index_iter = NewIndexIterator(ro, /*disable_prefix_seek=*/true, /*input_iter=*/&iiter_on_stack, /*get_context=*/nullptr, @@ -2654,9 +2659,9 @@ bool BlockBasedTable::TEST_IndexBlockInCache() const { } Status BlockBasedTable::GetKVPairsFromDataBlocks( - std::vector* kv_pair_blocks) { + const ReadOptions& read_options, std::vector* kv_pair_blocks) { std::unique_ptr> blockhandles_iter( - NewIndexIterator(ReadOptions(), /*need_upper_bound_check=*/false, + NewIndexIterator(read_options, /*need_upper_bound_check=*/false, /*input_iter=*/nullptr, /*get_context=*/nullptr, /*lookup_contex=*/nullptr)); @@ -2677,7 +2682,7 @@ Status BlockBasedTable::GetKVPairsFromDataBlocks( std::unique_ptr datablock_iter; Status tmp_status; datablock_iter.reset(NewDataBlockIterator( - ReadOptions(), blockhandles_iter->value().handle, + read_options, blockhandles_iter->value().handle, /*input_iter=*/nullptr, /*type=*/BlockType::kData, /*get_context=*/nullptr, /*lookup_context=*/nullptr, /*prefetch_buffer=*/nullptr, /*for_compaction=*/false, @@ -2723,7 +2728,8 @@ Status BlockBasedTable::DumpTable(WritableFile* out_file) { "--------------------------------------\n"; std::unique_ptr metaindex; std::unique_ptr metaindex_iter; - ReadOptions ro; + // TODO: plumb Env::IOActivity + const ReadOptions ro; Status s = ReadMetaIndexBlock(ro, nullptr /* prefetch_buffer */, &metaindex, &metaindex_iter); if (s.ok()) { @@ -2779,7 +2785,7 @@ Status BlockBasedTable::DumpTable(WritableFile* out_file) { if (rep_->uncompression_dict_reader) { CachableEntry uncompression_dict; s = rep_->uncompression_dict_reader->GetOrReadUncompressionDictionary( - nullptr /* prefetch_buffer */, false /* no_io */, + nullptr /* prefetch_buffer */, ro, false /* no_io */, false, /* verify_checksums */ nullptr /* get_context */, nullptr /* lookup_context */, &uncompression_dict); @@ -2797,7 +2803,7 @@ Status BlockBasedTable::DumpTable(WritableFile* out_file) { } // Output range deletions block - auto* range_del_iter = NewRangeTombstoneIterator(ReadOptions()); + auto* range_del_iter = NewRangeTombstoneIterator(ro); if (range_del_iter != nullptr) { range_del_iter->SeekToFirst(); if (range_del_iter->Valid()) { @@ -2827,8 +2833,10 @@ Status BlockBasedTable::DumpTable(WritableFile* out_file) { Status BlockBasedTable::DumpIndexBlock(std::ostream& out_stream) { out_stream << "Index Details:\n" "--------------------------------------\n"; + // TODO: plumb Env::IOActivity + const ReadOptions read_options; std::unique_ptr> blockhandles_iter( - NewIndexIterator(ReadOptions(), /*need_upper_bound_check=*/false, + NewIndexIterator(read_options, /*need_upper_bound_check=*/false, /*input_iter=*/nullptr, /*get_context=*/nullptr, /*lookup_contex=*/nullptr)); Status s = blockhandles_iter->status(); @@ -2876,8 +2884,10 @@ Status BlockBasedTable::DumpIndexBlock(std::ostream& out_stream) { } Status BlockBasedTable::DumpDataBlocks(std::ostream& out_stream) { + // TODO: plumb Env::IOActivity + const ReadOptions read_options; std::unique_ptr> blockhandles_iter( - NewIndexIterator(ReadOptions(), /*need_upper_bound_check=*/false, + NewIndexIterator(read_options, /*need_upper_bound_check=*/false, /*input_iter=*/nullptr, /*get_context=*/nullptr, /*lookup_contex=*/nullptr)); Status s = blockhandles_iter->status(); @@ -2911,7 +2921,7 @@ Status BlockBasedTable::DumpDataBlocks(std::ostream& out_stream) { std::unique_ptr datablock_iter; Status tmp_status; datablock_iter.reset(NewDataBlockIterator( - ReadOptions(), blockhandles_iter->value().handle, + read_options, blockhandles_iter->value().handle, /*input_iter=*/nullptr, /*type=*/BlockType::kData, /*get_context=*/nullptr, /*lookup_context=*/nullptr, /*prefetch_buffer=*/nullptr, /*for_compaction=*/false, diff --git a/table/block_based/block_based_table_reader.h b/table/block_based/block_based_table_reader.h index 2108416f1..df296a0d3 100644 --- a/table/block_based/block_based_table_reader.h +++ b/table/block_based/block_based_table_reader.h @@ -153,7 +153,8 @@ class BlockBasedTable : public TableReader { // Pre-fetch the disk blocks that correspond to the key range specified by // (kbegin, kend). The call will return error status in the event of // IO or iteration error. - Status Prefetch(const Slice* begin, const Slice* end) override; + Status Prefetch(const ReadOptions& read_options, const Slice* begin, + const Slice* end) override; // Given a key, return an approximate byte offset in the file where // the data for that key begins (or would begin if the key were @@ -161,15 +162,16 @@ class BlockBasedTable : public TableReader { // bytes, and so includes effects like compression of the underlying data. // E.g., the approximate offset of the last key in the table will // be close to the file length. - uint64_t ApproximateOffsetOf(const Slice& key, + uint64_t ApproximateOffsetOf(const ReadOptions& read_options, + const Slice& key, TableReaderCaller caller) override; // Given start and end keys, return the approximate data size in the file // between the keys. The returned value is in terms of file bytes, and so // includes effects like compression of the underlying data. // The start key must not be greater than the end key. - uint64_t ApproximateSize(const Slice& start, const Slice& end, - TableReaderCaller caller) override; + uint64_t ApproximateSize(const ReadOptions& read_options, const Slice& start, + const Slice& end, TableReaderCaller caller) override; Status ApproximateKeyAnchors(const ReadOptions& read_options, std::vector& anchors) override; @@ -265,7 +267,8 @@ class BlockBasedTable : public TableReader { // Retrieve all key value pairs from data blocks in the table. // The key retrieved are internal keys. - Status GetKVPairsFromDataBlocks(std::vector* kv_pair_blocks); + Status GetKVPairsFromDataBlocks(const ReadOptions& read_options, + std::vector* kv_pair_blocks); struct Rep; @@ -477,7 +480,8 @@ class BlockBasedTable : public TableReader { static BlockType GetBlockTypeForMetaBlockByName(const Slice& meta_block_name); - Status VerifyChecksumInMetaBlocks(InternalIteratorBase* index_iter); + Status VerifyChecksumInMetaBlocks(const ReadOptions& read_options, + InternalIteratorBase* index_iter); Status VerifyChecksumInBlocks(const ReadOptions& read_options, InternalIteratorBase* index_iter); diff --git a/table/block_based/block_based_table_reader_impl.h b/table/block_based/block_based_table_reader_impl.h index 50d147712..801b4614f 100644 --- a/table/block_based/block_based_table_reader_impl.h +++ b/table/block_based/block_based_table_reader_impl.h @@ -68,7 +68,7 @@ TBlockIter* BlockBasedTable::NewDataBlockIterator( // uncompression dict is typically at the end of the file and would // most likely break the sequentiality of the access pattern. s = rep_->uncompression_dict_reader->GetOrReadUncompressionDictionary( - ro.async_io ? nullptr : prefetch_buffer, no_io, ro.verify_checksums, + ro.async_io ? nullptr : prefetch_buffer, ro, no_io, ro.verify_checksums, get_context, lookup_context, &uncompression_dict); if (!s.ok()) { iter->Invalidate(s); diff --git a/table/block_based/block_based_table_reader_sync_and_async.h b/table/block_based/block_based_table_reader_sync_and_async.h index 5efb279f4..e033b688b 100644 --- a/table/block_based/block_based_table_reader_sync_and_async.h +++ b/table/block_based/block_based_table_reader_sync_and_async.h @@ -421,7 +421,7 @@ DEFINE_SYNC_AND_ASYNC(void, BlockBasedTable::MultiGet) uncompression_dict_status = rep_->uncompression_dict_reader ->GetOrReadUncompressionDictionary( - nullptr /* prefetch_buffer */, no_io, + nullptr /* prefetch_buffer */, read_options, no_io, read_options.verify_checksums, get_context, &metadata_lookup_context, &uncompression_dict); uncompression_dict_inited = true; diff --git a/table/block_based/hash_index_reader.cc b/table/block_based/hash_index_reader.cc index 83232dc7e..e158d8039 100644 --- a/table/block_based/hash_index_reader.cc +++ b/table/block_based/hash_index_reader.cc @@ -74,17 +74,17 @@ Status HashIndexReader::Create(const BlockBasedTable* table, // Read contents for the blocks BlockContents prefixes_contents; BlockFetcher prefixes_block_fetcher( - file, prefetch_buffer, footer, ReadOptions(), prefixes_handle, - &prefixes_contents, ioptions, true /*decompress*/, - true /*maybe_compressed*/, BlockType::kHashIndexPrefixes, - UncompressionDict::GetEmptyDict(), cache_options, memory_allocator); + file, prefetch_buffer, footer, ro, prefixes_handle, &prefixes_contents, + ioptions, true /*decompress*/, true /*maybe_compressed*/, + BlockType::kHashIndexPrefixes, UncompressionDict::GetEmptyDict(), + cache_options, memory_allocator); s = prefixes_block_fetcher.ReadBlockContents(); if (!s.ok()) { return s; } BlockContents prefixes_meta_contents; BlockFetcher prefixes_meta_block_fetcher( - file, prefetch_buffer, footer, ReadOptions(), prefixes_meta_handle, + file, prefetch_buffer, footer, ro, prefixes_meta_handle, &prefixes_meta_contents, ioptions, true /*decompress*/, true /*maybe_compressed*/, BlockType::kHashIndexMetadata, UncompressionDict::GetEmptyDict(), cache_options, memory_allocator); diff --git a/table/block_based/partitioned_index_reader.cc b/table/block_based/partitioned_index_reader.cc index d901f9ca3..d1cc88834 100644 --- a/table/block_based/partitioned_index_reader.cc +++ b/table/block_based/partitioned_index_reader.cc @@ -85,6 +85,7 @@ InternalIteratorBase* PartitionIndexReader::NewIterator( ro.async_io = read_options.async_io; ro.rate_limiter_priority = read_options.rate_limiter_priority; ro.verify_checksums = read_options.verify_checksums; + ro.io_activity = read_options.io_activity; // We don't return pinned data from index blocks, so no need // to set `block_contents_pinned`. diff --git a/table/block_based/uncompression_dict_reader.cc b/table/block_based/uncompression_dict_reader.cc index ba1908720..4ac442b6b 100644 --- a/table/block_based/uncompression_dict_reader.cc +++ b/table/block_based/uncompression_dict_reader.cc @@ -77,8 +77,9 @@ Status UncompressionDictReader::ReadUncompressionDictionary( } Status UncompressionDictReader::GetOrReadUncompressionDictionary( - FilePrefetchBuffer* prefetch_buffer, bool no_io, bool verify_checksums, - GetContext* get_context, BlockCacheLookupContext* lookup_context, + FilePrefetchBuffer* prefetch_buffer, const ReadOptions& ro, bool no_io, + bool verify_checksums, GetContext* get_context, + BlockCacheLookupContext* lookup_context, CachableEntry* uncompression_dict) const { assert(uncompression_dict); @@ -92,6 +93,7 @@ Status UncompressionDictReader::GetOrReadUncompressionDictionary( read_options.read_tier = kBlockCacheTier; } read_options.verify_checksums = verify_checksums; + read_options.io_activity = ro.io_activity; return ReadUncompressionDictionary(table_, prefetch_buffer, read_options, cache_dictionary_blocks(), get_context, diff --git a/table/block_based/uncompression_dict_reader.h b/table/block_based/uncompression_dict_reader.h index 416d25e2d..c78800d8a 100644 --- a/table/block_based/uncompression_dict_reader.h +++ b/table/block_based/uncompression_dict_reader.h @@ -32,8 +32,9 @@ class UncompressionDictReader { std::unique_ptr* uncompression_dict_reader); Status GetOrReadUncompressionDictionary( - FilePrefetchBuffer* prefetch_buffer, bool no_io, bool verify_checksums, - GetContext* get_context, BlockCacheLookupContext* lookup_context, + FilePrefetchBuffer* prefetch_buffer, const ReadOptions& ro, bool no_io, + bool verify_checksums, GetContext* get_context, + BlockCacheLookupContext* lookup_context, CachableEntry* uncompression_dict) const; size_t ApproximateMemoryUsage() const; diff --git a/table/cuckoo/cuckoo_table_builder_test.cc b/table/cuckoo/cuckoo_table_builder_test.cc index ec3ec4206..ceddbf37a 100644 --- a/table/cuckoo/cuckoo_table_builder_test.cc +++ b/table/cuckoo/cuckoo_table_builder_test.cc @@ -70,8 +70,10 @@ class CuckooBuilderTest : public testing::Test { // Assert Table Properties. std::unique_ptr props; + const ReadOptions read_options; ASSERT_OK(ReadTableProperties(file_reader.get(), read_file_size, - kCuckooTableMagicNumber, ioptions, &props)); + kCuckooTableMagicNumber, ioptions, + read_options, &props)); // Check unused bucket. std::string unused_key = props->user_collected_properties[CuckooTablePropertyNames::kEmptyKey]; @@ -627,4 +629,3 @@ int main(int argc, char** argv) { ::testing::InitGoogleTest(&argc, argv); return RUN_ALL_TESTS(); } - diff --git a/table/cuckoo/cuckoo_table_reader.cc b/table/cuckoo/cuckoo_table_reader.cc index fa3e77b2e..d64761962 100644 --- a/table/cuckoo/cuckoo_table_reader.cc +++ b/table/cuckoo/cuckoo_table_reader.cc @@ -59,8 +59,11 @@ CuckooTableReader::CuckooTableReader( } { std::unique_ptr props; - status_ = ReadTableProperties(file_.get(), file_size, - kCuckooTableMagicNumber, ioptions, &props); + // TODO: plumb Env::IOActivity + const ReadOptions read_options; + status_ = + ReadTableProperties(file_.get(), file_size, kCuckooTableMagicNumber, + ioptions, read_options, &props); if (!status_.ok()) { return; } diff --git a/table/cuckoo/cuckoo_table_reader.h b/table/cuckoo/cuckoo_table_reader.h index 7e154769d..d17011ed8 100644 --- a/table/cuckoo/cuckoo_table_reader.h +++ b/table/cuckoo/cuckoo_table_reader.h @@ -58,12 +58,14 @@ class CuckooTableReader : public TableReader { size_t ApproximateMemoryUsage() const override; // Following methods are not implemented for Cuckoo Table Reader - uint64_t ApproximateOffsetOf(const Slice& /*key*/, + uint64_t ApproximateOffsetOf(const ReadOptions& /*read_options*/, + const Slice& /*key*/, TableReaderCaller /*caller*/) override { return 0; } - uint64_t ApproximateSize(const Slice& /*start*/, const Slice& /*end*/, + uint64_t ApproximateSize(const ReadOptions& /* read_options */, + const Slice& /*start*/, const Slice& /*end*/, TableReaderCaller /*caller*/) override { return 0; } diff --git a/table/meta_blocks.cc b/table/meta_blocks.cc index 6530f6a80..2c58ff9c7 100644 --- a/table/meta_blocks.cc +++ b/table/meta_blocks.cc @@ -412,20 +412,22 @@ Status ReadTablePropertiesHelper( Status ReadTableProperties(RandomAccessFileReader* file, uint64_t file_size, uint64_t table_magic_number, const ImmutableOptions& ioptions, + const ReadOptions& read_options, std::unique_ptr* properties, MemoryAllocator* memory_allocator, FilePrefetchBuffer* prefetch_buffer) { BlockHandle block_handle; Footer footer; - Status s = FindMetaBlockInFile(file, file_size, table_magic_number, ioptions, - kPropertiesBlockName, &block_handle, - memory_allocator, prefetch_buffer, &footer); + Status s = + FindMetaBlockInFile(file, file_size, table_magic_number, ioptions, + read_options, kPropertiesBlockName, &block_handle, + memory_allocator, prefetch_buffer, &footer); if (!s.ok()) { return s; } if (!block_handle.IsNull()) { - s = ReadTablePropertiesHelper(ReadOptions(), block_handle, file, + s = ReadTablePropertiesHelper(read_options, block_handle, file, prefetch_buffer, footer, ioptions, properties, memory_allocator); } else { @@ -473,14 +475,20 @@ Status FindMetaBlock(InternalIterator* meta_index_iter, Status ReadMetaIndexBlockInFile(RandomAccessFileReader* file, uint64_t file_size, uint64_t table_magic_number, const ImmutableOptions& ioptions, + const ReadOptions& read_options, BlockContents* metaindex_contents, MemoryAllocator* memory_allocator, FilePrefetchBuffer* prefetch_buffer, Footer* footer_out) { Footer footer; IOOptions opts; - auto s = ReadFooterFromFile(opts, file, *ioptions.fs, prefetch_buffer, - file_size, &footer, table_magic_number); + Status s; + s = file->PrepareIOOptions(read_options, opts); + if (!s.ok()) { + return s; + } + s = ReadFooterFromFile(opts, file, *ioptions.fs, prefetch_buffer, file_size, + &footer, table_magic_number); if (!s.ok()) { return s; } @@ -489,7 +497,7 @@ Status ReadMetaIndexBlockInFile(RandomAccessFileReader* file, } auto metaindex_handle = footer.metaindex_handle(); - return BlockFetcher(file, prefetch_buffer, footer, ReadOptions(), + return BlockFetcher(file, prefetch_buffer, footer, read_options, metaindex_handle, metaindex_contents, ioptions, false /* do decompression */, false /*maybe_compressed*/, BlockType::kMetaIndex, UncompressionDict::GetEmptyDict(), @@ -497,18 +505,16 @@ Status ReadMetaIndexBlockInFile(RandomAccessFileReader* file, .ReadBlockContents(); } -Status FindMetaBlockInFile(RandomAccessFileReader* file, uint64_t file_size, - uint64_t table_magic_number, - const ImmutableOptions& ioptions, - const std::string& meta_block_name, - BlockHandle* block_handle, - MemoryAllocator* memory_allocator, - FilePrefetchBuffer* prefetch_buffer, - Footer* footer_out) { +Status FindMetaBlockInFile( + RandomAccessFileReader* file, uint64_t file_size, + uint64_t table_magic_number, const ImmutableOptions& ioptions, + const ReadOptions& read_options, const std::string& meta_block_name, + BlockHandle* block_handle, MemoryAllocator* memory_allocator, + FilePrefetchBuffer* prefetch_buffer, Footer* footer_out) { BlockContents metaindex_contents; auto s = ReadMetaIndexBlockInFile( - file, file_size, table_magic_number, ioptions, &metaindex_contents, - memory_allocator, prefetch_buffer, footer_out); + file, file_size, table_magic_number, ioptions, read_options, + &metaindex_contents, memory_allocator, prefetch_buffer, footer_out); if (!s.ok()) { return s; } @@ -526,6 +532,7 @@ Status ReadMetaBlock(RandomAccessFileReader* file, FilePrefetchBuffer* prefetch_buffer, uint64_t file_size, uint64_t table_magic_number, const ImmutableOptions& ioptions, + const ReadOptions& read_options, const std::string& meta_block_name, BlockType block_type, BlockContents* contents, MemoryAllocator* memory_allocator) { @@ -535,15 +542,16 @@ Status ReadMetaBlock(RandomAccessFileReader* file, BlockHandle block_handle; Footer footer; - Status status = FindMetaBlockInFile( - file, file_size, table_magic_number, ioptions, meta_block_name, - &block_handle, memory_allocator, prefetch_buffer, &footer); + Status status = + FindMetaBlockInFile(file, file_size, table_magic_number, ioptions, + read_options, meta_block_name, &block_handle, + memory_allocator, prefetch_buffer, &footer); if (!status.ok()) { return status; } - return BlockFetcher(file, prefetch_buffer, footer, ReadOptions(), - block_handle, contents, ioptions, false /* decompress */, + return BlockFetcher(file, prefetch_buffer, footer, read_options, block_handle, + contents, ioptions, false /* decompress */, false /*maybe_compressed*/, block_type, UncompressionDict::GetEmptyDict(), PersistentCacheOptions::kEmpty, memory_allocator) diff --git a/table/meta_blocks.h b/table/meta_blocks.h index b867dd01d..962a31638 100644 --- a/table/meta_blocks.h +++ b/table/meta_blocks.h @@ -119,6 +119,7 @@ Status ReadTablePropertiesHelper( Status ReadTableProperties(RandomAccessFileReader* file, uint64_t file_size, uint64_t table_magic_number, const ImmutableOptions& ioptions, + const ReadOptions& read_options, std::unique_ptr* properties, MemoryAllocator* memory_allocator = nullptr, FilePrefetchBuffer* prefetch_buffer = nullptr); @@ -139,6 +140,7 @@ Status FindMetaBlock(InternalIterator* meta_index_iter, Status FindMetaBlockInFile(RandomAccessFileReader* file, uint64_t file_size, uint64_t table_magic_number, const ImmutableOptions& ioptions, + const ReadOptions& read_options, const std::string& meta_block_name, BlockHandle* block_handle, MemoryAllocator* memory_allocator = nullptr, @@ -149,6 +151,7 @@ Status FindMetaBlockInFile(RandomAccessFileReader* file, uint64_t file_size, Status ReadMetaIndexBlockInFile(RandomAccessFileReader* file, uint64_t file_size, uint64_t table_magic_number, const ImmutableOptions& ioptions, + const ReadOptions& read_options, BlockContents* block_contents, MemoryAllocator* memory_allocator = nullptr, FilePrefetchBuffer* prefetch_buffer = nullptr, @@ -161,6 +164,7 @@ Status ReadMetaBlock(RandomAccessFileReader* file, FilePrefetchBuffer* prefetch_buffer, uint64_t file_size, uint64_t table_magic_number, const ImmutableOptions& ioptions, + const ReadOptions& read_options, const std::string& meta_block_name, BlockType block_type, BlockContents* contents, MemoryAllocator* memory_allocator = nullptr); diff --git a/table/mock_table.cc b/table/mock_table.cc index 130889eaa..c251ea108 100644 --- a/table/mock_table.cc +++ b/table/mock_table.cc @@ -41,12 +41,14 @@ class MockTableReader : public TableReader { GetContext* get_context, const SliceTransform* prefix_extractor, bool skip_filters = false) override; - uint64_t ApproximateOffsetOf(const Slice& /*key*/, + uint64_t ApproximateOffsetOf(const ReadOptions& /*read_options*/, + const Slice& /*key*/, TableReaderCaller /*caller*/) override { return 0; } - uint64_t ApproximateSize(const Slice& /*start*/, const Slice& /*end*/, + uint64_t ApproximateSize(const ReadOptions& /*read_options*/, + const Slice& /*start*/, const Slice& /*end*/, TableReaderCaller /*caller*/) override { return 0; } diff --git a/table/plain/plain_table_reader.cc b/table/plain/plain_table_reader.cc index 3e51c2275..2f0379f72 100644 --- a/table/plain/plain_table_reader.cc +++ b/table/plain/plain_table_reader.cc @@ -126,8 +126,10 @@ Status PlainTableReader::Open( } std::unique_ptr props; + // TODO: plumb Env::IOActivity + const ReadOptions read_options; auto s = ReadTableProperties(file.get(), file_size, kPlainTableMagicNumber, - ioptions, &props); + ioptions, read_options, &props); if (!s.ok()) { return s; } @@ -297,10 +299,14 @@ Status PlainTableReader::PopulateIndex(TableProperties* props, assert(props != nullptr); BlockContents index_block_contents; - Status s = ReadMetaBlock(file_info_.file.get(), nullptr /* prefetch_buffer */, - file_size_, kPlainTableMagicNumber, ioptions_, - PlainTableIndexBuilder::kPlainTableIndexBlock, - BlockType::kIndex, &index_block_contents); + + // TODO: plumb Env::IOActivity + const ReadOptions read_options; + Status s = + ReadMetaBlock(file_info_.file.get(), nullptr /* prefetch_buffer */, + file_size_, kPlainTableMagicNumber, ioptions_, read_options, + PlainTableIndexBuilder::kPlainTableIndexBlock, + BlockType::kIndex, &index_block_contents); bool index_in_file = s.ok(); @@ -310,8 +316,8 @@ Status PlainTableReader::PopulateIndex(TableProperties* props, if (index_in_file) { s = ReadMetaBlock(file_info_.file.get(), nullptr /* prefetch_buffer */, file_size_, kPlainTableMagicNumber, ioptions_, - BloomBlockBuilder::kBloomBlock, BlockType::kFilter, - &bloom_block_contents); + read_options, BloomBlockBuilder::kBloomBlock, + BlockType::kFilter, &bloom_block_contents); bloom_in_file = s.ok() && bloom_block_contents.data.size() > 0; } @@ -614,12 +620,14 @@ Status PlainTableReader::Get(const ReadOptions& /*ro*/, const Slice& target, return Status::OK(); } -uint64_t PlainTableReader::ApproximateOffsetOf(const Slice& /*key*/, - TableReaderCaller /*caller*/) { +uint64_t PlainTableReader::ApproximateOffsetOf( + const ReadOptions& /*read_options*/, const Slice& /*key*/, + TableReaderCaller /*caller*/) { return 0; } -uint64_t PlainTableReader::ApproximateSize(const Slice& /*start*/, +uint64_t PlainTableReader::ApproximateSize(const ReadOptions& /* read_options*/, + const Slice& /*start*/, const Slice& /*end*/, TableReaderCaller /*caller*/) { return 0; diff --git a/table/plain/plain_table_reader.h b/table/plain/plain_table_reader.h index 51500c3ee..0f5f7f3ce 100644 --- a/table/plain/plain_table_reader.h +++ b/table/plain/plain_table_reader.h @@ -92,11 +92,12 @@ class PlainTableReader : public TableReader { GetContext* get_context, const SliceTransform* prefix_extractor, bool skip_filters = false) override; - uint64_t ApproximateOffsetOf(const Slice& key, + uint64_t ApproximateOffsetOf(const ReadOptions& read_options, + const Slice& key, TableReaderCaller caller) override; - uint64_t ApproximateSize(const Slice& start, const Slice& end, - TableReaderCaller caller) override; + uint64_t ApproximateSize(const ReadOptions& read_options, const Slice& start, + const Slice& end, TableReaderCaller caller) override; uint32_t GetIndexSize() const { return index_.GetIndexSize(); } void SetupForCompaction() override; diff --git a/table/sst_file_dumper.cc b/table/sst_file_dumper.cc index 59e62486a..e9916eb5b 100644 --- a/table/sst_file_dumper.cc +++ b/table/sst_file_dumper.cc @@ -355,8 +355,11 @@ Status SstFileDumper::ReadTableProperties(uint64_t table_magic_number, RandomAccessFileReader* file, uint64_t file_size, FilePrefetchBuffer* prefetch_buffer) { + // TODO: plumb Env::IOActivity + const ReadOptions read_options; Status s = ROCKSDB_NAMESPACE::ReadTableProperties( - file, file_size, table_magic_number, ioptions_, &table_properties_, + file, file_size, table_magic_number, ioptions_, read_options, + &table_properties_, /* memory_allocator= */ nullptr, prefetch_buffer); if (!s.ok()) { if (!silent_) { @@ -514,4 +517,3 @@ Status SstFileDumper::ReadTableProperties( return init_result_; } } // namespace ROCKSDB_NAMESPACE - diff --git a/table/sst_file_reader.cc b/table/sst_file_reader.cc index 5573d941c..c95c91743 100644 --- a/table/sst_file_reader.cc +++ b/table/sst_file_reader.cc @@ -66,6 +66,7 @@ Status SstFileReader::Open(const std::string& file_path) { } Iterator* SstFileReader::NewIterator(const ReadOptions& roptions) { + assert(roptions.io_activity == Env::IOActivity::kUnknown); auto r = rep_.get(); auto sequence = roptions.snapshot != nullptr ? roptions.snapshot->GetSequenceNumber() @@ -91,9 +92,9 @@ std::shared_ptr SstFileReader::GetTableProperties() } Status SstFileReader::VerifyChecksum(const ReadOptions& read_options) { + assert(read_options.io_activity == Env::IOActivity::kUnknown); return rep_->table_reader->VerifyChecksum(read_options, TableReaderCaller::kSSTFileReader); } } // namespace ROCKSDB_NAMESPACE - diff --git a/table/table_reader.h b/table/table_reader.h index 391072eec..53c522052 100644 --- a/table/table_reader.h +++ b/table/table_reader.h @@ -76,7 +76,8 @@ class TableReader { // function and letting ApproximateSize take optional start and end, so // that absolute start and end can be specified and optimized without // key / index work. - virtual uint64_t ApproximateOffsetOf(const Slice& key, + virtual uint64_t ApproximateOffsetOf(const ReadOptions& read_options, + const Slice& key, TableReaderCaller caller) = 0; // Given start and end keys, return the approximate data size in the file @@ -84,7 +85,8 @@ class TableReader { // includes effects like compression of the underlying data and applicable // portions of metadata including filters and indexes. Nullptr for start or // end (or both) indicates absolute start or end of the table. - virtual uint64_t ApproximateSize(const Slice& start, const Slice& end, + virtual uint64_t ApproximateSize(const ReadOptions& read_options, + const Slice& start, const Slice& end, TableReaderCaller caller) = 0; struct Anchor { @@ -160,7 +162,8 @@ class TableReader { // Prefetch data corresponding to a give range of keys // Typically this functionality is required for table implementations that // persists the data on a non volatile storage medium like disk/SSD - virtual Status Prefetch(const Slice* begin = nullptr, + virtual Status Prefetch(const ReadOptions& /* read_options */, + const Slice* begin = nullptr, const Slice* end = nullptr) { (void)begin; (void)end; diff --git a/table/table_test.cc b/table/table_test.cc index 5bdac8bc2..a701eda01 100644 --- a/table/table_test.cc +++ b/table/table_test.cc @@ -425,14 +425,15 @@ class TableConstructor : public Constructor { } uint64_t ApproximateOffsetOf(const Slice& key) const { + const ReadOptions read_options; if (convert_to_internal_key_) { InternalKey ikey(key, kMaxSequenceNumber, kTypeValue); const Slice skey = ikey.Encode(); return table_reader_->ApproximateOffsetOf( - skey, TableReaderCaller::kUncategorized); + read_options, skey, TableReaderCaller::kUncategorized); } return table_reader_->ApproximateOffsetOf( - key, TableReaderCaller::kUncategorized); + read_options, key, TableReaderCaller::kUncategorized); } virtual Status Reopen(const ImmutableOptions& ioptions, @@ -1979,7 +1980,8 @@ void PrefetchRange(TableConstructor* c, Options* opt, end.reset(new Slice(key_end)); } } - s = table_reader->Prefetch(begin.get(), end.get()); + const ReadOptions read_options; + s = table_reader->Prefetch(read_options, begin.get(), end.get()); ASSERT_TRUE(s.code() == expected_status.code()); @@ -3335,11 +3337,12 @@ TEST_P(BlockBasedTableTest, TracingApproximateOffsetOfTest) { MutableCFOptions moptions(options); c.Finish(options, ioptions, moptions, table_options, GetPlainInternalComparator(options.comparator), &keys, &kvmap); + const ReadOptions read_options; for (uint32_t i = 1; i <= 2; i++) { InternalKey internal_key(auto_add_key1, 0, kTypeValue); std::string encoded_key = internal_key.Encode().ToString(); c.GetTableReader()->ApproximateOffsetOf( - encoded_key, TableReaderCaller::kUserApproximateSize); + read_options, encoded_key, TableReaderCaller::kUserApproximateSize); } // Verify traces. std::vector expected_records; @@ -4079,8 +4082,10 @@ TEST_F(PlainTableTest, BasicPlainTableProperties) { new RandomAccessFileReader(std::move(source), "test")); std::unique_ptr props; + const ReadOptions read_options; auto s = ReadTableProperties(file_reader.get(), ss->contents().size(), - kPlainTableMagicNumber, ioptions, &props); + kPlainTableMagicNumber, ioptions, read_options, + &props); ASSERT_OK(s); ASSERT_EQ(0ul, props->index_size); @@ -4756,9 +4761,10 @@ TEST_P(BlockBasedTableTest, DISABLED_TableWithGlobalSeqno) { new RandomAccessFileReader(std::move(source), "")); std::unique_ptr props; + const ReadOptions read_options; ASSERT_OK(ReadTableProperties(file_reader.get(), ss_rw.contents().size(), kBlockBasedTableMagicNumber, ioptions, - &props)); + read_options, &props)); UserCollectedProperties user_props = props->user_collected_properties; version = DecodeFixed32( @@ -4933,9 +4939,10 @@ TEST_P(BlockBasedTableTest, BlockAlignTest) { // Helper function to get version, global_seqno, global_seqno_offset std::function VerifyBlockAlignment = [&]() { std::unique_ptr props; + const ReadOptions read_options; ASSERT_OK(ReadTableProperties(file_reader.get(), sink->contents().size(), kBlockBasedTableMagicNumber, ioptions, - &props)); + read_options, &props)); uint64_t data_block_size = props->data_size / props->num_data_blocks; ASSERT_EQ(data_block_size, 4096); diff --git a/tools/db_crashtest.py b/tools/db_crashtest.py index 037f27f4a..ac478e7ad 100644 --- a/tools/db_crashtest.py +++ b/tools/db_crashtest.py @@ -200,6 +200,7 @@ default_params = { ] ), "allow_data_in_errors": True, + "enable_thread_tracking": lambda: random.choice([0, 1]), "readahead_size": lambda: random.choice([0, 16384, 524288]), "initial_auto_readahead_size": lambda: random.choice([0, 16384, 524288]), "max_auto_readahead_size": lambda: random.choice([0, 16384, 524288]), diff --git a/tools/ldb_cmd.cc b/tools/ldb_cmd.cc index b7b0e9909..f0119b31e 100644 --- a/tools/ldb_cmd.cc +++ b/tools/ldb_cmd.cc @@ -4168,6 +4168,8 @@ UnsafeRemoveSstFileCommand::UnsafeRemoveSstFileCommand( } void UnsafeRemoveSstFileCommand::DoCommand() { + // TODO: plumb Env::IOActivity + const ReadOptions read_options; PrepareOptions(); OfflineManifestWriter w(options_, db_path_); @@ -4192,7 +4194,7 @@ void UnsafeRemoveSstFileCommand::DoCommand() { s = options_.env->GetFileSystem()->NewDirectory(db_path_, IOOptions(), &db_dir, nullptr); if (s.ok()) { - s = w.LogAndApply(cfd, &edit, db_dir.get()); + s = w.LogAndApply(read_options, cfd, &edit, db_dir.get()); } } diff --git a/util/file_checksum_helper.cc b/util/file_checksum_helper.cc index 0c072e1e3..b8c4099b8 100644 --- a/util/file_checksum_helper.cc +++ b/util/file_checksum_helper.cc @@ -98,6 +98,8 @@ Status GetFileChecksumsFromManifest(Env* src_env, const std::string& abs_path, return Status::InvalidArgument("checksum_list is nullptr"); } assert(checksum_list); + // TODO: plumb Env::IOActivity + const ReadOptions read_options; checksum_list->reset(); Status s; @@ -125,7 +127,8 @@ Status GetFileChecksumsFromManifest(Env* src_env, const std::string& abs_path, reporter.status_ptr = &s; log::Reader reader(nullptr, std::move(file_reader), &reporter, true /* checksum */, 0 /* log_number */); - FileChecksumRetriever retriever(manifest_file_size, *checksum_list); + FileChecksumRetriever retriever(read_options, manifest_file_size, + *checksum_list); retriever.Iterate(reader, &s); assert(!retriever.status().ok() || manifest_file_size == std::numeric_limits::max() || diff --git a/util/stop_watch.h b/util/stop_watch.h index e26380d97..0ecd1bb11 100644 --- a/util/stop_watch.h +++ b/util/stop_watch.h @@ -9,23 +9,33 @@ namespace ROCKSDB_NAMESPACE { // Auto-scoped. -// Records the measure time into the corresponding histogram if statistics -// is not nullptr. It is also saved into *elapsed if the pointer is not nullptr -// and overwrite is true, it will be added to *elapsed if overwrite is false. +// When statistics is not nullptr, records the measured time into any enabled +// histograms supplied to the constructor. A histogram argument may be omitted +// by setting it to Histograms::HISTOGRAM_ENUM_MAX. It is also saved into +// *elapsed if the pointer is not nullptr and overwrite is true, it will be +// added to *elapsed if overwrite is false. class StopWatch { public: StopWatch(SystemClock* clock, Statistics* statistics, - const uint32_t hist_type, uint64_t* elapsed = nullptr, - bool overwrite = true, bool delay_enabled = false) + const uint32_t hist_type_1, + const uint32_t hist_type_2 = Histograms::HISTOGRAM_ENUM_MAX, + uint64_t* elapsed = nullptr, bool overwrite = true, + bool delay_enabled = false) : clock_(clock), statistics_(statistics), - hist_type_(hist_type), + hist_type_1_(statistics && statistics->HistEnabledForType(hist_type_1) + ? hist_type_1 + : Histograms::HISTOGRAM_ENUM_MAX), + hist_type_2_(statistics && statistics->HistEnabledForType(hist_type_2) + ? hist_type_2 + : Histograms::HISTOGRAM_ENUM_MAX), elapsed_(elapsed), overwrite_(overwrite), stats_enabled_(statistics && statistics->get_stats_level() >= StatsLevel::kExceptTimers && - statistics->HistEnabledForType(hist_type)), + (hist_type_1_ != Histograms::HISTOGRAM_ENUM_MAX || + hist_type_2_ != Histograms::HISTOGRAM_ENUM_MAX)), delay_enabled_(delay_enabled), total_delay_(0), delay_start_time_(0), @@ -44,10 +54,15 @@ class StopWatch { *elapsed_ -= total_delay_; } if (stats_enabled_) { - statistics_->reportTimeToHistogram( - hist_type_, (elapsed_ != nullptr) - ? *elapsed_ - : (clock_->NowMicros() - start_time_)); + const auto time = (elapsed_ != nullptr) + ? *elapsed_ + : (clock_->NowMicros() - start_time_); + if (hist_type_1_ != Histograms::HISTOGRAM_ENUM_MAX) { + statistics_->reportTimeToHistogram(hist_type_1_, time); + } + if (hist_type_2_ != Histograms::HISTOGRAM_ENUM_MAX) { + statistics_->reportTimeToHistogram(hist_type_2_, time); + } } } @@ -75,7 +90,8 @@ class StopWatch { private: SystemClock* clock_; Statistics* statistics_; - const uint32_t hist_type_; + const uint32_t hist_type_1_; + const uint32_t hist_type_2_; uint64_t* elapsed_; bool overwrite_; bool stats_enabled_; diff --git a/util/thread_list_test.cc b/util/thread_list_test.cc index af4e62355..b5b3378fa 100644 --- a/util/thread_list_test.cc +++ b/util/thread_list_test.cc @@ -42,6 +42,8 @@ class SimulatedBackgroundTask { std::unique_lock l(mutex_); running_count_++; bg_cv_.notify_all(); + assert(cf_key_); + Env::Default()->GetThreadStatusUpdater()->SetEnableTracking(true); Env::Default()->GetThreadStatusUpdater()->SetColumnFamilyInfoKey(cf_key_); Env::Default()->GetThreadStatusUpdater()->SetThreadOperation( operation_type_); diff --git a/util/thread_operation.h b/util/thread_operation.h index c24fccd5c..b6c106279 100644 --- a/util/thread_operation.h +++ b/util/thread_operation.h @@ -38,7 +38,8 @@ struct OperationInfo { static OperationInfo global_operation_table[] = { {ThreadStatus::OP_UNKNOWN, ""}, {ThreadStatus::OP_COMPACTION, "Compaction"}, - {ThreadStatus::OP_FLUSH, "Flush"}}; + {ThreadStatus::OP_FLUSH, "Flush"}, + {ThreadStatus::OP_DBOPEN, "DBOpen"}}; struct OperationStageInfo { const ThreadStatus::OperationStage stage; diff --git a/utilities/blob_db/blob_db_impl.cc b/utilities/blob_db/blob_db_impl.cc index 3a5e337b6..cfbbd7458 100644 --- a/utilities/blob_db/blob_db_impl.cc +++ b/utilities/blob_db/blob_db_impl.cc @@ -1631,6 +1631,11 @@ Status BlobDBImpl::GetImpl(const ReadOptions& read_options, return Status::NotSupported( "Blob DB doesn't support non-default column family."); } + if (read_options.io_activity != Env::IOActivity::kUnknown) { + return Status::InvalidArgument( + "Cannot call Get with `ReadOptions::io_activity` != " + "`Env::IOActivity::kUnknown`"); + } // Get a snapshot to avoid blob file get deleted between we // fetch and index entry and reading from the file. // TODO(yiwu): For Get() retry if file not found would be a simpler strategy. @@ -2036,6 +2041,11 @@ void BlobDBImpl::CopyBlobFiles( } Iterator* BlobDBImpl::NewIterator(const ReadOptions& read_options) { + if (read_options.io_activity != Env::IOActivity::kUnknown) { + return NewErrorIterator(Status::InvalidArgument( + "Cannot call NewIterator with `ReadOptions::io_activity` != " + "`Env::IOActivity::kUnknown`")); + } auto* cfd = static_cast_with_check(DefaultColumnFamily()) ->cfd(); diff --git a/utilities/transactions/pessimistic_transaction.cc b/utilities/transactions/pessimistic_transaction.cc index 63c689efd..1771497a6 100644 --- a/utilities/transactions/pessimistic_transaction.cc +++ b/utilities/transactions/pessimistic_transaction.cc @@ -166,6 +166,11 @@ template inline Status WriteCommittedTxn::GetForUpdateImpl( const ReadOptions& read_options, ColumnFamilyHandle* column_family, const Slice& key, TValue* value, bool exclusive, const bool do_validate) { + if (read_options.io_activity != Env::IOActivity::kUnknown) { + return Status::InvalidArgument( + "Cannot call GetForUpdate with `ReadOptions::io_activity` != " + "`Env::IOActivity::kUnknown`"); + } column_family = column_family ? column_family : db_impl_->DefaultColumnFamily(); assert(column_family); @@ -1170,4 +1175,3 @@ Status PessimisticTransaction::SetName(const TransactionName& name) { } } // namespace ROCKSDB_NAMESPACE - diff --git a/utilities/transactions/transaction_base.cc b/utilities/transactions/transaction_base.cc index 10d5d02a1..5963f7429 100644 --- a/utilities/transactions/transaction_base.cc +++ b/utilities/transactions/transaction_base.cc @@ -235,6 +235,11 @@ Status TransactionBaseImpl::PopSavePoint() { Status TransactionBaseImpl::Get(const ReadOptions& read_options, ColumnFamilyHandle* column_family, const Slice& key, std::string* value) { + if (read_options.io_activity != Env::IOActivity::kUnknown) { + return Status::InvalidArgument( + "Cannot call Get with `ReadOptions::io_activity` != " + "`Env::IOActivity::kUnknown`"); + } assert(value != nullptr); PinnableSlice pinnable_val(value); assert(!pinnable_val.IsPinned()); @@ -262,6 +267,11 @@ Status TransactionBaseImpl::GetForUpdate(const ReadOptions& read_options, "If do_validate is false then GetForUpdate with snapshot is not " "defined."); } + if (read_options.io_activity != Env::IOActivity::kUnknown) { + return Status::InvalidArgument( + "Cannot call GetForUpdate with `ReadOptions::io_activity` != " + "`Env::IOActivity::kUnknown`"); + } Status s = TryLock(column_family, key, true /* read_only */, exclusive, do_validate); @@ -288,6 +298,11 @@ Status TransactionBaseImpl::GetForUpdate(const ReadOptions& read_options, "If do_validate is false then GetForUpdate with snapshot is not " "defined."); } + if (read_options.io_activity != Env::IOActivity::kUnknown) { + return Status::InvalidArgument( + "Cannot call GetForUpdate with `ReadOptions::io_activity` != " + "`Env::IOActivity::kUnknown`"); + } Status s = TryLock(column_family, key, true /* read_only */, exclusive, do_validate); @@ -302,6 +317,13 @@ std::vector TransactionBaseImpl::MultiGet( const std::vector& column_family, const std::vector& keys, std::vector* values) { size_t num_keys = keys.size(); + if (read_options.io_activity != Env::IOActivity::kUnknown) { + Status s = Status::InvalidArgument( + "Cannot call MultiGet with `ReadOptions::io_activity` != " + "`Env::IOActivity::kUnknown`"); + return std::vector(num_keys, s); + } + values->resize(num_keys); std::vector stat_list(num_keys); @@ -317,6 +339,7 @@ void TransactionBaseImpl::MultiGet(const ReadOptions& read_options, const size_t num_keys, const Slice* keys, PinnableSlice* values, Status* statuses, const bool sorted_input) { + assert(read_options.io_activity == Env::IOActivity::kUnknown); write_batch_.MultiGetFromBatchAndDB(db_, read_options, column_family, num_keys, keys, values, statuses, sorted_input); @@ -328,6 +351,12 @@ std::vector TransactionBaseImpl::MultiGetForUpdate( const std::vector& keys, std::vector* values) { // Regardless of whether the MultiGet succeeded, track these keys. size_t num_keys = keys.size(); + if (read_options.io_activity != Env::IOActivity::kUnknown) { + Status s = Status::InvalidArgument( + "Cannot call MultiGetForUpdate with `ReadOptions::io_activity` != " + "`Env::IOActivity::kUnknown`"); + return std::vector(num_keys, s); + } values->resize(num_keys); // Lock all keys @@ -726,4 +755,3 @@ WriteBatch* TransactionBaseImpl::GetCommitTimeWriteBatch() { return &commit_time_batch_; } } // namespace ROCKSDB_NAMESPACE - diff --git a/utilities/transactions/write_prepared_txn.cc b/utilities/transactions/write_prepared_txn.cc index ededb6250..c27a679e4 100644 --- a/utilities/transactions/write_prepared_txn.cc +++ b/utilities/transactions/write_prepared_txn.cc @@ -44,6 +44,7 @@ void WritePreparedTxn::MultiGet(const ReadOptions& options, const size_t num_keys, const Slice* keys, PinnableSlice* values, Status* statuses, const bool sorted_input) { + assert(options.io_activity == Env::IOActivity::kUnknown); SequenceNumber min_uncommitted, snap_seq; const SnapshotBackup backed_by_snapshot = wpt_db_->AssignMinMaxSeqs(options.snapshot, &min_uncommitted, &snap_seq); @@ -64,6 +65,11 @@ void WritePreparedTxn::MultiGet(const ReadOptions& options, Status WritePreparedTxn::Get(const ReadOptions& options, ColumnFamilyHandle* column_family, const Slice& key, PinnableSlice* pinnable_val) { + if (options.io_activity != Env::IOActivity::kUnknown) { + return Status::InvalidArgument( + "Cannot call Get with `ReadOptions::io_activity` != " + "`Env::IOActivity::kUnknown`"); + } SequenceNumber min_uncommitted, snap_seq; const SnapshotBackup backed_by_snapshot = wpt_db_->AssignMinMaxSeqs(options.snapshot, &min_uncommitted, &snap_seq); @@ -507,4 +513,3 @@ Status WritePreparedTxn::RebuildFromWriteBatch(WriteBatch* src_batch) { } } // namespace ROCKSDB_NAMESPACE - diff --git a/utilities/transactions/write_prepared_txn_db.cc b/utilities/transactions/write_prepared_txn_db.cc index 84c45b7e4..6118c3549 100644 --- a/utilities/transactions/write_prepared_txn_db.cc +++ b/utilities/transactions/write_prepared_txn_db.cc @@ -250,6 +250,11 @@ Status WritePreparedTxnDB::WriteInternal(const WriteOptions& write_options_orig, Status WritePreparedTxnDB::Get(const ReadOptions& options, ColumnFamilyHandle* column_family, const Slice& key, PinnableSlice* value) { + if (options.io_activity != Env::IOActivity::kUnknown) { + return Status::InvalidArgument( + "Cannot call Get with `ReadOptions::io_activity` != " + "`Env::IOActivity::kUnknown`"); + } SequenceNumber min_uncommitted, snap_seq; const SnapshotBackup backed_by_snapshot = AssignMinMaxSeqs(options.snapshot, &min_uncommitted, &snap_seq); @@ -343,6 +348,11 @@ static void CleanupWritePreparedTxnDBIterator(void* arg1, void* /*arg2*/) { Iterator* WritePreparedTxnDB::NewIterator(const ReadOptions& options, ColumnFamilyHandle* column_family) { + if (options.io_activity != Env::IOActivity::kUnknown) { + return NewErrorIterator(Status::InvalidArgument( + "Cannot call NewIterator with `ReadOptions::io_activity` != " + "`Env::IOActivity::kUnknown`")); + } constexpr bool expose_blob_index = false; constexpr bool allow_refresh = false; std::shared_ptr own_snapshot = nullptr; diff --git a/utilities/transactions/write_unprepared_txn.cc b/utilities/transactions/write_unprepared_txn.cc index 11e04824f..845b117cf 100644 --- a/utilities/transactions/write_unprepared_txn.cc +++ b/utilities/transactions/write_unprepared_txn.cc @@ -948,6 +948,7 @@ void WriteUnpreparedTxn::MultiGet(const ReadOptions& options, const size_t num_keys, const Slice* keys, PinnableSlice* values, Status* statuses, const bool sorted_input) { + assert(options.io_activity == Env::IOActivity::kUnknown); SequenceNumber min_uncommitted, snap_seq; const SnapshotBackup backed_by_snapshot = wupt_db_->AssignMinMaxSeqs(options.snapshot, &min_uncommitted, &snap_seq); @@ -968,6 +969,11 @@ void WriteUnpreparedTxn::MultiGet(const ReadOptions& options, Status WriteUnpreparedTxn::Get(const ReadOptions& options, ColumnFamilyHandle* column_family, const Slice& key, PinnableSlice* value) { + if (options.io_activity != Env::IOActivity::kUnknown) { + return Status::InvalidArgument( + "Cannot call Get with `ReadOptions::io_activity` != " + "`Env::IOActivity::kUnknown`"); + } SequenceNumber min_uncommitted, snap_seq; const SnapshotBackup backed_by_snapshot = wupt_db_->AssignMinMaxSeqs(options.snapshot, &min_uncommitted, &snap_seq); @@ -1048,4 +1054,3 @@ WriteUnpreparedTxn::GetUnpreparedSequenceNumbers() { } } // namespace ROCKSDB_NAMESPACE - diff --git a/utilities/transactions/write_unprepared_txn_db.cc b/utilities/transactions/write_unprepared_txn_db.cc index 26a03c77d..fd0ba0aed 100644 --- a/utilities/transactions/write_unprepared_txn_db.cc +++ b/utilities/transactions/write_unprepared_txn_db.cc @@ -388,6 +388,11 @@ static void CleanupWriteUnpreparedTxnDBIterator(void* arg1, void* /*arg2*/) { Iterator* WriteUnpreparedTxnDB::NewIterator(const ReadOptions& options, ColumnFamilyHandle* column_family, WriteUnpreparedTxn* txn) { + if (options.io_activity != Env::IOActivity::kUnknown) { + return NewErrorIterator(Status::InvalidArgument( + "Cannot call NewIterator with `ReadOptions::io_activity` != " + "`Env::IOActivity::kUnknown`")); + } // TODO(lth): Refactor so that this logic is shared with WritePrepared. constexpr bool expose_blob_index = false; constexpr bool allow_refresh = false; diff --git a/utilities/ttl/db_ttl_impl.cc b/utilities/ttl/db_ttl_impl.cc index 5b0486fc1..2b261ec6f 100644 --- a/utilities/ttl/db_ttl_impl.cc +++ b/utilities/ttl/db_ttl_impl.cc @@ -596,6 +596,11 @@ Status DBWithTTLImpl::Write(const WriteOptions& opts, WriteBatch* updates) { Iterator* DBWithTTLImpl::NewIterator(const ReadOptions& opts, ColumnFamilyHandle* column_family) { + if (opts.io_activity != Env::IOActivity::kUnknown) { + return NewErrorIterator(Status::InvalidArgument( + "Cannot call NewIterator with `ReadOptions::io_activity` != " + "`Env::IOActivity::kUnknown`")); + } return new TtlIterator(db_->NewIterator(opts, column_family)); }