diff --git a/HISTORY.md b/HISTORY.md index ebb95663d..1ca433e8e 100644 --- a/HISTORY.md +++ b/HISTORY.md @@ -17,6 +17,7 @@ ### New Features * Add experimental `PerfContext` counters `iter_{next|prev|seek}_count` for db iterator, each counting the times of corresponding API being called. * Allow runtime changes to whether `WriteBufferManager` allows stall or not by calling `SetAllowStall()` +* New statistics `rocksdb.file.read.{flush|compaction}.micros` that measure read time of block-based SST tables or blob files during flush or compaction. ### Bug Fixes * In block cache tracing, fixed some cases of bad hit/miss information (and more) with MultiGet. diff --git a/db/blob/blob_file_cache.cc b/db/blob/blob_file_cache.cc index 19757946d..deebf8d34 100644 --- a/db/blob/blob_file_cache.cc +++ b/db/blob/blob_file_cache.cc @@ -37,7 +37,7 @@ BlobFileCache::BlobFileCache(Cache* cache, } Status BlobFileCache::GetBlobFileReader( - uint64_t blob_file_number, + const ReadOptions& read_options, uint64_t blob_file_number, CacheHandleGuard* blob_file_reader) { assert(blob_file_reader); assert(blob_file_reader->IsEmpty()); @@ -73,7 +73,7 @@ Status BlobFileCache::GetBlobFileReader( { assert(file_options_); const Status s = BlobFileReader::Create( - *immutable_options_, *file_options_, column_family_id_, + *immutable_options_, read_options, *file_options_, column_family_id_, blob_file_read_hist_, blob_file_number, io_tracer_, &reader); if (!s.ok()) { RecordTick(statistics, NO_FILE_ERRORS); diff --git a/db/blob/blob_file_cache.h b/db/blob/blob_file_cache.h index 6281897d6..a80be7c55 100644 --- a/db/blob/blob_file_cache.h +++ b/db/blob/blob_file_cache.h @@ -32,7 +32,8 @@ class BlobFileCache { BlobFileCache(const BlobFileCache&) = delete; BlobFileCache& operator=(const BlobFileCache&) = delete; - Status GetBlobFileReader(uint64_t blob_file_number, + Status GetBlobFileReader(const ReadOptions& read_options, + uint64_t blob_file_number, CacheHandleGuard* blob_file_reader); private: diff --git a/db/blob/blob_file_cache_test.cc b/db/blob/blob_file_cache_test.cc index d3a61b3c5..8c3c56de9 100644 --- a/db/blob/blob_file_cache_test.cc +++ b/db/blob/blob_file_cache_test.cc @@ -118,7 +118,9 @@ TEST_F(BlobFileCacheTest, GetBlobFileReader) { // First try: reader should be opened and put in cache CacheHandleGuard first; - ASSERT_OK(blob_file_cache.GetBlobFileReader(blob_file_number, &first)); + const ReadOptions read_options; + ASSERT_OK(blob_file_cache.GetBlobFileReader(read_options, blob_file_number, + &first)); ASSERT_NE(first.GetValue(), nullptr); ASSERT_EQ(options.statistics->getTickerCount(NO_FILE_OPENS), 1); ASSERT_EQ(options.statistics->getTickerCount(NO_FILE_ERRORS), 0); @@ -126,7 +128,8 @@ TEST_F(BlobFileCacheTest, GetBlobFileReader) { // Second try: reader should be served from cache CacheHandleGuard second; - ASSERT_OK(blob_file_cache.GetBlobFileReader(blob_file_number, &second)); + ASSERT_OK(blob_file_cache.GetBlobFileReader(read_options, blob_file_number, + &second)); ASSERT_NE(second.GetValue(), nullptr); ASSERT_EQ(options.statistics->getTickerCount(NO_FILE_OPENS), 1); ASSERT_EQ(options.statistics->getTickerCount(NO_FILE_ERRORS), 0); @@ -163,19 +166,21 @@ TEST_F(BlobFileCacheTest, GetBlobFileReader_Race) { CacheHandleGuard first; CacheHandleGuard second; + const ReadOptions read_options; SyncPoint::GetInstance()->SetCallBack( "BlobFileCache::GetBlobFileReader:DoubleCheck", [&](void* /* arg */) { // Disabling sync points to prevent infinite recursion SyncPoint::GetInstance()->DisableProcessing(); - - ASSERT_OK(blob_file_cache.GetBlobFileReader(blob_file_number, &second)); + ASSERT_OK(blob_file_cache.GetBlobFileReader(read_options, + blob_file_number, &second)); ASSERT_NE(second.GetValue(), nullptr); ASSERT_EQ(options.statistics->getTickerCount(NO_FILE_OPENS), 1); ASSERT_EQ(options.statistics->getTickerCount(NO_FILE_ERRORS), 0); }); SyncPoint::GetInstance()->EnableProcessing(); - ASSERT_OK(blob_file_cache.GetBlobFileReader(blob_file_number, &first)); + ASSERT_OK(blob_file_cache.GetBlobFileReader(read_options, blob_file_number, + &first)); ASSERT_NE(first.GetValue(), nullptr); ASSERT_EQ(options.statistics->getTickerCount(NO_FILE_OPENS), 1); ASSERT_EQ(options.statistics->getTickerCount(NO_FILE_ERRORS), 0); @@ -213,8 +218,10 @@ TEST_F(BlobFileCacheTest, GetBlobFileReader_IOError) { CacheHandleGuard reader; + const ReadOptions read_options; ASSERT_TRUE( - blob_file_cache.GetBlobFileReader(blob_file_number, &reader).IsIOError()); + blob_file_cache.GetBlobFileReader(read_options, blob_file_number, &reader) + .IsIOError()); ASSERT_EQ(reader.GetValue(), nullptr); ASSERT_EQ(options.statistics->getTickerCount(NO_FILE_OPENS), 1); ASSERT_EQ(options.statistics->getTickerCount(NO_FILE_ERRORS), 1); @@ -253,8 +260,10 @@ TEST_F(BlobFileCacheTest, GetBlobFileReader_CacheFull) { // strict_capacity_limit is set CacheHandleGuard reader; - ASSERT_TRUE(blob_file_cache.GetBlobFileReader(blob_file_number, &reader) - .IsMemoryLimit()); + const ReadOptions read_options; + ASSERT_TRUE( + blob_file_cache.GetBlobFileReader(read_options, blob_file_number, &reader) + .IsMemoryLimit()); ASSERT_EQ(reader.GetValue(), nullptr); ASSERT_EQ(options.statistics->getTickerCount(NO_FILE_OPENS), 1); ASSERT_EQ(options.statistics->getTickerCount(NO_FILE_ERRORS), 1); diff --git a/db/blob/blob_file_reader.cc b/db/blob/blob_file_reader.cc index da7f2bb12..79c0bf50d 100644 --- a/db/blob/blob_file_reader.cc +++ b/db/blob/blob_file_reader.cc @@ -26,9 +26,10 @@ namespace ROCKSDB_NAMESPACE { Status BlobFileReader::Create( - const ImmutableOptions& immutable_options, const FileOptions& file_options, - uint32_t column_family_id, HistogramImpl* blob_file_read_hist, - uint64_t blob_file_number, const std::shared_ptr& io_tracer, + const ImmutableOptions& immutable_options, const ReadOptions& read_options, + const FileOptions& file_options, uint32_t column_family_id, + HistogramImpl* blob_file_read_hist, uint64_t blob_file_number, + const std::shared_ptr& io_tracer, std::unique_ptr* blob_file_reader) { assert(blob_file_reader); assert(!*blob_file_reader); @@ -52,15 +53,17 @@ Status BlobFileReader::Create( CompressionType compression_type = kNoCompression; { - const Status s = ReadHeader(file_reader.get(), column_family_id, statistics, - &compression_type); + const Status s = + ReadHeader(file_reader.get(), read_options, column_family_id, + statistics, &compression_type); if (!s.ok()) { return s; } } { - const Status s = ReadFooter(file_reader.get(), file_size, statistics); + const Status s = + ReadFooter(file_reader.get(), read_options, file_size, statistics); if (!s.ok()) { return s; } @@ -134,6 +137,7 @@ Status BlobFileReader::OpenFile( } Status BlobFileReader::ReadHeader(const RandomAccessFileReader* file_reader, + const ReadOptions& read_options, uint32_t column_family_id, Statistics* statistics, CompressionType* compression_type) { @@ -151,9 +155,10 @@ Status BlobFileReader::ReadHeader(const RandomAccessFileReader* file_reader, constexpr size_t read_size = BlobLogHeader::kSize; // TODO: rate limit reading headers from blob files. - const Status s = ReadFromFile(file_reader, read_offset, read_size, - statistics, &header_slice, &buf, &aligned_buf, - Env::IO_TOTAL /* rate_limiter_priority */); + const Status s = + ReadFromFile(file_reader, read_options, read_offset, read_size, + statistics, &header_slice, &buf, &aligned_buf, + Env::IO_TOTAL /* rate_limiter_priority */); if (!s.ok()) { return s; } @@ -187,6 +192,7 @@ Status BlobFileReader::ReadHeader(const RandomAccessFileReader* file_reader, } Status BlobFileReader::ReadFooter(const RandomAccessFileReader* file_reader, + const ReadOptions& read_options, uint64_t file_size, Statistics* statistics) { assert(file_size >= BlobLogHeader::kSize + BlobLogFooter::kSize); assert(file_reader); @@ -202,9 +208,10 @@ Status BlobFileReader::ReadFooter(const RandomAccessFileReader* file_reader, constexpr size_t read_size = BlobLogFooter::kSize; // TODO: rate limit reading footers from blob files. - const Status s = ReadFromFile(file_reader, read_offset, read_size, - statistics, &footer_slice, &buf, &aligned_buf, - Env::IO_TOTAL /* rate_limiter_priority */); + const Status s = + ReadFromFile(file_reader, read_options, read_offset, read_size, + statistics, &footer_slice, &buf, &aligned_buf, + Env::IO_TOTAL /* rate_limiter_priority */); if (!s.ok()) { return s; } @@ -232,6 +239,7 @@ Status BlobFileReader::ReadFooter(const RandomAccessFileReader* file_reader, } Status BlobFileReader::ReadFromFile(const RandomAccessFileReader* file_reader, + const ReadOptions& read_options, uint64_t read_offset, size_t read_size, Statistics* statistics, Slice* slice, Buffer* buf, AlignedBuf* aligned_buf, @@ -246,17 +254,23 @@ Status BlobFileReader::ReadFromFile(const RandomAccessFileReader* file_reader, Status s; + IOOptions io_options; + s = file_reader->PrepareIOOptions(read_options, io_options); + if (!s.ok()) { + return s; + } + if (file_reader->use_direct_io()) { constexpr char* scratch = nullptr; - s = file_reader->Read(IOOptions(), read_offset, read_size, slice, scratch, + s = file_reader->Read(io_options, read_offset, read_size, slice, scratch, aligned_buf, rate_limiter_priority); } else { buf->reset(new char[read_size]); constexpr AlignedBuf* aligned_scratch = nullptr; - s = file_reader->Read(IOOptions(), read_offset, read_size, slice, - buf->get(), aligned_scratch, rate_limiter_priority); + s = file_reader->Read(io_options, read_offset, read_size, slice, buf->get(), + aligned_scratch, rate_limiter_priority); } if (!s.ok()) { @@ -324,8 +338,13 @@ Status BlobFileReader::GetBlob( Status s; constexpr bool for_compaction = true; + IOOptions io_options; + s = file_reader_->PrepareIOOptions(read_options, io_options); + if (!s.ok()) { + return s; + } prefetched = prefetch_buffer->TryReadFromCache( - IOOptions(), file_reader_.get(), record_offset, + io_options, file_reader_.get(), record_offset, static_cast(record_size), &record_slice, &s, read_options.rate_limiter_priority, for_compaction); if (!s.ok()) { @@ -338,10 +357,10 @@ Status BlobFileReader::GetBlob( PERF_COUNTER_ADD(blob_read_count, 1); PERF_COUNTER_ADD(blob_read_byte, record_size); PERF_TIMER_GUARD(blob_read_time); - const Status s = ReadFromFile(file_reader_.get(), record_offset, - static_cast(record_size), statistics_, - &record_slice, &buf, &aligned_buf, - read_options.rate_limiter_priority); + const Status s = ReadFromFile( + file_reader_.get(), read_options, record_offset, + static_cast(record_size), statistics_, &record_slice, &buf, + &aligned_buf, read_options.rate_limiter_priority); if (!s.ok()) { return s; } diff --git a/db/blob/blob_file_reader.h b/db/blob/blob_file_reader.h index 75b756da1..990e32540 100644 --- a/db/blob/blob_file_reader.h +++ b/db/blob/blob_file_reader.h @@ -29,6 +29,7 @@ class Statistics; class BlobFileReader { public: static Status Create(const ImmutableOptions& immutable_options, + const ReadOptions& read_options, const FileOptions& file_options, uint32_t column_family_id, HistogramImpl* blob_file_read_hist, @@ -74,15 +75,18 @@ class BlobFileReader { std::unique_ptr* file_reader); static Status ReadHeader(const RandomAccessFileReader* file_reader, + const ReadOptions& read_options, uint32_t column_family_id, Statistics* statistics, CompressionType* compression_type); static Status ReadFooter(const RandomAccessFileReader* file_reader, - uint64_t file_size, Statistics* statistics); + const ReadOptions& read_options, uint64_t file_size, + Statistics* statistics); using Buffer = std::unique_ptr; static Status ReadFromFile(const RandomAccessFileReader* file_reader, + const ReadOptions& read_options, uint64_t read_offset, size_t read_size, Statistics* statistics, Slice* slice, Buffer* buf, AlignedBuf* aligned_buf, diff --git a/db/blob/blob_file_reader_test.cc b/db/blob/blob_file_reader_test.cc index 03458e2b5..c8e4e5954 100644 --- a/db/blob/blob_file_reader_test.cc +++ b/db/blob/blob_file_reader_test.cc @@ -172,12 +172,12 @@ TEST_F(BlobFileReaderTest, CreateReaderAndGetBlob) { std::unique_ptr reader; + ReadOptions read_options; ASSERT_OK(BlobFileReader::Create( - immutable_options, FileOptions(), column_family_id, blob_file_read_hist, - blob_file_number, nullptr /*IOTracer*/, &reader)); + immutable_options, read_options, FileOptions(), column_family_id, + blob_file_read_hist, blob_file_number, nullptr /*IOTracer*/, &reader)); // Make sure the blob can be retrieved with and without checksum verification - ReadOptions read_options; read_options.verify_checksums = false; constexpr FilePrefetchBuffer* prefetch_buffer = nullptr; @@ -479,11 +479,11 @@ TEST_F(BlobFileReaderTest, Malformed) { constexpr HistogramImpl* blob_file_read_hist = nullptr; std::unique_ptr reader; - - ASSERT_TRUE(BlobFileReader::Create(immutable_options, FileOptions(), - column_family_id, blob_file_read_hist, - blob_file_number, nullptr /*IOTracer*/, - &reader) + const ReadOptions read_options; + ASSERT_TRUE(BlobFileReader::Create(immutable_options, read_options, + FileOptions(), column_family_id, + blob_file_read_hist, blob_file_number, + nullptr /*IOTracer*/, &reader) .IsCorruption()); } @@ -513,11 +513,11 @@ TEST_F(BlobFileReaderTest, TTL) { constexpr HistogramImpl* blob_file_read_hist = nullptr; std::unique_ptr reader; - - ASSERT_TRUE(BlobFileReader::Create(immutable_options, FileOptions(), - column_family_id, blob_file_read_hist, - blob_file_number, nullptr /*IOTracer*/, - &reader) + const ReadOptions read_options; + ASSERT_TRUE(BlobFileReader::Create(immutable_options, read_options, + FileOptions(), column_family_id, + blob_file_read_hist, blob_file_number, + nullptr /*IOTracer*/, &reader) .IsCorruption()); } @@ -552,11 +552,11 @@ TEST_F(BlobFileReaderTest, ExpirationRangeInHeader) { constexpr HistogramImpl* blob_file_read_hist = nullptr; std::unique_ptr reader; - - ASSERT_TRUE(BlobFileReader::Create(immutable_options, FileOptions(), - column_family_id, blob_file_read_hist, - blob_file_number, nullptr /*IOTracer*/, - &reader) + const ReadOptions read_options; + ASSERT_TRUE(BlobFileReader::Create(immutable_options, read_options, + FileOptions(), column_family_id, + blob_file_read_hist, blob_file_number, + nullptr /*IOTracer*/, &reader) .IsCorruption()); } @@ -591,11 +591,11 @@ TEST_F(BlobFileReaderTest, ExpirationRangeInFooter) { constexpr HistogramImpl* blob_file_read_hist = nullptr; std::unique_ptr reader; - - ASSERT_TRUE(BlobFileReader::Create(immutable_options, FileOptions(), - column_family_id, blob_file_read_hist, - blob_file_number, nullptr /*IOTracer*/, - &reader) + const ReadOptions read_options; + ASSERT_TRUE(BlobFileReader::Create(immutable_options, read_options, + FileOptions(), column_family_id, + blob_file_read_hist, blob_file_number, + nullptr /*IOTracer*/, &reader) .IsCorruption()); } @@ -629,9 +629,9 @@ TEST_F(BlobFileReaderTest, IncorrectColumnFamily) { std::unique_ptr reader; constexpr uint32_t incorrect_column_family_id = 2; - - ASSERT_TRUE(BlobFileReader::Create(immutable_options, FileOptions(), - incorrect_column_family_id, + const ReadOptions read_options; + ASSERT_TRUE(BlobFileReader::Create(immutable_options, read_options, + FileOptions(), incorrect_column_family_id, blob_file_read_hist, blob_file_number, nullptr /*IOTracer*/, &reader) .IsCorruption()); @@ -664,10 +664,10 @@ TEST_F(BlobFileReaderTest, BlobCRCError) { constexpr HistogramImpl* blob_file_read_hist = nullptr; std::unique_ptr reader; - + const ReadOptions read_options; ASSERT_OK(BlobFileReader::Create( - immutable_options, FileOptions(), column_family_id, blob_file_read_hist, - blob_file_number, nullptr /*IOTracer*/, &reader)); + immutable_options, read_options, FileOptions(), column_family_id, + blob_file_read_hist, blob_file_number, nullptr /*IOTracer*/, &reader)); SyncPoint::GetInstance()->SetCallBack( "BlobFileReader::VerifyBlob:CheckBlobCRC", [](void* arg) { @@ -728,13 +728,12 @@ TEST_F(BlobFileReaderTest, Compression) { constexpr HistogramImpl* blob_file_read_hist = nullptr; std::unique_ptr reader; - + ReadOptions read_options; ASSERT_OK(BlobFileReader::Create( - immutable_options, FileOptions(), column_family_id, blob_file_read_hist, - blob_file_number, nullptr /*IOTracer*/, &reader)); + immutable_options, read_options, FileOptions(), column_family_id, + blob_file_read_hist, blob_file_number, nullptr /*IOTracer*/, &reader)); // Make sure the blob can be retrieved with and without checksum verification - ReadOptions read_options; read_options.verify_checksums = false; constexpr FilePrefetchBuffer* prefetch_buffer = nullptr; @@ -803,10 +802,10 @@ TEST_F(BlobFileReaderTest, UncompressionError) { constexpr HistogramImpl* blob_file_read_hist = nullptr; std::unique_ptr reader; - + const ReadOptions read_options; ASSERT_OK(BlobFileReader::Create( - immutable_options, FileOptions(), column_family_id, blob_file_read_hist, - blob_file_number, nullptr /*IOTracer*/, &reader)); + immutable_options, read_options, FileOptions(), column_family_id, + blob_file_read_hist, blob_file_number, nullptr /*IOTracer*/, &reader)); SyncPoint::GetInstance()->SetCallBack( "BlobFileReader::UncompressBlobIfNeeded:TamperWithResult", [](void* arg) { @@ -895,10 +894,10 @@ TEST_P(BlobFileReaderIOErrorTest, IOError) { constexpr HistogramImpl* blob_file_read_hist = nullptr; std::unique_ptr reader; - + const ReadOptions read_options; const Status s = BlobFileReader::Create( - immutable_options, FileOptions(), column_family_id, blob_file_read_hist, - blob_file_number, nullptr /*IOTracer*/, &reader); + immutable_options, read_options, FileOptions(), column_family_id, + blob_file_read_hist, blob_file_number, nullptr /*IOTracer*/, &reader); const bool fail_during_create = (sync_point_ != "BlobFileReader::GetBlob:ReadFromFile"); @@ -983,10 +982,10 @@ TEST_P(BlobFileReaderDecodingErrorTest, DecodingError) { constexpr HistogramImpl* blob_file_read_hist = nullptr; std::unique_ptr reader; - + const ReadOptions read_options; const Status s = BlobFileReader::Create( - immutable_options, FileOptions(), column_family_id, blob_file_read_hist, - blob_file_number, nullptr /*IOTracer*/, &reader); + immutable_options, read_options, FileOptions(), column_family_id, + blob_file_read_hist, blob_file_number, nullptr /*IOTracer*/, &reader); const bool fail_during_create = sync_point_ != "BlobFileReader::GetBlob:TamperWithResult"; diff --git a/db/blob/blob_source.cc b/db/blob/blob_source.cc index 1e866c7dd..5e5e81355 100644 --- a/db/blob/blob_source.cc +++ b/db/blob/blob_source.cc @@ -209,7 +209,8 @@ Status BlobSource::GetBlob(const ReadOptions& read_options, { CacheHandleGuard blob_file_reader; - s = blob_file_cache_->GetBlobFileReader(file_number, &blob_file_reader); + s = blob_file_cache_->GetBlobFileReader(read_options, file_number, + &blob_file_reader); if (!s.ok()) { return s; } @@ -372,8 +373,8 @@ void BlobSource::MultiGetBlobFromOneFile(const ReadOptions& read_options, } CacheHandleGuard blob_file_reader; - Status s = - blob_file_cache_->GetBlobFileReader(file_number, &blob_file_reader); + Status s = blob_file_cache_->GetBlobFileReader(read_options, file_number, + &blob_file_reader); if (!s.ok()) { for (size_t i = 0; i < _blob_reqs.size(); ++i) { BlobReadRequest* const req = _blob_reqs[i].first; diff --git a/db/blob/blob_source.h b/db/blob/blob_source.h index cdc218747..d5e009b54 100644 --- a/db/blob/blob_source.h +++ b/db/blob/blob_source.h @@ -95,9 +95,9 @@ class BlobSource { uint64_t* bytes_read); inline Status GetBlobFileReader( - uint64_t blob_file_number, + const ReadOptions& read_options, uint64_t blob_file_number, CacheHandleGuard* blob_file_reader) { - return blob_file_cache_->GetBlobFileReader(blob_file_number, + return blob_file_cache_->GetBlobFileReader(read_options, blob_file_number, blob_file_reader); } diff --git a/db/blob/blob_source_test.cc b/db/blob/blob_source_test.cc index a82d5dd0d..a9771565a 100644 --- a/db/blob/blob_source_test.cc +++ b/db/blob/blob_source_test.cc @@ -517,7 +517,8 @@ TEST_F(BlobSourceTest, GetCompressedBlobs) { compression, blob_offsets, blob_sizes); CacheHandleGuard blob_file_reader; - ASSERT_OK(blob_source.GetBlobFileReader(file_number, &blob_file_reader)); + ASSERT_OK(blob_source.GetBlobFileReader(read_options, file_number, + &blob_file_reader)); ASSERT_NE(blob_file_reader.GetValue(), nullptr); const uint64_t file_size = blob_file_reader.GetValue()->GetFileSize(); @@ -1139,12 +1140,13 @@ TEST_F(BlobSecondaryCacheTest, GetBlobsFromSecondaryCache) { blob_file_cache.get()); CacheHandleGuard file_reader; - ASSERT_OK(blob_source.GetBlobFileReader(file_number, &file_reader)); + ReadOptions read_options; + ASSERT_OK( + blob_source.GetBlobFileReader(read_options, file_number, &file_reader)); ASSERT_NE(file_reader.GetValue(), nullptr); const uint64_t file_size = file_reader.GetValue()->GetFileSize(); ASSERT_EQ(file_reader.GetValue()->GetCompressionType(), kNoCompression); - ReadOptions read_options; read_options.verify_checksums = true; auto blob_cache = options_.blob_cache; diff --git a/db/builder.cc b/db/builder.cc index b86dd6b9c..be1ec29bf 100644 --- a/db/builder.cc +++ b/db/builder.cc @@ -56,8 +56,8 @@ TableBuilder* NewTableBuilder(const TableBuilderOptions& tboptions, Status BuildTable( const std::string& dbname, VersionSet* versions, const ImmutableDBOptions& db_options, const TableBuilderOptions& tboptions, - const FileOptions& file_options, TableCache* table_cache, - InternalIterator* iter, + const FileOptions& file_options, const ReadOptions& read_options, + TableCache* table_cache, InternalIterator* iter, std::vector> range_del_iters, FileMetaData* meta, std::vector* blob_file_additions, @@ -255,8 +255,8 @@ Status BuildTable( SizeApproximationOptions approx_opts; approx_opts.files_size_error_margin = 0.1; meta->compensated_range_deletion_size += versions->ApproximateSize( - approx_opts, version, kv.first.Encode(), tombstone_end.Encode(), - 0 /* start_level */, -1 /* end_level */, + approx_opts, read_options, version, kv.first.Encode(), + tombstone_end.Encode(), 0 /* start_level */, -1 /* end_level */, TableReaderCaller::kFlush); } last_tombstone_start_user_key = range_del_it->start_key(); @@ -369,7 +369,6 @@ Status BuildTable( // here because this is a special case after we finish the table building. // No matter whether use_direct_io_for_flush_and_compaction is true, // the goal is to cache it here for further user reads. - ReadOptions read_options; std::unique_ptr it(table_cache->NewIterator( read_options, file_options, tboptions.internal_comparator, *meta, nullptr /* range_del_agg */, mutable_cf_options.prefix_extractor, diff --git a/db/builder.h b/db/builder.h index 063da5ca9..6a6a1866a 100644 --- a/db/builder.h +++ b/db/builder.h @@ -53,8 +53,8 @@ TableBuilder* NewTableBuilder(const TableBuilderOptions& tboptions, extern Status BuildTable( const std::string& dbname, VersionSet* versions, const ImmutableDBOptions& db_options, const TableBuilderOptions& tboptions, - const FileOptions& file_options, TableCache* table_cache, - InternalIterator* iter, + const FileOptions& file_options, const ReadOptions& read_options, + TableCache* table_cache, InternalIterator* iter, std::vector> range_del_iters, FileMetaData* meta, std::vector* blob_file_additions, diff --git a/db/column_family.cc b/db/column_family.cc index b3d04dc6a..24ea46ac4 100644 --- a/db/column_family.cc +++ b/db/column_family.cc @@ -1141,6 +1141,7 @@ Status ColumnFamilyData::RangesOverlapWithMemtables( *overlap = false; // Create an InternalIterator over all unflushed memtables Arena arena; + // TODO: plumb Env::IOActivity ReadOptions read_opts; read_opts.total_order_seek = true; MergeIteratorBuilder merge_iter_builder(&internal_comparator_, &arena); diff --git a/db/compaction/compaction_iterator.cc b/db/compaction/compaction_iterator.cc index c2ac7f692..5be7b565a 100644 --- a/db/compaction/compaction_iterator.cc +++ b/db/compaction/compaction_iterator.cc @@ -1412,6 +1412,7 @@ std::unique_ptr CompactionIterator::CreateBlobFetcherIfNeeded( } ReadOptions read_options; + read_options.io_activity = Env::IOActivity::kCompaction; read_options.fill_cache = false; return std::unique_ptr(new BlobFetcher(version, read_options)); diff --git a/db/compaction/compaction_job.cc b/db/compaction/compaction_job.cc index 331be915e..8a326a508 100644 --- a/db/compaction/compaction_job.cc +++ b/db/compaction/compaction_job.cc @@ -192,8 +192,8 @@ CompactionJob::CompactionJob( assert(log_buffer_ != nullptr); const auto* cfd = compact_->compaction->column_family_data(); - ThreadStatusUtil::SetColumnFamily(cfd, cfd->ioptions()->env, - db_options_.enable_thread_tracking); + ThreadStatusUtil::SetEnableTracking(db_options_.enable_thread_tracking); + ThreadStatusUtil::SetColumnFamily(cfd); ThreadStatusUtil::SetThreadOperation(ThreadStatus::OP_COMPACTION); ReportStartedCompaction(compaction); } @@ -204,10 +204,6 @@ CompactionJob::~CompactionJob() { } void CompactionJob::ReportStartedCompaction(Compaction* compaction) { - const auto* cfd = compact_->compaction->column_family_data(); - ThreadStatusUtil::SetColumnFamily(cfd, cfd->ioptions()->env, - db_options_.enable_thread_tracking); - ThreadStatusUtil::SetThreadOperationProperty(ThreadStatus::COMPACTION_JOB_ID, job_id_); @@ -291,12 +287,14 @@ void CompactionJob::Prepare() { c->immutable_options()->preclude_last_level_data_seconds); if (preserve_time_duration > 0) { + const ReadOptions read_options(Env::IOActivity::kCompaction); // setup seqno_time_mapping_ seqno_time_mapping_.SetMaxTimeDuration(preserve_time_duration); for (const auto& each_level : *c->inputs()) { for (const auto& fmd : each_level.files) { std::shared_ptr tp; - Status s = cfd->current()->GetTableProperties(&tp, fmd, nullptr); + Status s = + cfd->current()->GetTableProperties(read_options, &tp, fmd, nullptr); if (s.ok()) { seqno_time_mapping_.Add(tp->seqno_to_time_mapping) .PermitUncheckedError(); @@ -472,7 +470,7 @@ void CompactionJob::GenSubcompactionBoundaries() { // overlap with N-1 other ranges. Since we requested a relatively large number // (128) of ranges from each input files, even N range overlapping would // cause relatively small inaccuracy. - + const ReadOptions read_options(Env::IOActivity::kCompaction); auto* c = compact_->compaction; if (c->max_subcompactions() <= 1 && !(c->immutable_options()->compaction_pri == kRoundRobin && @@ -506,7 +504,7 @@ void CompactionJob::GenSubcompactionBoundaries() { FileMetaData* f = flevel->files[i].file_metadata; std::vector my_anchors; Status s = cfd->table_cache()->ApproximateKeyAnchors( - ReadOptions(), icomp, *f, my_anchors); + read_options, icomp, *f, my_anchors); if (!s.ok() || my_anchors.empty()) { my_anchors.emplace_back(f->largest.user_key(), f->fd.GetFileSize()); } @@ -722,11 +720,12 @@ Status CompactionJob::Run() { // use_direct_io_for_flush_and_compaction is true, we will regard this // verification as user reads since the goal is to cache it here for // further user reads - ReadOptions read_options; + const ReadOptions verify_table_read_options( + Env::IOActivity::kCompaction); InternalIterator* iter = cfd->table_cache()->NewIterator( - read_options, file_options_, cfd->internal_comparator(), - files_output[file_idx]->meta, /*range_del_agg=*/nullptr, - prefix_extractor, + verify_table_read_options, file_options_, + cfd->internal_comparator(), files_output[file_idx]->meta, + /*range_del_agg=*/nullptr, prefix_extractor, /*table_reader_ptr=*/nullptr, cfd->internal_stats()->GetFileReadHist( compact_->compaction->output_level()), @@ -1032,7 +1031,6 @@ void CompactionJob::NotifyOnSubcompactionCompleted( void CompactionJob::ProcessKeyValueCompaction(SubcompactionState* sub_compact) { assert(sub_compact); assert(sub_compact->compaction); - if (db_options_.compaction_service) { CompactionServiceJobStatus comp_status = ProcessKeyValueCompactionWithCompactionService(sub_compact); @@ -1083,6 +1081,7 @@ void CompactionJob::ProcessKeyValueCompaction(SubcompactionState* sub_compact) { read_options.verify_checksums = true; read_options.fill_cache = false; read_options.rate_limiter_priority = GetRateLimiterPriority(); + read_options.io_activity = Env::IOActivity::kCompaction; // Compaction iterators shouldn't be confined to a single prefix. // Compactions use Seek() for // (a) concurrent compactions, @@ -1640,6 +1639,7 @@ Status CompactionJob::InstallCompactionResults( db_mutex_->AssertHeld(); + const ReadOptions read_options(Env::IOActivity::kCompaction); auto* compaction = compact_->compaction; assert(compaction); @@ -1717,8 +1717,8 @@ Status CompactionJob::InstallCompactionResults( } return versions_->LogAndApply(compaction->column_family_data(), - mutable_cf_options, edit, db_mutex_, - db_directory_); + mutable_cf_options, read_options, edit, + db_mutex_, db_directory_); } void CompactionJob::RecordCompactionIOStats() { diff --git a/db/compaction/compaction_job_test.cc b/db/compaction/compaction_job_test.cc index 578d7067c..9c5784d5e 100644 --- a/db/compaction/compaction_job_test.cc +++ b/db/compaction/compaction_job_test.cc @@ -389,9 +389,9 @@ class CompactionJobTestBase : public testing::Test { 0); mutex_.Lock(); - EXPECT_OK( - versions_->LogAndApply(versions_->GetColumnFamilySet()->GetDefault(), - mutable_cf_options_, &edit, &mutex_, nullptr)); + EXPECT_OK(versions_->LogAndApply( + versions_->GetColumnFamilySet()->GetDefault(), mutable_cf_options_, + read_options_, &edit, &mutex_, nullptr)); mutex_.Unlock(); } @@ -727,6 +727,7 @@ class CompactionJobTestBase : public testing::Test { ColumnFamilyOptions cf_options_; MutableCFOptions mutable_cf_options_; MutableDBOptions mutable_db_options_; + const ReadOptions read_options_; std::shared_ptr table_cache_; WriteController write_controller_; WriteBufferManager write_buffer_manager_; @@ -2440,4 +2441,3 @@ int main(int argc, char** argv) { RegisterCustomObjects(argc, argv); return RUN_ALL_TESTS(); } - diff --git a/db/compaction/compaction_outputs.cc b/db/compaction/compaction_outputs.cc index 3aedc3fe1..cf5105e41 100644 --- a/db/compaction/compaction_outputs.cc +++ b/db/compaction/compaction_outputs.cc @@ -574,6 +574,7 @@ Status CompactionOutputs::AddRangeDels( auto it = range_del_agg_->NewIterator(lower_bound, upper_bound); Slice last_tombstone_start_user_key{}; bool reached_lower_bound = false; + const ReadOptions read_options(Env::IOActivity::kCompaction); for (it->SeekToFirst(); it->Valid(); it->Next()) { auto tombstone = it->Tombstone(); auto kv = tombstone.Serialize(); @@ -713,7 +714,7 @@ Status CompactionOutputs::AddRangeDels( approx_opts.files_size_error_margin = 0.1; auto approximate_covered_size = compaction_->input_version()->version_set()->ApproximateSize( - approx_opts, compaction_->input_version(), + approx_opts, read_options, compaction_->input_version(), tombstone_start.Encode(), tombstone_end.Encode(), compaction_->output_level() + 1 /* start_level */, -1 /* end_level */, kCompaction); diff --git a/db/convenience.cc b/db/convenience.cc index f18473feb..8ab7cbc13 100644 --- a/db/convenience.cc +++ b/db/convenience.cc @@ -33,7 +33,9 @@ Status DeleteFilesInRanges(DB* db, ColumnFamilyHandle* column_family, Status VerifySstFileChecksum(const Options& options, const EnvOptions& env_options, const std::string& file_path) { - return VerifySstFileChecksum(options, env_options, ReadOptions(), file_path); + // TODO: plumb Env::IOActivity + const ReadOptions read_options; + return VerifySstFileChecksum(options, env_options, read_options, file_path); } Status VerifySstFileChecksum(const Options& options, const EnvOptions& env_options, @@ -56,8 +58,9 @@ Status VerifySstFileChecksum(const Options& options, std::unique_ptr file_reader( new RandomAccessFileReader( std::move(file), file_path, ioptions.clock, nullptr /* io_tracer */, - nullptr /* stats */, 0 /* hist_type */, nullptr /* file_read_hist */, - ioptions.rate_limiter.get())); + ioptions.stats /* stats */, + Histograms::SST_READ_MICROS /* hist_type */, + nullptr /* file_read_hist */, ioptions.rate_limiter.get())); const bool kImmortal = true; auto reader_options = TableReaderOptions( ioptions, options.prefix_extractor, env_options, internal_comparator, @@ -76,4 +79,3 @@ Status VerifySstFileChecksum(const Options& options, } } // namespace ROCKSDB_NAMESPACE - diff --git a/db/corruption_test.cc b/db/corruption_test.cc index ab506cdb7..7027181eb 100644 --- a/db/corruption_test.cc +++ b/db/corruption_test.cc @@ -762,9 +762,11 @@ TEST_F(CorruptionTest, RangeDeletionCorrupted) { fs->GetFileSize(filename, file_opts.io_options, &file_size, nullptr)); BlockHandle range_del_handle; - ASSERT_OK(FindMetaBlockInFile( - file_reader.get(), file_size, kBlockBasedTableMagicNumber, - ImmutableOptions(options_), kRangeDelBlockName, &range_del_handle)); + const ReadOptions read_options; + ASSERT_OK(FindMetaBlockInFile(file_reader.get(), file_size, + kBlockBasedTableMagicNumber, + ImmutableOptions(options_), read_options, + kRangeDelBlockName, &range_del_handle)); ASSERT_OK(TryReopen()); ASSERT_OK(test::CorruptFile(env_.get(), filename, @@ -1666,4 +1668,3 @@ int main(int argc, char** argv) { RegisterCustomObjects(argc, argv); return RUN_ALL_TESTS(); } - diff --git a/db/db_impl/compacted_db_impl.cc b/db/db_impl/compacted_db_impl.cc index 70de79858..3d824baf2 100644 --- a/db/db_impl/compacted_db_impl.cc +++ b/db/db_impl/compacted_db_impl.cc @@ -46,6 +46,11 @@ Status CompactedDBImpl::Get(const ReadOptions& options, ColumnFamilyHandle*, Status CompactedDBImpl::Get(const ReadOptions& options, ColumnFamilyHandle*, const Slice& key, PinnableSlice* value, std::string* timestamp) { + if (options.io_activity != Env::IOActivity::kUnknown) { + return Status::InvalidArgument( + "Cannot call Get with `ReadOptions::io_activity` != " + "`Env::IOActivity::kUnknown`"); + } assert(user_comparator_); if (options.timestamp) { const Status s = FailIfTsMismatchCf( diff --git a/db/db_impl/db_impl.cc b/db/db_impl/db_impl.cc index 69350af34..fcfb77731 100644 --- a/db/db_impl/db_impl.cc +++ b/db/db_impl/db_impl.cc @@ -327,6 +327,9 @@ Status DBImpl::Resume() { // means a new super version wouldn't have been installed Status DBImpl::ResumeImpl(DBRecoverContext context) { mutex_.AssertHeld(); + + // TODO: plumb Env::IOActivity + const ReadOptions read_options; WaitForBackgroundWork(); Status s; @@ -368,7 +371,7 @@ Status DBImpl::ResumeImpl(DBRecoverContext context) { assert(cfh); ColumnFamilyData* cfd = cfh->cfd(); const MutableCFOptions& cf_opts = *cfd->GetLatestMutableCFOptions(); - s = versions_->LogAndApply(cfd, cf_opts, &edit, &mutex_, + s = versions_->LogAndApply(cfd, cf_opts, read_options, &edit, &mutex_, directories_.GetDbDir()); if (!s.ok()) { io_s = versions_->io_status(); @@ -1147,6 +1150,8 @@ FSDirectory* DBImpl::GetDataDir(ColumnFamilyData* cfd, size_t path_id) const { Status DBImpl::SetOptions( ColumnFamilyHandle* column_family, const std::unordered_map& options_map) { + // TODO: plumb Env::IOActivity + const ReadOptions read_options; auto* cfd = static_cast_with_check(column_family)->cfd(); if (options_map.empty()) { @@ -1168,8 +1173,8 @@ Status DBImpl::SetOptions( new_options = *cfd->GetLatestMutableCFOptions(); // Append new version to recompute compaction score. VersionEdit dummy_edit; - s = versions_->LogAndApply(cfd, new_options, &dummy_edit, &mutex_, - directories_.GetDbDir()); + s = versions_->LogAndApply(cfd, new_options, read_options, &dummy_edit, + &mutex_, directories_.GetDbDir()); // Trigger possible flush/compactions. This has to be before we persist // options to file, otherwise there will be a deadlock with writer // thread. @@ -1507,7 +1512,9 @@ Status DBImpl::SyncWAL() { } if (status.ok() && synced_wals.IsWalAddition()) { InstrumentedMutexLock l(&mutex_); - status = ApplyWALToManifest(&synced_wals); + // TODO: plumb Env::IOActivity + const ReadOptions read_options; + status = ApplyWALToManifest(read_options, &synced_wals); } TEST_SYNC_POINT("DBImpl::SyncWAL:BeforeMarkLogsSynced:2"); @@ -1515,11 +1522,13 @@ Status DBImpl::SyncWAL() { return status; } -Status DBImpl::ApplyWALToManifest(VersionEdit* synced_wals) { +Status DBImpl::ApplyWALToManifest(const ReadOptions& read_options, + VersionEdit* synced_wals) { // not empty, write to MANIFEST. mutex_.AssertHeld(); + Status status = versions_->LogAndApplyToDefaultColumnFamily( - synced_wals, &mutex_, directories_.GetDbDir()); + read_options, synced_wals, &mutex_, directories_.GetDbDir()); if (!status.ok() && versions_->io_status().IsIOError()) { status = error_handler_.SetBGError(versions_->io_status(), BackgroundErrorReason::kManifestWrite); @@ -1936,6 +1945,12 @@ Status DBImpl::GetEntity(const ReadOptions& read_options, "Cannot call GetEntity without a PinnableWideColumns object"); } + if (read_options.io_activity != Env::IOActivity::kUnknown) { + return Status::InvalidArgument( + "Cannot call GetEntity with `ReadOptions::io_activity` != " + "`Env::IOActivity::kUnknown`"); + } + columns->Reset(); GetImplOptions get_impl_options; @@ -1981,6 +1996,12 @@ Status DBImpl::GetImpl(const ReadOptions& read_options, const Slice& key, assert(get_impl_options.column_family); + if (read_options.io_activity != Env::IOActivity::kUnknown) { + return Status::InvalidArgument( + "Cannot call Get with `ReadOptions::io_activity` != " + "`Env::IOActivity::kUnknown`"); + } + if (read_options.timestamp) { const Status s = FailIfTsMismatchCf(get_impl_options.column_family, *(read_options.timestamp), @@ -2931,6 +2952,11 @@ Status DBImpl::MultiGetImpl( autovector* sorted_keys, SuperVersion* super_version, SequenceNumber snapshot, ReadCallback* callback) { + if (read_options.io_activity != Env::IOActivity::kUnknown) { + return Status::InvalidArgument( + "Cannot call MultiGet with `ReadOptions::io_activity` != " + "`Env::IOActivity::kUnknown`"); + } PERF_CPU_TIMER_GUARD(get_cpu_nanos, immutable_db_options_.clock); StopWatch sw(immutable_db_options_.clock, stats_, DB_MULTIGET); @@ -3129,6 +3155,8 @@ Status DBImpl::CreateColumnFamilies( Status DBImpl::CreateColumnFamilyImpl(const ColumnFamilyOptions& cf_options, const std::string& column_family_name, ColumnFamilyHandle** handle) { + // TODO: plumb Env::IOActivity + const ReadOptions read_options; Status s; *handle = nullptr; @@ -3169,9 +3197,9 @@ Status DBImpl::CreateColumnFamilyImpl(const ColumnFamilyOptions& cf_options, write_thread_.EnterUnbatched(&w, &mutex_); // LogAndApply will both write the creation in MANIFEST and create // ColumnFamilyData object - s = versions_->LogAndApply(nullptr, MutableCFOptions(cf_options), &edit, - &mutex_, directories_.GetDbDir(), false, - &cf_options); + s = versions_->LogAndApply(nullptr, MutableCFOptions(cf_options), + read_options, &edit, &mutex_, + directories_.GetDbDir(), false, &cf_options); write_thread_.ExitUnbatched(&w); } if (s.ok()) { @@ -3250,6 +3278,8 @@ Status DBImpl::DropColumnFamilies( } Status DBImpl::DropColumnFamilyImpl(ColumnFamilyHandle* column_family) { + // TODO: plumb Env::IOActivity + const ReadOptions read_options; auto cfh = static_cast_with_check(column_family); auto cfd = cfh->cfd(); if (cfd->GetID() == 0) { @@ -3272,8 +3302,9 @@ Status DBImpl::DropColumnFamilyImpl(ColumnFamilyHandle* column_family) { // we drop column family from a single write thread WriteThread::Writer w; write_thread_.EnterUnbatched(&w, &mutex_); - s = versions_->LogAndApply(cfd, *cfd->GetLatestMutableCFOptions(), &edit, - &mutex_, directories_.GetDbDir()); + s = versions_->LogAndApply(cfd, *cfd->GetLatestMutableCFOptions(), + read_options, &edit, &mutex_, + directories_.GetDbDir()); write_thread_.ExitUnbatched(&w); } if (s.ok()) { @@ -3324,10 +3355,13 @@ bool DBImpl::KeyMayExist(const ReadOptions& read_options, std::string* value, std::string* timestamp, bool* value_found) { assert(value != nullptr); + assert(read_options.io_activity == Env::IOActivity::kUnknown); + if (value_found != nullptr) { // falsify later if key-may-exist but can't fetch value *value_found = true; } + // TODO: plumb Env::IOActivity ReadOptions roptions = read_options; roptions.read_tier = kBlockCacheTier; // read from block cache only PinnableSlice pinnable_val; @@ -3356,6 +3390,11 @@ Iterator* DBImpl::NewIterator(const ReadOptions& read_options, return NewErrorIterator(Status::NotSupported( "ReadTier::kPersistedData is not yet supported in iterators.")); } + if (read_options.io_activity != Env::IOActivity::kUnknown) { + return NewErrorIterator(Status::InvalidArgument( + "Cannot call NewIterator with `ReadOptions::io_activity` != " + "`Env::IOActivity::kUnknown`")); + } assert(column_family); @@ -3491,6 +3530,11 @@ Status DBImpl::NewIterators( return Status::NotSupported( "ReadTier::kPersistedData is not yet supported in iterators."); } + if (read_options.io_activity != Env::IOActivity::kUnknown) { + return Status::InvalidArgument( + "Cannot call NewIterators with `ReadOptions::io_activity` != " + "`Env::IOActivity::kUnknown`"); + } if (read_options.timestamp) { for (auto* cf : column_families) { @@ -3808,7 +3852,9 @@ Status DBImpl::GetPropertiesOfAllTables(ColumnFamilyHandle* column_family, version->Ref(); mutex_.Unlock(); - auto s = version->GetPropertiesOfAllTables(props); + // TODO: plumb Env::IOActivity + const ReadOptions read_options; + auto s = version->GetPropertiesOfAllTables(read_options, props); // Decrement the ref count mutex_.Lock(); @@ -3830,7 +3876,9 @@ Status DBImpl::GetPropertiesOfTablesInRange(ColumnFamilyHandle* column_family, version->Ref(); mutex_.Unlock(); - auto s = version->GetPropertiesOfTablesInRange(range, n, props); + // TODO: plumb Env::IOActivity + const ReadOptions read_options; + auto s = version->GetPropertiesOfTablesInRange(read_options, range, n, props); // Decrement the ref count mutex_.Lock(); @@ -4163,6 +4211,8 @@ Status DBImpl::GetApproximateSizes(const SizeApproximationOptions& options, SuperVersion* sv = GetAndRefSuperVersion(cfd); v = sv->current; + // TODO: plumb Env::IOActivity + const ReadOptions read_options; for (int i = 0; i < n; i++) { Slice start = range[i].start; Slice limit = range[i].limit; @@ -4184,7 +4234,7 @@ Status DBImpl::GetApproximateSizes(const SizeApproximationOptions& options, sizes[i] = 0; if (options.include_files) { sizes[i] += versions_->ApproximateSize( - options, v, k1.Encode(), k2.Encode(), /*start_level=*/0, + options, read_options, v, k1.Encode(), k2.Encode(), /*start_level=*/0, /*end_level=*/-1, TableReaderCaller::kUserApproximateSize); } if (options.include_memtables) { @@ -4232,6 +4282,8 @@ Status DBImpl::GetUpdatesSince( } Status DBImpl::DeleteFile(std::string name) { + // TODO: plumb Env::IOActivity + const ReadOptions read_options; uint64_t number; FileType type; WalFileType log_type; @@ -4311,7 +4363,8 @@ Status DBImpl::DeleteFile(std::string name) { edit.SetColumnFamily(cfd->GetID()); edit.DeleteFile(level, number); status = versions_->LogAndApply(cfd, *cfd->GetLatestMutableCFOptions(), - &edit, &mutex_, directories_.GetDbDir()); + read_options, &edit, &mutex_, + directories_.GetDbDir()); if (status.ok()) { InstallSuperVersionAndScheduleWork(cfd, &job_context.superversion_contexts[0], @@ -4333,6 +4386,8 @@ Status DBImpl::DeleteFile(std::string name) { Status DBImpl::DeleteFilesInRanges(ColumnFamilyHandle* column_family, const RangePtr* ranges, size_t n, bool include_end) { + // TODO: plumb Env::IOActivity + const ReadOptions read_options; Status status = Status::OK(); auto cfh = static_cast_with_check(column_family); ColumnFamilyData* cfd = cfh->cfd(); @@ -4398,7 +4453,8 @@ Status DBImpl::DeleteFilesInRanges(ColumnFamilyHandle* column_family, } input_version->Ref(); status = versions_->LogAndApply(cfd, *cfd->GetLatestMutableCFOptions(), - &edit, &mutex_, directories_.GetDbDir()); + read_options, &edit, &mutex_, + directories_.GetDbDir()); if (status.ok()) { InstallSuperVersionAndScheduleWork(cfd, &job_context.superversion_contexts[0], @@ -5034,6 +5090,7 @@ Status DBImpl::GetLatestSequenceForKey( MergeContext merge_context; SequenceNumber max_covering_tombstone_seq = 0; + // TODO: plumb Env::IOActivity ReadOptions read_options; SequenceNumber current_seq = versions_->LastSequence(); @@ -5189,6 +5246,8 @@ Status DBImpl::IngestExternalFile( Status DBImpl::IngestExternalFiles( const std::vector& args) { + // TODO: plumb Env::IOActivity + const ReadOptions read_options; if (args.empty()) { return Status::InvalidArgument("ingestion arg list is empty"); } @@ -5406,9 +5465,9 @@ Status DBImpl::IngestExternalFiles( } assert(0 == num_entries); } - status = - versions_->LogAndApply(cfds_to_commit, mutable_cf_options_list, - edit_lists, &mutex_, directories_.GetDbDir()); + status = versions_->LogAndApply(cfds_to_commit, mutable_cf_options_list, + read_options, edit_lists, &mutex_, + directories_.GetDbDir()); // It is safe to update VersionSet last seqno here after LogAndApply since // LogAndApply persists last sequence number from VersionEdits, // which are from file's largest seqno and not from VersionSet. @@ -5509,6 +5568,8 @@ Status DBImpl::CreateColumnFamilyWithImport( const ExportImportFilesMetaData& metadata, ColumnFamilyHandle** handle) { assert(handle != nullptr); assert(*handle == nullptr); + // TODO: plumb Env::IOActivity + const ReadOptions read_options; std::string cf_comparator_name = options.comparator->Name(); if (cf_comparator_name != metadata.db_comparator_name) { return Status::InvalidArgument("Comparator name mismatch"); @@ -5550,8 +5611,9 @@ Status DBImpl::CreateColumnFamilyWithImport( // file, we have to make sure the file number will never being reused. next_file_number = versions_->FetchAddFileNumber(metadata.files.size()); auto cf_options = cfd->GetLatestMutableCFOptions(); - status = versions_->LogAndApply(cfd, *cf_options, &dummy_edit, &mutex_, - directories_.GetDbDir()); + status = + versions_->LogAndApply(cfd, *cf_options, read_options, &dummy_edit, + &mutex_, directories_.GetDbDir()); if (status.ok()) { InstallSuperVersionAndScheduleWork(cfd, &dummy_sv_ctx, *cf_options); } @@ -5587,8 +5649,9 @@ Status DBImpl::CreateColumnFamilyWithImport( // Install job edit [Mutex will be unlocked here] if (status.ok()) { auto cf_options = cfd->GetLatestMutableCFOptions(); - status = versions_->LogAndApply(cfd, *cf_options, import_job.edit(), - &mutex_, directories_.GetDbDir()); + status = versions_->LogAndApply(cfd, *cf_options, read_options, + import_job.edit(), &mutex_, + directories_.GetDbDir()); if (status.ok()) { InstallSuperVersionAndScheduleWork(cfd, &sv_context, *cf_options); } @@ -5648,6 +5711,12 @@ Status DBImpl::VerifyChecksumInternal(const ReadOptions& read_options, Status s; + if (read_options.io_activity != Env::IOActivity::kUnknown) { + s = Status::InvalidArgument( + "Cannot verify file checksum with `ReadOptions::io_activity` != " + "`Env::IOActivity::kUnknown`"); + return s; + } if (use_file_checksum) { FileChecksumGenFactory* const file_checksum_gen_factory = immutable_db_options_.file_checksum_gen_factory.get(); @@ -5761,6 +5830,12 @@ Status DBImpl::VerifyFullFileChecksum(const std::string& file_checksum_expected, const std::string& func_name_expected, const std::string& fname, const ReadOptions& read_options) { + if (read_options.io_activity != Env::IOActivity::kUnknown) { + return Status::InvalidArgument( + "Cannot call VerifyChecksum with `ReadOptions::io_activity` != " + "`Env::IOActivity::kUnknown`"); + } + Status s; if (file_checksum_expected == kUnknownFileChecksum) { return s; @@ -5893,6 +5968,8 @@ Status DBImpl::ReserveFileNumbersBeforeIngestion( ColumnFamilyData* cfd, uint64_t num, std::unique_ptr::iterator>& pending_output_elem, uint64_t* next_file_number) { + // TODO: plumb Env::IOActivity + const ReadOptions read_options; Status s; SuperVersionContext dummy_sv_ctx(true /* create_superversion */); assert(nullptr != next_file_number); @@ -5910,8 +5987,8 @@ Status DBImpl::ReserveFileNumbersBeforeIngestion( // reuse the file number that has already assigned to the internal file, // and this will overwrite the external file. To protect the external // file, we have to make sure the file number will never being reused. - s = versions_->LogAndApply(cfd, *cf_options, &dummy_edit, &mutex_, - directories_.GetDbDir()); + s = versions_->LogAndApply(cfd, *cf_options, read_options, &dummy_edit, + &mutex_, directories_.GetDbDir()); if (s.ok()) { InstallSuperVersionAndScheduleWork(cfd, &dummy_sv_ctx, *cf_options); } diff --git a/db/db_impl/db_impl.h b/db/db_impl/db_impl.h index 28a6a4f31..50f9a8ca5 100644 --- a/db/db_impl/db_impl.h +++ b/db/db_impl/db_impl.h @@ -2127,7 +2127,7 @@ class DBImpl : public DB { // helper function to call after some of the logs_ were synced void MarkLogsSynced(uint64_t up_to, bool synced_dir, VersionEdit* edit); - Status ApplyWALToManifest(VersionEdit* edit); + Status ApplyWALToManifest(const ReadOptions& read_options, VersionEdit* edit); // WALs with log number up to up_to are not synced successfully. void MarkLogsNotSynced(uint64_t up_to); diff --git a/db/db_impl/db_impl_compaction_flush.cc b/db/db_impl/db_impl_compaction_flush.cc index 47ce69aeb..06cc2e6a8 100644 --- a/db/db_impl/db_impl_compaction_flush.cc +++ b/db/db_impl/db_impl_compaction_flush.cc @@ -229,7 +229,9 @@ Status DBImpl::FlushMemTableToOutputFile( log_io_s = SyncClosedLogs(job_context, &synced_wals); mutex_.Lock(); if (log_io_s.ok() && synced_wals.IsWalAddition()) { - log_io_s = status_to_io_status(ApplyWALToManifest(&synced_wals)); + const ReadOptions read_options(Env::IOActivity::kFlush); + log_io_s = + status_to_io_status(ApplyWALToManifest(read_options, &synced_wals)); TEST_SYNC_POINT_CALLBACK("DBImpl::FlushMemTableToOutputFile:CommitWal:1", nullptr); } @@ -492,7 +494,9 @@ Status DBImpl::AtomicFlushMemTablesToOutputFiles( log_io_s = SyncClosedLogs(job_context, &synced_wals); mutex_.Lock(); if (log_io_s.ok() && synced_wals.IsWalAddition()) { - log_io_s = status_to_io_status(ApplyWALToManifest(&synced_wals)); + const ReadOptions read_options(Env::IOActivity::kFlush); + log_io_s = + status_to_io_status(ApplyWALToManifest(read_options, &synced_wals)); } if (!log_io_s.ok() && !log_io_s.IsShutdownInProgress() && @@ -956,6 +960,9 @@ Status DBImpl::IncreaseFullHistoryTsLowImpl(ColumnFamilyData* cfd, VersionEdit edit; edit.SetColumnFamily(cfd->GetID()); edit.SetFullHistoryTsLow(ts_low); + + // TODO: plumb Env::IOActivity + const ReadOptions read_options; TEST_SYNC_POINT_CALLBACK("DBImpl::IncreaseFullHistoryTsLowImpl:BeforeEdit", &edit); @@ -969,7 +976,8 @@ Status DBImpl::IncreaseFullHistoryTsLowImpl(ColumnFamilyData* cfd, } Status s = versions_->LogAndApply(cfd, *cfd->GetLatestMutableCFOptions(), - &edit, &mutex_, directories_.GetDbDir()); + read_options, &edit, &mutex_, + directories_.GetDbDir()); if (!s.ok()) { return s; } @@ -1080,6 +1088,7 @@ Status DBImpl::CompactRangeInternal(const CompactRangeOptions& options, ReadOptions ro; ro.total_order_seek = true; + ro.io_activity = Env::IOActivity::kCompaction; bool overlap; for (int level = 0; level < current_version->storage_info()->num_non_empty_levels(); @@ -1639,6 +1648,8 @@ Status DBImpl::ReFitLevel(ColumnFamilyData* cfd, int level, int target_level) { return Status::InvalidArgument("Target level exceeds number of levels"); } + const ReadOptions read_options(Env::IOActivity::kCompaction); + SuperVersionContext sv_context(/* create_superversion */ true); InstrumentedMutexLock guard_lock(&mutex_); @@ -1753,8 +1764,9 @@ Status DBImpl::ReFitLevel(ColumnFamilyData* cfd, int level, int target_level) { "[%s] Apply version edit:\n%s", cfd->GetName().c_str(), edit.DebugString().data()); - Status status = versions_->LogAndApply(cfd, mutable_cf_options, &edit, - &mutex_, directories_.GetDbDir()); + Status status = + versions_->LogAndApply(cfd, mutable_cf_options, read_options, &edit, + &mutex_, directories_.GetDbDir()); cfd->compaction_picker()->UnregisterCompaction(c.get()); c.reset(); @@ -3189,6 +3201,8 @@ Status DBImpl::BackgroundCompaction(bool* made_progress, mutex_.AssertHeld(); TEST_SYNC_POINT("DBImpl::BackgroundCompaction:Start"); + const ReadOptions read_options(Env::IOActivity::kCompaction); + bool is_manual = (manual_compaction != nullptr); std::unique_ptr c; if (prepicked_compaction != nullptr && @@ -3399,9 +3413,9 @@ Status DBImpl::BackgroundCompaction(bool* made_progress, for (const auto& f : *c->inputs(0)) { c->edit()->DeleteFile(c->level(), f->fd.GetNumber()); } - status = versions_->LogAndApply(c->column_family_data(), - *c->mutable_cf_options(), c->edit(), - &mutex_, directories_.GetDbDir()); + status = versions_->LogAndApply( + c->column_family_data(), *c->mutable_cf_options(), read_options, + c->edit(), &mutex_, directories_.GetDbDir()); io_s = versions_->io_status(); InstallSuperVersionAndScheduleWork(c->column_family_data(), &job_context->superversion_contexts[0], @@ -3418,9 +3432,7 @@ Status DBImpl::BackgroundCompaction(bool* made_progress, c->column_family_data()); // Instrument for event update // TODO(yhchiang): add op details for showing trivial-move. - ThreadStatusUtil::SetColumnFamily( - c->column_family_data(), c->column_family_data()->ioptions()->env, - immutable_db_options_.enable_thread_tracking); + ThreadStatusUtil::SetColumnFamily(c->column_family_data()); ThreadStatusUtil::SetThreadOperation(ThreadStatus::OP_COMPACTION); compaction_job_stats.num_input_files = c->num_input_files(0); @@ -3466,9 +3478,9 @@ Status DBImpl::BackgroundCompaction(bool* made_progress, vstorage->GetNextCompactCursor(start_level, c->num_input_files(0))); } } - status = versions_->LogAndApply(c->column_family_data(), - *c->mutable_cf_options(), c->edit(), - &mutex_, directories_.GetDbDir()); + status = versions_->LogAndApply( + c->column_family_data(), *c->mutable_cf_options(), read_options, + c->edit(), &mutex_, directories_.GetDbDir()); io_s = versions_->io_status(); // Use latest MutableCFOptions InstallSuperVersionAndScheduleWork(c->column_family_data(), @@ -3782,6 +3794,8 @@ void DBImpl::BuildCompactionJobInfo( compaction_job_info->table_properties = c->GetOutputTableProperties(); compaction_job_info->compaction_reason = c->compaction_reason(); compaction_job_info->compression = c->output_compression(); + + const ReadOptions read_options(Env::IOActivity::kCompaction); for (size_t i = 0; i < c->num_input_levels(); ++i) { for (const auto fmd : *c->inputs(i)) { const FileDescriptor& desc = fmd->fd; @@ -3793,7 +3807,7 @@ void DBImpl::BuildCompactionJobInfo( static_cast(i), file_number, fmd->oldest_blob_file_number}); if (compaction_job_info->table_properties.count(fn) == 0) { std::shared_ptr tp; - auto s = current->GetTableProperties(&tp, fmd, &fn); + auto s = current->GetTableProperties(read_options, &tp, fmd, &fn); if (s.ok()) { compaction_job_info->table_properties[fn] = tp; } diff --git a/db/db_impl/db_impl_experimental.cc b/db/db_impl/db_impl_experimental.cc index 3e43fe498..5e3b7ba61 100644 --- a/db/db_impl/db_impl_experimental.cc +++ b/db/db_impl/db_impl_experimental.cc @@ -61,7 +61,8 @@ Status DBImpl::PromoteL0(ColumnFamilyHandle* column_family, int target_level) { "PromoteL0 FAILED. Invalid target level %d\n", target_level); return Status::InvalidArgument("Invalid target level"); } - + // TODO: plumb Env::IOActivity + const ReadOptions read_options; Status status; VersionEdit edit; JobContext job_context(next_job_id_.fetch_add(1), true); @@ -141,7 +142,8 @@ Status DBImpl::PromoteL0(ColumnFamilyHandle* column_family, int target_level) { } status = versions_->LogAndApply(cfd, *cfd->GetLatestMutableCFOptions(), - &edit, &mutex_, directories_.GetDbDir()); + read_options, &edit, &mutex_, + directories_.GetDbDir()); if (status.ok()) { InstallSuperVersionAndScheduleWork(cfd, &job_context.superversion_contexts[0], diff --git a/db/db_impl/db_impl_open.cc b/db/db_impl/db_impl_open.cc index 1758f4cc6..91ef84266 100644 --- a/db/db_impl/db_impl_open.cc +++ b/db/db_impl/db_impl_open.cc @@ -19,6 +19,7 @@ #include "file/writable_file_writer.h" #include "logging/logging.h" #include "monitoring/persistent_stats_history.h" +#include "monitoring/thread_status_util.h" #include "options/options_helper.h" #include "rocksdb/table.h" #include "rocksdb/wal_filter.h" @@ -923,8 +924,9 @@ Status DBImpl::InitPersistStatsColumnFamily() { Status DBImpl::LogAndApplyForRecovery(const RecoveryContext& recovery_ctx) { mutex_.AssertHeld(); assert(versions_->descriptor_log_ == nullptr); + const ReadOptions read_options(Env::IOActivity::kDBOpen); Status s = versions_->LogAndApply( - recovery_ctx.cfds_, recovery_ctx.mutable_cf_opts_, + recovery_ctx.cfds_, recovery_ctx.mutable_cf_opts_, read_options, recovery_ctx.edit_lists_, &mutex_, directories_.GetDbDir()); if (s.ok() && !(recovery_ctx.files_to_delete_.empty())) { mutex_.Unlock(); @@ -1577,6 +1579,7 @@ Status DBImpl::WriteLevel0TableForRecovery(int job_id, ColumnFamilyData* cfd, meta.fd = FileDescriptor(versions_->NewFileNumber(), 0, 0); ReadOptions ro; ro.total_order_seek = true; + ro.io_activity = Env::IOActivity::kDBOpen; Arena arena; Status s; TableProperties table_properties; @@ -1635,10 +1638,11 @@ Status DBImpl::WriteLevel0TableForRecovery(int job_id, ColumnFamilyData* cfd, SeqnoToTimeMapping empty_seqno_time_mapping; Version* version = cfd->current(); version->Ref(); + const ReadOptions read_option(Env::IOActivity::kDBOpen); s = BuildTable( dbname_, versions_.get(), immutable_db_options_, tboptions, - file_options_for_compaction_, cfd->table_cache(), iter.get(), - std::move(range_del_iters), &meta, &blob_file_additions, + file_options_for_compaction_, read_option, cfd->table_cache(), + iter.get(), std::move(range_del_iters), &meta, &blob_file_additions, snapshot_seqs, earliest_write_conflict_snapshot, kMaxSequenceNumber, snapshot_checker, paranoid_file_checks, cfd->internal_stats(), &io_s, io_tracer_, BlobFileCreationReason::kRecovery, @@ -1739,8 +1743,12 @@ Status DB::Open(const DBOptions& db_options, const std::string& dbname, std::vector* handles, DB** dbptr) { const bool kSeqPerBatch = true; const bool kBatchPerTxn = true; - return DBImpl::Open(db_options, dbname, column_families, handles, dbptr, - !kSeqPerBatch, kBatchPerTxn); + ThreadStatusUtil::SetEnableTracking(db_options.enable_thread_tracking); + ThreadStatusUtil::SetThreadOperation(ThreadStatus::OperationType::OP_DBOPEN); + Status s = DBImpl::Open(db_options, dbname, column_families, handles, dbptr, + !kSeqPerBatch, kBatchPerTxn); + ThreadStatusUtil::ResetThreadStatus(); + return s; } // TODO: Implement the trimming in flush code path. diff --git a/db/db_impl/db_impl_readonly.cc b/db/db_impl/db_impl_readonly.cc index 13bc37edb..871cf8085 100644 --- a/db/db_impl/db_impl_readonly.cc +++ b/db/db_impl/db_impl_readonly.cc @@ -40,6 +40,11 @@ Status DBImplReadOnly::Get(const ReadOptions& read_options, ColumnFamilyHandle* column_family, const Slice& key, PinnableSlice* pinnable_val, std::string* timestamp) { + if (read_options.io_activity != Env::IOActivity::kUnknown) { + return Status::InvalidArgument( + "Cannot call Get with `ReadOptions::io_activity` != " + "`Env::IOActivity::kUnknown`"); + } assert(pinnable_val != nullptr); // TODO: stopwatch DB_GET needed?, perf timer needed? PERF_TIMER_GUARD(get_snapshot_time); @@ -112,6 +117,11 @@ Status DBImplReadOnly::Get(const ReadOptions& read_options, Iterator* DBImplReadOnly::NewIterator(const ReadOptions& read_options, ColumnFamilyHandle* column_family) { + if (read_options.io_activity != Env::IOActivity::kUnknown) { + return NewErrorIterator(Status::InvalidArgument( + "Cannot call NewIterator with `ReadOptions::io_activity` != " + "`Env::IOActivity::kUnknown`")); + } assert(column_family); if (read_options.timestamp) { const Status s = FailIfTsMismatchCf( diff --git a/db/db_impl/db_impl_secondary.cc b/db/db_impl/db_impl_secondary.cc index f4ee4afbc..c6fcefddc 100644 --- a/db/db_impl/db_impl_secondary.cc +++ b/db/db_impl/db_impl_secondary.cc @@ -345,6 +345,11 @@ Status DBImplSecondary::GetImpl(const ReadOptions& read_options, ColumnFamilyHandle* column_family, const Slice& key, PinnableSlice* pinnable_val, std::string* timestamp) { + if (read_options.io_activity != Env::IOActivity::kUnknown) { + return Status::InvalidArgument( + "Cannot call Get with `ReadOptions::io_activity` != " + "`Env::IOActivity::kUnknown`"); + } assert(pinnable_val != nullptr); PERF_CPU_TIMER_GUARD(get_cpu_nanos, immutable_db_options_.clock); StopWatch sw(immutable_db_options_.clock, stats_, DB_GET); @@ -445,6 +450,11 @@ Iterator* DBImplSecondary::NewIterator(const ReadOptions& read_options, return NewErrorIterator(Status::NotSupported( "ReadTier::kPersistedData is not yet supported in iterators.")); } + if (read_options.io_activity != Env::IOActivity::kUnknown) { + return NewErrorIterator(Status::InvalidArgument( + "Cannot call NewIterator with `ReadOptions::io_activity` != " + "`Env::IOActivity::kUnknown`")); + } assert(column_family); if (read_options.timestamp) { @@ -511,6 +521,11 @@ Status DBImplSecondary::NewIterators( return Status::NotSupported( "ReadTier::kPersistedData is not yet supported in iterators."); } + if (read_options.io_activity != Env::IOActivity::kUnknown) { + return Status::InvalidArgument( + "Cannot call NewIterators with `ReadOptions::io_activity` != " + "`Env::IOActivity::kUnknown`"); + } ReadCallback* read_callback = nullptr; // No read callback provided. if (iterators == nullptr) { return Status::InvalidArgument("iterators not allowed to be nullptr"); diff --git a/db/db_impl/db_impl_write.cc b/db/db_impl/db_impl_write.cc index 89a054e4c..fb74434dd 100644 --- a/db/db_impl/db_impl_write.cc +++ b/db/db_impl/db_impl_write.cc @@ -606,7 +606,9 @@ Status DBImpl::WriteImpl(const WriteOptions& write_options, log_write_mutex_.Unlock(); if (status.ok() && synced_wals.IsWalAddition()) { InstrumentedMutexLock l(&mutex_); - status = ApplyWALToManifest(&synced_wals); + // TODO: plumb Env::IOActivity + const ReadOptions read_options; + status = ApplyWALToManifest(read_options, &synced_wals); } // Requesting sync with two_write_queues_ is expected to be very rare. We @@ -767,7 +769,9 @@ Status DBImpl::PipelinedWriteImpl(const WriteOptions& write_options, } if (w.status.ok() && synced_wals.IsWalAddition()) { InstrumentedMutexLock l(&mutex_); - w.status = ApplyWALToManifest(&synced_wals); + // TODO: plumb Env::IOActivity + const ReadOptions read_options; + w.status = ApplyWALToManifest(read_options, &synced_wals); } write_thread_.ExitAsBatchGroupLeader(wal_write_group, w.status); } @@ -1805,7 +1809,7 @@ Status DBImpl::DelayWrite(uint64_t num_bytes, WriteThread& write_thread, bool delayed = false; { StopWatch sw(immutable_db_options_.clock, stats_, WRITE_STALL, - &time_delayed); + Histograms::HISTOGRAM_ENUM_MAX, &time_delayed); // To avoid parallel timed delays (bad throttling), only support them // on the primary write queue. uint64_t delay; @@ -2086,6 +2090,8 @@ void DBImpl::NotifyOnMemTableSealed(ColumnFamilyData* /*cfd*/, // two_write_queues_ is true (This is to simplify the reasoning.) Status DBImpl::SwitchMemtable(ColumnFamilyData* cfd, WriteContext* context) { mutex_.AssertHeld(); + // TODO: plumb Env::IOActivity + const ReadOptions read_options; log::Writer* new_log = nullptr; MemTable* new_mem = nullptr; IOStatus io_s; @@ -2237,8 +2243,8 @@ Status DBImpl::SwitchMemtable(ColumnFamilyData* cfd, WriteContext* context) { VersionEdit wal_deletion; wal_deletion.DeleteWalsBefore(min_wal_number_to_keep); - s = versions_->LogAndApplyToDefaultColumnFamily(&wal_deletion, &mutex_, - directories_.GetDbDir()); + s = versions_->LogAndApplyToDefaultColumnFamily( + read_options, &wal_deletion, &mutex_, directories_.GetDbDir()); if (!s.ok() && versions_->io_status().IsIOError()) { s = error_handler_.SetBGError(versions_->io_status(), BackgroundErrorReason::kManifestWrite); diff --git a/db/db_iter.cc b/db/db_iter.cc index d564a7ac1..efa5746ff 100644 --- a/db/db_iter.cc +++ b/db/db_iter.cc @@ -196,6 +196,7 @@ bool DBIter::SetBlobValueIfNeeded(const Slice& user_key, // TODO: consider moving ReadOptions from ArenaWrappedDBIter to DBIter to // avoid having to copy options back and forth. + // TODO: plumb Env::IOActivity ReadOptions read_options; read_options.read_tier = read_tier_; read_options.fill_cache = fill_cache_; diff --git a/db/db_properties_test.cc b/db/db_properties_test.cc index 074f4e9a8..085ee064c 100644 --- a/db/db_properties_test.cc +++ b/db/db_properties_test.cc @@ -2333,8 +2333,9 @@ TEST_F(DBPropertiesTest, TableMetaIndexKeys) { // Read metaindex BlockContents bc; - ASSERT_OK(ReadMetaIndexBlockInFile(r.get(), file_size, 0U, - ImmutableOptions(options), &bc)); + const ReadOptions read_options; + ASSERT_OK(ReadMetaIndexBlockInFile( + r.get(), file_size, 0U, ImmutableOptions(options), read_options, &bc)); Block metaindex_block(std::move(bc)); std::unique_ptr meta_iter; meta_iter.reset(metaindex_block.NewMetaIterator()); diff --git a/db/db_rate_limiter_test.cc b/db/db_rate_limiter_test.cc index acea673cb..84c2df230 100644 --- a/db/db_rate_limiter_test.cc +++ b/db/db_rate_limiter_test.cc @@ -235,8 +235,9 @@ TEST_P(DBRateLimiterOnReadTest, VerifyChecksum) { ASSERT_EQ(0, options_.rate_limiter->GetTotalRequests(Env::IO_USER)); ASSERT_OK(db_->VerifyChecksum(GetReadOptions())); - // The files are tiny so there should have just been one read per file. - int expected = kNumFiles; + // There are 3 reads per file: ReadMetaIndexBlock, + // VerifyChecksumInMetaBlocks, VerifyChecksumInBlocks + int expected = kNumFiles * 3; ASSERT_EQ(expected, options_.rate_limiter->GetTotalRequests(Env::IO_USER)); } diff --git a/db/experimental.cc b/db/experimental.cc index 0e49eeca0..c2dce7fde 100644 --- a/db/experimental.cc +++ b/db/experimental.cc @@ -38,6 +38,8 @@ Status UpdateManifestForFilesState( const DBOptions& db_opts, const std::string& db_name, const std::vector& column_families, const UpdateManifestForFilesStateOptions& opts) { + // TODO: plumb Env::IOActivity + const ReadOptions read_options; OfflineManifestWriter w(db_opts, db_name); Status s = w.Recover(column_families); @@ -114,7 +116,7 @@ Status UpdateManifestForFilesState( std::unique_ptr db_dir; s = fs->NewDirectory(db_name, IOOptions(), &db_dir, nullptr); if (s.ok()) { - s = w.LogAndApply(cfd, &edit, db_dir.get()); + s = w.LogAndApply(read_options, cfd, &edit, db_dir.get()); } if (s.ok()) { ++cfs_updated; diff --git a/db/external_sst_file_ingestion_job.cc b/db/external_sst_file_ingestion_job.cc index 428c8bc6a..98bd6050a 100644 --- a/db/external_sst_file_ingestion_job.cc +++ b/db/external_sst_file_ingestion_job.cc @@ -692,6 +692,7 @@ Status ExternalSstFileIngestionJob::GetIngestedFileInfo( // If customized readahead size is needed, we can pass a user option // all the way to here. Right now we just rely on the default readahead // to keep things simple. + // TODO: plumb Env::IOActivity ReadOptions ro; ro.readahead_size = ingestion_options_.verify_checksums_readahead_size; status = table_reader->VerifyChecksum( @@ -745,6 +746,7 @@ Status ExternalSstFileIngestionJob::GetIngestedFileInfo( file_to_ingest->num_range_deletions = props->num_range_deletions; ParsedInternalKey key; + // TODO: plumb Env::IOActivity ReadOptions ro; std::unique_ptr iter(table_reader->NewIterator( ro, sv->mutable_cf_options.prefix_extractor.get(), /*arena=*/nullptr, @@ -855,6 +857,7 @@ Status ExternalSstFileIngestionJob::AssignLevelAndSeqnoForIngestedFile( bool overlap_with_db = false; Arena arena; + // TODO: plumb Env::IOActivity ReadOptions ro; ro.total_order_seek = true; int target_level = 0; @@ -1088,4 +1091,3 @@ Status ExternalSstFileIngestionJob::SyncIngestedFile(TWritableFile* file) { } } // namespace ROCKSDB_NAMESPACE - diff --git a/db/flush_job.cc b/db/flush_job.cc index 8193f594f..a3ffc2707 100644 --- a/db/flush_job.cc +++ b/db/flush_job.cc @@ -141,11 +141,12 @@ FlushJob::FlushJob( FlushJob::~FlushJob() { ThreadStatusUtil::ResetThreadStatus(); } void FlushJob::ReportStartedFlush() { - ThreadStatusUtil::SetColumnFamily(cfd_, cfd_->ioptions()->env, - db_options_.enable_thread_tracking); + ThreadStatusUtil::SetEnableTracking(db_options_.enable_thread_tracking); + ThreadStatusUtil::SetColumnFamily(cfd_); ThreadStatusUtil::SetThreadOperation(ThreadStatus::OP_FLUSH); ThreadStatusUtil::SetThreadOperationProperty(ThreadStatus::COMPACTION_JOB_ID, job_context_->job_id); + IOSTATS_RESET(bytes_written); } @@ -379,6 +380,7 @@ Status FlushJob::MemPurge() { // Create two iterators, one for the memtable data (contains // info from puts + deletes), and one for the memtable // Range Tombstones (from DeleteRanges). + // TODO: plumb Env::IOActivity ReadOptions ro; ro.total_order_seek = true; Arena arena; @@ -669,6 +671,7 @@ bool FlushJob::MemPurgeDecider(double threshold) { // Cochran formula for determining sample size. // 95% confidence interval, 7% precision. // n0 = (1.96*1.96)*0.25/(0.07*0.07) = 196.0 + // TODO: plumb Env::IOActivity double n0 = 196.0; ReadOptions ro; ro.total_order_seek = true; @@ -841,6 +844,7 @@ Status FlushJob::WriteLevel0Table() { range_del_iters; ReadOptions ro; ro.total_order_seek = true; + ro.io_activity = Env::IOActivity::kFlush; Arena arena; uint64_t total_num_entries = 0, total_num_deletes = 0; uint64_t total_data_size = 0; @@ -930,17 +934,19 @@ Status FlushJob::WriteLevel0Table() { meta_.fd.GetNumber()); const SequenceNumber job_snapshot_seq = job_context_->GetJobSnapshotSequence(); - s = BuildTable( - dbname_, versions_, db_options_, tboptions, file_options_, - cfd_->table_cache(), iter.get(), std::move(range_del_iters), &meta_, - &blob_file_additions, existing_snapshots_, - earliest_write_conflict_snapshot_, job_snapshot_seq, - snapshot_checker_, mutable_cf_options_.paranoid_file_checks, - cfd_->internal_stats(), &io_s, io_tracer_, - BlobFileCreationReason::kFlush, seqno_to_time_mapping_, event_logger_, - job_context_->job_id, io_priority, &table_properties_, write_hint, - full_history_ts_low, blob_callback_, base_, &num_input_entries, - &memtable_payload_bytes, &memtable_garbage_bytes); + const ReadOptions read_options(Env::IOActivity::kFlush); + s = BuildTable(dbname_, versions_, db_options_, tboptions, file_options_, + read_options, cfd_->table_cache(), iter.get(), + std::move(range_del_iters), &meta_, &blob_file_additions, + existing_snapshots_, earliest_write_conflict_snapshot_, + job_snapshot_seq, snapshot_checker_, + mutable_cf_options_.paranoid_file_checks, + cfd_->internal_stats(), &io_s, io_tracer_, + BlobFileCreationReason::kFlush, seqno_to_time_mapping_, + event_logger_, job_context_->job_id, io_priority, + &table_properties_, write_hint, full_history_ts_low, + blob_callback_, base_, &num_input_entries, + &memtable_payload_bytes, &memtable_garbage_bytes); // TODO: Cleanup io_status in BuildTable and table builders assert(!s.ok() || io_s.ok()); io_s.PermitUncheckedError(); diff --git a/db/import_column_family_job.cc b/db/import_column_family_job.cc index e637cb01d..12d2519e9 100644 --- a/db/import_column_family_job.cc +++ b/db/import_column_family_job.cc @@ -275,6 +275,7 @@ Status ImportColumnFamilyJob::GetIngestedFileInfo( // in file_meta. if (file_meta.smallest.empty()) { assert(file_meta.largest.empty()); + // TODO: plumb Env::IOActivity ReadOptions ro; std::unique_ptr iter(table_reader->NewIterator( ro, sv->mutable_cf_options.prefix_extractor.get(), /*arena=*/nullptr, @@ -350,4 +351,3 @@ Status ImportColumnFamilyJob::GetIngestedFileInfo( return status; } } // namespace ROCKSDB_NAMESPACE - diff --git a/db/internal_stats.cc b/db/internal_stats.cc index 5b76a7883..c75668c0d 100644 --- a/db/internal_stats.cc +++ b/db/internal_stats.cc @@ -1148,7 +1148,9 @@ bool InternalStats::HandleSsTables(std::string* value, Slice /*suffix*/) { bool InternalStats::HandleAggregatedTableProperties(std::string* value, Slice /*suffix*/) { std::shared_ptr tp; - auto s = cfd_->current()->GetAggregatedTableProperties(&tp); + // TODO: plumb Env::IOActivity + const ReadOptions read_options; + auto s = cfd_->current()->GetAggregatedTableProperties(read_options, &tp); if (!s.ok()) { return false; } @@ -1168,7 +1170,9 @@ static std::map MapUint64ValuesToString( bool InternalStats::HandleAggregatedTablePropertiesMap( std::map* values, Slice /*suffix*/) { std::shared_ptr tp; - auto s = cfd_->current()->GetAggregatedTableProperties(&tp); + // TODO: plumb Env::IOActivity + const ReadOptions read_options; + auto s = cfd_->current()->GetAggregatedTableProperties(read_options, &tp); if (!s.ok()) { return false; } @@ -1184,8 +1188,10 @@ bool InternalStats::HandleAggregatedTablePropertiesAtLevel(std::string* values, return false; } std::shared_ptr tp; + // TODO: plumb Env::IOActivity + const ReadOptions read_options; auto s = cfd_->current()->GetAggregatedTableProperties( - &tp, static_cast(level)); + read_options, &tp, static_cast(level)); if (!s.ok()) { return false; } @@ -1201,8 +1207,10 @@ bool InternalStats::HandleAggregatedTablePropertiesAtLevelMap( return false; } std::shared_ptr tp; + // TODO: plumb Env::IOActivity + const ReadOptions read_options; auto s = cfd_->current()->GetAggregatedTableProperties( - &tp, static_cast(level)); + read_options, &tp, static_cast(level)); if (!s.ok()) { return false; } @@ -1397,7 +1405,11 @@ bool InternalStats::HandleEstimatePendingCompactionBytes(uint64_t* value, bool InternalStats::HandleEstimateTableReadersMem(uint64_t* value, DBImpl* /*db*/, Version* version) { - *value = (version == nullptr) ? 0 : version->GetMemoryUsageByTableReaders(); + // TODO: plumb Env::IOActivity + const ReadOptions read_options; + *value = (version == nullptr) + ? 0 + : version->GetMemoryUsageByTableReaders(read_options); return true; } @@ -1448,9 +1460,10 @@ bool InternalStats::HandleEstimateOldestKeyTime(uint64_t* value, DBImpl* /*db*/, ->compaction_options_fifo.allow_compaction) { return false; } - + // TODO: plumb Env::IOActivity + const ReadOptions read_options; TablePropertiesCollection collection; - auto s = cfd_->current()->GetPropertiesOfAllTables(&collection); + auto s = cfd_->current()->GetPropertiesOfAllTables(read_options, &collection); if (!s.ok()) { return false; } diff --git a/db/memtable.cc b/db/memtable.cc index b99e1d345..e61ddc9db 100644 --- a/db/memtable.cc +++ b/db/memtable.cc @@ -599,6 +599,7 @@ void MemTable::ConstructFragmentedRangeTombstones() { assert(!IsFragmentedRangeTombstonesConstructed(false)); // There should be no concurrent Construction if (!is_range_del_table_empty_.load(std::memory_order_relaxed)) { + // TODO: plumb Env::IOActivity auto* unfragmented_iter = new MemTableIterator(*this, ReadOptions(), nullptr /* arena */, true /* use_range_del_table */); diff --git a/db/memtable_list.cc b/db/memtable_list.cc index ebcdf9b8e..ee1563f01 100644 --- a/db/memtable_list.cc +++ b/db/memtable_list.cc @@ -467,6 +467,8 @@ Status MemTableList::TryInstallMemtableFlushResults( ThreadStatus::STAGE_MEMTABLE_INSTALL_FLUSH_RESULTS); mu->AssertHeld(); + const ReadOptions read_options(Env::IOActivity::kFlush); + // Flush was successful // Record the status on the memtable object. Either this call or a call by a // concurrent flush thread will read the status and write it to manifest. @@ -578,8 +580,8 @@ Status MemTableList::TryInstallMemtableFlushResults( }; if (write_edits) { // this can release and reacquire the mutex. - s = vset->LogAndApply(cfd, mutable_cf_options, edit_list, mu, - db_directory, /*new_descriptor_log=*/false, + s = vset->LogAndApply(cfd, mutable_cf_options, read_options, edit_list, + mu, db_directory, /*new_descriptor_log=*/false, /*column_family_options=*/nullptr, manifest_write_cb); } else { @@ -798,6 +800,8 @@ Status InstallMemtableAtomicFlushResults( ThreadStatus::STAGE_MEMTABLE_INSTALL_FLUSH_RESULTS); mu->AssertHeld(); + const ReadOptions read_options(Env::IOActivity::kFlush); + size_t num = mems_list.size(); assert(cfds.size() == num); if (imm_lists != nullptr) { @@ -875,8 +879,8 @@ Status InstallMemtableAtomicFlushResults( } // this can release and reacquire the mutex. - s = vset->LogAndApply(cfds, mutable_cf_options_list, edit_lists, mu, - db_directory); + s = vset->LogAndApply(cfds, mutable_cf_options_list, read_options, edit_lists, + mu, db_directory); for (size_t k = 0; k != cfds.size(); ++k) { auto* imm = (imm_lists == nullptr) ? cfds[k]->imm() : imm_lists->at(k); diff --git a/db/perf_context_test.cc b/db/perf_context_test.cc index 3e78dbe27..bb8691b96 100644 --- a/db/perf_context_test.cc +++ b/db/perf_context_test.cc @@ -187,7 +187,8 @@ TEST_F(PerfContextTest, StopWatchOverhead) { uint64_t elapsed = 0; std::vector timings(kTotalIterations); - StopWatch timer(SystemClock::Default().get(), nullptr, 0, &elapsed); + StopWatch timer(SystemClock::Default().get(), nullptr, 0, + Histograms::HISTOGRAM_ENUM_MAX, &elapsed); for (auto& timing : timings) { timing = elapsed; } diff --git a/db/plain_table_db_test.cc b/db/plain_table_db_test.cc index 737ad4ed2..cc28b9f19 100644 --- a/db/plain_table_db_test.cc +++ b/db/plain_table_db_test.cc @@ -329,21 +329,23 @@ class TestPlainTableFactory : public PlainTableFactory { std::unique_ptr* table, bool /*prefetch_index_and_filter_in_cache*/) const override { std::unique_ptr props; + const ReadOptions read_options; auto s = ReadTableProperties(file.get(), file_size, kPlainTableMagicNumber, - table_reader_options.ioptions, &props); + table_reader_options.ioptions, read_options, + &props); EXPECT_TRUE(s.ok()); if (store_index_in_file_) { BlockHandle bloom_block_handle; s = FindMetaBlockInFile(file.get(), file_size, kPlainTableMagicNumber, - table_reader_options.ioptions, + table_reader_options.ioptions, read_options, BloomBlockBuilder::kBloomBlock, &bloom_block_handle); EXPECT_TRUE(s.ok()); BlockHandle index_block_handle; s = FindMetaBlockInFile(file.get(), file_size, kPlainTableMagicNumber, - table_reader_options.ioptions, + table_reader_options.ioptions, read_options, PlainTableIndexBuilder::kPlainTableIndexBlock, &index_block_handle); EXPECT_TRUE(s.ok()); @@ -1344,4 +1346,3 @@ int main(int argc, char** argv) { ::testing::InitGoogleTest(&argc, argv); return RUN_ALL_TESTS(); } - diff --git a/db/repair.cc b/db/repair.cc index 0b3e120c9..b4b9d0c5f 100644 --- a/db/repair.cc +++ b/db/repair.cc @@ -145,6 +145,8 @@ class Repairer { // Adds a column family to the VersionSet with cf_options_ and updates // manifest. Status AddColumnFamily(const std::string& cf_name, uint32_t cf_id) { + // TODO: plumb Env::IOActivity; + const ReadOptions read_options; const auto* cf_opts = GetColumnFamilyOptions(cf_name); if (cf_opts == nullptr) { return Status::Corruption("Encountered unknown column family with name=" + @@ -166,8 +168,9 @@ class Repairer { Status status = env_->GetFileSystem()->NewDirectory(dbname_, IOOptions(), &db_dir, nullptr); if (status.ok()) { - status = vset_.LogAndApply(cfd, mut_cf_opts, &edit, &mutex_, db_dir.get(), - false /* new_descriptor_log */, cf_opts); + status = vset_.LogAndApply(cfd, mut_cf_opts, read_options, &edit, &mutex_, + db_dir.get(), false /* new_descriptor_log */, + cf_opts); } mutex_.Unlock(); return status; @@ -357,6 +360,9 @@ class Repairer { } }; + // TODO: plumb Env::IOActivity + const ReadOptions read_options; + // Open the log file std::string logname = LogFileName(wal_dir, log); const auto& fs = env_->GetFileSystem(); @@ -422,6 +428,7 @@ class Repairer { FileMetaData meta; meta.fd = FileDescriptor(next_file_number_++, 0, 0); + // TODO: plumb Env::IOActivity ReadOptions ro; ro.total_order_seek = true; Arena arena; @@ -456,7 +463,7 @@ class Repairer { SeqnoToTimeMapping empty_seqno_time_mapping; status = BuildTable( dbname_, /* versions */ nullptr, immutable_db_options_, tboptions, - file_options_, table_cache_.get(), iter.get(), + file_options_, read_options, table_cache_.get(), iter.get(), std::move(range_del_iters), &meta, nullptr /* blob_file_additions */, {}, kMaxSequenceNumber, kMaxSequenceNumber, snapshot_checker, false /* paranoid_file_checks*/, nullptr /* internal_stats */, &io_s, @@ -509,8 +516,10 @@ class Repairer { file_size); std::shared_ptr props; if (status.ok()) { - status = table_cache_->GetTableProperties(file_options_, icmp_, t->meta, - &props); + // TODO: plumb Env::IOActivity + const ReadOptions read_options; + status = table_cache_->GetTableProperties(file_options_, read_options, + icmp_, t->meta, &props); } if (status.ok()) { auto s = @@ -556,6 +565,7 @@ class Repairer { } } if (status.ok()) { + // TODO: plumb Env::IOActivity ReadOptions ropts; ropts.total_order_seek = true; InternalIterator* iter = table_cache_->NewIterator( @@ -603,6 +613,7 @@ class Repairer { // an SST file is a full sorted run. This probably needs the extra logic // from compaction_job.cc around call to UpdateBoundariesForRange (to // handle range tombstones extendingg beyond range of other entries). + // TODO: plumb Env::IOActivity ReadOptions ropts; std::unique_ptr r_iter; status = table_cache_->GetRangeTombstoneIterator( @@ -625,6 +636,8 @@ class Repairer { } Status AddTables() { + // TODO: plumb Env::IOActivity; + const ReadOptions read_options; std::unordered_map> cf_id_to_tables; SequenceNumber max_sequence = 0; for (size_t i = 0; i < tables_.size(); i++) { @@ -706,8 +719,8 @@ class Repairer { s = env_->GetFileSystem()->NewDirectory(dbname_, IOOptions(), &db_dir, nullptr); if (s.ok()) { - s = vset_.LogAndApply(cfd, *cfd->GetLatestMutableCFOptions(), &edit, - &mutex_, db_dir.get(), + s = vset_.LogAndApply(cfd, *cfd->GetLatestMutableCFOptions(), + read_options, &edit, &mutex_, db_dir.get(), false /* new_descriptor_log */); } mutex_.Unlock(); @@ -809,4 +822,3 @@ Status RepairDB(const std::string& dbname, const Options& options) { } } // namespace ROCKSDB_NAMESPACE - diff --git a/db/table_cache.cc b/db/table_cache.cc index f456260bc..28206ed35 100644 --- a/db/table_cache.cc +++ b/db/table_cache.cc @@ -560,7 +560,7 @@ Status TableCache::MultiGetFilter( } Status TableCache::GetTableProperties( - const FileOptions& file_options, + const FileOptions& file_options, const ReadOptions& read_options, const InternalKeyComparator& internal_comparator, const FileMetaData& file_meta, std::shared_ptr* properties, @@ -574,7 +574,7 @@ Status TableCache::GetTableProperties( } TypedHandle* table_handle = nullptr; - Status s = FindTable(ReadOptions(), file_options, internal_comparator, + Status s = FindTable(read_options, file_options, internal_comparator, file_meta, &table_handle, prefix_extractor, no_io); if (!s.ok()) { return s; @@ -608,7 +608,7 @@ Status TableCache::ApproximateKeyAnchors( } size_t TableCache::GetMemoryUsageByTableReader( - const FileOptions& file_options, + const FileOptions& file_options, const ReadOptions& read_options, const InternalKeyComparator& internal_comparator, const FileMetaData& file_meta, const std::shared_ptr& prefix_extractor) { @@ -619,7 +619,7 @@ size_t TableCache::GetMemoryUsageByTableReader( } TypedHandle* table_handle = nullptr; - Status s = FindTable(ReadOptions(), file_options, internal_comparator, + Status s = FindTable(read_options, file_options, internal_comparator, file_meta, &table_handle, prefix_extractor, true); if (!s.ok()) { return 0; @@ -636,7 +636,8 @@ void TableCache::Evict(Cache* cache, uint64_t file_number) { } uint64_t TableCache::ApproximateOffsetOf( - const Slice& key, const FileMetaData& file_meta, TableReaderCaller caller, + const ReadOptions& read_options, const Slice& key, + const FileMetaData& file_meta, TableReaderCaller caller, const InternalKeyComparator& internal_comparator, const std::shared_ptr& prefix_extractor) { uint64_t result = 0; @@ -645,7 +646,7 @@ uint64_t TableCache::ApproximateOffsetOf( if (table_reader == nullptr) { const bool for_compaction = (caller == TableReaderCaller::kCompaction); Status s = - FindTable(ReadOptions(), file_options_, internal_comparator, file_meta, + FindTable(read_options, file_options_, internal_comparator, file_meta, &table_handle, prefix_extractor, false /* no_io */, !for_compaction /* record_read_stats */); if (s.ok()) { @@ -654,7 +655,7 @@ uint64_t TableCache::ApproximateOffsetOf( } if (table_reader != nullptr) { - result = table_reader->ApproximateOffsetOf(key, caller); + result = table_reader->ApproximateOffsetOf(read_options, key, caller); } if (table_handle != nullptr) { cache_.Release(table_handle); @@ -664,8 +665,9 @@ uint64_t TableCache::ApproximateOffsetOf( } uint64_t TableCache::ApproximateSize( - const Slice& start, const Slice& end, const FileMetaData& file_meta, - TableReaderCaller caller, const InternalKeyComparator& internal_comparator, + const ReadOptions& read_options, const Slice& start, const Slice& end, + const FileMetaData& file_meta, TableReaderCaller caller, + const InternalKeyComparator& internal_comparator, const std::shared_ptr& prefix_extractor) { uint64_t result = 0; TableReader* table_reader = file_meta.fd.table_reader; @@ -673,7 +675,7 @@ uint64_t TableCache::ApproximateSize( if (table_reader == nullptr) { const bool for_compaction = (caller == TableReaderCaller::kCompaction); Status s = - FindTable(ReadOptions(), file_options_, internal_comparator, file_meta, + FindTable(read_options, file_options_, internal_comparator, file_meta, &table_handle, prefix_extractor, false /* no_io */, !for_compaction /* record_read_stats */); if (s.ok()) { @@ -682,7 +684,7 @@ uint64_t TableCache::ApproximateSize( } if (table_reader != nullptr) { - result = table_reader->ApproximateSize(start, end, caller); + result = table_reader->ApproximateSize(read_options, start, end, caller); } if (table_handle != nullptr) { cache_.Release(table_handle); diff --git a/db/table_cache.h b/db/table_cache.h index 66282bf41..609e67498 100644 --- a/db/table_cache.h +++ b/db/table_cache.h @@ -179,7 +179,7 @@ class TableCache { // return Status::Incomplete() if table is not present in cache and // we set `no_io` to be true. Status GetTableProperties( - const FileOptions& toptions, + const FileOptions& toptions, const ReadOptions& read_options, const InternalKeyComparator& internal_comparator, const FileMetaData& file_meta, std::shared_ptr* properties, @@ -194,22 +194,23 @@ class TableCache { // Return total memory usage of the table reader of the file. // 0 if table reader of the file is not loaded. size_t GetMemoryUsageByTableReader( - const FileOptions& toptions, + const FileOptions& toptions, const ReadOptions& read_options, const InternalKeyComparator& internal_comparator, const FileMetaData& file_meta, const std::shared_ptr& prefix_extractor = nullptr); // Returns approximated offset of a key in a file represented by fd. uint64_t ApproximateOffsetOf( - const Slice& key, const FileMetaData& file_meta, TableReaderCaller caller, + const ReadOptions& read_options, const Slice& key, + const FileMetaData& file_meta, TableReaderCaller caller, const InternalKeyComparator& internal_comparator, const std::shared_ptr& prefix_extractor = nullptr); // Returns approximated data size between start and end keys in a file // represented by fd (the start key must not be greater than the end key). uint64_t ApproximateSize( - const Slice& start, const Slice& end, const FileMetaData& file_meta, - TableReaderCaller caller, + const ReadOptions& read_options, const Slice& start, const Slice& end, + const FileMetaData& file_meta, TableReaderCaller caller, const InternalKeyComparator& internal_comparator, const std::shared_ptr& prefix_extractor = nullptr); diff --git a/db/table_properties_collector_test.cc b/db/table_properties_collector_test.cc index 20f37e0c9..437b7e309 100644 --- a/db/table_properties_collector_test.cc +++ b/db/table_properties_collector_test.cc @@ -292,8 +292,9 @@ void TestCustomizedTablePropertiesCollector( new RandomAccessFileReader(std::move(source), "test")); std::unique_ptr props; + const ReadOptions read_options; Status s = ReadTableProperties(fake_file_reader.get(), fwf->contents().size(), - magic_number, ioptions, &props); + magic_number, ioptions, read_options, &props); ASSERT_OK(s); auto user_collected = props->user_collected_properties; @@ -429,8 +430,10 @@ void TestInternalKeyPropertiesCollector( new RandomAccessFileReader(std::move(source), "test")); std::unique_ptr props; - Status s = ReadTableProperties(reader.get(), fwf->contents().size(), - magic_number, ioptions, &props); + const ReadOptions read_options; + Status s = + ReadTableProperties(reader.get(), fwf->contents().size(), magic_number, + ioptions, read_options, &props); ASSERT_OK(s); auto user_collected = props->user_collected_properties; diff --git a/db/version_builder.cc b/db/version_builder.cc index 4f0e3a841..64590db5c 100644 --- a/db/version_builder.cc +++ b/db/version_builder.cc @@ -1257,7 +1257,7 @@ class VersionBuilder::Rep { InternalStats* internal_stats, int max_threads, bool prefetch_index_and_filter_in_cache, bool is_initial_load, const std::shared_ptr& prefix_extractor, - size_t max_file_size_for_l0_meta_pin) { + size_t max_file_size_for_l0_meta_pin, const ReadOptions& read_options) { assert(table_cache_ != nullptr); size_t table_cache_capacity = @@ -1324,7 +1324,7 @@ class VersionBuilder::Rep { int level = files_meta[file_idx].second; TableCache::TypedHandle* handle = nullptr; statuses[file_idx] = table_cache_->FindTable( - ReadOptions(), file_options_, + read_options, file_options_, *(base_vstorage_->InternalComparator()), *file_meta, &handle, prefix_extractor, false /*no_io */, true /* record_read_stats */, internal_stats->GetFileReadHist(level), false, level, @@ -1384,10 +1384,11 @@ Status VersionBuilder::LoadTableHandlers( InternalStats* internal_stats, int max_threads, bool prefetch_index_and_filter_in_cache, bool is_initial_load, const std::shared_ptr& prefix_extractor, - size_t max_file_size_for_l0_meta_pin) { - return rep_->LoadTableHandlers( - internal_stats, max_threads, prefetch_index_and_filter_in_cache, - is_initial_load, prefix_extractor, max_file_size_for_l0_meta_pin); + size_t max_file_size_for_l0_meta_pin, const ReadOptions& read_options) { + return rep_->LoadTableHandlers(internal_stats, max_threads, + prefetch_index_and_filter_in_cache, + is_initial_load, prefix_extractor, + max_file_size_for_l0_meta_pin, read_options); } uint64_t VersionBuilder::GetMinOldestBlobFileNumber() const { diff --git a/db/version_builder.h b/db/version_builder.h index 682d60524..8e7dd9e66 100644 --- a/db/version_builder.h +++ b/db/version_builder.h @@ -48,7 +48,7 @@ class VersionBuilder { InternalStats* internal_stats, int max_threads, bool prefetch_index_and_filter_in_cache, bool is_initial_load, const std::shared_ptr& prefix_extractor, - size_t max_file_size_for_l0_meta_pin); + size_t max_file_size_for_l0_meta_pin, const ReadOptions& read_options); uint64_t GetMinOldestBlobFileNumber() const; private: diff --git a/db/version_edit_handler.cc b/db/version_edit_handler.cc index f7a148968..7ea176e01 100644 --- a/db/version_edit_handler.cc +++ b/db/version_edit_handler.cc @@ -155,8 +155,9 @@ VersionEditHandler::VersionEditHandler( bool read_only, std::vector column_families, VersionSet* version_set, bool track_missing_files, bool no_error_if_files_missing, const std::shared_ptr& io_tracer, - bool skip_load_table_files, EpochNumberRequirement epoch_number_requirement) - : VersionEditHandlerBase(), + const ReadOptions& read_options, bool skip_load_table_files, + EpochNumberRequirement epoch_number_requirement) + : VersionEditHandlerBase(read_options), read_only_(read_only), column_families_(std::move(column_families)), version_set_(version_set), @@ -480,7 +481,8 @@ void VersionEditHandler::CheckIterationResult(const log::Reader& reader, ColumnFamilyData* VersionEditHandler::CreateCfAndInit( const ColumnFamilyOptions& cf_options, const VersionEdit& edit) { - ColumnFamilyData* cfd = version_set_->CreateColumnFamily(cf_options, &edit); + ColumnFamilyData* cfd = + version_set_->CreateColumnFamily(cf_options, read_options_, &edit); assert(cfd != nullptr); cfd->set_initialized(); assert(builders_.find(edit.column_family_) == builders_.end()); @@ -537,7 +539,7 @@ Status VersionEditHandler::MaybeCreateVersion(const VersionEdit& /*edit*/, if (s.ok()) { // Install new version v->PrepareAppend( - *cfd->GetLatestMutableCFOptions(), + *cfd->GetLatestMutableCFOptions(), read_options_, !(version_set_->db_options_->skip_stats_update_on_db_open)); version_set_->AppendVersion(cfd, v); } else { @@ -569,7 +571,8 @@ Status VersionEditHandler::LoadTables(ColumnFamilyData* cfd, version_set_->db_options_->max_file_opening_threads, prefetch_index_and_filter_in_cache, is_initial_load, cfd->GetLatestMutableCFOptions()->prefix_extractor, - MaxFileSizeForL0MetaPin(*cfd->GetLatestMutableCFOptions())); + MaxFileSizeForL0MetaPin(*cfd->GetLatestMutableCFOptions()), + read_options_); if ((s.IsPathNotFound() || s.IsCorruption()) && no_error_if_files_missing_) { s = Status::OK(); } @@ -647,11 +650,12 @@ Status VersionEditHandler::ExtractInfoFromVersionEdit(ColumnFamilyData* cfd, VersionEditHandlerPointInTime::VersionEditHandlerPointInTime( bool read_only, std::vector column_families, VersionSet* version_set, const std::shared_ptr& io_tracer, + const ReadOptions& read_options, EpochNumberRequirement epoch_number_requirement) : VersionEditHandler(read_only, column_families, version_set, /*track_missing_files=*/true, /*no_error_if_files_missing=*/true, io_tracer, - epoch_number_requirement) {} + read_options, epoch_number_requirement) {} VersionEditHandlerPointInTime::~VersionEditHandlerPointInTime() { for (const auto& elem : versions_) { @@ -816,7 +820,8 @@ Status VersionEditHandlerPointInTime::MaybeCreateVersion( cfd->internal_stats(), version_set_->db_options_->max_file_opening_threads, false, true, cfd->GetLatestMutableCFOptions()->prefix_extractor, - MaxFileSizeForL0MetaPin(*cfd->GetLatestMutableCFOptions())); + MaxFileSizeForL0MetaPin(*cfd->GetLatestMutableCFOptions()), + read_options_); if (!s.ok()) { delete version; if (s.IsCorruption()) { @@ -827,7 +832,7 @@ Status VersionEditHandlerPointInTime::MaybeCreateVersion( s = builder->SaveTo(version->storage_info()); if (s.ok()) { version->PrepareAppend( - *cfd->GetLatestMutableCFOptions(), + *cfd->GetLatestMutableCFOptions(), read_options_, !version_set_->db_options_->skip_stats_update_on_db_open); auto v_iter = versions_.find(cfd->GetID()); if (v_iter != versions_.end()) { @@ -847,7 +852,8 @@ Status VersionEditHandlerPointInTime::VerifyFile(ColumnFamilyData* cfd, const std::string& fpath, int level, const FileMetaData& fmeta) { - return version_set_->VerifyFileMetadata(cfd, fpath, level, fmeta); + return version_set_->VerifyFileMetadata(read_options_, cfd, fpath, level, + fmeta); } Status VersionEditHandlerPointInTime::VerifyBlobFile( @@ -856,7 +862,9 @@ Status VersionEditHandlerPointInTime::VerifyBlobFile( BlobSource* blob_source = cfd->blob_source(); assert(blob_source); CacheHandleGuard blob_file_reader; - Status s = blob_source->GetBlobFileReader(blob_file_num, &blob_file_reader); + + Status s = blob_source->GetBlobFileReader(read_options_, blob_file_num, + &blob_file_reader); if (!s.ok()) { return s; } diff --git a/db/version_edit_handler.h b/db/version_edit_handler.h index fc3fe7c6b..4b9f19542 100644 --- a/db/version_edit_handler.h +++ b/db/version_edit_handler.h @@ -19,8 +19,9 @@ struct FileMetaData; class VersionEditHandlerBase { public: - explicit VersionEditHandlerBase() - : max_manifest_read_size_(std::numeric_limits::max()) {} + explicit VersionEditHandlerBase(const ReadOptions& read_options) + : read_options_(read_options), + max_manifest_read_size_(std::numeric_limits::max()) {} virtual ~VersionEditHandlerBase() {} @@ -31,8 +32,9 @@ class VersionEditHandlerBase { AtomicGroupReadBuffer& GetReadBuffer() { return read_buffer_; } protected: - explicit VersionEditHandlerBase(uint64_t max_read_size) - : max_manifest_read_size_(max_read_size) {} + explicit VersionEditHandlerBase(const ReadOptions& read_options, + uint64_t max_read_size) + : read_options_(read_options), max_manifest_read_size_(max_read_size) {} virtual Status Initialize() { return Status::OK(); } virtual Status ApplyVersionEdit(VersionEdit& edit, @@ -45,6 +47,8 @@ class VersionEditHandlerBase { Status status_; + const ReadOptions& read_options_; + private: AtomicGroupReadBuffer read_buffer_; const uint64_t max_manifest_read_size_; @@ -52,7 +56,8 @@ class VersionEditHandlerBase { class ListColumnFamiliesHandler : public VersionEditHandlerBase { public: - ListColumnFamiliesHandler() : VersionEditHandlerBase() {} + explicit ListColumnFamiliesHandler(const ReadOptions& read_options) + : VersionEditHandlerBase(read_options) {} ~ListColumnFamiliesHandler() override {} @@ -72,9 +77,9 @@ class ListColumnFamiliesHandler : public VersionEditHandlerBase { class FileChecksumRetriever : public VersionEditHandlerBase { public: - FileChecksumRetriever(uint64_t max_read_size, + FileChecksumRetriever(const ReadOptions& read_options, uint64_t max_read_size, FileChecksumList& file_checksum_list) - : VersionEditHandlerBase(max_read_size), + : VersionEditHandlerBase(read_options, max_read_size), file_checksum_list_(file_checksum_list) {} ~FileChecksumRetriever() override {} @@ -111,12 +116,13 @@ class VersionEditHandler : public VersionEditHandlerBase { VersionSet* version_set, bool track_missing_files, bool no_error_if_files_missing, const std::shared_ptr& io_tracer, + const ReadOptions& read_options, EpochNumberRequirement epoch_number_requirement = EpochNumberRequirement::kMustPresent) - : VersionEditHandler(read_only, column_families, version_set, - track_missing_files, no_error_if_files_missing, - io_tracer, /*skip_load_table_files=*/false, - epoch_number_requirement) {} + : VersionEditHandler( + read_only, column_families, version_set, track_missing_files, + no_error_if_files_missing, io_tracer, read_options, + /*skip_load_table_files=*/false, epoch_number_requirement) {} ~VersionEditHandler() override {} @@ -137,7 +143,8 @@ class VersionEditHandler : public VersionEditHandlerBase { bool read_only, std::vector column_families, VersionSet* version_set, bool track_missing_files, bool no_error_if_files_missing, - const std::shared_ptr& io_tracer, bool skip_load_table_files, + const std::shared_ptr& io_tracer, + const ReadOptions& read_options, bool skip_load_table_files, EpochNumberRequirement epoch_number_requirement = EpochNumberRequirement::kMustPresent); @@ -212,6 +219,7 @@ class VersionEditHandlerPointInTime : public VersionEditHandler { VersionEditHandlerPointInTime( bool read_only, std::vector column_families, VersionSet* version_set, const std::shared_ptr& io_tracer, + const ReadOptions& read_options, EpochNumberRequirement epoch_number_requirement = EpochNumberRequirement::kMustPresent); ~VersionEditHandlerPointInTime() override; @@ -238,10 +246,11 @@ class ManifestTailer : public VersionEditHandlerPointInTime { explicit ManifestTailer(std::vector column_families, VersionSet* version_set, const std::shared_ptr& io_tracer, + const ReadOptions& read_options, EpochNumberRequirement epoch_number_requirement = EpochNumberRequirement::kMustPresent) : VersionEditHandlerPointInTime(/*read_only=*/false, column_families, - version_set, io_tracer, + version_set, io_tracer, read_options, epoch_number_requirement), mode_(Mode::kRecovery) {} @@ -281,12 +290,13 @@ class DumpManifestHandler : public VersionEditHandler { public: DumpManifestHandler(std::vector column_families, VersionSet* version_set, - const std::shared_ptr& io_tracer, bool verbose, - bool hex, bool json) + const std::shared_ptr& io_tracer, + const ReadOptions& read_options, bool verbose, bool hex, + bool json) : VersionEditHandler( /*read_only=*/true, column_families, version_set, /*track_missing_files=*/false, - /*no_error_if_files_missing=*/false, io_tracer, + /*no_error_if_files_missing=*/false, io_tracer, read_options, /*skip_load_table_files=*/true), verbose_(verbose), hex_(hex), diff --git a/db/version_set.cc b/db/version_set.cc index 125f03c9b..9f1888c78 100644 --- a/db/version_set.cc +++ b/db/version_set.cc @@ -1527,13 +1527,14 @@ void LevelIterator::InitFileIterator(size_t new_file_index) { } } // anonymous namespace -Status Version::GetTableProperties(std::shared_ptr* tp, +Status Version::GetTableProperties(const ReadOptions& read_options, + std::shared_ptr* tp, const FileMetaData* file_meta, const std::string* fname) const { auto table_cache = cfd_->table_cache(); auto ioptions = cfd_->ioptions(); Status s = table_cache->GetTableProperties( - file_options_, cfd_->internal_comparator(), *file_meta, tp, + file_options_, read_options, cfd_->internal_comparator(), *file_meta, tp, mutable_cf_options_.prefix_extractor, true /* no io */); if (s.ok()) { return s; @@ -1565,14 +1566,16 @@ Status Version::GetTableProperties(std::shared_ptr* tp, // the magic number check in the footer. std::unique_ptr file_reader( new RandomAccessFileReader( - std::move(file), file_name, nullptr /* env */, io_tracer_, - nullptr /* stats */, 0 /* hist_type */, nullptr /* file_read_hist */, - nullptr /* rate_limiter */, ioptions->listeners)); + std::move(file), file_name, ioptions->clock /* clock */, io_tracer_, + ioptions->stats /* stats */, + Histograms::SST_READ_MICROS /* hist_type */, + nullptr /* file_read_hist */, nullptr /* rate_limiter */, + ioptions->listeners)); std::unique_ptr props; s = ReadTableProperties( file_reader.get(), file_meta->fd.GetFileSize(), Footer::kNullTableMagicNumber /* table's magic number */, *ioptions, - &props); + read_options, &props); if (!s.ok()) { return s; } @@ -1581,10 +1584,11 @@ Status Version::GetTableProperties(std::shared_ptr* tp, return s; } -Status Version::GetPropertiesOfAllTables(TablePropertiesCollection* props) { +Status Version::GetPropertiesOfAllTables(const ReadOptions& read_options, + TablePropertiesCollection* props) { Status s; for (int level = 0; level < storage_info_.num_levels_; level++) { - s = GetPropertiesOfAllTables(props, level); + s = GetPropertiesOfAllTables(read_options, props, level); if (!s.ok()) { return s; } @@ -1602,6 +1606,8 @@ Status Version::TablesRangeTombstoneSummary(int max_entries_to_print, std::stringstream ss; + // TODO: plumb Env::IOActivity + const ReadOptions read_options; for (int level = 0; level < storage_info_.num_levels_; level++) { for (const auto& file_meta : storage_info_.files_[level]) { auto fname = @@ -1614,7 +1620,7 @@ Status Version::TablesRangeTombstoneSummary(int max_entries_to_print, std::unique_ptr tombstone_iter; Status s = table_cache->GetRangeTombstoneIterator( - ReadOptions(), cfd_->internal_comparator(), *file_meta, + read_options, cfd_->internal_comparator(), *file_meta, &tombstone_iter); if (!s.ok()) { return s; @@ -1648,7 +1654,8 @@ Status Version::TablesRangeTombstoneSummary(int max_entries_to_print, return Status::OK(); } -Status Version::GetPropertiesOfAllTables(TablePropertiesCollection* props, +Status Version::GetPropertiesOfAllTables(const ReadOptions& read_options, + TablePropertiesCollection* props, int level) { for (const auto& file_meta : storage_info_.files_[level]) { auto fname = @@ -1657,7 +1664,8 @@ Status Version::GetPropertiesOfAllTables(TablePropertiesCollection* props, // 1. If the table is already present in table cache, load table // properties from there. std::shared_ptr table_properties; - Status s = GetTableProperties(&table_properties, file_meta, &fname); + Status s = + GetTableProperties(read_options, &table_properties, file_meta, &fname); if (s.ok()) { props->insert({fname, table_properties}); } else { @@ -1669,7 +1677,8 @@ Status Version::GetPropertiesOfAllTables(TablePropertiesCollection* props, } Status Version::GetPropertiesOfTablesInRange( - const Range* range, std::size_t n, TablePropertiesCollection* props) const { + const ReadOptions& read_options, const Range* range, std::size_t n, + TablePropertiesCollection* props) const { for (int level = 0; level < storage_info_.num_non_empty_levels(); level++) { for (decltype(n) i = 0; i < n; i++) { // Convert user_key into a corresponding internal key. @@ -1686,7 +1695,8 @@ Status Version::GetPropertiesOfTablesInRange( // 1. If the table is already present in table cache, load table // properties from there. std::shared_ptr table_properties; - Status s = GetTableProperties(&table_properties, file_meta, &fname); + Status s = GetTableProperties(read_options, &table_properties, + file_meta, &fname); if (s.ok()) { props->insert({fname, table_properties}); } else { @@ -1701,13 +1711,14 @@ Status Version::GetPropertiesOfTablesInRange( } Status Version::GetAggregatedTableProperties( - std::shared_ptr* tp, int level) { + const ReadOptions& read_options, std::shared_ptr* tp, + int level) { TablePropertiesCollection props; Status s; if (level < 0) { - s = GetPropertiesOfAllTables(&props); + s = GetPropertiesOfAllTables(read_options, &props); } else { - s = GetPropertiesOfAllTables(&props, level); + s = GetPropertiesOfAllTables(read_options, &props, level); } if (!s.ok()) { return s; @@ -1721,12 +1732,12 @@ Status Version::GetAggregatedTableProperties( return Status::OK(); } -size_t Version::GetMemoryUsageByTableReaders() { +size_t Version::GetMemoryUsageByTableReaders(const ReadOptions& read_options) { size_t total_usage = 0; for (auto& file_level : storage_info_.level_files_brief_) { for (size_t i = 0; i < file_level.num_files; i++) { total_usage += cfd_->table_cache()->GetMemoryUsageByTableReader( - file_options_, cfd_->internal_comparator(), + file_options_, read_options, cfd_->internal_comparator(), *file_level.files[i].file_metadata, mutable_cf_options_.prefix_extractor); } @@ -2984,24 +2995,26 @@ void VersionStorageInfo::PrepareForVersionAppend( } void Version::PrepareAppend(const MutableCFOptions& mutable_cf_options, + const ReadOptions& read_options, bool update_stats) { TEST_SYNC_POINT_CALLBACK( "Version::PrepareAppend:forced_check", reinterpret_cast(&storage_info_.force_consistency_checks_)); if (update_stats) { - UpdateAccumulatedStats(); + UpdateAccumulatedStats(read_options); } storage_info_.PrepareForVersionAppend(*cfd_->ioptions(), mutable_cf_options); } -bool Version::MaybeInitializeFileMetaData(FileMetaData* file_meta) { +bool Version::MaybeInitializeFileMetaData(const ReadOptions& read_options, + FileMetaData* file_meta) { if (file_meta->init_stats_from_file || file_meta->compensated_file_size > 0) { return false; } std::shared_ptr tp; - Status s = GetTableProperties(&tp, file_meta); + Status s = GetTableProperties(read_options, &tp, file_meta); file_meta->init_stats_from_file = true; if (!s.ok()) { ROCKS_LOG_ERROR(vset_->db_options_->info_log, @@ -3046,7 +3059,7 @@ void VersionStorageInfo::RemoveCurrentStats(FileMetaData* file_meta) { } } -void Version::UpdateAccumulatedStats() { +void Version::UpdateAccumulatedStats(const ReadOptions& read_options) { // maximum number of table properties loaded from files. const int kMaxInitCount = 20; int init_count = 0; @@ -3064,7 +3077,7 @@ void Version::UpdateAccumulatedStats() { level < storage_info_.num_levels_ && init_count < kMaxInitCount; ++level) { for (auto* file_meta : storage_info_.files_[level]) { - if (MaybeInitializeFileMetaData(file_meta)) { + if (MaybeInitializeFileMetaData(read_options, file_meta)) { // each FileMeta will be initialized only once. storage_info_.UpdateAccumulatedStats(file_meta); // when option "max_open_files" is -1, all the file metadata has @@ -3089,7 +3102,8 @@ void Version::UpdateAccumulatedStats() { storage_info_.accumulated_raw_value_size_ == 0 && level >= 0; --level) { for (int i = static_cast(storage_info_.files_[level].size()) - 1; storage_info_.accumulated_raw_value_size_ == 0 && i >= 0; --i) { - if (MaybeInitializeFileMetaData(storage_info_.files_[level][i])) { + if (MaybeInitializeFileMetaData(read_options, + storage_info_.files_[level][i])) { storage_info_.UpdateAccumulatedStats(storage_info_.files_[level][i]); } } @@ -4971,7 +4985,8 @@ void VersionSet::AppendVersion(ColumnFamilyData* column_family_data, Status VersionSet::ProcessManifestWrites( std::deque& writers, InstrumentedMutex* mu, FSDirectory* dir_contains_current_file, bool new_descriptor_log, - const ColumnFamilyOptions* new_cf_options) { + const ColumnFamilyOptions* new_cf_options, + const ReadOptions& read_options) { mu->AssertHeld(); assert(!writers.empty()); ManifestWriter& first_writer = writers.front(); @@ -5202,7 +5217,7 @@ Status VersionSet::ProcessManifestWrites( true /* prefetch_index_and_filter_in_cache */, false /* is_initial_load */, mutable_cf_options_ptrs[i]->prefix_extractor, - MaxFileSizeForL0MetaPin(*mutable_cf_options_ptrs[i])); + MaxFileSizeForL0MetaPin(*mutable_cf_options_ptrs[i]), read_options); if (!s.ok()) { if (db_options_->paranoid_checks) { break; @@ -5247,7 +5262,8 @@ Status VersionSet::ProcessManifestWrites( constexpr bool update_stats = true; for (int i = 0; i < static_cast(versions.size()); ++i) { - versions[i]->PrepareAppend(*mutable_cf_options_ptrs[i], update_stats); + versions[i]->PrepareAppend(*mutable_cf_options_ptrs[i], read_options, + update_stats); } } @@ -5359,7 +5375,8 @@ Status VersionSet::ProcessManifestWrites( assert(batch_edits.size() == 1); assert(new_cf_options != nullptr); assert(max_last_sequence == descriptor_last_sequence_); - CreateColumnFamily(*new_cf_options, first_writer.edit_list.front()); + CreateColumnFamily(*new_cf_options, read_options, + first_writer.edit_list.front()); } else if (first_writer.edit_list.front()->is_column_family_drop_) { assert(batch_edits.size() == 1); assert(max_last_sequence == descriptor_last_sequence_); @@ -5528,6 +5545,7 @@ void VersionSet::WakeUpWaitingManifestWriters() { Status VersionSet::LogAndApply( const autovector& column_family_datas, const autovector& mutable_cf_options_list, + const ReadOptions& read_options, const autovector>& edit_lists, InstrumentedMutex* mu, FSDirectory* dir_contains_current_file, bool new_descriptor_log, const ColumnFamilyOptions* new_cf_options, @@ -5605,7 +5623,8 @@ Status VersionSet::LogAndApply( return Status::ColumnFamilyDropped(); } return ProcessManifestWrites(writers, mu, dir_contains_current_file, - new_descriptor_log, new_cf_options); + new_descriptor_log, new_cf_options, + read_options); } void VersionSet::LogAndApplyCFHelper(VersionEdit* edit, @@ -5689,6 +5708,7 @@ Status VersionSet::GetCurrentManifestPath(const std::string& dbname, Status VersionSet::Recover( const std::vector& column_families, bool read_only, std::string* db_id, bool no_error_if_files_missing) { + const ReadOptions read_options(Env::IOActivity::kDBOpen); // Read "CURRENT" file, which contains a pointer to the current manifest // file std::string manifest_path; @@ -5725,7 +5745,7 @@ Status VersionSet::Recover( VersionEditHandler handler( read_only, column_families, const_cast(this), /*track_missing_files=*/false, no_error_if_files_missing, io_tracer_, - EpochNumberRequirement::kMightMissing); + read_options, EpochNumberRequirement::kMightMissing); handler.Iterate(reader, &log_read_status); s = handler.status(); if (s.ok()) { @@ -5873,6 +5893,7 @@ Status VersionSet::TryRecoverFromOneManifest( const std::string& manifest_path, const std::vector& column_families, bool read_only, std::string* db_id, bool* has_missing_table_file) { + const ReadOptions read_options(Env::IOActivity::kDBOpen); ROCKS_LOG_INFO(db_options_->info_log, "Trying to recover from manifest: %s\n", manifest_path.c_str()); std::unique_ptr manifest_file_reader; @@ -5897,7 +5918,7 @@ Status VersionSet::TryRecoverFromOneManifest( /*checksum=*/true, /*log_num=*/0); VersionEditHandlerPointInTime handler_pit( read_only, column_families, const_cast(this), io_tracer_, - EpochNumberRequirement::kMightMissing); + read_options, EpochNumberRequirement::kMightMissing); handler_pit.Iterate(reader, &s); @@ -5940,6 +5961,8 @@ Status VersionSet::ListColumnFamilies(std::vector* column_families, Status VersionSet::ListColumnFamiliesFromManifest( const std::string& manifest_path, FileSystem* fs, std::vector* column_families) { + // TODO: plumb Env::IOActivity + const ReadOptions read_options; std::unique_ptr file_reader; Status s; { @@ -5959,7 +5982,7 @@ Status VersionSet::ListColumnFamiliesFromManifest( log::Reader reader(nullptr, std::move(file_reader), &reporter, true /* checksum */, 0 /* log_number */); - ListColumnFamiliesHandler handler; + ListColumnFamiliesHandler handler(read_options); handler.Iterate(reader, &s); assert(column_families); @@ -5982,6 +6005,9 @@ Status VersionSet::ReduceNumberOfLevels(const std::string& dbname, "Number of levels needs to be bigger than 1"); } + // TODO: plumb Env::IOActivity + const ReadOptions read_options; + ImmutableDBOptions db_options(*options); ColumnFamilyOptions cf_options(*options); std::shared_ptr tc(NewLRUCache(options->max_open_files - 10, @@ -6069,8 +6095,8 @@ Status VersionSet::ReduceNumberOfLevels(const std::string& dbname, InstrumentedMutex dummy_mutex; InstrumentedMutexLock l(&dummy_mutex); return versions.LogAndApply(versions.GetColumnFamilySet()->GetDefault(), - mutable_cf_options, &ve, &dummy_mutex, nullptr, - true); + mutable_cf_options, read_options, &ve, + &dummy_mutex, nullptr, true); } // Get the checksum information including the checksum and checksum function @@ -6143,6 +6169,9 @@ Status VersionSet::GetLiveFilesChecksumInfo(FileChecksumList* checksum_list) { Status VersionSet::DumpManifest(Options& options, std::string& dscname, bool verbose, bool hex, bool json) { assert(options.env); + // TODO: plumb Env::IOActivity + const ReadOptions read_options; + std::vector column_families; Status s = ListColumnFamiliesFromManifest( dscname, options.env->GetFileSystem().get(), &column_families); @@ -6169,7 +6198,8 @@ Status VersionSet::DumpManifest(Options& options, std::string& dscname, cf_descs.emplace_back(cf, options); } - DumpManifestHandler handler(cf_descs, this, io_tracer_, verbose, hex, json); + DumpManifestHandler handler(cf_descs, this, io_tracer_, read_options, verbose, + hex, json); { VersionSet::LogReporter reporter; reporter.status = &s; @@ -6372,6 +6402,7 @@ Status VersionSet::WriteCurrentStateToManifest( // we avoid doing binary search for the keys b and c twice and instead somehow // maintain state of where they first appear in the files. uint64_t VersionSet::ApproximateSize(const SizeApproximationOptions& options, + const ReadOptions& read_options, Version* v, const Slice& start, const Slice& end, int start_level, int end_level, TableReaderCaller caller) { @@ -6451,8 +6482,8 @@ uint64_t VersionSet::ApproximateSize(const SizeApproximationOptions& options, for (int i = idx_start + 1; i < idx_end; ++i) { uint64_t file_size = files_brief.files[i].fd.GetFileSize(); // The entire file falls into the range, so we can just take its size. - assert(file_size == - ApproximateSize(v, files_brief.files[i], start, end, caller)); + assert(file_size == ApproximateSize(read_options, v, files_brief.files[i], + start, end, caller)); total_full_size += file_size; } @@ -6487,21 +6518,24 @@ uint64_t VersionSet::ApproximateSize(const SizeApproximationOptions& options, // Estimate for all the first files (might also be last files), at each // level for (const auto file_ptr : first_files) { - total_full_size += ApproximateSize(v, *file_ptr, start, end, caller); + total_full_size += + ApproximateSize(read_options, v, *file_ptr, start, end, caller); } // Estimate for all the last files, at each level for (const auto file_ptr : last_files) { // We could use ApproximateSize here, but calling ApproximateOffsetOf // directly is just more efficient. - total_full_size += ApproximateOffsetOf(v, *file_ptr, end, caller); + total_full_size += + ApproximateOffsetOf(read_options, v, *file_ptr, end, caller); } } return total_full_size; } -uint64_t VersionSet::ApproximateOffsetOf(Version* v, const FdWithKeyRange& f, +uint64_t VersionSet::ApproximateOffsetOf(const ReadOptions& read_options, + Version* v, const FdWithKeyRange& f, const Slice& key, TableReaderCaller caller) { // pre-condition @@ -6521,14 +6555,15 @@ uint64_t VersionSet::ApproximateOffsetOf(Version* v, const FdWithKeyRange& f, TableCache* table_cache = v->cfd_->table_cache(); if (table_cache != nullptr) { result = table_cache->ApproximateOffsetOf( - key, *f.file_metadata, caller, icmp, + read_options, key, *f.file_metadata, caller, icmp, v->GetMutableCFOptions().prefix_extractor); } } return result; } -uint64_t VersionSet::ApproximateSize(Version* v, const FdWithKeyRange& f, +uint64_t VersionSet::ApproximateSize(const ReadOptions& read_options, + Version* v, const FdWithKeyRange& f, const Slice& start, const Slice& end, TableReaderCaller caller) { // pre-condition @@ -6544,13 +6579,14 @@ uint64_t VersionSet::ApproximateSize(Version* v, const FdWithKeyRange& f, if (icmp.Compare(f.smallest_key, start) >= 0) { // Start of the range is before the file start - approximate by end offset - return ApproximateOffsetOf(v, f, end, caller); + return ApproximateOffsetOf(read_options, v, f, end, caller); } if (icmp.Compare(f.largest_key, end) < 0) { // End of the range is after the file end - approximate by subtracting // start offset from the file size - uint64_t start_offset = ApproximateOffsetOf(v, f, start, caller); + uint64_t start_offset = + ApproximateOffsetOf(read_options, v, f, start, caller); assert(f.fd.GetFileSize() >= start_offset); return f.fd.GetFileSize() - start_offset; } @@ -6561,7 +6597,7 @@ uint64_t VersionSet::ApproximateSize(Version* v, const FdWithKeyRange& f, return 0; } return table_cache->ApproximateSize( - start, end, *f.file_metadata, caller, icmp, + read_options, start, end, *f.file_metadata, caller, icmp, v->GetMutableCFOptions().prefix_extractor); } @@ -6852,7 +6888,8 @@ void VersionSet::GetObsoleteFiles(std::vector* files, } ColumnFamilyData* VersionSet::CreateColumnFamily( - const ColumnFamilyOptions& cf_options, const VersionEdit* edit) { + const ColumnFamilyOptions& cf_options, const ReadOptions& read_options, + const VersionEdit* edit) { assert(edit->is_column_family_add_); MutableCFOptions dummy_cf_options; @@ -6871,7 +6908,8 @@ ColumnFamilyData* VersionSet::CreateColumnFamily( constexpr bool update_stats = false; - v->PrepareAppend(*new_cfd->GetLatestMutableCFOptions(), update_stats); + v->PrepareAppend(*new_cfd->GetLatestMutableCFOptions(), read_options, + update_stats); AppendVersion(new_cfd, v); // GetLatestMutableCFOptions() is safe here without mutex since the @@ -6936,7 +6974,8 @@ uint64_t VersionSet::GetTotalBlobFileSize(Version* dummy_versions) { return all_versions_blob_file_size; } -Status VersionSet::VerifyFileMetadata(ColumnFamilyData* cfd, +Status VersionSet::VerifyFileMetadata(const ReadOptions& read_options, + ColumnFamilyData* cfd, const std::string& fpath, int level, const FileMetaData& meta) { uint64_t fsize = 0; @@ -6969,7 +7008,7 @@ Status VersionSet::VerifyFileMetadata(ColumnFamilyData* cfd, TableCache::TypedHandle* handle = nullptr; FileMetaData meta_copy = meta; status = table_cache->FindTable( - ReadOptions(), file_opts, *icmp, meta_copy, &handle, pe, + read_options, file_opts, *icmp, meta_copy, &handle, pe, /*no_io=*/false, /*record_read_stats=*/true, internal_stats->GetFileReadHist(level), false, level, /*prefetch_index_and_filter_in_cache*/ false, max_sz_for_l0_meta_pin, @@ -7013,9 +7052,9 @@ Status ReactiveVersionSet::Recover( log::Reader* reader = manifest_reader->get(); assert(reader); - manifest_tailer_.reset( - new ManifestTailer(column_families, const_cast(this), - io_tracer_, EpochNumberRequirement::kMightMissing)); + manifest_tailer_.reset(new ManifestTailer( + column_families, const_cast(this), io_tracer_, + read_options_, EpochNumberRequirement::kMightMissing)); manifest_tailer_->Iterate(*reader, manifest_reader_status->get()); diff --git a/db/version_set.h b/db/version_set.h index 8d0633ea1..e7e96bc6c 100644 --- a/db/version_set.h +++ b/db/version_set.h @@ -913,7 +913,7 @@ class Version { // populates derived data structures. Call without mutex held. It needs to be // called before appending the version to the version set. void PrepareAppend(const MutableCFOptions& mutable_cf_options, - bool update_stats); + const ReadOptions& read_options, bool update_stats); // Reference count management (so Versions do not disappear out from // under live iterators) @@ -943,7 +943,8 @@ class Version { // specified in "file_meta". If the file name of "file_meta" is // known ahead, passing it by a non-null "fname" can save a // file-name conversion. - Status GetTableProperties(std::shared_ptr* tp, + Status GetTableProperties(const ReadOptions& read_options, + std::shared_ptr* tp, const FileMetaData* file_meta, const std::string* fname = nullptr) const; @@ -951,9 +952,12 @@ class Version { // On success, *props will be populated with all SSTables' table properties. // The keys of `props` are the sst file name, the values of `props` are the // tables' properties, represented as std::shared_ptr. - Status GetPropertiesOfAllTables(TablePropertiesCollection* props); - Status GetPropertiesOfAllTables(TablePropertiesCollection* props, int level); - Status GetPropertiesOfTablesInRange(const Range* range, std::size_t n, + Status GetPropertiesOfAllTables(const ReadOptions& read_options, + TablePropertiesCollection* props); + Status GetPropertiesOfAllTables(const ReadOptions& read_options, + TablePropertiesCollection* props, int level); + Status GetPropertiesOfTablesInRange(const ReadOptions& read_options, + const Range* range, std::size_t n, TablePropertiesCollection* props) const; // Print summary of range delete tombstones in SST files into out_str, @@ -965,13 +969,14 @@ class Version { // On success, "tp" will contains the aggregated table property among // the table properties of all sst files in this version. Status GetAggregatedTableProperties( + const ReadOptions& read_options, std::shared_ptr* tp, int level = -1); uint64_t GetEstimatedActiveKeys() { return storage_info_.GetEstimatedActiveKeys(); } - size_t GetMemoryUsageByTableReaders(); + size_t GetMemoryUsageByTableReaders(const ReadOptions& read_options); ColumnFamilyData* cfd() const { return cfd_; } @@ -1024,11 +1029,12 @@ class Version { // The helper function of UpdateAccumulatedStats, which may fill the missing // fields of file_meta from its associated TableProperties. // Returns true if it does initialize FileMetaData. - bool MaybeInitializeFileMetaData(FileMetaData* file_meta); + bool MaybeInitializeFileMetaData(const ReadOptions& read_options, + FileMetaData* file_meta); // Update the accumulated stats associated with the current version. // This accumulated stats will be used in compaction. - void UpdateAccumulatedStats(); + void UpdateAccumulatedStats(const ReadOptions& read_options); DECLARE_SYNC_AND_ASYNC( /* ret_type */ Status, /* func_name */ MultiGetFromSST, @@ -1136,13 +1142,13 @@ class VersionSet { virtual ~VersionSet(); Status LogAndApplyToDefaultColumnFamily( - VersionEdit* edit, InstrumentedMutex* mu, + const ReadOptions& read_options, VersionEdit* edit, InstrumentedMutex* mu, FSDirectory* dir_contains_current_file, bool new_descriptor_log = false, const ColumnFamilyOptions* column_family_options = nullptr) { ColumnFamilyData* default_cf = GetColumnFamilySet()->GetDefault(); const MutableCFOptions* cf_options = default_cf->GetLatestMutableCFOptions(); - return LogAndApply(default_cf, *cf_options, edit, mu, + return LogAndApply(default_cf, *cf_options, read_options, edit, mu, dir_contains_current_file, new_descriptor_log, column_family_options); } @@ -1155,9 +1161,9 @@ class VersionSet { // REQUIRES: no other thread concurrently calls LogAndApply() Status LogAndApply( ColumnFamilyData* column_family_data, - const MutableCFOptions& mutable_cf_options, VersionEdit* edit, - InstrumentedMutex* mu, FSDirectory* dir_contains_current_file, - bool new_descriptor_log = false, + const MutableCFOptions& mutable_cf_options, + const ReadOptions& read_options, VersionEdit* edit, InstrumentedMutex* mu, + FSDirectory* dir_contains_current_file, bool new_descriptor_log = false, const ColumnFamilyOptions* column_family_options = nullptr) { autovector cfds; cfds.emplace_back(column_family_data); @@ -1167,8 +1173,8 @@ class VersionSet { autovector edit_list; edit_list.emplace_back(edit); edit_lists.emplace_back(edit_list); - return LogAndApply(cfds, mutable_cf_options_list, edit_lists, mu, - dir_contains_current_file, new_descriptor_log, + return LogAndApply(cfds, mutable_cf_options_list, read_options, edit_lists, + mu, dir_contains_current_file, new_descriptor_log, column_family_options); } // The batch version. If edit_list.size() > 1, caller must ensure that @@ -1176,6 +1182,7 @@ class VersionSet { Status LogAndApply( ColumnFamilyData* column_family_data, const MutableCFOptions& mutable_cf_options, + const ReadOptions& read_options, const autovector& edit_list, InstrumentedMutex* mu, FSDirectory* dir_contains_current_file, bool new_descriptor_log = false, const ColumnFamilyOptions* column_family_options = nullptr, @@ -1186,8 +1193,8 @@ class VersionSet { mutable_cf_options_list.emplace_back(&mutable_cf_options); autovector> edit_lists; edit_lists.emplace_back(edit_list); - return LogAndApply(cfds, mutable_cf_options_list, edit_lists, mu, - dir_contains_current_file, new_descriptor_log, + return LogAndApply(cfds, mutable_cf_options_list, read_options, edit_lists, + mu, dir_contains_current_file, new_descriptor_log, column_family_options, {manifest_wcb}); } @@ -1197,6 +1204,7 @@ class VersionSet { virtual Status LogAndApply( const autovector& cfds, const autovector& mutable_cf_options_list, + const ReadOptions& read_options, const autovector>& edit_lists, InstrumentedMutex* mu, FSDirectory* dir_contains_current_file, bool new_descriptor_log = false, @@ -1427,7 +1435,8 @@ class VersionSet { // Return the approximate size of data to be scanned for range [start, end) // in levels [start_level, end_level). If end_level == -1 it will search // through all non-empty levels - uint64_t ApproximateSize(const SizeApproximationOptions& options, Version* v, + uint64_t ApproximateSize(const SizeApproximationOptions& options, + const ReadOptions& read_options, Version* v, const Slice& start, const Slice& end, int start_level, int end_level, TableReaderCaller caller); @@ -1487,7 +1496,8 @@ class VersionSet { new Version(cfd, this, file_options_, mutable_cf_options, io_tracer_); constexpr bool update_stats = false; - version->PrepareAppend(mutable_cf_options, update_stats); + const ReadOptions read_options; + version->PrepareAppend(mutable_cf_options, read_options, update_stats); AppendVersion(cfd, version); } @@ -1516,14 +1526,15 @@ class VersionSet { void Reset(); // Returns approximated offset of a key in a file for a given version. - uint64_t ApproximateOffsetOf(Version* v, const FdWithKeyRange& f, - const Slice& key, TableReaderCaller caller); + uint64_t ApproximateOffsetOf(const ReadOptions& read_options, Version* v, + const FdWithKeyRange& f, const Slice& key, + TableReaderCaller caller); // Returns approximated data size between start and end keys in a file // for a given version. - uint64_t ApproximateSize(Version* v, const FdWithKeyRange& f, - const Slice& start, const Slice& end, - TableReaderCaller caller); + uint64_t ApproximateSize(const ReadOptions& read_options, Version* v, + const FdWithKeyRange& f, const Slice& start, + const Slice& end, TableReaderCaller caller); struct MutableCFState { uint64_t log_number; @@ -1542,9 +1553,11 @@ class VersionSet { void AppendVersion(ColumnFamilyData* column_family_data, Version* v); ColumnFamilyData* CreateColumnFamily(const ColumnFamilyOptions& cf_options, + const ReadOptions& read_options, const VersionEdit* edit); - Status VerifyFileMetadata(ColumnFamilyData* cfd, const std::string& fpath, + Status VerifyFileMetadata(const ReadOptions& read_options, + ColumnFamilyData* cfd, const std::string& fpath, int level, const FileMetaData& meta); // Protected by DB mutex. @@ -1620,7 +1633,8 @@ class VersionSet { InstrumentedMutex* mu, FSDirectory* dir_contains_current_file, bool new_descriptor_log, - const ColumnFamilyOptions* new_cf_options); + const ColumnFamilyOptions* new_cf_options, + const ReadOptions& read_options); void LogAndApplyCFHelper(VersionEdit* edit, SequenceNumber* max_last_sequence); @@ -1672,13 +1686,15 @@ class ReactiveVersionSet : public VersionSet { private: std::unique_ptr manifest_tailer_; - + // TODO: plumb Env::IOActivity + const ReadOptions read_options_; using VersionSet::LogAndApply; using VersionSet::Recover; Status LogAndApply( const autovector& /*cfds*/, const autovector& /*mutable_cf_options_list*/, + const ReadOptions& /* read_options */, const autovector>& /*edit_lists*/, InstrumentedMutex* /*mu*/, FSDirectory* /*dir_contains_current_file*/, bool /*new_descriptor_log*/, const ColumnFamilyOptions* /*new_cf_option*/, diff --git a/db/version_set_test.cc b/db/version_set_test.cc index 0815d4cab..481dd46d9 100644 --- a/db/version_set_test.cc +++ b/db/version_set_test.cc @@ -1307,9 +1307,9 @@ class VersionSetTestBase { Status LogAndApplyToDefaultCF(VersionEdit& edit) { mutex_.Lock(); - Status s = - versions_->LogAndApply(versions_->GetColumnFamilySet()->GetDefault(), - mutable_cf_options_, &edit, &mutex_, nullptr); + Status s = versions_->LogAndApply( + versions_->GetColumnFamilySet()->GetDefault(), mutable_cf_options_, + read_options_, &edit, &mutex_, nullptr); mutex_.Unlock(); return s; } @@ -1321,9 +1321,9 @@ class VersionSetTestBase { vedits.push_back(e.get()); } mutex_.Lock(); - Status s = - versions_->LogAndApply(versions_->GetColumnFamilySet()->GetDefault(), - mutable_cf_options_, vedits, &mutex_, nullptr); + Status s = versions_->LogAndApply( + versions_->GetColumnFamilySet()->GetDefault(), mutable_cf_options_, + read_options_, vedits, &mutex_, nullptr); mutex_.Unlock(); return s; } @@ -1335,7 +1335,7 @@ class VersionSetTestBase { VersionEdit dummy; ASSERT_OK(versions_->LogAndApply( versions_->GetColumnFamilySet()->GetDefault(), mutable_cf_options_, - &dummy, &mutex_, db_directory, new_descriptor_log)); + read_options_, &dummy, &mutex_, db_directory, new_descriptor_log)); mutex_.Unlock(); } @@ -1350,7 +1350,8 @@ class VersionSetTestBase { Status s; mutex_.Lock(); s = versions_->LogAndApply(/*column_family_data=*/nullptr, - MutableCFOptions(cf_options), &new_cf, &mutex_, + MutableCFOptions(cf_options), read_options_, + &new_cf, &mutex_, /*db_directory=*/nullptr, /*new_descriptor_log=*/false, &cf_options); mutex_.Unlock(); @@ -1372,6 +1373,7 @@ class VersionSetTestBase { ColumnFamilyOptions cf_options_; ImmutableOptions immutable_options_; MutableCFOptions mutable_cf_options_; + const ReadOptions read_options_; std::shared_ptr table_cache_; WriteController write_controller_; WriteBufferManager write_buffer_manager_; @@ -1395,6 +1397,8 @@ class VersionSetTest : public VersionSetTestBase, public testing::Test { TEST_F(VersionSetTest, SameColumnFamilyGroupCommit) { NewDB(); const int kGroupSize = 5; + const ReadOptions read_options; + autovector edits; for (int i = 0; i != kGroupSize; ++i) { edits.emplace_back(VersionEdit()); @@ -1421,8 +1425,8 @@ TEST_F(VersionSetTest, SameColumnFamilyGroupCommit) { }); SyncPoint::GetInstance()->EnableProcessing(); mutex_.Lock(); - Status s = versions_->LogAndApply(cfds, all_mutable_cf_options, edit_lists, - &mutex_, nullptr); + Status s = versions_->LogAndApply(cfds, all_mutable_cf_options, read_options, + edit_lists, &mutex_, nullptr); mutex_.Unlock(); EXPECT_OK(s); EXPECT_EQ(kGroupSize - 1, count); @@ -1622,9 +1626,9 @@ TEST_F(VersionSetTest, ObsoleteBlobFile) { edit.AddBlobFileGarbage(blob_file_number, total_blob_count, total_blob_bytes); mutex_.Lock(); - Status s = - versions_->LogAndApply(versions_->GetColumnFamilySet()->GetDefault(), - mutable_cf_options_, &edit, &mutex_, nullptr); + Status s = versions_->LogAndApply( + versions_->GetColumnFamilySet()->GetDefault(), mutable_cf_options_, + read_options_, &edit, &mutex_, nullptr); mutex_.Unlock(); ASSERT_OK(s); @@ -2242,7 +2246,7 @@ class VersionSetWithTimestampTest : public VersionSetTest { Status s; mutex_.Lock(); s = versions_->LogAndApply(cfd_, *(cfd_->GetLatestMutableCFOptions()), - edits_, &mutex_, nullptr); + read_options_, edits_, &mutex_, nullptr); mutex_.Unlock(); ASSERT_OK(s); VerifyFullHistoryTsLow(*std::max_element(ts_lbs.begin(), ts_lbs.end())); @@ -2252,6 +2256,9 @@ class VersionSetWithTimestampTest : public VersionSetTest { ColumnFamilyData* cfd_{nullptr}; // edits_ must contain and own pointers to heap-alloc VersionEdit objects. autovector edits_; + + private: + const ReadOptions read_options_; }; const std::string VersionSetWithTimestampTest::kNewCfName("new_cf"); @@ -2680,6 +2687,8 @@ class VersionSetTestDropOneCF : public VersionSetTestBase, // Repeat the test for i = 1, 2, 3 to simulate dropping the first, middle and // last column family in an atomic group. TEST_P(VersionSetTestDropOneCF, HandleDroppedColumnFamilyInAtomicGroup) { + const ReadOptions read_options; + std::vector column_families; SequenceNumber last_seqno; std::unique_ptr log_writer; @@ -2709,7 +2718,7 @@ TEST_P(VersionSetTestDropOneCF, HandleDroppedColumnFamilyInAtomicGroup) { mutex_.Lock(); s = versions_->LogAndApply(cfd_to_drop, *cfd_to_drop->GetLatestMutableCFOptions(), - &drop_cf_edit, &mutex_, nullptr); + read_options, &drop_cf_edit, &mutex_, nullptr); mutex_.Unlock(); ASSERT_OK(s); @@ -2758,8 +2767,8 @@ TEST_P(VersionSetTestDropOneCF, HandleDroppedColumnFamilyInAtomicGroup) { }); SyncPoint::GetInstance()->EnableProcessing(); mutex_.Lock(); - s = versions_->LogAndApply(cfds, mutable_cf_options_list, edit_lists, &mutex_, - nullptr); + s = versions_->LogAndApply(cfds, mutable_cf_options_list, read_options, + edit_lists, &mutex_, nullptr); mutex_.Unlock(); ASSERT_OK(s); ASSERT_EQ(1, called); diff --git a/db/version_util.h b/db/version_util.h index 5ec6fda11..e39f25571 100644 --- a/db/version_util.h +++ b/db/version_util.h @@ -33,14 +33,15 @@ class OfflineManifestWriter { /*no_error_if_files_missing*/ true); } - Status LogAndApply(ColumnFamilyData* cfd, VersionEdit* edit, + Status LogAndApply(const ReadOptions& read_options, ColumnFamilyData* cfd, + VersionEdit* edit, FSDirectory* dir_contains_current_file) { // Use `mutex` to imitate a locked DB mutex when calling `LogAndApply()`. InstrumentedMutex mutex; mutex.Lock(); - Status s = versions_.LogAndApply(cfd, *cfd->GetLatestMutableCFOptions(), - edit, &mutex, dir_contains_current_file, - false /* new_descriptor_log */); + Status s = versions_.LogAndApply( + cfd, *cfd->GetLatestMutableCFOptions(), read_options, edit, &mutex, + dir_contains_current_file, false /* new_descriptor_log */); mutex.Unlock(); return s; } diff --git a/db/write_batch.cc b/db/write_batch.cc index 5f5c0bfcd..7fa0ed694 100644 --- a/db/write_batch.cc +++ b/db/write_batch.cc @@ -2036,6 +2036,7 @@ class MemTableInserter : public WriteBatch::Handler { // key not found in memtable. Do sst get, update, add SnapshotImpl read_from_snapshot; read_from_snapshot.number_ = sequence_; + // TODO: plumb Env::IOActivity ReadOptions ropts; // it's going to be overwritten for sure, so no point caching data block // containing the old version @@ -2480,6 +2481,7 @@ class MemTableInserter : public WriteBatch::Handler { // operations in the same batch. SnapshotImpl read_from_snapshot; read_from_snapshot.number_ = sequence_; + // TODO: plumb Env::IOActivity ReadOptions read_options; read_options.snapshot = &read_from_snapshot; diff --git a/db_stress_tool/db_stress_common.h b/db_stress_tool/db_stress_common.h index dbe912fe9..d16fefe4c 100644 --- a/db_stress_tool/db_stress_common.h +++ b/db_stress_tool/db_stress_common.h @@ -311,6 +311,8 @@ DECLARE_int32(create_timestamped_snapshot_one_in); DECLARE_bool(allow_data_in_errors); +DECLARE_bool(enable_thread_tracking); + // Tiered storage DECLARE_bool(enable_tiered_storage); // set last_level_temperature DECLARE_int64(preclude_last_level_data_seconds); diff --git a/db_stress_tool/db_stress_driver.cc b/db_stress_tool/db_stress_driver.cc index 2c8dcf610..4bf82c9d1 100644 --- a/db_stress_tool/db_stress_driver.cc +++ b/db_stress_tool/db_stress_driver.cc @@ -55,8 +55,7 @@ void ThreadBody(void* v) { } } } - -bool RunStressTest(SharedState* shared) { +bool RunStressTestImpl(SharedState* shared) { SystemClock* clock = db_stress_env->GetSystemClock().get(); StressTest* stress = shared->GetStressTest(); @@ -207,5 +206,11 @@ bool RunStressTest(SharedState* shared) { } return true; } +bool RunStressTest(SharedState* shared) { + ThreadStatusUtil::RegisterThread(db_stress_env, ThreadStatus::USER); + bool result = RunStressTestImpl(shared); + ThreadStatusUtil::UnregisterThread(); + return result; +} } // namespace ROCKSDB_NAMESPACE #endif // GFLAGS diff --git a/db_stress_tool/db_stress_env_wrapper.h b/db_stress_tool/db_stress_env_wrapper.h index af60df9bc..612d9fc6b 100644 --- a/db_stress_tool/db_stress_env_wrapper.h +++ b/db_stress_tool/db_stress_env_wrapper.h @@ -10,8 +10,30 @@ #ifdef GFLAGS #pragma once #include "db_stress_tool/db_stress_common.h" +#include "monitoring/thread_status_util.h" namespace ROCKSDB_NAMESPACE { +class DbStressRandomAccessFileWrapper : public FSRandomAccessFileOwnerWrapper { + public: + explicit DbStressRandomAccessFileWrapper( + std::unique_ptr&& target) + : FSRandomAccessFileOwnerWrapper(std::move(target)) {} + + IOStatus Read(uint64_t offset, size_t n, const IOOptions& options, + Slice* result, char* scratch, + IODebugContext* dbg) const override { +#ifndef NDEBUG + const ThreadStatus::OperationType thread_op = + ThreadStatusUtil::GetThreadOperation(); + Env::IOActivity io_activity = + ThreadStatusUtil::TEST_GetExpectedIOActivity(thread_op); + assert(io_activity == Env::IOActivity::kUnknown || + io_activity == options.io_activity); +#endif + return target()->Read(offset, n, options, result, scratch, dbg); + } +}; + class DbStressFSWrapper : public FileSystemWrapper { public: explicit DbStressFSWrapper(const std::shared_ptr& t) @@ -19,6 +41,18 @@ class DbStressFSWrapper : public FileSystemWrapper { static const char* kClassName() { return "DbStressFS"; } const char* Name() const override { return kClassName(); } + IOStatus NewRandomAccessFile(const std::string& f, + const FileOptions& file_opts, + std::unique_ptr* r, + IODebugContext* dbg) override { + std::unique_ptr file; + IOStatus s = target()->NewRandomAccessFile(f, file_opts, &file, dbg); + if (s.ok()) { + r->reset(new DbStressRandomAccessFileWrapper(std::move(file))); + } + return s; + } + IOStatus DeleteFile(const std::string& f, const IOOptions& opts, IODebugContext* dbg) override { // We determine whether it is a manifest file by searching a strong, diff --git a/db_stress_tool/db_stress_gflags.cc b/db_stress_tool/db_stress_gflags.cc index dd6d04916..b6ee67269 100644 --- a/db_stress_tool/db_stress_gflags.cc +++ b/db_stress_tool/db_stress_gflags.cc @@ -1051,6 +1051,11 @@ DEFINE_bool(allow_data_in_errors, ROCKSDB_NAMESPACE::Options().allow_data_in_errors, "If true, allow logging data, e.g. key, value in LOG files."); +DEFINE_bool(enable_thread_tracking, + ROCKSDB_NAMESPACE::Options().enable_thread_tracking, + "If true, the status of the threads involved in this DB will be " + "tracked and available via GetThreadList() API."); + DEFINE_int32(verify_iterator_with_expected_state_one_in, 0, "If non-zero, when TestIterate() is to be called, there is a " "1/verify_iterator_with_expected_state_one_in " diff --git a/db_stress_tool/db_stress_test_base.cc b/db_stress_tool/db_stress_test_base.cc index e508dadb5..60a12b331 100644 --- a/db_stress_tool/db_stress_test_base.cc +++ b/db_stress_tool/db_stress_test_base.cc @@ -3207,6 +3207,8 @@ void InitializeOptionsFromFlags( } options.allow_data_in_errors = FLAGS_allow_data_in_errors; + + options.enable_thread_tracking = FLAGS_enable_thread_tracking; } void InitializeOptionsGeneral( diff --git a/db_stress_tool/db_stress_tool.cc b/db_stress_tool/db_stress_tool.cc index c37117921..c41c5051f 100644 --- a/db_stress_tool/db_stress_tool.cc +++ b/db_stress_tool/db_stress_tool.cc @@ -99,13 +99,6 @@ int db_stress_tool(int argc, char** argv) { env_wrapper_guard = std::make_shared( raw_env, std::make_shared(raw_env->GetFileSystem())); - if (!env_opts && !FLAGS_use_io_uring) { - // If using the default Env (Posix), wrap DbStressEnvWrapper with the - // legacy EnvWrapper. This is a workaround to prevent MultiGet and scans - // from failing when IO uring is disabled. The EnvWrapper - // has a default implementation of ReadAsync that redirects to Read. - env_wrapper_guard = std::make_shared(env_wrapper_guard); - } db_stress_env = env_wrapper_guard.get(); FLAGS_rep_factory = StringToRepFactory(FLAGS_memtablerep.c_str()); diff --git a/db_stress_tool/multi_ops_txns_stress.cc b/db_stress_tool/multi_ops_txns_stress.cc index 4c05879aa..89b061004 100644 --- a/db_stress_tool/multi_ops_txns_stress.cc +++ b/db_stress_tool/multi_ops_txns_stress.cc @@ -1220,7 +1220,8 @@ void MultiOpsTxnsStressTest::VerifyDb(ThreadState* thread) const { // which can be called before TransactionDB::Open() returns to caller. // Therefore, at that time, db_ and txn_db_ may still be nullptr. // Caller has to make sure that the race condition does not happen. -void MultiOpsTxnsStressTest::VerifyPkSkFast(int job_id) { +void MultiOpsTxnsStressTest::VerifyPkSkFast(const ReadOptions& read_options, + int job_id) { DB* const db = db_aptr_.load(std::memory_order_acquire); if (db == nullptr) { return; @@ -1249,6 +1250,7 @@ void MultiOpsTxnsStressTest::VerifyPkSkFast(int job_id) { ReadOptions ropts; ropts.snapshot = snapshot; ropts.total_order_seek = true; + ropts.io_activity = read_options.io_activity; std::unique_ptr it(db_->NewIterator(ropts)); for (it->Seek(start_key); it->Valid(); it->Next()) { diff --git a/db_stress_tool/multi_ops_txns_stress.h b/db_stress_tool/multi_ops_txns_stress.h index 26744df66..12c45aaa3 100644 --- a/db_stress_tool/multi_ops_txns_stress.h +++ b/db_stress_tool/multi_ops_txns_stress.h @@ -288,7 +288,7 @@ class MultiOpsTxnsStressTest : public StressTest { VerifyDb(thread); } - void VerifyPkSkFast(int job_id); + void VerifyPkSkFast(const ReadOptions& read_options, int job_id); protected: class Counter { @@ -424,7 +424,8 @@ class MultiOpsTxnsStressListener : public EventListener { (void)db; #endif assert(info.cf_id == 0); - stress_test_->VerifyPkSkFast(info.job_id); + const ReadOptions read_options(Env::IOActivity::kFlush); + stress_test_->VerifyPkSkFast(read_options, info.job_id); } void OnCompactionCompleted(DB* db, const CompactionJobInfo& info) override { @@ -433,7 +434,8 @@ class MultiOpsTxnsStressListener : public EventListener { (void)db; #endif assert(info.cf_id == 0); - stress_test_->VerifyPkSkFast(info.job_id); + const ReadOptions read_options(Env::IOActivity::kCompaction); + stress_test_->VerifyPkSkFast(read_options, info.job_id); } private: diff --git a/file/file_util.cc b/file/file_util.cc index 43608fcdc..46faac67c 100644 --- a/file/file_util.cc +++ b/file/file_util.cc @@ -185,9 +185,9 @@ IOStatus GenerateOneFileChecksum( if (!io_s.ok()) { return io_s; } - reader.reset(new RandomAccessFileReader(std::move(r_file), file_path, - nullptr /*Env*/, io_tracer, nullptr, - 0, nullptr, rate_limiter)); + reader.reset(new RandomAccessFileReader( + std::move(r_file), file_path, nullptr /*Env*/, io_tracer, nullptr, + Histograms::HISTOGRAM_ENUM_MAX, nullptr, rate_limiter)); } // Found that 256 KB readahead size provides the best performance, based on diff --git a/file/file_util.h b/file/file_util.h index d46a7ba0e..e279cfba0 100644 --- a/file/file_util.h +++ b/file/file_util.h @@ -80,6 +80,8 @@ inline IOStatus PrepareIOFromReadOptions(const ReadOptions& ro, } opts.rate_limiter_priority = ro.rate_limiter_priority; + opts.io_activity = ro.io_activity; + return IOStatus::OK(); } diff --git a/file/random_access_file_reader.cc b/file/random_access_file_reader.cc index 226970641..2f8b51667 100644 --- a/file/random_access_file_reader.cc +++ b/file/random_access_file_reader.cc @@ -22,7 +22,11 @@ #include "util/rate_limiter.h" namespace ROCKSDB_NAMESPACE { - +const std::array + kReadHistograms{{ + FILE_READ_FLUSH_MICROS, + FILE_READ_COMPACTION_MICROS, + }}; inline void RecordIOStats(Statistics* stats, Temperature file_temperature, bool is_last_level, size_t size) { IOSTATS_ADD(bytes_read, size); @@ -94,6 +98,9 @@ IOStatus RandomAccessFileReader::Read( uint64_t elapsed = 0; { StopWatch sw(clock_, stats_, hist_type_, + (opts.io_activity != Env::IOActivity::kUnknown) + ? kReadHistograms[(std::size_t)(opts.io_activity)] + : Histograms::HISTOGRAM_ENUM_MAX, (stats_ != nullptr) ? &elapsed : nullptr, true /*overwrite*/, true /*delay_enabled*/); auto prev_perf_level = GetPerfLevel(); @@ -288,6 +295,9 @@ IOStatus RandomAccessFileReader::MultiRead( uint64_t elapsed = 0; { StopWatch sw(clock_, stats_, hist_type_, + (opts.io_activity != Env::IOActivity::kUnknown) + ? kReadHistograms[(std::size_t)(opts.io_activity)] + : Histograms::HISTOGRAM_ENUM_MAX, (stats_ != nullptr) ? &elapsed : nullptr, true /*overwrite*/, true /*delay_enabled*/); auto prev_perf_level = GetPerfLevel(); @@ -425,7 +435,7 @@ IOStatus RandomAccessFileReader::MultiRead( } IOStatus RandomAccessFileReader::PrepareIOOptions(const ReadOptions& ro, - IOOptions& opts) { + IOOptions& opts) const { if (clock_ != nullptr) { return PrepareIOFromReadOptions(ro, clock_, opts); } else { @@ -476,13 +486,17 @@ IOStatus RandomAccessFileReader::ReadAsync( assert(read_async_info->buf_.CurrentSize() == 0); - StopWatch sw(clock_, nullptr /*stats*/, 0 /*hist_type*/, &elapsed, - true /*overwrite*/, true /*delay_enabled*/); + StopWatch sw(clock_, nullptr /*stats*/, + Histograms::HISTOGRAM_ENUM_MAX /*hist_type*/, + Histograms::HISTOGRAM_ENUM_MAX, &elapsed, true /*overwrite*/, + true /*delay_enabled*/); s = file_->ReadAsync(aligned_req, opts, read_async_callback, read_async_info, io_handle, del_fn, nullptr /*dbg*/); } else { - StopWatch sw(clock_, nullptr /*stats*/, 0 /*hist_type*/, &elapsed, - true /*overwrite*/, true /*delay_enabled*/); + StopWatch sw(clock_, nullptr /*stats*/, + Histograms::HISTOGRAM_ENUM_MAX /*hist_type*/, + Histograms::HISTOGRAM_ENUM_MAX, &elapsed, true /*overwrite*/, + true /*delay_enabled*/); s = file_->ReadAsync(req, opts, read_async_callback, read_async_info, io_handle, del_fn, nullptr /*dbg*/); } diff --git a/file/random_access_file_reader.h b/file/random_access_file_reader.h index 50baa0318..ab4d1e797 100644 --- a/file/random_access_file_reader.h +++ b/file/random_access_file_reader.h @@ -122,7 +122,8 @@ class RandomAccessFileReader { std::unique_ptr&& raf, const std::string& _file_name, SystemClock* clock = nullptr, const std::shared_ptr& io_tracer = nullptr, - Statistics* stats = nullptr, uint32_t hist_type = 0, + Statistics* stats = nullptr, + uint32_t hist_type = Histograms::HISTOGRAM_ENUM_MAX, HistogramImpl* file_read_hist = nullptr, RateLimiter* rate_limiter = nullptr, const std::vector>& listeners = {}, @@ -197,7 +198,7 @@ class RandomAccessFileReader { bool use_direct_io() const { return file_->use_direct_io(); } - IOStatus PrepareIOOptions(const ReadOptions& ro, IOOptions& opts); + IOStatus PrepareIOOptions(const ReadOptions& ro, IOOptions& opts) const; IOStatus ReadAsync(FSReadRequest& req, const IOOptions& opts, std::function cb, diff --git a/include/rocksdb/env.h b/include/rocksdb/env.h index 62af602c6..c6523f063 100644 --- a/include/rocksdb/env.h +++ b/include/rocksdb/env.h @@ -436,6 +436,14 @@ class Env : public Customizable { IO_TOTAL = 4 }; + // EXPERIMENTAL + enum class IOActivity : uint8_t { + kFlush = 0, + kCompaction = 1, + kDBOpen = 2, + kUnknown, + }; + // Arrange to run "(*function)(arg)" once in a background thread, in // the thread pool specified by pri. By default, jobs go to the 'LOW' // priority thread pool. diff --git a/include/rocksdb/file_system.h b/include/rocksdb/file_system.h index 97b21e286..ae59ef800 100644 --- a/include/rocksdb/file_system.h +++ b/include/rocksdb/file_system.h @@ -116,6 +116,8 @@ struct IOOptions { // directories and list only files in GetChildren API. bool do_not_recurse; + Env::IOActivity io_activity = Env::IOActivity::kUnknown; + IOOptions() : IOOptions(false) {} explicit IOOptions(bool force_dir_fsync_) diff --git a/include/rocksdb/options.h b/include/rocksdb/options.h index 669afc1d4..611ba8b79 100644 --- a/include/rocksdb/options.h +++ b/include/rocksdb/options.h @@ -1696,8 +1696,11 @@ struct ReadOptions { // Default: true bool optimize_multiget_for_io; + Env::IOActivity io_activity; + ReadOptions(); ReadOptions(bool cksum, bool cache); + explicit ReadOptions(Env::IOActivity io_activity); }; // Options that control write operations diff --git a/include/rocksdb/statistics.h b/include/rocksdb/statistics.h index f25e02ebe..b7a8905ff 100644 --- a/include/rocksdb/statistics.h +++ b/include/rocksdb/statistics.h @@ -472,7 +472,13 @@ enum Histograms : uint32_t { NUM_FILES_IN_SINGLE_COMPACTION, DB_SEEK, WRITE_STALL, + // Time spent in reading block-based or plain SST table SST_READ_MICROS, + // Time spent in reading SST table (currently only block-based table) or blob + // file for flush or compaction + FILE_READ_FLUSH_MICROS, + FILE_READ_COMPACTION_MICROS, + // The number of subcompactions actually scheduled during a compaction NUM_SUBCOMPACTIONS_SCHEDULED, // Value size distribution in each operation diff --git a/include/rocksdb/thread_status.h b/include/rocksdb/thread_status.h index f37e45f97..beecdfd25 100644 --- a/include/rocksdb/thread_status.h +++ b/include/rocksdb/thread_status.h @@ -56,6 +56,7 @@ struct ThreadStatus { OP_UNKNOWN = 0, OP_COMPACTION, OP_FLUSH, + OP_DBOPEN, NUM_OP_TYPES }; diff --git a/include/rocksdb/utilities/transaction.h b/include/rocksdb/utilities/transaction.h index 91c088a60..947fcec55 100644 --- a/include/rocksdb/utilities/transaction.h +++ b/include/rocksdb/utilities/transaction.h @@ -681,4 +681,3 @@ class Transaction { }; } // namespace ROCKSDB_NAMESPACE - diff --git a/java/rocksjni/portal.h b/java/rocksjni/portal.h index ee87f8947..32dcca9df 100644 --- a/java/rocksjni/portal.h +++ b/java/rocksjni/portal.h @@ -5619,6 +5619,10 @@ class HistogramTypeJni { return 0x38; case ROCKSDB_NAMESPACE::Histograms::TABLE_OPEN_PREFETCH_TAIL_READ_BYTES: return 0x39; + case ROCKSDB_NAMESPACE::Histograms::FILE_READ_FLUSH_MICROS: + return 0x3A; + case ROCKSDB_NAMESPACE::Histograms::FILE_READ_COMPACTION_MICROS: + return 0x3B; case ROCKSDB_NAMESPACE::Histograms::HISTOGRAM_ENUM_MAX: // 0x1F for backwards compatibility on current minor version. return 0x1F; @@ -5738,6 +5742,10 @@ class HistogramTypeJni { case 0x39: return ROCKSDB_NAMESPACE::Histograms:: TABLE_OPEN_PREFETCH_TAIL_READ_BYTES; + case 0x3A: + return ROCKSDB_NAMESPACE::Histograms::FILE_READ_FLUSH_MICROS; + case 0x3B: + return ROCKSDB_NAMESPACE::Histograms::FILE_READ_COMPACTION_MICROS; case 0x1F: // 0x1F for backwards compatibility on current minor version. return ROCKSDB_NAMESPACE::Histograms::HISTOGRAM_ENUM_MAX; @@ -6777,6 +6785,8 @@ class OperationTypeJni { return 0x1; case ROCKSDB_NAMESPACE::ThreadStatus::OperationType::OP_FLUSH: return 0x2; + case ROCKSDB_NAMESPACE::ThreadStatus::OperationType::OP_DBOPEN: + return 0x3; default: return 0x7F; // undefined } @@ -6793,6 +6803,8 @@ class OperationTypeJni { return ROCKSDB_NAMESPACE::ThreadStatus::OperationType::OP_COMPACTION; case 0x2: return ROCKSDB_NAMESPACE::ThreadStatus::OperationType::OP_FLUSH; + case 0x3: + return ROCKSDB_NAMESPACE::ThreadStatus::OperationType::OP_DBOPEN; default: // undefined/default return ROCKSDB_NAMESPACE::ThreadStatus::OperationType::OP_UNKNOWN; diff --git a/java/src/main/java/org/rocksdb/HistogramType.java b/java/src/main/java/org/rocksdb/HistogramType.java index 20c54422c..c5da68d16 100644 --- a/java/src/main/java/org/rocksdb/HistogramType.java +++ b/java/src/main/java/org/rocksdb/HistogramType.java @@ -169,6 +169,10 @@ public enum HistogramType { */ TABLE_OPEN_PREFETCH_TAIL_READ_BYTES((byte) 0x39), + FILE_READ_FLUSH_MICROS((byte) 0x3A), + + FILE_READ_COMPACTION_MICROS((byte) 0x3B), + // 0x1F for backwards compatibility on current minor version. HISTOGRAM_ENUM_MAX((byte) 0x1F); diff --git a/java/src/main/java/org/rocksdb/OperationType.java b/java/src/main/java/org/rocksdb/OperationType.java index 7cc9b65cd..301caea32 100644 --- a/java/src/main/java/org/rocksdb/OperationType.java +++ b/java/src/main/java/org/rocksdb/OperationType.java @@ -14,7 +14,8 @@ package org.rocksdb; public enum OperationType { OP_UNKNOWN((byte)0x0), OP_COMPACTION((byte)0x1), - OP_FLUSH((byte)0x2); + OP_FLUSH((byte) 0x2), + OP_DBOPEN((byte) 0x3); private final byte value; diff --git a/microbench/db_basic_bench.cc b/microbench/db_basic_bench.cc index 3f9ed8e63..fbcab5391 100644 --- a/microbench/db_basic_bench.cc +++ b/microbench/db_basic_bench.cc @@ -1548,7 +1548,8 @@ static void RandomAccessFileReaderRead(benchmark::State& state) { : Temperature::kCold; readers.emplace_back(new RandomAccessFileReader( std::move(f), fname, env->GetSystemClock().get(), nullptr, statistics, - 0, nullptr, nullptr, {}, temperature, rand_num == 1)); + Histograms::HISTOGRAM_ENUM_MAX, nullptr, nullptr, {}, temperature, + rand_num == 1)); } IOOptions io_options; diff --git a/monitoring/statistics.cc b/monitoring/statistics.cc index 05de681a2..90e3fbda7 100644 --- a/monitoring/statistics.cc +++ b/monitoring/statistics.cc @@ -243,6 +243,8 @@ const std::vector> HistogramsNameMap = { {DB_SEEK, "rocksdb.db.seek.micros"}, {WRITE_STALL, "rocksdb.db.write.stall"}, {SST_READ_MICROS, "rocksdb.sst.read.micros"}, + {FILE_READ_FLUSH_MICROS, "rocksdb.file.read.flush.micros"}, + {FILE_READ_COMPACTION_MICROS, "rocksdb.file.read.compaction.micros"}, {NUM_SUBCOMPACTIONS_SCHEDULED, "rocksdb.num.subcompactions.scheduled"}, {BYTES_PER_READ, "rocksdb.bytes.per.read"}, {BYTES_PER_WRITE, "rocksdb.bytes.per.write"}, diff --git a/monitoring/thread_status_updater.cc b/monitoring/thread_status_updater.cc index 9707d2265..37fcef62b 100644 --- a/monitoring/thread_status_updater.cc +++ b/monitoring/thread_status_updater.cc @@ -47,15 +47,19 @@ void ThreadStatusUpdater::ResetThreadStatus() { SetColumnFamilyInfoKey(nullptr); } +void ThreadStatusUpdater::SetEnableTracking(bool enable_tracking) { + auto* data = Get(); + if (data == nullptr) { + return; + } + data->enable_tracking.store(enable_tracking, std::memory_order_relaxed); +} + void ThreadStatusUpdater::SetColumnFamilyInfoKey(const void* cf_key) { auto* data = Get(); if (data == nullptr) { return; } - // set the tracking flag based on whether cf_key is non-null or not. - // If enable_thread_tracking is set to false, the input cf_key - // would be nullptr. - data->enable_tracking = (cf_key != nullptr); data->cf_key.store(const_cast(cf_key), std::memory_order_relaxed); } @@ -86,6 +90,14 @@ void ThreadStatusUpdater::SetThreadOperation( } } +ThreadStatus::OperationType ThreadStatusUpdater::GetThreadOperation() { + ThreadStatusData* data = GetLocalThreadStatus(); + if (data == nullptr) { + return ThreadStatus::OperationType::OP_UNKNOWN; + } + return data->operation_type.load(std::memory_order_relaxed); +} + void ThreadStatusUpdater::SetThreadOperationProperty(int i, uint64_t value) { auto* data = GetLocalThreadStatus(); if (data == nullptr) { @@ -211,9 +223,7 @@ ThreadStatusData* ThreadStatusUpdater::GetLocalThreadStatus() { if (thread_status_data_ == nullptr) { return nullptr; } - if (!thread_status_data_->enable_tracking) { - assert(thread_status_data_->cf_key.load(std::memory_order_relaxed) == - nullptr); + if (!thread_status_data_->enable_tracking.load(std::memory_order_relaxed)) { return nullptr; } return thread_status_data_; diff --git a/monitoring/thread_status_updater.h b/monitoring/thread_status_updater.h index 762c73ae2..696063cb4 100644 --- a/monitoring/thread_status_updater.h +++ b/monitoring/thread_status_updater.h @@ -62,7 +62,8 @@ struct ConstantColumnFamilyInfo { // status of a thread using a set of atomic pointers. struct ThreadStatusData { #ifdef ROCKSDB_USING_THREAD_STATUS - explicit ThreadStatusData() : enable_tracking(false) { + explicit ThreadStatusData() { + enable_tracking.store(false); thread_id.store(0); thread_type.store(ThreadStatus::USER); cf_key.store(nullptr); @@ -72,13 +73,10 @@ struct ThreadStatusData { } // A flag to indicate whether the thread tracking is enabled - // in the current thread. This value will be updated based on whether - // the associated Options::enable_thread_tracking is set to true - // in ThreadStatusUtil::SetColumnFamily(). - // + // in the current thread. // If set to false, then SetThreadOperation and SetThreadState // will be no-op. - bool enable_tracking; + std::atomic enable_tracking; std::atomic thread_id; std::atomic thread_type; @@ -119,8 +117,10 @@ class ThreadStatusUpdater { // Register the current thread for tracking. void RegisterThread(ThreadStatus::ThreadType ttype, uint64_t thread_id); + void SetEnableTracking(bool enable_tracking); + // Update the column-family info of the current thread by setting - // its thread-local pointer of ThreadStateInfo to the correct entry. + // its thread-local pointer of ThreadStatusData to the correct entry. void SetColumnFamilyInfoKey(const void* cf_key); // returns the column family info key. @@ -129,6 +129,9 @@ class ThreadStatusUpdater { // Update the thread operation of the current thread. void SetThreadOperation(const ThreadStatus::OperationType type); + // Return the thread operation of the current thread. + ThreadStatus::OperationType GetThreadOperation(); + // The start time of the current thread operation. It is in the format // of micro-seconds since some fixed point in time. void SetOperationStartTime(const uint64_t start_time); diff --git a/monitoring/thread_status_util.cc b/monitoring/thread_status_util.cc index c07b85fa8..9b66dc28e 100644 --- a/monitoring/thread_status_util.cc +++ b/monitoring/thread_status_util.cc @@ -33,27 +33,23 @@ void ThreadStatusUtil::UnregisterThread() { } } -void ThreadStatusUtil::SetColumnFamily(const ColumnFamilyData* cfd, - const Env* env, - bool enable_thread_tracking) { - if (!MaybeInitThreadLocalUpdater(env)) { +void ThreadStatusUtil::SetEnableTracking(bool enable_tracking) { + if (thread_updater_local_cache_ == nullptr) { return; } - assert(thread_updater_local_cache_); - if (cfd != nullptr && enable_thread_tracking) { - thread_updater_local_cache_->SetColumnFamilyInfoKey(cfd); - } else { - // When cfd == nullptr or enable_thread_tracking == false, we set - // ColumnFamilyInfoKey to nullptr, which makes SetThreadOperation - // and SetThreadState become no-op. - thread_updater_local_cache_->SetColumnFamilyInfoKey(nullptr); + thread_updater_local_cache_->SetEnableTracking(enable_tracking); +} + +void ThreadStatusUtil::SetColumnFamily(const ColumnFamilyData* cfd) { + if (thread_updater_local_cache_ == nullptr) { + return; } + assert(cfd); + thread_updater_local_cache_->SetColumnFamilyInfoKey(cfd); } void ThreadStatusUtil::SetThreadOperation(ThreadStatus::OperationType op) { if (thread_updater_local_cache_ == nullptr) { - // thread_updater_local_cache_ must be set in SetColumnFamily - // or other ThreadStatusUtil functions. return; } @@ -68,6 +64,13 @@ void ThreadStatusUtil::SetThreadOperation(ThreadStatus::OperationType op) { thread_updater_local_cache_->SetThreadOperation(op); } +ThreadStatus::OperationType ThreadStatusUtil::GetThreadOperation() { + if (thread_updater_local_cache_ == nullptr) { + return ThreadStatus::OperationType::OP_UNKNOWN; + } + return thread_updater_local_cache_->GetThreadOperation(); +} + ThreadStatus::OperationStage ThreadStatusUtil::SetThreadOperationStage( ThreadStatus::OperationStage stage) { if (thread_updater_local_cache_ == nullptr) { @@ -172,9 +175,7 @@ bool ThreadStatusUtil::MaybeInitThreadLocalUpdater(const Env* /*env*/) { return false; } -void ThreadStatusUtil::SetColumnFamily(const ColumnFamilyData* /*cfd*/, - const Env* /*env*/, - bool /*enable_thread_tracking*/) {} +void ThreadStatusUtil::SetColumnFamily(const ColumnFamilyData* /*cfd*/) {} void ThreadStatusUtil::SetThreadOperation(ThreadStatus::OperationType /*op*/) {} @@ -189,7 +190,7 @@ void ThreadStatusUtil::SetThreadState(ThreadStatus::StateType /*state*/) {} void ThreadStatusUtil::NewColumnFamilyInfo(const DB* /*db*/, const ColumnFamilyData* /*cfd*/, const std::string& /*cf_name*/, - const Env* /*env*/) {} + const Env* env) {} void ThreadStatusUtil::EraseColumnFamilyInfo(const ColumnFamilyData* /*cfd*/) {} diff --git a/monitoring/thread_status_util.h b/monitoring/thread_status_util.h index 0137d2682..df148a039 100644 --- a/monitoring/thread_status_util.h +++ b/monitoring/thread_status_util.h @@ -52,13 +52,18 @@ class ThreadStatusUtil { // the current thread does not hold db_mutex. static void EraseDatabaseInfo(const DB* db); + static void SetEnableTracking(bool enable_tracking); + // Update the thread status to indicate the current thread is doing // something related to the specified column family. - static void SetColumnFamily(const ColumnFamilyData* cfd, const Env* env, - bool enable_thread_tracking); + // + // REQUIRES: cfd != nullptr + static void SetColumnFamily(const ColumnFamilyData* cfd); static void SetThreadOperation(ThreadStatus::OperationType type); + static ThreadStatus::OperationType GetThreadOperation(); + static ThreadStatus::OperationStage SetThreadOperationStage( ThreadStatus::OperationStage stage); @@ -74,6 +79,9 @@ class ThreadStatusUtil { static void TEST_SetStateDelay(const ThreadStatus::StateType state, int micro); static void TEST_StateDelay(const ThreadStatus::StateType state); + + static Env::IOActivity TEST_GetExpectedIOActivity( + ThreadStatus::OperationType thread_op); #endif protected: diff --git a/monitoring/thread_status_util_debug.cc b/monitoring/thread_status_util_debug.cc index f7a94355d..6e4fe8a9f 100644 --- a/monitoring/thread_status_util_debug.cc +++ b/monitoring/thread_status_util_debug.cc @@ -27,6 +27,20 @@ void ThreadStatusUtil::TEST_StateDelay(const ThreadStatus::StateType state) { } } +Env::IOActivity ThreadStatusUtil::TEST_GetExpectedIOActivity( + ThreadStatus::OperationType thread_op) { + switch (thread_op) { + case ThreadStatus::OperationType::OP_FLUSH: + return Env::IOActivity::kFlush; + case ThreadStatus::OperationType::OP_COMPACTION: + return Env::IOActivity::kCompaction; + case ThreadStatus::OperationType::OP_DBOPEN: + return Env::IOActivity::kDBOpen; + default: + return Env::IOActivity::kUnknown; + } +} + #endif // !NDEBUG } // namespace ROCKSDB_NAMESPACE diff --git a/options/options.cc b/options/options.cc index 4eeb7138b..4faddf5a2 100644 --- a/options/options.cc +++ b/options/options.cc @@ -682,8 +682,6 @@ DBOptions* DBOptions::IncreaseParallelism(int total_threads) { env->SetBackgroundThreads(1, Env::HIGH); return this; } - - ReadOptions::ReadOptions() : snapshot(nullptr), iterate_lower_bound(nullptr), @@ -708,7 +706,8 @@ ReadOptions::ReadOptions() value_size_soft_limit(std::numeric_limits::max()), adaptive_readahead(false), async_io(false), - optimize_multiget_for_io(true) {} + optimize_multiget_for_io(true), + io_activity(Env::IOActivity::kUnknown) {} ReadOptions::ReadOptions(bool cksum, bool cache) : snapshot(nullptr), @@ -734,6 +733,34 @@ ReadOptions::ReadOptions(bool cksum, bool cache) value_size_soft_limit(std::numeric_limits::max()), adaptive_readahead(false), async_io(false), - optimize_multiget_for_io(true) {} + optimize_multiget_for_io(true), + io_activity(Env::IOActivity::kUnknown) {} + +ReadOptions::ReadOptions(Env::IOActivity _io_activity) + : snapshot(nullptr), + iterate_lower_bound(nullptr), + iterate_upper_bound(nullptr), + readahead_size(0), + max_skippable_internal_keys(0), + read_tier(kReadAllTier), + verify_checksums(true), + fill_cache(true), + tailing(false), + managed(false), + total_order_seek(false), + auto_prefix_mode(false), + prefix_same_as_start(false), + pin_data(false), + background_purge_on_iterator_cleanup(false), + ignore_range_deletions(false), + timestamp(nullptr), + iter_start_ts(nullptr), + deadline(std::chrono::microseconds::zero()), + io_timeout(std::chrono::microseconds::zero()), + value_size_soft_limit(std::numeric_limits::max()), + adaptive_readahead(false), + async_io(false), + optimize_multiget_for_io(true), + io_activity(_io_activity) {} } // namespace ROCKSDB_NAMESPACE diff --git a/table/block_based/block_based_table_reader.cc b/table/block_based/block_based_table_reader.cc index 76aa9cec6..0ed42348f 100644 --- a/table/block_based/block_based_table_reader.cc +++ b/table/block_based/block_based_table_reader.cc @@ -583,6 +583,7 @@ Status BlockBasedTable::Open( ro.io_timeout = read_options.io_timeout; ro.rate_limiter_priority = read_options.rate_limiter_priority; ro.verify_checksums = read_options.verify_checksums; + ro.io_activity = read_options.io_activity; // prefetch both index and filters, down to all partitions const bool prefetch_all = prefetch_index_and_filter_in_cache || level == 0; @@ -2231,7 +2232,8 @@ Status BlockBasedTable::MultiGetFilter(const ReadOptions& read_options, return Status::OK(); } -Status BlockBasedTable::Prefetch(const Slice* const begin, +Status BlockBasedTable::Prefetch(const ReadOptions& read_options, + const Slice* const begin, const Slice* const end) { auto& comparator = rep_->internal_comparator; UserComparatorWrapper user_comparator(comparator.user_comparator()); @@ -2241,7 +2243,7 @@ Status BlockBasedTable::Prefetch(const Slice* const begin, } BlockCacheLookupContext lookup_context{TableReaderCaller::kPrefetch}; IndexBlockIter iiter_on_stack; - auto iiter = NewIndexIterator(ReadOptions(), /*need_upper_bound_check=*/false, + auto iiter = NewIndexIterator(read_options, /*need_upper_bound_check=*/false, &iiter_on_stack, /*get_context=*/nullptr, &lookup_context); std::unique_ptr> iiter_unique_ptr; @@ -2278,7 +2280,7 @@ Status BlockBasedTable::Prefetch(const Slice* const begin, DataBlockIter biter; Status tmp_status; NewDataBlockIterator( - ReadOptions(), block_handle, &biter, /*type=*/BlockType::kData, + read_options, block_handle, &biter, /*type=*/BlockType::kData, /*get_context=*/nullptr, &lookup_context, /*prefetch_buffer=*/nullptr, /*for_compaction=*/false, /*async_read=*/false, tmp_status); @@ -2298,11 +2300,10 @@ Status BlockBasedTable::VerifyChecksum(const ReadOptions& read_options, // Check Meta blocks std::unique_ptr metaindex; std::unique_ptr metaindex_iter; - ReadOptions ro; - s = ReadMetaIndexBlock(ro, nullptr /* prefetch buffer */, &metaindex, - &metaindex_iter); + s = ReadMetaIndexBlock(read_options, nullptr /* prefetch buffer */, + &metaindex, &metaindex_iter); if (s.ok()) { - s = VerifyChecksumInMetaBlocks(metaindex_iter.get()); + s = VerifyChecksumInMetaBlocks(read_options, metaindex_iter.get()); if (!s.ok()) { return s; } @@ -2409,7 +2410,7 @@ BlockType BlockBasedTable::GetBlockTypeForMetaBlockByName( } Status BlockBasedTable::VerifyChecksumInMetaBlocks( - InternalIteratorBase* index_iter) { + const ReadOptions& read_options, InternalIteratorBase* index_iter) { Status s; for (index_iter->SeekToFirst(); index_iter->Valid(); index_iter->Next()) { s = index_iter->status(); @@ -2425,14 +2426,14 @@ Status BlockBasedTable::VerifyChecksumInMetaBlocks( // Unfortunate special handling for properties block checksum w/ // global seqno std::unique_ptr table_properties; - s = ReadTablePropertiesHelper(ReadOptions(), handle, rep_->file.get(), + s = ReadTablePropertiesHelper(read_options, handle, rep_->file.get(), nullptr /* prefetch_buffer */, rep_->footer, rep_->ioptions, &table_properties, nullptr /* memory_allocator */); } else { s = BlockFetcher( rep_->file.get(), nullptr /* prefetch buffer */, rep_->footer, - ReadOptions(), handle, &contents, rep_->ioptions, + read_options, handle, &contents, rep_->ioptions, false /* decompress */, false /*maybe_compressed*/, GetBlockTypeForMetaBlockByName(meta_block_name), UncompressionDict::GetEmptyDict(), rep_->persistent_cache_options) @@ -2544,7 +2545,8 @@ uint64_t BlockBasedTable::GetApproximateDataSize() { return rep_->footer.metaindex_handle().offset(); } -uint64_t BlockBasedTable::ApproximateOffsetOf(const Slice& key, +uint64_t BlockBasedTable::ApproximateOffsetOf(const ReadOptions& read_options, + const Slice& key, TableReaderCaller caller) { uint64_t data_size = GetApproximateDataSize(); if (UNLIKELY(data_size == 0)) { @@ -2558,6 +2560,7 @@ uint64_t BlockBasedTable::ApproximateOffsetOf(const Slice& key, IndexBlockIter iiter_on_stack; ReadOptions ro; ro.total_order_seek = true; + ro.io_activity = read_options.io_activity; auto index_iter = NewIndexIterator(ro, /*disable_prefix_seek=*/true, /*input_iter=*/&iiter_on_stack, /*get_context=*/nullptr, @@ -2586,7 +2589,8 @@ uint64_t BlockBasedTable::ApproximateOffsetOf(const Slice& key, static_cast(rep_->file_size)); } -uint64_t BlockBasedTable::ApproximateSize(const Slice& start, const Slice& end, +uint64_t BlockBasedTable::ApproximateSize(const ReadOptions& read_options, + const Slice& start, const Slice& end, TableReaderCaller caller) { assert(rep_->internal_comparator.Compare(start, end) <= 0); @@ -2603,6 +2607,7 @@ uint64_t BlockBasedTable::ApproximateSize(const Slice& start, const Slice& end, IndexBlockIter iiter_on_stack; ReadOptions ro; ro.total_order_seek = true; + ro.io_activity = read_options.io_activity; auto index_iter = NewIndexIterator(ro, /*disable_prefix_seek=*/true, /*input_iter=*/&iiter_on_stack, /*get_context=*/nullptr, @@ -2654,9 +2659,9 @@ bool BlockBasedTable::TEST_IndexBlockInCache() const { } Status BlockBasedTable::GetKVPairsFromDataBlocks( - std::vector* kv_pair_blocks) { + const ReadOptions& read_options, std::vector* kv_pair_blocks) { std::unique_ptr> blockhandles_iter( - NewIndexIterator(ReadOptions(), /*need_upper_bound_check=*/false, + NewIndexIterator(read_options, /*need_upper_bound_check=*/false, /*input_iter=*/nullptr, /*get_context=*/nullptr, /*lookup_contex=*/nullptr)); @@ -2677,7 +2682,7 @@ Status BlockBasedTable::GetKVPairsFromDataBlocks( std::unique_ptr datablock_iter; Status tmp_status; datablock_iter.reset(NewDataBlockIterator( - ReadOptions(), blockhandles_iter->value().handle, + read_options, blockhandles_iter->value().handle, /*input_iter=*/nullptr, /*type=*/BlockType::kData, /*get_context=*/nullptr, /*lookup_context=*/nullptr, /*prefetch_buffer=*/nullptr, /*for_compaction=*/false, @@ -2723,7 +2728,8 @@ Status BlockBasedTable::DumpTable(WritableFile* out_file) { "--------------------------------------\n"; std::unique_ptr metaindex; std::unique_ptr metaindex_iter; - ReadOptions ro; + // TODO: plumb Env::IOActivity + const ReadOptions ro; Status s = ReadMetaIndexBlock(ro, nullptr /* prefetch_buffer */, &metaindex, &metaindex_iter); if (s.ok()) { @@ -2779,7 +2785,7 @@ Status BlockBasedTable::DumpTable(WritableFile* out_file) { if (rep_->uncompression_dict_reader) { CachableEntry uncompression_dict; s = rep_->uncompression_dict_reader->GetOrReadUncompressionDictionary( - nullptr /* prefetch_buffer */, false /* no_io */, + nullptr /* prefetch_buffer */, ro, false /* no_io */, false, /* verify_checksums */ nullptr /* get_context */, nullptr /* lookup_context */, &uncompression_dict); @@ -2797,7 +2803,7 @@ Status BlockBasedTable::DumpTable(WritableFile* out_file) { } // Output range deletions block - auto* range_del_iter = NewRangeTombstoneIterator(ReadOptions()); + auto* range_del_iter = NewRangeTombstoneIterator(ro); if (range_del_iter != nullptr) { range_del_iter->SeekToFirst(); if (range_del_iter->Valid()) { @@ -2827,8 +2833,10 @@ Status BlockBasedTable::DumpTable(WritableFile* out_file) { Status BlockBasedTable::DumpIndexBlock(std::ostream& out_stream) { out_stream << "Index Details:\n" "--------------------------------------\n"; + // TODO: plumb Env::IOActivity + const ReadOptions read_options; std::unique_ptr> blockhandles_iter( - NewIndexIterator(ReadOptions(), /*need_upper_bound_check=*/false, + NewIndexIterator(read_options, /*need_upper_bound_check=*/false, /*input_iter=*/nullptr, /*get_context=*/nullptr, /*lookup_contex=*/nullptr)); Status s = blockhandles_iter->status(); @@ -2876,8 +2884,10 @@ Status BlockBasedTable::DumpIndexBlock(std::ostream& out_stream) { } Status BlockBasedTable::DumpDataBlocks(std::ostream& out_stream) { + // TODO: plumb Env::IOActivity + const ReadOptions read_options; std::unique_ptr> blockhandles_iter( - NewIndexIterator(ReadOptions(), /*need_upper_bound_check=*/false, + NewIndexIterator(read_options, /*need_upper_bound_check=*/false, /*input_iter=*/nullptr, /*get_context=*/nullptr, /*lookup_contex=*/nullptr)); Status s = blockhandles_iter->status(); @@ -2911,7 +2921,7 @@ Status BlockBasedTable::DumpDataBlocks(std::ostream& out_stream) { std::unique_ptr datablock_iter; Status tmp_status; datablock_iter.reset(NewDataBlockIterator( - ReadOptions(), blockhandles_iter->value().handle, + read_options, blockhandles_iter->value().handle, /*input_iter=*/nullptr, /*type=*/BlockType::kData, /*get_context=*/nullptr, /*lookup_context=*/nullptr, /*prefetch_buffer=*/nullptr, /*for_compaction=*/false, diff --git a/table/block_based/block_based_table_reader.h b/table/block_based/block_based_table_reader.h index 2108416f1..df296a0d3 100644 --- a/table/block_based/block_based_table_reader.h +++ b/table/block_based/block_based_table_reader.h @@ -153,7 +153,8 @@ class BlockBasedTable : public TableReader { // Pre-fetch the disk blocks that correspond to the key range specified by // (kbegin, kend). The call will return error status in the event of // IO or iteration error. - Status Prefetch(const Slice* begin, const Slice* end) override; + Status Prefetch(const ReadOptions& read_options, const Slice* begin, + const Slice* end) override; // Given a key, return an approximate byte offset in the file where // the data for that key begins (or would begin if the key were @@ -161,15 +162,16 @@ class BlockBasedTable : public TableReader { // bytes, and so includes effects like compression of the underlying data. // E.g., the approximate offset of the last key in the table will // be close to the file length. - uint64_t ApproximateOffsetOf(const Slice& key, + uint64_t ApproximateOffsetOf(const ReadOptions& read_options, + const Slice& key, TableReaderCaller caller) override; // Given start and end keys, return the approximate data size in the file // between the keys. The returned value is in terms of file bytes, and so // includes effects like compression of the underlying data. // The start key must not be greater than the end key. - uint64_t ApproximateSize(const Slice& start, const Slice& end, - TableReaderCaller caller) override; + uint64_t ApproximateSize(const ReadOptions& read_options, const Slice& start, + const Slice& end, TableReaderCaller caller) override; Status ApproximateKeyAnchors(const ReadOptions& read_options, std::vector& anchors) override; @@ -265,7 +267,8 @@ class BlockBasedTable : public TableReader { // Retrieve all key value pairs from data blocks in the table. // The key retrieved are internal keys. - Status GetKVPairsFromDataBlocks(std::vector* kv_pair_blocks); + Status GetKVPairsFromDataBlocks(const ReadOptions& read_options, + std::vector* kv_pair_blocks); struct Rep; @@ -477,7 +480,8 @@ class BlockBasedTable : public TableReader { static BlockType GetBlockTypeForMetaBlockByName(const Slice& meta_block_name); - Status VerifyChecksumInMetaBlocks(InternalIteratorBase* index_iter); + Status VerifyChecksumInMetaBlocks(const ReadOptions& read_options, + InternalIteratorBase* index_iter); Status VerifyChecksumInBlocks(const ReadOptions& read_options, InternalIteratorBase* index_iter); diff --git a/table/block_based/block_based_table_reader_impl.h b/table/block_based/block_based_table_reader_impl.h index 50d147712..801b4614f 100644 --- a/table/block_based/block_based_table_reader_impl.h +++ b/table/block_based/block_based_table_reader_impl.h @@ -68,7 +68,7 @@ TBlockIter* BlockBasedTable::NewDataBlockIterator( // uncompression dict is typically at the end of the file and would // most likely break the sequentiality of the access pattern. s = rep_->uncompression_dict_reader->GetOrReadUncompressionDictionary( - ro.async_io ? nullptr : prefetch_buffer, no_io, ro.verify_checksums, + ro.async_io ? nullptr : prefetch_buffer, ro, no_io, ro.verify_checksums, get_context, lookup_context, &uncompression_dict); if (!s.ok()) { iter->Invalidate(s); diff --git a/table/block_based/block_based_table_reader_sync_and_async.h b/table/block_based/block_based_table_reader_sync_and_async.h index 5efb279f4..e033b688b 100644 --- a/table/block_based/block_based_table_reader_sync_and_async.h +++ b/table/block_based/block_based_table_reader_sync_and_async.h @@ -421,7 +421,7 @@ DEFINE_SYNC_AND_ASYNC(void, BlockBasedTable::MultiGet) uncompression_dict_status = rep_->uncompression_dict_reader ->GetOrReadUncompressionDictionary( - nullptr /* prefetch_buffer */, no_io, + nullptr /* prefetch_buffer */, read_options, no_io, read_options.verify_checksums, get_context, &metadata_lookup_context, &uncompression_dict); uncompression_dict_inited = true; diff --git a/table/block_based/hash_index_reader.cc b/table/block_based/hash_index_reader.cc index 83232dc7e..e158d8039 100644 --- a/table/block_based/hash_index_reader.cc +++ b/table/block_based/hash_index_reader.cc @@ -74,17 +74,17 @@ Status HashIndexReader::Create(const BlockBasedTable* table, // Read contents for the blocks BlockContents prefixes_contents; BlockFetcher prefixes_block_fetcher( - file, prefetch_buffer, footer, ReadOptions(), prefixes_handle, - &prefixes_contents, ioptions, true /*decompress*/, - true /*maybe_compressed*/, BlockType::kHashIndexPrefixes, - UncompressionDict::GetEmptyDict(), cache_options, memory_allocator); + file, prefetch_buffer, footer, ro, prefixes_handle, &prefixes_contents, + ioptions, true /*decompress*/, true /*maybe_compressed*/, + BlockType::kHashIndexPrefixes, UncompressionDict::GetEmptyDict(), + cache_options, memory_allocator); s = prefixes_block_fetcher.ReadBlockContents(); if (!s.ok()) { return s; } BlockContents prefixes_meta_contents; BlockFetcher prefixes_meta_block_fetcher( - file, prefetch_buffer, footer, ReadOptions(), prefixes_meta_handle, + file, prefetch_buffer, footer, ro, prefixes_meta_handle, &prefixes_meta_contents, ioptions, true /*decompress*/, true /*maybe_compressed*/, BlockType::kHashIndexMetadata, UncompressionDict::GetEmptyDict(), cache_options, memory_allocator); diff --git a/table/block_based/partitioned_index_reader.cc b/table/block_based/partitioned_index_reader.cc index d901f9ca3..d1cc88834 100644 --- a/table/block_based/partitioned_index_reader.cc +++ b/table/block_based/partitioned_index_reader.cc @@ -85,6 +85,7 @@ InternalIteratorBase* PartitionIndexReader::NewIterator( ro.async_io = read_options.async_io; ro.rate_limiter_priority = read_options.rate_limiter_priority; ro.verify_checksums = read_options.verify_checksums; + ro.io_activity = read_options.io_activity; // We don't return pinned data from index blocks, so no need // to set `block_contents_pinned`. diff --git a/table/block_based/uncompression_dict_reader.cc b/table/block_based/uncompression_dict_reader.cc index ba1908720..4ac442b6b 100644 --- a/table/block_based/uncompression_dict_reader.cc +++ b/table/block_based/uncompression_dict_reader.cc @@ -77,8 +77,9 @@ Status UncompressionDictReader::ReadUncompressionDictionary( } Status UncompressionDictReader::GetOrReadUncompressionDictionary( - FilePrefetchBuffer* prefetch_buffer, bool no_io, bool verify_checksums, - GetContext* get_context, BlockCacheLookupContext* lookup_context, + FilePrefetchBuffer* prefetch_buffer, const ReadOptions& ro, bool no_io, + bool verify_checksums, GetContext* get_context, + BlockCacheLookupContext* lookup_context, CachableEntry* uncompression_dict) const { assert(uncompression_dict); @@ -92,6 +93,7 @@ Status UncompressionDictReader::GetOrReadUncompressionDictionary( read_options.read_tier = kBlockCacheTier; } read_options.verify_checksums = verify_checksums; + read_options.io_activity = ro.io_activity; return ReadUncompressionDictionary(table_, prefetch_buffer, read_options, cache_dictionary_blocks(), get_context, diff --git a/table/block_based/uncompression_dict_reader.h b/table/block_based/uncompression_dict_reader.h index 416d25e2d..c78800d8a 100644 --- a/table/block_based/uncompression_dict_reader.h +++ b/table/block_based/uncompression_dict_reader.h @@ -32,8 +32,9 @@ class UncompressionDictReader { std::unique_ptr* uncompression_dict_reader); Status GetOrReadUncompressionDictionary( - FilePrefetchBuffer* prefetch_buffer, bool no_io, bool verify_checksums, - GetContext* get_context, BlockCacheLookupContext* lookup_context, + FilePrefetchBuffer* prefetch_buffer, const ReadOptions& ro, bool no_io, + bool verify_checksums, GetContext* get_context, + BlockCacheLookupContext* lookup_context, CachableEntry* uncompression_dict) const; size_t ApproximateMemoryUsage() const; diff --git a/table/cuckoo/cuckoo_table_builder_test.cc b/table/cuckoo/cuckoo_table_builder_test.cc index ec3ec4206..ceddbf37a 100644 --- a/table/cuckoo/cuckoo_table_builder_test.cc +++ b/table/cuckoo/cuckoo_table_builder_test.cc @@ -70,8 +70,10 @@ class CuckooBuilderTest : public testing::Test { // Assert Table Properties. std::unique_ptr props; + const ReadOptions read_options; ASSERT_OK(ReadTableProperties(file_reader.get(), read_file_size, - kCuckooTableMagicNumber, ioptions, &props)); + kCuckooTableMagicNumber, ioptions, + read_options, &props)); // Check unused bucket. std::string unused_key = props->user_collected_properties[CuckooTablePropertyNames::kEmptyKey]; @@ -627,4 +629,3 @@ int main(int argc, char** argv) { ::testing::InitGoogleTest(&argc, argv); return RUN_ALL_TESTS(); } - diff --git a/table/cuckoo/cuckoo_table_reader.cc b/table/cuckoo/cuckoo_table_reader.cc index fa3e77b2e..d64761962 100644 --- a/table/cuckoo/cuckoo_table_reader.cc +++ b/table/cuckoo/cuckoo_table_reader.cc @@ -59,8 +59,11 @@ CuckooTableReader::CuckooTableReader( } { std::unique_ptr props; - status_ = ReadTableProperties(file_.get(), file_size, - kCuckooTableMagicNumber, ioptions, &props); + // TODO: plumb Env::IOActivity + const ReadOptions read_options; + status_ = + ReadTableProperties(file_.get(), file_size, kCuckooTableMagicNumber, + ioptions, read_options, &props); if (!status_.ok()) { return; } diff --git a/table/cuckoo/cuckoo_table_reader.h b/table/cuckoo/cuckoo_table_reader.h index 7e154769d..d17011ed8 100644 --- a/table/cuckoo/cuckoo_table_reader.h +++ b/table/cuckoo/cuckoo_table_reader.h @@ -58,12 +58,14 @@ class CuckooTableReader : public TableReader { size_t ApproximateMemoryUsage() const override; // Following methods are not implemented for Cuckoo Table Reader - uint64_t ApproximateOffsetOf(const Slice& /*key*/, + uint64_t ApproximateOffsetOf(const ReadOptions& /*read_options*/, + const Slice& /*key*/, TableReaderCaller /*caller*/) override { return 0; } - uint64_t ApproximateSize(const Slice& /*start*/, const Slice& /*end*/, + uint64_t ApproximateSize(const ReadOptions& /* read_options */, + const Slice& /*start*/, const Slice& /*end*/, TableReaderCaller /*caller*/) override { return 0; } diff --git a/table/meta_blocks.cc b/table/meta_blocks.cc index 6530f6a80..2c58ff9c7 100644 --- a/table/meta_blocks.cc +++ b/table/meta_blocks.cc @@ -412,20 +412,22 @@ Status ReadTablePropertiesHelper( Status ReadTableProperties(RandomAccessFileReader* file, uint64_t file_size, uint64_t table_magic_number, const ImmutableOptions& ioptions, + const ReadOptions& read_options, std::unique_ptr* properties, MemoryAllocator* memory_allocator, FilePrefetchBuffer* prefetch_buffer) { BlockHandle block_handle; Footer footer; - Status s = FindMetaBlockInFile(file, file_size, table_magic_number, ioptions, - kPropertiesBlockName, &block_handle, - memory_allocator, prefetch_buffer, &footer); + Status s = + FindMetaBlockInFile(file, file_size, table_magic_number, ioptions, + read_options, kPropertiesBlockName, &block_handle, + memory_allocator, prefetch_buffer, &footer); if (!s.ok()) { return s; } if (!block_handle.IsNull()) { - s = ReadTablePropertiesHelper(ReadOptions(), block_handle, file, + s = ReadTablePropertiesHelper(read_options, block_handle, file, prefetch_buffer, footer, ioptions, properties, memory_allocator); } else { @@ -473,14 +475,20 @@ Status FindMetaBlock(InternalIterator* meta_index_iter, Status ReadMetaIndexBlockInFile(RandomAccessFileReader* file, uint64_t file_size, uint64_t table_magic_number, const ImmutableOptions& ioptions, + const ReadOptions& read_options, BlockContents* metaindex_contents, MemoryAllocator* memory_allocator, FilePrefetchBuffer* prefetch_buffer, Footer* footer_out) { Footer footer; IOOptions opts; - auto s = ReadFooterFromFile(opts, file, *ioptions.fs, prefetch_buffer, - file_size, &footer, table_magic_number); + Status s; + s = file->PrepareIOOptions(read_options, opts); + if (!s.ok()) { + return s; + } + s = ReadFooterFromFile(opts, file, *ioptions.fs, prefetch_buffer, file_size, + &footer, table_magic_number); if (!s.ok()) { return s; } @@ -489,7 +497,7 @@ Status ReadMetaIndexBlockInFile(RandomAccessFileReader* file, } auto metaindex_handle = footer.metaindex_handle(); - return BlockFetcher(file, prefetch_buffer, footer, ReadOptions(), + return BlockFetcher(file, prefetch_buffer, footer, read_options, metaindex_handle, metaindex_contents, ioptions, false /* do decompression */, false /*maybe_compressed*/, BlockType::kMetaIndex, UncompressionDict::GetEmptyDict(), @@ -497,18 +505,16 @@ Status ReadMetaIndexBlockInFile(RandomAccessFileReader* file, .ReadBlockContents(); } -Status FindMetaBlockInFile(RandomAccessFileReader* file, uint64_t file_size, - uint64_t table_magic_number, - const ImmutableOptions& ioptions, - const std::string& meta_block_name, - BlockHandle* block_handle, - MemoryAllocator* memory_allocator, - FilePrefetchBuffer* prefetch_buffer, - Footer* footer_out) { +Status FindMetaBlockInFile( + RandomAccessFileReader* file, uint64_t file_size, + uint64_t table_magic_number, const ImmutableOptions& ioptions, + const ReadOptions& read_options, const std::string& meta_block_name, + BlockHandle* block_handle, MemoryAllocator* memory_allocator, + FilePrefetchBuffer* prefetch_buffer, Footer* footer_out) { BlockContents metaindex_contents; auto s = ReadMetaIndexBlockInFile( - file, file_size, table_magic_number, ioptions, &metaindex_contents, - memory_allocator, prefetch_buffer, footer_out); + file, file_size, table_magic_number, ioptions, read_options, + &metaindex_contents, memory_allocator, prefetch_buffer, footer_out); if (!s.ok()) { return s; } @@ -526,6 +532,7 @@ Status ReadMetaBlock(RandomAccessFileReader* file, FilePrefetchBuffer* prefetch_buffer, uint64_t file_size, uint64_t table_magic_number, const ImmutableOptions& ioptions, + const ReadOptions& read_options, const std::string& meta_block_name, BlockType block_type, BlockContents* contents, MemoryAllocator* memory_allocator) { @@ -535,15 +542,16 @@ Status ReadMetaBlock(RandomAccessFileReader* file, BlockHandle block_handle; Footer footer; - Status status = FindMetaBlockInFile( - file, file_size, table_magic_number, ioptions, meta_block_name, - &block_handle, memory_allocator, prefetch_buffer, &footer); + Status status = + FindMetaBlockInFile(file, file_size, table_magic_number, ioptions, + read_options, meta_block_name, &block_handle, + memory_allocator, prefetch_buffer, &footer); if (!status.ok()) { return status; } - return BlockFetcher(file, prefetch_buffer, footer, ReadOptions(), - block_handle, contents, ioptions, false /* decompress */, + return BlockFetcher(file, prefetch_buffer, footer, read_options, block_handle, + contents, ioptions, false /* decompress */, false /*maybe_compressed*/, block_type, UncompressionDict::GetEmptyDict(), PersistentCacheOptions::kEmpty, memory_allocator) diff --git a/table/meta_blocks.h b/table/meta_blocks.h index b867dd01d..962a31638 100644 --- a/table/meta_blocks.h +++ b/table/meta_blocks.h @@ -119,6 +119,7 @@ Status ReadTablePropertiesHelper( Status ReadTableProperties(RandomAccessFileReader* file, uint64_t file_size, uint64_t table_magic_number, const ImmutableOptions& ioptions, + const ReadOptions& read_options, std::unique_ptr* properties, MemoryAllocator* memory_allocator = nullptr, FilePrefetchBuffer* prefetch_buffer = nullptr); @@ -139,6 +140,7 @@ Status FindMetaBlock(InternalIterator* meta_index_iter, Status FindMetaBlockInFile(RandomAccessFileReader* file, uint64_t file_size, uint64_t table_magic_number, const ImmutableOptions& ioptions, + const ReadOptions& read_options, const std::string& meta_block_name, BlockHandle* block_handle, MemoryAllocator* memory_allocator = nullptr, @@ -149,6 +151,7 @@ Status FindMetaBlockInFile(RandomAccessFileReader* file, uint64_t file_size, Status ReadMetaIndexBlockInFile(RandomAccessFileReader* file, uint64_t file_size, uint64_t table_magic_number, const ImmutableOptions& ioptions, + const ReadOptions& read_options, BlockContents* block_contents, MemoryAllocator* memory_allocator = nullptr, FilePrefetchBuffer* prefetch_buffer = nullptr, @@ -161,6 +164,7 @@ Status ReadMetaBlock(RandomAccessFileReader* file, FilePrefetchBuffer* prefetch_buffer, uint64_t file_size, uint64_t table_magic_number, const ImmutableOptions& ioptions, + const ReadOptions& read_options, const std::string& meta_block_name, BlockType block_type, BlockContents* contents, MemoryAllocator* memory_allocator = nullptr); diff --git a/table/mock_table.cc b/table/mock_table.cc index 130889eaa..c251ea108 100644 --- a/table/mock_table.cc +++ b/table/mock_table.cc @@ -41,12 +41,14 @@ class MockTableReader : public TableReader { GetContext* get_context, const SliceTransform* prefix_extractor, bool skip_filters = false) override; - uint64_t ApproximateOffsetOf(const Slice& /*key*/, + uint64_t ApproximateOffsetOf(const ReadOptions& /*read_options*/, + const Slice& /*key*/, TableReaderCaller /*caller*/) override { return 0; } - uint64_t ApproximateSize(const Slice& /*start*/, const Slice& /*end*/, + uint64_t ApproximateSize(const ReadOptions& /*read_options*/, + const Slice& /*start*/, const Slice& /*end*/, TableReaderCaller /*caller*/) override { return 0; } diff --git a/table/plain/plain_table_reader.cc b/table/plain/plain_table_reader.cc index 3e51c2275..2f0379f72 100644 --- a/table/plain/plain_table_reader.cc +++ b/table/plain/plain_table_reader.cc @@ -126,8 +126,10 @@ Status PlainTableReader::Open( } std::unique_ptr props; + // TODO: plumb Env::IOActivity + const ReadOptions read_options; auto s = ReadTableProperties(file.get(), file_size, kPlainTableMagicNumber, - ioptions, &props); + ioptions, read_options, &props); if (!s.ok()) { return s; } @@ -297,10 +299,14 @@ Status PlainTableReader::PopulateIndex(TableProperties* props, assert(props != nullptr); BlockContents index_block_contents; - Status s = ReadMetaBlock(file_info_.file.get(), nullptr /* prefetch_buffer */, - file_size_, kPlainTableMagicNumber, ioptions_, - PlainTableIndexBuilder::kPlainTableIndexBlock, - BlockType::kIndex, &index_block_contents); + + // TODO: plumb Env::IOActivity + const ReadOptions read_options; + Status s = + ReadMetaBlock(file_info_.file.get(), nullptr /* prefetch_buffer */, + file_size_, kPlainTableMagicNumber, ioptions_, read_options, + PlainTableIndexBuilder::kPlainTableIndexBlock, + BlockType::kIndex, &index_block_contents); bool index_in_file = s.ok(); @@ -310,8 +316,8 @@ Status PlainTableReader::PopulateIndex(TableProperties* props, if (index_in_file) { s = ReadMetaBlock(file_info_.file.get(), nullptr /* prefetch_buffer */, file_size_, kPlainTableMagicNumber, ioptions_, - BloomBlockBuilder::kBloomBlock, BlockType::kFilter, - &bloom_block_contents); + read_options, BloomBlockBuilder::kBloomBlock, + BlockType::kFilter, &bloom_block_contents); bloom_in_file = s.ok() && bloom_block_contents.data.size() > 0; } @@ -614,12 +620,14 @@ Status PlainTableReader::Get(const ReadOptions& /*ro*/, const Slice& target, return Status::OK(); } -uint64_t PlainTableReader::ApproximateOffsetOf(const Slice& /*key*/, - TableReaderCaller /*caller*/) { +uint64_t PlainTableReader::ApproximateOffsetOf( + const ReadOptions& /*read_options*/, const Slice& /*key*/, + TableReaderCaller /*caller*/) { return 0; } -uint64_t PlainTableReader::ApproximateSize(const Slice& /*start*/, +uint64_t PlainTableReader::ApproximateSize(const ReadOptions& /* read_options*/, + const Slice& /*start*/, const Slice& /*end*/, TableReaderCaller /*caller*/) { return 0; diff --git a/table/plain/plain_table_reader.h b/table/plain/plain_table_reader.h index 51500c3ee..0f5f7f3ce 100644 --- a/table/plain/plain_table_reader.h +++ b/table/plain/plain_table_reader.h @@ -92,11 +92,12 @@ class PlainTableReader : public TableReader { GetContext* get_context, const SliceTransform* prefix_extractor, bool skip_filters = false) override; - uint64_t ApproximateOffsetOf(const Slice& key, + uint64_t ApproximateOffsetOf(const ReadOptions& read_options, + const Slice& key, TableReaderCaller caller) override; - uint64_t ApproximateSize(const Slice& start, const Slice& end, - TableReaderCaller caller) override; + uint64_t ApproximateSize(const ReadOptions& read_options, const Slice& start, + const Slice& end, TableReaderCaller caller) override; uint32_t GetIndexSize() const { return index_.GetIndexSize(); } void SetupForCompaction() override; diff --git a/table/sst_file_dumper.cc b/table/sst_file_dumper.cc index 59e62486a..e9916eb5b 100644 --- a/table/sst_file_dumper.cc +++ b/table/sst_file_dumper.cc @@ -355,8 +355,11 @@ Status SstFileDumper::ReadTableProperties(uint64_t table_magic_number, RandomAccessFileReader* file, uint64_t file_size, FilePrefetchBuffer* prefetch_buffer) { + // TODO: plumb Env::IOActivity + const ReadOptions read_options; Status s = ROCKSDB_NAMESPACE::ReadTableProperties( - file, file_size, table_magic_number, ioptions_, &table_properties_, + file, file_size, table_magic_number, ioptions_, read_options, + &table_properties_, /* memory_allocator= */ nullptr, prefetch_buffer); if (!s.ok()) { if (!silent_) { @@ -514,4 +517,3 @@ Status SstFileDumper::ReadTableProperties( return init_result_; } } // namespace ROCKSDB_NAMESPACE - diff --git a/table/sst_file_reader.cc b/table/sst_file_reader.cc index 5573d941c..c95c91743 100644 --- a/table/sst_file_reader.cc +++ b/table/sst_file_reader.cc @@ -66,6 +66,7 @@ Status SstFileReader::Open(const std::string& file_path) { } Iterator* SstFileReader::NewIterator(const ReadOptions& roptions) { + assert(roptions.io_activity == Env::IOActivity::kUnknown); auto r = rep_.get(); auto sequence = roptions.snapshot != nullptr ? roptions.snapshot->GetSequenceNumber() @@ -91,9 +92,9 @@ std::shared_ptr SstFileReader::GetTableProperties() } Status SstFileReader::VerifyChecksum(const ReadOptions& read_options) { + assert(read_options.io_activity == Env::IOActivity::kUnknown); return rep_->table_reader->VerifyChecksum(read_options, TableReaderCaller::kSSTFileReader); } } // namespace ROCKSDB_NAMESPACE - diff --git a/table/table_reader.h b/table/table_reader.h index 391072eec..53c522052 100644 --- a/table/table_reader.h +++ b/table/table_reader.h @@ -76,7 +76,8 @@ class TableReader { // function and letting ApproximateSize take optional start and end, so // that absolute start and end can be specified and optimized without // key / index work. - virtual uint64_t ApproximateOffsetOf(const Slice& key, + virtual uint64_t ApproximateOffsetOf(const ReadOptions& read_options, + const Slice& key, TableReaderCaller caller) = 0; // Given start and end keys, return the approximate data size in the file @@ -84,7 +85,8 @@ class TableReader { // includes effects like compression of the underlying data and applicable // portions of metadata including filters and indexes. Nullptr for start or // end (or both) indicates absolute start or end of the table. - virtual uint64_t ApproximateSize(const Slice& start, const Slice& end, + virtual uint64_t ApproximateSize(const ReadOptions& read_options, + const Slice& start, const Slice& end, TableReaderCaller caller) = 0; struct Anchor { @@ -160,7 +162,8 @@ class TableReader { // Prefetch data corresponding to a give range of keys // Typically this functionality is required for table implementations that // persists the data on a non volatile storage medium like disk/SSD - virtual Status Prefetch(const Slice* begin = nullptr, + virtual Status Prefetch(const ReadOptions& /* read_options */, + const Slice* begin = nullptr, const Slice* end = nullptr) { (void)begin; (void)end; diff --git a/table/table_test.cc b/table/table_test.cc index 5bdac8bc2..a701eda01 100644 --- a/table/table_test.cc +++ b/table/table_test.cc @@ -425,14 +425,15 @@ class TableConstructor : public Constructor { } uint64_t ApproximateOffsetOf(const Slice& key) const { + const ReadOptions read_options; if (convert_to_internal_key_) { InternalKey ikey(key, kMaxSequenceNumber, kTypeValue); const Slice skey = ikey.Encode(); return table_reader_->ApproximateOffsetOf( - skey, TableReaderCaller::kUncategorized); + read_options, skey, TableReaderCaller::kUncategorized); } return table_reader_->ApproximateOffsetOf( - key, TableReaderCaller::kUncategorized); + read_options, key, TableReaderCaller::kUncategorized); } virtual Status Reopen(const ImmutableOptions& ioptions, @@ -1979,7 +1980,8 @@ void PrefetchRange(TableConstructor* c, Options* opt, end.reset(new Slice(key_end)); } } - s = table_reader->Prefetch(begin.get(), end.get()); + const ReadOptions read_options; + s = table_reader->Prefetch(read_options, begin.get(), end.get()); ASSERT_TRUE(s.code() == expected_status.code()); @@ -3335,11 +3337,12 @@ TEST_P(BlockBasedTableTest, TracingApproximateOffsetOfTest) { MutableCFOptions moptions(options); c.Finish(options, ioptions, moptions, table_options, GetPlainInternalComparator(options.comparator), &keys, &kvmap); + const ReadOptions read_options; for (uint32_t i = 1; i <= 2; i++) { InternalKey internal_key(auto_add_key1, 0, kTypeValue); std::string encoded_key = internal_key.Encode().ToString(); c.GetTableReader()->ApproximateOffsetOf( - encoded_key, TableReaderCaller::kUserApproximateSize); + read_options, encoded_key, TableReaderCaller::kUserApproximateSize); } // Verify traces. std::vector expected_records; @@ -4079,8 +4082,10 @@ TEST_F(PlainTableTest, BasicPlainTableProperties) { new RandomAccessFileReader(std::move(source), "test")); std::unique_ptr props; + const ReadOptions read_options; auto s = ReadTableProperties(file_reader.get(), ss->contents().size(), - kPlainTableMagicNumber, ioptions, &props); + kPlainTableMagicNumber, ioptions, read_options, + &props); ASSERT_OK(s); ASSERT_EQ(0ul, props->index_size); @@ -4756,9 +4761,10 @@ TEST_P(BlockBasedTableTest, DISABLED_TableWithGlobalSeqno) { new RandomAccessFileReader(std::move(source), "")); std::unique_ptr props; + const ReadOptions read_options; ASSERT_OK(ReadTableProperties(file_reader.get(), ss_rw.contents().size(), kBlockBasedTableMagicNumber, ioptions, - &props)); + read_options, &props)); UserCollectedProperties user_props = props->user_collected_properties; version = DecodeFixed32( @@ -4933,9 +4939,10 @@ TEST_P(BlockBasedTableTest, BlockAlignTest) { // Helper function to get version, global_seqno, global_seqno_offset std::function VerifyBlockAlignment = [&]() { std::unique_ptr props; + const ReadOptions read_options; ASSERT_OK(ReadTableProperties(file_reader.get(), sink->contents().size(), kBlockBasedTableMagicNumber, ioptions, - &props)); + read_options, &props)); uint64_t data_block_size = props->data_size / props->num_data_blocks; ASSERT_EQ(data_block_size, 4096); diff --git a/tools/db_crashtest.py b/tools/db_crashtest.py index 037f27f4a..ac478e7ad 100644 --- a/tools/db_crashtest.py +++ b/tools/db_crashtest.py @@ -200,6 +200,7 @@ default_params = { ] ), "allow_data_in_errors": True, + "enable_thread_tracking": lambda: random.choice([0, 1]), "readahead_size": lambda: random.choice([0, 16384, 524288]), "initial_auto_readahead_size": lambda: random.choice([0, 16384, 524288]), "max_auto_readahead_size": lambda: random.choice([0, 16384, 524288]), diff --git a/tools/ldb_cmd.cc b/tools/ldb_cmd.cc index b7b0e9909..f0119b31e 100644 --- a/tools/ldb_cmd.cc +++ b/tools/ldb_cmd.cc @@ -4168,6 +4168,8 @@ UnsafeRemoveSstFileCommand::UnsafeRemoveSstFileCommand( } void UnsafeRemoveSstFileCommand::DoCommand() { + // TODO: plumb Env::IOActivity + const ReadOptions read_options; PrepareOptions(); OfflineManifestWriter w(options_, db_path_); @@ -4192,7 +4194,7 @@ void UnsafeRemoveSstFileCommand::DoCommand() { s = options_.env->GetFileSystem()->NewDirectory(db_path_, IOOptions(), &db_dir, nullptr); if (s.ok()) { - s = w.LogAndApply(cfd, &edit, db_dir.get()); + s = w.LogAndApply(read_options, cfd, &edit, db_dir.get()); } } diff --git a/util/file_checksum_helper.cc b/util/file_checksum_helper.cc index 0c072e1e3..b8c4099b8 100644 --- a/util/file_checksum_helper.cc +++ b/util/file_checksum_helper.cc @@ -98,6 +98,8 @@ Status GetFileChecksumsFromManifest(Env* src_env, const std::string& abs_path, return Status::InvalidArgument("checksum_list is nullptr"); } assert(checksum_list); + // TODO: plumb Env::IOActivity + const ReadOptions read_options; checksum_list->reset(); Status s; @@ -125,7 +127,8 @@ Status GetFileChecksumsFromManifest(Env* src_env, const std::string& abs_path, reporter.status_ptr = &s; log::Reader reader(nullptr, std::move(file_reader), &reporter, true /* checksum */, 0 /* log_number */); - FileChecksumRetriever retriever(manifest_file_size, *checksum_list); + FileChecksumRetriever retriever(read_options, manifest_file_size, + *checksum_list); retriever.Iterate(reader, &s); assert(!retriever.status().ok() || manifest_file_size == std::numeric_limits::max() || diff --git a/util/stop_watch.h b/util/stop_watch.h index e26380d97..0ecd1bb11 100644 --- a/util/stop_watch.h +++ b/util/stop_watch.h @@ -9,23 +9,33 @@ namespace ROCKSDB_NAMESPACE { // Auto-scoped. -// Records the measure time into the corresponding histogram if statistics -// is not nullptr. It is also saved into *elapsed if the pointer is not nullptr -// and overwrite is true, it will be added to *elapsed if overwrite is false. +// When statistics is not nullptr, records the measured time into any enabled +// histograms supplied to the constructor. A histogram argument may be omitted +// by setting it to Histograms::HISTOGRAM_ENUM_MAX. It is also saved into +// *elapsed if the pointer is not nullptr and overwrite is true, it will be +// added to *elapsed if overwrite is false. class StopWatch { public: StopWatch(SystemClock* clock, Statistics* statistics, - const uint32_t hist_type, uint64_t* elapsed = nullptr, - bool overwrite = true, bool delay_enabled = false) + const uint32_t hist_type_1, + const uint32_t hist_type_2 = Histograms::HISTOGRAM_ENUM_MAX, + uint64_t* elapsed = nullptr, bool overwrite = true, + bool delay_enabled = false) : clock_(clock), statistics_(statistics), - hist_type_(hist_type), + hist_type_1_(statistics && statistics->HistEnabledForType(hist_type_1) + ? hist_type_1 + : Histograms::HISTOGRAM_ENUM_MAX), + hist_type_2_(statistics && statistics->HistEnabledForType(hist_type_2) + ? hist_type_2 + : Histograms::HISTOGRAM_ENUM_MAX), elapsed_(elapsed), overwrite_(overwrite), stats_enabled_(statistics && statistics->get_stats_level() >= StatsLevel::kExceptTimers && - statistics->HistEnabledForType(hist_type)), + (hist_type_1_ != Histograms::HISTOGRAM_ENUM_MAX || + hist_type_2_ != Histograms::HISTOGRAM_ENUM_MAX)), delay_enabled_(delay_enabled), total_delay_(0), delay_start_time_(0), @@ -44,10 +54,15 @@ class StopWatch { *elapsed_ -= total_delay_; } if (stats_enabled_) { - statistics_->reportTimeToHistogram( - hist_type_, (elapsed_ != nullptr) - ? *elapsed_ - : (clock_->NowMicros() - start_time_)); + const auto time = (elapsed_ != nullptr) + ? *elapsed_ + : (clock_->NowMicros() - start_time_); + if (hist_type_1_ != Histograms::HISTOGRAM_ENUM_MAX) { + statistics_->reportTimeToHistogram(hist_type_1_, time); + } + if (hist_type_2_ != Histograms::HISTOGRAM_ENUM_MAX) { + statistics_->reportTimeToHistogram(hist_type_2_, time); + } } } @@ -75,7 +90,8 @@ class StopWatch { private: SystemClock* clock_; Statistics* statistics_; - const uint32_t hist_type_; + const uint32_t hist_type_1_; + const uint32_t hist_type_2_; uint64_t* elapsed_; bool overwrite_; bool stats_enabled_; diff --git a/util/thread_list_test.cc b/util/thread_list_test.cc index af4e62355..b5b3378fa 100644 --- a/util/thread_list_test.cc +++ b/util/thread_list_test.cc @@ -42,6 +42,8 @@ class SimulatedBackgroundTask { std::unique_lock l(mutex_); running_count_++; bg_cv_.notify_all(); + assert(cf_key_); + Env::Default()->GetThreadStatusUpdater()->SetEnableTracking(true); Env::Default()->GetThreadStatusUpdater()->SetColumnFamilyInfoKey(cf_key_); Env::Default()->GetThreadStatusUpdater()->SetThreadOperation( operation_type_); diff --git a/util/thread_operation.h b/util/thread_operation.h index c24fccd5c..b6c106279 100644 --- a/util/thread_operation.h +++ b/util/thread_operation.h @@ -38,7 +38,8 @@ struct OperationInfo { static OperationInfo global_operation_table[] = { {ThreadStatus::OP_UNKNOWN, ""}, {ThreadStatus::OP_COMPACTION, "Compaction"}, - {ThreadStatus::OP_FLUSH, "Flush"}}; + {ThreadStatus::OP_FLUSH, "Flush"}, + {ThreadStatus::OP_DBOPEN, "DBOpen"}}; struct OperationStageInfo { const ThreadStatus::OperationStage stage; diff --git a/utilities/blob_db/blob_db_impl.cc b/utilities/blob_db/blob_db_impl.cc index 3a5e337b6..cfbbd7458 100644 --- a/utilities/blob_db/blob_db_impl.cc +++ b/utilities/blob_db/blob_db_impl.cc @@ -1631,6 +1631,11 @@ Status BlobDBImpl::GetImpl(const ReadOptions& read_options, return Status::NotSupported( "Blob DB doesn't support non-default column family."); } + if (read_options.io_activity != Env::IOActivity::kUnknown) { + return Status::InvalidArgument( + "Cannot call Get with `ReadOptions::io_activity` != " + "`Env::IOActivity::kUnknown`"); + } // Get a snapshot to avoid blob file get deleted between we // fetch and index entry and reading from the file. // TODO(yiwu): For Get() retry if file not found would be a simpler strategy. @@ -2036,6 +2041,11 @@ void BlobDBImpl::CopyBlobFiles( } Iterator* BlobDBImpl::NewIterator(const ReadOptions& read_options) { + if (read_options.io_activity != Env::IOActivity::kUnknown) { + return NewErrorIterator(Status::InvalidArgument( + "Cannot call NewIterator with `ReadOptions::io_activity` != " + "`Env::IOActivity::kUnknown`")); + } auto* cfd = static_cast_with_check(DefaultColumnFamily()) ->cfd(); diff --git a/utilities/transactions/pessimistic_transaction.cc b/utilities/transactions/pessimistic_transaction.cc index 63c689efd..1771497a6 100644 --- a/utilities/transactions/pessimistic_transaction.cc +++ b/utilities/transactions/pessimistic_transaction.cc @@ -166,6 +166,11 @@ template inline Status WriteCommittedTxn::GetForUpdateImpl( const ReadOptions& read_options, ColumnFamilyHandle* column_family, const Slice& key, TValue* value, bool exclusive, const bool do_validate) { + if (read_options.io_activity != Env::IOActivity::kUnknown) { + return Status::InvalidArgument( + "Cannot call GetForUpdate with `ReadOptions::io_activity` != " + "`Env::IOActivity::kUnknown`"); + } column_family = column_family ? column_family : db_impl_->DefaultColumnFamily(); assert(column_family); @@ -1170,4 +1175,3 @@ Status PessimisticTransaction::SetName(const TransactionName& name) { } } // namespace ROCKSDB_NAMESPACE - diff --git a/utilities/transactions/transaction_base.cc b/utilities/transactions/transaction_base.cc index 10d5d02a1..5963f7429 100644 --- a/utilities/transactions/transaction_base.cc +++ b/utilities/transactions/transaction_base.cc @@ -235,6 +235,11 @@ Status TransactionBaseImpl::PopSavePoint() { Status TransactionBaseImpl::Get(const ReadOptions& read_options, ColumnFamilyHandle* column_family, const Slice& key, std::string* value) { + if (read_options.io_activity != Env::IOActivity::kUnknown) { + return Status::InvalidArgument( + "Cannot call Get with `ReadOptions::io_activity` != " + "`Env::IOActivity::kUnknown`"); + } assert(value != nullptr); PinnableSlice pinnable_val(value); assert(!pinnable_val.IsPinned()); @@ -262,6 +267,11 @@ Status TransactionBaseImpl::GetForUpdate(const ReadOptions& read_options, "If do_validate is false then GetForUpdate with snapshot is not " "defined."); } + if (read_options.io_activity != Env::IOActivity::kUnknown) { + return Status::InvalidArgument( + "Cannot call GetForUpdate with `ReadOptions::io_activity` != " + "`Env::IOActivity::kUnknown`"); + } Status s = TryLock(column_family, key, true /* read_only */, exclusive, do_validate); @@ -288,6 +298,11 @@ Status TransactionBaseImpl::GetForUpdate(const ReadOptions& read_options, "If do_validate is false then GetForUpdate with snapshot is not " "defined."); } + if (read_options.io_activity != Env::IOActivity::kUnknown) { + return Status::InvalidArgument( + "Cannot call GetForUpdate with `ReadOptions::io_activity` != " + "`Env::IOActivity::kUnknown`"); + } Status s = TryLock(column_family, key, true /* read_only */, exclusive, do_validate); @@ -302,6 +317,13 @@ std::vector TransactionBaseImpl::MultiGet( const std::vector& column_family, const std::vector& keys, std::vector* values) { size_t num_keys = keys.size(); + if (read_options.io_activity != Env::IOActivity::kUnknown) { + Status s = Status::InvalidArgument( + "Cannot call MultiGet with `ReadOptions::io_activity` != " + "`Env::IOActivity::kUnknown`"); + return std::vector(num_keys, s); + } + values->resize(num_keys); std::vector stat_list(num_keys); @@ -317,6 +339,7 @@ void TransactionBaseImpl::MultiGet(const ReadOptions& read_options, const size_t num_keys, const Slice* keys, PinnableSlice* values, Status* statuses, const bool sorted_input) { + assert(read_options.io_activity == Env::IOActivity::kUnknown); write_batch_.MultiGetFromBatchAndDB(db_, read_options, column_family, num_keys, keys, values, statuses, sorted_input); @@ -328,6 +351,12 @@ std::vector TransactionBaseImpl::MultiGetForUpdate( const std::vector& keys, std::vector* values) { // Regardless of whether the MultiGet succeeded, track these keys. size_t num_keys = keys.size(); + if (read_options.io_activity != Env::IOActivity::kUnknown) { + Status s = Status::InvalidArgument( + "Cannot call MultiGetForUpdate with `ReadOptions::io_activity` != " + "`Env::IOActivity::kUnknown`"); + return std::vector(num_keys, s); + } values->resize(num_keys); // Lock all keys @@ -726,4 +755,3 @@ WriteBatch* TransactionBaseImpl::GetCommitTimeWriteBatch() { return &commit_time_batch_; } } // namespace ROCKSDB_NAMESPACE - diff --git a/utilities/transactions/write_prepared_txn.cc b/utilities/transactions/write_prepared_txn.cc index ededb6250..c27a679e4 100644 --- a/utilities/transactions/write_prepared_txn.cc +++ b/utilities/transactions/write_prepared_txn.cc @@ -44,6 +44,7 @@ void WritePreparedTxn::MultiGet(const ReadOptions& options, const size_t num_keys, const Slice* keys, PinnableSlice* values, Status* statuses, const bool sorted_input) { + assert(options.io_activity == Env::IOActivity::kUnknown); SequenceNumber min_uncommitted, snap_seq; const SnapshotBackup backed_by_snapshot = wpt_db_->AssignMinMaxSeqs(options.snapshot, &min_uncommitted, &snap_seq); @@ -64,6 +65,11 @@ void WritePreparedTxn::MultiGet(const ReadOptions& options, Status WritePreparedTxn::Get(const ReadOptions& options, ColumnFamilyHandle* column_family, const Slice& key, PinnableSlice* pinnable_val) { + if (options.io_activity != Env::IOActivity::kUnknown) { + return Status::InvalidArgument( + "Cannot call Get with `ReadOptions::io_activity` != " + "`Env::IOActivity::kUnknown`"); + } SequenceNumber min_uncommitted, snap_seq; const SnapshotBackup backed_by_snapshot = wpt_db_->AssignMinMaxSeqs(options.snapshot, &min_uncommitted, &snap_seq); @@ -507,4 +513,3 @@ Status WritePreparedTxn::RebuildFromWriteBatch(WriteBatch* src_batch) { } } // namespace ROCKSDB_NAMESPACE - diff --git a/utilities/transactions/write_prepared_txn_db.cc b/utilities/transactions/write_prepared_txn_db.cc index 84c45b7e4..6118c3549 100644 --- a/utilities/transactions/write_prepared_txn_db.cc +++ b/utilities/transactions/write_prepared_txn_db.cc @@ -250,6 +250,11 @@ Status WritePreparedTxnDB::WriteInternal(const WriteOptions& write_options_orig, Status WritePreparedTxnDB::Get(const ReadOptions& options, ColumnFamilyHandle* column_family, const Slice& key, PinnableSlice* value) { + if (options.io_activity != Env::IOActivity::kUnknown) { + return Status::InvalidArgument( + "Cannot call Get with `ReadOptions::io_activity` != " + "`Env::IOActivity::kUnknown`"); + } SequenceNumber min_uncommitted, snap_seq; const SnapshotBackup backed_by_snapshot = AssignMinMaxSeqs(options.snapshot, &min_uncommitted, &snap_seq); @@ -343,6 +348,11 @@ static void CleanupWritePreparedTxnDBIterator(void* arg1, void* /*arg2*/) { Iterator* WritePreparedTxnDB::NewIterator(const ReadOptions& options, ColumnFamilyHandle* column_family) { + if (options.io_activity != Env::IOActivity::kUnknown) { + return NewErrorIterator(Status::InvalidArgument( + "Cannot call NewIterator with `ReadOptions::io_activity` != " + "`Env::IOActivity::kUnknown`")); + } constexpr bool expose_blob_index = false; constexpr bool allow_refresh = false; std::shared_ptr own_snapshot = nullptr; diff --git a/utilities/transactions/write_unprepared_txn.cc b/utilities/transactions/write_unprepared_txn.cc index 11e04824f..845b117cf 100644 --- a/utilities/transactions/write_unprepared_txn.cc +++ b/utilities/transactions/write_unprepared_txn.cc @@ -948,6 +948,7 @@ void WriteUnpreparedTxn::MultiGet(const ReadOptions& options, const size_t num_keys, const Slice* keys, PinnableSlice* values, Status* statuses, const bool sorted_input) { + assert(options.io_activity == Env::IOActivity::kUnknown); SequenceNumber min_uncommitted, snap_seq; const SnapshotBackup backed_by_snapshot = wupt_db_->AssignMinMaxSeqs(options.snapshot, &min_uncommitted, &snap_seq); @@ -968,6 +969,11 @@ void WriteUnpreparedTxn::MultiGet(const ReadOptions& options, Status WriteUnpreparedTxn::Get(const ReadOptions& options, ColumnFamilyHandle* column_family, const Slice& key, PinnableSlice* value) { + if (options.io_activity != Env::IOActivity::kUnknown) { + return Status::InvalidArgument( + "Cannot call Get with `ReadOptions::io_activity` != " + "`Env::IOActivity::kUnknown`"); + } SequenceNumber min_uncommitted, snap_seq; const SnapshotBackup backed_by_snapshot = wupt_db_->AssignMinMaxSeqs(options.snapshot, &min_uncommitted, &snap_seq); @@ -1048,4 +1054,3 @@ WriteUnpreparedTxn::GetUnpreparedSequenceNumbers() { } } // namespace ROCKSDB_NAMESPACE - diff --git a/utilities/transactions/write_unprepared_txn_db.cc b/utilities/transactions/write_unprepared_txn_db.cc index 26a03c77d..fd0ba0aed 100644 --- a/utilities/transactions/write_unprepared_txn_db.cc +++ b/utilities/transactions/write_unprepared_txn_db.cc @@ -388,6 +388,11 @@ static void CleanupWriteUnpreparedTxnDBIterator(void* arg1, void* /*arg2*/) { Iterator* WriteUnpreparedTxnDB::NewIterator(const ReadOptions& options, ColumnFamilyHandle* column_family, WriteUnpreparedTxn* txn) { + if (options.io_activity != Env::IOActivity::kUnknown) { + return NewErrorIterator(Status::InvalidArgument( + "Cannot call NewIterator with `ReadOptions::io_activity` != " + "`Env::IOActivity::kUnknown`")); + } // TODO(lth): Refactor so that this logic is shared with WritePrepared. constexpr bool expose_blob_index = false; constexpr bool allow_refresh = false; diff --git a/utilities/ttl/db_ttl_impl.cc b/utilities/ttl/db_ttl_impl.cc index 5b0486fc1..2b261ec6f 100644 --- a/utilities/ttl/db_ttl_impl.cc +++ b/utilities/ttl/db_ttl_impl.cc @@ -596,6 +596,11 @@ Status DBWithTTLImpl::Write(const WriteOptions& opts, WriteBatch* updates) { Iterator* DBWithTTLImpl::NewIterator(const ReadOptions& opts, ColumnFamilyHandle* column_family) { + if (opts.io_activity != Env::IOActivity::kUnknown) { + return NewErrorIterator(Status::InvalidArgument( + "Cannot call NewIterator with `ReadOptions::io_activity` != " + "`Env::IOActivity::kUnknown`")); + } return new TtlIterator(db_->NewIterator(opts, column_family)); }