From 0ed8cb666de61d2e18bbb1fc90e05b734cee02de Mon Sep 17 00:00:00 2001 From: sdong Date: Mon, 17 May 2021 15:14:34 -0700 Subject: [PATCH] Write file temperature information to manifest (#8284) Summary: As a part of tiered storage, writing tempeature information to manifest is needed so that after DB recovery, RocksDB still has the tiering information, to implement some further necessary functionalities. Also fix some issues in simulated hybrid FS. Pull Request resolved: https://github.com/facebook/rocksdb/pull/8284 Test Plan: Add a new unit test to validate that the information is indeed written and read back. Reviewed By: zhichao-cao Differential Revision: D28335801 fbshipit-source-id: 56aeb2e6ea090be0200181dd968c8a7278037def --- db/compaction/compaction_job.cc | 4 +++- db/db_test2.cc | 23 ++++++++++++++++++++++- db/version_edit.cc | 26 ++++++++++++++++++++++++++ db/version_edit.h | 2 ++ db/version_set.cc | 9 +++++---- include/rocksdb/advanced_options.h | 11 ++++++----- include/rocksdb/metadata.h | 9 ++++++++- tools/simulated_hybrid_file_system.cc | 21 ++++++++++++++------- tools/simulated_hybrid_file_system.h | 2 ++ 9 files changed, 88 insertions(+), 19 deletions(-) diff --git a/db/compaction/compaction_job.cc b/db/compaction/compaction_job.cc index a55484bf0..4bbc36247 100644 --- a/db/compaction/compaction_job.cc +++ b/db/compaction/compaction_job.cc @@ -1689,8 +1689,9 @@ Status CompactionJob::OpenCompactionOutputFile( // Pass temperature of botommost files to FileSystem. FileOptions fo_copy = file_options_; + Temperature temperature = Temperature::kUnknown; if (bottommost_level_) { - fo_copy.temperature = + fo_copy.temperature = temperature = sub_compact->compaction->mutable_cf_options()->bottommost_temperature; } @@ -1742,6 +1743,7 @@ Status CompactionJob::OpenCompactionOutputFile( sub_compact->compaction->output_path_id(), 0); meta.oldest_ancester_time = oldest_ancester_time; meta.file_creation_time = current_time; + meta.temperature = temperature; sub_compact->outputs.emplace_back( std::move(meta), cfd->internal_comparator(), /*enable_order_check=*/ diff --git a/db/db_test2.cc b/db/db_test2.cc index 42ec2d103..8a3724307 100644 --- a/db/db_test2.cc +++ b/db/db_test2.cc @@ -5531,6 +5531,28 @@ TEST_P(RenameCurrentTest, Compaction) { ASSERT_EQ("NOT_FOUND", Get("foo")); ASSERT_EQ("d_value", Get("d")); } + +TEST_F(DBTest2, BottommostTemperature) { + Options options = CurrentOptions(); + options.bottommost_temperature = Temperature::kWarm; + options.level0_file_num_compaction_trigger = 2; + Reopen(options); + + ASSERT_OK(Put("foo", "bar")); + ASSERT_OK(Put("bar", "bar")); + ASSERT_OK(Flush()); + ASSERT_OK(Put("foo", "bar")); + ASSERT_OK(Put("bar", "bar")); + ASSERT_OK(Flush()); + ASSERT_OK(dbfull()->TEST_WaitForCompact()); + + Reopen(options); + + ColumnFamilyMetaData metadata; + db_->GetColumnFamilyMetaData(&metadata); + ASSERT_EQ(1, metadata.file_count); + ASSERT_EQ(Temperature::kWarm, metadata.levels[1].files[0].temperature); +} #endif // ROCKSDB_LITE // WAL recovery mode is WALRecoveryMode::kPointInTimeRecovery. @@ -5586,7 +5608,6 @@ TEST_F(DBTest2, PointInTimeRecoveryWithSyncFailureInCFCreation) { options.wal_recovery_mode = WALRecoveryMode::kPointInTimeRecovery; ReopenWithColumnFamilies({"default", "test1", "test2"}, options); } - } // namespace ROCKSDB_NAMESPACE #ifdef ROCKSDB_UNITTESTS_WITH_CUSTOM_OBJECTS_FROM_STATIC_LIBS diff --git a/db/version_edit.cc b/db/version_edit.cc index d9246367c..8cb173a2d 100644 --- a/db/version_edit.cc +++ b/db/version_edit.cc @@ -191,6 +191,11 @@ bool VersionEdit::EncodeTo(std::string* dst) const { char p = static_cast(f.fd.GetPathId()); PutLengthPrefixedSlice(dst, Slice(&p, 1)); } + if (f.temperature != Temperature::kUnknown) { + PutVarint32(dst, NewFileCustomTag::kTemperature); + char p = static_cast(f.temperature); + PutLengthPrefixedSlice(dst, Slice(&p, 1)); + } if (f.marked_for_compaction) { PutVarint32(dst, NewFileCustomTag::kNeedCompaction); char p = static_cast(1); @@ -360,6 +365,16 @@ const char* VersionEdit::DecodeNewFile4From(Slice* input) { return "invalid oldest blob file number"; } break; + case kTemperature: + if (field.size() != 1) { + return "temperature field wrong size"; + } else { + Temperature casted_field = static_cast(field[0]); + if (casted_field <= Temperature::kCold) { + f.temperature = casted_field; + } + } + break; default: if ((custom_tag & kCustomTagNonSafeIgnoreMask) != 0) { // Should not proceed if cannot understand it @@ -774,6 +789,12 @@ std::string VersionEdit::DebugString(bool hex_key) const { r.append(f.file_checksum); r.append(" file_checksum_func_name: "); r.append(f.file_checksum_func_name); + if (f.temperature != Temperature::kUnknown) { + r.append(" temperature: "); + // Maybe change to human readable format whenthe feature becomes + // permanent + r.append(ToString(static_cast(f.temperature))); + } } for (const auto& blob_file_addition : blob_file_additions_) { @@ -876,6 +897,11 @@ std::string VersionEdit::DebugJSON(int edit_num, bool hex_key) const { if (f.oldest_blob_file_number != kInvalidBlobFileNumber) { jw << "OldestBlobFile" << f.oldest_blob_file_number; } + if (f.temperature != Temperature::kUnknown) { + // Maybe change to human readable format whenthe feature becomes + // permanent + jw << "Temperature" << static_cast(f.temperature); + } jw.EndArrayedObject(); } diff --git a/db/version_edit.h b/db/version_edit.h index 7b6884793..600634927 100644 --- a/db/version_edit.h +++ b/db/version_edit.h @@ -81,6 +81,7 @@ enum NewFileCustomTag : uint32_t { kFileCreationTime = 6, kFileChecksum = 7, kFileChecksumFuncName = 8, + kTemperature = 9, // If this bit for the custom tag is set, opening DB should fail if // we don't know this field. @@ -188,6 +189,7 @@ struct FileMetaData { bool marked_for_compaction = false; // True if client asked us nicely to // compact this file. + Temperature temperature = Temperature::kUnknown; // Used only in BlobDB. The file number of the oldest blob file this SST file // refers to. 0 is an invalid value; BlobDB numbers the files starting from 1. diff --git a/db/version_set.cc b/db/version_set.cc index c9042d727..97eb3d11f 100644 --- a/db/version_set.cc +++ b/db/version_set.cc @@ -1485,15 +1485,16 @@ void Version::GetColumnFamilyMetaData(ColumnFamilyMetaData* cf_meta) { file_path = ioptions->cf_paths.back().path; } const uint64_t file_number = file->fd.GetNumber(); - files.emplace_back(SstFileMetaData{ + files.emplace_back( MakeTableFileName("", file_number), file_number, file_path, static_cast(file->fd.GetFileSize()), file->fd.smallest_seqno, file->fd.largest_seqno, file->smallest.user_key().ToString(), file->largest.user_key().ToString(), file->stats.num_reads_sampled.load(std::memory_order_relaxed), - file->being_compacted, file->oldest_blob_file_number, - file->TryGetOldestAncesterTime(), file->TryGetFileCreationTime(), - file->file_checksum, file->file_checksum_func_name}); + file->being_compacted, file->temperature, + file->oldest_blob_file_number, file->TryGetOldestAncesterTime(), + file->TryGetFileCreationTime(), file->file_checksum, + file->file_checksum_func_name); files.back().num_entries = file->num_entries; files.back().num_deletions = file->num_deletions; level_size += file->fd.GetFileSize(); diff --git a/include/rocksdb/advanced_options.h b/include/rocksdb/advanced_options.h index c682003a4..7804ec46b 100644 --- a/include/rocksdb/advanced_options.h +++ b/include/rocksdb/advanced_options.h @@ -188,12 +188,13 @@ struct CompressionOptions { // Temperature of a file. Used to pass to FileSystem for a different // placement and/or coding. +// Reserve some numbers in the middle, in case we need to insert new tier +// there. enum class Temperature : uint8_t { - kHot, - kWarm, - kCold, - kTotal, - kUnknown = kTotal, + kUnknown = 0, + kHot = 0x04, + kWarm = 0x08, + kCold = 0x0C, }; enum UpdateStatus { // Return status For inplace update callback diff --git a/include/rocksdb/metadata.h b/include/rocksdb/metadata.h index a789c75de..b515c51a1 100644 --- a/include/rocksdb/metadata.h +++ b/include/rocksdb/metadata.h @@ -11,6 +11,7 @@ #include #include +#include "rocksdb/options.h" #include "rocksdb/types.h" namespace ROCKSDB_NAMESPACE { @@ -62,6 +63,7 @@ struct SstFileMetaData { being_compacted(false), num_entries(0), num_deletions(0), + temperature(Temperature::kUnknown), oldest_blob_file_number(0), oldest_ancester_time(0), file_creation_time(0) {} @@ -71,7 +73,8 @@ struct SstFileMetaData { SequenceNumber _smallest_seqno, SequenceNumber _largest_seqno, const std::string& _smallestkey, const std::string& _largestkey, uint64_t _num_reads_sampled, - bool _being_compacted, uint64_t _oldest_blob_file_number, + bool _being_compacted, Temperature _temperature, + uint64_t _oldest_blob_file_number, uint64_t _oldest_ancester_time, uint64_t _file_creation_time, std::string& _file_checksum, std::string& _file_checksum_func_name) @@ -87,6 +90,7 @@ struct SstFileMetaData { being_compacted(_being_compacted), num_entries(0), num_deletions(0), + temperature(_temperature), oldest_blob_file_number(_oldest_blob_file_number), oldest_ancester_time(_oldest_ancester_time), file_creation_time(_file_creation_time), @@ -112,6 +116,9 @@ struct SstFileMetaData { uint64_t num_entries; uint64_t num_deletions; + // This feature is experimental and subject to change. + Temperature temperature; + uint64_t oldest_blob_file_number; // The id of the oldest blob file // referenced by the file. // An SST file may be generated by compactions whose input files may diff --git a/tools/simulated_hybrid_file_system.cc b/tools/simulated_hybrid_file_system.cc index 771b2e0db..59b4654be 100644 --- a/tools/simulated_hybrid_file_system.cc +++ b/tools/simulated_hybrid_file_system.cc @@ -7,6 +7,7 @@ #include "tools/simulated_hybrid_file_system.h" +#include #include #include @@ -110,8 +111,7 @@ IOStatus SimulatedHybridRaf::Read(uint64_t offset, size_t n, char* scratch, IODebugContext* dbg) const { if (temperature_ == Temperature::kWarm) { Env::Default()->SleepForMicroseconds(kLatencyAddedPerRequestUs); - rate_limiter_->Request(kDummyBytesPerRequest, Env::IOPriority::IO_LOW, - nullptr); + RequestRateLimit(1); } return target()->Read(offset, n, options, result, scratch, dbg); } @@ -120,11 +120,9 @@ IOStatus SimulatedHybridRaf::MultiRead(FSReadRequest* reqs, size_t num_reqs, const IOOptions& options, IODebugContext* dbg) { if (temperature_ == Temperature::kWarm) { + RequestRateLimit(static_cast(num_reqs)); Env::Default()->SleepForMicroseconds(kLatencyAddedPerRequestUs * static_cast(num_reqs)); - rate_limiter_->Request( - static_cast(num_reqs) * kDummyBytesPerRequest, - Env::IOPriority::IO_LOW, nullptr); } return target()->MultiRead(reqs, num_reqs, options, dbg); } @@ -133,13 +131,22 @@ IOStatus SimulatedHybridRaf::Prefetch(uint64_t offset, size_t n, const IOOptions& options, IODebugContext* dbg) { if (temperature_ == Temperature::kWarm) { - rate_limiter_->Request(kDummyBytesPerRequest, Env::IOPriority::IO_LOW, - nullptr); + RequestRateLimit(1); Env::Default()->SleepForMicroseconds(kLatencyAddedPerRequestUs); } return target()->Prefetch(offset, n, options, dbg); } +void SimulatedHybridRaf::RequestRateLimit(int64_t num_requests) const { + int64_t left = num_requests * kDummyBytesPerRequest; + const int64_t kMaxToRequest = kDummyBytesPerRequest / 100; + while (left > 0) { + int64_t to_request = std::min(kMaxToRequest, left); + rate_limiter_->Request(to_request, Env::IOPriority::IO_LOW, nullptr); + left -= to_request; + } +} + } // namespace ROCKSDB_NAMESPACE #endif // ROCKSDB_LITE diff --git a/tools/simulated_hybrid_file_system.h b/tools/simulated_hybrid_file_system.h index 40e68575a..a3d16c4e7 100644 --- a/tools/simulated_hybrid_file_system.h +++ b/tools/simulated_hybrid_file_system.h @@ -83,6 +83,8 @@ class SimulatedHybridRaf : public FSRandomAccessFileWrapper { private: std::shared_ptr rate_limiter_; Temperature temperature_; + + void RequestRateLimit(int64_t num_requests) const; }; } // namespace ROCKSDB_NAMESPACE