From a931bacf5d1ac118235e818f217c7cc936e84660 Mon Sep 17 00:00:00 2001 From: sdong Date: Wed, 29 Dec 2021 11:13:49 -0800 Subject: [PATCH] Improve SimulatedHybridFileSystem (#9301) Summary: Several improvements to SimulatedHybridFileSystem: (1) Allow a mode where all I/Os to all files simulate HDD. This can be enabled in db_bench using -simulate_hdd (2) Latency calculation is slightly more accurate (3) Allow to simulate more than one HDD spindles. Pull Request resolved: https://github.com/facebook/rocksdb/pull/9301 Test Plan: Run db_bench and observe the results are reasonable. Reviewed By: jay-zhuang Differential Revision: D33141662 fbshipit-source-id: b736e58c4ba910d06899cc9ccec79b628275f4fa --- HISTORY.md | 1 + tools/db_bench_tool.cc | 11 ++++- tools/simulated_hybrid_file_system.cc | 61 +++++++++++++++++---------- tools/simulated_hybrid_file_system.h | 14 ++++-- 4 files changed, 58 insertions(+), 29 deletions(-) diff --git a/HISTORY.md b/HISTORY.md index 6e0c54e53..b672f1160 100644 --- a/HISTORY.md +++ b/HISTORY.md @@ -7,6 +7,7 @@ ## 6.28.0 (2021-12-17) ### New Features * Introduced 'CommitWithTimestamp' as a new tag. Currently, there is no API for user to trigger a write with this tag to the WAL. This is part of the efforts to support write-commited transactions with user-defined timestamps. +* Introduce SimulatedHybridFileSystem which can help simulating HDD latency in db_bench. Tiered Storage latency simulation can be enabled using -simulate_hybrid_fs_file (note that it doesn't work if db_bench is interrupted in the middle). -simulate_hdd can also be used to simulate all files on HDD. ### Bug Fixes * Fixed a bug in rocksdb automatic implicit prefetching which got broken because of new feature adaptive_readahead and internal prefetching got disabled when iterator moves from one file to next. diff --git a/tools/db_bench_tool.cc b/tools/db_bench_tool.cc index 93b780541..fcd3c157a 100644 --- a/tools/db_bench_tool.cc +++ b/tools/db_bench_tool.cc @@ -1154,6 +1154,10 @@ DEFINE_string(simulate_hybrid_fs_file, "", "File for Store Metadata for Simulate hybrid FS. Empty means " "disable the feature. Now, if it is set, " "bottommost_temperature is set to kWarm."); +DEFINE_int32(simulate_hybrid_hdd_multipliers, 1, + "In simulate_hybrid_fs_file or simulate_hdd mode, how many HDDs " + "are simulated."); +DEFINE_bool(simulate_hdd, false, "Simulate read/write latency on HDD."); static std::shared_ptr env_guard; @@ -8135,12 +8139,15 @@ int db_bench_tool(int argc, char** argv) { fprintf(stderr, "Failed creating env: %s\n", s.ToString().c_str()); exit(1); } - } else if (FLAGS_simulate_hybrid_fs_file != "") { + } else if (FLAGS_simulate_hdd || FLAGS_simulate_hybrid_fs_file != "") { //**TODO: Make the simulate fs something that can be loaded // from the ObjectRegistry... static std::shared_ptr composite_env = NewCompositeEnv(std::make_shared( - FileSystem::Default(), FLAGS_simulate_hybrid_fs_file)); + FileSystem::Default(), FLAGS_simulate_hybrid_fs_file, + /*throughput_multiplier=*/ + int{FLAGS_simulate_hybrid_hdd_multipliers}, + /*is_full_fs_warm=*/FLAGS_simulate_hdd)); FLAGS_env = composite_env.get(); } #endif // ROCKSDB_LITE diff --git a/tools/simulated_hybrid_file_system.cc b/tools/simulated_hybrid_file_system.cc index 776dc6623..675d2593f 100644 --- a/tools/simulated_hybrid_file_system.cc +++ b/tools/simulated_hybrid_file_system.cc @@ -3,6 +3,7 @@ // COPYING file in the root directory) and Apache 2.0 License // (found in the LICENSE.Apache file in the root directory). +#include "util/stop_watch.h" #ifndef ROCKSDB_LITE #include "tools/simulated_hybrid_file_system.h" @@ -15,7 +16,6 @@ namespace ROCKSDB_NAMESPACE { -const int kLatencyAddedPerRequestUs = 15000; const int64_t kUsPerSec = 1000000; const int64_t kDummyBytesPerUs = 1024; @@ -43,14 +43,17 @@ void RateLimiterRequest(RateLimiter* rater_limiter, int64_t amount) { // warm SimulatedHybridFileSystem::SimulatedHybridFileSystem( const std::shared_ptr& base, - const std::string& metadata_file_name) + const std::string& metadata_file_name, int throughput_multiplier, + bool is_full_fs_warm) : FileSystemWrapper(base), // Limit to 100 requests per second. rate_limiter_(NewGenericRateLimiter( - kDummyBytesPerUs * kUsPerSec /* rate_bytes_per_sec */, + int64_t{throughput_multiplier} * kDummyBytesPerUs * + kUsPerSec /* rate_bytes_per_sec */, 1000 /* refill_period_us */)), metadata_file_name_(metadata_file_name), - name_("SimulatedHybridFileSystem: " + std::string(target()->Name())) { + name_("SimulatedHybridFileSystem: " + std::string(target()->Name())), + is_full_fs_warm_(is_full_fs_warm) { IOStatus s = base->FileExists(metadata_file_name, IOOptions(), nullptr); if (s.IsNotFound()) { return; @@ -77,6 +80,9 @@ SimulatedHybridFileSystem::SimulatedHybridFileSystem( // SimulatedHybridFileSystem::SimulatedHybridFileSystem() for format of the // file. SimulatedHybridFileSystem::~SimulatedHybridFileSystem() { + if (metadata_file_name_.empty()) { + return; + } std::string metadata; for (const auto& f : warm_file_set_) { metadata += f; @@ -93,13 +99,15 @@ IOStatus SimulatedHybridFileSystem::NewRandomAccessFile( const std::string& fname, const FileOptions& file_opts, std::unique_ptr* result, IODebugContext* dbg) { Temperature temperature = Temperature::kUnknown; - { + if (is_full_fs_warm_) { + temperature = Temperature::kWarm; + } else { const std::lock_guard lock(mutex_); if (warm_file_set_.find(fname) != warm_file_set_.end()) { temperature = Temperature::kWarm; } + assert(temperature == file_opts.temperature); } - assert(temperature == file_opts.temperature); IOStatus s = target()->NewRandomAccessFile(fname, file_opts, result, dbg); result->reset( new SimulatedHybridRaf(std::move(*result), rate_limiter_, temperature)); @@ -115,7 +123,7 @@ IOStatus SimulatedHybridFileSystem::NewWritableFile( } IOStatus s = target()->NewWritableFile(fname, file_opts, result, dbg); - if (file_opts.temperature == Temperature::kWarm) { + if (file_opts.temperature == Temperature::kWarm || is_full_fs_warm_) { result->reset(new SimulatedWritableFile(std::move(*result), rate_limiter_)); } return s; @@ -135,8 +143,7 @@ IOStatus SimulatedHybridRaf::Read(uint64_t offset, size_t n, const IOOptions& options, Slice* result, char* scratch, IODebugContext* dbg) const { if (temperature_ == Temperature::kWarm) { - Env::Default()->SleepForMicroseconds(kLatencyAddedPerRequestUs); - RequestRateLimit(n); + SimulateIOWait(n); } return target()->Read(offset, n, options, result, scratch, dbg); } @@ -146,10 +153,8 @@ IOStatus SimulatedHybridRaf::MultiRead(FSReadRequest* reqs, size_t num_reqs, IODebugContext* dbg) { if (temperature_ == Temperature::kWarm) { for (size_t i = 0; i < num_reqs; i++) { - RequestRateLimit(reqs[i].len); + SimulateIOWait(reqs[i].len); } - Env::Default()->SleepForMicroseconds(kLatencyAddedPerRequestUs * - static_cast(num_reqs)); } return target()->MultiRead(reqs, num_reqs, options, dbg); } @@ -158,24 +163,34 @@ IOStatus SimulatedHybridRaf::Prefetch(uint64_t offset, size_t n, const IOOptions& options, IODebugContext* dbg) { if (temperature_ == Temperature::kWarm) { - RequestRateLimit(n); - Env::Default()->SleepForMicroseconds(kLatencyAddedPerRequestUs); + SimulateIOWait(n); } return target()->Prefetch(offset, n, options, dbg); } -void SimulatedHybridRaf::RequestRateLimit(int64_t bytes) const { - RateLimiterRequest(rate_limiter_.get(), CalculateServeTimeUs(bytes)); +void SimulatedHybridRaf::SimulateIOWait(int64_t bytes) const { + int serve_time = CalculateServeTimeUs(bytes); + { + StopWatchNano stop_watch(Env::Default()->GetSystemClock().get(), + /*auto_start=*/true); + RateLimiterRequest(rate_limiter_.get(), serve_time); + int time_passed_us = static_cast(stop_watch.ElapsedNanos() / 1000); + if (time_passed_us < serve_time) { + Env::Default()->SleepForMicroseconds(serve_time - time_passed_us); + } + } } -void SimulatedWritableFile::RequestRateLimit(int64_t bytes) const { - RateLimiterRequest(rate_limiter_.get(), CalculateServeTimeUs(bytes)); +void SimulatedWritableFile::SimulateIOWait(int64_t bytes) const { + int serve_time = CalculateServeTimeUs(bytes); + Env::Default()->SleepForMicroseconds(serve_time); + RateLimiterRequest(rate_limiter_.get(), serve_time); } IOStatus SimulatedWritableFile::Append(const Slice& data, const IOOptions& ioo, IODebugContext* idc) { if (use_direct_io()) { - RequestRateLimit(data.size()); + SimulateIOWait(data.size()); } else { unsynced_bytes += data.size(); } @@ -186,7 +201,7 @@ IOStatus SimulatedWritableFile::Append( const Slice& data, const IOOptions& options, const DataVerificationInfo& verification_info, IODebugContext* dbg) { if (use_direct_io()) { - RequestRateLimit(data.size()); + SimulateIOWait(data.size()); } else { unsynced_bytes += data.size(); } @@ -198,7 +213,7 @@ IOStatus SimulatedWritableFile::PositionedAppend(const Slice& data, const IOOptions& options, IODebugContext* dbg) { if (use_direct_io()) { - RequestRateLimit(data.size()); + SimulateIOWait(data.size()); } else { // This might be overcalculated, but it's probably OK. unsynced_bytes += data.size(); @@ -209,7 +224,7 @@ IOStatus SimulatedWritableFile::PositionedAppend( const Slice& data, uint64_t offset, const IOOptions& options, const DataVerificationInfo& verification_info, IODebugContext* dbg) { if (use_direct_io()) { - RequestRateLimit(data.size()); + SimulateIOWait(data.size()); } else { // This might be overcalculated, but it's probably OK. unsynced_bytes += data.size(); @@ -221,7 +236,7 @@ IOStatus SimulatedWritableFile::PositionedAppend( IOStatus SimulatedWritableFile::Sync(const IOOptions& options, IODebugContext* dbg) { if (unsynced_bytes > 0) { - RequestRateLimit(unsynced_bytes); + SimulateIOWait(unsynced_bytes); unsynced_bytes = 0; } return target()->Sync(options, dbg); diff --git a/tools/simulated_hybrid_file_system.h b/tools/simulated_hybrid_file_system.h index e1b3393b4..251d89df7 100644 --- a/tools/simulated_hybrid_file_system.h +++ b/tools/simulated_hybrid_file_system.h @@ -28,8 +28,13 @@ class SimulatedHybridFileSystem : public FileSystemWrapper { // metadata_file_name stores metadata of the files, so that it can be // loaded after process restarts. If the file doesn't exist, create // one. The file is written when the class is destroyed. - explicit SimulatedHybridFileSystem(const std::shared_ptr& base, - const std::string& metadata_file_name); + // throughput_multiplier: multiplier of throughput. For example, 1 is to + // simulate single disk spindle. 4 is to simualte 4 disk spindles. + // is_full_fs_warm: if true, all files are all included in slow I/O + // simulation. + SimulatedHybridFileSystem(const std::shared_ptr& base, + const std::string& metadata_file_name, + int throughput_multiplier, bool is_full_fs_warm); ~SimulatedHybridFileSystem() override; @@ -55,6 +60,7 @@ class SimulatedHybridFileSystem : public FileSystemWrapper { std::unordered_set warm_file_set_; std::string metadata_file_name_; std::string name_; + bool is_full_fs_warm_; }; // Simulated random access file that can control IOPs and latency to simulate @@ -84,7 +90,7 @@ class SimulatedHybridRaf : public FSRandomAccessFileOwnerWrapper { std::shared_ptr rate_limiter_; Temperature temperature_; - void RequestRateLimit(int64_t num_requests) const; + void SimulateIOWait(int64_t num_requests) const; }; class SimulatedWritableFile : public FSWritableFileWrapper { @@ -113,7 +119,7 @@ class SimulatedWritableFile : public FSWritableFileWrapper { std::shared_ptr rate_limiter_; size_t unsynced_bytes = 0; - void RequestRateLimit(int64_t num_requests) const; + void SimulateIOWait(int64_t num_requests) const; }; } // namespace ROCKSDB_NAMESPACE