Pass a timeout to FileSystem for random reads (#6751)

Summary:
Calculate ```IOOptions::timeout``` using ```ReadOptions::deadline``` and pass it to ```FileSystem::Read/FileSystem::MultiRead```. This allows us to impose a tighter bound on the time taken by Get/MultiGet on FileSystem/Envs that support IO timeouts. Even on those that don't support, check in ```RandomAccessFileReader::Read``` and ```MultiRead``` and return ```Status::TimedOut()``` if the deadline is exceeded.

For now, TableReader creation, which might do file opens and reads, are not covered. It will be implemented in another PR.

Tests:
Update existing unit tests to verify the correct timeout value is being passed
Pull Request resolved: https://github.com/facebook/rocksdb/pull/6751

Reviewed By: riversand963

Differential Revision: D21285631

Pulled By: anand1976

fbshipit-source-id: d89af843e5a91ece866e87aa29438b52a65a8567
main
anand76 5 years ago committed by Facebook GitHub Bot
parent eecd8fba46
commit ab13d43e1d
  1. 71
      db/db_basic_test.cc
  2. 5
      db/db_impl/db_impl.cc
  3. 3
      file/file_prefetch_buffer.cc
  4. 16
      file/file_util.h
  5. 30
      file/random_access_file_reader.cc
  6. 9
      file/random_access_file_reader.h
  7. 15
      file/random_access_file_reader_test.cc
  8. 6
      include/rocksdb/file_system.h
  9. 15
      include/rocksdb/options.h
  10. 13
      table/block_based/block_based_table_reader.cc
  11. 47
      table/block_fetcher.cc
  12. 4
      table/cuckoo/cuckoo_table_builder_test.cc
  13. 4
      table/cuckoo/cuckoo_table_reader.cc
  14. 8
      table/format.cc
  15. 2
      table/mock_table.cc
  16. 5
      table/plain/plain_table_key_coding.cc
  17. 3
      table/plain/plain_table_reader.cc
  18. 8
      table/table_test.cc
  19. 10
      utilities/blob_db/blob_db_impl.cc
  20. 3
      utilities/blob_db/blob_dump_tool.cc
  21. 18
      utilities/blob_db/blob_file.cc
  22. 4
      utilities/blob_db/blob_log_reader.cc
  23. 3
      utilities/persistent_cache/block_cache_tier_file.cc
  24. 7
      utilities/trace/file_trace_reader_writer.cc

@ -2387,17 +2387,22 @@ class DBBasicTestMultiGetDeadline : public DBBasicTestMultiGet {
class DeadlineRandomAccessFile : public FSRandomAccessFileWrapper { class DeadlineRandomAccessFile : public FSRandomAccessFileWrapper {
public: public:
DeadlineRandomAccessFile(DeadlineFS& fs, DeadlineRandomAccessFile(DeadlineFS& fs, SpecialEnv* env,
std::unique_ptr<FSRandomAccessFile>& file) std::unique_ptr<FSRandomAccessFile>& file)
: FSRandomAccessFileWrapper(file.get()), : FSRandomAccessFileWrapper(file.get()),
fs_(fs), fs_(fs),
file_(std::move(file)) {} file_(std::move(file)),
env_(env) {}
IOStatus Read(uint64_t offset, size_t len, const IOOptions& opts, IOStatus Read(uint64_t offset, size_t len, const IOOptions& opts,
Slice* result, char* scratch, IODebugContext* dbg) const override { Slice* result, char* scratch, IODebugContext* dbg) const override {
int delay; int delay;
const std::chrono::microseconds deadline = fs_.GetDeadline();
if (deadline.count()) {
AssertDeadline(deadline, opts);
}
if (fs_.ShouldDelay(&delay)) { if (fs_.ShouldDelay(&delay)) {
Env::Default()->SleepForMicroseconds(delay); env_->SleepForMicroseconds(delay);
} }
return FSRandomAccessFileWrapper::Read(offset, len, opts, result, scratch, return FSRandomAccessFileWrapper::Read(offset, len, opts, result, scratch,
dbg); dbg);
@ -2406,22 +2411,37 @@ class DBBasicTestMultiGetDeadline : public DBBasicTestMultiGet {
IOStatus MultiRead(FSReadRequest* reqs, size_t num_reqs, IOStatus MultiRead(FSReadRequest* reqs, size_t num_reqs,
const IOOptions& options, IODebugContext* dbg) override { const IOOptions& options, IODebugContext* dbg) override {
int delay; int delay;
const std::chrono::microseconds deadline = fs_.GetDeadline();
if (deadline.count()) {
AssertDeadline(deadline, options);
}
if (fs_.ShouldDelay(&delay)) { if (fs_.ShouldDelay(&delay)) {
Env::Default()->SleepForMicroseconds(delay); env_->SleepForMicroseconds(delay);
} }
return FSRandomAccessFileWrapper::MultiRead(reqs, num_reqs, options, dbg); return FSRandomAccessFileWrapper::MultiRead(reqs, num_reqs, options, dbg);
} }
private: private:
void AssertDeadline(const std::chrono::microseconds deadline,
const IOOptions& opts) const {
// Give a leeway of +- 10us as it can take some time for the Get/
// MultiGet call to reach here, in order to avoid false alarms
std::chrono::microseconds now =
std::chrono::microseconds(env_->NowMicros());
ASSERT_EQ(deadline - now, opts.timeout);
}
DeadlineFS& fs_; DeadlineFS& fs_;
std::unique_ptr<FSRandomAccessFile> file_; std::unique_ptr<FSRandomAccessFile> file_;
SpecialEnv* env_;
}; };
class DeadlineFS : public FileSystemWrapper { class DeadlineFS : public FileSystemWrapper {
public: public:
DeadlineFS() DeadlineFS(SpecialEnv* env)
: FileSystemWrapper(FileSystem::Default()), : FileSystemWrapper(FileSystem::Default()),
delay_idx_(0) {} delay_idx_(0),
deadline_(std::chrono::microseconds::zero()),
env_(env) {}
~DeadlineFS() = default; ~DeadlineFS() = default;
IOStatus NewRandomAccessFile(const std::string& fname, IOStatus NewRandomAccessFile(const std::string& fname,
@ -2432,13 +2452,14 @@ class DBBasicTestMultiGetDeadline : public DBBasicTestMultiGet {
IOStatus s; IOStatus s;
s = target()->NewRandomAccessFile(fname, opts, &file, dbg); s = target()->NewRandomAccessFile(fname, opts, &file, dbg);
result->reset(new DeadlineRandomAccessFile(*this, file)); result->reset(new DeadlineRandomAccessFile(*this, env_, file));
return s; return s;
} }
// Set a vector of {IO counter, delay in microseconds} pairs that control // Set a vector of {IO counter, delay in microseconds} pairs that control
// when to inject a delay and duration of the delay // when to inject a delay and duration of the delay
void SetDelaySequence(const std::vector<std::pair<int, int>>&& seq) { void SetDelaySequence(const std::chrono::microseconds deadline,
const std::vector<std::pair<int, int>>&& seq) {
int total_delay = 0; int total_delay = 0;
for (auto& seq_iter : seq) { for (auto& seq_iter : seq) {
// Ensure no individual delay is > 500ms // Ensure no individual delay is > 500ms
@ -2451,6 +2472,7 @@ class DBBasicTestMultiGetDeadline : public DBBasicTestMultiGet {
delay_seq_ = seq; delay_seq_ = seq;
delay_idx_ = 0; delay_idx_ = 0;
io_count_ = 0; io_count_ = 0;
deadline_ = deadline;
} }
// Increment the IO counter and return a delay in microseconds // Increment the IO counter and return a delay in microseconds
@ -2464,10 +2486,14 @@ class DBBasicTestMultiGetDeadline : public DBBasicTestMultiGet {
return false; return false;
} }
const std::chrono::microseconds GetDeadline() { return deadline_; }
private: private:
std::vector<std::pair<int, int>> delay_seq_; std::vector<std::pair<int, int>> delay_seq_;
size_t delay_idx_; size_t delay_idx_;
int io_count_; int io_count_;
std::chrono::microseconds deadline_;
SpecialEnv* env_;
}; };
inline void CheckStatus(std::vector<Status>& statuses, size_t num_ok) { inline void CheckStatus(std::vector<Status>& statuses, size_t num_ok) {
@ -2483,8 +2509,10 @@ class DBBasicTestMultiGetDeadline : public DBBasicTestMultiGet {
TEST_F(DBBasicTestMultiGetDeadline, MultiGetDeadlineExceeded) { TEST_F(DBBasicTestMultiGetDeadline, MultiGetDeadlineExceeded) {
std::shared_ptr<DBBasicTestMultiGetDeadline::DeadlineFS> fs( std::shared_ptr<DBBasicTestMultiGetDeadline::DeadlineFS> fs(
new DBBasicTestMultiGetDeadline::DeadlineFS()); new DBBasicTestMultiGetDeadline::DeadlineFS(env_));
std::unique_ptr<Env> env = NewCompositeEnv(fs); std::unique_ptr<Env> env(new CompositeEnvWrapper(env_, fs));
env_->no_slowdown_ = true;
env_->time_elapse_only_sleep_.store(true);
Options options = CurrentOptions(); Options options = CurrentOptions();
std::shared_ptr<Cache> cache = NewLRUCache(1048576); std::shared_ptr<Cache> cache = NewLRUCache(1048576);
@ -2509,13 +2537,13 @@ TEST_F(DBBasicTestMultiGetDeadline, MultiGetDeadlineExceeded) {
cfs[i] = handles_[i]; cfs[i] = handles_[i];
keys[i] = Slice(key_str[i].data(), key_str[i].size()); keys[i] = Slice(key_str[i].data(), key_str[i].size());
} }
// Delay the first IO by 200ms
fs->SetDelaySequence({{0, 200000}});
ReadOptions ro; ReadOptions ro;
ro.deadline = std::chrono::microseconds{env->NowMicros() + 10000}; ro.deadline = std::chrono::microseconds{env->NowMicros() + 10000};
// Delay the first IO by 200ms
fs->SetDelaySequence(ro.deadline, {{0, 20000}});
std::vector<Status> statuses = dbfull()->MultiGet(ro, cfs, keys, &values); std::vector<Status> statuses = dbfull()->MultiGet(ro, cfs, keys, &values);
std::cout << "Non-batched MultiGet";
// The first key is successful because we check after the lookup, but // The first key is successful because we check after the lookup, but
// subsequent keys fail due to deadline exceeded // subsequent keys fail due to deadline exceeded
CheckStatus(statuses, 1); CheckStatus(statuses, 1);
@ -2537,10 +2565,9 @@ TEST_F(DBBasicTestMultiGetDeadline, MultiGetDeadlineExceeded) {
cfs[i] = handles_[i / 2]; cfs[i] = handles_[i / 2];
keys[i] = Slice(key_str[i].data(), key_str[i].size()); keys[i] = Slice(key_str[i].data(), key_str[i].size());
} }
fs->SetDelaySequence({{1, 200000}});
ro.deadline = std::chrono::microseconds{env->NowMicros() + 10000}; ro.deadline = std::chrono::microseconds{env->NowMicros() + 10000};
fs->SetDelaySequence(ro.deadline, {{1, 20000}});
statuses = dbfull()->MultiGet(ro, cfs, keys, &values); statuses = dbfull()->MultiGet(ro, cfs, keys, &values);
std::cout << "Non-batched 2";
CheckStatus(statuses, 3); CheckStatus(statuses, 3);
// Test batched MultiGet with an IO delay in the first data block read. // Test batched MultiGet with an IO delay in the first data block read.
@ -2552,11 +2579,10 @@ TEST_F(DBBasicTestMultiGetDeadline, MultiGetDeadlineExceeded) {
cache->SetCapacity(1048576); cache->SetCapacity(1048576);
statuses.clear(); statuses.clear();
statuses.resize(keys.size()); statuses.resize(keys.size());
fs->SetDelaySequence({{0, 200000}});
ro.deadline = std::chrono::microseconds{env->NowMicros() + 10000}; ro.deadline = std::chrono::microseconds{env->NowMicros() + 10000};
fs->SetDelaySequence(ro.deadline, {{0, 20000}});
dbfull()->MultiGet(ro, keys.size(), cfs.data(), keys.data(), dbfull()->MultiGet(ro, keys.size(), cfs.data(), keys.data(),
pin_values.data(), statuses.data()); pin_values.data(), statuses.data());
std::cout << "Batched 1";
CheckStatus(statuses, 2); CheckStatus(statuses, 2);
// Similar to the previous one, but an IO delay in the third CF data block // Similar to the previous one, but an IO delay in the third CF data block
@ -2568,11 +2594,10 @@ TEST_F(DBBasicTestMultiGetDeadline, MultiGetDeadlineExceeded) {
cache->SetCapacity(1048576); cache->SetCapacity(1048576);
statuses.clear(); statuses.clear();
statuses.resize(keys.size()); statuses.resize(keys.size());
fs->SetDelaySequence({{2, 200000}});
ro.deadline = std::chrono::microseconds{env->NowMicros() + 10000}; ro.deadline = std::chrono::microseconds{env->NowMicros() + 10000};
fs->SetDelaySequence(ro.deadline, {{2, 20000}});
dbfull()->MultiGet(ro, keys.size(), cfs.data(), keys.data(), dbfull()->MultiGet(ro, keys.size(), cfs.data(), keys.data(),
pin_values.data(), statuses.data()); pin_values.data(), statuses.data());
std::cout << "Batched 2";
CheckStatus(statuses, 6); CheckStatus(statuses, 6);
// Similar to the previous one, but an IO delay in the last but one CF // Similar to the previous one, but an IO delay in the last but one CF
@ -2583,11 +2608,10 @@ TEST_F(DBBasicTestMultiGetDeadline, MultiGetDeadlineExceeded) {
cache->SetCapacity(1048576); cache->SetCapacity(1048576);
statuses.clear(); statuses.clear();
statuses.resize(keys.size()); statuses.resize(keys.size());
fs->SetDelaySequence({{3, 200000}});
ro.deadline = std::chrono::microseconds{env->NowMicros() + 10000}; ro.deadline = std::chrono::microseconds{env->NowMicros() + 10000};
fs->SetDelaySequence(ro.deadline, {{3, 20000}});
dbfull()->MultiGet(ro, keys.size(), cfs.data(), keys.data(), dbfull()->MultiGet(ro, keys.size(), cfs.data(), keys.data(),
pin_values.data(), statuses.data()); pin_values.data(), statuses.data());
std::cout << "Batched 3";
CheckStatus(statuses, 8); CheckStatus(statuses, 8);
// Test batched MultiGet with single CF and lots of keys. Inject delay // Test batched MultiGet with single CF and lots of keys. Inject delay
@ -2610,11 +2634,10 @@ TEST_F(DBBasicTestMultiGetDeadline, MultiGetDeadlineExceeded) {
} }
statuses.clear(); statuses.clear();
statuses.resize(keys.size()); statuses.resize(keys.size());
fs->SetDelaySequence({{1, 200000}});
ro.deadline = std::chrono::microseconds{env->NowMicros() + 10000}; ro.deadline = std::chrono::microseconds{env->NowMicros() + 10000};
fs->SetDelaySequence(ro.deadline, {{1, 20000}});
dbfull()->MultiGet(ro, handles_[0], keys.size(), keys.data(), dbfull()->MultiGet(ro, handles_[0], keys.size(), keys.data(),
pin_values.data(), statuses.data()); pin_values.data(), statuses.data());
std::cout << "Batched single CF";
CheckStatus(statuses, 64); CheckStatus(statuses, 64);
Close(); Close();
} }

@ -1527,11 +1527,6 @@ Status DBImpl::GetImpl(const ReadOptions& read_options, const Slice& key,
GetImplOptions& get_impl_options) { GetImplOptions& get_impl_options) {
assert(get_impl_options.value != nullptr || assert(get_impl_options.value != nullptr ||
get_impl_options.merge_operands != nullptr); get_impl_options.merge_operands != nullptr);
// We will eventually support deadline for Get requests too, but safeguard
// for now
if (read_options.deadline != std::chrono::microseconds::zero()) {
return Status::NotSupported("ReadOptions deadline is not supported");
}
#ifndef NDEBUG #ifndef NDEBUG
assert(get_impl_options.column_family); assert(get_impl_options.column_family);

@ -88,8 +88,7 @@ Status FilePrefetchBuffer::Prefetch(RandomAccessFileReader* reader,
Slice result; Slice result;
size_t read_len = static_cast<size_t>(roundup_len - chunk_len); size_t read_len = static_cast<size_t>(roundup_len - chunk_len);
s = reader->Read(rounddown_offset + chunk_len, s = reader->Read(IOOptions(), rounddown_offset + chunk_len, read_len, &result,
read_len, &result,
buffer_.BufferStart() + chunk_len, nullptr, for_compaction); buffer_.BufferStart() + chunk_len, nullptr, for_compaction);
#ifndef NDEBUG #ifndef NDEBUG
if (!s.ok() || result.size() < read_len) { if (!s.ok() || result.size() < read_len) {

@ -30,4 +30,20 @@ extern Status DeleteDBFile(const ImmutableDBOptions* db_options,
extern bool IsWalDirSameAsDBPath(const ImmutableDBOptions* db_options); extern bool IsWalDirSameAsDBPath(const ImmutableDBOptions* db_options);
inline IOStatus PrepareIOFromReadOptions(const ReadOptions& ro, Env* env,
IOOptions& opts) {
if (!env) {
env = Env::Default();
}
if (ro.deadline.count()) {
std::chrono::microseconds now = std::chrono::microseconds(env->NowMicros());
if (now > ro.deadline) {
return IOStatus::TimedOut("Deadline exceeded");
}
opts.timeout = ro.deadline - now;
}
return IOStatus::OK();
}
} // namespace ROCKSDB_NAMESPACE } // namespace ROCKSDB_NAMESPACE

@ -15,14 +15,16 @@
#include "monitoring/histogram.h" #include "monitoring/histogram.h"
#include "monitoring/iostats_context_imp.h" #include "monitoring/iostats_context_imp.h"
#include "port/port.h" #include "port/port.h"
#include "table/format.h"
#include "test_util/sync_point.h" #include "test_util/sync_point.h"
#include "util/random.h" #include "util/random.h"
#include "util/rate_limiter.h" #include "util/rate_limiter.h"
namespace ROCKSDB_NAMESPACE { namespace ROCKSDB_NAMESPACE {
Status RandomAccessFileReader::Read(uint64_t offset, size_t n, Slice* result, Status RandomAccessFileReader::Read(const IOOptions& opts, uint64_t offset,
char* scratch, AlignedBuf* aligned_buf, size_t n, Slice* result, char* scratch,
AlignedBuf* aligned_buf,
bool for_compaction) const { bool for_compaction) const {
(void)aligned_buf; (void)aligned_buf;
Status s; Status s;
@ -62,10 +64,16 @@ Status RandomAccessFileReader::Read(uint64_t offset, size_t n, Slice* result,
start_ts = std::chrono::system_clock::now(); start_ts = std::chrono::system_clock::now();
orig_offset = aligned_offset + buf.CurrentSize(); orig_offset = aligned_offset + buf.CurrentSize();
} }
{ {
IOSTATS_CPU_TIMER_GUARD(cpu_read_nanos, env_); IOSTATS_CPU_TIMER_GUARD(cpu_read_nanos, env_);
s = file_->Read(aligned_offset + buf.CurrentSize(), allowed, // Only user reads are expected to specify a timeout. And user reads
IOOptions(), &tmp, buf.Destination(), nullptr); // are not subjected to rate_limiter and should go through only
// one iteration of this loop, so we don't need to check and adjust
// the opts.timeout before calling file_->Read
assert(!opts.timeout.count() || allowed == read_size);
s = file_->Read(aligned_offset + buf.CurrentSize(), allowed, opts,
&tmp, buf.Destination(), nullptr);
} }
if (ShouldNotifyListeners()) { if (ShouldNotifyListeners()) {
auto finish_ts = std::chrono::system_clock::now(); auto finish_ts = std::chrono::system_clock::now();
@ -116,9 +124,15 @@ Status RandomAccessFileReader::Read(uint64_t offset, size_t n, Slice* result,
start_ts = std::chrono::system_clock::now(); start_ts = std::chrono::system_clock::now();
} }
#endif #endif
{ {
IOSTATS_CPU_TIMER_GUARD(cpu_read_nanos, env_); IOSTATS_CPU_TIMER_GUARD(cpu_read_nanos, env_);
s = file_->Read(offset + pos, allowed, IOOptions(), &tmp_result, // Only user reads are expected to specify a timeout. And user reads
// are not subjected to rate_limiter and should go through only
// one iteration of this loop, so we don't need to check and adjust
// the opts.timeout before calling file_->Read
assert(!opts.timeout.count() || allowed == n);
s = file_->Read(offset + pos, allowed, opts, &tmp_result,
scratch + pos, nullptr); scratch + pos, nullptr);
} }
#ifndef ROCKSDB_LITE #ifndef ROCKSDB_LITE
@ -186,7 +200,8 @@ bool TryMerge(FSReadRequest* dest, const FSReadRequest& src) {
return true; return true;
} }
Status RandomAccessFileReader::MultiRead(FSReadRequest* read_reqs, Status RandomAccessFileReader::MultiRead(const IOOptions& opts,
FSReadRequest* read_reqs,
size_t num_reqs, size_t num_reqs,
AlignedBuf* aligned_buf) const { AlignedBuf* aligned_buf) const {
(void)aligned_buf; // suppress warning of unused variable in LITE mode (void)aligned_buf; // suppress warning of unused variable in LITE mode
@ -244,9 +259,10 @@ Status RandomAccessFileReader::MultiRead(FSReadRequest* read_reqs,
start_ts = std::chrono::system_clock::now(); start_ts = std::chrono::system_clock::now();
} }
#endif // ROCKSDB_LITE #endif // ROCKSDB_LITE
{ {
IOSTATS_CPU_TIMER_GUARD(cpu_read_nanos, env_); IOSTATS_CPU_TIMER_GUARD(cpu_read_nanos, env_);
s = file_->MultiRead(fs_reqs, num_fs_reqs, IOOptions(), nullptr); s = file_->MultiRead(fs_reqs, num_fs_reqs, opts, nullptr);
} }
#ifndef ROCKSDB_LITE #ifndef ROCKSDB_LITE

@ -114,15 +114,16 @@ class RandomAccessFileReader {
// 2. Otherwise, scratch is not used and can be null, the aligned_buf owns // 2. Otherwise, scratch is not used and can be null, the aligned_buf owns
// the internally allocated buffer on return, and the result refers to a // the internally allocated buffer on return, and the result refers to a
// region in aligned_buf. // region in aligned_buf.
Status Read(uint64_t offset, size_t n, Slice* result, char* scratch, Status Read(const IOOptions& opts, uint64_t offset, size_t n, Slice* result,
AlignedBuf* aligned_buf, bool for_compaction = false) const; char* scratch, AlignedBuf* aligned_buf,
bool for_compaction = false) const;
// REQUIRES: // REQUIRES:
// num_reqs > 0, reqs do not overlap, and offsets in reqs are increasing. // num_reqs > 0, reqs do not overlap, and offsets in reqs are increasing.
// In non-direct IO mode, aligned_buf should be null; // In non-direct IO mode, aligned_buf should be null;
// In direct IO mode, aligned_buf stores the aligned buffer allocated inside // In direct IO mode, aligned_buf stores the aligned buffer allocated inside
// MultiRead, the result Slices in reqs refer to aligned_buf. // MultiRead, the result Slices in reqs refer to aligned_buf.
Status MultiRead(FSReadRequest* reqs, size_t num_reqs, Status MultiRead(const IOOptions& opts, FSReadRequest* reqs, size_t num_reqs,
AlignedBuf* aligned_buf) const; AlignedBuf* aligned_buf) const;
Status Prefetch(uint64_t offset, size_t n) const { Status Prefetch(uint64_t offset, size_t n) const {
@ -134,5 +135,7 @@ class RandomAccessFileReader {
std::string file_name() const { return file_name_; } std::string file_name() const { return file_name_; }
bool use_direct_io() const { return file_->use_direct_io(); } bool use_direct_io() const { return file_->use_direct_io(); }
Env* env() const { return env_; }
}; };
} // namespace ROCKSDB_NAMESPACE } // namespace ROCKSDB_NAMESPACE

@ -106,7 +106,8 @@ TEST_F(RandomAccessFileReaderTest, ReadDirectIO) {
Slice result; Slice result;
AlignedBuf buf; AlignedBuf buf;
for (bool for_compaction : {true, false}) { for (bool for_compaction : {true, false}) {
ASSERT_OK(r->Read(offset, len, &result, nullptr, &buf, for_compaction)); ASSERT_OK(r->Read(IOOptions(), offset, len, &result, nullptr, &buf,
for_compaction));
ASSERT_EQ(result.ToString(), content.substr(offset, len)); ASSERT_EQ(result.ToString(), content.substr(offset, len));
} }
} }
@ -152,7 +153,8 @@ TEST_F(RandomAccessFileReaderTest, MultiReadDirectIO) {
reqs.push_back(std::move(r0)); reqs.push_back(std::move(r0));
reqs.push_back(std::move(r1)); reqs.push_back(std::move(r1));
AlignedBuf aligned_buf; AlignedBuf aligned_buf;
ASSERT_OK(r->MultiRead(reqs.data(), reqs.size(), &aligned_buf)); ASSERT_OK(
r->MultiRead(IOOptions(), reqs.data(), reqs.size(), &aligned_buf));
AssertResult(content, reqs); AssertResult(content, reqs);
} }
@ -189,7 +191,8 @@ TEST_F(RandomAccessFileReaderTest, MultiReadDirectIO) {
reqs.push_back(std::move(r1)); reqs.push_back(std::move(r1));
reqs.push_back(std::move(r2)); reqs.push_back(std::move(r2));
AlignedBuf aligned_buf; AlignedBuf aligned_buf;
ASSERT_OK(r->MultiRead(reqs.data(), reqs.size(), &aligned_buf)); ASSERT_OK(
r->MultiRead(IOOptions(), reqs.data(), reqs.size(), &aligned_buf));
AssertResult(content, reqs); AssertResult(content, reqs);
} }
@ -226,7 +229,8 @@ TEST_F(RandomAccessFileReaderTest, MultiReadDirectIO) {
reqs.push_back(std::move(r1)); reqs.push_back(std::move(r1));
reqs.push_back(std::move(r2)); reqs.push_back(std::move(r2));
AlignedBuf aligned_buf; AlignedBuf aligned_buf;
ASSERT_OK(r->MultiRead(reqs.data(), reqs.size(), &aligned_buf)); ASSERT_OK(
r->MultiRead(IOOptions(), reqs.data(), reqs.size(), &aligned_buf));
AssertResult(content, reqs); AssertResult(content, reqs);
} }
@ -255,7 +259,8 @@ TEST_F(RandomAccessFileReaderTest, MultiReadDirectIO) {
reqs.push_back(std::move(r0)); reqs.push_back(std::move(r0));
reqs.push_back(std::move(r1)); reqs.push_back(std::move(r1));
AlignedBuf aligned_buf; AlignedBuf aligned_buf;
ASSERT_OK(r->MultiRead(reqs.data(), reqs.size(), &aligned_buf)); ASSERT_OK(
r->MultiRead(IOOptions(), reqs.data(), reqs.size(), &aligned_buf));
AssertResult(content, reqs); AssertResult(content, reqs);
} }

@ -77,14 +77,16 @@ enum class IOType : uint8_t {
// honored. More hints can be added here in the future to indicate things like // honored. More hints can be added here in the future to indicate things like
// storage media (HDD/SSD) to be used, replication level etc. // storage media (HDD/SSD) to be used, replication level etc.
struct IOOptions { struct IOOptions {
// Timeout for the operation in milliseconds // Timeout for the operation in microseconds
std::chrono::milliseconds timeout; std::chrono::microseconds timeout;
// Priority - high or low // Priority - high or low
IOPriority prio; IOPriority prio;
// Type of data being read/written // Type of data being read/written
IOType type; IOType type;
IOOptions() : timeout(0), prio(IOPriority::kIOLow), type(IOType::kUnknown) {}
}; };
// File scope options that control how a file is opened/created and accessed // File scope options that control how a file is opened/created and accessed

@ -1346,13 +1346,14 @@ struct ReadOptions {
const Slice* timestamp; const Slice* timestamp;
const Slice* iter_start_ts; const Slice* iter_start_ts;
// Deadline for completing the read request (only MultiGet for now) in us. // Deadline for completing the read request (only Get/MultiGet for now) in us.
// It should be set to some number of microseconds since a fixed point in // It should be set to microseconds since epoch, i.e, gettimeofday or
// time, identical to that used by system time. The best way is to use // equivalent plus allowed duration in microseconds. The best way is to use
// env->NowMicros() + some timeout. This is best efforts. The call may // env->NowMicros() + some timeout.
// exceed the deadline if there is IO involved and the file system doesn't // This is best efforts. The call may exceed the deadline if there is IO
// support deadlines, or due to checking for deadline periodically rather // involved and the file system doesn't support deadlines, or due to
// than for every key if processing a batch // checking for deadline periodically rather than for every key if
// processing a batch
std::chrono::microseconds deadline; std::chrono::microseconds deadline;
ReadOptions(); ReadOptions();

@ -20,6 +20,7 @@
#include "db/dbformat.h" #include "db/dbformat.h"
#include "db/pinned_iterators_manager.h" #include "db/pinned_iterators_manager.h"
#include "file/file_prefetch_buffer.h" #include "file/file_prefetch_buffer.h"
#include "file/file_util.h"
#include "file/random_access_file_reader.h" #include "file/random_access_file_reader.h"
#include "monitoring/perf_context_imp.h" #include "monitoring/perf_context_imp.h"
#include "options/options_helper.h" #include "options/options_helper.h"
@ -1671,7 +1672,17 @@ void BlockBasedTable::RetrieveMultipleBlocks(
read_reqs.emplace_back(req); read_reqs.emplace_back(req);
} }
file->MultiRead(&read_reqs[0], read_reqs.size(), nullptr); {
IOOptions opts;
IOStatus s = PrepareIOFromReadOptions(options, file->env(), opts);
if (s.IsTimedOut()) {
for (FSReadRequest& req : read_reqs) {
req.status = s;
}
} else {
file->MultiRead(opts, &read_reqs[0], read_reqs.size(), nullptr);
}
}
idx_in_batch = 0; idx_in_batch = 0;
size_t valid_batch_idx = 0; size_t valid_batch_idx = 0;

@ -12,6 +12,7 @@
#include <cinttypes> #include <cinttypes>
#include <string> #include <string>
#include "file/file_util.h"
#include "logging/logging.h" #include "logging/logging.h"
#include "memory/memory_allocator.h" #include "memory/memory_allocator.h"
#include "monitoring/perf_context_imp.h" #include "monitoring/perf_context_imp.h"
@ -235,29 +236,33 @@ Status BlockFetcher::ReadBlockContents() {
return status_; return status_;
} }
} else if (!TryGetCompressedBlockFromPersistentCache()) { } else if (!TryGetCompressedBlockFromPersistentCache()) {
// Actual file read IOOptions opts;
if (file_->use_direct_io()) { status_ = PrepareIOFromReadOptions(read_options_, file_->env(), opts);
PERF_TIMER_GUARD(block_read_time); // Actual file read
status_ = if (status_.ok()) {
file_->Read(handle_.offset(), block_size_with_trailer_, if (file_->use_direct_io()) {
&slice_, nullptr, &direct_io_buf_, for_compaction_); PERF_TIMER_GUARD(block_read_time);
PERF_COUNTER_ADD(block_read_count, 1); status_ =
used_buf_ = const_cast<char*>(slice_.data()); file_->Read(opts, handle_.offset(), block_size_with_trailer_,
} else { &slice_, nullptr, &direct_io_buf_, for_compaction_);
PrepareBufferForBlockFromFile(); PERF_COUNTER_ADD(block_read_count, 1);
PERF_TIMER_GUARD(block_read_time); used_buf_ = const_cast<char*>(slice_.data());
status_ = file_->Read(handle_.offset(), block_size_with_trailer_, } else {
&slice_, used_buf_, nullptr, for_compaction_); PrepareBufferForBlockFromFile();
PERF_COUNTER_ADD(block_read_count, 1); PERF_TIMER_GUARD(block_read_time);
status_ = file_->Read(opts, handle_.offset(), block_size_with_trailer_,
&slice_, used_buf_, nullptr, for_compaction_);
PERF_COUNTER_ADD(block_read_count, 1);
#ifndef NDEBUG #ifndef NDEBUG
if (used_buf_ == &stack_buf_[0]) { if (used_buf_ == &stack_buf_[0]) {
num_stack_buf_memcpy_++; num_stack_buf_memcpy_++;
} else if (used_buf_ == heap_buf_.get()) { } else if (used_buf_ == heap_buf_.get()) {
num_heap_buf_memcpy_++; num_heap_buf_memcpy_++;
} else if (used_buf_ == compressed_buf_.get()) { } else if (used_buf_ == compressed_buf_.get()) {
num_compressed_buf_memcpy_++; num_compressed_buf_memcpy_++;
} }
#endif #endif
}
} }
// TODO: introduce dedicated perf counter for range tombstones // TODO: introduce dedicated perf counter for range tombstones

@ -112,8 +112,8 @@ class CuckooBuilderTest : public testing::Test {
size_t bucket_size = expected_unused_bucket.size(); size_t bucket_size = expected_unused_bucket.size();
for (uint32_t i = 0; i < table_size + cuckoo_block_size - 1; ++i) { for (uint32_t i = 0; i < table_size + cuckoo_block_size - 1; ++i) {
Slice read_slice; Slice read_slice;
ASSERT_OK(file_reader->Read(i * bucket_size, bucket_size, &read_slice, ASSERT_OK(file_reader->Read(IOOptions(), i * bucket_size, bucket_size,
nullptr, nullptr)); &read_slice, nullptr, nullptr));
size_t key_idx = size_t key_idx =
std::find(expected_locations.begin(), expected_locations.end(), i) - std::find(expected_locations.begin(), expected_locations.end(), i) -
expected_locations.begin(); expected_locations.begin();

@ -137,8 +137,8 @@ CuckooTableReader::CuckooTableReader(
cuckoo_block_size_ = *reinterpret_cast<const uint32_t*>( cuckoo_block_size_ = *reinterpret_cast<const uint32_t*>(
cuckoo_block_size->second.data()); cuckoo_block_size->second.data());
cuckoo_block_bytes_minus_one_ = cuckoo_block_size_ * bucket_length_ - 1; cuckoo_block_bytes_minus_one_ = cuckoo_block_size_ * bucket_length_ - 1;
status_ = file_->Read(0, static_cast<size_t>(file_size), &file_data_, nullptr, status_ = file_->Read(IOOptions(), 0, static_cast<size_t>(file_size),
nullptr); &file_data_, nullptr, nullptr);
} }
Status CuckooTableReader::Get(const ReadOptions& /*readOptions*/, Status CuckooTableReader::Get(const ReadOptions& /*readOptions*/,

@ -304,12 +304,12 @@ Status ReadFooterFromFile(RandomAccessFileReader* file,
!prefetch_buffer->TryReadFromCache(read_offset, Footer::kMaxEncodedLength, !prefetch_buffer->TryReadFromCache(read_offset, Footer::kMaxEncodedLength,
&footer_input)) { &footer_input)) {
if (file->use_direct_io()) { if (file->use_direct_io()) {
s = file->Read(read_offset, Footer::kMaxEncodedLength, &footer_input, s = file->Read(IOOptions(), read_offset, Footer::kMaxEncodedLength,
nullptr, &internal_buf); &footer_input, nullptr, &internal_buf);
} else { } else {
footer_buf.reserve(Footer::kMaxEncodedLength); footer_buf.reserve(Footer::kMaxEncodedLength);
s = file->Read(read_offset, Footer::kMaxEncodedLength, &footer_input, s = file->Read(IOOptions(), read_offset, Footer::kMaxEncodedLength,
&footer_buf[0], nullptr); &footer_input, &footer_buf[0], nullptr);
} }
if (!s.ok()) return s; if (!s.ok()) return s;
} }

@ -114,7 +114,7 @@ uint32_t MockTableFactory::GetAndWriteNextID(WritableFileWriter* file) const {
uint32_t MockTableFactory::GetIDFromFile(RandomAccessFileReader* file) const { uint32_t MockTableFactory::GetIDFromFile(RandomAccessFileReader* file) const {
char buf[4]; char buf[4];
Slice result; Slice result;
file->Read(0, 4, &result, buf, nullptr); file->Read(IOOptions(), 0, 4, &result, buf, nullptr);
assert(result.size() == 4); assert(result.size() == 4);
return DecodeFixed32(buf); return DecodeFixed32(buf);
} }

@ -210,8 +210,9 @@ bool PlainTableFileReader::ReadNonMmap(uint32_t file_offset, uint32_t len,
new_buffer->buf_len = 0; new_buffer->buf_len = 0;
} }
Slice read_result; Slice read_result;
Status s = file_info_->file->Read(file_offset, size_to_read, &read_result, Status s =
new_buffer->buf.get(), nullptr); file_info_->file->Read(IOOptions(), file_offset, size_to_read,
&read_result, new_buffer->buf.get(), nullptr);
if (!s.ok()) { if (!s.ok()) {
status_ = s; status_ = s;
return false; return false;

@ -289,7 +289,8 @@ void PlainTableReader::FillBloom(const std::vector<uint32_t>& prefix_hashes) {
Status PlainTableReader::MmapDataIfNeeded() { Status PlainTableReader::MmapDataIfNeeded() {
if (file_info_.is_mmap_mode) { if (file_info_.is_mmap_mode) {
// Get mmapped memory. // Get mmapped memory.
return file_info_.file->Read(0, static_cast<size_t>(file_size_), return file_info_.file->Read(IOOptions(), 0,
static_cast<size_t>(file_size_),
&file_info_.file_data, nullptr, nullptr); &file_info_.file_data, nullptr, nullptr);
} }
return Status::OK(); return Status::OK();

@ -1264,16 +1264,16 @@ class FileChecksumTestHelper {
Slice result; Slice result;
uint64_t offset = 0; uint64_t offset = 0;
Status s; Status s;
s = file_reader_->Read(offset, 2048, &result, scratch.get(), nullptr, s = file_reader_->Read(IOOptions(), offset, 2048, &result, scratch.get(),
false); nullptr, false);
if (!s.ok()) { if (!s.ok()) {
return s; return s;
} }
while (result.size() != 0) { while (result.size() != 0) {
file_checksum_generator->Update(scratch.get(), result.size()); file_checksum_generator->Update(scratch.get(), result.size());
offset += static_cast<uint64_t>(result.size()); offset += static_cast<uint64_t>(result.size());
s = file_reader_->Read(offset, 2048, &result, scratch.get(), nullptr, s = file_reader_->Read(IOOptions(), offset, 2048, &result, scratch.get(),
false); nullptr, false);
if (!s.ok()) { if (!s.ok()) {
return s; return s;
} }

@ -1491,12 +1491,14 @@ Status BlobDBImpl::GetRawBlobFromFile(const Slice& key, uint64_t file_number,
{ {
StopWatch read_sw(env_, statistics_, BLOB_DB_BLOB_FILE_READ_MICROS); StopWatch read_sw(env_, statistics_, BLOB_DB_BLOB_FILE_READ_MICROS);
if (reader->use_direct_io()) { if (reader->use_direct_io()) {
s = reader->Read(record_offset, static_cast<size_t>(record_size), s = reader->Read(IOOptions(), record_offset,
&blob_record, nullptr, &aligned_buf); static_cast<size_t>(record_size), &blob_record, nullptr,
&aligned_buf);
} else { } else {
buf.reserve(static_cast<size_t>(record_size)); buf.reserve(static_cast<size_t>(record_size));
s = reader->Read(record_offset, static_cast<size_t>(record_size), s = reader->Read(IOOptions(), record_offset,
&blob_record, &buf[0], nullptr); static_cast<size_t>(record_size), &blob_record, &buf[0],
nullptr);
} }
RecordTick(statistics_, BLOB_DB_BLOB_FILE_BYTES_READ, blob_record.size()); RecordTick(statistics_, BLOB_DB_BLOB_FILE_BYTES_READ, blob_record.size());
} }

@ -101,7 +101,8 @@ Status BlobDumpTool::Read(uint64_t offset, size_t size, Slice* result) {
} }
buffer_.reset(new char[buffer_size_]); buffer_.reset(new char[buffer_size_]);
} }
Status s = reader_->Read(offset, size, result, buffer_.get(), nullptr); Status s =
reader_->Read(IOOptions(), offset, size, result, buffer_.get(), nullptr);
if (!s.ok()) { if (!s.ok()) {
return s; return s;
} }

@ -142,12 +142,12 @@ Status BlobFile::ReadFooter(BlobLogFooter* bf) {
AlignedBuf aligned_buf; AlignedBuf aligned_buf;
Status s; Status s;
if (ra_file_reader_->use_direct_io()) { if (ra_file_reader_->use_direct_io()) {
s = ra_file_reader_->Read(footer_offset, BlobLogFooter::kSize, &result, s = ra_file_reader_->Read(IOOptions(), footer_offset, BlobLogFooter::kSize,
nullptr, &aligned_buf); &result, nullptr, &aligned_buf);
} else { } else {
buf.reserve(BlobLogFooter::kSize + 10); buf.reserve(BlobLogFooter::kSize + 10);
s = ra_file_reader_->Read(footer_offset, BlobLogFooter::kSize, &result, s = ra_file_reader_->Read(IOOptions(), footer_offset, BlobLogFooter::kSize,
&buf[0], nullptr); &result, &buf[0], nullptr);
} }
if (!s.ok()) return s; if (!s.ok()) return s;
if (result.size() != BlobLogFooter::kSize) { if (result.size() != BlobLogFooter::kSize) {
@ -266,11 +266,11 @@ Status BlobFile::ReadMetadata(Env* env, const EnvOptions& env_options) {
AlignedBuf aligned_buf; AlignedBuf aligned_buf;
Slice header_slice; Slice header_slice;
if (file_reader->use_direct_io()) { if (file_reader->use_direct_io()) {
s = file_reader->Read(0, BlobLogHeader::kSize, &header_slice, nullptr, s = file_reader->Read(IOOptions(), 0, BlobLogHeader::kSize, &header_slice,
&aligned_buf); nullptr, &aligned_buf);
} else { } else {
header_buf.reserve(BlobLogHeader::kSize); header_buf.reserve(BlobLogHeader::kSize);
s = file_reader->Read(0, BlobLogHeader::kSize, &header_slice, s = file_reader->Read(IOOptions(), 0, BlobLogHeader::kSize, &header_slice,
&header_buf[0], nullptr); &header_buf[0], nullptr);
} }
if (!s.ok()) { if (!s.ok()) {
@ -306,12 +306,12 @@ Status BlobFile::ReadMetadata(Env* env, const EnvOptions& env_options) {
std::string footer_buf; std::string footer_buf;
Slice footer_slice; Slice footer_slice;
if (file_reader->use_direct_io()) { if (file_reader->use_direct_io()) {
s = file_reader->Read(file_size - BlobLogFooter::kSize, s = file_reader->Read(IOOptions(), file_size - BlobLogFooter::kSize,
BlobLogFooter::kSize, &footer_slice, nullptr, BlobLogFooter::kSize, &footer_slice, nullptr,
&aligned_buf); &aligned_buf);
} else { } else {
footer_buf.reserve(BlobLogFooter::kSize); footer_buf.reserve(BlobLogFooter::kSize);
s = file_reader->Read(file_size - BlobLogFooter::kSize, s = file_reader->Read(IOOptions(), file_size - BlobLogFooter::kSize,
BlobLogFooter::kSize, &footer_slice, &footer_buf[0], BlobLogFooter::kSize, &footer_slice, &footer_buf[0],
nullptr); nullptr);
} }

@ -26,8 +26,8 @@ Reader::Reader(std::unique_ptr<RandomAccessFileReader>&& file_reader, Env* env,
Status Reader::ReadSlice(uint64_t size, Slice* slice, char* buf) { Status Reader::ReadSlice(uint64_t size, Slice* slice, char* buf) {
StopWatch read_sw(env_, statistics_, BLOB_DB_BLOB_FILE_READ_MICROS); StopWatch read_sw(env_, statistics_, BLOB_DB_BLOB_FILE_READ_MICROS);
Status s = Status s = file_->Read(IOOptions(), next_byte_, static_cast<size_t>(size),
file_->Read(next_byte_, static_cast<size_t>(size), slice, buf, nullptr); slice, buf, nullptr);
next_byte_ += size; next_byte_ += size;
if (!s.ok()) { if (!s.ok()) {
return s; return s;

@ -235,7 +235,8 @@ bool RandomAccessCacheFile::Read(const LBA& lba, Slice* key, Slice* val,
} }
Slice result; Slice result;
Status s = freader_->Read(lba.off_, lba.size_, &result, scratch, nullptr); Status s = freader_->Read(IOOptions(), lba.off_, lba.size_, &result, scratch,
nullptr);
if (!s.ok()) { if (!s.ok()) {
Error(log_, "Error reading from file %s. %s", Path().c_str(), Error(log_, "Error reading from file %s. %s", Path().c_str(),
s.ToString().c_str()); s.ToString().c_str());

@ -33,8 +33,8 @@ Status FileTraceReader::Close() {
Status FileTraceReader::Read(std::string* data) { Status FileTraceReader::Read(std::string* data) {
assert(file_reader_ != nullptr); assert(file_reader_ != nullptr);
Status s = file_reader_->Read(offset_, kTraceMetadataSize, &result_, buffer_, Status s = file_reader_->Read(IOOptions(), offset_, kTraceMetadataSize,
nullptr); &result_, buffer_, nullptr);
if (!s.ok()) { if (!s.ok()) {
return s; return s;
} }
@ -58,7 +58,8 @@ Status FileTraceReader::Read(std::string* data) {
unsigned int to_read = unsigned int to_read =
bytes_to_read > kBufferSize ? kBufferSize : bytes_to_read; bytes_to_read > kBufferSize ? kBufferSize : bytes_to_read;
while (to_read > 0) { while (to_read > 0) {
s = file_reader_->Read(offset_, to_read, &result_, buffer_, nullptr); s = file_reader_->Read(IOOptions(), offset_, to_read, &result_, buffer_,
nullptr);
if (!s.ok()) { if (!s.ok()) {
return s; return s;
} }

Loading…
Cancel
Save