From f6c4d7a57690af1cf28a628998242fe2cc49e9e7 Mon Sep 17 00:00:00 2001 From: Akanksha Mahajan Date: Mon, 18 Jul 2022 15:37:29 -0700 Subject: [PATCH] Fix hang in MultiRead with O_DIRECT and io_uring (#10368) Summary: Fix bug in O_DIRECT and io_uring when its EOF and bytes_read = 0 because of wrong check, it got added into incomplete list and gets stuck in an infinite loop as it will always return bytes_read = 0. The bug was introduced by PR https://github.com/facebook/rocksdb/pull/10197 and that PR is not released yet in any release branch. Pull Request resolved: https://github.com/facebook/rocksdb/pull/10368 Test Plan: Added new unit test Reviewed By: siying Differential Revision: D37885184 Pulled By: akankshamahajan15 fbshipit-source-id: 35b36a44b696d29b2f6f25301aa1b19547b4e03b --- env/env_test.cc | 75 ++++++++++++++++++++++++++++++++++++++++++++++++- env/io_posix.cc | 19 +++++++------ 2 files changed, 85 insertions(+), 9 deletions(-) diff --git a/env/env_test.cc b/env/env_test.cc index 91e29627f..4945dbf53 100644 --- a/env/env_test.cc +++ b/env/env_test.cc @@ -1401,7 +1401,7 @@ TEST_P(EnvPosixTestWithParam, MultiRead) { } }); - ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->DisableProcessing(); + ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->EnableProcessing(); std::unique_ptr file; std::vector reqs(3); std::vector> data; @@ -1522,6 +1522,79 @@ TEST_F(EnvPosixTest, MultiReadNonAlignedLargeNum) { } } +#ifndef ROCKSDB_LITE +TEST_F(EnvPosixTest, NonAlignedDirectIOMultiReadBeyondFileSize) { + EnvOptions soptions; + soptions.use_direct_reads = true; + soptions.use_direct_writes = false; + std::string fname = test::PerThreadDBPath(env_, "testfile"); + + Random rnd(301); + std::unique_ptr wfile; + size_t alignment = 0; + // Create file. + { + ASSERT_OK(env_->NewWritableFile(fname, &wfile, soptions)); + auto data_ptr = NewAligned(4095, 'b'); + Slice data_b(data_ptr.get(), 4095); + ASSERT_OK(wfile->PositionedAppend(data_b, 0U)); + ASSERT_OK(wfile->Close()); + } + +#if !defined(OS_MACOSX) && !defined(OS_WIN) && !defined(OS_SOLARIS) && \ + !defined(OS_AIX) && !defined(OS_OPENBSD) && !defined(OS_FREEBSD) + if (soptions.use_direct_reads) { + ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->SetCallBack( + "NewRandomAccessFile:O_DIRECT", [&](void* arg) { + int* val = static_cast(arg); + *val &= ~O_DIRECT; + }); + } +#endif + ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->EnableProcessing(); + + const int num_reads = 2; + // Create requests + std::vector scratches; + scratches.reserve(num_reads); + std::vector reqs(num_reads); + + std::unique_ptr file; + ASSERT_OK(env_->NewRandomAccessFile(fname, &file, soptions)); + alignment = file->GetRequiredBufferAlignment(); + ASSERT_EQ(num_reads, reqs.size()); + + std::vector> data; + + std::vector offsets = {0, 2047}; + std::vector lens = {2047, 4096 - 2047}; + + for (size_t i = 0; i < num_reads; i++) { + // Do alignment + reqs[i].offset = static_cast(TruncateToPageBoundary( + alignment, static_cast(/*offset=*/offsets[i]))); + reqs[i].len = + Roundup(static_cast(/*offset=*/offsets[i]) + /*length=*/lens[i], + alignment) - + reqs[i].offset; + + size_t new_capacity = Roundup(reqs[i].len, alignment); + data.emplace_back(NewAligned(new_capacity, 0)); + reqs[i].scratch = data.back().get(); + } + + // Query the data + ASSERT_OK(file->MultiRead(reqs.data(), reqs.size())); + + // Validate results + for (size_t i = 0; i < num_reads; ++i) { + ASSERT_OK(reqs[i].status); + } + + ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->DisableProcessing(); +} +#endif // ROCKSDB_LITE + #if defined(ROCKSDB_IOURING_PRESENT) void GenerateFilesAndRequest(Env* env, const std::string& fname, std::vector* ret_reqs, diff --git a/env/io_posix.cc b/env/io_posix.cc index e7dfe8fa8..0ea30803c 100644 --- a/env/io_posix.cc +++ b/env/io_posix.cc @@ -750,14 +750,17 @@ IOStatus PosixRandomAccessFile::MultiRead(FSReadRequest* reqs, bytes_read, read_again); int32_t res = cqe->res; if (res >= 0) { - if (bytes_read == 0 && read_again) { - Slice tmp_slice; - req->status = - Read(req->offset + req_wrap->finished_len, - req->len - req_wrap->finished_len, options, &tmp_slice, - req->scratch + req_wrap->finished_len, dbg); - req->result = - Slice(req->scratch, req_wrap->finished_len + tmp_slice.size()); + if (bytes_read == 0) { + if (read_again) { + Slice tmp_slice; + req->status = + Read(req->offset + req_wrap->finished_len, + req->len - req_wrap->finished_len, options, &tmp_slice, + req->scratch + req_wrap->finished_len, dbg); + req->result = + Slice(req->scratch, req_wrap->finished_len + tmp_slice.size()); + } + // else It means EOF so no need to do anything. } else if (bytes_read < req_wrap->iov.iov_len) { incomplete_rq_list.push_back(req_wrap); }