Handle io_uring partial results (#6441)

Summary:
The logic that handles io_uring partial results was wrong. Fix the logic by putting it into a queue and continue reading.
Pull Request resolved: https://github.com/facebook/rocksdb/pull/6441

Test Plan: Make sure this patch fixes the application test case where the bug was discovered; in env_test, add a unit test that simulates partial results and make sure the results are still correct.

Differential Revision: D20018616

fbshipit-source-id: 5398a7e34d74c26d52aa69dfd604e93e95d99c62
main
sdong 5 years ago committed by Facebook Github Bot
parent 890d87fadc
commit 942eaba091
  1. 20
      env/env_test.cc
  2. 78
      env/io_posix.cc

20
env/env_test.cc vendored

@ -1137,8 +1137,25 @@ TEST_P(EnvPosixTestWithParam, MultiRead) {
ASSERT_OK(wfile->Close()); ASSERT_OK(wfile->Close());
} }
// More attempts to simulate more partial result sequences.
for (uint32_t attempt = 0; attempt < 20; attempt++) {
// Random Read // Random Read
{ Random rnd(301 + attempt);
ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->SetCallBack(
"PosixRandomAccessFile::MultiRead:io_uring_result", [&](void* arg) {
if (attempt > 0) {
// No failure in the first attempt.
size_t& bytes_read = *static_cast<size_t*>(arg);
if (rnd.OneIn(4)) {
bytes_read = 0;
} else if (rnd.OneIn(3)) {
bytes_read = static_cast<size_t>(
rnd.Uniform(static_cast<int>(bytes_read)));
}
}
});
ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->EnableProcessing();
std::unique_ptr<RandomAccessFile> file; std::unique_ptr<RandomAccessFile> file;
std::vector<ReadRequest> reqs(3); std::vector<ReadRequest> reqs(3);
std::vector<std::unique_ptr<char, Deleter>> data; std::vector<std::unique_ptr<char, Deleter>> data;
@ -1163,6 +1180,7 @@ TEST_P(EnvPosixTestWithParam, MultiRead) {
ASSERT_OK(reqs[i].status); ASSERT_OK(reqs[i].status);
ASSERT_EQ(memcmp(reqs[i].scratch, buf.get(), kSectorSize), 0); ASSERT_EQ(memcmp(reqs[i].scratch, buf.get(), kSectorSize), 0);
} }
ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->DisableProcessing();
} }
} }

78
env/io_posix.cc vendored

@ -482,9 +482,6 @@ IOStatus PosixRandomAccessFile::MultiRead(FSReadRequest* reqs,
const IOOptions& options, const IOOptions& options,
IODebugContext* dbg) { IODebugContext* dbg) {
#if defined(ROCKSDB_IOURING_PRESENT) #if defined(ROCKSDB_IOURING_PRESENT)
size_t reqs_off;
ssize_t ret __attribute__((__unused__));
struct io_uring* iu = nullptr; struct io_uring* iu = nullptr;
if (thread_local_io_urings_) { if (thread_local_io_urings_) {
iu = static_cast<struct io_uring*>(thread_local_io_urings_->Get()); iu = static_cast<struct io_uring*>(thread_local_io_urings_->Get());
@ -505,35 +502,49 @@ IOStatus PosixRandomAccessFile::MultiRead(FSReadRequest* reqs,
struct WrappedReadRequest { struct WrappedReadRequest {
FSReadRequest* req; FSReadRequest* req;
struct iovec iov; struct iovec iov;
explicit WrappedReadRequest(FSReadRequest* r) : req(r) {} size_t finished_len;
explicit WrappedReadRequest(FSReadRequest* r) : req(r), finished_len(0) {}
}; };
autovector<WrappedReadRequest, 32> req_wraps; autovector<WrappedReadRequest, 32> req_wraps;
autovector<WrappedReadRequest*, 4> incomplete_rq_list;
for (size_t i = 0; i < num_reqs; i++) { for (size_t i = 0; i < num_reqs; i++) {
req_wraps.emplace_back(&reqs[i]); req_wraps.emplace_back(&reqs[i]);
} }
reqs_off = 0; size_t reqs_off = 0;
while (num_reqs) { while (num_reqs > reqs_off || !incomplete_rq_list.empty()) {
size_t this_reqs = num_reqs; size_t this_reqs = (num_reqs - reqs_off) + incomplete_rq_list.size();
// If requests exceed depth, split it into batches // If requests exceed depth, split it into batches
if (this_reqs > kIoUringDepth) this_reqs = kIoUringDepth; if (this_reqs > kIoUringDepth) this_reqs = kIoUringDepth;
assert(incomplete_rq_list.size() <= this_reqs);
for (size_t i = 0; i < this_reqs; i++) { for (size_t i = 0; i < this_reqs; i++) {
size_t index = i + reqs_off; WrappedReadRequest* rep_to_submit;
struct io_uring_sqe* sqe; if (i < incomplete_rq_list.size()) {
rep_to_submit = incomplete_rq_list[i];
} else {
rep_to_submit = &req_wraps[reqs_off++];
}
assert(rep_to_submit->req->len > rep_to_submit->finished_len);
rep_to_submit->iov.iov_base =
rep_to_submit->req->scratch + rep_to_submit->finished_len;
rep_to_submit->iov.iov_len =
rep_to_submit->req->len - rep_to_submit->finished_len;
struct io_uring_sqe* sqe;
sqe = io_uring_get_sqe(iu); sqe = io_uring_get_sqe(iu);
req_wraps[index].iov.iov_base = reqs[index].scratch; io_uring_prep_readv(
req_wraps[index].iov.iov_len = reqs[index].len; sqe, fd_, &rep_to_submit->iov, 1,
io_uring_prep_readv(sqe, fd_, &req_wraps[index].iov, 1, rep_to_submit->req->offset + rep_to_submit->finished_len);
reqs[index].offset); io_uring_sqe_set_data(sqe, rep_to_submit);
io_uring_sqe_set_data(sqe, &req_wraps[index]);
} }
incomplete_rq_list.clear();
ret = io_uring_submit_and_wait(iu, static_cast<unsigned int>(this_reqs)); ssize_t ret =
io_uring_submit_and_wait(iu, static_cast<unsigned int>(this_reqs));
if (static_cast<size_t>(ret) != this_reqs) { if (static_cast<size_t>(ret) != this_reqs) {
fprintf(stderr, "ret = %ld this_reqs: %ld\n", (long)ret, (long)this_reqs); fprintf(stderr, "ret = %ld this_reqs: %ld\n", (long)ret, (long)this_reqs);
} }
@ -547,21 +558,44 @@ IOStatus PosixRandomAccessFile::MultiRead(FSReadRequest* reqs,
// of our initial wait not reaping all completions // of our initial wait not reaping all completions
ret = io_uring_wait_cqe(iu, &cqe); ret = io_uring_wait_cqe(iu, &cqe);
assert(!ret); assert(!ret);
req_wrap = static_cast<WrappedReadRequest*>(io_uring_cqe_get_data(cqe)); req_wrap = static_cast<WrappedReadRequest*>(io_uring_cqe_get_data(cqe));
FSReadRequest* req = req_wrap->req; FSReadRequest* req = req_wrap->req;
if (static_cast<size_t>(cqe->res) == req_wrap->iov.iov_len) { if (cqe->res < 0) {
req->result = Slice(req->scratch, cqe->res); req->result = Slice(req->scratch, 0);
req->status = IOError("Req failed", filename_, cqe->res);
} else {
size_t bytes_read = static_cast<size_t>(cqe->res);
TEST_SYNC_POINT_CALLBACK(
"PosixRandomAccessFile::MultiRead:io_uring_result", &bytes_read);
if (bytes_read == req_wrap->iov.iov_len) {
req->result = Slice(req->scratch, req->len);
req->status = IOStatus::OK(); req->status = IOStatus::OK();
} else if (cqe->res >= 0) { } else if (bytes_read == 0) {
req->result = Slice(req->scratch, req_wrap->iov.iov_len - cqe->res); // cqe->res == 0 can means EOF, or can mean partial results. See
// comment
// https://github.com/facebook/rocksdb/pull/6441#issuecomment-589843435
// Fall back to pread in this case.
Slice tmp_slice;
req->status =
Read(req->offset + req_wrap->finished_len,
req->len - req_wrap->finished_len, options, &tmp_slice,
req->scratch + req_wrap->finished_len, dbg);
req->result =
Slice(req->scratch, req_wrap->finished_len + tmp_slice.size());
} else if (bytes_read < req_wrap->iov.iov_len) {
assert(bytes_read > 0);
assert(bytes_read + req_wrap->finished_len < req->len);
req_wrap->finished_len += bytes_read;
incomplete_rq_list.push_back(req_wrap);
} else { } else {
req->result = Slice(req->scratch, 0); req->result = Slice(req->scratch, 0);
req->status = IOError("Req failed", filename_, cqe->res); req->status = IOError("Req returned more bytes than requested",
filename_, cqe->res);
}
} }
io_uring_cqe_seen(iu, cqe); io_uring_cqe_seen(iu, cqe);
} }
num_reqs -= this_reqs;
reqs_off += this_reqs;
} }
return IOStatus::OK(); return IOStatus::OK();
#else #else

Loading…
Cancel
Save