@ -482,9 +482,6 @@ IOStatus PosixRandomAccessFile::MultiRead(FSReadRequest* reqs,
const IOOptions & options ,
const IOOptions & options ,
IODebugContext * dbg ) {
IODebugContext * dbg ) {
# if defined(ROCKSDB_IOURING_PRESENT)
# if defined(ROCKSDB_IOURING_PRESENT)
size_t reqs_off ;
ssize_t ret __attribute__ ( ( __unused__ ) ) ;
struct io_uring * iu = nullptr ;
struct io_uring * iu = nullptr ;
if ( thread_local_io_urings_ ) {
if ( thread_local_io_urings_ ) {
iu = static_cast < struct io_uring * > ( thread_local_io_urings_ - > Get ( ) ) ;
iu = static_cast < struct io_uring * > ( thread_local_io_urings_ - > Get ( ) ) ;
@ -505,35 +502,49 @@ IOStatus PosixRandomAccessFile::MultiRead(FSReadRequest* reqs,
struct WrappedReadRequest {
struct WrappedReadRequest {
FSReadRequest * req ;
FSReadRequest * req ;
struct iovec iov ;
struct iovec iov ;
explicit WrappedReadRequest ( FSReadRequest * r ) : req ( r ) { }
size_t finished_len ;
explicit WrappedReadRequest ( FSReadRequest * r ) : req ( r ) , finished_len ( 0 ) { }
} ;
} ;
autovector < WrappedReadRequest , 32 > req_wraps ;
autovector < WrappedReadRequest , 32 > req_wraps ;
autovector < WrappedReadRequest * , 4 > incomplete_rq_list ;
for ( size_t i = 0 ; i < num_reqs ; i + + ) {
for ( size_t i = 0 ; i < num_reqs ; i + + ) {
req_wraps . emplace_back ( & reqs [ i ] ) ;
req_wraps . emplace_back ( & reqs [ i ] ) ;
}
}
reqs_off = 0 ;
size_t reqs_off = 0 ;
while ( num_reqs ) {
while ( num_reqs > reqs_off | | ! incomplete_rq_list . empty ( ) ) {
size_t this_reqs = num_reqs ;
size_t this_reqs = ( num_reqs - reqs_off ) + incomplete_rq_list . size ( ) ;
// If requests exceed depth, split it into batches
// If requests exceed depth, split it into batches
if ( this_reqs > kIoUringDepth ) this_reqs = kIoUringDepth ;
if ( this_reqs > kIoUringDepth ) this_reqs = kIoUringDepth ;
assert ( incomplete_rq_list . size ( ) < = this_reqs ) ;
for ( size_t i = 0 ; i < this_reqs ; i + + ) {
for ( size_t i = 0 ; i < this_reqs ; i + + ) {
size_t index = i + reqs_off ;
WrappedReadRequest * rep_to_submit ;
struct io_uring_sqe * sqe ;
if ( i < incomplete_rq_list . size ( ) ) {
rep_to_submit = incomplete_rq_list [ i ] ;
} else {
rep_to_submit = & req_wraps [ reqs_off + + ] ;
}
assert ( rep_to_submit - > req - > len > rep_to_submit - > finished_len ) ;
rep_to_submit - > iov . iov_base =
rep_to_submit - > req - > scratch + rep_to_submit - > finished_len ;
rep_to_submit - > iov . iov_len =
rep_to_submit - > req - > len - rep_to_submit - > finished_len ;
struct io_uring_sqe * sqe ;
sqe = io_uring_get_sqe ( iu ) ;
sqe = io_uring_get_sqe ( iu ) ;
req_wraps [ index ] . iov . iov_base = reqs [ index ] . scratch ;
io_uring_prep_readv (
req_wraps [ index ] . iov . iov_len = reqs [ index ] . len ;
sqe , fd_ , & rep_to_submit - > iov , 1 ,
io_uring_prep_readv ( sqe , fd_ , & req_wraps [ index ] . iov , 1 ,
rep_to_submit - > req - > offset + rep_to_submit - > finished_len ) ;
reqs [ index ] . offset ) ;
io_uring_sqe_set_data ( sqe , rep_to_submit ) ;
io_uring_sqe_set_data ( sqe , & req_wraps [ index ] ) ;
}
}
incomplete_rq_list . clear ( ) ;
ret = io_uring_submit_and_wait ( iu , static_cast < unsigned int > ( this_reqs ) ) ;
ssize_t ret =
io_uring_submit_and_wait ( iu , static_cast < unsigned int > ( this_reqs ) ) ;
if ( static_cast < size_t > ( ret ) ! = this_reqs ) {
if ( static_cast < size_t > ( ret ) ! = this_reqs ) {
fprintf ( stderr , " ret = %ld this_reqs: %ld \n " , ( long ) ret , ( long ) this_reqs ) ;
fprintf ( stderr , " ret = %ld this_reqs: %ld \n " , ( long ) ret , ( long ) this_reqs ) ;
}
}
@ -547,21 +558,44 @@ IOStatus PosixRandomAccessFile::MultiRead(FSReadRequest* reqs,
// of our initial wait not reaping all completions
// of our initial wait not reaping all completions
ret = io_uring_wait_cqe ( iu , & cqe ) ;
ret = io_uring_wait_cqe ( iu , & cqe ) ;
assert ( ! ret ) ;
assert ( ! ret ) ;
req_wrap = static_cast < WrappedReadRequest * > ( io_uring_cqe_get_data ( cqe ) ) ;
req_wrap = static_cast < WrappedReadRequest * > ( io_uring_cqe_get_data ( cqe ) ) ;
FSReadRequest * req = req_wrap - > req ;
FSReadRequest * req = req_wrap - > req ;
if ( static_cast < size_t > ( cqe - > res ) = = req_wrap - > iov . iov_len ) {
if ( cqe - > res < 0 ) {
req - > result = Slice ( req - > scratch , cqe - > res ) ;
req - > result = Slice ( req - > scratch , 0 ) ;
req - > status = IOError ( " Req failed " , filename_ , cqe - > res ) ;
} else {
size_t bytes_read = static_cast < size_t > ( cqe - > res ) ;
TEST_SYNC_POINT_CALLBACK (
" PosixRandomAccessFile::MultiRead:io_uring_result " , & bytes_read ) ;
if ( bytes_read = = req_wrap - > iov . iov_len ) {
req - > result = Slice ( req - > scratch , req - > len ) ;
req - > status = IOStatus : : OK ( ) ;
req - > status = IOStatus : : OK ( ) ;
} else if ( cqe - > res > = 0 ) {
} else if ( bytes_read = = 0 ) {
req - > result = Slice ( req - > scratch , req_wrap - > iov . iov_len - cqe - > res ) ;
// cqe->res == 0 can means EOF, or can mean partial results. See
// comment
// https://github.com/facebook/rocksdb/pull/6441#issuecomment-589843435
// Fall back to pread in this case.
Slice tmp_slice ;
req - > status =
Read ( req - > offset + req_wrap - > finished_len ,
req - > len - req_wrap - > finished_len , options , & tmp_slice ,
req - > scratch + req_wrap - > finished_len , dbg ) ;
req - > result =
Slice ( req - > scratch , req_wrap - > finished_len + tmp_slice . size ( ) ) ;
} else if ( bytes_read < req_wrap - > iov . iov_len ) {
assert ( bytes_read > 0 ) ;
assert ( bytes_read + req_wrap - > finished_len < req - > len ) ;
req_wrap - > finished_len + = bytes_read ;
incomplete_rq_list . push_back ( req_wrap ) ;
} else {
} else {
req - > result = Slice ( req - > scratch , 0 ) ;
req - > result = Slice ( req - > scratch , 0 ) ;
req - > status = IOError ( " Req failed " , filename_ , cqe - > res ) ;
req - > status = IOError ( " Req returned more bytes than requested " ,
filename_ , cqe - > res ) ;
}
}
}
io_uring_cqe_seen ( iu , cqe ) ;
io_uring_cqe_seen ( iu , cqe ) ;
}
}
num_reqs - = this_reqs ;
reqs_off + = this_reqs ;
}
}
return IOStatus : : OK ( ) ;
return IOStatus : : OK ( ) ;
# else
# else