@ -10,6 +10,7 @@
# include "file/file_prefetch_buffer.h"
# include <algorithm>
# include <cassert>
# include "file/random_access_file_reader.h"
# include "monitoring/histogram.h"
@ -23,8 +24,8 @@ namespace ROCKSDB_NAMESPACE {
void FilePrefetchBuffer : : CalculateOffsetAndLen ( size_t alignment ,
uint64_t offset ,
size_t roundup_len , size_t index ,
bool refit_tail ,
size_t roundup_len ,
uint32_t index , bool refit_tail ,
uint64_t & chunk_len ) {
uint64_t chunk_offset_in_buffer = 0 ;
bool copy_data_to_new_buffer = false ;
@ -32,9 +33,7 @@ void FilePrefetchBuffer::CalculateOffsetAndLen(size_t alignment,
// If only a few bytes exist -- reuse them & read only what is really needed.
// This is typically the case of incremental reading of data.
// If no bytes exist in buffer -- full pread.
if ( bufs_ [ index ] . buffer_ . CurrentSize ( ) > 0 & &
offset > = bufs_ [ index ] . offset_ & &
offset < = bufs_ [ index ] . offset_ + bufs_ [ index ] . buffer_ . CurrentSize ( ) ) {
if ( DoesBufferContainData ( index ) & & IsOffsetInBuffer ( offset , index ) ) {
// Only a few requested bytes are in the buffer. memmove those chunk of
// bytes to the beginning, and memcpy them back into the new buffer if a
// new buffer is created.
@ -43,7 +42,7 @@ void FilePrefetchBuffer::CalculateOffsetAndLen(size_t alignment,
chunk_len = static_cast < uint64_t > ( bufs_ [ index ] . buffer_ . CurrentSize ( ) ) -
chunk_offset_in_buffer ;
assert ( chunk_offset_in_buffer % alignment = = 0 ) ;
// assert(chunk_len % alignment == 0);
assert ( chunk_len % alignment = = 0 ) ;
assert ( chunk_offset_in_buffer + chunk_len < =
bufs_ [ index ] . offset_ + bufs_ [ index ] . buffer_ . CurrentSize ( ) ) ;
if ( chunk_len > 0 ) {
@ -108,7 +107,7 @@ Status FilePrefetchBuffer::Read(const IOOptions& opts,
Status FilePrefetchBuffer : : ReadAsync ( const IOOptions & opts ,
RandomAccessFileReader * reader ,
uint64_t read_len , uint64_t chunk_len ,
uint64_t read_len ,
uint64_t rounddown_start , uint32_t index ) {
// callback for async read request.
auto fp = std : : bind ( & FilePrefetchBuffer : : PrefetchAsyncCallback , this ,
@ -116,15 +115,18 @@ Status FilePrefetchBuffer::ReadAsync(const IOOptions& opts,
FSReadRequest req ;
Slice result ;
req . len = read_len ;
req . offset = rounddown_start + chunk_len ;
req . offset = rounddown_start ;
req . result = result ;
req . scratch = bufs_ [ index ] . buffer_ . BufferStart ( ) + chunk_len ;
Status s = reader - > ReadAsync ( req , opts , fp ,
/*cb_arg=*/ nullptr , & io_handle_ , & del_fn_ ,
req . scratch = bufs_ [ index ] . buffer_ . BufferStart ( ) ;
bufs_ [ index ] . async_req_len_ = req . len ;
Status s =
reader - > ReadAsync ( req , opts , fp , & ( bufs_ [ index ] . pos_ ) ,
& ( bufs_ [ index ] . io_handle_ ) , & ( bufs_ [ index ] . del_fn_ ) ,
/*aligned_buf=*/ nullptr ) ;
req . status . PermitUncheckedError ( ) ;
if ( s . ok ( ) ) {
async_read_in_progress_ = true ;
bufs_ [ index ] . async_read_in_progress_ = true ;
}
return s ;
}
@ -170,8 +172,7 @@ void FilePrefetchBuffer::CopyDataToBuffer(uint32_t src, uint64_t& offset,
}
uint64_t copy_offset = ( offset - bufs_ [ src ] . offset_ ) ;
size_t copy_len = 0 ;
if ( offset + length < =
bufs_ [ src ] . offset_ + bufs_ [ src ] . buffer_ . CurrentSize ( ) ) {
if ( IsDataBlockInBuffer ( offset , length , src ) ) {
// All the bytes are in src.
copy_len = length ;
} else {
@ -194,65 +195,121 @@ void FilePrefetchBuffer::CopyDataToBuffer(uint32_t src, uint64_t& offset,
}
}
void FilePrefetchBuffer : : PollAndUpdateBuffersIfNeeded ( uint64_t offset ) {
if ( async_read_in_progress_ & & fs_ ! = nullptr ) {
// Wait for prefetch data to complete.
// No mutex is needed as PrefetchAsyncCallback updates the result in second
// buffer and FilePrefetchBuffer should wait for Poll before accessing the
// second buffer.
// Clear the buffers if it contains outdated data. Outdated data can be
// because previous sequential reads were read from the cache instead of these
// buffer. In that case outdated IOs should be aborted.
void FilePrefetchBuffer : : AbortIOIfNeeded ( uint64_t offset ) {
uint32_t second = curr_ ^ 1 ;
std : : vector < void * > handles ;
handles . emplace_back ( io_handle_ ) ;
StopWatch sw ( clock_ , stats_ , POLL_WAIT_MICROS ) ;
fs_ - > Poll ( handles , 1 ) . PermitUncheckedError ( ) ;
autovector < uint32_t > buf_pos ;
if ( IsBufferOutdatedWithAsyncProgress ( offset , curr_ ) ) {
handles . emplace_back ( bufs_ [ curr_ ] . io_handle_ ) ;
buf_pos . emplace_back ( curr_ ) ;
}
if ( IsBufferOutdatedWithAsyncProgress ( offset , second ) ) {
handles . emplace_back ( bufs_ [ second ] . io_handle_ ) ;
buf_pos . emplace_back ( second ) ;
}
if ( ! handles . empty ( ) ) {
StopWatch sw ( clock_ , stats_ , ASYNC_PREFETCH_ABORT_MICROS ) ;
Status s = fs_ - > AbortIO ( handles ) ;
assert ( s . ok ( ) ) ;
}
// Reset and Release io_handle_ after the Poll API as request has been
// completed.
async_read_in_progress_ = false ;
if ( io_handle_ ! = nullptr & & del_fn_ ! = nullptr ) {
del_fn_ ( io_handle_ ) ;
io_handle_ = nullptr ;
del_fn_ = nullptr ;
for ( auto & pos : buf_pos ) {
// Release io_handle.
DestroyAndClearIOHandle ( pos ) ;
}
if ( bufs_ [ second ] . io_handle_ = = nullptr ) {
bufs_ [ second ] . async_read_in_progress_ = false ;
}
if ( bufs_ [ curr_ ] . io_handle_ = = nullptr & &
bufs_ [ curr_ ] . async_read_in_progress_ ) {
bufs_ [ curr_ ] . async_read_in_progress_ = false ;
curr_ = curr_ ^ 1 ;
}
}
// Index of second buffer.
void FilePrefetchBuffer : : AbortAllIOs ( ) {
uint32_t second = curr_ ^ 1 ;
std : : vector < void * > handles ;
for ( uint32_t i = 0 ; i < 2 ; i + + ) {
if ( bufs_ [ i ] . async_read_in_progress_ & & bufs_ [ i ] . io_handle_ ! = nullptr ) {
handles . emplace_back ( bufs_ [ i ] . io_handle_ ) ;
}
}
if ( ! handles . empty ( ) ) {
StopWatch sw ( clock_ , stats_ , ASYNC_PREFETCH_ABORT_MICROS ) ;
Status s = fs_ - > AbortIO ( handles ) ;
assert ( s . ok ( ) ) ;
}
// First clear the buffers if it contains outdated data. Outdated data can be
// because previous sequential reads were read from the cache instead of these
// buffer.
{
if ( bufs_ [ curr_ ] . buffer_ . CurrentSize ( ) > 0 & &
offset > = bufs_ [ curr_ ] . offset_ + bufs_ [ curr_ ] . buffer_ . CurrentSize ( ) ) {
// Release io_handles.
if ( bufs_ [ curr_ ] . io_handle_ ! = nullptr & & bufs_ [ curr_ ] . del_fn_ ! = nullptr ) {
DestroyAndClearIOHandle ( curr_ ) ;
}
if ( bufs_ [ second ] . io_handle_ ! = nullptr & & bufs_ [ second ] . del_fn_ ! = nullptr ) {
DestroyAndClearIOHandle ( second ) ;
}
}
// Clear the buffers if it contains outdated data. Outdated data can be
// because previous sequential reads were read from the cache instead of these
// buffer.
void FilePrefetchBuffer : : UpdateBuffersIfNeeded ( uint64_t offset ) {
uint32_t second = curr_ ^ 1 ;
if ( IsBufferOutdated ( offset , curr_ ) ) {
bufs_ [ curr_ ] . buffer_ . Clear ( ) ;
}
if ( bufs_ [ second ] . buffer_ . CurrentSize ( ) > 0 & &
offset > = bufs_ [ second ] . offset_ + bufs_ [ second ] . buffer_ . CurrentSize ( ) ) {
if ( IsBufferOutdated ( offset , second ) ) {
bufs_ [ second ] . buffer_ . Clear ( ) ;
}
}
// If data is in second buffer, make it curr_. Second buffer can be either
// partial filled or full.
if ( bufs_ [ second ] . buffer_ . CurrentSize ( ) > 0 & &
offset > = bufs_ [ second ] . offset_ & &
offset < bufs_ [ second ] . offset_ + bufs_ [ second ] . buffer_ . CurrentSize ( ) ) {
// If data starts from second buffer, make it curr_. Second buffer can be
// either partial filled or full.
if ( ! bufs_ [ second ] . async_read_in_progress_ & & DoesBufferContainData ( second ) & &
IsOffsetInBuffer ( offset , second ) ) {
// Clear the curr_ as buffers have been swapped and curr_ contains the
// outdated data and switch the buffers.
if ( ! bufs_ [ curr_ ] . async_read_in_progress_ ) {
bufs_ [ curr_ ] . buffer_ . Clear ( ) ;
}
curr_ = curr_ ^ 1 ;
}
}
// If async_read = true:
// async_read is enabled in case of sequential reads. So when
// buffers are switched, we clear the curr_ buffer as we assume the data has
// been consumed because of sequential reads.
void FilePrefetchBuffer : : PollAndUpdateBuffersIfNeeded ( uint64_t offset ) {
if ( bufs_ [ curr_ ] . async_read_in_progress_ & & fs_ ! = nullptr ) {
if ( bufs_ [ curr_ ] . io_handle_ ! = nullptr ) {
// Wait for prefetch data to complete.
// No mutex is needed as async_read_in_progress behaves as mutex and is
// updated by main thread only.
std : : vector < void * > handles ;
handles . emplace_back ( bufs_ [ curr_ ] . io_handle_ ) ;
StopWatch sw ( clock_ , stats_ , POLL_WAIT_MICROS ) ;
fs_ - > Poll ( handles , 1 ) . PermitUncheckedError ( ) ;
}
// Reset and Release io_handle after the Poll API as request has been
// completed.
DestroyAndClearIOHandle ( curr_ ) ;
}
UpdateBuffersIfNeeded ( offset ) ;
}
// If async_io is enabled in case of sequential reads, PrefetchAsyncInternal is
// called. When buffers are switched, we clear the curr_ buffer as we assume the
// data has been consumed because of sequential reads.
// Data in buffers will always be sequential with curr_ following second and
// not vice versa.
//
// Scenarios for prefetching asynchronously:
// Case1: If both buffers are empty, prefetch n bytes
// synchronously in curr_
// and prefetch readahead_size_/2 async in second buffer.
// Case1: If both buffers are empty, prefetch n + readahead_size_/2 bytes
// synchronously in curr_ and prefetch readahead_size_/2 async in second
// buffer.
// Case2: If second buffer has partial or full data, make it current and
// prefetch readahead_size_/2 async in second buffer. In case of
// partial data, prefetch remaining bytes from size n synchronously to
@ -260,9 +317,10 @@ void FilePrefetchBuffer::PollAndUpdateBuffersIfNeeded(uint64_t offset) {
// Case3: If curr_ has partial data, prefetch remaining bytes from size n
// synchronously in curr_ to fulfill the requested bytes request and
// prefetch readahead_size_/2 bytes async in second buffer.
// Case4: If data is in both buffers, copy requested data from curr_ and second
// buffer to third buffer. If all requested bytes have been copied, do
// the asynchronous prefetching in second buffer.
// Case4: (Special case) If data is in both buffers, copy requested data from
// curr_, send async request on curr_, wait for poll to fill second
// buffer (if any), and copy remaining data from second buffer to third
// buffer.
Status FilePrefetchBuffer : : PrefetchAsyncInternal (
const IOOptions & opts , RandomAccessFileReader * reader , uint64_t offset ,
size_t length , size_t readahead_size , Env : : IOPriority rate_limiter_priority ,
@ -273,39 +331,30 @@ Status FilePrefetchBuffer::PrefetchAsyncInternal(
TEST_SYNC_POINT ( " FilePrefetchBuffer::PrefetchAsyncInternal:Start " ) ;
PollAndUpdateBuffersIfNeeded ( offset ) ;
// If all the requested bytes are in curr_, it will go for async prefetching
// only.
if ( bufs_ [ curr_ ] . buffer_ . CurrentSize ( ) > 0 & &
offset + length < =
bufs_ [ curr_ ] . offset_ + bufs_ [ curr_ ] . buffer_ . CurrentSize ( ) ) {
offset + = length ;
length = 0 ;
size_t alignment = reader - > file ( ) - > GetRequiredBufferAlignment ( ) ;
Status s ;
uint64_t tmp_offset = offset ;
size_t tmp_length = length ;
// Since async request was submitted directly by calling PrefetchAsync in
// last call, we don't need to prefetch further as this call is to poll the
// data submitted in previous call.
if ( async_request _submitted_) {
return Status : : OK ( ) ;
// 1. Abort IO and swap buffers if needed to point curr_ to first buffer with
// data.
{
if ( ! explicit_prefetch_submitted_ ) {
AbortIOIfNeeded ( offset ) ;
}
UpdateBuffersIfNeeded ( offset ) ;
}
async_request_submitted_ = false ;
Status s ;
size_t prefetch_size = length + readahead_size ;
size_t alignment = reader - > file ( ) - > GetRequiredBufferAlignment ( ) ;
// Index of second buffer.
uint32_t second = curr_ ^ 1 ;
// Data is overlapping i.e. some of the data is in curr_ buffer and remaining
// in second buffer.
if ( bufs_ [ curr_ ] . buffer_ . CurrentSize ( ) > 0 & &
bufs_ [ second ] . buffer_ . CurrentSize ( ) > 0 & &
offset > = bufs_ [ curr_ ] . offset_ & &
offset < bufs_ [ curr_ ] . offset_ + bufs_ [ curr_ ] . buffer_ . CurrentSize ( ) & &
offset + length > bufs_ [ second ] . offset_ ) {
// 2. If data is overlapping over two buffers, copy the data from curr_ and
// call ReadAsync on curr_.
if ( ! bufs_ [ curr_ ] . async_read_in_progress_ & & DoesBufferContainData ( curr_ ) & &
IsOffsetInBuffer ( offset , curr_ ) & &
( /*Data extends over curr_ buffer and second buffer either has data or in
process of population = */
( offset + length > bufs_ [ second ] . offset_ ) & &
( bufs_ [ second ] . async_read_in_progress_ | |
DoesBufferContainData ( second ) ) ) ) {
// Allocate new buffer to third buffer;
bufs_ [ 2 ] . buffer_ . Clear ( ) ;
bufs_ [ 2 ] . buffer_ . Alignment ( alignment ) ;
@ -313,25 +362,92 @@ Status FilePrefetchBuffer::PrefetchAsyncInternal(
bufs_ [ 2 ] . offset_ = offset ;
copy_to_third_buffer = true ;
// Move data from curr_ buffer to third.
CopyDataToBuffer ( curr_ , offset , length ) ;
if ( length = = 0 ) {
// Requested data has been copied and curr_ still has unconsumed data.
CopyDataToBuffer ( curr_ , tmp_offset , tmp_length ) ;
// Call async prefetching on curr_ since data has been consumed in curr_
// only if data lies within second buffer.
size_t second_size = bufs_ [ second ] . async_read_in_progress_
? bufs_ [ second ] . async_req_len_
: bufs_ [ second ] . buffer_ . CurrentSize ( ) ;
if ( tmp_offset + tmp_length < = bufs_ [ second ] . offset_ + second_size ) {
uint64_t rounddown_start = bufs_ [ second ] . offset_ + second_size ;
uint64_t roundup_end =
Roundup ( rounddown_start + readahead_size , alignment ) ;
uint64_t roundup_len = roundup_end - rounddown_start ;
uint64_t chunk_len = 0 ;
CalculateOffsetAndLen ( alignment , rounddown_start , roundup_len , curr_ ,
false , chunk_len ) ;
assert ( chunk_len = = 0 ) ;
assert ( roundup_len > = chunk_len ) ;
bufs_ [ curr_ ] . offset_ = rounddown_start ;
uint64_t read_len = static_cast < size_t > ( roundup_len - chunk_len ) ;
s = ReadAsync ( opts , reader , read_len , rounddown_start , curr_ ) ;
if ( ! s . ok ( ) ) {
DestroyAndClearIOHandle ( curr_ ) ;
bufs_ [ curr_ ] . buffer_ . Clear ( ) ;
return s ;
}
}
curr_ = curr_ ^ 1 ;
}
// 3. Call Poll only if data is needed for the second buffer.
// - Return if whole data is in curr_ and second buffer in progress.
// - If second buffer is empty, it will go for ReadAsync for second buffer.
if ( ! bufs_ [ curr_ ] . async_read_in_progress_ & & DoesBufferContainData ( curr_ ) & &
IsDataBlockInBuffer ( offset , length , curr_ ) ) {
// Whole data is in curr_.
UpdateBuffersIfNeeded ( offset ) ;
second = curr_ ^ 1 ;
if ( bufs_ [ second ] . async_read_in_progress_ ) {
return s ;
}
CopyDataToBuffer ( second , offset , length ) ;
// Length == 0: All the requested data has been copied to third buffer. It
// should go for only async prefetching.
} else {
PollAndUpdateBuffersIfNeeded ( offset ) ;
second = curr_ ^ 1 ;
}
if ( copy_to_third_buffer ) {
offset = tmp_offset ;
length = tmp_length ;
}
// 4. After polling and swapping buffers, if all the requested bytes are in
// curr_, it will only go for async prefetching.
// copy_to_third_buffer is a special case so it will be handled separately.
if ( ! copy_to_third_buffer & & DoesBufferContainData ( curr_ ) & &
IsDataBlockInBuffer ( offset , length , curr_ ) ) {
offset + = length ;
length = 0 ;
// Since async request was submitted directly by calling PrefetchAsync in
// last call, we don't need to prefetch further as this call is to poll
// the data submitted in previous call.
if ( explicit_prefetch_submitted_ ) {
return s ;
}
}
// 5. Data is overlapping i.e. some of the data has been copied to third
// buffer
// and remaining will be updated below.
if ( copy_to_third_buffer ) {
CopyDataToBuffer ( curr_ , offset , length ) ;
// Length == 0: All the requested data has been copied to third buffer and
// it has already gone for async prefetching. It can return without doing
// anything further.
// Length > 0: More data needs to be consumed so it will continue async and
// sync prefetching and copy the remaining data to third buffer in the end.
// swap the buffers.
curr_ = curr_ ^ 1 ;
// Update prefetch_size as length has been updated in CopyDataToBuffer.
prefetch_size = length + readahead_size ;
if ( length = = 0 ) {
return s ;
}
}
// 6. Go for ReadAsync and Read (if needed).
size_t prefetch_size = length + readahead_size ;
size_t _offset = static_cast < size_t > ( offset ) ;
second = curr_ ^ 1 ;
// offset and size alignment for curr_ buffer with synchronous prefetching
uint64_t rounddown_start1 = Rounddown ( _offset , alignment ) ;
@ -368,19 +484,34 @@ Status FilePrefetchBuffer::PrefetchAsyncInternal(
uint64_t chunk_len2 = 0 ;
CalculateOffsetAndLen ( alignment , rounddown_start2 , roundup_len2 , second ,
false /*refit_tail*/ , chunk_len2 ) ;
assert ( chunk_len2 = = 0 ) ;
// Update the buffer offset.
bufs_ [ second ] . offset_ = rounddown_start2 ;
assert ( roundup_len2 > = chunk_len2 ) ;
uint64_t read_len2 = static_cast < size_t > ( roundup_len2 - chunk_len2 ) ;
ReadAsync ( opts , reader , read_len2 , chunk_len2 , rounddown_start2 , second )
. PermitUncheckedError ( ) ;
Status tmp_s = ReadAsync ( opts , reader , read_len2 , rounddown_start2 , second ) ;
if ( ! tmp_s . ok ( ) ) {
DestroyAndClearIOHandle ( second ) ;
bufs_ [ second ] . buffer_ . Clear ( ) ;
}
}
if ( read_len1 > 0 ) {
s = Read ( opts , reader , rate_limiter_priority , read_len1 , chunk_len1 ,
rounddown_start1 , curr_ ) ;
if ( ! s . ok ( ) ) {
if ( bufs_ [ second ] . io_handle_ ! = nullptr ) {
std : : vector < void * > handles ;
handles . emplace_back ( bufs_ [ second ] . io_handle_ ) ;
{
StopWatch sw ( clock_ , stats_ , ASYNC_PREFETCH_ABORT_MICROS ) ;
Status status = fs_ - > AbortIO ( handles ) ;
assert ( status . ok ( ) ) ;
}
}
DestroyAndClearIOHandle ( second ) ;
bufs_ [ second ] . buffer_ . Clear ( ) ;
bufs_ [ curr_ ] . buffer_ . Clear ( ) ;
return s ;
}
}
@ -462,12 +593,18 @@ bool FilePrefetchBuffer::TryReadFromCacheAsync(
return false ;
}
// In case of async_io_, offset can be less than bufs_[curr_].offset_ because
// of reads not sequential and PrefetchAsync can be called for any block and
// RocksDB will call TryReadFromCacheAsync after PrefetchAsync to Poll for
// requested bytes.
if ( bufs_ [ curr_ ] . buffer_ . CurrentSize ( ) > 0 & & offset < bufs_ [ curr_ ] . offset_ & &
prev_len_ ! = 0 ) {
if ( explicit_prefetch_submitted_ ) {
if ( prev_offset_ ! = offset ) {
// Random offset called. So abort the IOs.
AbortAllIOs ( ) ;
bufs_ [ curr_ ] . buffer_ . Clear ( ) ;
bufs_ [ curr_ ^ 1 ] . buffer_ . Clear ( ) ;
explicit_prefetch_submitted_ = false ;
return false ;
}
}
if ( ! explicit_prefetch_submitted_ & & offset < bufs_ [ curr_ ] . offset_ ) {
return false ;
}
@ -479,8 +616,11 @@ bool FilePrefetchBuffer::TryReadFromCacheAsync(
// If readahead is not enabled: return false.
TEST_SYNC_POINT_CALLBACK ( " FilePrefetchBuffer::TryReadFromCache " ,
& readahead_size_ ) ;
if ( offset < bufs_ [ curr_ ] . offset_ | |
offset + n > bufs_ [ curr_ ] . offset_ + bufs_ [ curr_ ] . buffer_ . CurrentSize ( ) ) {
if ( explicit_prefetch_submitted_ | |
( bufs_ [ curr_ ] . async_read_in_progress_ | |
offset + n >
bufs_ [ curr_ ] . offset_ + bufs_ [ curr_ ] . buffer_ . CurrentSize ( ) ) ) {
if ( readahead_size_ > 0 ) {
Status s ;
assert ( reader ! = nullptr ) ;
@ -493,11 +633,11 @@ bool FilePrefetchBuffer::TryReadFromCacheAsync(
return false ;
}
}
// Prefetch n + readahead_size_/2 synchronously as remaining
// readahead_size_/2 will be prefetched asynchronously.
s = PrefetchAsyncInternal ( opts , reader , offset , n , readahead_size_ / 2 ,
rate_limiter_priority , copy_to_third_buffer ) ;
explicit_prefetch_submitted_ = false ;
if ( ! s . ok ( ) ) {
if ( status ) {
* status = s ;
@ -507,11 +647,12 @@ bool FilePrefetchBuffer::TryReadFromCacheAsync(
# endif
return false ;
}
prefetched = async_request _submitted_ ? false : true ;
prefetched = explicit_prefetch _submitted_ ? false : true ;
} else {
return false ;
}
}
UpdateReadPattern ( offset , n , false /*decrease_readaheadsize*/ ) ;
uint32_t index = curr_ ;
@ -523,14 +664,12 @@ bool FilePrefetchBuffer::TryReadFromCacheAsync(
if ( prefetched ) {
readahead_size_ = std : : min ( max_readahead_size_ , readahead_size_ * 2 ) ;
}
async_request_submitted_ = false ;
return true ;
}
void FilePrefetchBuffer : : PrefetchAsyncCallback ( const FSReadRequest & req ,
void * /*cb_arg*/ ) {
uint32_t index = curr_ ^ 1 ;
void * cb_arg ) {
uint32_t index = * ( static_cast < uint32_t * > ( cb_arg ) ) ;
# ifndef NDEBUG
if ( req . result . size ( ) < req . len ) {
// Fake an IO error to force db_stress fault injection to ignore
@ -565,82 +704,133 @@ Status FilePrefetchBuffer::PrefetchAsync(const IOOptions& opts,
if ( ! enable_ ) {
return Status : : NotSupported ( ) ;
}
TEST_SYNC_POINT ( " FilePrefetchBuffer::PrefetchAsync:Start " ) ;
PollAndUpdateBuffersIfNeeded ( offset ) ;
num_file_reads_ = 0 ;
explicit_prefetch_submitted_ = false ;
bool is_eligible_for_prefetching = false ;
if ( readahead_size_ > 0 & &
( ! implicit_auto_readahead_ | |
num_file_reads_ + 1 > = num_file_reads_for_auto_readahead_ ) ) {
is_eligible_for_prefetching = true ;
}
// Index of second buffer.
uint32_t second = curr_ ^ 1 ;
// 1. Cancel any pending async read to make code simpler as buffers can be out
// of sync.
AbortAllIOs ( ) ;
// 2. Clear outdated data.
UpdateBuffersIfNeeded ( offset ) ;
uint32_t second = curr_ ^ 1 ;
// Since PrefetchAsync can be called on non sequential reads. So offset can
// be less than buffers' offset. In that case it clears the buffer and
// prefetch that block.
if ( bufs_ [ curr_ ] . buffer_ . CurrentSize ( ) > 0 & & offset < bufs_ [ curr_ ] . offset_ ) {
// be less than curr_ buffers' offset. In that case also it clears bo th
// buffers .
if ( DoesBufferContainData ( curr_ ) & & ! IsOffsetInBuffer ( offset , curr_ ) ) {
bufs_ [ curr_ ] . buffer_ . Clear ( ) ;
bufs_ [ second ] . buffer_ . Clear ( ) ;
}
// All requested bytes are already in the curr_ buffer. So no need to Read
// again.
if ( bufs_ [ curr_ ] . buffer_ . CurrentSize ( ) > 0 & &
offset + n < = bufs_ [ curr_ ] . offset_ + bufs_ [ curr_ ] . buffer_ . CurrentSize ( ) ) {
UpdateReadPattern ( offset , n , /*decrease_readaheadsize=*/ false ) ;
bool data_found = false ;
// 3. If curr_ has full data.
if ( DoesBufferContainData ( curr_ ) & & IsDataBlockInBuffer ( offset , n , curr_ ) ) {
uint64_t offset_in_buffer = offset - bufs_ [ curr_ ] . offset_ ;
* result = Slice ( bufs_ [ curr_ ] . buffer_ . BufferStart ( ) + offset_in_buffer , n ) ;
data_found = true ;
// Update num_file_reads_ as TryReadFromCacheAsync won't be called for
// poll and update num_file_reads_ if data is found.
num_file_reads_ + + ;
// 3.1 If second also has some data or is not eligible for prefetching,
// return.
if ( ! is_eligible_for_prefetching | | DoesBufferContainData ( second ) ) {
return Status : : OK ( ) ;
}
} else {
// Partial data in curr_.
bufs_ [ curr_ ] . buffer_ . Clear ( ) ;
}
bufs_ [ second ] . buffer_ . Clear ( ) ;
Status s ;
size_t alignment = reader - > file ( ) - > GetRequiredBufferAlignment ( ) ;
// TODO akanksha: Handle the scenario if data is overlapping in 2 buffers.
// Currently, tt covers 2 scenarios. Either one buffer (curr_) has no data or
// it has partial data. It ignores the contents in second buffer (overlapping
// data in 2 buffers) and send the request to re-read that data again.
// Clear the second buffer in order to do asynchronous prefetching.
bufs_ [ second ] . buffer_ . Clear ( ) ;
size_t prefetch_size = is_eligible_for_prefetching ? readahead_size_ / 2 : 0 ;
size_t offset_to_read = static_cast < size_t > ( offset ) ;
uint64_t rounddown_start = 0 ;
uint64_t roundup_end = 0 ;
uint64_t rounddown_start1 = 0 ;
uint64_t roundup_end1 = 0 ;
uint64_t rounddown_start2 = 0 ;
uint64_t roundup_end2 = 0 ;
uint64_t chunk_len1 = 0 ;
uint64_t chunk_len2 = 0 ;
size_t read_len1 = 0 ;
size_t read_len2 = 0 ;
// - If curr_ is empty.
// - Call async read for full data + prefetch_size on curr_.
// - Call async read for prefetch_size on second if eligible.
// - If curr_ is filled.
// - prefetch_size on second.
// Calculate length and offsets for reading.
if ( ! DoesBufferContainData ( curr_ ) ) {
// Prefetch full data + prefetch_size in curr_.
rounddown_start1 = Rounddown ( offset_to_read , alignment ) ;
roundup_end1 = Roundup ( offset_to_read + n + prefetch_size , alignment ) ;
uint64_t roundup_len1 = roundup_end1 - rounddown_start1 ;
assert ( roundup_len1 > = alignment ) ;
assert ( roundup_len1 % alignment = = 0 ) ;
if ( bufs_ [ curr_ ] . buffer_ . CurrentSize ( ) = = 0 ) {
// Prefetch full data.
rounddown_start = Rounddown ( offset_to_read , alignment ) ;
roundup_end = Roundup ( offset_to_read + n , alignment ) ;
} else {
// Prefetch remaining data.
size_t rem_length = n - ( bufs_ [ curr_ ] . buffer_ . CurrentSize ( ) -
( offset - bufs_ [ curr_ ] . offset_ ) ) ;
rounddown_start = bufs_ [ curr_ ] . offset_ + bufs_ [ curr_ ] . buffer_ . CurrentSize ( ) ;
roundup_end = Roundup ( rounddown_start + rem_length , alignment ) ;
CalculateOffsetAndLen ( alignment , rounddown_start1 , roundup_len1 , curr_ ,
false , chunk_len1 ) ;
assert ( chunk_len1 = = 0 ) ;
assert ( roundup_len1 > = chunk_len1 ) ;
read_len1 = static_cast < size_t > ( roundup_len1 - chunk_len1 ) ;
bufs_ [ curr_ ] . offset_ = rounddown_start1 ;
}
uint64_t roundup_len = roundup_end - rounddown_start ;
assert ( roundup_len > = alignment ) ;
assert ( roundup_len % alignment = = 0 ) ;
if ( is_eligible_for_prefetching ) {
if ( DoesBufferContainData ( curr_ ) ) {
rounddown_start2 =
bufs_ [ curr_ ] . offset_ + bufs_ [ curr_ ] . buffer_ . CurrentSize ( ) ;
} else {
rounddown_start2 = roundup_end1 ;
}
uint64_t chunk_len = 0 ;
CalculateOffsetAndLen ( alignment , rounddown_start , roundup_len , second , false ,
chunk_len ) ;
roundup_end2 = Roundup ( rounddown_start2 + prefetch_size , alignment ) ;
uint64_t roundup_len2 = roundup_end2 - rounddown_start2 ;
assert ( roundup_len2 > = alignment ) ;
CalculateOffsetAndLen ( alignment , rounddown_start2 , roundup_len2 , second ,
false , chunk_len2 ) ;
assert ( chunk_len2 = = 0 ) ;
assert ( roundup_len2 > = chunk_len2 ) ;
read_len2 = static_cast < size_t > ( roundup_len2 - chunk_len2 ) ;
// Update the buffer offset.
bufs_ [ second ] . offset_ = rounddown_start ;
assert ( roundup_len > = chunk_len ) ;
size_t read_len = static_cast < size_t > ( roundup_len - chunk_len ) ;
s = ReadAsync ( opts , reader , read_len , chunk_len , rounddown_start , second ) ;
bufs_ [ second ] . offset_ = rounddown_start2 ;
}
if ( read_len1 ) {
s = ReadAsync ( opts , reader , read_len1 , rounddown_start1 , curr_ ) ;
if ( ! s . ok ( ) ) {
DestroyAndClearIOHandle ( curr_ ) ;
bufs_ [ curr_ ] . buffer_ . Clear ( ) ;
return s ;
}
// Update read pattern so that TryReadFromCacheAsync call be called to Poll
// the data. It will return without polling if blocks are not sequential.
UpdateReadPattern ( offset , n , /*decrease_readaheadsize=*/ false ) ;
explicit_prefetch_submitted_ = true ;
prev_len_ = 0 ;
async_request_submitted_ = true ;
return Status : : TryAgain ( ) ;
}
if ( read_len2 ) {
s = ReadAsync ( opts , reader , read_len2 , rounddown_start2 , second ) ;
if ( ! s . ok ( ) ) {
DestroyAndClearIOHandle ( second ) ;
bufs_ [ second ] . buffer_ . Clear ( ) ;
return s ;
}
readahead_size_ = std : : min ( max_readahead_size_ , readahead_size_ * 2 ) ;
}
return ( data_found ? Status : : OK ( ) : Status : : TryAgain ( ) ) ;
}
} // namespace ROCKSDB_NAMESPACE