@ -30,6 +30,7 @@
# include "util/iostats_context_imp.h"
# include "util/rate_limiter.h"
# include "util/sync_point.h"
# include "util/aligned_buffer.h"
# include "util/thread_status_updater.h"
# include "util/thread_status_util.h"
@ -161,15 +162,6 @@ inline int fsync(HANDLE hFile) {
return 0 ;
}
inline size_t TruncateToPageBoundary ( size_t page_size , size_t s ) {
s - = ( s & ( page_size - 1 ) ) ;
assert ( ( s % page_size ) = = 0 ) ;
return s ;
}
// Roundup x to a multiple of y
inline size_t Roundup ( size_t x , size_t y ) { return ( ( x + y - 1 ) / y ) * y ; }
// SetFileInformationByHandle() is capable of fast pre-allocates.
// However, this does not change the file end position unless the file is
// truncated and the pre-allocated space is not considered filled with zeros.
@ -492,7 +484,6 @@ class WinMmapFile : public WritableFile {
size_t n = std : : min ( left , avail ) ;
memcpy ( dst_ , src , n ) ;
IOSTATS_ADD ( bytes_written , n ) ;
dst_ + = n ;
src + = n ;
left - = n ;
@ -502,6 +493,12 @@ class WinMmapFile : public WritableFile {
return Status : : OK ( ) ;
}
// Means Close() will properly take care of truncate
// and it does not need any additional information
virtual Status Truncate ( uint64_t size ) override {
return Status : : OK ( ) ;
}
virtual Status Close ( ) override {
Status s ;
@ -612,94 +609,6 @@ class WinMmapFile : public WritableFile {
}
} ;
// This class is to manage an aligned user
// allocated buffer for unbuffered I/O purposes
// though it does not make a difference if you need a buffer.
class AlignedBuffer {
const size_t alignment_ ;
std : : unique_ptr < char [ ] > buf_ ;
size_t capacity_ ;
size_t cursize_ ;
char * bufstart_ ;
public :
explicit AlignedBuffer ( size_t alignment )
: alignment_ ( alignment ) , capacity_ ( 0 ) , cursize_ ( 0 ) , bufstart_ ( nullptr ) {
assert ( alignment > 0 ) ;
assert ( ( alignment & ( alignment - 1 ) ) = = 0 ) ;
}
size_t GetAlignment ( ) const { return alignment_ ; }
size_t GetCapacity ( ) const { return capacity_ ; }
size_t GetCurrentSize ( ) const { return cursize_ ; }
const char * GetBufferStart ( ) const { return bufstart_ ; }
void Clear ( ) { cursize_ = 0 ; }
// Allocates a new buffer and sets bufstart_ to the aligned first byte
void AllocateNewBuffer ( size_t requestedCapacity ) {
size_t size = Roundup ( requestedCapacity , alignment_ ) ;
buf_ . reset ( new char [ size + alignment_ ] ) ;
char * p = buf_ . get ( ) ;
bufstart_ = reinterpret_cast < char * > (
( reinterpret_cast < uintptr_t > ( p ) + ( alignment_ - 1 ) ) &
~ static_cast < uintptr_t > ( alignment_ - 1 ) ) ;
capacity_ = size ;
cursize_ = 0 ;
}
// Used for write
// Returns the number of bytes appended
size_t Append ( const char * src , size_t append_size ) {
size_t buffer_remaining = capacity_ - cursize_ ;
size_t to_copy = std : : min ( append_size , buffer_remaining ) ;
if ( to_copy > 0 ) {
memcpy ( bufstart_ + cursize_ , src , to_copy ) ;
cursize_ + = to_copy ;
}
return to_copy ;
}
size_t Read ( char * dest , size_t offset , size_t read_size ) const {
assert ( offset < cursize_ ) ;
size_t to_read = std : : min ( cursize_ - offset , read_size ) ;
if ( to_read > 0 ) {
memcpy ( dest , bufstart_ + offset , to_read ) ;
}
return to_read ;
}
/// Pad to alignment
void PadToAlignmentWith ( int padding ) {
size_t total_size = Roundup ( cursize_ , alignment_ ) ;
size_t pad_size = total_size - cursize_ ;
if ( pad_size > 0 ) {
assert ( ( pad_size + cursize_ ) < = capacity_ ) ;
memset ( bufstart_ + cursize_ , padding , pad_size ) ;
cursize_ + = pad_size ;
}
}
// After a partial flush move the tail to the beginning of the buffer
void RefitTail ( size_t tail_offset , size_t tail_size ) {
if ( tail_size > 0 ) {
memmove ( bufstart_ , bufstart_ + tail_offset , tail_size ) ;
}
cursize_ = tail_size ;
}
// Returns place to start writing
char * GetDestination ( ) { return bufstart_ + cursize_ ; }
void SetSize ( size_t cursize ) { cursize_ = cursize ; }
} ;
class WinSequentialFile : public SequentialFile {
private :
const std : : string filename_ ;
@ -734,7 +643,7 @@ class WinSequentialFile : public SequentialFile {
// Windows ReadFile API accepts a DWORD.
// While it is possible to read in a loop if n is > UINT_MAX
// it is a highly unlikely case.
if ( n > UINT_MAX ) {
if ( n > UINT_MAX ) {
return IOErrorFromWindowsError ( filename_ , ERROR_INVALID_PARAMETER ) ;
}
@ -747,8 +656,6 @@ class WinSequentialFile : public SequentialFile {
return IOErrorFromWindowsError ( filename_ , GetLastError ( ) ) ;
}
IOSTATS_ADD ( bytes_read , r ) ;
* result = Slice ( scratch , r ) ;
return s ;
@ -791,12 +698,13 @@ class WinRandomAccessFile : public RandomAccessFile {
: filename_ ( fname ) ,
hFile_ ( hFile ) ,
use_os_buffer_ ( options . use_os_buffer ) ,
buffer_ ( alignment ) ,
buffer_ ( ) ,
buffered_start_ ( 0 ) {
assert ( ! options . use_mmap_reads ) ;
// Unbuffered access, use internal buffer for reads
if ( ! use_os_buffer_ ) {
buffer_ . SetAlignment ( alignment ) ;
// Random read, no need in a big buffer
// We read things in database blocks which are likely to be similar to
// the alignment we use.
@ -854,11 +762,8 @@ class WinRandomAccessFile : public RandomAccessFile {
}
SSIZE_T read = 0 ;
{
IOSTATS_TIMER_GUARD ( read_nanos ) ;
read = pread ( hFile_ , buffer_ . GetDestination ( ) , actual_bytes_toread ,
start_page_start ) ;
}
read = pread ( hFile_ , buffer_ . GetDestination ( ) , actual_bytes_toread ,
start_page_start ) ;
if ( read > 0 ) {
buffer_ . SetSize ( read ) ;
@ -884,7 +789,6 @@ class WinRandomAccessFile : public RandomAccessFile {
}
}
IOSTATS_ADD_IF_POSITIVE ( bytes_read , n - left ) ;
* result = Slice ( scratch , ( r < 0 ) ? 0 : n - left ) ;
if ( r < 0 ) {
@ -893,6 +797,10 @@ class WinRandomAccessFile : public RandomAccessFile {
return s ;
}
virtual bool ReaderWriterForward ( ) const override {
return true ;
}
virtual void Hint ( AccessPattern pattern ) override { }
virtual Status InvalidateCache ( size_t offset , size_t length ) override {
@ -915,33 +823,23 @@ class WinRandomAccessFile : public RandomAccessFile {
class WinWritableFile : public WritableFile {
private :
const std : : string filename_ ;
HANDLE hFile_ ;
AlignedBuffer buffer_ ;
uint64_t filesize_ ; // How much data is actually written disk
uint64_t reservedsize_ ; // how far we have reserved space
bool pending_sync_ ;
RateLimiter * rate_limiter_ ;
const bool use_os_buffer_ ; // Used to indicate unbuffered access, the file
// must be opened as unbuffered if false
HANDLE hFile_ ;
const bool use_os_buffer_ ; // Used to indicate unbuffered access, the file
const uint64_t alignment_ ;
// must be opened as unbuffered if false
uint64_t filesize_ ; // How much data is actually written disk
uint64_t reservedsize_ ; // how far we have reserved space
public :
WinWritableFile ( const std : : string & fname , HANDLE hFile , size_t alignment ,
size_t capacity , const EnvOptions & options )
: filename_ ( fname ) ,
hFile_ ( hFile ) ,
buffer_ ( alignment ) ,
use_os_buffer_ ( options . use_os_buffer ) ,
alignment_ ( alignment ) ,
filesize_ ( 0 ) ,
reservedsize_ ( 0 ) ,
pending_sync_ ( false ) ,
rate_limiter_ ( options . rate_limiter ) ,
use_os_buffer_ ( options . use_os_buffer ) {
reservedsize_ ( 0 ) {
assert ( ! options . use_mmap_writes ) ;
buffer_ . AllocateNewBuffer ( capacity ) ;
}
~ WinWritableFile ( ) {
@ -950,106 +848,84 @@ class WinWritableFile : public WritableFile {
}
}
virtual Status Append ( const Slice & data ) override {
const char * src = data . data ( ) ;
// Indicates if the class makes use of unbuffered I/O
virtual bool UseOSBuffer ( ) const override {
return use_os_buffer_ ;
}
assert ( data . size ( ) < INT_MAX ) ;
virtual size_t GetRequiredBufferAlignment ( ) const override {
return alignment_ ;
}
size_t left = data . size ( ) ;
Status s ;
pending_sync_ = true ;
virtual Status Append ( const Slice & data ) override {
// This would call Alloc() if we are out of blocks
PrepareWrite ( GetFileSize ( ) , left ) ;
// Used for buffered access ONLY
assert ( use_os_buffer_ ) ;
assert ( data . size ( ) < std : : numeric_limits < int > : : max ( ) ) ;
// Flush only when I/O is buffered
if ( use_os_buffer_ & &
( buffer_ . GetCapacity ( ) - buffer_ . GetCurrentSize ( ) ) < left ) {
if ( buffer_ . GetCurrentSize ( ) > 0 ) {
s = Flush ( ) ;
if ( ! s . ok ( ) ) {
return s ;
}
}
Status s ;
if ( buffer_ . GetCapacity ( ) < c_OneMB ) {
size_t desiredCapacity = buffer_ . GetCapacity ( ) * 2 ;
desiredCapacity = std : : min ( desiredCapacity , c_OneMB ) ;
buffer_ . AllocateNewBuffer ( desiredCapacity ) ;
}
DWORD bytesWritten = 0 ;
if ( ! WriteFile ( hFile_ , data . data ( ) ,
data . size ( ) , & bytesWritten , NULL ) ) {
auto lastError = GetLastError ( ) ;
s = IOErrorFromWindowsError (
" Failed to WriteFile: " + filename_ ,
lastError ) ;
} else {
assert ( size_t ( bytesWritten ) = = data . size ( ) ) ;
filesize_ + = data . size ( ) ;
}
// We always use the internal buffer for the unbuffered I/O
// or we simply use it for its original purpose to accumulate many small
// chunks
if ( ! use_os_buffer_ | | ( buffer_ . GetCapacity ( ) > = left ) ) {
while ( left > 0 ) {
size_t appended = buffer_ . Append ( src , left ) ;
left - = appended ;
src + = appended ;
return s ;
}
if ( left > 0 ) {
s = Flush ( ) ;
if ( ! s . ok ( ) ) {
break ;
}
virtual Status Append ( const Slice & data , uint64_t offset ) override {
Status s ;
size_t cursize = buffer_ . GetCurrentSize ( ) ;
size_t capacity = buffer_ . GetCapacity ( ) ;
SSIZE_T ret = pwrite ( hFile_ , data . data ( ) ,
data . size ( ) , offset ) ;
// We double the buffer here because
// Flush calls do not keep up with the incoming bytes
// This is the only place when buffer is changed with unbuffered I/O
if ( cursize = = 0 & & capacity < c_OneMB ) {
size_t desiredCapacity = capacity * 2 ;
desiredCapacity = std : : min ( desiredCapacity , c_OneMB ) ;
buffer_ . AllocateNewBuffer ( desiredCapacity ) ;
}
}
}
// Error break
if ( ret < 0 ) {
auto lastError = GetLastError ( ) ;
s = IOErrorFromWindowsError (
" Failed to pwrite for: " + filename_ , lastError ) ;
} else {
// Writing directly to file bypassing what is in the buffer
assert ( buffer_ . GetCurrentSize ( ) = = 0 ) ;
// Use rate limiter for normal I/O very large request if available
s = WriteBuffered ( src , left ) ;
// With positional write it is not clear at all
// if this actually extends the filesize
assert ( size_t ( ret ) = = data . size ( ) ) ;
filesize_ + = data . size ( ) ;
}
return s ;
}
// Need to implement this so the file is truncated correctly
// when buffered and unbuffered mode
virtual Status Truncate ( uint64_t size ) override {
Status s = ftruncate ( filename_ , hFile_ , size ) ;
if ( s . ok ( ) ) {
filesize_ = size ;
}
return s ;
}
virtual Status Close ( ) override {
Status s ;
// If there is any data in the cache not written we need to deal with it
const size_t cursize = buffer_ . GetCurrentSize ( ) ;
const uint64_t final_size = filesize_ + cursize ;
if ( cursize > 0 ) {
// If OS buffering is on, we just flush the remainder, otherwise need
if ( ! use_os_buffer_ ) {
s = WriteUnbuffered ( ) ;
} else {
s = WriteBuffered ( buffer_ . GetBufferStart ( ) , cursize ) ;
}
}
Status s ;
if ( s . ok ( ) ) {
s = ftruncate ( filename_ , hFile_ , final_size ) ;
}
assert ( INVALID_HANDLE_VALUE ! = hFile_ ) ;
// Sync data if buffer was flushed
if ( s . ok ( ) & & ( cursize > 0 ) & & fsync ( hFile_ ) < 0 ) {
if ( fsync ( hFile_ ) < 0 ) {
auto lastError = GetLastError ( ) ;
s = IOErrorFromWindowsError ( " fsync failed at Close() for: " + filename_ ,
lastError ) ;
lastError ) ;
}
if ( FALSE = = : : CloseHandle ( hFile_ ) ) {
if ( s . ok ( ) ) {
auto lastError = GetLastError ( ) ;
s = IOErrorFromWindowsError ( " CloseHandle failed for: " + filename_ ,
lastError ) ;
}
auto lastError = GetLastError ( ) ;
s = IOErrorFromWindowsError ( " CloseHandle failed for: " + filename_ ,
lastError ) ;
}
hFile_ = INVALID_HANDLE_VALUE ;
@ -1057,36 +933,18 @@ class WinWritableFile : public WritableFile {
}
// write out the cached data to the OS cache
// This is now taken care of the WritableFileWriter
virtual Status Flush ( ) override {
Status status ;
if ( buffer_ . GetCurrentSize ( ) > 0 ) {
if ( ! use_os_buffer_ ) {
status = WriteUnbuffered ( ) ;
} else {
status =
WriteBuffered ( buffer_ . GetBufferStart ( ) , buffer_ . GetCurrentSize ( ) ) ;
if ( status . ok ( ) ) {
buffer_ . SetSize ( 0 ) ;
}
}
}
return status ;
return Status : : OK ( ) ;
}
virtual Status Sync ( ) override {
Status s = Flush ( ) ;
if ( ! s . ok ( ) ) {
return s ;
}
Status s ;
// Calls flush buffers
if ( pending_sync_ & & fsync ( hFile_ ) < 0 ) {
if ( fsync ( hFile_ ) < 0 ) {
auto lastError = GetLastError ( ) ;
s = IOErrorFromWindowsError ( " fsync failed at Sync() for: " + filename_ ,
lastError ) ;
} else {
pending_sync_ = false ;
}
return s ;
}
@ -1094,7 +952,12 @@ class WinWritableFile : public WritableFile {
virtual Status Fsync ( ) override { return Sync ( ) ; }
virtual uint64_t GetFileSize ( ) override {
return filesize_ + buffer_ . GetCurrentSize ( ) ;
// Double accounting now here with WritableFileWriter
// and this size will be wrong when unbuffered access is used
// but tests implement their own writable files and do not use WritableFileWrapper
// so we need to squeeze a square peg through
// a round hole here.
return filesize_ ;
}
virtual Status Allocate ( off_t offset , off_t len ) override {
@ -1104,7 +967,7 @@ class WinWritableFile : public WritableFile {
// Make sure that we reserve an aligned amount of space
// since the reservation block size is driven outside so we want
// to check if we are ok with reservation here
size_t spaceToReserve = Roundup ( offset + len , buffer_ . GetAlignment ( ) ) ;
size_t spaceToReserve = Roundup ( offset + len , alignment_ ) ;
// Nothing to do
if ( spaceToReserve < = reservedsize_ ) {
return status ;
@ -1117,133 +980,6 @@ class WinWritableFile : public WritableFile {
}
return status ;
}
private :
// This method writes to disk the specified data and makes use of the rate
// limiter
// if available
Status WriteBuffered ( const char * data , size_t size ) {
Status s ;
assert ( use_os_buffer_ ) ;
const char * src = data ;
size_t left = size ;
size_t actually_written = 0 ;
while ( left > 0 ) {
size_t bytes_allowed = RequestToken ( left , false ) ;
DWORD bytesWritten = 0 ;
if ( ! WriteFile ( hFile_ , src , bytes_allowed , & bytesWritten , NULL ) ) {
auto lastError = GetLastError ( ) ;
s = IOErrorFromWindowsError (
" Failed to write buffered via rate_limiter: " + filename_ ,
lastError ) ;
break ;
} else {
actually_written + = bytesWritten ;
src + = bytesWritten ;
left - = bytesWritten ;
}
}
IOSTATS_ADD ( bytes_written , actually_written ) ;
filesize_ + = actually_written ;
return s ;
}
// This flushes the accumulated data in the buffer. We pad data with zeros if
// necessary to the whole page.
// However, during automatic flushes padding would not be necessary.
// We always use RateLimiter if available. We move (Refit) any buffer bytes
// that are left over the
// whole number of pages to be written again on the next flush because we can
// only write on aligned
// offsets.
Status WriteUnbuffered ( ) {
Status s ;
assert ( ! use_os_buffer_ ) ;
size_t alignment = buffer_ . GetAlignment ( ) ;
assert ( ( filesize_ % alignment ) = = 0 ) ;
// Calculate whole page final file advance if all writes succeed
size_t file_advance =
TruncateToPageBoundary ( alignment , buffer_ . GetCurrentSize ( ) ) ;
// Calculate the leftover tail, we write it here padded with zeros BUT we
// will write
// it again in the future either on Close() OR when the current whole page
// fills out
size_t leftover_tail = buffer_ . GetCurrentSize ( ) - file_advance ;
// Round up and pad
buffer_ . PadToAlignmentWith ( 0 ) ;
const char * src = buffer_ . GetBufferStart ( ) ;
size_t left = buffer_ . GetCurrentSize ( ) ;
uint64_t file_offset = filesize_ ;
size_t actually_written = 0 ;
while ( left > 0 ) {
// Request how much is allowed. If this is less than one alignment we may
// be blocking a lot on every write
// because we can not write less than one alignment (page) unit thus check
// the configuration.
size_t bytes_allowed = RequestToken ( left , true ) ;
SSIZE_T ret = pwrite ( hFile_ , buffer_ . GetBufferStart ( ) + actually_written ,
bytes_allowed , file_offset ) ;
// Error break
if ( ret < 0 ) {
auto lastError = GetLastError ( ) ;
s = IOErrorFromWindowsError (
" Failed to pwrite for unbuffered: " + filename_ , lastError ) ;
buffer_ . SetSize ( file_advance + leftover_tail ) ;
break ;
}
actually_written + = ret ;
file_offset + = ret ;
left - = ret ;
}
IOSTATS_ADD ( bytes_written , actually_written ) ;
if ( s . ok ( ) ) {
// Move the tail to the beginning of the buffer
// This never happens during normal Append but rather during
// explicit call to Flush()/Sync() or Close()
buffer_ . RefitTail ( file_advance , leftover_tail ) ;
// This is where we start writing next time which may or not be
// the actual file size on disk. They match if the buffer size
// is a multiple of whole pages otherwise filesize_ is leftover_tail
// behind
filesize_ + = file_advance ;
}
return s ;
}
// This truncates the request to a single burst bytes
// and then goes through the request to make sure we are
// satisfied in the order of the I/O priority
size_t RequestToken ( size_t bytes , bool align ) const {
if ( rate_limiter_ & & io_priority_ < Env : : IO_TOTAL ) {
bytes = std : : min (
bytes , static_cast < size_t > ( rate_limiter_ - > GetSingleBurstBytes ( ) ) ) ;
if ( align ) {
// Here we may actually require more than burst and block
// but we can not write less than one page at a time on unbuffered
// thus we may want not to use ratelimiter s
size_t alignment = buffer_ . GetAlignment ( ) ;
bytes = std : : max ( alignment , TruncateToPageBoundary ( alignment , bytes ) ) ;
}
rate_limiter_ - > Request ( bytes , io_priority_ ) ;
}
return bytes ;
}
} ;
class WinDirectory : public Directory {
@ -2092,7 +1828,7 @@ class WinEnv : public Env {
ThreadPool * thread_pool_ ;
size_t thread_id_ ; // Thread count in the thread.
explicit BGThreadMetadata ( ThreadPool * thread_pool , size_t thread_id )
BGThreadMetadata ( ThreadPool * thread_pool , size_t thread_id )
: thread_pool_ ( thread_pool ) , thread_id_ ( thread_id ) { }
} ;