|
|
@ -30,6 +30,7 @@ |
|
|
|
#include "util/iostats_context_imp.h" |
|
|
|
#include "util/iostats_context_imp.h" |
|
|
|
#include "util/rate_limiter.h" |
|
|
|
#include "util/rate_limiter.h" |
|
|
|
#include "util/sync_point.h" |
|
|
|
#include "util/sync_point.h" |
|
|
|
|
|
|
|
#include "util/aligned_buffer.h" |
|
|
|
|
|
|
|
|
|
|
|
#include "util/thread_status_updater.h" |
|
|
|
#include "util/thread_status_updater.h" |
|
|
|
#include "util/thread_status_util.h" |
|
|
|
#include "util/thread_status_util.h" |
|
|
@ -161,15 +162,6 @@ inline int fsync(HANDLE hFile) { |
|
|
|
return 0; |
|
|
|
return 0; |
|
|
|
} |
|
|
|
} |
|
|
|
|
|
|
|
|
|
|
|
inline size_t TruncateToPageBoundary(size_t page_size, size_t s) { |
|
|
|
|
|
|
|
s -= (s & (page_size - 1)); |
|
|
|
|
|
|
|
assert((s % page_size) == 0); |
|
|
|
|
|
|
|
return s; |
|
|
|
|
|
|
|
} |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
// Roundup x to a multiple of y
|
|
|
|
|
|
|
|
inline size_t Roundup(size_t x, size_t y) { return ((x + y - 1) / y) * y; } |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
// SetFileInformationByHandle() is capable of fast pre-allocates.
|
|
|
|
// SetFileInformationByHandle() is capable of fast pre-allocates.
|
|
|
|
// However, this does not change the file end position unless the file is
|
|
|
|
// However, this does not change the file end position unless the file is
|
|
|
|
// truncated and the pre-allocated space is not considered filled with zeros.
|
|
|
|
// truncated and the pre-allocated space is not considered filled with zeros.
|
|
|
@ -492,7 +484,6 @@ class WinMmapFile : public WritableFile { |
|
|
|
|
|
|
|
|
|
|
|
size_t n = std::min(left, avail); |
|
|
|
size_t n = std::min(left, avail); |
|
|
|
memcpy(dst_, src, n); |
|
|
|
memcpy(dst_, src, n); |
|
|
|
IOSTATS_ADD(bytes_written, n); |
|
|
|
|
|
|
|
dst_ += n; |
|
|
|
dst_ += n; |
|
|
|
src += n; |
|
|
|
src += n; |
|
|
|
left -= n; |
|
|
|
left -= n; |
|
|
@ -502,6 +493,12 @@ class WinMmapFile : public WritableFile { |
|
|
|
return Status::OK(); |
|
|
|
return Status::OK(); |
|
|
|
} |
|
|
|
} |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
// Means Close() will properly take care of truncate
|
|
|
|
|
|
|
|
// and it does not need any additional information
|
|
|
|
|
|
|
|
virtual Status Truncate(uint64_t size) override { |
|
|
|
|
|
|
|
return Status::OK(); |
|
|
|
|
|
|
|
} |
|
|
|
|
|
|
|
|
|
|
|
virtual Status Close() override { |
|
|
|
virtual Status Close() override { |
|
|
|
Status s; |
|
|
|
Status s; |
|
|
|
|
|
|
|
|
|
|
@ -612,94 +609,6 @@ class WinMmapFile : public WritableFile { |
|
|
|
} |
|
|
|
} |
|
|
|
}; |
|
|
|
}; |
|
|
|
|
|
|
|
|
|
|
|
// This class is to manage an aligned user
|
|
|
|
|
|
|
|
// allocated buffer for unbuffered I/O purposes
|
|
|
|
|
|
|
|
// though it does not make a difference if you need a buffer.
|
|
|
|
|
|
|
|
class AlignedBuffer { |
|
|
|
|
|
|
|
const size_t alignment_; |
|
|
|
|
|
|
|
std::unique_ptr<char[]> buf_; |
|
|
|
|
|
|
|
size_t capacity_; |
|
|
|
|
|
|
|
size_t cursize_; |
|
|
|
|
|
|
|
char* bufstart_; |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
public: |
|
|
|
|
|
|
|
explicit AlignedBuffer(size_t alignment) |
|
|
|
|
|
|
|
: alignment_(alignment), capacity_(0), cursize_(0), bufstart_(nullptr) { |
|
|
|
|
|
|
|
assert(alignment > 0); |
|
|
|
|
|
|
|
assert((alignment & (alignment - 1)) == 0); |
|
|
|
|
|
|
|
} |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
size_t GetAlignment() const { return alignment_; } |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
size_t GetCapacity() const { return capacity_; } |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
size_t GetCurrentSize() const { return cursize_; } |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
const char* GetBufferStart() const { return bufstart_; } |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
void Clear() { cursize_ = 0; } |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
// Allocates a new buffer and sets bufstart_ to the aligned first byte
|
|
|
|
|
|
|
|
void AllocateNewBuffer(size_t requestedCapacity) { |
|
|
|
|
|
|
|
size_t size = Roundup(requestedCapacity, alignment_); |
|
|
|
|
|
|
|
buf_.reset(new char[size + alignment_]); |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
char* p = buf_.get(); |
|
|
|
|
|
|
|
bufstart_ = reinterpret_cast<char*>( |
|
|
|
|
|
|
|
(reinterpret_cast<uintptr_t>(p) + (alignment_ - 1)) & |
|
|
|
|
|
|
|
~static_cast<uintptr_t>(alignment_ - 1)); |
|
|
|
|
|
|
|
capacity_ = size; |
|
|
|
|
|
|
|
cursize_ = 0; |
|
|
|
|
|
|
|
} |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
// Used for write
|
|
|
|
|
|
|
|
// Returns the number of bytes appended
|
|
|
|
|
|
|
|
size_t Append(const char* src, size_t append_size) { |
|
|
|
|
|
|
|
size_t buffer_remaining = capacity_ - cursize_; |
|
|
|
|
|
|
|
size_t to_copy = std::min(append_size, buffer_remaining); |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
if (to_copy > 0) { |
|
|
|
|
|
|
|
memcpy(bufstart_ + cursize_, src, to_copy); |
|
|
|
|
|
|
|
cursize_ += to_copy; |
|
|
|
|
|
|
|
} |
|
|
|
|
|
|
|
return to_copy; |
|
|
|
|
|
|
|
} |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
size_t Read(char* dest, size_t offset, size_t read_size) const { |
|
|
|
|
|
|
|
assert(offset < cursize_); |
|
|
|
|
|
|
|
size_t to_read = std::min(cursize_ - offset, read_size); |
|
|
|
|
|
|
|
if (to_read > 0) { |
|
|
|
|
|
|
|
memcpy(dest, bufstart_ + offset, to_read); |
|
|
|
|
|
|
|
} |
|
|
|
|
|
|
|
return to_read; |
|
|
|
|
|
|
|
} |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
/// Pad to alignment
|
|
|
|
|
|
|
|
void PadToAlignmentWith(int padding) { |
|
|
|
|
|
|
|
size_t total_size = Roundup(cursize_, alignment_); |
|
|
|
|
|
|
|
size_t pad_size = total_size - cursize_; |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
if (pad_size > 0) { |
|
|
|
|
|
|
|
assert((pad_size + cursize_) <= capacity_); |
|
|
|
|
|
|
|
memset(bufstart_ + cursize_, padding, pad_size); |
|
|
|
|
|
|
|
cursize_ += pad_size; |
|
|
|
|
|
|
|
} |
|
|
|
|
|
|
|
} |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
// After a partial flush move the tail to the beginning of the buffer
|
|
|
|
|
|
|
|
void RefitTail(size_t tail_offset, size_t tail_size) { |
|
|
|
|
|
|
|
if (tail_size > 0) { |
|
|
|
|
|
|
|
memmove(bufstart_, bufstart_ + tail_offset, tail_size); |
|
|
|
|
|
|
|
} |
|
|
|
|
|
|
|
cursize_ = tail_size; |
|
|
|
|
|
|
|
} |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
// Returns place to start writing
|
|
|
|
|
|
|
|
char* GetDestination() { return bufstart_ + cursize_; } |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
void SetSize(size_t cursize) { cursize_ = cursize; } |
|
|
|
|
|
|
|
}; |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
class WinSequentialFile : public SequentialFile { |
|
|
|
class WinSequentialFile : public SequentialFile { |
|
|
|
private: |
|
|
|
private: |
|
|
|
const std::string filename_; |
|
|
|
const std::string filename_; |
|
|
@ -734,7 +643,7 @@ class WinSequentialFile : public SequentialFile { |
|
|
|
// Windows ReadFile API accepts a DWORD.
|
|
|
|
// Windows ReadFile API accepts a DWORD.
|
|
|
|
// While it is possible to read in a loop if n is > UINT_MAX
|
|
|
|
// While it is possible to read in a loop if n is > UINT_MAX
|
|
|
|
// it is a highly unlikely case.
|
|
|
|
// it is a highly unlikely case.
|
|
|
|
if (n > UINT_MAX) {
|
|
|
|
if (n > UINT_MAX) { |
|
|
|
return IOErrorFromWindowsError(filename_, ERROR_INVALID_PARAMETER); |
|
|
|
return IOErrorFromWindowsError(filename_, ERROR_INVALID_PARAMETER); |
|
|
|
} |
|
|
|
} |
|
|
|
|
|
|
|
|
|
|
@ -747,8 +656,6 @@ class WinSequentialFile : public SequentialFile { |
|
|
|
return IOErrorFromWindowsError(filename_, GetLastError()); |
|
|
|
return IOErrorFromWindowsError(filename_, GetLastError()); |
|
|
|
} |
|
|
|
} |
|
|
|
|
|
|
|
|
|
|
|
IOSTATS_ADD(bytes_read, r); |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
*result = Slice(scratch, r); |
|
|
|
*result = Slice(scratch, r); |
|
|
|
|
|
|
|
|
|
|
|
return s; |
|
|
|
return s; |
|
|
@ -791,12 +698,13 @@ class WinRandomAccessFile : public RandomAccessFile { |
|
|
|
: filename_(fname), |
|
|
|
: filename_(fname), |
|
|
|
hFile_(hFile), |
|
|
|
hFile_(hFile), |
|
|
|
use_os_buffer_(options.use_os_buffer), |
|
|
|
use_os_buffer_(options.use_os_buffer), |
|
|
|
buffer_(alignment), |
|
|
|
buffer_(), |
|
|
|
buffered_start_(0) { |
|
|
|
buffered_start_(0) { |
|
|
|
assert(!options.use_mmap_reads); |
|
|
|
assert(!options.use_mmap_reads); |
|
|
|
|
|
|
|
|
|
|
|
// Unbuffered access, use internal buffer for reads
|
|
|
|
// Unbuffered access, use internal buffer for reads
|
|
|
|
if (!use_os_buffer_) { |
|
|
|
if (!use_os_buffer_) { |
|
|
|
|
|
|
|
buffer_.Alignment(alignment); |
|
|
|
// Random read, no need in a big buffer
|
|
|
|
// Random read, no need in a big buffer
|
|
|
|
// We read things in database blocks which are likely to be similar to
|
|
|
|
// We read things in database blocks which are likely to be similar to
|
|
|
|
// the alignment we use.
|
|
|
|
// the alignment we use.
|
|
|
@ -826,7 +734,7 @@ class WinRandomAccessFile : public RandomAccessFile { |
|
|
|
// Let's see if at least some of the requested data is already
|
|
|
|
// Let's see if at least some of the requested data is already
|
|
|
|
// in the buffer
|
|
|
|
// in the buffer
|
|
|
|
if (offset >= buffered_start_ && |
|
|
|
if (offset >= buffered_start_ && |
|
|
|
offset < (buffered_start_ + buffer_.GetCurrentSize())) { |
|
|
|
offset < (buffered_start_ + buffer_.CurrentSize())) { |
|
|
|
size_t buffer_offset = offset - buffered_start_; |
|
|
|
size_t buffer_offset = offset - buffered_start_; |
|
|
|
r = buffer_.Read(dest, buffer_offset, left); |
|
|
|
r = buffer_.Read(dest, buffer_offset, left); |
|
|
|
assert(r >= 0); |
|
|
|
assert(r >= 0); |
|
|
@ -839,7 +747,7 @@ class WinRandomAccessFile : public RandomAccessFile { |
|
|
|
// Still some left or none was buffered
|
|
|
|
// Still some left or none was buffered
|
|
|
|
if (left > 0) { |
|
|
|
if (left > 0) { |
|
|
|
// Figure out the start/end offset for reading and amount to read
|
|
|
|
// Figure out the start/end offset for reading and amount to read
|
|
|
|
const size_t alignment = buffer_.GetAlignment(); |
|
|
|
const size_t alignment = buffer_.Alignment(); |
|
|
|
const size_t start_page_start = |
|
|
|
const size_t start_page_start = |
|
|
|
TruncateToPageBoundary(alignment, offset); |
|
|
|
TruncateToPageBoundary(alignment, offset); |
|
|
|
const size_t end_page_start = |
|
|
|
const size_t end_page_start = |
|
|
@ -847,21 +755,18 @@ class WinRandomAccessFile : public RandomAccessFile { |
|
|
|
const size_t actual_bytes_toread = |
|
|
|
const size_t actual_bytes_toread = |
|
|
|
(end_page_start - start_page_start) + alignment; |
|
|
|
(end_page_start - start_page_start) + alignment; |
|
|
|
|
|
|
|
|
|
|
|
if (buffer_.GetCapacity() < actual_bytes_toread) { |
|
|
|
if (buffer_.Capacity() < actual_bytes_toread) { |
|
|
|
buffer_.AllocateNewBuffer(actual_bytes_toread); |
|
|
|
buffer_.AllocateNewBuffer(actual_bytes_toread); |
|
|
|
} else { |
|
|
|
} else { |
|
|
|
buffer_.Clear(); |
|
|
|
buffer_.Clear(); |
|
|
|
} |
|
|
|
} |
|
|
|
|
|
|
|
|
|
|
|
SSIZE_T read = 0; |
|
|
|
SSIZE_T read = 0; |
|
|
|
{ |
|
|
|
read = pread(hFile_, buffer_.Destination(), actual_bytes_toread, |
|
|
|
IOSTATS_TIMER_GUARD(read_nanos); |
|
|
|
start_page_start); |
|
|
|
read = pread(hFile_, buffer_.GetDestination(), actual_bytes_toread, |
|
|
|
|
|
|
|
start_page_start); |
|
|
|
|
|
|
|
} |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
if (read > 0) { |
|
|
|
if (read > 0) { |
|
|
|
buffer_.SetSize(read); |
|
|
|
buffer_.Size(read); |
|
|
|
buffered_start_ = start_page_start; |
|
|
|
buffered_start_ = start_page_start; |
|
|
|
|
|
|
|
|
|
|
|
// Let's figure out how much we read from the users standpoint
|
|
|
|
// Let's figure out how much we read from the users standpoint
|
|
|
@ -884,7 +789,6 @@ class WinRandomAccessFile : public RandomAccessFile { |
|
|
|
} |
|
|
|
} |
|
|
|
} |
|
|
|
} |
|
|
|
|
|
|
|
|
|
|
|
IOSTATS_ADD_IF_POSITIVE(bytes_read, n - left); |
|
|
|
|
|
|
|
*result = Slice(scratch, (r < 0) ? 0 : n - left); |
|
|
|
*result = Slice(scratch, (r < 0) ? 0 : n - left); |
|
|
|
|
|
|
|
|
|
|
|
if (r < 0) { |
|
|
|
if (r < 0) { |
|
|
@ -893,6 +797,10 @@ class WinRandomAccessFile : public RandomAccessFile { |
|
|
|
return s; |
|
|
|
return s; |
|
|
|
} |
|
|
|
} |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
virtual bool ShouldForwardRawRequest() const override { |
|
|
|
|
|
|
|
return true; |
|
|
|
|
|
|
|
} |
|
|
|
|
|
|
|
|
|
|
|
virtual void Hint(AccessPattern pattern) override {} |
|
|
|
virtual void Hint(AccessPattern pattern) override {} |
|
|
|
|
|
|
|
|
|
|
|
virtual Status InvalidateCache(size_t offset, size_t length) override { |
|
|
|
virtual Status InvalidateCache(size_t offset, size_t length) override { |
|
|
@ -915,33 +823,23 @@ class WinRandomAccessFile : public RandomAccessFile { |
|
|
|
class WinWritableFile : public WritableFile { |
|
|
|
class WinWritableFile : public WritableFile { |
|
|
|
private: |
|
|
|
private: |
|
|
|
const std::string filename_; |
|
|
|
const std::string filename_; |
|
|
|
HANDLE hFile_; |
|
|
|
HANDLE hFile_; |
|
|
|
AlignedBuffer buffer_; |
|
|
|
const bool use_os_buffer_; // Used to indicate unbuffered access, the file
|
|
|
|
|
|
|
|
const uint64_t alignment_; |
|
|
|
uint64_t filesize_; // How much data is actually written disk
|
|
|
|
// must be opened as unbuffered if false
|
|
|
|
uint64_t reservedsize_; // how far we have reserved space
|
|
|
|
uint64_t filesize_; // How much data is actually written disk
|
|
|
|
|
|
|
|
uint64_t reservedsize_; // how far we have reserved space
|
|
|
|
bool pending_sync_; |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
RateLimiter* rate_limiter_; |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
const bool use_os_buffer_; // Used to indicate unbuffered access, the file
|
|
|
|
|
|
|
|
// must be opened as unbuffered if false
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
public: |
|
|
|
public: |
|
|
|
WinWritableFile(const std::string& fname, HANDLE hFile, size_t alignment, |
|
|
|
WinWritableFile(const std::string& fname, HANDLE hFile, size_t alignment, |
|
|
|
size_t capacity, const EnvOptions& options) |
|
|
|
size_t capacity, const EnvOptions& options) |
|
|
|
: filename_(fname), |
|
|
|
: filename_(fname), |
|
|
|
hFile_(hFile), |
|
|
|
hFile_(hFile), |
|
|
|
buffer_(alignment), |
|
|
|
use_os_buffer_(options.use_os_buffer), |
|
|
|
|
|
|
|
alignment_(alignment), |
|
|
|
filesize_(0), |
|
|
|
filesize_(0), |
|
|
|
reservedsize_(0), |
|
|
|
reservedsize_(0) { |
|
|
|
pending_sync_(false), |
|
|
|
|
|
|
|
rate_limiter_(options.rate_limiter), |
|
|
|
|
|
|
|
use_os_buffer_(options.use_os_buffer) { |
|
|
|
|
|
|
|
assert(!options.use_mmap_writes); |
|
|
|
assert(!options.use_mmap_writes); |
|
|
|
|
|
|
|
|
|
|
|
buffer_.AllocateNewBuffer(capacity); |
|
|
|
|
|
|
|
} |
|
|
|
} |
|
|
|
|
|
|
|
|
|
|
|
~WinWritableFile() { |
|
|
|
~WinWritableFile() { |
|
|
@ -950,106 +848,84 @@ class WinWritableFile : public WritableFile { |
|
|
|
} |
|
|
|
} |
|
|
|
} |
|
|
|
} |
|
|
|
|
|
|
|
|
|
|
|
virtual Status Append(const Slice& data) override { |
|
|
|
// Indicates if the class makes use of unbuffered I/O
|
|
|
|
const char* src = data.data(); |
|
|
|
virtual bool UseOSBuffer() const override { |
|
|
|
|
|
|
|
return use_os_buffer_; |
|
|
|
|
|
|
|
} |
|
|
|
|
|
|
|
|
|
|
|
assert(data.size() < INT_MAX); |
|
|
|
virtual size_t GetRequiredBufferAlignment() const override { |
|
|
|
|
|
|
|
return alignment_; |
|
|
|
|
|
|
|
} |
|
|
|
|
|
|
|
|
|
|
|
size_t left = data.size(); |
|
|
|
virtual Status Append(const Slice& data) override { |
|
|
|
Status s; |
|
|
|
|
|
|
|
pending_sync_ = true; |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
// This would call Alloc() if we are out of blocks
|
|
|
|
// Used for buffered access ONLY
|
|
|
|
PrepareWrite(GetFileSize(), left); |
|
|
|
assert(use_os_buffer_); |
|
|
|
|
|
|
|
assert(data.size() < std::numeric_limits<int>::max()); |
|
|
|
|
|
|
|
|
|
|
|
// Flush only when I/O is buffered
|
|
|
|
Status s; |
|
|
|
if (use_os_buffer_ && |
|
|
|
|
|
|
|
(buffer_.GetCapacity() - buffer_.GetCurrentSize()) < left) { |
|
|
|
|
|
|
|
if (buffer_.GetCurrentSize() > 0) { |
|
|
|
|
|
|
|
s = Flush(); |
|
|
|
|
|
|
|
if (!s.ok()) { |
|
|
|
|
|
|
|
return s; |
|
|
|
|
|
|
|
} |
|
|
|
|
|
|
|
} |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
if (buffer_.GetCapacity() < c_OneMB) { |
|
|
|
DWORD bytesWritten = 0; |
|
|
|
size_t desiredCapacity = buffer_.GetCapacity() * 2; |
|
|
|
if (!WriteFile(hFile_, data.data(), |
|
|
|
desiredCapacity = std::min(desiredCapacity, c_OneMB); |
|
|
|
data.size(), &bytesWritten, NULL)) { |
|
|
|
buffer_.AllocateNewBuffer(desiredCapacity); |
|
|
|
auto lastError = GetLastError(); |
|
|
|
} |
|
|
|
s = IOErrorFromWindowsError( |
|
|
|
|
|
|
|
"Failed to WriteFile: " + filename_, |
|
|
|
|
|
|
|
lastError); |
|
|
|
|
|
|
|
} else { |
|
|
|
|
|
|
|
assert(size_t(bytesWritten) == data.size()); |
|
|
|
|
|
|
|
filesize_ += data.size(); |
|
|
|
} |
|
|
|
} |
|
|
|
|
|
|
|
|
|
|
|
// We always use the internal buffer for the unbuffered I/O
|
|
|
|
return s; |
|
|
|
// or we simply use it for its original purpose to accumulate many small
|
|
|
|
} |
|
|
|
// chunks
|
|
|
|
|
|
|
|
if (!use_os_buffer_ || (buffer_.GetCapacity() >= left)) { |
|
|
|
|
|
|
|
while (left > 0) { |
|
|
|
|
|
|
|
size_t appended = buffer_.Append(src, left); |
|
|
|
|
|
|
|
left -= appended; |
|
|
|
|
|
|
|
src += appended; |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
if (left > 0) { |
|
|
|
virtual Status PositionedAppend(const Slice& data, uint64_t offset) override { |
|
|
|
s = Flush(); |
|
|
|
Status s; |
|
|
|
if (!s.ok()) { |
|
|
|
|
|
|
|
break; |
|
|
|
|
|
|
|
} |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
size_t cursize = buffer_.GetCurrentSize(); |
|
|
|
SSIZE_T ret = pwrite(hFile_, data.data(),
|
|
|
|
size_t capacity = buffer_.GetCapacity(); |
|
|
|
data.size(), offset); |
|
|
|
|
|
|
|
|
|
|
|
// We double the buffer here because
|
|
|
|
// Error break
|
|
|
|
// Flush calls do not keep up with the incoming bytes
|
|
|
|
if (ret < 0) { |
|
|
|
// This is the only place when buffer is changed with unbuffered I/O
|
|
|
|
auto lastError = GetLastError(); |
|
|
|
if (cursize == 0 && capacity < c_OneMB) { |
|
|
|
s = IOErrorFromWindowsError( |
|
|
|
size_t desiredCapacity = capacity * 2; |
|
|
|
"Failed to pwrite for: " + filename_, lastError); |
|
|
|
desiredCapacity = std::min(desiredCapacity, c_OneMB); |
|
|
|
|
|
|
|
buffer_.AllocateNewBuffer(desiredCapacity); |
|
|
|
|
|
|
|
} |
|
|
|
|
|
|
|
} |
|
|
|
|
|
|
|
} |
|
|
|
|
|
|
|
} else { |
|
|
|
} else { |
|
|
|
// Writing directly to file bypassing what is in the buffer
|
|
|
|
// With positional write it is not clear at all
|
|
|
|
assert(buffer_.GetCurrentSize() == 0); |
|
|
|
// if this actually extends the filesize
|
|
|
|
// Use rate limiter for normal I/O very large request if available
|
|
|
|
assert(size_t(ret) == data.size()); |
|
|
|
s = WriteBuffered(src, left); |
|
|
|
filesize_ += data.size(); |
|
|
|
} |
|
|
|
} |
|
|
|
|
|
|
|
return s; |
|
|
|
|
|
|
|
} |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
// Need to implement this so the file is truncated correctly
|
|
|
|
|
|
|
|
// when buffered and unbuffered mode
|
|
|
|
|
|
|
|
virtual Status Truncate(uint64_t size) override { |
|
|
|
|
|
|
|
Status s = ftruncate(filename_, hFile_, size); |
|
|
|
|
|
|
|
if (s.ok()) { |
|
|
|
|
|
|
|
filesize_ = size; |
|
|
|
|
|
|
|
} |
|
|
|
return s; |
|
|
|
return s; |
|
|
|
} |
|
|
|
} |
|
|
|
|
|
|
|
|
|
|
|
virtual Status Close() override { |
|
|
|
virtual Status Close() override { |
|
|
|
Status s; |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
// If there is any data in the cache not written we need to deal with it
|
|
|
|
Status s; |
|
|
|
const size_t cursize = buffer_.GetCurrentSize(); |
|
|
|
|
|
|
|
const uint64_t final_size = filesize_ + cursize; |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
if (cursize > 0) { |
|
|
|
|
|
|
|
// If OS buffering is on, we just flush the remainder, otherwise need
|
|
|
|
|
|
|
|
if (!use_os_buffer_) { |
|
|
|
|
|
|
|
s = WriteUnbuffered(); |
|
|
|
|
|
|
|
} else { |
|
|
|
|
|
|
|
s = WriteBuffered(buffer_.GetBufferStart(), cursize); |
|
|
|
|
|
|
|
} |
|
|
|
|
|
|
|
} |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
if (s.ok()) { |
|
|
|
assert(INVALID_HANDLE_VALUE != hFile_); |
|
|
|
s = ftruncate(filename_, hFile_, final_size); |
|
|
|
|
|
|
|
} |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
// Sync data if buffer was flushed
|
|
|
|
if (fsync(hFile_) < 0) { |
|
|
|
if (s.ok() && (cursize > 0) && fsync(hFile_) < 0) { |
|
|
|
|
|
|
|
auto lastError = GetLastError(); |
|
|
|
auto lastError = GetLastError(); |
|
|
|
s = IOErrorFromWindowsError("fsync failed at Close() for: " + filename_, |
|
|
|
s = IOErrorFromWindowsError("fsync failed at Close() for: " + filename_, |
|
|
|
lastError); |
|
|
|
lastError); |
|
|
|
} |
|
|
|
} |
|
|
|
|
|
|
|
|
|
|
|
if (FALSE == ::CloseHandle(hFile_)) { |
|
|
|
if (FALSE == ::CloseHandle(hFile_)) { |
|
|
|
if (s.ok()) { |
|
|
|
auto lastError = GetLastError(); |
|
|
|
auto lastError = GetLastError(); |
|
|
|
s = IOErrorFromWindowsError("CloseHandle failed for: " + filename_, |
|
|
|
s = IOErrorFromWindowsError("CloseHandle failed for: " + filename_, |
|
|
|
lastError); |
|
|
|
lastError); |
|
|
|
|
|
|
|
} |
|
|
|
|
|
|
|
} |
|
|
|
} |
|
|
|
|
|
|
|
|
|
|
|
hFile_ = INVALID_HANDLE_VALUE; |
|
|
|
hFile_ = INVALID_HANDLE_VALUE; |
|
|
@ -1057,36 +933,18 @@ class WinWritableFile : public WritableFile { |
|
|
|
} |
|
|
|
} |
|
|
|
|
|
|
|
|
|
|
|
// write out the cached data to the OS cache
|
|
|
|
// write out the cached data to the OS cache
|
|
|
|
|
|
|
|
// This is now taken care of the WritableFileWriter
|
|
|
|
virtual Status Flush() override { |
|
|
|
virtual Status Flush() override { |
|
|
|
Status status; |
|
|
|
return Status::OK(); |
|
|
|
|
|
|
|
|
|
|
|
if (buffer_.GetCurrentSize() > 0) { |
|
|
|
|
|
|
|
if (!use_os_buffer_) { |
|
|
|
|
|
|
|
status = WriteUnbuffered(); |
|
|
|
|
|
|
|
} else { |
|
|
|
|
|
|
|
status = |
|
|
|
|
|
|
|
WriteBuffered(buffer_.GetBufferStart(), buffer_.GetCurrentSize()); |
|
|
|
|
|
|
|
if (status.ok()) { |
|
|
|
|
|
|
|
buffer_.SetSize(0); |
|
|
|
|
|
|
|
} |
|
|
|
|
|
|
|
} |
|
|
|
|
|
|
|
} |
|
|
|
|
|
|
|
return status; |
|
|
|
|
|
|
|
} |
|
|
|
} |
|
|
|
|
|
|
|
|
|
|
|
virtual Status Sync() override { |
|
|
|
virtual Status Sync() override { |
|
|
|
Status s = Flush(); |
|
|
|
Status s; |
|
|
|
if (!s.ok()) { |
|
|
|
|
|
|
|
return s; |
|
|
|
|
|
|
|
} |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
// Calls flush buffers
|
|
|
|
// Calls flush buffers
|
|
|
|
if (pending_sync_ && fsync(hFile_) < 0) { |
|
|
|
if (fsync(hFile_) < 0) { |
|
|
|
auto lastError = GetLastError(); |
|
|
|
auto lastError = GetLastError(); |
|
|
|
s = IOErrorFromWindowsError("fsync failed at Sync() for: " + filename_, |
|
|
|
s = IOErrorFromWindowsError("fsync failed at Sync() for: " + filename_, |
|
|
|
lastError); |
|
|
|
lastError); |
|
|
|
} else { |
|
|
|
|
|
|
|
pending_sync_ = false; |
|
|
|
|
|
|
|
} |
|
|
|
} |
|
|
|
return s; |
|
|
|
return s; |
|
|
|
} |
|
|
|
} |
|
|
@ -1094,7 +952,12 @@ class WinWritableFile : public WritableFile { |
|
|
|
virtual Status Fsync() override { return Sync(); } |
|
|
|
virtual Status Fsync() override { return Sync(); } |
|
|
|
|
|
|
|
|
|
|
|
virtual uint64_t GetFileSize() override { |
|
|
|
virtual uint64_t GetFileSize() override { |
|
|
|
return filesize_ + buffer_.GetCurrentSize(); |
|
|
|
// Double accounting now here with WritableFileWriter
|
|
|
|
|
|
|
|
// and this size will be wrong when unbuffered access is used
|
|
|
|
|
|
|
|
// but tests implement their own writable files and do not use WritableFileWrapper
|
|
|
|
|
|
|
|
// so we need to squeeze a square peg through
|
|
|
|
|
|
|
|
// a round hole here.
|
|
|
|
|
|
|
|
return filesize_; |
|
|
|
} |
|
|
|
} |
|
|
|
|
|
|
|
|
|
|
|
virtual Status Allocate(off_t offset, off_t len) override { |
|
|
|
virtual Status Allocate(off_t offset, off_t len) override { |
|
|
@ -1104,7 +967,7 @@ class WinWritableFile : public WritableFile { |
|
|
|
// Make sure that we reserve an aligned amount of space
|
|
|
|
// Make sure that we reserve an aligned amount of space
|
|
|
|
// since the reservation block size is driven outside so we want
|
|
|
|
// since the reservation block size is driven outside so we want
|
|
|
|
// to check if we are ok with reservation here
|
|
|
|
// to check if we are ok with reservation here
|
|
|
|
size_t spaceToReserve = Roundup(offset + len, buffer_.GetAlignment()); |
|
|
|
size_t spaceToReserve = Roundup(offset + len, alignment_); |
|
|
|
// Nothing to do
|
|
|
|
// Nothing to do
|
|
|
|
if (spaceToReserve <= reservedsize_) { |
|
|
|
if (spaceToReserve <= reservedsize_) { |
|
|
|
return status; |
|
|
|
return status; |
|
|
@ -1117,133 +980,6 @@ class WinWritableFile : public WritableFile { |
|
|
|
} |
|
|
|
} |
|
|
|
return status; |
|
|
|
return status; |
|
|
|
} |
|
|
|
} |
|
|
|
|
|
|
|
|
|
|
|
private: |
|
|
|
|
|
|
|
// This method writes to disk the specified data and makes use of the rate
|
|
|
|
|
|
|
|
// limiter
|
|
|
|
|
|
|
|
// if available
|
|
|
|
|
|
|
|
Status WriteBuffered(const char* data, size_t size) { |
|
|
|
|
|
|
|
Status s; |
|
|
|
|
|
|
|
assert(use_os_buffer_); |
|
|
|
|
|
|
|
const char* src = data; |
|
|
|
|
|
|
|
size_t left = size; |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
size_t actually_written = 0; |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
while (left > 0) { |
|
|
|
|
|
|
|
size_t bytes_allowed = RequestToken(left, false); |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
DWORD bytesWritten = 0; |
|
|
|
|
|
|
|
if (!WriteFile(hFile_, src, bytes_allowed, &bytesWritten, NULL)) { |
|
|
|
|
|
|
|
auto lastError = GetLastError(); |
|
|
|
|
|
|
|
s = IOErrorFromWindowsError( |
|
|
|
|
|
|
|
"Failed to write buffered via rate_limiter: " + filename_, |
|
|
|
|
|
|
|
lastError); |
|
|
|
|
|
|
|
break; |
|
|
|
|
|
|
|
} else { |
|
|
|
|
|
|
|
actually_written += bytesWritten; |
|
|
|
|
|
|
|
src += bytesWritten; |
|
|
|
|
|
|
|
left -= bytesWritten; |
|
|
|
|
|
|
|
} |
|
|
|
|
|
|
|
} |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
IOSTATS_ADD(bytes_written, actually_written); |
|
|
|
|
|
|
|
filesize_ += actually_written; |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
return s; |
|
|
|
|
|
|
|
} |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
// This flushes the accumulated data in the buffer. We pad data with zeros if
|
|
|
|
|
|
|
|
// necessary to the whole page.
|
|
|
|
|
|
|
|
// However, during automatic flushes padding would not be necessary.
|
|
|
|
|
|
|
|
// We always use RateLimiter if available. We move (Refit) any buffer bytes
|
|
|
|
|
|
|
|
// that are left over the
|
|
|
|
|
|
|
|
// whole number of pages to be written again on the next flush because we can
|
|
|
|
|
|
|
|
// only write on aligned
|
|
|
|
|
|
|
|
// offsets.
|
|
|
|
|
|
|
|
Status WriteUnbuffered() { |
|
|
|
|
|
|
|
Status s; |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
assert(!use_os_buffer_); |
|
|
|
|
|
|
|
size_t alignment = buffer_.GetAlignment(); |
|
|
|
|
|
|
|
assert((filesize_ % alignment) == 0); |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
// Calculate whole page final file advance if all writes succeed
|
|
|
|
|
|
|
|
size_t file_advance = |
|
|
|
|
|
|
|
TruncateToPageBoundary(alignment, buffer_.GetCurrentSize()); |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
// Calculate the leftover tail, we write it here padded with zeros BUT we
|
|
|
|
|
|
|
|
// will write
|
|
|
|
|
|
|
|
// it again in the future either on Close() OR when the current whole page
|
|
|
|
|
|
|
|
// fills out
|
|
|
|
|
|
|
|
size_t leftover_tail = buffer_.GetCurrentSize() - file_advance; |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
// Round up and pad
|
|
|
|
|
|
|
|
buffer_.PadToAlignmentWith(0); |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
const char* src = buffer_.GetBufferStart(); |
|
|
|
|
|
|
|
size_t left = buffer_.GetCurrentSize(); |
|
|
|
|
|
|
|
uint64_t file_offset = filesize_; |
|
|
|
|
|
|
|
size_t actually_written = 0; |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
while (left > 0) { |
|
|
|
|
|
|
|
// Request how much is allowed. If this is less than one alignment we may
|
|
|
|
|
|
|
|
// be blocking a lot on every write
|
|
|
|
|
|
|
|
// because we can not write less than one alignment (page) unit thus check
|
|
|
|
|
|
|
|
// the configuration.
|
|
|
|
|
|
|
|
size_t bytes_allowed = RequestToken(left, true); |
|
|
|
|
|
|
|
SSIZE_T ret = pwrite(hFile_, buffer_.GetBufferStart() + actually_written, |
|
|
|
|
|
|
|
bytes_allowed, file_offset); |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
// Error break
|
|
|
|
|
|
|
|
if (ret < 0) { |
|
|
|
|
|
|
|
auto lastError = GetLastError(); |
|
|
|
|
|
|
|
s = IOErrorFromWindowsError( |
|
|
|
|
|
|
|
"Failed to pwrite for unbuffered: " + filename_, lastError); |
|
|
|
|
|
|
|
buffer_.SetSize(file_advance + leftover_tail); |
|
|
|
|
|
|
|
break; |
|
|
|
|
|
|
|
} |
|
|
|
|
|
|
|
actually_written += ret; |
|
|
|
|
|
|
|
file_offset += ret; |
|
|
|
|
|
|
|
left -= ret; |
|
|
|
|
|
|
|
} |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
IOSTATS_ADD(bytes_written, actually_written); |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
if (s.ok()) { |
|
|
|
|
|
|
|
// Move the tail to the beginning of the buffer
|
|
|
|
|
|
|
|
// This never happens during normal Append but rather during
|
|
|
|
|
|
|
|
// explicit call to Flush()/Sync() or Close()
|
|
|
|
|
|
|
|
buffer_.RefitTail(file_advance, leftover_tail); |
|
|
|
|
|
|
|
// This is where we start writing next time which may or not be
|
|
|
|
|
|
|
|
// the actual file size on disk. They match if the buffer size
|
|
|
|
|
|
|
|
// is a multiple of whole pages otherwise filesize_ is leftover_tail
|
|
|
|
|
|
|
|
// behind
|
|
|
|
|
|
|
|
filesize_ += file_advance; |
|
|
|
|
|
|
|
} |
|
|
|
|
|
|
|
return s; |
|
|
|
|
|
|
|
} |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
// This truncates the request to a single burst bytes
|
|
|
|
|
|
|
|
// and then goes through the request to make sure we are
|
|
|
|
|
|
|
|
// satisfied in the order of the I/O priority
|
|
|
|
|
|
|
|
size_t RequestToken(size_t bytes, bool align) const { |
|
|
|
|
|
|
|
if (rate_limiter_ && io_priority_ < Env::IO_TOTAL) { |
|
|
|
|
|
|
|
bytes = std::min( |
|
|
|
|
|
|
|
bytes, static_cast<size_t>(rate_limiter_->GetSingleBurstBytes())); |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
if (align) { |
|
|
|
|
|
|
|
// Here we may actually require more than burst and block
|
|
|
|
|
|
|
|
// but we can not write less than one page at a time on unbuffered
|
|
|
|
|
|
|
|
// thus we may want not to use ratelimiter s
|
|
|
|
|
|
|
|
size_t alignment = buffer_.GetAlignment(); |
|
|
|
|
|
|
|
bytes = std::max(alignment, TruncateToPageBoundary(alignment, bytes)); |
|
|
|
|
|
|
|
} |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
rate_limiter_->Request(bytes, io_priority_); |
|
|
|
|
|
|
|
} |
|
|
|
|
|
|
|
return bytes; |
|
|
|
|
|
|
|
} |
|
|
|
|
|
|
|
}; |
|
|
|
}; |
|
|
|
|
|
|
|
|
|
|
|
class WinDirectory : public Directory { |
|
|
|
class WinDirectory : public Directory { |
|
|
@ -2092,7 +1828,7 @@ class WinEnv : public Env { |
|
|
|
ThreadPool* thread_pool_; |
|
|
|
ThreadPool* thread_pool_; |
|
|
|
size_t thread_id_; // Thread count in the thread.
|
|
|
|
size_t thread_id_; // Thread count in the thread.
|
|
|
|
|
|
|
|
|
|
|
|
explicit BGThreadMetadata(ThreadPool* thread_pool, size_t thread_id) |
|
|
|
BGThreadMetadata(ThreadPool* thread_pool, size_t thread_id) |
|
|
|
: thread_pool_(thread_pool), thread_id_(thread_id) {} |
|
|
|
: thread_pool_(thread_pool), thread_id_(thread_id) {} |
|
|
|
}; |
|
|
|
}; |
|
|
|
|
|
|
|
|
|
|
|