Enable per-request buffer allocation in RandomAccessFile

This change impacts only non-buffered I/O on Windows.
 Currently, there is a buffer per RandomAccessFile
 instance that is protected by a lock. The reason we
 maintain the buffer is non-buffered I/O requires an aligned
 buffer to work.
 XPerf traces demonstrate that we accumulate a considerable
 wait time while waiting for that lock.
 This change enables to set random access buffer size to zero
 which would indicate a per request allocation.
 We are expecting that allocation expense would be much less than
 I/O costs plus wait time due to the fact that the memory heap
 would tend to re-use page aligned allocations especially with the
 use of Jemalloc.
 This change does not affect buffer use as a read_ahead_buffer for
 compaction purposes.
main
Dmitri Smirnov 9 years ago
parent 3b2a1ddd2e
commit 36300fbbe3
  1. 3
      include/rocksdb/options.h
  2. 115
      port/win/env_win.cc

@ -1121,6 +1121,9 @@ struct DBOptions {
// This option is currently honored only on Windows // This option is currently honored only on Windows
// //
// Default: 1 Mb // Default: 1 Mb
//
// Special value: 0 - means do not maintain per instance buffer. Allocate
// per request buffer and avoid locking.
size_t random_access_max_buffer_size; size_t random_access_max_buffer_size;
// This is the maximum buffer size that is used by WritableFileWriter. // This is the maximum buffer size that is used by WritableFileWriter.

@ -766,6 +766,18 @@ class WinRandomAccessFile : public RandomAccessFile {
return read; return read;
} }
void CalculateReadParameters(uint64_t offset, size_t bytes_requested,
size_t& actual_bytes_toread,
uint64_t& first_page_start) const {
const size_t alignment = buffer_.Alignment();
first_page_start = TruncateToPageBoundary(alignment, offset);
const uint64_t last_page_start =
TruncateToPageBoundary(alignment, offset + bytes_requested - 1);
actual_bytes_toread = (last_page_start - first_page_start) + alignment;
}
public: public:
WinRandomAccessFile(const std::string& fname, HANDLE hFile, size_t alignment, WinRandomAccessFile(const std::string& fname, HANDLE hFile, size_t alignment,
const EnvOptions& options) const EnvOptions& options)
@ -797,66 +809,87 @@ class WinRandomAccessFile : public RandomAccessFile {
virtual Status Read(uint64_t offset, size_t n, Slice* result, virtual Status Read(uint64_t offset, size_t n, Slice* result,
char* scratch) const override { char* scratch) const override {
Status s; Status s;
SSIZE_T r = -1; SSIZE_T r = -1;
size_t left = n; size_t left = n;
char* dest = scratch; char* dest = scratch;
if (n == 0) {
*result = Slice(scratch, 0);
return s;
}
// When in unbuffered mode we need to do the following changes: // When in unbuffered mode we need to do the following changes:
// - use our own aligned buffer // - use our own aligned buffer
// - always read at the offset of that is a multiple of alignment // - always read at the offset of that is a multiple of alignment
if (!use_os_buffer_) { if (!use_os_buffer_) {
std::unique_lock<std::mutex> lock(buffer_mut_);
// Let's see if at least some of the requested data is already uint64_t first_page_start = 0;
// in the buffer size_t actual_bytes_toread = 0;
if (offset >= buffered_start_ && size_t bytes_requested = left;
offset < (buffered_start_ + buffer_.CurrentSize())) {
size_t buffer_offset = offset - buffered_start_;
r = buffer_.Read(dest, buffer_offset, left);
assert(r >= 0);
left -= size_t(r); if (!read_ahead_ && random_access_max_buffer_size_ == 0) {
offset += r; CalculateReadParameters(offset, bytes_requested, actual_bytes_toread,
dest += r; first_page_start);
}
assert(actual_bytes_toread > 0);
// Still some left or none was buffered r = ReadIntoOneShotBuffer(offset, first_page_start,
if (left > 0) { actual_bytes_toread, left, dest);
// Figure out the start/end offset for reading and amount to read } else {
const size_t alignment = buffer_.Alignment();
const size_t first_page_start = std::unique_lock<std::mutex> lock(buffer_mut_);
TruncateToPageBoundary(alignment, offset);
size_t bytes_requested = left; // Let's see if at least some of the requested data is already
if (read_ahead_ && bytes_requested < compaction_readahead_size_) { // in the buffer
bytes_requested = compaction_readahead_size_; if (offset >= buffered_start_ &&
offset < (buffered_start_ + buffer_.CurrentSize())) {
size_t buffer_offset = offset - buffered_start_;
r = buffer_.Read(dest, buffer_offset, left);
assert(r >= 0);
left -= size_t(r);
offset += r;
dest += r;
} }
const size_t last_page_start = // Still some left or none was buffered
TruncateToPageBoundary(alignment, offset + bytes_requested - 1); if (left > 0) {
const size_t actual_bytes_toread = // Figure out the start/end offset for reading and amount to read
(last_page_start - first_page_start) + alignment; bytes_requested = left;
if (read_ahead_ && bytes_requested < compaction_readahead_size_) {
bytes_requested = compaction_readahead_size_;
}
CalculateReadParameters(offset, bytes_requested, actual_bytes_toread,
first_page_start);
assert(actual_bytes_toread > 0);
if (buffer_.Capacity() < actual_bytes_toread) { if (buffer_.Capacity() < actual_bytes_toread) {
// If we are in read-ahead mode or the requested size // If we are in read-ahead mode or the requested size
// exceeds max buffer size then use one-shot // exceeds max buffer size then use one-shot
// big buffer otherwise reallocate main buffer // big buffer otherwise reallocate main buffer
if (read_ahead_ || if (read_ahead_ ||
(actual_bytes_toread > random_access_max_buffer_size_)) { (actual_bytes_toread > random_access_max_buffer_size_)) {
// Unlock the mutex since we are not using instance buffer // Unlock the mutex since we are not using instance buffer
lock.unlock(); lock.unlock();
r = ReadIntoOneShotBuffer(offset, first_page_start, r = ReadIntoOneShotBuffer(offset, first_page_start,
actual_bytes_toread, left, dest); actual_bytes_toread, left, dest);
} else { }
buffer_.AllocateNewBuffer(actual_bytes_toread); else {
buffer_.AllocateNewBuffer(actual_bytes_toread);
r = ReadIntoInstanceBuffer(offset, first_page_start,
actual_bytes_toread, left, dest);
}
}
else {
buffer_.Clear();
r = ReadIntoInstanceBuffer(offset, first_page_start, r = ReadIntoInstanceBuffer(offset, first_page_start,
actual_bytes_toread, left, dest); actual_bytes_toread, left, dest);
} }
} else {
buffer_.Clear();
r = ReadIntoInstanceBuffer(offset, first_page_start,
actual_bytes_toread, left, dest);
} }
} }
} else { } else {

Loading…
Cancel
Save