Remove double buffering on RandomRead on Windows.

Summary:
Remove double buffering on RandomRead on Windows.
  With more logic appear in file reader/write Read no longer
  obeys forwarding calls to Windows implementation.
  Previously direct_io (unbuffered) was only available on Windows
  but now is supported as generic.
  We remove intermediate buffering on Windows.
  Remove random_access_max_buffer_size option which was windows specific.
  Non-zero values for that opton introduced unnecessary lock contention.
  Remove Env::EnableReadAhead(), Env::ShouldForwardRawRequest() that are
  no longer necessary.
  Add aligned buffer reads for cases when requested reads exceed read ahead size.
Closes https://github.com/facebook/rocksdb/pull/2105

Differential Revision: D4847770

Pulled By: siying

fbshipit-source-id: 8ab48f8e854ab498a4fd398a6934859792a2788f
main
Dmitri Smirnov 8 years ago committed by Facebook Github Bot
parent e15382c09c
commit cdad04b051
  1. 2
      HISTORY.md
  2. 1
      db/compaction_iterator.cc
  3. 21
      include/rocksdb/env.h
  4. 6
      options/options_settable_test.cc
  5. 202
      port/win/io_win.cc
  6. 64
      port/win/io_win.h
  7. 6
      util/aligned_buffer.h
  8. 18
      util/file_reader_writer.cc
  9. 10
      utilities/env_librados.cc
  10. 5
      utilities/env_mirror.cc

@ -7,6 +7,8 @@
## 5.4.0 (04/11/2017) ## 5.4.0 (04/11/2017)
### Public API Change ### Public API Change
* random_access_max_buffer_size no longer has any effect
* Removed Env::EnableReadAhead(), Env::ShouldForwardRawRequest()
* Support dynamically change `stats_dump_period_sec` option via SetDBOptions(). * Support dynamically change `stats_dump_period_sec` option via SetDBOptions().
* Added ReadOptions::max_skippable_internal_keys to set a threshold to fail a request as incomplete when too many keys are being skipped when using iterators. * Added ReadOptions::max_skippable_internal_keys to set a threshold to fail a request as incomplete when too many keys are being skipped when using iterators.
* DB::Get in place of std::string accepts PinnableSlice, which avoids the extra memcpy of value to std::string in most of cases. * DB::Get in place of std::string accepts PinnableSlice, which avoids the extra memcpy of value to std::string in most of cases.

@ -6,6 +6,7 @@
// of patent rights can be found in the PATENTS file in the same directory. // of patent rights can be found in the PATENTS file in the same directory.
#include "db/compaction_iterator.h" #include "db/compaction_iterator.h"
#include "rocksdb/listener.h"
#include "table/internal_iterator.h" #include "table/internal_iterator.h"
namespace rocksdb { namespace rocksdb {

@ -510,16 +510,6 @@ class RandomAccessFile {
return Status::OK(); return Status::OK();
} }
// Used by the file_reader_writer to decide if the ReadAhead wrapper
// should simply forward the call and do not enact buffering or locking.
virtual bool ShouldForwardRawRequest() const {
return false;
}
// For cases when read-ahead is implemented in the platform dependent
// layer
virtual void EnableReadAhead() {}
// Tries to get an unique ID for this file that will be the same each time // Tries to get an unique ID for this file that will be the same each time
// the file is opened (and will stay the same while the file is open). // the file is opened (and will stay the same while the file is open).
// Furthermore, it tries to make this ID at most "max_size" bytes. If such an // Furthermore, it tries to make this ID at most "max_size" bytes. If such an
@ -751,17 +741,6 @@ class RandomRWFile {
// aligned buffer for Direct I/O // aligned buffer for Direct I/O
virtual size_t GetRequiredBufferAlignment() const { return kDefaultPageSize; } virtual size_t GetRequiredBufferAlignment() const { return kDefaultPageSize; }
// Used by the file_reader_writer to decide if the ReadAhead wrapper
// should simply forward the call and do not enact read_ahead buffering or locking.
// The implementation below takes care of reading ahead
virtual bool ShouldForwardRawRequest() const {
return false;
}
// For cases when read-ahead is implemented in the platform dependent
// layer. This is when ShouldForwardRawRequest() returns true.
virtual void EnableReadAhead() {}
// Write bytes in `data` at offset `offset`, Returns Status::OK() on success. // Write bytes in `data` at offset `offset`, Returns Status::OK() on success.
// Pass aligned buffer when use_direct_io() returns true. // Pass aligned buffer when use_direct_io() returns true.
virtual Status Write(uint64_t offset, const Slice& data) = 0; virtual Status Write(uint64_t offset, const Slice& data) = 0;

@ -45,7 +45,7 @@ namespace rocksdb {
// settable through string on limited platforms as it depends on behavior of // settable through string on limited platforms as it depends on behavior of
// compilers. // compilers.
#ifndef ROCKSDB_LITE #ifndef ROCKSDB_LITE
#ifdef OS_LINUX #if defined OS_LINUX || defined OS_WIN
#ifndef __clang__ #ifndef __clang__
class OptionsSettableTest : public testing::Test { class OptionsSettableTest : public testing::Test {
@ -54,7 +54,7 @@ class OptionsSettableTest : public testing::Test {
}; };
const char kSpecialChar = 'z'; const char kSpecialChar = 'z';
typedef std::vector<std::pair<int, size_t>> OffsetGap; typedef std::vector<std::pair<size_t, size_t>> OffsetGap;
void FillWithSpecialChar(char* start_ptr, size_t total_size, void FillWithSpecialChar(char* start_ptr, size_t total_size,
const OffsetGap& blacklist) { const OffsetGap& blacklist) {
@ -446,7 +446,7 @@ TEST_F(OptionsSettableTest, ColumnFamilyOptionsAllFieldsSettable) {
delete[] new_options_ptr; delete[] new_options_ptr;
} }
#endif // !__clang__ #endif // !__clang__
#endif // OS_LINUX #endif // OS_LINUX || OS_WIN
#endif // !ROCKSDB_LITE #endif // !ROCKSDB_LITE
} // namespace rocksdb } // namespace rocksdb

@ -638,71 +638,6 @@ Status WinSequentialFile::InvalidateCache(size_t offset, size_t length) {
////////////////////////////////////////////////////////////////////////////////////////////////// //////////////////////////////////////////////////////////////////////////////////////////////////
/// WinRandomAccessBase /// WinRandomAccessBase
// Helper
void CalculateReadParameters(size_t alignment, uint64_t offset,
size_t bytes_requested,
size_t& actual_bytes_toread,
uint64_t& first_page_start) {
first_page_start = TruncateToPageBoundary(alignment, offset);
const uint64_t last_page_start =
TruncateToPageBoundary(alignment, offset + bytes_requested - 1);
actual_bytes_toread = (last_page_start - first_page_start) + alignment;
}
SSIZE_T WinRandomAccessImpl::ReadIntoBuffer(uint64_t user_offset,
uint64_t first_page_start,
size_t bytes_to_read, size_t& left,
AlignedBuffer& buffer, char* dest) const {
assert(buffer.CurrentSize() == 0);
assert(buffer.Capacity() >= bytes_to_read);
SSIZE_T read =
PositionedReadInternal(buffer.Destination(), bytes_to_read,
first_page_start);
if (read > 0) {
buffer.Size(read);
// Let's figure out how much we read from the users standpoint
if ((first_page_start + buffer.CurrentSize()) > user_offset) {
assert(first_page_start <= user_offset);
size_t buffer_offset = user_offset - first_page_start;
read = buffer.Read(dest, buffer_offset, left);
} else {
read = 0;
}
left -= read;
}
return read;
}
SSIZE_T WinRandomAccessImpl::ReadIntoOneShotBuffer(uint64_t user_offset,
uint64_t first_page_start,
size_t bytes_to_read, size_t& left,
char* dest) const {
AlignedBuffer bigBuffer;
bigBuffer.Alignment(buffer_.Alignment());
bigBuffer.AllocateNewBuffer(bytes_to_read);
return ReadIntoBuffer(user_offset, first_page_start, bytes_to_read, left,
bigBuffer, dest);
}
SSIZE_T WinRandomAccessImpl::ReadIntoInstanceBuffer(uint64_t user_offset,
uint64_t first_page_start,
size_t bytes_to_read, size_t& left,
char* dest) const {
SSIZE_T read = ReadIntoBuffer(user_offset, first_page_start, bytes_to_read,
left, buffer_, dest);
if (read > 0) {
buffered_start_ = first_page_start;
}
return read;
}
SSIZE_T WinRandomAccessImpl::PositionedReadInternal(char* src, SSIZE_T WinRandomAccessImpl::PositionedReadInternal(char* src,
size_t numBytes, size_t numBytes,
uint64_t offset) const { uint64_t offset) const {
@ -714,17 +649,9 @@ WinRandomAccessImpl::WinRandomAccessImpl(WinFileData* file_base,
size_t alignment, size_t alignment,
const EnvOptions& options) : const EnvOptions& options) :
file_base_(file_base), file_base_(file_base),
read_ahead_(false), alignment_(alignment) {
compaction_readahead_size_(options.compaction_readahead_size),
random_access_max_buffer_size_(options.random_access_max_buffer_size),
buffer_(),
buffered_start_(0) {
assert(!options.use_mmap_reads); assert(!options.use_mmap_reads);
// Do not allocate the buffer either until the first request or
// until there is a call to allocate a read-ahead buffer
buffer_.Alignment(alignment);
} }
inline inline
@ -732,94 +659,26 @@ Status WinRandomAccessImpl::ReadImpl(uint64_t offset, size_t n, Slice* result,
char* scratch) const { char* scratch) const {
Status s; Status s;
SSIZE_T r = -1;
size_t left = n;
char* dest = scratch;
if (n == 0) { // Check buffer alignment
*result = Slice(scratch, 0);
return s;
}
// When in direct I/O mode we need to do the following changes:
// - use our own aligned buffer
// - always read at the offset of that is a multiple of alignment
if (file_base_->use_direct_io()) { if (file_base_->use_direct_io()) {
uint64_t first_page_start = 0; if (!IsAligned(alignment_, scratch)) {
size_t actual_bytes_toread = 0; return Status::InvalidArgument("WinRandomAccessImpl::ReadImpl: scratch is not properly aligned");
size_t bytes_requested = left;
if (!read_ahead_ && random_access_max_buffer_size_ == 0) {
CalculateReadParameters(buffer_.Alignment(), offset, bytes_requested,
actual_bytes_toread,
first_page_start);
assert(actual_bytes_toread > 0);
r = ReadIntoOneShotBuffer(offset, first_page_start,
actual_bytes_toread, left, dest);
} else {
std::unique_lock<std::mutex> lock(buffer_mut_);
// Let's see if at least some of the requested data is already
// in the buffer
if (offset >= buffered_start_ &&
offset < (buffered_start_ + buffer_.CurrentSize())) {
size_t buffer_offset = offset - buffered_start_;
r = buffer_.Read(dest, buffer_offset, left);
assert(r >= 0);
left -= size_t(r);
offset += r;
dest += r;
} }
// Still some left or none was buffered
if (left > 0) {
// Figure out the start/end offset for reading and amount to read
bytes_requested = left;
if (read_ahead_ && bytes_requested < compaction_readahead_size_) {
bytes_requested = compaction_readahead_size_;
} }
CalculateReadParameters(buffer_.Alignment(), offset, bytes_requested, if (n == 0) {
actual_bytes_toread, *result = Slice(scratch, 0);
first_page_start); return s;
}
assert(actual_bytes_toread > 0); size_t left = n;
char* dest = scratch;
if (buffer_.Capacity() < actual_bytes_toread) { SSIZE_T r = PositionedReadInternal(scratch, left, offset);
// If we are in read-ahead mode or the requested size
// exceeds max buffer size then use one-shot
// big buffer otherwise reallocate main buffer
if (read_ahead_ ||
(actual_bytes_toread > random_access_max_buffer_size_)) {
// Unlock the mutex since we are not using instance buffer
lock.unlock();
r = ReadIntoOneShotBuffer(offset, first_page_start,
actual_bytes_toread, left, dest);
} else {
buffer_.AllocateNewBuffer(actual_bytes_toread);
r = ReadIntoInstanceBuffer(offset, first_page_start,
actual_bytes_toread, left, dest);
}
} else {
buffer_.Clear();
r = ReadIntoInstanceBuffer(offset, first_page_start,
actual_bytes_toread, left, dest);
}
}
}
} else {
r = PositionedReadInternal(scratch, left, offset);
if (r > 0) { if (r > 0) {
left -= r; left -= r;
} } else if (r < 0) {
}
if (r < 0) {
auto lastError = GetLastError(); auto lastError = GetLastError();
// Posix impl wants to treat reads from beyond // Posix impl wants to treat reads from beyond
// of the file as OK. // of the file as OK.
@ -833,23 +692,6 @@ Status WinRandomAccessImpl::ReadImpl(uint64_t offset, size_t n, Slice* result,
return s; return s;
} }
inline
void WinRandomAccessImpl::HintImpl(RandomAccessFile::AccessPattern pattern) {
if (pattern == RandomAccessFile::SEQUENTIAL && file_base_->use_direct_io() &&
compaction_readahead_size_ > 0) {
std::lock_guard<std::mutex> lg(buffer_mut_);
if (!read_ahead_) {
read_ahead_ = true;
// This would allocate read-ahead size + 2 alignments
// - one for memory alignment which added implicitly by AlignedBuffer
// - We add one more alignment because we will read one alignment more
// from disk
buffer_.AllocateNewBuffer(compaction_readahead_size_ +
buffer_.Alignment());
}
}
}
/////////////////////////////////////////////////////////////////////////////////////////////////// ///////////////////////////////////////////////////////////////////////////////////////////////////
/// WinRandomAccessFile /// WinRandomAccessFile
@ -867,18 +709,6 @@ Status WinRandomAccessFile::Read(uint64_t offset, size_t n, Slice* result,
return ReadImpl(offset, n, result, scratch); return ReadImpl(offset, n, result, scratch);
} }
void WinRandomAccessFile::EnableReadAhead() {
HintImpl(SEQUENTIAL);
}
bool WinRandomAccessFile::ShouldForwardRawRequest() const {
return true;
}
void WinRandomAccessFile::Hint(AccessPattern pattern) {
HintImpl(pattern);
}
Status WinRandomAccessFile::InvalidateCache(size_t offset, size_t length) { Status WinRandomAccessFile::InvalidateCache(size_t offset, size_t length) {
return Status::OK(); return Status::OK();
} }
@ -1136,14 +966,6 @@ size_t WinRandomRWFile::GetRequiredBufferAlignment() const {
return GetAlignement(); return GetAlignement();
} }
bool WinRandomRWFile::ShouldForwardRawRequest() const {
return true;
}
void WinRandomRWFile::EnableReadAhead() {
HintImpl(RandomAccessFile::SEQUENTIAL);
}
Status WinRandomRWFile::Write(uint64_t offset, const Slice & data) { Status WinRandomRWFile::Write(uint64_t offset, const Slice & data) {
return PositionedAppendImpl(data, offset); return PositionedAppendImpl(data, offset);
} }

@ -237,54 +237,12 @@ class WinMmapFile : private WinFileData, public WritableFile {
class WinRandomAccessImpl { class WinRandomAccessImpl {
protected: protected:
WinFileData* file_base_; WinFileData* file_base_;
bool read_ahead_; size_t alignment_;
const size_t compaction_readahead_size_;
const size_t random_access_max_buffer_size_;
mutable std::mutex buffer_mut_;
mutable AlignedBuffer buffer_;
mutable uint64_t
buffered_start_; // file offset set that is currently buffered
// Override for behavior change when creating a custom env // Override for behavior change when creating a custom env
virtual SSIZE_T PositionedReadInternal(char* src, size_t numBytes, virtual SSIZE_T PositionedReadInternal(char* src, size_t numBytes,
uint64_t offset) const; uint64_t offset) const;
/*
* The function reads a requested amount of bytes into the specified aligned
* buffer Upon success the function sets the length of the buffer to the
* amount of bytes actually read even though it might be less than actually
* requested. It then copies the amount of bytes requested by the user (left)
* to the user supplied buffer (dest) and reduces left by the amount of bytes
* copied to the user buffer
*
* @user_offset [in] - offset on disk where the read was requested by the user
* @first_page_start [in] - actual page aligned disk offset that we want to
* read from
* @bytes_to_read [in] - total amount of bytes that will be read from disk
* which is generally greater or equal to the amount
* that the user has requested due to the
* either alignment requirements or read_ahead in
* effect.
* @left [in/out] total amount of bytes that needs to be copied to the user
* buffer. It is reduced by the amount of bytes that actually
* copied
* @buffer - buffer to use
* @dest - user supplied buffer
*/
SSIZE_T ReadIntoBuffer(uint64_t user_offset, uint64_t first_page_start,
size_t bytes_to_read, size_t& left,
AlignedBuffer& buffer, char* dest) const;
SSIZE_T ReadIntoOneShotBuffer(uint64_t user_offset, uint64_t first_page_start,
size_t bytes_to_read, size_t& left,
char* dest) const;
SSIZE_T ReadIntoInstanceBuffer(uint64_t user_offset,
uint64_t first_page_start,
size_t bytes_to_read, size_t& left,
char* dest) const;
WinRandomAccessImpl(WinFileData* file_base, size_t alignment, WinRandomAccessImpl(WinFileData* file_base, size_t alignment,
const EnvOptions& options); const EnvOptions& options);
@ -293,9 +251,7 @@ class WinRandomAccessImpl {
Status ReadImpl(uint64_t offset, size_t n, Slice* result, Status ReadImpl(uint64_t offset, size_t n, Slice* result,
char* scratch) const; char* scratch) const;
void HintImpl(RandomAccessFile::AccessPattern pattern); size_t GetAlignment() const { return alignment_; }
size_t GetAlignment() const { return buffer_.Alignment(); }
public: public:
@ -318,14 +274,8 @@ class WinRandomAccessFile
virtual Status Read(uint64_t offset, size_t n, Slice* result, virtual Status Read(uint64_t offset, size_t n, Slice* result,
char* scratch) const override; char* scratch) const override;
virtual bool ShouldForwardRawRequest() const override;
virtual void EnableReadAhead() override;
virtual size_t GetUniqueId(char* id, size_t max_size) const override; virtual size_t GetUniqueId(char* id, size_t max_size) const override;
virtual void Hint(AccessPattern pattern) override;
virtual bool use_direct_io() const override { return WinFileData::use_direct_io(); } virtual bool use_direct_io() const override { return WinFileData::use_direct_io(); }
virtual Status InvalidateCache(size_t offset, size_t length) override; virtual Status InvalidateCache(size_t offset, size_t length) override;
@ -449,16 +399,6 @@ class WinRandomRWFile : private WinFileData,
// buffer for Write() when use_direct_io() returns true // buffer for Write() when use_direct_io() returns true
virtual size_t GetRequiredBufferAlignment() const override; virtual size_t GetRequiredBufferAlignment() const override;
// Used by the file_reader_writer to decide if the ReadAhead wrapper
// should simply forward the call and do not enact read_ahead buffering or
// locking.
// The implementation below takes care of reading ahead
virtual bool ShouldForwardRawRequest() const override;
// For cases when read-ahead is implemented in the platform dependent
// layer. This is when ShouldForwardRawRequest() returns true.
virtual void EnableReadAhead() override;
// Write bytes in `data` at offset `offset`, Returns Status::OK() on success. // Write bytes in `data` at offset `offset`, Returns Status::OK() on success.
// Pass aligned buffer when use_direct_io() returns true. // Pass aligned buffer when use_direct_io() returns true.
virtual Status Write(uint64_t offset, const Slice& data) override; virtual Status Write(uint64_t offset, const Slice& data) override;

@ -125,7 +125,11 @@ public:
size_t Read(char* dest, size_t offset, size_t read_size) const { size_t Read(char* dest, size_t offset, size_t read_size) const {
assert(offset < cursize_); assert(offset < cursize_);
size_t to_read = std::min(cursize_ - offset, read_size);
size_t to_read = 0;
if(offset < cursize_) {
to_read = std::min(cursize_ - offset, read_size);
}
if (to_read > 0) { if (to_read > 0) {
memcpy(dest, bufstart_ + offset, to_read); memcpy(dest, bufstart_ + offset, to_read);
} }

@ -435,16 +435,12 @@ class ReadaheadRandomAccessFile : public RandomAccessFile {
: file_(std::move(file)), : file_(std::move(file)),
alignment_(file_->GetRequiredBufferAlignment()), alignment_(file_->GetRequiredBufferAlignment()),
readahead_size_(Roundup(readahead_size, alignment_)), readahead_size_(Roundup(readahead_size, alignment_)),
forward_calls_(file_->ShouldForwardRawRequest()),
buffer_(), buffer_(),
buffer_offset_(0), buffer_offset_(0),
buffer_len_(0) { buffer_len_(0) {
if (!forward_calls_) {
buffer_.Alignment(alignment_); buffer_.Alignment(alignment_);
buffer_.AllocateNewBuffer(readahead_size_); buffer_.AllocateNewBuffer(readahead_size_);
} else if (readahead_size_ > 0) {
file_->EnableReadAhead();
}
} }
ReadaheadRandomAccessFile(const ReadaheadRandomAccessFile&) = delete; ReadaheadRandomAccessFile(const ReadaheadRandomAccessFile&) = delete;
@ -453,15 +449,8 @@ class ReadaheadRandomAccessFile : public RandomAccessFile {
virtual Status Read(uint64_t offset, size_t n, Slice* result, virtual Status Read(uint64_t offset, size_t n, Slice* result,
char* scratch) const override { char* scratch) const override {
if (n + alignment_ >= readahead_size_) {
return file_->Read(offset, n, result, scratch);
}
// On Windows in unbuffered mode this will lead to double buffering if (n + alignment_ >= readahead_size_) {
// and double locking so we avoid that.
// In normal mode Windows caches so much data from disk that we do
// not need readahead.
if (forward_calls_) {
return file_->Read(offset, n, result, scratch); return file_->Read(offset, n, result, scratch);
} }
@ -469,7 +458,7 @@ class ReadaheadRandomAccessFile : public RandomAccessFile {
size_t cached_len = 0; size_t cached_len = 0;
// Check if there is a cache hit, means that [offset, offset + n) is either // Check if there is a cache hit, means that [offset, offset + n) is either
// complitely or partially in the buffer // completely or partially in the buffer
// If it's completely cached, including end of file case when offset + n is // If it's completely cached, including end of file case when offset + n is
// greater than EOF, return // greater than EOF, return
if (TryReadFromCache(offset, n, &cached_len, scratch) && if (TryReadFromCache(offset, n, &cached_len, scratch) &&
@ -558,7 +547,6 @@ class ReadaheadRandomAccessFile : public RandomAccessFile {
std::unique_ptr<RandomAccessFile> file_; std::unique_ptr<RandomAccessFile> file_;
const size_t alignment_; const size_t alignment_;
size_t readahead_size_; size_t readahead_size_;
const bool forward_calls_;
mutable std::mutex lock_; mutable std::mutex lock_;
mutable AlignedBuffer buffer_; mutable AlignedBuffer buffer_;

@ -219,16 +219,6 @@ public:
return s; return s;
} }
// Used by the file_reader_writer to decide if the ReadAhead wrapper
// should simply forward the call and do not enact buffering or locking.
bool ShouldForwardRawRequest() const {
return false;
}
// For cases when read-ahead is implemented in the platform dependent
// layer
void EnableReadAhead() {}
/** /**
* @brief [brief description] * @brief [brief description]
* @details Get unique id for each file and guarantee this id is different for each file * @details Get unique id for each file and guarantee this id is different for each file

@ -86,11 +86,6 @@ class RandomAccessFileMirror : public RandomAccessFile {
return as; return as;
} }
bool ShouldForwardRawRequest() const {
// NOTE: not verified
return a_->ShouldForwardRawRequest();
}
size_t GetUniqueId(char* id, size_t max_size) const { size_t GetUniqueId(char* id, size_t max_size) const {
// NOTE: not verified // NOTE: not verified
return a_->GetUniqueId(id, max_size); return a_->GetUniqueId(id, max_size);

Loading…
Cancel
Save