Move rate_limiter, write buffering, most perf context instrumentation and most random kill out of Env
Summary: We want to keep Env a think layer for better portability. Less platform dependent codes should be moved out of Env. In this patch, I create a wrapper of file readers and writers, and put rate limiting, write buffering, as well as most perf context instrumentation and random kill out of Env. It will make it easier to maintain multiple Env in the future. Test Plan: Run all existing unit tests. Reviewers: anthony, kradhakrishnan, IslamAbdelRahman, yhchiang, igor Reviewed By: igor Subscribers: leveldb, dhruba Differential Revision: https://reviews.facebook.net/D42321main
parent
5ec829bc4f
commit
6e9fbeb27c
@ -0,0 +1,225 @@ |
|||||||
|
// Copyright (c) 2013, Facebook, Inc. All rights reserved.
|
||||||
|
// This source code is licensed under the BSD-style license found in the
|
||||||
|
// LICENSE file in the root directory of this source tree. An additional grant
|
||||||
|
// of patent rights can be found in the PATENTS file in the same directory.
|
||||||
|
//
|
||||||
|
// Copyright (c) 2011 The LevelDB Authors. All rights reserved.
|
||||||
|
// Use of this source code is governed by a BSD-style license that can be
|
||||||
|
// found in the LICENSE file. See the AUTHORS file for names of contributors.
|
||||||
|
|
||||||
|
#include "util/file_reader_writer.h" |
||||||
|
|
||||||
|
#include <algorithm> |
||||||
|
#include "port/port.h" |
||||||
|
#include "util/iostats_context_imp.h" |
||||||
|
#include "util/random.h" |
||||||
|
#include "util/rate_limiter.h" |
||||||
|
#include "util/sync_point.h" |
||||||
|
|
||||||
|
namespace rocksdb { |
||||||
|
Status SequentialFileReader::Read(size_t n, Slice* result, char* scratch) { |
||||||
|
Status s = file_->Read(n, result, scratch); |
||||||
|
IOSTATS_ADD(bytes_read, result->size()); |
||||||
|
return s; |
||||||
|
} |
||||||
|
|
||||||
|
Status SequentialFileReader::Skip(uint64_t n) { return file_->Skip(n); } |
||||||
|
|
||||||
|
Status RandomAccessFileReader::Read(uint64_t offset, size_t n, Slice* result, |
||||||
|
char* scratch) const { |
||||||
|
IOSTATS_TIMER_GUARD(read_nanos); |
||||||
|
Status s = file_->Read(offset, n, result, scratch); |
||||||
|
IOSTATS_ADD_IF_POSITIVE(bytes_read, result->size()); |
||||||
|
return s; |
||||||
|
} |
||||||
|
|
||||||
|
Status WritableFileWriter::Append(const Slice& data) { |
||||||
|
const char* src = data.data(); |
||||||
|
size_t left = data.size(); |
||||||
|
Status s; |
||||||
|
pending_sync_ = true; |
||||||
|
pending_fsync_ = true; |
||||||
|
|
||||||
|
TEST_KILL_RANDOM(rocksdb_kill_odds * REDUCE_ODDS2); |
||||||
|
|
||||||
|
writable_file_->PrepareWrite(static_cast<size_t>(GetFileSize()), left); |
||||||
|
// if there is no space in the cache, then flush
|
||||||
|
if (cursize_ + left > capacity_) { |
||||||
|
s = Flush(); |
||||||
|
if (!s.ok()) { |
||||||
|
return s; |
||||||
|
} |
||||||
|
// Increase the buffer size, but capped at 1MB
|
||||||
|
if (capacity_ < (1 << 20)) { |
||||||
|
capacity_ *= 2; |
||||||
|
buf_.reset(new char[capacity_]); |
||||||
|
} |
||||||
|
assert(cursize_ == 0); |
||||||
|
} |
||||||
|
|
||||||
|
// if the write fits into the cache, then write to cache
|
||||||
|
// otherwise do a write() syscall to write to OS buffers.
|
||||||
|
if (cursize_ + left <= capacity_) { |
||||||
|
memcpy(buf_.get() + cursize_, src, left); |
||||||
|
cursize_ += left; |
||||||
|
} else { |
||||||
|
while (left != 0) { |
||||||
|
size_t size = RequestToken(left); |
||||||
|
{ |
||||||
|
IOSTATS_TIMER_GUARD(write_nanos); |
||||||
|
s = writable_file_->Append(Slice(src, size)); |
||||||
|
if (!s.ok()) { |
||||||
|
return s; |
||||||
|
} |
||||||
|
} |
||||||
|
IOSTATS_ADD(bytes_written, size); |
||||||
|
TEST_KILL_RANDOM(rocksdb_kill_odds); |
||||||
|
|
||||||
|
left -= size; |
||||||
|
src += size; |
||||||
|
} |
||||||
|
} |
||||||
|
TEST_KILL_RANDOM(rocksdb_kill_odds); |
||||||
|
filesize_ += data.size(); |
||||||
|
return Status::OK(); |
||||||
|
} |
||||||
|
|
||||||
|
Status WritableFileWriter::Close() { |
||||||
|
Status s; |
||||||
|
s = Flush(); // flush cache to OS
|
||||||
|
if (!s.ok()) { |
||||||
|
return s; |
||||||
|
} |
||||||
|
|
||||||
|
TEST_KILL_RANDOM(rocksdb_kill_odds); |
||||||
|
return writable_file_->Close(); |
||||||
|
} |
||||||
|
|
||||||
|
// write out the cached data to the OS cache
|
||||||
|
Status WritableFileWriter::Flush() { |
||||||
|
TEST_KILL_RANDOM(rocksdb_kill_odds * REDUCE_ODDS2); |
||||||
|
size_t left = cursize_; |
||||||
|
char* src = buf_.get(); |
||||||
|
while (left != 0) { |
||||||
|
size_t size = RequestToken(left); |
||||||
|
{ |
||||||
|
IOSTATS_TIMER_GUARD(write_nanos); |
||||||
|
Status s = writable_file_->Append(Slice(src, size)); |
||||||
|
if (!s.ok()) { |
||||||
|
return s; |
||||||
|
} |
||||||
|
} |
||||||
|
IOSTATS_ADD(bytes_written, size); |
||||||
|
TEST_KILL_RANDOM(rocksdb_kill_odds * REDUCE_ODDS2); |
||||||
|
left -= size; |
||||||
|
src += size; |
||||||
|
} |
||||||
|
cursize_ = 0; |
||||||
|
|
||||||
|
writable_file_->Flush(); |
||||||
|
|
||||||
|
// sync OS cache to disk for every bytes_per_sync_
|
||||||
|
// TODO: give log file and sst file different options (log
|
||||||
|
// files could be potentially cached in OS for their whole
|
||||||
|
// life time, thus we might not want to flush at all).
|
||||||
|
if (bytes_per_sync_ && filesize_ - last_sync_size_ >= bytes_per_sync_) { |
||||||
|
writable_file_->RangeSync(last_sync_size_, filesize_ - last_sync_size_); |
||||||
|
last_sync_size_ = filesize_; |
||||||
|
} |
||||||
|
|
||||||
|
return Status::OK(); |
||||||
|
} |
||||||
|
|
||||||
|
Status WritableFileWriter::Sync(bool use_fsync) { |
||||||
|
Status s = Flush(); |
||||||
|
if (!s.ok()) { |
||||||
|
return s; |
||||||
|
} |
||||||
|
TEST_KILL_RANDOM(rocksdb_kill_odds); |
||||||
|
if (pending_sync_) { |
||||||
|
if (use_fsync) { |
||||||
|
s = writable_file_->Fsync(); |
||||||
|
} else { |
||||||
|
s = writable_file_->Sync(); |
||||||
|
} |
||||||
|
if (!s.ok()) { |
||||||
|
return s; |
||||||
|
} |
||||||
|
} |
||||||
|
TEST_KILL_RANDOM(rocksdb_kill_odds); |
||||||
|
pending_sync_ = false; |
||||||
|
if (use_fsync) { |
||||||
|
pending_fsync_ = false; |
||||||
|
} |
||||||
|
return Status::OK(); |
||||||
|
} |
||||||
|
|
||||||
|
Status WritableFileWriter::RangeSync(off_t offset, off_t nbytes) { |
||||||
|
IOSTATS_TIMER_GUARD(range_sync_nanos); |
||||||
|
return writable_file_->RangeSync(offset, nbytes); |
||||||
|
} |
||||||
|
|
||||||
|
size_t WritableFileWriter::RequestToken(size_t bytes) { |
||||||
|
Env::IOPriority io_priority; |
||||||
|
if (rate_limiter_&&(io_priority = writable_file_->GetIOPriority()) < |
||||||
|
Env::IO_TOTAL) { |
||||||
|
bytes = std::min(bytes, |
||||||
|
static_cast<size_t>(rate_limiter_->GetSingleBurstBytes())); |
||||||
|
rate_limiter_->Request(bytes, io_priority); |
||||||
|
} |
||||||
|
return bytes; |
||||||
|
} |
||||||
|
|
||||||
|
Status RandomRWFileAccessor::Write(uint64_t offset, const Slice& data) { |
||||||
|
Status s; |
||||||
|
pending_sync_ = true; |
||||||
|
pending_fsync_ = true; |
||||||
|
|
||||||
|
{ |
||||||
|
IOSTATS_TIMER_GUARD(write_nanos); |
||||||
|
s = random_rw_file_->Write(offset, data); |
||||||
|
if (!s.ok()) { |
||||||
|
return s; |
||||||
|
} |
||||||
|
} |
||||||
|
IOSTATS_ADD(bytes_written, data.size()); |
||||||
|
|
||||||
|
return s; |
||||||
|
} |
||||||
|
|
||||||
|
Status RandomRWFileAccessor::Read(uint64_t offset, size_t n, Slice* result, |
||||||
|
char* scratch) const { |
||||||
|
Status s; |
||||||
|
{ |
||||||
|
IOSTATS_TIMER_GUARD(read_nanos); |
||||||
|
s = random_rw_file_->Read(offset, n, result, scratch); |
||||||
|
if (!s.ok()) { |
||||||
|
return s; |
||||||
|
} |
||||||
|
} |
||||||
|
IOSTATS_ADD_IF_POSITIVE(bytes_read, result->size()); |
||||||
|
return s; |
||||||
|
} |
||||||
|
|
||||||
|
Status RandomRWFileAccessor::Close() { return random_rw_file_->Close(); } |
||||||
|
|
||||||
|
Status RandomRWFileAccessor::Sync(bool use_fsync) { |
||||||
|
Status s; |
||||||
|
if (pending_sync_) { |
||||||
|
if (use_fsync) { |
||||||
|
s = random_rw_file_->Fsync(); |
||||||
|
} else { |
||||||
|
s = random_rw_file_->Sync(); |
||||||
|
} |
||||||
|
if (!s.ok()) { |
||||||
|
return s; |
||||||
|
} |
||||||
|
} |
||||||
|
if (use_fsync) { |
||||||
|
pending_fsync_ = false; |
||||||
|
} |
||||||
|
pending_sync_ = false; |
||||||
|
|
||||||
|
return s; |
||||||
|
} |
||||||
|
} // namespace rocksdb
|
@ -0,0 +1,109 @@ |
|||||||
|
// Copyright (c) 2013, Facebook, Inc. All rights reserved.
|
||||||
|
// This source code is licensed under the BSD-style license found in the
|
||||||
|
// LICENSE file in the root directory of this source tree. An additional grant
|
||||||
|
// of patent rights can be found in the PATENTS file in the same directory.
|
||||||
|
//
|
||||||
|
// Copyright (c) 2011 The LevelDB Authors. All rights reserved.
|
||||||
|
// Use of this source code is governed by a BSD-style license that can be
|
||||||
|
// found in the LICENSE file. See the AUTHORS file for names of contributors.
|
||||||
|
#pragma once |
||||||
|
#include "rocksdb/env.h" |
||||||
|
|
||||||
|
namespace rocksdb { |
||||||
|
class SequentialFileReader { |
||||||
|
private: |
||||||
|
std::unique_ptr<SequentialFile> file_; |
||||||
|
|
||||||
|
public: |
||||||
|
explicit SequentialFileReader(std::unique_ptr<SequentialFile>&& _file) |
||||||
|
: file_(std::move(_file)) {} |
||||||
|
Status Read(size_t n, Slice* result, char* scratch); |
||||||
|
|
||||||
|
Status Skip(uint64_t n); |
||||||
|
|
||||||
|
SequentialFile* file() { return file_.get(); } |
||||||
|
}; |
||||||
|
|
||||||
|
class RandomAccessFileReader : public RandomAccessFile { |
||||||
|
private: |
||||||
|
std::unique_ptr<RandomAccessFile> file_; |
||||||
|
|
||||||
|
public: |
||||||
|
explicit RandomAccessFileReader(std::unique_ptr<RandomAccessFile>&& raf) |
||||||
|
: file_(std::move(raf)) {} |
||||||
|
|
||||||
|
Status Read(uint64_t offset, size_t n, Slice* result, char* scratch) const; |
||||||
|
|
||||||
|
RandomAccessFile* file() { return file_.get(); } |
||||||
|
}; |
||||||
|
|
||||||
|
// Use posix write to write data to a file.
|
||||||
|
class WritableFileWriter { |
||||||
|
private: |
||||||
|
std::unique_ptr<WritableFile> writable_file_; |
||||||
|
size_t cursize_; // current size of cached data in buf_
|
||||||
|
size_t capacity_; // max size of buf_
|
||||||
|
unique_ptr<char[]> buf_; // a buffer to cache writes
|
||||||
|
uint64_t filesize_; |
||||||
|
bool pending_sync_; |
||||||
|
bool pending_fsync_; |
||||||
|
uint64_t last_sync_size_; |
||||||
|
uint64_t bytes_per_sync_; |
||||||
|
RateLimiter* rate_limiter_; |
||||||
|
|
||||||
|
public: |
||||||
|
explicit WritableFileWriter(std::unique_ptr<WritableFile>&& file, |
||||||
|
const EnvOptions& options) |
||||||
|
: writable_file_(std::move(file)), |
||||||
|
cursize_(0), |
||||||
|
capacity_(65536), |
||||||
|
buf_(new char[capacity_]), |
||||||
|
filesize_(0), |
||||||
|
pending_sync_(false), |
||||||
|
pending_fsync_(false), |
||||||
|
last_sync_size_(0), |
||||||
|
bytes_per_sync_(options.bytes_per_sync), |
||||||
|
rate_limiter_(options.rate_limiter) {} |
||||||
|
|
||||||
|
~WritableFileWriter() { Flush(); } |
||||||
|
Status Append(const Slice& data); |
||||||
|
|
||||||
|
Status Flush(); |
||||||
|
|
||||||
|
Status Close(); |
||||||
|
|
||||||
|
Status Sync(bool use_fsync); |
||||||
|
|
||||||
|
uint64_t GetFileSize() { return filesize_; } |
||||||
|
|
||||||
|
Status InvalidateCache(size_t offset, size_t length) { |
||||||
|
return writable_file_->InvalidateCache(offset, length); |
||||||
|
} |
||||||
|
|
||||||
|
WritableFile* writable_file() const { return writable_file_.get(); } |
||||||
|
|
||||||
|
private: |
||||||
|
Status RangeSync(off_t offset, off_t nbytes); |
||||||
|
size_t RequestToken(size_t bytes); |
||||||
|
}; |
||||||
|
|
||||||
|
class RandomRWFileAccessor { |
||||||
|
private: |
||||||
|
std::unique_ptr<RandomRWFile> random_rw_file_; |
||||||
|
bool pending_sync_; |
||||||
|
bool pending_fsync_; |
||||||
|
|
||||||
|
public: |
||||||
|
explicit RandomRWFileAccessor(std::unique_ptr<RandomRWFile>&& f) |
||||||
|
: random_rw_file_(std::move(f)), |
||||||
|
pending_sync_(false), |
||||||
|
pending_fsync_(false) {} |
||||||
|
Status Write(uint64_t offset, const Slice& data); |
||||||
|
|
||||||
|
Status Read(uint64_t offset, size_t n, Slice* result, char* scratch) const; |
||||||
|
|
||||||
|
Status Close(); |
||||||
|
|
||||||
|
Status Sync(bool use_fsync); |
||||||
|
}; |
||||||
|
} // namespace rocksdb
|
Loading…
Reference in new issue