RangeSync not to sync last 1MB of the file

Summary:
From other ones' investigation:

"sync_file_range() behavior highly depends on kernel version and filesystem.

xfs does neighbor page flushing outside of the specified ranges. For example, sync_file_range(fd, 8192, 16384) does not only trigger flushing page #3 to #4, but also flushing many more dirty pages (i.e. up to page#16)... Ranges of the sync_file_range() should be far enough from write() offset (at least 1MB)."

Test Plan: make all check

Reviewers: igor, rven, kradhakrishnan, yhchiang, IslamAbdelRahman, anthony

Reviewed By: anthony

Subscribers: yoshinorim, MarkCallaghan, sumeet, domas, dhruba, leveldb, ljin

Differential Revision: https://reviews.facebook.net/D15807
main
Siying Dong 10 years ago committed by sdong
parent 7219088cda
commit 3dbf4ba220
  1. 1
      CMakeLists.txt
  2. 4
      Makefile
  3. 4
      include/rocksdb/options.h
  4. 23
      util/file_reader_writer.cc
  5. 89
      util/file_reader_writer_test.cc

@ -306,6 +306,7 @@ set(TESTS
util/env_test.cc
util/event_logger_test.cc
util/filelock_test.cc
util/file_reader_writer_test.cc
util/histogram_test.cc
util/manual_compaction_test.cc
util/memenv_test.cc

@ -248,6 +248,7 @@ TESTS = \
fault_injection_test \
filelock_test \
filename_test \
file_reader_writer_test \
block_based_filter_block_test \
full_filter_block_test \
histogram_test \
@ -774,6 +775,9 @@ rate_limiter_test: util/rate_limiter_test.o $(LIBOBJECTS) $(TESTHARNESS)
filename_test: db/filename_test.o $(LIBOBJECTS) $(TESTHARNESS)
$(AM_LINK)
file_reader_writer_test: util/file_reader_writer_test.o $(LIBOBJECTS) $(TESTHARNESS)
$(AM_LINK)
block_based_filter_block_test: table/block_based_filter_block_test.o $(LIBOBJECTS) $(TESTHARNESS)
$(AM_LINK)

@ -1014,7 +1014,9 @@ struct DBOptions {
void Dump(Logger* log) const;
// Allows OS to incrementally sync files to disk while they are being
// written, asynchronously, in the background.
// written, asynchronously, in the background. This operation can be used
// to smooth out write I/Os over time. Users shouldn't reply on it for
// persistency guarantee.
// Issue one request for every bytes_per_sync written. 0 turns it off.
// Default: 0
//

@ -122,9 +122,26 @@ Status WritableFileWriter::Flush() {
// TODO: give log file and sst file different options (log
// files could be potentially cached in OS for their whole
// life time, thus we might not want to flush at all).
if (bytes_per_sync_ && filesize_ - last_sync_size_ >= bytes_per_sync_) {
writable_file_->RangeSync(last_sync_size_, filesize_ - last_sync_size_);
last_sync_size_ = filesize_;
// We try to avoid sync to the last 1MB of data. For two reasons:
// (1) avoid rewrite the same page that is modified later.
// (2) for older version of OS, write can block while writing out
// the page.
// Xfs does neighbor page flushing outside of the specified ranges. We
// need to make sure sync range is far from the write offset.
if (bytes_per_sync_) {
uint64_t kBytesNotSyncRange = 1024 * 1024; // recent 1MB is not synced.
uint64_t kBytesAlignWhenSync = 4 * 1024; // Align 4KB.
if (filesize_ > kBytesNotSyncRange) {
uint64_t offset_sync_to = filesize_ - kBytesNotSyncRange;
offset_sync_to -= offset_sync_to % kBytesAlignWhenSync;
assert(offset_sync_to >= last_sync_size_);
if (offset_sync_to > 0 &&
offset_sync_to - last_sync_size_ >= bytes_per_sync_) {
RangeSync(last_sync_size_, offset_sync_to - last_sync_size_);
last_sync_size_ = offset_sync_to;
}
}
}
return Status::OK();

@ -0,0 +1,89 @@
// Copyright (c) 2013, Facebook, Inc. All rights reserved.
// This source code is licensed under the BSD-style license found in the
// LICENSE file in the root directory of this source tree. An additional grant
// of patent rights can be found in the PATENTS file in the same directory.
//
#include <vector>
#include "util/file_reader_writer.h"
#include "util/random.h"
#include "util/testharness.h"
namespace rocksdb {
class WritableFileWriterTest : public testing::Test {};
const uint32_t kMb = 1 << 20;
TEST_F(WritableFileWriterTest, RangeSync) {
class FakeWF : public WritableFile {
public:
explicit FakeWF() : size_(0), last_synced_(0) {}
~FakeWF() {}
Status Append(const Slice& data) override {
size_ += data.size();
return Status::OK();
}
Status Close() override {
EXPECT_GE(size_, last_synced_ + kMb);
EXPECT_LT(size_, last_synced_ + 2 * kMb);
// Make sure random writes generated enough writes.
EXPECT_GT(size_, 10 * kMb);
return Status::OK();
}
Status Flush() override { return Status::OK(); }
Status Sync() override { return Status::OK(); }
Status Fsync() override { return Status::OK(); }
void SetIOPriority(Env::IOPriority pri) override {}
uint64_t GetFileSize() override { return size_; }
void GetPreallocationStatus(size_t* block_size,
size_t* last_allocated_block) override {}
size_t GetUniqueId(char* id, size_t max_size) const override { return 0; }
Status InvalidateCache(size_t offset, size_t length) override {
return Status::OK();
}
protected:
Status Allocate(off_t offset, off_t len) override { return Status::OK(); }
Status RangeSync(off_t offset, off_t nbytes) override {
EXPECT_EQ(offset % 4096, 0u);
EXPECT_EQ(nbytes % 4096, 0u);
EXPECT_EQ(offset, last_synced_);
last_synced_ = offset + nbytes;
EXPECT_GE(size_, last_synced_ + kMb);
if (size_ > 2 * kMb) {
EXPECT_LT(size_, last_synced_ + 2 * kMb);
}
return Status::OK();
}
uint64_t size_;
uint64_t last_synced_;
};
EnvOptions env_options;
env_options.bytes_per_sync = kMb;
unique_ptr<FakeWF> wf(new FakeWF);
unique_ptr<WritableFileWriter> writer(
new WritableFileWriter(std::move(wf), env_options));
Random r(301);
std::unique_ptr<char[]> large_buf(new char[10 * kMb]);
for (int i = 0; i < 1000; i++) {
int skew_limit = (i < 700) ? 10 : 15;
uint32_t num = r.Skewed(skew_limit) * 100 + r.Uniform(100);
writer->Append(Slice(large_buf.get(), num));
// Flush in a chance of 1/10.
if (r.Uniform(10) == 0) {
writer->Flush();
}
}
writer->Close();
}
} // namespace rocksdb
int main(int argc, char** argv) {
::testing::InitGoogleTest(&argc, argv);
return RUN_ALL_TESTS();
}
Loading…
Cancel
Save