From 3dbf4ba220b892a15ba96f4f33384383488322df Mon Sep 17 00:00:00 2001 From: Siying Dong Date: Mon, 20 Jul 2015 14:46:15 -0700 Subject: [PATCH] RangeSync not to sync last 1MB of the file Summary: From other ones' investigation: "sync_file_range() behavior highly depends on kernel version and filesystem. xfs does neighbor page flushing outside of the specified ranges. For example, sync_file_range(fd, 8192, 16384) does not only trigger flushing page #3 to #4, but also flushing many more dirty pages (i.e. up to page#16)... Ranges of the sync_file_range() should be far enough from write() offset (at least 1MB)." Test Plan: make all check Reviewers: igor, rven, kradhakrishnan, yhchiang, IslamAbdelRahman, anthony Reviewed By: anthony Subscribers: yoshinorim, MarkCallaghan, sumeet, domas, dhruba, leveldb, ljin Differential Revision: https://reviews.facebook.net/D15807 --- CMakeLists.txt | 1 + Makefile | 4 ++ include/rocksdb/options.h | 4 +- util/file_reader_writer.cc | 23 +++++++-- util/file_reader_writer_test.cc | 89 +++++++++++++++++++++++++++++++++ 5 files changed, 117 insertions(+), 4 deletions(-) create mode 100644 util/file_reader_writer_test.cc diff --git a/CMakeLists.txt b/CMakeLists.txt index a82b24099..f5fdb2332 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -306,6 +306,7 @@ set(TESTS util/env_test.cc util/event_logger_test.cc util/filelock_test.cc + util/file_reader_writer_test.cc util/histogram_test.cc util/manual_compaction_test.cc util/memenv_test.cc diff --git a/Makefile b/Makefile index d5d2f5a29..b09bdd87f 100644 --- a/Makefile +++ b/Makefile @@ -248,6 +248,7 @@ TESTS = \ fault_injection_test \ filelock_test \ filename_test \ + file_reader_writer_test \ block_based_filter_block_test \ full_filter_block_test \ histogram_test \ @@ -774,6 +775,9 @@ rate_limiter_test: util/rate_limiter_test.o $(LIBOBJECTS) $(TESTHARNESS) filename_test: db/filename_test.o $(LIBOBJECTS) $(TESTHARNESS) $(AM_LINK) +file_reader_writer_test: util/file_reader_writer_test.o $(LIBOBJECTS) $(TESTHARNESS) + $(AM_LINK) + block_based_filter_block_test: table/block_based_filter_block_test.o $(LIBOBJECTS) $(TESTHARNESS) $(AM_LINK) diff --git a/include/rocksdb/options.h b/include/rocksdb/options.h index 3cd809aa6..b9e5c758b 100644 --- a/include/rocksdb/options.h +++ b/include/rocksdb/options.h @@ -1014,7 +1014,9 @@ struct DBOptions { void Dump(Logger* log) const; // Allows OS to incrementally sync files to disk while they are being - // written, asynchronously, in the background. + // written, asynchronously, in the background. This operation can be used + // to smooth out write I/Os over time. Users shouldn't reply on it for + // persistency guarantee. // Issue one request for every bytes_per_sync written. 0 turns it off. // Default: 0 // diff --git a/util/file_reader_writer.cc b/util/file_reader_writer.cc index ebd54c7eb..a12a4b930 100644 --- a/util/file_reader_writer.cc +++ b/util/file_reader_writer.cc @@ -122,9 +122,26 @@ Status WritableFileWriter::Flush() { // TODO: give log file and sst file different options (log // files could be potentially cached in OS for their whole // life time, thus we might not want to flush at all). - if (bytes_per_sync_ && filesize_ - last_sync_size_ >= bytes_per_sync_) { - writable_file_->RangeSync(last_sync_size_, filesize_ - last_sync_size_); - last_sync_size_ = filesize_; + + // We try to avoid sync to the last 1MB of data. For two reasons: + // (1) avoid rewrite the same page that is modified later. + // (2) for older version of OS, write can block while writing out + // the page. + // Xfs does neighbor page flushing outside of the specified ranges. We + // need to make sure sync range is far from the write offset. + if (bytes_per_sync_) { + uint64_t kBytesNotSyncRange = 1024 * 1024; // recent 1MB is not synced. + uint64_t kBytesAlignWhenSync = 4 * 1024; // Align 4KB. + if (filesize_ > kBytesNotSyncRange) { + uint64_t offset_sync_to = filesize_ - kBytesNotSyncRange; + offset_sync_to -= offset_sync_to % kBytesAlignWhenSync; + assert(offset_sync_to >= last_sync_size_); + if (offset_sync_to > 0 && + offset_sync_to - last_sync_size_ >= bytes_per_sync_) { + RangeSync(last_sync_size_, offset_sync_to - last_sync_size_); + last_sync_size_ = offset_sync_to; + } + } } return Status::OK(); diff --git a/util/file_reader_writer_test.cc b/util/file_reader_writer_test.cc new file mode 100644 index 000000000..924c171b7 --- /dev/null +++ b/util/file_reader_writer_test.cc @@ -0,0 +1,89 @@ +// Copyright (c) 2013, Facebook, Inc. All rights reserved. +// This source code is licensed under the BSD-style license found in the +// LICENSE file in the root directory of this source tree. An additional grant +// of patent rights can be found in the PATENTS file in the same directory. +// +#include +#include "util/file_reader_writer.h" +#include "util/random.h" +#include "util/testharness.h" + +namespace rocksdb { + +class WritableFileWriterTest : public testing::Test {}; + +const uint32_t kMb = 1 << 20; + +TEST_F(WritableFileWriterTest, RangeSync) { + class FakeWF : public WritableFile { + public: + explicit FakeWF() : size_(0), last_synced_(0) {} + ~FakeWF() {} + + Status Append(const Slice& data) override { + size_ += data.size(); + return Status::OK(); + } + Status Close() override { + EXPECT_GE(size_, last_synced_ + kMb); + EXPECT_LT(size_, last_synced_ + 2 * kMb); + // Make sure random writes generated enough writes. + EXPECT_GT(size_, 10 * kMb); + return Status::OK(); + } + Status Flush() override { return Status::OK(); } + Status Sync() override { return Status::OK(); } + Status Fsync() override { return Status::OK(); } + void SetIOPriority(Env::IOPriority pri) override {} + uint64_t GetFileSize() override { return size_; } + void GetPreallocationStatus(size_t* block_size, + size_t* last_allocated_block) override {} + size_t GetUniqueId(char* id, size_t max_size) const override { return 0; } + Status InvalidateCache(size_t offset, size_t length) override { + return Status::OK(); + } + + protected: + Status Allocate(off_t offset, off_t len) override { return Status::OK(); } + Status RangeSync(off_t offset, off_t nbytes) override { + EXPECT_EQ(offset % 4096, 0u); + EXPECT_EQ(nbytes % 4096, 0u); + + EXPECT_EQ(offset, last_synced_); + last_synced_ = offset + nbytes; + EXPECT_GE(size_, last_synced_ + kMb); + if (size_ > 2 * kMb) { + EXPECT_LT(size_, last_synced_ + 2 * kMb); + } + return Status::OK(); + } + + uint64_t size_; + uint64_t last_synced_; + }; + + EnvOptions env_options; + env_options.bytes_per_sync = kMb; + unique_ptr wf(new FakeWF); + unique_ptr writer( + new WritableFileWriter(std::move(wf), env_options)); + Random r(301); + std::unique_ptr large_buf(new char[10 * kMb]); + for (int i = 0; i < 1000; i++) { + int skew_limit = (i < 700) ? 10 : 15; + uint32_t num = r.Skewed(skew_limit) * 100 + r.Uniform(100); + writer->Append(Slice(large_buf.get(), num)); + + // Flush in a chance of 1/10. + if (r.Uniform(10) == 0) { + writer->Flush(); + } + } + writer->Close(); +} +} // namespace rocksdb + +int main(int argc, char** argv) { + ::testing::InitGoogleTest(&argc, argv); + return RUN_ALL_TESTS(); +}