From 778e179046f0996232768bff2862ec1df7e40207 Mon Sep 17 00:00:00 2001
From: Haobo Xu <haobo@fb.com>
Date: Tue, 4 Jun 2013 15:51:50 -0700
Subject: [PATCH] [RocksDB] Sync file to disk incrementally

Summary:
During compaction, we sync the output files after they are fully written out. This causes unnecessary blocking of the compaction thread and burstiness of the write traffic.
This diff simply asks the OS to sync data incrementally as they are written, on the background. The hope is that, at the final sync, most of the data are already on disk and we would block less on the sync call. Thus, each compaction runs faster and we could use fewer number of compaction threads to saturate IO.
In addition, the write traffic will be smoothed out, hopefully reducing the IO P99 latency too.

Some quick tests show 10~20% improvement in per thread compaction throughput. Combined with posix advice on compaction read, just 5 threads are enough to almost saturate the udb flash bandwidth for 800 bytes write only benchmark.
What's more promising is that, with saturated IO, iostat shows average wait time is actually smoother and much smaller.
For the write only test 800bytes test:
Before the change:  await  occillate between 10ms and 3ms
After the change: await ranges 1-3ms

Will test against read-modify-write workload too, see if high read latency P99 could be resolved.

Will introduce a parameter to control the sync interval in a follow up diff after cleaning up EnvOptions.

Test Plan: make check; db_bench; db_stress

Reviewers: dhruba

CC: leveldb

Differential Revision: https://reviews.facebook.net/D11115
---
 include/leveldb/env.h | 10 ++++++++++
 util/env_posix.cc     | 20 +++++++++++++++++++-
 2 files changed, 29 insertions(+), 1 deletion(-)

diff --git a/include/leveldb/env.h b/include/leveldb/env.h
index 7d70ef8f5..352ad7dba 100644
--- a/include/leveldb/env.h
+++ b/include/leveldb/env.h
@@ -360,6 +360,16 @@ class WritableFile {
     return Status::OK();
   }
 
+  // Sync a file range with disk.
+  // offset is the starting byte of the file range to be synchronized.
+  // nbytes specifies the length of the range to be synchronized.
+  // This asks the OS to initiate flushing the cached data to disk,
+  // without waiting for completion.
+  // Default implementation does nothing.
+  virtual Status RangeSync(off64_t offset, off64_t nbytes) {
+    return Status::OK();
+  }
+
  private:
   size_t last_preallocated_block_;
   size_t preallocation_block_size_;
diff --git a/util/env_posix.cc b/util/env_posix.cc
index 1388566c5..1745165c6 100644
--- a/util/env_posix.cc
+++ b/util/env_posix.cc
@@ -22,6 +22,7 @@
 #include <unistd.h>
 #if defined(OS_LINUX)
 #include <linux/fs.h>
+#include <fcntl.h>
 #endif
 #if defined(LEVELDB_PLATFORM_ANDROID)
 #include <sys/stat.h>
@@ -503,6 +504,7 @@ class PosixWritableFile : public WritableFile {
   uint64_t filesize_;
   bool pending_sync_;
   bool pending_fsync_;
+  uint64_t last_sync_size_;
 
  public:
   PosixWritableFile(const std::string& fname, int fd, size_t capacity,
@@ -514,7 +516,8 @@ class PosixWritableFile : public WritableFile {
     buf_(new char[capacity]),
     filesize_(0),
     pending_sync_(false),
-    pending_fsync_(false) {
+    pending_fsync_(false),
+    last_sync_size_(0) {
     assert(!options.use_mmap_writes);
   }
 
@@ -601,6 +604,13 @@ class PosixWritableFile : public WritableFile {
       src += done;
     }
     cursize_ = 0;
+
+    // sync OS cache to disk for every 2MB written
+    if (filesize_ - last_sync_size_ >= 2 * 1024 * 1024) {
+      RangeSync(last_sync_size_, filesize_ - last_sync_size_);
+      last_sync_size_ = filesize_;
+    }
+
     return Status::OK();
   }
 
@@ -638,6 +648,14 @@ class PosixWritableFile : public WritableFile {
       return IOError(filename_, errno);
     }
   }
+
+  virtual Status RangeSync(off64_t offset, off64_t nbytes) {
+    if (sync_file_range(fd_, offset, nbytes, SYNC_FILE_RANGE_WRITE) == 0) {
+      return Status::OK();
+    } else {
+      return IOError(filename_, errno);
+    }
+  }
 #endif
 };