Optionally wait on bytes_per_sync to smooth I/O (#5183)

Summary: The existing implementation does not guarantee bytes reach disk every `bytes_per_sync` when writing SST files, or every `wal_bytes_per_sync` when writing WALs. This can cause confusing behavior for users who enable this feature to avoid large syncs during flush and compaction, but then end up hitting them anyways. My understanding of the existing behavior is we used `sync_file_range` with `SYNC_FILE_RANGE_WRITE` to submit ranges for async writeback, such that we could continue processing the next range of bytes while that I/O is happening. I believe we can preserve that benefit while also limiting how far the processing can get ahead of the I/O, which prevents huge syncs from happening when the file finishes. Consider this `sync_file_range` usage: `sync_file_range(fd_, 0, static_cast<off_t>(offset + nbytes), SYNC_FILE_RANGE_WAIT_BEFORE | SYNC_FILE_RANGE_WRITE)`. Expanding the range to start at 0 and adding the `SYNC_FILE_RANGE_WAIT_BEFORE` flag causes any pending writeback (like from a previous call to `sync_file_range`) to finish before it proceeds to submit the latest `nbytes` for writeback. The latest `nbytes` are still written back asynchronously, unless processing exceeds I/O speed, in which case the following `sync_file_range` will need to wait on it. There is a second change in this PR to use `fdatasync` when `sync_file_range` is unavailable (determined statically) or has some known problem with the underlying filesystem (determined dynamically). The above two changes only apply when the user enables a new option, `strict_bytes_per_sync`. Pull Request resolved: https://github.com/facebook/rocksdb/pull/5183 Differential Revision: D14953553 Pulled By: siying fbshipit-source-id: 445c3862e019fb7b470f9c7f314fc231b62706e9
7 years ago · 8272a6de57
parent df38c1ce66
commit 8272a6de57
15 changed files with 189 additions and 46 deletions
--- a/HISTORY.md
+++ b/HISTORY.md
@ -1,4 +1,7 @@
 # Rocksdb Change Log
 ## Unreleased
 ### New Features
 * Add an option `strict_bytes_per_sync` that causes a file-writing thread to block rather than exceed the limit on bytes pending writeback specified by `bytes_per_sync` or `wal_bytes_per_sync`.
 ## Unreleased
 ### New Features
--- a/env/env.cc
+++ b/env/env.cc
@ -368,6 +368,7 @@ void AssignEnvOptions(EnvOptions* env_options, const DBOptions& options) {
  env_options->writable_file_max_buffer_size =
      options.writable_file_max_buffer_size;
  env_options->allow_fallocate = options.allow_fallocate;
  env_options->strict_bytes_per_sync = options.strict_bytes_per_sync;
 }
 }
--- a/env/env_hdfs.cc
+++ b/env/env_hdfs.cc
@ -189,8 +189,12 @@ class HdfsWritableFile: public WritableFile {
  hdfsFile hfile_;
 public:
-  HdfsWritableFile(hdfsFS fileSys, const std::string& fname)
+  HdfsWritableFile(hdfsFS fileSys, const std::string& fname,
-      : fileSys_(fileSys), filename_(fname) , hfile_(nullptr) {
+                   const EnvOptions& options)
      : WritableFile(options),
        fileSys_(fileSys),
        filename_(fname),
        hfile_(nullptr) {
    ROCKS_LOG_DEBUG(mylog, "[hdfs] HdfsWritableFile opening %s\n",
                    filename_.c_str());
    hfile_ = hdfsOpenFile(fileSys_, filename_.c_str(), O_WRONLY, 0, 0, 0);
@ -419,7 +423,7 @@ Status HdfsEnv::NewWritableFile(const std::string& fname,
                                const EnvOptions& /*options*/) {
  result->reset();
  Status s;
-  HdfsWritableFile* f = new HdfsWritableFile(fileSys_, fname);
+  HdfsWritableFile* f = new HdfsWritableFile(fileSys_, fname, options);
  if (f == nullptr || !f->isValid()) {
    delete f;
    *result = nullptr;
@ -586,7 +590,7 @@ Status HdfsEnv::UnlockFile(FileLock* /*lock*/) { return Status::OK(); }
 Status HdfsEnv::NewLogger(const std::string& fname,
                          std::shared_ptr<Logger>* result) {
-  HdfsWritableFile* f = new HdfsWritableFile(fileSys_, fname);
+  HdfsWritableFile* f = new HdfsWritableFile(fileSys_, fname, options);
  if (f == nullptr || !f->isValid()) {
    delete f;
    *result = nullptr;
--- a/env/io_posix.cc
+++ b/env/io_posix.cc
@ -57,6 +57,7 @@ int Fadvise(int fd, off_t offset, size_t len, int advice) {
 }
 namespace {
 size_t GetLogicalBufferSize(int __attribute__((__unused__)) fd) {
 #ifdef OS_LINUX
  struct stat buf;
@ -125,7 +126,45 @@ size_t GetLogicalBufferSize(int __attribute__((__unused__)) fd) {
 #endif
  return kDefaultPageSize;
 }
-} //  namespace
+
 #ifdef ROCKSDB_RANGESYNC_PRESENT
 #if !defined(ZFS_SUPER_MAGIC)
 // The magic number for ZFS was not exposed until recently. It should be fixed
 // forever so we can just copy the magic number here.
 #define ZFS_SUPER_MAGIC 0x2fc12fc1
 #endif
 bool IsSyncFileRangeSupported(int __attribute__((__unused__)) fd) {
  // `fstatfs` is only available on Linux, but so is `sync_file_range`, so
  // `defined(ROCKSDB_RANGESYNC_PRESENT)` should imply `defined(OS_LINUX)`.
  struct statfs buf;
  int ret = fstatfs(fd, &buf);
  assert(ret == 0);
  if (ret != 0) {
    // We don't know whether the filesystem properly supports `sync_file_range`.
    // Even if it doesn't, we don't know of any safety issue with trying to call
    // it anyways. So, to preserve the same behavior as before this `fstatfs`
    // check was introduced, we assume `sync_file_range` is usable.
    return true;
  }
  if (buf.f_type == ZFS_SUPER_MAGIC) {
    // Testing on ZFS showed the writeback did not happen asynchronously when
    // `sync_file_range` was called, even though it returned success. Avoid it
    // and use `fdatasync` instead to preserve the contract of `bytes_per_sync`,
    // even though this'll incur extra I/O for metadata.
    return false;
  }
  // No known problems with other filesystems' implementations of
  // `sync_file_range`, so allow them to use it.
  return true;
 }
 #undef ZFS_SUPER_MAGIC
 #endif  // ROCKSDB_RANGESYNC_PRESENT
 }  // anonymous namespace
 /*
 * DirectIOHelper
@ -734,7 +773,8 @@ Status PosixMmapFile::Allocate(uint64_t offset, uint64_t len) {
 */
 PosixWritableFile::PosixWritableFile(const std::string& fname, int fd,
                                     const EnvOptions& options)
-    : filename_(fname),
+    : WritableFile(options),
      filename_(fname),
      use_direct_io_(options.use_direct_writes),
      fd_(fd),
      filesize_(0),
@ -743,6 +783,9 @@ PosixWritableFile::PosixWritableFile(const std::string& fname, int fd,
  allow_fallocate_ = options.allow_fallocate;
  fallocate_with_keep_size_ = options.fallocate_with_keep_size;
 #endif
 #ifdef ROCKSDB_RANGESYNC_PRESENT
  sync_file_range_supported_ = IsSyncFileRangeSupported(fd_);
 #endif  // ROCKSDB_RANGESYNC_PRESENT
  assert(!options.use_mmap_writes);
 }
@ -945,20 +988,32 @@ Status PosixWritableFile::Allocate(uint64_t offset, uint64_t len) {
 }
 #endif
 #ifdef ROCKSDB_RANGESYNC_PRESENT
 Status PosixWritableFile::RangeSync(uint64_t offset, uint64_t nbytes) {
 #ifdef ROCKSDB_RANGESYNC_PRESENT
  assert(offset <= std::numeric_limits<off_t>::max());
  assert(nbytes <= std::numeric_limits<off_t>::max());
-  if (sync_file_range(fd_, static_cast<off_t>(offset),
+  if (sync_file_range_supported_) {
-      static_cast<off_t>(nbytes), SYNC_FILE_RANGE_WRITE) == 0) {
+    int ret;
-    return Status::OK();
+    if (strict_bytes_per_sync_) {
      // Specifying `SYNC_FILE_RANGE_WAIT_BEFORE` together with an offset/length
      // that spans all bytes written so far tells `sync_file_range` to wait for
      // any outstanding writeback requests to finish before issuing a new one.
      ret =
          sync_file_range(fd_, 0, static_cast<off_t>(offset + nbytes),
                          SYNC_FILE_RANGE_WAIT_BEFORE | SYNC_FILE_RANGE_WRITE);
    } else {
-    return IOError("While sync_file_range offset " + ToString(offset) +
+      ret = sync_file_range(fd_, static_cast<off_t>(offset),
-                       " bytes " + ToString(nbytes),
+                            static_cast<off_t>(nbytes), SYNC_FILE_RANGE_WRITE);
    }
    if (ret != 0) {
      return IOError("While sync_file_range returned " + ToString(ret),
                     filename_, errno);
    }
    return Status::OK();
  }
 #endif  // ROCKSDB_RANGESYNC_PRESENT
  return WritableFile::RangeSync(offset, nbytes);
 }
 #endif
 #ifdef OS_LINUX
 size_t PosixWritableFile::GetUniqueId(char* id, size_t max_size) const {
--- a/env/io_posix.h
+++ b/env/io_posix.h
@ -118,6 +118,11 @@ class PosixWritableFile : public WritableFile {
  bool allow_fallocate_;
  bool fallocate_with_keep_size_;
 #endif
 #ifdef ROCKSDB_RANGESYNC_PRESENT
  // Even if the syscall is present, the filesystem may still not properly
  // support it, so we need to do a dynamic check too.
  bool sync_file_range_supported_;
 #endif  // ROCKSDB_RANGESYNC_PRESENT
 public:
  explicit PosixWritableFile(const std::string& fname, int fd,
@ -144,9 +149,7 @@ class PosixWritableFile : public WritableFile {
 #ifdef ROCKSDB_FALLOCATE_PRESENT
  virtual Status Allocate(uint64_t offset, uint64_t len) override;
 #endif
 #ifdef ROCKSDB_RANGESYNC_PRESENT
  virtual Status RangeSync(uint64_t offset, uint64_t nbytes) override;
 #endif
 #ifdef OS_LINUX
  virtual size_t GetUniqueId(char* id, size_t max_size) const override;
 #endif
--- a/include/rocksdb/env.h
+++ b/include/rocksdb/env.h
@ -91,6 +91,23 @@ struct EnvOptions {
  // Default: 0
  uint64_t bytes_per_sync = 0;
  // When true, guarantees the file has at most `bytes_per_sync` bytes submitted
  // for writeback at any given time.
  //
  //  - If `sync_file_range` is supported it achieves this by waiting for any
  //    prior `sync_file_range`s to finish before proceeding. In this way,
  //    processing (compression, etc.) can proceed uninhibited in the gap
  //    between `sync_file_range`s, and we block only when I/O falls behind.
  //  - Otherwise the `WritableFile::Sync` method is used. Note this mechanism
  //    always blocks, thus preventing the interleaving of I/O and processing.
  //
  // Note: Enabling this option does not provide any additional persistence
  // guarantees, as it may use `sync_file_range`, which does not write out
  // metadata.
  //
  // Default: false
  bool strict_bytes_per_sync = false;
  // If true, we will preallocate the file with FALLOC_FL_KEEP_SIZE flag, which
  // means that file size won't change as part of preallocation.
  // If false, preallocation will also change the file size. This option will
@ -629,7 +646,16 @@ class WritableFile {
      : last_preallocated_block_(0),
        preallocation_block_size_(0),
        io_priority_(Env::IO_TOTAL),
-        write_hint_(Env::WLTH_NOT_SET) {}
+        write_hint_(Env::WLTH_NOT_SET),
        strict_bytes_per_sync_(false) {}
  explicit WritableFile(const EnvOptions& options)
      : last_preallocated_block_(0),
        preallocation_block_size_(0),
        io_priority_(Env::IO_TOTAL),
        write_hint_(Env::WLTH_NOT_SET),
        strict_bytes_per_sync_(options.strict_bytes_per_sync) {}
  virtual ~WritableFile();
  // Append data to the end of the file
@ -744,6 +770,9 @@ class WritableFile {
  // without waiting for completion.
  // Default implementation does nothing.
  virtual Status RangeSync(uint64_t /*offset*/, uint64_t /*nbytes*/) {
    if (strict_bytes_per_sync_) {
      return Sync();
    }
    return Status::OK();
  }
@ -792,6 +821,7 @@ class WritableFile {
 protected:
  Env::IOPriority io_priority_;
  Env::WriteLifeTimeHint write_hint_;
  const bool strict_bytes_per_sync_;
 };
 // A file abstraction for random reading and writing.
--- a/include/rocksdb/options.h
+++ b/include/rocksdb/options.h
@ -818,6 +818,27 @@ struct DBOptions {
  // Dynamically changeable through SetDBOptions() API.
  uint64_t wal_bytes_per_sync = 0;
  // When true, guarantees WAL files have at most `wal_bytes_per_sync`
  // bytes submitted for writeback at any given time, and SST files have at most
  // `bytes_per_sync` bytes pending writeback at any given time. This can be
  // used to handle cases where processing speed exceeds I/O speed during file
  // generation, which can lead to a huge sync when the file is finished, even
  // with `bytes_per_sync` / `wal_bytes_per_sync` properly configured.
  //
  //  - If `sync_file_range` is supported it achieves this by waiting for any
  //    prior `sync_file_range`s to finish before proceeding. In this way,
  //    processing (compression, etc.) can proceed uninhibited in the gap
  //    between `sync_file_range`s, and we block only when I/O falls behind.
  //  - Otherwise the `WritableFile::Sync` method is used. Note this mechanism
  //    always blocks, thus preventing the interleaving of I/O and processing.
  //
  // Note: Enabling this option does not provide any additional persistence
  // guarantees, as it may use `sync_file_range`, which does not write out
  // metadata.
  //
  // Default: false
  bool strict_bytes_per_sync = false;
  // A vector of EventListeners whose callback functions will be called
  // when specific RocksDB event happens.
  std::vector<std::shared_ptr<EventListener>> listeners;
--- a/options/db_options.cc
+++ b/options/db_options.cc
@ -240,6 +240,7 @@ MutableDBOptions::MutableDBOptions()
      max_open_files(-1),
      bytes_per_sync(0),
      wal_bytes_per_sync(0),
      strict_bytes_per_sync(false),
      compaction_readahead_size(0) {}
 MutableDBOptions::MutableDBOptions(const DBOptions& options)
@ -258,6 +259,7 @@ MutableDBOptions::MutableDBOptions(const DBOptions& options)
      max_open_files(options.max_open_files),
      bytes_per_sync(options.bytes_per_sync),
      wal_bytes_per_sync(options.wal_bytes_per_sync),
      strict_bytes_per_sync(options.strict_bytes_per_sync),
      compaction_readahead_size(options.compaction_readahead_size) {}
 void MutableDBOptions::Dump(Logger* log) const {
@ -293,6 +295,9 @@ void MutableDBOptions::Dump(Logger* log) const {
  ROCKS_LOG_HEADER(log,
                   "                     Options.wal_bytes_per_sync: %" PRIu64,
                   wal_bytes_per_sync);
  ROCKS_LOG_HEADER(log,
                   "                  Options.strict_bytes_per_sync: %d",
                   strict_bytes_per_sync);
  ROCKS_LOG_HEADER(log,
                   "      Options.compaction_readahead_size: %" ROCKSDB_PRIszt,
                   compaction_readahead_size);
--- a/options/db_options.h
+++ b/options/db_options.h
@ -103,6 +103,7 @@ struct MutableDBOptions {
  int max_open_files;
  uint64_t bytes_per_sync;
  uint64_t wal_bytes_per_sync;
  bool strict_bytes_per_sync;
  size_t compaction_readahead_size;
 };
--- a/options/options_helper.cc
+++ b/options/options_helper.cc
@ -59,6 +59,7 @@ DBOptions BuildDBOptions(const ImmutableDBOptions& immutable_db_options,
      mutable_db_options.max_background_compactions;
  options.bytes_per_sync = mutable_db_options.bytes_per_sync;
  options.wal_bytes_per_sync = mutable_db_options.wal_bytes_per_sync;
  options.strict_bytes_per_sync = mutable_db_options.strict_bytes_per_sync;
  options.max_subcompactions = immutable_db_options.max_subcompactions;
  options.max_background_flushes = immutable_db_options.max_background_flushes;
  options.max_log_file_size = immutable_db_options.max_log_file_size;
@ -1551,6 +1552,10 @@ std::unordered_map<std::string, OptionTypeInfo>
         {offsetof(struct DBOptions, wal_bytes_per_sync), OptionType::kUInt64T,
          OptionVerificationType::kNormal, true,
          offsetof(struct MutableDBOptions, wal_bytes_per_sync)}},
        {"strict_bytes_per_sync",
         {offsetof(struct DBOptions, strict_bytes_per_sync), OptionType::kBoolean,
          OptionVerificationType::kNormal, true,
          offsetof(struct MutableDBOptions, strict_bytes_per_sync)}},
        {"stats_dump_period_sec",
         {offsetof(struct DBOptions, stats_dump_period_sec), OptionType::kUInt,
          OptionVerificationType::kNormal, true,
--- a/options/options_settable_test.cc
+++ b/options/options_settable_test.cc
@ -256,6 +256,7 @@ TEST_F(OptionsSettableTest, DBOptionsAllFieldsSettable) {
                             "paranoid_checks=true;"
                             "is_fd_close_on_exec=false;"
                             "bytes_per_sync=4295013613;"
                             "strict_bytes_per_sync=true;"
                             "enable_thread_tracking=false;"
                             "recycle_log_file_num=0;"
                             "create_missing_column_families=true;"
--- a/options/options_test.cc
+++ b/options/options_test.cc
@ -141,6 +141,7 @@ TEST_F(OptionsTest, GetOptionsFromMapTest) {
      {"writable_file_max_buffer_size", "314159"},
      {"bytes_per_sync", "47"},
      {"wal_bytes_per_sync", "48"},
      {"strict_bytes_per_sync", "true"},
  };
  ColumnFamilyOptions base_cf_opt;
@ -277,6 +278,7 @@ TEST_F(OptionsTest, GetOptionsFromMapTest) {
  ASSERT_EQ(new_db_opt.writable_file_max_buffer_size, 314159);
  ASSERT_EQ(new_db_opt.bytes_per_sync, static_cast<uint64_t>(47));
  ASSERT_EQ(new_db_opt.wal_bytes_per_sync, static_cast<uint64_t>(48));
  ASSERT_EQ(new_db_opt.strict_bytes_per_sync, true);
  db_options_map["max_open_files"] = "hello";
  ASSERT_NOK(GetDBOptionsFromMap(base_db_opt, db_options_map, &new_db_opt));
--- a/port/win/io_win.cc
+++ b/port/win/io_win.cc
@ -383,9 +383,11 @@ Status WinMmapFile::PreallocateInternal(uint64_t spaceToReserve) {
  return fallocate(filename_, hFile_, spaceToReserve);
 }
-WinMmapFile::WinMmapFile(const std::string& fname, HANDLE hFile, size_t page_size,
+WinMmapFile::WinMmapFile(const std::string& fname, HANDLE hFile,
-  size_t allocation_granularity, const EnvOptions& options)
+                         size_t page_size, size_t allocation_granularity,
                         const EnvOptions& options)
    : WinFileData(fname, hFile, false),
      WritableFile(options),
      hMap_(NULL),
      page_size_(page_size),
      allocation_granularity_(allocation_granularity),
@ -966,7 +968,8 @@ WinWritableFile::WinWritableFile(const std::string& fname, HANDLE hFile,
                                 size_t alignment, size_t /* capacity */,
                                 const EnvOptions& options)
    : WinFileData(fname, hFile, options.use_direct_writes),
-      WinWritableImpl(this, alignment) {
+      WinWritableImpl(this, alignment),
      WritableFile(options) {
  assert(!options.use_mmap_writes);
 }
--- a/utilities/env_librados.cc
+++ b/utilities/env_librados.cc
@ -291,11 +291,17 @@ class LibradosWritableFile : public WritableFile {
  }
 public:
-  LibradosWritableFile(librados::IoCtx * io_ctx,
+  LibradosWritableFile(librados::IoCtx* io_ctx, std::string fid,
-                       std::string fid,
+                       std::string hint, const EnvLibrados* const env,
-                       std::string hint,
+                       const EnvOptions& options)
-                       const EnvLibrados * const env)
+      : WritableFile(options),
-    : _io_ctx(io_ctx), _fid(fid), _hint(hint), _env(env), _buffer(), _buffer_size(0), _file_size(0) {
+        _io_ctx(io_ctx),
        _fid(fid),
        _hint(hint),
        _env(env),
        _buffer(),
        _buffer_size(0),
        _file_size(0) {
    int ret = _io_ctx->stat(_fid, &_file_size, nullptr);
    // if file not exist
@ -1001,7 +1007,8 @@ Status EnvLibrados::NewWritableFile(
      _AddFid(fpath, fid);
    }
-    result->reset(new LibradosWritableFile(_GetIoctx(fpath), fid, fpath, this));
+    result->reset(
        new LibradosWritableFile(_GetIoctx(fpath), fid, fpath, this, options));
    s = Status::OK();
  } while (0);
@ -1042,7 +1049,8 @@ Status EnvLibrados::ReuseWritableFile(
      break;
    }
-    result->reset(new LibradosWritableFile(_GetIoctx(dst_fpath), src_fid, dst_fpath, this));
+    result->reset(new LibradosWritableFile(_GetIoctx(dst_fpath), src_fid,
                                           dst_fpath, this, options));
  } while (0);
  LOG_DEBUG("[OUT]%s\n", r.ToString().c_str());
@ -1485,5 +1493,5 @@ EnvLibrados* EnvLibrados::Default() {
                                 default_pool_name);
  return &default_env;
 }
-// @lint-ignore TXT4 T25377293 Grandfathered in
+
-}
+}  // namespace rocksdb
--- a/utilities/env_mirror.cc
+++ b/utilities/env_mirror.cc
@ -98,7 +98,8 @@ class WritableFileMirror : public WritableFile {
 public:
  std::unique_ptr<WritableFile> a_, b_;
  std::string fname;
-  explicit WritableFileMirror(std::string f) : fname(f) {}
+  explicit WritableFileMirror(std::string f, const EnvOptions& options)
      : WritableFile(options), fname(f) {}
  Status Append(const Slice& data) override {
    Status as = a_->Append(data);
@ -229,7 +230,7 @@ Status EnvMirror::NewWritableFile(const std::string& f,
                                  std::unique_ptr<WritableFile>* r,
                                  const EnvOptions& options) {
  if (f.find("/proc/") == 0) return a_->NewWritableFile(f, r, options);
-  WritableFileMirror* mf = new WritableFileMirror(f);
+  WritableFileMirror* mf = new WritableFileMirror(f, options);
  Status as = a_->NewWritableFile(f, &mf->a_, options);
  Status bs = b_->NewWritableFile(f, &mf->b_, options);
  assert(as == bs);
@ -246,7 +247,7 @@ Status EnvMirror::ReuseWritableFile(const std::string& fname,
                                    const EnvOptions& options) {
  if (fname.find("/proc/") == 0)
    return a_->ReuseWritableFile(fname, old_fname, r, options);
-  WritableFileMirror* mf = new WritableFileMirror(fname);
+  WritableFileMirror* mf = new WritableFileMirror(fname, options);
  Status as = a_->ReuseWritableFile(fname, old_fname, &mf->a_, options);
  Status bs = b_->ReuseWritableFile(fname, old_fname, &mf->b_, options);
  assert(as == bs);