From f26cb0f093393b96659d33d9d1cd5f0f58ca31f9 Mon Sep 17 00:00:00 2001 From: Igor Canadi Date: Mon, 17 Mar 2014 21:52:14 -0700 Subject: [PATCH] Optimize fallocation Summary: Based on my recent findings (posted in our internal group), if we use fallocate without KEEP_SIZE flag, we get superior performance of fdatasync() in append-only workloads. This diff provides an option for user to not use KEEP_SIZE flag, thus optimizing his sync performance by up to 2x-3x. At one point we also just called posix_fallocate instead of fallocate, which isn't very fast: http://code.woboq.org/userspace/glibc/sysdeps/posix/posix_fallocate.c.html (tl;dr it manually writes out zero bytes to allocate storage). This diff also fixes that, by first calling fallocate and then posix_fallocate if fallocate is not supported. Test Plan: make check Reviewers: dhruba, sdong, haobo, ljin Reviewed By: dhruba CC: leveldb Differential Revision: https://reviews.facebook.net/D16761 --- db/db_impl.cc | 9 ++--- db/repair.cc | 4 +-- db/version_set.cc | 2 +- include/rocksdb/env.h | 22 +++++++++++-- util/env.cc | 10 +++--- util/env_posix.cc | 76 ++++++++++++++++++++++++++++++------------- 6 files changed, 86 insertions(+), 37 deletions(-) diff --git a/db/db_impl.cc b/db/db_impl.cc index 06e485d50..3354e79c0 100644 --- a/db/db_impl.cc +++ b/db/db_impl.cc @@ -456,8 +456,8 @@ Status DBImpl::NewDB() { const std::string manifest = DescriptorFileName(dbname_, 1); unique_ptr file; - Status s = env_->NewWritableFile(manifest, &file, - storage_options_.AdaptForLogWrite()); + Status s = env_->NewWritableFile( + manifest, &file, env_->OptimizeForManifestWrite(storage_options_)); if (!s.ok()) { return s; } @@ -3626,7 +3626,8 @@ Status DBImpl::MakeRoomForWrite(bool force, { DelayLoggingAndReset(); s = env_->NewWritableFile(LogFileName(options_.wal_dir, new_log_number), - &lfile, storage_options_.AdaptForLogWrite()); + &lfile, + env_->OptimizeForLogWrite(storage_options_)); if (s.ok()) { // Our final size should be less than write_buffer_size // (compression, etc) but err on the side of caution. @@ -3912,7 +3913,7 @@ Status DB::Open(const Options& options, const std::string& dbname, DB** dbptr) { EnvOptions soptions(options); s = impl->options_.env->NewWritableFile( LogFileName(impl->options_.wal_dir, new_log_number), &lfile, - soptions.AdaptForLogWrite()); + impl->options_.env->OptimizeForLogWrite(soptions)); if (s.ok()) { lfile->SetPreallocationBlockSize(1.1 * impl->options_.write_buffer_size); VersionEdit edit; diff --git a/db/repair.cc b/db/repair.cc index 235bb8967..f3b95f5e5 100644 --- a/db/repair.cc +++ b/db/repair.cc @@ -306,8 +306,8 @@ class Repairer { Status WriteDescriptor() { std::string tmp = TempFileName(dbname_, 1); unique_ptr file; - Status status = - env_->NewWritableFile(tmp, &file, storage_options_.AdaptForLogWrite()); + Status status = env_->NewWritableFile( + tmp, &file, env_->OptimizeForManifestWrite(storage_options_)); if (!status.ok()) { return status; } diff --git a/db/version_set.cc b/db/version_set.cc index 0ce8a7efe..7276cd0b6 100644 --- a/db/version_set.cc +++ b/db/version_set.cc @@ -1564,7 +1564,7 @@ Status VersionSet::LogAndApply(VersionEdit* edit, port::Mutex* mu, unique_ptr descriptor_file; s = env_->NewWritableFile( DescriptorFileName(dbname_, pending_manifest_file_number_), - &descriptor_file, storage_options_.AdaptForLogWrite()); + &descriptor_file, env_->OptimizeForManifestWrite(storage_options_)); if (s.ok()) { descriptor_log_.reset(new log::Writer(std::move(descriptor_file))); s = WriteSnapshot(descriptor_log_.get()); diff --git a/include/rocksdb/env.h b/include/rocksdb/env.h index 16eb16440..f1c579981 100644 --- a/include/rocksdb/env.h +++ b/include/rocksdb/env.h @@ -49,8 +49,6 @@ struct EnvOptions { // construct from Options explicit EnvOptions(const Options& options); - EnvOptions AdaptForLogWrite() const; - // If true, then allow caching of data in environment buffers bool use_os_buffer = true; @@ -61,13 +59,21 @@ struct EnvOptions { bool use_mmap_writes = true; // If true, set the FD_CLOEXEC on open fd. - bool set_fd_cloexec= true; + bool set_fd_cloexec = true; // Allows OS to incrementally sync files to disk while they are being // written, in the background. Issue one request for every bytes_per_sync // written. 0 turns it off. // Default: 0 uint64_t bytes_per_sync = 0; + + // If true, we will preallocate the file with FALLOC_FL_KEEP_SIZE flag, which + // means that file size won't change as part of preallocation. + // If false, preallocation will also change the file size. This option will + // improve the performance in workloads where you sync the data on every + // write. By default, we set it to true for MANIFEST writes and false for + // WAL writes + bool fallocate_with_keep_size = true; }; class Env { @@ -260,6 +266,16 @@ class Env { // Generates a unique id that can be used to identify a db virtual std::string GenerateUniqueId(); + // OptimizeForLogWrite will create a new EnvOptions object that is a copy of + // the EnvOptions in the parameters, but is optimized for writing log files. + // Default implementation returns the copy of the same object. + virtual EnvOptions OptimizeForLogWrite(const EnvOptions& env_options) const; + // OptimizeForManifestWrite will create a new EnvOptions object that is a copy + // of the EnvOptions in the parameters, but is optimized for writing manifest + // files. Default implementation returns the copy of the same object. + virtual EnvOptions OptimizeForManifestWrite(const EnvOptions& env_options) + const; + private: // No copying allowed Env(const Env&); diff --git a/util/env.cc b/util/env.cc index 419c8145d..f2ebfcd59 100644 --- a/util/env.cc +++ b/util/env.cc @@ -241,10 +241,12 @@ void AssignEnvOptions(EnvOptions* env_options, const Options& options) { } -EnvOptions EnvOptions::AdaptForLogWrite() const { - EnvOptions adapted = *this; - adapted.use_mmap_writes = false; - return adapted; +EnvOptions Env::OptimizeForLogWrite(const EnvOptions& env_options) const { + return env_options; +} + +EnvOptions Env::OptimizeForManifestWrite(const EnvOptions& env_options) const { + return env_options; } EnvOptions::EnvOptions(const Options& options) { diff --git a/util/env_posix.cc b/util/env_posix.cc index 89d8df68d..c610c1546 100644 --- a/util/env_posix.cc +++ b/util/env_posix.cc @@ -354,9 +354,9 @@ class PosixMmapFile : public WritableFile { char* dst_; // Where to write next (in range [base_,limit_]) char* last_sync_; // Where have we synced up to uint64_t file_offset_; // Offset of base_ in file - // Have we done an munmap of unsynced data? bool pending_sync_; + bool fallocate_with_keep_size_; // Roundup x to a multiple of y static size_t Roundup(size_t x, size_t y) { @@ -399,7 +399,12 @@ class PosixMmapFile : public WritableFile { assert(base_ == nullptr); TEST_KILL_RANDOM(rocksdb_kill_odds); - int alloc_status = posix_fallocate(fd_, file_offset_, map_size_); + // we can't fallocate with FALLOC_FL_KEEP_SIZE here + int alloc_status = fallocate(fd_, 0, file_offset_, map_size_); + if (alloc_status != 0) { + // fallback to posix_fallocate + alloc_status = posix_fallocate(fd_, file_offset_, map_size_); + } if (alloc_status != 0) { return Status::IOError("Error allocating space to file : " + filename_ + "Error : " + strerror(alloc_status)); @@ -436,7 +441,8 @@ class PosixMmapFile : public WritableFile { dst_(nullptr), last_sync_(nullptr), file_offset_(0), - pending_sync_(false) { + pending_sync_(false), + fallocate_with_keep_size_(options.fallocate_with_keep_size) { assert((page_size & (page_size - 1)) == 0); assert(options.use_mmap_writes); } @@ -584,7 +590,9 @@ class PosixMmapFile : public WritableFile { #ifdef ROCKSDB_FALLOCATE_PRESENT virtual Status Allocate(off_t offset, off_t len) { TEST_KILL_RANDOM(rocksdb_kill_odds); - if (!fallocate(fd_, FALLOC_FL_KEEP_SIZE, offset, len)) { + int alloc_status = fallocate( + fd_, fallocate_with_keep_size_ ? FALLOC_FL_KEEP_SIZE : 0, offset, len); + if (alloc_status == 0) { return Status::OK(); } else { return IOError(filename_, errno); @@ -606,20 +614,22 @@ class PosixWritableFile : public WritableFile { bool pending_fsync_; uint64_t last_sync_size_; uint64_t bytes_per_sync_; + bool fallocate_with_keep_size_; public: PosixWritableFile(const std::string& fname, int fd, size_t capacity, - const EnvOptions& options) : - filename_(fname), - fd_(fd), - cursize_(0), - capacity_(capacity), - buf_(new char[capacity]), - filesize_(0), - pending_sync_(false), - pending_fsync_(false), - last_sync_size_(0), - bytes_per_sync_(options.bytes_per_sync) { + const EnvOptions& options) + : filename_(fname), + fd_(fd), + cursize_(0), + capacity_(capacity), + buf_(new char[capacity]), + filesize_(0), + pending_sync_(false), + pending_fsync_(false), + last_sync_size_(0), + bytes_per_sync_(options.bytes_per_sync), + fallocate_with_keep_size_(options.fallocate_with_keep_size) { assert(!options.use_mmap_writes); } @@ -771,7 +781,9 @@ class PosixWritableFile : public WritableFile { #ifdef ROCKSDB_FALLOCATE_PRESENT virtual Status Allocate(off_t offset, off_t len) { TEST_KILL_RANDOM(rocksdb_kill_odds); - if (!fallocate(fd_, FALLOC_FL_KEEP_SIZE, offset, len)) { + int alloc_status = fallocate( + fd_, fallocate_with_keep_size_ ? FALLOC_FL_KEEP_SIZE : 0, offset, len); + if (alloc_status == 0) { return Status::OK(); } else { return IOError(filename_, errno); @@ -797,14 +809,15 @@ class PosixRandomRWFile : public RandomRWFile { int fd_; bool pending_sync_; bool pending_fsync_; + bool fallocate_with_keep_size_; public: - PosixRandomRWFile(const std::string& fname, int fd, - const EnvOptions& options) : - filename_(fname), - fd_(fd), - pending_sync_(false), - pending_fsync_(false) { + PosixRandomRWFile(const std::string& fname, int fd, const EnvOptions& options) + : filename_(fname), + fd_(fd), + pending_sync_(false), + pending_fsync_(false), + fallocate_with_keep_size_(options.fallocate_with_keep_size) { assert(!options.use_mmap_writes && !options.use_mmap_reads); } @@ -874,7 +887,10 @@ class PosixRandomRWFile : public RandomRWFile { #ifdef ROCKSDB_FALLOCATE_PRESENT virtual Status Allocate(off_t offset, off_t len) { - if (!fallocate(fd_, FALLOC_FL_KEEP_SIZE, offset, len)) { + TEST_KILL_RANDOM(rocksdb_kill_odds); + int alloc_status = fallocate( + fd_, fallocate_with_keep_size_ ? FALLOC_FL_KEEP_SIZE : 0, offset, len); + if (alloc_status == 0) { return Status::OK(); } else { return IOError(filename_, errno); @@ -1332,6 +1348,20 @@ class PosixEnv : public Env { return dummy; } + EnvOptions OptimizeForLogWrite(const EnvOptions& env_options) const { + EnvOptions optimized = env_options; + optimized.use_mmap_writes = false; + optimized.fallocate_with_keep_size = true; + return optimized; + } + + EnvOptions OptimizeForManifestWrite(const EnvOptions& env_options) const { + EnvOptions optimized = env_options; + optimized.use_mmap_writes = false; + optimized.fallocate_with_keep_size = true; + return optimized; + } + private: bool checkedDiskForMmap_; bool forceMmapOff; // do we override Env options?