|
|
|
// Copyright (c) 2011-present, Facebook, Inc. All rights reserved.
|
|
|
|
// This source code is licensed under both the GPLv2 (found in the
|
|
|
|
// COPYING file in the root directory) and Apache 2.0 License
|
|
|
|
// (found in the LICENSE.Apache file in the root directory).
|
|
|
|
//
|
|
|
|
// Copyright (c) 2011 The LevelDB Authors. All rights reserved.
|
|
|
|
// Use of this source code is governed by a BSD-style license that can be
|
|
|
|
// found in the LICENSE file. See the AUTHORS file for names of contributors
|
|
|
|
|
|
|
|
#include "port/lang.h"
|
|
|
|
#if !defined(OS_WIN)
|
|
|
|
|
|
|
|
#include <dirent.h>
|
|
|
|
#ifndef ROCKSDB_NO_DYNAMIC_EXTENSION
|
|
|
|
#include <dlfcn.h>
|
|
|
|
#endif
|
|
|
|
#include <errno.h>
|
|
|
|
#include <fcntl.h>
|
|
|
|
|
|
|
|
#if defined(ROCKSDB_IOURING_PRESENT)
|
|
|
|
#include <liburing.h>
|
|
|
|
#endif
|
|
|
|
#include <pthread.h>
|
|
|
|
#include <signal.h>
|
|
|
|
#include <stdio.h>
|
|
|
|
#include <stdlib.h>
|
|
|
|
#include <string.h>
|
|
|
|
#include <sys/mman.h>
|
|
|
|
#include <sys/stat.h>
|
|
|
|
#if defined(OS_LINUX) || defined(OS_SOLARIS) || defined(OS_ANDROID)
|
|
|
|
#include <sys/statfs.h>
|
|
|
|
#endif
|
Auto recovery from out of space errors (#4164)
Summary:
This commit implements automatic recovery from a Status::NoSpace() error
during background operations such as write callback, flush and
compaction. The broad design is as follows -
1. Compaction errors are treated as soft errors and don't put the
database in read-only mode. A compaction is delayed until enough free
disk space is available to accomodate the compaction outputs, which is
estimated based on the input size. This means that users can continue to
write, and we rely on the WriteController to delay or stop writes if the
compaction debt becomes too high due to persistent low disk space
condition
2. Errors during write callback and flush are treated as hard errors,
i.e the database is put in read-only mode and goes back to read-write
only fater certain recovery actions are taken.
3. Both types of recovery rely on the SstFileManagerImpl to poll for
sufficient disk space. We assume that there is a 1-1 mapping between an
SFM and the underlying OS storage container. For cases where multiple
DBs are hosted on a single storage container, the user is expected to
allocate a single SFM instance and use the same one for all the DBs. If
no SFM is specified by the user, DBImpl::Open() will allocate one, but
this will be one per DB and each DB will recover independently. The
recovery implemented by SFM is as follows -
a) On the first occurance of an out of space error during compaction,
subsequent
compactions will be delayed until the disk free space check indicates
enough available space. The required space is computed as the sum of
input sizes.
b) The free space check requirement will be removed once the amount of
free space is greater than the size reserved by in progress
compactions when the first error occured
c) If the out of space error is a hard error, a background thread in
SFM will poll for sufficient headroom before triggering the recovery
of the database and putting it in write-only mode. The headroom is
calculated as the sum of the write_buffer_size of all the DB instances
associated with the SFM
4. EventListener callbacks will be called at the start and completion of
automatic recovery. Users can disable the auto recov ery in the start
callback, and later initiate it manually by calling DB::Resume()
Todo:
1. More extensive testing
2. Add disk full condition to db_stress (follow-on PR)
Pull Request resolved: https://github.com/facebook/rocksdb/pull/4164
Differential Revision: D9846378
Pulled By: anand1976
fbshipit-source-id: 80ea875dbd7f00205e19c82215ff6e37da10da4a
6 years ago
|
|
|
#include <sys/statvfs.h>
|
|
|
|
#include <sys/time.h>
|
|
|
|
#include <sys/types.h>
|
|
|
|
#if defined(ROCKSDB_IOURING_PRESENT)
|
|
|
|
#include <sys/uio.h>
|
|
|
|
#endif
|
|
|
|
#include <time.h>
|
|
|
|
#include <unistd.h>
|
|
|
|
|
|
|
|
#include <algorithm>
|
|
|
|
// Get nano time includes
|
|
|
|
#if defined(OS_LINUX) || defined(OS_FREEBSD) || defined(OS_GNU_KFREEBSD)
|
|
|
|
#elif defined(__MACH__)
|
|
|
|
#include <Availability.h>
|
|
|
|
#include <mach/clock.h>
|
|
|
|
#include <mach/mach.h>
|
|
|
|
#else
|
|
|
|
#include <chrono>
|
|
|
|
#endif
|
|
|
|
#include <deque>
|
|
|
|
#include <set>
|
|
|
|
#include <vector>
|
|
|
|
|
Introduce a new storage specific Env API (#5761)
Summary:
The current Env API encompasses both storage/file operations, as well as OS related operations. Most of the APIs return a Status, which does not have enough metadata about an error, such as whether its retry-able or not, scope (i.e fault domain) of the error etc., that may be required in order to properly handle a storage error. The file APIs also do not provide enough control over the IO SLA, such as timeout, prioritization, hinting about placement and redundancy etc.
This PR separates out the file/storage APIs from Env into a new FileSystem class. The APIs are updated to return an IOStatus with metadata about the error, as well as to take an IOOptions structure as input in order to allow more control over the IO.
The user can set both ```options.env``` and ```options.file_system``` to specify that RocksDB should use the former for OS related operations and the latter for storage operations. Internally, a ```CompositeEnvWrapper``` has been introduced that inherits from ```Env``` and redirects individual methods to either an ```Env``` implementation or the ```FileSystem``` as appropriate. When options are sanitized during ```DB::Open```, ```options.env``` is replaced with a newly allocated ```CompositeEnvWrapper``` instance if both env and file_system have been specified. This way, the rest of the RocksDB code can continue to function as before.
This PR also ports PosixEnv to the new API by splitting it into two - PosixEnv and PosixFileSystem. PosixEnv is defined as a sub-class of CompositeEnvWrapper, and threading/time functions are overridden with Posix specific implementations in order to avoid an extra level of indirection.
The ```CompositeEnvWrapper``` translates ```IOStatus``` return code to ```Status```, and sets the severity to ```kSoftError``` if the io_status is retryable. The error handling code in RocksDB can then recover the DB automatically.
Pull Request resolved: https://github.com/facebook/rocksdb/pull/5761
Differential Revision: D18868376
Pulled By: anand1976
fbshipit-source-id: 39efe18a162ea746fabac6360ff529baba48486f
5 years ago
|
|
|
#include "env/composite_env_wrapper.h"
|
|
|
|
#include "env/io_posix.h"
|
|
|
|
#include "logging/posix_logger.h"
|
|
|
|
#include "monitoring/iostats_context_imp.h"
|
|
|
|
#include "monitoring/thread_status_updater.h"
|
|
|
|
#include "port/port.h"
|
|
|
|
#include "rocksdb/env.h"
|
|
|
|
#include "rocksdb/options.h"
|
|
|
|
#include "rocksdb/slice.h"
|
|
|
|
#include "rocksdb/system_clock.h"
|
|
|
|
#include "test_util/sync_point.h"
|
|
|
|
#include "util/coding.h"
|
|
|
|
#include "util/compression_context_cache.h"
|
|
|
|
#include "util/random.h"
|
|
|
|
#include "util/string_util.h"
|
Ensure the destruction order of PosixEnv and ThreadLocalPtr
Summary:
By default, RocksDB initializes the singletons of ThreadLocalPtr first, then initializes PosixEnv
via static initializer. Destructor terminates objects in reverse order, so terminating PosixEnv
(calling pthread_mutex_lock), then ThreadLocal (calling pthread_mutex_destroy).
However, in certain case, application might initialize PosixEnv first, then ThreadLocalPtr.
This will cause core dump at the end of the program (eg. https://github.com/facebook/mysql-5.6/issues/122)
This patch fix this issue by ensuring the destruction order by moving the global static singletons
to function static singletons. Since function static singletons are initialized when the function is first
called, this property allows us invoke to enforce the construction of the static PosixEnv and the
singletons of ThreadLocalPtr by calling the function where the ThreadLocalPtr singletons belongs
right before we initialize the static PosixEnv.
Test Plan: Verified in the MyRocks.
Reviewers: yoshinorim, IslamAbdelRahman, rven, kradhakrishnan, anthony, sdong, MarkCallaghan
Reviewed By: anthony
Subscribers: dhruba, leveldb
Differential Revision: https://reviews.facebook.net/D51789
9 years ago
|
|
|
#include "util/thread_local.h"
|
|
|
|
#include "util/threadpool_imp.h"
|
|
|
|
|
|
|
|
#if !defined(TMPFS_MAGIC)
|
|
|
|
#define TMPFS_MAGIC 0x01021994
|
|
|
|
#endif
|
|
|
|
#if !defined(XFS_SUPER_MAGIC)
|
|
|
|
#define XFS_SUPER_MAGIC 0x58465342
|
|
|
|
#endif
|
|
|
|
#if !defined(EXT4_SUPER_MAGIC)
|
|
|
|
#define EXT4_SUPER_MAGIC 0xEF53
|
|
|
|
#endif
|
|
|
|
|
|
|
|
namespace ROCKSDB_NAMESPACE {
|
|
|
|
#if defined(OS_WIN)
|
|
|
|
static const std::string kSharedLibExt = ".dll";
|
|
|
|
static const char kPathSeparator = ';';
|
|
|
|
#else
|
|
|
|
static const char kPathSeparator = ':';
|
|
|
|
#if defined(OS_MACOSX)
|
|
|
|
static const std::string kSharedLibExt = ".dylib";
|
|
|
|
#else
|
|
|
|
static const std::string kSharedLibExt = ".so";
|
|
|
|
#endif
|
|
|
|
#endif
|
|
|
|
|
|
|
|
namespace {
|
|
|
|
|
|
|
|
ThreadStatusUpdater* CreateThreadStatusUpdater() {
|
|
|
|
return new ThreadStatusUpdater();
|
|
|
|
}
|
|
|
|
|
|
|
|
#ifndef ROCKSDB_NO_DYNAMIC_EXTENSION
|
|
|
|
class PosixDynamicLibrary : public DynamicLibrary {
|
|
|
|
public:
|
|
|
|
PosixDynamicLibrary(const std::string& name, void* handle)
|
|
|
|
: name_(name), handle_(handle) {}
|
|
|
|
~PosixDynamicLibrary() override { dlclose(handle_); }
|
|
|
|
|
|
|
|
Status LoadSymbol(const std::string& sym_name, void** func) override {
|
|
|
|
assert(nullptr != func);
|
|
|
|
dlerror(); // Clear any old error
|
|
|
|
*func = dlsym(handle_, sym_name.c_str());
|
|
|
|
if (*func != nullptr) {
|
|
|
|
return Status::OK();
|
|
|
|
} else {
|
|
|
|
char* err = dlerror();
|
|
|
|
return Status::NotFound("Error finding symbol: " + sym_name, err);
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
const char* Name() const override { return name_.c_str(); }
|
|
|
|
|
|
|
|
private:
|
|
|
|
std::string name_;
|
|
|
|
void* handle_;
|
|
|
|
};
|
|
|
|
#endif // !ROCKSDB_NO_DYNAMIC_EXTENSION
|
|
|
|
|
|
|
|
class PosixClock : public SystemClock {
|
|
|
|
public:
|
|
|
|
static const char* kClassName() { return "PosixClock"; }
|
|
|
|
const char* Name() const override { return kDefaultName(); }
|
|
|
|
const char* NickName() const override { return kClassName(); }
|
|
|
|
|
|
|
|
uint64_t NowMicros() override {
|
|
|
|
struct timeval tv;
|
|
|
|
gettimeofday(&tv, nullptr);
|
|
|
|
return static_cast<uint64_t>(tv.tv_sec) * 1000000 + tv.tv_usec;
|
|
|
|
}
|
|
|
|
|
|
|
|
uint64_t NowNanos() override {
|
|
|
|
#if defined(OS_LINUX) || defined(OS_FREEBSD) || defined(OS_GNU_KFREEBSD) || \
|
|
|
|
defined(OS_AIX)
|
|
|
|
struct timespec ts;
|
|
|
|
clock_gettime(CLOCK_MONOTONIC, &ts);
|
|
|
|
return static_cast<uint64_t>(ts.tv_sec) * 1000000000 + ts.tv_nsec;
|
|
|
|
#elif defined(OS_SOLARIS)
|
|
|
|
return gethrtime();
|
|
|
|
#elif defined(__MACH__)
|
|
|
|
clock_serv_t cclock;
|
|
|
|
mach_timespec_t ts;
|
|
|
|
host_get_clock_service(mach_host_self(), CALENDAR_CLOCK, &cclock);
|
|
|
|
clock_get_time(cclock, &ts);
|
|
|
|
mach_port_deallocate(mach_task_self(), cclock);
|
|
|
|
return static_cast<uint64_t>(ts.tv_sec) * 1000000000 + ts.tv_nsec;
|
|
|
|
#else
|
|
|
|
return std::chrono::duration_cast<std::chrono::nanoseconds>(
|
|
|
|
std::chrono::steady_clock::now().time_since_epoch())
|
|
|
|
.count();
|
|
|
|
#endif
|
|
|
|
}
|
|
|
|
|
|
|
|
uint64_t CPUMicros() override {
|
|
|
|
#if defined(OS_LINUX) || defined(OS_FREEBSD) || defined(OS_GNU_KFREEBSD) || \
|
|
|
|
defined(OS_AIX) || (defined(__MACH__) && defined(__MAC_10_12))
|
|
|
|
struct timespec ts;
|
|
|
|
clock_gettime(CLOCK_THREAD_CPUTIME_ID, &ts);
|
|
|
|
return (static_cast<uint64_t>(ts.tv_sec) * 1000000000 + ts.tv_nsec) / 1000;
|
|
|
|
#endif
|
|
|
|
return 0;
|
|
|
|
}
|
|
|
|
|
|
|
|
uint64_t CPUNanos() override {
|
|
|
|
#if defined(OS_LINUX) || defined(OS_FREEBSD) || defined(OS_GNU_KFREEBSD) || \
|
|
|
|
defined(OS_AIX) || (defined(__MACH__) && defined(__MAC_10_12))
|
|
|
|
struct timespec ts;
|
|
|
|
clock_gettime(CLOCK_THREAD_CPUTIME_ID, &ts);
|
|
|
|
return static_cast<uint64_t>(ts.tv_sec) * 1000000000 + ts.tv_nsec;
|
|
|
|
#endif
|
|
|
|
return 0;
|
|
|
|
}
|
|
|
|
|
|
|
|
void SleepForMicroseconds(int micros) override { usleep(micros); }
|
|
|
|
|
|
|
|
Status GetCurrentTime(int64_t* unix_time) override {
|
|
|
|
time_t ret = time(nullptr);
|
|
|
|
if (ret == (time_t)-1) {
|
|
|
|
return IOError("GetCurrentTime", "", errno);
|
|
|
|
}
|
|
|
|
*unix_time = (int64_t)ret;
|
|
|
|
return Status::OK();
|
|
|
|
}
|
|
|
|
|
|
|
|
std::string TimeToString(uint64_t secondsSince1970) override {
|
|
|
|
const time_t seconds = (time_t)secondsSince1970;
|
|
|
|
struct tm t;
|
|
|
|
int maxsize = 64;
|
|
|
|
std::string dummy;
|
|
|
|
dummy.reserve(maxsize);
|
|
|
|
dummy.resize(maxsize);
|
|
|
|
char* p = &dummy[0];
|
|
|
|
localtime_r(&seconds, &t);
|
|
|
|
snprintf(p, maxsize, "%04d/%02d/%02d-%02d:%02d:%02d ", t.tm_year + 1900,
|
|
|
|
t.tm_mon + 1, t.tm_mday, t.tm_hour, t.tm_min, t.tm_sec);
|
|
|
|
return dummy;
|
|
|
|
}
|
|
|
|
};
|
|
|
|
|
|
|
|
class PosixEnv : public CompositeEnv {
|
|
|
|
public:
|
|
|
|
static const char* kClassName() { return "PosixEnv"; }
|
|
|
|
const char* Name() const override { return kClassName(); }
|
|
|
|
const char* NickName() const override { return kDefaultName(); }
|
|
|
|
|
|
|
|
~PosixEnv() override {
|
|
|
|
if (this == Env::Default()) {
|
|
|
|
for (const auto tid : threads_to_join_) {
|
|
|
|
pthread_join(tid, nullptr);
|
|
|
|
}
|
|
|
|
for (int pool_id = 0; pool_id < Env::Priority::TOTAL; ++pool_id) {
|
|
|
|
thread_pools_[pool_id].JoinAllThreads();
|
|
|
|
}
|
|
|
|
// Do not delete the thread_status_updater_ in order to avoid the
|
|
|
|
// free after use when Env::Default() is destructed while some other
|
|
|
|
// child threads are still trying to update thread status. All
|
|
|
|
// PosixEnv instances use the same thread_status_updater_, so never
|
|
|
|
// explicitly delete it.
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
void SetFD_CLOEXEC(int fd, const EnvOptions* options) {
|
|
|
|
if ((options == nullptr || options->set_fd_cloexec) && fd > 0) {
|
|
|
|
fcntl(fd, F_SETFD, fcntl(fd, F_GETFD) | FD_CLOEXEC);
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
#ifndef ROCKSDB_NO_DYNAMIC_EXTENSION
|
|
|
|
// Loads the named library into the result.
|
|
|
|
// If the input name is empty, the current executable is loaded
|
|
|
|
// On *nix systems, a "lib" prefix is added to the name if one is not supplied
|
|
|
|
// Comparably, the appropriate shared library extension is added to the name
|
|
|
|
// if not supplied. If search_path is not specified, the shared library will
|
|
|
|
// be loaded using the default path (LD_LIBRARY_PATH) If search_path is
|
|
|
|
// specified, the shared library will be searched for in the directories
|
|
|
|
// provided by the search path
|
|
|
|
Status LoadLibrary(const std::string& name, const std::string& path,
|
|
|
|
std::shared_ptr<DynamicLibrary>* result) override {
|
|
|
|
assert(result != nullptr);
|
|
|
|
if (name.empty()) {
|
|
|
|
void* hndl = dlopen(NULL, RTLD_NOW);
|
|
|
|
if (hndl != nullptr) {
|
|
|
|
result->reset(new PosixDynamicLibrary(name, hndl));
|
|
|
|
return Status::OK();
|
|
|
|
}
|
|
|
|
} else {
|
|
|
|
std::string library_name = name;
|
|
|
|
if (library_name.find(kSharedLibExt) == std::string::npos) {
|
|
|
|
library_name = library_name + kSharedLibExt;
|
|
|
|
}
|
|
|
|
#if !defined(OS_WIN)
|
|
|
|
if (library_name.find('/') == std::string::npos &&
|
|
|
|
library_name.compare(0, 3, "lib") != 0) {
|
|
|
|
library_name = "lib" + library_name;
|
|
|
|
}
|
|
|
|
#endif
|
|
|
|
if (path.empty()) {
|
|
|
|
void* hndl = dlopen(library_name.c_str(), RTLD_NOW);
|
|
|
|
if (hndl != nullptr) {
|
|
|
|
result->reset(new PosixDynamicLibrary(library_name, hndl));
|
|
|
|
return Status::OK();
|
|
|
|
}
|
|
|
|
} else {
|
|
|
|
std::string local_path;
|
|
|
|
std::stringstream ss(path);
|
|
|
|
while (getline(ss, local_path, kPathSeparator)) {
|
|
|
|
if (!path.empty()) {
|
|
|
|
std::string full_name = local_path + "/" + library_name;
|
|
|
|
void* hndl = dlopen(full_name.c_str(), RTLD_NOW);
|
|
|
|
if (hndl != nullptr) {
|
|
|
|
result->reset(new PosixDynamicLibrary(full_name, hndl));
|
|
|
|
return Status::OK();
|
|
|
|
}
|
|
|
|
}
|
|
|
|
}
|
|
|
|
}
|
|
|
|
}
|
|
|
|
return Status::IOError(
|
|
|
|
IOErrorMsg("Failed to open shared library: xs", name), dlerror());
|
|
|
|
}
|
|
|
|
#endif // !ROCKSDB_NO_DYNAMIC_EXTENSION
|
|
|
|
|
|
|
|
void Schedule(void (*function)(void* arg1), void* arg, Priority pri = LOW,
|
|
|
|
void* tag = nullptr,
|
|
|
|
void (*unschedFunction)(void* arg) = nullptr) override;
|
|
|
|
|
|
|
|
int UnSchedule(void* arg, Priority pri) override;
|
|
|
|
|
|
|
|
void StartThread(void (*function)(void* arg), void* arg) override;
|
|
|
|
|
|
|
|
void WaitForJoin() override;
|
|
|
|
|
|
|
|
unsigned int GetThreadPoolQueueLen(Priority pri = LOW) const override;
|
|
|
|
|
|
|
|
Status GetThreadList(std::vector<ThreadStatus>* thread_list) override {
|
|
|
|
assert(thread_status_updater_);
|
|
|
|
return thread_status_updater_->GetThreadList(thread_list);
|
|
|
|
}
|
|
|
|
|
|
|
|
uint64_t GetThreadID() const override {
|
|
|
|
uint64_t thread_id = 0;
|
|
|
|
#if defined(_GNU_SOURCE) && defined(__GLIBC_PREREQ)
|
|
|
|
#if __GLIBC_PREREQ(2, 30)
|
|
|
|
thread_id = ::gettid();
|
|
|
|
#else // __GLIBC_PREREQ(2, 30)
|
|
|
|
pthread_t tid = pthread_self();
|
|
|
|
memcpy(&thread_id, &tid, std::min(sizeof(thread_id), sizeof(tid)));
|
|
|
|
#endif // __GLIBC_PREREQ(2, 30)
|
|
|
|
#else // defined(_GNU_SOURCE) && defined(__GLIBC_PREREQ)
|
|
|
|
pthread_t tid = pthread_self();
|
|
|
|
memcpy(&thread_id, &tid, std::min(sizeof(thread_id), sizeof(tid)));
|
|
|
|
#endif // defined(_GNU_SOURCE) && defined(__GLIBC_PREREQ)
|
|
|
|
return thread_id;
|
|
|
|
}
|
|
|
|
|
|
|
|
Status GetHostName(char* name, uint64_t len) override {
|
|
|
|
int ret = gethostname(name, static_cast<size_t>(len));
|
|
|
|
if (ret < 0) {
|
Introduce a new storage specific Env API (#5761)
Summary:
The current Env API encompasses both storage/file operations, as well as OS related operations. Most of the APIs return a Status, which does not have enough metadata about an error, such as whether its retry-able or not, scope (i.e fault domain) of the error etc., that may be required in order to properly handle a storage error. The file APIs also do not provide enough control over the IO SLA, such as timeout, prioritization, hinting about placement and redundancy etc.
This PR separates out the file/storage APIs from Env into a new FileSystem class. The APIs are updated to return an IOStatus with metadata about the error, as well as to take an IOOptions structure as input in order to allow more control over the IO.
The user can set both ```options.env``` and ```options.file_system``` to specify that RocksDB should use the former for OS related operations and the latter for storage operations. Internally, a ```CompositeEnvWrapper``` has been introduced that inherits from ```Env``` and redirects individual methods to either an ```Env``` implementation or the ```FileSystem``` as appropriate. When options are sanitized during ```DB::Open```, ```options.env``` is replaced with a newly allocated ```CompositeEnvWrapper``` instance if both env and file_system have been specified. This way, the rest of the RocksDB code can continue to function as before.
This PR also ports PosixEnv to the new API by splitting it into two - PosixEnv and PosixFileSystem. PosixEnv is defined as a sub-class of CompositeEnvWrapper, and threading/time functions are overridden with Posix specific implementations in order to avoid an extra level of indirection.
The ```CompositeEnvWrapper``` translates ```IOStatus``` return code to ```Status```, and sets the severity to ```kSoftError``` if the io_status is retryable. The error handling code in RocksDB can then recover the DB automatically.
Pull Request resolved: https://github.com/facebook/rocksdb/pull/5761
Differential Revision: D18868376
Pulled By: anand1976
fbshipit-source-id: 39efe18a162ea746fabac6360ff529baba48486f
5 years ago
|
|
|
if (errno == EFAULT || errno == EINVAL) {
|
|
|
|
return Status::InvalidArgument(errnoStr(errno).c_str());
|
Introduce a new storage specific Env API (#5761)
Summary:
The current Env API encompasses both storage/file operations, as well as OS related operations. Most of the APIs return a Status, which does not have enough metadata about an error, such as whether its retry-able or not, scope (i.e fault domain) of the error etc., that may be required in order to properly handle a storage error. The file APIs also do not provide enough control over the IO SLA, such as timeout, prioritization, hinting about placement and redundancy etc.
This PR separates out the file/storage APIs from Env into a new FileSystem class. The APIs are updated to return an IOStatus with metadata about the error, as well as to take an IOOptions structure as input in order to allow more control over the IO.
The user can set both ```options.env``` and ```options.file_system``` to specify that RocksDB should use the former for OS related operations and the latter for storage operations. Internally, a ```CompositeEnvWrapper``` has been introduced that inherits from ```Env``` and redirects individual methods to either an ```Env``` implementation or the ```FileSystem``` as appropriate. When options are sanitized during ```DB::Open```, ```options.env``` is replaced with a newly allocated ```CompositeEnvWrapper``` instance if both env and file_system have been specified. This way, the rest of the RocksDB code can continue to function as before.
This PR also ports PosixEnv to the new API by splitting it into two - PosixEnv and PosixFileSystem. PosixEnv is defined as a sub-class of CompositeEnvWrapper, and threading/time functions are overridden with Posix specific implementations in order to avoid an extra level of indirection.
The ```CompositeEnvWrapper``` translates ```IOStatus``` return code to ```Status```, and sets the severity to ```kSoftError``` if the io_status is retryable. The error handling code in RocksDB can then recover the DB automatically.
Pull Request resolved: https://github.com/facebook/rocksdb/pull/5761
Differential Revision: D18868376
Pulled By: anand1976
fbshipit-source-id: 39efe18a162ea746fabac6360ff529baba48486f
5 years ago
|
|
|
} else {
|
|
|
|
return IOError("GetHostName", name, errno);
|
Introduce a new storage specific Env API (#5761)
Summary:
The current Env API encompasses both storage/file operations, as well as OS related operations. Most of the APIs return a Status, which does not have enough metadata about an error, such as whether its retry-able or not, scope (i.e fault domain) of the error etc., that may be required in order to properly handle a storage error. The file APIs also do not provide enough control over the IO SLA, such as timeout, prioritization, hinting about placement and redundancy etc.
This PR separates out the file/storage APIs from Env into a new FileSystem class. The APIs are updated to return an IOStatus with metadata about the error, as well as to take an IOOptions structure as input in order to allow more control over the IO.
The user can set both ```options.env``` and ```options.file_system``` to specify that RocksDB should use the former for OS related operations and the latter for storage operations. Internally, a ```CompositeEnvWrapper``` has been introduced that inherits from ```Env``` and redirects individual methods to either an ```Env``` implementation or the ```FileSystem``` as appropriate. When options are sanitized during ```DB::Open```, ```options.env``` is replaced with a newly allocated ```CompositeEnvWrapper``` instance if both env and file_system have been specified. This way, the rest of the RocksDB code can continue to function as before.
This PR also ports PosixEnv to the new API by splitting it into two - PosixEnv and PosixFileSystem. PosixEnv is defined as a sub-class of CompositeEnvWrapper, and threading/time functions are overridden with Posix specific implementations in order to avoid an extra level of indirection.
The ```CompositeEnvWrapper``` translates ```IOStatus``` return code to ```Status```, and sets the severity to ```kSoftError``` if the io_status is retryable. The error handling code in RocksDB can then recover the DB automatically.
Pull Request resolved: https://github.com/facebook/rocksdb/pull/5761
Differential Revision: D18868376
Pulled By: anand1976
fbshipit-source-id: 39efe18a162ea746fabac6360ff529baba48486f
5 years ago
|
|
|
}
|
|
|
|
}
|
|
|
|
return Status::OK();
|
|
|
|
}
|
|
|
|
|
Introduce a new storage specific Env API (#5761)
Summary:
The current Env API encompasses both storage/file operations, as well as OS related operations. Most of the APIs return a Status, which does not have enough metadata about an error, such as whether its retry-able or not, scope (i.e fault domain) of the error etc., that may be required in order to properly handle a storage error. The file APIs also do not provide enough control over the IO SLA, such as timeout, prioritization, hinting about placement and redundancy etc.
This PR separates out the file/storage APIs from Env into a new FileSystem class. The APIs are updated to return an IOStatus with metadata about the error, as well as to take an IOOptions structure as input in order to allow more control over the IO.
The user can set both ```options.env``` and ```options.file_system``` to specify that RocksDB should use the former for OS related operations and the latter for storage operations. Internally, a ```CompositeEnvWrapper``` has been introduced that inherits from ```Env``` and redirects individual methods to either an ```Env``` implementation or the ```FileSystem``` as appropriate. When options are sanitized during ```DB::Open```, ```options.env``` is replaced with a newly allocated ```CompositeEnvWrapper``` instance if both env and file_system have been specified. This way, the rest of the RocksDB code can continue to function as before.
This PR also ports PosixEnv to the new API by splitting it into two - PosixEnv and PosixFileSystem. PosixEnv is defined as a sub-class of CompositeEnvWrapper, and threading/time functions are overridden with Posix specific implementations in order to avoid an extra level of indirection.
The ```CompositeEnvWrapper``` translates ```IOStatus``` return code to ```Status```, and sets the severity to ```kSoftError``` if the io_status is retryable. The error handling code in RocksDB can then recover the DB automatically.
Pull Request resolved: https://github.com/facebook/rocksdb/pull/5761
Differential Revision: D18868376
Pulled By: anand1976
fbshipit-source-id: 39efe18a162ea746fabac6360ff529baba48486f
5 years ago
|
|
|
ThreadStatusUpdater* GetThreadStatusUpdater() const override {
|
|
|
|
return Env::GetThreadStatusUpdater();
|
|
|
|
}
|
|
|
|
|
Introduce a new storage specific Env API (#5761)
Summary:
The current Env API encompasses both storage/file operations, as well as OS related operations. Most of the APIs return a Status, which does not have enough metadata about an error, such as whether its retry-able or not, scope (i.e fault domain) of the error etc., that may be required in order to properly handle a storage error. The file APIs also do not provide enough control over the IO SLA, such as timeout, prioritization, hinting about placement and redundancy etc.
This PR separates out the file/storage APIs from Env into a new FileSystem class. The APIs are updated to return an IOStatus with metadata about the error, as well as to take an IOOptions structure as input in order to allow more control over the IO.
The user can set both ```options.env``` and ```options.file_system``` to specify that RocksDB should use the former for OS related operations and the latter for storage operations. Internally, a ```CompositeEnvWrapper``` has been introduced that inherits from ```Env``` and redirects individual methods to either an ```Env``` implementation or the ```FileSystem``` as appropriate. When options are sanitized during ```DB::Open```, ```options.env``` is replaced with a newly allocated ```CompositeEnvWrapper``` instance if both env and file_system have been specified. This way, the rest of the RocksDB code can continue to function as before.
This PR also ports PosixEnv to the new API by splitting it into two - PosixEnv and PosixFileSystem. PosixEnv is defined as a sub-class of CompositeEnvWrapper, and threading/time functions are overridden with Posix specific implementations in order to avoid an extra level of indirection.
The ```CompositeEnvWrapper``` translates ```IOStatus``` return code to ```Status```, and sets the severity to ```kSoftError``` if the io_status is retryable. The error handling code in RocksDB can then recover the DB automatically.
Pull Request resolved: https://github.com/facebook/rocksdb/pull/5761
Differential Revision: D18868376
Pulled By: anand1976
fbshipit-source-id: 39efe18a162ea746fabac6360ff529baba48486f
5 years ago
|
|
|
std::string GenerateUniqueId() override { return Env::GenerateUniqueId(); }
|
|
|
|
|
|
|
|
// Allow increasing the number of worker threads.
|
|
|
|
void SetBackgroundThreads(int num, Priority pri) override {
|
|
|
|
assert(pri >= Priority::BOTTOM && pri <= Priority::HIGH);
|
|
|
|
thread_pools_[pri].SetBackgroundThreads(num);
|
|
|
|
}
|
|
|
|
|
|
|
|
int GetBackgroundThreads(Priority pri) override {
|
|
|
|
assert(pri >= Priority::BOTTOM && pri <= Priority::HIGH);
|
|
|
|
return thread_pools_[pri].GetBackgroundThreads();
|
|
|
|
}
|
|
|
|
|
|
|
|
Status SetAllowNonOwnerAccess(bool allow_non_owner_access) override {
|
|
|
|
allow_non_owner_access_ = allow_non_owner_access;
|
|
|
|
return Status::OK();
|
|
|
|
}
|
|
|
|
|
|
|
|
// Allow increasing the number of worker threads.
|
|
|
|
void IncBackgroundThreadsIfNeeded(int num, Priority pri) override {
|
|
|
|
assert(pri >= Priority::BOTTOM && pri <= Priority::HIGH);
|
|
|
|
thread_pools_[pri].IncBackgroundThreadsIfNeeded(num);
|
|
|
|
}
|
|
|
|
|
|
|
|
void LowerThreadPoolIOPriority(Priority pool) override {
|
|
|
|
assert(pool >= Priority::BOTTOM && pool <= Priority::HIGH);
|
|
|
|
#ifdef OS_LINUX
|
|
|
|
thread_pools_[pool].LowerIOPriority();
|
|
|
|
#else
|
|
|
|
(void)pool;
|
|
|
|
#endif
|
|
|
|
}
|
|
|
|
|
|
|
|
void LowerThreadPoolCPUPriority(Priority pool) override {
|
|
|
|
assert(pool >= Priority::BOTTOM && pool <= Priority::HIGH);
|
|
|
|
thread_pools_[pool].LowerCPUPriority(CpuPriority::kLow);
|
|
|
|
}
|
|
|
|
|
|
|
|
Status LowerThreadPoolCPUPriority(Priority pool, CpuPriority pri) override {
|
|
|
|
assert(pool >= Priority::BOTTOM && pool <= Priority::HIGH);
|
|
|
|
thread_pools_[pool].LowerCPUPriority(pri);
|
|
|
|
return Status::OK();
|
|
|
|
}
|
|
|
|
|
|
|
|
private:
|
|
|
|
friend Env* Env::Default();
|
|
|
|
// Constructs the default Env, a singleton
|
|
|
|
PosixEnv();
|
|
|
|
|
|
|
|
// The below 4 members are only used by the default PosixEnv instance.
|
|
|
|
// Non-default instances simply maintain references to the backing
|
|
|
|
// members in te default instance
|
|
|
|
std::vector<ThreadPoolImpl> thread_pools_storage_;
|
|
|
|
pthread_mutex_t mu_storage_;
|
|
|
|
std::vector<pthread_t> threads_to_join_storage_;
|
|
|
|
bool allow_non_owner_access_storage_;
|
|
|
|
|
|
|
|
std::vector<ThreadPoolImpl>& thread_pools_;
|
|
|
|
pthread_mutex_t& mu_;
|
|
|
|
std::vector<pthread_t>& threads_to_join_;
|
|
|
|
// If true, allow non owner read access for db files. Otherwise, non-owner
|
|
|
|
// has no access to db files.
|
|
|
|
bool& allow_non_owner_access_;
|
|
|
|
};
|
|
|
|
|
|
|
|
PosixEnv::PosixEnv()
|
|
|
|
: CompositeEnv(FileSystem::Default(), SystemClock::Default()),
|
|
|
|
thread_pools_storage_(Priority::TOTAL),
|
|
|
|
allow_non_owner_access_storage_(true),
|
|
|
|
thread_pools_(thread_pools_storage_),
|
|
|
|
mu_(mu_storage_),
|
|
|
|
threads_to_join_(threads_to_join_storage_),
|
|
|
|
allow_non_owner_access_(allow_non_owner_access_storage_) {
|
|
|
|
ThreadPoolImpl::PthreadCall("mutex_init", pthread_mutex_init(&mu_, nullptr));
|
|
|
|
for (int pool_id = 0; pool_id < Env::Priority::TOTAL; ++pool_id) {
|
|
|
|
thread_pools_[pool_id].SetThreadPriority(
|
|
|
|
static_cast<Env::Priority>(pool_id));
|
|
|
|
// This allows later initializing the thread-local-env of each thread.
|
|
|
|
thread_pools_[pool_id].SetHostEnv(this);
|
|
|
|
}
|
|
|
|
thread_status_updater_ = CreateThreadStatusUpdater();
|
|
|
|
}
|
|
|
|
|
|
|
|
void PosixEnv::Schedule(void (*function)(void* arg1), void* arg, Priority pri,
|
Running manual compactions in parallel with other automatic or manual compactions in restricted cases
Summary:
This diff provides a framework for doing manual
compactions in parallel with other compactions. We now have a deque of manual compactions. We also pass manual compactions as an argument from RunManualCompactions down to
BackgroundCompactions, so that RunManualCompactions can be reentrant.
Parallelism is controlled by the two routines
ConflictingManualCompaction to allow/disallow new parallel/manual
compactions based on already existing ManualCompactions. In this diff, by default manual compactions still have to run exclusive of other compactions. However, by setting the compaction option, exclusive_manual_compaction to false, it is possible to run other compactions in parallel with a manual compaction. However, we are still restricted to one manual compaction per column family at a time. All of these restrictions will be relaxed in future diffs.
I will be adding more tests later.
Test Plan: Rocksdb regression + new tests + valgrind
Reviewers: igor, anthony, IslamAbdelRahman, kradhakrishnan, yhchiang, sdong
Reviewed By: sdong
Subscribers: yoshinorim, dhruba, leveldb
Differential Revision: https://reviews.facebook.net/D47973
9 years ago
|
|
|
void* tag, void (*unschedFunction)(void* arg)) {
|
|
|
|
assert(pri >= Priority::BOTTOM && pri <= Priority::HIGH);
|
Running manual compactions in parallel with other automatic or manual compactions in restricted cases
Summary:
This diff provides a framework for doing manual
compactions in parallel with other compactions. We now have a deque of manual compactions. We also pass manual compactions as an argument from RunManualCompactions down to
BackgroundCompactions, so that RunManualCompactions can be reentrant.
Parallelism is controlled by the two routines
ConflictingManualCompaction to allow/disallow new parallel/manual
compactions based on already existing ManualCompactions. In this diff, by default manual compactions still have to run exclusive of other compactions. However, by setting the compaction option, exclusive_manual_compaction to false, it is possible to run other compactions in parallel with a manual compaction. However, we are still restricted to one manual compaction per column family at a time. All of these restrictions will be relaxed in future diffs.
I will be adding more tests later.
Test Plan: Rocksdb regression + new tests + valgrind
Reviewers: igor, anthony, IslamAbdelRahman, kradhakrishnan, yhchiang, sdong
Reviewed By: sdong
Subscribers: yoshinorim, dhruba, leveldb
Differential Revision: https://reviews.facebook.net/D47973
9 years ago
|
|
|
thread_pools_[pri].Schedule(function, arg, tag, unschedFunction);
|
|
|
|
}
|
|
|
|
|
|
|
|
int PosixEnv::UnSchedule(void* arg, Priority pri) {
|
|
|
|
return thread_pools_[pri].UnSchedule(arg);
|
|
|
|
}
|
|
|
|
|
|
|
|
unsigned int PosixEnv::GetThreadPoolQueueLen(Priority pri) const {
|
|
|
|
assert(pri >= Priority::BOTTOM && pri <= Priority::HIGH);
|
|
|
|
return thread_pools_[pri].GetQueueLen();
|
|
|
|
}
|
|
|
|
|
|
|
|
struct StartThreadState {
|
|
|
|
void (*user_function)(void*);
|
|
|
|
void* arg;
|
|
|
|
};
|
|
|
|
|
|
|
|
static void* StartThreadWrapper(void* arg) {
|
|
|
|
StartThreadState* state = reinterpret_cast<StartThreadState*>(arg);
|
|
|
|
state->user_function(state->arg);
|
|
|
|
delete state;
|
|
|
|
return nullptr;
|
|
|
|
}
|
|
|
|
|
|
|
|
void PosixEnv::StartThread(void (*function)(void* arg), void* arg) {
|
|
|
|
pthread_t t;
|
|
|
|
StartThreadState* state = new StartThreadState;
|
|
|
|
state->user_function = function;
|
|
|
|
state->arg = arg;
|
|
|
|
ThreadPoolImpl::PthreadCall(
|
|
|
|
"start thread", pthread_create(&t, nullptr, &StartThreadWrapper, state));
|
|
|
|
ThreadPoolImpl::PthreadCall("lock", pthread_mutex_lock(&mu_));
|
|
|
|
threads_to_join_.push_back(t);
|
|
|
|
ThreadPoolImpl::PthreadCall("unlock", pthread_mutex_unlock(&mu_));
|
|
|
|
}
|
|
|
|
|
|
|
|
void PosixEnv::WaitForJoin() {
|
|
|
|
for (const auto tid : threads_to_join_) {
|
|
|
|
pthread_join(tid, nullptr);
|
|
|
|
}
|
|
|
|
threads_to_join_.clear();
|
|
|
|
}
|
|
|
|
|
|
|
|
} // namespace
|
|
|
|
|
|
|
|
//
|
|
|
|
// Default Posix Env
|
|
|
|
//
|
|
|
|
Env* Env::Default() {
|
Ensure the destruction order of PosixEnv and ThreadLocalPtr
Summary:
By default, RocksDB initializes the singletons of ThreadLocalPtr first, then initializes PosixEnv
via static initializer. Destructor terminates objects in reverse order, so terminating PosixEnv
(calling pthread_mutex_lock), then ThreadLocal (calling pthread_mutex_destroy).
However, in certain case, application might initialize PosixEnv first, then ThreadLocalPtr.
This will cause core dump at the end of the program (eg. https://github.com/facebook/mysql-5.6/issues/122)
This patch fix this issue by ensuring the destruction order by moving the global static singletons
to function static singletons. Since function static singletons are initialized when the function is first
called, this property allows us invoke to enforce the construction of the static PosixEnv and the
singletons of ThreadLocalPtr by calling the function where the ThreadLocalPtr singletons belongs
right before we initialize the static PosixEnv.
Test Plan: Verified in the MyRocks.
Reviewers: yoshinorim, IslamAbdelRahman, rven, kradhakrishnan, anthony, sdong, MarkCallaghan
Reviewed By: anthony
Subscribers: dhruba, leveldb
Differential Revision: https://reviews.facebook.net/D51789
9 years ago
|
|
|
// The following function call initializes the singletons of ThreadLocalPtr
|
|
|
|
// right before the static default_env. This guarantees default_env will
|
|
|
|
// always being destructed before the ThreadLocalPtr singletons get
|
|
|
|
// destructed as C++ guarantees that the destructions of static variables
|
|
|
|
// is in the reverse order of their constructions.
|
|
|
|
//
|
|
|
|
// Since static members are destructed in the reverse order
|
|
|
|
// of their construction, having this call here guarantees that
|
|
|
|
// the destructor of static PosixEnv will go first, then the
|
|
|
|
// the singletons of ThreadLocalPtr.
|
|
|
|
ThreadLocalPtr::InitSingletons();
|
|
|
|
CompressionContextCache::InitSingleton();
|
|
|
|
INIT_SYNC_POINT_SINGLETONS();
|
|
|
|
// ~PosixEnv must be called on exit
|
|
|
|
//**TODO: Can we make this a STATIC_AVOID_DESTRUCTION?
|
|
|
|
static PosixEnv default_env;
|
|
|
|
return &default_env;
|
|
|
|
}
|
|
|
|
|
|
|
|
//
|
|
|
|
// Default Posix SystemClock
|
|
|
|
//
|
|
|
|
const std::shared_ptr<SystemClock>& SystemClock::Default() {
|
|
|
|
STATIC_AVOID_DESTRUCTION(std::shared_ptr<SystemClock>, instance)
|
|
|
|
(std::make_shared<PosixClock>());
|
|
|
|
return instance;
|
|
|
|
}
|
|
|
|
} // namespace ROCKSDB_NAMESPACE
|
|
|
|
|
|
|
|
#endif
|