|
|
|
// Copyright (c) 2011-present, Facebook, Inc. All rights reserved.
|
|
|
|
// This source code is licensed under both the GPLv2 (found in the
|
|
|
|
// COPYING file in the root directory) and Apache 2.0 License
|
|
|
|
// (found in the LICENSE.Apache file in the root directory).
|
|
|
|
|
|
|
|
#pragma once
|
|
|
|
|
|
|
|
#include <memory>
|
|
|
|
#include <string>
|
|
|
|
#include <unordered_map>
|
|
|
|
#include <vector>
|
|
|
|
|
Introduce a new storage specific Env API (#5761)
Summary:
The current Env API encompasses both storage/file operations, as well as OS related operations. Most of the APIs return a Status, which does not have enough metadata about an error, such as whether its retry-able or not, scope (i.e fault domain) of the error etc., that may be required in order to properly handle a storage error. The file APIs also do not provide enough control over the IO SLA, such as timeout, prioritization, hinting about placement and redundancy etc.
This PR separates out the file/storage APIs from Env into a new FileSystem class. The APIs are updated to return an IOStatus with metadata about the error, as well as to take an IOOptions structure as input in order to allow more control over the IO.
The user can set both ```options.env``` and ```options.file_system``` to specify that RocksDB should use the former for OS related operations and the latter for storage operations. Internally, a ```CompositeEnvWrapper``` has been introduced that inherits from ```Env``` and redirects individual methods to either an ```Env``` implementation or the ```FileSystem``` as appropriate. When options are sanitized during ```DB::Open```, ```options.env``` is replaced with a newly allocated ```CompositeEnvWrapper``` instance if both env and file_system have been specified. This way, the rest of the RocksDB code can continue to function as before.
This PR also ports PosixEnv to the new API by splitting it into two - PosixEnv and PosixFileSystem. PosixEnv is defined as a sub-class of CompositeEnvWrapper, and threading/time functions are overridden with Posix specific implementations in order to avoid an extra level of indirection.
The ```CompositeEnvWrapper``` translates ```IOStatus``` return code to ```Status```, and sets the severity to ```kSoftError``` if the io_status is retryable. The error handling code in RocksDB can then recover the DB automatically.
Pull Request resolved: https://github.com/facebook/rocksdb/pull/5761
Differential Revision: D18868376
Pulled By: anand1976
fbshipit-source-id: 39efe18a162ea746fabac6360ff529baba48486f
5 years ago
|
|
|
#include "rocksdb/file_system.h"
|
|
|
|
#include "rocksdb/statistics.h"
|
|
|
|
#include "rocksdb/status.h"
|
|
|
|
|
|
|
|
namespace ROCKSDB_NAMESPACE {
|
|
|
|
|
|
|
|
class Env;
|
|
|
|
class Logger;
|
|
|
|
|
|
|
|
// SstFileManager is used to track SST and blob files in the DB and control
|
|
|
|
// their deletion rate. All SstFileManager public functions are thread-safe.
|
|
|
|
// SstFileManager is NOT an extensible interface but a public interface for
|
|
|
|
// result of NewSstFileManager. Any derived classes must be RocksDB internal.
|
|
|
|
class SstFileManager {
|
|
|
|
public:
|
|
|
|
virtual ~SstFileManager() {}
|
|
|
|
|
|
|
|
// Update the maximum allowed space that should be used by RocksDB, if
|
|
|
|
// the total size of the SST and blob files exceeds max_allowed_space, writes
|
|
|
|
// to RocksDB will fail.
|
|
|
|
//
|
|
|
|
// Setting max_allowed_space to 0 will disable this feature; maximum allowed
|
|
|
|
// space will be infinite (Default value).
|
|
|
|
//
|
|
|
|
// thread-safe.
|
|
|
|
virtual void SetMaxAllowedSpaceUsage(uint64_t max_allowed_space) = 0;
|
|
|
|
|
|
|
|
// Set the amount of buffer room each compaction should be able to leave.
|
|
|
|
// In other words, at its maximum disk space consumption, the compaction
|
|
|
|
// should still leave compaction_buffer_size available on the disk so that
|
|
|
|
// other background functions may continue, such as logging and flushing.
|
|
|
|
virtual void SetCompactionBufferSize(uint64_t compaction_buffer_size) = 0;
|
|
|
|
|
|
|
|
// Return true if the total size of SST and blob files exceeded the maximum
|
|
|
|
// allowed space usage.
|
|
|
|
//
|
|
|
|
// thread-safe.
|
|
|
|
virtual bool IsMaxAllowedSpaceReached() = 0;
|
|
|
|
|
|
|
|
// Returns true if the total size of SST and blob files as well as estimated
|
|
|
|
// size of ongoing compactions exceeds the maximums allowed space usage.
|
|
|
|
virtual bool IsMaxAllowedSpaceReachedIncludingCompactions() = 0;
|
|
|
|
|
|
|
|
// Return the total size of all tracked files.
|
|
|
|
// thread-safe
|
|
|
|
virtual uint64_t GetTotalSize() = 0;
|
|
|
|
|
|
|
|
// Return a map containing all tracked files and their corresponding sizes.
|
|
|
|
// thread-safe
|
|
|
|
virtual std::unordered_map<std::string, uint64_t> GetTrackedFiles() = 0;
|
|
|
|
|
|
|
|
// Return delete rate limit in bytes per second.
|
|
|
|
// thread-safe
|
|
|
|
virtual int64_t GetDeleteRateBytesPerSecond() = 0;
|
|
|
|
|
|
|
|
// Update the delete rate limit in bytes per second.
|
|
|
|
// zero means disable delete rate limiting and delete files immediately
|
|
|
|
// thread-safe
|
|
|
|
virtual void SetDeleteRateBytesPerSecond(int64_t delete_rate) = 0;
|
|
|
|
|
|
|
|
// Return trash/DB size ratio where new files will be deleted immediately
|
|
|
|
// thread-safe
|
|
|
|
virtual double GetMaxTrashDBRatio() = 0;
|
|
|
|
|
|
|
|
// Update trash/DB size ratio where new files will be deleted immediately
|
|
|
|
// thread-safe
|
|
|
|
virtual void SetMaxTrashDBRatio(double ratio) = 0;
|
|
|
|
|
|
|
|
// Return the total size of trash files
|
|
|
|
// thread-safe
|
|
|
|
virtual uint64_t GetTotalTrashSize() = 0;
|
|
|
|
|
|
|
|
// Set the statistics ptr to dump the stat information
|
|
|
|
virtual void SetStatisticsPtr(const std::shared_ptr<Statistics>& stats) = 0;
|
|
|
|
};
|
|
|
|
|
|
|
|
// Create a new SstFileManager that can be shared among multiple RocksDB
|
|
|
|
// instances to track SST and blob files and control there deletion rate.
|
|
|
|
// Even though SstFileManager don't track WAL files but it still control
|
|
|
|
// there deletion rate.
|
|
|
|
//
|
|
|
|
// @param env: Pointer to Env object, please see "rocksdb/env.h".
|
Introduce a new storage specific Env API (#5761)
Summary:
The current Env API encompasses both storage/file operations, as well as OS related operations. Most of the APIs return a Status, which does not have enough metadata about an error, such as whether its retry-able or not, scope (i.e fault domain) of the error etc., that may be required in order to properly handle a storage error. The file APIs also do not provide enough control over the IO SLA, such as timeout, prioritization, hinting about placement and redundancy etc.
This PR separates out the file/storage APIs from Env into a new FileSystem class. The APIs are updated to return an IOStatus with metadata about the error, as well as to take an IOOptions structure as input in order to allow more control over the IO.
The user can set both ```options.env``` and ```options.file_system``` to specify that RocksDB should use the former for OS related operations and the latter for storage operations. Internally, a ```CompositeEnvWrapper``` has been introduced that inherits from ```Env``` and redirects individual methods to either an ```Env``` implementation or the ```FileSystem``` as appropriate. When options are sanitized during ```DB::Open```, ```options.env``` is replaced with a newly allocated ```CompositeEnvWrapper``` instance if both env and file_system have been specified. This way, the rest of the RocksDB code can continue to function as before.
This PR also ports PosixEnv to the new API by splitting it into two - PosixEnv and PosixFileSystem. PosixEnv is defined as a sub-class of CompositeEnvWrapper, and threading/time functions are overridden with Posix specific implementations in order to avoid an extra level of indirection.
The ```CompositeEnvWrapper``` translates ```IOStatus``` return code to ```Status```, and sets the severity to ```kSoftError``` if the io_status is retryable. The error handling code in RocksDB can then recover the DB automatically.
Pull Request resolved: https://github.com/facebook/rocksdb/pull/5761
Differential Revision: D18868376
Pulled By: anand1976
fbshipit-source-id: 39efe18a162ea746fabac6360ff529baba48486f
5 years ago
|
|
|
// @param fs: Pointer to FileSystem object (rocksdb/file_system.h"
|
|
|
|
// @param info_log: If not nullptr, info_log will be used to log errors.
|
|
|
|
//
|
|
|
|
// == Deletion rate limiting specific arguments ==
|
|
|
|
// @param trash_dir: Deprecated, this argument have no effect
|
|
|
|
// @param rate_bytes_per_sec: How many bytes should be deleted per second, If
|
|
|
|
// this value is set to 1024 (1 Kb / sec) and we deleted a file of size 4 Kb
|
|
|
|
// in 1 second, we will wait for another 3 seconds before we delete other
|
|
|
|
// files, Set to 0 to disable deletion rate limiting.
|
|
|
|
// This option also affect the delete rate of WAL files in the DB.
|
|
|
|
// @param delete_existing_trash: Deprecated, this argument have no effect, but
|
|
|
|
// if user provide trash_dir we will schedule deletes for files in the dir
|
|
|
|
// @param status: If not nullptr, status will contain any errors that happened
|
|
|
|
// during creating the missing trash_dir or deleting existing files in trash.
|
|
|
|
// @param max_trash_db_ratio: If the trash size constitutes for more than this
|
|
|
|
// fraction of the total DB size we will start deleting new files passed to
|
|
|
|
// DeleteScheduler immediately
|
|
|
|
// @param bytes_max_delete_chunk: if a file to delete is larger than delete
|
|
|
|
// chunk, ftruncate the file by this size each time, rather than dropping the
|
|
|
|
// whole file. 0 means to always delete the whole file. If the file has more
|
|
|
|
// than one linked names, the file will be deleted as a whole. Either way,
|
|
|
|
// `rate_bytes_per_sec` will be appreciated. NOTE that with this option,
|
|
|
|
// files already renamed as a trash may be partial, so users should not
|
|
|
|
// directly recover them without checking.
|
Introduce a new storage specific Env API (#5761)
Summary:
The current Env API encompasses both storage/file operations, as well as OS related operations. Most of the APIs return a Status, which does not have enough metadata about an error, such as whether its retry-able or not, scope (i.e fault domain) of the error etc., that may be required in order to properly handle a storage error. The file APIs also do not provide enough control over the IO SLA, such as timeout, prioritization, hinting about placement and redundancy etc.
This PR separates out the file/storage APIs from Env into a new FileSystem class. The APIs are updated to return an IOStatus with metadata about the error, as well as to take an IOOptions structure as input in order to allow more control over the IO.
The user can set both ```options.env``` and ```options.file_system``` to specify that RocksDB should use the former for OS related operations and the latter for storage operations. Internally, a ```CompositeEnvWrapper``` has been introduced that inherits from ```Env``` and redirects individual methods to either an ```Env``` implementation or the ```FileSystem``` as appropriate. When options are sanitized during ```DB::Open```, ```options.env``` is replaced with a newly allocated ```CompositeEnvWrapper``` instance if both env and file_system have been specified. This way, the rest of the RocksDB code can continue to function as before.
This PR also ports PosixEnv to the new API by splitting it into two - PosixEnv and PosixFileSystem. PosixEnv is defined as a sub-class of CompositeEnvWrapper, and threading/time functions are overridden with Posix specific implementations in order to avoid an extra level of indirection.
The ```CompositeEnvWrapper``` translates ```IOStatus``` return code to ```Status```, and sets the severity to ```kSoftError``` if the io_status is retryable. The error handling code in RocksDB can then recover the DB automatically.
Pull Request resolved: https://github.com/facebook/rocksdb/pull/5761
Differential Revision: D18868376
Pulled By: anand1976
fbshipit-source-id: 39efe18a162ea746fabac6360ff529baba48486f
5 years ago
|
|
|
extern SstFileManager* NewSstFileManager(
|
|
|
|
Env* env, std::shared_ptr<FileSystem> fs,
|
|
|
|
std::shared_ptr<Logger> info_log = nullptr,
|
|
|
|
const std::string& trash_dir = "", int64_t rate_bytes_per_sec = 0,
|
|
|
|
bool delete_existing_trash = true, Status* status = nullptr,
|
|
|
|
double max_trash_db_ratio = 0.25,
|
|
|
|
uint64_t bytes_max_delete_chunk = 64 * 1024 * 1024);
|
|
|
|
|
|
|
|
// Same as above, but takes a pointer to a legacy Env object, instead of
|
|
|
|
// Env and FileSystem objects
|
|
|
|
extern SstFileManager* NewSstFileManager(
|
|
|
|
Env* env, std::shared_ptr<Logger> info_log = nullptr,
|
|
|
|
std::string trash_dir = "", int64_t rate_bytes_per_sec = 0,
|
|
|
|
bool delete_existing_trash = true, Status* status = nullptr,
|
|
|
|
double max_trash_db_ratio = 0.25,
|
|
|
|
uint64_t bytes_max_delete_chunk = 64 * 1024 * 1024);
|
|
|
|
|
|
|
|
} // namespace ROCKSDB_NAMESPACE
|