|
|
|
// Copyright (c) 2011-present, Facebook, Inc. All rights reserved.
|
|
|
|
// This source code is licensed under both the GPLv2 (found in the
|
|
|
|
// COPYING file in the root directory) and Apache 2.0 License
|
|
|
|
// (found in the LICENSE.Apache file in the root directory).
|
|
|
|
|
|
|
|
#pragma once
|
|
|
|
|
|
|
|
#ifndef ROCKSDB_LITE
|
|
|
|
|
|
|
|
#include <functional>
|
|
|
|
#include <string>
|
|
|
|
#include <vector>
|
|
|
|
#include "rocksdb/db.h"
|
|
|
|
#include "rocksdb/status.h"
|
|
|
|
#include "rocksdb/utilities/stackable_db.h"
|
|
|
|
|
|
|
|
namespace rocksdb {
|
|
|
|
|
|
|
|
namespace blob_db {
|
|
|
|
|
|
|
|
class TTLExtractor;
|
|
|
|
|
|
|
|
// A wrapped database which puts values of KV pairs in a separate log
|
|
|
|
// and store location to the log in the underlying DB.
|
|
|
|
// It lacks lots of importatant functionalities, e.g. DB restarts,
|
|
|
|
// garbage collection, iterators, etc.
|
|
|
|
//
|
|
|
|
// The factory needs to be moved to include/rocksdb/utilities to allow
|
|
|
|
// users to use blob DB.
|
|
|
|
|
|
|
|
struct BlobDBOptions {
|
|
|
|
// name of the directory under main db, where blobs will be stored.
|
|
|
|
// default is "blob_dir"
|
|
|
|
std::string blob_dir = "blob_dir";
|
|
|
|
|
|
|
|
// whether the blob_dir path is relative or absolute.
|
|
|
|
bool path_relative = true;
|
|
|
|
|
|
|
|
// is the eviction strategy fifo based
|
|
|
|
bool is_fifo = false;
|
|
|
|
|
|
|
|
// maximum size of the blob dir. Once this gets used, up
|
|
|
|
// evict the blob file which is oldest (is_fifo )
|
|
|
|
// 0 means no limits
|
|
|
|
uint64_t blob_dir_size = 0;
|
|
|
|
|
|
|
|
// a new bucket is opened, for ttl_range. So if ttl_range is 600seconds
|
|
|
|
// (10 minutes), and the first bucket starts at 1471542000
|
|
|
|
// then the blob buckets will be
|
|
|
|
// first bucket is 1471542000 - 1471542600
|
|
|
|
// second bucket is 1471542600 - 1471543200
|
|
|
|
// and so on
|
|
|
|
uint64_t ttl_range_secs = 3600;
|
|
|
|
|
Blob DB: Inline small values in base DB
Summary:
Adding the `min_blob_size` option to allow storing small values in base db (in LSM tree) together with the key. The goal is to improve performance for small values, while taking advantage of blob db's low write amplification for large values.
Also adding expiration timestamp to blob index. It will be useful to evict stale blob indexes in base db by adding a compaction filter. I'll work on the compaction filter in future patches.
See blob_index.h for the new blob index format. There are 4 cases when writing a new key:
* small value w/o TTL: put in base db as normal value (i.e. ValueType::kTypeValue)
* small value w/ TTL: put (type, expiration, value) to base db.
* large value w/o TTL: write value to blob log and put (type, file, offset, size, compression) to base db.
* large value w/TTL: write value to blob log and put (type, expiration, file, offset, size, compression) to base db.
Closes https://github.com/facebook/rocksdb/pull/3066
Differential Revision: D6142115
Pulled By: yiwu-arbug
fbshipit-source-id: 9526e76e19f0839310a3f5f2a43772a4ad182cd0
7 years ago
|
|
|
// The smallest value to store in blob log. Value larger than this threshold
|
|
|
|
// will be inlined in base DB together with the key.
|
|
|
|
uint64_t min_blob_size = 0;
|
|
|
|
|
|
|
|
// Allows OS to incrementally sync blob files to disk for every
|
|
|
|
// bytes_per_sync bytes written. Users shouldn't rely on it for
|
|
|
|
// persistency guarantee.
|
|
|
|
uint64_t bytes_per_sync = 512 * 1024;
|
|
|
|
|
|
|
|
// the target size of each blob file. File will become immutable
|
|
|
|
// after it exceeds that size
|
|
|
|
uint64_t blob_file_size = 256 * 1024 * 1024;
|
|
|
|
|
|
|
|
// Instead of setting TTL explicitly by calling PutWithTTL or PutUntil,
|
|
|
|
// applications can set a TTLExtractor which can extract TTL from key-value
|
|
|
|
// pairs.
|
|
|
|
std::shared_ptr<TTLExtractor> ttl_extractor = nullptr;
|
|
|
|
|
|
|
|
// what compression to use for Blob's
|
|
|
|
CompressionType compression = kNoCompression;
|
|
|
|
|
|
|
|
// If enabled, blob DB periodically cleanup stale data by rewriting remaining
|
|
|
|
// live data in blob files to new files. If garbage collection is not enabled,
|
|
|
|
// blob files will be cleanup based on TTL.
|
|
|
|
bool enable_garbage_collection = false;
|
|
|
|
|
|
|
|
// Time interval to trigger garbage collection, in seconds.
|
|
|
|
uint64_t garbage_collection_interval_secs = 60;
|
|
|
|
|
|
|
|
// If garbage collection is enabled, blob files with deleted size no less
|
|
|
|
// than this ratio will become candidates to be cleanup.
|
|
|
|
double garbage_collection_deletion_size_threshold = 0.75;
|
|
|
|
|
|
|
|
// Disable all background job. Used for test only.
|
|
|
|
bool disable_background_tasks = false;
|
|
|
|
|
|
|
|
void Dump(Logger* log) const;
|
|
|
|
};
|
|
|
|
|
|
|
|
class BlobDB : public StackableDB {
|
|
|
|
public:
|
|
|
|
using rocksdb::StackableDB::Put;
|
|
|
|
virtual Status Put(const WriteOptions& options, const Slice& key,
|
|
|
|
const Slice& value) override = 0;
|
|
|
|
virtual Status Put(const WriteOptions& options,
|
|
|
|
ColumnFamilyHandle* column_family, const Slice& key,
|
|
|
|
const Slice& value) override {
|
|
|
|
if (column_family != DefaultColumnFamily()) {
|
|
|
|
return Status::NotSupported(
|
|
|
|
"Blob DB doesn't support non-default column family.");
|
|
|
|
}
|
|
|
|
return Put(options, key, value);
|
|
|
|
}
|
|
|
|
|
|
|
|
using rocksdb::StackableDB::Delete;
|
|
|
|
virtual Status Delete(const WriteOptions& options,
|
|
|
|
const Slice& key) override = 0;
|
|
|
|
virtual Status Delete(const WriteOptions& options,
|
|
|
|
ColumnFamilyHandle* column_family,
|
|
|
|
const Slice& key) override {
|
|
|
|
if (column_family != DefaultColumnFamily()) {
|
|
|
|
return Status::NotSupported(
|
|
|
|
"Blob DB doesn't support non-default column family.");
|
|
|
|
}
|
|
|
|
return Delete(options, key);
|
|
|
|
}
|
|
|
|
|
|
|
|
virtual Status PutWithTTL(const WriteOptions& options, const Slice& key,
|
|
|
|
const Slice& value, uint64_t ttl) = 0;
|
|
|
|
virtual Status PutWithTTL(const WriteOptions& options,
|
|
|
|
ColumnFamilyHandle* column_family, const Slice& key,
|
|
|
|
const Slice& value, uint64_t ttl) {
|
|
|
|
if (column_family != DefaultColumnFamily()) {
|
|
|
|
return Status::NotSupported(
|
|
|
|
"Blob DB doesn't support non-default column family.");
|
|
|
|
}
|
|
|
|
return PutWithTTL(options, key, value, ttl);
|
|
|
|
}
|
|
|
|
|
|
|
|
// Put with expiration. Key with expiration time equal to
|
|
|
|
// std::numeric_limits<uint64_t>::max() means the key don't expire.
|
|
|
|
virtual Status PutUntil(const WriteOptions& options, const Slice& key,
|
|
|
|
const Slice& value, uint64_t expiration) = 0;
|
|
|
|
virtual Status PutUntil(const WriteOptions& options,
|
|
|
|
ColumnFamilyHandle* column_family, const Slice& key,
|
|
|
|
const Slice& value, uint64_t expiration) {
|
|
|
|
if (column_family != DefaultColumnFamily()) {
|
|
|
|
return Status::NotSupported(
|
|
|
|
"Blob DB doesn't support non-default column family.");
|
|
|
|
}
|
|
|
|
return PutUntil(options, key, value, expiration);
|
|
|
|
}
|
|
|
|
|
|
|
|
using rocksdb::StackableDB::Get;
|
|
|
|
virtual Status Get(const ReadOptions& options,
|
|
|
|
ColumnFamilyHandle* column_family, const Slice& key,
|
|
|
|
PinnableSlice* value) override = 0;
|
|
|
|
|
|
|
|
using rocksdb::StackableDB::MultiGet;
|
|
|
|
virtual std::vector<Status> MultiGet(
|
|
|
|
const ReadOptions& options,
|
|
|
|
const std::vector<Slice>& keys,
|
|
|
|
std::vector<std::string>* values) override = 0;
|
|
|
|
virtual std::vector<Status> MultiGet(
|
|
|
|
const ReadOptions& options,
|
|
|
|
const std::vector<ColumnFamilyHandle*>& column_families,
|
|
|
|
const std::vector<Slice>& keys,
|
|
|
|
std::vector<std::string>* values) override {
|
|
|
|
for (auto column_family : column_families) {
|
|
|
|
if (column_family != DefaultColumnFamily()) {
|
|
|
|
return std::vector<Status>(
|
|
|
|
column_families.size(),
|
|
|
|
Status::NotSupported(
|
|
|
|
"Blob DB doesn't support non-default column family."));
|
|
|
|
}
|
|
|
|
}
|
|
|
|
return MultiGet(options, keys, values);
|
|
|
|
}
|
|
|
|
|
|
|
|
using rocksdb::StackableDB::SingleDelete;
|
|
|
|
virtual Status SingleDelete(const WriteOptions& /*wopts*/,
|
|
|
|
ColumnFamilyHandle* /*column_family*/,
|
|
|
|
const Slice& /*key*/) override {
|
|
|
|
return Status::NotSupported("Not supported operation in blob db.");
|
|
|
|
}
|
|
|
|
|
|
|
|
using rocksdb::StackableDB::Merge;
|
|
|
|
virtual Status Merge(const WriteOptions& /*options*/,
|
|
|
|
ColumnFamilyHandle* /*column_family*/,
|
|
|
|
const Slice& /*key*/, const Slice& /*value*/) override {
|
|
|
|
return Status::NotSupported("Not supported operation in blob db.");
|
|
|
|
}
|
|
|
|
|
|
|
|
virtual Status Write(const WriteOptions& opts,
|
|
|
|
WriteBatch* updates) override = 0;
|
|
|
|
|
|
|
|
using rocksdb::StackableDB::NewIterator;
|
|
|
|
virtual Iterator* NewIterator(const ReadOptions& options) override = 0;
|
|
|
|
virtual Iterator* NewIterator(const ReadOptions& options,
|
|
|
|
ColumnFamilyHandle* column_family) override {
|
|
|
|
if (column_family != DefaultColumnFamily()) {
|
|
|
|
// Blob DB doesn't support non-default column family.
|
|
|
|
return nullptr;
|
|
|
|
}
|
|
|
|
return NewIterator(options);
|
|
|
|
}
|
|
|
|
|
|
|
|
// Opening blob db.
|
|
|
|
static Status Open(const Options& options, const BlobDBOptions& bdb_options,
|
|
|
|
const std::string& dbname, BlobDB** blob_db);
|
|
|
|
|
|
|
|
static Status Open(const DBOptions& db_options,
|
|
|
|
const BlobDBOptions& bdb_options,
|
|
|
|
const std::string& dbname,
|
|
|
|
const std::vector<ColumnFamilyDescriptor>& column_families,
|
|
|
|
std::vector<ColumnFamilyHandle*>* handles,
|
|
|
|
BlobDB** blob_db);
|
|
|
|
|
|
|
|
virtual BlobDBOptions GetBlobDBOptions() const = 0;
|
|
|
|
|
|
|
|
virtual Status SyncBlobFiles() = 0;
|
|
|
|
|
|
|
|
virtual ~BlobDB() {}
|
|
|
|
|
|
|
|
protected:
|
|
|
|
explicit BlobDB();
|
|
|
|
};
|
|
|
|
|
|
|
|
// Destroy the content of the database.
|
|
|
|
Status DestroyBlobDB(const std::string& dbname, const Options& options,
|
|
|
|
const BlobDBOptions& bdb_options);
|
|
|
|
|
|
|
|
// TTLExtractor allow applications to extract TTL from key-value pairs.
|
|
|
|
// This useful for applications using Put or WriteBatch to write keys and
|
|
|
|
// don't intend to migrate to PutWithTTL or PutUntil.
|
|
|
|
//
|
|
|
|
// Applications can implement either ExtractTTL or ExtractExpiration. If both
|
|
|
|
// are implemented, ExtractExpiration will take precedence.
|
|
|
|
class TTLExtractor {
|
|
|
|
public:
|
|
|
|
// Extract TTL from key-value pair.
|
|
|
|
// Return true if the key has TTL, false otherwise. If key has TTL,
|
|
|
|
// TTL is pass back through ttl. The method can optionally modify the value,
|
|
|
|
// pass the result back through new_value, and also set value_changed to true.
|
|
|
|
virtual bool ExtractTTL(const Slice& key, const Slice& value, uint64_t* ttl,
|
|
|
|
std::string* new_value, bool* value_changed);
|
|
|
|
|
|
|
|
// Extract expiration time from key-value pair.
|
|
|
|
// Return true if the key has expiration time, false otherwise. If key has
|
|
|
|
// expiration time, it is pass back through expiration. The method can
|
|
|
|
// optionally modify the value, pass the result back through new_value,
|
|
|
|
// and also set value_changed to true.
|
|
|
|
virtual bool ExtractExpiration(const Slice& key, const Slice& value,
|
|
|
|
uint64_t now, uint64_t* expiration,
|
|
|
|
std::string* new_value, bool* value_changed);
|
|
|
|
|
|
|
|
virtual ~TTLExtractor() = default;
|
|
|
|
};
|
|
|
|
|
|
|
|
} // namespace blob_db
|
|
|
|
} // namespace rocksdb
|
|
|
|
#endif // ROCKSDB_LITE
|