You can not select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.
rocksdb/utilities/blob_db/blob_db.h

265 lines
10 KiB

// Copyright (c) 2011-present, Facebook, Inc. All rights reserved.
// This source code is licensed under both the GPLv2 (found in the
// COPYING file in the root directory) and Apache 2.0 License
// (found in the LICENSE.Apache file in the root directory).
#pragma once
#include <functional>
#include <limits>
#include <string>
#include <vector>
#include "rocksdb/db.h"
#include "rocksdb/status.h"
#include "rocksdb/utilities/stackable_db.h"
namespace ROCKSDB_NAMESPACE {
namespace blob_db {
// A wrapped database which puts values of KV pairs in a separate log
// and store location to the log in the underlying DB.
//
// The factory needs to be moved to include/rocksdb/utilities to allow
// users to use blob DB.
constexpr uint64_t kNoExpiration = std::numeric_limits<uint64_t>::max();
struct BlobDBOptions {
// Name of the directory under the base DB where blobs will be stored. Using
// a directory where the base DB stores its SST files is not supported.
// Default is "blob_dir"
std::string blob_dir = "blob_dir";
// whether the blob_dir path is relative or absolute.
bool path_relative = true;
// When max_db_size is reached, evict blob files to free up space
// instead of returnning NoSpace error on write. Blob files will be
// evicted from oldest to newest, based on file creation time.
bool is_fifo = false;
// Maximum size of the database (including SST files and blob files).
//
// Default: 0 (no limits)
uint64_t max_db_size = 0;
// a new bucket is opened, for ttl_range. So if ttl_range is 600seconds
// (10 minutes), and the first bucket starts at 1471542000
// then the blob buckets will be
// first bucket is 1471542000 - 1471542600
// second bucket is 1471542600 - 1471543200
// and so on
uint64_t ttl_range_secs = 3600;
// The smallest value to store in blob log. Values smaller than this threshold
// will be inlined in base DB together with the key.
uint64_t min_blob_size = 0;
// Allows OS to incrementally sync blob files to disk for every
// bytes_per_sync bytes written. Users shouldn't rely on it for
// persistency guarantee.
uint64_t bytes_per_sync = 512 * 1024;
// the target size of each blob file. File will become immutable
// after it exceeds that size
uint64_t blob_file_size = 256 * 1024 * 1024;
// what compression to use for Blob's
CompressionType compression = kNoCompression;
// If enabled, BlobDB cleans up stale blobs in non-TTL files during compaction
// by rewriting the remaining live blobs to new files.
bool enable_garbage_collection = false;
// The cutoff in terms of blob file age for garbage collection. Blobs in
// the oldest N non-TTL blob files will be rewritten when encountered during
// compaction, where N = garbage_collection_cutoff * number_of_non_TTL_files.
double garbage_collection_cutoff = 0.25;
// Disable all background job. Used for test only.
bool disable_background_tasks = false;
void Dump(Logger* log) const;
};
class BlobDB : public StackableDB {
public:
using ROCKSDB_NAMESPACE::StackableDB::Put;
virtual Status Put(const WriteOptions& options, const Slice& key,
const Slice& value) override = 0;
virtual Status Put(const WriteOptions& options,
ColumnFamilyHandle* column_family, const Slice& key,
const Slice& value) override {
if (column_family->GetID() != DefaultColumnFamily()->GetID()) {
return Status::NotSupported(
"Blob DB doesn't support non-default column family.");
}
return Put(options, key, value);
}
using ROCKSDB_NAMESPACE::StackableDB::Delete;
virtual Status Delete(const WriteOptions& options,
ColumnFamilyHandle* column_family,
const Slice& key) override {
if (column_family->GetID() != DefaultColumnFamily()->GetID()) {
return Status::NotSupported(
"Blob DB doesn't support non-default column family.");
}
assert(db_ != nullptr);
return db_->Delete(options, column_family, key);
}
virtual Status PutWithTTL(const WriteOptions& options, const Slice& key,
const Slice& value, uint64_t ttl) = 0;
virtual Status PutWithTTL(const WriteOptions& options,
ColumnFamilyHandle* column_family, const Slice& key,
const Slice& value, uint64_t ttl) {
if (column_family->GetID() != DefaultColumnFamily()->GetID()) {
return Status::NotSupported(
"Blob DB doesn't support non-default column family.");
}
return PutWithTTL(options, key, value, ttl);
}
// Put with expiration. Key with expiration time equal to
// std::numeric_limits<uint64_t>::max() means the key don't expire.
virtual Status PutUntil(const WriteOptions& options, const Slice& key,
const Slice& value, uint64_t expiration) = 0;
virtual Status PutUntil(const WriteOptions& options,
ColumnFamilyHandle* column_family, const Slice& key,
const Slice& value, uint64_t expiration) {
if (column_family->GetID() != DefaultColumnFamily()->GetID()) {
return Status::NotSupported(
"Blob DB doesn't support non-default column family.");
}
return PutUntil(options, key, value, expiration);
}
using ROCKSDB_NAMESPACE::StackableDB::Get;
virtual Status Get(const ReadOptions& options,
ColumnFamilyHandle* column_family, const Slice& key,
PinnableSlice* value) override = 0;
// Get value and expiration.
virtual Status Get(const ReadOptions& options,
ColumnFamilyHandle* column_family, const Slice& key,
PinnableSlice* value, uint64_t* expiration) = 0;
virtual Status Get(const ReadOptions& options, const Slice& key,
PinnableSlice* value, uint64_t* expiration) {
return Get(options, DefaultColumnFamily(), key, value, expiration);
}
using ROCKSDB_NAMESPACE::StackableDB::MultiGet;
virtual std::vector<Status> MultiGet(
const ReadOptions& options, const std::vector<Slice>& keys,
std::vector<std::string>* values) override = 0;
virtual std::vector<Status> MultiGet(
const ReadOptions& options,
const std::vector<ColumnFamilyHandle*>& column_families,
const std::vector<Slice>& keys,
std::vector<std::string>* values) override {
for (auto column_family : column_families) {
if (column_family->GetID() != DefaultColumnFamily()->GetID()) {
return std::vector<Status>(
column_families.size(),
Status::NotSupported(
"Blob DB doesn't support non-default column family."));
}
}
return MultiGet(options, keys, values);
}
Introduce a new MultiGet batching implementation (#5011) Summary: This PR introduces a new MultiGet() API, with the underlying implementation grouping keys based on SST file and batching lookups in a file. The reason for the new API is twofold - the definition allows callers to allocate storage for status and values on stack instead of std::vector, as well as return values as PinnableSlices in order to avoid copying, and it keeps the original MultiGet() implementation intact while we experiment with batching. Batching is useful when there is some spatial locality to the keys being queries, as well as larger batch sizes. The main benefits are due to - 1. Fewer function calls, especially to BlockBasedTableReader::MultiGet() and FullFilterBlockReader::KeysMayMatch() 2. Bloom filter cachelines can be prefetched, hiding the cache miss latency The next step is to optimize the binary searches in the level_storage_info, index blocks and data blocks, since we could reduce the number of key comparisons if the keys are relatively close to each other. The batching optimizations also need to be extended to other formats, such as PlainTable and filter formats. This also needs to be added to db_stress. Benchmark results from db_bench for various batch size/locality of reference combinations are given below. Locality was simulated by offsetting the keys in a batch by a stride length. Each SST file is about 8.6MB uncompressed and key/value size is 16/100 uncompressed. To focus on the cpu benefit of batching, the runs were single threaded and bound to the same cpu to eliminate interference from other system events. The results show a 10-25% improvement in micros/op from smaller to larger batch sizes (4 - 32). Batch Sizes 1 | 2 | 4 | 8 | 16 | 32 Random pattern (Stride length 0) 4.158 | 4.109 | 4.026 | 4.05 | 4.1 | 4.074 - Get 4.438 | 4.302 | 4.165 | 4.122 | 4.096 | 4.075 - MultiGet (no batching) 4.461 | 4.256 | 4.277 | 4.11 | 4.182 | 4.14 - MultiGet (w/ batching) Good locality (Stride length 16) 4.048 | 3.659 | 3.248 | 2.99 | 2.84 | 2.753 4.429 | 3.728 | 3.406 | 3.053 | 2.911 | 2.781 4.452 | 3.45 | 2.833 | 2.451 | 2.233 | 2.135 Good locality (Stride length 256) 4.066 | 3.786 | 3.581 | 3.447 | 3.415 | 3.232 4.406 | 4.005 | 3.644 | 3.49 | 3.381 | 3.268 4.393 | 3.649 | 3.186 | 2.882 | 2.676 | 2.62 Medium locality (Stride length 4096) 4.012 | 3.922 | 3.768 | 3.61 | 3.582 | 3.555 4.364 | 4.057 | 3.791 | 3.65 | 3.57 | 3.465 4.479 | 3.758 | 3.316 | 3.077 | 2.959 | 2.891 dbbench command used (on a DB with 4 levels, 12 million keys)- TEST_TMPDIR=/dev/shm numactl -C 10 ./db_bench.tmp -use_existing_db=true -benchmarks="readseq,multireadrandom" -write_buffer_size=4194304 -target_file_size_base=4194304 -max_bytes_for_level_base=16777216 -num=12000000 -reads=12000000 -duration=90 -threads=1 -compression_type=none -cache_size=4194304000 -batch_size=32 -disable_auto_compactions=true -bloom_bits=10 -cache_index_and_filter_blocks=true -pin_l0_filter_and_index_blocks_in_cache=true -multiread_batched=true -multiread_stride=4 Pull Request resolved: https://github.com/facebook/rocksdb/pull/5011 Differential Revision: D14348703 Pulled By: anand1976 fbshipit-source-id: 774406dab3776d979c809522a67bedac6c17f84b
6 years ago
virtual void MultiGet(const ReadOptions& /*options*/,
ColumnFamilyHandle* /*column_family*/,
const size_t num_keys, const Slice* /*keys*/,
PinnableSlice* /*values*/, Status* statuses,
const bool /*sorted_input*/ = false) override {
for (size_t i = 0; i < num_keys; ++i) {
statuses[i] =
Status::NotSupported("Blob DB doesn't support batched MultiGet");
Introduce a new MultiGet batching implementation (#5011) Summary: This PR introduces a new MultiGet() API, with the underlying implementation grouping keys based on SST file and batching lookups in a file. The reason for the new API is twofold - the definition allows callers to allocate storage for status and values on stack instead of std::vector, as well as return values as PinnableSlices in order to avoid copying, and it keeps the original MultiGet() implementation intact while we experiment with batching. Batching is useful when there is some spatial locality to the keys being queries, as well as larger batch sizes. The main benefits are due to - 1. Fewer function calls, especially to BlockBasedTableReader::MultiGet() and FullFilterBlockReader::KeysMayMatch() 2. Bloom filter cachelines can be prefetched, hiding the cache miss latency The next step is to optimize the binary searches in the level_storage_info, index blocks and data blocks, since we could reduce the number of key comparisons if the keys are relatively close to each other. The batching optimizations also need to be extended to other formats, such as PlainTable and filter formats. This also needs to be added to db_stress. Benchmark results from db_bench for various batch size/locality of reference combinations are given below. Locality was simulated by offsetting the keys in a batch by a stride length. Each SST file is about 8.6MB uncompressed and key/value size is 16/100 uncompressed. To focus on the cpu benefit of batching, the runs were single threaded and bound to the same cpu to eliminate interference from other system events. The results show a 10-25% improvement in micros/op from smaller to larger batch sizes (4 - 32). Batch Sizes 1 | 2 | 4 | 8 | 16 | 32 Random pattern (Stride length 0) 4.158 | 4.109 | 4.026 | 4.05 | 4.1 | 4.074 - Get 4.438 | 4.302 | 4.165 | 4.122 | 4.096 | 4.075 - MultiGet (no batching) 4.461 | 4.256 | 4.277 | 4.11 | 4.182 | 4.14 - MultiGet (w/ batching) Good locality (Stride length 16) 4.048 | 3.659 | 3.248 | 2.99 | 2.84 | 2.753 4.429 | 3.728 | 3.406 | 3.053 | 2.911 | 2.781 4.452 | 3.45 | 2.833 | 2.451 | 2.233 | 2.135 Good locality (Stride length 256) 4.066 | 3.786 | 3.581 | 3.447 | 3.415 | 3.232 4.406 | 4.005 | 3.644 | 3.49 | 3.381 | 3.268 4.393 | 3.649 | 3.186 | 2.882 | 2.676 | 2.62 Medium locality (Stride length 4096) 4.012 | 3.922 | 3.768 | 3.61 | 3.582 | 3.555 4.364 | 4.057 | 3.791 | 3.65 | 3.57 | 3.465 4.479 | 3.758 | 3.316 | 3.077 | 2.959 | 2.891 dbbench command used (on a DB with 4 levels, 12 million keys)- TEST_TMPDIR=/dev/shm numactl -C 10 ./db_bench.tmp -use_existing_db=true -benchmarks="readseq,multireadrandom" -write_buffer_size=4194304 -target_file_size_base=4194304 -max_bytes_for_level_base=16777216 -num=12000000 -reads=12000000 -duration=90 -threads=1 -compression_type=none -cache_size=4194304000 -batch_size=32 -disable_auto_compactions=true -bloom_bits=10 -cache_index_and_filter_blocks=true -pin_l0_filter_and_index_blocks_in_cache=true -multiread_batched=true -multiread_stride=4 Pull Request resolved: https://github.com/facebook/rocksdb/pull/5011 Differential Revision: D14348703 Pulled By: anand1976 fbshipit-source-id: 774406dab3776d979c809522a67bedac6c17f84b
6 years ago
}
}
using ROCKSDB_NAMESPACE::StackableDB::SingleDelete;
virtual Status SingleDelete(const WriteOptions& /*wopts*/,
ColumnFamilyHandle* /*column_family*/,
const Slice& /*key*/) override {
return Status::NotSupported("Not supported operation in blob db.");
}
using ROCKSDB_NAMESPACE::StackableDB::Merge;
virtual Status Merge(const WriteOptions& /*options*/,
ColumnFamilyHandle* /*column_family*/,
const Slice& /*key*/, const Slice& /*value*/) override {
return Status::NotSupported("Not supported operation in blob db.");
}
virtual Status Write(const WriteOptions& opts,
WriteBatch* updates) override = 0;
Revise APIs related to user-defined timestamp (#8946) Summary: ajkr reminded me that we have a rule of not including per-kv related data in `WriteOptions`. Namely, `WriteOptions` should not include information about "what-to-write", but should just include information about "how-to-write". According to this rule, `WriteOptions::timestamp` (experimental) is clearly a violation. Therefore, this PR removes `WriteOptions::timestamp` for compliance. After the removal, we need to pass timestamp info via another set of APIs. This PR proposes a set of overloaded functions `Put(write_opts, key, value, ts)`, `Delete(write_opts, key, ts)`, and `SingleDelete(write_opts, key, ts)`. Planned to add `Write(write_opts, batch, ts)`, but its complexity made me reconsider doing it in another PR (maybe). For better checking and returning error early, we also add a new set of APIs to `WriteBatch` that take extra `timestamp` information when writing to `WriteBatch`es. These set of APIs in `WriteBatchWithIndex` are currently not supported, and are on our TODO list. Removed `WriteBatch::AssignTimestamps()` and renamed `WriteBatch::AssignTimestamp()` to `WriteBatch::UpdateTimestamps()` since this method require that all keys have space for timestamps allocated already and multiple timestamps can be updated. The constructor of `WriteBatch` now takes a fourth argument `default_cf_ts_sz` which is the timestamp size of the default column family. This will be used to allocate space when calling APIs that do not specify a column family handle. Also, updated `DB::Get()`, `DB::MultiGet()`, `DB::NewIterator()`, `DB::NewIterators()` methods, replacing some assertions about timestamp to returning Status code. Pull Request resolved: https://github.com/facebook/rocksdb/pull/8946 Test Plan: make check ./db_bench -benchmarks=fillseq,fillrandom,readrandom,readseq,deleterandom -user_timestamp_size=8 ./db_stress --user_timestamp_size=8 -nooverwritepercent=0 -test_secondary=0 -secondary_catch_up_one_in=0 -continuous_verification_interval=0 Make sure there is no perf regression by running the following ``` ./db_bench_opt -db=/dev/shm/rocksdb -use_existing_db=0 -level0_stop_writes_trigger=256 -level0_slowdown_writes_trigger=256 -level0_file_num_compaction_trigger=256 -disable_wal=1 -duration=10 -benchmarks=fillrandom ``` Before this PR ``` DB path: [/dev/shm/rocksdb] fillrandom : 1.831 micros/op 546235 ops/sec; 60.4 MB/s ``` After this PR ``` DB path: [/dev/shm/rocksdb] fillrandom : 1.820 micros/op 549404 ops/sec; 60.8 MB/s ``` Reviewed By: ltamasi Differential Revision: D33721359 Pulled By: riversand963 fbshipit-source-id: c131561534272c120ffb80711d42748d21badf09
3 years ago
using ROCKSDB_NAMESPACE::StackableDB::NewIterator;
virtual Iterator* NewIterator(const ReadOptions& options) override = 0;
virtual Iterator* NewIterator(const ReadOptions& options,
ColumnFamilyHandle* column_family) override {
if (column_family->GetID() != DefaultColumnFamily()->GetID()) {
// Blob DB doesn't support non-default column family.
return nullptr;
}
return NewIterator(options);
}
Status CompactFiles(
const CompactionOptions& compact_options,
const std::vector<std::string>& input_file_names, const int output_level,
const int output_path_id = -1,
std::vector<std::string>* const output_file_names = nullptr,
CompactionJobInfo* compaction_job_info = nullptr) override = 0;
Status CompactFiles(
const CompactionOptions& compact_options,
ColumnFamilyHandle* column_family,
const std::vector<std::string>& input_file_names, const int output_level,
const int output_path_id = -1,
std::vector<std::string>* const output_file_names = nullptr,
CompactionJobInfo* compaction_job_info = nullptr) override {
if (column_family->GetID() != DefaultColumnFamily()->GetID()) {
return Status::NotSupported(
"Blob DB doesn't support non-default column family.");
}
return CompactFiles(compact_options, input_file_names, output_level,
output_path_id, output_file_names, compaction_job_info);
}
using ROCKSDB_NAMESPACE::StackableDB::Close;
virtual Status Close() override = 0;
// Opening blob db.
static Status Open(const Options& options, const BlobDBOptions& bdb_options,
const std::string& dbname, BlobDB** blob_db);
static Status Open(const DBOptions& db_options,
const BlobDBOptions& bdb_options,
const std::string& dbname,
const std::vector<ColumnFamilyDescriptor>& column_families,
std::vector<ColumnFamilyHandle*>* handles,
BlobDB** blob_db);
virtual BlobDBOptions GetBlobDBOptions() const = 0;
virtual Status SyncBlobFiles() = 0;
virtual ~BlobDB() {}
protected:
explicit BlobDB();
};
// Destroy the content of the database.
Status DestroyBlobDB(const std::string& dbname, const Options& options,
const BlobDBOptions& bdb_options);
} // namespace blob_db
} // namespace ROCKSDB_NAMESPACE