|
|
|
// Copyright (c) 2011-present, Facebook, Inc. All rights reserved.
|
|
|
|
// This source code is licensed under both the GPLv2 (found in the
|
|
|
|
// COPYING file in the root directory) and Apache 2.0 License
|
|
|
|
// (found in the LICENSE.Apache file in the root directory).
|
|
|
|
|
|
|
|
#ifndef ROCKSDB_LITE
|
|
|
|
#include "db/compacted_db_impl.h"
|
|
|
|
#include "db/db_impl/db_impl.h"
|
|
|
|
#include "db/version_set.h"
|
|
|
|
#include "table/get_context.h"
|
|
|
|
|
|
|
|
namespace rocksdb {
|
|
|
|
|
|
|
|
extern void MarkKeyMayExist(void* arg);
|
|
|
|
extern bool SaveValue(void* arg, const ParsedInternalKey& parsed_key,
|
|
|
|
const Slice& v, bool hit_and_return);
|
|
|
|
|
|
|
|
CompactedDBImpl::CompactedDBImpl(
|
|
|
|
const DBOptions& options, const std::string& dbname)
|
|
|
|
: DBImpl(options, dbname), cfd_(nullptr), version_(nullptr),
|
|
|
|
user_comparator_(nullptr) {
|
|
|
|
}
|
|
|
|
|
|
|
|
CompactedDBImpl::~CompactedDBImpl() {
|
|
|
|
}
|
|
|
|
|
|
|
|
size_t CompactedDBImpl::FindFile(const Slice& key) {
|
|
|
|
size_t right = files_.num_files - 1;
|
|
|
|
auto cmp = [&](const FdWithKeyRange& f, const Slice& k) -> bool {
|
|
|
|
return user_comparator_->Compare(ExtractUserKey(f.largest_key), k) < 0;
|
|
|
|
};
|
|
|
|
return static_cast<size_t>(std::lower_bound(files_.files,
|
|
|
|
files_.files + right, key, cmp) - files_.files);
|
|
|
|
}
|
|
|
|
|
|
|
|
Status CompactedDBImpl::Get(const ReadOptions& options, ColumnFamilyHandle*,
|
|
|
|
const Slice& key, PinnableSlice* value) {
|
|
|
|
GetContext get_context(user_comparator_, nullptr, nullptr, nullptr,
|
|
|
|
GetContext::kNotFound, key, value, nullptr, nullptr,
|
New API to get all merge operands for a Key (#5604)
Summary:
This is a new API added to db.h to allow for fetching all merge operands associated with a Key. The main motivation for this API is to support use cases where doing a full online merge is not necessary as it is performance sensitive. Example use-cases:
1. Update subset of columns and read subset of columns -
Imagine a SQL Table, a row is encoded as a K/V pair (as it is done in MyRocks). If there are many columns and users only updated one of them, we can use merge operator to reduce write amplification. While users only read one or two columns in the read query, this feature can avoid a full merging of the whole row, and save some CPU.
2. Updating very few attributes in a value which is a JSON-like document -
Updating one attribute can be done efficiently using merge operator, while reading back one attribute can be done more efficiently if we don't need to do a full merge.
----------------------------------------------------------------------------------------------------
API :
Status GetMergeOperands(
const ReadOptions& options, ColumnFamilyHandle* column_family,
const Slice& key, PinnableSlice* merge_operands,
GetMergeOperandsOptions* get_merge_operands_options,
int* number_of_operands)
Example usage :
int size = 100;
int number_of_operands = 0;
std::vector<PinnableSlice> values(size);
GetMergeOperandsOptions merge_operands_info;
db_->GetMergeOperands(ReadOptions(), db_->DefaultColumnFamily(), "k1", values.data(), merge_operands_info, &number_of_operands);
Description :
Returns all the merge operands corresponding to the key. If the number of merge operands in DB is greater than merge_operands_options.expected_max_number_of_operands no merge operands are returned and status is Incomplete. Merge operands returned are in the order of insertion.
merge_operands-> Points to an array of at-least merge_operands_options.expected_max_number_of_operands and the caller is responsible for allocating it. If the status returned is Incomplete then number_of_operands will contain the total number of merge operands found in DB for key.
Pull Request resolved: https://github.com/facebook/rocksdb/pull/5604
Test Plan:
Added unit test and perf test in db_bench that can be run using the command:
./db_bench -benchmarks=getmergeoperands --merge_operator=sortlist
Differential Revision: D16657366
Pulled By: vjnadimpalli
fbshipit-source-id: 0faadd752351745224ee12d4ae9ef3cb529951bf
5 years ago
|
|
|
true, nullptr, nullptr);
|
|
|
|
LookupKey lkey(key, kMaxSequenceNumber);
|
|
|
|
files_.files[FindFile(key)].fd.table_reader->Get(options, lkey.internal_key(),
|
|
|
|
&get_context, nullptr);
|
|
|
|
if (get_context.State() == GetContext::kFound) {
|
|
|
|
return Status::OK();
|
|
|
|
}
|
|
|
|
return Status::NotFound();
|
|
|
|
}
|
|
|
|
|
|
|
|
std::vector<Status> CompactedDBImpl::MultiGet(const ReadOptions& options,
|
|
|
|
const std::vector<ColumnFamilyHandle*>&,
|
|
|
|
const std::vector<Slice>& keys, std::vector<std::string>* values) {
|
|
|
|
autovector<TableReader*, 16> reader_list;
|
|
|
|
for (const auto& key : keys) {
|
|
|
|
const FdWithKeyRange& f = files_.files[FindFile(key)];
|
|
|
|
if (user_comparator_->Compare(key, ExtractUserKey(f.smallest_key)) < 0) {
|
|
|
|
reader_list.push_back(nullptr);
|
|
|
|
} else {
|
|
|
|
LookupKey lkey(key, kMaxSequenceNumber);
|
|
|
|
f.fd.table_reader->Prepare(lkey.internal_key());
|
|
|
|
reader_list.push_back(f.fd.table_reader);
|
|
|
|
}
|
|
|
|
}
|
|
|
|
std::vector<Status> statuses(keys.size(), Status::NotFound());
|
|
|
|
values->resize(keys.size());
|
|
|
|
int idx = 0;
|
|
|
|
for (auto* r : reader_list) {
|
|
|
|
if (r != nullptr) {
|
|
|
|
PinnableSlice pinnable_val;
|
|
|
|
std::string& value = (*values)[idx];
|
|
|
|
GetContext get_context(user_comparator_, nullptr, nullptr, nullptr,
|
|
|
|
GetContext::kNotFound, keys[idx], &pinnable_val,
|
New API to get all merge operands for a Key (#5604)
Summary:
This is a new API added to db.h to allow for fetching all merge operands associated with a Key. The main motivation for this API is to support use cases where doing a full online merge is not necessary as it is performance sensitive. Example use-cases:
1. Update subset of columns and read subset of columns -
Imagine a SQL Table, a row is encoded as a K/V pair (as it is done in MyRocks). If there are many columns and users only updated one of them, we can use merge operator to reduce write amplification. While users only read one or two columns in the read query, this feature can avoid a full merging of the whole row, and save some CPU.
2. Updating very few attributes in a value which is a JSON-like document -
Updating one attribute can be done efficiently using merge operator, while reading back one attribute can be done more efficiently if we don't need to do a full merge.
----------------------------------------------------------------------------------------------------
API :
Status GetMergeOperands(
const ReadOptions& options, ColumnFamilyHandle* column_family,
const Slice& key, PinnableSlice* merge_operands,
GetMergeOperandsOptions* get_merge_operands_options,
int* number_of_operands)
Example usage :
int size = 100;
int number_of_operands = 0;
std::vector<PinnableSlice> values(size);
GetMergeOperandsOptions merge_operands_info;
db_->GetMergeOperands(ReadOptions(), db_->DefaultColumnFamily(), "k1", values.data(), merge_operands_info, &number_of_operands);
Description :
Returns all the merge operands corresponding to the key. If the number of merge operands in DB is greater than merge_operands_options.expected_max_number_of_operands no merge operands are returned and status is Incomplete. Merge operands returned are in the order of insertion.
merge_operands-> Points to an array of at-least merge_operands_options.expected_max_number_of_operands and the caller is responsible for allocating it. If the status returned is Incomplete then number_of_operands will contain the total number of merge operands found in DB for key.
Pull Request resolved: https://github.com/facebook/rocksdb/pull/5604
Test Plan:
Added unit test and perf test in db_bench that can be run using the command:
./db_bench -benchmarks=getmergeoperands --merge_operator=sortlist
Differential Revision: D16657366
Pulled By: vjnadimpalli
fbshipit-source-id: 0faadd752351745224ee12d4ae9ef3cb529951bf
5 years ago
|
|
|
nullptr, nullptr, true, nullptr, nullptr);
|
|
|
|
LookupKey lkey(keys[idx], kMaxSequenceNumber);
|
|
|
|
r->Get(options, lkey.internal_key(), &get_context, nullptr);
|
|
|
|
value.assign(pinnable_val.data(), pinnable_val.size());
|
|
|
|
if (get_context.State() == GetContext::kFound) {
|
|
|
|
statuses[idx] = Status::OK();
|
|
|
|
}
|
|
|
|
}
|
|
|
|
++idx;
|
|
|
|
}
|
|
|
|
return statuses;
|
|
|
|
}
|
|
|
|
|
|
|
|
Status CompactedDBImpl::Init(const Options& options) {
|
|
|
|
SuperVersionContext sv_context(/* create_superversion */ true);
|
|
|
|
mutex_.Lock();
|
|
|
|
ColumnFamilyDescriptor cf(kDefaultColumnFamilyName,
|
|
|
|
ColumnFamilyOptions(options));
|
|
|
|
Status s = Recover({cf}, true /* read only */, false, true);
|
|
|
|
if (s.ok()) {
|
|
|
|
cfd_ = reinterpret_cast<ColumnFamilyHandleImpl*>(
|
|
|
|
DefaultColumnFamily())->cfd();
|
|
|
|
cfd_->InstallSuperVersion(&sv_context, &mutex_);
|
|
|
|
}
|
|
|
|
mutex_.Unlock();
|
|
|
|
sv_context.Clean();
|
|
|
|
if (!s.ok()) {
|
|
|
|
return s;
|
|
|
|
}
|
|
|
|
NewThreadStatusCfInfo(cfd_);
|
|
|
|
version_ = cfd_->GetSuperVersion()->current;
|
|
|
|
user_comparator_ = cfd_->user_comparator();
|
|
|
|
auto* vstorage = version_->storage_info();
|
|
|
|
if (vstorage->num_non_empty_levels() == 0) {
|
|
|
|
return Status::NotSupported("no file exists");
|
|
|
|
}
|
|
|
|
const LevelFilesBrief& l0 = vstorage->LevelFilesBrief(0);
|
|
|
|
// L0 should not have files
|
|
|
|
if (l0.num_files > 1) {
|
|
|
|
return Status::NotSupported("L0 contain more than 1 file");
|
|
|
|
}
|
|
|
|
if (l0.num_files == 1) {
|
|
|
|
if (vstorage->num_non_empty_levels() > 1) {
|
|
|
|
return Status::NotSupported("Both L0 and other level contain files");
|
|
|
|
}
|
|
|
|
files_ = l0;
|
|
|
|
return Status::OK();
|
|
|
|
}
|
|
|
|
|
|
|
|
for (int i = 1; i < vstorage->num_non_empty_levels() - 1; ++i) {
|
|
|
|
if (vstorage->LevelFilesBrief(i).num_files > 0) {
|
|
|
|
return Status::NotSupported("Other levels also contain files");
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
int level = vstorage->num_non_empty_levels() - 1;
|
|
|
|
if (vstorage->LevelFilesBrief(level).num_files > 0) {
|
|
|
|
files_ = vstorage->LevelFilesBrief(level);
|
|
|
|
return Status::OK();
|
|
|
|
}
|
|
|
|
return Status::NotSupported("no file exists");
|
|
|
|
}
|
|
|
|
|
|
|
|
Status CompactedDBImpl::Open(const Options& options,
|
|
|
|
const std::string& dbname, DB** dbptr) {
|
|
|
|
*dbptr = nullptr;
|
|
|
|
|
|
|
|
if (options.max_open_files != -1) {
|
|
|
|
return Status::InvalidArgument("require max_open_files = -1");
|
|
|
|
}
|
|
|
|
if (options.merge_operator.get() != nullptr) {
|
|
|
|
return Status::InvalidArgument("merge operator is not supported");
|
|
|
|
}
|
|
|
|
DBOptions db_options(options);
|
|
|
|
std::unique_ptr<CompactedDBImpl> db(new CompactedDBImpl(db_options, dbname));
|
|
|
|
Status s = db->Init(options);
|
|
|
|
if (s.ok()) {
|
move dump stats to a separate thread (#4382)
Summary:
Currently statistics are supposed to be dumped to info log at intervals of `options.stats_dump_period_sec`. However the implementation choice was to bind it with compaction thread, meaning if the database has been serving very light traffic, the stats may not get dumped at all.
We decided to separate stats dumping into a new timed thread using `TimerQueue`, which is already used in blob_db. This will allow us schedule new timed tasks with more deterministic behavior.
Tested with db_bench using `--stats_dump_period_sec=20` in command line:
> LOG:2018/09/17-14:07:45.575025 7fe99fbfe700 [WARN] [db/db_impl.cc:605] ------- DUMPING STATS -------
LOG:2018/09/17-14:08:05.643286 7fe99fbfe700 [WARN] [db/db_impl.cc:605] ------- DUMPING STATS -------
LOG:2018/09/17-14:08:25.691325 7fe99fbfe700 [WARN] [db/db_impl.cc:605] ------- DUMPING STATS -------
LOG:2018/09/17-14:08:45.740989 7fe99fbfe700 [WARN] [db/db_impl.cc:605] ------- DUMPING STATS -------
LOG content:
> 2018/09/17-14:07:45.575025 7fe99fbfe700 [WARN] [db/db_impl.cc:605] ------- DUMPING STATS -------
2018/09/17-14:07:45.575080 7fe99fbfe700 [WARN] [db/db_impl.cc:606]
** DB Stats **
Uptime(secs): 20.0 total, 20.0 interval
Cumulative writes: 4447K writes, 4447K keys, 4447K commit groups, 1.0 writes per commit group, ingest: 5.57 GB, 285.01 MB/s
Cumulative WAL: 4447K writes, 0 syncs, 4447638.00 writes per sync, written: 5.57 GB, 285.01 MB/s
Cumulative stall: 00:00:0.012 H:M:S, 0.1 percent
Interval writes: 4447K writes, 4447K keys, 4447K commit groups, 1.0 writes per commit group, ingest: 5700.71 MB, 285.01 MB/s
Interval WAL: 4447K writes, 0 syncs, 4447638.00 writes per sync, written: 5.57 MB, 285.01 MB/s
Interval stall: 00:00:0.012 H:M:S, 0.1 percent
** Compaction Stats [default] **
Level Files Size Score Read(GB) Rn(GB) Rnp1(GB) Write(GB) Wnew(GB) Moved(GB) W-Amp Rd(MB/s) Wr(MB/s) Comp(sec) Comp(cnt) Avg(sec) KeyIn KeyDrop
Pull Request resolved: https://github.com/facebook/rocksdb/pull/4382
Differential Revision: D9933051
Pulled By: miasantreble
fbshipit-source-id: 6d12bb1e4977674eea4bf2d2ac6d486b814bb2fa
6 years ago
|
|
|
db->StartTimedTasks();
|
|
|
|
ROCKS_LOG_INFO(db->immutable_db_options_.info_log,
|
|
|
|
"Opened the db as fully compacted mode");
|
|
|
|
LogFlush(db->immutable_db_options_.info_log);
|
|
|
|
*dbptr = db.release();
|
|
|
|
}
|
|
|
|
return s;
|
|
|
|
}
|
|
|
|
|
|
|
|
} // namespace rocksdb
|
|
|
|
#endif // ROCKSDB_LITE
|