|
|
|
// Copyright (c) 2011-present, Facebook, Inc. All rights reserved.
|
|
|
|
// This source code is licensed under both the GPLv2 (found in the
|
|
|
|
// COPYING file in the root directory) and Apache 2.0 License
|
|
|
|
// (found in the LICENSE.Apache file in the root directory).
|
|
|
|
//
|
|
|
|
#pragma once
|
|
|
|
#include <algorithm>
|
|
|
|
#include <memory>
|
|
|
|
#include <utility>
|
|
|
|
#include <vector>
|
|
|
|
|
|
|
|
#include "table/internal_iterator.h"
|
|
|
|
|
|
|
|
namespace ROCKSDB_NAMESPACE {
|
|
|
|
|
|
|
|
// PinnedIteratorsManager will be notified whenever we need to pin an Iterator
|
|
|
|
// and it will be responsible for deleting pinned Iterators when they are
|
|
|
|
// not needed anymore.
|
|
|
|
class PinnedIteratorsManager : public Cleanable {
|
|
|
|
public:
|
|
|
|
PinnedIteratorsManager() : pinning_enabled(false) {}
|
|
|
|
~PinnedIteratorsManager() {
|
|
|
|
if (pinning_enabled) {
|
|
|
|
ReleasePinnedData();
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
Avoid allocations/copies for large `GetMergeOperands()` results (#10458)
Summary:
This PR avoids allocations and copies for the result of `GetMergeOperands()` when the average operand size is at least 256 bytes and the total operands size is at least 32KB. The `GetMergeOperands()` already included `PinnableSlice` but was calling `PinSelf()` (i.e., allocating and copying) for each operand. When this optimization takes effect, we instead call `PinSlice()` to skip that allocation and copy. Resources are pinned in order for the `PinnableSlice` to point to valid memory even after `GetMergeOperands()` returns.
The pinned resources include a referenced `SuperVersion`, a `MergingContext`, and a `PinnedIteratorsManager`. They are bundled into a `GetMergeOperandsState`. We use `SharedCleanablePtr` to share that bundle among all `PinnableSlice`s populated by `GetMergeOperands()`. That way, the last `PinnableSlice` to be `Reset()` will cleanup the bundle, including unreferencing the `SuperVersion`.
Pull Request resolved: https://github.com/facebook/rocksdb/pull/10458
Test Plan:
- new DB level test
- measured benefit/regression in a number of memtable scenarios
Setup command:
```
$ ./db_bench -benchmarks=mergerandom -merge_operator=StringAppendOperator -num=$num -writes=16384 -key_size=16 -value_size=$value_sz -compression_type=none -write_buffer_size=1048576000
```
Benchmark command:
```
./db_bench -threads=$threads -use_existing_db=true -avoid_flush_during_recovery=true -write_buffer_size=1048576000 -benchmarks=readrandomoperands -merge_operator=StringAppendOperator -num=$num -duration=10
```
Worst regression is when a key has many tiny operands:
- Parameters: num=1 (implying 16384 operands per key), value_sz=8, threads=1
- `GetMergeOperands()` latency increases 682 micros -> 800 micros (+17%)
The regression disappears into the noise (<1% difference) if we remove the `Reset()` loop and the size counting loop. The former is arguably needed regardless of this PR as the convention in `Get()` and `MultiGet()` is to `Reset()` the input `PinnableSlice`s at the start. The latter could be optimized to count the size as we accumulate operands rather than after the fact.
Best improvement is when a key has large operands and high concurrency:
- Parameters: num=4 (implying 4096 operands per key), value_sz=2KB, threads=32
- `GetMergeOperands()` latency decreases 11492 micros -> 437 micros (-96%).
Reviewed By: cbi42
Differential Revision: D38336578
Pulled By: ajkr
fbshipit-source-id: 48146d127e04cb7f2d4d2939a2b9dff3aba18258
2 years ago
|
|
|
// Move constructor and move assignment is allowed.
|
|
|
|
PinnedIteratorsManager(PinnedIteratorsManager&& other) noexcept = default;
|
|
|
|
PinnedIteratorsManager& operator=(PinnedIteratorsManager&& other) noexcept =
|
|
|
|
default;
|
|
|
|
|
|
|
|
// Enable Iterators pinning
|
|
|
|
void StartPinning() {
|
|
|
|
assert(pinning_enabled == false);
|
|
|
|
pinning_enabled = true;
|
|
|
|
}
|
|
|
|
|
|
|
|
// Is pinning enabled ?
|
|
|
|
bool PinningEnabled() { return pinning_enabled; }
|
|
|
|
|
|
|
|
// Take ownership of iter and delete it when ReleasePinnedData() is called
|
|
|
|
void PinIterator(InternalIterator* iter, bool arena = false) {
|
|
|
|
if (arena) {
|
|
|
|
PinPtr(iter, &PinnedIteratorsManager::ReleaseArenaInternalIterator);
|
|
|
|
} else {
|
|
|
|
PinPtr(iter, &PinnedIteratorsManager::ReleaseInternalIterator);
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
using ReleaseFunction = void (*)(void* arg1);
|
|
|
|
void PinPtr(void* ptr, ReleaseFunction release_func) {
|
|
|
|
assert(pinning_enabled);
|
|
|
|
if (ptr == nullptr) {
|
|
|
|
return;
|
|
|
|
}
|
|
|
|
pinned_ptrs_.emplace_back(ptr, release_func);
|
|
|
|
}
|
|
|
|
|
|
|
|
// Release pinned Iterators
|
|
|
|
inline void ReleasePinnedData() {
|
|
|
|
assert(pinning_enabled == true);
|
|
|
|
pinning_enabled = false;
|
|
|
|
|
|
|
|
// Remove duplicate pointers
|
|
|
|
std::sort(pinned_ptrs_.begin(), pinned_ptrs_.end());
|
|
|
|
auto unique_end = std::unique(pinned_ptrs_.begin(), pinned_ptrs_.end());
|
|
|
|
|
|
|
|
for (auto i = pinned_ptrs_.begin(); i != unique_end; ++i) {
|
|
|
|
void* ptr = i->first;
|
|
|
|
ReleaseFunction release_func = i->second;
|
|
|
|
release_func(ptr);
|
|
|
|
}
|
|
|
|
pinned_ptrs_.clear();
|
|
|
|
// Also do cleanups from the base Cleanable
|
|
|
|
Cleanable::Reset();
|
|
|
|
}
|
|
|
|
|
|
|
|
private:
|
|
|
|
static void ReleaseInternalIterator(void* ptr) {
|
|
|
|
delete reinterpret_cast<InternalIterator*>(ptr);
|
|
|
|
}
|
|
|
|
|
|
|
|
static void ReleaseArenaInternalIterator(void* ptr) {
|
|
|
|
reinterpret_cast<InternalIterator*>(ptr)->~InternalIterator();
|
|
|
|
}
|
|
|
|
|
|
|
|
bool pinning_enabled;
|
|
|
|
std::vector<std::pair<void*, ReleaseFunction>> pinned_ptrs_;
|
|
|
|
};
|
|
|
|
|
|
|
|
} // namespace ROCKSDB_NAMESPACE
|