Make CompactRange() more aware of SstPartitionerFactory (#11032)

Summary:
Some users are at least considering using SstPartitioner to support efficient physical migration of specific key ranges between RocksDB instances. One might expect manual `CompactRange()` over a narrow key range across some partition to enforce partitioning of any SST files crossing that partition boundary, but that currently only works if there are keys within that range.

This change makes the overlap logic in CompactRange more aware of the partitioner to automatically select relevant files crossing a partition boundary, even when they otherwise would not be selected due to the compaction range falling in a gap between entries.

Pull Request resolved: https://github.com/facebook/rocksdb/pull/11032

Test Plan: unit test included

Reviewed By: hx235

Differential Revision: D41981380

Pulled By: pdillinger

fbshipit-source-id: 2fe445bdddc73c00276c20f295cc1fa33d15b05a
main
Peter Dillinger 2 years ago committed by Facebook GitHub Bot
parent f8969ad7d4
commit e6b6e74154
  1. 3
      HISTORY.md
  2. 64
      db/db_compaction_test.cc
  3. 58
      db/db_impl/db_impl_compaction_flush.cc

@ -15,6 +15,9 @@
* Fixed a bug in LockWAL() leading to re-locking mutex (#11020). * Fixed a bug in LockWAL() leading to re-locking mutex (#11020).
* Fixed a heap use after free bug in async scan prefetching when the scan thread and another thread try to read and load the same seek block into cache. * Fixed a heap use after free bug in async scan prefetching when the scan thread and another thread try to read and load the same seek block into cache.
### New Features
* When an SstPartitionerFactory is configured, CompactRange() now automatically selects for compaction any files overlapping a partition boundary that is in the compaction range, even if no actual entries are in the requested compaction range. With this feature, manual compaction can be used to (re-)establish SST partition points when SstPartitioner changes, without a full compaction.
## 7.9.0 (11/21/2022) ## 7.9.0 (11/21/2022)
### Performance Improvements ### Performance Improvements
* Fixed an iterator performance regression for delete range users when scanning through a consecutive sequence of range tombstones (#10877). * Fixed an iterator performance regression for delete range users when scanning through a consecutive sequence of range tombstones (#10877).

@ -1026,6 +1026,70 @@ TEST_F(DBCompactionTest, CompactionSstPartitioner) {
ASSERT_EQ("B", Get("bbbb1")); ASSERT_EQ("B", Get("bbbb1"));
} }
TEST_F(DBCompactionTest, CompactionSstPartitionWithManualCompaction) {
Options options = CurrentOptions();
options.compaction_style = kCompactionStyleLevel;
options.level0_file_num_compaction_trigger = 3;
DestroyAndReopen(options);
// create first file and flush to l0
ASSERT_OK(Put("000015", "A"));
ASSERT_OK(Put("000025", "B"));
ASSERT_OK(Flush());
ASSERT_OK(dbfull()->TEST_WaitForFlushMemTable());
// create second file and flush to l0
ASSERT_OK(Put("000015", "A2"));
ASSERT_OK(Put("000025", "B2"));
ASSERT_OK(Flush());
ASSERT_OK(dbfull()->TEST_WaitForFlushMemTable());
// CONTROL 1: compact without partitioner
CompactRangeOptions compact_options;
compact_options.bottommost_level_compaction =
BottommostLevelCompaction::kForceOptimized;
ASSERT_OK(dbfull()->CompactRange(CompactRangeOptions(), nullptr, nullptr));
// Check (compacted but no partitioning yet)
std::vector<LiveFileMetaData> files;
dbfull()->GetLiveFilesMetaData(&files);
ASSERT_EQ(1, files.size());
// Install partitioner
std::shared_ptr<SstPartitionerFactory> factory(
NewSstPartitionerFixedPrefixFactory(5));
options.sst_partitioner_factory = factory;
Reopen(options);
// CONTROL 2: request compaction on range with no partition boundary and no
// overlap with actual entries
Slice from("000017");
Slice to("000019");
ASSERT_OK(dbfull()->CompactRange(compact_options, &from, &to));
// Check (no partitioning yet)
files.clear();
dbfull()->GetLiveFilesMetaData(&files);
ASSERT_EQ(1, files.size());
ASSERT_EQ("A2", Get("000015"));
ASSERT_EQ("B2", Get("000025"));
// TEST: request compaction overlapping with partition boundary but no
// actual entries
// NOTE: `to` is INCLUSIVE
from = Slice("000019");
to = Slice("000020");
ASSERT_OK(dbfull()->CompactRange(compact_options, &from, &to));
// Check (must be partitioned)
files.clear();
dbfull()->GetLiveFilesMetaData(&files);
ASSERT_EQ(2, files.size());
ASSERT_EQ("A2", Get("000015"));
ASSERT_EQ("B2", Get("000025"));
}
TEST_F(DBCompactionTest, CompactionSstPartitionerNonTrivial) { TEST_F(DBCompactionTest, CompactionSstPartitionerNonTrivial) {
Options options = CurrentOptions(); Options options = CurrentOptions();
options.compaction_style = kCompactionStyleLevel; options.compaction_style = kCompactionStyleLevel;

@ -1087,6 +1087,22 @@ Status DBImpl::CompactRangeInternal(const CompactRangeOptions& options,
{ {
SuperVersion* super_version = cfd->GetReferencedSuperVersion(this); SuperVersion* super_version = cfd->GetReferencedSuperVersion(this);
Version* current_version = super_version->current; Version* current_version = super_version->current;
// Might need to query the partitioner
SstPartitionerFactory* partitioner_factory =
current_version->cfd()->ioptions()->sst_partitioner_factory.get();
std::unique_ptr<SstPartitioner> partitioner;
if (partitioner_factory && begin != nullptr && end != nullptr) {
SstPartitioner::Context context;
context.is_full_compaction = false;
context.is_manual_compaction = true;
context.output_level = /*unknown*/ -1;
// Small lies about compaction range
context.smallest_user_key = *begin;
context.largest_user_key = *end;
partitioner = partitioner_factory->CreatePartitioner(context);
}
ReadOptions ro; ReadOptions ro;
ro.total_order_seek = true; ro.total_order_seek = true;
bool overlap; bool overlap;
@ -1094,14 +1110,50 @@ Status DBImpl::CompactRangeInternal(const CompactRangeOptions& options,
level < current_version->storage_info()->num_non_empty_levels(); level < current_version->storage_info()->num_non_empty_levels();
level++) { level++) {
overlap = true; overlap = true;
// Whether to look at specific keys within files for overlap with
// compaction range, other than largest and smallest keys of the file
// known in Version metadata.
bool check_overlap_within_file = false;
if (begin != nullptr && end != nullptr) { if (begin != nullptr && end != nullptr) {
// Typically checking overlap within files in this case
check_overlap_within_file = true;
// WART: Not known why we don't check within file in one-sided bound
// cases
if (partitioner) {
// Especially if the partitioner is new, the manual compaction
// might be used to enforce the partitioning. Checking overlap
// within files might miss cases where compaction is needed to
// partition the files, as in this example:
// * File has two keys "001" and "111"
// * Compaction range is ["011", "101")
// * Partition boundary at "100"
// In cases like this, file-level overlap with the compaction
// range is sufficient to force any partitioning that is needed
// within the compaction range.
//
// But if there's no partitioning boundary within the compaction
// range, we can be sure there's no need to fix partitioning
// within that range, thus safe to check overlap within file.
//
// Use a hypothetical trivial move query to check for partition
// boundary in range. (NOTE: in defiance of all conventions,
// `begin` and `end` here are both INCLUSIVE bounds, which makes
// this analogy to CanDoTrivialMove() accurate even when `end` is
// the first key in a partition.)
if (!partitioner->CanDoTrivialMove(*begin, *end)) {
check_overlap_within_file = false;
}
}
}
if (check_overlap_within_file) {
Status status = current_version->OverlapWithLevelIterator( Status status = current_version->OverlapWithLevelIterator(
ro, file_options_, *begin, *end, level, &overlap); ro, file_options_, *begin, *end, level, &overlap);
if (!status.ok()) { if (!status.ok()) {
overlap = current_version->storage_info()->OverlapInLevel( check_overlap_within_file = false;
level, begin, end);
} }
} else { }
if (!check_overlap_within_file) {
overlap = current_version->storage_info()->OverlapInLevel(level, overlap = current_version->storage_info()->OverlapInLevel(level,
begin, end); begin, end);
} }

Loading…
Cancel
Save