Consider TTL compaction file cutting earlier to prevent small output file (#11075)

Summary:
in `CompactionOutputs::ShouldStopBefore()`, TTL-related states, `cur_files_to_cut_for_ttl_` and `next_files_to_cut_for_ttl_`, are not updated if the function returns early. This can cause unnecessary compaction output file cuttings and hence produce smaller output files, which may hurt write amp. See the example in the unit test for how this "unnecessary file cutting" can happen. This PR fixes this issue by moving the code for updating TTL states earlier in `CompactionOutputs::ShouldStopBefore()` so that the states are updated for each key.

Pull Request resolved: https://github.com/facebook/rocksdb/pull/11075

Test Plan: - Added new unit test.

Reviewed By: hx235

Differential Revision: D42398739

Pulled By: cbi42

fbshipit-source-id: 09fab66679c1a734abcfc31bcea33dd9aeb9dbc7
oxigraph-8.1.1
Changyu Bi 2 years ago committed by Facebook GitHub Bot
parent 6a82b68788
commit 4d0f9a995c
  1. 1
      HISTORY.md
  2. 100
      db/compaction/compaction_outputs.cc
  3. 7
      db/compaction/compaction_outputs.h
  4. 72
      db/db_compaction_test.cc

@ -15,6 +15,7 @@
* Fixed a heap use after free bug in async scan prefetching when the scan thread and another thread try to read and load the same seek block into cache. * Fixed a heap use after free bug in async scan prefetching when the scan thread and another thread try to read and load the same seek block into cache.
* Fixed a heap use after free in async scan prefetching if dictionary compression is enabled, in which case sync read of the compression dictionary gets mixed with async prefetching * Fixed a heap use after free in async scan prefetching if dictionary compression is enabled, in which case sync read of the compression dictionary gets mixed with async prefetching
* Fixed a data race bug of `CompactRange()` under `change_level=true` acts on overlapping range with an ongoing file ingestion for level compaction. This will either result in overlapping file ranges corruption at a certain level caught by `force_consistency_checks=true` or protentially two same keys both with seqno 0 in two different levels (i.e, new data ends up in lower/older level). The latter will be caught by assertion in debug build but go silently and result in read returning wrong result in release build. This fix is general so it also replaced previous fixes to a similar problem for `CompactFiles()` (#4665), general `CompactRange()` and auto compaction (commit 5c64fb6 and 87dfc1d). * Fixed a data race bug of `CompactRange()` under `change_level=true` acts on overlapping range with an ongoing file ingestion for level compaction. This will either result in overlapping file ranges corruption at a certain level caught by `force_consistency_checks=true` or protentially two same keys both with seqno 0 in two different levels (i.e, new data ends up in lower/older level). The latter will be caught by assertion in debug build but go silently and result in read returning wrong result in release build. This fix is general so it also replaced previous fixes to a similar problem for `CompactFiles()` (#4665), general `CompactRange()` and auto compaction (commit 5c64fb6 and 87dfc1d).
* Fixed a bug in compaction output cutting where small output files were produced due to TTL file cutting states were not being updated (#11075).
### New Features ### New Features
* When an SstPartitionerFactory is configured, CompactRange() now automatically selects for compaction any files overlapping a partition boundary that is in the compaction range, even if no actual entries are in the requested compaction range. With this feature, manual compaction can be used to (re-)establish SST partition points when SstPartitioner changes, without a full compaction. * When an SstPartitionerFactory is configured, CompactRange() now automatically selects for compaction any files overlapping a partition boundary that is in the compaction range, even if no actual entries are in the requested compaction range. With this feature, manual compaction can be used to (re-)establish SST partition points when SstPartitioner changes, without a full compaction.

@ -76,6 +76,46 @@ IOStatus CompactionOutputs::WriterSyncClose(const Status& input_status,
return io_s; return io_s;
} }
bool CompactionOutputs::UpdateFilesToCutForTTLStates(
const Slice& internal_key) {
if (!files_to_cut_for_ttl_.empty()) {
const InternalKeyComparator* icmp =
&compaction_->column_family_data()->internal_comparator();
if (cur_files_to_cut_for_ttl_ != -1) {
// Previous key is inside the range of a file
if (icmp->Compare(internal_key,
files_to_cut_for_ttl_[cur_files_to_cut_for_ttl_]
->largest.Encode()) > 0) {
next_files_to_cut_for_ttl_ = cur_files_to_cut_for_ttl_ + 1;
cur_files_to_cut_for_ttl_ = -1;
return true;
}
} else {
// Look for the key position
while (next_files_to_cut_for_ttl_ <
static_cast<int>(files_to_cut_for_ttl_.size())) {
if (icmp->Compare(internal_key,
files_to_cut_for_ttl_[next_files_to_cut_for_ttl_]
->smallest.Encode()) >= 0) {
if (icmp->Compare(internal_key,
files_to_cut_for_ttl_[next_files_to_cut_for_ttl_]
->largest.Encode()) <= 0) {
// With in the current file
cur_files_to_cut_for_ttl_ = next_files_to_cut_for_ttl_;
return true;
}
// Beyond the current file
next_files_to_cut_for_ttl_++;
} else {
// Still fall into the gap
break;
}
}
}
}
return false;
}
size_t CompactionOutputs::UpdateGrandparentBoundaryInfo( size_t CompactionOutputs::UpdateGrandparentBoundaryInfo(
const Slice& internal_key) { const Slice& internal_key) {
size_t curr_key_boundary_switched_num = 0; size_t curr_key_boundary_switched_num = 0;
@ -185,18 +225,30 @@ uint64_t CompactionOutputs::GetCurrentKeyGrandparentOverlappedBytes(
bool CompactionOutputs::ShouldStopBefore(const CompactionIterator& c_iter) { bool CompactionOutputs::ShouldStopBefore(const CompactionIterator& c_iter) {
assert(c_iter.Valid()); assert(c_iter.Valid());
// always update grandparent information like overlapped file number, size
// etc.
const Slice& internal_key = c_iter.key(); const Slice& internal_key = c_iter.key();
const uint64_t previous_overlapped_bytes = grandparent_overlapped_bytes_; const uint64_t previous_overlapped_bytes = grandparent_overlapped_bytes_;
size_t num_grandparent_boundaries_crossed = const InternalKeyComparator* icmp =
UpdateGrandparentBoundaryInfo(internal_key); &compaction_->column_family_data()->internal_comparator();
size_t num_grandparent_boundaries_crossed = 0;
bool should_stop_for_ttl = false;
// Always update grandparent information like overlapped file number, size
// etc., and TTL states.
// If compaction_->output_level() == 0, there is no need to update grandparent
// info, and that `grandparent` should be empty.
if (compaction_->output_level() > 0) {
num_grandparent_boundaries_crossed =
UpdateGrandparentBoundaryInfo(internal_key);
should_stop_for_ttl = UpdateFilesToCutForTTLStates(internal_key);
}
if (!HasBuilder()) { if (!HasBuilder()) {
return false; return false;
} }
if (should_stop_for_ttl) {
return true;
}
// If there's user defined partitioner, check that first // If there's user defined partitioner, check that first
if (partitioner_ && partitioner_->ShouldPartition(PartitionerRequest( if (partitioner_ && partitioner_->ShouldPartition(PartitionerRequest(
last_key_for_partitioner_, c_iter.user_key(), last_key_for_partitioner_, c_iter.user_key(),
@ -214,9 +266,6 @@ bool CompactionOutputs::ShouldStopBefore(const CompactionIterator& c_iter) {
return true; return true;
} }
const InternalKeyComparator* icmp =
&compaction_->column_family_data()->internal_comparator();
// Check if it needs to split for RoundRobin // Check if it needs to split for RoundRobin
// Invalid local_output_split_key indicates that we do not need to split // Invalid local_output_split_key indicates that we do not need to split
if (local_output_split_key_ != nullptr && !is_split_) { if (local_output_split_key_ != nullptr && !is_split_) {
@ -290,41 +339,6 @@ bool CompactionOutputs::ShouldStopBefore(const CompactionIterator& c_iter) {
} }
} }
// check ttl file boundaries if there's any
if (!files_to_cut_for_ttl_.empty()) {
if (cur_files_to_cut_for_ttl_ != -1) {
// Previous key is inside the range of a file
if (icmp->Compare(internal_key,
files_to_cut_for_ttl_[cur_files_to_cut_for_ttl_]
->largest.Encode()) > 0) {
next_files_to_cut_for_ttl_ = cur_files_to_cut_for_ttl_ + 1;
cur_files_to_cut_for_ttl_ = -1;
return true;
}
} else {
// Look for the key position
while (next_files_to_cut_for_ttl_ <
static_cast<int>(files_to_cut_for_ttl_.size())) {
if (icmp->Compare(internal_key,
files_to_cut_for_ttl_[next_files_to_cut_for_ttl_]
->smallest.Encode()) >= 0) {
if (icmp->Compare(internal_key,
files_to_cut_for_ttl_[next_files_to_cut_for_ttl_]
->largest.Encode()) <= 0) {
// With in the current file
cur_files_to_cut_for_ttl_ = next_files_to_cut_for_ttl_;
return true;
}
// Beyond the current file
next_files_to_cut_for_ttl_++;
} else {
// Still fall into the gap
break;
}
}
}
}
return false; return false;
} }

@ -221,6 +221,13 @@ class CompactionOutputs {
} }
} }
// Updates states related to file cutting for TTL.
// Returns a boolean value indicating whether the current
// compaction output file should be cut before `internal_key`.
//
// @param internal_key the current key to be added to output.
bool UpdateFilesToCutForTTLStates(const Slice& internal_key);
// update tracked grandparents information like grandparent index, if it's // update tracked grandparents information like grandparent index, if it's
// in the gap between 2 grandparent files, accumulated grandparent files size // in the gap between 2 grandparent files, accumulated grandparent files size
// etc. // etc.

@ -4219,6 +4219,78 @@ TEST_F(DBCompactionTest, LevelCompactExpiredTtlFiles) {
ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->DisableProcessing(); ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->DisableProcessing();
} }
TEST_F(DBCompactionTest, LevelTtlCompactionOutputCuttingIteractingWithOther) {
// This test is for a bug fix in CompactionOutputs::ShouldStopBefore() where
// TTL states were not being updated for keys that ShouldStopBefore() would
// return true for reasons other than TTL.
Options options = CurrentOptions();
options.compression = kNoCompression;
options.ttl = 24 * 60 * 60; // 24 hours
options.max_open_files = -1;
options.compaction_pri = kMinOverlappingRatio;
env_->SetMockSleep();
options.env = env_;
options.target_file_size_base = 4 << 10;
options.disable_auto_compactions = true;
options.level_compaction_dynamic_file_size = false;
DestroyAndReopen(options);
Random rnd(301);
// This makes sure the manual compaction below
// is not a bottommost compaction as TTL is only
// for non-bottommost compactions.
ASSERT_OK(Put(Key(3), rnd.RandomString(1 << 10)));
ASSERT_OK(Put(Key(0), rnd.RandomString(1 << 10)));
ASSERT_OK(Flush());
MoveFilesToLevel(6);
// L2:
ASSERT_OK(Put(Key(2), rnd.RandomString(4 << 10)));
ASSERT_OK(Put(Key(3), rnd.RandomString(4 << 10)));
ASSERT_OK(Flush());
MoveFilesToLevel(2);
// L1, overlaps in range with the file in L2 so
// that they compact together.
ASSERT_OK(Put(Key(0), rnd.RandomString(4 << 10)));
ASSERT_OK(Put(Key(1), rnd.RandomString(4 << 10)));
ASSERT_OK(Put(Key(3), rnd.RandomString(4 << 10)));
ASSERT_OK(Flush());
MoveFilesToLevel(1);
ASSERT_EQ("0,1,1,0,0,0,1", FilesPerLevel());
// 36 hours so that the file in L2 is eligible for TTL
env_->MockSleepForSeconds(36 * 60 * 60);
CompactRangeOptions compact_range_opts;
ASSERT_OK(dbfull()->RunManualCompaction(
static_cast_with_check<ColumnFamilyHandleImpl>(db_->DefaultColumnFamily())
->cfd(),
1 /* input_level */, 2 /* output_level */, compact_range_opts,
nullptr /* begin */, nullptr /* end */, true /* exclusive */,
true /* disallow_trivial_move */,
std::numeric_limits<uint64_t>::max() /*max_file_num_to_ignore*/,
"" /*trim_ts*/));
// L2 should have 2 files:
// file 1: Key(0), Key(1)
// ShouldStopBefore(Key(2)) return true due to TTL or output file size
// file 2: Key(2), Key(3)
//
// Before the fix in this PR, L2 would have 3 files:
// file 1: Key(0), Key(1)
// CompactionOutputs::ShouldStopBefore(Key(2)) returns true due to output file
// size.
// file 2: Key(2)
// CompactionOutput::ShouldStopBefore(Key(3)) returns true
// due to TTL cutting and that TTL states were not updated
// for Key(2).
// file 3: Key(3)
ASSERT_EQ("0,0,2,0,0,0,1", FilesPerLevel());
}
TEST_F(DBCompactionTest, LevelTtlCascadingCompactions) { TEST_F(DBCompactionTest, LevelTtlCascadingCompactions) {
env_->SetMockSleep(); env_->SetMockSleep();
const int kValueSize = 100; const int kValueSize = 100;

Loading…
Cancel
Save