|
|
|
// Copyright (c) 2011-present, Facebook, Inc. All rights reserved.
|
|
|
|
// This source code is licensed under both the GPLv2 (found in the
|
|
|
|
// COPYING file in the root directory) and Apache 2.0 License
|
|
|
|
// (found in the LICENSE.Apache file in the root directory).
|
|
|
|
//
|
|
|
|
// Copyright (c) 2011 The LevelDB Authors. All rights reserved.
|
|
|
|
// Use of this source code is governed by a BSD-style license that can be
|
|
|
|
// found in the LICENSE file. See the AUTHORS file for names of contributors.
|
|
|
|
|
|
|
|
#include "db/compaction/compaction.h"
|
|
|
|
|
|
|
|
#include <cinttypes>
|
|
|
|
#include <vector>
|
|
|
|
|
|
|
|
#include "db/column_family.h"
|
|
|
|
#include "logging/logging.h"
|
|
|
|
#include "rocksdb/compaction_filter.h"
|
|
|
|
#include "rocksdb/sst_partitioner.h"
|
|
|
|
#include "test_util/sync_point.h"
|
|
|
|
#include "util/string_util.h"
|
|
|
|
|
|
|
|
namespace ROCKSDB_NAMESPACE {
|
|
|
|
|
|
|
|
const uint64_t kRangeTombstoneSentinel =
|
|
|
|
PackSequenceAndType(kMaxSequenceNumber, kTypeRangeDeletion);
|
|
|
|
|
|
|
|
int sstableKeyCompare(const Comparator* uc, const Slice& a, const Slice& b) {
|
|
|
|
auto c = uc->CompareWithoutTimestamp(ExtractUserKey(a), ExtractUserKey(b));
|
|
|
|
if (c != 0) {
|
|
|
|
return c;
|
|
|
|
}
|
|
|
|
auto a_footer = ExtractInternalKeyFooter(a);
|
|
|
|
auto b_footer = ExtractInternalKeyFooter(b);
|
|
|
|
if (a_footer == kRangeTombstoneSentinel) {
|
|
|
|
if (b_footer != kRangeTombstoneSentinel) {
|
|
|
|
return -1;
|
|
|
|
}
|
|
|
|
} else if (b_footer == kRangeTombstoneSentinel) {
|
|
|
|
return 1;
|
|
|
|
}
|
|
|
|
return 0;
|
|
|
|
}
|
|
|
|
|
|
|
|
int sstableKeyCompare(const Comparator* user_cmp, const InternalKey* a,
|
|
|
|
const InternalKey& b) {
|
|
|
|
if (a == nullptr) {
|
|
|
|
return -1;
|
|
|
|
}
|
|
|
|
return sstableKeyCompare(user_cmp, *a, b);
|
|
|
|
}
|
|
|
|
|
|
|
|
int sstableKeyCompare(const Comparator* user_cmp, const InternalKey& a,
|
|
|
|
const InternalKey* b) {
|
|
|
|
if (b == nullptr) {
|
|
|
|
return -1;
|
|
|
|
}
|
|
|
|
return sstableKeyCompare(user_cmp, a, *b);
|
|
|
|
}
|
|
|
|
|
|
|
|
uint64_t TotalFileSize(const std::vector<FileMetaData*>& files) {
|
|
|
|
uint64_t sum = 0;
|
|
|
|
for (size_t i = 0; i < files.size() && files[i]; i++) {
|
|
|
|
sum += files[i]->fd.GetFileSize();
|
|
|
|
}
|
|
|
|
return sum;
|
|
|
|
}
|
|
|
|
|
|
|
|
void Compaction::SetInputVersion(Version* _input_version) {
|
|
|
|
input_version_ = _input_version;
|
|
|
|
cfd_ = input_version_->cfd();
|
|
|
|
|
|
|
|
cfd_->Ref();
|
|
|
|
input_version_->Ref();
|
Make Compaction class easier to use
Summary:
The goal of this diff is to make Compaction class easier to use. This should also make new compaction algorithms easier to write (like CompactFiles from @yhchiang and dynamic leveled and multi-leveled universal from @sdong).
Here are couple of things demonstrating that Compaction class is hard to use:
1. we have two constructors of Compaction class
2. there's this thing called grandparents_, but it appears to only be setup for leveled compaction and not compactfiles
3. it's easy to introduce a subtle and dangerous bug like this: D36225
4. SetupBottomMostLevel() is hard to understand and it shouldn't be. See this comment: https://github.com/facebook/rocksdb/blob/afbafeaeaebfd27a0f3e992fee8e0c57d07658fa/db/compaction.cc#L236-L241. It also made it harder for @yhchiang to write CompactFiles, as evidenced by this: https://github.com/facebook/rocksdb/blob/afbafeaeaebfd27a0f3e992fee8e0c57d07658fa/db/compaction_picker.cc#L204-L210
The problem is that we create Compaction object, which holds a lot of state, and then pass it around to some functions. After those functions are done mutating, then we call couple of functions on Compaction object, like SetupBottommostLevel() and MarkFilesBeingCompacted(). It is very hard to see what's happening with all that Compaction's state while it's travelling across different functions. If you're writing a new PickCompaction() function you need to try really hard to understand what are all the functions you need to run on Compaction object and what state you need to setup.
My proposed solution is to make important parts of Compaction immutable after construction. PickCompaction() should calculate compaction inputs and then pass them onto Compaction object once they are finalized. That makes it easy to create a new compaction -- just provide all the parameters to the constructor and you're done. No need to call confusing functions after you created your object.
This diff doesn't fully achieve that goal, but it comes pretty close. Here are some of the changes:
* have one Compaction constructor instead of two.
* inputs_ is constant after construction
* MarkFilesBeingCompacted() is now private to Compaction class and automatically called on construction/destruction.
* SetupBottommostLevel() is gone. Compaction figures it out on its own based on the input.
* CompactionPicker's functions are not passing around Compaction object anymore. They are only passing around the state that they need.
Test Plan:
make check
make asan_check
make valgrind_check
Reviewers: rven, anthony, sdong, yhchiang
Reviewed By: yhchiang
Subscribers: sdong, yhchiang, dhruba, leveldb
Differential Revision: https://reviews.facebook.net/D36687
10 years ago
|
|
|
edit_.SetColumnFamily(cfd_->GetID());
|
|
|
|
}
|
|
|
|
|
|
|
|
void Compaction::GetBoundaryKeys(
|
|
|
|
VersionStorageInfo* vstorage,
|
|
|
|
const std::vector<CompactionInputFiles>& inputs, Slice* smallest_user_key,
|
|
|
|
Slice* largest_user_key, int exclude_level) {
|
|
|
|
bool initialized = false;
|
|
|
|
const Comparator* ucmp = vstorage->InternalComparator()->user_comparator();
|
|
|
|
for (size_t i = 0; i < inputs.size(); ++i) {
|
|
|
|
if (inputs[i].files.empty() || inputs[i].level == exclude_level) {
|
|
|
|
continue;
|
|
|
|
}
|
|
|
|
if (inputs[i].level == 0) {
|
|
|
|
// we need to consider all files on level 0
|
|
|
|
for (const auto* f : inputs[i].files) {
|
|
|
|
const Slice& start_user_key = f->smallest.user_key();
|
|
|
|
if (!initialized ||
|
|
|
|
ucmp->Compare(start_user_key, *smallest_user_key) < 0) {
|
|
|
|
*smallest_user_key = start_user_key;
|
|
|
|
}
|
|
|
|
const Slice& end_user_key = f->largest.user_key();
|
|
|
|
if (!initialized ||
|
|
|
|
ucmp->Compare(end_user_key, *largest_user_key) > 0) {
|
|
|
|
*largest_user_key = end_user_key;
|
|
|
|
}
|
|
|
|
initialized = true;
|
|
|
|
}
|
|
|
|
} else {
|
|
|
|
// we only need to consider the first and last file
|
|
|
|
const Slice& start_user_key = inputs[i].files[0]->smallest.user_key();
|
|
|
|
if (!initialized ||
|
|
|
|
ucmp->Compare(start_user_key, *smallest_user_key) < 0) {
|
|
|
|
*smallest_user_key = start_user_key;
|
|
|
|
}
|
|
|
|
const Slice& end_user_key = inputs[i].files.back()->largest.user_key();
|
|
|
|
if (!initialized || ucmp->Compare(end_user_key, *largest_user_key) > 0) {
|
|
|
|
*largest_user_key = end_user_key;
|
|
|
|
}
|
|
|
|
initialized = true;
|
|
|
|
}
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
std::vector<CompactionInputFiles> Compaction::PopulateWithAtomicBoundaries(
|
|
|
|
VersionStorageInfo* vstorage, std::vector<CompactionInputFiles> inputs) {
|
|
|
|
const Comparator* ucmp = vstorage->InternalComparator()->user_comparator();
|
|
|
|
for (size_t i = 0; i < inputs.size(); i++) {
|
|
|
|
if (inputs[i].level == 0 || inputs[i].files.empty()) {
|
|
|
|
continue;
|
|
|
|
}
|
|
|
|
inputs[i].atomic_compaction_unit_boundaries.reserve(inputs[i].files.size());
|
|
|
|
AtomicCompactionUnitBoundary cur_boundary;
|
|
|
|
size_t first_atomic_idx = 0;
|
|
|
|
auto add_unit_boundary = [&](size_t to) {
|
|
|
|
if (first_atomic_idx == to) return;
|
|
|
|
for (size_t k = first_atomic_idx; k < to; k++) {
|
|
|
|
inputs[i].atomic_compaction_unit_boundaries.push_back(cur_boundary);
|
|
|
|
}
|
|
|
|
first_atomic_idx = to;
|
|
|
|
};
|
|
|
|
for (size_t j = 0; j < inputs[i].files.size(); j++) {
|
|
|
|
const auto* f = inputs[i].files[j];
|
|
|
|
if (j == 0) {
|
|
|
|
// First file in a level.
|
|
|
|
cur_boundary.smallest = &f->smallest;
|
|
|
|
cur_boundary.largest = &f->largest;
|
|
|
|
} else if (sstableKeyCompare(ucmp, *cur_boundary.largest, f->smallest) ==
|
|
|
|
0) {
|
|
|
|
// SSTs overlap but the end key of the previous file was not
|
|
|
|
// artificially extended by a range tombstone. Extend the current
|
|
|
|
// boundary.
|
|
|
|
cur_boundary.largest = &f->largest;
|
|
|
|
} else {
|
|
|
|
// Atomic compaction unit has ended.
|
|
|
|
add_unit_boundary(j);
|
|
|
|
cur_boundary.smallest = &f->smallest;
|
|
|
|
cur_boundary.largest = &f->largest;
|
|
|
|
}
|
|
|
|
}
|
|
|
|
add_unit_boundary(inputs[i].files.size());
|
|
|
|
assert(inputs[i].files.size() ==
|
|
|
|
inputs[i].atomic_compaction_unit_boundaries.size());
|
|
|
|
}
|
|
|
|
return inputs;
|
|
|
|
}
|
|
|
|
|
Make Compaction class easier to use
Summary:
The goal of this diff is to make Compaction class easier to use. This should also make new compaction algorithms easier to write (like CompactFiles from @yhchiang and dynamic leveled and multi-leveled universal from @sdong).
Here are couple of things demonstrating that Compaction class is hard to use:
1. we have two constructors of Compaction class
2. there's this thing called grandparents_, but it appears to only be setup for leveled compaction and not compactfiles
3. it's easy to introduce a subtle and dangerous bug like this: D36225
4. SetupBottomMostLevel() is hard to understand and it shouldn't be. See this comment: https://github.com/facebook/rocksdb/blob/afbafeaeaebfd27a0f3e992fee8e0c57d07658fa/db/compaction.cc#L236-L241. It also made it harder for @yhchiang to write CompactFiles, as evidenced by this: https://github.com/facebook/rocksdb/blob/afbafeaeaebfd27a0f3e992fee8e0c57d07658fa/db/compaction_picker.cc#L204-L210
The problem is that we create Compaction object, which holds a lot of state, and then pass it around to some functions. After those functions are done mutating, then we call couple of functions on Compaction object, like SetupBottommostLevel() and MarkFilesBeingCompacted(). It is very hard to see what's happening with all that Compaction's state while it's travelling across different functions. If you're writing a new PickCompaction() function you need to try really hard to understand what are all the functions you need to run on Compaction object and what state you need to setup.
My proposed solution is to make important parts of Compaction immutable after construction. PickCompaction() should calculate compaction inputs and then pass them onto Compaction object once they are finalized. That makes it easy to create a new compaction -- just provide all the parameters to the constructor and you're done. No need to call confusing functions after you created your object.
This diff doesn't fully achieve that goal, but it comes pretty close. Here are some of the changes:
* have one Compaction constructor instead of two.
* inputs_ is constant after construction
* MarkFilesBeingCompacted() is now private to Compaction class and automatically called on construction/destruction.
* SetupBottommostLevel() is gone. Compaction figures it out on its own based on the input.
* CompactionPicker's functions are not passing around Compaction object anymore. They are only passing around the state that they need.
Test Plan:
make check
make asan_check
make valgrind_check
Reviewers: rven, anthony, sdong, yhchiang
Reviewed By: yhchiang
Subscribers: sdong, yhchiang, dhruba, leveldb
Differential Revision: https://reviews.facebook.net/D36687
10 years ago
|
|
|
// helper function to determine if compaction is creating files at the
|
|
|
|
// bottommost level
|
|
|
|
bool Compaction::IsBottommostLevel(
|
|
|
|
int output_level, VersionStorageInfo* vstorage,
|
|
|
|
const std::vector<CompactionInputFiles>& inputs) {
|
|
|
|
int output_l0_idx;
|
|
|
|
if (output_level == 0) {
|
|
|
|
output_l0_idx = 0;
|
|
|
|
for (const auto* file : vstorage->LevelFiles(0)) {
|
|
|
|
if (inputs[0].files.back() == file) {
|
|
|
|
break;
|
|
|
|
}
|
|
|
|
++output_l0_idx;
|
|
|
|
}
|
|
|
|
assert(static_cast<size_t>(output_l0_idx) < vstorage->LevelFiles(0).size());
|
|
|
|
} else {
|
|
|
|
output_l0_idx = -1;
|
Make Compaction class easier to use
Summary:
The goal of this diff is to make Compaction class easier to use. This should also make new compaction algorithms easier to write (like CompactFiles from @yhchiang and dynamic leveled and multi-leveled universal from @sdong).
Here are couple of things demonstrating that Compaction class is hard to use:
1. we have two constructors of Compaction class
2. there's this thing called grandparents_, but it appears to only be setup for leveled compaction and not compactfiles
3. it's easy to introduce a subtle and dangerous bug like this: D36225
4. SetupBottomMostLevel() is hard to understand and it shouldn't be. See this comment: https://github.com/facebook/rocksdb/blob/afbafeaeaebfd27a0f3e992fee8e0c57d07658fa/db/compaction.cc#L236-L241. It also made it harder for @yhchiang to write CompactFiles, as evidenced by this: https://github.com/facebook/rocksdb/blob/afbafeaeaebfd27a0f3e992fee8e0c57d07658fa/db/compaction_picker.cc#L204-L210
The problem is that we create Compaction object, which holds a lot of state, and then pass it around to some functions. After those functions are done mutating, then we call couple of functions on Compaction object, like SetupBottommostLevel() and MarkFilesBeingCompacted(). It is very hard to see what's happening with all that Compaction's state while it's travelling across different functions. If you're writing a new PickCompaction() function you need to try really hard to understand what are all the functions you need to run on Compaction object and what state you need to setup.
My proposed solution is to make important parts of Compaction immutable after construction. PickCompaction() should calculate compaction inputs and then pass them onto Compaction object once they are finalized. That makes it easy to create a new compaction -- just provide all the parameters to the constructor and you're done. No need to call confusing functions after you created your object.
This diff doesn't fully achieve that goal, but it comes pretty close. Here are some of the changes:
* have one Compaction constructor instead of two.
* inputs_ is constant after construction
* MarkFilesBeingCompacted() is now private to Compaction class and automatically called on construction/destruction.
* SetupBottommostLevel() is gone. Compaction figures it out on its own based on the input.
* CompactionPicker's functions are not passing around Compaction object anymore. They are only passing around the state that they need.
Test Plan:
make check
make asan_check
make valgrind_check
Reviewers: rven, anthony, sdong, yhchiang
Reviewed By: yhchiang
Subscribers: sdong, yhchiang, dhruba, leveldb
Differential Revision: https://reviews.facebook.net/D36687
10 years ago
|
|
|
}
|
|
|
|
Slice smallest_key, largest_key;
|
|
|
|
GetBoundaryKeys(vstorage, inputs, &smallest_key, &largest_key);
|
|
|
|
return !vstorage->RangeMightExistAfterSortedRun(smallest_key, largest_key,
|
|
|
|
output_level, output_l0_idx);
|
Make Compaction class easier to use
Summary:
The goal of this diff is to make Compaction class easier to use. This should also make new compaction algorithms easier to write (like CompactFiles from @yhchiang and dynamic leveled and multi-leveled universal from @sdong).
Here are couple of things demonstrating that Compaction class is hard to use:
1. we have two constructors of Compaction class
2. there's this thing called grandparents_, but it appears to only be setup for leveled compaction and not compactfiles
3. it's easy to introduce a subtle and dangerous bug like this: D36225
4. SetupBottomMostLevel() is hard to understand and it shouldn't be. See this comment: https://github.com/facebook/rocksdb/blob/afbafeaeaebfd27a0f3e992fee8e0c57d07658fa/db/compaction.cc#L236-L241. It also made it harder for @yhchiang to write CompactFiles, as evidenced by this: https://github.com/facebook/rocksdb/blob/afbafeaeaebfd27a0f3e992fee8e0c57d07658fa/db/compaction_picker.cc#L204-L210
The problem is that we create Compaction object, which holds a lot of state, and then pass it around to some functions. After those functions are done mutating, then we call couple of functions on Compaction object, like SetupBottommostLevel() and MarkFilesBeingCompacted(). It is very hard to see what's happening with all that Compaction's state while it's travelling across different functions. If you're writing a new PickCompaction() function you need to try really hard to understand what are all the functions you need to run on Compaction object and what state you need to setup.
My proposed solution is to make important parts of Compaction immutable after construction. PickCompaction() should calculate compaction inputs and then pass them onto Compaction object once they are finalized. That makes it easy to create a new compaction -- just provide all the parameters to the constructor and you're done. No need to call confusing functions after you created your object.
This diff doesn't fully achieve that goal, but it comes pretty close. Here are some of the changes:
* have one Compaction constructor instead of two.
* inputs_ is constant after construction
* MarkFilesBeingCompacted() is now private to Compaction class and automatically called on construction/destruction.
* SetupBottommostLevel() is gone. Compaction figures it out on its own based on the input.
* CompactionPicker's functions are not passing around Compaction object anymore. They are only passing around the state that they need.
Test Plan:
make check
make asan_check
make valgrind_check
Reviewers: rven, anthony, sdong, yhchiang
Reviewed By: yhchiang
Subscribers: sdong, yhchiang, dhruba, leveldb
Differential Revision: https://reviews.facebook.net/D36687
10 years ago
|
|
|
}
|
|
|
|
|
|
|
|
// test function to validate the functionality of IsBottommostLevel()
|
|
|
|
// function -- determines if compaction with inputs and storage is bottommost
|
|
|
|
bool Compaction::TEST_IsBottommostLevel(
|
|
|
|
int output_level, VersionStorageInfo* vstorage,
|
|
|
|
const std::vector<CompactionInputFiles>& inputs) {
|
|
|
|
return IsBottommostLevel(output_level, vstorage, inputs);
|
|
|
|
}
|
|
|
|
|
Make Compaction class easier to use
Summary:
The goal of this diff is to make Compaction class easier to use. This should also make new compaction algorithms easier to write (like CompactFiles from @yhchiang and dynamic leveled and multi-leveled universal from @sdong).
Here are couple of things demonstrating that Compaction class is hard to use:
1. we have two constructors of Compaction class
2. there's this thing called grandparents_, but it appears to only be setup for leveled compaction and not compactfiles
3. it's easy to introduce a subtle and dangerous bug like this: D36225
4. SetupBottomMostLevel() is hard to understand and it shouldn't be. See this comment: https://github.com/facebook/rocksdb/blob/afbafeaeaebfd27a0f3e992fee8e0c57d07658fa/db/compaction.cc#L236-L241. It also made it harder for @yhchiang to write CompactFiles, as evidenced by this: https://github.com/facebook/rocksdb/blob/afbafeaeaebfd27a0f3e992fee8e0c57d07658fa/db/compaction_picker.cc#L204-L210
The problem is that we create Compaction object, which holds a lot of state, and then pass it around to some functions. After those functions are done mutating, then we call couple of functions on Compaction object, like SetupBottommostLevel() and MarkFilesBeingCompacted(). It is very hard to see what's happening with all that Compaction's state while it's travelling across different functions. If you're writing a new PickCompaction() function you need to try really hard to understand what are all the functions you need to run on Compaction object and what state you need to setup.
My proposed solution is to make important parts of Compaction immutable after construction. PickCompaction() should calculate compaction inputs and then pass them onto Compaction object once they are finalized. That makes it easy to create a new compaction -- just provide all the parameters to the constructor and you're done. No need to call confusing functions after you created your object.
This diff doesn't fully achieve that goal, but it comes pretty close. Here are some of the changes:
* have one Compaction constructor instead of two.
* inputs_ is constant after construction
* MarkFilesBeingCompacted() is now private to Compaction class and automatically called on construction/destruction.
* SetupBottommostLevel() is gone. Compaction figures it out on its own based on the input.
* CompactionPicker's functions are not passing around Compaction object anymore. They are only passing around the state that they need.
Test Plan:
make check
make asan_check
make valgrind_check
Reviewers: rven, anthony, sdong, yhchiang
Reviewed By: yhchiang
Subscribers: sdong, yhchiang, dhruba, leveldb
Differential Revision: https://reviews.facebook.net/D36687
10 years ago
|
|
|
bool Compaction::IsFullCompaction(
|
|
|
|
VersionStorageInfo* vstorage,
|
|
|
|
const std::vector<CompactionInputFiles>& inputs) {
|
|
|
|
size_t num_files_in_compaction = 0;
|
|
|
|
size_t total_num_files = 0;
|
Make Compaction class easier to use
Summary:
The goal of this diff is to make Compaction class easier to use. This should also make new compaction algorithms easier to write (like CompactFiles from @yhchiang and dynamic leveled and multi-leveled universal from @sdong).
Here are couple of things demonstrating that Compaction class is hard to use:
1. we have two constructors of Compaction class
2. there's this thing called grandparents_, but it appears to only be setup for leveled compaction and not compactfiles
3. it's easy to introduce a subtle and dangerous bug like this: D36225
4. SetupBottomMostLevel() is hard to understand and it shouldn't be. See this comment: https://github.com/facebook/rocksdb/blob/afbafeaeaebfd27a0f3e992fee8e0c57d07658fa/db/compaction.cc#L236-L241. It also made it harder for @yhchiang to write CompactFiles, as evidenced by this: https://github.com/facebook/rocksdb/blob/afbafeaeaebfd27a0f3e992fee8e0c57d07658fa/db/compaction_picker.cc#L204-L210
The problem is that we create Compaction object, which holds a lot of state, and then pass it around to some functions. After those functions are done mutating, then we call couple of functions on Compaction object, like SetupBottommostLevel() and MarkFilesBeingCompacted(). It is very hard to see what's happening with all that Compaction's state while it's travelling across different functions. If you're writing a new PickCompaction() function you need to try really hard to understand what are all the functions you need to run on Compaction object and what state you need to setup.
My proposed solution is to make important parts of Compaction immutable after construction. PickCompaction() should calculate compaction inputs and then pass them onto Compaction object once they are finalized. That makes it easy to create a new compaction -- just provide all the parameters to the constructor and you're done. No need to call confusing functions after you created your object.
This diff doesn't fully achieve that goal, but it comes pretty close. Here are some of the changes:
* have one Compaction constructor instead of two.
* inputs_ is constant after construction
* MarkFilesBeingCompacted() is now private to Compaction class and automatically called on construction/destruction.
* SetupBottommostLevel() is gone. Compaction figures it out on its own based on the input.
* CompactionPicker's functions are not passing around Compaction object anymore. They are only passing around the state that they need.
Test Plan:
make check
make asan_check
make valgrind_check
Reviewers: rven, anthony, sdong, yhchiang
Reviewed By: yhchiang
Subscribers: sdong, yhchiang, dhruba, leveldb
Differential Revision: https://reviews.facebook.net/D36687
10 years ago
|
|
|
for (int l = 0; l < vstorage->num_levels(); l++) {
|
|
|
|
total_num_files += vstorage->NumLevelFiles(l);
|
|
|
|
}
|
Make Compaction class easier to use
Summary:
The goal of this diff is to make Compaction class easier to use. This should also make new compaction algorithms easier to write (like CompactFiles from @yhchiang and dynamic leveled and multi-leveled universal from @sdong).
Here are couple of things demonstrating that Compaction class is hard to use:
1. we have two constructors of Compaction class
2. there's this thing called grandparents_, but it appears to only be setup for leveled compaction and not compactfiles
3. it's easy to introduce a subtle and dangerous bug like this: D36225
4. SetupBottomMostLevel() is hard to understand and it shouldn't be. See this comment: https://github.com/facebook/rocksdb/blob/afbafeaeaebfd27a0f3e992fee8e0c57d07658fa/db/compaction.cc#L236-L241. It also made it harder for @yhchiang to write CompactFiles, as evidenced by this: https://github.com/facebook/rocksdb/blob/afbafeaeaebfd27a0f3e992fee8e0c57d07658fa/db/compaction_picker.cc#L204-L210
The problem is that we create Compaction object, which holds a lot of state, and then pass it around to some functions. After those functions are done mutating, then we call couple of functions on Compaction object, like SetupBottommostLevel() and MarkFilesBeingCompacted(). It is very hard to see what's happening with all that Compaction's state while it's travelling across different functions. If you're writing a new PickCompaction() function you need to try really hard to understand what are all the functions you need to run on Compaction object and what state you need to setup.
My proposed solution is to make important parts of Compaction immutable after construction. PickCompaction() should calculate compaction inputs and then pass them onto Compaction object once they are finalized. That makes it easy to create a new compaction -- just provide all the parameters to the constructor and you're done. No need to call confusing functions after you created your object.
This diff doesn't fully achieve that goal, but it comes pretty close. Here are some of the changes:
* have one Compaction constructor instead of two.
* inputs_ is constant after construction
* MarkFilesBeingCompacted() is now private to Compaction class and automatically called on construction/destruction.
* SetupBottommostLevel() is gone. Compaction figures it out on its own based on the input.
* CompactionPicker's functions are not passing around Compaction object anymore. They are only passing around the state that they need.
Test Plan:
make check
make asan_check
make valgrind_check
Reviewers: rven, anthony, sdong, yhchiang
Reviewed By: yhchiang
Subscribers: sdong, yhchiang, dhruba, leveldb
Differential Revision: https://reviews.facebook.net/D36687
10 years ago
|
|
|
for (size_t i = 0; i < inputs.size(); i++) {
|
|
|
|
num_files_in_compaction += inputs[i].size();
|
|
|
|
}
|
|
|
|
return num_files_in_compaction == total_num_files;
|
|
|
|
}
|
|
|
|
|
|
|
|
const TablePropertiesCollection& Compaction::GetTableProperties() {
|
|
|
|
if (!input_table_properties_initialized_) {
|
|
|
|
const ReadOptions read_options(Env::IOActivity::kCompaction);
|
|
|
|
for (size_t i = 0; i < num_input_levels(); ++i) {
|
|
|
|
for (const FileMetaData* fmd : *(this->inputs(i))) {
|
|
|
|
std::shared_ptr<const TableProperties> tp;
|
|
|
|
std::string file_name =
|
|
|
|
TableFileName(immutable_options_.cf_paths, fmd->fd.GetNumber(),
|
|
|
|
fmd->fd.GetPathId());
|
|
|
|
Status s = input_version_->GetTableProperties(read_options, &tp, fmd,
|
|
|
|
&file_name);
|
|
|
|
if (s.ok()) {
|
|
|
|
table_properties_[file_name] = tp;
|
|
|
|
} else {
|
|
|
|
ROCKS_LOG_ERROR(immutable_options_.info_log,
|
|
|
|
"Unable to load table properties for file %" PRIu64
|
|
|
|
" --- %s\n",
|
|
|
|
fmd->fd.GetNumber(), s.ToString().c_str());
|
|
|
|
}
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
input_table_properties_initialized_ = true;
|
|
|
|
};
|
|
|
|
|
|
|
|
return table_properties_;
|
|
|
|
}
|
|
|
|
|
|
|
|
Compaction::Compaction(
|
|
|
|
VersionStorageInfo* vstorage, const ImmutableOptions& _immutable_options,
|
|
|
|
const MutableCFOptions& _mutable_cf_options,
|
|
|
|
const MutableDBOptions& _mutable_db_options,
|
|
|
|
std::vector<CompactionInputFiles> _inputs, int _output_level,
|
|
|
|
uint64_t _target_file_size, uint64_t _max_compaction_bytes,
|
|
|
|
uint32_t _output_path_id, CompressionType _compression,
|
|
|
|
CompressionOptions _compression_opts, Temperature _output_temperature,
|
|
|
|
uint32_t _max_subcompactions, std::vector<FileMetaData*> _grandparents,
|
|
|
|
bool _manual_compaction, const std::string& _trim_ts, double _score,
|
|
|
|
bool _deletion_compaction, bool l0_files_might_overlap,
|
|
|
|
CompactionReason _compaction_reason,
|
|
|
|
BlobGarbageCollectionPolicy _blob_garbage_collection_policy,
|
|
|
|
double _blob_garbage_collection_age_cutoff)
|
|
|
|
: input_vstorage_(vstorage),
|
|
|
|
start_level_(_inputs[0].level),
|
|
|
|
output_level_(_output_level),
|
Align compaction output file boundaries to the next level ones (#10655)
Summary:
Try to align the compaction output file boundaries to the next level ones
(grandparent level), to reduce the level compaction write-amplification.
In level compaction, there are "wasted" data at the beginning and end of the
output level files. Align the file boundary can avoid such "wasted" compaction.
With this PR, it tries to align the non-bottommost level file boundaries to its
next level ones. It may cut file when the file size is large enough (at least
50% of target_file_size) and not too large (2x target_file_size).
db_bench shows about 12.56% compaction reduction:
```
TEST_TMPDIR=/data/dbbench2 ./db_bench --benchmarks=fillrandom,readrandom -max_background_jobs=12 -num=400000000 -target_file_size_base=33554432
# baseline:
Flush(GB): cumulative 25.882, interval 7.216
Cumulative compaction: 285.90 GB write, 162.36 MB/s write, 269.68 GB read, 153.15 MB/s read, 2926.7 seconds
# with this change:
Flush(GB): cumulative 25.882, interval 7.753
Cumulative compaction: 249.97 GB write, 141.96 MB/s write, 233.74 GB read, 132.74 MB/s read, 2534.9 seconds
```
The compaction simulator shows a similar result (14% with 100G random data).
As a side effect, with this PR, the SST file size can exceed the
target_file_size, but is capped at 2x target_file_size. And there will be
smaller files. Here are file size statistics when loading 100GB with the target
file size 32MB:
```
baseline this_PR
count 1.656000e+03 1.705000e+03
mean 3.116062e+07 3.028076e+07
std 7.145242e+06 8.046139e+06
```
The feature is enabled by default, to revert to the old behavior disable it
with `AdvancedColumnFamilyOptions.level_compaction_dynamic_file_size = false`
Also includes https://github.com/facebook/rocksdb/issues/1963 to cut file before skippable grandparent file. Which is for
use case like user adding 2 or more non-overlapping data range at the same
time, it can reduce the overlapping of 2 datasets in the lower levels.
Pull Request resolved: https://github.com/facebook/rocksdb/pull/10655
Reviewed By: cbi42
Differential Revision: D39552321
Pulled By: jay-zhuang
fbshipit-source-id: 640d15f159ab0cd973f2426cfc3af266fc8bdde2
2 years ago
|
|
|
target_output_file_size_(_target_file_size),
|
|
|
|
max_compaction_bytes_(_max_compaction_bytes),
|
|
|
|
max_subcompactions_(_max_subcompactions),
|
|
|
|
immutable_options_(_immutable_options),
|
Make Compaction class easier to use
Summary:
The goal of this diff is to make Compaction class easier to use. This should also make new compaction algorithms easier to write (like CompactFiles from @yhchiang and dynamic leveled and multi-leveled universal from @sdong).
Here are couple of things demonstrating that Compaction class is hard to use:
1. we have two constructors of Compaction class
2. there's this thing called grandparents_, but it appears to only be setup for leveled compaction and not compactfiles
3. it's easy to introduce a subtle and dangerous bug like this: D36225
4. SetupBottomMostLevel() is hard to understand and it shouldn't be. See this comment: https://github.com/facebook/rocksdb/blob/afbafeaeaebfd27a0f3e992fee8e0c57d07658fa/db/compaction.cc#L236-L241. It also made it harder for @yhchiang to write CompactFiles, as evidenced by this: https://github.com/facebook/rocksdb/blob/afbafeaeaebfd27a0f3e992fee8e0c57d07658fa/db/compaction_picker.cc#L204-L210
The problem is that we create Compaction object, which holds a lot of state, and then pass it around to some functions. After those functions are done mutating, then we call couple of functions on Compaction object, like SetupBottommostLevel() and MarkFilesBeingCompacted(). It is very hard to see what's happening with all that Compaction's state while it's travelling across different functions. If you're writing a new PickCompaction() function you need to try really hard to understand what are all the functions you need to run on Compaction object and what state you need to setup.
My proposed solution is to make important parts of Compaction immutable after construction. PickCompaction() should calculate compaction inputs and then pass them onto Compaction object once they are finalized. That makes it easy to create a new compaction -- just provide all the parameters to the constructor and you're done. No need to call confusing functions after you created your object.
This diff doesn't fully achieve that goal, but it comes pretty close. Here are some of the changes:
* have one Compaction constructor instead of two.
* inputs_ is constant after construction
* MarkFilesBeingCompacted() is now private to Compaction class and automatically called on construction/destruction.
* SetupBottommostLevel() is gone. Compaction figures it out on its own based on the input.
* CompactionPicker's functions are not passing around Compaction object anymore. They are only passing around the state that they need.
Test Plan:
make check
make asan_check
make valgrind_check
Reviewers: rven, anthony, sdong, yhchiang
Reviewed By: yhchiang
Subscribers: sdong, yhchiang, dhruba, leveldb
Differential Revision: https://reviews.facebook.net/D36687
10 years ago
|
|
|
mutable_cf_options_(_mutable_cf_options),
|
|
|
|
input_version_(nullptr),
|
|
|
|
number_levels_(vstorage->num_levels()),
|
CompactFiles, EventListener and GetDatabaseMetaData
Summary:
This diff adds three sets of APIs to RocksDB.
= GetColumnFamilyMetaData =
* This APIs allow users to obtain the current state of a RocksDB instance on one column family.
* See GetColumnFamilyMetaData in include/rocksdb/db.h
= EventListener =
* A virtual class that allows users to implement a set of
call-back functions which will be called when specific
events of a RocksDB instance happens.
* To register EventListener, simply insert an EventListener to ColumnFamilyOptions::listeners
= CompactFiles =
* CompactFiles API inputs a set of file numbers and an output level, and RocksDB
will try to compact those files into the specified level.
= Example =
* Example code can be found in example/compact_files_example.cc, which implements
a simple external compactor using EventListener, GetColumnFamilyMetaData, and
CompactFiles API.
Test Plan:
listener_test
compactor_test
example/compact_files_example
export ROCKSDB_TESTS=CompactFiles
db_test
export ROCKSDB_TESTS=MetaData
db_test
Reviewers: ljin, igor, rven, sdong
Reviewed By: sdong
Subscribers: MarkCallaghan, dhruba, leveldb
Differential Revision: https://reviews.facebook.net/D24705
10 years ago
|
|
|
cfd_(nullptr),
|
Make Compaction class easier to use
Summary:
The goal of this diff is to make Compaction class easier to use. This should also make new compaction algorithms easier to write (like CompactFiles from @yhchiang and dynamic leveled and multi-leveled universal from @sdong).
Here are couple of things demonstrating that Compaction class is hard to use:
1. we have two constructors of Compaction class
2. there's this thing called grandparents_, but it appears to only be setup for leveled compaction and not compactfiles
3. it's easy to introduce a subtle and dangerous bug like this: D36225
4. SetupBottomMostLevel() is hard to understand and it shouldn't be. See this comment: https://github.com/facebook/rocksdb/blob/afbafeaeaebfd27a0f3e992fee8e0c57d07658fa/db/compaction.cc#L236-L241. It also made it harder for @yhchiang to write CompactFiles, as evidenced by this: https://github.com/facebook/rocksdb/blob/afbafeaeaebfd27a0f3e992fee8e0c57d07658fa/db/compaction_picker.cc#L204-L210
The problem is that we create Compaction object, which holds a lot of state, and then pass it around to some functions. After those functions are done mutating, then we call couple of functions on Compaction object, like SetupBottommostLevel() and MarkFilesBeingCompacted(). It is very hard to see what's happening with all that Compaction's state while it's travelling across different functions. If you're writing a new PickCompaction() function you need to try really hard to understand what are all the functions you need to run on Compaction object and what state you need to setup.
My proposed solution is to make important parts of Compaction immutable after construction. PickCompaction() should calculate compaction inputs and then pass them onto Compaction object once they are finalized. That makes it easy to create a new compaction -- just provide all the parameters to the constructor and you're done. No need to call confusing functions after you created your object.
This diff doesn't fully achieve that goal, but it comes pretty close. Here are some of the changes:
* have one Compaction constructor instead of two.
* inputs_ is constant after construction
* MarkFilesBeingCompacted() is now private to Compaction class and automatically called on construction/destruction.
* SetupBottommostLevel() is gone. Compaction figures it out on its own based on the input.
* CompactionPicker's functions are not passing around Compaction object anymore. They are only passing around the state that they need.
Test Plan:
make check
make asan_check
make valgrind_check
Reviewers: rven, anthony, sdong, yhchiang
Reviewed By: yhchiang
Subscribers: sdong, yhchiang, dhruba, leveldb
Differential Revision: https://reviews.facebook.net/D36687
10 years ago
|
|
|
output_path_id_(_output_path_id),
|
|
|
|
output_compression_(_compression),
|
|
|
|
output_compression_opts_(_compression_opts),
|
|
|
|
output_temperature_(_output_temperature),
|
|
|
|
deletion_compaction_(_deletion_compaction),
|
|
|
|
l0_files_might_overlap_(l0_files_might_overlap),
|
|
|
|
inputs_(PopulateWithAtomicBoundaries(vstorage, std::move(_inputs))),
|
Make Compaction class easier to use
Summary:
The goal of this diff is to make Compaction class easier to use. This should also make new compaction algorithms easier to write (like CompactFiles from @yhchiang and dynamic leveled and multi-leveled universal from @sdong).
Here are couple of things demonstrating that Compaction class is hard to use:
1. we have two constructors of Compaction class
2. there's this thing called grandparents_, but it appears to only be setup for leveled compaction and not compactfiles
3. it's easy to introduce a subtle and dangerous bug like this: D36225
4. SetupBottomMostLevel() is hard to understand and it shouldn't be. See this comment: https://github.com/facebook/rocksdb/blob/afbafeaeaebfd27a0f3e992fee8e0c57d07658fa/db/compaction.cc#L236-L241. It also made it harder for @yhchiang to write CompactFiles, as evidenced by this: https://github.com/facebook/rocksdb/blob/afbafeaeaebfd27a0f3e992fee8e0c57d07658fa/db/compaction_picker.cc#L204-L210
The problem is that we create Compaction object, which holds a lot of state, and then pass it around to some functions. After those functions are done mutating, then we call couple of functions on Compaction object, like SetupBottommostLevel() and MarkFilesBeingCompacted(). It is very hard to see what's happening with all that Compaction's state while it's travelling across different functions. If you're writing a new PickCompaction() function you need to try really hard to understand what are all the functions you need to run on Compaction object and what state you need to setup.
My proposed solution is to make important parts of Compaction immutable after construction. PickCompaction() should calculate compaction inputs and then pass them onto Compaction object once they are finalized. That makes it easy to create a new compaction -- just provide all the parameters to the constructor and you're done. No need to call confusing functions after you created your object.
This diff doesn't fully achieve that goal, but it comes pretty close. Here are some of the changes:
* have one Compaction constructor instead of two.
* inputs_ is constant after construction
* MarkFilesBeingCompacted() is now private to Compaction class and automatically called on construction/destruction.
* SetupBottommostLevel() is gone. Compaction figures it out on its own based on the input.
* CompactionPicker's functions are not passing around Compaction object anymore. They are only passing around the state that they need.
Test Plan:
make check
make asan_check
make valgrind_check
Reviewers: rven, anthony, sdong, yhchiang
Reviewed By: yhchiang
Subscribers: sdong, yhchiang, dhruba, leveldb
Differential Revision: https://reviews.facebook.net/D36687
10 years ago
|
|
|
grandparents_(std::move(_grandparents)),
|
|
|
|
score_(_score),
|
Add missing range conflict check between file ingestion and RefitLevel() (#10988)
Summary:
**Context:**
File ingestion never checks whether the key range it acts on overlaps with an ongoing RefitLevel() (used in `CompactRange()` with `change_level=true`). That's because RefitLevel() doesn't register and make its key range known to file ingestion. Though it checks overlapping with other compactions by https://github.com/facebook/rocksdb/blob/7.8.fb/db/external_sst_file_ingestion_job.cc#L998.
RefitLevel() (used in `CompactRange()` with `change_level=true`) doesn't check whether the key range it acts on overlaps with an ongoing file ingestion. That's because file ingestion does not register and make its key range known to other compactions.
- Note that non-refitlevel-compaction (e.g, manual compaction w/o RefitLevel() or general compaction) also does not check key range overlap with ongoing file ingestion for the same reason.
- But it's fine. Credited to cbi42's discovery, `WaitForIngestFile` was called by background and foreground compactions. They were introduced in https://github.com/facebook/rocksdb/commit/0f88160f67d36ea30e3aca3a3cef924c3a009be6, https://github.com/facebook/rocksdb/commit/5c64fb67d2fc198f1a73ff3ae543749a6a41f513 and https://github.com/facebook/rocksdb/commit/87dfc1d23e0e16ff73e15f63c6fa0fb3b3fc8c8c.
- Regardless, this PR registers file ingestion like a compaction is a general approach that will also add range conflict check between file ingestion and non-refitlevel-compaction, though it has not been the issue motivated this PR.
Above are bugs resulting in two bad consequences:
- If file ingestion and RefitLevel() creates files in the same level, then range-overlapped files will be created at that level and caught as corruption by `force_consistency_checks=true`
- If file ingestion and RefitLevel() creates file in different levels, then with one further compaction on the ingested file, it can result in two same keys both with seqno 0 in two different levels. Then with iterator's [optimization](https://github.com/facebook/rocksdb/blame/c62f3221698fd273b673d4f7e54eabb8329a4369/db/db_iter.cc#L342-L343) that assumes no two same keys both with seqno 0, it will either break this assertion in debug build or, even worst, return value of this same key for the key after it, which is the wrong value to return, in release build.
Therefore we decide to introduce range conflict check for file ingestion and RefitLevel() inspired from the existing range conflict check among compactions.
**Summary:**
- Treat file ingestion job and RefitLevel() as `Compaction` of new compaction reasons: `CompactionReason::kExternalSstIngestion` and `CompactionReason::kRefitLevel` and register/unregister them. File ingestion is treated as compaction from L0 to different levels and RefitLevel() as compaction from source level to target level.
- Check for `RangeOverlapWithCompaction` with other ongoing compactions, `RegisterCompaction()` on this "compaction" before changing the LSM state in `VersionStorageInfo`, and `UnregisterCompaction()` after changing.
- Replace scattered fixes (https://github.com/facebook/rocksdb/commit/0f88160f67d36ea30e3aca3a3cef924c3a009be6, https://github.com/facebook/rocksdb/commit/5c64fb67d2fc198f1a73ff3ae543749a6a41f513 and https://github.com/facebook/rocksdb/commit/87dfc1d23e0e16ff73e15f63c6fa0fb3b3fc8c8c.) that prevents overlapping between file ingestion and non-refit-level compaction with this fix cuz those practices are easy to overlook.
- Misc: logic cleanup, see PR comments
Pull Request resolved: https://github.com/facebook/rocksdb/pull/10988
Test Plan:
- New unit test `DBCompactionTestWithOngoingFileIngestionParam*` that failed pre-fix and passed afterwards.
- Made compatible with existing tests, see PR comments
- make check
- [Ongoing] Stress test rehearsal with normal value and aggressive CI value https://github.com/facebook/rocksdb/pull/10761
Reviewed By: cbi42
Differential Revision: D41535685
Pulled By: hx235
fbshipit-source-id: 549833a577ba1496d20a870583d4caa737da1258
2 years ago
|
|
|
bottommost_level_(
|
|
|
|
// For simplicity, we don't support the concept of "bottommost level"
|
|
|
|
// with
|
|
|
|
// `CompactionReason::kExternalSstIngestion` and
|
|
|
|
// `CompactionReason::kRefitLevel`
|
|
|
|
(_compaction_reason == CompactionReason::kExternalSstIngestion ||
|
|
|
|
_compaction_reason == CompactionReason::kRefitLevel)
|
|
|
|
? false
|
|
|
|
: IsBottommostLevel(output_level_, vstorage, inputs_)),
|
Make Compaction class easier to use
Summary:
The goal of this diff is to make Compaction class easier to use. This should also make new compaction algorithms easier to write (like CompactFiles from @yhchiang and dynamic leveled and multi-leveled universal from @sdong).
Here are couple of things demonstrating that Compaction class is hard to use:
1. we have two constructors of Compaction class
2. there's this thing called grandparents_, but it appears to only be setup for leveled compaction and not compactfiles
3. it's easy to introduce a subtle and dangerous bug like this: D36225
4. SetupBottomMostLevel() is hard to understand and it shouldn't be. See this comment: https://github.com/facebook/rocksdb/blob/afbafeaeaebfd27a0f3e992fee8e0c57d07658fa/db/compaction.cc#L236-L241. It also made it harder for @yhchiang to write CompactFiles, as evidenced by this: https://github.com/facebook/rocksdb/blob/afbafeaeaebfd27a0f3e992fee8e0c57d07658fa/db/compaction_picker.cc#L204-L210
The problem is that we create Compaction object, which holds a lot of state, and then pass it around to some functions. After those functions are done mutating, then we call couple of functions on Compaction object, like SetupBottommostLevel() and MarkFilesBeingCompacted(). It is very hard to see what's happening with all that Compaction's state while it's travelling across different functions. If you're writing a new PickCompaction() function you need to try really hard to understand what are all the functions you need to run on Compaction object and what state you need to setup.
My proposed solution is to make important parts of Compaction immutable after construction. PickCompaction() should calculate compaction inputs and then pass them onto Compaction object once they are finalized. That makes it easy to create a new compaction -- just provide all the parameters to the constructor and you're done. No need to call confusing functions after you created your object.
This diff doesn't fully achieve that goal, but it comes pretty close. Here are some of the changes:
* have one Compaction constructor instead of two.
* inputs_ is constant after construction
* MarkFilesBeingCompacted() is now private to Compaction class and automatically called on construction/destruction.
* SetupBottommostLevel() is gone. Compaction figures it out on its own based on the input.
* CompactionPicker's functions are not passing around Compaction object anymore. They are only passing around the state that they need.
Test Plan:
make check
make asan_check
make valgrind_check
Reviewers: rven, anthony, sdong, yhchiang
Reviewed By: yhchiang
Subscribers: sdong, yhchiang, dhruba, leveldb
Differential Revision: https://reviews.facebook.net/D36687
10 years ago
|
|
|
is_full_compaction_(IsFullCompaction(vstorage, inputs_)),
|
|
|
|
is_manual_compaction_(_manual_compaction),
|
|
|
|
trim_ts_(_trim_ts),
|
|
|
|
is_trivial_move_(false),
|
|
|
|
compaction_reason_(_compaction_reason),
|
|
|
|
notify_on_compaction_completion_(false),
|
|
|
|
enable_blob_garbage_collection_(
|
|
|
|
_blob_garbage_collection_policy == BlobGarbageCollectionPolicy::kForce
|
|
|
|
? true
|
|
|
|
: (_blob_garbage_collection_policy ==
|
|
|
|
BlobGarbageCollectionPolicy::kDisable
|
|
|
|
? false
|
|
|
|
: mutable_cf_options()->enable_blob_garbage_collection)),
|
|
|
|
blob_garbage_collection_age_cutoff_(
|
|
|
|
_blob_garbage_collection_age_cutoff < 0 ||
|
|
|
|
_blob_garbage_collection_age_cutoff > 1
|
|
|
|
? mutable_cf_options()->blob_garbage_collection_age_cutoff
|
|
|
|
: _blob_garbage_collection_age_cutoff),
|
Add missing range conflict check between file ingestion and RefitLevel() (#10988)
Summary:
**Context:**
File ingestion never checks whether the key range it acts on overlaps with an ongoing RefitLevel() (used in `CompactRange()` with `change_level=true`). That's because RefitLevel() doesn't register and make its key range known to file ingestion. Though it checks overlapping with other compactions by https://github.com/facebook/rocksdb/blob/7.8.fb/db/external_sst_file_ingestion_job.cc#L998.
RefitLevel() (used in `CompactRange()` with `change_level=true`) doesn't check whether the key range it acts on overlaps with an ongoing file ingestion. That's because file ingestion does not register and make its key range known to other compactions.
- Note that non-refitlevel-compaction (e.g, manual compaction w/o RefitLevel() or general compaction) also does not check key range overlap with ongoing file ingestion for the same reason.
- But it's fine. Credited to cbi42's discovery, `WaitForIngestFile` was called by background and foreground compactions. They were introduced in https://github.com/facebook/rocksdb/commit/0f88160f67d36ea30e3aca3a3cef924c3a009be6, https://github.com/facebook/rocksdb/commit/5c64fb67d2fc198f1a73ff3ae543749a6a41f513 and https://github.com/facebook/rocksdb/commit/87dfc1d23e0e16ff73e15f63c6fa0fb3b3fc8c8c.
- Regardless, this PR registers file ingestion like a compaction is a general approach that will also add range conflict check between file ingestion and non-refitlevel-compaction, though it has not been the issue motivated this PR.
Above are bugs resulting in two bad consequences:
- If file ingestion and RefitLevel() creates files in the same level, then range-overlapped files will be created at that level and caught as corruption by `force_consistency_checks=true`
- If file ingestion and RefitLevel() creates file in different levels, then with one further compaction on the ingested file, it can result in two same keys both with seqno 0 in two different levels. Then with iterator's [optimization](https://github.com/facebook/rocksdb/blame/c62f3221698fd273b673d4f7e54eabb8329a4369/db/db_iter.cc#L342-L343) that assumes no two same keys both with seqno 0, it will either break this assertion in debug build or, even worst, return value of this same key for the key after it, which is the wrong value to return, in release build.
Therefore we decide to introduce range conflict check for file ingestion and RefitLevel() inspired from the existing range conflict check among compactions.
**Summary:**
- Treat file ingestion job and RefitLevel() as `Compaction` of new compaction reasons: `CompactionReason::kExternalSstIngestion` and `CompactionReason::kRefitLevel` and register/unregister them. File ingestion is treated as compaction from L0 to different levels and RefitLevel() as compaction from source level to target level.
- Check for `RangeOverlapWithCompaction` with other ongoing compactions, `RegisterCompaction()` on this "compaction" before changing the LSM state in `VersionStorageInfo`, and `UnregisterCompaction()` after changing.
- Replace scattered fixes (https://github.com/facebook/rocksdb/commit/0f88160f67d36ea30e3aca3a3cef924c3a009be6, https://github.com/facebook/rocksdb/commit/5c64fb67d2fc198f1a73ff3ae543749a6a41f513 and https://github.com/facebook/rocksdb/commit/87dfc1d23e0e16ff73e15f63c6fa0fb3b3fc8c8c.) that prevents overlapping between file ingestion and non-refit-level compaction with this fix cuz those practices are easy to overlook.
- Misc: logic cleanup, see PR comments
Pull Request resolved: https://github.com/facebook/rocksdb/pull/10988
Test Plan:
- New unit test `DBCompactionTestWithOngoingFileIngestionParam*` that failed pre-fix and passed afterwards.
- Made compatible with existing tests, see PR comments
- make check
- [Ongoing] Stress test rehearsal with normal value and aggressive CI value https://github.com/facebook/rocksdb/pull/10761
Reviewed By: cbi42
Differential Revision: D41535685
Pulled By: hx235
fbshipit-source-id: 549833a577ba1496d20a870583d4caa737da1258
2 years ago
|
|
|
penultimate_level_(
|
|
|
|
// For simplicity, we don't support the concept of "penultimate level"
|
|
|
|
// with `CompactionReason::kExternalSstIngestion` and
|
|
|
|
// `CompactionReason::kRefitLevel`
|
|
|
|
_compaction_reason == CompactionReason::kExternalSstIngestion ||
|
|
|
|
_compaction_reason == CompactionReason::kRefitLevel
|
|
|
|
? Compaction::kInvalidLevel
|
|
|
|
: EvaluatePenultimateLevel(vstorage, immutable_options_,
|
|
|
|
start_level_, output_level_)) {
|
Make Compaction class easier to use
Summary:
The goal of this diff is to make Compaction class easier to use. This should also make new compaction algorithms easier to write (like CompactFiles from @yhchiang and dynamic leveled and multi-leveled universal from @sdong).
Here are couple of things demonstrating that Compaction class is hard to use:
1. we have two constructors of Compaction class
2. there's this thing called grandparents_, but it appears to only be setup for leveled compaction and not compactfiles
3. it's easy to introduce a subtle and dangerous bug like this: D36225
4. SetupBottomMostLevel() is hard to understand and it shouldn't be. See this comment: https://github.com/facebook/rocksdb/blob/afbafeaeaebfd27a0f3e992fee8e0c57d07658fa/db/compaction.cc#L236-L241. It also made it harder for @yhchiang to write CompactFiles, as evidenced by this: https://github.com/facebook/rocksdb/blob/afbafeaeaebfd27a0f3e992fee8e0c57d07658fa/db/compaction_picker.cc#L204-L210
The problem is that we create Compaction object, which holds a lot of state, and then pass it around to some functions. After those functions are done mutating, then we call couple of functions on Compaction object, like SetupBottommostLevel() and MarkFilesBeingCompacted(). It is very hard to see what's happening with all that Compaction's state while it's travelling across different functions. If you're writing a new PickCompaction() function you need to try really hard to understand what are all the functions you need to run on Compaction object and what state you need to setup.
My proposed solution is to make important parts of Compaction immutable after construction. PickCompaction() should calculate compaction inputs and then pass them onto Compaction object once they are finalized. That makes it easy to create a new compaction -- just provide all the parameters to the constructor and you're done. No need to call confusing functions after you created your object.
This diff doesn't fully achieve that goal, but it comes pretty close. Here are some of the changes:
* have one Compaction constructor instead of two.
* inputs_ is constant after construction
* MarkFilesBeingCompacted() is now private to Compaction class and automatically called on construction/destruction.
* SetupBottommostLevel() is gone. Compaction figures it out on its own based on the input.
* CompactionPicker's functions are not passing around Compaction object anymore. They are only passing around the state that they need.
Test Plan:
make check
make asan_check
make valgrind_check
Reviewers: rven, anthony, sdong, yhchiang
Reviewed By: yhchiang
Subscribers: sdong, yhchiang, dhruba, leveldb
Differential Revision: https://reviews.facebook.net/D36687
10 years ago
|
|
|
MarkFilesBeingCompacted(true);
|
|
|
|
if (is_manual_compaction_) {
|
|
|
|
compaction_reason_ = CompactionReason::kManualCompaction;
|
|
|
|
}
|
|
|
|
if (max_subcompactions_ == 0) {
|
|
|
|
max_subcompactions_ = _mutable_db_options.max_subcompactions;
|
|
|
|
}
|
Make Compaction class easier to use
Summary:
The goal of this diff is to make Compaction class easier to use. This should also make new compaction algorithms easier to write (like CompactFiles from @yhchiang and dynamic leveled and multi-leveled universal from @sdong).
Here are couple of things demonstrating that Compaction class is hard to use:
1. we have two constructors of Compaction class
2. there's this thing called grandparents_, but it appears to only be setup for leveled compaction and not compactfiles
3. it's easy to introduce a subtle and dangerous bug like this: D36225
4. SetupBottomMostLevel() is hard to understand and it shouldn't be. See this comment: https://github.com/facebook/rocksdb/blob/afbafeaeaebfd27a0f3e992fee8e0c57d07658fa/db/compaction.cc#L236-L241. It also made it harder for @yhchiang to write CompactFiles, as evidenced by this: https://github.com/facebook/rocksdb/blob/afbafeaeaebfd27a0f3e992fee8e0c57d07658fa/db/compaction_picker.cc#L204-L210
The problem is that we create Compaction object, which holds a lot of state, and then pass it around to some functions. After those functions are done mutating, then we call couple of functions on Compaction object, like SetupBottommostLevel() and MarkFilesBeingCompacted(). It is very hard to see what's happening with all that Compaction's state while it's travelling across different functions. If you're writing a new PickCompaction() function you need to try really hard to understand what are all the functions you need to run on Compaction object and what state you need to setup.
My proposed solution is to make important parts of Compaction immutable after construction. PickCompaction() should calculate compaction inputs and then pass them onto Compaction object once they are finalized. That makes it easy to create a new compaction -- just provide all the parameters to the constructor and you're done. No need to call confusing functions after you created your object.
This diff doesn't fully achieve that goal, but it comes pretty close. Here are some of the changes:
* have one Compaction constructor instead of two.
* inputs_ is constant after construction
* MarkFilesBeingCompacted() is now private to Compaction class and automatically called on construction/destruction.
* SetupBottommostLevel() is gone. Compaction figures it out on its own based on the input.
* CompactionPicker's functions are not passing around Compaction object anymore. They are only passing around the state that they need.
Test Plan:
make check
make asan_check
make valgrind_check
Reviewers: rven, anthony, sdong, yhchiang
Reviewed By: yhchiang
Subscribers: sdong, yhchiang, dhruba, leveldb
Differential Revision: https://reviews.facebook.net/D36687
10 years ago
|
|
|
|
Align compaction output file boundaries to the next level ones (#10655)
Summary:
Try to align the compaction output file boundaries to the next level ones
(grandparent level), to reduce the level compaction write-amplification.
In level compaction, there are "wasted" data at the beginning and end of the
output level files. Align the file boundary can avoid such "wasted" compaction.
With this PR, it tries to align the non-bottommost level file boundaries to its
next level ones. It may cut file when the file size is large enough (at least
50% of target_file_size) and not too large (2x target_file_size).
db_bench shows about 12.56% compaction reduction:
```
TEST_TMPDIR=/data/dbbench2 ./db_bench --benchmarks=fillrandom,readrandom -max_background_jobs=12 -num=400000000 -target_file_size_base=33554432
# baseline:
Flush(GB): cumulative 25.882, interval 7.216
Cumulative compaction: 285.90 GB write, 162.36 MB/s write, 269.68 GB read, 153.15 MB/s read, 2926.7 seconds
# with this change:
Flush(GB): cumulative 25.882, interval 7.753
Cumulative compaction: 249.97 GB write, 141.96 MB/s write, 233.74 GB read, 132.74 MB/s read, 2534.9 seconds
```
The compaction simulator shows a similar result (14% with 100G random data).
As a side effect, with this PR, the SST file size can exceed the
target_file_size, but is capped at 2x target_file_size. And there will be
smaller files. Here are file size statistics when loading 100GB with the target
file size 32MB:
```
baseline this_PR
count 1.656000e+03 1.705000e+03
mean 3.116062e+07 3.028076e+07
std 7.145242e+06 8.046139e+06
```
The feature is enabled by default, to revert to the old behavior disable it
with `AdvancedColumnFamilyOptions.level_compaction_dynamic_file_size = false`
Also includes https://github.com/facebook/rocksdb/issues/1963 to cut file before skippable grandparent file. Which is for
use case like user adding 2 or more non-overlapping data range at the same
time, it can reduce the overlapping of 2 datasets in the lower levels.
Pull Request resolved: https://github.com/facebook/rocksdb/pull/10655
Reviewed By: cbi42
Differential Revision: D39552321
Pulled By: jay-zhuang
fbshipit-source-id: 640d15f159ab0cd973f2426cfc3af266fc8bdde2
2 years ago
|
|
|
// for the non-bottommost levels, it tries to build files match the target
|
|
|
|
// file size, but not guaranteed. It could be 2x the size of the target size.
|
|
|
|
max_output_file_size_ =
|
|
|
|
bottommost_level_ || grandparents_.empty() ||
|
|
|
|
!_immutable_options.level_compaction_dynamic_file_size
|
|
|
|
? target_output_file_size_
|
|
|
|
: 2 * target_output_file_size_;
|
|
|
|
|
Make Compaction class easier to use
Summary:
The goal of this diff is to make Compaction class easier to use. This should also make new compaction algorithms easier to write (like CompactFiles from @yhchiang and dynamic leveled and multi-leveled universal from @sdong).
Here are couple of things demonstrating that Compaction class is hard to use:
1. we have two constructors of Compaction class
2. there's this thing called grandparents_, but it appears to only be setup for leveled compaction and not compactfiles
3. it's easy to introduce a subtle and dangerous bug like this: D36225
4. SetupBottomMostLevel() is hard to understand and it shouldn't be. See this comment: https://github.com/facebook/rocksdb/blob/afbafeaeaebfd27a0f3e992fee8e0c57d07658fa/db/compaction.cc#L236-L241. It also made it harder for @yhchiang to write CompactFiles, as evidenced by this: https://github.com/facebook/rocksdb/blob/afbafeaeaebfd27a0f3e992fee8e0c57d07658fa/db/compaction_picker.cc#L204-L210
The problem is that we create Compaction object, which holds a lot of state, and then pass it around to some functions. After those functions are done mutating, then we call couple of functions on Compaction object, like SetupBottommostLevel() and MarkFilesBeingCompacted(). It is very hard to see what's happening with all that Compaction's state while it's travelling across different functions. If you're writing a new PickCompaction() function you need to try really hard to understand what are all the functions you need to run on Compaction object and what state you need to setup.
My proposed solution is to make important parts of Compaction immutable after construction. PickCompaction() should calculate compaction inputs and then pass them onto Compaction object once they are finalized. That makes it easy to create a new compaction -- just provide all the parameters to the constructor and you're done. No need to call confusing functions after you created your object.
This diff doesn't fully achieve that goal, but it comes pretty close. Here are some of the changes:
* have one Compaction constructor instead of two.
* inputs_ is constant after construction
* MarkFilesBeingCompacted() is now private to Compaction class and automatically called on construction/destruction.
* SetupBottommostLevel() is gone. Compaction figures it out on its own based on the input.
* CompactionPicker's functions are not passing around Compaction object anymore. They are only passing around the state that they need.
Test Plan:
make check
make asan_check
make valgrind_check
Reviewers: rven, anthony, sdong, yhchiang
Reviewed By: yhchiang
Subscribers: sdong, yhchiang, dhruba, leveldb
Differential Revision: https://reviews.facebook.net/D36687
10 years ago
|
|
|
#ifndef NDEBUG
|
|
|
|
for (size_t i = 1; i < inputs_.size(); ++i) {
|
|
|
|
assert(inputs_[i].level > inputs_[i - 1].level);
|
|
|
|
}
|
|
|
|
#endif
|
|
|
|
|
|
|
|
// setup input_levels_
|
|
|
|
{
|
|
|
|
input_levels_.resize(num_input_levels());
|
|
|
|
for (size_t which = 0; which < num_input_levels(); which++) {
|
|
|
|
DoGenerateLevelFilesBrief(&input_levels_[which], inputs_[which].files,
|
|
|
|
&arena_);
|
|
|
|
}
|
CompactFiles, EventListener and GetDatabaseMetaData
Summary:
This diff adds three sets of APIs to RocksDB.
= GetColumnFamilyMetaData =
* This APIs allow users to obtain the current state of a RocksDB instance on one column family.
* See GetColumnFamilyMetaData in include/rocksdb/db.h
= EventListener =
* A virtual class that allows users to implement a set of
call-back functions which will be called when specific
events of a RocksDB instance happens.
* To register EventListener, simply insert an EventListener to ColumnFamilyOptions::listeners
= CompactFiles =
* CompactFiles API inputs a set of file numbers and an output level, and RocksDB
will try to compact those files into the specified level.
= Example =
* Example code can be found in example/compact_files_example.cc, which implements
a simple external compactor using EventListener, GetColumnFamilyMetaData, and
CompactFiles API.
Test Plan:
listener_test
compactor_test
example/compact_files_example
export ROCKSDB_TESTS=CompactFiles
db_test
export ROCKSDB_TESTS=MetaData
db_test
Reviewers: ljin, igor, rven, sdong
Reviewed By: sdong
Subscribers: MarkCallaghan, dhruba, leveldb
Differential Revision: https://reviews.facebook.net/D24705
10 years ago
|
|
|
}
|
|
|
|
|
|
|
|
GetBoundaryKeys(vstorage, inputs_, &smallest_user_key_, &largest_user_key_);
|
|
|
|
|
|
|
|
// Every compaction regardless of any compaction reason may respect the
|
|
|
|
// existing compact cursor in the output level to split output files
|
|
|
|
output_split_key_ = nullptr;
|
|
|
|
if (immutable_options_.compaction_style == kCompactionStyleLevel &&
|
|
|
|
immutable_options_.compaction_pri == kRoundRobin) {
|
|
|
|
const InternalKey* cursor =
|
|
|
|
&input_vstorage_->GetCompactCursors()[output_level_];
|
|
|
|
if (cursor->size() != 0) {
|
|
|
|
const Slice& cursor_user_key = ExtractUserKey(cursor->Encode());
|
|
|
|
auto ucmp = vstorage->InternalComparator()->user_comparator();
|
|
|
|
// May split output files according to the cursor if it in the user-key
|
|
|
|
// range
|
|
|
|
if (ucmp->CompareWithoutTimestamp(cursor_user_key, smallest_user_key_) >
|
|
|
|
0 &&
|
|
|
|
ucmp->CompareWithoutTimestamp(cursor_user_key, largest_user_key_) <=
|
|
|
|
0) {
|
|
|
|
output_split_key_ = cursor;
|
|
|
|
}
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
PopulatePenultimateLevelOutputRange();
|
|
|
|
}
|
|
|
|
|
|
|
|
void Compaction::PopulatePenultimateLevelOutputRange() {
|
|
|
|
if (!SupportsPerKeyPlacement()) {
|
|
|
|
return;
|
|
|
|
}
|
|
|
|
|
|
|
|
// exclude the last level, the range of all input levels is the safe range
|
|
|
|
// of keys that can be moved up.
|
|
|
|
int exclude_level = number_levels_ - 1;
|
|
|
|
penultimate_output_range_type_ = PenultimateOutputRangeType::kNonLastRange;
|
|
|
|
|
|
|
|
// For universal compaction, the penultimate_output_range could be extended if
|
|
|
|
// all penultimate level files are included in the compaction (which includes
|
|
|
|
// the case that the penultimate level is empty).
|
|
|
|
if (immutable_options_.compaction_style == kCompactionStyleUniversal) {
|
|
|
|
exclude_level = kInvalidLevel;
|
|
|
|
penultimate_output_range_type_ = PenultimateOutputRangeType::kFullRange;
|
|
|
|
std::set<uint64_t> penultimate_inputs;
|
|
|
|
for (const auto& input_lvl : inputs_) {
|
|
|
|
if (input_lvl.level == penultimate_level_) {
|
|
|
|
for (const auto& file : input_lvl.files) {
|
|
|
|
penultimate_inputs.emplace(file->fd.GetNumber());
|
|
|
|
}
|
|
|
|
}
|
|
|
|
}
|
|
|
|
auto penultimate_files = input_vstorage_->LevelFiles(penultimate_level_);
|
|
|
|
for (const auto& file : penultimate_files) {
|
|
|
|
if (penultimate_inputs.find(file->fd.GetNumber()) ==
|
|
|
|
penultimate_inputs.end()) {
|
|
|
|
exclude_level = number_levels_ - 1;
|
|
|
|
penultimate_output_range_type_ =
|
|
|
|
PenultimateOutputRangeType::kNonLastRange;
|
|
|
|
break;
|
|
|
|
}
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
GetBoundaryKeys(input_vstorage_, inputs_,
|
|
|
|
&penultimate_level_smallest_user_key_,
|
|
|
|
&penultimate_level_largest_user_key_, exclude_level);
|
CompactFiles, EventListener and GetDatabaseMetaData
Summary:
This diff adds three sets of APIs to RocksDB.
= GetColumnFamilyMetaData =
* This APIs allow users to obtain the current state of a RocksDB instance on one column family.
* See GetColumnFamilyMetaData in include/rocksdb/db.h
= EventListener =
* A virtual class that allows users to implement a set of
call-back functions which will be called when specific
events of a RocksDB instance happens.
* To register EventListener, simply insert an EventListener to ColumnFamilyOptions::listeners
= CompactFiles =
* CompactFiles API inputs a set of file numbers and an output level, and RocksDB
will try to compact those files into the specified level.
= Example =
* Example code can be found in example/compact_files_example.cc, which implements
a simple external compactor using EventListener, GetColumnFamilyMetaData, and
CompactFiles API.
Test Plan:
listener_test
compactor_test
example/compact_files_example
export ROCKSDB_TESTS=CompactFiles
db_test
export ROCKSDB_TESTS=MetaData
db_test
Reviewers: ljin, igor, rven, sdong
Reviewed By: sdong
Subscribers: MarkCallaghan, dhruba, leveldb
Differential Revision: https://reviews.facebook.net/D24705
10 years ago
|
|
|
}
|
|
|
|
|
|
|
|
Compaction::~Compaction() {
|
|
|
|
if (input_version_ != nullptr) {
|
|
|
|
input_version_->Unref();
|
|
|
|
}
|
|
|
|
if (cfd_ != nullptr) {
|
|
|
|
cfd_->UnrefAndTryDelete();
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
bool Compaction::SupportsPerKeyPlacement() const {
|
|
|
|
return penultimate_level_ != kInvalidLevel;
|
|
|
|
}
|
|
|
|
|
|
|
|
int Compaction::GetPenultimateLevel() const { return penultimate_level_; }
|
|
|
|
|
|
|
|
// smallest_key and largest_key include timestamps if user-defined timestamp is
|
|
|
|
// enabled.
|
|
|
|
bool Compaction::OverlapPenultimateLevelOutputRange(
|
|
|
|
const Slice& smallest_key, const Slice& largest_key) const {
|
|
|
|
if (!SupportsPerKeyPlacement()) {
|
|
|
|
return false;
|
|
|
|
}
|
|
|
|
const Comparator* ucmp =
|
|
|
|
input_vstorage_->InternalComparator()->user_comparator();
|
|
|
|
|
|
|
|
return ucmp->CompareWithoutTimestamp(
|
|
|
|
smallest_key, penultimate_level_largest_user_key_) <= 0 &&
|
|
|
|
ucmp->CompareWithoutTimestamp(
|
|
|
|
largest_key, penultimate_level_smallest_user_key_) >= 0;
|
|
|
|
}
|
|
|
|
|
|
|
|
// key includes timestamp if user-defined timestamp is enabled.
|
|
|
|
bool Compaction::WithinPenultimateLevelOutputRange(const Slice& key) const {
|
|
|
|
if (!SupportsPerKeyPlacement()) {
|
|
|
|
return false;
|
|
|
|
}
|
|
|
|
|
|
|
|
if (penultimate_level_smallest_user_key_.empty() ||
|
|
|
|
penultimate_level_largest_user_key_.empty()) {
|
|
|
|
return false;
|
|
|
|
}
|
|
|
|
|
|
|
|
const Comparator* ucmp =
|
|
|
|
input_vstorage_->InternalComparator()->user_comparator();
|
|
|
|
|
|
|
|
return ucmp->CompareWithoutTimestamp(
|
|
|
|
key, penultimate_level_smallest_user_key_) >= 0 &&
|
|
|
|
ucmp->CompareWithoutTimestamp(
|
|
|
|
key, penultimate_level_largest_user_key_) <= 0;
|
|
|
|
}
|
|
|
|
|
|
|
|
bool Compaction::InputCompressionMatchesOutput() const {
|
|
|
|
int base_level = input_vstorage_->base_level();
|
|
|
|
bool matches =
|
|
|
|
(GetCompressionType(input_vstorage_, mutable_cf_options_, start_level_,
|
|
|
|
base_level) == output_compression_);
|
|
|
|
if (matches) {
|
|
|
|
TEST_SYNC_POINT("Compaction::InputCompressionMatchesOutput:Matches");
|
|
|
|
return true;
|
|
|
|
}
|
|
|
|
TEST_SYNC_POINT("Compaction::InputCompressionMatchesOutput:DidntMatch");
|
|
|
|
return matches;
|
|
|
|
}
|
|
|
|
|
|
|
|
bool Compaction::IsTrivialMove() const {
|
|
|
|
// Avoid a move if there is lots of overlapping grandparent data.
|
|
|
|
// Otherwise, the move could create a parent file that will require
|
|
|
|
// a very expensive merge later on.
|
Make Compaction class easier to use
Summary:
The goal of this diff is to make Compaction class easier to use. This should also make new compaction algorithms easier to write (like CompactFiles from @yhchiang and dynamic leveled and multi-leveled universal from @sdong).
Here are couple of things demonstrating that Compaction class is hard to use:
1. we have two constructors of Compaction class
2. there's this thing called grandparents_, but it appears to only be setup for leveled compaction and not compactfiles
3. it's easy to introduce a subtle and dangerous bug like this: D36225
4. SetupBottomMostLevel() is hard to understand and it shouldn't be. See this comment: https://github.com/facebook/rocksdb/blob/afbafeaeaebfd27a0f3e992fee8e0c57d07658fa/db/compaction.cc#L236-L241. It also made it harder for @yhchiang to write CompactFiles, as evidenced by this: https://github.com/facebook/rocksdb/blob/afbafeaeaebfd27a0f3e992fee8e0c57d07658fa/db/compaction_picker.cc#L204-L210
The problem is that we create Compaction object, which holds a lot of state, and then pass it around to some functions. After those functions are done mutating, then we call couple of functions on Compaction object, like SetupBottommostLevel() and MarkFilesBeingCompacted(). It is very hard to see what's happening with all that Compaction's state while it's travelling across different functions. If you're writing a new PickCompaction() function you need to try really hard to understand what are all the functions you need to run on Compaction object and what state you need to setup.
My proposed solution is to make important parts of Compaction immutable after construction. PickCompaction() should calculate compaction inputs and then pass them onto Compaction object once they are finalized. That makes it easy to create a new compaction -- just provide all the parameters to the constructor and you're done. No need to call confusing functions after you created your object.
This diff doesn't fully achieve that goal, but it comes pretty close. Here are some of the changes:
* have one Compaction constructor instead of two.
* inputs_ is constant after construction
* MarkFilesBeingCompacted() is now private to Compaction class and automatically called on construction/destruction.
* SetupBottommostLevel() is gone. Compaction figures it out on its own based on the input.
* CompactionPicker's functions are not passing around Compaction object anymore. They are only passing around the state that they need.
Test Plan:
make check
make asan_check
make valgrind_check
Reviewers: rven, anthony, sdong, yhchiang
Reviewed By: yhchiang
Subscribers: sdong, yhchiang, dhruba, leveldb
Differential Revision: https://reviews.facebook.net/D36687
10 years ago
|
|
|
// If start_level_== output_level_, the purpose is to force compaction
|
Allowing L0 -> L1 trivial move on sorted data
Summary:
This diff updates the logic of how we do trivial move, now trivial move can run on any number of files in input level as long as they are not overlapping
The conditions for trivial move have been updated
Introduced conditions:
- Trivial move cannot happen if we have a compaction filter (except if the compaction is not manual)
- Input level files cannot be overlapping
Removed conditions:
- Trivial move only run when the compaction is not manual
- Input level should can contain only 1 file
More context on what tests failed because of Trivial move
```
DBTest.CompactionsGenerateMultipleFiles
This test is expecting compaction on a file in L0 to generate multiple files in L1, this test will fail with trivial move because we end up with one file in L1
```
```
DBTest.NoSpaceCompactRange
This test expect compaction to fail when we force environment to report running out of space, of course this is not valid in trivial move situation
because trivial move does not need any extra space, and did not check for that
```
```
DBTest.DropWrites
Similar to DBTest.NoSpaceCompactRange
```
```
DBTest.DeleteObsoleteFilesPendingOutputs
This test expect that a file in L2 is deleted after it's moved to L3, this is not valid with trivial move because although the file was moved it is now used by L3
```
```
CuckooTableDBTest.CompactionIntoMultipleFiles
Same as DBTest.CompactionsGenerateMultipleFiles
```
This diff is based on a work by @sdong https://reviews.facebook.net/D34149
Test Plan: make -j64 check
Reviewers: rven, sdong, igor
Reviewed By: igor
Subscribers: yhchiang, ott, march, dhruba, sdong
Differential Revision: https://reviews.facebook.net/D34797
10 years ago
|
|
|
// filter to be applied to that level, and thus cannot be a trivial move.
|
|
|
|
|
|
|
|
// Check if start level have files with overlapping ranges
|
|
|
|
if (start_level_ == 0 && input_vstorage_->level0_non_overlapping() == false &&
|
|
|
|
l0_files_might_overlap_) {
|
|
|
|
// We cannot move files from L0 to L1 if the L0 files in the LSM-tree are
|
|
|
|
// overlapping, unless we are sure that files picked in L0 don't overlap.
|
Allowing L0 -> L1 trivial move on sorted data
Summary:
This diff updates the logic of how we do trivial move, now trivial move can run on any number of files in input level as long as they are not overlapping
The conditions for trivial move have been updated
Introduced conditions:
- Trivial move cannot happen if we have a compaction filter (except if the compaction is not manual)
- Input level files cannot be overlapping
Removed conditions:
- Trivial move only run when the compaction is not manual
- Input level should can contain only 1 file
More context on what tests failed because of Trivial move
```
DBTest.CompactionsGenerateMultipleFiles
This test is expecting compaction on a file in L0 to generate multiple files in L1, this test will fail with trivial move because we end up with one file in L1
```
```
DBTest.NoSpaceCompactRange
This test expect compaction to fail when we force environment to report running out of space, of course this is not valid in trivial move situation
because trivial move does not need any extra space, and did not check for that
```
```
DBTest.DropWrites
Similar to DBTest.NoSpaceCompactRange
```
```
DBTest.DeleteObsoleteFilesPendingOutputs
This test expect that a file in L2 is deleted after it's moved to L3, this is not valid with trivial move because although the file was moved it is now used by L3
```
```
CuckooTableDBTest.CompactionIntoMultipleFiles
Same as DBTest.CompactionsGenerateMultipleFiles
```
This diff is based on a work by @sdong https://reviews.facebook.net/D34149
Test Plan: make -j64 check
Reviewers: rven, sdong, igor
Reviewed By: igor
Subscribers: yhchiang, ott, march, dhruba, sdong
Differential Revision: https://reviews.facebook.net/D34797
10 years ago
|
|
|
return false;
|
|
|
|
}
|
|
|
|
|
|
|
|
if (is_manual_compaction_ &&
|
|
|
|
(immutable_options_.compaction_filter != nullptr ||
|
|
|
|
immutable_options_.compaction_filter_factory != nullptr)) {
|
Allowing L0 -> L1 trivial move on sorted data
Summary:
This diff updates the logic of how we do trivial move, now trivial move can run on any number of files in input level as long as they are not overlapping
The conditions for trivial move have been updated
Introduced conditions:
- Trivial move cannot happen if we have a compaction filter (except if the compaction is not manual)
- Input level files cannot be overlapping
Removed conditions:
- Trivial move only run when the compaction is not manual
- Input level should can contain only 1 file
More context on what tests failed because of Trivial move
```
DBTest.CompactionsGenerateMultipleFiles
This test is expecting compaction on a file in L0 to generate multiple files in L1, this test will fail with trivial move because we end up with one file in L1
```
```
DBTest.NoSpaceCompactRange
This test expect compaction to fail when we force environment to report running out of space, of course this is not valid in trivial move situation
because trivial move does not need any extra space, and did not check for that
```
```
DBTest.DropWrites
Similar to DBTest.NoSpaceCompactRange
```
```
DBTest.DeleteObsoleteFilesPendingOutputs
This test expect that a file in L2 is deleted after it's moved to L3, this is not valid with trivial move because although the file was moved it is now used by L3
```
```
CuckooTableDBTest.CompactionIntoMultipleFiles
Same as DBTest.CompactionsGenerateMultipleFiles
```
This diff is based on a work by @sdong https://reviews.facebook.net/D34149
Test Plan: make -j64 check
Reviewers: rven, sdong, igor
Reviewed By: igor
Subscribers: yhchiang, ott, march, dhruba, sdong
Differential Revision: https://reviews.facebook.net/D34797
10 years ago
|
|
|
// This is a manual compaction and we have a compaction filter that should
|
|
|
|
// be executed, we cannot do a trivial move
|
|
|
|
return false;
|
|
|
|
}
|
|
|
|
|
|
|
|
if (start_level_ == output_level_) {
|
|
|
|
// It doesn't make sense if compaction picker picks files just to trivial
|
|
|
|
// move to the same level.
|
|
|
|
return false;
|
|
|
|
}
|
|
|
|
|
|
|
|
if (compaction_reason_ == CompactionReason::kChangeTemperature) {
|
|
|
|
// Changing temperature usually requires rewriting the file.
|
|
|
|
return false;
|
|
|
|
}
|
|
|
|
|
|
|
|
// Used in universal compaction, where trivial move can be done if the
|
|
|
|
// input files are non overlapping
|
|
|
|
if ((mutable_cf_options_.compaction_options_universal.allow_trivial_move) &&
|
|
|
|
(output_level_ != 0) &&
|
|
|
|
(cfd_->ioptions()->compaction_style == kCompactionStyleUniversal)) {
|
|
|
|
return is_trivial_move_;
|
|
|
|
}
|
|
|
|
|
|
|
|
if (!(start_level_ != output_level_ && num_input_levels() == 1 &&
|
|
|
|
input(0, 0)->fd.GetPathId() == output_path_id() &&
|
|
|
|
InputCompressionMatchesOutput())) {
|
|
|
|
return false;
|
|
|
|
}
|
|
|
|
|
|
|
|
// assert inputs_.size() == 1
|
|
|
|
|
|
|
|
if (output_level_ + 1 < number_levels_) {
|
|
|
|
std::unique_ptr<SstPartitioner> partitioner = CreateSstPartitioner();
|
|
|
|
for (const auto& file : inputs_.front().files) {
|
|
|
|
std::vector<FileMetaData*> file_grand_parents;
|
|
|
|
input_vstorage_->GetOverlappingInputs(output_level_ + 1, &file->smallest,
|
|
|
|
&file->largest,
|
|
|
|
&file_grand_parents);
|
|
|
|
const auto compaction_size =
|
|
|
|
file->fd.GetFileSize() + TotalFileSize(file_grand_parents);
|
|
|
|
if (compaction_size > max_compaction_bytes_) {
|
|
|
|
return false;
|
|
|
|
}
|
|
|
|
|
|
|
|
if (partitioner.get() != nullptr) {
|
|
|
|
if (!partitioner->CanDoTrivialMove(file->smallest.user_key(),
|
|
|
|
file->largest.user_key())) {
|
|
|
|
return false;
|
|
|
|
}
|
|
|
|
}
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
// PerKeyPlacement compaction should never be trivial move.
|
|
|
|
if (SupportsPerKeyPlacement()) {
|
|
|
|
return false;
|
|
|
|
}
|
|
|
|
|
|
|
|
return true;
|
|
|
|
}
|
|
|
|
|
|
|
|
void Compaction::AddInputDeletions(VersionEdit* out_edit) {
|
|
|
|
for (size_t which = 0; which < num_input_levels(); which++) {
|
|
|
|
for (size_t i = 0; i < inputs_[which].size(); i++) {
|
|
|
|
out_edit->DeleteFile(level(which), inputs_[which][i]->fd.GetNumber());
|
|
|
|
}
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
bool Compaction::KeyNotExistsBeyondOutputLevel(
|
|
|
|
const Slice& user_key, std::vector<size_t>* level_ptrs) const {
|
|
|
|
assert(input_version_ != nullptr);
|
|
|
|
assert(level_ptrs != nullptr);
|
|
|
|
assert(level_ptrs->size() == static_cast<size_t>(number_levels_));
|
|
|
|
if (bottommost_level_) {
|
|
|
|
return true;
|
|
|
|
} else if (output_level_ != 0 &&
|
|
|
|
cfd_->ioptions()->compaction_style == kCompactionStyleLevel) {
|
|
|
|
// Maybe use binary search to find right entry instead of linear search?
|
|
|
|
const Comparator* user_cmp = cfd_->user_comparator();
|
|
|
|
for (int lvl = output_level_ + 1; lvl < number_levels_; lvl++) {
|
|
|
|
const std::vector<FileMetaData*>& files =
|
|
|
|
input_vstorage_->LevelFiles(lvl);
|
|
|
|
for (; level_ptrs->at(lvl) < files.size(); level_ptrs->at(lvl)++) {
|
|
|
|
auto* f = files[level_ptrs->at(lvl)];
|
|
|
|
if (user_cmp->Compare(user_key, f->largest.user_key()) <= 0) {
|
|
|
|
// We've advanced far enough
|
Allow compaction iterator to perform garbage collection (#7556)
Summary:
Add a threshold timestamp, full_history_ts_low_ of type `std::string*` to
`CompactionIterator`, so that RocksDB can also perform garbage collection during
compaction.
* If full_history_ts_low_ is nullptr, then compaction iterator does not perform
GC, preserving all timestamp history for all keys. Compaction iterator will
treat user key with different timestamps as different user keys.
* If full_history_ts_low_ is not nullptr, then compaction iterator performs
GC. GC will look at keys older than `*full_history_ts_low_` and determine their
eligibility based on factors including snapshots.
Current rules of GC:
* If an internal key is in the same snapshot as a previous counterpart
with the same user key, and this key is eligible for GC, and the key is
not single-delete or merge operand, then this key can be dropped. Note
that the previous internal key cannot be a merge operand either.
* If a tombstone is the most recent one in the earliest snapshot and it
is eligible for GC, and keyNotExistsBeyondLevel() is true, then this
tombstone can be dropped.
* If a tombstone is the most recent one in a snapshot and it is eligible
for GC, and the compaction is at bottommost level, then all other older
internal keys of the same user key must also be eligible for GC, thus
can be dropped
* Single-delete, delete-range and merge are not currently supported.
Pull Request resolved: https://github.com/facebook/rocksdb/pull/7556
Test Plan: make check
Reviewed By: ltamasi
Differential Revision: D24507728
Pulled By: riversand963
fbshipit-source-id: 3c09c7301f41eed76dfcf4d1527e68cf6e0a8bb3
4 years ago
|
|
|
// In the presence of user-defined timestamp, we may need to handle
|
|
|
|
// the case in which f->smallest.user_key() (including ts) has the
|
|
|
|
// same user key, but the ts part is smaller. If so,
|
|
|
|
// Compare(user_key, f->smallest.user_key()) returns -1.
|
|
|
|
// That's why we need CompareWithoutTimestamp().
|
|
|
|
if (user_cmp->CompareWithoutTimestamp(user_key,
|
|
|
|
f->smallest.user_key()) >= 0) {
|
|
|
|
// Key falls in this file's range, so it may
|
|
|
|
// exist beyond output level
|
|
|
|
return false;
|
|
|
|
}
|
|
|
|
break;
|
|
|
|
}
|
|
|
|
}
|
|
|
|
}
|
|
|
|
return true;
|
|
|
|
}
|
|
|
|
return false;
|
|
|
|
}
|
|
|
|
|
|
|
|
bool Compaction::KeyRangeNotExistsBeyondOutputLevel(
|
|
|
|
const Slice& begin_key, const Slice& end_key,
|
|
|
|
std::vector<size_t>* level_ptrs) const {
|
|
|
|
assert(input_version_ != nullptr);
|
|
|
|
assert(level_ptrs != nullptr);
|
|
|
|
assert(level_ptrs->size() == static_cast<size_t>(number_levels_));
|
|
|
|
assert(cfd_->user_comparator()->CompareWithoutTimestamp(begin_key, end_key) <
|
|
|
|
0);
|
|
|
|
if (bottommost_level_) {
|
|
|
|
return true /* does not overlap */;
|
|
|
|
} else if (output_level_ != 0 &&
|
|
|
|
cfd_->ioptions()->compaction_style == kCompactionStyleLevel) {
|
|
|
|
const Comparator* user_cmp = cfd_->user_comparator();
|
|
|
|
for (int lvl = output_level_ + 1; lvl < number_levels_; lvl++) {
|
|
|
|
const std::vector<FileMetaData*>& files =
|
|
|
|
input_vstorage_->LevelFiles(lvl);
|
|
|
|
for (; level_ptrs->at(lvl) < files.size(); level_ptrs->at(lvl)++) {
|
|
|
|
auto* f = files[level_ptrs->at(lvl)];
|
|
|
|
// Advance until the first file with begin_key <= f->largest.user_key()
|
|
|
|
if (user_cmp->CompareWithoutTimestamp(begin_key,
|
|
|
|
f->largest.user_key()) > 0) {
|
|
|
|
continue;
|
|
|
|
}
|
|
|
|
// We know that the previous file prev_f, if exists, has
|
|
|
|
// prev_f->largest.user_key() < begin_key.
|
|
|
|
if (user_cmp->CompareWithoutTimestamp(end_key,
|
|
|
|
f->smallest.user_key()) <= 0) {
|
|
|
|
// not overlapping with this level
|
|
|
|
break;
|
|
|
|
} else {
|
|
|
|
// We have:
|
|
|
|
// - begin_key < end_key,
|
|
|
|
// - begin_key <= f->largest.user_key(), and
|
|
|
|
// - end_key > f->smallest.user_key()
|
|
|
|
return false /* overlap */;
|
|
|
|
}
|
|
|
|
}
|
|
|
|
}
|
|
|
|
return true /* does not overlap */;
|
|
|
|
}
|
|
|
|
return false /* overlaps */;
|
|
|
|
};
|
|
|
|
|
|
|
|
// Mark (or clear) each file that is being compacted
|
|
|
|
void Compaction::MarkFilesBeingCompacted(bool mark_as_compacted) {
|
|
|
|
for (size_t i = 0; i < num_input_levels(); i++) {
|
|
|
|
for (size_t j = 0; j < inputs_[i].size(); j++) {
|
|
|
|
assert(mark_as_compacted ? !inputs_[i][j]->being_compacted
|
|
|
|
: inputs_[i][j]->being_compacted);
|
|
|
|
inputs_[i][j]->being_compacted = mark_as_compacted;
|
|
|
|
}
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
options.level_compaction_dynamic_level_bytes to allow RocksDB to pick size bases of levels dynamically.
Summary:
When having fixed max_bytes_for_level_base, the ratio of size of largest level and the second one can range from 0 to the multiplier. This makes LSM tree frequently irregular and unpredictable. It can also cause poor space amplification in some cases.
In this improvement (proposed by Igor Kabiljo), we introduce a parameter option.level_compaction_use_dynamic_max_bytes. When turning it on, RocksDB is free to pick a level base in the range of (options.max_bytes_for_level_base/options.max_bytes_for_level_multiplier, options.max_bytes_for_level_base] so that real level ratios are close to options.max_bytes_for_level_multiplier.
Test Plan: New unit tests and pass tests suites including valgrind.
Reviewers: MarkCallaghan, rven, yhchiang, igor, ikabiljo
Reviewed By: ikabiljo
Subscribers: yoshinorim, ikabiljo, dhruba, leveldb
Differential Revision: https://reviews.facebook.net/D31437
10 years ago
|
|
|
// Sample output:
|
|
|
|
// If compacting 3 L0 files, 2 L3 files and 1 L4 file, and outputting to L5,
|
|
|
|
// print: "3@0 + 2@3 + 1@4 files to L5"
|
|
|
|
const char* Compaction::InputLevelSummary(
|
|
|
|
InputLevelSummaryBuffer* scratch) const {
|
|
|
|
int len = 0;
|
|
|
|
bool is_first = true;
|
|
|
|
for (auto& input_level : inputs_) {
|
|
|
|
if (input_level.empty()) {
|
|
|
|
continue;
|
|
|
|
}
|
|
|
|
if (!is_first) {
|
|
|
|
len +=
|
|
|
|
snprintf(scratch->buffer + len, sizeof(scratch->buffer) - len, " + ");
|
|
|
|
len = std::min(len, static_cast<int>(sizeof(scratch->buffer)));
|
options.level_compaction_dynamic_level_bytes to allow RocksDB to pick size bases of levels dynamically.
Summary:
When having fixed max_bytes_for_level_base, the ratio of size of largest level and the second one can range from 0 to the multiplier. This makes LSM tree frequently irregular and unpredictable. It can also cause poor space amplification in some cases.
In this improvement (proposed by Igor Kabiljo), we introduce a parameter option.level_compaction_use_dynamic_max_bytes. When turning it on, RocksDB is free to pick a level base in the range of (options.max_bytes_for_level_base/options.max_bytes_for_level_multiplier, options.max_bytes_for_level_base] so that real level ratios are close to options.max_bytes_for_level_multiplier.
Test Plan: New unit tests and pass tests suites including valgrind.
Reviewers: MarkCallaghan, rven, yhchiang, igor, ikabiljo
Reviewed By: ikabiljo
Subscribers: yoshinorim, ikabiljo, dhruba, leveldb
Differential Revision: https://reviews.facebook.net/D31437
10 years ago
|
|
|
} else {
|
|
|
|
is_first = false;
|
|
|
|
}
|
|
|
|
len += snprintf(scratch->buffer + len, sizeof(scratch->buffer) - len,
|
|
|
|
"%" ROCKSDB_PRIszt "@%d", input_level.size(),
|
|
|
|
input_level.level);
|
|
|
|
len = std::min(len, static_cast<int>(sizeof(scratch->buffer)));
|
options.level_compaction_dynamic_level_bytes to allow RocksDB to pick size bases of levels dynamically.
Summary:
When having fixed max_bytes_for_level_base, the ratio of size of largest level and the second one can range from 0 to the multiplier. This makes LSM tree frequently irregular and unpredictable. It can also cause poor space amplification in some cases.
In this improvement (proposed by Igor Kabiljo), we introduce a parameter option.level_compaction_use_dynamic_max_bytes. When turning it on, RocksDB is free to pick a level base in the range of (options.max_bytes_for_level_base/options.max_bytes_for_level_multiplier, options.max_bytes_for_level_base] so that real level ratios are close to options.max_bytes_for_level_multiplier.
Test Plan: New unit tests and pass tests suites including valgrind.
Reviewers: MarkCallaghan, rven, yhchiang, igor, ikabiljo
Reviewed By: ikabiljo
Subscribers: yoshinorim, ikabiljo, dhruba, leveldb
Differential Revision: https://reviews.facebook.net/D31437
10 years ago
|
|
|
}
|
|
|
|
snprintf(scratch->buffer + len, sizeof(scratch->buffer) - len,
|
|
|
|
" files to L%d", output_level());
|
|
|
|
|
|
|
|
return scratch->buffer;
|
|
|
|
}
|
|
|
|
|
Include bunch of more events into EventLogger
Summary:
Added these events:
* Recovery start, finish and also when recovery creates a file
* Trivial move
* Compaction start, finish and when compaction creates a file
* Flush start, finish
Also includes small fix to EventLogger
Also added option ROCKSDB_PRINT_EVENTS_TO_STDOUT which is useful when we debug things. I've spent far too much time chasing LOG files.
Still didn't get sst table properties in JSON. They are written very deeply into the stack. I'll address in separate diff.
TODO:
* Write specification. Let's first use this for a while and figure out what's good data to put here, too. After that we'll write spec
* Write tools that parse and analyze LOGs. This can be in python or go. Good intern task.
Test Plan: Ran db_bench with ROCKSDB_PRINT_EVENTS_TO_STDOUT. Here's the output: https://phabricator.fb.com/P19811976
Reviewers: sdong, yhchiang, rven, MarkCallaghan, kradhakrishnan, anthony
Reviewed By: anthony
Subscribers: dhruba, leveldb
Differential Revision: https://reviews.facebook.net/D37521
10 years ago
|
|
|
uint64_t Compaction::CalculateTotalInputSize() const {
|
|
|
|
uint64_t size = 0;
|
|
|
|
for (auto& input_level : inputs_) {
|
|
|
|
for (auto f : input_level.files) {
|
|
|
|
size += f->fd.GetFileSize();
|
|
|
|
}
|
|
|
|
}
|
|
|
|
return size;
|
|
|
|
}
|
|
|
|
|
|
|
|
void Compaction::ReleaseCompactionFiles(Status status) {
|
Make Compaction class easier to use
Summary:
The goal of this diff is to make Compaction class easier to use. This should also make new compaction algorithms easier to write (like CompactFiles from @yhchiang and dynamic leveled and multi-leveled universal from @sdong).
Here are couple of things demonstrating that Compaction class is hard to use:
1. we have two constructors of Compaction class
2. there's this thing called grandparents_, but it appears to only be setup for leveled compaction and not compactfiles
3. it's easy to introduce a subtle and dangerous bug like this: D36225
4. SetupBottomMostLevel() is hard to understand and it shouldn't be. See this comment: https://github.com/facebook/rocksdb/blob/afbafeaeaebfd27a0f3e992fee8e0c57d07658fa/db/compaction.cc#L236-L241. It also made it harder for @yhchiang to write CompactFiles, as evidenced by this: https://github.com/facebook/rocksdb/blob/afbafeaeaebfd27a0f3e992fee8e0c57d07658fa/db/compaction_picker.cc#L204-L210
The problem is that we create Compaction object, which holds a lot of state, and then pass it around to some functions. After those functions are done mutating, then we call couple of functions on Compaction object, like SetupBottommostLevel() and MarkFilesBeingCompacted(). It is very hard to see what's happening with all that Compaction's state while it's travelling across different functions. If you're writing a new PickCompaction() function you need to try really hard to understand what are all the functions you need to run on Compaction object and what state you need to setup.
My proposed solution is to make important parts of Compaction immutable after construction. PickCompaction() should calculate compaction inputs and then pass them onto Compaction object once they are finalized. That makes it easy to create a new compaction -- just provide all the parameters to the constructor and you're done. No need to call confusing functions after you created your object.
This diff doesn't fully achieve that goal, but it comes pretty close. Here are some of the changes:
* have one Compaction constructor instead of two.
* inputs_ is constant after construction
* MarkFilesBeingCompacted() is now private to Compaction class and automatically called on construction/destruction.
* SetupBottommostLevel() is gone. Compaction figures it out on its own based on the input.
* CompactionPicker's functions are not passing around Compaction object anymore. They are only passing around the state that they need.
Test Plan:
make check
make asan_check
make valgrind_check
Reviewers: rven, anthony, sdong, yhchiang
Reviewed By: yhchiang
Subscribers: sdong, yhchiang, dhruba, leveldb
Differential Revision: https://reviews.facebook.net/D36687
10 years ago
|
|
|
MarkFilesBeingCompacted(false);
|
|
|
|
cfd_->compaction_picker()->ReleaseCompactionFiles(this, status);
|
|
|
|
}
|
|
|
|
|
|
|
|
void Compaction::ResetNextCompactionIndex() {
|
|
|
|
assert(input_version_ != nullptr);
|
|
|
|
input_vstorage_->ResetNextCompactionIndex(start_level_);
|
|
|
|
}
|
|
|
|
|
|
|
|
namespace {
|
|
|
|
int InputSummary(const std::vector<FileMetaData*>& files, char* output,
|
|
|
|
int len) {
|
|
|
|
*output = '\0';
|
|
|
|
int write = 0;
|
|
|
|
for (size_t i = 0; i < files.size(); i++) {
|
|
|
|
int sz = len - write;
|
|
|
|
int ret;
|
|
|
|
char sztxt[16];
|
|
|
|
AppendHumanBytes(files.at(i)->fd.GetFileSize(), sztxt, 16);
|
|
|
|
ret = snprintf(output + write, sz, "%" PRIu64 "(%s) ",
|
|
|
|
files.at(i)->fd.GetNumber(), sztxt);
|
|
|
|
if (ret < 0 || ret >= sz) break;
|
|
|
|
write += ret;
|
|
|
|
}
|
|
|
|
// if files.size() is non-zero, overwrite the last space
|
|
|
|
return write - !!files.size();
|
|
|
|
}
|
|
|
|
} // namespace
|
|
|
|
|
|
|
|
void Compaction::Summary(char* output, int len) {
|
|
|
|
int write =
|
|
|
|
snprintf(output, len, "Base version %" PRIu64 " Base level %d, inputs: [",
|
|
|
|
input_version_->GetVersionNumber(), start_level_);
|
|
|
|
if (write < 0 || write >= len) {
|
|
|
|
return;
|
|
|
|
}
|
|
|
|
|
|
|
|
for (size_t level_iter = 0; level_iter < num_input_levels(); ++level_iter) {
|
|
|
|
if (level_iter > 0) {
|
|
|
|
write += snprintf(output + write, len - write, "], [");
|
|
|
|
if (write < 0 || write >= len) {
|
|
|
|
return;
|
|
|
|
}
|
|
|
|
}
|
|
|
|
write +=
|
|
|
|
InputSummary(inputs_[level_iter].files, output + write, len - write);
|
|
|
|
if (write < 0 || write >= len) {
|
|
|
|
return;
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
snprintf(output + write, len - write, "]");
|
|
|
|
}
|
|
|
|
|
|
|
|
uint64_t Compaction::OutputFilePreallocationSize() const {
|
|
|
|
uint64_t preallocation_size = 0;
|
|
|
|
|
|
|
|
for (const auto& level_files : inputs_) {
|
|
|
|
for (const auto& file : level_files.files) {
|
|
|
|
preallocation_size += file->fd.GetFileSize();
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
if (max_output_file_size_ != std::numeric_limits<uint64_t>::max() &&
|
|
|
|
(immutable_options_.compaction_style == kCompactionStyleLevel ||
|
|
|
|
output_level() > 0)) {
|
|
|
|
preallocation_size = std::min(max_output_file_size_, preallocation_size);
|
|
|
|
}
|
|
|
|
|
|
|
|
// Over-estimate slightly so we don't end up just barely crossing
|
|
|
|
// the threshold
|
|
|
|
// No point to preallocate more than 1GB.
|
|
|
|
return std::min(uint64_t{1073741824},
|
|
|
|
preallocation_size + (preallocation_size / 10));
|
|
|
|
}
|
|
|
|
|
Allowing L0 -> L1 trivial move on sorted data
Summary:
This diff updates the logic of how we do trivial move, now trivial move can run on any number of files in input level as long as they are not overlapping
The conditions for trivial move have been updated
Introduced conditions:
- Trivial move cannot happen if we have a compaction filter (except if the compaction is not manual)
- Input level files cannot be overlapping
Removed conditions:
- Trivial move only run when the compaction is not manual
- Input level should can contain only 1 file
More context on what tests failed because of Trivial move
```
DBTest.CompactionsGenerateMultipleFiles
This test is expecting compaction on a file in L0 to generate multiple files in L1, this test will fail with trivial move because we end up with one file in L1
```
```
DBTest.NoSpaceCompactRange
This test expect compaction to fail when we force environment to report running out of space, of course this is not valid in trivial move situation
because trivial move does not need any extra space, and did not check for that
```
```
DBTest.DropWrites
Similar to DBTest.NoSpaceCompactRange
```
```
DBTest.DeleteObsoleteFilesPendingOutputs
This test expect that a file in L2 is deleted after it's moved to L3, this is not valid with trivial move because although the file was moved it is now used by L3
```
```
CuckooTableDBTest.CompactionIntoMultipleFiles
Same as DBTest.CompactionsGenerateMultipleFiles
```
This diff is based on a work by @sdong https://reviews.facebook.net/D34149
Test Plan: make -j64 check
Reviewers: rven, sdong, igor
Reviewed By: igor
Subscribers: yhchiang, ott, march, dhruba, sdong
Differential Revision: https://reviews.facebook.net/D34797
10 years ago
|
|
|
std::unique_ptr<CompactionFilter> Compaction::CreateCompactionFilter() const {
|
|
|
|
if (!cfd_->ioptions()->compaction_filter_factory) {
|
|
|
|
return nullptr;
|
|
|
|
}
|
|
|
|
|
|
|
|
if (!cfd_->ioptions()
|
|
|
|
->compaction_filter_factory->ShouldFilterTableFileCreation(
|
|
|
|
TableFileCreationReason::kCompaction)) {
|
|
|
|
return nullptr;
|
|
|
|
}
|
|
|
|
|
Allowing L0 -> L1 trivial move on sorted data
Summary:
This diff updates the logic of how we do trivial move, now trivial move can run on any number of files in input level as long as they are not overlapping
The conditions for trivial move have been updated
Introduced conditions:
- Trivial move cannot happen if we have a compaction filter (except if the compaction is not manual)
- Input level files cannot be overlapping
Removed conditions:
- Trivial move only run when the compaction is not manual
- Input level should can contain only 1 file
More context on what tests failed because of Trivial move
```
DBTest.CompactionsGenerateMultipleFiles
This test is expecting compaction on a file in L0 to generate multiple files in L1, this test will fail with trivial move because we end up with one file in L1
```
```
DBTest.NoSpaceCompactRange
This test expect compaction to fail when we force environment to report running out of space, of course this is not valid in trivial move situation
because trivial move does not need any extra space, and did not check for that
```
```
DBTest.DropWrites
Similar to DBTest.NoSpaceCompactRange
```
```
DBTest.DeleteObsoleteFilesPendingOutputs
This test expect that a file in L2 is deleted after it's moved to L3, this is not valid with trivial move because although the file was moved it is now used by L3
```
```
CuckooTableDBTest.CompactionIntoMultipleFiles
Same as DBTest.CompactionsGenerateMultipleFiles
```
This diff is based on a work by @sdong https://reviews.facebook.net/D34149
Test Plan: make -j64 check
Reviewers: rven, sdong, igor
Reviewed By: igor
Subscribers: yhchiang, ott, march, dhruba, sdong
Differential Revision: https://reviews.facebook.net/D34797
10 years ago
|
|
|
CompactionFilter::Context context;
|
|
|
|
context.is_full_compaction = is_full_compaction_;
|
|
|
|
context.is_manual_compaction = is_manual_compaction_;
|
|
|
|
context.column_family_id = cfd_->GetID();
|
|
|
|
context.reason = TableFileCreationReason::kCompaction;
|
Allowing L0 -> L1 trivial move on sorted data
Summary:
This diff updates the logic of how we do trivial move, now trivial move can run on any number of files in input level as long as they are not overlapping
The conditions for trivial move have been updated
Introduced conditions:
- Trivial move cannot happen if we have a compaction filter (except if the compaction is not manual)
- Input level files cannot be overlapping
Removed conditions:
- Trivial move only run when the compaction is not manual
- Input level should can contain only 1 file
More context on what tests failed because of Trivial move
```
DBTest.CompactionsGenerateMultipleFiles
This test is expecting compaction on a file in L0 to generate multiple files in L1, this test will fail with trivial move because we end up with one file in L1
```
```
DBTest.NoSpaceCompactRange
This test expect compaction to fail when we force environment to report running out of space, of course this is not valid in trivial move situation
because trivial move does not need any extra space, and did not check for that
```
```
DBTest.DropWrites
Similar to DBTest.NoSpaceCompactRange
```
```
DBTest.DeleteObsoleteFilesPendingOutputs
This test expect that a file in L2 is deleted after it's moved to L3, this is not valid with trivial move because although the file was moved it is now used by L3
```
```
CuckooTableDBTest.CompactionIntoMultipleFiles
Same as DBTest.CompactionsGenerateMultipleFiles
```
This diff is based on a work by @sdong https://reviews.facebook.net/D34149
Test Plan: make -j64 check
Reviewers: rven, sdong, igor
Reviewed By: igor
Subscribers: yhchiang, ott, march, dhruba, sdong
Differential Revision: https://reviews.facebook.net/D34797
10 years ago
|
|
|
return cfd_->ioptions()->compaction_filter_factory->CreateCompactionFilter(
|
|
|
|
context);
|
|
|
|
}
|
|
|
|
|
|
|
|
std::unique_ptr<SstPartitioner> Compaction::CreateSstPartitioner() const {
|
|
|
|
if (!immutable_options_.sst_partitioner_factory) {
|
|
|
|
return nullptr;
|
|
|
|
}
|
|
|
|
|
|
|
|
SstPartitioner::Context context;
|
|
|
|
context.is_full_compaction = is_full_compaction_;
|
|
|
|
context.is_manual_compaction = is_manual_compaction_;
|
|
|
|
context.output_level = output_level_;
|
|
|
|
context.smallest_user_key = smallest_user_key_;
|
|
|
|
context.largest_user_key = largest_user_key_;
|
|
|
|
return immutable_options_.sst_partitioner_factory->CreatePartitioner(context);
|
|
|
|
}
|
|
|
|
|
|
|
|
bool Compaction::IsOutputLevelEmpty() const {
|
|
|
|
return inputs_.back().level != output_level_ || inputs_.back().empty();
|
|
|
|
}
|
|
|
|
|
|
|
|
bool Compaction::ShouldFormSubcompactions() const {
|
Support subcmpct using reserved resources for round-robin priority (#10341)
Summary:
Earlier implementation of round-robin priority can only pick one file at a time and disallows parallel compactions within the same level. In this PR, round-robin compaction policy will expand towards more input files with respecting some additional constraints, which are summarized as follows:
* Constraint 1: We can only pick consecutive files
- Constraint 1a: When a file is being compacted (or some input files are being compacted after expanding), we cannot choose it and have to stop choosing more files
- Constraint 1b: When we reach the last file (with the largest keys), we cannot choose more files (the next file will be the first one with small keys)
* Constraint 2: We should ensure the total compaction bytes (including the overlapped files from the next level) is no more than `mutable_cf_options_.max_compaction_bytes`
* Constraint 3: We try our best to pick as many files as possible so that the post-compaction level size can be just less than `MaxBytesForLevel(start_level_)`
* Constraint 4: If trivial move is allowed, we reuse the logic of `TryNonL0TrivialMove()` instead of expanding files with Constraint 3
More details can be found in `LevelCompactionBuilder::SetupOtherFilesWithRoundRobinExpansion()`.
The above optimization accelerates the process of moving the compaction cursor, in which the write-amp can be further reduced. While a large compaction may lead to high write stall, we break this large compaction into several subcompactions **regardless of** the `max_subcompactions` limit. The number of subcompactions for round-robin compaction priority is determined through the following steps:
* Step 1: Initialized against `max_output_file_limit`, the number of input files in the start level, and also the range size limit `ranges.size()`
* Step 2: Call `AcquireSubcompactionResources()`when max subcompactions is not sufficient, but we may or may not obtain desired resources, additional number of resources is stored in `extra_num_subcompaction_threads_reserved_`). Subcompaction limit is changed and update `num_planned_subcompactions` with `GetSubcompactionLimit()`
* Step 3: Call `ShrinkSubcompactionResources()` to ensure extra resources can be released (extra resources may exist for round-robin compaction when the number of actual number of subcompactions is less than the number of planned subcompactions)
More details can be found in `CompactionJob::AcquireSubcompactionResources()`,`CompactionJob::ShrinkSubcompactionResources()`, and `CompactionJob::ReleaseSubcompactionResources()`.
Pull Request resolved: https://github.com/facebook/rocksdb/pull/10341
Test Plan: Add `CompactionPriMultipleFilesRoundRobin[1-3]` unit test in `compaction_picker_test.cc` and `RoundRobinSubcompactionsAgainstResources.SubcompactionsUsingResources/[0-4]`, `RoundRobinSubcompactionsAgainstPressureToken.PressureTokenTest/[0-1]` in `db_compaction_test.cc`
Reviewed By: ajkr, hx235
Differential Revision: D37792644
Pulled By: littlepig2013
fbshipit-source-id: 7fecb7c4ffd97b34bbf6e3b760b2c35a772a0657
2 years ago
|
|
|
if (cfd_ == nullptr) {
|
|
|
|
return false;
|
|
|
|
}
|
|
|
|
|
Support subcmpct using reserved resources for round-robin priority (#10341)
Summary:
Earlier implementation of round-robin priority can only pick one file at a time and disallows parallel compactions within the same level. In this PR, round-robin compaction policy will expand towards more input files with respecting some additional constraints, which are summarized as follows:
* Constraint 1: We can only pick consecutive files
- Constraint 1a: When a file is being compacted (or some input files are being compacted after expanding), we cannot choose it and have to stop choosing more files
- Constraint 1b: When we reach the last file (with the largest keys), we cannot choose more files (the next file will be the first one with small keys)
* Constraint 2: We should ensure the total compaction bytes (including the overlapped files from the next level) is no more than `mutable_cf_options_.max_compaction_bytes`
* Constraint 3: We try our best to pick as many files as possible so that the post-compaction level size can be just less than `MaxBytesForLevel(start_level_)`
* Constraint 4: If trivial move is allowed, we reuse the logic of `TryNonL0TrivialMove()` instead of expanding files with Constraint 3
More details can be found in `LevelCompactionBuilder::SetupOtherFilesWithRoundRobinExpansion()`.
The above optimization accelerates the process of moving the compaction cursor, in which the write-amp can be further reduced. While a large compaction may lead to high write stall, we break this large compaction into several subcompactions **regardless of** the `max_subcompactions` limit. The number of subcompactions for round-robin compaction priority is determined through the following steps:
* Step 1: Initialized against `max_output_file_limit`, the number of input files in the start level, and also the range size limit `ranges.size()`
* Step 2: Call `AcquireSubcompactionResources()`when max subcompactions is not sufficient, but we may or may not obtain desired resources, additional number of resources is stored in `extra_num_subcompaction_threads_reserved_`). Subcompaction limit is changed and update `num_planned_subcompactions` with `GetSubcompactionLimit()`
* Step 3: Call `ShrinkSubcompactionResources()` to ensure extra resources can be released (extra resources may exist for round-robin compaction when the number of actual number of subcompactions is less than the number of planned subcompactions)
More details can be found in `CompactionJob::AcquireSubcompactionResources()`,`CompactionJob::ShrinkSubcompactionResources()`, and `CompactionJob::ReleaseSubcompactionResources()`.
Pull Request resolved: https://github.com/facebook/rocksdb/pull/10341
Test Plan: Add `CompactionPriMultipleFilesRoundRobin[1-3]` unit test in `compaction_picker_test.cc` and `RoundRobinSubcompactionsAgainstResources.SubcompactionsUsingResources/[0-4]`, `RoundRobinSubcompactionsAgainstPressureToken.PressureTokenTest/[0-1]` in `db_compaction_test.cc`
Reviewed By: ajkr, hx235
Differential Revision: D37792644
Pulled By: littlepig2013
fbshipit-source-id: 7fecb7c4ffd97b34bbf6e3b760b2c35a772a0657
2 years ago
|
|
|
// Round-Robin pri under leveled compaction allows subcompactions by default
|
|
|
|
// and the number of subcompactions can be larger than max_subcompactions_
|
|
|
|
if (cfd_->ioptions()->compaction_pri == kRoundRobin &&
|
|
|
|
cfd_->ioptions()->compaction_style == kCompactionStyleLevel) {
|
|
|
|
return output_level_ > 0;
|
|
|
|
}
|
|
|
|
|
|
|
|
if (max_subcompactions_ <= 1) {
|
|
|
|
return false;
|
|
|
|
}
|
|
|
|
|
|
|
|
if (cfd_->ioptions()->compaction_style == kCompactionStyleLevel) {
|
|
|
|
return (start_level_ == 0 || is_manual_compaction_) && output_level_ > 0;
|
|
|
|
} else if (cfd_->ioptions()->compaction_style == kCompactionStyleUniversal) {
|
|
|
|
return number_levels_ > 1 && output_level_ > 0;
|
|
|
|
} else {
|
|
|
|
return false;
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
bool Compaction::DoesInputReferenceBlobFiles() const {
|
|
|
|
assert(input_version_);
|
|
|
|
|
|
|
|
const VersionStorageInfo* storage_info = input_version_->storage_info();
|
|
|
|
assert(storage_info);
|
|
|
|
|
|
|
|
if (storage_info->GetBlobFiles().empty()) {
|
|
|
|
return false;
|
|
|
|
}
|
|
|
|
|
|
|
|
for (size_t i = 0; i < inputs_.size(); ++i) {
|
|
|
|
for (const FileMetaData* meta : inputs_[i].files) {
|
|
|
|
assert(meta);
|
|
|
|
|
|
|
|
if (meta->oldest_blob_file_number != kInvalidBlobFileNumber) {
|
|
|
|
return true;
|
|
|
|
}
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
return false;
|
|
|
|
}
|
|
|
|
|
Try to start TTL earlier with kMinOverlappingRatio is used (#8749)
Summary:
Right now, when options.ttl is set, compactions are triggered around the time when TTL is reached. This might cause extra compactions which are often bursty. This commit tries to mitigate it by picking those files earlier in normal compaction picking process. This is only implemented using kMinOverlappingRatio with Leveled compaction as it is the default value and it is more complicated to change other styles.
When a file is aged more than ttl/2, RocksDB starts to boost the compaction priority of files in normal compaction picking process, and hope by the time TTL is reached, very few extra compaction is needed.
In order for this to work, another change is made: during a compaction, if an output level file is older than ttl/2, cut output files based on original boundary (if it is not in the last level). This is to make sure that after an old file is moved to the next level, and new data is merged from the upper level, the new data falling into this range isn't reset with old timestamp. Without this change, in many cases, most files from one level will keep having old timestamp, even if they have newer data and we stuck in it.
Pull Request resolved: https://github.com/facebook/rocksdb/pull/8749
Test Plan: Add a unit test to test the boosting logic. Will add a unit test to test it end-to-end.
Reviewed By: jay-zhuang
Differential Revision: D30735261
fbshipit-source-id: 503c2d89250b22911eb99e72b379be154de3428e
3 years ago
|
|
|
uint64_t Compaction::MinInputFileOldestAncesterTime(
|
|
|
|
const InternalKey* start, const InternalKey* end) const {
|
|
|
|
uint64_t min_oldest_ancester_time = std::numeric_limits<uint64_t>::max();
|
Try to start TTL earlier with kMinOverlappingRatio is used (#8749)
Summary:
Right now, when options.ttl is set, compactions are triggered around the time when TTL is reached. This might cause extra compactions which are often bursty. This commit tries to mitigate it by picking those files earlier in normal compaction picking process. This is only implemented using kMinOverlappingRatio with Leveled compaction as it is the default value and it is more complicated to change other styles.
When a file is aged more than ttl/2, RocksDB starts to boost the compaction priority of files in normal compaction picking process, and hope by the time TTL is reached, very few extra compaction is needed.
In order for this to work, another change is made: during a compaction, if an output level file is older than ttl/2, cut output files based on original boundary (if it is not in the last level). This is to make sure that after an old file is moved to the next level, and new data is merged from the upper level, the new data falling into this range isn't reset with old timestamp. Without this change, in many cases, most files from one level will keep having old timestamp, even if they have newer data and we stuck in it.
Pull Request resolved: https://github.com/facebook/rocksdb/pull/8749
Test Plan: Add a unit test to test the boosting logic. Will add a unit test to test it end-to-end.
Reviewed By: jay-zhuang
Differential Revision: D30735261
fbshipit-source-id: 503c2d89250b22911eb99e72b379be154de3428e
3 years ago
|
|
|
const InternalKeyComparator& icmp =
|
|
|
|
column_family_data()->internal_comparator();
|
|
|
|
for (const auto& level_files : inputs_) {
|
|
|
|
for (const auto& file : level_files.files) {
|
Try to start TTL earlier with kMinOverlappingRatio is used (#8749)
Summary:
Right now, when options.ttl is set, compactions are triggered around the time when TTL is reached. This might cause extra compactions which are often bursty. This commit tries to mitigate it by picking those files earlier in normal compaction picking process. This is only implemented using kMinOverlappingRatio with Leveled compaction as it is the default value and it is more complicated to change other styles.
When a file is aged more than ttl/2, RocksDB starts to boost the compaction priority of files in normal compaction picking process, and hope by the time TTL is reached, very few extra compaction is needed.
In order for this to work, another change is made: during a compaction, if an output level file is older than ttl/2, cut output files based on original boundary (if it is not in the last level). This is to make sure that after an old file is moved to the next level, and new data is merged from the upper level, the new data falling into this range isn't reset with old timestamp. Without this change, in many cases, most files from one level will keep having old timestamp, even if they have newer data and we stuck in it.
Pull Request resolved: https://github.com/facebook/rocksdb/pull/8749
Test Plan: Add a unit test to test the boosting logic. Will add a unit test to test it end-to-end.
Reviewed By: jay-zhuang
Differential Revision: D30735261
fbshipit-source-id: 503c2d89250b22911eb99e72b379be154de3428e
3 years ago
|
|
|
if (start != nullptr && icmp.Compare(file->largest, *start) < 0) {
|
|
|
|
continue;
|
|
|
|
}
|
|
|
|
if (end != nullptr && icmp.Compare(file->smallest, *end) > 0) {
|
|
|
|
continue;
|
|
|
|
}
|
|
|
|
uint64_t oldest_ancester_time = file->TryGetOldestAncesterTime();
|
|
|
|
if (oldest_ancester_time != 0) {
|
|
|
|
min_oldest_ancester_time =
|
|
|
|
std::min(min_oldest_ancester_time, oldest_ancester_time);
|
|
|
|
}
|
|
|
|
}
|
|
|
|
}
|
|
|
|
return min_oldest_ancester_time;
|
|
|
|
}
|
|
|
|
|
Sort L0 files by newly introduced epoch_num (#10922)
Summary:
**Context:**
Sorting L0 files by `largest_seqno` has at least two inconvenience:
- File ingestion and compaction involving ingested files can create files of overlapping seqno range with the existing files. `force_consistency_check=true` will catch such overlap seqno range even those harmless overlap.
- For example, consider the following sequence of events ("key@n" indicates key at seqno "n")
- insert k1@1 to memtable m1
- ingest file s1 with k2@2, ingest file s2 with k3@3
- insert k4@4 to m1
- compact files s1, s2 and result in new file s3 of seqno range [2, 3]
- flush m1 and result in new file s4 of seqno range [1, 4]. And `force_consistency_check=true` will think s4 and s3 has file reordering corruption that might cause retuning an old value of k1
- However such caught corruption is a false positive since s1, s2 will not have overlapped keys with k1 or whatever inserted into m1 before ingest file s1 by the requirement of file ingestion (otherwise the m1 will be flushed first before any of the file ingestion completes). Therefore there in fact isn't any file reordering corruption.
- Single delete can decrease a file's largest seqno and ordering by `largest_seqno` can introduce a wrong ordering hence file reordering corruption
- For example, consider the following sequence of events ("key@n" indicates key at seqno "n", Credit to ajkr for this example)
- an existing SST s1 contains only k1@1
- insert k1@2 to memtable m1
- ingest file s2 with k3@3, ingest file s3 with k4@4
- insert single delete k5@5 in m1
- flush m1 and result in new file s4 of seqno range [2, 5]
- compact s1, s2, s3 and result in new file s5 of seqno range [1, 4]
- compact s4 and result in new file s6 of seqno range [2] due to single delete
- By the last step, we have file ordering by largest seqno (">" means "newer") : s5 > s6 while s6 contains a newer version of the k1's value (i.e, k1@2) than s5, which is a real reordering corruption. While this can be caught by `force_consistency_check=true`, there isn't a good way to prevent this from happening if ordering by `largest_seqno`
Therefore, we are redesigning the sorting criteria of L0 files and avoid above inconvenience. Credit to ajkr , we now introduce `epoch_num` which describes the order of a file being flushed or ingested/imported (compaction output file will has the minimum `epoch_num` among input files'). This will avoid the above inconvenience in the following ways:
- In the first case above, there will no longer be overlap seqno range check in `force_consistency_check=true` but `epoch_number` ordering check. This will result in file ordering s1 < s2 < s4 (pre-compaction) and s3 < s4 (post-compaction) which won't trigger false positive corruption. See test class `DBCompactionTestL0FilesMisorderCorruption*` for more.
- In the second case above, this will result in file ordering s1 < s2 < s3 < s4 (pre-compacting s1, s2, s3), s5 < s4 (post-compacting s1, s2, s3), s5 < s6 (post-compacting s4), which are correct file ordering without causing any corruption.
**Summary:**
- Introduce `epoch_number` stored per `ColumnFamilyData` and sort CF's L0 files by their assigned `epoch_number` instead of `largest_seqno`.
- `epoch_number` is increased and assigned upon `VersionEdit::AddFile()` for flush (or similarly for WriteLevel0TableForRecovery) and file ingestion (except for allow_behind_true, which will always get assigned as the `kReservedEpochNumberForFileIngestedBehind`)
- Compaction output file is assigned with the minimum `epoch_number` among input files'
- Refit level: reuse refitted file's epoch_number
- Other paths needing `epoch_number` treatment:
- Import column families: reuse file's epoch_number if exists. If not, assign one based on `NewestFirstBySeqNo`
- Repair: reuse file's epoch_number if exists. If not, assign one based on `NewestFirstBySeqNo`.
- Assigning new epoch_number to a file and adding this file to LSM tree should be atomic. This is guaranteed by us assigning epoch_number right upon `VersionEdit::AddFile()` where this version edit will be apply to LSM tree shape right after by holding the db mutex (e.g, flush, file ingestion, import column family) or by there is only 1 ongoing edit per CF (e.g, WriteLevel0TableForRecovery, Repair).
- Assigning the minimum input epoch number to compaction output file won't misorder L0 files (even through later `Refit(target_level=0)`). It's due to for every key "k" in the input range, a legit compaction will cover a continuous epoch number range of that key. As long as we assign the key "k" the minimum input epoch number, it won't become newer or older than the versions of this key that aren't included in this compaction hence no misorder.
- Persist `epoch_number` of each file in manifest and recover `epoch_number` on db recovery
- Backward compatibility with old db without `epoch_number` support is guaranteed by assigning `epoch_number` to recovered files by `NewestFirstBySeqno` order. See `VersionStorageInfo::RecoverEpochNumbers()` for more
- Forward compatibility with manifest is guaranteed by flexibility of `NewFileCustomTag`
- Replace `force_consistent_check` on L0 with `epoch_number` and remove false positive check like case 1 with `largest_seqno` above
- Due to backward compatibility issue, we might encounter files with missing epoch number at the beginning of db recovery. We will still use old L0 sorting mechanism (`NewestFirstBySeqno`) to check/sort them till we infer their epoch number. See usages of `EpochNumberRequirement`.
- Remove fix https://github.com/facebook/rocksdb/pull/5958#issue-511150930 and their outdated tests to file reordering corruption because such fix can be replaced by this PR.
- Misc:
- update existing tests with `epoch_number` so make check will pass
- update https://github.com/facebook/rocksdb/pull/5958#issue-511150930 tests to verify corruption is fixed using `epoch_number` and cover universal/fifo compaction/CompactRange/CompactFile cases
- assert db_mutex is held for a few places before calling ColumnFamilyData::NewEpochNumber()
Pull Request resolved: https://github.com/facebook/rocksdb/pull/10922
Test Plan:
- `make check`
- New unit tests under `db/db_compaction_test.cc`, `db/db_test2.cc`, `db/version_builder_test.cc`, `db/repair_test.cc`
- Updated tests (i.e, `DBCompactionTestL0FilesMisorderCorruption*`) under https://github.com/facebook/rocksdb/pull/5958#issue-511150930
- [Ongoing] Compatibility test: manually run https://github.com/ajkr/rocksdb/commit/36a5686ec012f35a4371e409aa85c404ca1c210d (with file ingestion off for running the `.orig` binary to prevent this bug affecting upgrade/downgrade formality checking) for 1 hour on `simple black/white box`, `cf_consistency/txn/enable_ts with whitebox + test_best_efforts_recovery with blackbox`
- [Ongoing] normal db stress test
- [Ongoing] db stress test with aggressive value https://github.com/facebook/rocksdb/pull/10761
Reviewed By: ajkr
Differential Revision: D41063187
Pulled By: hx235
fbshipit-source-id: 826cb23455de7beaabe2d16c57682a82733a32a9
2 years ago
|
|
|
uint64_t Compaction::MinInputFileEpochNumber() const {
|
|
|
|
uint64_t min_epoch_number = std::numeric_limits<uint64_t>::max();
|
|
|
|
for (const auto& inputs_per_level : inputs_) {
|
|
|
|
for (const auto& file : inputs_per_level.files) {
|
|
|
|
min_epoch_number = std::min(min_epoch_number, file->epoch_number);
|
|
|
|
}
|
|
|
|
}
|
|
|
|
return min_epoch_number;
|
|
|
|
}
|
|
|
|
|
|
|
|
int Compaction::EvaluatePenultimateLevel(
|
|
|
|
const VersionStorageInfo* vstorage,
|
|
|
|
const ImmutableOptions& immutable_options, const int start_level,
|
|
|
|
const int output_level) {
|
|
|
|
// TODO: currently per_key_placement feature only support level and universal
|
|
|
|
// compaction
|
|
|
|
if (immutable_options.compaction_style != kCompactionStyleLevel &&
|
|
|
|
immutable_options.compaction_style != kCompactionStyleUniversal) {
|
|
|
|
return kInvalidLevel;
|
|
|
|
}
|
|
|
|
if (output_level != immutable_options.num_levels - 1) {
|
|
|
|
return kInvalidLevel;
|
|
|
|
}
|
|
|
|
|
|
|
|
int penultimate_level = output_level - 1;
|
|
|
|
assert(penultimate_level < immutable_options.num_levels);
|
|
|
|
if (penultimate_level <= 0) {
|
|
|
|
return kInvalidLevel;
|
|
|
|
}
|
|
|
|
|
|
|
|
// If the penultimate level is not within input level -> output level range
|
|
|
|
// check if the penultimate output level is empty, if it's empty, it could
|
|
|
|
// also be locked for the penultimate output.
|
|
|
|
// TODO: ideally, it only needs to check if there's a file within the
|
|
|
|
// compaction output key range. For simplicity, it just check if there's any
|
|
|
|
// file on the penultimate level.
|
|
|
|
if (start_level == immutable_options.num_levels - 1 &&
|
|
|
|
(immutable_options.compaction_style != kCompactionStyleUniversal ||
|
|
|
|
!vstorage->LevelFiles(penultimate_level).empty())) {
|
|
|
|
return kInvalidLevel;
|
|
|
|
}
|
|
|
|
|
|
|
|
bool supports_per_key_placement =
|
|
|
|
immutable_options.preclude_last_level_data_seconds > 0;
|
|
|
|
|
|
|
|
// it could be overridden by unittest
|
|
|
|
TEST_SYNC_POINT_CALLBACK("Compaction::SupportsPerKeyPlacement:Enabled",
|
|
|
|
&supports_per_key_placement);
|
|
|
|
if (!supports_per_key_placement) {
|
|
|
|
return kInvalidLevel;
|
|
|
|
}
|
|
|
|
|
|
|
|
return penultimate_level;
|
|
|
|
}
|
|
|
|
|
|
|
|
} // namespace ROCKSDB_NAMESPACE
|