|
|
|
// Copyright (c) 2011-present, Facebook, Inc. All rights reserved.
|
|
|
|
// This source code is licensed under both the GPLv2 (found in the
|
|
|
|
// COPYING file in the root directory) and Apache 2.0 License
|
|
|
|
// (found in the LICENSE.Apache file in the root directory).
|
|
|
|
//
|
|
|
|
// Copyright (c) 2011 The LevelDB Authors. All rights reserved.
|
|
|
|
// Use of this source code is governed by a BSD-style license that can be
|
|
|
|
// found in the LICENSE file. See the AUTHORS file for names of contributors.
|
|
|
|
|
|
|
|
#pragma once
|
|
|
|
#include "db/version_set.h"
|
|
|
|
#include "memory/arena.h"
|
|
|
|
#include "options/cf_options.h"
|
|
|
|
#include "rocksdb/sst_partitioner.h"
|
|
|
|
#include "util/autovector.h"
|
|
|
|
|
|
|
|
namespace ROCKSDB_NAMESPACE {
|
|
|
|
// The file contains class Compaction, as well as some helper functions
|
|
|
|
// and data structures used by the class.
|
|
|
|
|
|
|
|
// Utility for comparing sstable boundary keys. Returns -1 if either a or b is
|
|
|
|
// null which provides the property that a==null indicates a key that is less
|
|
|
|
// than any key and b==null indicates a key that is greater than any key. Note
|
|
|
|
// that the comparison is performed primarily on the user-key portion of the
|
|
|
|
// key. If the user-keys compare equal, an additional test is made to sort
|
|
|
|
// range tombstone sentinel keys before other keys with the same user-key. The
|
|
|
|
// result is that 2 user-keys will compare equal if they differ purely on
|
|
|
|
// their sequence number and value, but the range tombstone sentinel for that
|
|
|
|
// user-key will compare not equal. This is necessary because the range
|
|
|
|
// tombstone sentinel key is set as the largest key for an sstable even though
|
|
|
|
// that key never appears in the database. We don't want adjacent sstables to
|
|
|
|
// be considered overlapping if they are separated by the range tombstone
|
|
|
|
// sentinel.
|
|
|
|
int sstableKeyCompare(const Comparator* user_cmp, const InternalKey& a,
|
|
|
|
const InternalKey& b);
|
|
|
|
int sstableKeyCompare(const Comparator* user_cmp, const InternalKey* a,
|
|
|
|
const InternalKey& b);
|
|
|
|
int sstableKeyCompare(const Comparator* user_cmp, const InternalKey& a,
|
|
|
|
const InternalKey* b);
|
|
|
|
|
|
|
|
// An AtomicCompactionUnitBoundary represents a range of keys [smallest,
|
|
|
|
// largest] that exactly spans one ore more neighbouring SSTs on the same
|
|
|
|
// level. Every pair of SSTs in this range "overlap" (i.e., the largest
|
|
|
|
// user key of one file is the smallest user key of the next file). These
|
|
|
|
// boundaries are propagated down to RangeDelAggregator during compaction
|
|
|
|
// to provide safe truncation boundaries for range tombstones.
|
|
|
|
struct AtomicCompactionUnitBoundary {
|
|
|
|
const InternalKey* smallest = nullptr;
|
|
|
|
const InternalKey* largest = nullptr;
|
|
|
|
};
|
|
|
|
|
|
|
|
// The structure that manages compaction input files associated
|
|
|
|
// with the same physical level.
|
|
|
|
struct CompactionInputFiles {
|
|
|
|
int level;
|
|
|
|
std::vector<FileMetaData*> files;
|
|
|
|
std::vector<AtomicCompactionUnitBoundary> atomic_compaction_unit_boundaries;
|
|
|
|
inline bool empty() const { return files.empty(); }
|
|
|
|
inline size_t size() const { return files.size(); }
|
|
|
|
inline void clear() { files.clear(); }
|
|
|
|
inline FileMetaData* operator[](size_t i) const { return files[i]; }
|
|
|
|
};
|
|
|
|
|
|
|
|
class Version;
|
|
|
|
class ColumnFamilyData;
|
|
|
|
class VersionStorageInfo;
|
Allowing L0 -> L1 trivial move on sorted data
Summary:
This diff updates the logic of how we do trivial move, now trivial move can run on any number of files in input level as long as they are not overlapping
The conditions for trivial move have been updated
Introduced conditions:
- Trivial move cannot happen if we have a compaction filter (except if the compaction is not manual)
- Input level files cannot be overlapping
Removed conditions:
- Trivial move only run when the compaction is not manual
- Input level should can contain only 1 file
More context on what tests failed because of Trivial move
```
DBTest.CompactionsGenerateMultipleFiles
This test is expecting compaction on a file in L0 to generate multiple files in L1, this test will fail with trivial move because we end up with one file in L1
```
```
DBTest.NoSpaceCompactRange
This test expect compaction to fail when we force environment to report running out of space, of course this is not valid in trivial move situation
because trivial move does not need any extra space, and did not check for that
```
```
DBTest.DropWrites
Similar to DBTest.NoSpaceCompactRange
```
```
DBTest.DeleteObsoleteFilesPendingOutputs
This test expect that a file in L2 is deleted after it's moved to L3, this is not valid with trivial move because although the file was moved it is now used by L3
```
```
CuckooTableDBTest.CompactionIntoMultipleFiles
Same as DBTest.CompactionsGenerateMultipleFiles
```
This diff is based on a work by @sdong https://reviews.facebook.net/D34149
Test Plan: make -j64 check
Reviewers: rven, sdong, igor
Reviewed By: igor
Subscribers: yhchiang, ott, march, dhruba, sdong
Differential Revision: https://reviews.facebook.net/D34797
10 years ago
|
|
|
class CompactionFilter;
|
|
|
|
|
|
|
|
// A Compaction encapsulates metadata about a compaction.
|
|
|
|
class Compaction {
|
|
|
|
public:
|
CompactFiles, EventListener and GetDatabaseMetaData
Summary:
This diff adds three sets of APIs to RocksDB.
= GetColumnFamilyMetaData =
* This APIs allow users to obtain the current state of a RocksDB instance on one column family.
* See GetColumnFamilyMetaData in include/rocksdb/db.h
= EventListener =
* A virtual class that allows users to implement a set of
call-back functions which will be called when specific
events of a RocksDB instance happens.
* To register EventListener, simply insert an EventListener to ColumnFamilyOptions::listeners
= CompactFiles =
* CompactFiles API inputs a set of file numbers and an output level, and RocksDB
will try to compact those files into the specified level.
= Example =
* Example code can be found in example/compact_files_example.cc, which implements
a simple external compactor using EventListener, GetColumnFamilyMetaData, and
CompactFiles API.
Test Plan:
listener_test
compactor_test
example/compact_files_example
export ROCKSDB_TESTS=CompactFiles
db_test
export ROCKSDB_TESTS=MetaData
db_test
Reviewers: ljin, igor, rven, sdong
Reviewed By: sdong
Subscribers: MarkCallaghan, dhruba, leveldb
Differential Revision: https://reviews.facebook.net/D24705
10 years ago
|
|
|
Compaction(VersionStorageInfo* input_version,
|
|
|
|
const ImmutableOptions& immutable_options,
|
Make Compaction class easier to use
Summary:
The goal of this diff is to make Compaction class easier to use. This should also make new compaction algorithms easier to write (like CompactFiles from @yhchiang and dynamic leveled and multi-leveled universal from @sdong).
Here are couple of things demonstrating that Compaction class is hard to use:
1. we have two constructors of Compaction class
2. there's this thing called grandparents_, but it appears to only be setup for leveled compaction and not compactfiles
3. it's easy to introduce a subtle and dangerous bug like this: D36225
4. SetupBottomMostLevel() is hard to understand and it shouldn't be. See this comment: https://github.com/facebook/rocksdb/blob/afbafeaeaebfd27a0f3e992fee8e0c57d07658fa/db/compaction.cc#L236-L241. It also made it harder for @yhchiang to write CompactFiles, as evidenced by this: https://github.com/facebook/rocksdb/blob/afbafeaeaebfd27a0f3e992fee8e0c57d07658fa/db/compaction_picker.cc#L204-L210
The problem is that we create Compaction object, which holds a lot of state, and then pass it around to some functions. After those functions are done mutating, then we call couple of functions on Compaction object, like SetupBottommostLevel() and MarkFilesBeingCompacted(). It is very hard to see what's happening with all that Compaction's state while it's travelling across different functions. If you're writing a new PickCompaction() function you need to try really hard to understand what are all the functions you need to run on Compaction object and what state you need to setup.
My proposed solution is to make important parts of Compaction immutable after construction. PickCompaction() should calculate compaction inputs and then pass them onto Compaction object once they are finalized. That makes it easy to create a new compaction -- just provide all the parameters to the constructor and you're done. No need to call confusing functions after you created your object.
This diff doesn't fully achieve that goal, but it comes pretty close. Here are some of the changes:
* have one Compaction constructor instead of two.
* inputs_ is constant after construction
* MarkFilesBeingCompacted() is now private to Compaction class and automatically called on construction/destruction.
* SetupBottommostLevel() is gone. Compaction figures it out on its own based on the input.
* CompactionPicker's functions are not passing around Compaction object anymore. They are only passing around the state that they need.
Test Plan:
make check
make asan_check
make valgrind_check
Reviewers: rven, anthony, sdong, yhchiang
Reviewed By: yhchiang
Subscribers: sdong, yhchiang, dhruba, leveldb
Differential Revision: https://reviews.facebook.net/D36687
10 years ago
|
|
|
const MutableCFOptions& mutable_cf_options,
|
|
|
|
const MutableDBOptions& mutable_db_options,
|
Make Compaction class easier to use
Summary:
The goal of this diff is to make Compaction class easier to use. This should also make new compaction algorithms easier to write (like CompactFiles from @yhchiang and dynamic leveled and multi-leveled universal from @sdong).
Here are couple of things demonstrating that Compaction class is hard to use:
1. we have two constructors of Compaction class
2. there's this thing called grandparents_, but it appears to only be setup for leveled compaction and not compactfiles
3. it's easy to introduce a subtle and dangerous bug like this: D36225
4. SetupBottomMostLevel() is hard to understand and it shouldn't be. See this comment: https://github.com/facebook/rocksdb/blob/afbafeaeaebfd27a0f3e992fee8e0c57d07658fa/db/compaction.cc#L236-L241. It also made it harder for @yhchiang to write CompactFiles, as evidenced by this: https://github.com/facebook/rocksdb/blob/afbafeaeaebfd27a0f3e992fee8e0c57d07658fa/db/compaction_picker.cc#L204-L210
The problem is that we create Compaction object, which holds a lot of state, and then pass it around to some functions. After those functions are done mutating, then we call couple of functions on Compaction object, like SetupBottommostLevel() and MarkFilesBeingCompacted(). It is very hard to see what's happening with all that Compaction's state while it's travelling across different functions. If you're writing a new PickCompaction() function you need to try really hard to understand what are all the functions you need to run on Compaction object and what state you need to setup.
My proposed solution is to make important parts of Compaction immutable after construction. PickCompaction() should calculate compaction inputs and then pass them onto Compaction object once they are finalized. That makes it easy to create a new compaction -- just provide all the parameters to the constructor and you're done. No need to call confusing functions after you created your object.
This diff doesn't fully achieve that goal, but it comes pretty close. Here are some of the changes:
* have one Compaction constructor instead of two.
* inputs_ is constant after construction
* MarkFilesBeingCompacted() is now private to Compaction class and automatically called on construction/destruction.
* SetupBottommostLevel() is gone. Compaction figures it out on its own based on the input.
* CompactionPicker's functions are not passing around Compaction object anymore. They are only passing around the state that they need.
Test Plan:
make check
make asan_check
make valgrind_check
Reviewers: rven, anthony, sdong, yhchiang
Reviewed By: yhchiang
Subscribers: sdong, yhchiang, dhruba, leveldb
Differential Revision: https://reviews.facebook.net/D36687
10 years ago
|
|
|
std::vector<CompactionInputFiles> inputs, int output_level,
|
|
|
|
uint64_t target_file_size, uint64_t max_compaction_bytes,
|
Make Compaction class easier to use
Summary:
The goal of this diff is to make Compaction class easier to use. This should also make new compaction algorithms easier to write (like CompactFiles from @yhchiang and dynamic leveled and multi-leveled universal from @sdong).
Here are couple of things demonstrating that Compaction class is hard to use:
1. we have two constructors of Compaction class
2. there's this thing called grandparents_, but it appears to only be setup for leveled compaction and not compactfiles
3. it's easy to introduce a subtle and dangerous bug like this: D36225
4. SetupBottomMostLevel() is hard to understand and it shouldn't be. See this comment: https://github.com/facebook/rocksdb/blob/afbafeaeaebfd27a0f3e992fee8e0c57d07658fa/db/compaction.cc#L236-L241. It also made it harder for @yhchiang to write CompactFiles, as evidenced by this: https://github.com/facebook/rocksdb/blob/afbafeaeaebfd27a0f3e992fee8e0c57d07658fa/db/compaction_picker.cc#L204-L210
The problem is that we create Compaction object, which holds a lot of state, and then pass it around to some functions. After those functions are done mutating, then we call couple of functions on Compaction object, like SetupBottommostLevel() and MarkFilesBeingCompacted(). It is very hard to see what's happening with all that Compaction's state while it's travelling across different functions. If you're writing a new PickCompaction() function you need to try really hard to understand what are all the functions you need to run on Compaction object and what state you need to setup.
My proposed solution is to make important parts of Compaction immutable after construction. PickCompaction() should calculate compaction inputs and then pass them onto Compaction object once they are finalized. That makes it easy to create a new compaction -- just provide all the parameters to the constructor and you're done. No need to call confusing functions after you created your object.
This diff doesn't fully achieve that goal, but it comes pretty close. Here are some of the changes:
* have one Compaction constructor instead of two.
* inputs_ is constant after construction
* MarkFilesBeingCompacted() is now private to Compaction class and automatically called on construction/destruction.
* SetupBottommostLevel() is gone. Compaction figures it out on its own based on the input.
* CompactionPicker's functions are not passing around Compaction object anymore. They are only passing around the state that they need.
Test Plan:
make check
make asan_check
make valgrind_check
Reviewers: rven, anthony, sdong, yhchiang
Reviewed By: yhchiang
Subscribers: sdong, yhchiang, dhruba, leveldb
Differential Revision: https://reviews.facebook.net/D36687
10 years ago
|
|
|
uint32_t output_path_id, CompressionType compression,
|
|
|
|
CompressionOptions compression_opts,
|
|
|
|
Temperature output_temperature, uint32_t max_subcompactions,
|
Make Compaction class easier to use
Summary:
The goal of this diff is to make Compaction class easier to use. This should also make new compaction algorithms easier to write (like CompactFiles from @yhchiang and dynamic leveled and multi-leveled universal from @sdong).
Here are couple of things demonstrating that Compaction class is hard to use:
1. we have two constructors of Compaction class
2. there's this thing called grandparents_, but it appears to only be setup for leveled compaction and not compactfiles
3. it's easy to introduce a subtle and dangerous bug like this: D36225
4. SetupBottomMostLevel() is hard to understand and it shouldn't be. See this comment: https://github.com/facebook/rocksdb/blob/afbafeaeaebfd27a0f3e992fee8e0c57d07658fa/db/compaction.cc#L236-L241. It also made it harder for @yhchiang to write CompactFiles, as evidenced by this: https://github.com/facebook/rocksdb/blob/afbafeaeaebfd27a0f3e992fee8e0c57d07658fa/db/compaction_picker.cc#L204-L210
The problem is that we create Compaction object, which holds a lot of state, and then pass it around to some functions. After those functions are done mutating, then we call couple of functions on Compaction object, like SetupBottommostLevel() and MarkFilesBeingCompacted(). It is very hard to see what's happening with all that Compaction's state while it's travelling across different functions. If you're writing a new PickCompaction() function you need to try really hard to understand what are all the functions you need to run on Compaction object and what state you need to setup.
My proposed solution is to make important parts of Compaction immutable after construction. PickCompaction() should calculate compaction inputs and then pass them onto Compaction object once they are finalized. That makes it easy to create a new compaction -- just provide all the parameters to the constructor and you're done. No need to call confusing functions after you created your object.
This diff doesn't fully achieve that goal, but it comes pretty close. Here are some of the changes:
* have one Compaction constructor instead of two.
* inputs_ is constant after construction
* MarkFilesBeingCompacted() is now private to Compaction class and automatically called on construction/destruction.
* SetupBottommostLevel() is gone. Compaction figures it out on its own based on the input.
* CompactionPicker's functions are not passing around Compaction object anymore. They are only passing around the state that they need.
Test Plan:
make check
make asan_check
make valgrind_check
Reviewers: rven, anthony, sdong, yhchiang
Reviewed By: yhchiang
Subscribers: sdong, yhchiang, dhruba, leveldb
Differential Revision: https://reviews.facebook.net/D36687
10 years ago
|
|
|
std::vector<FileMetaData*> grandparents,
|
|
|
|
bool manual_compaction = false, const std::string& trim_ts = "",
|
|
|
|
double score = -1, bool deletion_compaction = false,
|
|
|
|
bool l0_files_might_overlap = true,
|
|
|
|
CompactionReason compaction_reason = CompactionReason::kUnknown,
|
|
|
|
BlobGarbageCollectionPolicy blob_garbage_collection_policy =
|
|
|
|
BlobGarbageCollectionPolicy::kUseDefault,
|
|
|
|
double blob_garbage_collection_age_cutoff = -1);
|
CompactFiles, EventListener and GetDatabaseMetaData
Summary:
This diff adds three sets of APIs to RocksDB.
= GetColumnFamilyMetaData =
* This APIs allow users to obtain the current state of a RocksDB instance on one column family.
* See GetColumnFamilyMetaData in include/rocksdb/db.h
= EventListener =
* A virtual class that allows users to implement a set of
call-back functions which will be called when specific
events of a RocksDB instance happens.
* To register EventListener, simply insert an EventListener to ColumnFamilyOptions::listeners
= CompactFiles =
* CompactFiles API inputs a set of file numbers and an output level, and RocksDB
will try to compact those files into the specified level.
= Example =
* Example code can be found in example/compact_files_example.cc, which implements
a simple external compactor using EventListener, GetColumnFamilyMetaData, and
CompactFiles API.
Test Plan:
listener_test
compactor_test
example/compact_files_example
export ROCKSDB_TESTS=CompactFiles
db_test
export ROCKSDB_TESTS=MetaData
db_test
Reviewers: ljin, igor, rven, sdong
Reviewed By: sdong
Subscribers: MarkCallaghan, dhruba, leveldb
Differential Revision: https://reviews.facebook.net/D24705
10 years ago
|
|
|
|
|
|
|
// The type of the penultimate level output range
|
|
|
|
enum class PenultimateOutputRangeType : int {
|
|
|
|
kNotSupported, // it cannot output to the penultimate level
|
|
|
|
kFullRange, // any data could be output to the penultimate level
|
|
|
|
kNonLastRange, // only the keys within non_last_level compaction inputs can
|
|
|
|
// be outputted to the penultimate level
|
|
|
|
kDisabled, // no data can be outputted to the penultimate level
|
|
|
|
};
|
|
|
|
|
|
|
|
// No copying allowed
|
|
|
|
Compaction(const Compaction&) = delete;
|
|
|
|
void operator=(const Compaction&) = delete;
|
|
|
|
|
|
|
|
~Compaction();
|
|
|
|
|
|
|
|
// Returns the level associated to the specified compaction input level.
|
|
|
|
// If compaction_input_level is not specified, then input_level is set to 0.
|
|
|
|
int level(size_t compaction_input_level = 0) const {
|
|
|
|
return inputs_[compaction_input_level].level;
|
|
|
|
}
|
|
|
|
|
options.level_compaction_dynamic_level_bytes to allow RocksDB to pick size bases of levels dynamically.
Summary:
When having fixed max_bytes_for_level_base, the ratio of size of largest level and the second one can range from 0 to the multiplier. This makes LSM tree frequently irregular and unpredictable. It can also cause poor space amplification in some cases.
In this improvement (proposed by Igor Kabiljo), we introduce a parameter option.level_compaction_use_dynamic_max_bytes. When turning it on, RocksDB is free to pick a level base in the range of (options.max_bytes_for_level_base/options.max_bytes_for_level_multiplier, options.max_bytes_for_level_base] so that real level ratios are close to options.max_bytes_for_level_multiplier.
Test Plan: New unit tests and pass tests suites including valgrind.
Reviewers: MarkCallaghan, rven, yhchiang, igor, ikabiljo
Reviewed By: ikabiljo
Subscribers: yoshinorim, ikabiljo, dhruba, leveldb
Differential Revision: https://reviews.facebook.net/D31437
10 years ago
|
|
|
int start_level() const { return start_level_; }
|
|
|
|
|
|
|
|
// Outputs will go to this level
|
|
|
|
int output_level() const { return output_level_; }
|
|
|
|
|
|
|
|
// Returns the number of input levels in this compaction.
|
|
|
|
size_t num_input_levels() const { return inputs_.size(); }
|
|
|
|
|
|
|
|
// Return the object that holds the edits to the descriptor done
|
|
|
|
// by this compaction.
|
Make Compaction class easier to use
Summary:
The goal of this diff is to make Compaction class easier to use. This should also make new compaction algorithms easier to write (like CompactFiles from @yhchiang and dynamic leveled and multi-leveled universal from @sdong).
Here are couple of things demonstrating that Compaction class is hard to use:
1. we have two constructors of Compaction class
2. there's this thing called grandparents_, but it appears to only be setup for leveled compaction and not compactfiles
3. it's easy to introduce a subtle and dangerous bug like this: D36225
4. SetupBottomMostLevel() is hard to understand and it shouldn't be. See this comment: https://github.com/facebook/rocksdb/blob/afbafeaeaebfd27a0f3e992fee8e0c57d07658fa/db/compaction.cc#L236-L241. It also made it harder for @yhchiang to write CompactFiles, as evidenced by this: https://github.com/facebook/rocksdb/blob/afbafeaeaebfd27a0f3e992fee8e0c57d07658fa/db/compaction_picker.cc#L204-L210
The problem is that we create Compaction object, which holds a lot of state, and then pass it around to some functions. After those functions are done mutating, then we call couple of functions on Compaction object, like SetupBottommostLevel() and MarkFilesBeingCompacted(). It is very hard to see what's happening with all that Compaction's state while it's travelling across different functions. If you're writing a new PickCompaction() function you need to try really hard to understand what are all the functions you need to run on Compaction object and what state you need to setup.
My proposed solution is to make important parts of Compaction immutable after construction. PickCompaction() should calculate compaction inputs and then pass them onto Compaction object once they are finalized. That makes it easy to create a new compaction -- just provide all the parameters to the constructor and you're done. No need to call confusing functions after you created your object.
This diff doesn't fully achieve that goal, but it comes pretty close. Here are some of the changes:
* have one Compaction constructor instead of two.
* inputs_ is constant after construction
* MarkFilesBeingCompacted() is now private to Compaction class and automatically called on construction/destruction.
* SetupBottommostLevel() is gone. Compaction figures it out on its own based on the input.
* CompactionPicker's functions are not passing around Compaction object anymore. They are only passing around the state that they need.
Test Plan:
make check
make asan_check
make valgrind_check
Reviewers: rven, anthony, sdong, yhchiang
Reviewed By: yhchiang
Subscribers: sdong, yhchiang, dhruba, leveldb
Differential Revision: https://reviews.facebook.net/D36687
10 years ago
|
|
|
VersionEdit* edit() { return &edit_; }
|
|
|
|
|
|
|
|
// Returns the number of input files associated to the specified
|
|
|
|
// compaction input level.
|
|
|
|
// The function will return 0 if when "compaction_input_level" < 0
|
|
|
|
// or "compaction_input_level" >= "num_input_levels()".
|
|
|
|
size_t num_input_files(size_t compaction_input_level) const {
|
|
|
|
if (compaction_input_level < inputs_.size()) {
|
|
|
|
return inputs_[compaction_input_level].size();
|
|
|
|
}
|
|
|
|
return 0;
|
|
|
|
}
|
|
|
|
|
|
|
|
// Returns input version of the compaction
|
|
|
|
Version* input_version() const { return input_version_; }
|
|
|
|
|
|
|
|
// Returns the ColumnFamilyData associated with the compaction.
|
|
|
|
ColumnFamilyData* column_family_data() const { return cfd_; }
|
|
|
|
|
|
|
|
// Returns the file meta data of the 'i'th input file at the
|
|
|
|
// specified compaction input level.
|
|
|
|
// REQUIREMENT: "compaction_input_level" must be >= 0 and
|
|
|
|
// < "input_levels()"
|
|
|
|
FileMetaData* input(size_t compaction_input_level, size_t i) const {
|
|
|
|
assert(compaction_input_level < inputs_.size());
|
|
|
|
return inputs_[compaction_input_level][i];
|
|
|
|
}
|
|
|
|
|
|
|
|
const std::vector<AtomicCompactionUnitBoundary>* boundaries(
|
|
|
|
size_t compaction_input_level) const {
|
|
|
|
assert(compaction_input_level < inputs_.size());
|
|
|
|
return &inputs_[compaction_input_level].atomic_compaction_unit_boundaries;
|
|
|
|
}
|
|
|
|
|
|
|
|
// Returns the list of file meta data of the specified compaction
|
|
|
|
// input level.
|
|
|
|
// REQUIREMENT: "compaction_input_level" must be >= 0 and
|
|
|
|
// < "input_levels()"
|
|
|
|
const std::vector<FileMetaData*>* inputs(
|
|
|
|
size_t compaction_input_level) const {
|
|
|
|
assert(compaction_input_level < inputs_.size());
|
|
|
|
return &inputs_[compaction_input_level].files;
|
|
|
|
}
|
|
|
|
|
|
|
|
const std::vector<CompactionInputFiles>* inputs() { return &inputs_; }
|
|
|
|
|
|
|
|
// Returns the LevelFilesBrief of the specified compaction input level.
|
|
|
|
const LevelFilesBrief* input_levels(size_t compaction_input_level) const {
|
|
|
|
return &input_levels_[compaction_input_level];
|
|
|
|
}
|
|
|
|
|
|
|
|
// Maximum size of files to build during this compaction.
|
|
|
|
uint64_t max_output_file_size() const { return max_output_file_size_; }
|
|
|
|
|
Align compaction output file boundaries to the next level ones (#10655)
Summary:
Try to align the compaction output file boundaries to the next level ones
(grandparent level), to reduce the level compaction write-amplification.
In level compaction, there are "wasted" data at the beginning and end of the
output level files. Align the file boundary can avoid such "wasted" compaction.
With this PR, it tries to align the non-bottommost level file boundaries to its
next level ones. It may cut file when the file size is large enough (at least
50% of target_file_size) and not too large (2x target_file_size).
db_bench shows about 12.56% compaction reduction:
```
TEST_TMPDIR=/data/dbbench2 ./db_bench --benchmarks=fillrandom,readrandom -max_background_jobs=12 -num=400000000 -target_file_size_base=33554432
# baseline:
Flush(GB): cumulative 25.882, interval 7.216
Cumulative compaction: 285.90 GB write, 162.36 MB/s write, 269.68 GB read, 153.15 MB/s read, 2926.7 seconds
# with this change:
Flush(GB): cumulative 25.882, interval 7.753
Cumulative compaction: 249.97 GB write, 141.96 MB/s write, 233.74 GB read, 132.74 MB/s read, 2534.9 seconds
```
The compaction simulator shows a similar result (14% with 100G random data).
As a side effect, with this PR, the SST file size can exceed the
target_file_size, but is capped at 2x target_file_size. And there will be
smaller files. Here are file size statistics when loading 100GB with the target
file size 32MB:
```
baseline this_PR
count 1.656000e+03 1.705000e+03
mean 3.116062e+07 3.028076e+07
std 7.145242e+06 8.046139e+06
```
The feature is enabled by default, to revert to the old behavior disable it
with `AdvancedColumnFamilyOptions.level_compaction_dynamic_file_size = false`
Also includes https://github.com/facebook/rocksdb/issues/1963 to cut file before skippable grandparent file. Which is for
use case like user adding 2 or more non-overlapping data range at the same
time, it can reduce the overlapping of 2 datasets in the lower levels.
Pull Request resolved: https://github.com/facebook/rocksdb/pull/10655
Reviewed By: cbi42
Differential Revision: D39552321
Pulled By: jay-zhuang
fbshipit-source-id: 640d15f159ab0cd973f2426cfc3af266fc8bdde2
2 years ago
|
|
|
// Target output file size for this compaction
|
|
|
|
uint64_t target_output_file_size() const { return target_output_file_size_; }
|
|
|
|
|
|
|
|
// What compression for output
|
|
|
|
CompressionType output_compression() const { return output_compression_; }
|
|
|
|
|
|
|
|
// What compression options for output
|
|
|
|
const CompressionOptions& output_compression_opts() const {
|
|
|
|
return output_compression_opts_;
|
|
|
|
}
|
|
|
|
|
|
|
|
// Whether need to write output file to second DB path.
|
|
|
|
uint32_t output_path_id() const { return output_path_id_; }
|
|
|
|
|
|
|
|
// Is this a trivial compaction that can be implemented by just
|
|
|
|
// moving a single input file to the next level (no merging or splitting)
|
|
|
|
bool IsTrivialMove() const;
|
|
|
|
|
|
|
|
// The split user key in the output level if this compaction is required to
|
|
|
|
// split the output files according to the existing cursor in the output
|
|
|
|
// level under round-robin compaction policy. Empty indicates no required
|
|
|
|
// splitting key
|
|
|
|
const InternalKey* GetOutputSplitKey() const { return output_split_key_; }
|
|
|
|
|
|
|
|
// If true, then the compaction can be done by simply deleting input files.
|
|
|
|
bool deletion_compaction() const { return deletion_compaction_; }
|
|
|
|
|
|
|
|
// Add all inputs to this compaction as delete operations to *edit.
|
|
|
|
void AddInputDeletions(VersionEdit* edit);
|
|
|
|
|
|
|
|
// Returns true if the available information we have guarantees that
|
|
|
|
// the input "user_key" does not exist in any level beyond "output_level()".
|
|
|
|
bool KeyNotExistsBeyondOutputLevel(const Slice& user_key,
|
|
|
|
std::vector<size_t>* level_ptrs) const;
|
|
|
|
|
|
|
|
// Clear all files to indicate that they are not being compacted
|
|
|
|
// Delete this compaction from the list of running compactions.
|
|
|
|
//
|
|
|
|
// Requirement: DB mutex held
|
|
|
|
void ReleaseCompactionFiles(Status status);
|
|
|
|
|
|
|
|
// Returns the summary of the compaction in "output" with maximum "len"
|
|
|
|
// in bytes. The caller is responsible for the memory management of
|
|
|
|
// "output".
|
|
|
|
void Summary(char* output, int len);
|
|
|
|
|
|
|
|
// Return the score that was used to pick this compaction run.
|
|
|
|
double score() const { return score_; }
|
|
|
|
|
|
|
|
// Is this compaction creating a file in the bottom most level?
|
|
|
|
bool bottommost_level() const { return bottommost_level_; }
|
|
|
|
|
|
|
|
// Is the compaction compact to the last level
|
|
|
|
bool is_last_level() const {
|
|
|
|
return output_level_ == immutable_options_.num_levels - 1;
|
|
|
|
}
|
|
|
|
|
|
|
|
// Does this compaction include all sst files?
|
|
|
|
bool is_full_compaction() const { return is_full_compaction_; }
|
|
|
|
|
|
|
|
// Was this compaction triggered manually by the client?
|
|
|
|
bool is_manual_compaction() const { return is_manual_compaction_; }
|
|
|
|
|
|
|
|
std::string trim_ts() const { return trim_ts_; }
|
|
|
|
|
|
|
|
// Used when allow_trivial_move option is set in
|
|
|
|
// Universal compaction. If all the input files are
|
|
|
|
// non overlapping, then is_trivial_move_ variable
|
|
|
|
// will be set true, else false
|
|
|
|
void set_is_trivial_move(bool trivial_move) {
|
|
|
|
is_trivial_move_ = trivial_move;
|
|
|
|
}
|
|
|
|
|
|
|
|
// Used when allow_trivial_move option is set in
|
|
|
|
// Universal compaction. Returns true, if the input files
|
|
|
|
// are non-overlapping and can be trivially moved.
|
|
|
|
bool is_trivial_move() const { return is_trivial_move_; }
|
|
|
|
|
|
|
|
// How many total levels are there?
|
|
|
|
int number_levels() const { return number_levels_; }
|
|
|
|
|
|
|
|
// Return the ImmutableOptions that should be used throughout the compaction
|
|
|
|
// procedure
|
|
|
|
const ImmutableOptions* immutable_options() const {
|
|
|
|
return &immutable_options_;
|
|
|
|
}
|
|
|
|
|
|
|
|
// Return the MutableCFOptions that should be used throughout the compaction
|
|
|
|
// procedure
|
|
|
|
const MutableCFOptions* mutable_cf_options() const {
|
|
|
|
return &mutable_cf_options_;
|
|
|
|
}
|
|
|
|
|
|
|
|
// Returns the size in bytes that the output file should be preallocated to.
|
|
|
|
// In level compaction, that is max_file_size_. In universal compaction, that
|
|
|
|
// is the sum of all input file sizes.
|
|
|
|
uint64_t OutputFilePreallocationSize() const;
|
|
|
|
|
|
|
|
void SetInputVersion(Version* input_version);
|
|
|
|
|
options.level_compaction_dynamic_level_bytes to allow RocksDB to pick size bases of levels dynamically.
Summary:
When having fixed max_bytes_for_level_base, the ratio of size of largest level and the second one can range from 0 to the multiplier. This makes LSM tree frequently irregular and unpredictable. It can also cause poor space amplification in some cases.
In this improvement (proposed by Igor Kabiljo), we introduce a parameter option.level_compaction_use_dynamic_max_bytes. When turning it on, RocksDB is free to pick a level base in the range of (options.max_bytes_for_level_base/options.max_bytes_for_level_multiplier, options.max_bytes_for_level_base] so that real level ratios are close to options.max_bytes_for_level_multiplier.
Test Plan: New unit tests and pass tests suites including valgrind.
Reviewers: MarkCallaghan, rven, yhchiang, igor, ikabiljo
Reviewed By: ikabiljo
Subscribers: yoshinorim, ikabiljo, dhruba, leveldb
Differential Revision: https://reviews.facebook.net/D31437
10 years ago
|
|
|
struct InputLevelSummaryBuffer {
|
|
|
|
char buffer[128];
|
|
|
|
};
|
|
|
|
|
|
|
|
const char* InputLevelSummary(InputLevelSummaryBuffer* scratch) const;
|
|
|
|
|
Include bunch of more events into EventLogger
Summary:
Added these events:
* Recovery start, finish and also when recovery creates a file
* Trivial move
* Compaction start, finish and when compaction creates a file
* Flush start, finish
Also includes small fix to EventLogger
Also added option ROCKSDB_PRINT_EVENTS_TO_STDOUT which is useful when we debug things. I've spent far too much time chasing LOG files.
Still didn't get sst table properties in JSON. They are written very deeply into the stack. I'll address in separate diff.
TODO:
* Write specification. Let's first use this for a while and figure out what's good data to put here, too. After that we'll write spec
* Write tools that parse and analyze LOGs. This can be in python or go. Good intern task.
Test Plan: Ran db_bench with ROCKSDB_PRINT_EVENTS_TO_STDOUT. Here's the output: https://phabricator.fb.com/P19811976
Reviewers: sdong, yhchiang, rven, MarkCallaghan, kradhakrishnan, anthony
Reviewed By: anthony
Subscribers: dhruba, leveldb
Differential Revision: https://reviews.facebook.net/D37521
10 years ago
|
|
|
uint64_t CalculateTotalInputSize() const;
|
|
|
|
|
Make Compaction class easier to use
Summary:
The goal of this diff is to make Compaction class easier to use. This should also make new compaction algorithms easier to write (like CompactFiles from @yhchiang and dynamic leveled and multi-leveled universal from @sdong).
Here are couple of things demonstrating that Compaction class is hard to use:
1. we have two constructors of Compaction class
2. there's this thing called grandparents_, but it appears to only be setup for leveled compaction and not compactfiles
3. it's easy to introduce a subtle and dangerous bug like this: D36225
4. SetupBottomMostLevel() is hard to understand and it shouldn't be. See this comment: https://github.com/facebook/rocksdb/blob/afbafeaeaebfd27a0f3e992fee8e0c57d07658fa/db/compaction.cc#L236-L241. It also made it harder for @yhchiang to write CompactFiles, as evidenced by this: https://github.com/facebook/rocksdb/blob/afbafeaeaebfd27a0f3e992fee8e0c57d07658fa/db/compaction_picker.cc#L204-L210
The problem is that we create Compaction object, which holds a lot of state, and then pass it around to some functions. After those functions are done mutating, then we call couple of functions on Compaction object, like SetupBottommostLevel() and MarkFilesBeingCompacted(). It is very hard to see what's happening with all that Compaction's state while it's travelling across different functions. If you're writing a new PickCompaction() function you need to try really hard to understand what are all the functions you need to run on Compaction object and what state you need to setup.
My proposed solution is to make important parts of Compaction immutable after construction. PickCompaction() should calculate compaction inputs and then pass them onto Compaction object once they are finalized. That makes it easy to create a new compaction -- just provide all the parameters to the constructor and you're done. No need to call confusing functions after you created your object.
This diff doesn't fully achieve that goal, but it comes pretty close. Here are some of the changes:
* have one Compaction constructor instead of two.
* inputs_ is constant after construction
* MarkFilesBeingCompacted() is now private to Compaction class and automatically called on construction/destruction.
* SetupBottommostLevel() is gone. Compaction figures it out on its own based on the input.
* CompactionPicker's functions are not passing around Compaction object anymore. They are only passing around the state that they need.
Test Plan:
make check
make asan_check
make valgrind_check
Reviewers: rven, anthony, sdong, yhchiang
Reviewed By: yhchiang
Subscribers: sdong, yhchiang, dhruba, leveldb
Differential Revision: https://reviews.facebook.net/D36687
10 years ago
|
|
|
// In case of compaction error, reset the nextIndex that is used
|
|
|
|
// to pick up the next file to be compacted from files_by_size_
|
|
|
|
void ResetNextCompactionIndex();
|
|
|
|
|
Allowing L0 -> L1 trivial move on sorted data
Summary:
This diff updates the logic of how we do trivial move, now trivial move can run on any number of files in input level as long as they are not overlapping
The conditions for trivial move have been updated
Introduced conditions:
- Trivial move cannot happen if we have a compaction filter (except if the compaction is not manual)
- Input level files cannot be overlapping
Removed conditions:
- Trivial move only run when the compaction is not manual
- Input level should can contain only 1 file
More context on what tests failed because of Trivial move
```
DBTest.CompactionsGenerateMultipleFiles
This test is expecting compaction on a file in L0 to generate multiple files in L1, this test will fail with trivial move because we end up with one file in L1
```
```
DBTest.NoSpaceCompactRange
This test expect compaction to fail when we force environment to report running out of space, of course this is not valid in trivial move situation
because trivial move does not need any extra space, and did not check for that
```
```
DBTest.DropWrites
Similar to DBTest.NoSpaceCompactRange
```
```
DBTest.DeleteObsoleteFilesPendingOutputs
This test expect that a file in L2 is deleted after it's moved to L3, this is not valid with trivial move because although the file was moved it is now used by L3
```
```
CuckooTableDBTest.CompactionIntoMultipleFiles
Same as DBTest.CompactionsGenerateMultipleFiles
```
This diff is based on a work by @sdong https://reviews.facebook.net/D34149
Test Plan: make -j64 check
Reviewers: rven, sdong, igor
Reviewed By: igor
Subscribers: yhchiang, ott, march, dhruba, sdong
Differential Revision: https://reviews.facebook.net/D34797
10 years ago
|
|
|
// Create a CompactionFilter from compaction_filter_factory
|
|
|
|
std::unique_ptr<CompactionFilter> CreateCompactionFilter() const;
|
|
|
|
|
|
|
|
// Create a SstPartitioner from sst_partitioner_factory
|
|
|
|
std::unique_ptr<SstPartitioner> CreateSstPartitioner() const;
|
|
|
|
|
|
|
|
// Is the input level corresponding to output_level_ empty?
|
|
|
|
bool IsOutputLevelEmpty() const;
|
Parallelize L0-L1 Compaction: Restructure Compaction Job
Summary:
As of now compactions involving files from Level 0 and Level 1 are single
threaded because the files in L0, although sorted, are not range partitioned like
the other levels. This means that during L0-L1 compaction each file from L1
needs to be merged with potentially all the files from L0.
This attempt to parallelize the L0-L1 compaction assigns a thread and a
corresponding iterator to each L1 file that then considers only the key range
found in that L1 file and only the L0 files that have those keys (and only the
specific portion of those L0 files in which those keys are found). In this way
the overlap is minimized and potentially eliminated between different iterators
focusing on the same files.
The first step is to restructure the compaction logic to break L0-L1 compactions
into multiple, smaller, sequential compactions. Eventually each of these smaller
jobs will be run simultaneously. Areas to pay extra attention to are
# Correct aggregation of compaction job statistics across multiple threads
# Proper opening/closing of output files (make sure each thread's is unique)
# Keys that span multiple L1 files
# Skewed distributions of keys within L0 files
Test Plan: Make and run db_test (newer version has separate compaction tests) and compaction_job_stats_test
Reviewers: igor, noetzli, anthony, sdong, yhchiang
Reviewed By: yhchiang
Subscribers: MarkCallaghan, dhruba, leveldb
Differential Revision: https://reviews.facebook.net/D42699
10 years ago
|
|
|
|
|
|
|
// Should this compaction be broken up into smaller ones run in parallel?
|
|
|
|
bool ShouldFormSubcompactions() const;
|
Parallelize L0-L1 Compaction: Restructure Compaction Job
Summary:
As of now compactions involving files from Level 0 and Level 1 are single
threaded because the files in L0, although sorted, are not range partitioned like
the other levels. This means that during L0-L1 compaction each file from L1
needs to be merged with potentially all the files from L0.
This attempt to parallelize the L0-L1 compaction assigns a thread and a
corresponding iterator to each L1 file that then considers only the key range
found in that L1 file and only the L0 files that have those keys (and only the
specific portion of those L0 files in which those keys are found). In this way
the overlap is minimized and potentially eliminated between different iterators
focusing on the same files.
The first step is to restructure the compaction logic to break L0-L1 compactions
into multiple, smaller, sequential compactions. Eventually each of these smaller
jobs will be run simultaneously. Areas to pay extra attention to are
# Correct aggregation of compaction job statistics across multiple threads
# Proper opening/closing of output files (make sure each thread's is unique)
# Keys that span multiple L1 files
# Skewed distributions of keys within L0 files
Test Plan: Make and run db_test (newer version has separate compaction tests) and compaction_job_stats_test
Reviewers: igor, noetzli, anthony, sdong, yhchiang
Reviewed By: yhchiang
Subscribers: MarkCallaghan, dhruba, leveldb
Differential Revision: https://reviews.facebook.net/D42699
10 years ago
|
|
|
|
|
|
|
// Returns true iff at least one input file references a blob file.
|
|
|
|
//
|
|
|
|
// PRE: input version has been set.
|
|
|
|
bool DoesInputReferenceBlobFiles() const;
|
|
|
|
|
|
|
|
// test function to validate the functionality of IsBottommostLevel()
|
|
|
|
// function -- determines if compaction with inputs and storage is bottommost
|
|
|
|
static bool TEST_IsBottommostLevel(
|
|
|
|
int output_level, VersionStorageInfo* vstorage,
|
|
|
|
const std::vector<CompactionInputFiles>& inputs);
|
|
|
|
|
|
|
|
TablePropertiesCollection GetOutputTableProperties() const {
|
|
|
|
return output_table_properties_;
|
|
|
|
}
|
|
|
|
|
|
|
|
void SetOutputTableProperties(TablePropertiesCollection tp) {
|
|
|
|
output_table_properties_ = std::move(tp);
|
|
|
|
}
|
|
|
|
|
|
|
|
Slice GetSmallestUserKey() const { return smallest_user_key_; }
|
|
|
|
|
|
|
|
Slice GetLargestUserKey() const { return largest_user_key_; }
|
|
|
|
|
|
|
|
Slice GetPenultimateLevelSmallestUserKey() const {
|
|
|
|
return penultimate_level_smallest_user_key_;
|
|
|
|
}
|
|
|
|
|
|
|
|
Slice GetPenultimateLevelLargestUserKey() const {
|
|
|
|
return penultimate_level_largest_user_key_;
|
|
|
|
}
|
|
|
|
|
|
|
|
PenultimateOutputRangeType GetPenultimateOutputRangeType() const {
|
|
|
|
return penultimate_output_range_type_;
|
|
|
|
}
|
|
|
|
|
|
|
|
// Return true if the compaction supports per_key_placement
|
|
|
|
bool SupportsPerKeyPlacement() const;
|
|
|
|
|
|
|
|
// Get per_key_placement penultimate output level, which is `last_level - 1`
|
|
|
|
// if per_key_placement feature is supported. Otherwise, return -1.
|
|
|
|
int GetPenultimateLevel() const;
|
|
|
|
|
|
|
|
// Return true if the given range is overlap with penultimate level output
|
|
|
|
// range.
|
|
|
|
// Both smallest_key and largest_key include timestamps if user-defined
|
|
|
|
// timestamp is enabled.
|
|
|
|
bool OverlapPenultimateLevelOutputRange(const Slice& smallest_key,
|
|
|
|
const Slice& largest_key) const;
|
|
|
|
|
|
|
|
// Return true if the key is within penultimate level output range for
|
|
|
|
// per_key_placement feature, which is safe to place the key to the
|
|
|
|
// penultimate level. different compaction strategy has different rules.
|
|
|
|
// If per_key_placement is not supported, always return false.
|
|
|
|
// TODO: currently it doesn't support moving data from the last level to the
|
|
|
|
// penultimate level
|
|
|
|
// key includes timestamp if user-defined timestamp is enabled.
|
|
|
|
bool WithinPenultimateLevelOutputRange(const Slice& key) const;
|
|
|
|
|
|
|
|
CompactionReason compaction_reason() const { return compaction_reason_; }
|
|
|
|
|
|
|
|
const std::vector<FileMetaData*>& grandparents() const {
|
|
|
|
return grandparents_;
|
|
|
|
}
|
|
|
|
|
|
|
|
uint64_t max_compaction_bytes() const { return max_compaction_bytes_; }
|
|
|
|
|
|
|
|
Temperature output_temperature() const { return output_temperature_; }
|
|
|
|
|
|
|
|
uint32_t max_subcompactions() const { return max_subcompactions_; }
|
|
|
|
|
|
|
|
bool enable_blob_garbage_collection() const {
|
|
|
|
return enable_blob_garbage_collection_;
|
|
|
|
}
|
|
|
|
|
|
|
|
double blob_garbage_collection_age_cutoff() const {
|
|
|
|
return blob_garbage_collection_age_cutoff_;
|
|
|
|
}
|
|
|
|
|
Try to start TTL earlier with kMinOverlappingRatio is used (#8749)
Summary:
Right now, when options.ttl is set, compactions are triggered around the time when TTL is reached. This might cause extra compactions which are often bursty. This commit tries to mitigate it by picking those files earlier in normal compaction picking process. This is only implemented using kMinOverlappingRatio with Leveled compaction as it is the default value and it is more complicated to change other styles.
When a file is aged more than ttl/2, RocksDB starts to boost the compaction priority of files in normal compaction picking process, and hope by the time TTL is reached, very few extra compaction is needed.
In order for this to work, another change is made: during a compaction, if an output level file is older than ttl/2, cut output files based on original boundary (if it is not in the last level). This is to make sure that after an old file is moved to the next level, and new data is merged from the upper level, the new data falling into this range isn't reset with old timestamp. Without this change, in many cases, most files from one level will keep having old timestamp, even if they have newer data and we stuck in it.
Pull Request resolved: https://github.com/facebook/rocksdb/pull/8749
Test Plan: Add a unit test to test the boosting logic. Will add a unit test to test it end-to-end.
Reviewed By: jay-zhuang
Differential Revision: D30735261
fbshipit-source-id: 503c2d89250b22911eb99e72b379be154de3428e
3 years ago
|
|
|
// start and end are sub compact range. Null if no boundary.
|
|
|
|
// This is used to filter out some input files' ancester's time range.
|
|
|
|
uint64_t MinInputFileOldestAncesterTime(const InternalKey* start,
|
|
|
|
const InternalKey* end) const;
|
Sort L0 files by newly introduced epoch_num (#10922)
Summary:
**Context:**
Sorting L0 files by `largest_seqno` has at least two inconvenience:
- File ingestion and compaction involving ingested files can create files of overlapping seqno range with the existing files. `force_consistency_check=true` will catch such overlap seqno range even those harmless overlap.
- For example, consider the following sequence of events ("key@n" indicates key at seqno "n")
- insert k1@1 to memtable m1
- ingest file s1 with k2@2, ingest file s2 with k3@3
- insert k4@4 to m1
- compact files s1, s2 and result in new file s3 of seqno range [2, 3]
- flush m1 and result in new file s4 of seqno range [1, 4]. And `force_consistency_check=true` will think s4 and s3 has file reordering corruption that might cause retuning an old value of k1
- However such caught corruption is a false positive since s1, s2 will not have overlapped keys with k1 or whatever inserted into m1 before ingest file s1 by the requirement of file ingestion (otherwise the m1 will be flushed first before any of the file ingestion completes). Therefore there in fact isn't any file reordering corruption.
- Single delete can decrease a file's largest seqno and ordering by `largest_seqno` can introduce a wrong ordering hence file reordering corruption
- For example, consider the following sequence of events ("key@n" indicates key at seqno "n", Credit to ajkr for this example)
- an existing SST s1 contains only k1@1
- insert k1@2 to memtable m1
- ingest file s2 with k3@3, ingest file s3 with k4@4
- insert single delete k5@5 in m1
- flush m1 and result in new file s4 of seqno range [2, 5]
- compact s1, s2, s3 and result in new file s5 of seqno range [1, 4]
- compact s4 and result in new file s6 of seqno range [2] due to single delete
- By the last step, we have file ordering by largest seqno (">" means "newer") : s5 > s6 while s6 contains a newer version of the k1's value (i.e, k1@2) than s5, which is a real reordering corruption. While this can be caught by `force_consistency_check=true`, there isn't a good way to prevent this from happening if ordering by `largest_seqno`
Therefore, we are redesigning the sorting criteria of L0 files and avoid above inconvenience. Credit to ajkr , we now introduce `epoch_num` which describes the order of a file being flushed or ingested/imported (compaction output file will has the minimum `epoch_num` among input files'). This will avoid the above inconvenience in the following ways:
- In the first case above, there will no longer be overlap seqno range check in `force_consistency_check=true` but `epoch_number` ordering check. This will result in file ordering s1 < s2 < s4 (pre-compaction) and s3 < s4 (post-compaction) which won't trigger false positive corruption. See test class `DBCompactionTestL0FilesMisorderCorruption*` for more.
- In the second case above, this will result in file ordering s1 < s2 < s3 < s4 (pre-compacting s1, s2, s3), s5 < s4 (post-compacting s1, s2, s3), s5 < s6 (post-compacting s4), which are correct file ordering without causing any corruption.
**Summary:**
- Introduce `epoch_number` stored per `ColumnFamilyData` and sort CF's L0 files by their assigned `epoch_number` instead of `largest_seqno`.
- `epoch_number` is increased and assigned upon `VersionEdit::AddFile()` for flush (or similarly for WriteLevel0TableForRecovery) and file ingestion (except for allow_behind_true, which will always get assigned as the `kReservedEpochNumberForFileIngestedBehind`)
- Compaction output file is assigned with the minimum `epoch_number` among input files'
- Refit level: reuse refitted file's epoch_number
- Other paths needing `epoch_number` treatment:
- Import column families: reuse file's epoch_number if exists. If not, assign one based on `NewestFirstBySeqNo`
- Repair: reuse file's epoch_number if exists. If not, assign one based on `NewestFirstBySeqNo`.
- Assigning new epoch_number to a file and adding this file to LSM tree should be atomic. This is guaranteed by us assigning epoch_number right upon `VersionEdit::AddFile()` where this version edit will be apply to LSM tree shape right after by holding the db mutex (e.g, flush, file ingestion, import column family) or by there is only 1 ongoing edit per CF (e.g, WriteLevel0TableForRecovery, Repair).
- Assigning the minimum input epoch number to compaction output file won't misorder L0 files (even through later `Refit(target_level=0)`). It's due to for every key "k" in the input range, a legit compaction will cover a continuous epoch number range of that key. As long as we assign the key "k" the minimum input epoch number, it won't become newer or older than the versions of this key that aren't included in this compaction hence no misorder.
- Persist `epoch_number` of each file in manifest and recover `epoch_number` on db recovery
- Backward compatibility with old db without `epoch_number` support is guaranteed by assigning `epoch_number` to recovered files by `NewestFirstBySeqno` order. See `VersionStorageInfo::RecoverEpochNumbers()` for more
- Forward compatibility with manifest is guaranteed by flexibility of `NewFileCustomTag`
- Replace `force_consistent_check` on L0 with `epoch_number` and remove false positive check like case 1 with `largest_seqno` above
- Due to backward compatibility issue, we might encounter files with missing epoch number at the beginning of db recovery. We will still use old L0 sorting mechanism (`NewestFirstBySeqno`) to check/sort them till we infer their epoch number. See usages of `EpochNumberRequirement`.
- Remove fix https://github.com/facebook/rocksdb/pull/5958#issue-511150930 and their outdated tests to file reordering corruption because such fix can be replaced by this PR.
- Misc:
- update existing tests with `epoch_number` so make check will pass
- update https://github.com/facebook/rocksdb/pull/5958#issue-511150930 tests to verify corruption is fixed using `epoch_number` and cover universal/fifo compaction/CompactRange/CompactFile cases
- assert db_mutex is held for a few places before calling ColumnFamilyData::NewEpochNumber()
Pull Request resolved: https://github.com/facebook/rocksdb/pull/10922
Test Plan:
- `make check`
- New unit tests under `db/db_compaction_test.cc`, `db/db_test2.cc`, `db/version_builder_test.cc`, `db/repair_test.cc`
- Updated tests (i.e, `DBCompactionTestL0FilesMisorderCorruption*`) under https://github.com/facebook/rocksdb/pull/5958#issue-511150930
- [Ongoing] Compatibility test: manually run https://github.com/ajkr/rocksdb/commit/36a5686ec012f35a4371e409aa85c404ca1c210d (with file ingestion off for running the `.orig` binary to prevent this bug affecting upgrade/downgrade formality checking) for 1 hour on `simple black/white box`, `cf_consistency/txn/enable_ts with whitebox + test_best_efforts_recovery with blackbox`
- [Ongoing] normal db stress test
- [Ongoing] db stress test with aggressive value https://github.com/facebook/rocksdb/pull/10761
Reviewed By: ajkr
Differential Revision: D41063187
Pulled By: hx235
fbshipit-source-id: 826cb23455de7beaabe2d16c57682a82733a32a9
2 years ago
|
|
|
// Return the minimum epoch number among
|
|
|
|
// input files' associated with this compaction
|
|
|
|
uint64_t MinInputFileEpochNumber() const;
|
|
|
|
|
|
|
|
// Called by DBImpl::NotifyOnCompactionCompleted to make sure number of
|
|
|
|
// compaction begin and compaction completion callbacks match.
|
|
|
|
void SetNotifyOnCompactionCompleted() {
|
|
|
|
notify_on_compaction_completion_ = true;
|
|
|
|
}
|
|
|
|
|
|
|
|
bool ShouldNotifyOnCompactionCompleted() const {
|
|
|
|
return notify_on_compaction_completion_;
|
|
|
|
}
|
|
|
|
|
|
|
|
static constexpr int kInvalidLevel = -1;
|
|
|
|
|
|
|
|
// Evaluate penultimate output level. If the compaction supports
|
|
|
|
// per_key_placement feature, it returns the penultimate level number.
|
|
|
|
// Otherwise, it's set to kInvalidLevel (-1), which means
|
|
|
|
// output_to_penultimate_level is not supported.
|
|
|
|
// Note: even the penultimate level output is supported (PenultimateLevel !=
|
|
|
|
// kInvalidLevel), some key range maybe unsafe to be outputted to the
|
|
|
|
// penultimate level. The safe key range is populated by
|
|
|
|
// `PopulatePenultimateLevelOutputRange()`.
|
|
|
|
// Which could potentially disable all penultimate level output.
|
|
|
|
static int EvaluatePenultimateLevel(const VersionStorageInfo* vstorage,
|
|
|
|
const ImmutableOptions& immutable_options,
|
|
|
|
const int start_level,
|
|
|
|
const int output_level);
|
|
|
|
|
|
|
|
private:
|
Make Compaction class easier to use
Summary:
The goal of this diff is to make Compaction class easier to use. This should also make new compaction algorithms easier to write (like CompactFiles from @yhchiang and dynamic leveled and multi-leveled universal from @sdong).
Here are couple of things demonstrating that Compaction class is hard to use:
1. we have two constructors of Compaction class
2. there's this thing called grandparents_, but it appears to only be setup for leveled compaction and not compactfiles
3. it's easy to introduce a subtle and dangerous bug like this: D36225
4. SetupBottomMostLevel() is hard to understand and it shouldn't be. See this comment: https://github.com/facebook/rocksdb/blob/afbafeaeaebfd27a0f3e992fee8e0c57d07658fa/db/compaction.cc#L236-L241. It also made it harder for @yhchiang to write CompactFiles, as evidenced by this: https://github.com/facebook/rocksdb/blob/afbafeaeaebfd27a0f3e992fee8e0c57d07658fa/db/compaction_picker.cc#L204-L210
The problem is that we create Compaction object, which holds a lot of state, and then pass it around to some functions. After those functions are done mutating, then we call couple of functions on Compaction object, like SetupBottommostLevel() and MarkFilesBeingCompacted(). It is very hard to see what's happening with all that Compaction's state while it's travelling across different functions. If you're writing a new PickCompaction() function you need to try really hard to understand what are all the functions you need to run on Compaction object and what state you need to setup.
My proposed solution is to make important parts of Compaction immutable after construction. PickCompaction() should calculate compaction inputs and then pass them onto Compaction object once they are finalized. That makes it easy to create a new compaction -- just provide all the parameters to the constructor and you're done. No need to call confusing functions after you created your object.
This diff doesn't fully achieve that goal, but it comes pretty close. Here are some of the changes:
* have one Compaction constructor instead of two.
* inputs_ is constant after construction
* MarkFilesBeingCompacted() is now private to Compaction class and automatically called on construction/destruction.
* SetupBottommostLevel() is gone. Compaction figures it out on its own based on the input.
* CompactionPicker's functions are not passing around Compaction object anymore. They are only passing around the state that they need.
Test Plan:
make check
make asan_check
make valgrind_check
Reviewers: rven, anthony, sdong, yhchiang
Reviewed By: yhchiang
Subscribers: sdong, yhchiang, dhruba, leveldb
Differential Revision: https://reviews.facebook.net/D36687
10 years ago
|
|
|
// mark (or clear) all files that are being compacted
|
|
|
|
void MarkFilesBeingCompacted(bool mark_as_compacted);
|
|
|
|
|
|
|
|
// get the smallest and largest key present in files to be compacted
|
|
|
|
static void GetBoundaryKeys(VersionStorageInfo* vstorage,
|
|
|
|
const std::vector<CompactionInputFiles>& inputs,
|
|
|
|
Slice* smallest_key, Slice* largest_key,
|
|
|
|
int exclude_level = -1);
|
|
|
|
|
|
|
|
// populate penultimate level output range, which will be used to determine if
|
|
|
|
// a key is safe to output to the penultimate level (details see
|
|
|
|
// `Compaction::WithinPenultimateLevelOutputRange()`.
|
|
|
|
void PopulatePenultimateLevelOutputRange();
|
|
|
|
|
|
|
|
// Get the atomic file boundaries for all files in the compaction. Necessary
|
|
|
|
// in order to avoid the scenario described in
|
|
|
|
// https://github.com/facebook/rocksdb/pull/4432#discussion_r221072219 and
|
|
|
|
// plumb down appropriate key boundaries to RangeDelAggregator during
|
|
|
|
// compaction.
|
|
|
|
static std::vector<CompactionInputFiles> PopulateWithAtomicBoundaries(
|
|
|
|
VersionStorageInfo* vstorage, std::vector<CompactionInputFiles> inputs);
|
|
|
|
|
Make Compaction class easier to use
Summary:
The goal of this diff is to make Compaction class easier to use. This should also make new compaction algorithms easier to write (like CompactFiles from @yhchiang and dynamic leveled and multi-leveled universal from @sdong).
Here are couple of things demonstrating that Compaction class is hard to use:
1. we have two constructors of Compaction class
2. there's this thing called grandparents_, but it appears to only be setup for leveled compaction and not compactfiles
3. it's easy to introduce a subtle and dangerous bug like this: D36225
4. SetupBottomMostLevel() is hard to understand and it shouldn't be. See this comment: https://github.com/facebook/rocksdb/blob/afbafeaeaebfd27a0f3e992fee8e0c57d07658fa/db/compaction.cc#L236-L241. It also made it harder for @yhchiang to write CompactFiles, as evidenced by this: https://github.com/facebook/rocksdb/blob/afbafeaeaebfd27a0f3e992fee8e0c57d07658fa/db/compaction_picker.cc#L204-L210
The problem is that we create Compaction object, which holds a lot of state, and then pass it around to some functions. After those functions are done mutating, then we call couple of functions on Compaction object, like SetupBottommostLevel() and MarkFilesBeingCompacted(). It is very hard to see what's happening with all that Compaction's state while it's travelling across different functions. If you're writing a new PickCompaction() function you need to try really hard to understand what are all the functions you need to run on Compaction object and what state you need to setup.
My proposed solution is to make important parts of Compaction immutable after construction. PickCompaction() should calculate compaction inputs and then pass them onto Compaction object once they are finalized. That makes it easy to create a new compaction -- just provide all the parameters to the constructor and you're done. No need to call confusing functions after you created your object.
This diff doesn't fully achieve that goal, but it comes pretty close. Here are some of the changes:
* have one Compaction constructor instead of two.
* inputs_ is constant after construction
* MarkFilesBeingCompacted() is now private to Compaction class and automatically called on construction/destruction.
* SetupBottommostLevel() is gone. Compaction figures it out on its own based on the input.
* CompactionPicker's functions are not passing around Compaction object anymore. They are only passing around the state that they need.
Test Plan:
make check
make asan_check
make valgrind_check
Reviewers: rven, anthony, sdong, yhchiang
Reviewed By: yhchiang
Subscribers: sdong, yhchiang, dhruba, leveldb
Differential Revision: https://reviews.facebook.net/D36687
10 years ago
|
|
|
// helper function to determine if compaction with inputs and storage is
|
|
|
|
// bottommost
|
|
|
|
static bool IsBottommostLevel(
|
|
|
|
int output_level, VersionStorageInfo* vstorage,
|
|
|
|
const std::vector<CompactionInputFiles>& inputs);
|
|
|
|
|
Make Compaction class easier to use
Summary:
The goal of this diff is to make Compaction class easier to use. This should also make new compaction algorithms easier to write (like CompactFiles from @yhchiang and dynamic leveled and multi-leveled universal from @sdong).
Here are couple of things demonstrating that Compaction class is hard to use:
1. we have two constructors of Compaction class
2. there's this thing called grandparents_, but it appears to only be setup for leveled compaction and not compactfiles
3. it's easy to introduce a subtle and dangerous bug like this: D36225
4. SetupBottomMostLevel() is hard to understand and it shouldn't be. See this comment: https://github.com/facebook/rocksdb/blob/afbafeaeaebfd27a0f3e992fee8e0c57d07658fa/db/compaction.cc#L236-L241. It also made it harder for @yhchiang to write CompactFiles, as evidenced by this: https://github.com/facebook/rocksdb/blob/afbafeaeaebfd27a0f3e992fee8e0c57d07658fa/db/compaction_picker.cc#L204-L210
The problem is that we create Compaction object, which holds a lot of state, and then pass it around to some functions. After those functions are done mutating, then we call couple of functions on Compaction object, like SetupBottommostLevel() and MarkFilesBeingCompacted(). It is very hard to see what's happening with all that Compaction's state while it's travelling across different functions. If you're writing a new PickCompaction() function you need to try really hard to understand what are all the functions you need to run on Compaction object and what state you need to setup.
My proposed solution is to make important parts of Compaction immutable after construction. PickCompaction() should calculate compaction inputs and then pass them onto Compaction object once they are finalized. That makes it easy to create a new compaction -- just provide all the parameters to the constructor and you're done. No need to call confusing functions after you created your object.
This diff doesn't fully achieve that goal, but it comes pretty close. Here are some of the changes:
* have one Compaction constructor instead of two.
* inputs_ is constant after construction
* MarkFilesBeingCompacted() is now private to Compaction class and automatically called on construction/destruction.
* SetupBottommostLevel() is gone. Compaction figures it out on its own based on the input.
* CompactionPicker's functions are not passing around Compaction object anymore. They are only passing around the state that they need.
Test Plan:
make check
make asan_check
make valgrind_check
Reviewers: rven, anthony, sdong, yhchiang
Reviewed By: yhchiang
Subscribers: sdong, yhchiang, dhruba, leveldb
Differential Revision: https://reviews.facebook.net/D36687
10 years ago
|
|
|
static bool IsFullCompaction(VersionStorageInfo* vstorage,
|
|
|
|
const std::vector<CompactionInputFiles>& inputs);
|
|
|
|
|
|
|
|
VersionStorageInfo* input_vstorage_;
|
|
|
|
|
|
|
|
const int start_level_; // the lowest level to be compacted
|
|
|
|
const int output_level_; // levels to which output files are stored
|
Align compaction output file boundaries to the next level ones (#10655)
Summary:
Try to align the compaction output file boundaries to the next level ones
(grandparent level), to reduce the level compaction write-amplification.
In level compaction, there are "wasted" data at the beginning and end of the
output level files. Align the file boundary can avoid such "wasted" compaction.
With this PR, it tries to align the non-bottommost level file boundaries to its
next level ones. It may cut file when the file size is large enough (at least
50% of target_file_size) and not too large (2x target_file_size).
db_bench shows about 12.56% compaction reduction:
```
TEST_TMPDIR=/data/dbbench2 ./db_bench --benchmarks=fillrandom,readrandom -max_background_jobs=12 -num=400000000 -target_file_size_base=33554432
# baseline:
Flush(GB): cumulative 25.882, interval 7.216
Cumulative compaction: 285.90 GB write, 162.36 MB/s write, 269.68 GB read, 153.15 MB/s read, 2926.7 seconds
# with this change:
Flush(GB): cumulative 25.882, interval 7.753
Cumulative compaction: 249.97 GB write, 141.96 MB/s write, 233.74 GB read, 132.74 MB/s read, 2534.9 seconds
```
The compaction simulator shows a similar result (14% with 100G random data).
As a side effect, with this PR, the SST file size can exceed the
target_file_size, but is capped at 2x target_file_size. And there will be
smaller files. Here are file size statistics when loading 100GB with the target
file size 32MB:
```
baseline this_PR
count 1.656000e+03 1.705000e+03
mean 3.116062e+07 3.028076e+07
std 7.145242e+06 8.046139e+06
```
The feature is enabled by default, to revert to the old behavior disable it
with `AdvancedColumnFamilyOptions.level_compaction_dynamic_file_size = false`
Also includes https://github.com/facebook/rocksdb/issues/1963 to cut file before skippable grandparent file. Which is for
use case like user adding 2 or more non-overlapping data range at the same
time, it can reduce the overlapping of 2 datasets in the lower levels.
Pull Request resolved: https://github.com/facebook/rocksdb/pull/10655
Reviewed By: cbi42
Differential Revision: D39552321
Pulled By: jay-zhuang
fbshipit-source-id: 640d15f159ab0cd973f2426cfc3af266fc8bdde2
2 years ago
|
|
|
uint64_t target_output_file_size_;
|
|
|
|
uint64_t max_output_file_size_;
|
|
|
|
uint64_t max_compaction_bytes_;
|
|
|
|
uint32_t max_subcompactions_;
|
|
|
|
const ImmutableOptions immutable_options_;
|
|
|
|
const MutableCFOptions mutable_cf_options_;
|
|
|
|
Version* input_version_;
|
Make Compaction class easier to use
Summary:
The goal of this diff is to make Compaction class easier to use. This should also make new compaction algorithms easier to write (like CompactFiles from @yhchiang and dynamic leveled and multi-leveled universal from @sdong).
Here are couple of things demonstrating that Compaction class is hard to use:
1. we have two constructors of Compaction class
2. there's this thing called grandparents_, but it appears to only be setup for leveled compaction and not compactfiles
3. it's easy to introduce a subtle and dangerous bug like this: D36225
4. SetupBottomMostLevel() is hard to understand and it shouldn't be. See this comment: https://github.com/facebook/rocksdb/blob/afbafeaeaebfd27a0f3e992fee8e0c57d07658fa/db/compaction.cc#L236-L241. It also made it harder for @yhchiang to write CompactFiles, as evidenced by this: https://github.com/facebook/rocksdb/blob/afbafeaeaebfd27a0f3e992fee8e0c57d07658fa/db/compaction_picker.cc#L204-L210
The problem is that we create Compaction object, which holds a lot of state, and then pass it around to some functions. After those functions are done mutating, then we call couple of functions on Compaction object, like SetupBottommostLevel() and MarkFilesBeingCompacted(). It is very hard to see what's happening with all that Compaction's state while it's travelling across different functions. If you're writing a new PickCompaction() function you need to try really hard to understand what are all the functions you need to run on Compaction object and what state you need to setup.
My proposed solution is to make important parts of Compaction immutable after construction. PickCompaction() should calculate compaction inputs and then pass them onto Compaction object once they are finalized. That makes it easy to create a new compaction -- just provide all the parameters to the constructor and you're done. No need to call confusing functions after you created your object.
This diff doesn't fully achieve that goal, but it comes pretty close. Here are some of the changes:
* have one Compaction constructor instead of two.
* inputs_ is constant after construction
* MarkFilesBeingCompacted() is now private to Compaction class and automatically called on construction/destruction.
* SetupBottommostLevel() is gone. Compaction figures it out on its own based on the input.
* CompactionPicker's functions are not passing around Compaction object anymore. They are only passing around the state that they need.
Test Plan:
make check
make asan_check
make valgrind_check
Reviewers: rven, anthony, sdong, yhchiang
Reviewed By: yhchiang
Subscribers: sdong, yhchiang, dhruba, leveldb
Differential Revision: https://reviews.facebook.net/D36687
10 years ago
|
|
|
VersionEdit edit_;
|
|
|
|
const int number_levels_;
|
|
|
|
ColumnFamilyData* cfd_;
|
|
|
|
Arena arena_; // Arena used to allocate space for file_levels_
|
|
|
|
|
Make Compaction class easier to use
Summary:
The goal of this diff is to make Compaction class easier to use. This should also make new compaction algorithms easier to write (like CompactFiles from @yhchiang and dynamic leveled and multi-leveled universal from @sdong).
Here are couple of things demonstrating that Compaction class is hard to use:
1. we have two constructors of Compaction class
2. there's this thing called grandparents_, but it appears to only be setup for leveled compaction and not compactfiles
3. it's easy to introduce a subtle and dangerous bug like this: D36225
4. SetupBottomMostLevel() is hard to understand and it shouldn't be. See this comment: https://github.com/facebook/rocksdb/blob/afbafeaeaebfd27a0f3e992fee8e0c57d07658fa/db/compaction.cc#L236-L241. It also made it harder for @yhchiang to write CompactFiles, as evidenced by this: https://github.com/facebook/rocksdb/blob/afbafeaeaebfd27a0f3e992fee8e0c57d07658fa/db/compaction_picker.cc#L204-L210
The problem is that we create Compaction object, which holds a lot of state, and then pass it around to some functions. After those functions are done mutating, then we call couple of functions on Compaction object, like SetupBottommostLevel() and MarkFilesBeingCompacted(). It is very hard to see what's happening with all that Compaction's state while it's travelling across different functions. If you're writing a new PickCompaction() function you need to try really hard to understand what are all the functions you need to run on Compaction object and what state you need to setup.
My proposed solution is to make important parts of Compaction immutable after construction. PickCompaction() should calculate compaction inputs and then pass them onto Compaction object once they are finalized. That makes it easy to create a new compaction -- just provide all the parameters to the constructor and you're done. No need to call confusing functions after you created your object.
This diff doesn't fully achieve that goal, but it comes pretty close. Here are some of the changes:
* have one Compaction constructor instead of two.
* inputs_ is constant after construction
* MarkFilesBeingCompacted() is now private to Compaction class and automatically called on construction/destruction.
* SetupBottommostLevel() is gone. Compaction figures it out on its own based on the input.
* CompactionPicker's functions are not passing around Compaction object anymore. They are only passing around the state that they need.
Test Plan:
make check
make asan_check
make valgrind_check
Reviewers: rven, anthony, sdong, yhchiang
Reviewed By: yhchiang
Subscribers: sdong, yhchiang, dhruba, leveldb
Differential Revision: https://reviews.facebook.net/D36687
10 years ago
|
|
|
const uint32_t output_path_id_;
|
|
|
|
CompressionType output_compression_;
|
|
|
|
CompressionOptions output_compression_opts_;
|
|
|
|
Temperature output_temperature_;
|
|
|
|
// If true, then the compaction can be done by simply deleting input files.
|
Make Compaction class easier to use
Summary:
The goal of this diff is to make Compaction class easier to use. This should also make new compaction algorithms easier to write (like CompactFiles from @yhchiang and dynamic leveled and multi-leveled universal from @sdong).
Here are couple of things demonstrating that Compaction class is hard to use:
1. we have two constructors of Compaction class
2. there's this thing called grandparents_, but it appears to only be setup for leveled compaction and not compactfiles
3. it's easy to introduce a subtle and dangerous bug like this: D36225
4. SetupBottomMostLevel() is hard to understand and it shouldn't be. See this comment: https://github.com/facebook/rocksdb/blob/afbafeaeaebfd27a0f3e992fee8e0c57d07658fa/db/compaction.cc#L236-L241. It also made it harder for @yhchiang to write CompactFiles, as evidenced by this: https://github.com/facebook/rocksdb/blob/afbafeaeaebfd27a0f3e992fee8e0c57d07658fa/db/compaction_picker.cc#L204-L210
The problem is that we create Compaction object, which holds a lot of state, and then pass it around to some functions. After those functions are done mutating, then we call couple of functions on Compaction object, like SetupBottommostLevel() and MarkFilesBeingCompacted(). It is very hard to see what's happening with all that Compaction's state while it's travelling across different functions. If you're writing a new PickCompaction() function you need to try really hard to understand what are all the functions you need to run on Compaction object and what state you need to setup.
My proposed solution is to make important parts of Compaction immutable after construction. PickCompaction() should calculate compaction inputs and then pass them onto Compaction object once they are finalized. That makes it easy to create a new compaction -- just provide all the parameters to the constructor and you're done. No need to call confusing functions after you created your object.
This diff doesn't fully achieve that goal, but it comes pretty close. Here are some of the changes:
* have one Compaction constructor instead of two.
* inputs_ is constant after construction
* MarkFilesBeingCompacted() is now private to Compaction class and automatically called on construction/destruction.
* SetupBottommostLevel() is gone. Compaction figures it out on its own based on the input.
* CompactionPicker's functions are not passing around Compaction object anymore. They are only passing around the state that they need.
Test Plan:
make check
make asan_check
make valgrind_check
Reviewers: rven, anthony, sdong, yhchiang
Reviewed By: yhchiang
Subscribers: sdong, yhchiang, dhruba, leveldb
Differential Revision: https://reviews.facebook.net/D36687
10 years ago
|
|
|
const bool deletion_compaction_;
|
|
|
|
// should it split the output file using the compact cursor?
|
|
|
|
const InternalKey* output_split_key_;
|
|
|
|
|
|
|
|
// L0 files in LSM-tree might be overlapping. But the compaction picking
|
|
|
|
// logic might pick a subset of the files that aren't overlapping. if
|
|
|
|
// that is the case, set the value to false. Otherwise, set it true.
|
|
|
|
bool l0_files_might_overlap_;
|
|
|
|
|
Make Compaction class easier to use
Summary:
The goal of this diff is to make Compaction class easier to use. This should also make new compaction algorithms easier to write (like CompactFiles from @yhchiang and dynamic leveled and multi-leveled universal from @sdong).
Here are couple of things demonstrating that Compaction class is hard to use:
1. we have two constructors of Compaction class
2. there's this thing called grandparents_, but it appears to only be setup for leveled compaction and not compactfiles
3. it's easy to introduce a subtle and dangerous bug like this: D36225
4. SetupBottomMostLevel() is hard to understand and it shouldn't be. See this comment: https://github.com/facebook/rocksdb/blob/afbafeaeaebfd27a0f3e992fee8e0c57d07658fa/db/compaction.cc#L236-L241. It also made it harder for @yhchiang to write CompactFiles, as evidenced by this: https://github.com/facebook/rocksdb/blob/afbafeaeaebfd27a0f3e992fee8e0c57d07658fa/db/compaction_picker.cc#L204-L210
The problem is that we create Compaction object, which holds a lot of state, and then pass it around to some functions. After those functions are done mutating, then we call couple of functions on Compaction object, like SetupBottommostLevel() and MarkFilesBeingCompacted(). It is very hard to see what's happening with all that Compaction's state while it's travelling across different functions. If you're writing a new PickCompaction() function you need to try really hard to understand what are all the functions you need to run on Compaction object and what state you need to setup.
My proposed solution is to make important parts of Compaction immutable after construction. PickCompaction() should calculate compaction inputs and then pass them onto Compaction object once they are finalized. That makes it easy to create a new compaction -- just provide all the parameters to the constructor and you're done. No need to call confusing functions after you created your object.
This diff doesn't fully achieve that goal, but it comes pretty close. Here are some of the changes:
* have one Compaction constructor instead of two.
* inputs_ is constant after construction
* MarkFilesBeingCompacted() is now private to Compaction class and automatically called on construction/destruction.
* SetupBottommostLevel() is gone. Compaction figures it out on its own based on the input.
* CompactionPicker's functions are not passing around Compaction object anymore. They are only passing around the state that they need.
Test Plan:
make check
make asan_check
make valgrind_check
Reviewers: rven, anthony, sdong, yhchiang
Reviewed By: yhchiang
Subscribers: sdong, yhchiang, dhruba, leveldb
Differential Revision: https://reviews.facebook.net/D36687
10 years ago
|
|
|
// Compaction input files organized by level. Constant after construction
|
|
|
|
const std::vector<CompactionInputFiles> inputs_;
|
|
|
|
|
|
|
|
// A copy of inputs_, organized more closely in memory
|
|
|
|
autovector<LevelFilesBrief, 2> input_levels_;
|
|
|
|
|
|
|
|
// State used to check for number of overlapping grandparent files
|
|
|
|
// (grandparent == "output_level_ + 1")
|
|
|
|
std::vector<FileMetaData*> grandparents_;
|
|
|
|
const double score_; // score that was used to pick this compaction.
|
|
|
|
|
|
|
|
// Is this compaction creating a file in the bottom most level?
|
Make Compaction class easier to use
Summary:
The goal of this diff is to make Compaction class easier to use. This should also make new compaction algorithms easier to write (like CompactFiles from @yhchiang and dynamic leveled and multi-leveled universal from @sdong).
Here are couple of things demonstrating that Compaction class is hard to use:
1. we have two constructors of Compaction class
2. there's this thing called grandparents_, but it appears to only be setup for leveled compaction and not compactfiles
3. it's easy to introduce a subtle and dangerous bug like this: D36225
4. SetupBottomMostLevel() is hard to understand and it shouldn't be. See this comment: https://github.com/facebook/rocksdb/blob/afbafeaeaebfd27a0f3e992fee8e0c57d07658fa/db/compaction.cc#L236-L241. It also made it harder for @yhchiang to write CompactFiles, as evidenced by this: https://github.com/facebook/rocksdb/blob/afbafeaeaebfd27a0f3e992fee8e0c57d07658fa/db/compaction_picker.cc#L204-L210
The problem is that we create Compaction object, which holds a lot of state, and then pass it around to some functions. After those functions are done mutating, then we call couple of functions on Compaction object, like SetupBottommostLevel() and MarkFilesBeingCompacted(). It is very hard to see what's happening with all that Compaction's state while it's travelling across different functions. If you're writing a new PickCompaction() function you need to try really hard to understand what are all the functions you need to run on Compaction object and what state you need to setup.
My proposed solution is to make important parts of Compaction immutable after construction. PickCompaction() should calculate compaction inputs and then pass them onto Compaction object once they are finalized. That makes it easy to create a new compaction -- just provide all the parameters to the constructor and you're done. No need to call confusing functions after you created your object.
This diff doesn't fully achieve that goal, but it comes pretty close. Here are some of the changes:
* have one Compaction constructor instead of two.
* inputs_ is constant after construction
* MarkFilesBeingCompacted() is now private to Compaction class and automatically called on construction/destruction.
* SetupBottommostLevel() is gone. Compaction figures it out on its own based on the input.
* CompactionPicker's functions are not passing around Compaction object anymore. They are only passing around the state that they need.
Test Plan:
make check
make asan_check
make valgrind_check
Reviewers: rven, anthony, sdong, yhchiang
Reviewed By: yhchiang
Subscribers: sdong, yhchiang, dhruba, leveldb
Differential Revision: https://reviews.facebook.net/D36687
10 years ago
|
|
|
const bool bottommost_level_;
|
|
|
|
// Does this compaction include all sst files?
|
Make Compaction class easier to use
Summary:
The goal of this diff is to make Compaction class easier to use. This should also make new compaction algorithms easier to write (like CompactFiles from @yhchiang and dynamic leveled and multi-leveled universal from @sdong).
Here are couple of things demonstrating that Compaction class is hard to use:
1. we have two constructors of Compaction class
2. there's this thing called grandparents_, but it appears to only be setup for leveled compaction and not compactfiles
3. it's easy to introduce a subtle and dangerous bug like this: D36225
4. SetupBottomMostLevel() is hard to understand and it shouldn't be. See this comment: https://github.com/facebook/rocksdb/blob/afbafeaeaebfd27a0f3e992fee8e0c57d07658fa/db/compaction.cc#L236-L241. It also made it harder for @yhchiang to write CompactFiles, as evidenced by this: https://github.com/facebook/rocksdb/blob/afbafeaeaebfd27a0f3e992fee8e0c57d07658fa/db/compaction_picker.cc#L204-L210
The problem is that we create Compaction object, which holds a lot of state, and then pass it around to some functions. After those functions are done mutating, then we call couple of functions on Compaction object, like SetupBottommostLevel() and MarkFilesBeingCompacted(). It is very hard to see what's happening with all that Compaction's state while it's travelling across different functions. If you're writing a new PickCompaction() function you need to try really hard to understand what are all the functions you need to run on Compaction object and what state you need to setup.
My proposed solution is to make important parts of Compaction immutable after construction. PickCompaction() should calculate compaction inputs and then pass them onto Compaction object once they are finalized. That makes it easy to create a new compaction -- just provide all the parameters to the constructor and you're done. No need to call confusing functions after you created your object.
This diff doesn't fully achieve that goal, but it comes pretty close. Here are some of the changes:
* have one Compaction constructor instead of two.
* inputs_ is constant after construction
* MarkFilesBeingCompacted() is now private to Compaction class and automatically called on construction/destruction.
* SetupBottommostLevel() is gone. Compaction figures it out on its own based on the input.
* CompactionPicker's functions are not passing around Compaction object anymore. They are only passing around the state that they need.
Test Plan:
make check
make asan_check
make valgrind_check
Reviewers: rven, anthony, sdong, yhchiang
Reviewed By: yhchiang
Subscribers: sdong, yhchiang, dhruba, leveldb
Differential Revision: https://reviews.facebook.net/D36687
10 years ago
|
|
|
const bool is_full_compaction_;
|
|
|
|
|
|
|
|
// Is this compaction requested by the client?
|
Make Compaction class easier to use
Summary:
The goal of this diff is to make Compaction class easier to use. This should also make new compaction algorithms easier to write (like CompactFiles from @yhchiang and dynamic leveled and multi-leveled universal from @sdong).
Here are couple of things demonstrating that Compaction class is hard to use:
1. we have two constructors of Compaction class
2. there's this thing called grandparents_, but it appears to only be setup for leveled compaction and not compactfiles
3. it's easy to introduce a subtle and dangerous bug like this: D36225
4. SetupBottomMostLevel() is hard to understand and it shouldn't be. See this comment: https://github.com/facebook/rocksdb/blob/afbafeaeaebfd27a0f3e992fee8e0c57d07658fa/db/compaction.cc#L236-L241. It also made it harder for @yhchiang to write CompactFiles, as evidenced by this: https://github.com/facebook/rocksdb/blob/afbafeaeaebfd27a0f3e992fee8e0c57d07658fa/db/compaction_picker.cc#L204-L210
The problem is that we create Compaction object, which holds a lot of state, and then pass it around to some functions. After those functions are done mutating, then we call couple of functions on Compaction object, like SetupBottommostLevel() and MarkFilesBeingCompacted(). It is very hard to see what's happening with all that Compaction's state while it's travelling across different functions. If you're writing a new PickCompaction() function you need to try really hard to understand what are all the functions you need to run on Compaction object and what state you need to setup.
My proposed solution is to make important parts of Compaction immutable after construction. PickCompaction() should calculate compaction inputs and then pass them onto Compaction object once they are finalized. That makes it easy to create a new compaction -- just provide all the parameters to the constructor and you're done. No need to call confusing functions after you created your object.
This diff doesn't fully achieve that goal, but it comes pretty close. Here are some of the changes:
* have one Compaction constructor instead of two.
* inputs_ is constant after construction
* MarkFilesBeingCompacted() is now private to Compaction class and automatically called on construction/destruction.
* SetupBottommostLevel() is gone. Compaction figures it out on its own based on the input.
* CompactionPicker's functions are not passing around Compaction object anymore. They are only passing around the state that they need.
Test Plan:
make check
make asan_check
make valgrind_check
Reviewers: rven, anthony, sdong, yhchiang
Reviewed By: yhchiang
Subscribers: sdong, yhchiang, dhruba, leveldb
Differential Revision: https://reviews.facebook.net/D36687
10 years ago
|
|
|
const bool is_manual_compaction_;
|
|
|
|
|
|
|
|
// The data with timestamp > trim_ts_ will be removed
|
|
|
|
const std::string trim_ts_;
|
|
|
|
|
|
|
|
// True if we can do trivial move in Universal multi level
|
|
|
|
// compaction
|
|
|
|
bool is_trivial_move_;
|
|
|
|
|
|
|
|
// Does input compression match the output compression?
|
|
|
|
bool InputCompressionMatchesOutput() const;
|
|
|
|
|
|
|
|
// table properties of output files
|
|
|
|
TablePropertiesCollection output_table_properties_;
|
|
|
|
|
|
|
|
// smallest user keys in compaction
|
|
|
|
// includes timestamp if user-defined timestamp is enabled.
|
|
|
|
Slice smallest_user_key_;
|
|
|
|
|
|
|
|
// largest user keys in compaction
|
|
|
|
// includes timestamp if user-defined timestamp is enabled.
|
|
|
|
Slice largest_user_key_;
|
|
|
|
|
|
|
|
// Reason for compaction
|
|
|
|
CompactionReason compaction_reason_;
|
|
|
|
|
|
|
|
// Notify on compaction completion only if listener was notified on compaction
|
|
|
|
// begin.
|
|
|
|
bool notify_on_compaction_completion_;
|
|
|
|
|
|
|
|
// Enable/disable GC collection for blobs during compaction.
|
|
|
|
bool enable_blob_garbage_collection_;
|
|
|
|
|
|
|
|
// Blob garbage collection age cutoff.
|
|
|
|
double blob_garbage_collection_age_cutoff_;
|
|
|
|
|
|
|
|
// only set when per_key_placement feature is enabled, -1 (kInvalidLevel)
|
|
|
|
// means not supported.
|
|
|
|
const int penultimate_level_;
|
|
|
|
|
|
|
|
// Key range for penultimate level output
|
|
|
|
// includes timestamp if user-defined timestamp is enabled.
|
|
|
|
// penultimate_output_range_type_ shows the range type
|
|
|
|
Slice penultimate_level_smallest_user_key_;
|
|
|
|
Slice penultimate_level_largest_user_key_;
|
|
|
|
PenultimateOutputRangeType penultimate_output_range_type_ =
|
|
|
|
PenultimateOutputRangeType::kNotSupported;
|
|
|
|
};
|
|
|
|
|
|
|
|
#ifndef NDEBUG
|
|
|
|
// Helper struct only for tests, which contains the data to decide if a key
|
|
|
|
// should be output to the penultimate level.
|
|
|
|
// TODO: remove this when the public feature knob is available
|
|
|
|
struct PerKeyPlacementContext {
|
|
|
|
const int level;
|
|
|
|
const Slice key;
|
|
|
|
const Slice value;
|
|
|
|
const SequenceNumber seq_num;
|
|
|
|
|
|
|
|
bool output_to_penultimate_level;
|
|
|
|
|
|
|
|
PerKeyPlacementContext(int _level, Slice _key, Slice _value,
|
|
|
|
SequenceNumber _seq_num)
|
|
|
|
: level(_level), key(_key), value(_value), seq_num(_seq_num) {
|
|
|
|
output_to_penultimate_level = false;
|
|
|
|
}
|
|
|
|
};
|
|
|
|
#endif /* !NDEBUG */
|
|
|
|
|
|
|
|
// Return sum of sizes of all files in `files`.
|
|
|
|
extern uint64_t TotalFileSize(const std::vector<FileMetaData*>& files);
|
|
|
|
|
|
|
|
} // namespace ROCKSDB_NAMESPACE
|