|
|
|
// Copyright (c) 2011-present, Facebook, Inc. All rights reserved.
|
|
|
|
// This source code is licensed under both the GPLv2 (found in the
|
|
|
|
// COPYING file in the root directory) and Apache 2.0 License
|
|
|
|
// (found in the LICENSE.Apache file in the root directory).
|
|
|
|
|
|
|
|
#include "table/mock_table.h"
|
|
|
|
|
|
|
|
#include "db/dbformat.h"
|
Introduce a new storage specific Env API (#5761)
Summary:
The current Env API encompasses both storage/file operations, as well as OS related operations. Most of the APIs return a Status, which does not have enough metadata about an error, such as whether its retry-able or not, scope (i.e fault domain) of the error etc., that may be required in order to properly handle a storage error. The file APIs also do not provide enough control over the IO SLA, such as timeout, prioritization, hinting about placement and redundancy etc.
This PR separates out the file/storage APIs from Env into a new FileSystem class. The APIs are updated to return an IOStatus with metadata about the error, as well as to take an IOOptions structure as input in order to allow more control over the IO.
The user can set both ```options.env``` and ```options.file_system``` to specify that RocksDB should use the former for OS related operations and the latter for storage operations. Internally, a ```CompositeEnvWrapper``` has been introduced that inherits from ```Env``` and redirects individual methods to either an ```Env``` implementation or the ```FileSystem``` as appropriate. When options are sanitized during ```DB::Open```, ```options.env``` is replaced with a newly allocated ```CompositeEnvWrapper``` instance if both env and file_system have been specified. This way, the rest of the RocksDB code can continue to function as before.
This PR also ports PosixEnv to the new API by splitting it into two - PosixEnv and PosixFileSystem. PosixEnv is defined as a sub-class of CompositeEnvWrapper, and threading/time functions are overridden with Posix specific implementations in order to avoid an extra level of indirection.
The ```CompositeEnvWrapper``` translates ```IOStatus``` return code to ```Status```, and sets the severity to ```kSoftError``` if the io_status is retryable. The error handling code in RocksDB can then recover the DB automatically.
Pull Request resolved: https://github.com/facebook/rocksdb/pull/5761
Differential Revision: D18868376
Pulled By: anand1976
fbshipit-source-id: 39efe18a162ea746fabac6360ff529baba48486f
5 years ago
|
|
|
#include "env/composite_env_wrapper.h"
|
|
|
|
#include "file/random_access_file_reader.h"
|
|
|
|
#include "port/port.h"
|
|
|
|
#include "rocksdb/table_properties.h"
|
|
|
|
#include "table/get_context.h"
|
|
|
|
#include "util/coding.h"
|
|
|
|
|
|
|
|
namespace ROCKSDB_NAMESPACE {
|
|
|
|
namespace mock {
|
|
|
|
|
|
|
|
KVVector MakeMockFile(std::initializer_list<KVPair> l) { return KVVector(l); }
|
|
|
|
|
|
|
|
void SortKVVector(KVVector* kv_vector, const Comparator* ucmp) {
|
|
|
|
InternalKeyComparator icmp(ucmp);
|
|
|
|
std::sort(kv_vector->begin(), kv_vector->end(),
|
|
|
|
[icmp](KVPair a, KVPair b) -> bool {
|
|
|
|
return icmp.Compare(a.first, b.first) < 0;
|
|
|
|
});
|
|
|
|
}
|
|
|
|
|
|
|
|
class MockTableReader : public TableReader {
|
|
|
|
public:
|
|
|
|
explicit MockTableReader(const KVVector& table) : table_(table) {}
|
|
|
|
|
|
|
|
InternalIterator* NewIterator(const ReadOptions&,
|
|
|
|
const SliceTransform* prefix_extractor,
|
|
|
|
Arena* arena, bool skip_filters,
|
|
|
|
TableReaderCaller caller,
|
|
|
|
size_t compaction_readahead_size = 0,
|
|
|
|
bool allow_unprepared_value = false) override;
|
|
|
|
|
|
|
|
Status Get(const ReadOptions& readOptions, const Slice& key,
|
|
|
|
GetContext* get_context, const SliceTransform* prefix_extractor,
|
|
|
|
bool skip_filters = false) override;
|
|
|
|
|
|
|
|
uint64_t ApproximateOffsetOf(const ReadOptions& /*read_options*/,
|
|
|
|
const Slice& /*key*/,
|
|
|
|
TableReaderCaller /*caller*/) override {
|
|
|
|
return 0;
|
|
|
|
}
|
|
|
|
|
|
|
|
uint64_t ApproximateSize(const ReadOptions& /*read_options*/,
|
|
|
|
const Slice& /*start*/, const Slice& /*end*/,
|
|
|
|
TableReaderCaller /*caller*/) override {
|
|
|
|
return 0;
|
|
|
|
}
|
|
|
|
|
|
|
|
size_t ApproximateMemoryUsage() const override { return 0; }
|
|
|
|
|
|
|
|
void SetupForCompaction() override {}
|
|
|
|
|
|
|
|
std::shared_ptr<const TableProperties> GetTableProperties() const override;
|
|
|
|
|
|
|
|
~MockTableReader() {}
|
|
|
|
|
|
|
|
private:
|
|
|
|
const KVVector& table_;
|
|
|
|
};
|
|
|
|
|
|
|
|
class MockTableIterator : public InternalIterator {
|
|
|
|
public:
|
|
|
|
explicit MockTableIterator(const KVVector& table) : table_(table) {
|
|
|
|
itr_ = table_.end();
|
|
|
|
}
|
|
|
|
|
|
|
|
bool Valid() const override { return itr_ != table_.end(); }
|
|
|
|
|
|
|
|
void SeekToFirst() override { itr_ = table_.begin(); }
|
|
|
|
|
|
|
|
void SeekToLast() override {
|
|
|
|
itr_ = table_.end();
|
|
|
|
--itr_;
|
|
|
|
}
|
|
|
|
|
|
|
|
void Seek(const Slice& target) override {
|
|
|
|
KVPair target_pair(target.ToString(), "");
|
|
|
|
InternalKeyComparator icmp(BytewiseComparator());
|
|
|
|
itr_ = std::lower_bound(table_.begin(), table_.end(), target_pair,
|
|
|
|
[icmp](KVPair a, KVPair b) -> bool {
|
|
|
|
return icmp.Compare(a.first, b.first) < 0;
|
|
|
|
});
|
|
|
|
}
|
|
|
|
|
|
|
|
void SeekForPrev(const Slice& target) override {
|
|
|
|
KVPair target_pair(target.ToString(), "");
|
|
|
|
InternalKeyComparator icmp(BytewiseComparator());
|
|
|
|
itr_ = std::upper_bound(table_.begin(), table_.end(), target_pair,
|
|
|
|
[icmp](KVPair a, KVPair b) -> bool {
|
|
|
|
return icmp.Compare(a.first, b.first) < 0;
|
|
|
|
});
|
|
|
|
Prev();
|
|
|
|
}
|
|
|
|
|
|
|
|
void Next() override { ++itr_; }
|
|
|
|
|
|
|
|
void Prev() override {
|
|
|
|
if (itr_ == table_.begin()) {
|
|
|
|
itr_ = table_.end();
|
|
|
|
} else {
|
|
|
|
--itr_;
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
Slice key() const override { return Slice(itr_->first); }
|
|
|
|
|
|
|
|
Slice value() const override { return Slice(itr_->second); }
|
|
|
|
|
|
|
|
Status status() const override { return Status::OK(); }
|
|
|
|
|
|
|
|
private:
|
|
|
|
const KVVector& table_;
|
|
|
|
KVVector::const_iterator itr_;
|
|
|
|
};
|
|
|
|
|
|
|
|
class MockTableBuilder : public TableBuilder {
|
|
|
|
public:
|
|
|
|
MockTableBuilder(uint32_t id, MockTableFileSystem* file_system,
|
|
|
|
MockTableFactory::MockCorruptionMode corrupt_mode =
|
Align compaction output file boundaries to the next level ones (#10655)
Summary:
Try to align the compaction output file boundaries to the next level ones
(grandparent level), to reduce the level compaction write-amplification.
In level compaction, there are "wasted" data at the beginning and end of the
output level files. Align the file boundary can avoid such "wasted" compaction.
With this PR, it tries to align the non-bottommost level file boundaries to its
next level ones. It may cut file when the file size is large enough (at least
50% of target_file_size) and not too large (2x target_file_size).
db_bench shows about 12.56% compaction reduction:
```
TEST_TMPDIR=/data/dbbench2 ./db_bench --benchmarks=fillrandom,readrandom -max_background_jobs=12 -num=400000000 -target_file_size_base=33554432
# baseline:
Flush(GB): cumulative 25.882, interval 7.216
Cumulative compaction: 285.90 GB write, 162.36 MB/s write, 269.68 GB read, 153.15 MB/s read, 2926.7 seconds
# with this change:
Flush(GB): cumulative 25.882, interval 7.753
Cumulative compaction: 249.97 GB write, 141.96 MB/s write, 233.74 GB read, 132.74 MB/s read, 2534.9 seconds
```
The compaction simulator shows a similar result (14% with 100G random data).
As a side effect, with this PR, the SST file size can exceed the
target_file_size, but is capped at 2x target_file_size. And there will be
smaller files. Here are file size statistics when loading 100GB with the target
file size 32MB:
```
baseline this_PR
count 1.656000e+03 1.705000e+03
mean 3.116062e+07 3.028076e+07
std 7.145242e+06 8.046139e+06
```
The feature is enabled by default, to revert to the old behavior disable it
with `AdvancedColumnFamilyOptions.level_compaction_dynamic_file_size = false`
Also includes https://github.com/facebook/rocksdb/issues/1963 to cut file before skippable grandparent file. Which is for
use case like user adding 2 or more non-overlapping data range at the same
time, it can reduce the overlapping of 2 datasets in the lower levels.
Pull Request resolved: https://github.com/facebook/rocksdb/pull/10655
Reviewed By: cbi42
Differential Revision: D39552321
Pulled By: jay-zhuang
fbshipit-source-id: 640d15f159ab0cd973f2426cfc3af266fc8bdde2
2 years ago
|
|
|
MockTableFactory::kCorruptNone,
|
|
|
|
size_t key_value_size = 1)
|
|
|
|
: id_(id),
|
|
|
|
file_system_(file_system),
|
|
|
|
corrupt_mode_(corrupt_mode),
|
|
|
|
key_value_size_(key_value_size) {
|
|
|
|
table_ = MakeMockFile({});
|
|
|
|
}
|
|
|
|
|
|
|
|
// REQUIRES: Either Finish() or Abandon() has been called.
|
|
|
|
~MockTableBuilder() {}
|
|
|
|
|
|
|
|
// Add key,value to the table being constructed.
|
|
|
|
// REQUIRES: key is after any previously added key according to comparator.
|
|
|
|
// REQUIRES: Finish(), Abandon() have not been called
|
|
|
|
void Add(const Slice& key, const Slice& value) override {
|
|
|
|
if (corrupt_mode_ == MockTableFactory::kCorruptValue) {
|
|
|
|
// Corrupt the value
|
|
|
|
table_.push_back({key.ToString(), value.ToString() + " "});
|
|
|
|
corrupt_mode_ = MockTableFactory::kCorruptNone;
|
|
|
|
} else if (corrupt_mode_ == MockTableFactory::kCorruptKey) {
|
|
|
|
table_.push_back({key.ToString() + " ", value.ToString()});
|
|
|
|
corrupt_mode_ = MockTableFactory::kCorruptNone;
|
|
|
|
} else if (corrupt_mode_ == MockTableFactory::kCorruptReorderKey) {
|
|
|
|
if (prev_key_.empty()) {
|
|
|
|
prev_key_ = key.ToString();
|
|
|
|
prev_value_ = value.ToString();
|
|
|
|
} else {
|
|
|
|
table_.push_back({key.ToString(), value.ToString()});
|
|
|
|
table_.push_back({prev_key_, prev_value_});
|
|
|
|
corrupt_mode_ = MockTableFactory::kCorruptNone;
|
|
|
|
}
|
|
|
|
} else {
|
|
|
|
table_.push_back({key.ToString(), value.ToString()});
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
// Return non-ok iff some error has been detected.
|
|
|
|
Status status() const override { return Status::OK(); }
|
|
|
|
|
|
|
|
// Return non-ok iff some error happens during IO.
|
|
|
|
IOStatus io_status() const override { return IOStatus::OK(); }
|
|
|
|
|
|
|
|
Status Finish() override {
|
|
|
|
MutexLock lock_guard(&file_system_->mutex);
|
|
|
|
file_system_->files.insert({id_, table_});
|
|
|
|
return Status::OK();
|
|
|
|
}
|
|
|
|
|
|
|
|
void Abandon() override {}
|
|
|
|
|
|
|
|
uint64_t NumEntries() const override { return table_.size(); }
|
|
|
|
|
Align compaction output file boundaries to the next level ones (#10655)
Summary:
Try to align the compaction output file boundaries to the next level ones
(grandparent level), to reduce the level compaction write-amplification.
In level compaction, there are "wasted" data at the beginning and end of the
output level files. Align the file boundary can avoid such "wasted" compaction.
With this PR, it tries to align the non-bottommost level file boundaries to its
next level ones. It may cut file when the file size is large enough (at least
50% of target_file_size) and not too large (2x target_file_size).
db_bench shows about 12.56% compaction reduction:
```
TEST_TMPDIR=/data/dbbench2 ./db_bench --benchmarks=fillrandom,readrandom -max_background_jobs=12 -num=400000000 -target_file_size_base=33554432
# baseline:
Flush(GB): cumulative 25.882, interval 7.216
Cumulative compaction: 285.90 GB write, 162.36 MB/s write, 269.68 GB read, 153.15 MB/s read, 2926.7 seconds
# with this change:
Flush(GB): cumulative 25.882, interval 7.753
Cumulative compaction: 249.97 GB write, 141.96 MB/s write, 233.74 GB read, 132.74 MB/s read, 2534.9 seconds
```
The compaction simulator shows a similar result (14% with 100G random data).
As a side effect, with this PR, the SST file size can exceed the
target_file_size, but is capped at 2x target_file_size. And there will be
smaller files. Here are file size statistics when loading 100GB with the target
file size 32MB:
```
baseline this_PR
count 1.656000e+03 1.705000e+03
mean 3.116062e+07 3.028076e+07
std 7.145242e+06 8.046139e+06
```
The feature is enabled by default, to revert to the old behavior disable it
with `AdvancedColumnFamilyOptions.level_compaction_dynamic_file_size = false`
Also includes https://github.com/facebook/rocksdb/issues/1963 to cut file before skippable grandparent file. Which is for
use case like user adding 2 or more non-overlapping data range at the same
time, it can reduce the overlapping of 2 datasets in the lower levels.
Pull Request resolved: https://github.com/facebook/rocksdb/pull/10655
Reviewed By: cbi42
Differential Revision: D39552321
Pulled By: jay-zhuang
fbshipit-source-id: 640d15f159ab0cd973f2426cfc3af266fc8bdde2
2 years ago
|
|
|
uint64_t FileSize() const override { return table_.size() * key_value_size_; }
|
|
|
|
|
|
|
|
TableProperties GetTableProperties() const override {
|
|
|
|
return TableProperties();
|
|
|
|
}
|
|
|
|
|
|
|
|
// Get file checksum
|
|
|
|
std::string GetFileChecksum() const override { return kUnknownFileChecksum; }
|
|
|
|
// Get file checksum function name
|
|
|
|
const char* GetFileChecksumFuncName() const override {
|
|
|
|
return kUnknownFileChecksumFuncName;
|
|
|
|
}
|
|
|
|
|
|
|
|
private:
|
|
|
|
uint32_t id_;
|
|
|
|
std::string prev_key_;
|
|
|
|
std::string prev_value_;
|
|
|
|
MockTableFileSystem* file_system_;
|
|
|
|
int corrupt_mode_;
|
|
|
|
KVVector table_;
|
Align compaction output file boundaries to the next level ones (#10655)
Summary:
Try to align the compaction output file boundaries to the next level ones
(grandparent level), to reduce the level compaction write-amplification.
In level compaction, there are "wasted" data at the beginning and end of the
output level files. Align the file boundary can avoid such "wasted" compaction.
With this PR, it tries to align the non-bottommost level file boundaries to its
next level ones. It may cut file when the file size is large enough (at least
50% of target_file_size) and not too large (2x target_file_size).
db_bench shows about 12.56% compaction reduction:
```
TEST_TMPDIR=/data/dbbench2 ./db_bench --benchmarks=fillrandom,readrandom -max_background_jobs=12 -num=400000000 -target_file_size_base=33554432
# baseline:
Flush(GB): cumulative 25.882, interval 7.216
Cumulative compaction: 285.90 GB write, 162.36 MB/s write, 269.68 GB read, 153.15 MB/s read, 2926.7 seconds
# with this change:
Flush(GB): cumulative 25.882, interval 7.753
Cumulative compaction: 249.97 GB write, 141.96 MB/s write, 233.74 GB read, 132.74 MB/s read, 2534.9 seconds
```
The compaction simulator shows a similar result (14% with 100G random data).
As a side effect, with this PR, the SST file size can exceed the
target_file_size, but is capped at 2x target_file_size. And there will be
smaller files. Here are file size statistics when loading 100GB with the target
file size 32MB:
```
baseline this_PR
count 1.656000e+03 1.705000e+03
mean 3.116062e+07 3.028076e+07
std 7.145242e+06 8.046139e+06
```
The feature is enabled by default, to revert to the old behavior disable it
with `AdvancedColumnFamilyOptions.level_compaction_dynamic_file_size = false`
Also includes https://github.com/facebook/rocksdb/issues/1963 to cut file before skippable grandparent file. Which is for
use case like user adding 2 or more non-overlapping data range at the same
time, it can reduce the overlapping of 2 datasets in the lower levels.
Pull Request resolved: https://github.com/facebook/rocksdb/pull/10655
Reviewed By: cbi42
Differential Revision: D39552321
Pulled By: jay-zhuang
fbshipit-source-id: 640d15f159ab0cd973f2426cfc3af266fc8bdde2
2 years ago
|
|
|
size_t key_value_size_;
|
|
|
|
};
|
|
|
|
|
|
|
|
InternalIterator* MockTableReader::NewIterator(
|
|
|
|
const ReadOptions&, const SliceTransform* /* prefix_extractor */,
|
|
|
|
Arena* /*arena*/, bool /*skip_filters*/, TableReaderCaller /*caller*/,
|
Properly report IO errors when IndexType::kBinarySearchWithFirstKey is used (#6621)
Summary:
Context: Index type `kBinarySearchWithFirstKey` added the ability for sst file iterator to sometimes report a key from index without reading the corresponding data block. This is useful when sst blocks are cut at some meaningful boundaries (e.g. one block per key prefix), and many seeks land between blocks (e.g. for each prefix, the ranges of keys in different sst files are nearly disjoint, so a typical seek needs to read a data block from only one file even if all files have the prefix). But this added a new error condition, which rocksdb code was really not equipped to deal with: `InternalIterator::value()` may fail with an IO error or Status::Incomplete, but it's just a method returning a Slice, with no way to report error instead. Before this PR, this type of error wasn't handled at all (an empty slice was returned), and kBinarySearchWithFirstKey implementation was considered a prototype.
Now that we (LogDevice) have experimented with kBinarySearchWithFirstKey for a while and confirmed that it's really useful, this PR is adding the missing error handling.
It's a pretty inconvenient situation implementation-wise. The error needs to be reported from InternalIterator when trying to access value. But there are ~700 call sites of `InternalIterator::value()`, most of which either can't hit the error condition (because the iterator is reading from memtable or from index or something) or wouldn't benefit from the deferred loading of the value (e.g. compaction iterator that reads all values anyway). Adding error handling to all these call sites would needlessly bloat the code. So instead I made the deferred value loading optional: only the call sites that may use deferred loading have to call the new method `PrepareValue()` before calling `value()`. The feature is enabled with a new bool argument `allow_unprepared_value` to a bunch of methods that create iterators (it wouldn't make sense to put it in ReadOptions because it's completely internal to iterators, with virtually no user-visible effect). Lmk if you have better ideas.
Note that the deferred value loading only happens for *internal* iterators. The user-visible iterator (DBIter) always prepares the value before returning from Seek/Next/etc. We could go further and add an API to defer that value loading too, but that's most likely not useful for LogDevice, so it doesn't seem worth the complexity for now.
Pull Request resolved: https://github.com/facebook/rocksdb/pull/6621
Test Plan: make -j5 check . Will also deploy to some logdevice test clusters and look at stats.
Reviewed By: siying
Differential Revision: D20786930
Pulled By: al13n321
fbshipit-source-id: 6da77d918bad3780522e918f17f4d5513d3e99ee
5 years ago
|
|
|
size_t /*compaction_readahead_size*/, bool /* allow_unprepared_value */) {
|
|
|
|
return new MockTableIterator(table_);
|
|
|
|
}
|
|
|
|
|
|
|
|
Status MockTableReader::Get(const ReadOptions&, const Slice& key,
|
|
|
|
GetContext* get_context,
|
|
|
|
const SliceTransform* /*prefix_extractor*/,
|
|
|
|
bool /*skip_filters*/) {
|
|
|
|
std::unique_ptr<MockTableIterator> iter(new MockTableIterator(table_));
|
|
|
|
for (iter->Seek(key); iter->Valid(); iter->Next()) {
|
|
|
|
ParsedInternalKey parsed_key;
|
|
|
|
Status pik_status =
|
|
|
|
ParseInternalKey(iter->key(), &parsed_key, true /* log_err_key */);
|
|
|
|
if (!pik_status.ok()) {
|
|
|
|
return pik_status;
|
|
|
|
}
|
|
|
|
|
|
|
|
bool dont_care __attribute__((__unused__));
|
|
|
|
if (!get_context->SaveValue(parsed_key, iter->value(), &dont_care)) {
|
|
|
|
break;
|
|
|
|
}
|
|
|
|
}
|
|
|
|
return Status::OK();
|
|
|
|
}
|
|
|
|
|
|
|
|
std::shared_ptr<const TableProperties> MockTableReader::GetTableProperties()
|
|
|
|
const {
|
|
|
|
TableProperties* tp = new TableProperties();
|
|
|
|
tp->num_entries = table_.size();
|
|
|
|
tp->num_range_deletions = 0;
|
|
|
|
tp->raw_key_size = 1;
|
|
|
|
tp->raw_value_size = 1;
|
|
|
|
|
|
|
|
return std::shared_ptr<const TableProperties>(tp);
|
|
|
|
}
|
|
|
|
|
|
|
|
MockTableFactory::MockTableFactory()
|
|
|
|
: next_id_(1), corrupt_mode_(MockTableFactory::kCorruptNone) {}
|
|
|
|
|
|
|
|
Status MockTableFactory::NewTableReader(
|
|
|
|
const ReadOptions& /*ro*/,
|
|
|
|
const TableReaderOptions& /*table_reader_options*/,
|
|
|
|
std::unique_ptr<RandomAccessFileReader>&& file, uint64_t /*file_size*/,
|
|
|
|
std::unique_ptr<TableReader>* table_reader,
|
|
|
|
bool /*prefetch_index_and_filter_in_cache*/) const {
|
|
|
|
uint32_t id;
|
|
|
|
Status s = GetIDFromFile(file.get(), &id);
|
|
|
|
if (!s.ok()) {
|
|
|
|
return s;
|
|
|
|
}
|
|
|
|
|
|
|
|
MutexLock lock_guard(&file_system_.mutex);
|
|
|
|
|
|
|
|
auto it = file_system_.files.find(id);
|
|
|
|
if (it == file_system_.files.end()) {
|
|
|
|
return Status::IOError("Mock file not found");
|
|
|
|
}
|
|
|
|
|
|
|
|
table_reader->reset(new MockTableReader(it->second));
|
|
|
|
|
|
|
|
return Status::OK();
|
|
|
|
}
|
|
|
|
|
|
|
|
TableBuilder* MockTableFactory::NewTableBuilder(
|
|
|
|
const TableBuilderOptions& /*table_builder_options*/,
|
|
|
|
WritableFileWriter* file) const {
|
|
|
|
uint32_t id;
|
|
|
|
Status s = GetAndWriteNextID(file, &id);
|
|
|
|
assert(s.ok());
|
|
|
|
|
Align compaction output file boundaries to the next level ones (#10655)
Summary:
Try to align the compaction output file boundaries to the next level ones
(grandparent level), to reduce the level compaction write-amplification.
In level compaction, there are "wasted" data at the beginning and end of the
output level files. Align the file boundary can avoid such "wasted" compaction.
With this PR, it tries to align the non-bottommost level file boundaries to its
next level ones. It may cut file when the file size is large enough (at least
50% of target_file_size) and not too large (2x target_file_size).
db_bench shows about 12.56% compaction reduction:
```
TEST_TMPDIR=/data/dbbench2 ./db_bench --benchmarks=fillrandom,readrandom -max_background_jobs=12 -num=400000000 -target_file_size_base=33554432
# baseline:
Flush(GB): cumulative 25.882, interval 7.216
Cumulative compaction: 285.90 GB write, 162.36 MB/s write, 269.68 GB read, 153.15 MB/s read, 2926.7 seconds
# with this change:
Flush(GB): cumulative 25.882, interval 7.753
Cumulative compaction: 249.97 GB write, 141.96 MB/s write, 233.74 GB read, 132.74 MB/s read, 2534.9 seconds
```
The compaction simulator shows a similar result (14% with 100G random data).
As a side effect, with this PR, the SST file size can exceed the
target_file_size, but is capped at 2x target_file_size. And there will be
smaller files. Here are file size statistics when loading 100GB with the target
file size 32MB:
```
baseline this_PR
count 1.656000e+03 1.705000e+03
mean 3.116062e+07 3.028076e+07
std 7.145242e+06 8.046139e+06
```
The feature is enabled by default, to revert to the old behavior disable it
with `AdvancedColumnFamilyOptions.level_compaction_dynamic_file_size = false`
Also includes https://github.com/facebook/rocksdb/issues/1963 to cut file before skippable grandparent file. Which is for
use case like user adding 2 or more non-overlapping data range at the same
time, it can reduce the overlapping of 2 datasets in the lower levels.
Pull Request resolved: https://github.com/facebook/rocksdb/pull/10655
Reviewed By: cbi42
Differential Revision: D39552321
Pulled By: jay-zhuang
fbshipit-source-id: 640d15f159ab0cd973f2426cfc3af266fc8bdde2
2 years ago
|
|
|
return new MockTableBuilder(id, &file_system_, corrupt_mode_,
|
|
|
|
key_value_size_);
|
|
|
|
}
|
|
|
|
|
|
|
|
Status MockTableFactory::CreateMockTable(Env* env, const std::string& fname,
|
|
|
|
KVVector file_contents) {
|
|
|
|
std::unique_ptr<WritableFileWriter> file_writer;
|
|
|
|
Status s = WritableFileWriter::Create(env->GetFileSystem(), fname,
|
|
|
|
FileOptions(), &file_writer, nullptr);
|
|
|
|
if (!s.ok()) {
|
|
|
|
return s;
|
|
|
|
}
|
|
|
|
uint32_t id;
|
|
|
|
s = GetAndWriteNextID(file_writer.get(), &id);
|
|
|
|
if (s.ok()) {
|
|
|
|
file_system_.files.insert({id, std::move(file_contents)});
|
|
|
|
}
|
|
|
|
return s;
|
|
|
|
}
|
|
|
|
|
|
|
|
Status MockTableFactory::GetAndWriteNextID(WritableFileWriter* file,
|
|
|
|
uint32_t* next_id) const {
|
|
|
|
*next_id = next_id_.fetch_add(1);
|
|
|
|
char buf[4];
|
|
|
|
EncodeFixed32(buf, *next_id);
|
|
|
|
return file->Append(Slice(buf, 4));
|
|
|
|
}
|
|
|
|
|
|
|
|
Status MockTableFactory::GetIDFromFile(RandomAccessFileReader* file,
|
|
|
|
uint32_t* id) const {
|
|
|
|
char buf[4];
|
|
|
|
Slice result;
|
|
|
|
Status s = file->Read(IOOptions(), 0, 4, &result, buf, nullptr,
|
|
|
|
Env::IO_TOTAL /* rate_limiter_priority */);
|
|
|
|
assert(result.size() == 4);
|
|
|
|
*id = DecodeFixed32(buf);
|
|
|
|
return s;
|
|
|
|
}
|
|
|
|
|
|
|
|
void MockTableFactory::AssertSingleFile(const KVVector& file_contents) {
|
|
|
|
ASSERT_EQ(file_system_.files.size(), 1U);
|
|
|
|
ASSERT_EQ(file_contents, file_system_.files.begin()->second);
|
|
|
|
}
|
|
|
|
|
Align compaction output file boundaries to the next level ones (#10655)
Summary:
Try to align the compaction output file boundaries to the next level ones
(grandparent level), to reduce the level compaction write-amplification.
In level compaction, there are "wasted" data at the beginning and end of the
output level files. Align the file boundary can avoid such "wasted" compaction.
With this PR, it tries to align the non-bottommost level file boundaries to its
next level ones. It may cut file when the file size is large enough (at least
50% of target_file_size) and not too large (2x target_file_size).
db_bench shows about 12.56% compaction reduction:
```
TEST_TMPDIR=/data/dbbench2 ./db_bench --benchmarks=fillrandom,readrandom -max_background_jobs=12 -num=400000000 -target_file_size_base=33554432
# baseline:
Flush(GB): cumulative 25.882, interval 7.216
Cumulative compaction: 285.90 GB write, 162.36 MB/s write, 269.68 GB read, 153.15 MB/s read, 2926.7 seconds
# with this change:
Flush(GB): cumulative 25.882, interval 7.753
Cumulative compaction: 249.97 GB write, 141.96 MB/s write, 233.74 GB read, 132.74 MB/s read, 2534.9 seconds
```
The compaction simulator shows a similar result (14% with 100G random data).
As a side effect, with this PR, the SST file size can exceed the
target_file_size, but is capped at 2x target_file_size. And there will be
smaller files. Here are file size statistics when loading 100GB with the target
file size 32MB:
```
baseline this_PR
count 1.656000e+03 1.705000e+03
mean 3.116062e+07 3.028076e+07
std 7.145242e+06 8.046139e+06
```
The feature is enabled by default, to revert to the old behavior disable it
with `AdvancedColumnFamilyOptions.level_compaction_dynamic_file_size = false`
Also includes https://github.com/facebook/rocksdb/issues/1963 to cut file before skippable grandparent file. Which is for
use case like user adding 2 or more non-overlapping data range at the same
time, it can reduce the overlapping of 2 datasets in the lower levels.
Pull Request resolved: https://github.com/facebook/rocksdb/pull/10655
Reviewed By: cbi42
Differential Revision: D39552321
Pulled By: jay-zhuang
fbshipit-source-id: 640d15f159ab0cd973f2426cfc3af266fc8bdde2
2 years ago
|
|
|
void MockTableFactory::AssertLatestFiles(
|
|
|
|
const std::vector<KVVector>& files_contents) {
|
|
|
|
ASSERT_GE(file_system_.files.size(), files_contents.size());
|
|
|
|
auto it = file_system_.files.rbegin();
|
|
|
|
for (auto expect = files_contents.rbegin(); expect != files_contents.rend();
|
|
|
|
expect++, it++) {
|
|
|
|
ASSERT_TRUE(it != file_system_.files.rend());
|
|
|
|
if (*expect != it->second) {
|
|
|
|
std::cout << "Wrong content! Content of file, expect:" << std::endl;
|
|
|
|
for (const auto& kv : *expect) {
|
|
|
|
ParsedInternalKey ikey;
|
|
|
|
std::string key, value;
|
|
|
|
std::tie(key, value) = kv;
|
|
|
|
ASSERT_OK(ParseInternalKey(Slice(key), &ikey, true /* log_err_key */));
|
|
|
|
std::cout << ikey.DebugString(true, false) << " -> " << value
|
|
|
|
<< std::endl;
|
|
|
|
}
|
|
|
|
std::cout << "actual:" << std::endl;
|
|
|
|
for (const auto& kv : it->second) {
|
|
|
|
ParsedInternalKey ikey;
|
|
|
|
std::string key, value;
|
|
|
|
std::tie(key, value) = kv;
|
|
|
|
ASSERT_OK(ParseInternalKey(Slice(key), &ikey, true /* log_err_key */));
|
|
|
|
std::cout << ikey.DebugString(true, false) << " -> " << value
|
|
|
|
<< std::endl;
|
|
|
|
}
|
|
|
|
FAIL();
|
|
|
|
}
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
} // namespace mock
|
|
|
|
} // namespace ROCKSDB_NAMESPACE
|