|
|
|
// Copyright (c) 2011-present, Facebook, Inc. All rights reserved.
|
|
|
|
// This source code is licensed under both the GPLv2 (found in the
|
|
|
|
// COPYING file in the root directory) and Apache 2.0 License
|
|
|
|
// (found in the LICENSE.Apache file in the root directory).
|
|
|
|
//
|
|
|
|
// Copyright (c) 2011 The LevelDB Authors. All rights reserved.
|
|
|
|
// Use of this source code is governed by a BSD-style license that can be
|
|
|
|
// found in the LICENSE file. See the AUTHORS file for names of contributors.
|
|
|
|
|
|
|
|
#include "db/column_family.h"
|
|
|
|
|
|
|
|
#include <algorithm>
|
|
|
|
#include <cinttypes>
|
|
|
|
#include <limits>
|
|
|
|
#include <sstream>
|
|
|
|
#include <string>
|
|
|
|
#include <vector>
|
|
|
|
|
|
|
|
#include "db/blob/blob_file_cache.h"
|
|
|
|
#include "db/blob/blob_source.h"
|
|
|
|
#include "db/compaction/compaction_picker.h"
|
|
|
|
#include "db/compaction/compaction_picker_fifo.h"
|
|
|
|
#include "db/compaction/compaction_picker_level.h"
|
|
|
|
#include "db/compaction/compaction_picker_universal.h"
|
|
|
|
#include "db/db_impl/db_impl.h"
|
|
|
|
#include "db/internal_stats.h"
|
|
|
|
#include "db/job_context.h"
|
|
|
|
#include "db/range_del_aggregator.h"
|
|
|
|
#include "db/table_properties_collector.h"
|
|
|
|
#include "db/version_set.h"
|
Push- instead of pull-model for managing Write stalls
Summary:
Introducing WriteController, which is a source of truth about per-DB write delays. Let's define an DB epoch as a period where there are no flushes and compactions (i.e. new epoch is started when flush or compaction finishes). Each epoch can either:
* proceed with all writes without delay
* delay all writes by fixed time
* stop all writes
The three modes are recomputed at each epoch change (flush, compaction), rather than on every write (which is currently the case).
When we have a lot of column families, our current pull behavior adds a big overhead, since we need to loop over every column family for every write. With new push model, overhead on Write code-path is minimal.
This is just the start. Next step is to also take care of stalls introduced by slow memtable flushes. The final goal is to eliminate function MakeRoomForWrite(), which currently needs to be called for every column family by every write.
Test Plan: make check for now. I'll add some unit tests later. Also, perf test.
Reviewers: dhruba, yhchiang, MarkCallaghan, sdong, ljin
Reviewed By: ljin
Subscribers: leveldb
Differential Revision: https://reviews.facebook.net/D22791
10 years ago
|
|
|
#include "db/write_controller.h"
|
|
|
|
#include "file/sst_file_manager_impl.h"
|
|
|
|
#include "logging/logging.h"
|
|
|
|
#include "monitoring/thread_status_util.h"
|
|
|
|
#include "options/options_helper.h"
|
Auto enable Periodic Compactions if a Compaction Filter is used (#5865)
Summary:
- Periodic compactions are auto-enabled if a compaction filter or a compaction filter factory is set, in Level Compaction.
- The default value of `periodic_compaction_seconds` is changed to UINT64_MAX, which lets RocksDB auto-tune periodic compactions as needed. An explicit value of 0 will still work as before ie. to disable periodic compactions completely. For now, on seeing a compaction filter along with a UINT64_MAX value for `periodic_compaction_seconds`, RocksDB will make SST files older than 30 days to go through periodic copmactions.
Some RocksDB users make use of compaction filters to control when their data can be deleted, usually with a custom TTL logic. But it is occasionally possible that the compactions get delayed by considerable time due to factors like low writes to a key range, data reaching bottom level, etc before the TTL expiry. Periodic Compactions feature was originally built to help such cases. Now periodic compactions are auto enabled by default when compaction filters or compaction filter factories are used, as it is generally helpful to all cases to collect garbage.
`periodic_compaction_seconds` is set to a large value, 30 days, in `SanitizeOptions` when RocksDB sees that a `compaction_filter` or `compaction_filter_factory` is used.
This is done only for Level Compaction style.
Pull Request resolved: https://github.com/facebook/rocksdb/pull/5865
Test Plan:
- Added a new test `DBCompactionTest.LevelPeriodicCompactionWithCompactionFilters` to make sure that `periodic_compaction_seconds` is set if either `compaction_filter` or `compaction_filter_factory` options are set.
- `COMPILE_WITH_ASAN=1 make check`
Differential Revision: D17659180
Pulled By: sagar0
fbshipit-source-id: 4887b9cf2e53cf2dc93a7b658c6b15e1181217ee
5 years ago
|
|
|
#include "port/port.h"
|
|
|
|
#include "rocksdb/convenience.h"
|
|
|
|
#include "rocksdb/table.h"
|
|
|
|
#include "table/merging_iterator.h"
|
|
|
|
#include "util/autovector.h"
|
|
|
|
#include "util/cast_util.h"
|
|
|
|
#include "util/compression.h"
|
|
|
|
|
|
|
|
namespace ROCKSDB_NAMESPACE {
|
|
|
|
|
|
|
|
ColumnFamilyHandleImpl::ColumnFamilyHandleImpl(
|
|
|
|
ColumnFamilyData* column_family_data, DBImpl* db, InstrumentedMutex* mutex)
|
|
|
|
: cfd_(column_family_data), db_(db), mutex_(mutex) {
|
|
|
|
if (cfd_ != nullptr) {
|
|
|
|
cfd_->Ref();
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
ColumnFamilyHandleImpl::~ColumnFamilyHandleImpl() {
|
|
|
|
if (cfd_ != nullptr) {
|
|
|
|
for (auto& listener : cfd_->ioptions()->listeners) {
|
|
|
|
listener->OnColumnFamilyHandleDeletionStarted(this);
|
|
|
|
}
|
|
|
|
// Job id == 0 means that this is not our background process, but rather
|
|
|
|
// user thread
|
|
|
|
// Need to hold some shared pointers owned by the initial_cf_options
|
|
|
|
// before final cleaning up finishes.
|
|
|
|
ColumnFamilyOptions initial_cf_options_copy = cfd_->initial_cf_options();
|
|
|
|
JobContext job_context(0);
|
|
|
|
mutex_->Lock();
|
|
|
|
bool dropped = cfd_->IsDropped();
|
|
|
|
if (cfd_->UnrefAndTryDelete()) {
|
|
|
|
if (dropped) {
|
|
|
|
db_->FindObsoleteFiles(&job_context, false, true);
|
|
|
|
}
|
|
|
|
}
|
|
|
|
mutex_->Unlock();
|
|
|
|
if (job_context.HaveSomethingToDelete()) {
|
|
|
|
bool defer_purge =
|
|
|
|
db_->immutable_db_options().avoid_unnecessary_blocking_io;
|
|
|
|
db_->PurgeObsoleteFiles(job_context, defer_purge);
|
|
|
|
}
|
|
|
|
job_context.Clean();
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
uint32_t ColumnFamilyHandleImpl::GetID() const { return cfd()->GetID(); }
|
|
|
|
|
CompactFiles, EventListener and GetDatabaseMetaData
Summary:
This diff adds three sets of APIs to RocksDB.
= GetColumnFamilyMetaData =
* This APIs allow users to obtain the current state of a RocksDB instance on one column family.
* See GetColumnFamilyMetaData in include/rocksdb/db.h
= EventListener =
* A virtual class that allows users to implement a set of
call-back functions which will be called when specific
events of a RocksDB instance happens.
* To register EventListener, simply insert an EventListener to ColumnFamilyOptions::listeners
= CompactFiles =
* CompactFiles API inputs a set of file numbers and an output level, and RocksDB
will try to compact those files into the specified level.
= Example =
* Example code can be found in example/compact_files_example.cc, which implements
a simple external compactor using EventListener, GetColumnFamilyMetaData, and
CompactFiles API.
Test Plan:
listener_test
compactor_test
example/compact_files_example
export ROCKSDB_TESTS=CompactFiles
db_test
export ROCKSDB_TESTS=MetaData
db_test
Reviewers: ljin, igor, rven, sdong
Reviewed By: sdong
Subscribers: MarkCallaghan, dhruba, leveldb
Differential Revision: https://reviews.facebook.net/D24705
10 years ago
|
|
|
const std::string& ColumnFamilyHandleImpl::GetName() const {
|
|
|
|
return cfd()->GetName();
|
|
|
|
}
|
|
|
|
|
|
|
|
Status ColumnFamilyHandleImpl::GetDescriptor(ColumnFamilyDescriptor* desc) {
|
|
|
|
// accessing mutable cf-options requires db mutex.
|
|
|
|
InstrumentedMutexLock l(mutex_);
|
|
|
|
*desc = ColumnFamilyDescriptor(cfd()->GetName(), cfd()->GetLatestCFOptions());
|
|
|
|
return Status::OK();
|
|
|
|
}
|
|
|
|
|
|
|
|
const Comparator* ColumnFamilyHandleImpl::GetComparator() const {
|
|
|
|
return cfd()->user_comparator();
|
|
|
|
}
|
|
|
|
|
A new call back to TablePropertiesCollector to allow users know the entry is add, delete or merge
Summary:
Currently users have no idea a key is add, delete or merge from TablePropertiesCollector call back. Add a new function to add it.
Also refactor the codes so that
(1) make table property collector and internal table property collector two separate data structures with the later one now exposed
(2) table builders only receive internal table properties
Test Plan: Add cases in table_properties_collector_test to cover both of old and new ways of using TablePropertiesCollector.
Reviewers: yhchiang, igor.sugak, rven, igor
Reviewed By: rven, igor
Subscribers: meyering, yoshinorim, maykov, leveldb, dhruba
Differential Revision: https://reviews.facebook.net/D35373
10 years ago
|
|
|
void GetIntTblPropCollectorFactory(
|
|
|
|
const ImmutableCFOptions& ioptions,
|
|
|
|
IntTblPropCollectorFactories* int_tbl_prop_collector_factories) {
|
|
|
|
assert(int_tbl_prop_collector_factories);
|
|
|
|
|
|
|
|
auto& collector_factories = ioptions.table_properties_collector_factories;
|
|
|
|
for (size_t i = 0; i < ioptions.table_properties_collector_factories.size();
|
A new call back to TablePropertiesCollector to allow users know the entry is add, delete or merge
Summary:
Currently users have no idea a key is add, delete or merge from TablePropertiesCollector call back. Add a new function to add it.
Also refactor the codes so that
(1) make table property collector and internal table property collector two separate data structures with the later one now exposed
(2) table builders only receive internal table properties
Test Plan: Add cases in table_properties_collector_test to cover both of old and new ways of using TablePropertiesCollector.
Reviewers: yhchiang, igor.sugak, rven, igor
Reviewed By: rven, igor
Subscribers: meyering, yoshinorim, maykov, leveldb, dhruba
Differential Revision: https://reviews.facebook.net/D35373
10 years ago
|
|
|
++i) {
|
|
|
|
assert(collector_factories[i]);
|
|
|
|
int_tbl_prop_collector_factories->emplace_back(
|
|
|
|
new UserKeyTablePropertiesCollectorFactory(collector_factories[i]));
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
Status CheckCompressionSupported(const ColumnFamilyOptions& cf_options) {
|
|
|
|
if (!cf_options.compression_per_level.empty()) {
|
|
|
|
for (size_t level = 0; level < cf_options.compression_per_level.size();
|
|
|
|
++level) {
|
|
|
|
if (!CompressionTypeSupported(cf_options.compression_per_level[level])) {
|
|
|
|
return Status::InvalidArgument(
|
|
|
|
"Compression type " +
|
|
|
|
CompressionTypeToString(cf_options.compression_per_level[level]) +
|
|
|
|
" is not linked with the binary.");
|
|
|
|
}
|
|
|
|
}
|
|
|
|
} else {
|
|
|
|
if (!CompressionTypeSupported(cf_options.compression)) {
|
|
|
|
return Status::InvalidArgument(
|
|
|
|
"Compression type " +
|
|
|
|
CompressionTypeToString(cf_options.compression) +
|
|
|
|
" is not linked with the binary.");
|
|
|
|
}
|
|
|
|
}
|
|
|
|
if (cf_options.compression_opts.zstd_max_train_bytes > 0) {
|
Support using ZDICT_finalizeDictionary to generate zstd dictionary (#9857)
Summary:
An untrained dictionary is currently simply the concatenation of several samples. The ZSTD API, ZDICT_finalizeDictionary(), can improve such a dictionary's effectiveness at low cost. This PR changes how dictionary is created by calling the ZSTD ZDICT_finalizeDictionary() API instead of creating raw content dictionary (when max_dict_buffer_bytes > 0), and pass in all buffered uncompressed data blocks as samples.
Pull Request resolved: https://github.com/facebook/rocksdb/pull/9857
Test Plan:
#### db_bench test for cpu/memory of compression+decompression and space saving on synthetic data:
Set up: change the parameter [here](https://github.com/facebook/rocksdb/blob/fb9a167a55e0970b1ef6f67c1600c8d9c4c6114f/tools/db_bench_tool.cc#L1766) to 16384 to make synthetic data more compressible.
```
# linked local ZSTD with version 1.5.2
# DEBUG_LEVEL=0 ROCKSDB_NO_FBCODE=1 ROCKSDB_DISABLE_ZSTD=1 EXTRA_CXXFLAGS="-DZSTD_STATIC_LINKING_ONLY -DZSTD -I/data/users/changyubi/install/include/" EXTRA_LDFLAGS="-L/data/users/changyubi/install/lib/ -l:libzstd.a" make -j32 db_bench
dict_bytes=16384
train_bytes=1048576
echo "========== No Dictionary =========="
TEST_TMPDIR=/dev/shm ./db_bench -benchmarks=filluniquerandom,compact -num=10000000 -compression_type=zstd -compression_max_dict_bytes=0 -block_size=4096 -max_background_jobs=24 -memtablerep=vector -allow_concurrent_memtable_write=false -disable_wal=true -max_write_buffer_number=8 >/dev/null 2>&1
TEST_TMPDIR=/dev/shm /usr/bin/time ./db_bench -use_existing_db=true -benchmarks=compact -compression_type=zstd -compression_max_dict_bytes=0 -block_size=4096 2>&1 | grep elapsed
du -hc /dev/shm/dbbench/*sst | grep total
echo "========== Raw Content Dictionary =========="
TEST_TMPDIR=/dev/shm ./db_bench_main -benchmarks=filluniquerandom,compact -num=10000000 -compression_type=zstd -compression_max_dict_bytes=$dict_bytes -block_size=4096 -max_background_jobs=24 -memtablerep=vector -allow_concurrent_memtable_write=false -disable_wal=true -max_write_buffer_number=8 >/dev/null 2>&1
TEST_TMPDIR=/dev/shm /usr/bin/time ./db_bench_main -use_existing_db=true -benchmarks=compact -compression_type=zstd -compression_max_dict_bytes=$dict_bytes -block_size=4096 2>&1 | grep elapsed
du -hc /dev/shm/dbbench/*sst | grep total
echo "========== FinalizeDictionary =========="
TEST_TMPDIR=/dev/shm ./db_bench -benchmarks=filluniquerandom,compact -num=10000000 -compression_type=zstd -compression_max_dict_bytes=$dict_bytes -compression_zstd_max_train_bytes=$train_bytes -compression_use_zstd_dict_trainer=false -block_size=4096 -max_background_jobs=24 -memtablerep=vector -allow_concurrent_memtable_write=false -disable_wal=true -max_write_buffer_number=8 >/dev/null 2>&1
TEST_TMPDIR=/dev/shm /usr/bin/time ./db_bench -use_existing_db=true -benchmarks=compact -compression_type=zstd -compression_max_dict_bytes=$dict_bytes -compression_zstd_max_train_bytes=$train_bytes -compression_use_zstd_dict_trainer=false -block_size=4096 2>&1 | grep elapsed
du -hc /dev/shm/dbbench/*sst | grep total
echo "========== TrainDictionary =========="
TEST_TMPDIR=/dev/shm ./db_bench -benchmarks=filluniquerandom,compact -num=10000000 -compression_type=zstd -compression_max_dict_bytes=$dict_bytes -compression_zstd_max_train_bytes=$train_bytes -block_size=4096 -max_background_jobs=24 -memtablerep=vector -allow_concurrent_memtable_write=false -disable_wal=true -max_write_buffer_number=8 >/dev/null 2>&1
TEST_TMPDIR=/dev/shm /usr/bin/time ./db_bench -use_existing_db=true -benchmarks=compact -compression_type=zstd -compression_max_dict_bytes=$dict_bytes -compression_zstd_max_train_bytes=$train_bytes -block_size=4096 2>&1 | grep elapsed
du -hc /dev/shm/dbbench/*sst | grep total
# Result: TrainDictionary is much better on space saving, but FinalizeDictionary seems to use less memory.
# before compression data size: 1.2GB
dict_bytes=16384
max_dict_buffer_bytes = 1048576
space cpu/memory
No Dictionary 468M 14.93user 1.00system 0:15.92elapsed 100%CPU (0avgtext+0avgdata 23904maxresident)k
Raw Dictionary 251M 15.81user 0.80system 0:16.56elapsed 100%CPU (0avgtext+0avgdata 156808maxresident)k
FinalizeDictionary 236M 11.93user 0.64system 0:12.56elapsed 100%CPU (0avgtext+0avgdata 89548maxresident)k
TrainDictionary 84M 7.29user 0.45system 0:07.75elapsed 100%CPU (0avgtext+0avgdata 97288maxresident)k
```
#### Benchmark on 10 sample SST files for spacing saving and CPU time on compression:
FinalizeDictionary is comparable to TrainDictionary in terms of space saving, and takes less time in compression.
```
dict_bytes=16384
train_bytes=1048576
for sst_file in `ls ../temp/myrock-sst/`
do
echo "********** $sst_file **********"
echo "========== No Dictionary =========="
./sst_dump --file="../temp/myrock-sst/$sst_file" --command=recompress --compression_level_from=6 --compression_level_to=6 --compression_types=kZSTD
echo "========== Raw Content Dictionary =========="
./sst_dump --file="../temp/myrock-sst/$sst_file" --command=recompress --compression_level_from=6 --compression_level_to=6 --compression_types=kZSTD --compression_max_dict_bytes=$dict_bytes
echo "========== FinalizeDictionary =========="
./sst_dump --file="../temp/myrock-sst/$sst_file" --command=recompress --compression_level_from=6 --compression_level_to=6 --compression_types=kZSTD --compression_max_dict_bytes=$dict_bytes --compression_zstd_max_train_bytes=$train_bytes --compression_use_zstd_finalize_dict
echo "========== TrainDictionary =========="
./sst_dump --file="../temp/myrock-sst/$sst_file" --command=recompress --compression_level_from=6 --compression_level_to=6 --compression_types=kZSTD --compression_max_dict_bytes=$dict_bytes --compression_zstd_max_train_bytes=$train_bytes
done
010240.sst (Size/Time) 011029.sst 013184.sst 021552.sst 185054.sst 185137.sst 191666.sst 7560381.sst 7604174.sst 7635312.sst
No Dictionary 28165569 / 2614419 32899411 / 2976832 32977848 / 3055542 31966329 / 2004590 33614351 / 1755877 33429029 / 1717042 33611933 / 1776936 33634045 / 2771417 33789721 / 2205414 33592194 / 388254
Raw Content Dictionary 28019950 / 2697961 33748665 / 3572422 33896373 / 3534701 26418431 / 2259658 28560825 / 1839168 28455030 / 1846039 28494319 / 1861349 32391599 / 3095649 33772142 / 2407843 33592230 / 474523
FinalizeDictionary 27896012 / 2650029 33763886 / 3719427 33904283 / 3552793 26008225 / 2198033 28111872 / 1869530 28014374 / 1789771 28047706 / 1848300 32296254 / 3204027 33698698 / 2381468 33592344 / 517433
TrainDictionary 28046089 / 2740037 33706480 / 3679019 33885741 / 3629351 25087123 / 2204558 27194353 / 1970207 27234229 / 1896811 27166710 / 1903119 32011041 / 3322315 32730692 / 2406146 33608631 / 570593
```
#### Decompression/Read test:
With FinalizeDictionary/TrainDictionary, some data structure used for decompression are in stored in dictionary, so they are expected to be faster in terms of decompression/reads.
```
dict_bytes=16384
train_bytes=1048576
echo "No Dictionary"
TEST_TMPDIR=/dev/shm/ ./db_bench -benchmarks=filluniquerandom,compact -compression_type=zstd -compression_max_dict_bytes=0 > /dev/null 2>&1
TEST_TMPDIR=/dev/shm/ ./db_bench -use_existing_db=true -benchmarks=readrandom -cache_size=0 -compression_type=zstd -compression_max_dict_bytes=0 2>&1 | grep MB/s
echo "Raw Dictionary"
TEST_TMPDIR=/dev/shm/ ./db_bench -benchmarks=filluniquerandom,compact -compression_type=zstd -compression_max_dict_bytes=$dict_bytes > /dev/null 2>&1
TEST_TMPDIR=/dev/shm/ ./db_bench -use_existing_db=true -benchmarks=readrandom -cache_size=0 -compression_type=zstd -compression_max_dict_bytes=$dict_bytes 2>&1 | grep MB/s
echo "FinalizeDict"
TEST_TMPDIR=/dev/shm/ ./db_bench -benchmarks=filluniquerandom,compact -compression_type=zstd -compression_max_dict_bytes=$dict_bytes -compression_zstd_max_train_bytes=$train_bytes -compression_use_zstd_dict_trainer=false > /dev/null 2>&1
TEST_TMPDIR=/dev/shm/ ./db_bench -use_existing_db=true -benchmarks=readrandom -cache_size=0 -compression_type=zstd -compression_max_dict_bytes=$dict_bytes -compression_zstd_max_train_bytes=$train_bytes -compression_use_zstd_dict_trainer=false 2>&1 | grep MB/s
echo "Train Dictionary"
TEST_TMPDIR=/dev/shm/ ./db_bench -benchmarks=filluniquerandom,compact -compression_type=zstd -compression_max_dict_bytes=$dict_bytes -compression_zstd_max_train_bytes=$train_bytes > /dev/null 2>&1
TEST_TMPDIR=/dev/shm/ ./db_bench -use_existing_db=true -benchmarks=readrandom -cache_size=0 -compression_type=zstd -compression_max_dict_bytes=$dict_bytes -compression_zstd_max_train_bytes=$train_bytes 2>&1 | grep MB/s
No Dictionary
readrandom : 12.183 micros/op 82082 ops/sec 12.183 seconds 1000000 operations; 9.1 MB/s (1000000 of 1000000 found)
Raw Dictionary
readrandom : 12.314 micros/op 81205 ops/sec 12.314 seconds 1000000 operations; 9.0 MB/s (1000000 of 1000000 found)
FinalizeDict
readrandom : 9.787 micros/op 102180 ops/sec 9.787 seconds 1000000 operations; 11.3 MB/s (1000000 of 1000000 found)
Train Dictionary
readrandom : 9.698 micros/op 103108 ops/sec 9.699 seconds 1000000 operations; 11.4 MB/s (1000000 of 1000000 found)
```
Reviewed By: ajkr
Differential Revision: D35720026
Pulled By: cbi42
fbshipit-source-id: 24d230fdff0fd28a1bb650658798f00dfcfb2a1f
3 years ago
|
|
|
if (cf_options.compression_opts.use_zstd_dict_trainer) {
|
|
|
|
if (!ZSTD_TrainDictionarySupported()) {
|
|
|
|
return Status::InvalidArgument(
|
|
|
|
"zstd dictionary trainer cannot be used because ZSTD 1.1.3+ "
|
|
|
|
"is not linked with the binary.");
|
|
|
|
}
|
|
|
|
} else if (!ZSTD_FinalizeDictionarySupported()) {
|
|
|
|
return Status::InvalidArgument(
|
Support using ZDICT_finalizeDictionary to generate zstd dictionary (#9857)
Summary:
An untrained dictionary is currently simply the concatenation of several samples. The ZSTD API, ZDICT_finalizeDictionary(), can improve such a dictionary's effectiveness at low cost. This PR changes how dictionary is created by calling the ZSTD ZDICT_finalizeDictionary() API instead of creating raw content dictionary (when max_dict_buffer_bytes > 0), and pass in all buffered uncompressed data blocks as samples.
Pull Request resolved: https://github.com/facebook/rocksdb/pull/9857
Test Plan:
#### db_bench test for cpu/memory of compression+decompression and space saving on synthetic data:
Set up: change the parameter [here](https://github.com/facebook/rocksdb/blob/fb9a167a55e0970b1ef6f67c1600c8d9c4c6114f/tools/db_bench_tool.cc#L1766) to 16384 to make synthetic data more compressible.
```
# linked local ZSTD with version 1.5.2
# DEBUG_LEVEL=0 ROCKSDB_NO_FBCODE=1 ROCKSDB_DISABLE_ZSTD=1 EXTRA_CXXFLAGS="-DZSTD_STATIC_LINKING_ONLY -DZSTD -I/data/users/changyubi/install/include/" EXTRA_LDFLAGS="-L/data/users/changyubi/install/lib/ -l:libzstd.a" make -j32 db_bench
dict_bytes=16384
train_bytes=1048576
echo "========== No Dictionary =========="
TEST_TMPDIR=/dev/shm ./db_bench -benchmarks=filluniquerandom,compact -num=10000000 -compression_type=zstd -compression_max_dict_bytes=0 -block_size=4096 -max_background_jobs=24 -memtablerep=vector -allow_concurrent_memtable_write=false -disable_wal=true -max_write_buffer_number=8 >/dev/null 2>&1
TEST_TMPDIR=/dev/shm /usr/bin/time ./db_bench -use_existing_db=true -benchmarks=compact -compression_type=zstd -compression_max_dict_bytes=0 -block_size=4096 2>&1 | grep elapsed
du -hc /dev/shm/dbbench/*sst | grep total
echo "========== Raw Content Dictionary =========="
TEST_TMPDIR=/dev/shm ./db_bench_main -benchmarks=filluniquerandom,compact -num=10000000 -compression_type=zstd -compression_max_dict_bytes=$dict_bytes -block_size=4096 -max_background_jobs=24 -memtablerep=vector -allow_concurrent_memtable_write=false -disable_wal=true -max_write_buffer_number=8 >/dev/null 2>&1
TEST_TMPDIR=/dev/shm /usr/bin/time ./db_bench_main -use_existing_db=true -benchmarks=compact -compression_type=zstd -compression_max_dict_bytes=$dict_bytes -block_size=4096 2>&1 | grep elapsed
du -hc /dev/shm/dbbench/*sst | grep total
echo "========== FinalizeDictionary =========="
TEST_TMPDIR=/dev/shm ./db_bench -benchmarks=filluniquerandom,compact -num=10000000 -compression_type=zstd -compression_max_dict_bytes=$dict_bytes -compression_zstd_max_train_bytes=$train_bytes -compression_use_zstd_dict_trainer=false -block_size=4096 -max_background_jobs=24 -memtablerep=vector -allow_concurrent_memtable_write=false -disable_wal=true -max_write_buffer_number=8 >/dev/null 2>&1
TEST_TMPDIR=/dev/shm /usr/bin/time ./db_bench -use_existing_db=true -benchmarks=compact -compression_type=zstd -compression_max_dict_bytes=$dict_bytes -compression_zstd_max_train_bytes=$train_bytes -compression_use_zstd_dict_trainer=false -block_size=4096 2>&1 | grep elapsed
du -hc /dev/shm/dbbench/*sst | grep total
echo "========== TrainDictionary =========="
TEST_TMPDIR=/dev/shm ./db_bench -benchmarks=filluniquerandom,compact -num=10000000 -compression_type=zstd -compression_max_dict_bytes=$dict_bytes -compression_zstd_max_train_bytes=$train_bytes -block_size=4096 -max_background_jobs=24 -memtablerep=vector -allow_concurrent_memtable_write=false -disable_wal=true -max_write_buffer_number=8 >/dev/null 2>&1
TEST_TMPDIR=/dev/shm /usr/bin/time ./db_bench -use_existing_db=true -benchmarks=compact -compression_type=zstd -compression_max_dict_bytes=$dict_bytes -compression_zstd_max_train_bytes=$train_bytes -block_size=4096 2>&1 | grep elapsed
du -hc /dev/shm/dbbench/*sst | grep total
# Result: TrainDictionary is much better on space saving, but FinalizeDictionary seems to use less memory.
# before compression data size: 1.2GB
dict_bytes=16384
max_dict_buffer_bytes = 1048576
space cpu/memory
No Dictionary 468M 14.93user 1.00system 0:15.92elapsed 100%CPU (0avgtext+0avgdata 23904maxresident)k
Raw Dictionary 251M 15.81user 0.80system 0:16.56elapsed 100%CPU (0avgtext+0avgdata 156808maxresident)k
FinalizeDictionary 236M 11.93user 0.64system 0:12.56elapsed 100%CPU (0avgtext+0avgdata 89548maxresident)k
TrainDictionary 84M 7.29user 0.45system 0:07.75elapsed 100%CPU (0avgtext+0avgdata 97288maxresident)k
```
#### Benchmark on 10 sample SST files for spacing saving and CPU time on compression:
FinalizeDictionary is comparable to TrainDictionary in terms of space saving, and takes less time in compression.
```
dict_bytes=16384
train_bytes=1048576
for sst_file in `ls ../temp/myrock-sst/`
do
echo "********** $sst_file **********"
echo "========== No Dictionary =========="
./sst_dump --file="../temp/myrock-sst/$sst_file" --command=recompress --compression_level_from=6 --compression_level_to=6 --compression_types=kZSTD
echo "========== Raw Content Dictionary =========="
./sst_dump --file="../temp/myrock-sst/$sst_file" --command=recompress --compression_level_from=6 --compression_level_to=6 --compression_types=kZSTD --compression_max_dict_bytes=$dict_bytes
echo "========== FinalizeDictionary =========="
./sst_dump --file="../temp/myrock-sst/$sst_file" --command=recompress --compression_level_from=6 --compression_level_to=6 --compression_types=kZSTD --compression_max_dict_bytes=$dict_bytes --compression_zstd_max_train_bytes=$train_bytes --compression_use_zstd_finalize_dict
echo "========== TrainDictionary =========="
./sst_dump --file="../temp/myrock-sst/$sst_file" --command=recompress --compression_level_from=6 --compression_level_to=6 --compression_types=kZSTD --compression_max_dict_bytes=$dict_bytes --compression_zstd_max_train_bytes=$train_bytes
done
010240.sst (Size/Time) 011029.sst 013184.sst 021552.sst 185054.sst 185137.sst 191666.sst 7560381.sst 7604174.sst 7635312.sst
No Dictionary 28165569 / 2614419 32899411 / 2976832 32977848 / 3055542 31966329 / 2004590 33614351 / 1755877 33429029 / 1717042 33611933 / 1776936 33634045 / 2771417 33789721 / 2205414 33592194 / 388254
Raw Content Dictionary 28019950 / 2697961 33748665 / 3572422 33896373 / 3534701 26418431 / 2259658 28560825 / 1839168 28455030 / 1846039 28494319 / 1861349 32391599 / 3095649 33772142 / 2407843 33592230 / 474523
FinalizeDictionary 27896012 / 2650029 33763886 / 3719427 33904283 / 3552793 26008225 / 2198033 28111872 / 1869530 28014374 / 1789771 28047706 / 1848300 32296254 / 3204027 33698698 / 2381468 33592344 / 517433
TrainDictionary 28046089 / 2740037 33706480 / 3679019 33885741 / 3629351 25087123 / 2204558 27194353 / 1970207 27234229 / 1896811 27166710 / 1903119 32011041 / 3322315 32730692 / 2406146 33608631 / 570593
```
#### Decompression/Read test:
With FinalizeDictionary/TrainDictionary, some data structure used for decompression are in stored in dictionary, so they are expected to be faster in terms of decompression/reads.
```
dict_bytes=16384
train_bytes=1048576
echo "No Dictionary"
TEST_TMPDIR=/dev/shm/ ./db_bench -benchmarks=filluniquerandom,compact -compression_type=zstd -compression_max_dict_bytes=0 > /dev/null 2>&1
TEST_TMPDIR=/dev/shm/ ./db_bench -use_existing_db=true -benchmarks=readrandom -cache_size=0 -compression_type=zstd -compression_max_dict_bytes=0 2>&1 | grep MB/s
echo "Raw Dictionary"
TEST_TMPDIR=/dev/shm/ ./db_bench -benchmarks=filluniquerandom,compact -compression_type=zstd -compression_max_dict_bytes=$dict_bytes > /dev/null 2>&1
TEST_TMPDIR=/dev/shm/ ./db_bench -use_existing_db=true -benchmarks=readrandom -cache_size=0 -compression_type=zstd -compression_max_dict_bytes=$dict_bytes 2>&1 | grep MB/s
echo "FinalizeDict"
TEST_TMPDIR=/dev/shm/ ./db_bench -benchmarks=filluniquerandom,compact -compression_type=zstd -compression_max_dict_bytes=$dict_bytes -compression_zstd_max_train_bytes=$train_bytes -compression_use_zstd_dict_trainer=false > /dev/null 2>&1
TEST_TMPDIR=/dev/shm/ ./db_bench -use_existing_db=true -benchmarks=readrandom -cache_size=0 -compression_type=zstd -compression_max_dict_bytes=$dict_bytes -compression_zstd_max_train_bytes=$train_bytes -compression_use_zstd_dict_trainer=false 2>&1 | grep MB/s
echo "Train Dictionary"
TEST_TMPDIR=/dev/shm/ ./db_bench -benchmarks=filluniquerandom,compact -compression_type=zstd -compression_max_dict_bytes=$dict_bytes -compression_zstd_max_train_bytes=$train_bytes > /dev/null 2>&1
TEST_TMPDIR=/dev/shm/ ./db_bench -use_existing_db=true -benchmarks=readrandom -cache_size=0 -compression_type=zstd -compression_max_dict_bytes=$dict_bytes -compression_zstd_max_train_bytes=$train_bytes 2>&1 | grep MB/s
No Dictionary
readrandom : 12.183 micros/op 82082 ops/sec 12.183 seconds 1000000 operations; 9.1 MB/s (1000000 of 1000000 found)
Raw Dictionary
readrandom : 12.314 micros/op 81205 ops/sec 12.314 seconds 1000000 operations; 9.0 MB/s (1000000 of 1000000 found)
FinalizeDict
readrandom : 9.787 micros/op 102180 ops/sec 9.787 seconds 1000000 operations; 11.3 MB/s (1000000 of 1000000 found)
Train Dictionary
readrandom : 9.698 micros/op 103108 ops/sec 9.699 seconds 1000000 operations; 11.4 MB/s (1000000 of 1000000 found)
```
Reviewed By: ajkr
Differential Revision: D35720026
Pulled By: cbi42
fbshipit-source-id: 24d230fdff0fd28a1bb650658798f00dfcfb2a1f
3 years ago
|
|
|
"zstd finalizeDictionary cannot be used because ZSTD 1.4.5+ "
|
|
|
|
"is not linked with the binary.");
|
|
|
|
}
|
|
|
|
if (cf_options.compression_opts.max_dict_bytes == 0) {
|
|
|
|
return Status::InvalidArgument(
|
|
|
|
"The dictionary size limit (`CompressionOptions::max_dict_bytes`) "
|
|
|
|
"should be nonzero if we're using zstd's dictionary generator.");
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
if (!CompressionTypeSupported(cf_options.blob_compression_type)) {
|
|
|
|
std::ostringstream oss;
|
|
|
|
oss << "The specified blob compression type "
|
|
|
|
<< CompressionTypeToString(cf_options.blob_compression_type)
|
|
|
|
<< " is not available.";
|
|
|
|
|
|
|
|
return Status::InvalidArgument(oss.str());
|
|
|
|
}
|
|
|
|
|
|
|
|
return Status::OK();
|
|
|
|
}
|
|
|
|
|
support for concurrent adds to memtable
Summary:
This diff adds support for concurrent adds to the skiplist memtable
implementations. Memory allocation is made thread-safe by the addition of
a spinlock, with small per-core buffers to avoid contention. Concurrent
memtable writes are made via an additional method and don't impose a
performance overhead on the non-concurrent case, so parallelism can be
selected on a per-batch basis.
Write thread synchronization is an increasing bottleneck for higher levels
of concurrency, so this diff adds --enable_write_thread_adaptive_yield
(default off). This feature causes threads joining a write batch
group to spin for a short time (default 100 usec) using sched_yield,
rather than going to sleep on a mutex. If the timing of the yield calls
indicates that another thread has actually run during the yield then
spinning is avoided. This option improves performance for concurrent
situations even without parallel adds, although it has the potential to
increase CPU usage (and the heuristic adaptation is not yet mature).
Parallel writes are not currently compatible with
inplace updates, update callbacks, or delete filtering.
Enable it with --allow_concurrent_memtable_write (and
--enable_write_thread_adaptive_yield). Parallel memtable writes
are performance neutral when there is no actual parallelism, and in
my experiments (SSD server-class Linux and varying contention and key
sizes for fillrandom) they are always a performance win when there is
more than one thread.
Statistics are updated earlier in the write path, dropping the number
of DB mutex acquisitions from 2 to 1 for almost all cases.
This diff was motivated and inspired by Yahoo's cLSM work. It is more
conservative than cLSM: RocksDB's write batch group leader role is
preserved (along with all of the existing flush and write throttling
logic) and concurrent writers are blocked until all memtable insertions
have completed and the sequence number has been advanced, to preserve
linearizability.
My test config is "db_bench -benchmarks=fillrandom -threads=$T
-batch_size=1 -memtablerep=skip_list -value_size=100 --num=1000000/$T
-level0_slowdown_writes_trigger=9999 -level0_stop_writes_trigger=9999
-disable_auto_compactions --max_write_buffer_number=8
-max_background_flushes=8 --disable_wal --write_buffer_size=160000000
--block_size=16384 --allow_concurrent_memtable_write" on a two-socket
Xeon E5-2660 @ 2.2Ghz with lots of memory and an SSD hard drive. With 1
thread I get ~440Kops/sec. Peak performance for 1 socket (numactl
-N1) is slightly more than 1Mops/sec, at 16 threads. Peak performance
across both sockets happens at 30 threads, and is ~900Kops/sec, although
with fewer threads there is less performance loss when the system has
background work.
Test Plan:
1. concurrent stress tests for InlineSkipList and DynamicBloom
2. make clean; make check
3. make clean; DISABLE_JEMALLOC=1 make valgrind_check; valgrind db_bench
4. make clean; COMPILE_WITH_TSAN=1 make all check; db_bench
5. make clean; COMPILE_WITH_ASAN=1 make all check; db_bench
6. make clean; OPT=-DROCKSDB_LITE make check
7. verify no perf regressions when disabled
Reviewers: igor, sdong
Reviewed By: sdong
Subscribers: MarkCallaghan, IslamAbdelRahman, anthony, yhchiang, rven, sdong, guyg8, kradhakrishnan, dhruba
Differential Revision: https://reviews.facebook.net/D50589
9 years ago
|
|
|
Status CheckConcurrentWritesSupported(const ColumnFamilyOptions& cf_options) {
|
|
|
|
if (cf_options.inplace_update_support) {
|
|
|
|
return Status::InvalidArgument(
|
|
|
|
"In-place memtable updates (inplace_update_support) is not compatible "
|
|
|
|
"with concurrent writes (allow_concurrent_memtable_write)");
|
|
|
|
}
|
|
|
|
if (!cf_options.memtable_factory->IsInsertConcurrentlySupported()) {
|
|
|
|
return Status::InvalidArgument(
|
|
|
|
"Memtable doesn't concurrent writes (allow_concurrent_memtable_write)");
|
|
|
|
}
|
support for concurrent adds to memtable
Summary:
This diff adds support for concurrent adds to the skiplist memtable
implementations. Memory allocation is made thread-safe by the addition of
a spinlock, with small per-core buffers to avoid contention. Concurrent
memtable writes are made via an additional method and don't impose a
performance overhead on the non-concurrent case, so parallelism can be
selected on a per-batch basis.
Write thread synchronization is an increasing bottleneck for higher levels
of concurrency, so this diff adds --enable_write_thread_adaptive_yield
(default off). This feature causes threads joining a write batch
group to spin for a short time (default 100 usec) using sched_yield,
rather than going to sleep on a mutex. If the timing of the yield calls
indicates that another thread has actually run during the yield then
spinning is avoided. This option improves performance for concurrent
situations even without parallel adds, although it has the potential to
increase CPU usage (and the heuristic adaptation is not yet mature).
Parallel writes are not currently compatible with
inplace updates, update callbacks, or delete filtering.
Enable it with --allow_concurrent_memtable_write (and
--enable_write_thread_adaptive_yield). Parallel memtable writes
are performance neutral when there is no actual parallelism, and in
my experiments (SSD server-class Linux and varying contention and key
sizes for fillrandom) they are always a performance win when there is
more than one thread.
Statistics are updated earlier in the write path, dropping the number
of DB mutex acquisitions from 2 to 1 for almost all cases.
This diff was motivated and inspired by Yahoo's cLSM work. It is more
conservative than cLSM: RocksDB's write batch group leader role is
preserved (along with all of the existing flush and write throttling
logic) and concurrent writers are blocked until all memtable insertions
have completed and the sequence number has been advanced, to preserve
linearizability.
My test config is "db_bench -benchmarks=fillrandom -threads=$T
-batch_size=1 -memtablerep=skip_list -value_size=100 --num=1000000/$T
-level0_slowdown_writes_trigger=9999 -level0_stop_writes_trigger=9999
-disable_auto_compactions --max_write_buffer_number=8
-max_background_flushes=8 --disable_wal --write_buffer_size=160000000
--block_size=16384 --allow_concurrent_memtable_write" on a two-socket
Xeon E5-2660 @ 2.2Ghz with lots of memory and an SSD hard drive. With 1
thread I get ~440Kops/sec. Peak performance for 1 socket (numactl
-N1) is slightly more than 1Mops/sec, at 16 threads. Peak performance
across both sockets happens at 30 threads, and is ~900Kops/sec, although
with fewer threads there is less performance loss when the system has
background work.
Test Plan:
1. concurrent stress tests for InlineSkipList and DynamicBloom
2. make clean; make check
3. make clean; DISABLE_JEMALLOC=1 make valgrind_check; valgrind db_bench
4. make clean; COMPILE_WITH_TSAN=1 make all check; db_bench
5. make clean; COMPILE_WITH_ASAN=1 make all check; db_bench
6. make clean; OPT=-DROCKSDB_LITE make check
7. verify no perf regressions when disabled
Reviewers: igor, sdong
Reviewed By: sdong
Subscribers: MarkCallaghan, IslamAbdelRahman, anthony, yhchiang, rven, sdong, guyg8, kradhakrishnan, dhruba
Differential Revision: https://reviews.facebook.net/D50589
9 years ago
|
|
|
return Status::OK();
|
|
|
|
}
|
|
|
|
|
|
|
|
Status CheckCFPathsSupported(const DBOptions& db_options,
|
|
|
|
const ColumnFamilyOptions& cf_options) {
|
|
|
|
// More than one cf_paths are supported only in universal
|
|
|
|
// and level compaction styles. This function also checks the case
|
|
|
|
// in which cf_paths is not specified, which results in db_paths
|
|
|
|
// being used.
|
|
|
|
if ((cf_options.compaction_style != kCompactionStyleUniversal) &&
|
|
|
|
(cf_options.compaction_style != kCompactionStyleLevel)) {
|
|
|
|
if (cf_options.cf_paths.size() > 1) {
|
|
|
|
return Status::NotSupported(
|
|
|
|
"More than one CF paths are only supported in "
|
|
|
|
"universal and level compaction styles. ");
|
|
|
|
} else if (cf_options.cf_paths.empty() && db_options.db_paths.size() > 1) {
|
|
|
|
return Status::NotSupported(
|
|
|
|
"More than one DB paths are only supported in "
|
|
|
|
"universal and level compaction styles. ");
|
|
|
|
}
|
|
|
|
}
|
|
|
|
return Status::OK();
|
|
|
|
}
|
|
|
|
|
|
|
|
namespace {
|
|
|
|
const uint64_t kDefaultTtl = 0xfffffffffffffffe;
|
|
|
|
const uint64_t kDefaultPeriodicCompSecs = 0xfffffffffffffffe;
|
|
|
|
} // anonymous namespace
|
|
|
|
|
|
|
|
ColumnFamilyOptions SanitizeOptions(const ImmutableDBOptions& db_options,
|
options.level_compaction_dynamic_level_bytes to allow RocksDB to pick size bases of levels dynamically.
Summary:
When having fixed max_bytes_for_level_base, the ratio of size of largest level and the second one can range from 0 to the multiplier. This makes LSM tree frequently irregular and unpredictable. It can also cause poor space amplification in some cases.
In this improvement (proposed by Igor Kabiljo), we introduce a parameter option.level_compaction_use_dynamic_max_bytes. When turning it on, RocksDB is free to pick a level base in the range of (options.max_bytes_for_level_base/options.max_bytes_for_level_multiplier, options.max_bytes_for_level_base] so that real level ratios are close to options.max_bytes_for_level_multiplier.
Test Plan: New unit tests and pass tests suites including valgrind.
Reviewers: MarkCallaghan, rven, yhchiang, igor, ikabiljo
Reviewed By: ikabiljo
Subscribers: yoshinorim, ikabiljo, dhruba, leveldb
Differential Revision: https://reviews.facebook.net/D31437
10 years ago
|
|
|
const ColumnFamilyOptions& src) {
|
|
|
|
ColumnFamilyOptions result = src;
|
ColumnFamilyOptions SanitizeOptions is buggy on 32-bit platforms.
Summary:
The pre-existing code is trying to clamp between 65,536 and 0,
resulting in clamping to 65,536, resulting in very small buffers,
resulting in ShouldFlushNow() being true quite easily,
resulting in assertion failing and database performance
being "not what it should be".
https://github.com/facebook/rocksdb/issues/1018
Test Plan: make check
Reviewers: sdong, andrewkr, IslamAbdelRahman, yhchiang, igor
Reviewed By: igor
Subscribers: leveldb, andrewkr, dhruba
Differential Revision: https://reviews.facebook.net/D55455
9 years ago
|
|
|
size_t clamp_max = std::conditional<
|
|
|
|
sizeof(size_t) == 4, std::integral_constant<size_t, 0xffffffff>,
|
|
|
|
std::integral_constant<uint64_t, 64ull << 30>>::type::value;
|
|
|
|
ClipToRange(&result.write_buffer_size, (static_cast<size_t>(64)) << 10,
|
|
|
|
clamp_max);
|
|
|
|
// if user sets arena_block_size, we trust user to use this value. Otherwise,
|
|
|
|
// calculate a proper value from writer_buffer_size;
|
|
|
|
if (result.arena_block_size <= 0) {
|
|
|
|
result.arena_block_size =
|
|
|
|
std::min(size_t{1024 * 1024}, result.write_buffer_size / 8);
|
|
|
|
|
|
|
|
// Align up to 4k
|
|
|
|
const size_t align = 4 * 1024;
|
|
|
|
result.arena_block_size =
|
|
|
|
((result.arena_block_size + align - 1) / align) * align;
|
|
|
|
}
|
|
|
|
result.min_write_buffer_number_to_merge =
|
|
|
|
std::min(result.min_write_buffer_number_to_merge,
|
|
|
|
result.max_write_buffer_number - 1);
|
|
|
|
if (result.min_write_buffer_number_to_merge < 1) {
|
|
|
|
result.min_write_buffer_number_to_merge = 1;
|
|
|
|
}
|
|
|
|
|
Sanitize min_write_buffer_number_to_merge to 1 with atomic_flush (#10773)
Summary:
With current implementation, within the same RocksDB instance, all column families with non-empty memtables will be scheduled for flush if RocksDB determines that any column family needs to be flushed, e.g. memtable full, write buffer manager, etc., if atomic flush is enabled. Not doing so can lead to data loss and inconsistency when WAL is disabled, which is a common setting when atomic flush is enabled. Therefore, setting a per-column-family knob, min_write_buffer_number_to_merge to a value greater than 1 is not compatible with atomic flush, and should be sanitized during column family creation and db open.
Pull Request resolved: https://github.com/facebook/rocksdb/pull/10773
Test Plan:
Reproduce: D39993203 has detailed steps.
Run the test with and without the fix.
Reviewed By: cbi42
Differential Revision: D40077955
Pulled By: cbi42
fbshipit-source-id: 451a9179eb531ac42eaccf40b451b9dec4085240
2 years ago
|
|
|
if (db_options.atomic_flush && result.min_write_buffer_number_to_merge > 1) {
|
|
|
|
ROCKS_LOG_WARN(
|
|
|
|
db_options.logger,
|
|
|
|
"Currently, if atomic_flush is true, then triggering flush for any "
|
|
|
|
"column family internally (non-manual flush) will trigger flushing "
|
|
|
|
"all column families even if the number of memtables is smaller "
|
|
|
|
"min_write_buffer_number_to_merge. Therefore, configuring "
|
|
|
|
"min_write_buffer_number_to_merge > 1 is not compatible and should "
|
|
|
|
"be satinized to 1. Not doing so will lead to data loss and "
|
|
|
|
"inconsistent state across multiple column families when WAL is "
|
|
|
|
"disabled, which is a common setting for atomic flush");
|
|
|
|
|
|
|
|
result.min_write_buffer_number_to_merge = 1;
|
|
|
|
}
|
|
|
|
|
|
|
|
if (result.num_levels < 1) {
|
|
|
|
result.num_levels = 1;
|
|
|
|
}
|
|
|
|
if (result.compaction_style == kCompactionStyleLevel &&
|
|
|
|
result.num_levels < 2) {
|
|
|
|
result.num_levels = 2;
|
|
|
|
}
|
|
|
|
|
|
|
|
if (result.compaction_style == kCompactionStyleUniversal &&
|
|
|
|
db_options.allow_ingest_behind && result.num_levels < 3) {
|
|
|
|
result.num_levels = 3;
|
|
|
|
}
|
|
|
|
|
|
|
|
if (result.max_write_buffer_number < 2) {
|
|
|
|
result.max_write_buffer_number = 2;
|
|
|
|
}
|
Refactor trimming logic for immutable memtables (#5022)
Summary:
MyRocks currently sets `max_write_buffer_number_to_maintain` in order to maintain enough history for transaction conflict checking. The effectiveness of this approach depends on the size of memtables. When memtables are small, it may not keep enough history; when memtables are large, this may consume too much memory.
We are proposing a new way to configure memtable list history: by limiting the memory usage of immutable memtables. The new option is `max_write_buffer_size_to_maintain` and it will take precedence over the old `max_write_buffer_number_to_maintain` if they are both set to non-zero values. The new option accounts for the total memory usage of flushed immutable memtables and mutable memtable. When the total usage exceeds the limit, RocksDB may start dropping immutable memtables (which is also called trimming history), starting from the oldest one.
The semantics of the old option actually works both as an upper bound and lower bound. History trimming will start if number of immutable memtables exceeds the limit, but it will never go below (limit-1) due to history trimming.
In order the mimic the behavior with the new option, history trimming will stop if dropping the next immutable memtable causes the total memory usage go below the size limit. For example, assuming the size limit is set to 64MB, and there are 3 immutable memtables with sizes of 20, 30, 30. Although the total memory usage is 80MB > 64MB, dropping the oldest memtable will reduce the memory usage to 60MB < 64MB, so in this case no memtable will be dropped.
Pull Request resolved: https://github.com/facebook/rocksdb/pull/5022
Differential Revision: D14394062
Pulled By: miasantreble
fbshipit-source-id: 60457a509c6af89d0993f988c9b5c2aa9e45f5c5
5 years ago
|
|
|
// fall back max_write_buffer_number_to_maintain if
|
|
|
|
// max_write_buffer_size_to_maintain is not set
|
|
|
|
if (result.max_write_buffer_size_to_maintain < 0) {
|
|
|
|
result.max_write_buffer_size_to_maintain =
|
|
|
|
result.max_write_buffer_number *
|
|
|
|
static_cast<int64_t>(result.write_buffer_size);
|
|
|
|
} else if (result.max_write_buffer_size_to_maintain == 0 &&
|
|
|
|
result.max_write_buffer_number_to_maintain < 0) {
|
Support saving history in memtable_list
Summary:
For transactions, we are using the memtables to validate that there are no write conflicts. But after flushing, we don't have any memtables, and transactions could fail to commit. So we want to someone keep around some extra history to use for conflict checking. In addition, we want to provide a way to increase the size of this history if too many transactions fail to commit.
After chatting with people, it seems like everyone prefers just using Memtables to store this history (instead of a separate history structure). It seems like the best place for this is abstracted inside the memtable_list. I decide to create a separate list in MemtableListVersion as using the same list complicated the flush/installalflushresults logic too much.
This diff adds a new parameter to control how much memtable history to keep around after flushing. However, it sounds like people aren't too fond of adding new parameters. So I am making the default size of flushed+not-flushed memtables be set to max_write_buffers. This should not change the maximum amount of memory used, but make it more likely we're using closer the the limit. (We are now postponing deleting flushed memtables until the max_write_buffer limit is reached). So while we might use more memory on average, we are still obeying the limit set (and you could argue it's better to go ahead and use up memory now instead of waiting for a write stall to happen to test this limit).
However, if people are opposed to this default behavior, we can easily set it to 0 and require this parameter be set in order to use transactions.
Test Plan: Added a xfunc test to play around with setting different values of this parameter in all tests. Added testing in memtablelist_test and planning on adding more testing here.
Reviewers: sdong, rven, igor
Reviewed By: igor
Subscribers: dhruba, leveldb
Differential Revision: https://reviews.facebook.net/D37443
10 years ago
|
|
|
result.max_write_buffer_number_to_maintain = result.max_write_buffer_number;
|
|
|
|
}
|
|
|
|
// bloom filter size shouldn't exceed 1/4 of memtable size.
|
|
|
|
if (result.memtable_prefix_bloom_size_ratio > 0.25) {
|
|
|
|
result.memtable_prefix_bloom_size_ratio = 0.25;
|
|
|
|
} else if (result.memtable_prefix_bloom_size_ratio < 0) {
|
|
|
|
result.memtable_prefix_bloom_size_ratio = 0;
|
|
|
|
}
|
Support saving history in memtable_list
Summary:
For transactions, we are using the memtables to validate that there are no write conflicts. But after flushing, we don't have any memtables, and transactions could fail to commit. So we want to someone keep around some extra history to use for conflict checking. In addition, we want to provide a way to increase the size of this history if too many transactions fail to commit.
After chatting with people, it seems like everyone prefers just using Memtables to store this history (instead of a separate history structure). It seems like the best place for this is abstracted inside the memtable_list. I decide to create a separate list in MemtableListVersion as using the same list complicated the flush/installalflushresults logic too much.
This diff adds a new parameter to control how much memtable history to keep around after flushing. However, it sounds like people aren't too fond of adding new parameters. So I am making the default size of flushed+not-flushed memtables be set to max_write_buffers. This should not change the maximum amount of memory used, but make it more likely we're using closer the the limit. (We are now postponing deleting flushed memtables until the max_write_buffer limit is reached). So while we might use more memory on average, we are still obeying the limit set (and you could argue it's better to go ahead and use up memory now instead of waiting for a write stall to happen to test this limit).
However, if people are opposed to this default behavior, we can easily set it to 0 and require this parameter be set in order to use transactions.
Test Plan: Added a xfunc test to play around with setting different values of this parameter in all tests. Added testing in memtablelist_test and planning on adding more testing here.
Reviewers: sdong, rven, igor
Reviewed By: igor
Subscribers: dhruba, leveldb
Differential Revision: https://reviews.facebook.net/D37443
10 years ago
|
|
|
|
|
|
|
if (!result.prefix_extractor) {
|
|
|
|
assert(result.memtable_factory);
|
|
|
|
Slice name = result.memtable_factory->Name();
|
|
|
|
if (name.compare("HashSkipListRepFactory") == 0 ||
|
|
|
|
name.compare("HashLinkListRepFactory") == 0) {
|
|
|
|
result.memtable_factory = std::make_shared<SkipListFactory>();
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
if (result.compaction_style == kCompactionStyleFIFO) {
|
|
|
|
// since we delete level0 files in FIFO compaction when there are too many
|
|
|
|
// of them, these options don't really mean anything
|
|
|
|
result.level0_slowdown_writes_trigger = std::numeric_limits<int>::max();
|
|
|
|
result.level0_stop_writes_trigger = std::numeric_limits<int>::max();
|
|
|
|
}
|
|
|
|
|
|
|
|
if (result.max_bytes_for_level_multiplier <= 0) {
|
|
|
|
result.max_bytes_for_level_multiplier = 1;
|
|
|
|
}
|
|
|
|
|
|
|
|
if (result.level0_file_num_compaction_trigger == 0) {
|
|
|
|
ROCKS_LOG_WARN(db_options.logger,
|
|
|
|
"level0_file_num_compaction_trigger cannot be 0");
|
|
|
|
result.level0_file_num_compaction_trigger = 1;
|
|
|
|
}
|
|
|
|
|
|
|
|
if (result.level0_stop_writes_trigger <
|
|
|
|
result.level0_slowdown_writes_trigger ||
|
|
|
|
result.level0_slowdown_writes_trigger <
|
|
|
|
result.level0_file_num_compaction_trigger) {
|
|
|
|
ROCKS_LOG_WARN(db_options.logger,
|
|
|
|
"This condition must be satisfied: "
|
|
|
|
"level0_stop_writes_trigger(%d) >= "
|
|
|
|
"level0_slowdown_writes_trigger(%d) >= "
|
|
|
|
"level0_file_num_compaction_trigger(%d)",
|
|
|
|
result.level0_stop_writes_trigger,
|
|
|
|
result.level0_slowdown_writes_trigger,
|
|
|
|
result.level0_file_num_compaction_trigger);
|
|
|
|
if (result.level0_slowdown_writes_trigger <
|
|
|
|
result.level0_file_num_compaction_trigger) {
|
|
|
|
result.level0_slowdown_writes_trigger =
|
|
|
|
result.level0_file_num_compaction_trigger;
|
|
|
|
}
|
|
|
|
if (result.level0_stop_writes_trigger <
|
|
|
|
result.level0_slowdown_writes_trigger) {
|
|
|
|
result.level0_stop_writes_trigger = result.level0_slowdown_writes_trigger;
|
|
|
|
}
|
|
|
|
ROCKS_LOG_WARN(db_options.logger,
|
|
|
|
"Adjust the value to "
|
|
|
|
"level0_stop_writes_trigger(%d)"
|
|
|
|
"level0_slowdown_writes_trigger(%d)"
|
|
|
|
"level0_file_num_compaction_trigger(%d)",
|
|
|
|
result.level0_stop_writes_trigger,
|
|
|
|
result.level0_slowdown_writes_trigger,
|
|
|
|
result.level0_file_num_compaction_trigger);
|
|
|
|
}
|
Add options.base_background_compactions as a number of compaction threads for low compaction debt
Summary:
If options.base_background_compactions is given, we try to schedule number of compactions not existing this number, only when L0 files increase to certain number, or pending compaction bytes more than certain threshold, we schedule compactions based on options.max_background_compactions.
The watermarks are calculated based on slowdown thresholds.
Test Plan:
Add new test cases in column_family_test.
Adding more unit tests.
Reviewers: IslamAbdelRahman, yhchiang, kradhakrishnan, rven, anthony
Reviewed By: anthony
Subscribers: leveldb, dhruba, yoshinorim
Differential Revision: https://reviews.facebook.net/D53409
9 years ago
|
|
|
|
|
|
|
if (result.soft_pending_compaction_bytes_limit == 0) {
|
|
|
|
result.soft_pending_compaction_bytes_limit =
|
|
|
|
result.hard_pending_compaction_bytes_limit;
|
|
|
|
} else if (result.hard_pending_compaction_bytes_limit > 0 &&
|
|
|
|
result.soft_pending_compaction_bytes_limit >
|
|
|
|
result.hard_pending_compaction_bytes_limit) {
|
|
|
|
result.soft_pending_compaction_bytes_limit =
|
|
|
|
result.hard_pending_compaction_bytes_limit;
|
|
|
|
}
|
|
|
|
|
|
|
|
// When the DB is stopped, it's possible that there are some .trash files that
|
|
|
|
// were not deleted yet, when we open the DB we will find these .trash files
|
|
|
|
// and schedule them to be deleted (or delete immediately if SstFileManager
|
|
|
|
// was not used)
|
|
|
|
auto sfm =
|
|
|
|
static_cast<SstFileManagerImpl*>(db_options.sst_file_manager.get());
|
|
|
|
for (size_t i = 0; i < result.cf_paths.size(); i++) {
|
|
|
|
DeleteScheduler::CleanupDirectory(db_options.env, sfm,
|
|
|
|
result.cf_paths[i].path)
|
|
|
|
.PermitUncheckedError();
|
|
|
|
}
|
|
|
|
|
|
|
|
if (result.cf_paths.empty()) {
|
|
|
|
result.cf_paths = db_options.db_paths;
|
|
|
|
}
|
|
|
|
|
options.level_compaction_dynamic_level_bytes to allow RocksDB to pick size bases of levels dynamically.
Summary:
When having fixed max_bytes_for_level_base, the ratio of size of largest level and the second one can range from 0 to the multiplier. This makes LSM tree frequently irregular and unpredictable. It can also cause poor space amplification in some cases.
In this improvement (proposed by Igor Kabiljo), we introduce a parameter option.level_compaction_use_dynamic_max_bytes. When turning it on, RocksDB is free to pick a level base in the range of (options.max_bytes_for_level_base/options.max_bytes_for_level_multiplier, options.max_bytes_for_level_base] so that real level ratios are close to options.max_bytes_for_level_multiplier.
Test Plan: New unit tests and pass tests suites including valgrind.
Reviewers: MarkCallaghan, rven, yhchiang, igor, ikabiljo
Reviewed By: ikabiljo
Subscribers: yoshinorim, ikabiljo, dhruba, leveldb
Differential Revision: https://reviews.facebook.net/D31437
10 years ago
|
|
|
if (result.level_compaction_dynamic_level_bytes) {
|
|
|
|
if (result.compaction_style != kCompactionStyleLevel) {
|
|
|
|
ROCKS_LOG_WARN(db_options.info_log.get(),
|
|
|
|
"level_compaction_dynamic_level_bytes only makes sense"
|
|
|
|
"for level-based compaction");
|
|
|
|
result.level_compaction_dynamic_level_bytes = false;
|
|
|
|
} else if (result.cf_paths.size() > 1U) {
|
|
|
|
// we don't yet know how to make both of this feature and multiple
|
|
|
|
// DB path work.
|
|
|
|
ROCKS_LOG_WARN(db_options.info_log.get(),
|
|
|
|
"multiple cf_paths/db_paths and"
|
|
|
|
"level_compaction_dynamic_level_bytes"
|
|
|
|
"can't be used together");
|
options.level_compaction_dynamic_level_bytes to allow RocksDB to pick size bases of levels dynamically.
Summary:
When having fixed max_bytes_for_level_base, the ratio of size of largest level and the second one can range from 0 to the multiplier. This makes LSM tree frequently irregular and unpredictable. It can also cause poor space amplification in some cases.
In this improvement (proposed by Igor Kabiljo), we introduce a parameter option.level_compaction_use_dynamic_max_bytes. When turning it on, RocksDB is free to pick a level base in the range of (options.max_bytes_for_level_base/options.max_bytes_for_level_multiplier, options.max_bytes_for_level_base] so that real level ratios are close to options.max_bytes_for_level_multiplier.
Test Plan: New unit tests and pass tests suites including valgrind.
Reviewers: MarkCallaghan, rven, yhchiang, igor, ikabiljo
Reviewed By: ikabiljo
Subscribers: yoshinorim, ikabiljo, dhruba, leveldb
Differential Revision: https://reviews.facebook.net/D31437
10 years ago
|
|
|
result.level_compaction_dynamic_level_bytes = false;
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
if (result.max_compaction_bytes == 0) {
|
|
|
|
result.max_compaction_bytes = result.target_file_size_base * 25;
|
|
|
|
}
|
|
|
|
|
|
|
|
bool is_block_based_table = (result.table_factory->IsInstanceOf(
|
|
|
|
TableFactory::kBlockBasedTableName()));
|
|
|
|
|
|
|
|
const uint64_t kAdjustedTtl = 30 * 24 * 60 * 60;
|
|
|
|
if (result.ttl == kDefaultTtl) {
|
|
|
|
if (is_block_based_table &&
|
|
|
|
result.compaction_style != kCompactionStyleFIFO) {
|
|
|
|
result.ttl = kAdjustedTtl;
|
|
|
|
} else {
|
|
|
|
result.ttl = 0;
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
const uint64_t kAdjustedPeriodicCompSecs = 30 * 24 * 60 * 60;
|
|
|
|
|
Auto enable Periodic Compactions if a Compaction Filter is used (#5865)
Summary:
- Periodic compactions are auto-enabled if a compaction filter or a compaction filter factory is set, in Level Compaction.
- The default value of `periodic_compaction_seconds` is changed to UINT64_MAX, which lets RocksDB auto-tune periodic compactions as needed. An explicit value of 0 will still work as before ie. to disable periodic compactions completely. For now, on seeing a compaction filter along with a UINT64_MAX value for `periodic_compaction_seconds`, RocksDB will make SST files older than 30 days to go through periodic copmactions.
Some RocksDB users make use of compaction filters to control when their data can be deleted, usually with a custom TTL logic. But it is occasionally possible that the compactions get delayed by considerable time due to factors like low writes to a key range, data reaching bottom level, etc before the TTL expiry. Periodic Compactions feature was originally built to help such cases. Now periodic compactions are auto enabled by default when compaction filters or compaction filter factories are used, as it is generally helpful to all cases to collect garbage.
`periodic_compaction_seconds` is set to a large value, 30 days, in `SanitizeOptions` when RocksDB sees that a `compaction_filter` or `compaction_filter_factory` is used.
This is done only for Level Compaction style.
Pull Request resolved: https://github.com/facebook/rocksdb/pull/5865
Test Plan:
- Added a new test `DBCompactionTest.LevelPeriodicCompactionWithCompactionFilters` to make sure that `periodic_compaction_seconds` is set if either `compaction_filter` or `compaction_filter_factory` options are set.
- `COMPILE_WITH_ASAN=1 make check`
Differential Revision: D17659180
Pulled By: sagar0
fbshipit-source-id: 4887b9cf2e53cf2dc93a7b658c6b15e1181217ee
5 years ago
|
|
|
// Turn on periodic compactions and set them to occur once every 30 days if
|
|
|
|
// compaction filters are used and periodic_compaction_seconds is set to the
|
|
|
|
// default value.
|
|
|
|
if (result.compaction_style != kCompactionStyleFIFO) {
|
|
|
|
if ((result.compaction_filter != nullptr ||
|
|
|
|
result.compaction_filter_factory != nullptr) &&
|
|
|
|
result.periodic_compaction_seconds == kDefaultPeriodicCompSecs &&
|
|
|
|
is_block_based_table) {
|
|
|
|
result.periodic_compaction_seconds = kAdjustedPeriodicCompSecs;
|
|
|
|
}
|
|
|
|
} else {
|
|
|
|
// result.compaction_style == kCompactionStyleFIFO
|
|
|
|
if (result.ttl == 0) {
|
|
|
|
if (is_block_based_table) {
|
|
|
|
if (result.periodic_compaction_seconds == kDefaultPeriodicCompSecs) {
|
|
|
|
result.periodic_compaction_seconds = kAdjustedPeriodicCompSecs;
|
|
|
|
}
|
|
|
|
result.ttl = result.periodic_compaction_seconds;
|
|
|
|
}
|
|
|
|
} else if (result.periodic_compaction_seconds != 0) {
|
|
|
|
result.ttl = std::min(result.ttl, result.periodic_compaction_seconds);
|
|
|
|
}
|
Auto enable Periodic Compactions if a Compaction Filter is used (#5865)
Summary:
- Periodic compactions are auto-enabled if a compaction filter or a compaction filter factory is set, in Level Compaction.
- The default value of `periodic_compaction_seconds` is changed to UINT64_MAX, which lets RocksDB auto-tune periodic compactions as needed. An explicit value of 0 will still work as before ie. to disable periodic compactions completely. For now, on seeing a compaction filter along with a UINT64_MAX value for `periodic_compaction_seconds`, RocksDB will make SST files older than 30 days to go through periodic copmactions.
Some RocksDB users make use of compaction filters to control when their data can be deleted, usually with a custom TTL logic. But it is occasionally possible that the compactions get delayed by considerable time due to factors like low writes to a key range, data reaching bottom level, etc before the TTL expiry. Periodic Compactions feature was originally built to help such cases. Now periodic compactions are auto enabled by default when compaction filters or compaction filter factories are used, as it is generally helpful to all cases to collect garbage.
`periodic_compaction_seconds` is set to a large value, 30 days, in `SanitizeOptions` when RocksDB sees that a `compaction_filter` or `compaction_filter_factory` is used.
This is done only for Level Compaction style.
Pull Request resolved: https://github.com/facebook/rocksdb/pull/5865
Test Plan:
- Added a new test `DBCompactionTest.LevelPeriodicCompactionWithCompactionFilters` to make sure that `periodic_compaction_seconds` is set if either `compaction_filter` or `compaction_filter_factory` options are set.
- `COMPILE_WITH_ASAN=1 make check`
Differential Revision: D17659180
Pulled By: sagar0
fbshipit-source-id: 4887b9cf2e53cf2dc93a7b658c6b15e1181217ee
5 years ago
|
|
|
}
|
|
|
|
|
|
|
|
// TTL compactions would work similar to Periodic Compactions in Universal in
|
|
|
|
// most of the cases. So, if ttl is set, execute the periodic compaction
|
|
|
|
// codepath.
|
|
|
|
if (result.compaction_style == kCompactionStyleUniversal && result.ttl != 0) {
|
|
|
|
if (result.periodic_compaction_seconds != 0) {
|
|
|
|
result.periodic_compaction_seconds =
|
|
|
|
std::min(result.ttl, result.periodic_compaction_seconds);
|
|
|
|
} else {
|
|
|
|
result.periodic_compaction_seconds = result.ttl;
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
if (result.periodic_compaction_seconds == kDefaultPeriodicCompSecs) {
|
|
|
|
result.periodic_compaction_seconds = 0;
|
|
|
|
}
|
|
|
|
|
|
|
|
return result;
|
|
|
|
}
|
|
|
|
|
|
|
|
int SuperVersion::dummy = 0;
|
|
|
|
void* const SuperVersion::kSVInUse = &SuperVersion::dummy;
|
|
|
|
void* const SuperVersion::kSVObsolete = nullptr;
|
|
|
|
|
|
|
|
SuperVersion::~SuperVersion() {
|
|
|
|
for (auto td : to_delete) {
|
|
|
|
delete td;
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
SuperVersion* SuperVersion::Ref() {
|
|
|
|
refs.fetch_add(1, std::memory_order_relaxed);
|
|
|
|
return this;
|
|
|
|
}
|
|
|
|
|
|
|
|
bool SuperVersion::Unref() {
|
|
|
|
// fetch_sub returns the previous value of ref
|
|
|
|
uint32_t previous_refs = refs.fetch_sub(1);
|
|
|
|
assert(previous_refs > 0);
|
|
|
|
return previous_refs == 1;
|
|
|
|
}
|
|
|
|
|
Make mempurge a background process (equivalent to in-memory compaction). (#8505)
Summary:
In https://github.com/facebook/rocksdb/issues/8454, I introduced a new process baptized `MemPurge` (memtable garbage collection). This new PR is built upon this past mempurge prototype.
In this PR, I made the `mempurge` process a background task, which provides superior performance since the mempurge process does not cling on the db_mutex anymore, and addresses severe restrictions from the past iteration (including a scenario where the past mempurge was failling, when a memtable was mempurged but was still referred to by an iterator/snapshot/...).
Now the mempurge process ressembles an in-memory compaction process: the stack of immutable memtables is filtered out, and the useful payload is used to populate an output memtable. If the output memtable is filled at more than 60% capacity (arbitrary heuristic) the mempurge process is aborted and a regular flush process takes place, else the output memtable is kept in the immutable memtable stack. Note that adding this output memtable to the `imm()` memtable stack does not trigger another flush process, so that the flush thread can go to sleep at the end of a successful mempurge.
MemPurge is activated by making the `experimental_allow_mempurge` flag `true`. When activated, the `MemPurge` process will always happen when the flush reason is `kWriteBufferFull`.
The 3 unit tests confirm that this process supports `Put`, `Get`, `Delete`, `DeleteRange` operators and is compatible with `Iterators` and `CompactionFilters`.
Pull Request resolved: https://github.com/facebook/rocksdb/pull/8505
Reviewed By: pdillinger
Differential Revision: D29619283
Pulled By: bjlemaire
fbshipit-source-id: 8a99bee76b63a8211bff1a00e0ae32360aaece95
4 years ago
|
|
|
void SuperVersion::Cleanup() {
|
|
|
|
assert(refs.load(std::memory_order_relaxed) == 0);
|
Memtable "MemPurge" prototype (#8454)
Summary:
Implement an experimental feature called "MemPurge", which consists in purging "garbage" bytes out of a memtable and reuse the memtable struct instead of making it immutable and eventually flushing its content to storage.
The prototype is by default deactivated and is not intended for use. It is intended for correctness and validation testing. At the moment, the "MemPurge" feature can be switched on by using the `options.experimental_allow_mempurge` flag. For this early stage, when the allow_mempurge flag is set to `true`, all the flush operations will be rerouted to perform a MemPurge. This is a temporary design decision that will give us the time to explore meaningful heuristics to use MemPurge at the right time for relevant workloads . Moreover, the current MemPurge operation only supports `Puts`, `Deletes`, `DeleteRange` operations, and handles `Iterators` as well as `CompactionFilter`s that are invoked at flush time .
Three unit tests are added to `db_flush_test.cc` to test if MemPurge works correctly (and checks that the previously mentioned operations are fully supported thoroughly tested).
One noticeable design decision is the timing of the MemPurge operation in the memtable workflow: for this prototype, the mempurge happens when the memtable is switched (and usually made immutable). This is an inefficient process because it implies that the entirety of the MemPurge operation happens while holding the db_mutex. Future commits will make the MemPurge operation a background task (akin to the regular flush operation) and aim at drastically enhancing the performance of this operation. The MemPurge is also not fully "WAL-compatible" yet, but when the WAL is full, or when the regular MemPurge operation fails (or when the purged memtable still needs to be flushed), a regular flush operation takes place. Later commits will also correct these behaviors.
Pull Request resolved: https://github.com/facebook/rocksdb/pull/8454
Reviewed By: anand1976
Differential Revision: D29433971
Pulled By: bjlemaire
fbshipit-source-id: 6af48213554e35048a7e03816955100a80a26dc5
4 years ago
|
|
|
// Since this SuperVersion object is being deleted,
|
|
|
|
// decrement reference to the immutable MemtableList
|
|
|
|
// this SV object was pointing to.
|
|
|
|
imm->Unref(&to_delete);
|
|
|
|
MemTable* m = mem->Unref();
|
|
|
|
if (m != nullptr) {
|
Make mempurge a background process (equivalent to in-memory compaction). (#8505)
Summary:
In https://github.com/facebook/rocksdb/issues/8454, I introduced a new process baptized `MemPurge` (memtable garbage collection). This new PR is built upon this past mempurge prototype.
In this PR, I made the `mempurge` process a background task, which provides superior performance since the mempurge process does not cling on the db_mutex anymore, and addresses severe restrictions from the past iteration (including a scenario where the past mempurge was failling, when a memtable was mempurged but was still referred to by an iterator/snapshot/...).
Now the mempurge process ressembles an in-memory compaction process: the stack of immutable memtables is filtered out, and the useful payload is used to populate an output memtable. If the output memtable is filled at more than 60% capacity (arbitrary heuristic) the mempurge process is aborted and a regular flush process takes place, else the output memtable is kept in the immutable memtable stack. Note that adding this output memtable to the `imm()` memtable stack does not trigger another flush process, so that the flush thread can go to sleep at the end of a successful mempurge.
MemPurge is activated by making the `experimental_allow_mempurge` flag `true`. When activated, the `MemPurge` process will always happen when the flush reason is `kWriteBufferFull`.
The 3 unit tests confirm that this process supports `Put`, `Get`, `Delete`, `DeleteRange` operators and is compatible with `Iterators` and `CompactionFilters`.
Pull Request resolved: https://github.com/facebook/rocksdb/pull/8505
Reviewed By: pdillinger
Differential Revision: D29619283
Pulled By: bjlemaire
fbshipit-source-id: 8a99bee76b63a8211bff1a00e0ae32360aaece95
4 years ago
|
|
|
auto* memory_usage = current->cfd()->imm()->current_memory_usage();
|
|
|
|
assert(*memory_usage >= m->ApproximateMemoryUsage());
|
|
|
|
*memory_usage -= m->ApproximateMemoryUsage();
|
|
|
|
to_delete.push_back(m);
|
|
|
|
}
|
|
|
|
current->Unref();
|
Fix a race in ColumnFamilyData::UnrefAndTryDelete (#8605)
Summary:
The `ColumnFamilyData::UnrefAndTryDelete` code currently on the trunk
unlocks the DB mutex before destroying the `ThreadLocalPtr` holding
the per-thread `SuperVersion` pointers when the only remaining reference
is the back reference from `super_version_`. The idea behind this was to
break the circular dependency between `ColumnFamilyData` and `SuperVersion`:
when the penultimate reference goes away, `ColumnFamilyData` can clean up
the `SuperVersion`, which can in turn clean up `ColumnFamilyData`. (Assuming there
is a `SuperVersion` and it is not referenced by anything else.) However,
unlocking the mutex throws a wrench in this plan by making it possible for another thread
to jump in and take another reference to the `ColumnFamilyData`, keeping the
object alive in a zombie `ThreadLocalPtr`-less state. This can cause issues like
https://github.com/facebook/rocksdb/issues/8440 ,
https://github.com/facebook/rocksdb/issues/8382 ,
and might also explain the `was_last_ref` assertion failures from the `ColumnFamilySet`
destructor we sometimes observe during close in our stress tests.
Digging through the archives, this unlocking goes way back to 2014 (or earlier). The original
rationale was that `SuperVersionUnrefHandle` used to lock the mutex so it can call
`SuperVersion::Cleanup`; however, this logic turned out to be deadlock-prone.
https://github.com/facebook/rocksdb/pull/3510 fixed the deadlock but left the
unlocking in place. https://github.com/facebook/rocksdb/pull/6147 then introduced
the circular dependency and associated cleanup logic described above (in order
to enable iterators to keep the `ColumnFamilyData` for dropped column families alive),
and moved the unlocking-relocking snippet to its present location in `UnrefAndTryDelete`.
Finally, https://github.com/facebook/rocksdb/pull/7749 fixed a memory leak but
apparently exacerbated the race by (otherwise correctly) switching to `UnrefAndTryDelete`
in `SuperVersion::Cleanup`.
The patch simply eliminates the unlocking and relocking, which has been unnecessary
ever since https://github.com/facebook/rocksdb/issues/3510 made `SuperVersionUnrefHandle` lock-free.
This closes the window during which another thread could increase the reference count,
and hopefully fixes the issues above.
Pull Request resolved: https://github.com/facebook/rocksdb/pull/8605
Test Plan: Ran `make check` and stress tests locally.
Reviewed By: pdillinger
Differential Revision: D30051035
Pulled By: ltamasi
fbshipit-source-id: 8fe559e4b4ad69fc142579f8bc393ef525918528
3 years ago
|
|
|
cfd->UnrefAndTryDelete();
|
|
|
|
}
|
|
|
|
|
|
|
|
void SuperVersion::Init(ColumnFamilyData* new_cfd, MemTable* new_mem,
|
|
|
|
MemTableListVersion* new_imm, Version* new_current) {
|
|
|
|
cfd = new_cfd;
|
|
|
|
mem = new_mem;
|
|
|
|
imm = new_imm;
|
|
|
|
current = new_current;
|
|
|
|
cfd->Ref();
|
|
|
|
mem->Ref();
|
|
|
|
imm->Ref();
|
|
|
|
current->Ref();
|
|
|
|
refs.store(1, std::memory_order_relaxed);
|
|
|
|
}
|
|
|
|
|
|
|
|
namespace {
|
|
|
|
void SuperVersionUnrefHandle(void* ptr) {
|
Fix a race in ColumnFamilyData::UnrefAndTryDelete (#8605)
Summary:
The `ColumnFamilyData::UnrefAndTryDelete` code currently on the trunk
unlocks the DB mutex before destroying the `ThreadLocalPtr` holding
the per-thread `SuperVersion` pointers when the only remaining reference
is the back reference from `super_version_`. The idea behind this was to
break the circular dependency between `ColumnFamilyData` and `SuperVersion`:
when the penultimate reference goes away, `ColumnFamilyData` can clean up
the `SuperVersion`, which can in turn clean up `ColumnFamilyData`. (Assuming there
is a `SuperVersion` and it is not referenced by anything else.) However,
unlocking the mutex throws a wrench in this plan by making it possible for another thread
to jump in and take another reference to the `ColumnFamilyData`, keeping the
object alive in a zombie `ThreadLocalPtr`-less state. This can cause issues like
https://github.com/facebook/rocksdb/issues/8440 ,
https://github.com/facebook/rocksdb/issues/8382 ,
and might also explain the `was_last_ref` assertion failures from the `ColumnFamilySet`
destructor we sometimes observe during close in our stress tests.
Digging through the archives, this unlocking goes way back to 2014 (or earlier). The original
rationale was that `SuperVersionUnrefHandle` used to lock the mutex so it can call
`SuperVersion::Cleanup`; however, this logic turned out to be deadlock-prone.
https://github.com/facebook/rocksdb/pull/3510 fixed the deadlock but left the
unlocking in place. https://github.com/facebook/rocksdb/pull/6147 then introduced
the circular dependency and associated cleanup logic described above (in order
to enable iterators to keep the `ColumnFamilyData` for dropped column families alive),
and moved the unlocking-relocking snippet to its present location in `UnrefAndTryDelete`.
Finally, https://github.com/facebook/rocksdb/pull/7749 fixed a memory leak but
apparently exacerbated the race by (otherwise correctly) switching to `UnrefAndTryDelete`
in `SuperVersion::Cleanup`.
The patch simply eliminates the unlocking and relocking, which has been unnecessary
ever since https://github.com/facebook/rocksdb/issues/3510 made `SuperVersionUnrefHandle` lock-free.
This closes the window during which another thread could increase the reference count,
and hopefully fixes the issues above.
Pull Request resolved: https://github.com/facebook/rocksdb/pull/8605
Test Plan: Ran `make check` and stress tests locally.
Reviewed By: pdillinger
Differential Revision: D30051035
Pulled By: ltamasi
fbshipit-source-id: 8fe559e4b4ad69fc142579f8bc393ef525918528
3 years ago
|
|
|
// UnrefHandle is called when a thread exits or a ThreadLocalPtr gets
|
|
|
|
// destroyed. When the former happens, the thread shouldn't see kSVInUse.
|
|
|
|
// When the latter happens, only super_version_ holds a reference
|
|
|
|
// to ColumnFamilyData, so no further queries are possible.
|
|
|
|
SuperVersion* sv = static_cast<SuperVersion*>(ptr);
|
Fix deadlock in ColumnFamilyData::InstallSuperVersion()
Summary:
Deadlock: a memtable flush holds DB::mutex_ and calls ThreadLocalPtr::Scrape(), which locks ThreadLocalPtr mutex; meanwhile, a thread exit handler locks ThreadLocalPtr mutex and calls SuperVersionUnrefHandle, which tries to lock DB::mutex_.
This deadlock is hit all the time on our workload. It blocks our release.
In general, the problem is that ThreadLocalPtr takes an arbitrary callback and calls it while holding a lock on a global mutex. The same global mutex is (at least in some cases) locked by almost all ThreadLocalPtr methods, on any instance of ThreadLocalPtr. So, there'll be a deadlock if the callback tries to do anything to any instance of ThreadLocalPtr, or waits for another thread to do so.
So, probably the only safe way to use ThreadLocalPtr callbacks is to do only do simple and lock-free things in them.
This PR fixes the deadlock by making sure that local_sv_ never holds the last reference to a SuperVersion, and therefore SuperVersionUnrefHandle never has to do any nontrivial cleanup.
I also searched for other uses of ThreadLocalPtr to see if they may have similar bugs. There's only one other use, in transaction_lock_mgr.cc, and it looks fine.
Closes https://github.com/facebook/rocksdb/pull/3510
Reviewed By: sagar0
Differential Revision: D7005346
Pulled By: al13n321
fbshipit-source-id: 37575591b84f07a891d6659e87e784660fde815f
7 years ago
|
|
|
bool was_last_ref __attribute__((__unused__));
|
|
|
|
was_last_ref = sv->Unref();
|
|
|
|
// Thread-local SuperVersions can't outlive ColumnFamilyData::super_version_.
|
|
|
|
// This is important because we can't do SuperVersion cleanup here.
|
|
|
|
// That would require locking DB mutex, which would deadlock because
|
|
|
|
// SuperVersionUnrefHandle is called with locked ThreadLocalPtr mutex.
|
|
|
|
assert(!was_last_ref);
|
|
|
|
}
|
|
|
|
} // anonymous namespace
|
|
|
|
|
|
|
|
std::vector<std::string> ColumnFamilyData::GetDbPaths() const {
|
|
|
|
std::vector<std::string> paths;
|
|
|
|
paths.reserve(ioptions_.cf_paths.size());
|
|
|
|
for (const DbPath& db_path : ioptions_.cf_paths) {
|
|
|
|
paths.emplace_back(db_path.path);
|
|
|
|
}
|
|
|
|
return paths;
|
|
|
|
}
|
|
|
|
|
|
|
|
const uint32_t ColumnFamilyData::kDummyColumnFamilyDataId =
|
|
|
|
std::numeric_limits<uint32_t>::max();
|
|
|
|
|
Rewritten system for scheduling background work
Summary:
When scaling to higher number of column families, the worst bottleneck was MaybeScheduleFlushOrCompaction(), which did a for loop over all column families while holding a mutex. This patch addresses the issue.
The approach is similar to our earlier efforts: instead of a pull-model, where we do something for every column family, we can do a push-based model -- when we detect that column family is ready to be flushed/compacted, we add it to the flush_queue_/compaction_queue_. That way we don't need to loop over every column family in MaybeScheduleFlushOrCompaction.
Here are the performance results:
Command:
./db_bench --write_buffer_size=268435456 --db_write_buffer_size=268435456 --db=/fast-rocksdb-tmp/rocks_lots_of_cf --use_existing_db=0 --open_files=55000 --statistics=1 --histogram=1 --disable_data_sync=1 --max_write_buffer_number=2 --sync=0 --benchmarks=fillrandom --threads=16 --num_column_families=5000 --disable_wal=1 --max_background_flushes=16 --max_background_compactions=16 --level0_file_num_compaction_trigger=2 --level0_slowdown_writes_trigger=2 --level0_stop_writes_trigger=3 --hard_rate_limit=1 --num=33333333 --writes=33333333
Before the patch:
fillrandom : 26.950 micros/op 37105 ops/sec; 4.1 MB/s
After the patch:
fillrandom : 17.404 micros/op 57456 ops/sec; 6.4 MB/s
Next bottleneck is VersionSet::AddLiveFiles, which is painfully slow when we have a lot of files. This is coming in the next patch, but when I removed that code, here's what I got:
fillrandom : 7.590 micros/op 131758 ops/sec; 14.6 MB/s
Test Plan:
make check
two stress tests:
Big number of compactions and flushes:
./db_stress --threads=30 --ops_per_thread=20000000 --max_key=10000 --column_families=20 --clear_column_family_one_in=10000000 --verify_before_write=0 --reopen=15 --max_background_compactions=10 --max_background_flushes=10 --db=/fast-rocksdb-tmp/db_stress --prefixpercent=0 --iterpercent=0 --writepercent=75 --db_write_buffer_size=2000000
max_background_flushes=0, to verify that this case also works correctly
./db_stress --threads=30 --ops_per_thread=2000000 --max_key=10000 --column_families=20 --clear_column_family_one_in=10000000 --verify_before_write=0 --reopen=3 --max_background_compactions=3 --max_background_flushes=0 --db=/fast-rocksdb-tmp/db_stress --prefixpercent=0 --iterpercent=0 --writepercent=75 --db_write_buffer_size=2000000
Reviewers: ljin, rven, yhchiang, sdong
Reviewed By: sdong
Subscribers: dhruba, leveldb
Differential Revision: https://reviews.facebook.net/D30123
10 years ago
|
|
|
ColumnFamilyData::ColumnFamilyData(
|
|
|
|
uint32_t id, const std::string& name, Version* _dummy_versions,
|
|
|
|
Cache* _table_cache, WriteBufferManager* write_buffer_manager,
|
|
|
|
const ColumnFamilyOptions& cf_options, const ImmutableDBOptions& db_options,
|
Fix use-after-free on implicit temporary FileOptions (#8571)
Summary:
FileOptions has an implicit conversion from EnvOptions and some
internal APIs take `const FileOptions&` and save the reference, which is
counter to Google C++ guidelines,
> Avoid defining functions that require a const reference parameter to outlive the call, because const reference parameters bind to temporaries. Instead, find a way to eliminate the lifetime requirement (for example, by copying the parameter), or pass it by const pointer and document the lifetime and non-null requirements.
This is at least a problem for repair.cc, which passes an EnvOptions to
TableCache(), which would save a reference to the temporary copy as
FileOptions. This was unfortunately only caught as a side effect of
changes in https://github.com/facebook/rocksdb/issues/8544.
This change fixes the repair.cc case and updates the involved internal
APIs that save a reference to use `const FileOptions*` instead.
Unfortunately, I don't know how to get any of our sanitizers to reliably
report bugs like this, so I can't rule out more existing in our
codebase.
Pull Request resolved: https://github.com/facebook/rocksdb/pull/8571
Test Plan:
Test that issues seen with https://github.com/facebook/rocksdb/issues/8544 are fixed (can reproduce on
AWS EC2)
Reviewed By: ajkr
Differential Revision: D29943890
Pulled By: pdillinger
fbshipit-source-id: 95f9c5251548777b4dc994c1a083dd2add5799c9
3 years ago
|
|
|
const FileOptions* file_options, ColumnFamilySet* column_family_set,
|
|
|
|
BlockCacheTracer* const block_cache_tracer,
|
|
|
|
const std::shared_ptr<IOTracer>& io_tracer, const std::string& db_id,
|
|
|
|
const std::string& db_session_id)
|
|
|
|
: id_(id),
|
|
|
|
name_(name),
|
|
|
|
dummy_versions_(_dummy_versions),
|
|
|
|
current_(nullptr),
|
|
|
|
refs_(0),
|
|
|
|
initialized_(false),
|
|
|
|
dropped_(false),
|
|
|
|
internal_comparator_(cf_options.comparator),
|
|
|
|
initial_cf_options_(SanitizeOptions(db_options, cf_options)),
|
|
|
|
ioptions_(db_options, initial_cf_options_),
|
|
|
|
mutable_cf_options_(initial_cf_options_),
|
|
|
|
is_delete_range_supported_(
|
|
|
|
cf_options.table_factory->IsDeleteRangeSupported()),
|
|
|
|
write_buffer_manager_(write_buffer_manager),
|
|
|
|
mem_(nullptr),
|
|
|
|
imm_(ioptions_.min_write_buffer_number_to_merge,
|
Refactor trimming logic for immutable memtables (#5022)
Summary:
MyRocks currently sets `max_write_buffer_number_to_maintain` in order to maintain enough history for transaction conflict checking. The effectiveness of this approach depends on the size of memtables. When memtables are small, it may not keep enough history; when memtables are large, this may consume too much memory.
We are proposing a new way to configure memtable list history: by limiting the memory usage of immutable memtables. The new option is `max_write_buffer_size_to_maintain` and it will take precedence over the old `max_write_buffer_number_to_maintain` if they are both set to non-zero values. The new option accounts for the total memory usage of flushed immutable memtables and mutable memtable. When the total usage exceeds the limit, RocksDB may start dropping immutable memtables (which is also called trimming history), starting from the oldest one.
The semantics of the old option actually works both as an upper bound and lower bound. History trimming will start if number of immutable memtables exceeds the limit, but it will never go below (limit-1) due to history trimming.
In order the mimic the behavior with the new option, history trimming will stop if dropping the next immutable memtable causes the total memory usage go below the size limit. For example, assuming the size limit is set to 64MB, and there are 3 immutable memtables with sizes of 20, 30, 30. Although the total memory usage is 80MB > 64MB, dropping the oldest memtable will reduce the memory usage to 60MB < 64MB, so in this case no memtable will be dropped.
Pull Request resolved: https://github.com/facebook/rocksdb/pull/5022
Differential Revision: D14394062
Pulled By: miasantreble
fbshipit-source-id: 60457a509c6af89d0993f988c9b5c2aa9e45f5c5
5 years ago
|
|
|
ioptions_.max_write_buffer_number_to_maintain,
|
|
|
|
ioptions_.max_write_buffer_size_to_maintain),
|
|
|
|
super_version_(nullptr),
|
|
|
|
super_version_number_(0),
|
|
|
|
local_sv_(new ThreadLocalPtr(&SuperVersionUnrefHandle)),
|
|
|
|
next_(nullptr),
|
|
|
|
prev_(nullptr),
|
|
|
|
log_number_(0),
|
Rewritten system for scheduling background work
Summary:
When scaling to higher number of column families, the worst bottleneck was MaybeScheduleFlushOrCompaction(), which did a for loop over all column families while holding a mutex. This patch addresses the issue.
The approach is similar to our earlier efforts: instead of a pull-model, where we do something for every column family, we can do a push-based model -- when we detect that column family is ready to be flushed/compacted, we add it to the flush_queue_/compaction_queue_. That way we don't need to loop over every column family in MaybeScheduleFlushOrCompaction.
Here are the performance results:
Command:
./db_bench --write_buffer_size=268435456 --db_write_buffer_size=268435456 --db=/fast-rocksdb-tmp/rocks_lots_of_cf --use_existing_db=0 --open_files=55000 --statistics=1 --histogram=1 --disable_data_sync=1 --max_write_buffer_number=2 --sync=0 --benchmarks=fillrandom --threads=16 --num_column_families=5000 --disable_wal=1 --max_background_flushes=16 --max_background_compactions=16 --level0_file_num_compaction_trigger=2 --level0_slowdown_writes_trigger=2 --level0_stop_writes_trigger=3 --hard_rate_limit=1 --num=33333333 --writes=33333333
Before the patch:
fillrandom : 26.950 micros/op 37105 ops/sec; 4.1 MB/s
After the patch:
fillrandom : 17.404 micros/op 57456 ops/sec; 6.4 MB/s
Next bottleneck is VersionSet::AddLiveFiles, which is painfully slow when we have a lot of files. This is coming in the next patch, but when I removed that code, here's what I got:
fillrandom : 7.590 micros/op 131758 ops/sec; 14.6 MB/s
Test Plan:
make check
two stress tests:
Big number of compactions and flushes:
./db_stress --threads=30 --ops_per_thread=20000000 --max_key=10000 --column_families=20 --clear_column_family_one_in=10000000 --verify_before_write=0 --reopen=15 --max_background_compactions=10 --max_background_flushes=10 --db=/fast-rocksdb-tmp/db_stress --prefixpercent=0 --iterpercent=0 --writepercent=75 --db_write_buffer_size=2000000
max_background_flushes=0, to verify that this case also works correctly
./db_stress --threads=30 --ops_per_thread=2000000 --max_key=10000 --column_families=20 --clear_column_family_one_in=10000000 --verify_before_write=0 --reopen=3 --max_background_compactions=3 --max_background_flushes=0 --db=/fast-rocksdb-tmp/db_stress --prefixpercent=0 --iterpercent=0 --writepercent=75 --db_write_buffer_size=2000000
Reviewers: ljin, rven, yhchiang, sdong
Reviewed By: sdong
Subscribers: dhruba, leveldb
Differential Revision: https://reviews.facebook.net/D30123
10 years ago
|
|
|
column_family_set_(column_family_set),
|
|
|
|
queued_for_flush_(false),
|
|
|
|
queued_for_compaction_(false),
|
|
|
|
prev_compaction_needed_bytes_(0),
|
|
|
|
allow_2pc_(db_options.allow_2pc),
|
|
|
|
last_memtable_id_(0),
|
|
|
|
db_paths_registered_(false),
|
Sort L0 files by newly introduced epoch_num (#10922)
Summary:
**Context:**
Sorting L0 files by `largest_seqno` has at least two inconvenience:
- File ingestion and compaction involving ingested files can create files of overlapping seqno range with the existing files. `force_consistency_check=true` will catch such overlap seqno range even those harmless overlap.
- For example, consider the following sequence of events ("key@n" indicates key at seqno "n")
- insert k1@1 to memtable m1
- ingest file s1 with k2@2, ingest file s2 with k3@3
- insert k4@4 to m1
- compact files s1, s2 and result in new file s3 of seqno range [2, 3]
- flush m1 and result in new file s4 of seqno range [1, 4]. And `force_consistency_check=true` will think s4 and s3 has file reordering corruption that might cause retuning an old value of k1
- However such caught corruption is a false positive since s1, s2 will not have overlapped keys with k1 or whatever inserted into m1 before ingest file s1 by the requirement of file ingestion (otherwise the m1 will be flushed first before any of the file ingestion completes). Therefore there in fact isn't any file reordering corruption.
- Single delete can decrease a file's largest seqno and ordering by `largest_seqno` can introduce a wrong ordering hence file reordering corruption
- For example, consider the following sequence of events ("key@n" indicates key at seqno "n", Credit to ajkr for this example)
- an existing SST s1 contains only k1@1
- insert k1@2 to memtable m1
- ingest file s2 with k3@3, ingest file s3 with k4@4
- insert single delete k5@5 in m1
- flush m1 and result in new file s4 of seqno range [2, 5]
- compact s1, s2, s3 and result in new file s5 of seqno range [1, 4]
- compact s4 and result in new file s6 of seqno range [2] due to single delete
- By the last step, we have file ordering by largest seqno (">" means "newer") : s5 > s6 while s6 contains a newer version of the k1's value (i.e, k1@2) than s5, which is a real reordering corruption. While this can be caught by `force_consistency_check=true`, there isn't a good way to prevent this from happening if ordering by `largest_seqno`
Therefore, we are redesigning the sorting criteria of L0 files and avoid above inconvenience. Credit to ajkr , we now introduce `epoch_num` which describes the order of a file being flushed or ingested/imported (compaction output file will has the minimum `epoch_num` among input files'). This will avoid the above inconvenience in the following ways:
- In the first case above, there will no longer be overlap seqno range check in `force_consistency_check=true` but `epoch_number` ordering check. This will result in file ordering s1 < s2 < s4 (pre-compaction) and s3 < s4 (post-compaction) which won't trigger false positive corruption. See test class `DBCompactionTestL0FilesMisorderCorruption*` for more.
- In the second case above, this will result in file ordering s1 < s2 < s3 < s4 (pre-compacting s1, s2, s3), s5 < s4 (post-compacting s1, s2, s3), s5 < s6 (post-compacting s4), which are correct file ordering without causing any corruption.
**Summary:**
- Introduce `epoch_number` stored per `ColumnFamilyData` and sort CF's L0 files by their assigned `epoch_number` instead of `largest_seqno`.
- `epoch_number` is increased and assigned upon `VersionEdit::AddFile()` for flush (or similarly for WriteLevel0TableForRecovery) and file ingestion (except for allow_behind_true, which will always get assigned as the `kReservedEpochNumberForFileIngestedBehind`)
- Compaction output file is assigned with the minimum `epoch_number` among input files'
- Refit level: reuse refitted file's epoch_number
- Other paths needing `epoch_number` treatment:
- Import column families: reuse file's epoch_number if exists. If not, assign one based on `NewestFirstBySeqNo`
- Repair: reuse file's epoch_number if exists. If not, assign one based on `NewestFirstBySeqNo`.
- Assigning new epoch_number to a file and adding this file to LSM tree should be atomic. This is guaranteed by us assigning epoch_number right upon `VersionEdit::AddFile()` where this version edit will be apply to LSM tree shape right after by holding the db mutex (e.g, flush, file ingestion, import column family) or by there is only 1 ongoing edit per CF (e.g, WriteLevel0TableForRecovery, Repair).
- Assigning the minimum input epoch number to compaction output file won't misorder L0 files (even through later `Refit(target_level=0)`). It's due to for every key "k" in the input range, a legit compaction will cover a continuous epoch number range of that key. As long as we assign the key "k" the minimum input epoch number, it won't become newer or older than the versions of this key that aren't included in this compaction hence no misorder.
- Persist `epoch_number` of each file in manifest and recover `epoch_number` on db recovery
- Backward compatibility with old db without `epoch_number` support is guaranteed by assigning `epoch_number` to recovered files by `NewestFirstBySeqno` order. See `VersionStorageInfo::RecoverEpochNumbers()` for more
- Forward compatibility with manifest is guaranteed by flexibility of `NewFileCustomTag`
- Replace `force_consistent_check` on L0 with `epoch_number` and remove false positive check like case 1 with `largest_seqno` above
- Due to backward compatibility issue, we might encounter files with missing epoch number at the beginning of db recovery. We will still use old L0 sorting mechanism (`NewestFirstBySeqno`) to check/sort them till we infer their epoch number. See usages of `EpochNumberRequirement`.
- Remove fix https://github.com/facebook/rocksdb/pull/5958#issue-511150930 and their outdated tests to file reordering corruption because such fix can be replaced by this PR.
- Misc:
- update existing tests with `epoch_number` so make check will pass
- update https://github.com/facebook/rocksdb/pull/5958#issue-511150930 tests to verify corruption is fixed using `epoch_number` and cover universal/fifo compaction/CompactRange/CompactFile cases
- assert db_mutex is held for a few places before calling ColumnFamilyData::NewEpochNumber()
Pull Request resolved: https://github.com/facebook/rocksdb/pull/10922
Test Plan:
- `make check`
- New unit tests under `db/db_compaction_test.cc`, `db/db_test2.cc`, `db/version_builder_test.cc`, `db/repair_test.cc`
- Updated tests (i.e, `DBCompactionTestL0FilesMisorderCorruption*`) under https://github.com/facebook/rocksdb/pull/5958#issue-511150930
- [Ongoing] Compatibility test: manually run https://github.com/ajkr/rocksdb/commit/36a5686ec012f35a4371e409aa85c404ca1c210d (with file ingestion off for running the `.orig` binary to prevent this bug affecting upgrade/downgrade formality checking) for 1 hour on `simple black/white box`, `cf_consistency/txn/enable_ts with whitebox + test_best_efforts_recovery with blackbox`
- [Ongoing] normal db stress test
- [Ongoing] db stress test with aggressive value https://github.com/facebook/rocksdb/pull/10761
Reviewed By: ajkr
Differential Revision: D41063187
Pulled By: hx235
fbshipit-source-id: 826cb23455de7beaabe2d16c57682a82733a32a9
2 years ago
|
|
|
mempurge_used_(false),
|
|
|
|
next_epoch_number_(1) {
|
|
|
|
if (id_ != kDummyColumnFamilyDataId) {
|
|
|
|
// TODO(cc): RegisterDbPaths can be expensive, considering moving it
|
|
|
|
// outside of this constructor which might be called with db mutex held.
|
|
|
|
// TODO(cc): considering using ioptions_.fs, currently some tests rely on
|
|
|
|
// EnvWrapper, that's the main reason why we use env here.
|
|
|
|
Status s = ioptions_.env->RegisterDbPaths(GetDbPaths());
|
|
|
|
if (s.ok()) {
|
|
|
|
db_paths_registered_ = true;
|
|
|
|
} else {
|
|
|
|
ROCKS_LOG_ERROR(
|
|
|
|
ioptions_.logger,
|
|
|
|
"Failed to register data paths of column family (id: %d, name: %s)",
|
|
|
|
id_, name_.c_str());
|
|
|
|
}
|
|
|
|
}
|
|
|
|
Ref();
|
|
|
|
|
A new call back to TablePropertiesCollector to allow users know the entry is add, delete or merge
Summary:
Currently users have no idea a key is add, delete or merge from TablePropertiesCollector call back. Add a new function to add it.
Also refactor the codes so that
(1) make table property collector and internal table property collector two separate data structures with the later one now exposed
(2) table builders only receive internal table properties
Test Plan: Add cases in table_properties_collector_test to cover both of old and new ways of using TablePropertiesCollector.
Reviewers: yhchiang, igor.sugak, rven, igor
Reviewed By: rven, igor
Subscribers: meyering, yoshinorim, maykov, leveldb, dhruba
Differential Revision: https://reviews.facebook.net/D35373
10 years ago
|
|
|
// Convert user defined table properties collector factories to internal ones.
|
|
|
|
GetIntTblPropCollectorFactory(ioptions_, &int_tbl_prop_collector_factories_);
|
A new call back to TablePropertiesCollector to allow users know the entry is add, delete or merge
Summary:
Currently users have no idea a key is add, delete or merge from TablePropertiesCollector call back. Add a new function to add it.
Also refactor the codes so that
(1) make table property collector and internal table property collector two separate data structures with the later one now exposed
(2) table builders only receive internal table properties
Test Plan: Add cases in table_properties_collector_test to cover both of old and new ways of using TablePropertiesCollector.
Reviewers: yhchiang, igor.sugak, rven, igor
Reviewed By: rven, igor
Subscribers: meyering, yoshinorim, maykov, leveldb, dhruba
Differential Revision: https://reviews.facebook.net/D35373
10 years ago
|
|
|
|
|
|
|
// if _dummy_versions is nullptr, then this is a dummy column family.
|
|
|
|
if (_dummy_versions != nullptr) {
|
|
|
|
internal_stats_.reset(
|
|
|
|
new InternalStats(ioptions_.num_levels, ioptions_.clock, this));
|
Introduce a new storage specific Env API (#5761)
Summary:
The current Env API encompasses both storage/file operations, as well as OS related operations. Most of the APIs return a Status, which does not have enough metadata about an error, such as whether its retry-able or not, scope (i.e fault domain) of the error etc., that may be required in order to properly handle a storage error. The file APIs also do not provide enough control over the IO SLA, such as timeout, prioritization, hinting about placement and redundancy etc.
This PR separates out the file/storage APIs from Env into a new FileSystem class. The APIs are updated to return an IOStatus with metadata about the error, as well as to take an IOOptions structure as input in order to allow more control over the IO.
The user can set both ```options.env``` and ```options.file_system``` to specify that RocksDB should use the former for OS related operations and the latter for storage operations. Internally, a ```CompositeEnvWrapper``` has been introduced that inherits from ```Env``` and redirects individual methods to either an ```Env``` implementation or the ```FileSystem``` as appropriate. When options are sanitized during ```DB::Open```, ```options.env``` is replaced with a newly allocated ```CompositeEnvWrapper``` instance if both env and file_system have been specified. This way, the rest of the RocksDB code can continue to function as before.
This PR also ports PosixEnv to the new API by splitting it into two - PosixEnv and PosixFileSystem. PosixEnv is defined as a sub-class of CompositeEnvWrapper, and threading/time functions are overridden with Posix specific implementations in order to avoid an extra level of indirection.
The ```CompositeEnvWrapper``` translates ```IOStatus``` return code to ```Status```, and sets the severity to ```kSoftError``` if the io_status is retryable. The error handling code in RocksDB can then recover the DB automatically.
Pull Request resolved: https://github.com/facebook/rocksdb/pull/5761
Differential Revision: D18868376
Pulled By: anand1976
fbshipit-source-id: 39efe18a162ea746fabac6360ff529baba48486f
5 years ago
|
|
|
table_cache_.reset(new TableCache(ioptions_, file_options, _table_cache,
|
|
|
|
block_cache_tracer, io_tracer,
|
|
|
|
db_session_id));
|
|
|
|
blob_file_cache_.reset(
|
|
|
|
new BlobFileCache(_table_cache, ioptions(), soptions(), id_,
|
|
|
|
internal_stats_->GetBlobFileReadHist(), io_tracer));
|
|
|
|
blob_source_.reset(new BlobSource(ioptions(), db_id, db_session_id,
|
|
|
|
blob_file_cache_.get()));
|
|
|
|
|
|
|
|
if (ioptions_.compaction_style == kCompactionStyleLevel) {
|
|
|
|
compaction_picker_.reset(
|
|
|
|
new LevelCompactionPicker(ioptions_, &internal_comparator_));
|
|
|
|
} else if (ioptions_.compaction_style == kCompactionStyleUniversal) {
|
|
|
|
compaction_picker_.reset(
|
|
|
|
new UniversalCompactionPicker(ioptions_, &internal_comparator_));
|
CompactFiles, EventListener and GetDatabaseMetaData
Summary:
This diff adds three sets of APIs to RocksDB.
= GetColumnFamilyMetaData =
* This APIs allow users to obtain the current state of a RocksDB instance on one column family.
* See GetColumnFamilyMetaData in include/rocksdb/db.h
= EventListener =
* A virtual class that allows users to implement a set of
call-back functions which will be called when specific
events of a RocksDB instance happens.
* To register EventListener, simply insert an EventListener to ColumnFamilyOptions::listeners
= CompactFiles =
* CompactFiles API inputs a set of file numbers and an output level, and RocksDB
will try to compact those files into the specified level.
= Example =
* Example code can be found in example/compact_files_example.cc, which implements
a simple external compactor using EventListener, GetColumnFamilyMetaData, and
CompactFiles API.
Test Plan:
listener_test
compactor_test
example/compact_files_example
export ROCKSDB_TESTS=CompactFiles
db_test
export ROCKSDB_TESTS=MetaData
db_test
Reviewers: ljin, igor, rven, sdong
Reviewed By: sdong
Subscribers: MarkCallaghan, dhruba, leveldb
Differential Revision: https://reviews.facebook.net/D24705
10 years ago
|
|
|
} else if (ioptions_.compaction_style == kCompactionStyleFIFO) {
|
|
|
|
compaction_picker_.reset(
|
|
|
|
new FIFOCompactionPicker(ioptions_, &internal_comparator_));
|
CompactFiles, EventListener and GetDatabaseMetaData
Summary:
This diff adds three sets of APIs to RocksDB.
= GetColumnFamilyMetaData =
* This APIs allow users to obtain the current state of a RocksDB instance on one column family.
* See GetColumnFamilyMetaData in include/rocksdb/db.h
= EventListener =
* A virtual class that allows users to implement a set of
call-back functions which will be called when specific
events of a RocksDB instance happens.
* To register EventListener, simply insert an EventListener to ColumnFamilyOptions::listeners
= CompactFiles =
* CompactFiles API inputs a set of file numbers and an output level, and RocksDB
will try to compact those files into the specified level.
= Example =
* Example code can be found in example/compact_files_example.cc, which implements
a simple external compactor using EventListener, GetColumnFamilyMetaData, and
CompactFiles API.
Test Plan:
listener_test
compactor_test
example/compact_files_example
export ROCKSDB_TESTS=CompactFiles
db_test
export ROCKSDB_TESTS=MetaData
db_test
Reviewers: ljin, igor, rven, sdong
Reviewed By: sdong
Subscribers: MarkCallaghan, dhruba, leveldb
Differential Revision: https://reviews.facebook.net/D24705
10 years ago
|
|
|
} else if (ioptions_.compaction_style == kCompactionStyleNone) {
|
|
|
|
compaction_picker_.reset(
|
|
|
|
new NullCompactionPicker(ioptions_, &internal_comparator_));
|
|
|
|
ROCKS_LOG_WARN(ioptions_.logger,
|
|
|
|
"Column family %s does not use any background compaction. "
|
|
|
|
"Compactions can only be done via CompactFiles\n",
|
|
|
|
GetName().c_str());
|
CompactFiles, EventListener and GetDatabaseMetaData
Summary:
This diff adds three sets of APIs to RocksDB.
= GetColumnFamilyMetaData =
* This APIs allow users to obtain the current state of a RocksDB instance on one column family.
* See GetColumnFamilyMetaData in include/rocksdb/db.h
= EventListener =
* A virtual class that allows users to implement a set of
call-back functions which will be called when specific
events of a RocksDB instance happens.
* To register EventListener, simply insert an EventListener to ColumnFamilyOptions::listeners
= CompactFiles =
* CompactFiles API inputs a set of file numbers and an output level, and RocksDB
will try to compact those files into the specified level.
= Example =
* Example code can be found in example/compact_files_example.cc, which implements
a simple external compactor using EventListener, GetColumnFamilyMetaData, and
CompactFiles API.
Test Plan:
listener_test
compactor_test
example/compact_files_example
export ROCKSDB_TESTS=CompactFiles
db_test
export ROCKSDB_TESTS=MetaData
db_test
Reviewers: ljin, igor, rven, sdong
Reviewed By: sdong
Subscribers: MarkCallaghan, dhruba, leveldb
Differential Revision: https://reviews.facebook.net/D24705
10 years ago
|
|
|
} else {
|
|
|
|
ROCKS_LOG_ERROR(ioptions_.logger,
|
|
|
|
"Unable to recognize the specified compaction style %d. "
|
|
|
|
"Column family %s will use kCompactionStyleLevel.\n",
|
|
|
|
ioptions_.compaction_style, GetName().c_str());
|
CompactFiles, EventListener and GetDatabaseMetaData
Summary:
This diff adds three sets of APIs to RocksDB.
= GetColumnFamilyMetaData =
* This APIs allow users to obtain the current state of a RocksDB instance on one column family.
* See GetColumnFamilyMetaData in include/rocksdb/db.h
= EventListener =
* A virtual class that allows users to implement a set of
call-back functions which will be called when specific
events of a RocksDB instance happens.
* To register EventListener, simply insert an EventListener to ColumnFamilyOptions::listeners
= CompactFiles =
* CompactFiles API inputs a set of file numbers and an output level, and RocksDB
will try to compact those files into the specified level.
= Example =
* Example code can be found in example/compact_files_example.cc, which implements
a simple external compactor using EventListener, GetColumnFamilyMetaData, and
CompactFiles API.
Test Plan:
listener_test
compactor_test
example/compact_files_example
export ROCKSDB_TESTS=CompactFiles
db_test
export ROCKSDB_TESTS=MetaData
db_test
Reviewers: ljin, igor, rven, sdong
Reviewed By: sdong
Subscribers: MarkCallaghan, dhruba, leveldb
Differential Revision: https://reviews.facebook.net/D24705
10 years ago
|
|
|
compaction_picker_.reset(
|
|
|
|
new LevelCompactionPicker(ioptions_, &internal_comparator_));
|
|
|
|
}
|
|
|
|
|
Rewritten system for scheduling background work
Summary:
When scaling to higher number of column families, the worst bottleneck was MaybeScheduleFlushOrCompaction(), which did a for loop over all column families while holding a mutex. This patch addresses the issue.
The approach is similar to our earlier efforts: instead of a pull-model, where we do something for every column family, we can do a push-based model -- when we detect that column family is ready to be flushed/compacted, we add it to the flush_queue_/compaction_queue_. That way we don't need to loop over every column family in MaybeScheduleFlushOrCompaction.
Here are the performance results:
Command:
./db_bench --write_buffer_size=268435456 --db_write_buffer_size=268435456 --db=/fast-rocksdb-tmp/rocks_lots_of_cf --use_existing_db=0 --open_files=55000 --statistics=1 --histogram=1 --disable_data_sync=1 --max_write_buffer_number=2 --sync=0 --benchmarks=fillrandom --threads=16 --num_column_families=5000 --disable_wal=1 --max_background_flushes=16 --max_background_compactions=16 --level0_file_num_compaction_trigger=2 --level0_slowdown_writes_trigger=2 --level0_stop_writes_trigger=3 --hard_rate_limit=1 --num=33333333 --writes=33333333
Before the patch:
fillrandom : 26.950 micros/op 37105 ops/sec; 4.1 MB/s
After the patch:
fillrandom : 17.404 micros/op 57456 ops/sec; 6.4 MB/s
Next bottleneck is VersionSet::AddLiveFiles, which is painfully slow when we have a lot of files. This is coming in the next patch, but when I removed that code, here's what I got:
fillrandom : 7.590 micros/op 131758 ops/sec; 14.6 MB/s
Test Plan:
make check
two stress tests:
Big number of compactions and flushes:
./db_stress --threads=30 --ops_per_thread=20000000 --max_key=10000 --column_families=20 --clear_column_family_one_in=10000000 --verify_before_write=0 --reopen=15 --max_background_compactions=10 --max_background_flushes=10 --db=/fast-rocksdb-tmp/db_stress --prefixpercent=0 --iterpercent=0 --writepercent=75 --db_write_buffer_size=2000000
max_background_flushes=0, to verify that this case also works correctly
./db_stress --threads=30 --ops_per_thread=2000000 --max_key=10000 --column_families=20 --clear_column_family_one_in=10000000 --verify_before_write=0 --reopen=3 --max_background_compactions=3 --max_background_flushes=0 --db=/fast-rocksdb-tmp/db_stress --prefixpercent=0 --iterpercent=0 --writepercent=75 --db_write_buffer_size=2000000
Reviewers: ljin, rven, yhchiang, sdong
Reviewed By: sdong
Subscribers: dhruba, leveldb
Differential Revision: https://reviews.facebook.net/D30123
10 years ago
|
|
|
if (column_family_set_->NumberOfColumnFamilies() < 10) {
|
|
|
|
ROCKS_LOG_INFO(ioptions_.logger,
|
|
|
|
"--------------- Options for column family [%s]:\n",
|
|
|
|
name.c_str());
|
|
|
|
initial_cf_options_.Dump(ioptions_.logger);
|
Rewritten system for scheduling background work
Summary:
When scaling to higher number of column families, the worst bottleneck was MaybeScheduleFlushOrCompaction(), which did a for loop over all column families while holding a mutex. This patch addresses the issue.
The approach is similar to our earlier efforts: instead of a pull-model, where we do something for every column family, we can do a push-based model -- when we detect that column family is ready to be flushed/compacted, we add it to the flush_queue_/compaction_queue_. That way we don't need to loop over every column family in MaybeScheduleFlushOrCompaction.
Here are the performance results:
Command:
./db_bench --write_buffer_size=268435456 --db_write_buffer_size=268435456 --db=/fast-rocksdb-tmp/rocks_lots_of_cf --use_existing_db=0 --open_files=55000 --statistics=1 --histogram=1 --disable_data_sync=1 --max_write_buffer_number=2 --sync=0 --benchmarks=fillrandom --threads=16 --num_column_families=5000 --disable_wal=1 --max_background_flushes=16 --max_background_compactions=16 --level0_file_num_compaction_trigger=2 --level0_slowdown_writes_trigger=2 --level0_stop_writes_trigger=3 --hard_rate_limit=1 --num=33333333 --writes=33333333
Before the patch:
fillrandom : 26.950 micros/op 37105 ops/sec; 4.1 MB/s
After the patch:
fillrandom : 17.404 micros/op 57456 ops/sec; 6.4 MB/s
Next bottleneck is VersionSet::AddLiveFiles, which is painfully slow when we have a lot of files. This is coming in the next patch, but when I removed that code, here's what I got:
fillrandom : 7.590 micros/op 131758 ops/sec; 14.6 MB/s
Test Plan:
make check
two stress tests:
Big number of compactions and flushes:
./db_stress --threads=30 --ops_per_thread=20000000 --max_key=10000 --column_families=20 --clear_column_family_one_in=10000000 --verify_before_write=0 --reopen=15 --max_background_compactions=10 --max_background_flushes=10 --db=/fast-rocksdb-tmp/db_stress --prefixpercent=0 --iterpercent=0 --writepercent=75 --db_write_buffer_size=2000000
max_background_flushes=0, to verify that this case also works correctly
./db_stress --threads=30 --ops_per_thread=2000000 --max_key=10000 --column_families=20 --clear_column_family_one_in=10000000 --verify_before_write=0 --reopen=3 --max_background_compactions=3 --max_background_flushes=0 --db=/fast-rocksdb-tmp/db_stress --prefixpercent=0 --iterpercent=0 --writepercent=75 --db_write_buffer_size=2000000
Reviewers: ljin, rven, yhchiang, sdong
Reviewed By: sdong
Subscribers: dhruba, leveldb
Differential Revision: https://reviews.facebook.net/D30123
10 years ago
|
|
|
} else {
|
|
|
|
ROCKS_LOG_INFO(ioptions_.logger, "\t(skipping printing options)\n");
|
Rewritten system for scheduling background work
Summary:
When scaling to higher number of column families, the worst bottleneck was MaybeScheduleFlushOrCompaction(), which did a for loop over all column families while holding a mutex. This patch addresses the issue.
The approach is similar to our earlier efforts: instead of a pull-model, where we do something for every column family, we can do a push-based model -- when we detect that column family is ready to be flushed/compacted, we add it to the flush_queue_/compaction_queue_. That way we don't need to loop over every column family in MaybeScheduleFlushOrCompaction.
Here are the performance results:
Command:
./db_bench --write_buffer_size=268435456 --db_write_buffer_size=268435456 --db=/fast-rocksdb-tmp/rocks_lots_of_cf --use_existing_db=0 --open_files=55000 --statistics=1 --histogram=1 --disable_data_sync=1 --max_write_buffer_number=2 --sync=0 --benchmarks=fillrandom --threads=16 --num_column_families=5000 --disable_wal=1 --max_background_flushes=16 --max_background_compactions=16 --level0_file_num_compaction_trigger=2 --level0_slowdown_writes_trigger=2 --level0_stop_writes_trigger=3 --hard_rate_limit=1 --num=33333333 --writes=33333333
Before the patch:
fillrandom : 26.950 micros/op 37105 ops/sec; 4.1 MB/s
After the patch:
fillrandom : 17.404 micros/op 57456 ops/sec; 6.4 MB/s
Next bottleneck is VersionSet::AddLiveFiles, which is painfully slow when we have a lot of files. This is coming in the next patch, but when I removed that code, here's what I got:
fillrandom : 7.590 micros/op 131758 ops/sec; 14.6 MB/s
Test Plan:
make check
two stress tests:
Big number of compactions and flushes:
./db_stress --threads=30 --ops_per_thread=20000000 --max_key=10000 --column_families=20 --clear_column_family_one_in=10000000 --verify_before_write=0 --reopen=15 --max_background_compactions=10 --max_background_flushes=10 --db=/fast-rocksdb-tmp/db_stress --prefixpercent=0 --iterpercent=0 --writepercent=75 --db_write_buffer_size=2000000
max_background_flushes=0, to verify that this case also works correctly
./db_stress --threads=30 --ops_per_thread=2000000 --max_key=10000 --column_families=20 --clear_column_family_one_in=10000000 --verify_before_write=0 --reopen=3 --max_background_compactions=3 --max_background_flushes=0 --db=/fast-rocksdb-tmp/db_stress --prefixpercent=0 --iterpercent=0 --writepercent=75 --db_write_buffer_size=2000000
Reviewers: ljin, rven, yhchiang, sdong
Reviewed By: sdong
Subscribers: dhruba, leveldb
Differential Revision: https://reviews.facebook.net/D30123
10 years ago
|
|
|
}
|
|
|
|
}
|
Cache some conditions for DBImpl::MakeRoomForWrite
Summary:
Task 4580155. Some conditions in DBImpl::MakeRoomForWrite can be cached in
ColumnFamilyData, because theirs value can be changed only during compaction,
adding new memtable and/or add recalculation of compaction score.
These conditions are:
cfd->imm()->size() == cfd->options()->max_write_buffer_number - 1
cfd->current()->NumLevelFiles(0) >= cfd->options()->level0_stop_writes_trigger
cfd->options()->soft_rate_limit > 0.0 &&
(score = cfd->current()->MaxCompactionScore()) > cfd->options()->soft_rate_limit
cfd->options()->hard_rate_limit > 1.0 &&
(score = cfd->current()->MaxCompactionScore()) > cfd->options()->hard_rate_limit
P.S.
As it's my first diff, Siying suggested to add everybody as a reviewers
for this diff. Sorry, if I forgot someone or add someone by mistake.
Test Plan: make all check
Reviewers: haobo, xjin, dhruba, yhchiang, zagfox, ljin, sdong
Reviewed By: sdong
Subscribers: leveldb
Differential Revision: https://reviews.facebook.net/D19311
11 years ago
|
|
|
|
|
|
|
RecalculateWriteStallConditions(mutable_cf_options_);
|
Account memory of FileMetaData in global memory limit (#9924)
Summary:
**Context/Summary:**
As revealed by heap profiling, allocation of `FileMetaData` for [newly created file added to a Version](https://github.com/facebook/rocksdb/pull/9924/files#diff-a6aa385940793f95a2c5b39cc670bd440c4547fa54fd44622f756382d5e47e43R774) can consume significant heap memory. This PR is to account that toward our global memory limit based on block cache capacity.
Pull Request resolved: https://github.com/facebook/rocksdb/pull/9924
Test Plan:
- Previous `make check` verified there are only 2 places where the memory of the allocated `FileMetaData` can be released
- New unit test `TEST_P(ChargeFileMetadataTestWithParam, Basic)`
- db bench (CPU cost of `charge_file_metadata` in write and compact)
- **write micros/op: -0.24%** : `TEST_TMPDIR=/dev/shm/testdb ./db_bench -benchmarks=fillseq -db=$TEST_TMPDIR -charge_file_metadata=1 (remove this option for pre-PR) -disable_auto_compactions=1 -write_buffer_size=100000 -num=4000000 | egrep 'fillseq'`
- **compact micros/op -0.87%** : `TEST_TMPDIR=/dev/shm/testdb ./db_bench -benchmarks=fillseq -db=$TEST_TMPDIR -charge_file_metadata=1 -disable_auto_compactions=1 -write_buffer_size=100000 -num=4000000 -numdistinct=1000 && ./db_bench -benchmarks=compact -db=$TEST_TMPDIR -use_existing_db=1 -charge_file_metadata=1 -disable_auto_compactions=1 | egrep 'compact'`
table 1 - write
#-run | (pre-PR) avg micros/op | std micros/op | (post-PR) micros/op | std micros/op | change (%)
-- | -- | -- | -- | -- | --
10 | 3.9711 | 0.264408 | 3.9914 | 0.254563 | 0.5111933721
20 | 3.83905 | 0.0664488 | 3.8251 | 0.0695456 | -0.3633711465
40 | 3.86625 | 0.136669 | 3.8867 | 0.143765 | 0.5289363078
80 | 3.87828 | 0.119007 | 3.86791 | 0.115674 | **-0.2673865734**
160 | 3.87677 | 0.162231 | 3.86739 | 0.16663 | **-0.2419539978**
table 2 - compact
#-run | (pre-PR) avg micros/op | std micros/op | (post-PR) micros/op | std micros/op | change (%)
-- | -- | -- | -- | -- | --
10 | 2,399,650.00 | 96,375.80 | 2,359,537.00 | 53,243.60 | -1.67
20 | 2,410,480.00 | 89,988.00 | 2,433,580.00 | 91,121.20 | 0.96
40 | 2.41E+06 | 121811 | 2.39E+06 | 131525 | **-0.96**
80 | 2.40E+06 | 134503 | 2.39E+06 | 108799 | **-0.78**
- stress test: `python3 tools/db_crashtest.py blackbox --charge_file_metadata=1 --cache_size=1` killed as normal
Reviewed By: ajkr
Differential Revision: D36055583
Pulled By: hx235
fbshipit-source-id: b60eab94707103cb1322cf815f05810ef0232625
3 years ago
|
|
|
|
|
|
|
if (cf_options.table_factory->IsInstanceOf(
|
|
|
|
TableFactory::kBlockBasedTableName()) &&
|
|
|
|
cf_options.table_factory->GetOptions<BlockBasedTableOptions>()) {
|
|
|
|
const BlockBasedTableOptions* bbto =
|
|
|
|
cf_options.table_factory->GetOptions<BlockBasedTableOptions>();
|
|
|
|
const auto& options_overrides = bbto->cache_usage_options.options_overrides;
|
|
|
|
const auto file_metadata_charged =
|
|
|
|
options_overrides.at(CacheEntryRole::kFileMetadata).charged;
|
|
|
|
if (bbto->block_cache &&
|
|
|
|
file_metadata_charged == CacheEntryRoleOptions::Decision::kEnabled) {
|
|
|
|
// TODO(hx235): Add a `ConcurrentCacheReservationManager` at DB scope
|
|
|
|
// responsible for reservation of `ObsoleteFileInfo` so that we can keep
|
|
|
|
// this `file_metadata_cache_res_mgr_` nonconcurrent
|
|
|
|
file_metadata_cache_res_mgr_.reset(new ConcurrentCacheReservationManager(
|
|
|
|
std::make_shared<
|
|
|
|
CacheReservationManagerImpl<CacheEntryRole::kFileMetadata>>(
|
|
|
|
bbto->block_cache)));
|
|
|
|
}
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
// DB mutex held
|
|
|
|
ColumnFamilyData::~ColumnFamilyData() {
|
|
|
|
assert(refs_.load(std::memory_order_relaxed) == 0);
|
|
|
|
// remove from linked list
|
|
|
|
auto prev = prev_;
|
|
|
|
auto next = next_;
|
|
|
|
prev->next_ = next;
|
|
|
|
next->prev_ = prev;
|
|
|
|
|
|
|
|
if (!dropped_ && column_family_set_ != nullptr) {
|
|
|
|
// If it's dropped, it's already removed from column family set
|
|
|
|
// If column_family_set_ == nullptr, this is dummy CFD and not in
|
|
|
|
// ColumnFamilySet
|
|
|
|
column_family_set_->RemoveColumnFamily(this);
|
|
|
|
}
|
|
|
|
|
|
|
|
if (current_ != nullptr) {
|
|
|
|
current_->Unref();
|
|
|
|
}
|
|
|
|
|
Rewritten system for scheduling background work
Summary:
When scaling to higher number of column families, the worst bottleneck was MaybeScheduleFlushOrCompaction(), which did a for loop over all column families while holding a mutex. This patch addresses the issue.
The approach is similar to our earlier efforts: instead of a pull-model, where we do something for every column family, we can do a push-based model -- when we detect that column family is ready to be flushed/compacted, we add it to the flush_queue_/compaction_queue_. That way we don't need to loop over every column family in MaybeScheduleFlushOrCompaction.
Here are the performance results:
Command:
./db_bench --write_buffer_size=268435456 --db_write_buffer_size=268435456 --db=/fast-rocksdb-tmp/rocks_lots_of_cf --use_existing_db=0 --open_files=55000 --statistics=1 --histogram=1 --disable_data_sync=1 --max_write_buffer_number=2 --sync=0 --benchmarks=fillrandom --threads=16 --num_column_families=5000 --disable_wal=1 --max_background_flushes=16 --max_background_compactions=16 --level0_file_num_compaction_trigger=2 --level0_slowdown_writes_trigger=2 --level0_stop_writes_trigger=3 --hard_rate_limit=1 --num=33333333 --writes=33333333
Before the patch:
fillrandom : 26.950 micros/op 37105 ops/sec; 4.1 MB/s
After the patch:
fillrandom : 17.404 micros/op 57456 ops/sec; 6.4 MB/s
Next bottleneck is VersionSet::AddLiveFiles, which is painfully slow when we have a lot of files. This is coming in the next patch, but when I removed that code, here's what I got:
fillrandom : 7.590 micros/op 131758 ops/sec; 14.6 MB/s
Test Plan:
make check
two stress tests:
Big number of compactions and flushes:
./db_stress --threads=30 --ops_per_thread=20000000 --max_key=10000 --column_families=20 --clear_column_family_one_in=10000000 --verify_before_write=0 --reopen=15 --max_background_compactions=10 --max_background_flushes=10 --db=/fast-rocksdb-tmp/db_stress --prefixpercent=0 --iterpercent=0 --writepercent=75 --db_write_buffer_size=2000000
max_background_flushes=0, to verify that this case also works correctly
./db_stress --threads=30 --ops_per_thread=2000000 --max_key=10000 --column_families=20 --clear_column_family_one_in=10000000 --verify_before_write=0 --reopen=3 --max_background_compactions=3 --max_background_flushes=0 --db=/fast-rocksdb-tmp/db_stress --prefixpercent=0 --iterpercent=0 --writepercent=75 --db_write_buffer_size=2000000
Reviewers: ljin, rven, yhchiang, sdong
Reviewed By: sdong
Subscribers: dhruba, leveldb
Differential Revision: https://reviews.facebook.net/D30123
10 years ago
|
|
|
// It would be wrong if this ColumnFamilyData is in flush_queue_ or
|
|
|
|
// compaction_queue_ and we destroyed it
|
|
|
|
assert(!queued_for_flush_);
|
|
|
|
assert(!queued_for_compaction_);
|
|
|
|
assert(super_version_ == nullptr);
|
|
|
|
|
|
|
|
if (dummy_versions_ != nullptr) {
|
|
|
|
// List must be empty
|
|
|
|
assert(dummy_versions_->Next() == dummy_versions_);
|
|
|
|
bool deleted __attribute__((__unused__));
|
|
|
|
deleted = dummy_versions_->Unref();
|
|
|
|
assert(deleted);
|
|
|
|
}
|
|
|
|
|
|
|
|
if (mem_ != nullptr) {
|
|
|
|
delete mem_->Unref();
|
|
|
|
}
|
|
|
|
autovector<MemTable*> to_delete;
|
|
|
|
imm_.current()->Unref(&to_delete);
|
|
|
|
for (MemTable* m : to_delete) {
|
|
|
|
delete m;
|
|
|
|
}
|
|
|
|
|
|
|
|
if (db_paths_registered_) {
|
|
|
|
// TODO(cc): considering using ioptions_.fs, currently some tests rely on
|
|
|
|
// EnvWrapper, that's the main reason why we use env here.
|
|
|
|
Status s = ioptions_.env->UnregisterDbPaths(GetDbPaths());
|
|
|
|
if (!s.ok()) {
|
|
|
|
ROCKS_LOG_ERROR(
|
|
|
|
ioptions_.logger,
|
|
|
|
"Failed to unregister data paths of column family (id: %d, name: %s)",
|
|
|
|
id_, name_.c_str());
|
|
|
|
}
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
Fix a race in ColumnFamilyData::UnrefAndTryDelete (#8605)
Summary:
The `ColumnFamilyData::UnrefAndTryDelete` code currently on the trunk
unlocks the DB mutex before destroying the `ThreadLocalPtr` holding
the per-thread `SuperVersion` pointers when the only remaining reference
is the back reference from `super_version_`. The idea behind this was to
break the circular dependency between `ColumnFamilyData` and `SuperVersion`:
when the penultimate reference goes away, `ColumnFamilyData` can clean up
the `SuperVersion`, which can in turn clean up `ColumnFamilyData`. (Assuming there
is a `SuperVersion` and it is not referenced by anything else.) However,
unlocking the mutex throws a wrench in this plan by making it possible for another thread
to jump in and take another reference to the `ColumnFamilyData`, keeping the
object alive in a zombie `ThreadLocalPtr`-less state. This can cause issues like
https://github.com/facebook/rocksdb/issues/8440 ,
https://github.com/facebook/rocksdb/issues/8382 ,
and might also explain the `was_last_ref` assertion failures from the `ColumnFamilySet`
destructor we sometimes observe during close in our stress tests.
Digging through the archives, this unlocking goes way back to 2014 (or earlier). The original
rationale was that `SuperVersionUnrefHandle` used to lock the mutex so it can call
`SuperVersion::Cleanup`; however, this logic turned out to be deadlock-prone.
https://github.com/facebook/rocksdb/pull/3510 fixed the deadlock but left the
unlocking in place. https://github.com/facebook/rocksdb/pull/6147 then introduced
the circular dependency and associated cleanup logic described above (in order
to enable iterators to keep the `ColumnFamilyData` for dropped column families alive),
and moved the unlocking-relocking snippet to its present location in `UnrefAndTryDelete`.
Finally, https://github.com/facebook/rocksdb/pull/7749 fixed a memory leak but
apparently exacerbated the race by (otherwise correctly) switching to `UnrefAndTryDelete`
in `SuperVersion::Cleanup`.
The patch simply eliminates the unlocking and relocking, which has been unnecessary
ever since https://github.com/facebook/rocksdb/issues/3510 made `SuperVersionUnrefHandle` lock-free.
This closes the window during which another thread could increase the reference count,
and hopefully fixes the issues above.
Pull Request resolved: https://github.com/facebook/rocksdb/pull/8605
Test Plan: Ran `make check` and stress tests locally.
Reviewed By: pdillinger
Differential Revision: D30051035
Pulled By: ltamasi
fbshipit-source-id: 8fe559e4b4ad69fc142579f8bc393ef525918528
3 years ago
|
|
|
bool ColumnFamilyData::UnrefAndTryDelete() {
|
|
|
|
int old_refs = refs_.fetch_sub(1);
|
|
|
|
assert(old_refs > 0);
|
|
|
|
|
|
|
|
if (old_refs == 1) {
|
|
|
|
assert(super_version_ == nullptr);
|
|
|
|
delete this;
|
|
|
|
return true;
|
|
|
|
}
|
|
|
|
|
Fix a race in ColumnFamilyData::UnrefAndTryDelete (#8605)
Summary:
The `ColumnFamilyData::UnrefAndTryDelete` code currently on the trunk
unlocks the DB mutex before destroying the `ThreadLocalPtr` holding
the per-thread `SuperVersion` pointers when the only remaining reference
is the back reference from `super_version_`. The idea behind this was to
break the circular dependency between `ColumnFamilyData` and `SuperVersion`:
when the penultimate reference goes away, `ColumnFamilyData` can clean up
the `SuperVersion`, which can in turn clean up `ColumnFamilyData`. (Assuming there
is a `SuperVersion` and it is not referenced by anything else.) However,
unlocking the mutex throws a wrench in this plan by making it possible for another thread
to jump in and take another reference to the `ColumnFamilyData`, keeping the
object alive in a zombie `ThreadLocalPtr`-less state. This can cause issues like
https://github.com/facebook/rocksdb/issues/8440 ,
https://github.com/facebook/rocksdb/issues/8382 ,
and might also explain the `was_last_ref` assertion failures from the `ColumnFamilySet`
destructor we sometimes observe during close in our stress tests.
Digging through the archives, this unlocking goes way back to 2014 (or earlier). The original
rationale was that `SuperVersionUnrefHandle` used to lock the mutex so it can call
`SuperVersion::Cleanup`; however, this logic turned out to be deadlock-prone.
https://github.com/facebook/rocksdb/pull/3510 fixed the deadlock but left the
unlocking in place. https://github.com/facebook/rocksdb/pull/6147 then introduced
the circular dependency and associated cleanup logic described above (in order
to enable iterators to keep the `ColumnFamilyData` for dropped column families alive),
and moved the unlocking-relocking snippet to its present location in `UnrefAndTryDelete`.
Finally, https://github.com/facebook/rocksdb/pull/7749 fixed a memory leak but
apparently exacerbated the race by (otherwise correctly) switching to `UnrefAndTryDelete`
in `SuperVersion::Cleanup`.
The patch simply eliminates the unlocking and relocking, which has been unnecessary
ever since https://github.com/facebook/rocksdb/issues/3510 made `SuperVersionUnrefHandle` lock-free.
This closes the window during which another thread could increase the reference count,
and hopefully fixes the issues above.
Pull Request resolved: https://github.com/facebook/rocksdb/pull/8605
Test Plan: Ran `make check` and stress tests locally.
Reviewed By: pdillinger
Differential Revision: D30051035
Pulled By: ltamasi
fbshipit-source-id: 8fe559e4b4ad69fc142579f8bc393ef525918528
3 years ago
|
|
|
if (old_refs == 2 && super_version_ != nullptr) {
|
|
|
|
// Only the super_version_ holds me
|
|
|
|
SuperVersion* sv = super_version_;
|
|
|
|
super_version_ = nullptr;
|
Fix a race in ColumnFamilyData::UnrefAndTryDelete (#8605)
Summary:
The `ColumnFamilyData::UnrefAndTryDelete` code currently on the trunk
unlocks the DB mutex before destroying the `ThreadLocalPtr` holding
the per-thread `SuperVersion` pointers when the only remaining reference
is the back reference from `super_version_`. The idea behind this was to
break the circular dependency between `ColumnFamilyData` and `SuperVersion`:
when the penultimate reference goes away, `ColumnFamilyData` can clean up
the `SuperVersion`, which can in turn clean up `ColumnFamilyData`. (Assuming there
is a `SuperVersion` and it is not referenced by anything else.) However,
unlocking the mutex throws a wrench in this plan by making it possible for another thread
to jump in and take another reference to the `ColumnFamilyData`, keeping the
object alive in a zombie `ThreadLocalPtr`-less state. This can cause issues like
https://github.com/facebook/rocksdb/issues/8440 ,
https://github.com/facebook/rocksdb/issues/8382 ,
and might also explain the `was_last_ref` assertion failures from the `ColumnFamilySet`
destructor we sometimes observe during close in our stress tests.
Digging through the archives, this unlocking goes way back to 2014 (or earlier). The original
rationale was that `SuperVersionUnrefHandle` used to lock the mutex so it can call
`SuperVersion::Cleanup`; however, this logic turned out to be deadlock-prone.
https://github.com/facebook/rocksdb/pull/3510 fixed the deadlock but left the
unlocking in place. https://github.com/facebook/rocksdb/pull/6147 then introduced
the circular dependency and associated cleanup logic described above (in order
to enable iterators to keep the `ColumnFamilyData` for dropped column families alive),
and moved the unlocking-relocking snippet to its present location in `UnrefAndTryDelete`.
Finally, https://github.com/facebook/rocksdb/pull/7749 fixed a memory leak but
apparently exacerbated the race by (otherwise correctly) switching to `UnrefAndTryDelete`
in `SuperVersion::Cleanup`.
The patch simply eliminates the unlocking and relocking, which has been unnecessary
ever since https://github.com/facebook/rocksdb/issues/3510 made `SuperVersionUnrefHandle` lock-free.
This closes the window during which another thread could increase the reference count,
and hopefully fixes the issues above.
Pull Request resolved: https://github.com/facebook/rocksdb/pull/8605
Test Plan: Ran `make check` and stress tests locally.
Reviewed By: pdillinger
Differential Revision: D30051035
Pulled By: ltamasi
fbshipit-source-id: 8fe559e4b4ad69fc142579f8bc393ef525918528
3 years ago
|
|
|
|
|
|
|
// Release SuperVersion references kept in ThreadLocalPtr.
|
|
|
|
local_sv_.reset();
|
|
|
|
|
|
|
|
if (sv->Unref()) {
|
Fix a race in ColumnFamilyData::UnrefAndTryDelete (#8605)
Summary:
The `ColumnFamilyData::UnrefAndTryDelete` code currently on the trunk
unlocks the DB mutex before destroying the `ThreadLocalPtr` holding
the per-thread `SuperVersion` pointers when the only remaining reference
is the back reference from `super_version_`. The idea behind this was to
break the circular dependency between `ColumnFamilyData` and `SuperVersion`:
when the penultimate reference goes away, `ColumnFamilyData` can clean up
the `SuperVersion`, which can in turn clean up `ColumnFamilyData`. (Assuming there
is a `SuperVersion` and it is not referenced by anything else.) However,
unlocking the mutex throws a wrench in this plan by making it possible for another thread
to jump in and take another reference to the `ColumnFamilyData`, keeping the
object alive in a zombie `ThreadLocalPtr`-less state. This can cause issues like
https://github.com/facebook/rocksdb/issues/8440 ,
https://github.com/facebook/rocksdb/issues/8382 ,
and might also explain the `was_last_ref` assertion failures from the `ColumnFamilySet`
destructor we sometimes observe during close in our stress tests.
Digging through the archives, this unlocking goes way back to 2014 (or earlier). The original
rationale was that `SuperVersionUnrefHandle` used to lock the mutex so it can call
`SuperVersion::Cleanup`; however, this logic turned out to be deadlock-prone.
https://github.com/facebook/rocksdb/pull/3510 fixed the deadlock but left the
unlocking in place. https://github.com/facebook/rocksdb/pull/6147 then introduced
the circular dependency and associated cleanup logic described above (in order
to enable iterators to keep the `ColumnFamilyData` for dropped column families alive),
and moved the unlocking-relocking snippet to its present location in `UnrefAndTryDelete`.
Finally, https://github.com/facebook/rocksdb/pull/7749 fixed a memory leak but
apparently exacerbated the race by (otherwise correctly) switching to `UnrefAndTryDelete`
in `SuperVersion::Cleanup`.
The patch simply eliminates the unlocking and relocking, which has been unnecessary
ever since https://github.com/facebook/rocksdb/issues/3510 made `SuperVersionUnrefHandle` lock-free.
This closes the window during which another thread could increase the reference count,
and hopefully fixes the issues above.
Pull Request resolved: https://github.com/facebook/rocksdb/pull/8605
Test Plan: Ran `make check` and stress tests locally.
Reviewed By: pdillinger
Differential Revision: D30051035
Pulled By: ltamasi
fbshipit-source-id: 8fe559e4b4ad69fc142579f8bc393ef525918528
3 years ago
|
|
|
// Note: sv will delete this ColumnFamilyData during Cleanup()
|
|
|
|
assert(sv->cfd == this);
|
|
|
|
sv->Cleanup();
|
|
|
|
delete sv;
|
|
|
|
return true;
|
|
|
|
}
|
|
|
|
}
|
|
|
|
return false;
|
|
|
|
}
|
|
|
|
|
|
|
|
void ColumnFamilyData::SetDropped() {
|
|
|
|
// can't drop default CF
|
|
|
|
assert(id_ != 0);
|
|
|
|
dropped_ = true;
|
|
|
|
write_controller_token_.reset();
|
|
|
|
|
|
|
|
// remove from column_family_set
|
|
|
|
column_family_set_->RemoveColumnFamily(this);
|
|
|
|
}
|
|
|
|
|
|
|
|
ColumnFamilyOptions ColumnFamilyData::GetLatestCFOptions() const {
|
|
|
|
return BuildColumnFamilyOptions(initial_cf_options_, mutable_cf_options_);
|
|
|
|
}
|
|
|
|
|
|
|
|
uint64_t ColumnFamilyData::OldestLogToKeep() {
|
|
|
|
auto current_log = GetLogNumber();
|
|
|
|
|
|
|
|
if (allow_2pc_) {
|
|
|
|
auto imm_prep_log = imm()->PrecomputeMinLogContainingPrepSection();
|
|
|
|
auto mem_prep_log = mem()->GetMinLogContainingPrepSection();
|
|
|
|
|
|
|
|
if (imm_prep_log > 0 && imm_prep_log < current_log) {
|
|
|
|
current_log = imm_prep_log;
|
|
|
|
}
|
|
|
|
|
|
|
|
if (mem_prep_log > 0 && mem_prep_log < current_log) {
|
|
|
|
current_log = mem_prep_log;
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
return current_log;
|
|
|
|
}
|
|
|
|
|
|
|
|
const double kIncSlowdownRatio = 0.8;
|
|
|
|
const double kDecSlowdownRatio = 1 / kIncSlowdownRatio;
|
|
|
|
const double kNearStopSlowdownRatio = 0.6;
|
|
|
|
const double kDelayRecoverSlowdownRatio = 1.4;
|
When slowdown is triggered, reduce the write rate
Summary: It's usually hard for users to set a value of options.delayed_write_rate. With this diff, after slowdown condition triggers, we greedily reduce write rate if estimated pending compaction bytes increase. If estimated compaction pending bytes drop, we increase the write rate.
Test Plan:
Add a unit test
Test with db_bench setting:
TEST_TMPDIR=/dev/shm/ ./db_bench --benchmarks=fillrandom -num=10000000 --soft_pending_compaction_bytes_limit=1000000000 --hard_pending_compaction_bytes_limit=3000000000 --delayed_write_rate=100000000
and make sure without the commit, write stop will happen, but with the commit, it will not happen.
Reviewers: igor, anthony, rven, yhchiang, kradhakrishnan, IslamAbdelRahman
Reviewed By: IslamAbdelRahman
Subscribers: leveldb, dhruba
Differential Revision: https://reviews.facebook.net/D52131
9 years ago
|
|
|
|
|
|
|
namespace {
|
|
|
|
// If penalize_stop is true, we further reduce slowdown rate.
|
When slowdown is triggered, reduce the write rate
Summary: It's usually hard for users to set a value of options.delayed_write_rate. With this diff, after slowdown condition triggers, we greedily reduce write rate if estimated pending compaction bytes increase. If estimated compaction pending bytes drop, we increase the write rate.
Test Plan:
Add a unit test
Test with db_bench setting:
TEST_TMPDIR=/dev/shm/ ./db_bench --benchmarks=fillrandom -num=10000000 --soft_pending_compaction_bytes_limit=1000000000 --hard_pending_compaction_bytes_limit=3000000000 --delayed_write_rate=100000000
and make sure without the commit, write stop will happen, but with the commit, it will not happen.
Reviewers: igor, anthony, rven, yhchiang, kradhakrishnan, IslamAbdelRahman
Reviewed By: IslamAbdelRahman
Subscribers: leveldb, dhruba
Differential Revision: https://reviews.facebook.net/D52131
9 years ago
|
|
|
std::unique_ptr<WriteControllerToken> SetupDelay(
|
|
|
|
WriteController* write_controller, uint64_t compaction_needed_bytes,
|
|
|
|
uint64_t prev_compaction_need_bytes, bool penalize_stop,
|
|
|
|
bool auto_compactions_disabled) {
|
|
|
|
const uint64_t kMinWriteRate = 16 * 1024u; // Minimum write rate 16KB/s.
|
When slowdown is triggered, reduce the write rate
Summary: It's usually hard for users to set a value of options.delayed_write_rate. With this diff, after slowdown condition triggers, we greedily reduce write rate if estimated pending compaction bytes increase. If estimated compaction pending bytes drop, we increase the write rate.
Test Plan:
Add a unit test
Test with db_bench setting:
TEST_TMPDIR=/dev/shm/ ./db_bench --benchmarks=fillrandom -num=10000000 --soft_pending_compaction_bytes_limit=1000000000 --hard_pending_compaction_bytes_limit=3000000000 --delayed_write_rate=100000000
and make sure without the commit, write stop will happen, but with the commit, it will not happen.
Reviewers: igor, anthony, rven, yhchiang, kradhakrishnan, IslamAbdelRahman
Reviewed By: IslamAbdelRahman
Subscribers: leveldb, dhruba
Differential Revision: https://reviews.facebook.net/D52131
9 years ago
|
|
|
|
|
|
|
uint64_t max_write_rate = write_controller->max_delayed_write_rate();
|
When slowdown is triggered, reduce the write rate
Summary: It's usually hard for users to set a value of options.delayed_write_rate. With this diff, after slowdown condition triggers, we greedily reduce write rate if estimated pending compaction bytes increase. If estimated compaction pending bytes drop, we increase the write rate.
Test Plan:
Add a unit test
Test with db_bench setting:
TEST_TMPDIR=/dev/shm/ ./db_bench --benchmarks=fillrandom -num=10000000 --soft_pending_compaction_bytes_limit=1000000000 --hard_pending_compaction_bytes_limit=3000000000 --delayed_write_rate=100000000
and make sure without the commit, write stop will happen, but with the commit, it will not happen.
Reviewers: igor, anthony, rven, yhchiang, kradhakrishnan, IslamAbdelRahman
Reviewed By: IslamAbdelRahman
Subscribers: leveldb, dhruba
Differential Revision: https://reviews.facebook.net/D52131
9 years ago
|
|
|
uint64_t write_rate = write_controller->delayed_write_rate();
|
|
|
|
|
|
|
|
if (auto_compactions_disabled) {
|
When slowdown is triggered, reduce the write rate
Summary: It's usually hard for users to set a value of options.delayed_write_rate. With this diff, after slowdown condition triggers, we greedily reduce write rate if estimated pending compaction bytes increase. If estimated compaction pending bytes drop, we increase the write rate.
Test Plan:
Add a unit test
Test with db_bench setting:
TEST_TMPDIR=/dev/shm/ ./db_bench --benchmarks=fillrandom -num=10000000 --soft_pending_compaction_bytes_limit=1000000000 --hard_pending_compaction_bytes_limit=3000000000 --delayed_write_rate=100000000
and make sure without the commit, write stop will happen, but with the commit, it will not happen.
Reviewers: igor, anthony, rven, yhchiang, kradhakrishnan, IslamAbdelRahman
Reviewed By: IslamAbdelRahman
Subscribers: leveldb, dhruba
Differential Revision: https://reviews.facebook.net/D52131
9 years ago
|
|
|
// When auto compaction is disabled, always use the value user gave.
|
|
|
|
write_rate = max_write_rate;
|
|
|
|
} else if (write_controller->NeedsDelay() && max_write_rate > kMinWriteRate) {
|
|
|
|
// If user gives rate less than kMinWriteRate, don't adjust it.
|
|
|
|
//
|
|
|
|
// If already delayed, need to adjust based on previous compaction debt.
|
|
|
|
// When there are two or more column families require delay, we always
|
|
|
|
// increase or reduce write rate based on information for one single
|
|
|
|
// column family. It is likely to be OK but we can improve if there is a
|
|
|
|
// problem.
|
|
|
|
// Ignore compaction_needed_bytes = 0 case because compaction_needed_bytes
|
|
|
|
// is only available in level-based compaction
|
|
|
|
//
|
|
|
|
// If the compaction debt stays the same as previously, we also further slow
|
|
|
|
// down. It usually means a mem table is full. It's mainly for the case
|
|
|
|
// where both of flush and compaction are much slower than the speed we
|
|
|
|
// insert to mem tables, so we need to actively slow down before we get
|
|
|
|
// feedback signal from compaction and flushes to avoid the full stop
|
|
|
|
// because of hitting the max write buffer number.
|
|
|
|
//
|
|
|
|
// If DB just falled into the stop condition, we need to further reduce
|
|
|
|
// the write rate to avoid the stop condition.
|
|
|
|
if (penalize_stop) {
|
|
|
|
// Penalize the near stop or stop condition by more aggressive slowdown.
|
|
|
|
// This is to provide the long term slowdown increase signal.
|
|
|
|
// The penalty is more than the reward of recovering to the normal
|
|
|
|
// condition.
|
|
|
|
write_rate = static_cast<uint64_t>(static_cast<double>(write_rate) *
|
|
|
|
kNearStopSlowdownRatio);
|
When slowdown is triggered, reduce the write rate
Summary: It's usually hard for users to set a value of options.delayed_write_rate. With this diff, after slowdown condition triggers, we greedily reduce write rate if estimated pending compaction bytes increase. If estimated compaction pending bytes drop, we increase the write rate.
Test Plan:
Add a unit test
Test with db_bench setting:
TEST_TMPDIR=/dev/shm/ ./db_bench --benchmarks=fillrandom -num=10000000 --soft_pending_compaction_bytes_limit=1000000000 --hard_pending_compaction_bytes_limit=3000000000 --delayed_write_rate=100000000
and make sure without the commit, write stop will happen, but with the commit, it will not happen.
Reviewers: igor, anthony, rven, yhchiang, kradhakrishnan, IslamAbdelRahman
Reviewed By: IslamAbdelRahman
Subscribers: leveldb, dhruba
Differential Revision: https://reviews.facebook.net/D52131
9 years ago
|
|
|
if (write_rate < kMinWriteRate) {
|
|
|
|
write_rate = kMinWriteRate;
|
|
|
|
}
|
|
|
|
} else if (prev_compaction_need_bytes > 0 &&
|
|
|
|
prev_compaction_need_bytes <= compaction_needed_bytes) {
|
|
|
|
write_rate = static_cast<uint64_t>(static_cast<double>(write_rate) *
|
|
|
|
kIncSlowdownRatio);
|
|
|
|
if (write_rate < kMinWriteRate) {
|
|
|
|
write_rate = kMinWriteRate;
|
|
|
|
}
|
|
|
|
} else if (prev_compaction_need_bytes > compaction_needed_bytes) {
|
When slowdown is triggered, reduce the write rate
Summary: It's usually hard for users to set a value of options.delayed_write_rate. With this diff, after slowdown condition triggers, we greedily reduce write rate if estimated pending compaction bytes increase. If estimated compaction pending bytes drop, we increase the write rate.
Test Plan:
Add a unit test
Test with db_bench setting:
TEST_TMPDIR=/dev/shm/ ./db_bench --benchmarks=fillrandom -num=10000000 --soft_pending_compaction_bytes_limit=1000000000 --hard_pending_compaction_bytes_limit=3000000000 --delayed_write_rate=100000000
and make sure without the commit, write stop will happen, but with the commit, it will not happen.
Reviewers: igor, anthony, rven, yhchiang, kradhakrishnan, IslamAbdelRahman
Reviewed By: IslamAbdelRahman
Subscribers: leveldb, dhruba
Differential Revision: https://reviews.facebook.net/D52131
9 years ago
|
|
|
// We are speeding up by ratio of kSlowdownRatio when we have paid
|
|
|
|
// compaction debt. But we'll never speed up to faster than the write rate
|
|
|
|
// given by users.
|
|
|
|
write_rate = static_cast<uint64_t>(static_cast<double>(write_rate) *
|
|
|
|
kDecSlowdownRatio);
|
When slowdown is triggered, reduce the write rate
Summary: It's usually hard for users to set a value of options.delayed_write_rate. With this diff, after slowdown condition triggers, we greedily reduce write rate if estimated pending compaction bytes increase. If estimated compaction pending bytes drop, we increase the write rate.
Test Plan:
Add a unit test
Test with db_bench setting:
TEST_TMPDIR=/dev/shm/ ./db_bench --benchmarks=fillrandom -num=10000000 --soft_pending_compaction_bytes_limit=1000000000 --hard_pending_compaction_bytes_limit=3000000000 --delayed_write_rate=100000000
and make sure without the commit, write stop will happen, but with the commit, it will not happen.
Reviewers: igor, anthony, rven, yhchiang, kradhakrishnan, IslamAbdelRahman
Reviewed By: IslamAbdelRahman
Subscribers: leveldb, dhruba
Differential Revision: https://reviews.facebook.net/D52131
9 years ago
|
|
|
if (write_rate > max_write_rate) {
|
|
|
|
write_rate = max_write_rate;
|
|
|
|
}
|
|
|
|
}
|
|
|
|
}
|
|
|
|
return write_controller->GetDelayToken(write_rate);
|
|
|
|
}
|
Add options.base_background_compactions as a number of compaction threads for low compaction debt
Summary:
If options.base_background_compactions is given, we try to schedule number of compactions not existing this number, only when L0 files increase to certain number, or pending compaction bytes more than certain threshold, we schedule compactions based on options.max_background_compactions.
The watermarks are calculated based on slowdown thresholds.
Test Plan:
Add new test cases in column_family_test.
Adding more unit tests.
Reviewers: IslamAbdelRahman, yhchiang, kradhakrishnan, rven, anthony
Reviewed By: anthony
Subscribers: leveldb, dhruba, yoshinorim
Differential Revision: https://reviews.facebook.net/D53409
9 years ago
|
|
|
|
|
|
|
int GetL0ThresholdSpeedupCompaction(int level0_file_num_compaction_trigger,
|
|
|
|
int level0_slowdown_writes_trigger) {
|
|
|
|
// SanitizeOptions() ensures it.
|
|
|
|
assert(level0_file_num_compaction_trigger <= level0_slowdown_writes_trigger);
|
|
|
|
|
|
|
|
if (level0_file_num_compaction_trigger < 0) {
|
|
|
|
return std::numeric_limits<int>::max();
|
|
|
|
}
|
|
|
|
|
|
|
|
const int64_t twice_level0_trigger =
|
|
|
|
static_cast<int64_t>(level0_file_num_compaction_trigger) * 2;
|
|
|
|
|
|
|
|
const int64_t one_fourth_trigger_slowdown =
|
|
|
|
static_cast<int64_t>(level0_file_num_compaction_trigger) +
|
|
|
|
((level0_slowdown_writes_trigger - level0_file_num_compaction_trigger) /
|
|
|
|
4);
|
|
|
|
|
|
|
|
assert(twice_level0_trigger >= 0);
|
|
|
|
assert(one_fourth_trigger_slowdown >= 0);
|
|
|
|
|
|
|
|
// 1/4 of the way between L0 compaction trigger threshold and slowdown
|
|
|
|
// condition.
|
Add options.base_background_compactions as a number of compaction threads for low compaction debt
Summary:
If options.base_background_compactions is given, we try to schedule number of compactions not existing this number, only when L0 files increase to certain number, or pending compaction bytes more than certain threshold, we schedule compactions based on options.max_background_compactions.
The watermarks are calculated based on slowdown thresholds.
Test Plan:
Add new test cases in column_family_test.
Adding more unit tests.
Reviewers: IslamAbdelRahman, yhchiang, kradhakrishnan, rven, anthony
Reviewed By: anthony
Subscribers: leveldb, dhruba, yoshinorim
Differential Revision: https://reviews.facebook.net/D53409
9 years ago
|
|
|
// Or twice as compaction trigger, if it is smaller.
|
|
|
|
int64_t res = std::min(twice_level0_trigger, one_fourth_trigger_slowdown);
|
|
|
|
if (res >= std::numeric_limits<int32_t>::max()) {
|
|
|
|
return std::numeric_limits<int32_t>::max();
|
|
|
|
} else {
|
|
|
|
// res fits in int
|
|
|
|
return static_cast<int>(res);
|
|
|
|
}
|
Add options.base_background_compactions as a number of compaction threads for low compaction debt
Summary:
If options.base_background_compactions is given, we try to schedule number of compactions not existing this number, only when L0 files increase to certain number, or pending compaction bytes more than certain threshold, we schedule compactions based on options.max_background_compactions.
The watermarks are calculated based on slowdown thresholds.
Test Plan:
Add new test cases in column_family_test.
Adding more unit tests.
Reviewers: IslamAbdelRahman, yhchiang, kradhakrishnan, rven, anthony
Reviewed By: anthony
Subscribers: leveldb, dhruba, yoshinorim
Differential Revision: https://reviews.facebook.net/D53409
9 years ago
|
|
|
}
|
|
|
|
} // anonymous namespace
|
When slowdown is triggered, reduce the write rate
Summary: It's usually hard for users to set a value of options.delayed_write_rate. With this diff, after slowdown condition triggers, we greedily reduce write rate if estimated pending compaction bytes increase. If estimated compaction pending bytes drop, we increase the write rate.
Test Plan:
Add a unit test
Test with db_bench setting:
TEST_TMPDIR=/dev/shm/ ./db_bench --benchmarks=fillrandom -num=10000000 --soft_pending_compaction_bytes_limit=1000000000 --hard_pending_compaction_bytes_limit=3000000000 --delayed_write_rate=100000000
and make sure without the commit, write stop will happen, but with the commit, it will not happen.
Reviewers: igor, anthony, rven, yhchiang, kradhakrishnan, IslamAbdelRahman
Reviewed By: IslamAbdelRahman
Subscribers: leveldb, dhruba
Differential Revision: https://reviews.facebook.net/D52131
9 years ago
|
|
|
|
|
|
|
std::pair<WriteStallCondition, WriteStallCause>
|
|
|
|
ColumnFamilyData::GetWriteStallConditionAndCause(
|
|
|
|
int num_unflushed_memtables, int num_l0_files,
|
|
|
|
uint64_t num_compaction_needed_bytes,
|
|
|
|
const MutableCFOptions& mutable_cf_options,
|
|
|
|
const ImmutableCFOptions& immutable_cf_options) {
|
|
|
|
if (num_unflushed_memtables >= mutable_cf_options.max_write_buffer_number) {
|
|
|
|
return {WriteStallCondition::kStopped, WriteStallCause::kMemtableLimit};
|
|
|
|
} else if (!mutable_cf_options.disable_auto_compactions &&
|
|
|
|
num_l0_files >= mutable_cf_options.level0_stop_writes_trigger) {
|
|
|
|
return {WriteStallCondition::kStopped, WriteStallCause::kL0FileCountLimit};
|
|
|
|
} else if (!mutable_cf_options.disable_auto_compactions &&
|
|
|
|
mutable_cf_options.hard_pending_compaction_bytes_limit > 0 &&
|
|
|
|
num_compaction_needed_bytes >=
|
|
|
|
mutable_cf_options.hard_pending_compaction_bytes_limit) {
|
|
|
|
return {WriteStallCondition::kStopped,
|
|
|
|
WriteStallCause::kPendingCompactionBytes};
|
|
|
|
} else if (mutable_cf_options.max_write_buffer_number > 3 &&
|
|
|
|
num_unflushed_memtables >=
|
|
|
|
mutable_cf_options.max_write_buffer_number - 1 &&
|
|
|
|
num_unflushed_memtables - 1 >=
|
|
|
|
immutable_cf_options.min_write_buffer_number_to_merge) {
|
|
|
|
return {WriteStallCondition::kDelayed, WriteStallCause::kMemtableLimit};
|
|
|
|
} else if (!mutable_cf_options.disable_auto_compactions &&
|
|
|
|
mutable_cf_options.level0_slowdown_writes_trigger >= 0 &&
|
|
|
|
num_l0_files >=
|
|
|
|
mutable_cf_options.level0_slowdown_writes_trigger) {
|
|
|
|
return {WriteStallCondition::kDelayed, WriteStallCause::kL0FileCountLimit};
|
|
|
|
} else if (!mutable_cf_options.disable_auto_compactions &&
|
|
|
|
mutable_cf_options.soft_pending_compaction_bytes_limit > 0 &&
|
|
|
|
num_compaction_needed_bytes >=
|
|
|
|
mutable_cf_options.soft_pending_compaction_bytes_limit) {
|
|
|
|
return {WriteStallCondition::kDelayed,
|
|
|
|
WriteStallCause::kPendingCompactionBytes};
|
|
|
|
}
|
|
|
|
return {WriteStallCondition::kNormal, WriteStallCause::kNone};
|
|
|
|
}
|
|
|
|
|
|
|
|
WriteStallCondition ColumnFamilyData::RecalculateWriteStallConditions(
|
|
|
|
const MutableCFOptions& mutable_cf_options) {
|
|
|
|
auto write_stall_condition = WriteStallCondition::kNormal;
|
Cache some conditions for DBImpl::MakeRoomForWrite
Summary:
Task 4580155. Some conditions in DBImpl::MakeRoomForWrite can be cached in
ColumnFamilyData, because theirs value can be changed only during compaction,
adding new memtable and/or add recalculation of compaction score.
These conditions are:
cfd->imm()->size() == cfd->options()->max_write_buffer_number - 1
cfd->current()->NumLevelFiles(0) >= cfd->options()->level0_stop_writes_trigger
cfd->options()->soft_rate_limit > 0.0 &&
(score = cfd->current()->MaxCompactionScore()) > cfd->options()->soft_rate_limit
cfd->options()->hard_rate_limit > 1.0 &&
(score = cfd->current()->MaxCompactionScore()) > cfd->options()->hard_rate_limit
P.S.
As it's my first diff, Siying suggested to add everybody as a reviewers
for this diff. Sorry, if I forgot someone or add someone by mistake.
Test Plan: make all check
Reviewers: haobo, xjin, dhruba, yhchiang, zagfox, ljin, sdong
Reviewed By: sdong
Subscribers: leveldb
Differential Revision: https://reviews.facebook.net/D19311
11 years ago
|
|
|
if (current_ != nullptr) {
|
|
|
|
auto* vstorage = current_->storage_info();
|
Push- instead of pull-model for managing Write stalls
Summary:
Introducing WriteController, which is a source of truth about per-DB write delays. Let's define an DB epoch as a period where there are no flushes and compactions (i.e. new epoch is started when flush or compaction finishes). Each epoch can either:
* proceed with all writes without delay
* delay all writes by fixed time
* stop all writes
The three modes are recomputed at each epoch change (flush, compaction), rather than on every write (which is currently the case).
When we have a lot of column families, our current pull behavior adds a big overhead, since we need to loop over every column family for every write. With new push model, overhead on Write code-path is minimal.
This is just the start. Next step is to also take care of stalls introduced by slow memtable flushes. The final goal is to eliminate function MakeRoomForWrite(), which currently needs to be called for every column family by every write.
Test Plan: make check for now. I'll add some unit tests later. Also, perf test.
Reviewers: dhruba, yhchiang, MarkCallaghan, sdong, ljin
Reviewed By: ljin
Subscribers: leveldb
Differential Revision: https://reviews.facebook.net/D22791
10 years ago
|
|
|
auto write_controller = column_family_set_->write_controller_;
|
When slowdown is triggered, reduce the write rate
Summary: It's usually hard for users to set a value of options.delayed_write_rate. With this diff, after slowdown condition triggers, we greedily reduce write rate if estimated pending compaction bytes increase. If estimated compaction pending bytes drop, we increase the write rate.
Test Plan:
Add a unit test
Test with db_bench setting:
TEST_TMPDIR=/dev/shm/ ./db_bench --benchmarks=fillrandom -num=10000000 --soft_pending_compaction_bytes_limit=1000000000 --hard_pending_compaction_bytes_limit=3000000000 --delayed_write_rate=100000000
and make sure without the commit, write stop will happen, but with the commit, it will not happen.
Reviewers: igor, anthony, rven, yhchiang, kradhakrishnan, IslamAbdelRahman
Reviewed By: IslamAbdelRahman
Subscribers: leveldb, dhruba
Differential Revision: https://reviews.facebook.net/D52131
9 years ago
|
|
|
uint64_t compaction_needed_bytes =
|
|
|
|
vstorage->estimated_compaction_needed_bytes();
|
Push- instead of pull-model for managing Write stalls
Summary:
Introducing WriteController, which is a source of truth about per-DB write delays. Let's define an DB epoch as a period where there are no flushes and compactions (i.e. new epoch is started when flush or compaction finishes). Each epoch can either:
* proceed with all writes without delay
* delay all writes by fixed time
* stop all writes
The three modes are recomputed at each epoch change (flush, compaction), rather than on every write (which is currently the case).
When we have a lot of column families, our current pull behavior adds a big overhead, since we need to loop over every column family for every write. With new push model, overhead on Write code-path is minimal.
This is just the start. Next step is to also take care of stalls introduced by slow memtable flushes. The final goal is to eliminate function MakeRoomForWrite(), which currently needs to be called for every column family by every write.
Test Plan: make check for now. I'll add some unit tests later. Also, perf test.
Reviewers: dhruba, yhchiang, MarkCallaghan, sdong, ljin
Reviewed By: ljin
Subscribers: leveldb
Differential Revision: https://reviews.facebook.net/D22791
10 years ago
|
|
|
|
|
|
|
auto write_stall_condition_and_cause = GetWriteStallConditionAndCause(
|
|
|
|
imm()->NumNotFlushed(), vstorage->l0_delay_trigger_count(),
|
|
|
|
vstorage->estimated_compaction_needed_bytes(), mutable_cf_options,
|
|
|
|
*ioptions());
|
|
|
|
write_stall_condition = write_stall_condition_and_cause.first;
|
|
|
|
auto write_stall_cause = write_stall_condition_and_cause.second;
|
|
|
|
|
|
|
|
bool was_stopped = write_controller->IsStopped();
|
|
|
|
bool needed_delay = write_controller->NeedsDelay();
|
|
|
|
|
|
|
|
if (write_stall_condition == WriteStallCondition::kStopped &&
|
|
|
|
write_stall_cause == WriteStallCause::kMemtableLimit) {
|
Push- instead of pull-model for managing Write stalls
Summary:
Introducing WriteController, which is a source of truth about per-DB write delays. Let's define an DB epoch as a period where there are no flushes and compactions (i.e. new epoch is started when flush or compaction finishes). Each epoch can either:
* proceed with all writes without delay
* delay all writes by fixed time
* stop all writes
The three modes are recomputed at each epoch change (flush, compaction), rather than on every write (which is currently the case).
When we have a lot of column families, our current pull behavior adds a big overhead, since we need to loop over every column family for every write. With new push model, overhead on Write code-path is minimal.
This is just the start. Next step is to also take care of stalls introduced by slow memtable flushes. The final goal is to eliminate function MakeRoomForWrite(), which currently needs to be called for every column family by every write.
Test Plan: make check for now. I'll add some unit tests later. Also, perf test.
Reviewers: dhruba, yhchiang, MarkCallaghan, sdong, ljin
Reviewed By: ljin
Subscribers: leveldb
Differential Revision: https://reviews.facebook.net/D22791
10 years ago
|
|
|
write_controller_token_ = write_controller->GetStopToken();
|
|
|
|
internal_stats_->AddCFStats(InternalStats::MEMTABLE_LIMIT_STOPS, 1);
|
|
|
|
ROCKS_LOG_WARN(
|
|
|
|
ioptions_.logger,
|
Push- instead of pull-model for managing Write stalls
Summary:
Introducing WriteController, which is a source of truth about per-DB write delays. Let's define an DB epoch as a period where there are no flushes and compactions (i.e. new epoch is started when flush or compaction finishes). Each epoch can either:
* proceed with all writes without delay
* delay all writes by fixed time
* stop all writes
The three modes are recomputed at each epoch change (flush, compaction), rather than on every write (which is currently the case).
When we have a lot of column families, our current pull behavior adds a big overhead, since we need to loop over every column family for every write. With new push model, overhead on Write code-path is minimal.
This is just the start. Next step is to also take care of stalls introduced by slow memtable flushes. The final goal is to eliminate function MakeRoomForWrite(), which currently needs to be called for every column family by every write.
Test Plan: make check for now. I'll add some unit tests later. Also, perf test.
Reviewers: dhruba, yhchiang, MarkCallaghan, sdong, ljin
Reviewed By: ljin
Subscribers: leveldb
Differential Revision: https://reviews.facebook.net/D22791
10 years ago
|
|
|
"[%s] Stopping writes because we have %d immutable memtables "
|
|
|
|
"(waiting for flush), max_write_buffer_number is set to %d",
|
Support saving history in memtable_list
Summary:
For transactions, we are using the memtables to validate that there are no write conflicts. But after flushing, we don't have any memtables, and transactions could fail to commit. So we want to someone keep around some extra history to use for conflict checking. In addition, we want to provide a way to increase the size of this history if too many transactions fail to commit.
After chatting with people, it seems like everyone prefers just using Memtables to store this history (instead of a separate history structure). It seems like the best place for this is abstracted inside the memtable_list. I decide to create a separate list in MemtableListVersion as using the same list complicated the flush/installalflushresults logic too much.
This diff adds a new parameter to control how much memtable history to keep around after flushing. However, it sounds like people aren't too fond of adding new parameters. So I am making the default size of flushed+not-flushed memtables be set to max_write_buffers. This should not change the maximum amount of memory used, but make it more likely we're using closer the the limit. (We are now postponing deleting flushed memtables until the max_write_buffer limit is reached). So while we might use more memory on average, we are still obeying the limit set (and you could argue it's better to go ahead and use up memory now instead of waiting for a write stall to happen to test this limit).
However, if people are opposed to this default behavior, we can easily set it to 0 and require this parameter be set in order to use transactions.
Test Plan: Added a xfunc test to play around with setting different values of this parameter in all tests. Added testing in memtablelist_test and planning on adding more testing here.
Reviewers: sdong, rven, igor
Reviewed By: igor
Subscribers: dhruba, leveldb
Differential Revision: https://reviews.facebook.net/D37443
10 years ago
|
|
|
name_.c_str(), imm()->NumNotFlushed(),
|
|
|
|
mutable_cf_options.max_write_buffer_number);
|
|
|
|
} else if (write_stall_condition == WriteStallCondition::kStopped &&
|
|
|
|
write_stall_cause == WriteStallCause::kL0FileCountLimit) {
|
|
|
|
write_controller_token_ = write_controller->GetStopToken();
|
|
|
|
internal_stats_->AddCFStats(InternalStats::L0_FILE_COUNT_LIMIT_STOPS, 1);
|
|
|
|
if (compaction_picker_->IsLevel0CompactionInProgress()) {
|
|
|
|
internal_stats_->AddCFStats(
|
|
|
|
InternalStats::L0_FILE_COUNT_LIMIT_STOPS_WITH_ONGOING_COMPACTION,
|
|
|
|
1);
|
|
|
|
}
|
|
|
|
ROCKS_LOG_WARN(ioptions_.logger,
|
|
|
|
"[%s] Stopping writes because we have %d level-0 files",
|
|
|
|
name_.c_str(), vstorage->l0_delay_trigger_count());
|
|
|
|
} else if (write_stall_condition == WriteStallCondition::kStopped &&
|
|
|
|
write_stall_cause == WriteStallCause::kPendingCompactionBytes) {
|
|
|
|
write_controller_token_ = write_controller->GetStopToken();
|
|
|
|
internal_stats_->AddCFStats(
|
|
|
|
InternalStats::PENDING_COMPACTION_BYTES_LIMIT_STOPS, 1);
|
|
|
|
ROCKS_LOG_WARN(
|
|
|
|
ioptions_.logger,
|
|
|
|
"[%s] Stopping writes because of estimated pending compaction "
|
|
|
|
"bytes %" PRIu64,
|
When slowdown is triggered, reduce the write rate
Summary: It's usually hard for users to set a value of options.delayed_write_rate. With this diff, after slowdown condition triggers, we greedily reduce write rate if estimated pending compaction bytes increase. If estimated compaction pending bytes drop, we increase the write rate.
Test Plan:
Add a unit test
Test with db_bench setting:
TEST_TMPDIR=/dev/shm/ ./db_bench --benchmarks=fillrandom -num=10000000 --soft_pending_compaction_bytes_limit=1000000000 --hard_pending_compaction_bytes_limit=3000000000 --delayed_write_rate=100000000
and make sure without the commit, write stop will happen, but with the commit, it will not happen.
Reviewers: igor, anthony, rven, yhchiang, kradhakrishnan, IslamAbdelRahman
Reviewed By: IslamAbdelRahman
Subscribers: leveldb, dhruba
Differential Revision: https://reviews.facebook.net/D52131
9 years ago
|
|
|
name_.c_str(), compaction_needed_bytes);
|
|
|
|
} else if (write_stall_condition == WriteStallCondition::kDelayed &&
|
|
|
|
write_stall_cause == WriteStallCause::kMemtableLimit) {
|
|
|
|
write_controller_token_ =
|
|
|
|
SetupDelay(write_controller, compaction_needed_bytes,
|
|
|
|
prev_compaction_needed_bytes_, was_stopped,
|
|
|
|
mutable_cf_options.disable_auto_compactions);
|
|
|
|
internal_stats_->AddCFStats(InternalStats::MEMTABLE_LIMIT_DELAYS, 1);
|
|
|
|
ROCKS_LOG_WARN(
|
|
|
|
ioptions_.logger,
|
|
|
|
"[%s] Stalling writes because we have %d immutable memtables "
|
|
|
|
"(waiting for flush), max_write_buffer_number is set to %d "
|
|
|
|
"rate %" PRIu64,
|
|
|
|
name_.c_str(), imm()->NumNotFlushed(),
|
|
|
|
mutable_cf_options.max_write_buffer_number,
|
|
|
|
write_controller->delayed_write_rate());
|
|
|
|
} else if (write_stall_condition == WriteStallCondition::kDelayed &&
|
|
|
|
write_stall_cause == WriteStallCause::kL0FileCountLimit) {
|
|
|
|
// L0 is the last two files from stopping.
|
|
|
|
bool near_stop = vstorage->l0_delay_trigger_count() >=
|
|
|
|
mutable_cf_options.level0_stop_writes_trigger - 2;
|
When slowdown is triggered, reduce the write rate
Summary: It's usually hard for users to set a value of options.delayed_write_rate. With this diff, after slowdown condition triggers, we greedily reduce write rate if estimated pending compaction bytes increase. If estimated compaction pending bytes drop, we increase the write rate.
Test Plan:
Add a unit test
Test with db_bench setting:
TEST_TMPDIR=/dev/shm/ ./db_bench --benchmarks=fillrandom -num=10000000 --soft_pending_compaction_bytes_limit=1000000000 --hard_pending_compaction_bytes_limit=3000000000 --delayed_write_rate=100000000
and make sure without the commit, write stop will happen, but with the commit, it will not happen.
Reviewers: igor, anthony, rven, yhchiang, kradhakrishnan, IslamAbdelRahman
Reviewed By: IslamAbdelRahman
Subscribers: leveldb, dhruba
Differential Revision: https://reviews.facebook.net/D52131
9 years ago
|
|
|
write_controller_token_ =
|
|
|
|
SetupDelay(write_controller, compaction_needed_bytes,
|
|
|
|
prev_compaction_needed_bytes_, was_stopped || near_stop,
|
When slowdown is triggered, reduce the write rate
Summary: It's usually hard for users to set a value of options.delayed_write_rate. With this diff, after slowdown condition triggers, we greedily reduce write rate if estimated pending compaction bytes increase. If estimated compaction pending bytes drop, we increase the write rate.
Test Plan:
Add a unit test
Test with db_bench setting:
TEST_TMPDIR=/dev/shm/ ./db_bench --benchmarks=fillrandom -num=10000000 --soft_pending_compaction_bytes_limit=1000000000 --hard_pending_compaction_bytes_limit=3000000000 --delayed_write_rate=100000000
and make sure without the commit, write stop will happen, but with the commit, it will not happen.
Reviewers: igor, anthony, rven, yhchiang, kradhakrishnan, IslamAbdelRahman
Reviewed By: IslamAbdelRahman
Subscribers: leveldb, dhruba
Differential Revision: https://reviews.facebook.net/D52131
9 years ago
|
|
|
mutable_cf_options.disable_auto_compactions);
|
|
|
|
internal_stats_->AddCFStats(InternalStats::L0_FILE_COUNT_LIMIT_DELAYS, 1);
|
|
|
|
if (compaction_picker_->IsLevel0CompactionInProgress()) {
|
|
|
|
internal_stats_->AddCFStats(
|
|
|
|
InternalStats::L0_FILE_COUNT_LIMIT_DELAYS_WITH_ONGOING_COMPACTION,
|
|
|
|
1);
|
|
|
|
}
|
|
|
|
ROCKS_LOG_WARN(ioptions_.logger,
|
|
|
|
"[%s] Stalling writes because we have %d level-0 files "
|
|
|
|
"rate %" PRIu64,
|
|
|
|
name_.c_str(), vstorage->l0_delay_trigger_count(),
|
|
|
|
write_controller->delayed_write_rate());
|
|
|
|
} else if (write_stall_condition == WriteStallCondition::kDelayed &&
|
|
|
|
write_stall_cause == WriteStallCause::kPendingCompactionBytes) {
|
|
|
|
// If the distance to hard limit is less than 1/4 of the gap between soft
|
|
|
|
// and
|
|
|
|
// hard bytes limit, we think it is near stop and speed up the slowdown.
|
|
|
|
bool near_stop =
|
|
|
|
mutable_cf_options.hard_pending_compaction_bytes_limit > 0 &&
|
|
|
|
(compaction_needed_bytes -
|
|
|
|
mutable_cf_options.soft_pending_compaction_bytes_limit) >
|
|
|
|
3 *
|
|
|
|
(mutable_cf_options.hard_pending_compaction_bytes_limit -
|
|
|
|
mutable_cf_options.soft_pending_compaction_bytes_limit) /
|
|
|
|
4;
|
|
|
|
|
When slowdown is triggered, reduce the write rate
Summary: It's usually hard for users to set a value of options.delayed_write_rate. With this diff, after slowdown condition triggers, we greedily reduce write rate if estimated pending compaction bytes increase. If estimated compaction pending bytes drop, we increase the write rate.
Test Plan:
Add a unit test
Test with db_bench setting:
TEST_TMPDIR=/dev/shm/ ./db_bench --benchmarks=fillrandom -num=10000000 --soft_pending_compaction_bytes_limit=1000000000 --hard_pending_compaction_bytes_limit=3000000000 --delayed_write_rate=100000000
and make sure without the commit, write stop will happen, but with the commit, it will not happen.
Reviewers: igor, anthony, rven, yhchiang, kradhakrishnan, IslamAbdelRahman
Reviewed By: IslamAbdelRahman
Subscribers: leveldb, dhruba
Differential Revision: https://reviews.facebook.net/D52131
9 years ago
|
|
|
write_controller_token_ =
|
|
|
|
SetupDelay(write_controller, compaction_needed_bytes,
|
|
|
|
prev_compaction_needed_bytes_, was_stopped || near_stop,
|
When slowdown is triggered, reduce the write rate
Summary: It's usually hard for users to set a value of options.delayed_write_rate. With this diff, after slowdown condition triggers, we greedily reduce write rate if estimated pending compaction bytes increase. If estimated compaction pending bytes drop, we increase the write rate.
Test Plan:
Add a unit test
Test with db_bench setting:
TEST_TMPDIR=/dev/shm/ ./db_bench --benchmarks=fillrandom -num=10000000 --soft_pending_compaction_bytes_limit=1000000000 --hard_pending_compaction_bytes_limit=3000000000 --delayed_write_rate=100000000
and make sure without the commit, write stop will happen, but with the commit, it will not happen.
Reviewers: igor, anthony, rven, yhchiang, kradhakrishnan, IslamAbdelRahman
Reviewed By: IslamAbdelRahman
Subscribers: leveldb, dhruba
Differential Revision: https://reviews.facebook.net/D52131
9 years ago
|
|
|
mutable_cf_options.disable_auto_compactions);
|
|
|
|
internal_stats_->AddCFStats(
|
|
|
|
InternalStats::PENDING_COMPACTION_BYTES_LIMIT_DELAYS, 1);
|
|
|
|
ROCKS_LOG_WARN(
|
|
|
|
ioptions_.logger,
|
|
|
|
"[%s] Stalling writes because of estimated pending compaction "
|
When slowdown is triggered, reduce the write rate
Summary: It's usually hard for users to set a value of options.delayed_write_rate. With this diff, after slowdown condition triggers, we greedily reduce write rate if estimated pending compaction bytes increase. If estimated compaction pending bytes drop, we increase the write rate.
Test Plan:
Add a unit test
Test with db_bench setting:
TEST_TMPDIR=/dev/shm/ ./db_bench --benchmarks=fillrandom -num=10000000 --soft_pending_compaction_bytes_limit=1000000000 --hard_pending_compaction_bytes_limit=3000000000 --delayed_write_rate=100000000
and make sure without the commit, write stop will happen, but with the commit, it will not happen.
Reviewers: igor, anthony, rven, yhchiang, kradhakrishnan, IslamAbdelRahman
Reviewed By: IslamAbdelRahman
Subscribers: leveldb, dhruba
Differential Revision: https://reviews.facebook.net/D52131
9 years ago
|
|
|
"bytes %" PRIu64 " rate %" PRIu64,
|
|
|
|
name_.c_str(), vstorage->estimated_compaction_needed_bytes(),
|
|
|
|
write_controller->delayed_write_rate());
|
|
|
|
} else {
|
|
|
|
assert(write_stall_condition == WriteStallCondition::kNormal);
|
|
|
|
if (vstorage->l0_delay_trigger_count() >=
|
|
|
|
GetL0ThresholdSpeedupCompaction(
|
|
|
|
mutable_cf_options.level0_file_num_compaction_trigger,
|
|
|
|
mutable_cf_options.level0_slowdown_writes_trigger)) {
|
|
|
|
write_controller_token_ =
|
|
|
|
write_controller->GetCompactionPressureToken();
|
|
|
|
ROCKS_LOG_INFO(
|
|
|
|
ioptions_.logger,
|
|
|
|
"[%s] Increasing compaction threads because we have %d level-0 "
|
|
|
|
"files ",
|
|
|
|
name_.c_str(), vstorage->l0_delay_trigger_count());
|
|
|
|
} else if (vstorage->estimated_compaction_needed_bytes() >=
|
|
|
|
mutable_cf_options.soft_pending_compaction_bytes_limit / 4) {
|
|
|
|
// Increase compaction threads if bytes needed for compaction exceeds
|
|
|
|
// 1/4 of threshold for slowing down.
|
|
|
|
// If soft pending compaction byte limit is not set, always speed up
|
|
|
|
// compaction.
|
|
|
|
write_controller_token_ =
|
|
|
|
write_controller->GetCompactionPressureToken();
|
|
|
|
if (mutable_cf_options.soft_pending_compaction_bytes_limit > 0) {
|
|
|
|
ROCKS_LOG_INFO(
|
|
|
|
ioptions_.logger,
|
|
|
|
"[%s] Increasing compaction threads because of estimated pending "
|
|
|
|
"compaction "
|
|
|
|
"bytes %" PRIu64,
|
|
|
|
name_.c_str(), vstorage->estimated_compaction_needed_bytes());
|
|
|
|
}
|
|
|
|
} else {
|
|
|
|
write_controller_token_.reset();
|
|
|
|
}
|
|
|
|
// If the DB recovers from delay conditions, we reward with reducing
|
|
|
|
// double the slowdown ratio. This is to balance the long term slowdown
|
|
|
|
// increase signal.
|
|
|
|
if (needed_delay) {
|
|
|
|
uint64_t write_rate = write_controller->delayed_write_rate();
|
|
|
|
write_controller->set_delayed_write_rate(static_cast<uint64_t>(
|
|
|
|
static_cast<double>(write_rate) * kDelayRecoverSlowdownRatio));
|
|
|
|
// Set the low pri limit to be 1/4 the delayed write rate.
|
|
|
|
// Note we don't reset this value even after delay condition is relased.
|
|
|
|
// Low-pri rate will continue to apply if there is a compaction
|
|
|
|
// pressure.
|
|
|
|
write_controller->low_pri_rate_limiter()->SetBytesPerSecond(write_rate /
|
|
|
|
4);
|
Add options.base_background_compactions as a number of compaction threads for low compaction debt
Summary:
If options.base_background_compactions is given, we try to schedule number of compactions not existing this number, only when L0 files increase to certain number, or pending compaction bytes more than certain threshold, we schedule compactions based on options.max_background_compactions.
The watermarks are calculated based on slowdown thresholds.
Test Plan:
Add new test cases in column_family_test.
Adding more unit tests.
Reviewers: IslamAbdelRahman, yhchiang, kradhakrishnan, rven, anthony
Reviewed By: anthony
Subscribers: leveldb, dhruba, yoshinorim
Differential Revision: https://reviews.facebook.net/D53409
9 years ago
|
|
|
}
|
Push- instead of pull-model for managing Write stalls
Summary:
Introducing WriteController, which is a source of truth about per-DB write delays. Let's define an DB epoch as a period where there are no flushes and compactions (i.e. new epoch is started when flush or compaction finishes). Each epoch can either:
* proceed with all writes without delay
* delay all writes by fixed time
* stop all writes
The three modes are recomputed at each epoch change (flush, compaction), rather than on every write (which is currently the case).
When we have a lot of column families, our current pull behavior adds a big overhead, since we need to loop over every column family for every write. With new push model, overhead on Write code-path is minimal.
This is just the start. Next step is to also take care of stalls introduced by slow memtable flushes. The final goal is to eliminate function MakeRoomForWrite(), which currently needs to be called for every column family by every write.
Test Plan: make check for now. I'll add some unit tests later. Also, perf test.
Reviewers: dhruba, yhchiang, MarkCallaghan, sdong, ljin
Reviewed By: ljin
Subscribers: leveldb
Differential Revision: https://reviews.facebook.net/D22791
10 years ago
|
|
|
}
|
When slowdown is triggered, reduce the write rate
Summary: It's usually hard for users to set a value of options.delayed_write_rate. With this diff, after slowdown condition triggers, we greedily reduce write rate if estimated pending compaction bytes increase. If estimated compaction pending bytes drop, we increase the write rate.
Test Plan:
Add a unit test
Test with db_bench setting:
TEST_TMPDIR=/dev/shm/ ./db_bench --benchmarks=fillrandom -num=10000000 --soft_pending_compaction_bytes_limit=1000000000 --hard_pending_compaction_bytes_limit=3000000000 --delayed_write_rate=100000000
and make sure without the commit, write stop will happen, but with the commit, it will not happen.
Reviewers: igor, anthony, rven, yhchiang, kradhakrishnan, IslamAbdelRahman
Reviewed By: IslamAbdelRahman
Subscribers: leveldb, dhruba
Differential Revision: https://reviews.facebook.net/D52131
9 years ago
|
|
|
prev_compaction_needed_bytes_ = compaction_needed_bytes;
|
Cache some conditions for DBImpl::MakeRoomForWrite
Summary:
Task 4580155. Some conditions in DBImpl::MakeRoomForWrite can be cached in
ColumnFamilyData, because theirs value can be changed only during compaction,
adding new memtable and/or add recalculation of compaction score.
These conditions are:
cfd->imm()->size() == cfd->options()->max_write_buffer_number - 1
cfd->current()->NumLevelFiles(0) >= cfd->options()->level0_stop_writes_trigger
cfd->options()->soft_rate_limit > 0.0 &&
(score = cfd->current()->MaxCompactionScore()) > cfd->options()->soft_rate_limit
cfd->options()->hard_rate_limit > 1.0 &&
(score = cfd->current()->MaxCompactionScore()) > cfd->options()->hard_rate_limit
P.S.
As it's my first diff, Siying suggested to add everybody as a reviewers
for this diff. Sorry, if I forgot someone or add someone by mistake.
Test Plan: make all check
Reviewers: haobo, xjin, dhruba, yhchiang, zagfox, ljin, sdong
Reviewed By: sdong
Subscribers: leveldb
Differential Revision: https://reviews.facebook.net/D19311
11 years ago
|
|
|
}
|
|
|
|
return write_stall_condition;
|
Cache some conditions for DBImpl::MakeRoomForWrite
Summary:
Task 4580155. Some conditions in DBImpl::MakeRoomForWrite can be cached in
ColumnFamilyData, because theirs value can be changed only during compaction,
adding new memtable and/or add recalculation of compaction score.
These conditions are:
cfd->imm()->size() == cfd->options()->max_write_buffer_number - 1
cfd->current()->NumLevelFiles(0) >= cfd->options()->level0_stop_writes_trigger
cfd->options()->soft_rate_limit > 0.0 &&
(score = cfd->current()->MaxCompactionScore()) > cfd->options()->soft_rate_limit
cfd->options()->hard_rate_limit > 1.0 &&
(score = cfd->current()->MaxCompactionScore()) > cfd->options()->hard_rate_limit
P.S.
As it's my first diff, Siying suggested to add everybody as a reviewers
for this diff. Sorry, if I forgot someone or add someone by mistake.
Test Plan: make all check
Reviewers: haobo, xjin, dhruba, yhchiang, zagfox, ljin, sdong
Reviewed By: sdong
Subscribers: leveldb
Differential Revision: https://reviews.facebook.net/D19311
11 years ago
|
|
|
}
|
|
|
|
|
Introduce a new storage specific Env API (#5761)
Summary:
The current Env API encompasses both storage/file operations, as well as OS related operations. Most of the APIs return a Status, which does not have enough metadata about an error, such as whether its retry-able or not, scope (i.e fault domain) of the error etc., that may be required in order to properly handle a storage error. The file APIs also do not provide enough control over the IO SLA, such as timeout, prioritization, hinting about placement and redundancy etc.
This PR separates out the file/storage APIs from Env into a new FileSystem class. The APIs are updated to return an IOStatus with metadata about the error, as well as to take an IOOptions structure as input in order to allow more control over the IO.
The user can set both ```options.env``` and ```options.file_system``` to specify that RocksDB should use the former for OS related operations and the latter for storage operations. Internally, a ```CompositeEnvWrapper``` has been introduced that inherits from ```Env``` and redirects individual methods to either an ```Env``` implementation or the ```FileSystem``` as appropriate. When options are sanitized during ```DB::Open```, ```options.env``` is replaced with a newly allocated ```CompositeEnvWrapper``` instance if both env and file_system have been specified. This way, the rest of the RocksDB code can continue to function as before.
This PR also ports PosixEnv to the new API by splitting it into two - PosixEnv and PosixFileSystem. PosixEnv is defined as a sub-class of CompositeEnvWrapper, and threading/time functions are overridden with Posix specific implementations in order to avoid an extra level of indirection.
The ```CompositeEnvWrapper``` translates ```IOStatus``` return code to ```Status```, and sets the severity to ```kSoftError``` if the io_status is retryable. The error handling code in RocksDB can then recover the DB automatically.
Pull Request resolved: https://github.com/facebook/rocksdb/pull/5761
Differential Revision: D18868376
Pulled By: anand1976
fbshipit-source-id: 39efe18a162ea746fabac6360ff529baba48486f
5 years ago
|
|
|
const FileOptions* ColumnFamilyData::soptions() const {
|
|
|
|
return &(column_family_set_->file_options_);
|
|
|
|
}
|
|
|
|
|
|
|
|
void ColumnFamilyData::SetCurrent(Version* current_version) {
|
|
|
|
current_ = current_version;
|
|
|
|
}
|
|
|
|
|
|
|
|
uint64_t ColumnFamilyData::GetNumLiveVersions() const {
|
|
|
|
return VersionSet::GetNumLiveVersions(dummy_versions_);
|
|
|
|
}
|
|
|
|
|
|
|
|
uint64_t ColumnFamilyData::GetTotalSstFilesSize() const {
|
|
|
|
return VersionSet::GetTotalSstFilesSize(dummy_versions_);
|
|
|
|
}
|
|
|
|
|
|
|
|
uint64_t ColumnFamilyData::GetTotalBlobFileSize() const {
|
|
|
|
return VersionSet::GetTotalBlobFileSize(dummy_versions_);
|
|
|
|
}
|
|
|
|
|
|
|
|
uint64_t ColumnFamilyData::GetLiveSstFilesSize() const {
|
|
|
|
return current_->GetSstFilesSize();
|
|
|
|
}
|
|
|
|
|
|
|
|
MemTable* ColumnFamilyData::ConstructNewMemtable(
|
|
|
|
const MutableCFOptions& mutable_cf_options, SequenceNumber earliest_seq) {
|
|
|
|
return new MemTable(internal_comparator_, ioptions_, mutable_cf_options,
|
|
|
|
write_buffer_manager_, earliest_seq, id_);
|
|
|
|
}
|
|
|
|
|
|
|
|
void ColumnFamilyData::CreateNewMemtable(
|
|
|
|
const MutableCFOptions& mutable_cf_options, SequenceNumber earliest_seq) {
|
|
|
|
if (mem_ != nullptr) {
|
|
|
|
delete mem_->Unref();
|
|
|
|
}
|
|
|
|
SetMemtable(ConstructNewMemtable(mutable_cf_options, earliest_seq));
|
|
|
|
mem_->Ref();
|
|
|
|
}
|
|
|
|
|
Rewritten system for scheduling background work
Summary:
When scaling to higher number of column families, the worst bottleneck was MaybeScheduleFlushOrCompaction(), which did a for loop over all column families while holding a mutex. This patch addresses the issue.
The approach is similar to our earlier efforts: instead of a pull-model, where we do something for every column family, we can do a push-based model -- when we detect that column family is ready to be flushed/compacted, we add it to the flush_queue_/compaction_queue_. That way we don't need to loop over every column family in MaybeScheduleFlushOrCompaction.
Here are the performance results:
Command:
./db_bench --write_buffer_size=268435456 --db_write_buffer_size=268435456 --db=/fast-rocksdb-tmp/rocks_lots_of_cf --use_existing_db=0 --open_files=55000 --statistics=1 --histogram=1 --disable_data_sync=1 --max_write_buffer_number=2 --sync=0 --benchmarks=fillrandom --threads=16 --num_column_families=5000 --disable_wal=1 --max_background_flushes=16 --max_background_compactions=16 --level0_file_num_compaction_trigger=2 --level0_slowdown_writes_trigger=2 --level0_stop_writes_trigger=3 --hard_rate_limit=1 --num=33333333 --writes=33333333
Before the patch:
fillrandom : 26.950 micros/op 37105 ops/sec; 4.1 MB/s
After the patch:
fillrandom : 17.404 micros/op 57456 ops/sec; 6.4 MB/s
Next bottleneck is VersionSet::AddLiveFiles, which is painfully slow when we have a lot of files. This is coming in the next patch, but when I removed that code, here's what I got:
fillrandom : 7.590 micros/op 131758 ops/sec; 14.6 MB/s
Test Plan:
make check
two stress tests:
Big number of compactions and flushes:
./db_stress --threads=30 --ops_per_thread=20000000 --max_key=10000 --column_families=20 --clear_column_family_one_in=10000000 --verify_before_write=0 --reopen=15 --max_background_compactions=10 --max_background_flushes=10 --db=/fast-rocksdb-tmp/db_stress --prefixpercent=0 --iterpercent=0 --writepercent=75 --db_write_buffer_size=2000000
max_background_flushes=0, to verify that this case also works correctly
./db_stress --threads=30 --ops_per_thread=2000000 --max_key=10000 --column_families=20 --clear_column_family_one_in=10000000 --verify_before_write=0 --reopen=3 --max_background_compactions=3 --max_background_flushes=0 --db=/fast-rocksdb-tmp/db_stress --prefixpercent=0 --iterpercent=0 --writepercent=75 --db_write_buffer_size=2000000
Reviewers: ljin, rven, yhchiang, sdong
Reviewed By: sdong
Subscribers: dhruba, leveldb
Differential Revision: https://reviews.facebook.net/D30123
10 years ago
|
|
|
bool ColumnFamilyData::NeedsCompaction() const {
|
|
|
|
return !mutable_cf_options_.disable_auto_compactions &&
|
|
|
|
compaction_picker_->NeedsCompaction(current_->storage_info());
|
Rewritten system for scheduling background work
Summary:
When scaling to higher number of column families, the worst bottleneck was MaybeScheduleFlushOrCompaction(), which did a for loop over all column families while holding a mutex. This patch addresses the issue.
The approach is similar to our earlier efforts: instead of a pull-model, where we do something for every column family, we can do a push-based model -- when we detect that column family is ready to be flushed/compacted, we add it to the flush_queue_/compaction_queue_. That way we don't need to loop over every column family in MaybeScheduleFlushOrCompaction.
Here are the performance results:
Command:
./db_bench --write_buffer_size=268435456 --db_write_buffer_size=268435456 --db=/fast-rocksdb-tmp/rocks_lots_of_cf --use_existing_db=0 --open_files=55000 --statistics=1 --histogram=1 --disable_data_sync=1 --max_write_buffer_number=2 --sync=0 --benchmarks=fillrandom --threads=16 --num_column_families=5000 --disable_wal=1 --max_background_flushes=16 --max_background_compactions=16 --level0_file_num_compaction_trigger=2 --level0_slowdown_writes_trigger=2 --level0_stop_writes_trigger=3 --hard_rate_limit=1 --num=33333333 --writes=33333333
Before the patch:
fillrandom : 26.950 micros/op 37105 ops/sec; 4.1 MB/s
After the patch:
fillrandom : 17.404 micros/op 57456 ops/sec; 6.4 MB/s
Next bottleneck is VersionSet::AddLiveFiles, which is painfully slow when we have a lot of files. This is coming in the next patch, but when I removed that code, here's what I got:
fillrandom : 7.590 micros/op 131758 ops/sec; 14.6 MB/s
Test Plan:
make check
two stress tests:
Big number of compactions and flushes:
./db_stress --threads=30 --ops_per_thread=20000000 --max_key=10000 --column_families=20 --clear_column_family_one_in=10000000 --verify_before_write=0 --reopen=15 --max_background_compactions=10 --max_background_flushes=10 --db=/fast-rocksdb-tmp/db_stress --prefixpercent=0 --iterpercent=0 --writepercent=75 --db_write_buffer_size=2000000
max_background_flushes=0, to verify that this case also works correctly
./db_stress --threads=30 --ops_per_thread=2000000 --max_key=10000 --column_families=20 --clear_column_family_one_in=10000000 --verify_before_write=0 --reopen=3 --max_background_compactions=3 --max_background_flushes=0 --db=/fast-rocksdb-tmp/db_stress --prefixpercent=0 --iterpercent=0 --writepercent=75 --db_write_buffer_size=2000000
Reviewers: ljin, rven, yhchiang, sdong
Reviewed By: sdong
Subscribers: dhruba, leveldb
Differential Revision: https://reviews.facebook.net/D30123
10 years ago
|
|
|
}
|
|
|
|
|
|
|
|
Compaction* ColumnFamilyData::PickCompaction(
|
|
|
|
const MutableCFOptions& mutable_options,
|
|
|
|
const MutableDBOptions& mutable_db_options, LogBuffer* log_buffer) {
|
|
|
|
auto* result = compaction_picker_->PickCompaction(
|
|
|
|
GetName(), mutable_options, mutable_db_options, current_->storage_info(),
|
Sort L0 files by newly introduced epoch_num (#10922)
Summary:
**Context:**
Sorting L0 files by `largest_seqno` has at least two inconvenience:
- File ingestion and compaction involving ingested files can create files of overlapping seqno range with the existing files. `force_consistency_check=true` will catch such overlap seqno range even those harmless overlap.
- For example, consider the following sequence of events ("key@n" indicates key at seqno "n")
- insert k1@1 to memtable m1
- ingest file s1 with k2@2, ingest file s2 with k3@3
- insert k4@4 to m1
- compact files s1, s2 and result in new file s3 of seqno range [2, 3]
- flush m1 and result in new file s4 of seqno range [1, 4]. And `force_consistency_check=true` will think s4 and s3 has file reordering corruption that might cause retuning an old value of k1
- However such caught corruption is a false positive since s1, s2 will not have overlapped keys with k1 or whatever inserted into m1 before ingest file s1 by the requirement of file ingestion (otherwise the m1 will be flushed first before any of the file ingestion completes). Therefore there in fact isn't any file reordering corruption.
- Single delete can decrease a file's largest seqno and ordering by `largest_seqno` can introduce a wrong ordering hence file reordering corruption
- For example, consider the following sequence of events ("key@n" indicates key at seqno "n", Credit to ajkr for this example)
- an existing SST s1 contains only k1@1
- insert k1@2 to memtable m1
- ingest file s2 with k3@3, ingest file s3 with k4@4
- insert single delete k5@5 in m1
- flush m1 and result in new file s4 of seqno range [2, 5]
- compact s1, s2, s3 and result in new file s5 of seqno range [1, 4]
- compact s4 and result in new file s6 of seqno range [2] due to single delete
- By the last step, we have file ordering by largest seqno (">" means "newer") : s5 > s6 while s6 contains a newer version of the k1's value (i.e, k1@2) than s5, which is a real reordering corruption. While this can be caught by `force_consistency_check=true`, there isn't a good way to prevent this from happening if ordering by `largest_seqno`
Therefore, we are redesigning the sorting criteria of L0 files and avoid above inconvenience. Credit to ajkr , we now introduce `epoch_num` which describes the order of a file being flushed or ingested/imported (compaction output file will has the minimum `epoch_num` among input files'). This will avoid the above inconvenience in the following ways:
- In the first case above, there will no longer be overlap seqno range check in `force_consistency_check=true` but `epoch_number` ordering check. This will result in file ordering s1 < s2 < s4 (pre-compaction) and s3 < s4 (post-compaction) which won't trigger false positive corruption. See test class `DBCompactionTestL0FilesMisorderCorruption*` for more.
- In the second case above, this will result in file ordering s1 < s2 < s3 < s4 (pre-compacting s1, s2, s3), s5 < s4 (post-compacting s1, s2, s3), s5 < s6 (post-compacting s4), which are correct file ordering without causing any corruption.
**Summary:**
- Introduce `epoch_number` stored per `ColumnFamilyData` and sort CF's L0 files by their assigned `epoch_number` instead of `largest_seqno`.
- `epoch_number` is increased and assigned upon `VersionEdit::AddFile()` for flush (or similarly for WriteLevel0TableForRecovery) and file ingestion (except for allow_behind_true, which will always get assigned as the `kReservedEpochNumberForFileIngestedBehind`)
- Compaction output file is assigned with the minimum `epoch_number` among input files'
- Refit level: reuse refitted file's epoch_number
- Other paths needing `epoch_number` treatment:
- Import column families: reuse file's epoch_number if exists. If not, assign one based on `NewestFirstBySeqNo`
- Repair: reuse file's epoch_number if exists. If not, assign one based on `NewestFirstBySeqNo`.
- Assigning new epoch_number to a file and adding this file to LSM tree should be atomic. This is guaranteed by us assigning epoch_number right upon `VersionEdit::AddFile()` where this version edit will be apply to LSM tree shape right after by holding the db mutex (e.g, flush, file ingestion, import column family) or by there is only 1 ongoing edit per CF (e.g, WriteLevel0TableForRecovery, Repair).
- Assigning the minimum input epoch number to compaction output file won't misorder L0 files (even through later `Refit(target_level=0)`). It's due to for every key "k" in the input range, a legit compaction will cover a continuous epoch number range of that key. As long as we assign the key "k" the minimum input epoch number, it won't become newer or older than the versions of this key that aren't included in this compaction hence no misorder.
- Persist `epoch_number` of each file in manifest and recover `epoch_number` on db recovery
- Backward compatibility with old db without `epoch_number` support is guaranteed by assigning `epoch_number` to recovered files by `NewestFirstBySeqno` order. See `VersionStorageInfo::RecoverEpochNumbers()` for more
- Forward compatibility with manifest is guaranteed by flexibility of `NewFileCustomTag`
- Replace `force_consistent_check` on L0 with `epoch_number` and remove false positive check like case 1 with `largest_seqno` above
- Due to backward compatibility issue, we might encounter files with missing epoch number at the beginning of db recovery. We will still use old L0 sorting mechanism (`NewestFirstBySeqno`) to check/sort them till we infer their epoch number. See usages of `EpochNumberRequirement`.
- Remove fix https://github.com/facebook/rocksdb/pull/5958#issue-511150930 and their outdated tests to file reordering corruption because such fix can be replaced by this PR.
- Misc:
- update existing tests with `epoch_number` so make check will pass
- update https://github.com/facebook/rocksdb/pull/5958#issue-511150930 tests to verify corruption is fixed using `epoch_number` and cover universal/fifo compaction/CompactRange/CompactFile cases
- assert db_mutex is held for a few places before calling ColumnFamilyData::NewEpochNumber()
Pull Request resolved: https://github.com/facebook/rocksdb/pull/10922
Test Plan:
- `make check`
- New unit tests under `db/db_compaction_test.cc`, `db/db_test2.cc`, `db/version_builder_test.cc`, `db/repair_test.cc`
- Updated tests (i.e, `DBCompactionTestL0FilesMisorderCorruption*`) under https://github.com/facebook/rocksdb/pull/5958#issue-511150930
- [Ongoing] Compatibility test: manually run https://github.com/ajkr/rocksdb/commit/36a5686ec012f35a4371e409aa85c404ca1c210d (with file ingestion off for running the `.orig` binary to prevent this bug affecting upgrade/downgrade formality checking) for 1 hour on `simple black/white box`, `cf_consistency/txn/enable_ts with whitebox + test_best_efforts_recovery with blackbox`
- [Ongoing] normal db stress test
- [Ongoing] db stress test with aggressive value https://github.com/facebook/rocksdb/pull/10761
Reviewed By: ajkr
Differential Revision: D41063187
Pulled By: hx235
fbshipit-source-id: 826cb23455de7beaabe2d16c57682a82733a32a9
2 years ago
|
|
|
log_buffer);
|
|
|
|
if (result != nullptr) {
|
|
|
|
result->SetInputVersion(current_);
|
|
|
|
}
|
Cache some conditions for DBImpl::MakeRoomForWrite
Summary:
Task 4580155. Some conditions in DBImpl::MakeRoomForWrite can be cached in
ColumnFamilyData, because theirs value can be changed only during compaction,
adding new memtable and/or add recalculation of compaction score.
These conditions are:
cfd->imm()->size() == cfd->options()->max_write_buffer_number - 1
cfd->current()->NumLevelFiles(0) >= cfd->options()->level0_stop_writes_trigger
cfd->options()->soft_rate_limit > 0.0 &&
(score = cfd->current()->MaxCompactionScore()) > cfd->options()->soft_rate_limit
cfd->options()->hard_rate_limit > 1.0 &&
(score = cfd->current()->MaxCompactionScore()) > cfd->options()->hard_rate_limit
P.S.
As it's my first diff, Siying suggested to add everybody as a reviewers
for this diff. Sorry, if I forgot someone or add someone by mistake.
Test Plan: make all check
Reviewers: haobo, xjin, dhruba, yhchiang, zagfox, ljin, sdong
Reviewed By: sdong
Subscribers: leveldb
Differential Revision: https://reviews.facebook.net/D19311
11 years ago
|
|
|
return result;
|
|
|
|
}
|
|
|
|
|
|
|
|
bool ColumnFamilyData::RangeOverlapWithCompaction(
|
|
|
|
const Slice& smallest_user_key, const Slice& largest_user_key,
|
|
|
|
int level) const {
|
|
|
|
return compaction_picker_->RangeOverlapWithCompaction(
|
|
|
|
smallest_user_key, largest_user_key, level);
|
|
|
|
}
|
|
|
|
|
|
|
|
Status ColumnFamilyData::RangesOverlapWithMemtables(
|
|
|
|
const autovector<Range>& ranges, SuperVersion* super_version,
|
|
|
|
bool allow_data_in_errors, bool* overlap) {
|
|
|
|
assert(overlap != nullptr);
|
|
|
|
*overlap = false;
|
|
|
|
// Create an InternalIterator over all unflushed memtables
|
|
|
|
Arena arena;
|
|
|
|
// TODO: plumb Env::IOActivity
|
|
|
|
ReadOptions read_opts;
|
|
|
|
read_opts.total_order_seek = true;
|
|
|
|
MergeIteratorBuilder merge_iter_builder(&internal_comparator_, &arena);
|
|
|
|
merge_iter_builder.AddIterator(
|
|
|
|
super_version->mem->NewIterator(read_opts, &arena));
|
|
|
|
super_version->imm->AddIterators(read_opts, &merge_iter_builder,
|
|
|
|
false /* add_range_tombstone_iter */);
|
|
|
|
ScopedArenaIterator memtable_iter(merge_iter_builder.Finish());
|
|
|
|
|
|
|
|
auto read_seq = super_version->current->version_set()->LastSequence();
|
|
|
|
ReadRangeDelAggregator range_del_agg(&internal_comparator_, read_seq);
|
|
|
|
auto* active_range_del_iter = super_version->mem->NewRangeTombstoneIterator(
|
|
|
|
read_opts, read_seq, false /* immutable_memtable */);
|
|
|
|
range_del_agg.AddTombstones(
|
|
|
|
std::unique_ptr<FragmentedRangeTombstoneIterator>(active_range_del_iter));
|
|
|
|
Status status;
|
|
|
|
status = super_version->imm->AddRangeTombstoneIterators(
|
|
|
|
read_opts, nullptr /* arena */, &range_del_agg);
|
|
|
|
// AddRangeTombstoneIterators always return Status::OK.
|
|
|
|
assert(status.ok());
|
|
|
|
|
|
|
|
for (size_t i = 0; i < ranges.size() && status.ok() && !*overlap; ++i) {
|
|
|
|
auto* vstorage = super_version->current->storage_info();
|
|
|
|
auto* ucmp = vstorage->InternalComparator()->user_comparator();
|
|
|
|
InternalKey range_start(ranges[i].start, kMaxSequenceNumber,
|
|
|
|
kValueTypeForSeek);
|
|
|
|
memtable_iter->Seek(range_start.Encode());
|
|
|
|
status = memtable_iter->status();
|
|
|
|
ParsedInternalKey seek_result;
|
|
|
|
|
|
|
|
if (status.ok() && memtable_iter->Valid()) {
|
|
|
|
status = ParseInternalKey(memtable_iter->key(), &seek_result,
|
|
|
|
allow_data_in_errors);
|
|
|
|
}
|
|
|
|
|
|
|
|
if (status.ok()) {
|
|
|
|
if (memtable_iter->Valid() &&
|
|
|
|
ucmp->Compare(seek_result.user_key, ranges[i].limit) <= 0) {
|
|
|
|
*overlap = true;
|
|
|
|
} else if (range_del_agg.IsRangeOverlapped(ranges[i].start,
|
|
|
|
ranges[i].limit)) {
|
|
|
|
*overlap = true;
|
|
|
|
}
|
|
|
|
}
|
|
|
|
}
|
|
|
|
return status;
|
|
|
|
}
|
|
|
|
|
|
|
|
const int ColumnFamilyData::kCompactAllLevels = -1;
|
|
|
|
const int ColumnFamilyData::kCompactToBaseLevel = -2;
|
|
|
|
|
|
|
|
Compaction* ColumnFamilyData::CompactRange(
|
|
|
|
const MutableCFOptions& mutable_cf_options,
|
|
|
|
const MutableDBOptions& mutable_db_options, int input_level,
|
|
|
|
int output_level, const CompactRangeOptions& compact_range_options,
|
|
|
|
const InternalKey* begin, const InternalKey* end,
|
|
|
|
InternalKey** compaction_end, bool* conflict,
|
|
|
|
uint64_t max_file_num_to_ignore, const std::string& trim_ts) {
|
|
|
|
auto* result = compaction_picker_->CompactRange(
|
|
|
|
GetName(), mutable_cf_options, mutable_db_options,
|
|
|
|
current_->storage_info(), input_level, output_level,
|
|
|
|
compact_range_options, begin, end, compaction_end, conflict,
|
|
|
|
max_file_num_to_ignore, trim_ts);
|
|
|
|
if (result != nullptr) {
|
|
|
|
result->SetInputVersion(current_);
|
|
|
|
}
|
Add missing range conflict check between file ingestion and RefitLevel() (#10988)
Summary:
**Context:**
File ingestion never checks whether the key range it acts on overlaps with an ongoing RefitLevel() (used in `CompactRange()` with `change_level=true`). That's because RefitLevel() doesn't register and make its key range known to file ingestion. Though it checks overlapping with other compactions by https://github.com/facebook/rocksdb/blob/7.8.fb/db/external_sst_file_ingestion_job.cc#L998.
RefitLevel() (used in `CompactRange()` with `change_level=true`) doesn't check whether the key range it acts on overlaps with an ongoing file ingestion. That's because file ingestion does not register and make its key range known to other compactions.
- Note that non-refitlevel-compaction (e.g, manual compaction w/o RefitLevel() or general compaction) also does not check key range overlap with ongoing file ingestion for the same reason.
- But it's fine. Credited to cbi42's discovery, `WaitForIngestFile` was called by background and foreground compactions. They were introduced in https://github.com/facebook/rocksdb/commit/0f88160f67d36ea30e3aca3a3cef924c3a009be6, https://github.com/facebook/rocksdb/commit/5c64fb67d2fc198f1a73ff3ae543749a6a41f513 and https://github.com/facebook/rocksdb/commit/87dfc1d23e0e16ff73e15f63c6fa0fb3b3fc8c8c.
- Regardless, this PR registers file ingestion like a compaction is a general approach that will also add range conflict check between file ingestion and non-refitlevel-compaction, though it has not been the issue motivated this PR.
Above are bugs resulting in two bad consequences:
- If file ingestion and RefitLevel() creates files in the same level, then range-overlapped files will be created at that level and caught as corruption by `force_consistency_checks=true`
- If file ingestion and RefitLevel() creates file in different levels, then with one further compaction on the ingested file, it can result in two same keys both with seqno 0 in two different levels. Then with iterator's [optimization](https://github.com/facebook/rocksdb/blame/c62f3221698fd273b673d4f7e54eabb8329a4369/db/db_iter.cc#L342-L343) that assumes no two same keys both with seqno 0, it will either break this assertion in debug build or, even worst, return value of this same key for the key after it, which is the wrong value to return, in release build.
Therefore we decide to introduce range conflict check for file ingestion and RefitLevel() inspired from the existing range conflict check among compactions.
**Summary:**
- Treat file ingestion job and RefitLevel() as `Compaction` of new compaction reasons: `CompactionReason::kExternalSstIngestion` and `CompactionReason::kRefitLevel` and register/unregister them. File ingestion is treated as compaction from L0 to different levels and RefitLevel() as compaction from source level to target level.
- Check for `RangeOverlapWithCompaction` with other ongoing compactions, `RegisterCompaction()` on this "compaction" before changing the LSM state in `VersionStorageInfo`, and `UnregisterCompaction()` after changing.
- Replace scattered fixes (https://github.com/facebook/rocksdb/commit/0f88160f67d36ea30e3aca3a3cef924c3a009be6, https://github.com/facebook/rocksdb/commit/5c64fb67d2fc198f1a73ff3ae543749a6a41f513 and https://github.com/facebook/rocksdb/commit/87dfc1d23e0e16ff73e15f63c6fa0fb3b3fc8c8c.) that prevents overlapping between file ingestion and non-refit-level compaction with this fix cuz those practices are easy to overlook.
- Misc: logic cleanup, see PR comments
Pull Request resolved: https://github.com/facebook/rocksdb/pull/10988
Test Plan:
- New unit test `DBCompactionTestWithOngoingFileIngestionParam*` that failed pre-fix and passed afterwards.
- Made compatible with existing tests, see PR comments
- make check
- [Ongoing] Stress test rehearsal with normal value and aggressive CI value https://github.com/facebook/rocksdb/pull/10761
Reviewed By: cbi42
Differential Revision: D41535685
Pulled By: hx235
fbshipit-source-id: 549833a577ba1496d20a870583d4caa737da1258
2 years ago
|
|
|
TEST_SYNC_POINT("ColumnFamilyData::CompactRange:Return");
|
|
|
|
return result;
|
|
|
|
}
|
|
|
|
|
|
|
|
SuperVersion* ColumnFamilyData::GetReferencedSuperVersion(DBImpl* db) {
|
|
|
|
SuperVersion* sv = GetThreadLocalSuperVersion(db);
|
|
|
|
sv->Ref();
|
|
|
|
if (!ReturnThreadLocalSuperVersion(sv)) {
|
|
|
|
// This Unref() corresponds to the Ref() in GetThreadLocalSuperVersion()
|
|
|
|
// when the thread-local pointer was populated. So, the Ref() earlier in
|
|
|
|
// this function still prevents the returned SuperVersion* from being
|
|
|
|
// deleted out from under the caller.
|
|
|
|
sv->Unref();
|
|
|
|
}
|
|
|
|
return sv;
|
|
|
|
}
|
|
|
|
|
|
|
|
SuperVersion* ColumnFamilyData::GetThreadLocalSuperVersion(DBImpl* db) {
|
|
|
|
// The SuperVersion is cached in thread local storage to avoid acquiring
|
|
|
|
// mutex when SuperVersion does not change since the last use. When a new
|
|
|
|
// SuperVersion is installed, the compaction or flush thread cleans up
|
|
|
|
// cached SuperVersion in all existing thread local storage. To avoid
|
|
|
|
// acquiring mutex for this operation, we use atomic Swap() on the thread
|
|
|
|
// local pointer to guarantee exclusive access. If the thread local pointer
|
|
|
|
// is being used while a new SuperVersion is installed, the cached
|
|
|
|
// SuperVersion can become stale. In that case, the background thread would
|
|
|
|
// have swapped in kSVObsolete. We re-check the value at when returning
|
|
|
|
// SuperVersion back to thread local, with an atomic compare and swap.
|
|
|
|
// The superversion will need to be released if detected to be stale.
|
|
|
|
void* ptr = local_sv_->Swap(SuperVersion::kSVInUse);
|
|
|
|
// Invariant:
|
|
|
|
// (1) Scrape (always) installs kSVObsolete in ThreadLocal storage
|
|
|
|
// (2) the Swap above (always) installs kSVInUse, ThreadLocal storage
|
|
|
|
// should only keep kSVInUse before ReturnThreadLocalSuperVersion call
|
|
|
|
// (if no Scrape happens).
|
|
|
|
assert(ptr != SuperVersion::kSVInUse);
|
|
|
|
SuperVersion* sv = static_cast<SuperVersion*>(ptr);
|
|
|
|
if (sv == SuperVersion::kSVObsolete) {
|
|
|
|
RecordTick(ioptions_.stats, NUMBER_SUPERVERSION_ACQUIRES);
|
|
|
|
db->mutex()->Lock();
|
|
|
|
sv = super_version_->Ref();
|
|
|
|
db->mutex()->Unlock();
|
|
|
|
}
|
|
|
|
assert(sv != nullptr);
|
|
|
|
return sv;
|
|
|
|
}
|
|
|
|
|
|
|
|
bool ColumnFamilyData::ReturnThreadLocalSuperVersion(SuperVersion* sv) {
|
|
|
|
assert(sv != nullptr);
|
|
|
|
// Put the SuperVersion back
|
|
|
|
void* expected = SuperVersion::kSVInUse;
|
|
|
|
if (local_sv_->CompareAndSwap(static_cast<void*>(sv), expected)) {
|
|
|
|
// When we see kSVInUse in the ThreadLocal, we are sure ThreadLocal
|
|
|
|
// storage has not been altered and no Scrape has happened. The
|
|
|
|
// SuperVersion is still current.
|
|
|
|
return true;
|
|
|
|
} else {
|
|
|
|
// ThreadLocal scrape happened in the process of this GetImpl call (after
|
|
|
|
// thread local Swap() at the beginning and before CompareAndSwap()).
|
|
|
|
// This means the SuperVersion it holds is obsolete.
|
|
|
|
assert(expected == SuperVersion::kSVObsolete);
|
|
|
|
}
|
|
|
|
return false;
|
|
|
|
}
|
|
|
|
|
|
|
|
void ColumnFamilyData::InstallSuperVersion(SuperVersionContext* sv_context,
|
|
|
|
InstrumentedMutex* db_mutex) {
|
|
|
|
db_mutex->AssertHeld();
|
Fix a race in ColumnFamilyData::UnrefAndTryDelete (#8605)
Summary:
The `ColumnFamilyData::UnrefAndTryDelete` code currently on the trunk
unlocks the DB mutex before destroying the `ThreadLocalPtr` holding
the per-thread `SuperVersion` pointers when the only remaining reference
is the back reference from `super_version_`. The idea behind this was to
break the circular dependency between `ColumnFamilyData` and `SuperVersion`:
when the penultimate reference goes away, `ColumnFamilyData` can clean up
the `SuperVersion`, which can in turn clean up `ColumnFamilyData`. (Assuming there
is a `SuperVersion` and it is not referenced by anything else.) However,
unlocking the mutex throws a wrench in this plan by making it possible for another thread
to jump in and take another reference to the `ColumnFamilyData`, keeping the
object alive in a zombie `ThreadLocalPtr`-less state. This can cause issues like
https://github.com/facebook/rocksdb/issues/8440 ,
https://github.com/facebook/rocksdb/issues/8382 ,
and might also explain the `was_last_ref` assertion failures from the `ColumnFamilySet`
destructor we sometimes observe during close in our stress tests.
Digging through the archives, this unlocking goes way back to 2014 (or earlier). The original
rationale was that `SuperVersionUnrefHandle` used to lock the mutex so it can call
`SuperVersion::Cleanup`; however, this logic turned out to be deadlock-prone.
https://github.com/facebook/rocksdb/pull/3510 fixed the deadlock but left the
unlocking in place. https://github.com/facebook/rocksdb/pull/6147 then introduced
the circular dependency and associated cleanup logic described above (in order
to enable iterators to keep the `ColumnFamilyData` for dropped column families alive),
and moved the unlocking-relocking snippet to its present location in `UnrefAndTryDelete`.
Finally, https://github.com/facebook/rocksdb/pull/7749 fixed a memory leak but
apparently exacerbated the race by (otherwise correctly) switching to `UnrefAndTryDelete`
in `SuperVersion::Cleanup`.
The patch simply eliminates the unlocking and relocking, which has been unnecessary
ever since https://github.com/facebook/rocksdb/issues/3510 made `SuperVersionUnrefHandle` lock-free.
This closes the window during which another thread could increase the reference count,
and hopefully fixes the issues above.
Pull Request resolved: https://github.com/facebook/rocksdb/pull/8605
Test Plan: Ran `make check` and stress tests locally.
Reviewed By: pdillinger
Differential Revision: D30051035
Pulled By: ltamasi
fbshipit-source-id: 8fe559e4b4ad69fc142579f8bc393ef525918528
3 years ago
|
|
|
return InstallSuperVersion(sv_context, mutable_cf_options_);
|
|
|
|
}
|
|
|
|
|
|
|
|
void ColumnFamilyData::InstallSuperVersion(
|
Fix a race in ColumnFamilyData::UnrefAndTryDelete (#8605)
Summary:
The `ColumnFamilyData::UnrefAndTryDelete` code currently on the trunk
unlocks the DB mutex before destroying the `ThreadLocalPtr` holding
the per-thread `SuperVersion` pointers when the only remaining reference
is the back reference from `super_version_`. The idea behind this was to
break the circular dependency between `ColumnFamilyData` and `SuperVersion`:
when the penultimate reference goes away, `ColumnFamilyData` can clean up
the `SuperVersion`, which can in turn clean up `ColumnFamilyData`. (Assuming there
is a `SuperVersion` and it is not referenced by anything else.) However,
unlocking the mutex throws a wrench in this plan by making it possible for another thread
to jump in and take another reference to the `ColumnFamilyData`, keeping the
object alive in a zombie `ThreadLocalPtr`-less state. This can cause issues like
https://github.com/facebook/rocksdb/issues/8440 ,
https://github.com/facebook/rocksdb/issues/8382 ,
and might also explain the `was_last_ref` assertion failures from the `ColumnFamilySet`
destructor we sometimes observe during close in our stress tests.
Digging through the archives, this unlocking goes way back to 2014 (or earlier). The original
rationale was that `SuperVersionUnrefHandle` used to lock the mutex so it can call
`SuperVersion::Cleanup`; however, this logic turned out to be deadlock-prone.
https://github.com/facebook/rocksdb/pull/3510 fixed the deadlock but left the
unlocking in place. https://github.com/facebook/rocksdb/pull/6147 then introduced
the circular dependency and associated cleanup logic described above (in order
to enable iterators to keep the `ColumnFamilyData` for dropped column families alive),
and moved the unlocking-relocking snippet to its present location in `UnrefAndTryDelete`.
Finally, https://github.com/facebook/rocksdb/pull/7749 fixed a memory leak but
apparently exacerbated the race by (otherwise correctly) switching to `UnrefAndTryDelete`
in `SuperVersion::Cleanup`.
The patch simply eliminates the unlocking and relocking, which has been unnecessary
ever since https://github.com/facebook/rocksdb/issues/3510 made `SuperVersionUnrefHandle` lock-free.
This closes the window during which another thread could increase the reference count,
and hopefully fixes the issues above.
Pull Request resolved: https://github.com/facebook/rocksdb/pull/8605
Test Plan: Ran `make check` and stress tests locally.
Reviewed By: pdillinger
Differential Revision: D30051035
Pulled By: ltamasi
fbshipit-source-id: 8fe559e4b4ad69fc142579f8bc393ef525918528
3 years ago
|
|
|
SuperVersionContext* sv_context,
|
Make mempurge a background process (equivalent to in-memory compaction). (#8505)
Summary:
In https://github.com/facebook/rocksdb/issues/8454, I introduced a new process baptized `MemPurge` (memtable garbage collection). This new PR is built upon this past mempurge prototype.
In this PR, I made the `mempurge` process a background task, which provides superior performance since the mempurge process does not cling on the db_mutex anymore, and addresses severe restrictions from the past iteration (including a scenario where the past mempurge was failling, when a memtable was mempurged but was still referred to by an iterator/snapshot/...).
Now the mempurge process ressembles an in-memory compaction process: the stack of immutable memtables is filtered out, and the useful payload is used to populate an output memtable. If the output memtable is filled at more than 60% capacity (arbitrary heuristic) the mempurge process is aborted and a regular flush process takes place, else the output memtable is kept in the immutable memtable stack. Note that adding this output memtable to the `imm()` memtable stack does not trigger another flush process, so that the flush thread can go to sleep at the end of a successful mempurge.
MemPurge is activated by making the `experimental_allow_mempurge` flag `true`. When activated, the `MemPurge` process will always happen when the flush reason is `kWriteBufferFull`.
The 3 unit tests confirm that this process supports `Put`, `Get`, `Delete`, `DeleteRange` operators and is compatible with `Iterators` and `CompactionFilters`.
Pull Request resolved: https://github.com/facebook/rocksdb/pull/8505
Reviewed By: pdillinger
Differential Revision: D29619283
Pulled By: bjlemaire
fbshipit-source-id: 8a99bee76b63a8211bff1a00e0ae32360aaece95
4 years ago
|
|
|
const MutableCFOptions& mutable_cf_options) {
|
|
|
|
SuperVersion* new_superversion = sv_context->new_superversion.release();
|
|
|
|
new_superversion->mutable_cf_options = mutable_cf_options;
|
|
|
|
new_superversion->Init(this, mem_, imm_.current(), current_);
|
|
|
|
SuperVersion* old_superversion = super_version_;
|
|
|
|
super_version_ = new_superversion;
|
|
|
|
++super_version_number_;
|
|
|
|
super_version_->version_number = super_version_number_;
|
Fix A Bug Where Concurrent Compactions Cause Further Slowing Down (#10270)
Summary:
Currently, when installing a new super version, when stalling condition triggers, we compare estimated compaction bytes to previously, and if the new value is larger or equal to the previous one, we reduce the slowdown write rate. However, if concurrent compactions happen, the same value might be used. The result is that, although some compactions reduce estimated compaction bytes, we treat them as a signal for further slowing down. In some cases, it causes slowdown rate drops all the way to the minimum, far lower than needed.
Fix the bug by not triggering a re-calculation if a new super version doesn't have Version or a memtable change. With this fix, number of compaction finishes are still undercounted in this algorithm, but it is still better than the current bug where they are negatively counted.
Pull Request resolved: https://github.com/facebook/rocksdb/pull/10270
Test Plan: Run a benchmark where the slowdown rate is dropped to minimal unnessarily and see it is back to a normal value.
Reviewed By: ajkr
Differential Revision: D37497327
fbshipit-source-id: 9bca961cc38fed965c3af0fa6c9ca0efaa7637c4
3 years ago
|
|
|
if (old_superversion == nullptr || old_superversion->current != current() ||
|
|
|
|
old_superversion->mem != mem_ ||
|
|
|
|
old_superversion->imm != imm_.current()) {
|
|
|
|
// Should not recalculate slow down condition if nothing has changed, since
|
|
|
|
// currently RecalculateWriteStallConditions() treats it as further slowing
|
|
|
|
// down is needed.
|
|
|
|
super_version_->write_stall_condition =
|
|
|
|
RecalculateWriteStallConditions(mutable_cf_options);
|
|
|
|
} else {
|
|
|
|
super_version_->write_stall_condition =
|
|
|
|
old_superversion->write_stall_condition;
|
|
|
|
}
|
|
|
|
if (old_superversion != nullptr) {
|
Fix deadlock in ColumnFamilyData::InstallSuperVersion()
Summary:
Deadlock: a memtable flush holds DB::mutex_ and calls ThreadLocalPtr::Scrape(), which locks ThreadLocalPtr mutex; meanwhile, a thread exit handler locks ThreadLocalPtr mutex and calls SuperVersionUnrefHandle, which tries to lock DB::mutex_.
This deadlock is hit all the time on our workload. It blocks our release.
In general, the problem is that ThreadLocalPtr takes an arbitrary callback and calls it while holding a lock on a global mutex. The same global mutex is (at least in some cases) locked by almost all ThreadLocalPtr methods, on any instance of ThreadLocalPtr. So, there'll be a deadlock if the callback tries to do anything to any instance of ThreadLocalPtr, or waits for another thread to do so.
So, probably the only safe way to use ThreadLocalPtr callbacks is to do only do simple and lock-free things in them.
This PR fixes the deadlock by making sure that local_sv_ never holds the last reference to a SuperVersion, and therefore SuperVersionUnrefHandle never has to do any nontrivial cleanup.
I also searched for other uses of ThreadLocalPtr to see if they may have similar bugs. There's only one other use, in transaction_lock_mgr.cc, and it looks fine.
Closes https://github.com/facebook/rocksdb/pull/3510
Reviewed By: sagar0
Differential Revision: D7005346
Pulled By: al13n321
fbshipit-source-id: 37575591b84f07a891d6659e87e784660fde815f
7 years ago
|
|
|
// Reset SuperVersions cached in thread local storage.
|
|
|
|
// This should be done before old_superversion->Unref(). That's to ensure
|
|
|
|
// that local_sv_ never holds the last reference to SuperVersion, since
|
|
|
|
// it has no means to safely do SuperVersion cleanup.
|
|
|
|
ResetThreadLocalSuperVersions();
|
|
|
|
|
|
|
|
if (old_superversion->mutable_cf_options.write_buffer_size !=
|
|
|
|
mutable_cf_options.write_buffer_size) {
|
|
|
|
mem_->UpdateWriteBufferSize(mutable_cf_options.write_buffer_size);
|
|
|
|
}
|
|
|
|
if (old_superversion->write_stall_condition !=
|
|
|
|
new_superversion->write_stall_condition) {
|
|
|
|
sv_context->PushWriteStallNotification(
|
|
|
|
old_superversion->write_stall_condition,
|
|
|
|
new_superversion->write_stall_condition, GetName(), ioptions());
|
|
|
|
}
|
|
|
|
if (old_superversion->Unref()) {
|
Make mempurge a background process (equivalent to in-memory compaction). (#8505)
Summary:
In https://github.com/facebook/rocksdb/issues/8454, I introduced a new process baptized `MemPurge` (memtable garbage collection). This new PR is built upon this past mempurge prototype.
In this PR, I made the `mempurge` process a background task, which provides superior performance since the mempurge process does not cling on the db_mutex anymore, and addresses severe restrictions from the past iteration (including a scenario where the past mempurge was failling, when a memtable was mempurged but was still referred to by an iterator/snapshot/...).
Now the mempurge process ressembles an in-memory compaction process: the stack of immutable memtables is filtered out, and the useful payload is used to populate an output memtable. If the output memtable is filled at more than 60% capacity (arbitrary heuristic) the mempurge process is aborted and a regular flush process takes place, else the output memtable is kept in the immutable memtable stack. Note that adding this output memtable to the `imm()` memtable stack does not trigger another flush process, so that the flush thread can go to sleep at the end of a successful mempurge.
MemPurge is activated by making the `experimental_allow_mempurge` flag `true`. When activated, the `MemPurge` process will always happen when the flush reason is `kWriteBufferFull`.
The 3 unit tests confirm that this process supports `Put`, `Get`, `Delete`, `DeleteRange` operators and is compatible with `Iterators` and `CompactionFilters`.
Pull Request resolved: https://github.com/facebook/rocksdb/pull/8505
Reviewed By: pdillinger
Differential Revision: D29619283
Pulled By: bjlemaire
fbshipit-source-id: 8a99bee76b63a8211bff1a00e0ae32360aaece95
4 years ago
|
|
|
old_superversion->Cleanup();
|
|
|
|
sv_context->superversions_to_free.push_back(old_superversion);
|
|
|
|
}
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
void ColumnFamilyData::ResetThreadLocalSuperVersions() {
|
|
|
|
autovector<void*> sv_ptrs;
|
|
|
|
local_sv_->Scrape(&sv_ptrs, SuperVersion::kSVObsolete);
|
|
|
|
for (auto ptr : sv_ptrs) {
|
|
|
|
assert(ptr);
|
|
|
|
if (ptr == SuperVersion::kSVInUse) {
|
|
|
|
continue;
|
|
|
|
}
|
|
|
|
auto sv = static_cast<SuperVersion*>(ptr);
|
Fix deadlock in ColumnFamilyData::InstallSuperVersion()
Summary:
Deadlock: a memtable flush holds DB::mutex_ and calls ThreadLocalPtr::Scrape(), which locks ThreadLocalPtr mutex; meanwhile, a thread exit handler locks ThreadLocalPtr mutex and calls SuperVersionUnrefHandle, which tries to lock DB::mutex_.
This deadlock is hit all the time on our workload. It blocks our release.
In general, the problem is that ThreadLocalPtr takes an arbitrary callback and calls it while holding a lock on a global mutex. The same global mutex is (at least in some cases) locked by almost all ThreadLocalPtr methods, on any instance of ThreadLocalPtr. So, there'll be a deadlock if the callback tries to do anything to any instance of ThreadLocalPtr, or waits for another thread to do so.
So, probably the only safe way to use ThreadLocalPtr callbacks is to do only do simple and lock-free things in them.
This PR fixes the deadlock by making sure that local_sv_ never holds the last reference to a SuperVersion, and therefore SuperVersionUnrefHandle never has to do any nontrivial cleanup.
I also searched for other uses of ThreadLocalPtr to see if they may have similar bugs. There's only one other use, in transaction_lock_mgr.cc, and it looks fine.
Closes https://github.com/facebook/rocksdb/pull/3510
Reviewed By: sagar0
Differential Revision: D7005346
Pulled By: al13n321
fbshipit-source-id: 37575591b84f07a891d6659e87e784660fde815f
7 years ago
|
|
|
bool was_last_ref __attribute__((__unused__));
|
|
|
|
was_last_ref = sv->Unref();
|
|
|
|
// sv couldn't have been the last reference because
|
|
|
|
// ResetThreadLocalSuperVersions() is called before
|
|
|
|
// unref'ing super_version_.
|
|
|
|
assert(!was_last_ref);
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
Status ColumnFamilyData::ValidateOptions(
|
|
|
|
const DBOptions& db_options, const ColumnFamilyOptions& cf_options) {
|
|
|
|
Status s;
|
|
|
|
s = CheckCompressionSupported(cf_options);
|
|
|
|
if (s.ok() && db_options.allow_concurrent_memtable_write) {
|
|
|
|
s = CheckConcurrentWritesSupported(cf_options);
|
|
|
|
}
|
|
|
|
if (s.ok() && db_options.unordered_write &&
|
|
|
|
cf_options.max_successive_merges != 0) {
|
|
|
|
s = Status::InvalidArgument(
|
|
|
|
"max_successive_merges > 0 is incompatible with unordered_write");
|
|
|
|
}
|
|
|
|
if (s.ok()) {
|
|
|
|
s = CheckCFPathsSupported(db_options, cf_options);
|
|
|
|
}
|
|
|
|
if (!s.ok()) {
|
|
|
|
return s;
|
|
|
|
}
|
|
|
|
|
|
|
|
if (cf_options.ttl > 0 && cf_options.ttl != kDefaultTtl) {
|
|
|
|
if (!cf_options.table_factory->IsInstanceOf(
|
|
|
|
TableFactory::kBlockBasedTableName())) {
|
|
|
|
return Status::NotSupported(
|
|
|
|
"TTL is only supported in Block-Based Table format. ");
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
Auto enable Periodic Compactions if a Compaction Filter is used (#5865)
Summary:
- Periodic compactions are auto-enabled if a compaction filter or a compaction filter factory is set, in Level Compaction.
- The default value of `periodic_compaction_seconds` is changed to UINT64_MAX, which lets RocksDB auto-tune periodic compactions as needed. An explicit value of 0 will still work as before ie. to disable periodic compactions completely. For now, on seeing a compaction filter along with a UINT64_MAX value for `periodic_compaction_seconds`, RocksDB will make SST files older than 30 days to go through periodic copmactions.
Some RocksDB users make use of compaction filters to control when their data can be deleted, usually with a custom TTL logic. But it is occasionally possible that the compactions get delayed by considerable time due to factors like low writes to a key range, data reaching bottom level, etc before the TTL expiry. Periodic Compactions feature was originally built to help such cases. Now periodic compactions are auto enabled by default when compaction filters or compaction filter factories are used, as it is generally helpful to all cases to collect garbage.
`periodic_compaction_seconds` is set to a large value, 30 days, in `SanitizeOptions` when RocksDB sees that a `compaction_filter` or `compaction_filter_factory` is used.
This is done only for Level Compaction style.
Pull Request resolved: https://github.com/facebook/rocksdb/pull/5865
Test Plan:
- Added a new test `DBCompactionTest.LevelPeriodicCompactionWithCompactionFilters` to make sure that `periodic_compaction_seconds` is set if either `compaction_filter` or `compaction_filter_factory` options are set.
- `COMPILE_WITH_ASAN=1 make check`
Differential Revision: D17659180
Pulled By: sagar0
fbshipit-source-id: 4887b9cf2e53cf2dc93a7b658c6b15e1181217ee
5 years ago
|
|
|
if (cf_options.periodic_compaction_seconds > 0 &&
|
|
|
|
cf_options.periodic_compaction_seconds != kDefaultPeriodicCompSecs) {
|
|
|
|
if (!cf_options.table_factory->IsInstanceOf(
|
|
|
|
TableFactory::kBlockBasedTableName())) {
|
|
|
|
return Status::NotSupported(
|
|
|
|
"Periodic Compaction is only supported in "
|
|
|
|
"Block-Based Table format. ");
|
|
|
|
}
|
|
|
|
}
|
Integrated blob garbage collection: relocate blobs (#7694)
Summary:
The patch adds basic garbage collection support to the integrated BlobDB
implementation. Valid blobs residing in the oldest blob files are relocated
as they are encountered during compaction. The threshold that determines
which blob files qualify is computed based on the configuration option
`blob_garbage_collection_age_cutoff`, which was introduced in https://github.com/facebook/rocksdb/issues/7661 .
Once a blob is retrieved for the purposes of relocation, it passes through the
same logic that extracts large values to blob files in general. This means that
if, for instance, the size threshold for key-value separation (`min_blob_size`)
got changed or writing blob files got disabled altogether, it is possible for the
value to be moved back into the LSM tree. In particular, one way to re-inline
all blob values if needed would be to perform a full manual compaction with
`enable_blob_files` set to `false`, `enable_blob_garbage_collection` set to
`true`, and `blob_file_garbage_collection_age_cutoff` set to `1.0`.
Some TODOs that I plan to address in separate PRs:
1) We'll have to measure the amount of new garbage in each blob file and log
`BlobFileGarbage` entries as part of the compaction job's `VersionEdit`.
(For the time being, blob files are cleaned up solely based on the
`oldest_blob_file_number` relationships.)
2) When compression is used for blobs, the compression type hasn't changed,
and the blob still qualifies for being written to a blob file, we can simply copy
the compressed blob to the new file instead of going through decompression
and compression.
3) We need to update the formula for computing write amplification to account
for the amount of data read from blob files as part of GC.
Pull Request resolved: https://github.com/facebook/rocksdb/pull/7694
Test Plan: `make check`
Reviewed By: riversand963
Differential Revision: D25069663
Pulled By: ltamasi
fbshipit-source-id: bdfa8feb09afcf5bca3b4eba2ba72ce2f15cd06a
4 years ago
|
|
|
|
Make it possible to force the garbage collection of the oldest blob files (#8994)
Summary:
The current BlobDB garbage collection logic works by relocating the valid
blobs from the oldest blob files as they are encountered during compaction,
and cleaning up blob files once they contain nothing but garbage. However,
with sufficiently skewed workloads, it is theoretically possible to end up in a
situation when few or no compactions get scheduled for the SST files that contain
references to the oldest blob files, which can lead to increased space amp due
to the lack of GC.
In order to efficiently handle such workloads, the patch adds a new BlobDB
configuration option called `blob_garbage_collection_force_threshold`,
which signals to BlobDB to schedule targeted compactions for the SST files
that keep alive the oldest batch of blob files if the overall ratio of garbage in
the given blob files meets the threshold *and* all the given blob files are
eligible for GC based on `blob_garbage_collection_age_cutoff`. (For example,
if the new option is set to 0.9, targeted compactions will get scheduled if the
sum of garbage bytes meets or exceeds 90% of the sum of total bytes in the
oldest blob files, assuming all affected blob files are below the age-based cutoff.)
The net result of these targeted compactions is that the valid blobs in the oldest
blob files are relocated and the oldest blob files themselves cleaned up (since
*all* SST files that rely on them get compacted away).
These targeted compactions are similar to periodic compactions in the sense
that they force certain SST files that otherwise would not get picked up to undergo
compaction and also in the sense that instead of merging files from multiple levels,
they target a single file. (Note: such compactions might still include neighboring files
from the same level due to the need of having a "clean cut" boundary but they never
include any files from any other level.)
This functionality is currently only supported with the leveled compaction style
and is inactive by default (since the default value is set to 1.0, i.e. 100%).
Pull Request resolved: https://github.com/facebook/rocksdb/pull/8994
Test Plan: Ran `make check` and tested using `db_bench` and the stress/crash tests.
Reviewed By: riversand963
Differential Revision: D31489850
Pulled By: ltamasi
fbshipit-source-id: 44057d511726a0e2a03c5d9313d7511b3f0c4eab
3 years ago
|
|
|
if (cf_options.enable_blob_garbage_collection) {
|
|
|
|
if (cf_options.blob_garbage_collection_age_cutoff < 0.0 ||
|
|
|
|
cf_options.blob_garbage_collection_age_cutoff > 1.0) {
|
|
|
|
return Status::InvalidArgument(
|
|
|
|
"The age cutoff for blob garbage collection should be in the range "
|
|
|
|
"[0.0, 1.0].");
|
|
|
|
}
|
|
|
|
if (cf_options.blob_garbage_collection_force_threshold < 0.0 ||
|
|
|
|
cf_options.blob_garbage_collection_force_threshold > 1.0) {
|
|
|
|
return Status::InvalidArgument(
|
|
|
|
"The garbage ratio threshold for forcing blob garbage collection "
|
|
|
|
"should be in the range [0.0, 1.0].");
|
|
|
|
}
|
Integrated blob garbage collection: relocate blobs (#7694)
Summary:
The patch adds basic garbage collection support to the integrated BlobDB
implementation. Valid blobs residing in the oldest blob files are relocated
as they are encountered during compaction. The threshold that determines
which blob files qualify is computed based on the configuration option
`blob_garbage_collection_age_cutoff`, which was introduced in https://github.com/facebook/rocksdb/issues/7661 .
Once a blob is retrieved for the purposes of relocation, it passes through the
same logic that extracts large values to blob files in general. This means that
if, for instance, the size threshold for key-value separation (`min_blob_size`)
got changed or writing blob files got disabled altogether, it is possible for the
value to be moved back into the LSM tree. In particular, one way to re-inline
all blob values if needed would be to perform a full manual compaction with
`enable_blob_files` set to `false`, `enable_blob_garbage_collection` set to
`true`, and `blob_file_garbage_collection_age_cutoff` set to `1.0`.
Some TODOs that I plan to address in separate PRs:
1) We'll have to measure the amount of new garbage in each blob file and log
`BlobFileGarbage` entries as part of the compaction job's `VersionEdit`.
(For the time being, blob files are cleaned up solely based on the
`oldest_blob_file_number` relationships.)
2) When compression is used for blobs, the compression type hasn't changed,
and the blob still qualifies for being written to a blob file, we can simply copy
the compressed blob to the new file instead of going through decompression
and compression.
3) We need to update the formula for computing write amplification to account
for the amount of data read from blob files as part of GC.
Pull Request resolved: https://github.com/facebook/rocksdb/pull/7694
Test Plan: `make check`
Reviewed By: riversand963
Differential Revision: D25069663
Pulled By: ltamasi
fbshipit-source-id: bdfa8feb09afcf5bca3b4eba2ba72ce2f15cd06a
4 years ago
|
|
|
}
|
|
|
|
|
|
|
|
if (cf_options.compaction_style == kCompactionStyleFIFO &&
|
|
|
|
db_options.max_open_files != -1 && cf_options.ttl > 0) {
|
|
|
|
return Status::NotSupported(
|
|
|
|
"FIFO compaction only supported with max_open_files = -1.");
|
|
|
|
}
|
|
|
|
|
|
|
|
std::vector<uint32_t> supported{0, 1, 2, 4, 8};
|
|
|
|
if (std::find(supported.begin(), supported.end(),
|
|
|
|
cf_options.memtable_protection_bytes_per_key) ==
|
|
|
|
supported.end()) {
|
|
|
|
return Status::NotSupported(
|
|
|
|
"Memtable per key-value checksum protection only supports 0, 1, 2, 4 "
|
|
|
|
"or 8 bytes per key.");
|
|
|
|
}
|
|
|
|
if (std::find(supported.begin(), supported.end(),
|
|
|
|
cf_options.block_protection_bytes_per_key) == supported.end()) {
|
|
|
|
return Status::NotSupported(
|
|
|
|
"Block per key-value checksum protection only supports 0, 1, 2, 4 "
|
|
|
|
"or 8 bytes per key.");
|
|
|
|
}
|
|
|
|
|
|
|
|
if (!cf_options.compaction_options_fifo.file_temperature_age_thresholds
|
|
|
|
.empty()) {
|
|
|
|
if (cf_options.compaction_style != kCompactionStyleFIFO) {
|
|
|
|
return Status::NotSupported(
|
|
|
|
"Option file_temperature_age_thresholds only supports FIFO "
|
|
|
|
"compaction.");
|
|
|
|
} else if (cf_options.num_levels > 1) {
|
|
|
|
return Status::NotSupported(
|
|
|
|
"Option file_temperature_age_thresholds is only supported when "
|
|
|
|
"num_levels = 1.");
|
|
|
|
} else {
|
|
|
|
const auto& ages =
|
|
|
|
cf_options.compaction_options_fifo.file_temperature_age_thresholds;
|
|
|
|
assert(ages.size() >= 1);
|
|
|
|
// check that age is sorted
|
|
|
|
for (size_t i = 0; i < ages.size() - 1; ++i) {
|
|
|
|
if (ages[i].age >= ages[i + 1].age) {
|
|
|
|
return Status::NotSupported(
|
|
|
|
"Option file_temperature_age_thresholds requires elements to be "
|
|
|
|
"sorted in increasing order with respect to `age` field.");
|
|
|
|
}
|
|
|
|
}
|
|
|
|
}
|
|
|
|
}
|
|
|
|
return s;
|
|
|
|
}
|
|
|
|
|
|
|
|
Status ColumnFamilyData::SetOptions(
|
|
|
|
const DBOptions& db_opts,
|
|
|
|
const std::unordered_map<std::string, std::string>& options_map) {
|
|
|
|
ColumnFamilyOptions cf_opts =
|
|
|
|
BuildColumnFamilyOptions(initial_cf_options_, mutable_cf_options_);
|
|
|
|
ConfigOptions config_opts;
|
|
|
|
config_opts.mutable_options_only = true;
|
|
|
|
Status s = GetColumnFamilyOptionsFromMap(config_opts, cf_opts, options_map,
|
|
|
|
&cf_opts);
|
|
|
|
if (s.ok()) {
|
|
|
|
s = ValidateOptions(db_opts, cf_opts);
|
|
|
|
}
|
|
|
|
if (s.ok()) {
|
|
|
|
mutable_cf_options_ = MutableCFOptions(cf_opts);
|
|
|
|
mutable_cf_options_.RefreshDerivedOptions(ioptions_);
|
|
|
|
}
|
|
|
|
return s;
|
|
|
|
}
|
|
|
|
|
|
|
|
// REQUIRES: DB mutex held
|
|
|
|
Env::WriteLifeTimeHint ColumnFamilyData::CalculateSSTWriteHint(int level) {
|
|
|
|
if (initial_cf_options_.compaction_style != kCompactionStyleLevel) {
|
|
|
|
return Env::WLTH_NOT_SET;
|
|
|
|
}
|
|
|
|
if (level == 0) {
|
|
|
|
return Env::WLTH_MEDIUM;
|
|
|
|
}
|
|
|
|
int base_level = current_->storage_info()->base_level();
|
|
|
|
|
|
|
|
// L1: medium, L2: long, ...
|
|
|
|
if (level - base_level >= 2) {
|
|
|
|
return Env::WLTH_EXTREME;
|
|
|
|
} else if (level < base_level) {
|
|
|
|
// There is no restriction which prevents level passed in to be smaller
|
|
|
|
// than base_level.
|
|
|
|
return Env::WLTH_MEDIUM;
|
|
|
|
}
|
|
|
|
return static_cast<Env::WriteLifeTimeHint>(
|
|
|
|
level - base_level + static_cast<int>(Env::WLTH_MEDIUM));
|
|
|
|
}
|
|
|
|
|
|
|
|
Status ColumnFamilyData::AddDirectories(
|
|
|
|
std::map<std::string, std::shared_ptr<FSDirectory>>* created_dirs) {
|
|
|
|
Status s;
|
|
|
|
assert(created_dirs != nullptr);
|
|
|
|
assert(data_dirs_.empty());
|
|
|
|
for (auto& p : ioptions_.cf_paths) {
|
|
|
|
auto existing_dir = created_dirs->find(p.path);
|
|
|
|
|
|
|
|
if (existing_dir == created_dirs->end()) {
|
|
|
|
std::unique_ptr<FSDirectory> path_directory;
|
|
|
|
s = DBImpl::CreateAndNewDirectory(ioptions_.fs.get(), p.path,
|
|
|
|
&path_directory);
|
|
|
|
if (!s.ok()) {
|
|
|
|
return s;
|
|
|
|
}
|
|
|
|
assert(path_directory != nullptr);
|
|
|
|
data_dirs_.emplace_back(path_directory.release());
|
|
|
|
(*created_dirs)[p.path] = data_dirs_.back();
|
|
|
|
} else {
|
|
|
|
data_dirs_.emplace_back(existing_dir->second);
|
|
|
|
}
|
|
|
|
}
|
|
|
|
assert(data_dirs_.size() == ioptions_.cf_paths.size());
|
|
|
|
return s;
|
|
|
|
}
|
|
|
|
|
|
|
|
FSDirectory* ColumnFamilyData::GetDataDir(size_t path_id) const {
|
|
|
|
if (data_dirs_.empty()) {
|
|
|
|
return nullptr;
|
|
|
|
}
|
|
|
|
|
|
|
|
assert(path_id < data_dirs_.size());
|
|
|
|
return data_dirs_[path_id].get();
|
|
|
|
}
|
|
|
|
|
Sort L0 files by newly introduced epoch_num (#10922)
Summary:
**Context:**
Sorting L0 files by `largest_seqno` has at least two inconvenience:
- File ingestion and compaction involving ingested files can create files of overlapping seqno range with the existing files. `force_consistency_check=true` will catch such overlap seqno range even those harmless overlap.
- For example, consider the following sequence of events ("key@n" indicates key at seqno "n")
- insert k1@1 to memtable m1
- ingest file s1 with k2@2, ingest file s2 with k3@3
- insert k4@4 to m1
- compact files s1, s2 and result in new file s3 of seqno range [2, 3]
- flush m1 and result in new file s4 of seqno range [1, 4]. And `force_consistency_check=true` will think s4 and s3 has file reordering corruption that might cause retuning an old value of k1
- However such caught corruption is a false positive since s1, s2 will not have overlapped keys with k1 or whatever inserted into m1 before ingest file s1 by the requirement of file ingestion (otherwise the m1 will be flushed first before any of the file ingestion completes). Therefore there in fact isn't any file reordering corruption.
- Single delete can decrease a file's largest seqno and ordering by `largest_seqno` can introduce a wrong ordering hence file reordering corruption
- For example, consider the following sequence of events ("key@n" indicates key at seqno "n", Credit to ajkr for this example)
- an existing SST s1 contains only k1@1
- insert k1@2 to memtable m1
- ingest file s2 with k3@3, ingest file s3 with k4@4
- insert single delete k5@5 in m1
- flush m1 and result in new file s4 of seqno range [2, 5]
- compact s1, s2, s3 and result in new file s5 of seqno range [1, 4]
- compact s4 and result in new file s6 of seqno range [2] due to single delete
- By the last step, we have file ordering by largest seqno (">" means "newer") : s5 > s6 while s6 contains a newer version of the k1's value (i.e, k1@2) than s5, which is a real reordering corruption. While this can be caught by `force_consistency_check=true`, there isn't a good way to prevent this from happening if ordering by `largest_seqno`
Therefore, we are redesigning the sorting criteria of L0 files and avoid above inconvenience. Credit to ajkr , we now introduce `epoch_num` which describes the order of a file being flushed or ingested/imported (compaction output file will has the minimum `epoch_num` among input files'). This will avoid the above inconvenience in the following ways:
- In the first case above, there will no longer be overlap seqno range check in `force_consistency_check=true` but `epoch_number` ordering check. This will result in file ordering s1 < s2 < s4 (pre-compaction) and s3 < s4 (post-compaction) which won't trigger false positive corruption. See test class `DBCompactionTestL0FilesMisorderCorruption*` for more.
- In the second case above, this will result in file ordering s1 < s2 < s3 < s4 (pre-compacting s1, s2, s3), s5 < s4 (post-compacting s1, s2, s3), s5 < s6 (post-compacting s4), which are correct file ordering without causing any corruption.
**Summary:**
- Introduce `epoch_number` stored per `ColumnFamilyData` and sort CF's L0 files by their assigned `epoch_number` instead of `largest_seqno`.
- `epoch_number` is increased and assigned upon `VersionEdit::AddFile()` for flush (or similarly for WriteLevel0TableForRecovery) and file ingestion (except for allow_behind_true, which will always get assigned as the `kReservedEpochNumberForFileIngestedBehind`)
- Compaction output file is assigned with the minimum `epoch_number` among input files'
- Refit level: reuse refitted file's epoch_number
- Other paths needing `epoch_number` treatment:
- Import column families: reuse file's epoch_number if exists. If not, assign one based on `NewestFirstBySeqNo`
- Repair: reuse file's epoch_number if exists. If not, assign one based on `NewestFirstBySeqNo`.
- Assigning new epoch_number to a file and adding this file to LSM tree should be atomic. This is guaranteed by us assigning epoch_number right upon `VersionEdit::AddFile()` where this version edit will be apply to LSM tree shape right after by holding the db mutex (e.g, flush, file ingestion, import column family) or by there is only 1 ongoing edit per CF (e.g, WriteLevel0TableForRecovery, Repair).
- Assigning the minimum input epoch number to compaction output file won't misorder L0 files (even through later `Refit(target_level=0)`). It's due to for every key "k" in the input range, a legit compaction will cover a continuous epoch number range of that key. As long as we assign the key "k" the minimum input epoch number, it won't become newer or older than the versions of this key that aren't included in this compaction hence no misorder.
- Persist `epoch_number` of each file in manifest and recover `epoch_number` on db recovery
- Backward compatibility with old db without `epoch_number` support is guaranteed by assigning `epoch_number` to recovered files by `NewestFirstBySeqno` order. See `VersionStorageInfo::RecoverEpochNumbers()` for more
- Forward compatibility with manifest is guaranteed by flexibility of `NewFileCustomTag`
- Replace `force_consistent_check` on L0 with `epoch_number` and remove false positive check like case 1 with `largest_seqno` above
- Due to backward compatibility issue, we might encounter files with missing epoch number at the beginning of db recovery. We will still use old L0 sorting mechanism (`NewestFirstBySeqno`) to check/sort them till we infer their epoch number. See usages of `EpochNumberRequirement`.
- Remove fix https://github.com/facebook/rocksdb/pull/5958#issue-511150930 and their outdated tests to file reordering corruption because such fix can be replaced by this PR.
- Misc:
- update existing tests with `epoch_number` so make check will pass
- update https://github.com/facebook/rocksdb/pull/5958#issue-511150930 tests to verify corruption is fixed using `epoch_number` and cover universal/fifo compaction/CompactRange/CompactFile cases
- assert db_mutex is held for a few places before calling ColumnFamilyData::NewEpochNumber()
Pull Request resolved: https://github.com/facebook/rocksdb/pull/10922
Test Plan:
- `make check`
- New unit tests under `db/db_compaction_test.cc`, `db/db_test2.cc`, `db/version_builder_test.cc`, `db/repair_test.cc`
- Updated tests (i.e, `DBCompactionTestL0FilesMisorderCorruption*`) under https://github.com/facebook/rocksdb/pull/5958#issue-511150930
- [Ongoing] Compatibility test: manually run https://github.com/ajkr/rocksdb/commit/36a5686ec012f35a4371e409aa85c404ca1c210d (with file ingestion off for running the `.orig` binary to prevent this bug affecting upgrade/downgrade formality checking) for 1 hour on `simple black/white box`, `cf_consistency/txn/enable_ts with whitebox + test_best_efforts_recovery with blackbox`
- [Ongoing] normal db stress test
- [Ongoing] db stress test with aggressive value https://github.com/facebook/rocksdb/pull/10761
Reviewed By: ajkr
Differential Revision: D41063187
Pulled By: hx235
fbshipit-source-id: 826cb23455de7beaabe2d16c57682a82733a32a9
2 years ago
|
|
|
void ColumnFamilyData::RecoverEpochNumbers() {
|
|
|
|
assert(current_);
|
|
|
|
auto* vstorage = current_->storage_info();
|
|
|
|
assert(vstorage);
|
|
|
|
vstorage->RecoverEpochNumbers(this);
|
|
|
|
}
|
|
|
|
|
[CF] Rethink table cache
Summary:
Adapting table cache to column families is interesting. We want table cache to be global LRU, so if some column families are use not as often as others, we want them to be evicted from cache. However, current TableCache object also constructs tables on its own. If table is not found in the cache, TableCache automatically creates new table. We want each column family to be able to specify different table factory.
To solve the problem, we still have a single LRU, but we provide the LRUCache object to TableCache on construction. We have one TableCache per column family, but the underyling cache is shared by all TableCache objects.
This allows us to have a global LRU, but still be able to support different table factories for different column families. Also, in the future it will also be able to support different directories for different column families.
Test Plan: make check
Reviewers: dhruba, haobo, kailiu, sdong
CC: leveldb
Differential Revision: https://reviews.facebook.net/D15915
11 years ago
|
|
|
ColumnFamilySet::ColumnFamilySet(const std::string& dbname,
|
|
|
|
const ImmutableDBOptions* db_options,
|
Introduce a new storage specific Env API (#5761)
Summary:
The current Env API encompasses both storage/file operations, as well as OS related operations. Most of the APIs return a Status, which does not have enough metadata about an error, such as whether its retry-able or not, scope (i.e fault domain) of the error etc., that may be required in order to properly handle a storage error. The file APIs also do not provide enough control over the IO SLA, such as timeout, prioritization, hinting about placement and redundancy etc.
This PR separates out the file/storage APIs from Env into a new FileSystem class. The APIs are updated to return an IOStatus with metadata about the error, as well as to take an IOOptions structure as input in order to allow more control over the IO.
The user can set both ```options.env``` and ```options.file_system``` to specify that RocksDB should use the former for OS related operations and the latter for storage operations. Internally, a ```CompositeEnvWrapper``` has been introduced that inherits from ```Env``` and redirects individual methods to either an ```Env``` implementation or the ```FileSystem``` as appropriate. When options are sanitized during ```DB::Open```, ```options.env``` is replaced with a newly allocated ```CompositeEnvWrapper``` instance if both env and file_system have been specified. This way, the rest of the RocksDB code can continue to function as before.
This PR also ports PosixEnv to the new API by splitting it into two - PosixEnv and PosixFileSystem. PosixEnv is defined as a sub-class of CompositeEnvWrapper, and threading/time functions are overridden with Posix specific implementations in order to avoid an extra level of indirection.
The ```CompositeEnvWrapper``` translates ```IOStatus``` return code to ```Status```, and sets the severity to ```kSoftError``` if the io_status is retryable. The error handling code in RocksDB can then recover the DB automatically.
Pull Request resolved: https://github.com/facebook/rocksdb/pull/5761
Differential Revision: D18868376
Pulled By: anand1976
fbshipit-source-id: 39efe18a162ea746fabac6360ff529baba48486f
5 years ago
|
|
|
const FileOptions& file_options,
|
Push- instead of pull-model for managing Write stalls
Summary:
Introducing WriteController, which is a source of truth about per-DB write delays. Let's define an DB epoch as a period where there are no flushes and compactions (i.e. new epoch is started when flush or compaction finishes). Each epoch can either:
* proceed with all writes without delay
* delay all writes by fixed time
* stop all writes
The three modes are recomputed at each epoch change (flush, compaction), rather than on every write (which is currently the case).
When we have a lot of column families, our current pull behavior adds a big overhead, since we need to loop over every column family for every write. With new push model, overhead on Write code-path is minimal.
This is just the start. Next step is to also take care of stalls introduced by slow memtable flushes. The final goal is to eliminate function MakeRoomForWrite(), which currently needs to be called for every column family by every write.
Test Plan: make check for now. I'll add some unit tests later. Also, perf test.
Reviewers: dhruba, yhchiang, MarkCallaghan, sdong, ljin
Reviewed By: ljin
Subscribers: leveldb
Differential Revision: https://reviews.facebook.net/D22791
10 years ago
|
|
|
Cache* table_cache,
|
|
|
|
WriteBufferManager* _write_buffer_manager,
|
|
|
|
WriteController* _write_controller,
|
|
|
|
BlockCacheTracer* const block_cache_tracer,
|
|
|
|
const std::shared_ptr<IOTracer>& io_tracer,
|
|
|
|
const std::string& db_id,
|
|
|
|
const std::string& db_session_id)
|
|
|
|
: max_column_family_(0),
|
Fix use-after-free on implicit temporary FileOptions (#8571)
Summary:
FileOptions has an implicit conversion from EnvOptions and some
internal APIs take `const FileOptions&` and save the reference, which is
counter to Google C++ guidelines,
> Avoid defining functions that require a const reference parameter to outlive the call, because const reference parameters bind to temporaries. Instead, find a way to eliminate the lifetime requirement (for example, by copying the parameter), or pass it by const pointer and document the lifetime and non-null requirements.
This is at least a problem for repair.cc, which passes an EnvOptions to
TableCache(), which would save a reference to the temporary copy as
FileOptions. This was unfortunately only caught as a side effect of
changes in https://github.com/facebook/rocksdb/issues/8544.
This change fixes the repair.cc case and updates the involved internal
APIs that save a reference to use `const FileOptions*` instead.
Unfortunately, I don't know how to get any of our sanitizers to reliably
report bugs like this, so I can't rule out more existing in our
codebase.
Pull Request resolved: https://github.com/facebook/rocksdb/pull/8571
Test Plan:
Test that issues seen with https://github.com/facebook/rocksdb/issues/8544 are fixed (can reproduce on
AWS EC2)
Reviewed By: ajkr
Differential Revision: D29943890
Pulled By: pdillinger
fbshipit-source-id: 95f9c5251548777b4dc994c1a083dd2add5799c9
3 years ago
|
|
|
file_options_(file_options),
|
|
|
|
dummy_cfd_(new ColumnFamilyData(
|
|
|
|
ColumnFamilyData::kDummyColumnFamilyDataId, "", nullptr, nullptr,
|
Fix use-after-free on implicit temporary FileOptions (#8571)
Summary:
FileOptions has an implicit conversion from EnvOptions and some
internal APIs take `const FileOptions&` and save the reference, which is
counter to Google C++ guidelines,
> Avoid defining functions that require a const reference parameter to outlive the call, because const reference parameters bind to temporaries. Instead, find a way to eliminate the lifetime requirement (for example, by copying the parameter), or pass it by const pointer and document the lifetime and non-null requirements.
This is at least a problem for repair.cc, which passes an EnvOptions to
TableCache(), which would save a reference to the temporary copy as
FileOptions. This was unfortunately only caught as a side effect of
changes in https://github.com/facebook/rocksdb/issues/8544.
This change fixes the repair.cc case and updates the involved internal
APIs that save a reference to use `const FileOptions*` instead.
Unfortunately, I don't know how to get any of our sanitizers to reliably
report bugs like this, so I can't rule out more existing in our
codebase.
Pull Request resolved: https://github.com/facebook/rocksdb/pull/8571
Test Plan:
Test that issues seen with https://github.com/facebook/rocksdb/issues/8544 are fixed (can reproduce on
AWS EC2)
Reviewed By: ajkr
Differential Revision: D29943890
Pulled By: pdillinger
fbshipit-source-id: 95f9c5251548777b4dc994c1a083dd2add5799c9
3 years ago
|
|
|
nullptr, ColumnFamilyOptions(), *db_options, &file_options_, nullptr,
|
|
|
|
block_cache_tracer, io_tracer, db_id, db_session_id)),
|
|
|
|
default_cfd_cache_(nullptr),
|
[CF] Rethink table cache
Summary:
Adapting table cache to column families is interesting. We want table cache to be global LRU, so if some column families are use not as often as others, we want them to be evicted from cache. However, current TableCache object also constructs tables on its own. If table is not found in the cache, TableCache automatically creates new table. We want each column family to be able to specify different table factory.
To solve the problem, we still have a single LRU, but we provide the LRUCache object to TableCache on construction. We have one TableCache per column family, but the underyling cache is shared by all TableCache objects.
This allows us to have a global LRU, but still be able to support different table factories for different column families. Also, in the future it will also be able to support different directories for different column families.
Test Plan: make check
Reviewers: dhruba, haobo, kailiu, sdong
CC: leveldb
Differential Revision: https://reviews.facebook.net/D15915
11 years ago
|
|
|
db_name_(dbname),
|
|
|
|
db_options_(db_options),
|
|
|
|
table_cache_(table_cache),
|
|
|
|
write_buffer_manager_(_write_buffer_manager),
|
|
|
|
write_controller_(_write_controller),
|
|
|
|
block_cache_tracer_(block_cache_tracer),
|
|
|
|
io_tracer_(io_tracer),
|
|
|
|
db_id_(db_id),
|
|
|
|
db_session_id_(db_session_id) {
|
|
|
|
// initialize linked list
|
|
|
|
dummy_cfd_->prev_ = dummy_cfd_;
|
|
|
|
dummy_cfd_->next_ = dummy_cfd_;
|
|
|
|
}
|
|
|
|
|
|
|
|
ColumnFamilySet::~ColumnFamilySet() {
|
|
|
|
while (column_family_data_.size() > 0) {
|
|
|
|
// cfd destructor will delete itself from column_family_data_
|
|
|
|
auto cfd = column_family_data_.begin()->second;
|
|
|
|
bool last_ref __attribute__((__unused__));
|
|
|
|
last_ref = cfd->UnrefAndTryDelete();
|
|
|
|
assert(last_ref);
|
|
|
|
}
|
|
|
|
bool dummy_last_ref __attribute__((__unused__));
|
|
|
|
dummy_last_ref = dummy_cfd_->UnrefAndTryDelete();
|
|
|
|
assert(dummy_last_ref);
|
|
|
|
}
|
|
|
|
|
|
|
|
ColumnFamilyData* ColumnFamilySet::GetDefault() const {
|
|
|
|
assert(default_cfd_cache_ != nullptr);
|
|
|
|
return default_cfd_cache_;
|
|
|
|
}
|
|
|
|
|
|
|
|
ColumnFamilyData* ColumnFamilySet::GetColumnFamily(uint32_t id) const {
|
|
|
|
auto cfd_iter = column_family_data_.find(id);
|
|
|
|
if (cfd_iter != column_family_data_.end()) {
|
|
|
|
return cfd_iter->second;
|
|
|
|
} else {
|
|
|
|
return nullptr;
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
ColumnFamilyData* ColumnFamilySet::GetColumnFamily(
|
|
|
|
const std::string& name) const {
|
|
|
|
auto cfd_iter = column_families_.find(name);
|
|
|
|
if (cfd_iter != column_families_.end()) {
|
|
|
|
auto cfd = GetColumnFamily(cfd_iter->second);
|
|
|
|
assert(cfd != nullptr);
|
|
|
|
return cfd;
|
|
|
|
} else {
|
|
|
|
return nullptr;
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
uint32_t ColumnFamilySet::GetNextColumnFamilyID() {
|
|
|
|
return ++max_column_family_;
|
|
|
|
}
|
|
|
|
|
|
|
|
uint32_t ColumnFamilySet::GetMaxColumnFamily() { return max_column_family_; }
|
|
|
|
|
|
|
|
void ColumnFamilySet::UpdateMaxColumnFamily(uint32_t new_max_column_family) {
|
|
|
|
max_column_family_ = std::max(new_max_column_family, max_column_family_);
|
|
|
|
}
|
|
|
|
|
|
|
|
size_t ColumnFamilySet::NumberOfColumnFamilies() const {
|
|
|
|
return column_families_.size();
|
|
|
|
}
|
|
|
|
|
|
|
|
// under a DB mutex AND write thread
|
|
|
|
ColumnFamilyData* ColumnFamilySet::CreateColumnFamily(
|
|
|
|
const std::string& name, uint32_t id, Version* dummy_versions,
|
|
|
|
const ColumnFamilyOptions& options) {
|
|
|
|
assert(column_families_.find(name) == column_families_.end());
|
|
|
|
ColumnFamilyData* new_cfd = new ColumnFamilyData(
|
|
|
|
id, name, dummy_versions, table_cache_, write_buffer_manager_, options,
|
Fix use-after-free on implicit temporary FileOptions (#8571)
Summary:
FileOptions has an implicit conversion from EnvOptions and some
internal APIs take `const FileOptions&` and save the reference, which is
counter to Google C++ guidelines,
> Avoid defining functions that require a const reference parameter to outlive the call, because const reference parameters bind to temporaries. Instead, find a way to eliminate the lifetime requirement (for example, by copying the parameter), or pass it by const pointer and document the lifetime and non-null requirements.
This is at least a problem for repair.cc, which passes an EnvOptions to
TableCache(), which would save a reference to the temporary copy as
FileOptions. This was unfortunately only caught as a side effect of
changes in https://github.com/facebook/rocksdb/issues/8544.
This change fixes the repair.cc case and updates the involved internal
APIs that save a reference to use `const FileOptions*` instead.
Unfortunately, I don't know how to get any of our sanitizers to reliably
report bugs like this, so I can't rule out more existing in our
codebase.
Pull Request resolved: https://github.com/facebook/rocksdb/pull/8571
Test Plan:
Test that issues seen with https://github.com/facebook/rocksdb/issues/8544 are fixed (can reproduce on
AWS EC2)
Reviewed By: ajkr
Differential Revision: D29943890
Pulled By: pdillinger
fbshipit-source-id: 95f9c5251548777b4dc994c1a083dd2add5799c9
3 years ago
|
|
|
*db_options_, &file_options_, this, block_cache_tracer_, io_tracer_,
|
|
|
|
db_id_, db_session_id_);
|
|
|
|
column_families_.insert({name, id});
|
|
|
|
column_family_data_.insert({id, new_cfd});
|
|
|
|
auto ucmp = new_cfd->user_comparator();
|
|
|
|
assert(ucmp);
|
|
|
|
size_t ts_sz = ucmp->timestamp_size();
|
|
|
|
running_ts_sz_.insert({id, ts_sz});
|
|
|
|
if (ts_sz > 0) {
|
|
|
|
ts_sz_for_record_.insert({id, ts_sz});
|
|
|
|
}
|
|
|
|
max_column_family_ = std::max(max_column_family_, id);
|
|
|
|
// add to linked list
|
|
|
|
new_cfd->next_ = dummy_cfd_;
|
|
|
|
auto prev = dummy_cfd_->prev_;
|
|
|
|
new_cfd->prev_ = prev;
|
|
|
|
prev->next_ = new_cfd;
|
|
|
|
dummy_cfd_->prev_ = new_cfd;
|
|
|
|
if (id == 0) {
|
|
|
|
default_cfd_cache_ = new_cfd;
|
|
|
|
}
|
|
|
|
return new_cfd;
|
|
|
|
}
|
|
|
|
|
|
|
|
// under a DB mutex AND from a write thread
|
|
|
|
void ColumnFamilySet::RemoveColumnFamily(ColumnFamilyData* cfd) {
|
|
|
|
uint32_t cf_id = cfd->GetID();
|
|
|
|
auto cfd_iter = column_family_data_.find(cf_id);
|
|
|
|
assert(cfd_iter != column_family_data_.end());
|
|
|
|
column_family_data_.erase(cfd_iter);
|
|
|
|
column_families_.erase(cfd->GetName());
|
|
|
|
running_ts_sz_.erase(cf_id);
|
|
|
|
ts_sz_for_record_.erase(cf_id);
|
|
|
|
}
|
|
|
|
|
|
|
|
// under a DB mutex OR from a write thread
|
|
|
|
bool ColumnFamilyMemTablesImpl::Seek(uint32_t column_family_id) {
|
|
|
|
if (column_family_id == 0) {
|
|
|
|
// optimization for common case
|
|
|
|
current_ = column_family_set_->GetDefault();
|
|
|
|
} else {
|
|
|
|
current_ = column_family_set_->GetColumnFamily(column_family_id);
|
|
|
|
}
|
|
|
|
handle_.SetCFD(current_);
|
|
|
|
return current_ != nullptr;
|
|
|
|
}
|
|
|
|
|
|
|
|
uint64_t ColumnFamilyMemTablesImpl::GetLogNumber() const {
|
|
|
|
assert(current_ != nullptr);
|
|
|
|
return current_->GetLogNumber();
|
|
|
|
}
|
|
|
|
|
|
|
|
MemTable* ColumnFamilyMemTablesImpl::GetMemTable() const {
|
|
|
|
assert(current_ != nullptr);
|
|
|
|
return current_->mem();
|
|
|
|
}
|
|
|
|
|
|
|
|
ColumnFamilyHandle* ColumnFamilyMemTablesImpl::GetColumnFamilyHandle() {
|
|
|
|
assert(current_ != nullptr);
|
|
|
|
return &handle_;
|
|
|
|
}
|
|
|
|
|
|
|
|
uint32_t GetColumnFamilyID(ColumnFamilyHandle* column_family) {
|
|
|
|
uint32_t column_family_id = 0;
|
|
|
|
if (column_family != nullptr) {
|
|
|
|
auto cfh = static_cast_with_check<ColumnFamilyHandleImpl>(column_family);
|
|
|
|
column_family_id = cfh->GetID();
|
|
|
|
}
|
|
|
|
return column_family_id;
|
|
|
|
}
|
|
|
|
|
|
|
|
const Comparator* GetColumnFamilyUserComparator(
|
|
|
|
ColumnFamilyHandle* column_family) {
|
|
|
|
if (column_family != nullptr) {
|
|
|
|
return column_family->GetComparator();
|
|
|
|
}
|
|
|
|
return nullptr;
|
|
|
|
}
|
|
|
|
|
|
|
|
} // namespace ROCKSDB_NAMESPACE
|