You can not select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.
rocksdb/db/compaction/compaction_picker_fifo.cc

247 lines
9.1 KiB

// Copyright (c) 2011-present, Facebook, Inc. All rights reserved.
// This source code is licensed under both the GPLv2 (found in the
// COPYING file in the root directory) and Apache 2.0 License
// (found in the LICENSE.Apache file in the root directory).
//
// Copyright (c) 2011 The LevelDB Authors. All rights reserved.
// Use of this source code is governed by a BSD-style license that can be
// found in the LICENSE file. See the AUTHORS file for names of contributors.
#include "db/compaction/compaction_picker_fifo.h"
#ifndef ROCKSDB_LITE
#include <cinttypes>
#include <string>
#include <vector>
#include "db/column_family.h"
#include "logging/log_buffer.h"
#include "util/string_util.h"
namespace ROCKSDB_NAMESPACE {
namespace {
uint64_t GetTotalFilesSize(const std::vector<FileMetaData*>& files) {
uint64_t total_size = 0;
for (const auto& f : files) {
total_size += f->fd.file_size;
}
return total_size;
}
} // anonymous namespace
bool FIFOCompactionPicker::NeedsCompaction(
const VersionStorageInfo* vstorage) const {
const int kLevel0 = 0;
return vstorage->CompactionScore(kLevel0) >= 1;
}
Compaction* FIFOCompactionPicker::PickTTLCompaction(
const std::string& cf_name, const MutableCFOptions& mutable_cf_options,
VersionStorageInfo* vstorage, LogBuffer* log_buffer) {
assert(mutable_cf_options.ttl > 0);
const int kLevel0 = 0;
const std::vector<FileMetaData*>& level_files = vstorage->LevelFiles(kLevel0);
uint64_t total_size = GetTotalFilesSize(level_files);
int64_t _current_time;
auto status = ioptions_.env->GetCurrentTime(&_current_time);
if (!status.ok()) {
ROCKS_LOG_BUFFER(log_buffer,
"[%s] FIFO compaction: Couldn't get current time: %s. "
"Not doing compactions based on TTL. ",
cf_name.c_str(), status.ToString().c_str());
return nullptr;
}
const uint64_t current_time = static_cast<uint64_t>(_current_time);
Fix assertion failure in FIFO compaction with TTL (#5754) Summary: Before this PR, the following sequence of events can cause assertion failure as shown below. Stack trace (partial): ``` (gdb) bt 2 0x00007f59b350ad15 in __assert_fail_base (fmt=<optimized out>, assertion=assertion@entry=0x9f8390 "mark_as_compacted ? !inputs_[i][j]->being_compacted : inputs_[i][j]->being_compacted", file=file@entry=0x9e347c "db/compaction/compaction.cc", line=line@entry=395, function=function@entry=0xa21ec0 <rocksdb::Compaction::MarkFilesBeingCompacted(bool)::__PRETTY_FUNCTION__> "void rocksdb::Compaction::MarkFilesBeingCompacted(bool)") at assert.c:92 3 0x00007f59b350adc3 in __GI___assert_fail (assertion=assertion@entry=0x9f8390 "mark_as_compacted ? !inputs_[i][j]->being_compacted : inputs_[i][j]->being_compacted", file=file@entry=0x9e347c "db/compaction/compaction.cc", line=line@entry=395, function=function@entry=0xa21ec0 <rocksdb::Compaction::MarkFilesBeingCompacted(bool)::__PRETTY_FUNCTION__> "void rocksdb::Compaction::MarkFilesBeingCompacted(bool)") at assert.c:101 4 0x0000000000492ccd in rocksdb::Compaction::MarkFilesBeingCompacted (this=<optimized out>, mark_as_compacted=<optimized out>) at db/compaction/compaction.cc:394 5 0x000000000049467a in rocksdb::Compaction::Compaction (this=0x7f59af013000, vstorage=0x7f581af53030, _immutable_cf_options=..., _mutable_cf_options=..., _inputs=..., _output_level=<optimized out>, _target_file_size=0, _max_compaction_bytes=0, _output_path_id=0, _compression=<incomplete type>, _compression_opts=..., _max_subcompactions=0, _grandparents=..., _manual_compaction=false, _score=4, _deletion_compaction=true, _compaction_reason=rocksdb::CompactionReason::kFIFOTtl) at db/compaction/compaction.cc:241 6 0x00000000004af9bc in rocksdb::FIFOCompactionPicker::PickTTLCompaction (this=0x7f59b31a6900, cf_name=..., mutable_cf_options=..., vstorage=0x7f581af53030, log_buffer=log_buffer@entry=0x7f59b1bfa930) at db/compaction/compaction_picker_fifo.cc:101 7 0x00000000004b0771 in rocksdb::FIFOCompactionPicker::PickCompaction (this=0x7f59b31a6900, cf_name=..., mutable_cf_options=..., vstorage=0x7f581af53030, log_buffer=0x7f59b1bfa930) at db/compaction/compaction_picker_fifo.cc:201 8 0x00000000004838cc in rocksdb::ColumnFamilyData::PickCompaction (this=this@entry=0x7f59b31b3700, mutable_options=..., log_buffer=log_buffer@entry=0x7f59b1bfa930) at db/column_family.cc:933 9 0x00000000004f3645 in rocksdb::DBImpl::BackgroundCompaction (this=this@entry=0x7f59b3176000, made_progress=made_progress@entry=0x7f59b1bfa6bf, job_context=job_context@entry=0x7f59b1bfa760, log_buffer=log_buffer@entry=0x7f59b1bfa930, prepicked_compaction=prepicked_compaction@entry=0x0, thread_pri=rocksdb::Env::LOW) at db/db_impl/db_impl_compaction_flush.cc:2541 10 0x00000000004f5e2a in rocksdb::DBImpl::BackgroundCallCompaction (this=this@entry=0x7f59b3176000, prepicked_compaction=prepicked_compaction@entry=0x0, bg_thread_pri=bg_thread_pri@entry=rocksdb::Env::LOW) at db/db_impl/db_impl_compaction_flush.cc:2312 11 0x00000000004f648e in rocksdb::DBImpl::BGWorkCompaction (arg=<optimized out>) at db/db_impl/db_impl_compaction_flush.cc:2087 ``` This can be caused by the following sequence of events. ``` Time | thr bg_compact_thr1 bg_compact_thr2 | write | flush | mark all l0 as being compacted | write | flush | add cf to queue again | mark all l0 as being | compacted, fail the | assertion V ``` Test plan (on devserver) Since bg_compact_thr1 and bg_compact_thr2 are two threads executing the same code, it is difficult to use sync point dependency to coordinate their execution. Therefore, I choose to use db_stress. ``` $TEST_TMPDIR=/dev/shm/rocksdb ./db_stress --periodic_compaction_seconds=1 --max_background_compactions=20 --format_version=2 --memtablerep=skip_list --max_write_buffer_number=3 --cache_index_and_filter_blocks=1 --reopen=20 --recycle_log_file_num=0 --acquire_snapshot_one_in=10000 --delpercent=4 --log2_keys_per_lock=22 --compaction_ttl=1 --block_size=16384 --use_multiget=1 --compact_files_one_in=1000000 --target_file_size_multiplier=2 --clear_column_family_one_in=0 --max_bytes_for_level_base=10485760 --use_full_merge_v1=1 --target_file_size_base=2097152 --checkpoint_one_in=1000000 --mmap_read=0 --compression_type=zstd --writepercent=35 --readpercent=45 --subcompactions=4 --use_merge=0 --write_buffer_size=4194304 --test_batches_snapshots=0 --db=/dev/shm/rocksdb/rocksdb_crashtest_whitebox --use_direct_reads=0 --compact_range_one_in=1000000 --open_files=-1 --destroy_db_initially=0 --progress_reports=0 --compression_zstd_max_train_bytes=0 --snapshot_hold_ops=100000 --enable_pipelined_write=0 --nooverwritepercent=1 --compression_max_dict_bytes=0 --max_key=1000000 --prefixpercent=5 --flush_one_in=1000000 --ops_per_thread=40000 --index_block_restart_interval=7 --cache_size=1048576 --compaction_style=2 --verify_checksum=1 --delrangepercent=1 --use_direct_io_for_flush_and_compaction=0 ``` This should see no assertion failure. Last but not least, ``` $COMPILE_WITH_ASAN=1 make -j32 all $make check ``` Pull Request resolved: https://github.com/facebook/rocksdb/pull/5754 Differential Revision: D17109791 Pulled By: riversand963 fbshipit-source-id: 25fc46101235add158554e096540b72c324be078
5 years ago
if (!level0_compactions_in_progress_.empty()) {
ROCKS_LOG_BUFFER(
log_buffer,
"[%s] FIFO compaction: Already executing compaction. No need "
"to run parallel compactions since compactions are very fast",
cf_name.c_str());
return nullptr;
}
std::vector<CompactionInputFiles> inputs;
inputs.emplace_back();
inputs[0].level = 0;
// avoid underflow
if (current_time > mutable_cf_options.ttl) {
for (auto ritr = level_files.rbegin(); ritr != level_files.rend(); ++ritr) {
FileMetaData* f = *ritr;
if (f->fd.table_reader && f->fd.table_reader->GetTableProperties()) {
uint64_t creation_time =
f->fd.table_reader->GetTableProperties()->creation_time;
if (creation_time == 0 ||
creation_time >= (current_time - mutable_cf_options.ttl)) {
break;
}
}
total_size -= f->compensated_file_size;
inputs[0].files.push_back(f);
}
}
// Return a nullptr and proceed to size-based FIFO compaction if:
// 1. there are no files older than ttl OR
// 2. there are a few files older than ttl, but deleting them will not bring
// the total size to be less than max_table_files_size threshold.
if (inputs[0].files.empty() ||
total_size >
mutable_cf_options.compaction_options_fifo.max_table_files_size) {
return nullptr;
}
for (const auto& f : inputs[0].files) {
uint64_t creation_time = 0;
if (f && f->fd.table_reader && f->fd.table_reader->GetTableProperties()) {
creation_time = f->fd.table_reader->GetTableProperties()->creation_time;
}
ROCKS_LOG_BUFFER(log_buffer,
"[%s] FIFO compaction: picking file %" PRIu64
" with creation time %" PRIu64 " for deletion",
cf_name.c_str(), f->fd.GetNumber(), creation_time);
}
Compaction* c = new Compaction(
vstorage, ioptions_, mutable_cf_options, std::move(inputs), 0, 0, 0, 0,
kNoCompression, mutable_cf_options.compression_opts,
/* max_subcompactions */ 0, {}, /* is manual */ false,
vstorage->CompactionScore(0),
/* is deletion compaction */ true, CompactionReason::kFIFOTtl);
return c;
}
Compaction* FIFOCompactionPicker::PickSizeCompaction(
const std::string& cf_name, const MutableCFOptions& mutable_cf_options,
VersionStorageInfo* vstorage, LogBuffer* log_buffer) {
const int kLevel0 = 0;
const std::vector<FileMetaData*>& level_files = vstorage->LevelFiles(kLevel0);
uint64_t total_size = GetTotalFilesSize(level_files);
if (total_size <=
mutable_cf_options.compaction_options_fifo.max_table_files_size ||
level_files.size() == 0) {
// total size not exceeded
if (mutable_cf_options.compaction_options_fifo.allow_compaction &&
level_files.size() > 0) {
CompactionInputFiles comp_inputs;
// try to prevent same files from being compacted multiple times, which
// could produce large files that may never TTL-expire. Achieve this by
// disallowing compactions with files larger than memtable (inflate its
// size by 10% to account for uncompressed L0 files that may have size
// slightly greater than memtable size limit).
size_t max_compact_bytes_per_del_file =
static_cast<size_t>(MultiplyCheckOverflow(
static_cast<uint64_t>(mutable_cf_options.write_buffer_size),
1.1));
if (FindIntraL0Compaction(
level_files,
mutable_cf_options
.level0_file_num_compaction_trigger /* min_files_to_compact */
,
max_compact_bytes_per_del_file,
mutable_cf_options.max_compaction_bytes, &comp_inputs)) {
Compaction* c = new Compaction(
vstorage, ioptions_, mutable_cf_options, {comp_inputs}, 0,
16 * 1024 * 1024 /* output file size limit */,
0 /* max compaction bytes, not applicable */,
0 /* output path ID */, mutable_cf_options.compression,
mutable_cf_options.compression_opts, 0 /* max_subcompactions */, {},
/* is manual */ false, vstorage->CompactionScore(0),
/* is deletion compaction */ false,
CompactionReason::kFIFOReduceNumFiles);
return c;
}
}
ROCKS_LOG_BUFFER(
log_buffer,
"[%s] FIFO compaction: nothing to do. Total size %" PRIu64
", max size %" PRIu64 "\n",
cf_name.c_str(), total_size,
mutable_cf_options.compaction_options_fifo.max_table_files_size);
return nullptr;
}
if (!level0_compactions_in_progress_.empty()) {
ROCKS_LOG_BUFFER(
log_buffer,
"[%s] FIFO compaction: Already executing compaction. No need "
"to run parallel compactions since compactions are very fast",
cf_name.c_str());
return nullptr;
}
std::vector<CompactionInputFiles> inputs;
inputs.emplace_back();
inputs[0].level = 0;
for (auto ritr = level_files.rbegin(); ritr != level_files.rend(); ++ritr) {
auto f = *ritr;
total_size -= f->compensated_file_size;
inputs[0].files.push_back(f);
char tmp_fsize[16];
AppendHumanBytes(f->fd.GetFileSize(), tmp_fsize, sizeof(tmp_fsize));
ROCKS_LOG_BUFFER(log_buffer,
"[%s] FIFO compaction: picking file %" PRIu64
" with size %s for deletion",
cf_name.c_str(), f->fd.GetNumber(), tmp_fsize);
if (total_size <=
mutable_cf_options.compaction_options_fifo.max_table_files_size) {
break;
}
}
Compaction* c = new Compaction(
vstorage, ioptions_, mutable_cf_options, std::move(inputs), 0, 0, 0, 0,
kNoCompression, mutable_cf_options.compression_opts,
/* max_subcompactions */ 0, {}, /* is manual */ false,
vstorage->CompactionScore(0),
/* is deletion compaction */ true, CompactionReason::kFIFOMaxSize);
return c;
}
Compaction* FIFOCompactionPicker::PickCompaction(
const std::string& cf_name, const MutableCFOptions& mutable_cf_options,
Fix corruption with intra-L0 on ingested files (#5958) Summary: ## Problem Description Our process was abort when it call `CheckConsistency`. And the information in `stderr` show that "`L0 files seqno 3001491972 3004797440 vs. 3002875611 3004524421` ". Here are the causes of the accident I investigated. * RocksDB will call `CheckConsistency` whenever `MANIFEST` file is update. It will check sequence number interval of every file, except files which were ingested. * When one file is ingested into RocksDB, it will be assigned the value of global sequence number, and the minimum and maximum seqno of this file are equal, which are both equal to global sequence number. * `CheckConsistency` determines whether the file is ingested by whether the smallest and largest seqno of an sstable file are equal. * If IntraL0Compaction picks one sst which was ingested just now and compacted it into another sst, the `smallest_seqno` of this new file will be smaller than his `largest_seqno`. * If more than one ingested file was ingested before memtable schedule flush, and they all compact into one new sstable file by `IntraL0Compaction`. The sequence interval of this new file will be included in the interval of the memtable. So `CheckConsistency` will return a `Corruption`. * If a sstable was ingested after the memtable was schedule to flush, which would assign a larger seqno to it than memtable. Then the file was compacted with other files (these files were all flushed before the memtable) in L0 into one file. This compaction start before the flush job of memtable start, but completed after the flush job finish. So this new file produced by the compaction (we call it s1) would have a larger interval of sequence number than the file produced by flush (we call it s2). **But there was still some data in s1 written into RocksDB before the s2, so it's possible that some data in s2 was cover by old data in s1.** Of course, it would also make a `Corruption` because of overlap of seqno. There is the relationship of the files: > s1.smallest_seqno < s2.smallest_seqno < s2.largest_seqno < s1.largest_seqno So I skip pick sst file which was ingested in function `FindIntraL0Compaction ` ## Reason Here is my bug report: https://github.com/facebook/rocksdb/issues/5913 There are two situations that can cause the check to fail. ### First situation: - First we ingest five external sst into Rocksdb, and they happened to be ingested in L0. and there had been some data in memtable, which make the smallest sequence number of memtable is less than which of sst that we ingest. - If there had been one compaction job which compacted sst from L0 to L1, `LevelCompactionPicker` would trigger a `IntraL0Compaction` which would compact this five sst from L0 to L0. We call this sst A, which was merged from five ingested sst. - Then some data was put into memtable, and memtable was flushed to L0. We called this sst B. - RocksDB check consistency , and find the `smallest_seqno` of B is less than that of A and crash. Because A was merged from five sst, the smallest sequence number of it was less than the biggest sequece number of itself, so RocksDB could not tell if A was produce by ingested. ### Secondary situaion - First we have flushed many sst in L0, we call them [s1, s2, s3]. - There is an immutable memtable request to be flushed, but because flush thread is busy, so it has not been picked. we call it m1. And at the moment, one sst is ingested into L0. We call it s4. Because s4 is ingested after m1 became immutable memtable, so it has a larger log sequence number than m1. - m1 is flushed in L0. because it is small, this flush job finish quickly. we call it s5. - [s1, s2, s3, s4] are compacted into one sst to L0, by IntraL0Compaction. We call it s6. - compacted 4@0 files to L0 - When s6 is added into manifest, the corruption happened. because the largest sequence number of s6 is equal to s4, and they are both larger than that of s5. But because s1 is older than m1, so the smallest sequence number of s6 is smaller than that of s5. - s6.smallest_seqno < s5.smallest_seqno < s5.largest_seqno < s6.largest_seqno Pull Request resolved: https://github.com/facebook/rocksdb/pull/5958 Differential Revision: D18601316 fbshipit-source-id: 5fe54b3c9af52a2e1400728f565e895cde1c7267
5 years ago
VersionStorageInfo* vstorage, LogBuffer* log_buffer,
SequenceNumber /*earliest_memtable_seqno*/) {
assert(vstorage->num_levels() == 1);
Compaction* c = nullptr;
if (mutable_cf_options.ttl > 0) {
c = PickTTLCompaction(cf_name, mutable_cf_options, vstorage, log_buffer);
}
if (c == nullptr) {
c = PickSizeCompaction(cf_name, mutable_cf_options, vstorage, log_buffer);
}
RegisterCompaction(c);
return c;
}
Compaction* FIFOCompactionPicker::CompactRange(
const std::string& cf_name, const MutableCFOptions& mutable_cf_options,
VersionStorageInfo* vstorage, int input_level, int output_level,
const CompactRangeOptions& /*compact_range_options*/,
const InternalKey* /*begin*/, const InternalKey* /*end*/,
InternalKey** compaction_end, bool* /*manual_conflict*/,
uint64_t /*max_file_num_to_ignore*/) {
#ifdef NDEBUG
(void)input_level;
(void)output_level;
#endif
assert(input_level == 0);
assert(output_level == 0);
*compaction_end = nullptr;
LogBuffer log_buffer(InfoLogLevel::INFO_LEVEL, ioptions_.info_log);
Compaction* c =
PickCompaction(cf_name, mutable_cf_options, vstorage, &log_buffer);
log_buffer.FlushBufferToLog();
return c;
}
} // namespace ROCKSDB_NAMESPACE
#endif // !ROCKSDB_LITE