|
|
|
// Copyright (c) 2011-present, Facebook, Inc. All rights reserved.
|
|
|
|
// This source code is licensed under both the GPLv2 (found in the
|
|
|
|
// COPYING file in the root directory) and Apache 2.0 License
|
|
|
|
// (found in the LICENSE.Apache file in the root directory).
|
|
|
|
//
|
|
|
|
// Copyright (c) 2011 The LevelDB Authors. All rights reserved.
|
|
|
|
// Use of this source code is governed by a BSD-style license that can be
|
|
|
|
// found in the LICENSE file. See the AUTHORS file for names of contributors.
|
|
|
|
|
|
|
|
#include "db/compaction/compaction_picker_fifo.h"
|
|
|
|
|
|
|
|
#include <cinttypes>
|
|
|
|
#include <string>
|
|
|
|
#include <vector>
|
|
|
|
|
|
|
|
#include "db/column_family.h"
|
|
|
|
#include "logging/log_buffer.h"
|
|
|
|
#include "logging/logging.h"
|
|
|
|
#include "util/string_util.h"
|
|
|
|
|
|
|
|
namespace ROCKSDB_NAMESPACE {
|
|
|
|
namespace {
|
|
|
|
uint64_t GetTotalFilesSize(const std::vector<FileMetaData*>& files) {
|
|
|
|
uint64_t total_size = 0;
|
|
|
|
for (const auto& f : files) {
|
|
|
|
total_size += f->fd.file_size;
|
|
|
|
}
|
|
|
|
return total_size;
|
|
|
|
}
|
|
|
|
} // anonymous namespace
|
|
|
|
|
|
|
|
bool FIFOCompactionPicker::NeedsCompaction(
|
|
|
|
const VersionStorageInfo* vstorage) const {
|
|
|
|
const int kLevel0 = 0;
|
|
|
|
return vstorage->CompactionScore(kLevel0) >= 1;
|
|
|
|
}
|
|
|
|
|
|
|
|
Compaction* FIFOCompactionPicker::PickTTLCompaction(
|
|
|
|
const std::string& cf_name, const MutableCFOptions& mutable_cf_options,
|
|
|
|
const MutableDBOptions& mutable_db_options, VersionStorageInfo* vstorage,
|
|
|
|
LogBuffer* log_buffer) {
|
|
|
|
assert(mutable_cf_options.ttl > 0);
|
|
|
|
|
|
|
|
const int kLevel0 = 0;
|
|
|
|
const std::vector<FileMetaData*>& level_files = vstorage->LevelFiles(kLevel0);
|
|
|
|
uint64_t total_size = GetTotalFilesSize(level_files);
|
|
|
|
|
|
|
|
int64_t _current_time;
|
|
|
|
auto status = ioptions_.clock->GetCurrentTime(&_current_time);
|
|
|
|
if (!status.ok()) {
|
|
|
|
ROCKS_LOG_BUFFER(log_buffer,
|
|
|
|
"[%s] FIFO compaction: Couldn't get current time: %s. "
|
|
|
|
"Not doing compactions based on TTL. ",
|
|
|
|
cf_name.c_str(), status.ToString().c_str());
|
|
|
|
return nullptr;
|
|
|
|
}
|
|
|
|
const uint64_t current_time = static_cast<uint64_t>(_current_time);
|
|
|
|
|
Fix assertion failure in FIFO compaction with TTL (#5754)
Summary:
Before this PR, the following sequence of events can cause assertion failure as shown below.
Stack trace (partial):
```
(gdb) bt
2 0x00007f59b350ad15 in __assert_fail_base (fmt=<optimized out>, assertion=assertion@entry=0x9f8390 "mark_as_compacted ? !inputs_[i][j]->being_compacted : inputs_[i][j]->being_compacted", file=file@entry=0x9e347c "db/compaction/compaction.cc", line=line@entry=395, function=function@entry=0xa21ec0 <rocksdb::Compaction::MarkFilesBeingCompacted(bool)::__PRETTY_FUNCTION__> "void rocksdb::Compaction::MarkFilesBeingCompacted(bool)") at assert.c:92
3 0x00007f59b350adc3 in __GI___assert_fail (assertion=assertion@entry=0x9f8390 "mark_as_compacted ? !inputs_[i][j]->being_compacted : inputs_[i][j]->being_compacted", file=file@entry=0x9e347c "db/compaction/compaction.cc", line=line@entry=395, function=function@entry=0xa21ec0 <rocksdb::Compaction::MarkFilesBeingCompacted(bool)::__PRETTY_FUNCTION__> "void rocksdb::Compaction::MarkFilesBeingCompacted(bool)") at assert.c:101
4 0x0000000000492ccd in rocksdb::Compaction::MarkFilesBeingCompacted (this=<optimized out>, mark_as_compacted=<optimized out>) at db/compaction/compaction.cc:394
5 0x000000000049467a in rocksdb::Compaction::Compaction (this=0x7f59af013000, vstorage=0x7f581af53030, _immutable_cf_options=..., _mutable_cf_options=..., _inputs=..., _output_level=<optimized out>, _target_file_size=0, _max_compaction_bytes=0, _output_path_id=0, _compression=<incomplete type>, _compression_opts=..., _max_subcompactions=0, _grandparents=..., _manual_compaction=false, _score=4, _deletion_compaction=true, _compaction_reason=rocksdb::CompactionReason::kFIFOTtl) at db/compaction/compaction.cc:241
6 0x00000000004af9bc in rocksdb::FIFOCompactionPicker::PickTTLCompaction (this=0x7f59b31a6900, cf_name=..., mutable_cf_options=..., vstorage=0x7f581af53030, log_buffer=log_buffer@entry=0x7f59b1bfa930) at db/compaction/compaction_picker_fifo.cc:101
7 0x00000000004b0771 in rocksdb::FIFOCompactionPicker::PickCompaction (this=0x7f59b31a6900, cf_name=..., mutable_cf_options=..., vstorage=0x7f581af53030, log_buffer=0x7f59b1bfa930) at db/compaction/compaction_picker_fifo.cc:201
8 0x00000000004838cc in rocksdb::ColumnFamilyData::PickCompaction (this=this@entry=0x7f59b31b3700, mutable_options=..., log_buffer=log_buffer@entry=0x7f59b1bfa930) at db/column_family.cc:933
9 0x00000000004f3645 in rocksdb::DBImpl::BackgroundCompaction (this=this@entry=0x7f59b3176000, made_progress=made_progress@entry=0x7f59b1bfa6bf, job_context=job_context@entry=0x7f59b1bfa760, log_buffer=log_buffer@entry=0x7f59b1bfa930, prepicked_compaction=prepicked_compaction@entry=0x0, thread_pri=rocksdb::Env::LOW) at db/db_impl/db_impl_compaction_flush.cc:2541
10 0x00000000004f5e2a in rocksdb::DBImpl::BackgroundCallCompaction (this=this@entry=0x7f59b3176000, prepicked_compaction=prepicked_compaction@entry=0x0, bg_thread_pri=bg_thread_pri@entry=rocksdb::Env::LOW) at db/db_impl/db_impl_compaction_flush.cc:2312
11 0x00000000004f648e in rocksdb::DBImpl::BGWorkCompaction (arg=<optimized out>) at db/db_impl/db_impl_compaction_flush.cc:2087
```
This can be caused by the following sequence of events.
```
Time
| thr bg_compact_thr1 bg_compact_thr2
| write
| flush
| mark all l0 as being compacted
| write
| flush
| add cf to queue again
| mark all l0 as being
| compacted, fail the
| assertion
V
```
Test plan (on devserver)
Since bg_compact_thr1 and bg_compact_thr2 are two threads executing the same
code, it is difficult to use sync point dependency to
coordinate their execution. Therefore, I choose to use db_stress.
```
$TEST_TMPDIR=/dev/shm/rocksdb ./db_stress --periodic_compaction_seconds=1 --max_background_compactions=20 --format_version=2 --memtablerep=skip_list --max_write_buffer_number=3 --cache_index_and_filter_blocks=1 --reopen=20 --recycle_log_file_num=0 --acquire_snapshot_one_in=10000 --delpercent=4 --log2_keys_per_lock=22 --compaction_ttl=1 --block_size=16384 --use_multiget=1 --compact_files_one_in=1000000 --target_file_size_multiplier=2 --clear_column_family_one_in=0 --max_bytes_for_level_base=10485760 --use_full_merge_v1=1 --target_file_size_base=2097152 --checkpoint_one_in=1000000 --mmap_read=0 --compression_type=zstd --writepercent=35 --readpercent=45 --subcompactions=4 --use_merge=0 --write_buffer_size=4194304 --test_batches_snapshots=0 --db=/dev/shm/rocksdb/rocksdb_crashtest_whitebox --use_direct_reads=0 --compact_range_one_in=1000000 --open_files=-1 --destroy_db_initially=0 --progress_reports=0 --compression_zstd_max_train_bytes=0 --snapshot_hold_ops=100000 --enable_pipelined_write=0 --nooverwritepercent=1 --compression_max_dict_bytes=0 --max_key=1000000 --prefixpercent=5 --flush_one_in=1000000 --ops_per_thread=40000 --index_block_restart_interval=7 --cache_size=1048576 --compaction_style=2 --verify_checksum=1 --delrangepercent=1 --use_direct_io_for_flush_and_compaction=0
```
This should see no assertion failure.
Last but not least,
```
$COMPILE_WITH_ASAN=1 make -j32 all
$make check
```
Pull Request resolved: https://github.com/facebook/rocksdb/pull/5754
Differential Revision: D17109791
Pulled By: riversand963
fbshipit-source-id: 25fc46101235add158554e096540b72c324be078
5 years ago
|
|
|
if (!level0_compactions_in_progress_.empty()) {
|
|
|
|
ROCKS_LOG_BUFFER(
|
|
|
|
log_buffer,
|
|
|
|
"[%s] FIFO compaction: Already executing compaction. No need "
|
|
|
|
"to run parallel compactions since compactions are very fast",
|
|
|
|
cf_name.c_str());
|
|
|
|
return nullptr;
|
|
|
|
}
|
|
|
|
|
|
|
|
std::vector<CompactionInputFiles> inputs;
|
|
|
|
inputs.emplace_back();
|
|
|
|
inputs[0].level = 0;
|
|
|
|
|
|
|
|
// avoid underflow
|
|
|
|
if (current_time > mutable_cf_options.ttl) {
|
|
|
|
for (auto ritr = level_files.rbegin(); ritr != level_files.rend(); ++ritr) {
|
|
|
|
FileMetaData* f = *ritr;
|
|
|
|
assert(f);
|
|
|
|
if (f->fd.table_reader && f->fd.table_reader->GetTableProperties()) {
|
|
|
|
uint64_t creation_time =
|
|
|
|
f->fd.table_reader->GetTableProperties()->creation_time;
|
|
|
|
if (creation_time == 0 ||
|
|
|
|
creation_time >= (current_time - mutable_cf_options.ttl)) {
|
|
|
|
break;
|
|
|
|
}
|
|
|
|
}
|
|
|
|
total_size -= f->fd.file_size;
|
|
|
|
inputs[0].files.push_back(f);
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
// Return a nullptr and proceed to size-based FIFO compaction if:
|
|
|
|
// 1. there are no files older than ttl OR
|
|
|
|
// 2. there are a few files older than ttl, but deleting them will not bring
|
|
|
|
// the total size to be less than max_table_files_size threshold.
|
|
|
|
if (inputs[0].files.empty() ||
|
|
|
|
total_size >
|
|
|
|
mutable_cf_options.compaction_options_fifo.max_table_files_size) {
|
|
|
|
return nullptr;
|
|
|
|
}
|
|
|
|
|
|
|
|
for (const auto& f : inputs[0].files) {
|
|
|
|
uint64_t creation_time = 0;
|
|
|
|
assert(f);
|
|
|
|
if (f->fd.table_reader && f->fd.table_reader->GetTableProperties()) {
|
|
|
|
creation_time = f->fd.table_reader->GetTableProperties()->creation_time;
|
|
|
|
}
|
|
|
|
ROCKS_LOG_BUFFER(log_buffer,
|
|
|
|
"[%s] FIFO compaction: picking file %" PRIu64
|
|
|
|
" with creation time %" PRIu64 " for deletion",
|
|
|
|
cf_name.c_str(), f->fd.GetNumber(), creation_time);
|
|
|
|
}
|
|
|
|
|
|
|
|
Compaction* c = new Compaction(
|
|
|
|
vstorage, ioptions_, mutable_cf_options, mutable_db_options,
|
|
|
|
std::move(inputs), 0, 0, 0, 0, kNoCompression,
|
|
|
|
mutable_cf_options.compression_opts, Temperature::kUnknown,
|
|
|
|
/* max_subcompactions */ 0, {}, /* is manual */ false,
|
|
|
|
/* trim_ts */ "", vstorage->CompactionScore(0),
|
|
|
|
/* is deletion compaction */ true, /* l0_files_might_overlap */ true,
|
|
|
|
CompactionReason::kFIFOTtl);
|
|
|
|
return c;
|
|
|
|
}
|
|
|
|
|
|
|
|
// The size-based compaction picker for FIFO.
|
|
|
|
//
|
|
|
|
// When the entire column family size exceeds max_table_files_size, FIFO will
|
|
|
|
// try to delete the oldest sst file(s) until the resulting column family size
|
|
|
|
// is smaller than max_table_files_size.
|
|
|
|
//
|
|
|
|
// This function also takes care the case where a DB is migrating from level /
|
|
|
|
// universal compaction to FIFO compaction. During the migration, the column
|
|
|
|
// family will also have non-L0 files while FIFO can only create L0 files.
|
|
|
|
// In this case, this function will first purge the sst files in the bottom-
|
|
|
|
// most non-empty level first, and the DB will eventually converge to the
|
|
|
|
// regular FIFO case where there're only L0 files. Note that during the
|
|
|
|
// migration case, the purge order will only be an approximation of "FIFO"
|
|
|
|
// as entries inside lower-level files might sometimes be newer than some
|
|
|
|
// entries inside upper-level files.
|
|
|
|
Compaction* FIFOCompactionPicker::PickSizeCompaction(
|
|
|
|
const std::string& cf_name, const MutableCFOptions& mutable_cf_options,
|
|
|
|
const MutableDBOptions& mutable_db_options, VersionStorageInfo* vstorage,
|
|
|
|
LogBuffer* log_buffer) {
|
|
|
|
// compute the total size and identify the last non-empty level
|
|
|
|
int last_level = 0;
|
|
|
|
uint64_t total_size = 0;
|
|
|
|
for (int level = 0; level < vstorage->num_levels(); ++level) {
|
|
|
|
auto level_size = GetTotalFilesSize(vstorage->LevelFiles(level));
|
|
|
|
total_size += level_size;
|
|
|
|
if (level_size > 0) {
|
|
|
|
last_level = level;
|
|
|
|
}
|
|
|
|
}
|
|
|
|
const std::vector<FileMetaData*>& last_level_files =
|
|
|
|
vstorage->LevelFiles(last_level);
|
|
|
|
|
|
|
|
if (last_level == 0 &&
|
|
|
|
total_size <=
|
|
|
|
mutable_cf_options.compaction_options_fifo.max_table_files_size) {
|
|
|
|
// total size not exceeded, try to find intra level 0 compaction if enabled
|
|
|
|
const std::vector<FileMetaData*>& level0_files = vstorage->LevelFiles(0);
|
|
|
|
if (mutable_cf_options.compaction_options_fifo.allow_compaction &&
|
|
|
|
level0_files.size() > 0) {
|
|
|
|
CompactionInputFiles comp_inputs;
|
|
|
|
// try to prevent same files from being compacted multiple times, which
|
|
|
|
// could produce large files that may never TTL-expire. Achieve this by
|
|
|
|
// disallowing compactions with files larger than memtable (inflate its
|
|
|
|
// size by 10% to account for uncompressed L0 files that may have size
|
|
|
|
// slightly greater than memtable size limit).
|
|
|
|
size_t max_compact_bytes_per_del_file =
|
|
|
|
static_cast<size_t>(MultiplyCheckOverflow(
|
|
|
|
static_cast<uint64_t>(mutable_cf_options.write_buffer_size),
|
|
|
|
1.1));
|
|
|
|
if (FindIntraL0Compaction(
|
|
|
|
level0_files,
|
|
|
|
mutable_cf_options
|
|
|
|
.level0_file_num_compaction_trigger /* min_files_to_compact */
|
|
|
|
,
|
|
|
|
max_compact_bytes_per_del_file,
|
|
|
|
mutable_cf_options.max_compaction_bytes, &comp_inputs)) {
|
|
|
|
Compaction* c = new Compaction(
|
|
|
|
vstorage, ioptions_, mutable_cf_options, mutable_db_options,
|
|
|
|
{comp_inputs}, 0, 16 * 1024 * 1024 /* output file size limit */,
|
|
|
|
0 /* max compaction bytes, not applicable */,
|
|
|
|
0 /* output path ID */, mutable_cf_options.compression,
|
|
|
|
mutable_cf_options.compression_opts, Temperature::kUnknown,
|
|
|
|
0 /* max_subcompactions */, {}, /* is manual */ false,
|
|
|
|
/* trim_ts */ "", vstorage->CompactionScore(0),
|
|
|
|
/* is deletion compaction */ false,
|
|
|
|
/* l0_files_might_overlap */ true,
|
|
|
|
CompactionReason::kFIFOReduceNumFiles);
|
|
|
|
return c;
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
ROCKS_LOG_BUFFER(
|
|
|
|
log_buffer,
|
|
|
|
"[%s] FIFO compaction: nothing to do. Total size %" PRIu64
|
|
|
|
", max size %" PRIu64 "\n",
|
|
|
|
cf_name.c_str(), total_size,
|
|
|
|
mutable_cf_options.compaction_options_fifo.max_table_files_size);
|
|
|
|
return nullptr;
|
|
|
|
}
|
|
|
|
|
|
|
|
if (!level0_compactions_in_progress_.empty()) {
|
|
|
|
ROCKS_LOG_BUFFER(
|
|
|
|
log_buffer,
|
|
|
|
"[%s] FIFO compaction: Already executing compaction. No need "
|
|
|
|
"to run parallel compactions since compactions are very fast",
|
|
|
|
cf_name.c_str());
|
|
|
|
return nullptr;
|
|
|
|
}
|
|
|
|
|
|
|
|
std::vector<CompactionInputFiles> inputs;
|
|
|
|
inputs.emplace_back();
|
|
|
|
inputs[0].level = last_level;
|
|
|
|
|
|
|
|
if (last_level == 0) {
|
|
|
|
// In L0, right-most files are the oldest files.
|
|
|
|
for (auto ritr = last_level_files.rbegin(); ritr != last_level_files.rend();
|
|
|
|
++ritr) {
|
|
|
|
auto f = *ritr;
|
|
|
|
total_size -= f->fd.file_size;
|
|
|
|
inputs[0].files.push_back(f);
|
|
|
|
char tmp_fsize[16];
|
|
|
|
AppendHumanBytes(f->fd.GetFileSize(), tmp_fsize, sizeof(tmp_fsize));
|
|
|
|
ROCKS_LOG_BUFFER(log_buffer,
|
|
|
|
"[%s] FIFO compaction: picking file %" PRIu64
|
|
|
|
" with size %s for deletion",
|
|
|
|
cf_name.c_str(), f->fd.GetNumber(), tmp_fsize);
|
|
|
|
if (total_size <=
|
|
|
|
mutable_cf_options.compaction_options_fifo.max_table_files_size) {
|
|
|
|
break;
|
|
|
|
}
|
|
|
|
}
|
|
|
|
} else if (total_size >
|
|
|
|
mutable_cf_options.compaction_options_fifo.max_table_files_size) {
|
|
|
|
// If the last level is non-L0, we actually don't know which file is
|
|
|
|
// logically the oldest since the file creation time only represents
|
|
|
|
// when this file was compacted to this level, which is independent
|
|
|
|
// to when the entries in this file were first inserted.
|
|
|
|
//
|
|
|
|
// As a result, we delete files from the left instead. This means the sst
|
|
|
|
// file with the smallest key will be deleted first. This design decision
|
|
|
|
// better serves a major type of FIFO use cases where smaller keys are
|
|
|
|
// associated with older data.
|
|
|
|
for (const auto& f : last_level_files) {
|
|
|
|
total_size -= f->fd.file_size;
|
|
|
|
inputs[0].files.push_back(f);
|
|
|
|
char tmp_fsize[16];
|
|
|
|
AppendHumanBytes(f->fd.GetFileSize(), tmp_fsize, sizeof(tmp_fsize));
|
|
|
|
ROCKS_LOG_BUFFER(
|
|
|
|
log_buffer,
|
|
|
|
"[%s] FIFO compaction: picking file %" PRIu64
|
|
|
|
" with size %s for deletion under total size %" PRIu64
|
|
|
|
" vs max table files size %" PRIu64,
|
|
|
|
cf_name.c_str(), f->fd.GetNumber(), tmp_fsize, total_size,
|
|
|
|
mutable_cf_options.compaction_options_fifo.max_table_files_size);
|
|
|
|
|
|
|
|
if (total_size <=
|
|
|
|
mutable_cf_options.compaction_options_fifo.max_table_files_size) {
|
|
|
|
break;
|
|
|
|
}
|
|
|
|
}
|
|
|
|
} else {
|
|
|
|
ROCKS_LOG_BUFFER(
|
|
|
|
log_buffer,
|
|
|
|
"[%s] FIFO compaction: nothing to do. Total size %" PRIu64
|
|
|
|
", max size %" PRIu64 "\n",
|
|
|
|
cf_name.c_str(), total_size,
|
|
|
|
mutable_cf_options.compaction_options_fifo.max_table_files_size);
|
|
|
|
return nullptr;
|
|
|
|
}
|
|
|
|
|
|
|
|
Compaction* c = new Compaction(
|
|
|
|
vstorage, ioptions_, mutable_cf_options, mutable_db_options,
|
|
|
|
std::move(inputs), last_level,
|
|
|
|
/* target_file_size */ 0,
|
|
|
|
/* max_compaction_bytes */ 0,
|
|
|
|
/* output_path_id */ 0, kNoCompression,
|
|
|
|
mutable_cf_options.compression_opts, Temperature::kUnknown,
|
|
|
|
/* max_subcompactions */ 0, {}, /* is manual */ false,
|
|
|
|
/* trim_ts */ "", vstorage->CompactionScore(0),
|
|
|
|
/* is deletion compaction */ true,
|
|
|
|
/* l0_files_might_overlap */ true, CompactionReason::kFIFOMaxSize);
|
|
|
|
return c;
|
|
|
|
}
|
|
|
|
|
|
|
|
Compaction* FIFOCompactionPicker::PickCompactionToWarm(
|
|
|
|
const std::string& cf_name, const MutableCFOptions& mutable_cf_options,
|
|
|
|
const MutableDBOptions& mutable_db_options, VersionStorageInfo* vstorage,
|
|
|
|
LogBuffer* log_buffer) {
|
|
|
|
if (mutable_cf_options.compaction_options_fifo.age_for_warm == 0) {
|
|
|
|
return nullptr;
|
|
|
|
}
|
|
|
|
|
|
|
|
// PickCompactionToWarm is only triggered if there is no non-L0 files.
|
|
|
|
for (int level = 1; level < vstorage->num_levels(); ++level) {
|
|
|
|
if (GetTotalFilesSize(vstorage->LevelFiles(level)) > 0) {
|
|
|
|
return nullptr;
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
const int kLevel0 = 0;
|
|
|
|
const std::vector<FileMetaData*>& level_files = vstorage->LevelFiles(kLevel0);
|
|
|
|
|
|
|
|
int64_t _current_time;
|
|
|
|
auto status = ioptions_.clock->GetCurrentTime(&_current_time);
|
|
|
|
if (!status.ok()) {
|
|
|
|
ROCKS_LOG_BUFFER(log_buffer,
|
|
|
|
"[%s] FIFO compaction: Couldn't get current time: %s. "
|
|
|
|
"Not doing compactions based on warm threshold. ",
|
|
|
|
cf_name.c_str(), status.ToString().c_str());
|
|
|
|
return nullptr;
|
|
|
|
}
|
|
|
|
const uint64_t current_time = static_cast<uint64_t>(_current_time);
|
|
|
|
|
|
|
|
if (!level0_compactions_in_progress_.empty()) {
|
|
|
|
ROCKS_LOG_BUFFER(
|
|
|
|
log_buffer,
|
|
|
|
"[%s] FIFO compaction: Already executing compaction. Parallel "
|
|
|
|
"compactions are not supported",
|
|
|
|
cf_name.c_str());
|
|
|
|
return nullptr;
|
|
|
|
}
|
|
|
|
|
|
|
|
std::vector<CompactionInputFiles> inputs;
|
|
|
|
inputs.emplace_back();
|
|
|
|
inputs[0].level = 0;
|
|
|
|
|
|
|
|
// avoid underflow
|
|
|
|
if (current_time > mutable_cf_options.compaction_options_fifo.age_for_warm) {
|
|
|
|
uint64_t create_time_threshold =
|
|
|
|
current_time - mutable_cf_options.compaction_options_fifo.age_for_warm;
|
|
|
|
uint64_t compaction_size = 0;
|
|
|
|
// We will ideally identify a file qualifying for warm tier by knowing
|
|
|
|
// the timestamp for the youngest entry in the file. However, right now
|
|
|
|
// we don't have the information. We infer it by looking at timestamp
|
|
|
|
// of the next file's (which is just younger) oldest entry's timestamp.
|
|
|
|
FileMetaData* prev_file = nullptr;
|
|
|
|
for (auto ritr = level_files.rbegin(); ritr != level_files.rend(); ++ritr) {
|
|
|
|
FileMetaData* f = *ritr;
|
|
|
|
assert(f);
|
|
|
|
if (f->being_compacted) {
|
|
|
|
// Right now this probably won't happen as we never try to schedule
|
|
|
|
// two compactions in parallel, so here we just simply don't schedule
|
|
|
|
// anything.
|
|
|
|
return nullptr;
|
|
|
|
}
|
|
|
|
uint64_t oldest_ancester_time = f->TryGetOldestAncesterTime();
|
|
|
|
if (oldest_ancester_time == kUnknownOldestAncesterTime) {
|
|
|
|
// Older files might not have enough information. It is possible to
|
|
|
|
// handle these files by looking at newer files, but maintaining the
|
|
|
|
// logic isn't worth it.
|
|
|
|
break;
|
|
|
|
}
|
|
|
|
if (oldest_ancester_time > create_time_threshold) {
|
|
|
|
// The previous file (which has slightly older data) doesn't qualify
|
|
|
|
// for warm tier.
|
|
|
|
break;
|
|
|
|
}
|
|
|
|
if (prev_file != nullptr) {
|
|
|
|
compaction_size += prev_file->fd.GetFileSize();
|
|
|
|
if (compaction_size > mutable_cf_options.max_compaction_bytes) {
|
|
|
|
break;
|
|
|
|
}
|
|
|
|
inputs[0].files.push_back(prev_file);
|
|
|
|
ROCKS_LOG_BUFFER(log_buffer,
|
|
|
|
"[%s] FIFO compaction: picking file %" PRIu64
|
|
|
|
" with next file's oldest time %" PRIu64 " for warm",
|
|
|
|
cf_name.c_str(), prev_file->fd.GetNumber(),
|
|
|
|
oldest_ancester_time);
|
|
|
|
}
|
|
|
|
if (f->temperature == Temperature::kUnknown ||
|
|
|
|
f->temperature == Temperature::kHot) {
|
|
|
|
prev_file = f;
|
|
|
|
} else if (!inputs[0].files.empty()) {
|
|
|
|
// A warm file newer than files picked.
|
|
|
|
break;
|
|
|
|
} else {
|
|
|
|
assert(prev_file == nullptr);
|
|
|
|
}
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
if (inputs[0].files.empty()) {
|
|
|
|
return nullptr;
|
|
|
|
}
|
|
|
|
|
|
|
|
Compaction* c = new Compaction(
|
|
|
|
vstorage, ioptions_, mutable_cf_options, mutable_db_options,
|
|
|
|
std::move(inputs), 0, 0 /* output file size limit */,
|
|
|
|
0 /* max compaction bytes, not applicable */, 0 /* output path ID */,
|
|
|
|
mutable_cf_options.compression, mutable_cf_options.compression_opts,
|
|
|
|
Temperature::kWarm,
|
|
|
|
/* max_subcompactions */ 0, {}, /* is manual */ false, /* trim_ts */ "",
|
|
|
|
vstorage->CompactionScore(0),
|
|
|
|
/* is deletion compaction */ false, /* l0_files_might_overlap */ true,
|
|
|
|
CompactionReason::kChangeTemperature);
|
|
|
|
return c;
|
|
|
|
}
|
|
|
|
|
|
|
|
Compaction* FIFOCompactionPicker::PickCompaction(
|
|
|
|
const std::string& cf_name, const MutableCFOptions& mutable_cf_options,
|
|
|
|
const MutableDBOptions& mutable_db_options, VersionStorageInfo* vstorage,
|
Sort L0 files by newly introduced epoch_num (#10922)
Summary:
**Context:**
Sorting L0 files by `largest_seqno` has at least two inconvenience:
- File ingestion and compaction involving ingested files can create files of overlapping seqno range with the existing files. `force_consistency_check=true` will catch such overlap seqno range even those harmless overlap.
- For example, consider the following sequence of events ("key@n" indicates key at seqno "n")
- insert k1@1 to memtable m1
- ingest file s1 with k2@2, ingest file s2 with k3@3
- insert k4@4 to m1
- compact files s1, s2 and result in new file s3 of seqno range [2, 3]
- flush m1 and result in new file s4 of seqno range [1, 4]. And `force_consistency_check=true` will think s4 and s3 has file reordering corruption that might cause retuning an old value of k1
- However such caught corruption is a false positive since s1, s2 will not have overlapped keys with k1 or whatever inserted into m1 before ingest file s1 by the requirement of file ingestion (otherwise the m1 will be flushed first before any of the file ingestion completes). Therefore there in fact isn't any file reordering corruption.
- Single delete can decrease a file's largest seqno and ordering by `largest_seqno` can introduce a wrong ordering hence file reordering corruption
- For example, consider the following sequence of events ("key@n" indicates key at seqno "n", Credit to ajkr for this example)
- an existing SST s1 contains only k1@1
- insert k1@2 to memtable m1
- ingest file s2 with k3@3, ingest file s3 with k4@4
- insert single delete k5@5 in m1
- flush m1 and result in new file s4 of seqno range [2, 5]
- compact s1, s2, s3 and result in new file s5 of seqno range [1, 4]
- compact s4 and result in new file s6 of seqno range [2] due to single delete
- By the last step, we have file ordering by largest seqno (">" means "newer") : s5 > s6 while s6 contains a newer version of the k1's value (i.e, k1@2) than s5, which is a real reordering corruption. While this can be caught by `force_consistency_check=true`, there isn't a good way to prevent this from happening if ordering by `largest_seqno`
Therefore, we are redesigning the sorting criteria of L0 files and avoid above inconvenience. Credit to ajkr , we now introduce `epoch_num` which describes the order of a file being flushed or ingested/imported (compaction output file will has the minimum `epoch_num` among input files'). This will avoid the above inconvenience in the following ways:
- In the first case above, there will no longer be overlap seqno range check in `force_consistency_check=true` but `epoch_number` ordering check. This will result in file ordering s1 < s2 < s4 (pre-compaction) and s3 < s4 (post-compaction) which won't trigger false positive corruption. See test class `DBCompactionTestL0FilesMisorderCorruption*` for more.
- In the second case above, this will result in file ordering s1 < s2 < s3 < s4 (pre-compacting s1, s2, s3), s5 < s4 (post-compacting s1, s2, s3), s5 < s6 (post-compacting s4), which are correct file ordering without causing any corruption.
**Summary:**
- Introduce `epoch_number` stored per `ColumnFamilyData` and sort CF's L0 files by their assigned `epoch_number` instead of `largest_seqno`.
- `epoch_number` is increased and assigned upon `VersionEdit::AddFile()` for flush (or similarly for WriteLevel0TableForRecovery) and file ingestion (except for allow_behind_true, which will always get assigned as the `kReservedEpochNumberForFileIngestedBehind`)
- Compaction output file is assigned with the minimum `epoch_number` among input files'
- Refit level: reuse refitted file's epoch_number
- Other paths needing `epoch_number` treatment:
- Import column families: reuse file's epoch_number if exists. If not, assign one based on `NewestFirstBySeqNo`
- Repair: reuse file's epoch_number if exists. If not, assign one based on `NewestFirstBySeqNo`.
- Assigning new epoch_number to a file and adding this file to LSM tree should be atomic. This is guaranteed by us assigning epoch_number right upon `VersionEdit::AddFile()` where this version edit will be apply to LSM tree shape right after by holding the db mutex (e.g, flush, file ingestion, import column family) or by there is only 1 ongoing edit per CF (e.g, WriteLevel0TableForRecovery, Repair).
- Assigning the minimum input epoch number to compaction output file won't misorder L0 files (even through later `Refit(target_level=0)`). It's due to for every key "k" in the input range, a legit compaction will cover a continuous epoch number range of that key. As long as we assign the key "k" the minimum input epoch number, it won't become newer or older than the versions of this key that aren't included in this compaction hence no misorder.
- Persist `epoch_number` of each file in manifest and recover `epoch_number` on db recovery
- Backward compatibility with old db without `epoch_number` support is guaranteed by assigning `epoch_number` to recovered files by `NewestFirstBySeqno` order. See `VersionStorageInfo::RecoverEpochNumbers()` for more
- Forward compatibility with manifest is guaranteed by flexibility of `NewFileCustomTag`
- Replace `force_consistent_check` on L0 with `epoch_number` and remove false positive check like case 1 with `largest_seqno` above
- Due to backward compatibility issue, we might encounter files with missing epoch number at the beginning of db recovery. We will still use old L0 sorting mechanism (`NewestFirstBySeqno`) to check/sort them till we infer their epoch number. See usages of `EpochNumberRequirement`.
- Remove fix https://github.com/facebook/rocksdb/pull/5958#issue-511150930 and their outdated tests to file reordering corruption because such fix can be replaced by this PR.
- Misc:
- update existing tests with `epoch_number` so make check will pass
- update https://github.com/facebook/rocksdb/pull/5958#issue-511150930 tests to verify corruption is fixed using `epoch_number` and cover universal/fifo compaction/CompactRange/CompactFile cases
- assert db_mutex is held for a few places before calling ColumnFamilyData::NewEpochNumber()
Pull Request resolved: https://github.com/facebook/rocksdb/pull/10922
Test Plan:
- `make check`
- New unit tests under `db/db_compaction_test.cc`, `db/db_test2.cc`, `db/version_builder_test.cc`, `db/repair_test.cc`
- Updated tests (i.e, `DBCompactionTestL0FilesMisorderCorruption*`) under https://github.com/facebook/rocksdb/pull/5958#issue-511150930
- [Ongoing] Compatibility test: manually run https://github.com/ajkr/rocksdb/commit/36a5686ec012f35a4371e409aa85c404ca1c210d (with file ingestion off for running the `.orig` binary to prevent this bug affecting upgrade/downgrade formality checking) for 1 hour on `simple black/white box`, `cf_consistency/txn/enable_ts with whitebox + test_best_efforts_recovery with blackbox`
- [Ongoing] normal db stress test
- [Ongoing] db stress test with aggressive value https://github.com/facebook/rocksdb/pull/10761
Reviewed By: ajkr
Differential Revision: D41063187
Pulled By: hx235
fbshipit-source-id: 826cb23455de7beaabe2d16c57682a82733a32a9
2 years ago
|
|
|
LogBuffer* log_buffer) {
|
|
|
|
Compaction* c = nullptr;
|
|
|
|
if (mutable_cf_options.ttl > 0) {
|
|
|
|
c = PickTTLCompaction(cf_name, mutable_cf_options, mutable_db_options,
|
|
|
|
vstorage, log_buffer);
|
|
|
|
}
|
|
|
|
if (c == nullptr) {
|
|
|
|
c = PickSizeCompaction(cf_name, mutable_cf_options, mutable_db_options,
|
|
|
|
vstorage, log_buffer);
|
|
|
|
}
|
|
|
|
if (c == nullptr) {
|
|
|
|
c = PickCompactionToWarm(cf_name, mutable_cf_options, mutable_db_options,
|
|
|
|
vstorage, log_buffer);
|
|
|
|
}
|
|
|
|
RegisterCompaction(c);
|
|
|
|
return c;
|
|
|
|
}
|
|
|
|
|
|
|
|
Compaction* FIFOCompactionPicker::CompactRange(
|
|
|
|
const std::string& cf_name, const MutableCFOptions& mutable_cf_options,
|
|
|
|
const MutableDBOptions& mutable_db_options, VersionStorageInfo* vstorage,
|
|
|
|
int input_level, int output_level,
|
|
|
|
const CompactRangeOptions& /*compact_range_options*/,
|
|
|
|
const InternalKey* /*begin*/, const InternalKey* /*end*/,
|
|
|
|
InternalKey** compaction_end, bool* /*manual_conflict*/,
|
|
|
|
uint64_t /*max_file_num_to_ignore*/, const std::string& /*trim_ts*/) {
|
|
|
|
#ifdef NDEBUG
|
|
|
|
(void)input_level;
|
|
|
|
(void)output_level;
|
|
|
|
#endif
|
|
|
|
assert(input_level == 0);
|
|
|
|
assert(output_level == 0);
|
|
|
|
*compaction_end = nullptr;
|
|
|
|
LogBuffer log_buffer(InfoLogLevel::INFO_LEVEL, ioptions_.logger);
|
|
|
|
Compaction* c = PickCompaction(cf_name, mutable_cf_options,
|
|
|
|
mutable_db_options, vstorage, &log_buffer);
|
|
|
|
log_buffer.FlushBufferToLog();
|
|
|
|
return c;
|
|
|
|
}
|
|
|
|
|
|
|
|
} // namespace ROCKSDB_NAMESPACE
|