Compare the number of input keys and processed keys for compactions (#11571)

Summary:
... to improve data integrity validation during compaction.

A new option `compaction_verify_record_count` is introduced for this verification and is enabled by default. One exception when the verification is not done is when a compaction filter returns kRemoveAndSkipUntil which can cause CompactionIterator to seek until some key and hence not able to keep track of the number of keys processed.

For expected number of input keys, we sum over the number of total keys - number of range tombstones across compaction input files (`CompactionJob::UpdateCompactionStats()`). Table properties are consulted if `FileMetaData` is not initialized for some input file. Since table properties for all input files were also constructed during `DBImpl::NotifyOnCompactionBegin()`, `Compaction::GetTableProperties()` is introduced to reduce duplicated code.

For actual number of keys processed, each subcompaction will record its number of keys processed to `sub_compact->compaction_job_stats.num_input_records` and aggregated when all subcompactions finish (`CompactionJob::AggregateCompactionStats()`). In the case when some subcompaction encountered kRemoveAndSkipUntil from compaction filter and does not have accurate count, it propagates this information through `sub_compact->compaction_job_stats.has_num_input_records`.

Pull Request resolved: https://github.com/facebook/rocksdb/pull/11571

Test Plan:
* Add a new unit test `DBCompactionTest.VerifyRecordCount` for the corruption case.
* All other unit tests for non-corrupted case.
* Ran crash test for a few hours: `python3 ./tools/db_crashtest.py whitebox --simple`

Reviewed By: ajkr

Differential Revision: D47131965

Pulled By: cbi42

fbshipit-source-id: cc8e94565dd526c4347e9d3843ecf32f6727af92
oxigraph-main
Changyu Bi 1 year ago committed by Facebook GitHub Bot
parent 5dd8c114bb
commit 6a0f637633
  1. 4
      db/builder.cc
  2. 29
      db/compaction/compaction.cc
  3. 19
      db/compaction/compaction.h
  4. 13
      db/compaction/compaction_iterator.cc
  5. 23
      db/compaction/compaction_iterator.h
  6. 4
      db/compaction/compaction_iterator_test.cc
  7. 121
      db/compaction/compaction_job.cc
  8. 19
      db/compaction/compaction_job.h
  9. 3
      db/compaction/compaction_job_test.cc
  10. 37
      db/db_compaction_test.cc
  11. 2
      db/db_impl/db_impl.h
  12. 25
      db/db_impl/db_impl_compaction_flush.cc
  13. 1
      db/flush_job.cc
  14. 3
      db/version_edit.h
  15. 3
      include/rocksdb/compaction_job_stats.h
  16. 20
      include/rocksdb/options.h
  17. 7
      options/db_options.cc
  18. 1
      options/db_options.h
  19. 2
      options/options_helper.cc
  20. 1
      options/options_settable_test.cc
  21. 8
      table/mock_table.cc
  22. 1
      unreleased_history/new_features/compaction_verify_input_count.md
  23. 2
      util/compaction_job_stats_impl.cc

@ -203,6 +203,7 @@ Status BuildTable(
blob_file_builder.get(), ioptions.allow_data_in_errors, blob_file_builder.get(), ioptions.allow_data_in_errors,
ioptions.enforce_single_del_contracts, ioptions.enforce_single_del_contracts,
/*manual_compaction_canceled=*/kManualCompactionCanceledFalse, /*manual_compaction_canceled=*/kManualCompactionCanceledFalse,
true /* must_count_input_entries */,
/*compaction=*/nullptr, compaction_filter.get(), /*compaction=*/nullptr, compaction_filter.get(),
/*shutting_down=*/nullptr, db_options.info_log, full_history_ts_low); /*shutting_down=*/nullptr, db_options.info_log, full_history_ts_low);
@ -286,8 +287,9 @@ Status BuildTable(
TEST_SYNC_POINT("BuildTable:BeforeFinishBuildTable"); TEST_SYNC_POINT("BuildTable:BeforeFinishBuildTable");
const bool empty = builder->IsEmpty(); const bool empty = builder->IsEmpty();
if (num_input_entries != nullptr) { if (num_input_entries != nullptr) {
assert(c_iter.HasNumInputEntryScanned());
*num_input_entries = *num_input_entries =
c_iter.num_input_entry_scanned() + num_unfragmented_tombstones; c_iter.NumInputEntryScanned() + num_unfragmented_tombstones;
} }
if (!s.ok() || empty) { if (!s.ok() || empty) {
builder->Abandon(); builder->Abandon();

@ -13,6 +13,7 @@
#include <vector> #include <vector>
#include "db/column_family.h" #include "db/column_family.h"
#include "logging/logging.h"
#include "rocksdb/compaction_filter.h" #include "rocksdb/compaction_filter.h"
#include "rocksdb/sst_partitioner.h" #include "rocksdb/sst_partitioner.h"
#include "test_util/sync_point.h" #include "test_util/sync_point.h"
@ -203,6 +204,34 @@ bool Compaction::IsFullCompaction(
return num_files_in_compaction == total_num_files; return num_files_in_compaction == total_num_files;
} }
const TablePropertiesCollection& Compaction::GetTableProperties() {
if (!input_table_properties_initialized_) {
const ReadOptions read_options(Env::IOActivity::kCompaction);
for (size_t i = 0; i < num_input_levels(); ++i) {
for (const FileMetaData* fmd : *(this->inputs(i))) {
std::shared_ptr<const TableProperties> tp;
std::string file_name =
TableFileName(immutable_options_.cf_paths, fmd->fd.GetNumber(),
fmd->fd.GetPathId());
Status s = input_version_->GetTableProperties(read_options, &tp, fmd,
&file_name);
if (s.ok()) {
table_properties_[file_name] = tp;
} else {
ROCKS_LOG_ERROR(immutable_options_.info_log,
"Unable to load table properties for file %" PRIu64
" --- %s\n",
fmd->fd.GetNumber(), s.ToString().c_str());
}
}
}
input_table_properties_initialized_ = true;
};
return table_properties_;
}
Compaction::Compaction( Compaction::Compaction(
VersionStorageInfo* vstorage, const ImmutableOptions& _immutable_options, VersionStorageInfo* vstorage, const ImmutableOptions& _immutable_options,
const MutableCFOptions& _mutable_cf_options, const MutableCFOptions& _mutable_cf_options,

@ -326,12 +326,16 @@ class Compaction {
int output_level, VersionStorageInfo* vstorage, int output_level, VersionStorageInfo* vstorage,
const std::vector<CompactionInputFiles>& inputs); const std::vector<CompactionInputFiles>& inputs);
TablePropertiesCollection GetOutputTableProperties() const { // If called before a compaction finishes, will return
return output_table_properties_; // table properties of all compaction input files.
} // If called after a compaction finished, will return
// table properties of all compaction input and output files.
void SetOutputTableProperties(TablePropertiesCollection tp) { const TablePropertiesCollection& GetTableProperties();
output_table_properties_ = std::move(tp);
void SetOutputTableProperties(
const std::string& file_name,
const std::shared_ptr<const TableProperties>& tp) {
table_properties_[file_name] = tp;
} }
Slice GetSmallestUserKey() const { return smallest_user_key_; } Slice GetSmallestUserKey() const { return smallest_user_key_; }
@ -518,8 +522,9 @@ class Compaction {
// Does input compression match the output compression? // Does input compression match the output compression?
bool InputCompressionMatchesOutput() const; bool InputCompressionMatchesOutput() const;
bool input_table_properties_initialized_ = false;
// table properties of output files // table properties of output files
TablePropertiesCollection output_table_properties_; TablePropertiesCollection table_properties_;
// smallest user keys in compaction // smallest user keys in compaction
// includes timestamp if user-defined timestamp is enabled. // includes timestamp if user-defined timestamp is enabled.

@ -31,7 +31,8 @@ CompactionIterator::CompactionIterator(
BlobFileBuilder* blob_file_builder, bool allow_data_in_errors, BlobFileBuilder* blob_file_builder, bool allow_data_in_errors,
bool enforce_single_del_contracts, bool enforce_single_del_contracts,
const std::atomic<bool>& manual_compaction_canceled, const std::atomic<bool>& manual_compaction_canceled,
const Compaction* compaction, const CompactionFilter* compaction_filter, bool must_count_input_entries, const Compaction* compaction,
const CompactionFilter* compaction_filter,
const std::atomic<bool>* shutting_down, const std::atomic<bool>* shutting_down,
const std::shared_ptr<Logger> info_log, const std::shared_ptr<Logger> info_log,
const std::string* full_history_ts_low, const std::string* full_history_ts_low,
@ -45,8 +46,9 @@ CompactionIterator::CompactionIterator(
manual_compaction_canceled, manual_compaction_canceled,
std::unique_ptr<CompactionProxy>( std::unique_ptr<CompactionProxy>(
compaction ? new RealCompaction(compaction) : nullptr), compaction ? new RealCompaction(compaction) : nullptr),
compaction_filter, shutting_down, info_log, full_history_ts_low, must_count_input_entries, compaction_filter, shutting_down, info_log,
preserve_time_min_seqno, preclude_last_level_min_seqno) {} full_history_ts_low, preserve_time_min_seqno,
preclude_last_level_min_seqno) {}
CompactionIterator::CompactionIterator( CompactionIterator::CompactionIterator(
InternalIterator* input, const Comparator* cmp, MergeHelper* merge_helper, InternalIterator* input, const Comparator* cmp, MergeHelper* merge_helper,
@ -58,15 +60,14 @@ CompactionIterator::CompactionIterator(
BlobFileBuilder* blob_file_builder, bool allow_data_in_errors, BlobFileBuilder* blob_file_builder, bool allow_data_in_errors,
bool enforce_single_del_contracts, bool enforce_single_del_contracts,
const std::atomic<bool>& manual_compaction_canceled, const std::atomic<bool>& manual_compaction_canceled,
std::unique_ptr<CompactionProxy> compaction, std::unique_ptr<CompactionProxy> compaction, bool must_count_input_entries,
const CompactionFilter* compaction_filter, const CompactionFilter* compaction_filter,
const std::atomic<bool>* shutting_down, const std::atomic<bool>* shutting_down,
const std::shared_ptr<Logger> info_log, const std::shared_ptr<Logger> info_log,
const std::string* full_history_ts_low, const std::string* full_history_ts_low,
const SequenceNumber preserve_time_min_seqno, const SequenceNumber preserve_time_min_seqno,
const SequenceNumber preclude_last_level_min_seqno) const SequenceNumber preclude_last_level_min_seqno)
: input_(input, cmp, : input_(input, cmp, must_count_input_entries),
!compaction || compaction->DoesInputReferenceBlobFiles()),
cmp_(cmp), cmp_(cmp),
merge_helper_(merge_helper), merge_helper_(merge_helper),
snapshots_(snapshots), snapshots_(snapshots),

@ -38,15 +38,18 @@ class SequenceIterWrapper : public InternalIterator {
bool Valid() const override { return inner_iter_->Valid(); } bool Valid() const override { return inner_iter_->Valid(); }
Status status() const override { return inner_iter_->status(); } Status status() const override { return inner_iter_->status(); }
void Next() override { void Next() override {
num_itered_++; if (!inner_iter_->IsDeleteRangeSentinelKey()) {
num_itered_++;
}
inner_iter_->Next(); inner_iter_->Next();
} }
void Seek(const Slice& target) override { void Seek(const Slice& target) override {
if (!need_count_entries_) { if (!need_count_entries_) {
has_num_itered_ = false;
inner_iter_->Seek(target); inner_iter_->Seek(target);
} else { } else {
// For flush cases, we need to count total number of entries, so we // Need to count total number of entries,
// do Next() rather than Seek(). // so we do Next() rather than Seek().
while (inner_iter_->Valid() && while (inner_iter_->Valid() &&
icmp_.Compare(inner_iter_->key(), target) < 0) { icmp_.Compare(inner_iter_->key(), target) < 0) {
Next(); Next();
@ -62,7 +65,8 @@ class SequenceIterWrapper : public InternalIterator {
void SeekForPrev(const Slice& /* target */) override { assert(false); } void SeekForPrev(const Slice& /* target */) override { assert(false); }
void SeekToLast() override { assert(false); } void SeekToLast() override { assert(false); }
uint64_t num_itered() const { return num_itered_; } uint64_t NumItered() const { return num_itered_; }
bool HasNumItered() const { return has_num_itered_; }
bool IsDeleteRangeSentinelKey() const override { bool IsDeleteRangeSentinelKey() const override {
assert(Valid()); assert(Valid());
return inner_iter_->IsDeleteRangeSentinelKey(); return inner_iter_->IsDeleteRangeSentinelKey();
@ -73,6 +77,7 @@ class SequenceIterWrapper : public InternalIterator {
InternalIterator* inner_iter_; // not owned InternalIterator* inner_iter_; // not owned
uint64_t num_itered_ = 0; uint64_t num_itered_ = 0;
bool need_count_entries_; bool need_count_entries_;
bool has_num_itered_ = true;
}; };
class CompactionIterator { class CompactionIterator {
@ -189,6 +194,10 @@ class CompactionIterator {
const Compaction* compaction_; const Compaction* compaction_;
}; };
// @param must_count_input_entries if true, `NumInputEntryScanned()` will
// return the number of input keys scanned. If false, `NumInputEntryScanned()`
// will return this number if no Seek was called on `input`. User should call
// `HasNumInputEntryScanned()` first in this case.
CompactionIterator( CompactionIterator(
InternalIterator* input, const Comparator* cmp, MergeHelper* merge_helper, InternalIterator* input, const Comparator* cmp, MergeHelper* merge_helper,
SequenceNumber last_sequence, std::vector<SequenceNumber>* snapshots, SequenceNumber last_sequence, std::vector<SequenceNumber>* snapshots,
@ -199,7 +208,7 @@ class CompactionIterator {
BlobFileBuilder* blob_file_builder, bool allow_data_in_errors, BlobFileBuilder* blob_file_builder, bool allow_data_in_errors,
bool enforce_single_del_contracts, bool enforce_single_del_contracts,
const std::atomic<bool>& manual_compaction_canceled, const std::atomic<bool>& manual_compaction_canceled,
const Compaction* compaction = nullptr, bool must_count_input_entries, const Compaction* compaction = nullptr,
const CompactionFilter* compaction_filter = nullptr, const CompactionFilter* compaction_filter = nullptr,
const std::atomic<bool>* shutting_down = nullptr, const std::atomic<bool>* shutting_down = nullptr,
const std::shared_ptr<Logger> info_log = nullptr, const std::shared_ptr<Logger> info_log = nullptr,
@ -219,6 +228,7 @@ class CompactionIterator {
bool enforce_single_del_contracts, bool enforce_single_del_contracts,
const std::atomic<bool>& manual_compaction_canceled, const std::atomic<bool>& manual_compaction_canceled,
std::unique_ptr<CompactionProxy> compaction, std::unique_ptr<CompactionProxy> compaction,
bool must_count_input_entries,
const CompactionFilter* compaction_filter = nullptr, const CompactionFilter* compaction_filter = nullptr,
const std::atomic<bool>* shutting_down = nullptr, const std::atomic<bool>* shutting_down = nullptr,
const std::shared_ptr<Logger> info_log = nullptr, const std::shared_ptr<Logger> info_log = nullptr,
@ -253,7 +263,8 @@ class CompactionIterator {
return current_user_key_; return current_user_key_;
} }
const CompactionIterationStats& iter_stats() const { return iter_stats_; } const CompactionIterationStats& iter_stats() const { return iter_stats_; }
uint64_t num_input_entry_scanned() const { return input_.num_itered(); } bool HasNumInputEntryScanned() const { return input_.HasNumItered(); }
uint64_t NumInputEntryScanned() const { return input_.NumItered(); }
// If the current key should be placed on penultimate level, only valid if // If the current key should be placed on penultimate level, only valid if
// per_key_placement is supported // per_key_placement is supported
bool output_to_penultimate_level() const { bool output_to_penultimate_level() const {

@ -293,8 +293,8 @@ class CompactionIteratorTest : public testing::TestWithParam<bool> {
nullptr /* blob_file_builder */, true /*allow_data_in_errors*/, nullptr /* blob_file_builder */, true /*allow_data_in_errors*/,
true /*enforce_single_del_contracts*/, true /*enforce_single_del_contracts*/,
/*manual_compaction_canceled=*/kManualCompactionCanceledFalse_, /*manual_compaction_canceled=*/kManualCompactionCanceledFalse_,
std::move(compaction), filter, &shutting_down_, /*info_log=*/nullptr, std::move(compaction), /*must_count_input_entries=*/false, filter,
full_history_ts_low)); &shutting_down_, /*info_log=*/nullptr, full_history_ts_low));
} }
void AddSnapshot(SequenceNumber snapshot, void AddSnapshot(SequenceNumber snapshot,

@ -796,15 +796,46 @@ Status CompactionJob::Run() {
auto fn = auto fn =
TableFileName(state.compaction->immutable_options()->cf_paths, TableFileName(state.compaction->immutable_options()->cf_paths,
output.meta.fd.GetNumber(), output.meta.fd.GetPathId()); output.meta.fd.GetNumber(), output.meta.fd.GetPathId());
tp[fn] = output.table_properties; compact_->compaction->SetOutputTableProperties(fn,
output.table_properties);
} }
} }
compact_->compaction->SetOutputTableProperties(std::move(tp));
// Finish up all book-keeping to unify the subcompaction results // Finish up all bookkeeping to unify the subcompaction results.
compact_->AggregateCompactionStats(compaction_stats_, *compaction_job_stats_); compact_->AggregateCompactionStats(compaction_stats_, *compaction_job_stats_);
UpdateCompactionStats(); uint64_t num_input_range_del = 0;
bool ok = UpdateCompactionStats(&num_input_range_del);
// (Sub)compactions returned ok, do sanity check on the number of input keys.
if (status.ok() && ok && compaction_job_stats_->has_num_input_records) {
size_t ts_sz = compact_->compaction->column_family_data()
->user_comparator()
->timestamp_size();
// When trim_ts_ is non-empty, CompactionIterator takes
// HistoryTrimmingIterator as input iterator and sees a trimmed view of
// input keys. So the number of keys it processed is not suitable for
// verification here.
// TODO: support verification when trim_ts_ is non-empty.
if (!(ts_sz > 0 && !trim_ts_.empty()) &&
db_options_.compaction_verify_record_count) {
assert(compaction_stats_.stats.num_input_records > 0);
// TODO: verify the number of range deletion entries.
uint64_t expected =
compaction_stats_.stats.num_input_records - num_input_range_del;
uint64_t actual = compaction_job_stats_->num_input_records;
if (expected != actual) {
std::string msg =
"Total number of input records: " + std::to_string(expected) +
", but processed " + std::to_string(actual) + " records.";
ROCKS_LOG_WARN(
db_options_.info_log, "[%s] [JOB %d] Compaction %s",
compact_->compaction->column_family_data()->GetName().c_str(),
job_context_->job_id, msg.c_str());
status = Status::Corruption(
"Compaction number of input keys does not match number of keys "
"processed.");
}
}
}
RecordCompactionIOStats(); RecordCompactionIOStats();
LogFlush(db_options_.info_log); LogFlush(db_options_.info_log);
TEST_SYNC_POINT("CompactionJob::Run():End"); TEST_SYNC_POINT("CompactionJob::Run():End");
@ -1252,6 +1283,8 @@ void CompactionJob::ProcessKeyValueCompaction(SubcompactionState* sub_compact) {
/*expect_valid_internal_key=*/true, range_del_agg.get(), /*expect_valid_internal_key=*/true, range_del_agg.get(),
blob_file_builder.get(), db_options_.allow_data_in_errors, blob_file_builder.get(), db_options_.allow_data_in_errors,
db_options_.enforce_single_del_contracts, manual_compaction_canceled_, db_options_.enforce_single_del_contracts, manual_compaction_canceled_,
sub_compact->compaction
->DoesInputReferenceBlobFiles() /* must_count_input_entries */,
sub_compact->compaction, compaction_filter, shutting_down_, sub_compact->compaction, compaction_filter, shutting_down_,
db_options_.info_log, full_history_ts_low, preserve_time_min_seqno_, db_options_.info_log, full_history_ts_low, preserve_time_min_seqno_,
preclude_last_level_min_seqno_); preclude_last_level_min_seqno_);
@ -1316,8 +1349,25 @@ void CompactionJob::ProcessKeyValueCompaction(SubcompactionState* sub_compact) {
if (c_iter->status().IsManualCompactionPaused()) { if (c_iter->status().IsManualCompactionPaused()) {
break; break;
} }
#ifndef NDEBUG
bool stop = false;
TEST_SYNC_POINT_CALLBACK("CompactionJob::ProcessKeyValueCompaction()::stop",
static_cast<void*>(&stop));
if (stop) {
break;
}
#endif // NDEBUG
} }
// This number may not be accurate when CompactionIterator was created
// with `must_count_input_entries=false`.
assert(!sub_compact->compaction->DoesInputReferenceBlobFiles() ||
c_iter->HasNumInputEntryScanned());
sub_compact->compaction_job_stats.has_num_input_records =
c_iter->HasNumInputEntryScanned();
sub_compact->compaction_job_stats.num_input_records =
c_iter->NumInputEntryScanned();
sub_compact->compaction_job_stats.num_blobs_read = sub_compact->compaction_job_stats.num_blobs_read =
c_iter_stats.num_blobs_read; c_iter_stats.num_blobs_read;
sub_compact->compaction_job_stats.total_blob_bytes_read = sub_compact->compaction_job_stats.total_blob_bytes_read =
@ -1903,24 +1953,53 @@ void CopyPrefix(const Slice& src, size_t prefix_length, std::string* dst) {
} }
} // namespace } // namespace
bool CompactionJob::UpdateCompactionStats(uint64_t* num_input_range_del) {
void CompactionJob::UpdateCompactionStats() {
assert(compact_); assert(compact_);
Compaction* compaction = compact_->compaction; Compaction* compaction = compact_->compaction;
compaction_stats_.stats.num_input_files_in_non_output_levels = 0; compaction_stats_.stats.num_input_files_in_non_output_levels = 0;
compaction_stats_.stats.num_input_files_in_output_level = 0; compaction_stats_.stats.num_input_files_in_output_level = 0;
bool has_error = false;
const ReadOptions read_options(Env::IOActivity::kCompaction);
const auto& input_table_properties = compaction->GetTableProperties();
for (int input_level = 0; for (int input_level = 0;
input_level < static_cast<int>(compaction->num_input_levels()); input_level < static_cast<int>(compaction->num_input_levels());
++input_level) { ++input_level) {
size_t num_input_files = compaction->num_input_files(input_level);
uint64_t* bytes_read;
if (compaction->level(input_level) != compaction->output_level()) { if (compaction->level(input_level) != compaction->output_level()) {
UpdateCompactionInputStatsHelper( compaction_stats_.stats.num_input_files_in_non_output_levels +=
&compaction_stats_.stats.num_input_files_in_non_output_levels, static_cast<int>(num_input_files);
&compaction_stats_.stats.bytes_read_non_output_levels, input_level); bytes_read = &compaction_stats_.stats.bytes_read_non_output_levels;
} else { } else {
UpdateCompactionInputStatsHelper( compaction_stats_.stats.num_input_files_in_output_level +=
&compaction_stats_.stats.num_input_files_in_output_level, static_cast<int>(num_input_files);
&compaction_stats_.stats.bytes_read_output_level, input_level); bytes_read = &compaction_stats_.stats.bytes_read_output_level;
}
for (size_t i = 0; i < num_input_files; ++i) {
const FileMetaData* file_meta = compaction->input(input_level, i);
*bytes_read += file_meta->fd.GetFileSize();
uint64_t file_input_entries = file_meta->num_entries;
uint64_t file_num_range_del = file_meta->num_range_deletions;
if (file_input_entries == 0) {
uint64_t file_number = file_meta->fd.GetNumber();
// Try getting info from table property
std::string fn =
TableFileName(compaction->immutable_options()->cf_paths,
file_number, file_meta->fd.GetPathId());
const auto& tp = input_table_properties.find(fn);
if (tp != input_table_properties.end()) {
file_input_entries = tp->second->num_entries;
file_num_range_del = tp->second->num_range_deletions;
} else {
has_error = true;
}
}
compaction_stats_.stats.num_input_records += file_input_entries;
if (num_input_range_del) {
*num_input_range_del += file_num_range_del;
}
} }
} }
@ -1930,21 +2009,7 @@ void CompactionJob::UpdateCompactionStats() {
compaction_stats_.stats.num_dropped_records = compaction_stats_.stats.num_dropped_records =
compaction_stats_.DroppedRecords(); compaction_stats_.DroppedRecords();
} return !has_error;
void CompactionJob::UpdateCompactionInputStatsHelper(int* num_files,
uint64_t* bytes_read,
int input_level) {
const Compaction* compaction = compact_->compaction;
auto num_input_files = compaction->num_input_files(input_level);
*num_files += static_cast<int>(num_input_files);
for (size_t i = 0; i < num_input_files; ++i) {
const auto* file_meta = compaction->input(input_level, i);
*bytes_read += file_meta->fd.GetFileSize();
compaction_stats_.stats.num_input_records +=
static_cast<uint64_t>(file_meta->num_entries);
}
} }
void CompactionJob::UpdateCompactionJobStats( void CompactionJob::UpdateCompactionJobStats(

@ -192,7 +192,21 @@ class CompactionJob {
IOStatus io_status() const { return io_status_; } IOStatus io_status() const { return io_status_; }
protected: protected:
void UpdateCompactionStats(); // Update the following stats in compaction_stats_.stats
// - num_input_files_in_non_output_levels
// - num_input_files_in_output_level
// - bytes_read_non_output_levels
// - bytes_read_output_level
// - num_input_records
// - bytes_read_blob
// - num_dropped_records
//
// @param num_input_range_del if non-null, will be set to the number of range
// deletion entries in this compaction input.
//
// Returns true iff compaction_stats_.stats.num_input_records and
// num_input_range_del are calculated successfully.
bool UpdateCompactionStats(uint64_t* num_input_range_del = nullptr);
void LogCompaction(); void LogCompaction();
virtual void RecordCompactionIOStats(); virtual void RecordCompactionIOStats();
void CleanupCompaction(); void CleanupCompaction();
@ -267,9 +281,6 @@ class CompactionJob {
void RecordDroppedKeys(const CompactionIterationStats& c_iter_stats, void RecordDroppedKeys(const CompactionIterationStats& c_iter_stats,
CompactionJobStats* compaction_job_stats = nullptr); CompactionJobStats* compaction_job_stats = nullptr);
void UpdateCompactionInputStatsHelper(int* num_files, uint64_t* bytes_read,
int input_level);
void NotifyOnSubcompactionBegin(SubcompactionState* sub_compact); void NotifyOnSubcompactionBegin(SubcompactionState* sub_compact);
void NotifyOnSubcompactionCompleted(SubcompactionState* sub_compact); void NotifyOnSubcompactionCompleted(SubcompactionState* sub_compact);

@ -655,11 +655,12 @@ class CompactionJobTestBase : public testing::Test {
ASSERT_TRUE(full_history_ts_low_.empty() || ASSERT_TRUE(full_history_ts_low_.empty() ||
ucmp_->timestamp_size() == full_history_ts_low_.size()); ucmp_->timestamp_size() == full_history_ts_low_.size());
const std::atomic<bool> kManualCompactionCanceledFalse{false}; const std::atomic<bool> kManualCompactionCanceledFalse{false};
JobContext job_context(1, false /* create_superversion */);
CompactionJob compaction_job( CompactionJob compaction_job(
0, &compaction, db_options_, mutable_db_options_, env_options_, 0, &compaction, db_options_, mutable_db_options_, env_options_,
versions_.get(), &shutting_down_, &log_buffer, nullptr, nullptr, versions_.get(), &shutting_down_, &log_buffer, nullptr, nullptr,
nullptr, nullptr, &mutex_, &error_handler_, snapshots, nullptr, nullptr, &mutex_, &error_handler_, snapshots,
earliest_write_conflict_snapshot, snapshot_checker, nullptr, earliest_write_conflict_snapshot, snapshot_checker, &job_context,
table_cache_, &event_logger, false, false, dbname_, table_cache_, &event_logger, false, false, dbname_,
&compaction_job_stats_, Env::Priority::USER, nullptr /* IOTracer */, &compaction_job_stats_, Env::Priority::USER, nullptr /* IOTracer */,
/*manual_compaction_canceled=*/kManualCompactionCanceledFalse, /*manual_compaction_canceled=*/kManualCompactionCanceledFalse,

@ -9807,6 +9807,43 @@ TEST_F(DBCompactionTest, NumberOfSubcompactions) {
} }
} }
TEST_F(DBCompactionTest, VerifyRecordCount) {
Options options = CurrentOptions();
options.compaction_style = kCompactionStyleLevel;
options.level0_file_num_compaction_trigger = 3;
options.compaction_verify_record_count = true;
DestroyAndReopen(options);
Random rnd(301);
// Create 2 overlapping L0 files
for (int i = 1; i < 20; i += 2) {
ASSERT_OK(Put(Key(i), rnd.RandomString(100)));
}
ASSERT_OK(Flush());
for (int i = 0; i < 20; i += 2) {
ASSERT_OK(Put(Key(i), rnd.RandomString(100)));
}
ASSERT_OK(Flush());
// Only iterator through 10 keys and force compaction to finish.
int num_iter = 0;
ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->SetCallBack(
"CompactionJob::ProcessKeyValueCompaction()::stop", [&](void* stop_ptr) {
num_iter++;
if (num_iter == 10) {
*(bool*)stop_ptr = true;
}
});
SyncPoint::GetInstance()->EnableProcessing();
Status s = db_->CompactRange(CompactRangeOptions(), nullptr, nullptr);
ASSERT_TRUE(s.IsCorruption());
const char* expect =
"Compaction number of input keys does not match number of keys "
"processed.";
ASSERT_TRUE(std::strstr(s.getState(), expect));
}
} // namespace ROCKSDB_NAMESPACE } // namespace ROCKSDB_NAMESPACE
int main(int argc, char** argv) { int main(int argc, char** argv) {

@ -2200,7 +2200,7 @@ class DBImpl : public DB {
void BuildCompactionJobInfo(const ColumnFamilyData* cfd, Compaction* c, void BuildCompactionJobInfo(const ColumnFamilyData* cfd, Compaction* c,
const Status& st, const Status& st,
const CompactionJobStats& compaction_job_stats, const CompactionJobStats& compaction_job_stats,
const int job_id, const Version* current, const int job_id,
CompactionJobInfo* compaction_job_info) const; CompactionJobInfo* compaction_job_info) const;
// Reserve the next 'num' file numbers for to-be-ingested external SST files, // Reserve the next 'num' file numbers for to-be-ingested external SST files,
// and return the current file_number in 'next_file_number'. // and return the current file_number in 'next_file_number'.

@ -1546,7 +1546,7 @@ Status DBImpl::CompactFilesImpl(
if (compaction_job_info != nullptr) { if (compaction_job_info != nullptr) {
BuildCompactionJobInfo(cfd, c.get(), s, compaction_job_stats, BuildCompactionJobInfo(cfd, c.get(), s, compaction_job_stats,
job_context->job_id, version, compaction_job_info); job_context->job_id, compaction_job_info);
} }
if (status.ok()) { if (status.ok()) {
@ -1643,21 +1643,18 @@ void DBImpl::NotifyOnCompactionBegin(ColumnFamilyData* cfd, Compaction* c,
} }
c->SetNotifyOnCompactionCompleted(); c->SetNotifyOnCompactionCompleted();
Version* current = cfd->current();
current->Ref();
// release lock while notifying events // release lock while notifying events
mutex_.Unlock(); mutex_.Unlock();
TEST_SYNC_POINT("DBImpl::NotifyOnCompactionBegin::UnlockMutex"); TEST_SYNC_POINT("DBImpl::NotifyOnCompactionBegin::UnlockMutex");
{ {
CompactionJobInfo info{}; CompactionJobInfo info{};
BuildCompactionJobInfo(cfd, c, st, job_stats, job_id, current, &info); BuildCompactionJobInfo(cfd, c, st, job_stats, job_id, &info);
for (auto listener : immutable_db_options_.listeners) { for (auto listener : immutable_db_options_.listeners) {
listener->OnCompactionBegin(this, info); listener->OnCompactionBegin(this, info);
} }
info.status.PermitUncheckedError(); info.status.PermitUncheckedError();
} }
mutex_.Lock(); mutex_.Lock();
current->Unref();
} }
void DBImpl::NotifyOnCompactionCompleted( void DBImpl::NotifyOnCompactionCompleted(
@ -1675,21 +1672,17 @@ void DBImpl::NotifyOnCompactionCompleted(
return; return;
} }
Version* current = cfd->current();
current->Ref();
// release lock while notifying events // release lock while notifying events
mutex_.Unlock(); mutex_.Unlock();
TEST_SYNC_POINT("DBImpl::NotifyOnCompactionCompleted::UnlockMutex"); TEST_SYNC_POINT("DBImpl::NotifyOnCompactionCompleted::UnlockMutex");
{ {
CompactionJobInfo info{}; CompactionJobInfo info{};
BuildCompactionJobInfo(cfd, c, st, compaction_job_stats, job_id, current, BuildCompactionJobInfo(cfd, c, st, compaction_job_stats, job_id, &info);
&info);
for (auto listener : immutable_db_options_.listeners) { for (auto listener : immutable_db_options_.listeners) {
listener->OnCompactionCompleted(this, info); listener->OnCompactionCompleted(this, info);
} }
} }
mutex_.Lock(); mutex_.Lock();
current->Unref();
// no need to signal bg_cv_ as it will be signaled at the end of the // no need to signal bg_cv_ as it will be signaled at the end of the
// flush process. // flush process.
} }
@ -3923,7 +3916,7 @@ bool DBImpl::MCOverlap(ManualCompactionState* m, ManualCompactionState* m1) {
void DBImpl::BuildCompactionJobInfo( void DBImpl::BuildCompactionJobInfo(
const ColumnFamilyData* cfd, Compaction* c, const Status& st, const ColumnFamilyData* cfd, Compaction* c, const Status& st,
const CompactionJobStats& compaction_job_stats, const int job_id, const CompactionJobStats& compaction_job_stats, const int job_id,
const Version* current, CompactionJobInfo* compaction_job_info) const { CompactionJobInfo* compaction_job_info) const {
assert(compaction_job_info != nullptr); assert(compaction_job_info != nullptr);
compaction_job_info->cf_id = cfd->GetID(); compaction_job_info->cf_id = cfd->GetID();
compaction_job_info->cf_name = cfd->GetName(); compaction_job_info->cf_name = cfd->GetName();
@ -3933,7 +3926,7 @@ void DBImpl::BuildCompactionJobInfo(
compaction_job_info->base_input_level = c->start_level(); compaction_job_info->base_input_level = c->start_level();
compaction_job_info->output_level = c->output_level(); compaction_job_info->output_level = c->output_level();
compaction_job_info->stats = compaction_job_stats; compaction_job_info->stats = compaction_job_stats;
compaction_job_info->table_properties = c->GetOutputTableProperties(); compaction_job_info->table_properties = c->GetTableProperties();
compaction_job_info->compaction_reason = c->compaction_reason(); compaction_job_info->compaction_reason = c->compaction_reason();
compaction_job_info->compression = c->output_compression(); compaction_job_info->compression = c->output_compression();
@ -3947,15 +3940,9 @@ void DBImpl::BuildCompactionJobInfo(
compaction_job_info->input_files.push_back(fn); compaction_job_info->input_files.push_back(fn);
compaction_job_info->input_file_infos.push_back(CompactionFileInfo{ compaction_job_info->input_file_infos.push_back(CompactionFileInfo{
static_cast<int>(i), file_number, fmd->oldest_blob_file_number}); static_cast<int>(i), file_number, fmd->oldest_blob_file_number});
if (compaction_job_info->table_properties.count(fn) == 0) {
std::shared_ptr<const TableProperties> tp;
auto s = current->GetTableProperties(read_options, &tp, fmd, &fn);
if (s.ok()) {
compaction_job_info->table_properties[fn] = tp;
}
}
} }
} }
for (const auto& newf : c->edit()->GetNewFiles()) { for (const auto& newf : c->edit()->GetNewFiles()) {
const FileMetaData& meta = newf.second; const FileMetaData& meta = newf.second;
const FileDescriptor& desc = meta.fd; const FileDescriptor& desc = meta.fd;

@ -490,6 +490,7 @@ Status FlushJob::MemPurge() {
nullptr, ioptions->allow_data_in_errors, nullptr, ioptions->allow_data_in_errors,
ioptions->enforce_single_del_contracts, ioptions->enforce_single_del_contracts,
/*manual_compaction_canceled=*/kManualCompactionCanceledFalse, /*manual_compaction_canceled=*/kManualCompactionCanceledFalse,
false /* must_count_input_entries */,
/*compaction=*/nullptr, compaction_filter.get(), /*compaction=*/nullptr, compaction_filter.get(),
/*shutting_down=*/nullptr, ioptions->info_log, full_history_ts_low); /*shutting_down=*/nullptr, ioptions->info_log, full_history_ts_low);

@ -193,7 +193,8 @@ struct FileMetaData {
uint64_t compensated_file_size = 0; uint64_t compensated_file_size = 0;
// These values can mutate, but they can only be read or written from // These values can mutate, but they can only be read or written from
// single-threaded LogAndApply thread // single-threaded LogAndApply thread
uint64_t num_entries = 0; // the number of entries. uint64_t num_entries =
0; // The number of entries, including deletions and range deletions.
// The number of deletion entries, including range deletions. // The number of deletion entries, including range deletions.
uint64_t num_deletions = 0; uint64_t num_deletions = 0;
uint64_t raw_key_size = 0; // total uncompressed key size. uint64_t raw_key_size = 0; // total uncompressed key size.

@ -24,6 +24,9 @@ struct CompactionJobStats {
// the elapsed CPU time of this compaction in microseconds. // the elapsed CPU time of this compaction in microseconds.
uint64_t cpu_micros; uint64_t cpu_micros;
// Used internally indicating whether a subcompaction's
// `num_input_records` is accurate.
bool has_num_input_records;
// the number of compaction input records. // the number of compaction input records.
uint64_t num_input_records; uint64_t num_input_records;
// the number of blobs read from blob files // the number of blobs read from blob files

@ -490,11 +490,29 @@ struct DBOptions {
// If true, during memtable flush, RocksDB will validate total entries // If true, during memtable flush, RocksDB will validate total entries
// read in flush, and compare with counter inserted into it. // read in flush, and compare with counter inserted into it.
//
// The option is here to turn the feature off in case this new validation // The option is here to turn the feature off in case this new validation
// feature has a bug. // feature has a bug. The option may be removed in the future once the
// feature is stable.
//
// Default: true // Default: true
bool flush_verify_memtable_count = true; bool flush_verify_memtable_count = true;
// If true, during compaction, RocksDB will count the number of entries
// read and compare it against the number of entries in the compaction
// input files. This is intended to add protection against corruption
// during compaction. Note that
// - this verification is not done for compactions during which a compaction
// filter returns kRemoveAndSkipUntil, and
// - the number of range deletions is not verified.
//
// The option is here to turn the feature off in case this new validation
// feature has a bug. The option may be removed in the future once the
// feature is stable.
//
// Default: true
bool compaction_verify_record_count = true;
// If true, the log numbers and sizes of the synced WALs are tracked // If true, the log numbers and sizes of the synced WALs are tracked
// in MANIFEST. During DB recovery, if a synced WAL is missing // in MANIFEST. During DB recovery, if a synced WAL is missing
// from disk, or the WAL's size does not match the recorded size in // from disk, or the WAL's size does not match the recorded size in

@ -222,6 +222,10 @@ static std::unordered_map<std::string, OptionTypeInfo>
{offsetof(struct ImmutableDBOptions, flush_verify_memtable_count), {offsetof(struct ImmutableDBOptions, flush_verify_memtable_count),
OptionType::kBoolean, OptionVerificationType::kNormal, OptionType::kBoolean, OptionVerificationType::kNormal,
OptionTypeFlags::kNone}}, OptionTypeFlags::kNone}},
{"compaction_verify_record_count",
{offsetof(struct ImmutableDBOptions, compaction_verify_record_count),
OptionType::kBoolean, OptionVerificationType::kNormal,
OptionTypeFlags::kNone}},
{"track_and_verify_wals_in_manifest", {"track_and_verify_wals_in_manifest",
{offsetof(struct ImmutableDBOptions, {offsetof(struct ImmutableDBOptions,
track_and_verify_wals_in_manifest), track_and_verify_wals_in_manifest),
@ -679,6 +683,7 @@ ImmutableDBOptions::ImmutableDBOptions(const DBOptions& options)
error_if_exists(options.error_if_exists), error_if_exists(options.error_if_exists),
paranoid_checks(options.paranoid_checks), paranoid_checks(options.paranoid_checks),
flush_verify_memtable_count(options.flush_verify_memtable_count), flush_verify_memtable_count(options.flush_verify_memtable_count),
compaction_verify_record_count(options.compaction_verify_record_count),
track_and_verify_wals_in_manifest( track_and_verify_wals_in_manifest(
options.track_and_verify_wals_in_manifest), options.track_and_verify_wals_in_manifest),
verify_sst_unique_id_in_manifest( verify_sst_unique_id_in_manifest(
@ -771,6 +776,8 @@ void ImmutableDBOptions::Dump(Logger* log) const {
paranoid_checks); paranoid_checks);
ROCKS_LOG_HEADER(log, " Options.flush_verify_memtable_count: %d", ROCKS_LOG_HEADER(log, " Options.flush_verify_memtable_count: %d",
flush_verify_memtable_count); flush_verify_memtable_count);
ROCKS_LOG_HEADER(log, " Options.compaction_verify_record_count: %d",
compaction_verify_record_count);
ROCKS_LOG_HEADER(log, ROCKS_LOG_HEADER(log,
" " " "
"Options.track_and_verify_wals_in_manifest: %d", "Options.track_and_verify_wals_in_manifest: %d",

@ -25,6 +25,7 @@ struct ImmutableDBOptions {
bool error_if_exists; bool error_if_exists;
bool paranoid_checks; bool paranoid_checks;
bool flush_verify_memtable_count; bool flush_verify_memtable_count;
bool compaction_verify_record_count;
bool track_and_verify_wals_in_manifest; bool track_and_verify_wals_in_manifest;
bool verify_sst_unique_id_in_manifest; bool verify_sst_unique_id_in_manifest;
Env* env; Env* env;

@ -60,6 +60,8 @@ DBOptions BuildDBOptions(const ImmutableDBOptions& immutable_db_options,
options.paranoid_checks = immutable_db_options.paranoid_checks; options.paranoid_checks = immutable_db_options.paranoid_checks;
options.flush_verify_memtable_count = options.flush_verify_memtable_count =
immutable_db_options.flush_verify_memtable_count; immutable_db_options.flush_verify_memtable_count;
options.compaction_verify_record_count =
immutable_db_options.compaction_verify_record_count;
options.track_and_verify_wals_in_manifest = options.track_and_verify_wals_in_manifest =
immutable_db_options.track_and_verify_wals_in_manifest; immutable_db_options.track_and_verify_wals_in_manifest;
options.verify_sst_unique_id_in_manifest = options.verify_sst_unique_id_in_manifest =

@ -308,6 +308,7 @@ TEST_F(OptionsSettableTest, DBOptionsAllFieldsSettable) {
"writable_file_max_buffer_size=1048576;" "writable_file_max_buffer_size=1048576;"
"paranoid_checks=true;" "paranoid_checks=true;"
"flush_verify_memtable_count=true;" "flush_verify_memtable_count=true;"
"compaction_verify_record_count=true;"
"track_and_verify_wals_in_manifest=true;" "track_and_verify_wals_in_manifest=true;"
"verify_sst_unique_id_in_manifest=true;" "verify_sst_unique_id_in_manifest=true;"
"is_fd_close_on_exec=false;" "is_fd_close_on_exec=false;"

@ -230,7 +230,13 @@ Status MockTableReader::Get(const ReadOptions&, const Slice& key,
std::shared_ptr<const TableProperties> MockTableReader::GetTableProperties() std::shared_ptr<const TableProperties> MockTableReader::GetTableProperties()
const { const {
return std::shared_ptr<const TableProperties>(new TableProperties()); TableProperties* tp = new TableProperties();
tp->num_entries = table_.size();
tp->num_range_deletions = 0;
tp->raw_key_size = 1;
tp->raw_value_size = 1;
return std::shared_ptr<const TableProperties>(tp);
} }
MockTableFactory::MockTableFactory() MockTableFactory::MockTableFactory()

@ -0,0 +1 @@
* RocksDB will compare the number of input keys to the number of keys processed after each compaction. Compaction will fail and report Corruption status if the verification fails. Option `compaction_verify_record_count` is introduced for this purpose and is enabled by default.

@ -12,6 +12,7 @@ void CompactionJobStats::Reset() {
elapsed_micros = 0; elapsed_micros = 0;
cpu_micros = 0; cpu_micros = 0;
has_num_input_records = true;
num_input_records = 0; num_input_records = 0;
num_blobs_read = 0; num_blobs_read = 0;
num_input_files = 0; num_input_files = 0;
@ -55,6 +56,7 @@ void CompactionJobStats::Add(const CompactionJobStats& stats) {
elapsed_micros += stats.elapsed_micros; elapsed_micros += stats.elapsed_micros;
cpu_micros += stats.cpu_micros; cpu_micros += stats.cpu_micros;
has_num_input_records &= stats.has_num_input_records;
num_input_records += stats.num_input_records; num_input_records += stats.num_input_records;
num_blobs_read += stats.num_blobs_read; num_blobs_read += stats.num_blobs_read;
num_input_files += stats.num_input_files; num_input_files += stats.num_input_files;

Loading…
Cancel
Save