diff --git a/HISTORY.md b/HISTORY.md index b762388aa..314666501 100644 --- a/HISTORY.md +++ b/HISTORY.md @@ -2,6 +2,7 @@ ## Unreleased ### Behavior changes * Make best-efforts recovery verify SST unique ID before Version construction (#10962) +* Introduce `epoch_number` and sort L0 files by `epoch_number` instead of `largest_seqno`. `epoch_number` represents the order of a file being flushed or ingested/imported. Compaction output file will be assigned with the minimum `epoch_number` among input files'. For L0, larger `epoch_number` indicates newer L0 file. ### Bug Fixes * Fixed a regression in iterator where range tombstones after `iterate_upper_bound` is processed. @@ -9,6 +10,7 @@ * Fixed a bug that multi-level FIFO compaction deletes one file in non-L0 even when `CompactionOptionsFIFO::max_table_files_size` is no exceeded since #10348 or 7.8.0. * Fixed a bug caused by `DB::SyncWAL()` affecting `track_and_verify_wals_in_manifest`. Without the fix, application may see "open error: Corruption: Missing WAL with log number" while trying to open the db. The corruption is a false alarm but prevents DB open (#10892). * Fixed a BackupEngine bug in which RestoreDBFromLatestBackup would fail if the latest backup was deleted and there is another valid backup available. +* Fix L0 file misorder corruption caused by ingesting files of overlapping seqnos with memtable entries' through introducing `epoch_number`. Before the fix, `force_consistency_checks=true` may catch the corruption before it's exposed to readers, in which case writes returning `Status::Corruption` would be expected. Also replace the previous incomplete fix (#5958) to the same corruption with this new and more complete fix. ## 7.9.0 (11/21/2022) ### Performance Improvements diff --git a/db/column_family.cc b/db/column_family.cc index 268060ddf..d9875336c 100644 --- a/db/column_family.cc +++ b/db/column_family.cc @@ -565,7 +565,8 @@ ColumnFamilyData::ColumnFamilyData( allow_2pc_(db_options.allow_2pc), last_memtable_id_(0), db_paths_registered_(false), - mempurge_used_(false) { + mempurge_used_(false), + next_epoch_number_(1) { if (id_ != kDummyColumnFamilyDataId) { // TODO(cc): RegisterDbPaths can be expensive, considering moving it // outside of this constructor which might be called with db mutex held. @@ -1128,12 +1129,9 @@ bool ColumnFamilyData::NeedsCompaction() const { Compaction* ColumnFamilyData::PickCompaction( const MutableCFOptions& mutable_options, const MutableDBOptions& mutable_db_options, LogBuffer* log_buffer) { - SequenceNumber earliest_mem_seqno = - std::min(mem_->GetEarliestSequenceNumber(), - imm_.current()->GetEarliestSequenceNumber(false)); auto* result = compaction_picker_->PickCompaction( GetName(), mutable_options, mutable_db_options, current_->storage_info(), - log_buffer, earliest_mem_seqno); + log_buffer); if (result != nullptr) { result->SetInputVersion(current_); } @@ -1520,6 +1518,13 @@ FSDirectory* ColumnFamilyData::GetDataDir(size_t path_id) const { return data_dirs_[path_id].get(); } +void ColumnFamilyData::RecoverEpochNumbers() { + assert(current_); + auto* vstorage = current_->storage_info(); + assert(vstorage); + vstorage->RecoverEpochNumbers(this); +} + ColumnFamilySet::ColumnFamilySet(const std::string& dbname, const ImmutableDBOptions* db_options, const FileOptions& file_options, diff --git a/db/column_family.h b/db/column_family.h index 3e6d01d22..ff4eca514 100644 --- a/db/column_family.h +++ b/db/column_family.h @@ -533,6 +533,24 @@ class ColumnFamilyData { void SetMempurgeUsed() { mempurge_used_ = true; } bool GetMempurgeUsed() { return mempurge_used_; } + // Allocate and return a new epoch number + uint64_t NewEpochNumber() { return next_epoch_number_.fetch_add(1); } + + // Get the next epoch number to be assigned + uint64_t GetNextEpochNumber() const { return next_epoch_number_.load(); } + + // Set the next epoch number to be assigned + void SetNextEpochNumber(uint64_t next_epoch_number) { + next_epoch_number_.store(next_epoch_number); + } + + // Reset the next epoch number to be assigned + void ResetNextEpochNumber() { next_epoch_number_.store(1); } + + // Recover the next epoch number of this CF and epoch number + // of its files (if missing) + void RecoverEpochNumbers(); + private: friend class ColumnFamilySet; ColumnFamilyData(uint32_t id, const std::string& name, @@ -634,6 +652,8 @@ class ColumnFamilyData { // a Version associated with this CFD std::shared_ptr file_metadata_cache_res_mgr_; bool mempurge_used_; + + std::atomic next_epoch_number_; }; // ColumnFamilySet has interesting thread-safety requirements diff --git a/db/compaction/compaction.cc b/db/compaction/compaction.cc index 01b5a570b..47ca8d1a9 100644 --- a/db/compaction/compaction.cc +++ b/db/compaction/compaction.cc @@ -780,6 +780,16 @@ uint64_t Compaction::MinInputFileOldestAncesterTime( return min_oldest_ancester_time; } +uint64_t Compaction::MinInputFileEpochNumber() const { + uint64_t min_epoch_number = std::numeric_limits::max(); + for (const auto& inputs_per_level : inputs_) { + for (const auto& file : inputs_per_level.files) { + min_epoch_number = std::min(min_epoch_number, file->epoch_number); + } + } + return min_epoch_number; +} + int Compaction::EvaluatePenultimateLevel( const VersionStorageInfo* vstorage, const ImmutableOptions& immutable_options, const int start_level, diff --git a/db/compaction/compaction.h b/db/compaction/compaction.h index 21d1190ac..ee8639601 100644 --- a/db/compaction/compaction.h +++ b/db/compaction/compaction.h @@ -378,6 +378,9 @@ class Compaction { // This is used to filter out some input files' ancester's time range. uint64_t MinInputFileOldestAncesterTime(const InternalKey* start, const InternalKey* end) const; + // Return the minimum epoch number among + // input files' associated with this compaction + uint64_t MinInputFileEpochNumber() const; // Called by DBImpl::NotifyOnCompactionCompleted to make sure number of // compaction begin and compaction completion callbacks match. diff --git a/db/compaction/compaction_job.cc b/db/compaction/compaction_job.cc index 1da1bcda8..24b05a8de 100644 --- a/db/compaction/compaction_job.cc +++ b/db/compaction/compaction_job.cc @@ -1834,12 +1834,14 @@ Status CompactionJob::OpenCompactionOutputFile(SubcompactionState* sub_compact, } // Initialize a SubcompactionState::Output and add it to sub_compact->outputs + uint64_t epoch_number = sub_compact->compaction->MinInputFileEpochNumber(); { FileMetaData meta; meta.fd = FileDescriptor(file_number, sub_compact->compaction->output_path_id(), 0); meta.oldest_ancester_time = oldest_ancester_time; meta.file_creation_time = current_time; + meta.epoch_number = epoch_number; meta.temperature = temperature; assert(!db_id_.empty()); assert(!db_session_id_.empty()); diff --git a/db/compaction/compaction_job.h b/db/compaction/compaction_job.h index bfbce1011..2f8cb08da 100644 --- a/db/compaction/compaction_job.h +++ b/db/compaction/compaction_job.h @@ -402,6 +402,7 @@ struct CompactionServiceOutputFile { std::string largest_internal_key; uint64_t oldest_ancester_time; uint64_t file_creation_time; + uint64_t epoch_number; uint64_t paranoid_hash; bool marked_for_compaction; UniqueId64x2 unique_id; @@ -411,8 +412,8 @@ struct CompactionServiceOutputFile { const std::string& name, SequenceNumber smallest, SequenceNumber largest, std::string _smallest_internal_key, std::string _largest_internal_key, uint64_t _oldest_ancester_time, uint64_t _file_creation_time, - uint64_t _paranoid_hash, bool _marked_for_compaction, - UniqueId64x2 _unique_id) + uint64_t _epoch_number, uint64_t _paranoid_hash, + bool _marked_for_compaction, UniqueId64x2 _unique_id) : file_name(name), smallest_seqno(smallest), largest_seqno(largest), @@ -420,6 +421,7 @@ struct CompactionServiceOutputFile { largest_internal_key(std::move(_largest_internal_key)), oldest_ancester_time(_oldest_ancester_time), file_creation_time(_file_creation_time), + epoch_number(_epoch_number), paranoid_hash(_paranoid_hash), marked_for_compaction(_marked_for_compaction), unique_id(std::move(_unique_id)) {} diff --git a/db/compaction/compaction_job_test.cc b/db/compaction/compaction_job_test.cc index c87871100..8b312ea78 100644 --- a/db/compaction/compaction_job_test.cc +++ b/db/compaction/compaction_job_test.cc @@ -380,11 +380,13 @@ class CompactionJobTestBase : public testing::Test { } VersionEdit edit; - edit.AddFile(level, file_number, 0, file_size, smallest_key, largest_key, - smallest_seqno, largest_seqno, false, Temperature::kUnknown, - oldest_blob_file_number, kUnknownOldestAncesterTime, - kUnknownFileCreationTime, kUnknownFileChecksum, - kUnknownFileChecksumFuncName, kNullUniqueId64x2); + edit.AddFile( + level, file_number, 0, file_size, smallest_key, largest_key, + smallest_seqno, largest_seqno, false, Temperature::kUnknown, + oldest_blob_file_number, kUnknownOldestAncesterTime, + kUnknownFileCreationTime, + versions_->GetColumnFamilySet()->GetDefault()->NewEpochNumber(), + kUnknownFileChecksum, kUnknownFileChecksumFuncName, kNullUniqueId64x2); mutex_.Lock(); EXPECT_OK( @@ -1655,7 +1657,7 @@ TEST_F(CompactionJobTest, ResultSerialization) { rnd.RandomBinaryString(rnd.Uniform(kStrMaxLen)), rnd.RandomBinaryString(rnd.Uniform(kStrMaxLen)), rnd64.Uniform(UINT64_MAX), rnd64.Uniform(UINT64_MAX), - rnd64.Uniform(UINT64_MAX), rnd.OneIn(2), id); + rnd64.Uniform(UINT64_MAX), rnd64.Uniform(UINT64_MAX), rnd.OneIn(2), id); } result.output_level = rnd.Uniform(10); result.output_path = rnd.RandomString(rnd.Uniform(kStrMaxLen)); diff --git a/db/compaction/compaction_picker.cc b/db/compaction/compaction_picker.cc index abdecca9f..de2570eee 100644 --- a/db/compaction/compaction_picker.cc +++ b/db/compaction/compaction_picker.cc @@ -31,27 +31,15 @@ bool FindIntraL0Compaction(const std::vector& level_files, size_t min_files_to_compact, uint64_t max_compact_bytes_per_del_file, uint64_t max_compaction_bytes, - CompactionInputFiles* comp_inputs, - SequenceNumber earliest_mem_seqno) { - // Do not pick ingested file when there is at least one memtable not flushed - // which of seqno is overlap with the sst. + CompactionInputFiles* comp_inputs) { TEST_SYNC_POINT("FindIntraL0Compaction"); + size_t start = 0; - for (; start < level_files.size(); start++) { - if (level_files[start]->being_compacted) { - return false; - } - // If there is no data in memtable, the earliest sequence number would the - // largest sequence number in last memtable. - // Because all files are sorted in descending order by largest_seqno, so we - // only need to check the first one. - if (level_files[start]->fd.largest_seqno <= earliest_mem_seqno) { - break; - } - } - if (start >= level_files.size()) { + + if (level_files.size() == 0 || level_files[start]->being_compacted) { return false; } + size_t compact_bytes = static_cast(level_files[start]->fd.file_size); size_t compact_bytes_per_del_file = std::numeric_limits::max(); // Compaction range will be [start, limit). @@ -995,6 +983,7 @@ Status CompactionPicker::SanitizeCompactionInputFilesForAllLevels( current_files[f].name + " is currently being compacted."); } + input_files->insert(TableFileNameToNumber(current_files[f].name)); } diff --git a/db/compaction/compaction_picker.h b/db/compaction/compaction_picker.h index 7739dd96b..d98af851b 100644 --- a/db/compaction/compaction_picker.h +++ b/db/compaction/compaction_picker.h @@ -51,14 +51,15 @@ class CompactionPicker { virtual ~CompactionPicker(); // Pick level and inputs for a new compaction. + // // Returns nullptr if there is no compaction to be done. // Otherwise returns a pointer to a heap-allocated object that // describes the compaction. Caller should delete the result. - virtual Compaction* PickCompaction( - const std::string& cf_name, const MutableCFOptions& mutable_cf_options, - const MutableDBOptions& mutable_db_options, VersionStorageInfo* vstorage, - LogBuffer* log_buffer, - SequenceNumber earliest_memtable_seqno = kMaxSequenceNumber) = 0; + virtual Compaction* PickCompaction(const std::string& cf_name, + const MutableCFOptions& mutable_cf_options, + const MutableDBOptions& mutable_db_options, + VersionStorageInfo* vstorage, + LogBuffer* log_buffer) = 0; // Return a compaction object for compacting the range [begin,end] in // the specified level. Returns nullptr if there is nothing in that @@ -91,6 +92,7 @@ class CompactionPicker { // files. If it's not possible to conver an invalid input_files // into a valid one by adding more files, the function will return a // non-ok status with specific reason. +// #ifndef ROCKSDB_LITE Status SanitizeCompactionInputFiles(std::unordered_set* input_files, const ColumnFamilyMetaData& cf_meta, @@ -255,12 +257,11 @@ class NullCompactionPicker : public CompactionPicker { virtual ~NullCompactionPicker() {} // Always return "nullptr" - Compaction* PickCompaction( - const std::string& /*cf_name*/, - const MutableCFOptions& /*mutable_cf_options*/, - const MutableDBOptions& /*mutable_db_options*/, - VersionStorageInfo* /*vstorage*/, LogBuffer* /* log_buffer */, - SequenceNumber /* earliest_memtable_seqno */) override { + Compaction* PickCompaction(const std::string& /*cf_name*/, + const MutableCFOptions& /*mutable_cf_options*/, + const MutableDBOptions& /*mutable_db_options*/, + VersionStorageInfo* /*vstorage*/, + LogBuffer* /* log_buffer */) override { return nullptr; } @@ -304,11 +305,11 @@ class NullCompactionPicker : public CompactionPicker { // files. Cannot be nullptr. // // @return true iff compaction was found. -bool FindIntraL0Compaction( - const std::vector& level_files, size_t min_files_to_compact, - uint64_t max_compact_bytes_per_del_file, uint64_t max_compaction_bytes, - CompactionInputFiles* comp_inputs, - SequenceNumber earliest_mem_seqno = kMaxSequenceNumber); +bool FindIntraL0Compaction(const std::vector& level_files, + size_t min_files_to_compact, + uint64_t max_compact_bytes_per_del_file, + uint64_t max_compaction_bytes, + CompactionInputFiles* comp_inputs); CompressionType GetCompressionType(const VersionStorageInfo* vstorage, const MutableCFOptions& mutable_cf_options, diff --git a/db/compaction/compaction_picker_fifo.cc b/db/compaction/compaction_picker_fifo.cc index b370dd1da..362e64e16 100644 --- a/db/compaction/compaction_picker_fifo.cc +++ b/db/compaction/compaction_picker_fifo.cc @@ -402,7 +402,7 @@ Compaction* FIFOCompactionPicker::PickCompactionToWarm( Compaction* FIFOCompactionPicker::PickCompaction( const std::string& cf_name, const MutableCFOptions& mutable_cf_options, const MutableDBOptions& mutable_db_options, VersionStorageInfo* vstorage, - LogBuffer* log_buffer, SequenceNumber /*earliest_memtable_seqno*/) { + LogBuffer* log_buffer) { Compaction* c = nullptr; if (mutable_cf_options.ttl > 0) { c = PickTTLCompaction(cf_name, mutable_cf_options, mutable_db_options, diff --git a/db/compaction/compaction_picker_fifo.h b/db/compaction/compaction_picker_fifo.h index 544259f38..1db760185 100644 --- a/db/compaction/compaction_picker_fifo.h +++ b/db/compaction/compaction_picker_fifo.h @@ -19,11 +19,11 @@ class FIFOCompactionPicker : public CompactionPicker { const InternalKeyComparator* icmp) : CompactionPicker(ioptions, icmp) {} - virtual Compaction* PickCompaction( - const std::string& cf_name, const MutableCFOptions& mutable_cf_options, - const MutableDBOptions& mutable_db_options, VersionStorageInfo* version, - LogBuffer* log_buffer, - SequenceNumber earliest_memtable_seqno = kMaxSequenceNumber) override; + virtual Compaction* PickCompaction(const std::string& cf_name, + const MutableCFOptions& mutable_cf_options, + const MutableDBOptions& mutable_db_options, + VersionStorageInfo* version, + LogBuffer* log_buffer) override; virtual Compaction* CompactRange( const std::string& cf_name, const MutableCFOptions& mutable_cf_options, diff --git a/db/compaction/compaction_picker_level.cc b/db/compaction/compaction_picker_level.cc index b689b6add..31987fc52 100644 --- a/db/compaction/compaction_picker_level.cc +++ b/db/compaction/compaction_picker_level.cc @@ -50,7 +50,6 @@ class LevelCompactionBuilder { public: LevelCompactionBuilder(const std::string& cf_name, VersionStorageInfo* vstorage, - SequenceNumber earliest_mem_seqno, CompactionPicker* compaction_picker, LogBuffer* log_buffer, const MutableCFOptions& mutable_cf_options, @@ -58,7 +57,6 @@ class LevelCompactionBuilder { const MutableDBOptions& mutable_db_options) : cf_name_(cf_name), vstorage_(vstorage), - earliest_mem_seqno_(earliest_mem_seqno), compaction_picker_(compaction_picker), log_buffer_(log_buffer), mutable_cf_options_(mutable_cf_options), @@ -122,7 +120,6 @@ class LevelCompactionBuilder { const std::string& cf_name_; VersionStorageInfo* vstorage_; - SequenceNumber earliest_mem_seqno_; CompactionPicker* compaction_picker_; LogBuffer* log_buffer_; int start_level_ = -1; @@ -196,7 +193,10 @@ void LevelCompactionBuilder::SetupInitialFiles() { } output_level_ = (start_level_ == 0) ? vstorage_->base_level() : start_level_ + 1; - if (PickFileToCompact()) { + bool picked_file_to_compact = PickFileToCompact(); + TEST_SYNC_POINT_CALLBACK("PostPickFileToCompact", + &picked_file_to_compact); + if (picked_file_to_compact) { // found the compaction! if (start_level_ == 0) { // L0 score = `num L0 files` / `level0_file_num_compaction_trigger` @@ -825,16 +825,16 @@ bool LevelCompactionBuilder::PickIntraL0Compaction() { return FindIntraL0Compaction(level_files, kMinFilesForIntraL0Compaction, std::numeric_limits::max(), mutable_cf_options_.max_compaction_bytes, - &start_level_inputs_, earliest_mem_seqno_); + &start_level_inputs_); } } // namespace Compaction* LevelCompactionPicker::PickCompaction( const std::string& cf_name, const MutableCFOptions& mutable_cf_options, const MutableDBOptions& mutable_db_options, VersionStorageInfo* vstorage, - LogBuffer* log_buffer, SequenceNumber earliest_mem_seqno) { - LevelCompactionBuilder builder(cf_name, vstorage, earliest_mem_seqno, this, - log_buffer, mutable_cf_options, ioptions_, + LogBuffer* log_buffer) { + LevelCompactionBuilder builder(cf_name, vstorage, this, log_buffer, + mutable_cf_options, ioptions_, mutable_db_options); return builder.PickCompaction(); } diff --git a/db/compaction/compaction_picker_level.h b/db/compaction/compaction_picker_level.h index 42a9b60a6..6eb0f586f 100644 --- a/db/compaction/compaction_picker_level.h +++ b/db/compaction/compaction_picker_level.h @@ -20,11 +20,11 @@ class LevelCompactionPicker : public CompactionPicker { LevelCompactionPicker(const ImmutableOptions& ioptions, const InternalKeyComparator* icmp) : CompactionPicker(ioptions, icmp) {} - virtual Compaction* PickCompaction( - const std::string& cf_name, const MutableCFOptions& mutable_cf_options, - const MutableDBOptions& mutable_db_options, VersionStorageInfo* vstorage, - LogBuffer* log_buffer, - SequenceNumber earliest_memtable_seqno = kMaxSequenceNumber) override; + virtual Compaction* PickCompaction(const std::string& cf_name, + const MutableCFOptions& mutable_cf_options, + const MutableDBOptions& mutable_db_options, + VersionStorageInfo* vstorage, + LogBuffer* log_buffer) override; virtual bool NeedsCompaction( const VersionStorageInfo* vstorage) const override; diff --git a/db/compaction/compaction_picker_test.cc b/db/compaction/compaction_picker_test.cc index 2e2e566c0..dfc508fc5 100644 --- a/db/compaction/compaction_picker_test.cc +++ b/db/compaction/compaction_picker_test.cc @@ -77,8 +77,9 @@ class CompactionPickerTestBase : public testing::Test { void NewVersionStorage(int num_levels, CompactionStyle style) { DeleteVersionStorage(); options_.num_levels = num_levels; - vstorage_.reset(new VersionStorageInfo(&icmp_, ucmp_, options_.num_levels, - style, nullptr, false)); + vstorage_.reset(new VersionStorageInfo( + &icmp_, ucmp_, options_.num_levels, style, nullptr, false, + EpochNumberRequirement::kMustPresent)); vstorage_->PrepareForVersionAppend(ioptions_, mutable_cf_options_); } @@ -87,7 +88,7 @@ class CompactionPickerTestBase : public testing::Test { void AddVersionStorage() { temp_vstorage_.reset(new VersionStorageInfo( &icmp_, ucmp_, options_.num_levels, ioptions_.compaction_style, - vstorage_.get(), false)); + vstorage_.get(), false, EpochNumberRequirement::kMustPresent)); } void DeleteVersionStorage() { @@ -105,7 +106,8 @@ class CompactionPickerTestBase : public testing::Test { size_t compensated_file_size = 0, bool marked_for_compact = false, Temperature temperature = Temperature::kUnknown, uint64_t oldest_ancestor_time = kUnknownOldestAncesterTime, - Slice ts_of_smallest = Slice(), Slice ts_of_largest = Slice()) { + Slice ts_of_smallest = Slice(), Slice ts_of_largest = Slice(), + uint64_t epoch_number = kUnknownEpochNumber) { assert(ts_of_smallest.size() == ucmp_->timestamp_size()); assert(ts_of_largest.size() == ucmp_->timestamp_size()); @@ -145,7 +147,7 @@ class CompactionPickerTestBase : public testing::Test { file_number, path_id, file_size, smallest_ikey, largest_ikey, smallest_seq, largest_seq, marked_for_compact, temperature, kInvalidBlobFileNumber, kUnknownOldestAncesterTime, - kUnknownFileCreationTime, kUnknownFileChecksum, + kUnknownFileCreationTime, epoch_number, kUnknownFileChecksum, kUnknownFileChecksumFuncName, kNullUniqueId64x2); f->compensated_file_size = (compensated_file_size != 0) ? compensated_file_size : file_size; @@ -2871,38 +2873,6 @@ TEST_F(CompactionPickerTest, IntraL0MaxCompactionBytesHit) { ASSERT_EQ(0, compaction->output_level()); } -TEST_F(CompactionPickerTest, IntraL0ForEarliestSeqno) { - // Intra L0 compaction triggers only if there are at least - // level0_file_num_compaction_trigger + 2 L0 files. - mutable_cf_options_.level0_file_num_compaction_trigger = 3; - mutable_cf_options_.max_compaction_bytes = 999999u; - NewVersionStorage(6, kCompactionStyleLevel); - - // 4 out of 6 L0 files will be picked for intra L0 compaction due to - // being_compact limit. And the latest one L0 will be skipped due to earliest - // seqno. The one L1 file spans entire L0 key range and is marked as being - // compacted to avoid L0->L1 compaction. - Add(1, 1U, "100", "350", 200000U, 0, 110, 111); - Add(0, 2U, "301", "350", 1U, 0, 108, 109); - Add(0, 3U, "251", "300", 1U, 0, 106, 107); - Add(0, 4U, "201", "250", 1U, 0, 104, 105); - Add(0, 5U, "151", "200", 1U, 0, 102, 103); - Add(0, 6U, "100", "150", 1U, 0, 100, 101); - Add(0, 7U, "100", "100", 1U, 0, 99, 100); - vstorage_->LevelFiles(0)[5]->being_compacted = true; - vstorage_->LevelFiles(1)[0]->being_compacted = true; - UpdateVersionStorageInfo(); - - std::unique_ptr compaction(level_compaction_picker.PickCompaction( - cf_name_, mutable_cf_options_, mutable_db_options_, vstorage_.get(), - &log_buffer_, 107)); - ASSERT_TRUE(compaction.get() != nullptr); - ASSERT_EQ(1U, compaction->num_input_levels()); - ASSERT_EQ(4U, compaction->num_input_files(0)); - ASSERT_EQ(CompactionReason::kLevelL0FilesNum, - compaction->compaction_reason()); - ASSERT_EQ(0, compaction->output_level()); -} #ifndef ROCKSDB_LITE TEST_F(CompactionPickerTest, UniversalMarkedCompactionFullOverlap) { @@ -2916,9 +2886,23 @@ TEST_F(CompactionPickerTest, UniversalMarkedCompactionFullOverlap) { // should fail NewVersionStorage(5, kCompactionStyleUniversal); - Add(0, 1U, "150", "200", kFileSize, 0, 500, 550); - Add(0, 2U, "201", "250", 2 * kFileSize, 0, 401, 450); - Add(0, 4U, "260", "300", 4 * kFileSize, 0, 260, 300); + Add(0, 1U, "150", "200", kFileSize, 0, 500, 550, /*compensated_file_size*/ 0, + /*marked_for_compact*/ false, /* temperature*/ Temperature::kUnknown, + /*oldest_ancestor_time*/ kUnknownOldestAncesterTime, + /*ts_of_smallest*/ Slice(), /*ts_of_largest*/ Slice(), + /*epoch_number*/ 3); + Add(0, 2U, "201", "250", 2 * kFileSize, 0, 401, 450, + /*compensated_file_size*/ 0, /*marked_for_compact*/ false, + /* temperature*/ Temperature::kUnknown, + /*oldest_ancestor_time*/ kUnknownOldestAncesterTime, + /*ts_of_smallest*/ Slice(), /*ts_of_largest*/ Slice(), + /*epoch_number*/ 2); + Add(0, 4U, "260", "300", 4 * kFileSize, 0, 260, 300, + /*compensated_file_size*/ 0, /*marked_for_compact*/ false, + /* temperature*/ Temperature::kUnknown, + /*oldest_ancestor_time*/ kUnknownOldestAncesterTime, + /*ts_of_smallest*/ Slice(), /*ts_of_largest*/ Slice(), + /*epoch_number*/ 1); Add(3, 5U, "010", "080", 8 * kFileSize, 0, 200, 251); Add(4, 3U, "301", "350", 8 * kFileSize, 0, 101, 150); Add(4, 6U, "501", "750", 8 * kFileSize, 0, 101, 150); @@ -2940,7 +2924,11 @@ TEST_F(CompactionPickerTest, UniversalMarkedCompactionFullOverlap) { AddVersionStorage(); // Simulate a flush and mark the file for compaction - Add(0, 7U, "150", "200", kFileSize, 0, 551, 600, 0, true); + Add(0, 7U, "150", "200", kFileSize, 0, 551, 600, 0, true, + /* temperature*/ Temperature::kUnknown, + /*oldest_ancestor_time*/ kUnknownOldestAncesterTime, + /*ts_of_smallest*/ Slice(), /*ts_of_largest*/ Slice(), + /*epoch_number*/ 4); UpdateVersionStorageInfo(); std::unique_ptr compaction2( @@ -2962,7 +2950,11 @@ TEST_F(CompactionPickerTest, UniversalMarkedCompactionFullOverlap2) { NewVersionStorage(5, kCompactionStyleUniversal); // Mark file number 4 for compaction - Add(0, 4U, "260", "300", 4 * kFileSize, 0, 260, 300, 0, true); + Add(0, 4U, "260", "300", 4 * kFileSize, 0, 260, 300, 0, true, + /* temperature*/ Temperature::kUnknown, + /*oldest_ancestor_time*/ kUnknownOldestAncesterTime, + /*ts_of_smallest*/ Slice(), /*ts_of_largest*/ Slice(), + /*epoch_number*/ 1); Add(3, 5U, "240", "290", 8 * kFileSize, 0, 201, 250); Add(4, 3U, "301", "350", 8 * kFileSize, 0, 101, 150); Add(4, 6U, "501", "750", 8 * kFileSize, 0, 101, 150); @@ -2983,8 +2975,17 @@ TEST_F(CompactionPickerTest, UniversalMarkedCompactionFullOverlap2) { ASSERT_EQ(1U, compaction->num_input_files(1)); AddVersionStorage(); - Add(0, 1U, "150", "200", kFileSize, 0, 500, 550); - Add(0, 2U, "201", "250", 2 * kFileSize, 0, 401, 450); + Add(0, 1U, "150", "200", kFileSize, 0, 500, 550, /*compensated_file_size*/ 0, + /*marked_for_compact*/ false, /* temperature*/ Temperature::kUnknown, + /*oldest_ancestor_time*/ kUnknownOldestAncesterTime, + /*ts_of_smallest*/ Slice(), /*ts_of_largest*/ Slice(), + /*epoch_number*/ 3); + Add(0, 2U, "201", "250", 2 * kFileSize, 0, 401, 450, + /*compensated_file_size*/ 0, /*marked_for_compact*/ false, + /* temperature*/ Temperature::kUnknown, + /*oldest_ancestor_time*/ kUnknownOldestAncesterTime, + /*ts_of_smallest*/ Slice(), /*ts_of_largest*/ Slice(), + /*epoch_number*/ 2); UpdateVersionStorageInfo(); std::unique_ptr compaction2( @@ -3150,10 +3151,29 @@ TEST_F(CompactionPickerTest, UniversalMarkedL0Overlap2) { NewVersionStorage(1, kCompactionStyleUniversal); // Mark file number 5 for compaction - Add(0, 4U, "260", "300", 1 * kFileSize, 0, 260, 300); - Add(0, 5U, "240", "290", 2 * kFileSize, 0, 201, 250, 0, true); - Add(0, 3U, "301", "350", 4 * kFileSize, 0, 101, 150); - Add(0, 6U, "501", "750", 8 * kFileSize, 0, 50, 100); + Add(0, 4U, "260", "300", 1 * kFileSize, 0, 260, 300, + /*compensated_file_size*/ 0, /*marked_for_compact*/ false, + /* temperature*/ Temperature::kUnknown, + /*oldest_ancestor_time*/ kUnknownOldestAncesterTime, + /*ts_of_smallest*/ Slice(), /*ts_of_largest*/ Slice(), + /*epoch_number*/ 4); + Add(0, 5U, "240", "290", 2 * kFileSize, 0, 201, 250, 0, true, + /* temperature*/ Temperature::kUnknown, + /*oldest_ancestor_time*/ kUnknownOldestAncesterTime, + /*ts_of_smallest*/ Slice(), /*ts_of_largest*/ Slice(), + /*epoch_number*/ 3); + Add(0, 3U, "301", "350", 4 * kFileSize, 0, 101, 150, + /*compensated_file_size*/ 0, /*marked_for_compact*/ false, + /* temperature*/ Temperature::kUnknown, + /*oldest_ancestor_time*/ kUnknownOldestAncesterTime, + /*ts_of_smallest*/ Slice(), /*ts_of_largest*/ Slice(), + /*epoch_number*/ 2); + Add(0, 6U, "501", "750", 8 * kFileSize, 0, 50, 100, + /*compensated_file_size*/ 0, /*marked_for_compact*/ false, + /* temperature*/ Temperature::kUnknown, + /*oldest_ancestor_time*/ kUnknownOldestAncesterTime, + /*ts_of_smallest*/ Slice(), /*ts_of_largest*/ Slice(), + /*epoch_number*/ 1); UpdateVersionStorageInfo(); std::unique_ptr compaction( @@ -3173,8 +3193,18 @@ TEST_F(CompactionPickerTest, UniversalMarkedL0Overlap2) { ASSERT_TRUE(file_map_[6].first->being_compacted); AddVersionStorage(); - Add(0, 1U, "150", "200", kFileSize, 0, 500, 550); - Add(0, 2U, "201", "250", kFileSize, 0, 401, 450); + Add(0, 1U, "150", "200", kFileSize, 0, 500, 550, /*compensated_file_size*/ 0, + /*marked_for_compact*/ false, + /* temperature*/ Temperature::kUnknown, + /*oldest_ancestor_time*/ kUnknownOldestAncesterTime, + /*ts_of_smallest*/ Slice(), /*ts_of_largest*/ Slice(), + /*epoch_number*/ 6); + Add(0, 2U, "201", "250", kFileSize, 0, 401, 450, /*compensated_file_size*/ 0, + /*marked_for_compact*/ false, + /* temperature*/ Temperature::kUnknown, + /*oldest_ancestor_time*/ kUnknownOldestAncesterTime, + /*ts_of_smallest*/ Slice(), /*ts_of_largest*/ Slice(), + /*epoch_number*/ 5); UpdateVersionStorageInfo(); std::unique_ptr compaction2( diff --git a/db/compaction/compaction_picker_universal.cc b/db/compaction/compaction_picker_universal.cc index 376e4f60f..3ef4e70b3 100644 --- a/db/compaction/compaction_picker_universal.cc +++ b/db/compaction/compaction_picker_universal.cc @@ -293,7 +293,7 @@ bool UniversalCompactionPicker::NeedsCompaction( Compaction* UniversalCompactionPicker::PickCompaction( const std::string& cf_name, const MutableCFOptions& mutable_cf_options, const MutableDBOptions& mutable_db_options, VersionStorageInfo* vstorage, - LogBuffer* log_buffer, SequenceNumber /* earliest_memtable_seqno */) { + LogBuffer* log_buffer) { UniversalCompactionBuilder builder(ioptions_, icmp_, cf_name, mutable_cf_options, mutable_db_options, vstorage, this, log_buffer); @@ -400,6 +400,7 @@ Compaction* UniversalCompactionBuilder::PickCompaction() { if (!vstorage_->FilesMarkedForPeriodicCompaction().empty()) { // Always need to do a full compaction for periodic compaction. c = PickPeriodicCompaction(); + TEST_SYNC_POINT_CALLBACK("PostPickPeriodicCompaction", c); } // Check for size amplification. @@ -408,6 +409,7 @@ Compaction* UniversalCompactionBuilder::PickCompaction() { static_cast( mutable_cf_options_.level0_file_num_compaction_trigger)) { if ((c = PickCompactionToReduceSizeAmp()) != nullptr) { + TEST_SYNC_POINT("PickCompactionToReduceSizeAmpReturnNonnullptr"); ROCKS_LOG_BUFFER(log_buffer_, "[%s] Universal: compacting for size amp\n", cf_name_.c_str()); } else { @@ -417,6 +419,7 @@ Compaction* UniversalCompactionBuilder::PickCompaction() { mutable_cf_options_.compaction_options_universal.size_ratio; if ((c = PickCompactionToReduceSortedRuns(ratio, UINT_MAX)) != nullptr) { + TEST_SYNC_POINT("PickCompactionToReduceSortedRunsReturnNonnullptr"); ROCKS_LOG_BUFFER(log_buffer_, "[%s] Universal: compacting for size ratio\n", cf_name_.c_str()); @@ -457,6 +460,7 @@ Compaction* UniversalCompactionBuilder::PickCompaction() { if (c == nullptr) { if ((c = PickDeleteTriggeredCompaction()) != nullptr) { + TEST_SYNC_POINT("PickDeleteTriggeredCompactionReturnNonnullptr"); ROCKS_LOG_BUFFER(log_buffer_, "[%s] Universal: delete triggered compaction\n", cf_name_.c_str()); diff --git a/db/compaction/compaction_picker_universal.h b/db/compaction/compaction_picker_universal.h index 5f897cc9b..558733195 100644 --- a/db/compaction/compaction_picker_universal.h +++ b/db/compaction/compaction_picker_universal.h @@ -18,11 +18,11 @@ class UniversalCompactionPicker : public CompactionPicker { UniversalCompactionPicker(const ImmutableOptions& ioptions, const InternalKeyComparator* icmp) : CompactionPicker(ioptions, icmp) {} - virtual Compaction* PickCompaction( - const std::string& cf_name, const MutableCFOptions& mutable_cf_options, - const MutableDBOptions& mutable_db_options, VersionStorageInfo* vstorage, - LogBuffer* log_buffer, - SequenceNumber earliest_memtable_seqno = kMaxSequenceNumber) override; + virtual Compaction* PickCompaction(const std::string& cf_name, + const MutableCFOptions& mutable_cf_options, + const MutableDBOptions& mutable_db_options, + VersionStorageInfo* vstorage, + LogBuffer* log_buffer) override; virtual int MaxOutputLevel() const override { return NumberLevels() - 1; } virtual bool NeedsCompaction( diff --git a/db/compaction/compaction_service_job.cc b/db/compaction/compaction_service_job.cc index 1d2e99d99..1f6c0b710 100644 --- a/db/compaction/compaction_service_job.cc +++ b/db/compaction/compaction_service_job.cc @@ -190,6 +190,7 @@ CompactionJob::ProcessKeyValueCompactionWithCompactionService( meta.largest.DecodeFrom(file.largest_internal_key); meta.oldest_ancester_time = file.oldest_ancester_time; meta.file_creation_time = file.file_creation_time; + meta.epoch_number = file.epoch_number; meta.marked_for_compaction = file.marked_for_compaction; meta.unique_id = file.unique_id; @@ -333,8 +334,9 @@ Status CompactionServiceCompactionJob::Run() { MakeTableFileName(meta.fd.GetNumber()), meta.fd.smallest_seqno, meta.fd.largest_seqno, meta.smallest.Encode().ToString(), meta.largest.Encode().ToString(), meta.oldest_ancester_time, - meta.file_creation_time, output_file.validator.GetHash(), - meta.marked_for_compaction, meta.unique_id); + meta.file_creation_time, meta.epoch_number, + output_file.validator.GetHash(), meta.marked_for_compaction, + meta.unique_id); } InternalStats::CompactionStatsFull compaction_stats; sub_compact->AggregateCompactionStats(compaction_stats); @@ -489,6 +491,10 @@ static std::unordered_map {offsetof(struct CompactionServiceOutputFile, file_creation_time), OptionType::kUInt64T, OptionVerificationType::kNormal, OptionTypeFlags::kNone}}, + {"epoch_number", + {offsetof(struct CompactionServiceOutputFile, epoch_number), + OptionType::kUInt64T, OptionVerificationType::kNormal, + OptionTypeFlags::kNone}}, {"paranoid_hash", {offsetof(struct CompactionServiceOutputFile, paranoid_hash), OptionType::kUInt64T, OptionVerificationType::kNormal, diff --git a/db/db_compaction_test.cc b/db/db_compaction_test.cc index ba9c50b9a..ed9a5a7ae 100644 --- a/db/db_compaction_test.cc +++ b/db/db_compaction_test.cc @@ -9,8 +9,10 @@ #include +#include "compaction/compaction_picker_universal.h" #include "db/blob/blob_index.h" #include "db/db_test_util.h" +#include "db/dbformat.h" #include "env/mock_env.h" #include "port/port.h" #include "port/stack_trace.h" @@ -6271,170 +6273,653 @@ void IngestOneKeyValue(DBImpl* db, const std::string& key, ASSERT_OK(db->IngestExternalFile({info.file_path}, ingest_opt)); } -TEST_P(DBCompactionTestWithParam, - FlushAfterIntraL0CompactionCheckConsistencyFail) { - Options options = CurrentOptions(); - options.force_consistency_checks = true; - options.compression = kNoCompression; - options.level0_file_num_compaction_trigger = 5; - options.max_background_compactions = 2; - options.max_subcompactions = max_subcompactions_; - DestroyAndReopen(options); +class DBCompactionTestL0FilesMisorderCorruption : public DBCompactionTest { + public: + DBCompactionTestL0FilesMisorderCorruption() : DBCompactionTest() {} + void SetupOptions(const CompactionStyle compaciton_style, + const std::string& compaction_path_to_test = "") { + options_ = CurrentOptions(); + options_.create_if_missing = true; + options_.compression = kNoCompression; + + options_.force_consistency_checks = true; + options_.compaction_style = compaciton_style; + + if (compaciton_style == CompactionStyle::kCompactionStyleLevel) { + options_.num_levels = 7; + // Level compaction's PickIntraL0Compaction() impl detail requires + // `options.level0_file_num_compaction_trigger` to be + // at least 2 files less than the actual number of level 0 files + // (i.e, 7 by design in this test) + options_.level0_file_num_compaction_trigger = 5; + options_.max_background_compactions = 2; + options_.write_buffer_size = 2 << 20; + options_.max_write_buffer_number = 6; + } else if (compaciton_style == CompactionStyle::kCompactionStyleUniversal) { + // TODO: expand test coverage to num_lvels > 1 for universal compacion, + // which requires careful unit test design to compact to level 0 despite + // num_levels > 1 + options_.num_levels = 1; + options_.level0_file_num_compaction_trigger = 5; + + CompactionOptionsUniversal universal_options; + if (compaction_path_to_test == "PickCompactionToReduceSizeAmp") { + universal_options.max_size_amplification_percent = 50; + } else if (compaction_path_to_test == + "PickCompactionToReduceSortedRuns") { + universal_options.max_size_amplification_percent = 400; + } else if (compaction_path_to_test == "PickDeleteTriggeredCompaction") { + universal_options.max_size_amplification_percent = 400; + universal_options.min_merge_width = 6; + } + options_.compaction_options_universal = universal_options; + } else if (compaciton_style == CompactionStyle::kCompactionStyleFIFO) { + options_.max_open_files = -1; + options_.num_levels = 1; + options_.level0_file_num_compaction_trigger = 3; + + CompactionOptionsFIFO fifo_options; + if (compaction_path_to_test == "FindIntraL0Compaction" || + compaction_path_to_test == "CompactRange") { + fifo_options.allow_compaction = true; + fifo_options.age_for_warm = 0; + } else if (compaction_path_to_test == "CompactFile") { + fifo_options.allow_compaction = false; + fifo_options.age_for_warm = 0; + } + options_.compaction_options_fifo = fifo_options; + } - const size_t kValueSize = 1 << 20; - Random rnd(301); - std::atomic pick_intra_l0_count(0); - std::string value(rnd.RandomString(kValueSize)); + if (compaction_path_to_test == "CompactFile" || + compaction_path_to_test == "CompactRange") { + options_.disable_auto_compactions = true; + } else { + options_.disable_auto_compactions = false; + } + } - // The L0->L1 must be picked before we begin ingesting files to trigger - // intra-L0 compaction, and must not finish until after an intra-L0 - // compaction has been picked. - ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->LoadDependency( - {{"LevelCompactionPicker::PickCompaction:Return", - "DBCompactionTestWithParam::" - "FlushAfterIntraL0CompactionCheckConsistencyFail:L0ToL1Ready"}, - {"LevelCompactionPicker::PickCompactionBySize:0", - "CompactionJob::Run():Start"}}); - ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->SetCallBack( - "FindIntraL0Compaction", - [&](void* /*arg*/) { pick_intra_l0_count.fetch_add(1); }); + void Destroy(const Options& options) { + if (snapshot_) { + assert(db_); + db_->ReleaseSnapshot(snapshot_); + snapshot_ = nullptr; + } + DBTestBase::Destroy(options); + } - ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->EnableProcessing(); + void Reopen(const Options& options) { + DBTestBase::Reopen(options); + if (options.compaction_style != CompactionStyle::kCompactionStyleLevel) { + // To force assigning the global seqno to ingested file + // for our test purpose. + assert(snapshot_ == nullptr); + snapshot_ = db_->GetSnapshot(); + } + } - // prevents trivial move - for (int i = 0; i < 10; ++i) { - ASSERT_OK(Put(Key(i), "")); // prevents trivial move + void DestroyAndReopen(Options& options) { + Destroy(options); + Reopen(options); } - ASSERT_OK(Flush()); - Compact("", Key(99)); - ASSERT_EQ(0, NumTableFilesAtLevel(0)); - // Flush 5 L0 sst. - for (int i = 0; i < 5; ++i) { - ASSERT_OK(Put(Key(i + 1), value)); - ASSERT_OK(Flush()); + void PauseCompactionThread() { + sleeping_task_.reset(new test::SleepingBackgroundTask()); + env_->SetBackgroundThreads(1, Env::LOW); + env_->Schedule(&test::SleepingBackgroundTask::DoSleepTask, + sleeping_task_.get(), Env::Priority::LOW); + sleeping_task_->WaitUntilSleeping(); } - ASSERT_EQ(5, NumTableFilesAtLevel(0)); - // Put one key, to make smallest log sequence number in this memtable is less - // than sst which would be ingested in next step. - ASSERT_OK(Put(Key(0), "a")); + void ResumeCompactionThread() { + if (sleeping_task_) { + sleeping_task_->WakeUp(); + sleeping_task_->WaitUntilDone(); + } + } - ASSERT_EQ(5, NumTableFilesAtLevel(0)); - TEST_SYNC_POINT( - "DBCompactionTestWithParam::" - "FlushAfterIntraL0CompactionCheckConsistencyFail:L0ToL1Ready"); + void AddFilesMarkedForPeriodicCompaction(const size_t num_files) { + assert(options_.compaction_style == + CompactionStyle::kCompactionStyleUniversal); + VersionSet* const versions = dbfull()->GetVersionSet(); + assert(versions); + ColumnFamilyData* const cfd = versions->GetColumnFamilySet()->GetDefault(); + assert(cfd); + Version* const current = cfd->current(); + assert(current); - // Ingest 5 L0 sst. And this files would trigger PickIntraL0Compaction. - for (int i = 5; i < 10; i++) { - ASSERT_EQ(i, NumTableFilesAtLevel(0)); - IngestOneKeyValue(dbfull(), Key(i), value, options); + VersionStorageInfo* const storage_info = current->storage_info(); + assert(storage_info); + + const std::vector level0_files = storage_info->LevelFiles(0); + assert(level0_files.size() == num_files); + + for (FileMetaData* f : level0_files) { + storage_info->TEST_AddFileMarkedForPeriodicCompaction(0, f); + } } - // Put one key, to make biggest log sequence number in this memtable is bigger - // than sst which would be ingested in next step. - ASSERT_OK(Put(Key(2), "b")); - ASSERT_OK(dbfull()->TEST_WaitForCompact()); - ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->DisableProcessing(); - std::vector> level_to_files; - dbfull()->TEST_GetFilesMetaData(dbfull()->DefaultColumnFamily(), - &level_to_files); - ASSERT_GT(level_to_files[0].size(), 0); - ASSERT_GT(pick_intra_l0_count.load(), 0); + void AddFilesMarkedForCompaction(const size_t num_files) { + assert(options_.compaction_style == + CompactionStyle::kCompactionStyleUniversal); + VersionSet* const versions = dbfull()->GetVersionSet(); + assert(versions); + ColumnFamilyData* const cfd = versions->GetColumnFamilySet()->GetDefault(); + assert(cfd); + Version* const current = cfd->current(); + assert(current); - ASSERT_OK(Flush()); -} + VersionStorageInfo* const storage_info = current->storage_info(); + assert(storage_info); -TEST_P(DBCompactionTestWithParam, - IntraL0CompactionAfterFlushCheckConsistencyFail) { - Options options = CurrentOptions(); - options.force_consistency_checks = true; - options.compression = kNoCompression; - options.level0_file_num_compaction_trigger = 5; - options.max_background_compactions = 2; - options.max_subcompactions = max_subcompactions_; - options.write_buffer_size = 2 << 20; - options.max_write_buffer_number = 6; - DestroyAndReopen(options); + const std::vector level0_files = storage_info->LevelFiles(0); + assert(level0_files.size() == num_files); - const size_t kValueSize = 1 << 20; - Random rnd(301); - std::string value(rnd.RandomString(kValueSize)); - std::string value2(rnd.RandomString(kValueSize)); - std::string bigvalue = value + value; + for (FileMetaData* f : level0_files) { + storage_info->TEST_AddFileMarkedForCompaction(0, f); + } + } - // prevents trivial move + void SetupSyncPoints(const std::string& compaction_path_to_test) { + compaction_path_sync_point_called_.store(false); + if (compaction_path_to_test == "FindIntraL0Compaction" && + options_.compaction_style == CompactionStyle::kCompactionStyleLevel) { + ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->SetCallBack( + "PostPickFileToCompact", [&](void* arg) { + bool* picked_file_to_compact = (bool*)arg; + // To trigger intra-L0 compaction specifically, + // we mock PickFileToCompact()'s result to be false + *picked_file_to_compact = false; + }); + ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->SetCallBack( + "FindIntraL0Compaction", [&](void* /*arg*/) { + compaction_path_sync_point_called_.store(true); + }); + + } else if (compaction_path_to_test == "PickPeriodicCompaction") { + assert(options_.compaction_style == + CompactionStyle::kCompactionStyleUniversal); + ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->SetCallBack( + "PostPickPeriodicCompaction", [&](void* compaction_arg) { + Compaction* compaction = (Compaction*)compaction_arg; + if (compaction != nullptr) { + compaction_path_sync_point_called_.store(true); + } + }); + } else if (compaction_path_to_test == "PickCompactionToReduceSizeAmp") { + assert(options_.compaction_style == + CompactionStyle::kCompactionStyleUniversal); + ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->SetCallBack( + "PickCompactionToReduceSizeAmpReturnNonnullptr", [&](void* /*arg*/) { + compaction_path_sync_point_called_.store(true); + }); + } else if (compaction_path_to_test == "PickCompactionToReduceSortedRuns") { + assert(options_.compaction_style == + CompactionStyle::kCompactionStyleUniversal); + ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->SetCallBack( + "PickCompactionToReduceSortedRunsReturnNonnullptr", + [&](void* /*arg*/) { + compaction_path_sync_point_called_.store(true); + }); + } else if (compaction_path_to_test == "PickDeleteTriggeredCompaction") { + assert(options_.compaction_style == + CompactionStyle::kCompactionStyleUniversal); + ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->SetCallBack( + "PickDeleteTriggeredCompactionReturnNonnullptr", [&](void* /*arg*/) { + compaction_path_sync_point_called_.store(true); + }); + } else if ((compaction_path_to_test == "FindIntraL0Compaction" || + compaction_path_to_test == "CompactRange") && + options_.compaction_style == + CompactionStyle::kCompactionStyleFIFO) { + ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->SetCallBack( + "FindIntraL0Compaction", [&](void* /*arg*/) { + compaction_path_sync_point_called_.store(true); + }); + } + + ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->EnableProcessing(); + } + + bool SyncPointsCalled() { return compaction_path_sync_point_called_.load(); } + + void DisableSyncPoints() { + ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->ClearAllCallBacks(); + ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->DisableProcessing(); + } + + // Return the largest seqno of the latest L0 file based on file number + SequenceNumber GetLatestL0FileLargestSeqnoHelper() { + VersionSet* const versions = dbfull()->GetVersionSet(); + assert(versions); + ColumnFamilyData* const cfd = versions->GetColumnFamilySet()->GetDefault(); + assert(cfd); + Version* const current = cfd->current(); + assert(current); + VersionStorageInfo* const storage_info = current->storage_info(); + assert(storage_info); + const std::vector level0_files = storage_info->LevelFiles(0); + assert(level0_files.size() >= 1); + + uint64_t latest_file_num = 0; + uint64_t latest_file_largest_seqno = 0; + for (FileMetaData* f : level0_files) { + if (f->fd.GetNumber() > latest_file_num) { + latest_file_num = f->fd.GetNumber(); + latest_file_largest_seqno = f->fd.largest_seqno; + } + } + + return latest_file_largest_seqno; + } + + protected: + Options options_; + + private: + const Snapshot* snapshot_ = nullptr; + std::atomic compaction_path_sync_point_called_; + std::shared_ptr sleeping_task_; +}; + +TEST_F(DBCompactionTestL0FilesMisorderCorruption, + FlushAfterIntraL0LevelCompactionWithIngestedFile) { + SetupOptions(CompactionStyle::kCompactionStyleLevel, ""); + DestroyAndReopen(options_); + // Prevents trivial move for (int i = 0; i < 10; ++i) { - ASSERT_OK(Put(Key(i), "")); // prevents trivial move + ASSERT_OK(Put(Key(i), "")); // Prevents trivial move } ASSERT_OK(Flush()); Compact("", Key(99)); ASSERT_EQ(0, NumTableFilesAtLevel(0)); - std::atomic pick_intra_l0_count(0); - // The L0->L1 must be picked before we begin ingesting files to trigger - // intra-L0 compaction, and must not finish until after an intra-L0 - // compaction has been picked. - ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->LoadDependency( - {{"LevelCompactionPicker::PickCompaction:Return", - "DBCompactionTestWithParam::" - "IntraL0CompactionAfterFlushCheckConsistencyFail:L0ToL1Ready"}, - {"LevelCompactionPicker::PickCompactionBySize:0", - "CompactionJob::Run():Start"}}); - ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->SetCallBack( - "FindIntraL0Compaction", - [&](void* /*arg*/) { pick_intra_l0_count.fetch_add(1); }); - ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->EnableProcessing(); - // Make 6 L0 sst. + // To get accurate NumTableFilesAtLevel(0) when the number reaches + // options_.level0_file_num_compaction_trigger + PauseCompactionThread(); + + // To create below LSM tree + // (key:value@n indicates key-value pair has seqno "n", L0 is sorted): + // + // memtable: m1[ 5:new@12 .. 1:new@8, 0:new@7] + // L0: s6[6:new@13], s5[5:old@6] ... s1[1:old@2],s0[0:old@1] + // + // (1) Make 6 L0 sst (i.e, s0 - s5) for (int i = 0; i < 6; ++i) { if (i % 2 == 0) { - IngestOneKeyValue(dbfull(), Key(i), value, options); + IngestOneKeyValue(dbfull(), Key(i), "old", options_); } else { - ASSERT_OK(Put(Key(i), value)); + ASSERT_OK(Put(Key(i), "old")); ASSERT_OK(Flush()); } } - ASSERT_EQ(6, NumTableFilesAtLevel(0)); - // Stop run flush job - env_->SetBackgroundThreads(1, Env::HIGH); - test::SleepingBackgroundTask sleeping_tasks; - env_->Schedule(&test::SleepingBackgroundTask::DoSleepTask, &sleeping_tasks, - Env::Priority::HIGH); - sleeping_tasks.WaitUntilSleeping(); - - // Put many keys to make memtable request to flush + // (2) Create m1 for (int i = 0; i < 6; ++i) { - ASSERT_OK(Put(Key(i), bigvalue)); + ASSERT_OK(Put(Key(i), "new")); } - ASSERT_EQ(6, NumTableFilesAtLevel(0)); - TEST_SYNC_POINT( - "DBCompactionTestWithParam::" - "IntraL0CompactionAfterFlushCheckConsistencyFail:L0ToL1Ready"); - // ingest file to trigger IntraL0Compaction - for (int i = 6; i < 10; ++i) { + + // (3) Ingest file (i.e, s6) to trigger IntraL0Compaction() + for (int i = 6; i < 7; ++i) { ASSERT_EQ(i, NumTableFilesAtLevel(0)); - IngestOneKeyValue(dbfull(), Key(i), value2, options); + IngestOneKeyValue(dbfull(), Key(i), "new", options_); } - // Wake up flush job - sleeping_tasks.WakeUp(); - sleeping_tasks.WaitUntilDone(); + SetupSyncPoints("FindIntraL0Compaction"); + ResumeCompactionThread(); + ASSERT_OK(dbfull()->TEST_WaitForCompact()); - ROCKSDB_NAMESPACE::SyncPoint::GetInstance()->DisableProcessing(); - uint64_t error_count = 0; - db_->GetIntProperty("rocksdb.background-errors", &error_count); - ASSERT_EQ(error_count, 0); - ASSERT_GT(pick_intra_l0_count.load(), 0); + ASSERT_TRUE(SyncPointsCalled()); + DisableSyncPoints(); + + // After compaction, we have LSM tree: + // + // memtable: m1[ 5:new@12 .. 1:new@8, 0:new@7] + // L0: s7[6:new@13, 5:old@6 .. 0:old@1] + ASSERT_EQ(1, NumTableFilesAtLevel(0)); + SequenceNumber compact_output_file_largest_seqno = + GetLatestL0FileLargestSeqnoHelper(); + + ASSERT_OK(Flush()); + // After flush, we have LSM tree: + // + // L0: s8[5:new@12 .. 0:new@7],s7[6:new@13, 5:old@5 .. 0:old@1] + ASSERT_EQ(2, NumTableFilesAtLevel(0)); + SequenceNumber flushed_file_largest_seqno = + GetLatestL0FileLargestSeqnoHelper(); + + // To verify there isn't any file misorder leading to returning a old value + // of Key(0) - Key(5) , which is caused by flushed table s8 has a + // smaller largest seqno than the compaction output file s7's largest seqno + // while the flushed table has the newer version of the values than the + // compaction output file's. + ASSERT_TRUE(flushed_file_largest_seqno < compact_output_file_largest_seqno); for (int i = 0; i < 6; ++i) { - ASSERT_EQ(bigvalue, Get(Key(i))); + ASSERT_EQ("new", Get(Key(i))); + } + for (int i = 6; i < 7; ++i) { + ASSERT_EQ("new", Get(Key(i))); + } +} + +TEST_F(DBCompactionTestL0FilesMisorderCorruption, + FlushAfterIntraL0UniversalCompactionWithIngestedFile) { + for (const std::string compaction_path_to_test : + {"PickPeriodicCompaction", "PickCompactionToReduceSizeAmp", + "PickCompactionToReduceSortedRuns", "PickDeleteTriggeredCompaction"}) { + SetupOptions(CompactionStyle::kCompactionStyleUniversal, + compaction_path_to_test); + DestroyAndReopen(options_); + + // To get accurate NumTableFilesAtLevel(0) when the number reaches + // options_.level0_file_num_compaction_trigger + PauseCompactionThread(); + + // To create below LSM tree + // (key:value@n indicates key-value pair has seqno "n", L0 is sorted): + // + // memtable: m1 [ k2:new@8, k1:new@7] + // L0: s4[k9:dummy@10], s3[k8:dummy@9], + // s2[k7:old@6, k6:old@5].. s0[k3:old@2, k1:old@1] + // + // (1) Create 3 existing SST file (i.e, s0 - s2) + ASSERT_OK(Put("k1", "old")); + ASSERT_OK(Put("k3", "old")); + ASSERT_OK(Flush()); + ASSERT_EQ(1, NumTableFilesAtLevel(0)); + ASSERT_OK(Put("k4", "old")); + ASSERT_OK(Put("k5", "old")); + ASSERT_OK(Flush()); + ASSERT_EQ(2, NumTableFilesAtLevel(0)); + ASSERT_OK(Put("k6", "old")); + ASSERT_OK(Put("k7", "old")); + ASSERT_OK(Flush()); + ASSERT_EQ(3, NumTableFilesAtLevel(0)); + + // (2) Create m1. Noted that it contains a overlaped key with s0 + ASSERT_OK(Put("k1", "new")); // overlapped key + ASSERT_OK(Put("k2", "new")); + + // (3) Ingest two SST files s3, s4 + IngestOneKeyValue(dbfull(), "k8", "dummy", options_); + IngestOneKeyValue(dbfull(), "k9", "dummy", options_); + // Up to now, L0 contains s0 - s4 + ASSERT_EQ(5, NumTableFilesAtLevel(0)); + + if (compaction_path_to_test == "PickPeriodicCompaction") { + AddFilesMarkedForPeriodicCompaction(5); + } else if (compaction_path_to_test == "PickDeleteTriggeredCompaction") { + AddFilesMarkedForCompaction(5); + } + + SetupSyncPoints(compaction_path_to_test); + ResumeCompactionThread(); + + ASSERT_OK(dbfull()->TEST_WaitForCompact()); + + ASSERT_TRUE(SyncPointsCalled()) + << "failed for compaction path to test: " << compaction_path_to_test; + DisableSyncPoints(); + + // After compaction, we have LSM tree: + // + // memtable: m1[ k2:new@8, k1:new@7] + // L0: s5[k9:dummy@10, k8@dummy@9, k7:old@6 .. k3:old@2, k1:old@1] + ASSERT_EQ(1, NumTableFilesAtLevel(0)) + << "failed for compaction path to test: " << compaction_path_to_test; + SequenceNumber compact_output_file_largest_seqno = + GetLatestL0FileLargestSeqnoHelper(); + + ASSERT_OK(Flush()) << "failed for compaction path to test: " + << compaction_path_to_test; + // After flush, we have LSM tree: + // + // L0: s6[k2:new@8, k1:new@7], + // s5[k9:dummy@10, k8@dummy@9, k7:old@6 .. k3:old@2, k1:old@1] + ASSERT_EQ(2, NumTableFilesAtLevel(0)) + << "failed for compaction path to test: " << compaction_path_to_test; + SequenceNumber flushed_file_largest_seqno = + GetLatestL0FileLargestSeqnoHelper(); + + // To verify there isn't any file misorder leading to returning a old + // value of "k1" , which is caused by flushed table s6 has a + // smaller largest seqno than the compaction output file s5's largest seqno + // while the flushed table has the newer version of the value + // than the compaction output file's. + ASSERT_TRUE(flushed_file_largest_seqno < compact_output_file_largest_seqno) + << "failed for compaction path to test: " << compaction_path_to_test; + EXPECT_EQ(Get("k1"), "new") + << "failed for compaction path to test: " << compaction_path_to_test; + } + + Destroy(options_); +} + +TEST_F(DBCompactionTestL0FilesMisorderCorruption, + FlushAfterIntraL0FIFOCompactionWithIngestedFile) { + for (const std::string compaction_path_to_test : {"FindIntraL0Compaction"}) { + SetupOptions(CompactionStyle::kCompactionStyleFIFO, + compaction_path_to_test); + DestroyAndReopen(options_); + + // To create below LSM tree + // (key:value@n indicates key-value pair has seqno "n", L0 is sorted): + // + // memtable: m1 [ k2:new@4, k1:new@3] + // L0: s2[k5:dummy@6], s1[k4:dummy@5], s0[k3:old@2, k1:old@1] + // + // (1) Create an existing SST file s0 + ASSERT_OK(Put("k1", "old")); + ASSERT_OK(Put("k3", "old")); + ASSERT_OK(Flush()); + ASSERT_EQ(1, NumTableFilesAtLevel(0)); + + // (2) Create memtable m1. Noted that it contains a overlaped key with s0 + ASSERT_OK(Put("k1", "new")); // overlapped key + ASSERT_OK(Put("k2", "new")); + + // To get accurate NumTableFilesAtLevel(0) when the number reaches + // options_.level0_file_num_compaction_trigger + PauseCompactionThread(); + + // (3) Ingest two SST files s1, s2 + IngestOneKeyValue(dbfull(), "k4", "dummy", options_); + IngestOneKeyValue(dbfull(), "k5", "dummy", options_); + // Up to now, L0 contains s0, s1, s2 + ASSERT_EQ(3, NumTableFilesAtLevel(0)); + + SetupSyncPoints(compaction_path_to_test); + ResumeCompactionThread(); + + ASSERT_OK(dbfull()->TEST_WaitForCompact()); + + ASSERT_TRUE(SyncPointsCalled()) + << "failed for compaction path to test: " << compaction_path_to_test; + DisableSyncPoints(); + // After compaction, we have LSM tree: + // + // memtable: m1 [ k2:new@4, k1:new@3] + // L0: s3[k5:dummy@6, k4:dummy@5, k3:old@2, k1:old@1] + ASSERT_EQ(1, NumTableFilesAtLevel(0)) + << "failed for compaction path to test: " << compaction_path_to_test; + SequenceNumber compact_output_file_largest_seqno = + GetLatestL0FileLargestSeqnoHelper(); + + ASSERT_OK(Flush()) << "failed for compaction path to test: " + << compaction_path_to_test; + // After flush, we have LSM tree: + // + // L0: s4[k2:new@4, k1:new@3], s3[k5:dummy@6, k4:dummy@5, k3:old@2, + // k1:old@1] + ASSERT_EQ(2, NumTableFilesAtLevel(0)) + << "failed for compaction path to test: " << compaction_path_to_test; + SequenceNumber flushed_file_largest_seqno = + GetLatestL0FileLargestSeqnoHelper(); + + // To verify there isn't any file misorder leading to returning a old + // value of "k1" , which is caused by flushed table s4 has a + // smaller largest seqno than the compaction output file s3's largest seqno + // while the flushed table has the newer version of the value + // than the compaction output file's. + ASSERT_TRUE(flushed_file_largest_seqno < compact_output_file_largest_seqno) + << "failed for compaction path to test: " << compaction_path_to_test; + EXPECT_EQ(Get("k1"), "new") + << "failed for compaction path to test: " << compaction_path_to_test; + } + + Destroy(options_); +} + +class DBCompactionTestL0FilesMisorderCorruptionWithParam + : public DBCompactionTestL0FilesMisorderCorruption, + public testing::WithParamInterface { + public: + DBCompactionTestL0FilesMisorderCorruptionWithParam() + : DBCompactionTestL0FilesMisorderCorruption() {} +}; + +// TODO: add `CompactionStyle::kCompactionStyleLevel` to testing parameter, +// which requires careful unit test +// design for ingesting file to L0 and CompactRange()/CompactFile() to L0 +INSTANTIATE_TEST_CASE_P( + DBCompactionTestL0FilesMisorderCorruptionWithParam, + DBCompactionTestL0FilesMisorderCorruptionWithParam, + ::testing::Values(CompactionStyle::kCompactionStyleUniversal, + CompactionStyle::kCompactionStyleFIFO)); + +TEST_P(DBCompactionTestL0FilesMisorderCorruptionWithParam, + FlushAfterIntraL0CompactFileWithIngestedFile) { + SetupOptions(GetParam(), "CompactFile"); + DestroyAndReopen(options_); + + // To create below LSM tree + // (key:value@n indicates key-value pair has seqno "n", L0 is sorted): + // + // memtable: m1 [ k2:new@4, k1:new@3] + // L0: s2[k5:dummy@6], s1[k4:dummy@5], s0[k3:old@2, k1:old@1] + // + // (1) Create an existing SST file s0 + ASSERT_OK(Put("k1", "old")); + ASSERT_OK(Put("k3", "old")); + ASSERT_OK(Flush()); + ASSERT_EQ(1, NumTableFilesAtLevel(0)); + + // (2) Create memtable m1. Noted that it contains a overlaped key with s0 + ASSERT_OK(Put("k1", "new")); // overlapped key + ASSERT_OK(Put("k2", "new")); + + // (3) Ingest two SST files s1, s2 + IngestOneKeyValue(dbfull(), "k4", "dummy", options_); + IngestOneKeyValue(dbfull(), "k5", "dummy", options_); + // Up to now, L0 contains s0, s1, s2 + ASSERT_EQ(3, NumTableFilesAtLevel(0)); + + ColumnFamilyMetaData cf_meta_data; + db_->GetColumnFamilyMetaData(&cf_meta_data); + ASSERT_EQ(cf_meta_data.levels[0].files.size(), 3); + std::vector input_files; + for (const auto& file : cf_meta_data.levels[0].files) { + input_files.push_back(file.name); } - for (int i = 6; i < 10; ++i) { - ASSERT_EQ(value2, Get(Key(i))); + ASSERT_EQ(input_files.size(), 3); + + Status s = db_->CompactFiles(CompactionOptions(), input_files, 0); + // After compaction, we have LSM tree: + // + // memtable: m1 [ k2:new@4, k1:new@3] + // L0: s3[k5:dummy@6, k4:dummy@5, k3:old@2, k1:old@1] + ASSERT_OK(s); + ASSERT_EQ(1, NumTableFilesAtLevel(0)); + SequenceNumber compact_output_file_largest_seqno = + GetLatestL0FileLargestSeqnoHelper(); + + ASSERT_OK(Flush()); + // After flush, we have LSM tree: + // + // L0: s4[k2:new@4, k1:new@3], s3[k5:dummy@6, k4:dummy@5, k3:old@2, + // k1:old@1] + ASSERT_EQ(2, NumTableFilesAtLevel(0)); + SequenceNumber flushed_file_largest_seqno = + GetLatestL0FileLargestSeqnoHelper(); + + // To verify there isn't any file misorder leading to returning a old value + // of "1" , which is caused by flushed table s4 has a smaller + // largest seqno than the compaction output file s3's largest seqno while the + // flushed table has the newer version of the value than the + // compaction output file's. + ASSERT_TRUE(flushed_file_largest_seqno < compact_output_file_largest_seqno); + EXPECT_EQ(Get("k1"), "new"); + + Destroy(options_); +} + +TEST_P(DBCompactionTestL0FilesMisorderCorruptionWithParam, + FlushAfterIntraL0CompactRangeWithIngestedFile) { + SetupOptions(GetParam(), "CompactRange"); + DestroyAndReopen(options_); + + // To create below LSM tree + // (key:value@n indicates key-value pair has seqno "n", L0 is sorted): + // + // memtable: m1 [ k2:new@4, k1:new@3] + // L0: s2[k5:dummy@6], s1[k4:dummy@5], s0[k3:old@2, k1:old@1] + // + // (1) Create an existing SST file s0 + ASSERT_OK(Put("k1", "old")); + ASSERT_OK(Put("k3", "old")); + ASSERT_OK(Flush()); + ASSERT_EQ(1, NumTableFilesAtLevel(0)); + + // (2) Create memtable m1. Noted that it contains a overlaped key with s0 + ASSERT_OK(Put("k1", "new")); // overlapped key + ASSERT_OK(Put("k2", "new")); + + // (3) Ingest two SST files s1, s2 + IngestOneKeyValue(dbfull(), "k4", "dummy", options_); + IngestOneKeyValue(dbfull(), "k5", "dummy", options_); + // Up to now, L0 contains s0, s1, s2 + ASSERT_EQ(3, NumTableFilesAtLevel(0)); + + if (options_.compaction_style == CompactionStyle::kCompactionStyleFIFO) { + SetupSyncPoints("CompactRange"); + } + // `start` and `end` is carefully chosen so that compact range: + // (1) doesn't overlap with memtable therefore the memtable won't be flushed + // (2) should target at compacting s0 with s1 and s2 + Slice start("k3"), end("k5"); + ASSERT_OK(db_->CompactRange(CompactRangeOptions(), &start, &end)); + // After compaction, we have LSM tree: + // + // memtable: m1 [ k2:new@4, k1:new@3] + // L0: s3[k5:dummy@6, k4:dummy@5, k3:old@2, k1:old@1] + if (options_.compaction_style == CompactionStyle::kCompactionStyleFIFO) { + ASSERT_TRUE(SyncPointsCalled()); + DisableSyncPoints(); } + ASSERT_EQ(1, NumTableFilesAtLevel(0)); + SequenceNumber compact_output_file_largest_seqno = + GetLatestL0FileLargestSeqnoHelper(); + + ASSERT_OK(Flush()); + // After flush, we have LSM tree: + // + // L0: s4[k2:new@4, k1:new@3], s3[k5:dummy@6, k4:dummy@5, k3:old@2, + // k1:old@1] + ASSERT_EQ(2, NumTableFilesAtLevel(0)); + SequenceNumber flushed_file_largest_seqno = + GetLatestL0FileLargestSeqnoHelper(); + + // To verify there isn't any file misorder leading to returning a old value + // of "k1" , which is caused by flushed table s4 has a smaller + // largest seqno than the compaction output file s3's largest seqno while the + // flushed table has the newer version of the value than the + // compaction output file's. + ASSERT_TRUE(flushed_file_largest_seqno < compact_output_file_largest_seqno); + EXPECT_EQ(Get("k1"), "new"); + + Destroy(options_); } TEST_P(DBCompactionTestWithBottommostParam, SequenceKeysManualCompaction) { diff --git a/db/db_impl/db_impl.cc b/db/db_impl/db_impl.cc index a431111d4..657d2870f 100644 --- a/db/db_impl/db_impl.cc +++ b/db/db_impl/db_impl.cc @@ -5318,6 +5318,7 @@ Status DBImpl::IngestExternalFiles( // Run ingestion jobs. if (status.ok()) { for (size_t i = 0; i != num_cfs; ++i) { + mutex_.AssertHeld(); status = ingestion_jobs[i].Run(); if (!status.ok()) { break; @@ -5522,6 +5523,7 @@ Status DBImpl::CreateColumnFamilyWithImport( num_running_ingest_file_++; assert(!cfd->IsDropped()); + mutex_.AssertHeld(); status = import_job.Run(); // Install job edit [Mutex will be unlocked here] diff --git a/db/db_impl/db_impl_compaction_flush.cc b/db/db_impl/db_impl_compaction_flush.cc index a605fac87..31e2d07ce 100644 --- a/db/db_impl/db_impl_compaction_flush.cc +++ b/db/db_impl/db_impl_compaction_flush.cc @@ -1687,14 +1687,15 @@ Status DBImpl::ReFitLevel(ColumnFamilyData* cfd, int level, int target_level) { VersionEdit edit; edit.SetColumnFamily(cfd->GetID()); + for (const auto& f : vstorage->LevelFiles(level)) { edit.DeleteFile(level, f->fd.GetNumber()); edit.AddFile( to_level, f->fd.GetNumber(), f->fd.GetPathId(), f->fd.GetFileSize(), f->smallest, f->largest, f->fd.smallest_seqno, f->fd.largest_seqno, f->marked_for_compaction, f->temperature, f->oldest_blob_file_number, - f->oldest_ancester_time, f->file_creation_time, f->file_checksum, - f->file_checksum_func_name, f->unique_id); + f->oldest_ancester_time, f->file_creation_time, f->epoch_number, + f->file_checksum, f->file_checksum_func_name, f->unique_id); } ROCKS_LOG_DEBUG(immutable_db_options_.info_log, "[%s] Apply version edit:\n%s", cfd->GetName().c_str(), @@ -3334,8 +3335,8 @@ Status DBImpl::BackgroundCompaction(bool* made_progress, f->fd.GetFileSize(), f->smallest, f->largest, f->fd.smallest_seqno, f->fd.largest_seqno, f->marked_for_compaction, f->temperature, f->oldest_blob_file_number, f->oldest_ancester_time, - f->file_creation_time, f->file_checksum, f->file_checksum_func_name, - f->unique_id); + f->file_creation_time, f->epoch_number, f->file_checksum, + f->file_checksum_func_name, f->unique_id); ROCKS_LOG_BUFFER( log_buffer, diff --git a/db/db_impl/db_impl_experimental.cc b/db/db_impl/db_impl_experimental.cc index c1b1e4137..035fdbd41 100644 --- a/db/db_impl/db_impl_experimental.cc +++ b/db/db_impl/db_impl_experimental.cc @@ -136,7 +136,7 @@ Status DBImpl::PromoteL0(ColumnFamilyHandle* column_family, int target_level) { f->fd.smallest_seqno, f->fd.largest_seqno, f->marked_for_compaction, f->temperature, f->oldest_blob_file_number, f->oldest_ancester_time, - f->file_creation_time, f->file_checksum, + f->file_creation_time, f->epoch_number, f->file_checksum, f->file_checksum_func_name, f->unique_id); } diff --git a/db/db_impl/db_impl_open.cc b/db/db_impl/db_impl_open.cc index 40ffa2e85..f626df9ed 100644 --- a/db/db_impl/db_impl_open.cc +++ b/db/db_impl/db_impl_open.cc @@ -1515,7 +1515,7 @@ Status DBImpl::WriteLevel0TableForRecovery(int job_id, ColumnFamilyData* cfd, .PermitUncheckedError(); // ignore error const uint64_t current_time = static_cast(_current_time); meta.oldest_ancester_time = current_time; - + meta.epoch_number = cfd->NewEpochNumber(); { auto write_hint = cfd->CalculateSSTWriteHint(0); mutex_.Unlock(); @@ -1583,13 +1583,13 @@ Status DBImpl::WriteLevel0TableForRecovery(int job_id, ColumnFamilyData* cfd, constexpr int level = 0; if (s.ok() && has_output) { - edit->AddFile(level, meta.fd.GetNumber(), meta.fd.GetPathId(), - meta.fd.GetFileSize(), meta.smallest, meta.largest, - meta.fd.smallest_seqno, meta.fd.largest_seqno, - meta.marked_for_compaction, meta.temperature, - meta.oldest_blob_file_number, meta.oldest_ancester_time, - meta.file_creation_time, meta.file_checksum, - meta.file_checksum_func_name, meta.unique_id); + edit->AddFile( + level, meta.fd.GetNumber(), meta.fd.GetPathId(), meta.fd.GetFileSize(), + meta.smallest, meta.largest, meta.fd.smallest_seqno, + meta.fd.largest_seqno, meta.marked_for_compaction, meta.temperature, + meta.oldest_blob_file_number, meta.oldest_ancester_time, + meta.file_creation_time, meta.epoch_number, meta.file_checksum, + meta.file_checksum_func_name, meta.unique_id); for (const auto& blob : blob_file_additions) { edit->AddBlobFile(blob); diff --git a/db/db_test.cc b/db/db_test.cc index 9575248b4..1688745c1 100644 --- a/db/db_test.cc +++ b/db/db_test.cc @@ -1203,6 +1203,8 @@ void CheckColumnFamilyMeta( file_meta_from_files.file_creation_time); ASSERT_GE(file_meta_from_cf.file_creation_time, start_time); ASSERT_LE(file_meta_from_cf.file_creation_time, end_time); + ASSERT_EQ(file_meta_from_cf.epoch_number, + file_meta_from_files.epoch_number); ASSERT_GE(file_meta_from_cf.oldest_ancester_time, start_time); ASSERT_LE(file_meta_from_cf.oldest_ancester_time, end_time); // More from FileStorageInfo @@ -1253,6 +1255,7 @@ void CheckLiveFilesMeta( ASSERT_EQ(meta.largestkey, expected_meta.largest.user_key().ToString()); ASSERT_EQ(meta.oldest_blob_file_number, expected_meta.oldest_blob_file_number); + ASSERT_EQ(meta.epoch_number, expected_meta.epoch_number); // More from FileStorageInfo ASSERT_EQ(meta.file_type, kTableFile); diff --git a/db/db_test2.cc b/db/db_test2.cc index 779b8bf13..b4f1664f4 100644 --- a/db/db_test2.cc +++ b/db/db_test2.cc @@ -14,6 +14,7 @@ #include "db/db_test_util.h" #include "db/read_callback.h" +#include "db/version_edit.h" #include "options/options_helper.h" #include "port/port.h" #include "port/stack_trace.h" @@ -33,6 +34,18 @@ namespace ROCKSDB_NAMESPACE { class DBTest2 : public DBTestBase { public: DBTest2() : DBTestBase("db_test2", /*env_do_fsync=*/true) {} + std::vector GetLevelFileMetadatas(int level, int cf = 0) { + VersionSet* const versions = dbfull()->GetVersionSet(); + assert(versions); + ColumnFamilyData* const cfd = + versions->GetColumnFamilySet()->GetColumnFamily(cf); + assert(cfd); + Version* const current = cfd->current(); + assert(current); + VersionStorageInfo* const storage_info = current->storage_info(); + assert(storage_info); + return storage_info->LevelFiles(level); + } }; #ifndef ROCKSDB_LITE @@ -7325,6 +7338,218 @@ TEST_F(DBTest2, PointInTimeRecoveryWithSyncFailureInCFCreation) { ReopenWithColumnFamilies({"default", "test1", "test2"}, options); } +#ifndef ROCKSDB_LITE +TEST_F(DBTest2, SortL0FilesByEpochNumber) { + Options options = CurrentOptions(); + options.num_levels = 1; + options.compaction_style = kCompactionStyleUniversal; + DestroyAndReopen(options); + + // Set up L0 files to be sorted by their epoch_number + ASSERT_OK(Put("key1", "seq1")); + + SstFileWriter sst_file_writer{EnvOptions(), options}; + std::string external_file1 = dbname_ + "/test_files1.sst"; + std::string external_file2 = dbname_ + "/test_files2.sst"; + ASSERT_OK(sst_file_writer.Open(external_file1)); + ASSERT_OK(sst_file_writer.Put("key2", "seq0")); + ASSERT_OK(sst_file_writer.Finish()); + ASSERT_OK(sst_file_writer.Open(external_file2)); + ASSERT_OK(sst_file_writer.Put("key3", "seq0")); + ASSERT_OK(sst_file_writer.Finish()); + + ASSERT_OK(Put("key4", "seq2")); + ASSERT_OK(Flush()); + + auto* handle = db_->DefaultColumnFamily(); + ASSERT_OK(db_->IngestExternalFile(handle, {external_file1, external_file2}, + IngestExternalFileOptions())); + + // To verify L0 files are sorted by epoch_number in descending order + // instead of largest_seqno + std::vector level0_files = GetLevelFileMetadatas(0 /* level*/); + ASSERT_EQ(level0_files.size(), 3); + + EXPECT_EQ(level0_files[0]->epoch_number, 3); + EXPECT_EQ(level0_files[0]->fd.largest_seqno, 0); + ASSERT_EQ(level0_files[0]->num_entries, 1); + ASSERT_TRUE(level0_files[0]->largest.user_key() == Slice("key3")); + + EXPECT_EQ(level0_files[1]->epoch_number, 2); + EXPECT_EQ(level0_files[1]->fd.largest_seqno, 0); + ASSERT_EQ(level0_files[1]->num_entries, 1); + ASSERT_TRUE(level0_files[1]->largest.user_key() == Slice("key2")); + + EXPECT_EQ(level0_files[2]->epoch_number, 1); + EXPECT_EQ(level0_files[2]->fd.largest_seqno, 2); + ASSERT_EQ(level0_files[2]->num_entries, 2); + ASSERT_TRUE(level0_files[2]->largest.user_key() == Slice("key4")); + ASSERT_TRUE(level0_files[2]->smallest.user_key() == Slice("key1")); + + // To verify compacted file is assigned with the minimum epoch_number + // among input files' + ASSERT_OK(db_->CompactRange(CompactRangeOptions(), nullptr, nullptr)); + + level0_files = GetLevelFileMetadatas(0 /* level*/); + ASSERT_EQ(level0_files.size(), 1); + EXPECT_EQ(level0_files[0]->epoch_number, 1); + ASSERT_EQ(level0_files[0]->num_entries, 4); + ASSERT_TRUE(level0_files[0]->largest.user_key() == Slice("key4")); + ASSERT_TRUE(level0_files[0]->smallest.user_key() == Slice("key1")); +} + +TEST_F(DBTest2, SameEpochNumberAfterCompactRangeChangeLevel) { + Options options = CurrentOptions(); + options.num_levels = 7; + options.compaction_style = CompactionStyle::kCompactionStyleLevel; + options.disable_auto_compactions = true; + DestroyAndReopen(options); + + // Set up the file in L1 to be moved to L0 in later step of CompactRange() + ASSERT_OK(Put("key1", "seq1")); + ASSERT_OK(Flush()); + MoveFilesToLevel(1, 0); + std::vector level0_files = GetLevelFileMetadatas(0 /* level*/); + ASSERT_EQ(level0_files.size(), 0); + std::vector level1_files = GetLevelFileMetadatas(1 /* level*/); + ASSERT_EQ(level1_files.size(), 1); + std::vector level2_files = GetLevelFileMetadatas(2 /* level*/); + ASSERT_EQ(level2_files.size(), 0); + + ASSERT_EQ(level1_files[0]->epoch_number, 1); + + // To verify CompactRange() moving file to L0 still keeps the file's + // epoch_number + CompactRangeOptions croptions; + croptions.change_level = true; + croptions.target_level = 0; + ASSERT_OK(db_->CompactRange(croptions, nullptr, nullptr)); + level0_files = GetLevelFileMetadatas(0 /* level*/); + level1_files = GetLevelFileMetadatas(1 /* level*/); + ASSERT_EQ(level0_files.size(), 1); + ASSERT_EQ(level1_files.size(), 0); + + EXPECT_EQ(level0_files[0]->epoch_number, 1); + + ASSERT_EQ(level0_files[0]->num_entries, 1); + ASSERT_TRUE(level0_files[0]->largest.user_key() == Slice("key1")); +} + +TEST_F(DBTest2, RecoverEpochNumber) { + for (bool allow_ingest_behind : {true, false}) { + Options options = CurrentOptions(); + options.allow_ingest_behind = allow_ingest_behind; + options.num_levels = 7; + options.compaction_style = kCompactionStyleLevel; + options.disable_auto_compactions = true; + DestroyAndReopen(options); + CreateAndReopenWithCF({"cf1"}, options); + VersionSet* versions = dbfull()->GetVersionSet(); + assert(versions); + const ColumnFamilyData* default_cf = + versions->GetColumnFamilySet()->GetDefault(); + const ColumnFamilyData* cf1 = + versions->GetColumnFamilySet()->GetColumnFamily("cf1"); + + // Set up files in default CF to recover in later step + ASSERT_OK(Put("key1", "epoch1")); + ASSERT_OK(Flush()); + MoveFilesToLevel(1 /* level*/, 0 /* cf*/); + ASSERT_OK(Put("key2", "epoch2")); + ASSERT_OK(Flush()); + + std::vector level0_files = + GetLevelFileMetadatas(0 /* level*/); + ASSERT_EQ(level0_files.size(), 1); + ASSERT_EQ(level0_files[0]->epoch_number, + allow_ingest_behind + ? 2 + kReservedEpochNumberForFileIngestedBehind + : 2); + ASSERT_EQ(level0_files[0]->num_entries, 1); + ASSERT_TRUE(level0_files[0]->largest.user_key() == Slice("key2")); + + std::vector level1_files = + GetLevelFileMetadatas(1 /* level*/); + ASSERT_EQ(level1_files.size(), 1); + ASSERT_EQ(level1_files[0]->epoch_number, + allow_ingest_behind + ? 1 + kReservedEpochNumberForFileIngestedBehind + : 1); + ASSERT_EQ(level1_files[0]->num_entries, 1); + ASSERT_TRUE(level1_files[0]->largest.user_key() == Slice("key1")); + + // Set up files in cf1 to recover in later step + ASSERT_OK(Put(1 /* cf */, "cf1_key1", "epoch1")); + ASSERT_OK(Flush(1 /* cf */)); + + std::vector level0_files_cf1 = + GetLevelFileMetadatas(0 /* level*/, 1 /* cf*/); + ASSERT_EQ(level0_files_cf1.size(), 1); + ASSERT_EQ(level0_files_cf1[0]->epoch_number, + allow_ingest_behind + ? 1 + kReservedEpochNumberForFileIngestedBehind + : 1); + ASSERT_EQ(level0_files_cf1[0]->num_entries, 1); + ASSERT_TRUE(level0_files_cf1[0]->largest.user_key() == Slice("cf1_key1")); + + ASSERT_EQ(default_cf->GetNextEpochNumber(), + allow_ingest_behind + ? 3 + kReservedEpochNumberForFileIngestedBehind + : 3); + ASSERT_EQ(cf1->GetNextEpochNumber(), + allow_ingest_behind + ? 2 + kReservedEpochNumberForFileIngestedBehind + : 2); + + // To verify epoch_number of files of different levels/CFs are + // persisted and recovered correctly + ReopenWithColumnFamilies({"default", "cf1"}, options); + versions = dbfull()->GetVersionSet(); + assert(versions); + default_cf = versions->GetColumnFamilySet()->GetDefault(); + cf1 = versions->GetColumnFamilySet()->GetColumnFamily("cf1"); + + level0_files = GetLevelFileMetadatas(0 /* level*/); + ASSERT_EQ(level0_files.size(), 1); + EXPECT_EQ(level0_files[0]->epoch_number, + allow_ingest_behind + ? 2 + kReservedEpochNumberForFileIngestedBehind + : 2); + ASSERT_EQ(level0_files[0]->num_entries, 1); + ASSERT_TRUE(level0_files[0]->largest.user_key() == Slice("key2")); + + level1_files = GetLevelFileMetadatas(1 /* level*/); + ASSERT_EQ(level1_files.size(), 1); + EXPECT_EQ(level1_files[0]->epoch_number, + allow_ingest_behind + ? 1 + kReservedEpochNumberForFileIngestedBehind + : 1); + ASSERT_EQ(level1_files[0]->num_entries, 1); + ASSERT_TRUE(level1_files[0]->largest.user_key() == Slice("key1")); + + level0_files_cf1 = GetLevelFileMetadatas(0 /* level*/, 1 /* cf*/); + ASSERT_EQ(level0_files_cf1.size(), 1); + EXPECT_EQ(level0_files_cf1[0]->epoch_number, + allow_ingest_behind + ? 1 + kReservedEpochNumberForFileIngestedBehind + : 1); + ASSERT_EQ(level0_files_cf1[0]->num_entries, 1); + ASSERT_TRUE(level0_files_cf1[0]->largest.user_key() == Slice("cf1_key1")); + + // To verify next epoch number is recovered correctly + EXPECT_EQ(default_cf->GetNextEpochNumber(), + allow_ingest_behind + ? 3 + kReservedEpochNumberForFileIngestedBehind + : 3); + EXPECT_EQ(cf1->GetNextEpochNumber(), + allow_ingest_behind + ? 2 + kReservedEpochNumberForFileIngestedBehind + : 2); + } +} + +#endif // ROCKSDB_LITE + TEST_F(DBTest2, RenameDirectory) { Options options = CurrentOptions(); DestroyAndReopen(options); diff --git a/db/experimental.cc b/db/experimental.cc index d838ebde5..cb6286b1f 100644 --- a/db/experimental.cc +++ b/db/experimental.cc @@ -112,7 +112,7 @@ Status UpdateManifestForFilesState( lf->smallest, lf->largest, lf->fd.smallest_seqno, lf->fd.largest_seqno, lf->marked_for_compaction, temp, lf->oldest_blob_file_number, lf->oldest_ancester_time, - lf->file_creation_time, lf->file_checksum, + lf->file_creation_time, lf->epoch_number, lf->file_checksum, lf->file_checksum_func_name, lf->unique_id); } } diff --git a/db/external_sst_file_ingestion_job.cc b/db/external_sst_file_ingestion_job.cc index ba1277eab..dfb967268 100644 --- a/db/external_sst_file_ingestion_job.cc +++ b/db/external_sst_file_ingestion_job.cc @@ -469,8 +469,11 @@ Status ExternalSstFileIngestionJob::Run() { f.fd.GetNumber(), f.fd.GetPathId(), f.fd.GetFileSize(), f.smallest_internal_key, f.largest_internal_key, f.assigned_seqno, f.assigned_seqno, false, f.file_temperature, kInvalidBlobFileNumber, - oldest_ancester_time, current_time, f.file_checksum, - f.file_checksum_func_name, f.unique_id); + oldest_ancester_time, current_time, + ingestion_options_.ingest_behind + ? kReservedEpochNumberForFileIngestedBehind + : cfd_->NewEpochNumber(), + f.file_checksum, f.file_checksum_func_name, f.unique_id); f_metadata.temperature = f.file_temperature; edit_.AddFile(f.picked_level, f_metadata); } diff --git a/db/flush_job.cc b/db/flush_job.cc index 645e42f44..c63ccec3e 100644 --- a/db/flush_job.cc +++ b/db/flush_job.cc @@ -23,6 +23,7 @@ #include "db/memtable_list.h" #include "db/merge_context.h" #include "db/range_tombstone_fragmenter.h" +#include "db/version_edit.h" #include "db/version_set.h" #include "file/file_util.h" #include "file/filename.h" @@ -199,6 +200,7 @@ void FlushJob::PickMemTable() { // path 0 for level 0 file. meta_.fd = FileDescriptor(versions_->NewFileNumber(), 0, 0); + meta_.epoch_number = cfd_->NewEpochNumber(); base_ = cfd_->current(); base_->Ref(); // it is likely that we do not need this reference @@ -999,8 +1001,9 @@ Status FlushJob::WriteLevel0Table() { meta_.fd.smallest_seqno, meta_.fd.largest_seqno, meta_.marked_for_compaction, meta_.temperature, meta_.oldest_blob_file_number, meta_.oldest_ancester_time, - meta_.file_creation_time, meta_.file_checksum, - meta_.file_checksum_func_name, meta_.unique_id); + meta_.file_creation_time, meta_.epoch_number, + meta_.file_checksum, meta_.file_checksum_func_name, + meta_.unique_id); edit_->SetBlobFileAdditions(std::move(blob_file_additions)); } diff --git a/db/import_column_family_job.cc b/db/import_column_family_job.cc index ae342cc0b..c59ef11ab 100644 --- a/db/import_column_family_job.cc +++ b/db/import_column_family_job.cc @@ -4,6 +4,7 @@ // COPYING file in the root directory) and Apache 2.0 License // (found in the LICENSE.Apache file in the root directory). +#include "db/version_builder.h" #ifndef ROCKSDB_LITE #include "db/import_column_family_job.h" @@ -119,38 +120,46 @@ Status ImportColumnFamilyJob::Run() { static_cast(temp_current_time); } - VersionBuilder version_builder( + // Recover files' epoch number using dummy VersionStorageInfo + VersionBuilder dummy_version_builder( cfd_->current()->version_set()->file_options(), cfd_->ioptions(), cfd_->table_cache(), cfd_->current()->storage_info(), cfd_->current()->version_set(), cfd_->GetFileMetadataCacheReservationManager()); - VersionStorageInfo vstorage( + VersionStorageInfo dummy_vstorage( &cfd_->internal_comparator(), cfd_->user_comparator(), cfd_->NumberLevels(), cfd_->ioptions()->compaction_style, - nullptr /* src_vstorage */, cfd_->ioptions()->force_consistency_checks); + nullptr /* src_vstorage */, cfd_->ioptions()->force_consistency_checks, + EpochNumberRequirement::kMightMissing); Status s; for (size_t i = 0; s.ok() && i < files_to_import_.size(); ++i) { const auto& f = files_to_import_[i]; const auto& file_metadata = metadata_[i]; - VersionEdit version_edit; - version_edit.AddFile( + VersionEdit dummy_version_edit; + dummy_version_edit.AddFile( file_metadata.level, f.fd.GetNumber(), f.fd.GetPathId(), f.fd.GetFileSize(), f.smallest_internal_key, f.largest_internal_key, file_metadata.smallest_seqno, file_metadata.largest_seqno, false, file_metadata.temperature, kInvalidBlobFileNumber, oldest_ancester_time, - current_time, kUnknownFileChecksum, kUnknownFileChecksumFuncName, - f.unique_id); - s = version_builder.Apply(&version_edit); + current_time, file_metadata.epoch_number, kUnknownFileChecksum, + kUnknownFileChecksumFuncName, f.unique_id); + s = dummy_version_builder.Apply(&dummy_version_edit); } if (s.ok()) { - s = version_builder.SaveTo(&vstorage); + s = dummy_version_builder.SaveTo(&dummy_vstorage); } + if (s.ok()) { + dummy_vstorage.RecoverEpochNumbers(cfd_); + } + + // Record changes from this CF import in VersionEdit, including files with + // recovered epoch numbers if (s.ok()) { edit_.SetColumnFamily(cfd_->GetID()); - for (int level = 0; level < vstorage.num_levels(); level++) { - for (FileMetaData* file_meta : vstorage.LevelFiles(level)) { + for (int level = 0; level < dummy_vstorage.num_levels(); level++) { + for (FileMetaData* file_meta : dummy_vstorage.LevelFiles(level)) { edit_.AddFile(level, *file_meta); // If incoming sequence number is higher, update local sequence number. if (file_meta->fd.largest_seqno > versions_->LastSequence()) { @@ -161,8 +170,10 @@ Status ImportColumnFamilyJob::Run() { } } } - for (int level = 0; level < vstorage.num_levels(); level++) { - for (FileMetaData* file_meta : vstorage.LevelFiles(level)) { + + // Release resources occupied by the dummy VersionStorageInfo + for (int level = 0; level < dummy_vstorage.num_levels(); level++) { + for (FileMetaData* file_meta : dummy_vstorage.LevelFiles(level)) { file_meta->refs--; if (file_meta->refs <= 0) { delete file_meta; @@ -299,7 +310,6 @@ Status ImportColumnFamilyJob::GetIngestedFileInfo( return status; } - } // namespace ROCKSDB_NAMESPACE #endif // !ROCKSDB_LITE diff --git a/db/repair.cc b/db/repair.cc index 1829a79f2..ae26f9c6f 100644 --- a/db/repair.cc +++ b/db/repair.cc @@ -59,6 +59,7 @@ // Store per-table metadata (smallest, largest, largest-seq#, ...) // in the table's meta section to speed up ScanTable. +#include "db/version_builder.h" #ifndef ROCKSDB_LITE #include @@ -640,38 +641,79 @@ class Repairer { for (const auto& cf_id_and_tables : cf_id_to_tables) { auto* cfd = vset_.GetColumnFamilySet()->GetColumnFamily(cf_id_and_tables.first); - VersionEdit edit; - edit.SetComparatorName(cfd->user_comparator()->Name()); - edit.SetLogNumber(0); - edit.SetNextFile(next_file_number_); - edit.SetColumnFamily(cfd->GetID()); - // TODO(opt): separate out into multiple levels + // Recover files' epoch number using dummy VersionStorageInfo + VersionBuilder dummy_version_builder( + cfd->current()->version_set()->file_options(), cfd->ioptions(), + cfd->table_cache(), cfd->current()->storage_info(), + cfd->current()->version_set(), + cfd->GetFileMetadataCacheReservationManager()); + VersionStorageInfo dummy_vstorage( + &cfd->internal_comparator(), cfd->user_comparator(), + cfd->NumberLevels(), cfd->ioptions()->compaction_style, + nullptr /* src_vstorage */, cfd->ioptions()->force_consistency_checks, + EpochNumberRequirement::kMightMissing); + Status s; + VersionEdit dummy_edit; for (const auto* table : cf_id_and_tables.second) { - edit.AddFile( + // TODO(opt): separate out into multiple levels + dummy_edit.AddFile( 0, table->meta.fd.GetNumber(), table->meta.fd.GetPathId(), table->meta.fd.GetFileSize(), table->meta.smallest, table->meta.largest, table->meta.fd.smallest_seqno, table->meta.fd.largest_seqno, table->meta.marked_for_compaction, table->meta.temperature, table->meta.oldest_blob_file_number, table->meta.oldest_ancester_time, table->meta.file_creation_time, - table->meta.file_checksum, table->meta.file_checksum_func_name, - table->meta.unique_id); + table->meta.epoch_number, table->meta.file_checksum, + table->meta.file_checksum_func_name, table->meta.unique_id); } - assert(next_file_number_ > 0); - vset_.MarkFileNumberUsed(next_file_number_ - 1); - mutex_.Lock(); - std::unique_ptr db_dir; - Status status = env_->GetFileSystem()->NewDirectory(dbname_, IOOptions(), - &db_dir, nullptr); - if (status.ok()) { - status = vset_.LogAndApply(cfd, *cfd->GetLatestMutableCFOptions(), - &edit, &mutex_, db_dir.get(), - false /* new_descriptor_log */); + s = dummy_version_builder.Apply(&dummy_edit); + if (s.ok()) { + s = dummy_version_builder.SaveTo(&dummy_vstorage); } - mutex_.Unlock(); - if (!status.ok()) { - return status; + if (s.ok()) { + dummy_vstorage.RecoverEpochNumbers(cfd); + } + if (s.ok()) { + // Record changes from this repair in VersionEdit, including files with + // recovered epoch numbers + VersionEdit edit; + edit.SetComparatorName(cfd->user_comparator()->Name()); + edit.SetLogNumber(0); + edit.SetNextFile(next_file_number_); + edit.SetColumnFamily(cfd->GetID()); + for (int level = 0; level < dummy_vstorage.num_levels(); ++level) { + for (FileMetaData* file_meta : dummy_vstorage.LevelFiles(level)) { + edit.AddFile(level, *file_meta); + } + } + + // Release resources occupied by the dummy VersionStorageInfo + for (int level = 0; level < dummy_vstorage.num_levels(); ++level) { + for (FileMetaData* file_meta : dummy_vstorage.LevelFiles(level)) { + file_meta->refs--; + if (file_meta->refs <= 0) { + delete file_meta; + } + } + } + + // Persist record of changes + assert(next_file_number_ > 0); + vset_.MarkFileNumberUsed(next_file_number_ - 1); + mutex_.Lock(); + std::unique_ptr db_dir; + s = env_->GetFileSystem()->NewDirectory(dbname_, IOOptions(), &db_dir, + nullptr); + if (s.ok()) { + s = vset_.LogAndApply(cfd, *cfd->GetLatestMutableCFOptions(), &edit, + &mutex_, db_dir.get(), + false /* new_descriptor_log */); + } + mutex_.Unlock(); + } + if (!s.ok()) { + return s; } } return Status::OK(); diff --git a/db/repair_test.cc b/db/repair_test.cc index 644a9270d..f80f2b722 100644 --- a/db/repair_test.cc +++ b/db/repair_test.cc @@ -62,8 +62,62 @@ class RepairTest : public DBTestBase { ASSERT_GT(verify_passed, 0); SyncPoint::GetInstance()->DisableProcessing(); } + + std::vector GetLevelFileMetadatas(int level, int cf = 0) { + VersionSet* const versions = dbfull()->GetVersionSet(); + assert(versions); + ColumnFamilyData* const cfd = + versions->GetColumnFamilySet()->GetColumnFamily(cf); + assert(cfd); + Version* const current = cfd->current(); + assert(current); + VersionStorageInfo* const storage_info = current->storage_info(); + assert(storage_info); + return storage_info->LevelFiles(level); + } }; +TEST_F(RepairTest, SortRepairedDBL0ByEpochNumber) { + Options options = CurrentOptions(); + DestroyAndReopen(options); + + ASSERT_OK(Put("k1", "oldest")); + ASSERT_OK(Put("k1", "older")); + ASSERT_OK(Flush()); + MoveFilesToLevel(1); + + ASSERT_OK(Put("k1", "old")); + ASSERT_OK(Flush()); + + ASSERT_OK(Put("k1", "new")); + + std::vector level0_files = GetLevelFileMetadatas(0 /* level*/); + ASSERT_EQ(level0_files.size(), 1); + ASSERT_EQ(level0_files[0]->epoch_number, 2); + std::vector level1_files = GetLevelFileMetadatas(1 /* level*/); + ASSERT_EQ(level1_files.size(), 1); + ASSERT_EQ(level1_files[0]->epoch_number, 1); + + std::string manifest_path = + DescriptorFileName(dbname_, dbfull()->TEST_Current_Manifest_FileNo()); + Close(); + ASSERT_OK(env_->FileExists(manifest_path)); + ASSERT_OK(env_->DeleteFile(manifest_path)); + + ASSERT_OK(RepairDB(dbname_, CurrentOptions())); + ReopenWithSstIdVerify(); + + EXPECT_EQ(Get("k1"), "new"); + + level0_files = GetLevelFileMetadatas(0 /* level*/); + ASSERT_EQ(level0_files.size(), 3); + EXPECT_EQ(level0_files[0]->epoch_number, 3); + EXPECT_EQ(level0_files[1]->epoch_number, 2); + EXPECT_EQ(level0_files[2]->epoch_number, 1); + level1_files = GetLevelFileMetadatas(1 /* level*/); + ASSERT_EQ(level1_files.size(), 0); +} + TEST_F(RepairTest, LostManifest) { // Add a couple SST files, delete the manifest, and verify RepairDB() saves // the day. diff --git a/db/version_builder.cc b/db/version_builder.cc index 2c65dcf71..bff90b242 100644 --- a/db/version_builder.cc +++ b/db/version_builder.cc @@ -28,6 +28,7 @@ #include "db/dbformat.h" #include "db/internal_stats.h" #include "db/table_cache.h" +#include "db/version_edit.h" #include "db/version_set.h" #include "port/port.h" #include "table/table_reader.h" @@ -36,25 +37,22 @@ namespace ROCKSDB_NAMESPACE { class VersionBuilder::Rep { - class NewestFirstBySeqNo { + class NewestFirstByEpochNumber { + private: + inline static const NewestFirstBySeqNo seqno_cmp; + public: bool operator()(const FileMetaData* lhs, const FileMetaData* rhs) const { assert(lhs); assert(rhs); - if (lhs->fd.largest_seqno != rhs->fd.largest_seqno) { - return lhs->fd.largest_seqno > rhs->fd.largest_seqno; - } - - if (lhs->fd.smallest_seqno != rhs->fd.smallest_seqno) { - return lhs->fd.smallest_seqno > rhs->fd.smallest_seqno; + if (lhs->epoch_number != rhs->epoch_number) { + return lhs->epoch_number > rhs->epoch_number; + } else { + return seqno_cmp(lhs, rhs); } - - // Break ties by file number - return lhs->fd.GetNumber() > rhs->fd.GetNumber(); } }; - class BySmallestKey { public: explicit BySmallestKey(const InternalKeyComparator* cmp) : cmp_(cmp) {} @@ -251,7 +249,8 @@ class VersionBuilder::Rep { std::unordered_map table_file_levels_; // Current compact cursors that should be changed after the last compaction std::unordered_map updated_compact_cursors_; - NewestFirstBySeqNo level_zero_cmp_; + NewestFirstByEpochNumber level_zero_cmp_by_epochno_; + NewestFirstBySeqNo level_zero_cmp_by_seqno_; BySmallestKey level_nonzero_cmp_; // Mutable metadata objects for all blob files affected by the series of @@ -382,43 +381,60 @@ class VersionBuilder::Rep { ExpectedLinkedSsts expected_linked_ssts; if (num_levels_ > 0) { + const InternalKeyComparator* const icmp = vstorage->InternalComparator(); + EpochNumberRequirement epoch_number_requirement = + vstorage->GetEpochNumberRequirement(); + assert(icmp); // Check L0 { - auto l0_checker = [this](const FileMetaData* lhs, - const FileMetaData* rhs) { + auto l0_checker = [this, epoch_number_requirement, icmp]( + const FileMetaData* lhs, + const FileMetaData* rhs) { assert(lhs); assert(rhs); - if (!level_zero_cmp_(lhs, rhs)) { - std::ostringstream oss; - oss << "L0 files are not sorted properly: files #" - << lhs->fd.GetNumber() << ", #" << rhs->fd.GetNumber(); - - return Status::Corruption("VersionBuilder", oss.str()); - } - - if (rhs->fd.smallest_seqno == rhs->fd.largest_seqno) { - // This is an external file that we ingested - const SequenceNumber external_file_seqno = rhs->fd.smallest_seqno; - - if (!(external_file_seqno < lhs->fd.largest_seqno || - external_file_seqno == 0)) { + if (epoch_number_requirement == + EpochNumberRequirement::kMightMissing) { + if (!level_zero_cmp_by_seqno_(lhs, rhs)) { std::ostringstream oss; - oss << "L0 file #" << lhs->fd.GetNumber() << " with seqno " - << lhs->fd.smallest_seqno << ' ' << lhs->fd.largest_seqno - << " vs. file #" << rhs->fd.GetNumber() - << " with global_seqno " << external_file_seqno; - + oss << "L0 files are not sorted properly: files #" + << lhs->fd.GetNumber() << " with seqnos (largest, smallest) " + << lhs->fd.largest_seqno << " , " << lhs->fd.smallest_seqno + << ", #" << rhs->fd.GetNumber() + << " with seqnos (largest, smallest) " + << rhs->fd.largest_seqno << " , " << rhs->fd.smallest_seqno; return Status::Corruption("VersionBuilder", oss.str()); } - } else if (lhs->fd.smallest_seqno <= rhs->fd.smallest_seqno) { - std::ostringstream oss; - oss << "L0 file #" << lhs->fd.GetNumber() << " with seqno " - << lhs->fd.smallest_seqno << ' ' << lhs->fd.largest_seqno - << " vs. file #" << rhs->fd.GetNumber() << " with seqno " - << rhs->fd.smallest_seqno << ' ' << rhs->fd.largest_seqno; + } else if (epoch_number_requirement == + EpochNumberRequirement::kMustPresent) { + if (lhs->epoch_number == rhs->epoch_number) { + bool range_overlapped = + icmp->Compare(lhs->smallest, rhs->largest) <= 0 && + icmp->Compare(lhs->largest, rhs->smallest) >= 0; + + if (range_overlapped) { + std::ostringstream oss; + oss << "L0 files of same epoch number but overlapping range #" + << lhs->fd.GetNumber() + << " , smallest key: " << lhs->smallest.DebugString(true) + << " , largest key: " << lhs->largest.DebugString(true) + << " , epoch number: " << lhs->epoch_number << " vs. file #" + << rhs->fd.GetNumber() + << " , smallest key: " << rhs->smallest.DebugString(true) + << " , largest key: " << rhs->largest.DebugString(true) + << " , epoch number: " << rhs->epoch_number; + return Status::Corruption("VersionBuilder", oss.str()); + } + } - return Status::Corruption("VersionBuilder", oss.str()); + if (!level_zero_cmp_by_epochno_(lhs, rhs)) { + std::ostringstream oss; + oss << "L0 files are not sorted properly: files #" + << lhs->fd.GetNumber() << " with epoch number " + << lhs->epoch_number << ", #" << rhs->fd.GetNumber() + << " with epoch number " << rhs->epoch_number; + return Status::Corruption("VersionBuilder", oss.str()); + } } return Status::OK(); @@ -433,8 +449,6 @@ class VersionBuilder::Rep { } // Check L1 and up - const InternalKeyComparator* const icmp = vstorage->InternalComparator(); - assert(icmp); for (int level = 1; level < num_levels_; ++level) { auto checker = [this, level, icmp](const FileMetaData* lhs, @@ -1156,6 +1170,25 @@ class VersionBuilder::Rep { } } + bool PromoteEpochNumberRequirementIfNeeded( + VersionStorageInfo* vstorage) const { + if (vstorage->HasMissingEpochNumber()) { + return false; + } + + for (int level = 0; level < num_levels_; ++level) { + for (const auto& pair : levels_[level].added_files) { + const FileMetaData* f = pair.second; + if (f->epoch_number == kUnknownEpochNumber) { + return false; + } + } + } + + vstorage->SetEpochNumberRequirement(EpochNumberRequirement::kMustPresent); + return true; + } + void SaveSSTFilesTo(VersionStorageInfo* vstorage) const { assert(vstorage); @@ -1163,7 +1196,21 @@ class VersionBuilder::Rep { return; } - SaveSSTFilesTo(vstorage, /* level */ 0, level_zero_cmp_); + EpochNumberRequirement epoch_number_requirement = + vstorage->GetEpochNumberRequirement(); + + if (epoch_number_requirement == EpochNumberRequirement::kMightMissing) { + bool promoted = PromoteEpochNumberRequirementIfNeeded(vstorage); + if (promoted) { + epoch_number_requirement = vstorage->GetEpochNumberRequirement(); + } + } + + if (epoch_number_requirement == EpochNumberRequirement::kMightMissing) { + SaveSSTFilesTo(vstorage, /* level */ 0, level_zero_cmp_by_seqno_); + } else { + SaveSSTFilesTo(vstorage, /* level */ 0, level_zero_cmp_by_epochno_); + } for (int level = 1; level < num_levels_; ++level) { SaveSSTFilesTo(vstorage, level, level_nonzero_cmp_); diff --git a/db/version_builder.h b/db/version_builder.h index 1c022832a..682d60524 100644 --- a/db/version_builder.h +++ b/db/version_builder.h @@ -11,7 +11,9 @@ #include +#include "db/version_edit.h" #include "rocksdb/file_system.h" +#include "rocksdb/metadata.h" #include "rocksdb/slice_transform.h" namespace ROCKSDB_NAMESPACE { @@ -69,4 +71,22 @@ class BaseReferencedVersionBuilder { Version* version_; }; +class NewestFirstBySeqNo { + public: + bool operator()(const FileMetaData* lhs, const FileMetaData* rhs) const { + assert(lhs); + assert(rhs); + + if (lhs->fd.largest_seqno != rhs->fd.largest_seqno) { + return lhs->fd.largest_seqno > rhs->fd.largest_seqno; + } + + if (lhs->fd.smallest_seqno != rhs->fd.smallest_seqno) { + return lhs->fd.smallest_seqno > rhs->fd.smallest_seqno; + } + + // Break ties by file number + return lhs->fd.GetNumber() > rhs->fd.GetNumber(); + } +}; } // namespace ROCKSDB_NAMESPACE diff --git a/db/version_builder_test.cc b/db/version_builder_test.cc index ee5c3f2e3..ed276c65f 100644 --- a/db/version_builder_test.cc +++ b/db/version_builder_test.cc @@ -64,14 +64,15 @@ class VersionBuilderTest : public testing::Test { uint64_t num_entries = 0, uint64_t num_deletions = 0, bool sampled = false, SequenceNumber smallest_seqno = 0, SequenceNumber largest_seqno = 0, - uint64_t oldest_blob_file_number = kInvalidBlobFileNumber) { + uint64_t oldest_blob_file_number = kInvalidBlobFileNumber, + uint64_t epoch_number = kUnknownEpochNumber) { assert(level < vstorage_.num_levels()); FileMetaData* f = new FileMetaData( file_number, path_id, file_size, GetInternalKey(smallest, smallest_seq), GetInternalKey(largest, largest_seq), smallest_seqno, largest_seqno, /* marked_for_compact */ false, Temperature::kUnknown, oldest_blob_file_number, kUnknownOldestAncesterTime, - kUnknownFileCreationTime, kUnknownFileChecksum, + kUnknownFileCreationTime, epoch_number, kUnknownFileChecksum, kUnknownFileChecksumFuncName, kNullUniqueId64x2); f->compensated_file_size = file_size; f->num_entries = num_entries; @@ -98,7 +99,8 @@ class VersionBuilderTest : public testing::Test { vstorage_.AddBlobFile(std::move(meta)); } - void AddDummyFile(uint64_t table_file_number, uint64_t blob_file_number) { + void AddDummyFile(uint64_t table_file_number, uint64_t blob_file_number, + uint64_t epoch_number) { constexpr int level = 0; constexpr char smallest[] = "bar"; constexpr char largest[] = "foo"; @@ -112,11 +114,11 @@ class VersionBuilderTest : public testing::Test { Add(level, table_file_number, smallest, largest, file_size, path_id, smallest_seq, largest_seq, num_entries, num_deletions, sampled, - smallest_seq, largest_seq, blob_file_number); + smallest_seq, largest_seq, blob_file_number, epoch_number); } void AddDummyFileToEdit(VersionEdit* edit, uint64_t table_file_number, - uint64_t blob_file_number) { + uint64_t blob_file_number, uint64_t epoch_number) { assert(edit); constexpr int level = 0; @@ -132,7 +134,7 @@ class VersionBuilderTest : public testing::Test { level, table_file_number, path_id, file_size, GetInternalKey(smallest), GetInternalKey(largest), smallest_seqno, largest_seqno, marked_for_compaction, Temperature::kUnknown, blob_file_number, - kUnknownOldestAncesterTime, kUnknownFileCreationTime, + kUnknownOldestAncesterTime, kUnknownFileCreationTime, epoch_number, kUnknownFileChecksum, kUnknownFileChecksumFuncName, kNullUniqueId64x2); } @@ -157,7 +159,13 @@ void UnrefFilesInVersion(VersionStorageInfo* new_vstorage) { } TEST_F(VersionBuilderTest, ApplyAndSaveTo) { - Add(0, 1U, "150", "200", 100U); + Add(0, 1U, "150", "200", 100U, /*path_id*/ 0, + /*smallest_seq*/ 100, /*largest_seq*/ 100, + /*num_entries*/ 0, /*num_deletions*/ 0, + /*sampled*/ false, /*smallest_seqno*/ 0, + /*largest_seqno*/ 0, + /*oldest_blob_file_number*/ kInvalidBlobFileNumber, + /*epoch_number*/ 1); Add(1, 66U, "150", "200", 100U); Add(1, 88U, "201", "300", 100U); @@ -177,7 +185,7 @@ TEST_F(VersionBuilderTest, ApplyAndSaveTo) { version_edit.AddFile( 2, 666, 0, 100U, GetInternalKey("301"), GetInternalKey("350"), 200, 200, false, Temperature::kUnknown, kInvalidBlobFileNumber, - kUnknownOldestAncesterTime, kUnknownFileCreationTime, + kUnknownOldestAncesterTime, kUnknownFileCreationTime, kUnknownEpochNumber, kUnknownFileChecksum, kUnknownFileChecksumFuncName, kNullUniqueId64x2); version_edit.DeleteFile(3, 27U); @@ -204,8 +212,12 @@ TEST_F(VersionBuilderTest, ApplyAndSaveTo) { TEST_F(VersionBuilderTest, ApplyAndSaveToDynamic) { ioptions_.level_compaction_dynamic_level_bytes = true; - Add(0, 1U, "150", "200", 100U, 0, 200U, 200U, 0, 0, false, 200U, 200U); - Add(0, 88U, "201", "300", 100U, 0, 100U, 100U, 0, 0, false, 100U, 100U); + Add(0, 1U, "150", "200", 100U, 0, 200U, 200U, 0, 0, false, 200U, 200U, + /*oldest_blob_file_number*/ kInvalidBlobFileNumber, + /*epoch_number*/ 2); + Add(0, 88U, "201", "300", 100U, 0, 100U, 100U, 0, 0, false, 100U, 100U, + /*oldest_blob_file_number*/ kInvalidBlobFileNumber, + /*epoch_number*/ 1); Add(4, 6U, "150", "179", 100U); Add(4, 7U, "180", "220", 100U); @@ -220,7 +232,7 @@ TEST_F(VersionBuilderTest, ApplyAndSaveToDynamic) { version_edit.AddFile( 3, 666, 0, 100U, GetInternalKey("301"), GetInternalKey("350"), 200, 200, false, Temperature::kUnknown, kInvalidBlobFileNumber, - kUnknownOldestAncesterTime, kUnknownFileCreationTime, + kUnknownOldestAncesterTime, kUnknownFileCreationTime, kUnknownEpochNumber, kUnknownFileChecksum, kUnknownFileChecksumFuncName, kNullUniqueId64x2); version_edit.DeleteFile(0, 1U); version_edit.DeleteFile(0, 88U); @@ -250,8 +262,12 @@ TEST_F(VersionBuilderTest, ApplyAndSaveToDynamic) { TEST_F(VersionBuilderTest, ApplyAndSaveToDynamic2) { ioptions_.level_compaction_dynamic_level_bytes = true; - Add(0, 1U, "150", "200", 100U, 0, 200U, 200U, 0, 0, false, 200U, 200U); - Add(0, 88U, "201", "300", 100U, 0, 100U, 100U, 0, 0, false, 100U, 100U); + Add(0, 1U, "150", "200", 100U, 0, 200U, 200U, 0, 0, false, 200U, 200U, + /*oldest_blob_file_number*/ kInvalidBlobFileNumber, + /*epoch_number*/ 2); + Add(0, 88U, "201", "300", 100U, 0, 100U, 100U, 0, 0, false, 100U, 100U, + /*oldest_blob_file_number*/ kInvalidBlobFileNumber, + /*epoch_number*/ 1); Add(4, 6U, "150", "179", 100U); Add(4, 7U, "180", "220", 100U); @@ -266,7 +282,7 @@ TEST_F(VersionBuilderTest, ApplyAndSaveToDynamic2) { version_edit.AddFile( 4, 666, 0, 100U, GetInternalKey("301"), GetInternalKey("350"), 200, 200, false, Temperature::kUnknown, kInvalidBlobFileNumber, - kUnknownOldestAncesterTime, kUnknownFileCreationTime, + kUnknownOldestAncesterTime, kUnknownFileCreationTime, kUnknownEpochNumber, kUnknownFileChecksum, kUnknownFileChecksumFuncName, kNullUniqueId64x2); version_edit.DeleteFile(0, 1U); version_edit.DeleteFile(0, 88U); @@ -302,27 +318,27 @@ TEST_F(VersionBuilderTest, ApplyMultipleAndSaveTo) { version_edit.AddFile( 2, 666, 0, 100U, GetInternalKey("301"), GetInternalKey("350"), 200, 200, false, Temperature::kUnknown, kInvalidBlobFileNumber, - kUnknownOldestAncesterTime, kUnknownFileCreationTime, + kUnknownOldestAncesterTime, kUnknownFileCreationTime, kUnknownEpochNumber, kUnknownFileChecksum, kUnknownFileChecksumFuncName, kNullUniqueId64x2); version_edit.AddFile( 2, 676, 0, 100U, GetInternalKey("401"), GetInternalKey("450"), 200, 200, false, Temperature::kUnknown, kInvalidBlobFileNumber, - kUnknownOldestAncesterTime, kUnknownFileCreationTime, + kUnknownOldestAncesterTime, kUnknownFileCreationTime, kUnknownEpochNumber, kUnknownFileChecksum, kUnknownFileChecksumFuncName, kNullUniqueId64x2); version_edit.AddFile( 2, 636, 0, 100U, GetInternalKey("601"), GetInternalKey("650"), 200, 200, false, Temperature::kUnknown, kInvalidBlobFileNumber, - kUnknownOldestAncesterTime, kUnknownFileCreationTime, + kUnknownOldestAncesterTime, kUnknownFileCreationTime, kUnknownEpochNumber, kUnknownFileChecksum, kUnknownFileChecksumFuncName, kNullUniqueId64x2); version_edit.AddFile( 2, 616, 0, 100U, GetInternalKey("501"), GetInternalKey("550"), 200, 200, false, Temperature::kUnknown, kInvalidBlobFileNumber, - kUnknownOldestAncesterTime, kUnknownFileCreationTime, + kUnknownOldestAncesterTime, kUnknownFileCreationTime, kUnknownEpochNumber, kUnknownFileChecksum, kUnknownFileChecksumFuncName, kNullUniqueId64x2); version_edit.AddFile( 2, 606, 0, 100U, GetInternalKey("701"), GetInternalKey("750"), 200, 200, false, Temperature::kUnknown, kInvalidBlobFileNumber, - kUnknownOldestAncesterTime, kUnknownFileCreationTime, + kUnknownOldestAncesterTime, kUnknownFileCreationTime, kUnknownEpochNumber, kUnknownFileChecksum, kUnknownFileChecksumFuncName, kNullUniqueId64x2); EnvOptions env_options; @@ -361,27 +377,27 @@ TEST_F(VersionBuilderTest, ApplyDeleteAndSaveTo) { version_edit.AddFile( 2, 666, 0, 100U, GetInternalKey("301"), GetInternalKey("350"), 200, 200, false, Temperature::kUnknown, kInvalidBlobFileNumber, - kUnknownOldestAncesterTime, kUnknownFileCreationTime, + kUnknownOldestAncesterTime, kUnknownFileCreationTime, kUnknownEpochNumber, kUnknownFileChecksum, kUnknownFileChecksumFuncName, kNullUniqueId64x2); version_edit.AddFile( 2, 676, 0, 100U, GetInternalKey("401"), GetInternalKey("450"), 200, 200, false, Temperature::kUnknown, kInvalidBlobFileNumber, - kUnknownOldestAncesterTime, kUnknownFileCreationTime, + kUnknownOldestAncesterTime, kUnknownFileCreationTime, kUnknownEpochNumber, kUnknownFileChecksum, kUnknownFileChecksumFuncName, kNullUniqueId64x2); version_edit.AddFile( 2, 636, 0, 100U, GetInternalKey("601"), GetInternalKey("650"), 200, 200, false, Temperature::kUnknown, kInvalidBlobFileNumber, - kUnknownOldestAncesterTime, kUnknownFileCreationTime, + kUnknownOldestAncesterTime, kUnknownFileCreationTime, kUnknownEpochNumber, kUnknownFileChecksum, kUnknownFileChecksumFuncName, kNullUniqueId64x2); version_edit.AddFile( 2, 616, 0, 100U, GetInternalKey("501"), GetInternalKey("550"), 200, 200, false, Temperature::kUnknown, kInvalidBlobFileNumber, - kUnknownOldestAncesterTime, kUnknownFileCreationTime, + kUnknownOldestAncesterTime, kUnknownFileCreationTime, kUnknownEpochNumber, kUnknownFileChecksum, kUnknownFileChecksumFuncName, kNullUniqueId64x2); version_edit.AddFile( 2, 606, 0, 100U, GetInternalKey("701"), GetInternalKey("750"), 200, 200, false, Temperature::kUnknown, kInvalidBlobFileNumber, - kUnknownOldestAncesterTime, kUnknownFileCreationTime, + kUnknownOldestAncesterTime, kUnknownFileCreationTime, kUnknownEpochNumber, kUnknownFileChecksum, kUnknownFileChecksumFuncName, kNullUniqueId64x2); ASSERT_OK(version_builder.Apply(&version_edit)); @@ -389,14 +405,14 @@ TEST_F(VersionBuilderTest, ApplyDeleteAndSaveTo) { version_edit.AddFile( 2, 808, 0, 100U, GetInternalKey("901"), GetInternalKey("950"), 200, 200, false, Temperature::kUnknown, kInvalidBlobFileNumber, - kUnknownOldestAncesterTime, kUnknownFileCreationTime, + kUnknownOldestAncesterTime, kUnknownFileCreationTime, kUnknownEpochNumber, kUnknownFileChecksum, kUnknownFileChecksumFuncName, kNullUniqueId64x2); version_edit2.DeleteFile(2, 616); version_edit2.DeleteFile(2, 636); version_edit.AddFile( 2, 806, 0, 100U, GetInternalKey("801"), GetInternalKey("850"), 200, 200, false, Temperature::kUnknown, kInvalidBlobFileNumber, - kUnknownOldestAncesterTime, kUnknownFileCreationTime, + kUnknownOldestAncesterTime, kUnknownFileCreationTime, kUnknownEpochNumber, kUnknownFileChecksum, kUnknownFileChecksumFuncName, kNullUniqueId64x2); ASSERT_OK(version_builder.Apply(&version_edit2)); @@ -502,13 +518,13 @@ TEST_F(VersionBuilderTest, ApplyFileDeletionAndAddition) { constexpr bool marked_for_compaction = false; - addition.AddFile(level, file_number, path_id, file_size, - GetInternalKey(smallest, smallest_seq), - GetInternalKey(largest, largest_seq), smallest_seqno, - largest_seqno, marked_for_compaction, Temperature::kUnknown, - kInvalidBlobFileNumber, kUnknownOldestAncesterTime, - kUnknownFileCreationTime, kUnknownFileChecksum, - kUnknownFileChecksumFuncName, kNullUniqueId64x2); + addition.AddFile( + level, file_number, path_id, file_size, + GetInternalKey(smallest, smallest_seq), + GetInternalKey(largest, largest_seq), smallest_seqno, largest_seqno, + marked_for_compaction, Temperature::kUnknown, kInvalidBlobFileNumber, + kUnknownOldestAncesterTime, kUnknownFileCreationTime, kUnknownEpochNumber, + kUnknownFileChecksum, kUnknownFileChecksumFuncName, kNullUniqueId64x2); ASSERT_OK(builder.Apply(&addition)); @@ -556,7 +572,7 @@ TEST_F(VersionBuilderTest, ApplyFileAdditionAlreadyInBase) { new_level, file_number, path_id, file_size, GetInternalKey(smallest), GetInternalKey(largest), smallest_seqno, largest_seqno, marked_for_compaction, Temperature::kUnknown, kInvalidBlobFileNumber, - kUnknownOldestAncesterTime, kUnknownFileCreationTime, + kUnknownOldestAncesterTime, kUnknownFileCreationTime, kUnknownEpochNumber, kUnknownFileChecksum, kUnknownFileChecksumFuncName, kNullUniqueId64x2); const Status s = builder.Apply(&edit); @@ -588,12 +604,12 @@ TEST_F(VersionBuilderTest, ApplyFileAdditionAlreadyApplied) { constexpr SequenceNumber largest_seqno = 1000; constexpr bool marked_for_compaction = false; - edit.AddFile(level, file_number, path_id, file_size, GetInternalKey(smallest), - GetInternalKey(largest), smallest_seqno, largest_seqno, - marked_for_compaction, Temperature::kUnknown, - kInvalidBlobFileNumber, kUnknownOldestAncesterTime, - kUnknownFileCreationTime, kUnknownFileChecksum, - kUnknownFileChecksumFuncName, kNullUniqueId64x2); + edit.AddFile( + level, file_number, path_id, file_size, GetInternalKey(smallest), + GetInternalKey(largest), smallest_seqno, largest_seqno, + marked_for_compaction, Temperature::kUnknown, kInvalidBlobFileNumber, + kUnknownOldestAncesterTime, kUnknownFileCreationTime, kUnknownEpochNumber, + kUnknownFileChecksum, kUnknownFileChecksumFuncName, kNullUniqueId64x2); ASSERT_OK(builder.Apply(&edit)); @@ -605,7 +621,7 @@ TEST_F(VersionBuilderTest, ApplyFileAdditionAlreadyApplied) { new_level, file_number, path_id, file_size, GetInternalKey(smallest), GetInternalKey(largest), smallest_seqno, largest_seqno, marked_for_compaction, Temperature::kUnknown, kInvalidBlobFileNumber, - kUnknownOldestAncesterTime, kUnknownFileCreationTime, + kUnknownOldestAncesterTime, kUnknownFileCreationTime, kUnknownEpochNumber, kUnknownFileChecksum, kUnknownFileChecksumFuncName, kNullUniqueId64x2); const Status s = builder.Apply(&other_edit); @@ -641,7 +657,7 @@ TEST_F(VersionBuilderTest, ApplyFileAdditionAndDeletion) { level, file_number, path_id, file_size, GetInternalKey(smallest), GetInternalKey(largest), smallest_seqno, largest_seqno, marked_for_compaction, Temperature::kUnknown, kInvalidBlobFileNumber, - kUnknownOldestAncesterTime, kUnknownFileCreationTime, + kUnknownOldestAncesterTime, kUnknownFileCreationTime, kUnknownEpochNumber, kUnknownFileChecksum, kUnknownFileChecksumFuncName, kNullUniqueId64x2); ASSERT_OK(builder.Apply(&addition)); @@ -691,7 +707,8 @@ TEST_F(VersionBuilderTest, ApplyBlobFileAddition) { // Add dummy table file to ensure the blob file is referenced. constexpr uint64_t table_file_number = 1; - AddDummyFileToEdit(&edit, table_file_number, blob_file_number); + AddDummyFileToEdit(&edit, table_file_number, blob_file_number, + 1 /*epoch_number*/); ASSERT_OK(builder.Apply(&edit)); @@ -813,7 +830,7 @@ TEST_F(VersionBuilderTest, ApplyBlobFileGarbageFileInBase) { ASSERT_NE(meta, nullptr); // Add dummy table file to ensure the blob file is referenced. - AddDummyFile(table_file_number, blob_file_number); + AddDummyFile(table_file_number, blob_file_number, 1 /*epoch_number*/); UpdateVersionStorageInfo(); @@ -892,7 +909,8 @@ TEST_F(VersionBuilderTest, ApplyBlobFileGarbageFileAdditionApplied) { // Add dummy table file to ensure the blob file is referenced. constexpr uint64_t table_file_number = 1; - AddDummyFileToEdit(&addition, table_file_number, blob_file_number); + AddDummyFileToEdit(&addition, table_file_number, blob_file_number, + 1 /*epoch_number*/); ASSERT_OK(builder.Apply(&addition)); @@ -989,7 +1007,8 @@ TEST_F(VersionBuilderTest, BlobFileGarbageOverflow) { // Add dummy table file to ensure the blob file is referenced. constexpr uint64_t table_file_number = 1; - AddDummyFileToEdit(&addition, table_file_number, blob_file_number); + AddDummyFileToEdit(&addition, table_file_number, blob_file_number, + 1 /*epoch_number*/); ASSERT_OK(builder.Apply(&addition)); @@ -1050,7 +1069,7 @@ TEST_F(VersionBuilderTest, SaveBlobFilesTo) { const uint64_t table_file_number = 2 * i; const uint64_t blob_file_number = 2 * i + 1; - AddDummyFile(table_file_number, blob_file_number); + AddDummyFile(table_file_number, blob_file_number, i /*epoch_number*/); } UpdateVersionStorageInfo(); @@ -1171,7 +1190,8 @@ TEST_F(VersionBuilderTest, SaveBlobFilesToConcurrentJobs) { constexpr uint64_t garbage_blob_count = 0; constexpr uint64_t garbage_blob_bytes = 0; - AddDummyFile(base_table_file_number, base_blob_file_number); + AddDummyFile(base_table_file_number, base_blob_file_number, + 1 /*epoch_number*/); AddBlob(base_blob_file_number, base_total_blob_count, base_total_blob_bytes, checksum_method, checksum_value, BlobFileMetaData::LinkedSsts{base_table_file_number}, @@ -1206,12 +1226,12 @@ TEST_F(VersionBuilderTest, SaveBlobFilesToConcurrentJobs) { constexpr uint64_t total_blob_count = 234; constexpr uint64_t total_blob_bytes = 1 << 22; - edit.AddFile(level, table_file_number, path_id, file_size, - GetInternalKey(smallest), GetInternalKey(largest), - smallest_seqno, largest_seqno, marked_for_compaction, - Temperature::kUnknown, blob_file_number, - kUnknownOldestAncesterTime, kUnknownFileCreationTime, - checksum_value, checksum_method, kNullUniqueId64x2); + edit.AddFile( + level, table_file_number, path_id, file_size, GetInternalKey(smallest), + GetInternalKey(largest), smallest_seqno, largest_seqno, + marked_for_compaction, Temperature::kUnknown, blob_file_number, + kUnknownOldestAncesterTime, kUnknownFileCreationTime, 2 /*epoch_number*/, + checksum_value, checksum_method, kNullUniqueId64x2); edit.AddBlobFile(blob_file_number, total_blob_count, total_blob_bytes, checksum_method, checksum_value); @@ -1297,8 +1317,9 @@ TEST_F(VersionBuilderTest, CheckConsistencyForBlobFiles) { /* largest_seqno */ 200, /* marked_for_compaction */ false, Temperature::kUnknown, /* oldest_blob_file_number */ 16, kUnknownOldestAncesterTime, - kUnknownFileCreationTime, kUnknownFileChecksum, - kUnknownFileChecksumFuncName, kNullUniqueId64x2); + kUnknownFileCreationTime, kUnknownEpochNumber, + kUnknownFileChecksum, kUnknownFileChecksumFuncName, + kNullUniqueId64x2); edit.AddFile(/* level */ 1, /* file_number */ 700, /* path_id */ 0, /* file_size */ 100, /* smallest */ GetInternalKey("801"), @@ -1306,8 +1327,9 @@ TEST_F(VersionBuilderTest, CheckConsistencyForBlobFiles) { /* largest_seqno */ 200, /* marked_for_compaction */ false, Temperature::kUnknown, /* oldest_blob_file_number */ 1000, kUnknownOldestAncesterTime, - kUnknownFileCreationTime, kUnknownFileChecksum, - kUnknownFileChecksumFuncName, kNullUniqueId64x2); + kUnknownFileCreationTime, kUnknownEpochNumber, + kUnknownFileChecksum, kUnknownFileChecksumFuncName, + kNullUniqueId64x2); edit.AddBlobFile(/* blob_file_number */ 1000, /* total_blob_count */ 2000, /* total_blob_bytes */ 200000, /* checksum_method */ std::string(), @@ -1527,7 +1549,7 @@ TEST_F(VersionBuilderTest, MaintainLinkedSstsForBlobFiles) { /* largest_seqno */ 2100, /* marked_for_compaction */ false, Temperature::kUnknown, /* oldest_blob_file_number */ 1, kUnknownOldestAncesterTime, - kUnknownFileCreationTime, kUnknownFileChecksum, + kUnknownFileCreationTime, kUnknownEpochNumber, kUnknownFileChecksum, kUnknownFileChecksumFuncName, kNullUniqueId64x2); // Add an SST that does not reference any blob files. @@ -1537,7 +1559,7 @@ TEST_F(VersionBuilderTest, MaintainLinkedSstsForBlobFiles) { /* largest */ GetInternalKey("22", 2200), /* smallest_seqno */ 2200, /* largest_seqno */ 2200, /* marked_for_compaction */ false, Temperature::kUnknown, kInvalidBlobFileNumber, kUnknownOldestAncesterTime, - kUnknownFileCreationTime, kUnknownFileChecksum, + kUnknownFileCreationTime, kUnknownEpochNumber, kUnknownFileChecksum, kUnknownFileChecksumFuncName, kNullUniqueId64x2); // Delete a file that references a blob file. @@ -1559,8 +1581,9 @@ TEST_F(VersionBuilderTest, MaintainLinkedSstsForBlobFiles) { /* largest_seqno */ 300, /* marked_for_compaction */ false, Temperature::kUnknown, /* oldest_blob_file_number */ 3, kUnknownOldestAncesterTime, - kUnknownFileCreationTime, kUnknownFileChecksum, - kUnknownFileChecksumFuncName, kNullUniqueId64x2); + kUnknownFileCreationTime, kUnknownEpochNumber, + kUnknownFileChecksum, kUnknownFileChecksumFuncName, + kNullUniqueId64x2); // Trivially move a file that does not reference any blob files. edit.DeleteFile(/* level */ 1, /* file_number */ 13); @@ -1571,8 +1594,8 @@ TEST_F(VersionBuilderTest, MaintainLinkedSstsForBlobFiles) { /* largest_seqno */ 1300, /* marked_for_compaction */ false, Temperature::kUnknown, kInvalidBlobFileNumber, kUnknownOldestAncesterTime, kUnknownFileCreationTime, - kUnknownFileChecksum, kUnknownFileChecksumFuncName, - kNullUniqueId64x2); + kUnknownEpochNumber, kUnknownFileChecksum, + kUnknownFileChecksumFuncName, kNullUniqueId64x2); // Add one more SST file that references a blob file, then promptly // delete it in a second version edit before the new version gets saved. @@ -1584,8 +1607,9 @@ TEST_F(VersionBuilderTest, MaintainLinkedSstsForBlobFiles) { /* largest_seqno */ 2300, /* marked_for_compaction */ false, Temperature::kUnknown, /* oldest_blob_file_number */ 5, kUnknownOldestAncesterTime, - kUnknownFileCreationTime, kUnknownFileChecksum, - kUnknownFileChecksumFuncName, kNullUniqueId64x2); + kUnknownFileCreationTime, kUnknownEpochNumber, + kUnknownFileChecksum, kUnknownFileChecksumFuncName, + kNullUniqueId64x2); VersionEdit edit2; @@ -1634,7 +1658,13 @@ TEST_F(VersionBuilderTest, MaintainLinkedSstsForBlobFiles) { } TEST_F(VersionBuilderTest, CheckConsistencyForFileDeletedTwice) { - Add(0, 1U, "150", "200", 100U); + Add(0, 1U, "150", "200", 100, /*path_id*/ 0, + /*smallest_seq*/ 100, /*largest_seq*/ 100, + /*num_entries*/ 0, /*num_deletions*/ 0, + /*sampled*/ false, /*smallest_seqno*/ 0, + /*largest_seqno*/ 0, + /*oldest_blob_file_number*/ kInvalidBlobFileNumber, + /*epoch_number*/ 1); UpdateVersionStorageInfo(); @@ -1666,6 +1696,99 @@ TEST_F(VersionBuilderTest, CheckConsistencyForFileDeletedTwice) { UnrefFilesInVersion(&new_vstorage2); } +TEST_F(VersionBuilderTest, CheckConsistencyForL0FilesSortedByEpochNumber) { + Status s; + // To verify files of same epoch number of overlapping ranges are caught as + // corrupted + VersionEdit version_edit_1; + version_edit_1.AddFile( + /* level */ 0, /* file_number */ 1U, /* path_id */ 0, + /* file_size */ 100, /* smallest */ GetInternalKey("a", 1), + /* largest */ GetInternalKey("c", 3), /* smallest_seqno */ 1, + /* largest_seqno */ 3, /* marked_for_compaction */ false, + Temperature::kUnknown, + /* oldest_blob_file_number */ kInvalidBlobFileNumber, + kUnknownOldestAncesterTime, kUnknownFileCreationTime, + 1 /* epoch_number */, kUnknownFileChecksum, kUnknownFileChecksumFuncName, + kNullUniqueId64x2); + version_edit_1.AddFile( + /* level */ 0, /* file_number */ 2U, /* path_id */ 0, + /* file_size */ 100, /* smallest */ GetInternalKey("b", 2), + /* largest */ GetInternalKey("d", 4), /* smallest_seqno */ 2, + /* largest_seqno */ 4, /* marked_for_compaction */ false, + Temperature::kUnknown, + /* oldest_blob_file_number */ kInvalidBlobFileNumber, + kUnknownOldestAncesterTime, kUnknownFileCreationTime, + 1 /* epoch_number */, kUnknownFileChecksum, kUnknownFileChecksumFuncName, + kNullUniqueId64x2); + + VersionBuilder version_builder_1(EnvOptions(), &ioptions_, + nullptr /* table_cache */, &vstorage_, + nullptr /* file_metadata_cache_res_mgr */); + VersionStorageInfo new_vstorage_1( + &icmp_, ucmp_, options_.num_levels, kCompactionStyleLevel, + nullptr /* src_vstorage */, true /* force_consistency_checks */); + + ASSERT_OK(version_builder_1.Apply(&version_edit_1)); + s = version_builder_1.SaveTo(&new_vstorage_1); + EXPECT_TRUE(s.IsCorruption()); + EXPECT_TRUE(std::strstr( + s.getState(), "L0 files of same epoch number but overlapping range")); + UnrefFilesInVersion(&new_vstorage_1); + + // To verify L0 files not sorted by epoch_number are caught as corrupted + VersionEdit version_edit_2; + version_edit_2.AddFile( + /* level */ 0, /* file_number */ 1U, /* path_id */ 0, + /* file_size */ 100, /* smallest */ GetInternalKey("a", 1), + /* largest */ GetInternalKey("a", 1), /* smallest_seqno */ 1, + /* largest_seqno */ 1, /* marked_for_compaction */ false, + Temperature::kUnknown, + /* oldest_blob_file_number */ kInvalidBlobFileNumber, + kUnknownOldestAncesterTime, kUnknownFileCreationTime, + 1 /* epoch_number */, kUnknownFileChecksum, kUnknownFileChecksumFuncName, + kNullUniqueId64x2); + version_edit_2.AddFile( + /* level */ 0, /* file_number */ 2U, /* path_id */ 0, + /* file_size */ 100, /* smallest */ GetInternalKey("b", 2), + /* largest */ GetInternalKey("b", 2), /* smallest_seqno */ 2, + /* largest_seqno */ 2, /* marked_for_compaction */ false, + Temperature::kUnknown, + /* oldest_blob_file_number */ kInvalidBlobFileNumber, + kUnknownOldestAncesterTime, kUnknownFileCreationTime, + 2 /* epoch_number */, kUnknownFileChecksum, kUnknownFileChecksumFuncName, + kNullUniqueId64x2); + + VersionBuilder version_builder_2(EnvOptions(), &ioptions_, + nullptr /* table_cache */, &vstorage_, + nullptr /* file_metadata_cache_res_mgr */); + VersionStorageInfo new_vstorage_2( + &icmp_, ucmp_, options_.num_levels, kCompactionStyleLevel, + nullptr /* src_vstorage */, true /* force_consistency_checks */); + + ASSERT_OK(version_builder_2.Apply(&version_edit_2)); + s = version_builder_2.SaveTo(&new_vstorage_2); + ASSERT_TRUE(s.ok()); + + const std::vector& l0_files = new_vstorage_2.LevelFiles(0); + ASSERT_EQ(l0_files.size(), 2); + // Manually corrupt L0 files's epoch_number + l0_files[0]->epoch_number = 1; + l0_files[1]->epoch_number = 2; + + // To surface corruption error by applying dummy version edit + VersionEdit dummy_version_edit; + VersionBuilder dummy_version_builder( + EnvOptions(), &ioptions_, nullptr /* table_cache */, &vstorage_, + nullptr /* file_metadata_cache_res_mgr */); + ASSERT_OK(dummy_version_builder.Apply(&dummy_version_edit)); + s = dummy_version_builder.SaveTo(&new_vstorage_2); + EXPECT_TRUE(s.IsCorruption()); + EXPECT_TRUE(std::strstr(s.getState(), "L0 files are not sorted properly")); + + UnrefFilesInVersion(&new_vstorage_2); +} + TEST_F(VersionBuilderTest, EstimatedActiveKeys) { const uint32_t kTotalSamples = 20; const uint32_t kNumLevels = 5; diff --git a/db/version_edit.cc b/db/version_edit.cc index c763d98e8..df5226077 100644 --- a/db/version_edit.cc +++ b/db/version_edit.cc @@ -135,7 +135,8 @@ bool VersionEdit::EncodeTo(std::string* dst) const { bool min_log_num_written = false; for (size_t i = 0; i < new_files_.size(); i++) { const FileMetaData& f = new_files_[i].second; - if (!f.smallest.Valid() || !f.largest.Valid()) { + if (!f.smallest.Valid() || !f.largest.Valid() || + f.epoch_number == kUnknownEpochNumber) { return false; } PutVarint32(dst, kNewFile4); @@ -184,6 +185,11 @@ bool VersionEdit::EncodeTo(std::string* dst) const { &varint_file_creation_time); PutLengthPrefixedSlice(dst, Slice(varint_file_creation_time)); + PutVarint32(dst, NewFileCustomTag::kEpochNumber); + std::string varint_epoch_number; + PutVarint64(&varint_epoch_number, f.epoch_number); + PutLengthPrefixedSlice(dst, Slice(varint_epoch_number)); + PutVarint32(dst, NewFileCustomTag::kFileChecksum); PutLengthPrefixedSlice(dst, Slice(f.file_checksum)); @@ -352,6 +358,11 @@ const char* VersionEdit::DecodeNewFile4From(Slice* input) { return "invalid file creation time"; } break; + case kEpochNumber: + if (!GetVarint64(&field, &f.epoch_number)) { + return "invalid epoch number"; + } + break; case kFileChecksum: f.file_checksum = field.ToString(); break; @@ -808,6 +819,8 @@ std::string VersionEdit::DebugString(bool hex_key) const { AppendNumberTo(&r, f.oldest_ancester_time); r.append(" file_creation_time:"); AppendNumberTo(&r, f.file_creation_time); + r.append(" epoch_number:"); + AppendNumberTo(&r, f.epoch_number); r.append(" file_checksum:"); r.append(Slice(f.file_checksum).ToString(true)); r.append(" file_checksum_func_name: "); @@ -927,6 +940,7 @@ std::string VersionEdit::DebugJSON(int edit_num, bool hex_key) const { jw << "LargestIKey" << f.largest.DebugString(hex_key); jw << "OldestAncesterTime" << f.oldest_ancester_time; jw << "FileCreationTime" << f.file_creation_time; + jw << "EpochNumber" << f.epoch_number; jw << "FileChecksum" << Slice(f.file_checksum).ToString(true); jw << "FileChecksumFuncName" << f.file_checksum_func_name; if (f.temperature != Temperature::kUnknown) { diff --git a/db/version_edit.h b/db/version_edit.h index c9800a3c0..cfc5f14e5 100644 --- a/db/version_edit.h +++ b/db/version_edit.h @@ -88,6 +88,7 @@ enum NewFileCustomTag : uint32_t { kMinTimestamp = 10, kMaxTimestamp = 11, kUniqueId = 12, + kEpochNumber = 13, // If this bit for the custom tag is set, opening DB should fail if // we don't know this field. @@ -102,6 +103,10 @@ class VersionSet; constexpr uint64_t kFileNumberMask = 0x3FFFFFFFFFFFFFFF; constexpr uint64_t kUnknownOldestAncesterTime = 0; constexpr uint64_t kUnknownFileCreationTime = 0; +constexpr uint64_t kUnknownEpochNumber = 0; +// If `Options::allow_ingest_behind` is true, this epoch number +// will be dedicated to files ingested behind. +constexpr uint64_t kReservedEpochNumberForFileIngestedBehind = 1; extern uint64_t PackFileNumberAndPathId(uint64_t number, uint64_t path_id); @@ -210,6 +215,12 @@ struct FileMetaData { // Unix time when the SST file is created. uint64_t file_creation_time = kUnknownFileCreationTime; + // The order of a file being flushed or ingested/imported. + // Compaction output file will be assigned with the minimum `epoch_number` + // among input files'. + // For L0, larger `epoch_number` indicates newer L0 file. + uint64_t epoch_number = kUnknownEpochNumber; + // File checksum std::string file_checksum = kUnknownFileChecksum; @@ -227,7 +238,7 @@ struct FileMetaData { const SequenceNumber& largest_seq, bool marked_for_compact, Temperature _temperature, uint64_t oldest_blob_file, uint64_t _oldest_ancester_time, uint64_t _file_creation_time, - const std::string& _file_checksum, + uint64_t _epoch_number, const std::string& _file_checksum, const std::string& _file_checksum_func_name, UniqueId64x2 _unique_id) : fd(file, file_path_id, file_size, smallest_seq, largest_seq), @@ -238,6 +249,7 @@ struct FileMetaData { oldest_blob_file_number(oldest_blob_file), oldest_ancester_time(_oldest_ancester_time), file_creation_time(_file_creation_time), + epoch_number(_epoch_number), file_checksum(_file_checksum), file_checksum_func_name(_file_checksum_func_name), unique_id(std::move(_unique_id)) { @@ -420,7 +432,7 @@ class VersionEdit { const SequenceNumber& largest_seqno, bool marked_for_compaction, Temperature temperature, uint64_t oldest_blob_file_number, uint64_t oldest_ancester_time, uint64_t file_creation_time, - const std::string& file_checksum, + uint64_t epoch_number, const std::string& file_checksum, const std::string& file_checksum_func_name, const UniqueId64x2& unique_id) { assert(smallest_seqno <= largest_seqno); @@ -429,8 +441,8 @@ class VersionEdit { FileMetaData(file, file_path_id, file_size, smallest, largest, smallest_seqno, largest_seqno, marked_for_compaction, temperature, oldest_blob_file_number, oldest_ancester_time, - file_creation_time, file_checksum, file_checksum_func_name, - unique_id)); + file_creation_time, epoch_number, file_checksum, + file_checksum_func_name, unique_id)); if (!HasLastSequence() || largest_seqno > GetLastSequence()) { SetLastSequence(largest_seqno); } diff --git a/db/version_edit_handler.cc b/db/version_edit_handler.cc index 145e78789..df537d68a 100644 --- a/db/version_edit_handler.cc +++ b/db/version_edit_handler.cc @@ -14,6 +14,7 @@ #include "db/blob/blob_file_reader.h" #include "db/blob/blob_source.h" +#include "db/version_edit.h" #include "logging/logging.h" #include "monitoring/persistent_stats_history.h" @@ -154,7 +155,7 @@ VersionEditHandler::VersionEditHandler( bool read_only, std::vector column_families, VersionSet* version_set, bool track_missing_files, bool no_error_if_files_missing, const std::shared_ptr& io_tracer, - bool skip_load_table_files) + bool skip_load_table_files, EpochNumberRequirement epoch_number_requirement) : VersionEditHandlerBase(), read_only_(read_only), column_families_(std::move(column_families)), @@ -163,7 +164,8 @@ VersionEditHandler::VersionEditHandler( no_error_if_files_missing_(no_error_if_files_missing), io_tracer_(io_tracer), skip_load_table_files_(skip_load_table_files), - initialized_(false) { + initialized_(false), + epoch_number_requirement_(epoch_number_requirement) { assert(version_set_ != nullptr); } @@ -431,6 +433,7 @@ void VersionEditHandler::CheckIterationResult(const log::Reader& reader, } } } + if (s->ok()) { for (auto* cfd : *(version_set_->column_family_set_)) { if (cfd->IsDropped()) { @@ -528,7 +531,8 @@ Status VersionEditHandler::MaybeCreateVersion(const VersionEdit& /*edit*/, auto* builder = builder_iter->second->version_builder(); auto* v = new Version(cfd, version_set_, version_set_->file_options_, *cfd->GetLatestMutableCFOptions(), io_tracer_, - version_set_->current_version_number_++); + version_set_->current_version_number_++, + epoch_number_requirement_); s = builder->SaveTo(v->storage_info()); if (s.ok()) { // Install new version @@ -642,10 +646,12 @@ Status VersionEditHandler::ExtractInfoFromVersionEdit(ColumnFamilyData* cfd, VersionEditHandlerPointInTime::VersionEditHandlerPointInTime( bool read_only, std::vector column_families, - VersionSet* version_set, const std::shared_ptr& io_tracer) + VersionSet* version_set, const std::shared_ptr& io_tracer, + EpochNumberRequirement epoch_number_requirement) : VersionEditHandler(read_only, column_families, version_set, /*track_missing_files=*/true, - /*no_error_if_files_missing=*/true, io_tracer) {} + /*no_error_if_files_missing=*/true, io_tracer, + epoch_number_requirement) {} VersionEditHandlerPointInTime::~VersionEditHandlerPointInTime() { for (const auto& elem : versions_) { @@ -804,7 +810,8 @@ Status VersionEditHandlerPointInTime::MaybeCreateVersion( auto* version = new Version(cfd, version_set_, version_set_->file_options_, *cfd->GetLatestMutableCFOptions(), io_tracer_, - version_set_->current_version_number_++); + version_set_->current_version_number_++, + epoch_number_requirement_); s = builder->LoadTableHandlers( cfd->internal_stats(), version_set_->db_options_->max_file_opening_threads, false, true, diff --git a/db/version_edit_handler.h b/db/version_edit_handler.h index fd2379b07..fc3fe7c6b 100644 --- a/db/version_edit_handler.h +++ b/db/version_edit_handler.h @@ -110,10 +110,13 @@ class VersionEditHandler : public VersionEditHandlerBase { const std::vector& column_families, VersionSet* version_set, bool track_missing_files, bool no_error_if_files_missing, - const std::shared_ptr& io_tracer) + const std::shared_ptr& io_tracer, + EpochNumberRequirement epoch_number_requirement = + EpochNumberRequirement::kMustPresent) : VersionEditHandler(read_only, column_families, version_set, track_missing_files, no_error_if_files_missing, - io_tracer, /*skip_load_table_files=*/false) {} + io_tracer, /*skip_load_table_files=*/false, + epoch_number_requirement) {} ~VersionEditHandler() override {} @@ -134,7 +137,9 @@ class VersionEditHandler : public VersionEditHandlerBase { bool read_only, std::vector column_families, VersionSet* version_set, bool track_missing_files, bool no_error_if_files_missing, - const std::shared_ptr& io_tracer, bool skip_load_table_files); + const std::shared_ptr& io_tracer, bool skip_load_table_files, + EpochNumberRequirement epoch_number_requirement = + EpochNumberRequirement::kMustPresent); Status ApplyVersionEdit(VersionEdit& edit, ColumnFamilyData** cfd) override; @@ -189,6 +194,7 @@ class VersionEditHandler : public VersionEditHandlerBase { bool skip_load_table_files_; bool initialized_; std::unique_ptr> cf_to_cmp_names_; + EpochNumberRequirement epoch_number_requirement_; private: Status ExtractInfoFromVersionEdit(ColumnFamilyData* cfd, @@ -205,7 +211,9 @@ class VersionEditHandlerPointInTime : public VersionEditHandler { public: VersionEditHandlerPointInTime( bool read_only, std::vector column_families, - VersionSet* version_set, const std::shared_ptr& io_tracer); + VersionSet* version_set, const std::shared_ptr& io_tracer, + EpochNumberRequirement epoch_number_requirement = + EpochNumberRequirement::kMustPresent); ~VersionEditHandlerPointInTime() override; protected: @@ -229,9 +237,12 @@ class ManifestTailer : public VersionEditHandlerPointInTime { public: explicit ManifestTailer(std::vector column_families, VersionSet* version_set, - const std::shared_ptr& io_tracer) + const std::shared_ptr& io_tracer, + EpochNumberRequirement epoch_number_requirement = + EpochNumberRequirement::kMustPresent) : VersionEditHandlerPointInTime(/*read_only=*/false, column_families, - version_set, io_tracer), + version_set, io_tracer, + epoch_number_requirement), mode_(Mode::kRecovery) {} void PrepareToReadNewManifest() { diff --git a/db/version_edit_test.cc b/db/version_edit_test.cc index c7f271d83..7571291e2 100644 --- a/db/version_edit_test.cc +++ b/db/version_edit_test.cc @@ -43,7 +43,8 @@ TEST_F(VersionEditTest, EncodeDecode) { InternalKey("foo", kBig + 500 + i, kTypeValue), InternalKey("zoo", kBig + 600 + i, kTypeDeletion), kBig + 500 + i, kBig + 600 + i, false, Temperature::kUnknown, - kInvalidBlobFileNumber, 888, 678, "234", "crc32c", + kInvalidBlobFileNumber, 888, 678, + kBig + 300 + i /* epoch_number */, "234", "crc32c", kNullUniqueId64x2); edit.DeleteFile(4, kBig + 700 + i); } @@ -63,25 +64,25 @@ TEST_F(VersionEditTest, EncodeDecodeNewFile4) { InternalKey("zoo", kBig + 600, kTypeDeletion), kBig + 500, kBig + 600, true, Temperature::kUnknown, kInvalidBlobFileNumber, kUnknownOldestAncesterTime, kUnknownFileCreationTime, - kUnknownFileChecksum, kUnknownFileChecksumFuncName, - kNullUniqueId64x2); + 300 /* epoch_number */, kUnknownFileChecksum, + kUnknownFileChecksumFuncName, kNullUniqueId64x2); edit.AddFile(4, 301, 3, 100, InternalKey("foo", kBig + 501, kTypeValue), InternalKey("zoo", kBig + 601, kTypeDeletion), kBig + 501, kBig + 601, false, Temperature::kUnknown, kInvalidBlobFileNumber, kUnknownOldestAncesterTime, kUnknownFileCreationTime, - kUnknownFileChecksum, kUnknownFileChecksumFuncName, - kNullUniqueId64x2); + 301 /* epoch_number */, kUnknownFileChecksum, + kUnknownFileChecksumFuncName, kNullUniqueId64x2); edit.AddFile(5, 302, 0, 100, InternalKey("foo", kBig + 502, kTypeValue), InternalKey("zoo", kBig + 602, kTypeDeletion), kBig + 502, kBig + 602, true, Temperature::kUnknown, kInvalidBlobFileNumber, - 666, 888, kUnknownFileChecksum, kUnknownFileChecksumFuncName, - kNullUniqueId64x2); + 666, 888, 302 /* epoch_number */, kUnknownFileChecksum, + kUnknownFileChecksumFuncName, kNullUniqueId64x2); edit.AddFile(5, 303, 0, 100, InternalKey("foo", kBig + 503, kTypeBlobIndex), InternalKey("zoo", kBig + 603, kTypeBlobIndex), kBig + 503, kBig + 603, true, Temperature::kUnknown, 1001, kUnknownOldestAncesterTime, kUnknownFileCreationTime, - kUnknownFileChecksum, kUnknownFileChecksumFuncName, - kNullUniqueId64x2); + 303 /* epoch_number */, kUnknownFileChecksum, + kUnknownFileChecksumFuncName, kNullUniqueId64x2); edit.DeleteFile(4, 700); @@ -121,12 +122,13 @@ TEST_F(VersionEditTest, ForwardCompatibleNewFile4) { InternalKey("zoo", kBig + 600, kTypeDeletion), kBig + 500, kBig + 600, true, Temperature::kUnknown, kInvalidBlobFileNumber, kUnknownOldestAncesterTime, kUnknownFileCreationTime, - kUnknownFileChecksum, kUnknownFileChecksumFuncName, - kNullUniqueId64x2); + 300 /* epoch_number */, kUnknownFileChecksum, + kUnknownFileChecksumFuncName, kNullUniqueId64x2); edit.AddFile(4, 301, 3, 100, InternalKey("foo", kBig + 501, kTypeValue), InternalKey("zoo", kBig + 601, kTypeDeletion), kBig + 501, kBig + 601, false, Temperature::kUnknown, kInvalidBlobFileNumber, - 686, 868, "234", "crc32c", kNullUniqueId64x2); + 686, 868, 301 /* epoch_number */, "234", "crc32c", + kNullUniqueId64x2); edit.DeleteFile(4, 700); edit.SetComparatorName("foo"); @@ -174,8 +176,8 @@ TEST_F(VersionEditTest, NewFile4NotSupportedField) { InternalKey("zoo", kBig + 600, kTypeDeletion), kBig + 500, kBig + 600, true, Temperature::kUnknown, kInvalidBlobFileNumber, kUnknownOldestAncesterTime, kUnknownFileCreationTime, - kUnknownFileChecksum, kUnknownFileChecksumFuncName, - kNullUniqueId64x2); + 300 /* epoch_number */, kUnknownFileChecksum, + kUnknownFileChecksumFuncName, kNullUniqueId64x2); edit.SetComparatorName("foo"); edit.SetLogNumber(kBig + 100); @@ -205,8 +207,8 @@ TEST_F(VersionEditTest, EncodeEmptyFile) { edit.AddFile(0, 0, 0, 0, InternalKey(), InternalKey(), 0, 0, false, Temperature::kUnknown, kInvalidBlobFileNumber, kUnknownOldestAncesterTime, kUnknownFileCreationTime, - kUnknownFileChecksum, kUnknownFileChecksumFuncName, - kNullUniqueId64x2); + 1 /*epoch_number*/, kUnknownFileChecksum, + kUnknownFileChecksumFuncName, kNullUniqueId64x2); std::string buffer; ASSERT_TRUE(!edit.EncodeTo(&buffer)); } diff --git a/db/version_set.cc b/db/version_set.cc index becdd6790..1030e5e28 100644 --- a/db/version_set.cc +++ b/db/version_set.cc @@ -1771,8 +1771,8 @@ void Version::GetColumnFamilyMetaData(ColumnFamilyMetaData* cf_meta) { file->stats.num_reads_sampled.load(std::memory_order_relaxed), file->being_compacted, file->temperature, file->oldest_blob_file_number, file->TryGetOldestAncesterTime(), - file->TryGetFileCreationTime(), file->file_checksum, - file->file_checksum_func_name); + file->TryGetFileCreationTime(), file->epoch_number, + file->file_checksum, file->file_checksum_func_name); files.back().num_entries = file->num_entries; files.back().num_deletions = file->num_deletions; level_size += file->fd.GetFileSize(); @@ -2036,7 +2036,8 @@ VersionStorageInfo::VersionStorageInfo( const InternalKeyComparator* internal_comparator, const Comparator* user_comparator, int levels, CompactionStyle compaction_style, VersionStorageInfo* ref_vstorage, - bool _force_consistency_checks) + bool _force_consistency_checks, + EpochNumberRequirement epoch_number_requirement) : internal_comparator_(internal_comparator), user_comparator_(user_comparator), // cfd is nullptr if Version is dummy @@ -2064,7 +2065,8 @@ VersionStorageInfo::VersionStorageInfo( current_num_samples_(0), estimated_compaction_needed_bytes_(0), finalized_(false), - force_consistency_checks_(_force_consistency_checks) { + force_consistency_checks_(_force_consistency_checks), + epoch_number_requirement_(epoch_number_requirement) { if (ref_vstorage != nullptr) { accumulated_file_size_ = ref_vstorage->accumulated_file_size_; accumulated_raw_key_size_ = ref_vstorage->accumulated_raw_key_size_; @@ -2085,7 +2087,8 @@ Version::Version(ColumnFamilyData* column_family_data, VersionSet* vset, const FileOptions& file_opt, const MutableCFOptions mutable_cf_options, const std::shared_ptr& io_tracer, - uint64_t version_number) + uint64_t version_number, + EpochNumberRequirement epoch_number_requirement) : env_(vset->env_), clock_(vset->clock_), cfd_(column_family_data), @@ -2104,7 +2107,8 @@ Version::Version(ColumnFamilyData* column_family_data, VersionSet* vset, (cfd_ == nullptr || cfd_->current() == nullptr) ? nullptr : cfd_->current()->storage_info(), - cfd_ == nullptr ? false : cfd_->ioptions()->force_consistency_checks), + cfd_ == nullptr ? false : cfd_->ioptions()->force_consistency_checks, + epoch_number_requirement), vset_(vset), next_(this), prev_(this), @@ -4280,6 +4284,74 @@ const char* VersionStorageInfo::LevelFileSummary(FileSummaryStorage* scratch, return scratch->buffer; } +bool VersionStorageInfo::HasMissingEpochNumber() const { + for (int level = 0; level < num_levels_; ++level) { + for (const FileMetaData* f : files_[level]) { + if (f->epoch_number == kUnknownEpochNumber) { + return true; + } + } + } + return false; +} + +uint64_t VersionStorageInfo::GetMaxEpochNumberOfFiles() const { + uint64_t max_epoch_number = kUnknownEpochNumber; + for (int level = 0; level < num_levels_; ++level) { + for (const FileMetaData* f : files_[level]) { + max_epoch_number = std::max(max_epoch_number, f->epoch_number); + } + } + return max_epoch_number; +} + +void VersionStorageInfo::RecoverEpochNumbers(ColumnFamilyData* cfd) { + cfd->ResetNextEpochNumber(); + + bool reserve_epoch_num_for_file_ingested_behind = + cfd->ioptions()->allow_ingest_behind; + if (reserve_epoch_num_for_file_ingested_behind) { + uint64_t reserved_epoch_number = cfd->NewEpochNumber(); + assert(reserved_epoch_number == kReservedEpochNumberForFileIngestedBehind); + ROCKS_LOG_INFO(cfd->ioptions()->info_log.get(), + "[%s]CF has reserved epoch number %" PRIu64 + " for files ingested " + "behind since `Options::allow_ingest_behind` is true", + cfd->GetName().c_str(), reserved_epoch_number); + } + + if (HasMissingEpochNumber()) { + assert(epoch_number_requirement_ == EpochNumberRequirement::kMightMissing); + assert(num_levels_ >= 1); + + for (int level = num_levels_ - 1; level >= 1; --level) { + auto& files_at_level = files_[level]; + if (files_at_level.empty()) { + continue; + } + uint64_t next_epoch_number = cfd->NewEpochNumber(); + for (FileMetaData* f : files_at_level) { + f->epoch_number = next_epoch_number; + } + } + + for (auto file_meta_iter = files_[0].rbegin(); + file_meta_iter != files_[0].rend(); file_meta_iter++) { + FileMetaData* f = *file_meta_iter; + f->epoch_number = cfd->NewEpochNumber(); + } + + ROCKS_LOG_WARN(cfd->ioptions()->info_log.get(), + "[%s]CF's epoch numbers are inferred based on seqno", + cfd->GetName().c_str()); + epoch_number_requirement_ = EpochNumberRequirement::kMustPresent; + } else { + assert(epoch_number_requirement_ == EpochNumberRequirement::kMustPresent); + cfd->SetNextEpochNumber( + std::max(GetMaxEpochNumberOfFiles() + 1, cfd->GetNextEpochNumber())); + } +} + uint64_t VersionStorageInfo::MaxNextLevelOverlappingBytes() { uint64_t result = 0; std::vector overlaps; @@ -5557,7 +5629,8 @@ Status VersionSet::Recover( true /* checksum */, 0 /* log_number */); VersionEditHandler handler( read_only, column_families, const_cast(this), - /*track_missing_files=*/false, no_error_if_files_missing, io_tracer_); + /*track_missing_files=*/false, no_error_if_files_missing, io_tracer_, + EpochNumberRequirement::kMightMissing); handler.Iterate(reader, &log_read_status); s = handler.status(); if (s.ok()) { @@ -5566,6 +5639,9 @@ Status VersionSet::Recover( assert(current_manifest_file_size != 0); handler.GetDbId(db_id); } + if (s.ok()) { + RecoverEpochNumbers(); + } } if (s.ok()) { @@ -5725,7 +5801,8 @@ Status VersionSet::TryRecoverFromOneManifest( log::Reader reader(nullptr, std::move(manifest_file_reader), &reporter, /*checksum=*/true, /*log_num=*/0); VersionEditHandlerPointInTime handler_pit( - read_only, column_families, const_cast(this), io_tracer_); + read_only, column_families, const_cast(this), io_tracer_, + EpochNumberRequirement::kMightMissing); handler_pit.Iterate(reader, &s); @@ -5734,7 +5811,21 @@ Status VersionSet::TryRecoverFromOneManifest( assert(nullptr != has_missing_table_file); *has_missing_table_file = handler_pit.HasMissingFiles(); - return handler_pit.status(); + s = handler_pit.status(); + if (s.ok()) { + RecoverEpochNumbers(); + } + return s; +} + +void VersionSet::RecoverEpochNumbers() { + for (auto cfd : *column_family_set_) { + if (cfd->IsDropped()) { + continue; + } + assert(cfd->initialized()); + cfd->RecoverEpochNumbers(); + } } Status VersionSet::ListColumnFamilies(std::vector* column_families, @@ -6121,7 +6212,7 @@ Status VersionSet::WriteCurrentStateToManifest( f->fd.smallest_seqno, f->fd.largest_seqno, f->marked_for_compaction, f->temperature, f->oldest_blob_file_number, f->oldest_ancester_time, - f->file_creation_time, f->file_checksum, + f->file_creation_time, f->epoch_number, f->file_checksum, f->file_checksum_func_name, f->unique_id); } } @@ -6612,6 +6703,7 @@ void VersionSet::GetLiveFilesMetaData(std::vector* metadata) { filemetadata.temperature = file->temperature; filemetadata.oldest_ancester_time = file->TryGetOldestAncesterTime(); filemetadata.file_creation_time = file->TryGetFileCreationTime(); + filemetadata.epoch_number = file->epoch_number; metadata->push_back(filemetadata); } } @@ -6814,12 +6906,17 @@ Status ReactiveVersionSet::Recover( log::Reader* reader = manifest_reader->get(); assert(reader); - manifest_tailer_.reset(new ManifestTailer( - column_families, const_cast(this), io_tracer_)); + manifest_tailer_.reset( + new ManifestTailer(column_families, const_cast(this), + io_tracer_, EpochNumberRequirement::kMightMissing)); manifest_tailer_->Iterate(*reader, manifest_reader_status->get()); - return manifest_tailer_->status(); + s = manifest_tailer_->status(); + if (s.ok()) { + RecoverEpochNumbers(); + } + return s; } Status ReactiveVersionSet::ReadAndApply( diff --git a/db/version_set.h b/db/version_set.h index 03176a8b5..b92546ed6 100644 --- a/db/version_set.h +++ b/db/version_set.h @@ -116,6 +116,10 @@ extern bool SomeFileOverlapsRange(const InternalKeyComparator& icmp, extern void DoGenerateLevelFilesBrief(LevelFilesBrief* file_level, const std::vector& files, Arena* arena); +enum EpochNumberRequirement { + kMightMissing, + kMustPresent, +}; // Information of the storage associated with each Version, including number of // levels of LSM tree, files information at each level, files marked for @@ -126,7 +130,9 @@ class VersionStorageInfo { const Comparator* user_comparator, int num_levels, CompactionStyle compaction_style, VersionStorageInfo* src_vstorage, - bool _force_consistency_checks); + bool _force_consistency_checks, + EpochNumberRequirement epoch_number_requirement = + EpochNumberRequirement::kMustPresent); // No copying allowed VersionStorageInfo(const VersionStorageInfo&) = delete; void operator=(const VersionStorageInfo&) = delete; @@ -319,6 +325,17 @@ class VersionStorageInfo { return files_[level]; } + bool HasMissingEpochNumber() const; + uint64_t GetMaxEpochNumberOfFiles() const; + EpochNumberRequirement GetEpochNumberRequirement() const { + return epoch_number_requirement_; + } + void SetEpochNumberRequirement( + EpochNumberRequirement epoch_number_requirement) { + epoch_number_requirement_ = epoch_number_requirement; + } + void RecoverEpochNumbers(ColumnFamilyData* cfd); + class FileLocation { public: FileLocation() = default; @@ -440,6 +457,11 @@ class VersionStorageInfo { return files_marked_for_compaction_; } + void TEST_AddFileMarkedForCompaction(int level, FileMetaData* f) { + f->marked_for_compaction = true; + files_marked_for_compaction_.emplace_back(level, f); + } + // REQUIRES: ComputeCompactionScore has been called // REQUIRES: DB mutex held during access const autovector>& ExpiredTtlFiles() const { @@ -723,6 +745,8 @@ class VersionStorageInfo { // is compiled in release mode bool force_consistency_checks_; + EpochNumberRequirement epoch_number_requirement_; + friend class Version; friend class VersionSet; }; @@ -1047,7 +1071,9 @@ class Version { Version(ColumnFamilyData* cfd, VersionSet* vset, const FileOptions& file_opt, MutableCFOptions mutable_cf_options, const std::shared_ptr& io_tracer, - uint64_t version_number = 0); + uint64_t version_number = 0, + EpochNumberRequirement epoch_number_requirement = + EpochNumberRequirement::kMustPresent); ~Version(); @@ -1188,6 +1214,10 @@ class VersionSet { const std::vector& column_families, bool read_only, std::string* db_id, bool* has_missing_table_file); + // Recover the next epoch number of each CFs and epoch number + // of their files (if missing) + void RecoverEpochNumbers(); + // Reads a manifest file and returns a list of column families in // column_families. static Status ListColumnFamilies(std::vector* column_families, diff --git a/db/version_set_test.cc b/db/version_set_test.cc index 7d17406c1..c179f7a6a 100644 --- a/db/version_set_test.cc +++ b/db/version_set_test.cc @@ -14,6 +14,7 @@ #include "db/db_impl/db_impl.h" #include "db/db_test_util.h" #include "db/log_writer.h" +#include "db/version_edit.h" #include "rocksdb/advanced_options.h" #include "rocksdb/convenience.h" #include "rocksdb/file_system.h" @@ -49,7 +50,7 @@ class GenerateLevelFilesBriefTest : public testing::Test { InternalKey(largest, largest_seq, kTypeValue), smallest_seq, largest_seq, /* marked_for_compact */ false, Temperature::kUnknown, kInvalidBlobFileNumber, kUnknownOldestAncesterTime, - kUnknownFileCreationTime, kUnknownFileChecksum, + kUnknownFileCreationTime, kUnknownEpochNumber, kUnknownFileChecksum, kUnknownFileChecksumFuncName, kNullUniqueId64x2); files_.push_back(f); } @@ -158,7 +159,8 @@ class VersionStorageInfoTestBase : public testing::Test { /* largest_seq */ 0, /* marked_for_compact */ false, Temperature::kUnknown, oldest_blob_file_number, kUnknownOldestAncesterTime, kUnknownFileCreationTime, - kUnknownFileChecksum, kUnknownFileChecksumFuncName, kNullUniqueId64x2); + kUnknownEpochNumber, kUnknownFileChecksum, kUnknownFileChecksumFuncName, + kNullUniqueId64x2); f->compensated_file_size = file_size; vstorage_.AddFile(level, f); } @@ -3191,15 +3193,19 @@ class VersionSetTestMissingFiles : public VersionSetTestBase, std::string column_family; std::string key; // the only key int level = 0; + uint64_t epoch_number; SstInfo(uint64_t file_num, const std::string& cf_name, - const std::string& _key) - : SstInfo(file_num, cf_name, _key, 0) {} + const std::string& _key, + uint64_t _epoch_number = kUnknownEpochNumber) + : SstInfo(file_num, cf_name, _key, 0, _epoch_number) {} SstInfo(uint64_t file_num, const std::string& cf_name, - const std::string& _key, int lvl) + const std::string& _key, int lvl, + uint64_t _epoch_number = kUnknownEpochNumber) : file_number(file_num), column_family(cf_name), key(_key), - level(lvl) {} + level(lvl), + epoch_number(_epoch_number) {} }; // Create dummy sst, return their metadata. Note that only file name and size @@ -3235,7 +3241,7 @@ class VersionSetTestMissingFiles : public VersionSetTestBase, ASSERT_NE(0, file_size); file_metas->emplace_back(file_num, /*file_path_id=*/0, file_size, ikey, ikey, 0, 0, false, Temperature::kUnknown, 0, 0, - 0, kUnknownFileChecksum, + 0, info.epoch_number, kUnknownFileChecksum, kUnknownFileChecksumFuncName, kNullUniqueId64x2); } } @@ -3273,11 +3279,11 @@ class VersionSetTestMissingFiles : public VersionSetTestBase, TEST_F(VersionSetTestMissingFiles, ManifestFarBehindSst) { std::vector existing_files = { - SstInfo(100, kDefaultColumnFamilyName, "a"), - SstInfo(102, kDefaultColumnFamilyName, "b"), - SstInfo(103, kDefaultColumnFamilyName, "c"), - SstInfo(107, kDefaultColumnFamilyName, "d"), - SstInfo(110, kDefaultColumnFamilyName, "e")}; + SstInfo(100, kDefaultColumnFamilyName, "a", 100 /* epoch_number */), + SstInfo(102, kDefaultColumnFamilyName, "b", 102 /* epoch_number */), + SstInfo(103, kDefaultColumnFamilyName, "c", 103 /* epoch_number */), + SstInfo(107, kDefaultColumnFamilyName, "d", 107 /* epoch_number */), + SstInfo(110, kDefaultColumnFamilyName, "e", 110 /* epoch_number */)}; std::vector file_metas; CreateDummyTableFiles(existing_files, &file_metas); @@ -3288,10 +3294,12 @@ TEST_F(VersionSetTestMissingFiles, ManifestFarBehindSst) { std::string largest_ukey = "b"; InternalKey smallest_ikey(smallest_ukey, 1, ValueType::kTypeValue); InternalKey largest_ikey(largest_ukey, 1, ValueType::kTypeValue); + FileMetaData meta = FileMetaData( file_num, /*file_path_id=*/0, /*file_size=*/12, smallest_ikey, largest_ikey, 0, 0, false, Temperature::kUnknown, 0, 0, 0, - kUnknownFileChecksum, kUnknownFileChecksumFuncName, kNullUniqueId64x2); + file_num /* epoch_number */, kUnknownFileChecksum, + kUnknownFileChecksumFuncName, kNullUniqueId64x2); added_files.emplace_back(0, meta); } WriteFileAdditionAndDeletionToManifest( @@ -3321,11 +3329,16 @@ TEST_F(VersionSetTestMissingFiles, ManifestFarBehindSst) { TEST_F(VersionSetTestMissingFiles, ManifestAheadofSst) { std::vector existing_files = { - SstInfo(100, kDefaultColumnFamilyName, "a"), - SstInfo(102, kDefaultColumnFamilyName, "b"), - SstInfo(103, kDefaultColumnFamilyName, "c"), - SstInfo(107, kDefaultColumnFamilyName, "d"), - SstInfo(110, kDefaultColumnFamilyName, "e")}; + SstInfo(100, kDefaultColumnFamilyName, "a", 0 /* level */, + 100 /* epoch_number */), + SstInfo(102, kDefaultColumnFamilyName, "b", 0 /* level */, + 102 /* epoch_number */), + SstInfo(103, kDefaultColumnFamilyName, "c", 0 /* level */, + 103 /* epoch_number */), + SstInfo(107, kDefaultColumnFamilyName, "d", 0 /* level */, + 107 /* epoch_number */), + SstInfo(110, kDefaultColumnFamilyName, "e", 0 /* level */, + 110 /* epoch_number */)}; std::vector file_metas; CreateDummyTableFiles(existing_files, &file_metas); @@ -3346,7 +3359,8 @@ TEST_F(VersionSetTestMissingFiles, ManifestAheadofSst) { FileMetaData meta = FileMetaData( file_num, /*file_path_id=*/0, /*file_size=*/12, smallest_ikey, largest_ikey, 0, 0, false, Temperature::kUnknown, 0, 0, 0, - kUnknownFileChecksum, kUnknownFileChecksumFuncName, kNullUniqueId64x2); + file_num /* epoch_number */, kUnknownFileChecksum, + kUnknownFileChecksumFuncName, kNullUniqueId64x2); added_files.emplace_back(0, meta); } WriteFileAdditionAndDeletionToManifest( @@ -3381,11 +3395,16 @@ TEST_F(VersionSetTestMissingFiles, ManifestAheadofSst) { TEST_F(VersionSetTestMissingFiles, NoFileMissing) { std::vector existing_files = { - SstInfo(100, kDefaultColumnFamilyName, "a"), - SstInfo(102, kDefaultColumnFamilyName, "b"), - SstInfo(103, kDefaultColumnFamilyName, "c"), - SstInfo(107, kDefaultColumnFamilyName, "d"), - SstInfo(110, kDefaultColumnFamilyName, "e")}; + SstInfo(100, kDefaultColumnFamilyName, "a", 0 /* level */, + 100 /* epoch_number */), + SstInfo(102, kDefaultColumnFamilyName, "b", 0 /* level */, + 102 /* epoch_number */), + SstInfo(103, kDefaultColumnFamilyName, "c", 0 /* level */, + 103 /* epoch_number */), + SstInfo(107, kDefaultColumnFamilyName, "d", 0 /* level */, + 107 /* epoch_number */), + SstInfo(110, kDefaultColumnFamilyName, "e", 0 /* level */, + 110 /* epoch_number */)}; std::vector file_metas; CreateDummyTableFiles(existing_files, &file_metas); @@ -3435,7 +3454,8 @@ TEST_F(VersionSetTestMissingFiles, MinLogNumberToKeep2PC) { db_options_.allow_2pc = true; NewDB(); - SstInfo sst(100, kDefaultColumnFamilyName, "a"); + SstInfo sst(100, kDefaultColumnFamilyName, "a", 0 /* level */, + 100 /* epoch_number */); std::vector file_metas; CreateDummyTableFiles({sst}, &file_metas); diff --git a/include/rocksdb/metadata.h b/include/rocksdb/metadata.h index 0cdffcd5f..3cdd8bd8a 100644 --- a/include/rocksdb/metadata.h +++ b/include/rocksdb/metadata.h @@ -82,7 +82,7 @@ struct SstFileMetaData : public FileStorageInfo { bool _being_compacted, Temperature _temperature, uint64_t _oldest_blob_file_number, uint64_t _oldest_ancester_time, uint64_t _file_creation_time, - std::string& _file_checksum, + uint64_t _epoch_number, std::string& _file_checksum, std::string& _file_checksum_func_name) : smallest_seqno(_smallest_seqno), largest_seqno(_largest_seqno), @@ -94,7 +94,8 @@ struct SstFileMetaData : public FileStorageInfo { num_deletions(0), oldest_blob_file_number(_oldest_blob_file_number), oldest_ancester_time(_oldest_ancester_time), - file_creation_time(_file_creation_time) { + file_creation_time(_file_creation_time), + epoch_number(_epoch_number) { if (!_file_name.empty()) { if (_file_name[0] == '/') { relative_filename = _file_name.substr(1); @@ -141,7 +142,12 @@ struct SstFileMetaData : public FileStorageInfo { // Timestamp when the SST file is created, provided by // SystemClock::GetCurrentTime(). 0 if the information is not available. uint64_t file_creation_time = 0; - + // The order of a file being flushed or ingested/imported. + // Compaction output file will be assigned with the minimum `epoch_number` + // among input files'. + // For L0, larger `epoch_number` indicates newer L0 file. + // 0 if the information is not available. + uint64_t epoch_number = 0; // DEPRECATED: The name of the file within its directory with a // leading slash (e.g. "/123456.sst"). Use relative_filename from base struct // instead. diff --git a/utilities/checkpoint/checkpoint_impl.cc b/utilities/checkpoint/checkpoint_impl.cc index 44ce70b1b..cdea325cd 100644 --- a/utilities/checkpoint/checkpoint_impl.cc +++ b/utilities/checkpoint/checkpoint_impl.cc @@ -382,6 +382,7 @@ Status CheckpointImpl::ExportColumnFamily( live_file_metadata.largestkey = std::move(file_metadata.largestkey); live_file_metadata.oldest_blob_file_number = file_metadata.oldest_blob_file_number; + live_file_metadata.epoch_number = file_metadata.epoch_number; live_file_metadata.level = level_metadata.level; result_metadata->files.push_back(live_file_metadata); }