diff --git a/HISTORY.md b/HISTORY.md index 40bae8c74..0698f0cc9 100644 --- a/HISTORY.md +++ b/HISTORY.md @@ -6,6 +6,7 @@ ### Bug Fixes * Fixed an issue for backward iteration when user defined timestamp is enabled in combination with BlobDB. * Fixed a couple of cases where a Merge operand encountered during iteration wasn't reflected in the `internal_merge_count` PerfContext counter. +* Fixed a bug in CreateColumnFamilyWithImport()/ExportColumnFamily() which did not support range tombstones (#11252). ### New Features * Add statistics rocksdb.secondary.cache.filter.hits, rocksdb.secondary.cache.index.hits, and rocksdb.secondary.cache.filter.hits diff --git a/db/external_sst_file_ingestion_job.cc b/db/external_sst_file_ingestion_job.cc index 448e70738..428c8bc6a 100644 --- a/db/external_sst_file_ingestion_job.cc +++ b/db/external_sst_file_ingestion_job.cc @@ -746,12 +746,6 @@ Status ExternalSstFileIngestionJob::GetIngestedFileInfo( ParsedInternalKey key; ReadOptions ro; - // During reading the external file we can cache blocks that we read into - // the block cache, if we later change the global seqno of this file, we will - // have block in cache that will include keys with wrong seqno. - // We need to disable fill_cache so that we read from the file without - // updating the block cache. - ro.fill_cache = false; std::unique_ptr iter(table_reader->NewIterator( ro, sv->mutable_cf_options.prefix_extractor.get(), /*arena=*/nullptr, /*skip_filters=*/false, TableReaderCaller::kExternalSSTIngestion)); diff --git a/db/import_column_family_job.cc b/db/import_column_family_job.cc index 05b2f6d1d..68e54ab69 100644 --- a/db/import_column_family_job.cc +++ b/db/import_column_family_job.cc @@ -34,8 +34,8 @@ Status ImportColumnFamilyJob::Prepare(uint64_t next_file_number, for (const auto& file_metadata : metadata_) { const auto file_path = file_metadata.db_path + "/" + file_metadata.name; IngestedFileInfo file_to_import; - status = - GetIngestedFileInfo(file_path, next_file_number++, &file_to_import, sv); + status = GetIngestedFileInfo(file_path, next_file_number++, sv, + file_metadata, &file_to_import); if (!status.ok()) { return status; } @@ -212,16 +212,20 @@ void ImportColumnFamilyJob::Cleanup(const Status& status) { Status ImportColumnFamilyJob::GetIngestedFileInfo( const std::string& external_file, uint64_t new_file_number, - IngestedFileInfo* file_to_import, SuperVersion* sv) { + SuperVersion* sv, const LiveFileMetaData& file_meta, + IngestedFileInfo* file_to_import) { file_to_import->external_file_path = external_file; - - // Get external file size - Status status = fs_->GetFileSize(external_file, IOOptions(), - &file_to_import->file_size, nullptr); - if (!status.ok()) { - return status; + Status status; + if (file_meta.size > 0) { + file_to_import->file_size = file_meta.size; + } else { + // Get external file size + status = fs_->GetFileSize(external_file, IOOptions(), + &file_to_import->file_size, nullptr); + if (!status.ok()) { + return status; + } } - // Assign FD with number file_to_import->fd = FileDescriptor(new_file_number, 0, file_to_import->file_size); @@ -262,37 +266,61 @@ Status ImportColumnFamilyJob::GetIngestedFileInfo( // Get number of entries in table file_to_import->num_entries = props->num_entries; - ParsedInternalKey key; - ReadOptions ro; - // During reading the external file we can cache blocks that we read into - // the block cache, if we later change the global seqno of this file, we will - // have block in cache that will include keys with wrong seqno. - // We need to disable fill_cache so that we read from the file without - // updating the block cache. - ro.fill_cache = false; - std::unique_ptr iter(table_reader->NewIterator( - ro, sv->mutable_cf_options.prefix_extractor.get(), /*arena=*/nullptr, - /*skip_filters=*/false, TableReaderCaller::kExternalSSTIngestion)); - - // Get first (smallest) key from file - iter->SeekToFirst(); - Status pik_status = - ParseInternalKey(iter->key(), &key, db_options_.allow_data_in_errors); - if (!pik_status.ok()) { - return Status::Corruption("Corrupted Key in external file. ", - pik_status.getState()); - } - file_to_import->smallest_internal_key.SetFrom(key); - - // Get last (largest) key from file - iter->SeekToLast(); - pik_status = - ParseInternalKey(iter->key(), &key, db_options_.allow_data_in_errors); - if (!pik_status.ok()) { - return Status::Corruption("Corrupted Key in external file. ", - pik_status.getState()); + // If the importing files were exported with Checkpoint::ExportColumnFamily(), + // we cannot simply recompute smallest and largest used to truncate range + // tombstones from file content, and we expect smallest and largest populated + // in file_meta. + if (file_meta.smallest.empty()) { + assert(file_meta.largest.empty()); + ReadOptions ro; + std::unique_ptr iter(table_reader->NewIterator( + ro, sv->mutable_cf_options.prefix_extractor.get(), /*arena=*/nullptr, + /*skip_filters=*/false, TableReaderCaller::kExternalSSTIngestion)); + + // Get first (smallest) key from file + iter->SeekToFirst(); + bool bound_set = false; + if (iter->Valid()) { + file_to_import->smallest_internal_key.DecodeFrom(iter->key()); + iter->SeekToLast(); + file_to_import->largest_internal_key.DecodeFrom(iter->key()); + bound_set = true; + } + + std::unique_ptr range_del_iter{ + table_reader->NewRangeTombstoneIterator(ro)}; + if (range_del_iter != nullptr) { + range_del_iter->SeekToFirst(); + if (range_del_iter->Valid()) { + ParsedInternalKey key; + Status pik_status = ParseInternalKey(range_del_iter->key(), &key, + db_options_.allow_data_in_errors); + if (!pik_status.ok()) { + return Status::Corruption("Corrupted key in external file. ", + pik_status.getState()); + } + RangeTombstone tombstone(key, range_del_iter->value()); + InternalKey start_key = tombstone.SerializeKey(); + const InternalKeyComparator* icmp = &cfd_->internal_comparator(); + if (!bound_set || + icmp->Compare(start_key, file_to_import->smallest_internal_key) < + 0) { + file_to_import->smallest_internal_key = start_key; + } + InternalKey end_key = tombstone.SerializeEndKey(); + if (!bound_set || + icmp->Compare(end_key, file_to_import->largest_internal_key) > 0) { + file_to_import->largest_internal_key = end_key; + } + bound_set = true; + } + } + assert(bound_set); + } else { + assert(!file_meta.largest.empty()); + file_to_import->smallest_internal_key.DecodeFrom(file_meta.smallest); + file_to_import->largest_internal_key.DecodeFrom(file_meta.largest); } - file_to_import->largest_internal_key.SetFrom(key); file_to_import->cf_id = static_cast(props->column_family_id); diff --git a/db/import_column_family_job.h b/db/import_column_family_job.h index 57c49c67f..d0e2f9c29 100644 --- a/db/import_column_family_job.h +++ b/db/import_column_family_job.h @@ -62,9 +62,9 @@ class ImportColumnFamilyJob { // Open the external file and populate `file_to_import` with all the // external information we need to import this file. Status GetIngestedFileInfo(const std::string& external_file, - uint64_t new_file_number, - IngestedFileInfo* file_to_import, - SuperVersion* sv); + uint64_t new_file_number, SuperVersion* sv, + const LiveFileMetaData& file_meta, + IngestedFileInfo* file_to_import); SystemClock* clock_; VersionSet* versions_; diff --git a/db/import_column_family_test.cc b/db/import_column_family_test.cc index f012483be..c7940a374 100644 --- a/db/import_column_family_test.cc +++ b/db/import_column_family_test.cc @@ -280,6 +280,57 @@ TEST_F(ImportColumnFamilyTest, ImportSSTFileWriterFilesWithOverlap) { } } +TEST_F(ImportColumnFamilyTest, ImportSSTFileWriterFilesWithRangeTombstone) { + // Test for a bug where import file's smallest and largest key did not + // consider range tombstone. + Options options = CurrentOptions(); + CreateAndReopenWithCF({"koko"}, options); + + SstFileWriter sfw_cf1(EnvOptions(), options, handles_[1]); + // cf1.sst + const std::string cf1_sst_name = "cf1.sst"; + const std::string cf1_sst = sst_files_dir_ + cf1_sst_name; + ASSERT_OK(sfw_cf1.Open(cf1_sst)); + ASSERT_OK(sfw_cf1.Put("K1", "V1")); + ASSERT_OK(sfw_cf1.Put("K2", "V2")); + ASSERT_OK(sfw_cf1.DeleteRange("K3", "K4")); + ASSERT_OK(sfw_cf1.Finish()); + + // Import sst file corresponding to cf1 onto a new cf and verify + ExportImportFilesMetaData metadata; + metadata.files.push_back( + LiveFileMetaDataInit(cf1_sst_name, sst_files_dir_, 0, 0, 19)); + metadata.db_comparator_name = options.comparator->Name(); + + ASSERT_OK(db_->CreateColumnFamilyWithImport( + options, "toto", ImportColumnFamilyOptions(), metadata, &import_cfh_)); + ASSERT_NE(import_cfh_, nullptr); + + ColumnFamilyMetaData import_cf_meta; + db_->GetColumnFamilyMetaData(import_cfh_, &import_cf_meta); + ASSERT_EQ(import_cf_meta.file_count, 1); + const SstFileMetaData* file_meta = nullptr; + for (const auto& level_meta : import_cf_meta.levels) { + if (!level_meta.files.empty()) { + file_meta = &(level_meta.files[0]); + break; + } + } + ASSERT_TRUE(file_meta != nullptr); + InternalKey largest; + largest.DecodeFrom(file_meta->largest); + ASSERT_EQ(largest.user_key(), "K4"); + + std::string value; + ASSERT_OK(db_->Get(ReadOptions(), import_cfh_, "K1", &value)); + ASSERT_EQ(value, "V1"); + ASSERT_OK(db_->Get(ReadOptions(), import_cfh_, "K2", &value)); + ASSERT_EQ(value, "V2"); + ASSERT_OK(db_->DropColumnFamily(import_cfh_)); + ASSERT_OK(db_->DestroyColumnFamilyHandle(import_cfh_)); + import_cfh_ = nullptr; +} + TEST_F(ImportColumnFamilyTest, ImportExportedSSTFromAnotherCF) { Options options = CurrentOptions(); CreateAndReopenWithCF({"koko"}, options); @@ -444,6 +495,70 @@ TEST_F(ImportColumnFamilyTest, ImportExportedSSTFromAnotherDB) { ASSERT_OK(DestroyDir(env_, dbname_ + "/db_copy")); } +TEST_F(ImportColumnFamilyTest, + ImportExportedSSTFromAnotherCFWithRangeTombstone) { + // Test for a bug where import file's smallest and largest key did not + // consider range tombstone. + Options options = CurrentOptions(); + options.disable_auto_compactions = true; + CreateAndReopenWithCF({"koko"}, options); + + for (int i = 10; i < 20; ++i) { + ASSERT_OK(Put(1, Key(i), Key(i) + "_val")); + } + ASSERT_OK(Flush(1 /* cf */)); + MoveFilesToLevel(1 /* level */, 1 /* cf */); + const Snapshot* snapshot = db_->GetSnapshot(); + ASSERT_OK(db_->DeleteRange(WriteOptions(), handles_[1], Key(0), Key(25))); + ASSERT_OK(Put(1, Key(1), "t")); + ASSERT_OK(Flush(1)); + // Tests importing a range tombstone only file + ASSERT_OK(db_->DeleteRange(WriteOptions(), handles_[1], Key(0), Key(2))); + + Checkpoint* checkpoint; + ASSERT_OK(Checkpoint::Create(db_, &checkpoint)); + ASSERT_OK(checkpoint->ExportColumnFamily(handles_[1], export_files_dir_, + &metadata_ptr_)); + ASSERT_NE(metadata_ptr_, nullptr); + delete checkpoint; + + ImportColumnFamilyOptions import_options; + import_options.move_files = false; + ASSERT_OK(db_->CreateColumnFamilyWithImport(options, "toto", import_options, + *metadata_ptr_, &import_cfh_)); + ASSERT_NE(import_cfh_, nullptr); + + import_options.move_files = true; + ASSERT_OK(db_->CreateColumnFamilyWithImport(options, "yoyo", import_options, + *metadata_ptr_, &import_cfh2_)); + ASSERT_NE(import_cfh2_, nullptr); + delete metadata_ptr_; + metadata_ptr_ = nullptr; + + std::string value1, value2; + ReadOptions ro_latest; + ReadOptions ro_snapshot; + ro_snapshot.snapshot = snapshot; + + for (int i = 10; i < 20; ++i) { + ASSERT_TRUE(db_->Get(ro_latest, import_cfh_, Key(i), &value1).IsNotFound()); + ASSERT_OK(db_->Get(ro_snapshot, import_cfh_, Key(i), &value1)); + ASSERT_EQ(Get(1, Key(i), snapshot), value1); + } + ASSERT_TRUE(db_->Get(ro_latest, import_cfh_, Key(1), &value1).IsNotFound()); + + for (int i = 10; i < 20; ++i) { + ASSERT_TRUE( + db_->Get(ro_latest, import_cfh2_, Key(i), &value1).IsNotFound()); + + ASSERT_OK(db_->Get(ro_snapshot, import_cfh2_, Key(i), &value2)); + ASSERT_EQ(Get(1, Key(i), snapshot), value2); + } + ASSERT_TRUE(db_->Get(ro_latest, import_cfh2_, Key(1), &value1).IsNotFound()); + + db_->ReleaseSnapshot(snapshot); +} + TEST_F(ImportColumnFamilyTest, LevelFilesOverlappingAtEndpoints) { // Imports a column family containing a level where two files overlap at their // endpoints. "Overlap" means the largest user key in one file is the same as diff --git a/db/version_set.cc b/db/version_set.cc index 1119575fd..acccf54d5 100644 --- a/db/version_set.cc +++ b/db/version_set.cc @@ -1776,6 +1776,8 @@ void Version::GetColumnFamilyMetaData(ColumnFamilyMetaData* cf_meta) { file->file_checksum, file->file_checksum_func_name); files.back().num_entries = file->num_entries; files.back().num_deletions = file->num_deletions; + files.back().smallest = file->smallest.Encode().ToString(); + files.back().largest = file->largest.Encode().ToString(); level_size += file->fd.GetFileSize(); } cf_meta->levels.emplace_back(level, level_size, std::move(files)); diff --git a/include/rocksdb/db.h b/include/rocksdb/db.h index 09307efb3..fa6469231 100644 --- a/include/rocksdb/db.h +++ b/include/rocksdb/db.h @@ -1717,11 +1717,12 @@ class DB { const std::vector& args) = 0; // CreateColumnFamilyWithImport() will create a new column family with - // column_family_name and import external SST files specified in metadata into - // this column family. + // column_family_name and import external SST files specified in `metadata` + // into this column family. // (1) External SST files can be created using SstFileWriter. // (2) External SST files can be exported from a particular column family in - // an existing DB using Checkpoint::ExportColumnFamily. + // an existing DB using Checkpoint::ExportColumnFamily. `metadata` should + // be the output from Checkpoint::ExportColumnFamily. // Option in import_options specifies whether the external files are copied or // moved (default is copy). When option specifies copy, managing files at // external_file_path is caller's responsibility. When option specifies a diff --git a/include/rocksdb/metadata.h b/include/rocksdb/metadata.h index 3cdd8bd8a..4ab3842dd 100644 --- a/include/rocksdb/metadata.h +++ b/include/rocksdb/metadata.h @@ -148,6 +148,13 @@ struct SstFileMetaData : public FileStorageInfo { // For L0, larger `epoch_number` indicates newer L0 file. // 0 if the information is not available. uint64_t epoch_number = 0; + + // These bounds define the effective key range for range tombstones + // in this file. + // Currently only used by CreateColumnFamilyWithImport(). + std::string smallest{}; // Smallest internal key served by table + std::string largest{}; // Largest internal key served by table + // DEPRECATED: The name of the file within its directory with a // leading slash (e.g. "/123456.sst"). Use relative_filename from base struct // instead. diff --git a/utilities/checkpoint/checkpoint_impl.cc b/utilities/checkpoint/checkpoint_impl.cc index 6c1a73819..4a0cc7159 100644 --- a/utilities/checkpoint/checkpoint_impl.cc +++ b/utilities/checkpoint/checkpoint_impl.cc @@ -372,17 +372,19 @@ Status CheckpointImpl::ExportColumnFamily( for (const auto& file_metadata : level_metadata.files) { LiveFileMetaData live_file_metadata; live_file_metadata.size = file_metadata.size; - live_file_metadata.name = std::move(file_metadata.name); + live_file_metadata.name = file_metadata.name; live_file_metadata.file_number = file_metadata.file_number; live_file_metadata.db_path = export_dir; live_file_metadata.smallest_seqno = file_metadata.smallest_seqno; live_file_metadata.largest_seqno = file_metadata.largest_seqno; - live_file_metadata.smallestkey = std::move(file_metadata.smallestkey); - live_file_metadata.largestkey = std::move(file_metadata.largestkey); + live_file_metadata.smallestkey = file_metadata.smallestkey; + live_file_metadata.largestkey = file_metadata.largestkey; live_file_metadata.oldest_blob_file_number = file_metadata.oldest_blob_file_number; live_file_metadata.epoch_number = file_metadata.epoch_number; live_file_metadata.level = level_metadata.level; + live_file_metadata.smallest = file_metadata.smallest; + live_file_metadata.largest = file_metadata.largest; result_metadata->files.push_back(live_file_metadata); } *metadata = result_metadata;