From 8d8eb0e77e13a3902d23fbda742dc47aa7bc418f Mon Sep 17 00:00:00 2001 From: "mayue.fight" Date: Thu, 18 May 2023 13:25:01 -0700 Subject: [PATCH] Support Clip DB to KeyRange (#11379) Summary: This PR is part of the request https://github.com/facebook/rocksdb/issues/11317. (Another part is https://github.com/facebook/rocksdb/pull/11378) ClipDB() will clip the entries in the CF according to the range [begin_key, end_key). All the entries outside this range will be completely deleted (including tombstones). This feature is mainly used to ensure that there is no overlapping Key when calling CreateColumnFamilyWithImports() to import multiple CFs. When Calling ClipDB [begin, end), there are the following steps 1. Quickly and directly delete files without overlap DeleteFilesInRanges(nullptr, begin) + DeleteFilesInRanges(end, nullptr) 2. Delete the Key outside the range Delete[smallest_key, begin) + Delete[end, largest_key] 3. Delete the tombstone through Manul Compact CompactRange(option, nullptr, nullptr) Pull Request resolved: https://github.com/facebook/rocksdb/pull/11379 Reviewed By: ajkr Differential Revision: D45840358 Pulled By: cbi42 fbshipit-source-id: 54152e8a45fd8ede137f99787eb252f0b51440a4 --- CMakeLists.txt | 1 + HISTORY.md | 1 + Makefile | 3 + TARGETS | 6 + db/db_clip_test.cc | 142 +++++++++++++++++++++++ db/db_impl/compacted_db_impl.h | 7 ++ db/db_impl/db_impl.cc | 76 +++++++++++- db/db_impl/db_impl.h | 5 + db/db_impl/db_impl_readonly.h | 7 ++ db/db_test.cc | 7 ++ db/version_set.cc | 43 +++++++ db/version_set.h | 3 + include/rocksdb/db.h | 19 +++ include/rocksdb/utilities/stackable_db.h | 7 ++ src.mk | 1 + 15 files changed, 327 insertions(+), 1 deletion(-) create mode 100644 db/db_clip_test.cc diff --git a/CMakeLists.txt b/CMakeLists.txt index 109981c1b..7797bde2c 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -1290,6 +1290,7 @@ if(WITH_TESTS) db/db_bloom_filter_test.cc db/db_compaction_filter_test.cc db/db_compaction_test.cc + db/db_clip_test.cc db/db_dynamic_level_test.cc db/db_encryption_test.cc db/db_flush_test.cc diff --git a/HISTORY.md b/HISTORY.md index d76c99e59..51533d204 100644 --- a/HISTORY.md +++ b/HISTORY.md @@ -9,6 +9,7 @@ * New statistics `rocksdb.file.read.db.open.micros` that measures read time of block-based SST tables or blob files during db open. ### Public API Changes +* EXPERIMENTAL: Add new API `DB::ClipColumnFamily` to clip the key in CF to a certain range. It will physically deletes all keys outside the range including tombstones. * Add `MakeSharedCache()` construction functions to various cache Options objects, and deprecated the `NewWhateverCache()` functions with long parameter lists. ### Behavior changes diff --git a/Makefile b/Makefile index 75a5d7359..b499e8be1 100644 --- a/Makefile +++ b/Makefile @@ -1480,6 +1480,9 @@ db_compaction_filter_test: $(OBJ_DIR)/db/db_compaction_filter_test.o $(TEST_LIBR db_compaction_test: $(OBJ_DIR)/db/db_compaction_test.o $(TEST_LIBRARY) $(LIBRARY) $(AM_LINK) +db_clip_test: $(OBJ_DIR)/db/db_clip_test.o $(TEST_LIBRARY) $(LIBRARY) + $(AM_LINK) + db_dynamic_level_test: $(OBJ_DIR)/db/db_dynamic_level_test.o $(TEST_LIBRARY) $(LIBRARY) $(AM_LINK) diff --git a/TARGETS b/TARGETS index 52a29b774..ff5321312 100644 --- a/TARGETS +++ b/TARGETS @@ -4750,6 +4750,12 @@ cpp_unittest_wrapper(name="db_bloom_filter_test", extra_compiler_flags=[]) +cpp_unittest_wrapper(name="db_clip_test", + srcs=["db/db_clip_test.cc"], + deps=[":rocksdb_test_lib"], + extra_compiler_flags=[]) + + cpp_unittest_wrapper(name="db_compaction_filter_test", srcs=["db/db_compaction_filter_test.cc"], deps=[":rocksdb_test_lib"], diff --git a/db/db_clip_test.cc b/db/db_clip_test.cc new file mode 100644 index 000000000..fd0bb5717 --- /dev/null +++ b/db/db_clip_test.cc @@ -0,0 +1,142 @@ +// Copyright (c) Meta Platforms, Inc. and affiliates. +// This source code is licensed under both the GPLv2 (found in the +// COPYING file in the root directory) and Apache 2.0 License +// (found in the LICENSE.Apache file in the root directory). + +#include "db/db_test_util.h" +#include "port/port.h" +#include "util/random.h" + +namespace ROCKSDB_NAMESPACE { + +class DBClipTest : public DBTestBase { + public: + DBClipTest() : DBTestBase("db_clip_test", /*env_do_fsync=*/true) {} +}; + +TEST_F(DBClipTest, TestClipRange) { + Options options = CurrentOptions(); + options.write_buffer_size = 10 * 1024 * 1024; + options.max_bytes_for_level_multiplier = 2; + options.num_levels = 3; + options.max_background_compactions = 3; + options.disable_auto_compactions = true; + options.statistics = CreateDBStatistics(); + + DestroyAndReopen(options); + int32_t value_size = 10 * 1024; // 10 KB + + Random rnd(301); + std::map values; + + // file [0 => 100), [100 => 200), ... [900, 1000) + for (auto i = 0; i < 10; i++) { + for (auto j = 0; j < 100; j++) { + auto k = i * 100 + j; + values[k] = rnd.RandomString(value_size); + ASSERT_OK(Put(Key(k), values[k])); + } + ASSERT_OK(Flush()); + } + ASSERT_EQ("10", FilesPerLevel(0)); + auto begin_key = Key(251), end_key = Key(751); + ASSERT_OK( + db_->ClipColumnFamily(db_->DefaultColumnFamily(), begin_key, end_key)); + + for (auto i = 0; i < 251; i++) { + ReadOptions ropts; + std::string result; + auto s = db_->Get(ropts, Key(i), &result); + ASSERT_TRUE(s.IsNotFound()); + } + for (auto i = 251; i < 751; i++) { + ASSERT_EQ(Get(Key(i)), values[i]); + } + for (auto i = 751; i < 1000; i++) { + ReadOptions ropts; + std::string result; + auto s = db_->Get(ropts, Key(i), &result); + ASSERT_TRUE(s.IsNotFound()); + } + + std::vector all_metadata; + db_->GetLiveFilesMetaData(&all_metadata); + for (auto& md : all_metadata) { + // make sure clip_begin_key <= file_smallestkey <= file_largestkey <= + // clip_end_key + bool in_range = false; + + if (options.comparator->Compare(begin_key, md.smallestkey) <= 0 && + options.comparator->Compare(end_key, md.largestkey) > 0) { + in_range = true; + } + ASSERT_TRUE(in_range); + } + + CompactRangeOptions compact_options; + compact_options.change_level = true; + compact_options.target_level = 2; + ASSERT_OK(db_->CompactRange(compact_options, nullptr, nullptr)); + ASSERT_EQ("0,0,3", FilesPerLevel(0)); + + for (auto i = 0; i < 10; i += 2) { + for (auto j = 0; j < 100; j++) { + auto k = i * 100 + j; + ASSERT_OK(Put(Key(k), values[k])); + } + ASSERT_OK(Flush()); + } + ASSERT_EQ("5,0,3", FilesPerLevel(0)); + ASSERT_OK(dbfull()->TEST_CompactRange(0, nullptr, nullptr)); + ASSERT_EQ("0,5,3", FilesPerLevel(0)); + + for (auto i = 1; i < 10; i += 2) { + for (auto j = 0; j < 100; j++) { + auto k = i * 100 + j; + ASSERT_OK(Put(Key(k), values[k])); + } + ASSERT_OK(Flush()); + } + ASSERT_EQ("5,5,3", FilesPerLevel(0)); + + auto begin_key_2 = Key(222), end_key_2 = Key(888); + + ASSERT_OK(db_->ClipColumnFamily(db_->DefaultColumnFamily(), begin_key_2, + end_key_2)); + + for (auto i = 0; i < 222; i++) { + ReadOptions ropts; + std::string result; + auto s = db_->Get(ropts, Key(i), &result); + ASSERT_TRUE(s.IsNotFound()); + } + for (auto i = 222; i < 888; i++) { + ASSERT_EQ(Get(Key(i)), values[i]); + } + for (auto i = 888; i < 1000; i++) { + ReadOptions ropts; + std::string result; + auto s = db_->Get(ropts, Key(i), &result); + ASSERT_TRUE(s.IsNotFound()); + } + + std::vector all_metadata_2; + db_->GetLiveFilesMetaData(&all_metadata_2); + for (auto& md : all_metadata_2) { + // make sure clip_begin_key <= file_smallestkey <= file_largestkey <= + // clip_end_key + bool in_range = false; + if (begin_key_2.compare(md.smallestkey) <= 0 && + end_key_2.compare(md.largestkey) > 0) { + in_range = true; + } + ASSERT_TRUE(in_range); + } +} +} // namespace ROCKSDB_NAMESPACE + +int main(int argc, char** argv) { + ROCKSDB_NAMESPACE::port::InstallStackTraceHandler(); + ::testing::InitGoogleTest(&argc, argv); + return RUN_ALL_TESTS(); +} \ No newline at end of file diff --git a/db/db_impl/compacted_db_impl.h b/db/db_impl/compacted_db_impl.h index e0e0d5664..9879d81b6 100644 --- a/db/db_impl/compacted_db_impl.h +++ b/db/db_impl/compacted_db_impl.h @@ -128,6 +128,13 @@ class CompactedDBImpl : public DBImpl { return Status::NotSupported("Not supported in compacted db mode."); } + using DB::ClipColumnFamily; + virtual Status ClipColumnFamily(ColumnFamilyHandle* /*column_family*/, + const Slice& /*begin*/, + const Slice& /*end*/) override { + return Status::NotSupported("Not supported in compacted db mode."); + } + // FIXME: some missing overrides for more "write" functions // Share with DBImplReadOnly? diff --git a/db/db_impl/db_impl.cc b/db/db_impl/db_impl.cc index 07aee535d..2ed0caf7f 100644 --- a/db/db_impl/db_impl.cc +++ b/db/db_impl/db_impl.cc @@ -4522,7 +4522,6 @@ void DBImpl::GetAllColumnFamilyMetaData( } } - Status DBImpl::CheckConsistency() { mutex_.AssertHeld(); std::vector metadata; @@ -5705,6 +5704,81 @@ Status DBImpl::CreateColumnFamilyWithImport( return status; } +Status DBImpl::ClipColumnFamily(ColumnFamilyHandle* column_family, + const Slice& begin_key, const Slice& end_key) { + assert(column_family); + Status status; + // Flush memtable + FlushOptions flush_opts; + flush_opts.allow_write_stall = true; + auto* cfd = + static_cast_with_check(column_family)->cfd(); + if (immutable_db_options_.atomic_flush) { + status = AtomicFlushMemTables(flush_opts, FlushReason::kDeleteFiles, + {} /* provided_candidate_cfds */, + false /* entered_write_thread */); + } else { + status = FlushMemTable(cfd, flush_opts, FlushReason::kDeleteFiles, + false /* entered_write_thread */); + } + + if (status.ok()) { + // DeleteFilesInRanges non-overlap files except L0 + std::vector ranges; + ranges.push_back(RangePtr(nullptr, &begin_key)); + ranges.push_back(RangePtr(&end_key, nullptr)); + status = DeleteFilesInRanges(column_family, ranges.data(), ranges.size()); + } + + // DeleteRange the remaining overlapping keys + bool empty_after_delete = false; + if (status.ok()) { + Slice smallest_user_key, largest_user_key; + { + // Lock db mutex + InstrumentedMutexLock l(&mutex_); + cfd->current()->GetSstFilesBoundaryKeys(&smallest_user_key, + &largest_user_key); + } + // all the files has been deleted after DeleteFilesInRanges; + if (smallest_user_key.empty() && largest_user_key.empty()) { + empty_after_delete = true; + } else { + const Comparator* const ucmp = column_family->GetComparator(); + WriteOptions wo; + // Delete [smallest_user_key, clip_begin_key) + if (ucmp->Compare(smallest_user_key, begin_key) < 0) { + status = DeleteRange(wo, column_family, smallest_user_key, begin_key); + } + + if (status.ok()) { + // Delete [clip_end_key, largest_use_key] + if (ucmp->Compare(end_key, largest_user_key) < 0) { + status = DeleteRange(wo, column_family, end_key, largest_user_key); + if (status.ok()) { + status = Delete(wo, column_family, largest_user_key); + } + } + } + } + } + + if (status.ok() && !empty_after_delete) { + // CompactRange delete all the tombstones + CompactRangeOptions compact_options; + compact_options.exclusive_manual_compaction = true; + compact_options.bottommost_level_compaction = + BottommostLevelCompaction::kForceOptimized; + // We could just compact the ranges [null, clip_begin_key] and + // [clip_end_key, null]. But due to how manual compaction calculates the + // last level to compact to and that range tombstones are not dropped + // during non-bottommost compactions, calling CompactRange() on these two + // ranges may not clear all range tombstones. + status = CompactRange(compact_options, nullptr, nullptr); + } + return status; +} + Status DBImpl::VerifyFileChecksums(const ReadOptions& read_options) { return VerifyChecksumInternal(read_options, /*use_file_checksum=*/true); } diff --git a/db/db_impl/db_impl.h b/db/db_impl/db_impl.h index 8f782a34d..f2da7467d 100644 --- a/db/db_impl/db_impl.h +++ b/db/db_impl/db_impl.h @@ -534,6 +534,11 @@ class DBImpl : public DB { const ExportImportFilesMetaData& metadata, ColumnFamilyHandle** handle) override; + using DB::ClipColumnFamily; + virtual Status ClipColumnFamily(ColumnFamilyHandle* column_family, + const Slice& begin_key, + const Slice& end_key) override; + using DB::VerifyFileChecksums; Status VerifyFileChecksums(const ReadOptions& read_options) override; diff --git a/db/db_impl/db_impl_readonly.h b/db/db_impl/db_impl_readonly.h index 1cc374198..a694acc00 100644 --- a/db/db_impl/db_impl_readonly.h +++ b/db/db_impl/db_impl_readonly.h @@ -142,6 +142,13 @@ class DBImplReadOnly : public DBImpl { return Status::NotSupported("Not supported operation in read only mode."); } + using DB::ClipColumnFamily; + virtual Status ClipColumnFamily(ColumnFamilyHandle* /*column_family*/, + const Slice& /*begin*/, + const Slice& /*end*/) override { + return Status::NotSupported("Not supported operation in read only mode."); + } + // FIXME: some missing overrides for more "write" functions protected: diff --git a/db/db_test.cc b/db/db_test.cc index fdf82153c..d23daa55d 100644 --- a/db/db_test.cc +++ b/db/db_test.cc @@ -3107,6 +3107,13 @@ class ModelDB : public DB { return Status::NotSupported("Not implemented."); } + using DB::ClipColumnFamily; + virtual Status ClipColumnFamily(ColumnFamilyHandle* /*column_family*/, + const Slice& /*begin*/, + const Slice& /*end*/) override { + return Status::NotSupported("Not implemented."); + } + using DB::GetPropertiesOfAllTables; Status GetPropertiesOfAllTables( ColumnFamilyHandle* /*column_family*/, diff --git a/db/version_set.cc b/db/version_set.cc index b9610331e..674c0e4aa 100644 --- a/db/version_set.cc +++ b/db/version_set.cc @@ -1826,6 +1826,49 @@ uint64_t Version::GetSstFilesSize() { return sst_files_size; } +void Version::GetSstFilesBoundaryKeys(Slice* smallest_user_key, + Slice* largest_user_key) { + smallest_user_key->clear(); + largest_user_key->clear(); + bool initialized = false; + const Comparator* ucmp = storage_info_.user_comparator_; + for (int level = 0; level < cfd_->NumberLevels(); level++) { + if (storage_info_.LevelFiles(level).size() == 0) { + continue; + } + if (level == 0) { + // we need to consider all files on level 0 + for (const auto& file : storage_info_.LevelFiles(level)) { + const Slice& start_user_key = file->smallest.user_key(); + if (!initialized || + ucmp->Compare(start_user_key, *smallest_user_key) < 0) { + *smallest_user_key = start_user_key; + } + const Slice& end_user_key = file->largest.user_key(); + if (!initialized || + ucmp->Compare(end_user_key, *largest_user_key) > 0) { + *largest_user_key = end_user_key; + } + initialized = true; + } + } else { + // we only need to consider the first and last file + const Slice& start_user_key = + storage_info_.LevelFiles(level)[0]->smallest.user_key(); + if (!initialized || + ucmp->Compare(start_user_key, *smallest_user_key) < 0) { + *smallest_user_key = start_user_key; + } + const Slice& end_user_key = + storage_info_.LevelFiles(level).back()->largest.user_key(); + if (!initialized || ucmp->Compare(end_user_key, *largest_user_key) > 0) { + *largest_user_key = end_user_key; + } + initialized = true; + } + } +} + void Version::GetCreationTimeOfOldestFile(uint64_t* creation_time) { uint64_t oldest_time = std::numeric_limits::max(); for (int level = 0; level < storage_info_.num_non_empty_levels_; level++) { diff --git a/db/version_set.h b/db/version_set.h index e7e96bc6c..25ad36583 100644 --- a/db/version_set.h +++ b/db/version_set.h @@ -992,6 +992,9 @@ class Version { void GetColumnFamilyMetaData(ColumnFamilyMetaData* cf_meta); + void GetSstFilesBoundaryKeys(Slice* smallest_user_key, + Slice* largest_user_key); + uint64_t GetSstFilesSize(); // Retrieves the file_creation_time of the oldest file in the DB. diff --git a/include/rocksdb/db.h b/include/rocksdb/db.h index 6539eb8ae..a42f5b8b6 100644 --- a/include/rocksdb/db.h +++ b/include/rocksdb/db.h @@ -1763,6 +1763,25 @@ class DB { const ExportImportFilesMetaData& metadata, ColumnFamilyHandle** handle) = 0; + // EXPERIMENTAL + // ClipColumnFamily() will clip the entries in the CF according to the range + // [begin_key, + // end_key). + // Returns OK on success, and a non-OK status on error. + // Any entries outside this range will be completely deleted (including + // tombstones). + // The main difference between ClipColumnFamily(begin, end) and + // DeleteRange(begin, end) + // is that the former physically deletes all keys outside the range, but is + // more heavyweight than the latter. + // This feature is mainly used to ensure that there is no overlapping Key when + // calling + // CreateColumnFamilyWithImports() to import multiple CFs. + // Note that: concurrent updates cannot be performed during Clip. + virtual Status ClipColumnFamily(ColumnFamilyHandle* column_family, + const Slice& begin_key, + const Slice& end_key) = 0; + // Verify the checksums of files in db. Currently the whole-file checksum of // table files are checked. virtual Status VerifyFileChecksums(const ReadOptions& /*read_options*/) { diff --git a/include/rocksdb/utilities/stackable_db.h b/include/rocksdb/utilities/stackable_db.h index abb365ad3..1a87a1136 100644 --- a/include/rocksdb/utilities/stackable_db.h +++ b/include/rocksdb/utilities/stackable_db.h @@ -178,6 +178,13 @@ class StackableDB : public DB { import_options, metadata, handle); } + using DB::ClipColumnFamily; + virtual Status ClipColumnFamily(ColumnFamilyHandle* column_family, + const Slice& begin_key, + const Slice& end_key) override { + return db_->ClipColumnFamily(column_family, begin_key, end_key); + } + using DB::VerifyFileChecksums; Status VerifyFileChecksums(const ReadOptions& read_opts) override { return db_->VerifyFileChecksums(read_opts); diff --git a/src.mk b/src.mk index baaea29be..eb70ac04b 100644 --- a/src.mk +++ b/src.mk @@ -460,6 +460,7 @@ TEST_MAIN_SOURCES = \ db/db_bloom_filter_test.cc \ db/db_compaction_filter_test.cc \ db/db_compaction_test.cc \ + db/db_clip_test.cc \ db/db_dynamic_level_test.cc \ db/db_encryption_test.cc \ db/db_flush_test.cc \