diff --git a/CMakeLists.txt b/CMakeLists.txt index 109981c1b..7797bde2c 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -1290,6 +1290,7 @@ if(WITH_TESTS) db/db_bloom_filter_test.cc db/db_compaction_filter_test.cc db/db_compaction_test.cc + db/db_clip_test.cc db/db_dynamic_level_test.cc db/db_encryption_test.cc db/db_flush_test.cc diff --git a/HISTORY.md b/HISTORY.md index d76c99e59..51533d204 100644 --- a/HISTORY.md +++ b/HISTORY.md @@ -9,6 +9,7 @@ * New statistics `rocksdb.file.read.db.open.micros` that measures read time of block-based SST tables or blob files during db open. ### Public API Changes +* EXPERIMENTAL: Add new API `DB::ClipColumnFamily` to clip the key in CF to a certain range. It will physically deletes all keys outside the range including tombstones. * Add `MakeSharedCache()` construction functions to various cache Options objects, and deprecated the `NewWhateverCache()` functions with long parameter lists. ### Behavior changes diff --git a/Makefile b/Makefile index 75a5d7359..b499e8be1 100644 --- a/Makefile +++ b/Makefile @@ -1480,6 +1480,9 @@ db_compaction_filter_test: $(OBJ_DIR)/db/db_compaction_filter_test.o $(TEST_LIBR db_compaction_test: $(OBJ_DIR)/db/db_compaction_test.o $(TEST_LIBRARY) $(LIBRARY) $(AM_LINK) +db_clip_test: $(OBJ_DIR)/db/db_clip_test.o $(TEST_LIBRARY) $(LIBRARY) + $(AM_LINK) + db_dynamic_level_test: $(OBJ_DIR)/db/db_dynamic_level_test.o $(TEST_LIBRARY) $(LIBRARY) $(AM_LINK) diff --git a/TARGETS b/TARGETS index 52a29b774..ff5321312 100644 --- a/TARGETS +++ b/TARGETS @@ -4750,6 +4750,12 @@ cpp_unittest_wrapper(name="db_bloom_filter_test", extra_compiler_flags=[]) +cpp_unittest_wrapper(name="db_clip_test", + srcs=["db/db_clip_test.cc"], + deps=[":rocksdb_test_lib"], + extra_compiler_flags=[]) + + cpp_unittest_wrapper(name="db_compaction_filter_test", srcs=["db/db_compaction_filter_test.cc"], deps=[":rocksdb_test_lib"], diff --git a/db/db_clip_test.cc b/db/db_clip_test.cc new file mode 100644 index 000000000..fd0bb5717 --- /dev/null +++ b/db/db_clip_test.cc @@ -0,0 +1,142 @@ +// Copyright (c) Meta Platforms, Inc. and affiliates. +// This source code is licensed under both the GPLv2 (found in the +// COPYING file in the root directory) and Apache 2.0 License +// (found in the LICENSE.Apache file in the root directory). + +#include "db/db_test_util.h" +#include "port/port.h" +#include "util/random.h" + +namespace ROCKSDB_NAMESPACE { + +class DBClipTest : public DBTestBase { + public: + DBClipTest() : DBTestBase("db_clip_test", /*env_do_fsync=*/true) {} +}; + +TEST_F(DBClipTest, TestClipRange) { + Options options = CurrentOptions(); + options.write_buffer_size = 10 * 1024 * 1024; + options.max_bytes_for_level_multiplier = 2; + options.num_levels = 3; + options.max_background_compactions = 3; + options.disable_auto_compactions = true; + options.statistics = CreateDBStatistics(); + + DestroyAndReopen(options); + int32_t value_size = 10 * 1024; // 10 KB + + Random rnd(301); + std::map values; + + // file [0 => 100), [100 => 200), ... [900, 1000) + for (auto i = 0; i < 10; i++) { + for (auto j = 0; j < 100; j++) { + auto k = i * 100 + j; + values[k] = rnd.RandomString(value_size); + ASSERT_OK(Put(Key(k), values[k])); + } + ASSERT_OK(Flush()); + } + ASSERT_EQ("10", FilesPerLevel(0)); + auto begin_key = Key(251), end_key = Key(751); + ASSERT_OK( + db_->ClipColumnFamily(db_->DefaultColumnFamily(), begin_key, end_key)); + + for (auto i = 0; i < 251; i++) { + ReadOptions ropts; + std::string result; + auto s = db_->Get(ropts, Key(i), &result); + ASSERT_TRUE(s.IsNotFound()); + } + for (auto i = 251; i < 751; i++) { + ASSERT_EQ(Get(Key(i)), values[i]); + } + for (auto i = 751; i < 1000; i++) { + ReadOptions ropts; + std::string result; + auto s = db_->Get(ropts, Key(i), &result); + ASSERT_TRUE(s.IsNotFound()); + } + + std::vector all_metadata; + db_->GetLiveFilesMetaData(&all_metadata); + for (auto& md : all_metadata) { + // make sure clip_begin_key <= file_smallestkey <= file_largestkey <= + // clip_end_key + bool in_range = false; + + if (options.comparator->Compare(begin_key, md.smallestkey) <= 0 && + options.comparator->Compare(end_key, md.largestkey) > 0) { + in_range = true; + } + ASSERT_TRUE(in_range); + } + + CompactRangeOptions compact_options; + compact_options.change_level = true; + compact_options.target_level = 2; + ASSERT_OK(db_->CompactRange(compact_options, nullptr, nullptr)); + ASSERT_EQ("0,0,3", FilesPerLevel(0)); + + for (auto i = 0; i < 10; i += 2) { + for (auto j = 0; j < 100; j++) { + auto k = i * 100 + j; + ASSERT_OK(Put(Key(k), values[k])); + } + ASSERT_OK(Flush()); + } + ASSERT_EQ("5,0,3", FilesPerLevel(0)); + ASSERT_OK(dbfull()->TEST_CompactRange(0, nullptr, nullptr)); + ASSERT_EQ("0,5,3", FilesPerLevel(0)); + + for (auto i = 1; i < 10; i += 2) { + for (auto j = 0; j < 100; j++) { + auto k = i * 100 + j; + ASSERT_OK(Put(Key(k), values[k])); + } + ASSERT_OK(Flush()); + } + ASSERT_EQ("5,5,3", FilesPerLevel(0)); + + auto begin_key_2 = Key(222), end_key_2 = Key(888); + + ASSERT_OK(db_->ClipColumnFamily(db_->DefaultColumnFamily(), begin_key_2, + end_key_2)); + + for (auto i = 0; i < 222; i++) { + ReadOptions ropts; + std::string result; + auto s = db_->Get(ropts, Key(i), &result); + ASSERT_TRUE(s.IsNotFound()); + } + for (auto i = 222; i < 888; i++) { + ASSERT_EQ(Get(Key(i)), values[i]); + } + for (auto i = 888; i < 1000; i++) { + ReadOptions ropts; + std::string result; + auto s = db_->Get(ropts, Key(i), &result); + ASSERT_TRUE(s.IsNotFound()); + } + + std::vector all_metadata_2; + db_->GetLiveFilesMetaData(&all_metadata_2); + for (auto& md : all_metadata_2) { + // make sure clip_begin_key <= file_smallestkey <= file_largestkey <= + // clip_end_key + bool in_range = false; + if (begin_key_2.compare(md.smallestkey) <= 0 && + end_key_2.compare(md.largestkey) > 0) { + in_range = true; + } + ASSERT_TRUE(in_range); + } +} +} // namespace ROCKSDB_NAMESPACE + +int main(int argc, char** argv) { + ROCKSDB_NAMESPACE::port::InstallStackTraceHandler(); + ::testing::InitGoogleTest(&argc, argv); + return RUN_ALL_TESTS(); +} \ No newline at end of file diff --git a/db/db_impl/compacted_db_impl.h b/db/db_impl/compacted_db_impl.h index e0e0d5664..9879d81b6 100644 --- a/db/db_impl/compacted_db_impl.h +++ b/db/db_impl/compacted_db_impl.h @@ -128,6 +128,13 @@ class CompactedDBImpl : public DBImpl { return Status::NotSupported("Not supported in compacted db mode."); } + using DB::ClipColumnFamily; + virtual Status ClipColumnFamily(ColumnFamilyHandle* /*column_family*/, + const Slice& /*begin*/, + const Slice& /*end*/) override { + return Status::NotSupported("Not supported in compacted db mode."); + } + // FIXME: some missing overrides for more "write" functions // Share with DBImplReadOnly? diff --git a/db/db_impl/db_impl.cc b/db/db_impl/db_impl.cc index 07aee535d..2ed0caf7f 100644 --- a/db/db_impl/db_impl.cc +++ b/db/db_impl/db_impl.cc @@ -4522,7 +4522,6 @@ void DBImpl::GetAllColumnFamilyMetaData( } } - Status DBImpl::CheckConsistency() { mutex_.AssertHeld(); std::vector metadata; @@ -5705,6 +5704,81 @@ Status DBImpl::CreateColumnFamilyWithImport( return status; } +Status DBImpl::ClipColumnFamily(ColumnFamilyHandle* column_family, + const Slice& begin_key, const Slice& end_key) { + assert(column_family); + Status status; + // Flush memtable + FlushOptions flush_opts; + flush_opts.allow_write_stall = true; + auto* cfd = + static_cast_with_check(column_family)->cfd(); + if (immutable_db_options_.atomic_flush) { + status = AtomicFlushMemTables(flush_opts, FlushReason::kDeleteFiles, + {} /* provided_candidate_cfds */, + false /* entered_write_thread */); + } else { + status = FlushMemTable(cfd, flush_opts, FlushReason::kDeleteFiles, + false /* entered_write_thread */); + } + + if (status.ok()) { + // DeleteFilesInRanges non-overlap files except L0 + std::vector ranges; + ranges.push_back(RangePtr(nullptr, &begin_key)); + ranges.push_back(RangePtr(&end_key, nullptr)); + status = DeleteFilesInRanges(column_family, ranges.data(), ranges.size()); + } + + // DeleteRange the remaining overlapping keys + bool empty_after_delete = false; + if (status.ok()) { + Slice smallest_user_key, largest_user_key; + { + // Lock db mutex + InstrumentedMutexLock l(&mutex_); + cfd->current()->GetSstFilesBoundaryKeys(&smallest_user_key, + &largest_user_key); + } + // all the files has been deleted after DeleteFilesInRanges; + if (smallest_user_key.empty() && largest_user_key.empty()) { + empty_after_delete = true; + } else { + const Comparator* const ucmp = column_family->GetComparator(); + WriteOptions wo; + // Delete [smallest_user_key, clip_begin_key) + if (ucmp->Compare(smallest_user_key, begin_key) < 0) { + status = DeleteRange(wo, column_family, smallest_user_key, begin_key); + } + + if (status.ok()) { + // Delete [clip_end_key, largest_use_key] + if (ucmp->Compare(end_key, largest_user_key) < 0) { + status = DeleteRange(wo, column_family, end_key, largest_user_key); + if (status.ok()) { + status = Delete(wo, column_family, largest_user_key); + } + } + } + } + } + + if (status.ok() && !empty_after_delete) { + // CompactRange delete all the tombstones + CompactRangeOptions compact_options; + compact_options.exclusive_manual_compaction = true; + compact_options.bottommost_level_compaction = + BottommostLevelCompaction::kForceOptimized; + // We could just compact the ranges [null, clip_begin_key] and + // [clip_end_key, null]. But due to how manual compaction calculates the + // last level to compact to and that range tombstones are not dropped + // during non-bottommost compactions, calling CompactRange() on these two + // ranges may not clear all range tombstones. + status = CompactRange(compact_options, nullptr, nullptr); + } + return status; +} + Status DBImpl::VerifyFileChecksums(const ReadOptions& read_options) { return VerifyChecksumInternal(read_options, /*use_file_checksum=*/true); } diff --git a/db/db_impl/db_impl.h b/db/db_impl/db_impl.h index 8f782a34d..f2da7467d 100644 --- a/db/db_impl/db_impl.h +++ b/db/db_impl/db_impl.h @@ -534,6 +534,11 @@ class DBImpl : public DB { const ExportImportFilesMetaData& metadata, ColumnFamilyHandle** handle) override; + using DB::ClipColumnFamily; + virtual Status ClipColumnFamily(ColumnFamilyHandle* column_family, + const Slice& begin_key, + const Slice& end_key) override; + using DB::VerifyFileChecksums; Status VerifyFileChecksums(const ReadOptions& read_options) override; diff --git a/db/db_impl/db_impl_readonly.h b/db/db_impl/db_impl_readonly.h index 1cc374198..a694acc00 100644 --- a/db/db_impl/db_impl_readonly.h +++ b/db/db_impl/db_impl_readonly.h @@ -142,6 +142,13 @@ class DBImplReadOnly : public DBImpl { return Status::NotSupported("Not supported operation in read only mode."); } + using DB::ClipColumnFamily; + virtual Status ClipColumnFamily(ColumnFamilyHandle* /*column_family*/, + const Slice& /*begin*/, + const Slice& /*end*/) override { + return Status::NotSupported("Not supported operation in read only mode."); + } + // FIXME: some missing overrides for more "write" functions protected: diff --git a/db/db_test.cc b/db/db_test.cc index fdf82153c..d23daa55d 100644 --- a/db/db_test.cc +++ b/db/db_test.cc @@ -3107,6 +3107,13 @@ class ModelDB : public DB { return Status::NotSupported("Not implemented."); } + using DB::ClipColumnFamily; + virtual Status ClipColumnFamily(ColumnFamilyHandle* /*column_family*/, + const Slice& /*begin*/, + const Slice& /*end*/) override { + return Status::NotSupported("Not implemented."); + } + using DB::GetPropertiesOfAllTables; Status GetPropertiesOfAllTables( ColumnFamilyHandle* /*column_family*/, diff --git a/db/version_set.cc b/db/version_set.cc index b9610331e..674c0e4aa 100644 --- a/db/version_set.cc +++ b/db/version_set.cc @@ -1826,6 +1826,49 @@ uint64_t Version::GetSstFilesSize() { return sst_files_size; } +void Version::GetSstFilesBoundaryKeys(Slice* smallest_user_key, + Slice* largest_user_key) { + smallest_user_key->clear(); + largest_user_key->clear(); + bool initialized = false; + const Comparator* ucmp = storage_info_.user_comparator_; + for (int level = 0; level < cfd_->NumberLevels(); level++) { + if (storage_info_.LevelFiles(level).size() == 0) { + continue; + } + if (level == 0) { + // we need to consider all files on level 0 + for (const auto& file : storage_info_.LevelFiles(level)) { + const Slice& start_user_key = file->smallest.user_key(); + if (!initialized || + ucmp->Compare(start_user_key, *smallest_user_key) < 0) { + *smallest_user_key = start_user_key; + } + const Slice& end_user_key = file->largest.user_key(); + if (!initialized || + ucmp->Compare(end_user_key, *largest_user_key) > 0) { + *largest_user_key = end_user_key; + } + initialized = true; + } + } else { + // we only need to consider the first and last file + const Slice& start_user_key = + storage_info_.LevelFiles(level)[0]->smallest.user_key(); + if (!initialized || + ucmp->Compare(start_user_key, *smallest_user_key) < 0) { + *smallest_user_key = start_user_key; + } + const Slice& end_user_key = + storage_info_.LevelFiles(level).back()->largest.user_key(); + if (!initialized || ucmp->Compare(end_user_key, *largest_user_key) > 0) { + *largest_user_key = end_user_key; + } + initialized = true; + } + } +} + void Version::GetCreationTimeOfOldestFile(uint64_t* creation_time) { uint64_t oldest_time = std::numeric_limits::max(); for (int level = 0; level < storage_info_.num_non_empty_levels_; level++) { diff --git a/db/version_set.h b/db/version_set.h index e7e96bc6c..25ad36583 100644 --- a/db/version_set.h +++ b/db/version_set.h @@ -992,6 +992,9 @@ class Version { void GetColumnFamilyMetaData(ColumnFamilyMetaData* cf_meta); + void GetSstFilesBoundaryKeys(Slice* smallest_user_key, + Slice* largest_user_key); + uint64_t GetSstFilesSize(); // Retrieves the file_creation_time of the oldest file in the DB. diff --git a/include/rocksdb/db.h b/include/rocksdb/db.h index 6539eb8ae..a42f5b8b6 100644 --- a/include/rocksdb/db.h +++ b/include/rocksdb/db.h @@ -1763,6 +1763,25 @@ class DB { const ExportImportFilesMetaData& metadata, ColumnFamilyHandle** handle) = 0; + // EXPERIMENTAL + // ClipColumnFamily() will clip the entries in the CF according to the range + // [begin_key, + // end_key). + // Returns OK on success, and a non-OK status on error. + // Any entries outside this range will be completely deleted (including + // tombstones). + // The main difference between ClipColumnFamily(begin, end) and + // DeleteRange(begin, end) + // is that the former physically deletes all keys outside the range, but is + // more heavyweight than the latter. + // This feature is mainly used to ensure that there is no overlapping Key when + // calling + // CreateColumnFamilyWithImports() to import multiple CFs. + // Note that: concurrent updates cannot be performed during Clip. + virtual Status ClipColumnFamily(ColumnFamilyHandle* column_family, + const Slice& begin_key, + const Slice& end_key) = 0; + // Verify the checksums of files in db. Currently the whole-file checksum of // table files are checked. virtual Status VerifyFileChecksums(const ReadOptions& /*read_options*/) { diff --git a/include/rocksdb/utilities/stackable_db.h b/include/rocksdb/utilities/stackable_db.h index abb365ad3..1a87a1136 100644 --- a/include/rocksdb/utilities/stackable_db.h +++ b/include/rocksdb/utilities/stackable_db.h @@ -178,6 +178,13 @@ class StackableDB : public DB { import_options, metadata, handle); } + using DB::ClipColumnFamily; + virtual Status ClipColumnFamily(ColumnFamilyHandle* column_family, + const Slice& begin_key, + const Slice& end_key) override { + return db_->ClipColumnFamily(column_family, begin_key, end_key); + } + using DB::VerifyFileChecksums; Status VerifyFileChecksums(const ReadOptions& read_opts) override { return db_->VerifyFileChecksums(read_opts); diff --git a/src.mk b/src.mk index baaea29be..eb70ac04b 100644 --- a/src.mk +++ b/src.mk @@ -460,6 +460,7 @@ TEST_MAIN_SOURCES = \ db/db_bloom_filter_test.cc \ db/db_compaction_filter_test.cc \ db/db_compaction_test.cc \ + db/db_clip_test.cc \ db/db_dynamic_level_test.cc \ db/db_encryption_test.cc \ db/db_flush_test.cc \