From a207c278096d2d7a37fa28198e8c3a0f7e60a48c Mon Sep 17 00:00:00 2001 From: Peter Dillinger Date: Mon, 16 Aug 2021 20:36:19 -0700 Subject: [PATCH] Stable cache keys using DB session ids in SSTs (#8659) Summary: Use DB session ids in SST table properties to make cache keys stable across DB re-open and copy / move / restore / etc. These new cache keys are currently only enabled when FileSystem does not provide GetUniqueId. For now, they are typically larger, so slightly less efficient. Relevant to https://github.com/facebook/rocksdb/issues/7405 This change has a minor regression in PersistentCache functionality: metaindex blocks are no longer cached in PersistentCache. Table properties blocks already were not but ideally should be. I didn't spent effort to fix & test these issues because we don't believe PersistentCache is used much if at all and expect SecondaryCache to replace it. (Though PRs are welcome.) FIXME: there is more to be fixed for stable cache keys on external SST files Pull Request resolved: https://github.com/facebook/rocksdb/pull/8659 Test Plan: new unit test added, which fails when disabling new functionality Reviewed By: zhichao-cao Differential Revision: D30297705 Pulled By: pdillinger fbshipit-source-id: e8539a5c8802a79340405629870f2e3fb3822d3a --- db/db_basic_test.cc | 5 + db/db_block_cache_test.cc | 99 +++++++++++++++++++ .../block_based/block_based_table_builder.cc | 1 + table/block_based/block_based_table_reader.cc | 75 ++++++++++---- table/block_based/block_based_table_reader.h | 13 +-- table/meta_blocks.cc | 2 + 6 files changed, 170 insertions(+), 25 deletions(-) diff --git a/db/db_basic_test.cc b/db/db_basic_test.cc index 13644025b..b22010528 100644 --- a/db/db_basic_test.cc +++ b/db/db_basic_test.cc @@ -2756,6 +2756,11 @@ class DBBasicTestMultiGet : public DBTestBase { EXPECT_OK(dbfull()->Flush(FlushOptions(), handles_[cf])); } } + // Clear compressed cache, which is always pre-populated + if (compressed_cache_) { + compressed_cache_->SetCapacity(0); + compressed_cache_->SetCapacity(1048576); + } } bool CheckValue(int i, const std::string& value) { diff --git a/db/db_block_cache_test.cc b/db/db_block_cache_test.cc index 0b4389fd1..c7c100f63 100644 --- a/db/db_block_cache_test.cc +++ b/db/db_block_cache_test.cc @@ -7,6 +7,7 @@ // Use of this source code is governed by a BSD-style license that can be // found in the LICENSE file. See the AUTHORS file for names of contributors. #include +#include #include #include "cache/cache_entry_roles.h" @@ -14,9 +15,11 @@ #include "db/column_family.h" #include "db/db_test_util.h" #include "port/stack_trace.h" +#include "rocksdb/statistics.h" #include "rocksdb/table.h" #include "util/compression.h" #include "util/random.h" +#include "utilities/fault_injection_fs.h" namespace ROCKSDB_NAMESPACE { @@ -1298,6 +1301,102 @@ TEST_F(DBBlockCacheTest, CacheEntryRoleStats) { #endif // ROCKSDB_LITE +// Disable LinkFile so that we can physically copy a DB using Checkpoint. +// Disable file GetUniqueId to enable stable cache keys. +class StableCacheKeyTestFS : public FaultInjectionTestFS { + public: + explicit StableCacheKeyTestFS(const std::shared_ptr& base) + : FaultInjectionTestFS(base) { + SetFailGetUniqueId(true); + } + + virtual ~StableCacheKeyTestFS() {} + + IOStatus LinkFile(const std::string&, const std::string&, const IOOptions&, + IODebugContext*) override { + return IOStatus::NotSupported("Disabled"); + } +}; + +TEST_F(DBBlockCacheTest, StableCacheKeys) { + std::shared_ptr test_fs{ + new StableCacheKeyTestFS(env_->GetFileSystem())}; + std::unique_ptr test_env{ + new CompositeEnvWrapper(env_, test_fs)}; + + for (bool compressed : {false, true}) { + Options options = CurrentOptions(); + options.create_if_missing = true; + options.statistics = ROCKSDB_NAMESPACE::CreateDBStatistics(); + options.env = test_env.get(); + + BlockBasedTableOptions table_options; + + std::function verify_stats; + if (compressed) { + if (!Snappy_Supported()) { + fprintf(stderr, "skipping compressed test, snappy unavailable\n"); + continue; + } + options.compression = CompressionType::kSnappyCompression; + table_options.no_block_cache = true; + table_options.block_cache_compressed = NewLRUCache(1 << 25, 0, false); + verify_stats = [&options] { + ASSERT_EQ( + 1, options.statistics->getTickerCount(BLOCK_CACHE_COMPRESSED_ADD)); + }; + } else { + table_options.cache_index_and_filter_blocks = true; + table_options.block_cache = NewLRUCache(1 << 25, 0, false); + verify_stats = [&options] { + ASSERT_EQ(1, options.statistics->getTickerCount(BLOCK_CACHE_DATA_ADD)); + ASSERT_EQ(1, options.statistics->getTickerCount(BLOCK_CACHE_INDEX_ADD)); + ASSERT_EQ(1, + options.statistics->getTickerCount(BLOCK_CACHE_FILTER_ADD)); + }; + } + + table_options.filter_policy.reset(NewBloomFilterPolicy(10, false)); + options.table_factory.reset(NewBlockBasedTableFactory(table_options)); + DestroyAndReopen(options); + + ASSERT_OK(Put("key1", "abc")); + std::string something_compressible(500U, 'x'); + ASSERT_OK(Put("key2", something_compressible)); + ASSERT_OK(Flush()); + + ASSERT_EQ(Get("key1"), std::string("abc")); + verify_stats(); + + // Make sure we can cache hit after re-open + Reopen(options); + + ASSERT_EQ(Get("key1"), std::string("abc")); + verify_stats(); + + // Make sure we can cache hit even on a full copy of the DB. Using + // StableCacheKeyTestFS, Checkpoint will resort to full copy not hard link. + // (Checkpoint not available in LITE mode to test this.) +#ifndef ROCKSDB_LITE + auto db_copy_name = dbname_ + "-copy"; + Checkpoint* checkpoint; + ASSERT_OK(Checkpoint::Create(db_, &checkpoint)); + ASSERT_OK(checkpoint->CreateCheckpoint(db_copy_name)); + delete checkpoint; + + Close(); + Destroy(options); + dbname_ = db_copy_name; + Reopen(options); + + ASSERT_EQ(Get("key1"), std::string("abc")); + verify_stats(); +#endif // !ROCKSDB_LITE + + Close(); + } +} + class DBBlockCachePinningTest : public DBTestBase, public testing::WithParamInterface< diff --git a/table/block_based/block_based_table_builder.cc b/table/block_based/block_based_table_builder.cc index 07c545656..2b20cdc46 100644 --- a/table/block_based/block_based_table_builder.cc +++ b/table/block_based/block_based_table_builder.cc @@ -1398,6 +1398,7 @@ void DeleteEntryCached(const Slice& /*key*/, void* value) { // Helper function to setup the cache key's prefix for the Table. void BlockBasedTableBuilder::SetupCacheKeyPrefix( const TableBuilderOptions& tbo) { + // FIXME: Unify with BlockBasedTable::SetupCacheKeyPrefix if (rep_->table_options.block_cache.get() != nullptr) { BlockBasedTable::GenerateCachePrefix( rep_->table_options.block_cache.get(), rep_->file->writable_file(), diff --git a/table/block_based/block_based_table_reader.cc b/table/block_based/block_based_table_reader.cc index f6eca75e3..5c020084d 100644 --- a/table/block_based/block_based_table_reader.cc +++ b/table/block_based/block_based_table_reader.cc @@ -57,6 +57,7 @@ #include "table/meta_blocks.h" #include "table/multiget_context.h" #include "table/persistent_cache_helper.h" +#include "table/persistent_cache_options.h" #include "table/sst_file_writer_collectors.h" #include "table/two_level_iterator.h" #include "test_util/sync_point.h" @@ -371,7 +372,7 @@ Cache::Handle* BlockBasedTable::GetEntryFromCache( // Helper function to setup the cache key's prefix for the Table. void BlockBasedTable::SetupCacheKeyPrefix(Rep* rep, const std::string& db_session_id, - uint64_t cur_file_num) { + uint64_t file_num) { assert(kMaxCacheKeyPrefixSize >= 10); rep->cache_key_prefix_size = 0; rep->compressed_cache_key_prefix_size = 0; @@ -379,19 +380,28 @@ void BlockBasedTable::SetupCacheKeyPrefix(Rep* rep, GenerateCachePrefix( rep->table_options.block_cache.get(), rep->file->file(), &rep->cache_key_prefix[0], &rep->cache_key_prefix_size, db_session_id, - cur_file_num); - } - if (rep->table_options.persistent_cache != nullptr) { - GenerateCachePrefix( - rep->table_options.persistent_cache.get(), rep->file->file(), - &rep->persistent_cache_key_prefix[0], - &rep->persistent_cache_key_prefix_size, "", cur_file_num); + file_num); } if (rep->table_options.block_cache_compressed != nullptr) { GenerateCachePrefix( rep->table_options.block_cache_compressed.get(), rep->file->file(), &rep->compressed_cache_key_prefix[0], - &rep->compressed_cache_key_prefix_size, "", cur_file_num); + &rep->compressed_cache_key_prefix_size, db_session_id, file_num); + } + if (rep->table_options.persistent_cache != nullptr) { + char persistent_cache_key_prefix[kMaxCacheKeyPrefixSize]; + size_t persistent_cache_key_prefix_size = 0; + + GenerateCachePrefix( + rep->table_options.persistent_cache.get(), rep->file->file(), + &persistent_cache_key_prefix[0], &persistent_cache_key_prefix_size, + db_session_id, file_num); + + rep->persistent_cache_options = + PersistentCacheOptions(rep->table_options.persistent_cache, + std::string(persistent_cache_key_prefix, + persistent_cache_key_prefix_size), + rep->ioptions.stats); } } @@ -513,7 +523,7 @@ Status BlockBasedTable::Open( const SequenceNumber largest_seqno, const bool force_direct_prefetch, TailPrefetchStats* tail_prefetch_stats, BlockCacheTracer* const block_cache_tracer, - size_t max_file_size_for_l0_meta_pin, const std::string& db_session_id, + size_t max_file_size_for_l0_meta_pin, const std::string& cur_db_session_id, uint64_t cur_file_num) { table_reader->reset(); @@ -588,16 +598,11 @@ Status BlockBasedTable::Open( rep->internal_prefix_transform.reset( new InternalKeySliceTransform(prefix_extractor)); } - SetupCacheKeyPrefix(rep, db_session_id, cur_file_num); - std::unique_ptr new_table( - new BlockBasedTable(rep, block_cache_tracer)); - // page cache options - rep->persistent_cache_options = - PersistentCacheOptions(rep->table_options.persistent_cache, - std::string(rep->persistent_cache_key_prefix, - rep->persistent_cache_key_prefix_size), - rep->ioptions.stats); + // For fully portable/stable cache keys, we need to read the properties + // block before setting up cache keys. TODO: consider setting up a bootstrap + // cache key for PersistentCache to use for metaindex and properties blocks. + rep->persistent_cache_options = PersistentCacheOptions(); // Meta-blocks are not dictionary compressed. Explicitly set the dictionary // handle to null, otherwise it may be seen as uninitialized during the below @@ -605,6 +610,8 @@ Status BlockBasedTable::Open( rep->compression_dict_handle = BlockHandle::NullBlockHandle(); // Read metaindex + std::unique_ptr new_table( + new BlockBasedTable(rep, block_cache_tracer)); std::unique_ptr metaindex; std::unique_ptr metaindex_iter; s = new_table->ReadMetaIndexBlock(ro, prefetch_buffer.get(), &metaindex, @@ -620,6 +627,36 @@ Status BlockBasedTable::Open( if (!s.ok()) { return s; } + + // With properties loaded, we can set up portable/stable cache keys if + // necessary info is available + std::string db_session_id = cur_db_session_id; + uint64_t file_num = cur_file_num; + if (rep->table_properties && !rep->table_properties->db_session_id.empty()) { + const auto& uprops = rep->table_properties->user_collected_properties; + auto version_iter = uprops.find(ExternalSstFilePropertyNames::kVersion); + if (version_iter == uprops.end()) { + // Normal (non-external) SST file - can only use embedded db_session_id + // with current file number (which should be original file number) + if (file_num > 0) { + db_session_id = rep->table_properties->db_session_id; + } + } else { + // External (ingested) SST file - should not use current file number + // (which is changed from original), so that same file ingested into + // different DBs can share block cache entries. Although they can modify + // the embedded global_seqno, that information is not currently cached + // under these portable/stable keys. + // Note: For now, each external SST file gets its own unique session id, + // so we can use a fixed file number under than session id. + // ... except FIXME (peterd): sst_file_writer currently uses wrong + // format for db_session_ids so this approach doesn't work yet. + db_session_id = rep->table_properties->db_session_id; + file_num = 1; + } + } + SetupCacheKeyPrefix(rep, db_session_id, file_num); + s = new_table->ReadRangeDelBlock(ro, prefetch_buffer.get(), metaindex_iter.get(), internal_comparator, &lookup_context); diff --git a/table/block_based/block_based_table_reader.h b/table/block_based/block_based_table_reader.h index 43b56a68c..e64d09d20 100644 --- a/table/block_based/block_based_table_reader.h +++ b/table/block_based/block_based_table_reader.h @@ -9,6 +9,8 @@ #pragma once +#include + #include "db/range_tombstone_fragmenter.h" #include "file/filename.h" #include "table/block_based/block_based_table_factory.h" @@ -19,7 +21,6 @@ #include "table/table_properties_internal.h" #include "table/table_reader.h" #include "table/two_level_iterator.h" - #include "trace_replay/block_cache_tracer.h" namespace ROCKSDB_NAMESPACE { @@ -100,7 +101,7 @@ class BlockBasedTable : public TableReader { TailPrefetchStats* tail_prefetch_stats = nullptr, BlockCacheTracer* const block_cache_tracer = nullptr, size_t max_file_size_for_l0_meta_pin = 0, - const std::string& db_session_id = "", + const std::string& cur_db_session_id = "", uint64_t cur_file_num = 0); bool PrefixMayMatch(const Slice& internal_key, @@ -555,11 +556,11 @@ struct BlockBasedTable::Rep { Status status; std::unique_ptr file; char cache_key_prefix[kMaxCacheKeyPrefixSize]; - size_t cache_key_prefix_size = 0; - char persistent_cache_key_prefix[kMaxCacheKeyPrefixSize]; - size_t persistent_cache_key_prefix_size = 0; + // SIZE_MAX -> assert not used without re-assignment + size_t cache_key_prefix_size = SIZE_MAX; char compressed_cache_key_prefix[kMaxCacheKeyPrefixSize]; - size_t compressed_cache_key_prefix_size = 0; + // SIZE_MAX -> assert not used without re-assignment + size_t compressed_cache_key_prefix_size = SIZE_MAX; PersistentCacheOptions persistent_cache_options; // Footer contains the fixed table information diff --git a/table/meta_blocks.cc b/table/meta_blocks.cc index 52e56be81..fff1397c8 100644 --- a/table/meta_blocks.cc +++ b/table/meta_blocks.cc @@ -228,6 +228,8 @@ Status ReadProperties(const ReadOptions& read_options, BlockContents block_contents; Status s; + // FIXME: should be a parameter for reading table properties to use persistent + // cache PersistentCacheOptions cache_options; ReadOptions ro = read_options; ro.verify_checksums = verify_checksum;