Stable cache keys using DB session ids in SSTs (#8659)

Summary:
Use DB session ids in SST table properties to make cache keys
stable across DB re-open and copy / move / restore / etc.

These new cache keys are currently only enabled when FileSystem does not
provide GetUniqueId. For now, they are typically larger, so slightly
less efficient.

Relevant to https://github.com/facebook/rocksdb/issues/7405

This change has a minor regression in PersistentCache functionality:
metaindex blocks are no longer cached in PersistentCache. Table properties
blocks already were not but ideally should be. I didn't spent effort to
fix & test these issues because we don't believe PersistentCache is used much
if at all and expect SecondaryCache to replace it. (Though PRs are welcome.)

FIXME: there is more to be fixed for stable cache keys on external SST files

Pull Request resolved: https://github.com/facebook/rocksdb/pull/8659

Test Plan:
new unit test added, which fails when disabling new
functionality

Reviewed By: zhichao-cao

Differential Revision: D30297705

Pulled By: pdillinger

fbshipit-source-id: e8539a5c8802a79340405629870f2e3fb3822d3a
main
Peter Dillinger 3 years ago committed by Facebook GitHub Bot
parent 5de333fd99
commit a207c27809
  1. 5
      db/db_basic_test.cc
  2. 99
      db/db_block_cache_test.cc
  3. 1
      table/block_based/block_based_table_builder.cc
  4. 75
      table/block_based/block_based_table_reader.cc
  5. 13
      table/block_based/block_based_table_reader.h
  6. 2
      table/meta_blocks.cc

@ -2756,6 +2756,11 @@ class DBBasicTestMultiGet : public DBTestBase {
EXPECT_OK(dbfull()->Flush(FlushOptions(), handles_[cf]));
}
}
// Clear compressed cache, which is always pre-populated
if (compressed_cache_) {
compressed_cache_->SetCapacity(0);
compressed_cache_->SetCapacity(1048576);
}
}
bool CheckValue(int i, const std::string& value) {

@ -7,6 +7,7 @@
// Use of this source code is governed by a BSD-style license that can be
// found in the LICENSE file. See the AUTHORS file for names of contributors.
#include <cstdlib>
#include <functional>
#include <memory>
#include "cache/cache_entry_roles.h"
@ -14,9 +15,11 @@
#include "db/column_family.h"
#include "db/db_test_util.h"
#include "port/stack_trace.h"
#include "rocksdb/statistics.h"
#include "rocksdb/table.h"
#include "util/compression.h"
#include "util/random.h"
#include "utilities/fault_injection_fs.h"
namespace ROCKSDB_NAMESPACE {
@ -1298,6 +1301,102 @@ TEST_F(DBBlockCacheTest, CacheEntryRoleStats) {
#endif // ROCKSDB_LITE
// Disable LinkFile so that we can physically copy a DB using Checkpoint.
// Disable file GetUniqueId to enable stable cache keys.
class StableCacheKeyTestFS : public FaultInjectionTestFS {
public:
explicit StableCacheKeyTestFS(const std::shared_ptr<FileSystem>& base)
: FaultInjectionTestFS(base) {
SetFailGetUniqueId(true);
}
virtual ~StableCacheKeyTestFS() {}
IOStatus LinkFile(const std::string&, const std::string&, const IOOptions&,
IODebugContext*) override {
return IOStatus::NotSupported("Disabled");
}
};
TEST_F(DBBlockCacheTest, StableCacheKeys) {
std::shared_ptr<StableCacheKeyTestFS> test_fs{
new StableCacheKeyTestFS(env_->GetFileSystem())};
std::unique_ptr<CompositeEnvWrapper> test_env{
new CompositeEnvWrapper(env_, test_fs)};
for (bool compressed : {false, true}) {
Options options = CurrentOptions();
options.create_if_missing = true;
options.statistics = ROCKSDB_NAMESPACE::CreateDBStatistics();
options.env = test_env.get();
BlockBasedTableOptions table_options;
std::function<void()> verify_stats;
if (compressed) {
if (!Snappy_Supported()) {
fprintf(stderr, "skipping compressed test, snappy unavailable\n");
continue;
}
options.compression = CompressionType::kSnappyCompression;
table_options.no_block_cache = true;
table_options.block_cache_compressed = NewLRUCache(1 << 25, 0, false);
verify_stats = [&options] {
ASSERT_EQ(
1, options.statistics->getTickerCount(BLOCK_CACHE_COMPRESSED_ADD));
};
} else {
table_options.cache_index_and_filter_blocks = true;
table_options.block_cache = NewLRUCache(1 << 25, 0, false);
verify_stats = [&options] {
ASSERT_EQ(1, options.statistics->getTickerCount(BLOCK_CACHE_DATA_ADD));
ASSERT_EQ(1, options.statistics->getTickerCount(BLOCK_CACHE_INDEX_ADD));
ASSERT_EQ(1,
options.statistics->getTickerCount(BLOCK_CACHE_FILTER_ADD));
};
}
table_options.filter_policy.reset(NewBloomFilterPolicy(10, false));
options.table_factory.reset(NewBlockBasedTableFactory(table_options));
DestroyAndReopen(options);
ASSERT_OK(Put("key1", "abc"));
std::string something_compressible(500U, 'x');
ASSERT_OK(Put("key2", something_compressible));
ASSERT_OK(Flush());
ASSERT_EQ(Get("key1"), std::string("abc"));
verify_stats();
// Make sure we can cache hit after re-open
Reopen(options);
ASSERT_EQ(Get("key1"), std::string("abc"));
verify_stats();
// Make sure we can cache hit even on a full copy of the DB. Using
// StableCacheKeyTestFS, Checkpoint will resort to full copy not hard link.
// (Checkpoint not available in LITE mode to test this.)
#ifndef ROCKSDB_LITE
auto db_copy_name = dbname_ + "-copy";
Checkpoint* checkpoint;
ASSERT_OK(Checkpoint::Create(db_, &checkpoint));
ASSERT_OK(checkpoint->CreateCheckpoint(db_copy_name));
delete checkpoint;
Close();
Destroy(options);
dbname_ = db_copy_name;
Reopen(options);
ASSERT_EQ(Get("key1"), std::string("abc"));
verify_stats();
#endif // !ROCKSDB_LITE
Close();
}
}
class DBBlockCachePinningTest
: public DBTestBase,
public testing::WithParamInterface<

@ -1398,6 +1398,7 @@ void DeleteEntryCached(const Slice& /*key*/, void* value) {
// Helper function to setup the cache key's prefix for the Table.
void BlockBasedTableBuilder::SetupCacheKeyPrefix(
const TableBuilderOptions& tbo) {
// FIXME: Unify with BlockBasedTable::SetupCacheKeyPrefix
if (rep_->table_options.block_cache.get() != nullptr) {
BlockBasedTable::GenerateCachePrefix<Cache, FSWritableFile>(
rep_->table_options.block_cache.get(), rep_->file->writable_file(),

@ -57,6 +57,7 @@
#include "table/meta_blocks.h"
#include "table/multiget_context.h"
#include "table/persistent_cache_helper.h"
#include "table/persistent_cache_options.h"
#include "table/sst_file_writer_collectors.h"
#include "table/two_level_iterator.h"
#include "test_util/sync_point.h"
@ -371,7 +372,7 @@ Cache::Handle* BlockBasedTable::GetEntryFromCache(
// Helper function to setup the cache key's prefix for the Table.
void BlockBasedTable::SetupCacheKeyPrefix(Rep* rep,
const std::string& db_session_id,
uint64_t cur_file_num) {
uint64_t file_num) {
assert(kMaxCacheKeyPrefixSize >= 10);
rep->cache_key_prefix_size = 0;
rep->compressed_cache_key_prefix_size = 0;
@ -379,19 +380,28 @@ void BlockBasedTable::SetupCacheKeyPrefix(Rep* rep,
GenerateCachePrefix<Cache, FSRandomAccessFile>(
rep->table_options.block_cache.get(), rep->file->file(),
&rep->cache_key_prefix[0], &rep->cache_key_prefix_size, db_session_id,
cur_file_num);
}
if (rep->table_options.persistent_cache != nullptr) {
GenerateCachePrefix<PersistentCache, FSRandomAccessFile>(
rep->table_options.persistent_cache.get(), rep->file->file(),
&rep->persistent_cache_key_prefix[0],
&rep->persistent_cache_key_prefix_size, "", cur_file_num);
file_num);
}
if (rep->table_options.block_cache_compressed != nullptr) {
GenerateCachePrefix<Cache, FSRandomAccessFile>(
rep->table_options.block_cache_compressed.get(), rep->file->file(),
&rep->compressed_cache_key_prefix[0],
&rep->compressed_cache_key_prefix_size, "", cur_file_num);
&rep->compressed_cache_key_prefix_size, db_session_id, file_num);
}
if (rep->table_options.persistent_cache != nullptr) {
char persistent_cache_key_prefix[kMaxCacheKeyPrefixSize];
size_t persistent_cache_key_prefix_size = 0;
GenerateCachePrefix<PersistentCache, FSRandomAccessFile>(
rep->table_options.persistent_cache.get(), rep->file->file(),
&persistent_cache_key_prefix[0], &persistent_cache_key_prefix_size,
db_session_id, file_num);
rep->persistent_cache_options =
PersistentCacheOptions(rep->table_options.persistent_cache,
std::string(persistent_cache_key_prefix,
persistent_cache_key_prefix_size),
rep->ioptions.stats);
}
}
@ -513,7 +523,7 @@ Status BlockBasedTable::Open(
const SequenceNumber largest_seqno, const bool force_direct_prefetch,
TailPrefetchStats* tail_prefetch_stats,
BlockCacheTracer* const block_cache_tracer,
size_t max_file_size_for_l0_meta_pin, const std::string& db_session_id,
size_t max_file_size_for_l0_meta_pin, const std::string& cur_db_session_id,
uint64_t cur_file_num) {
table_reader->reset();
@ -588,16 +598,11 @@ Status BlockBasedTable::Open(
rep->internal_prefix_transform.reset(
new InternalKeySliceTransform(prefix_extractor));
}
SetupCacheKeyPrefix(rep, db_session_id, cur_file_num);
std::unique_ptr<BlockBasedTable> new_table(
new BlockBasedTable(rep, block_cache_tracer));
// page cache options
rep->persistent_cache_options =
PersistentCacheOptions(rep->table_options.persistent_cache,
std::string(rep->persistent_cache_key_prefix,
rep->persistent_cache_key_prefix_size),
rep->ioptions.stats);
// For fully portable/stable cache keys, we need to read the properties
// block before setting up cache keys. TODO: consider setting up a bootstrap
// cache key for PersistentCache to use for metaindex and properties blocks.
rep->persistent_cache_options = PersistentCacheOptions();
// Meta-blocks are not dictionary compressed. Explicitly set the dictionary
// handle to null, otherwise it may be seen as uninitialized during the below
@ -605,6 +610,8 @@ Status BlockBasedTable::Open(
rep->compression_dict_handle = BlockHandle::NullBlockHandle();
// Read metaindex
std::unique_ptr<BlockBasedTable> new_table(
new BlockBasedTable(rep, block_cache_tracer));
std::unique_ptr<Block> metaindex;
std::unique_ptr<InternalIterator> metaindex_iter;
s = new_table->ReadMetaIndexBlock(ro, prefetch_buffer.get(), &metaindex,
@ -620,6 +627,36 @@ Status BlockBasedTable::Open(
if (!s.ok()) {
return s;
}
// With properties loaded, we can set up portable/stable cache keys if
// necessary info is available
std::string db_session_id = cur_db_session_id;
uint64_t file_num = cur_file_num;
if (rep->table_properties && !rep->table_properties->db_session_id.empty()) {
const auto& uprops = rep->table_properties->user_collected_properties;
auto version_iter = uprops.find(ExternalSstFilePropertyNames::kVersion);
if (version_iter == uprops.end()) {
// Normal (non-external) SST file - can only use embedded db_session_id
// with current file number (which should be original file number)
if (file_num > 0) {
db_session_id = rep->table_properties->db_session_id;
}
} else {
// External (ingested) SST file - should not use current file number
// (which is changed from original), so that same file ingested into
// different DBs can share block cache entries. Although they can modify
// the embedded global_seqno, that information is not currently cached
// under these portable/stable keys.
// Note: For now, each external SST file gets its own unique session id,
// so we can use a fixed file number under than session id.
// ... except FIXME (peterd): sst_file_writer currently uses wrong
// format for db_session_ids so this approach doesn't work yet.
db_session_id = rep->table_properties->db_session_id;
file_num = 1;
}
}
SetupCacheKeyPrefix(rep, db_session_id, file_num);
s = new_table->ReadRangeDelBlock(ro, prefetch_buffer.get(),
metaindex_iter.get(), internal_comparator,
&lookup_context);

@ -9,6 +9,8 @@
#pragma once
#include <cstdint>
#include "db/range_tombstone_fragmenter.h"
#include "file/filename.h"
#include "table/block_based/block_based_table_factory.h"
@ -19,7 +21,6 @@
#include "table/table_properties_internal.h"
#include "table/table_reader.h"
#include "table/two_level_iterator.h"
#include "trace_replay/block_cache_tracer.h"
namespace ROCKSDB_NAMESPACE {
@ -100,7 +101,7 @@ class BlockBasedTable : public TableReader {
TailPrefetchStats* tail_prefetch_stats = nullptr,
BlockCacheTracer* const block_cache_tracer = nullptr,
size_t max_file_size_for_l0_meta_pin = 0,
const std::string& db_session_id = "",
const std::string& cur_db_session_id = "",
uint64_t cur_file_num = 0);
bool PrefixMayMatch(const Slice& internal_key,
@ -555,11 +556,11 @@ struct BlockBasedTable::Rep {
Status status;
std::unique_ptr<RandomAccessFileReader> file;
char cache_key_prefix[kMaxCacheKeyPrefixSize];
size_t cache_key_prefix_size = 0;
char persistent_cache_key_prefix[kMaxCacheKeyPrefixSize];
size_t persistent_cache_key_prefix_size = 0;
// SIZE_MAX -> assert not used without re-assignment
size_t cache_key_prefix_size = SIZE_MAX;
char compressed_cache_key_prefix[kMaxCacheKeyPrefixSize];
size_t compressed_cache_key_prefix_size = 0;
// SIZE_MAX -> assert not used without re-assignment
size_t compressed_cache_key_prefix_size = SIZE_MAX;
PersistentCacheOptions persistent_cache_options;
// Footer contains the fixed table information

@ -228,6 +228,8 @@ Status ReadProperties(const ReadOptions& read_options,
BlockContents block_contents;
Status s;
// FIXME: should be a parameter for reading table properties to use persistent
// cache
PersistentCacheOptions cache_options;
ReadOptions ro = read_options;
ro.verify_checksums = verify_checksum;

Loading…
Cancel
Save