From cba398df8ad4e7ce189e2fe8850027d934f3b572 Mon Sep 17 00:00:00 2001 From: Gang Liao Date: Tue, 14 Jun 2022 14:19:26 -0700 Subject: [PATCH] Add blob cache option in the column family options (#10155) Summary: There is currently no caching mechanism for blobs, which is not ideal especially when the database resides on remote storage (where we cannot rely on the OS page cache). As part of this task, we would like to make it possible for the application to configure a blob cache. This PR is a part of https://github.com/facebook/rocksdb/issues/10156 Pull Request resolved: https://github.com/facebook/rocksdb/pull/10155 Reviewed By: ltamasi Differential Revision: D37150819 Pulled By: gangliao fbshipit-source-id: b807c7916ea5d411588128f8e22a49f171388fe2 --- db/c.cc | 5 +++ db/db_options_test.cc | 1 + include/rocksdb/advanced_options.h | 10 +++++- include/rocksdb/c.h | 3 ++ include/rocksdb/file_system.h | 2 +- include/rocksdb/options.h | 1 - options/cf_options.cc | 13 +++++++- options/cf_options.h | 2 ++ options/options.cc | 9 +++++- options/options_helper.cc | 1 + options/options_settable_test.cc | 9 ++++-- options/options_test.cc | 32 +++++++++++++++++++ table/block_based/block_based_table_reader.cc | 2 +- 13 files changed, 82 insertions(+), 8 deletions(-) diff --git a/db/c.cc b/db/c.cc index d745d5aee..d1d0aafe1 100644 --- a/db/c.cc +++ b/db/c.cc @@ -3048,6 +3048,11 @@ int rocksdb_options_get_blob_file_starting_level(rocksdb_options_t* opt) { return opt->rep.blob_file_starting_level; } +void rocksdb_options_set_blob_cache(rocksdb_options_t* opt, + rocksdb_cache_t* blob_cache) { + opt->rep.blob_cache = blob_cache->rep; +} + void rocksdb_options_set_num_levels(rocksdb_options_t* opt, int n) { opt->rep.num_levels = n; } diff --git a/db/db_options_test.cc b/db/db_options_test.cc index 46aa25211..229ad904e 100644 --- a/db/db_options_test.cc +++ b/db/db_options_test.cc @@ -220,6 +220,7 @@ TEST_F(DBOptionsTest, SetMutableTableOptions) { ColumnFamilyHandle* cfh = dbfull()->DefaultColumnFamily(); Options c_opts = dbfull()->GetOptions(cfh); + const auto* c_bbto = c_opts.table_factory->GetOptions(); ASSERT_NE(c_bbto, nullptr); diff --git a/include/rocksdb/advanced_options.h b/include/rocksdb/advanced_options.h index e3f4ccee7..9cdd947be 100644 --- a/include/rocksdb/advanced_options.h +++ b/include/rocksdb/advanced_options.h @@ -10,6 +10,7 @@ #include +#include "rocksdb/cache.h" #include "rocksdb/compression_type.h" #include "rocksdb/memtablerep.h" #include "rocksdb/universal_compaction.h" @@ -227,7 +228,7 @@ enum class Temperature : uint8_t { }; // The control option of how the cache tiers will be used. Currently rocksdb -// support block cahe (volatile tier), secondary cache (non-volatile tier). +// support block cache (volatile tier), secondary cache (non-volatile tier). // In the future, we may add more caching layers. enum class CacheTier : uint8_t { kVolatileTier = 0, @@ -953,6 +954,13 @@ struct AdvancedColumnFamilyOptions { // Dynamically changeable through the SetOptions() API int blob_file_starting_level = 0; + // This feature is WORK IN PROGRESS + // If non-NULL use the specified cache for blobs. + // If NULL, rocksdb will not use a blob cache. + // + // Default: nullptr (disabled) + std::shared_ptr blob_cache = nullptr; + // Create ColumnFamilyOptions with default values for all fields AdvancedColumnFamilyOptions(); // Create ColumnFamilyOptions from Options diff --git a/include/rocksdb/c.h b/include/rocksdb/c.h index 556333551..93737c4bd 100644 --- a/include/rocksdb/c.h +++ b/include/rocksdb/c.h @@ -1264,6 +1264,9 @@ extern ROCKSDB_LIBRARY_API void rocksdb_options_set_blob_file_starting_level( extern ROCKSDB_LIBRARY_API int rocksdb_options_get_blob_file_starting_level( rocksdb_options_t* opt); +extern ROCKSDB_LIBRARY_API void rocksdb_options_set_blob_cache( + rocksdb_options_t* opt, rocksdb_cache_t* blob_cache); + /* returns a pointer to a malloc()-ed, null terminated string */ extern ROCKSDB_LIBRARY_API char* rocksdb_options_statistics_get_string( rocksdb_options_t* opt); diff --git a/include/rocksdb/file_system.h b/include/rocksdb/file_system.h index 7bc19976b..19f4a402a 100644 --- a/include/rocksdb/file_system.h +++ b/include/rocksdb/file_system.h @@ -762,7 +762,7 @@ struct FSReadRequest { // returns fewer bytes if end of file is hit (or `status` is not OK). size_t len; - // A buffer that MultiRead() can optionally place data in. It can + // A buffer that MultiRead() can optionally place data in. It can // ignore this and allocate its own buffer. // The lifecycle of scratch will be until IO is completed. // diff --git a/include/rocksdb/options.h b/include/rocksdb/options.h index cc175dccc..19bc3cb19 100644 --- a/include/rocksdb/options.h +++ b/include/rocksdb/options.h @@ -1412,7 +1412,6 @@ struct Options : public DBOptions, public ColumnFamilyOptions { Options* DisableExtraChecks(); }; -// // An application can issue a read request (via Get/Iterators) and specify // if that read should process data that ALREADY resides on a specified cache // level. For example, if an application specifies kBlockCacheTier then the diff --git a/options/cf_options.cc b/options/cf_options.cc index 2da55a6fe..1e9865265 100644 --- a/options/cf_options.cc +++ b/options/cf_options.cc @@ -732,6 +732,16 @@ static std::unordered_map OptionTypeInfo::AsCustomSharedPtr( offsetof(struct ImmutableCFOptions, sst_partitioner_factory), OptionVerificationType::kByName, OptionTypeFlags::kAllowNull)}, + {"blob_cache", + {offsetof(struct ImmutableCFOptions, blob_cache), OptionType::kUnknown, + OptionVerificationType::kNormal, + (OptionTypeFlags::kCompareNever | OptionTypeFlags::kDontSerialize), + // Parses the input value as a Cache + [](const ConfigOptions& opts, const std::string&, + const std::string& value, void* addr) { + auto* cache = static_cast*>(addr); + return Cache::CreateFromString(opts, value, cache); + }}}, }; const std::string OptionsHelper::kCFOptionsName = "ColumnFamilyOptions"; @@ -870,7 +880,8 @@ ImmutableCFOptions::ImmutableCFOptions(const ColumnFamilyOptions& cf_options) cf_options.memtable_insert_with_hint_prefix_extractor), cf_paths(cf_options.cf_paths), compaction_thread_limiter(cf_options.compaction_thread_limiter), - sst_partitioner_factory(cf_options.sst_partitioner_factory) {} + sst_partitioner_factory(cf_options.sst_partitioner_factory), + blob_cache(cf_options.blob_cache) {} ImmutableOptions::ImmutableOptions() : ImmutableOptions(Options()) {} diff --git a/options/cf_options.h b/options/cf_options.h index c6bfe8f78..bfdc2e102 100644 --- a/options/cf_options.h +++ b/options/cf_options.h @@ -78,6 +78,8 @@ struct ImmutableCFOptions { std::shared_ptr compaction_thread_limiter; std::shared_ptr sst_partitioner_factory; + + std::shared_ptr blob_cache; }; struct ImmutableOptions : public ImmutableDBOptions, public ImmutableCFOptions { diff --git a/options/options.cc b/options/options.cc index 8424549b7..bba166be4 100644 --- a/options/options.cc +++ b/options/options.cc @@ -101,7 +101,8 @@ AdvancedColumnFamilyOptions::AdvancedColumnFamilyOptions(const Options& options) blob_garbage_collection_force_threshold( options.blob_garbage_collection_force_threshold), blob_compaction_readahead_size(options.blob_compaction_readahead_size), - blob_file_starting_level(options.blob_file_starting_level) { + blob_file_starting_level(options.blob_file_starting_level), + blob_cache(options.blob_cache) { assert(memtable_factory.get() != nullptr); if (max_bytes_for_level_multiplier_additional.size() < static_cast(num_levels)) { @@ -417,6 +418,12 @@ void ColumnFamilyOptions::Dump(Logger* log) const { blob_compaction_readahead_size); ROCKS_LOG_HEADER(log, " Options.blob_file_starting_level: %d", blob_file_starting_level); + if (blob_cache) { + ROCKS_LOG_HEADER(log, " Options.blob_cache: %s", + blob_cache->Name()); + ROCKS_LOG_HEADER(log, " blob_cache options: %s", + blob_cache->GetPrintableOptions().c_str()); + } } // ColumnFamilyOptions::Dump void Options::Dump(Logger* log) const { diff --git a/options/options_helper.cc b/options/options_helper.cc index 65eb708c1..6af73c840 100644 --- a/options/options_helper.cc +++ b/options/options_helper.cc @@ -303,6 +303,7 @@ void UpdateColumnFamilyOptions(const ImmutableCFOptions& ioptions, cf_opts->cf_paths = ioptions.cf_paths; cf_opts->compaction_thread_limiter = ioptions.compaction_thread_limiter; cf_opts->sst_partitioner_factory = ioptions.sst_partitioner_factory; + cf_opts->blob_cache = ioptions.blob_cache; // TODO(yhchiang): find some way to handle the following derived options // * max_file_size diff --git a/options/options_settable_test.cc b/options/options_settable_test.cc index 944248da2..42a6fd577 100644 --- a/options/options_settable_test.cc +++ b/options/options_settable_test.cc @@ -377,7 +377,7 @@ TEST_F(OptionsSettableTest, DBOptionsAllFieldsSettable) { // test is not updated accordingly. // After adding an option, we need to make sure it is settable by // GetColumnFamilyOptionsFromString() and add the option to the input -// string passed to GetColumnFamilyOptionsFromString()in this test. +// string passed to GetColumnFamilyOptionsFromString() in this test. // If it is a complicated type, you also need to add the field to // kColumnFamilyOptionsExcluded, and maybe add customized verification // for it. @@ -400,6 +400,8 @@ TEST_F(OptionsSettableTest, ColumnFamilyOptionsAllFieldsSettable) { {offsetof(struct ColumnFamilyOptions, table_properties_collector_factories), sizeof(ColumnFamilyOptions::TablePropertiesCollectorFactories)}, + {offsetof(struct ColumnFamilyOptions, blob_cache), + sizeof(std::shared_ptr)}, {offsetof(struct ColumnFamilyOptions, comparator), sizeof(Comparator*)}, {offsetof(struct ColumnFamilyOptions, merge_operator), sizeof(std::shared_ptr)}, @@ -523,9 +525,12 @@ TEST_F(OptionsSettableTest, ColumnFamilyOptionsAllFieldsSettable) { "blob_file_starting_level=1;" "bottommost_temperature=kWarm;" "compaction_options_fifo={max_table_files_size=3;allow_" - "compaction=false;age_for_warm=1;};", + "compaction=false;age_for_warm=1;};" + "blob_cache=1M;", new_options)); + ASSERT_NE(new_options->blob_cache.get(), nullptr); + ASSERT_EQ(unset_bytes_base, NumUnsetBytes(new_options_ptr, sizeof(ColumnFamilyOptions), kColumnFamilyOptionsExcluded)); diff --git a/options/options_test.cc b/options/options_test.cc index 1992e39a5..7c688f290 100644 --- a/options/options_test.cc +++ b/options/options_test.cc @@ -601,6 +601,22 @@ TEST_F(OptionsTest, GetColumnFamilyOptionsFromStringTest) { ASSERT_TRUE(new_cf_opt.memtable_factory != nullptr); ASSERT_EQ(std::string(new_cf_opt.memtable_factory->Name()), "SkipListFactory"); ASSERT_TRUE(new_cf_opt.memtable_factory->IsInstanceOf("SkipListFactory")); + + // blob cache + ASSERT_OK(GetColumnFamilyOptionsFromString( + config_options, base_cf_opt, + "blob_cache={capacity=1M;num_shard_bits=4;" + "strict_capacity_limit=true;high_pri_pool_ratio=0.5;};", + &new_cf_opt)); + ASSERT_NE(new_cf_opt.blob_cache, nullptr); + ASSERT_EQ(new_cf_opt.blob_cache->GetCapacity(), 1024UL * 1024UL); + ASSERT_EQ(static_cast(new_cf_opt.blob_cache.get()) + ->GetNumShardBits(), + 4); + ASSERT_EQ(new_cf_opt.blob_cache->HasStrictCapacityLimit(), true); + ASSERT_EQ(static_cast(new_cf_opt.blob_cache.get()) + ->GetHighPriPoolRatio(), + 0.5); } TEST_F(OptionsTest, CompressionOptionsFromString) { @@ -2767,6 +2783,22 @@ TEST_F(OptionsOldApiTest, GetColumnFamilyOptionsFromStringTest) { &new_cf_opt)); ASSERT_TRUE(new_cf_opt.memtable_factory != nullptr); ASSERT_TRUE(new_cf_opt.memtable_factory->IsInstanceOf("SkipListFactory")); + + // blob cache + ASSERT_OK(GetColumnFamilyOptionsFromString( + base_cf_opt, + "blob_cache={capacity=1M;num_shard_bits=4;" + "strict_capacity_limit=true;high_pri_pool_ratio=0.5;};", + &new_cf_opt)); + ASSERT_NE(new_cf_opt.blob_cache, nullptr); + ASSERT_EQ(new_cf_opt.blob_cache->GetCapacity(), 1024UL * 1024UL); + ASSERT_EQ(static_cast(new_cf_opt.blob_cache.get()) + ->GetNumShardBits(), + 4); + ASSERT_EQ(new_cf_opt.blob_cache->HasStrictCapacityLimit(), true); + ASSERT_EQ(static_cast(new_cf_opt.blob_cache.get()) + ->GetHighPriPoolRatio(), + 0.5); } TEST_F(OptionsTest, SliceTransformCreateFromString) { diff --git a/table/block_based/block_based_table_reader.cc b/table/block_based/block_based_table_reader.cc index e119d1bc7..31b75cf5e 100644 --- a/table/block_based/block_based_table_reader.cc +++ b/table/block_based/block_based_table_reader.cc @@ -563,7 +563,7 @@ void BlockBasedTable::SetupBaseCacheKey(const TableProperties* properties, // assert(!db_id.empty()); // Minimum block size is 5 bytes; therefore we can trim off two lower bits - // from offets. See GetCacheKey. + // from offsets. See GetCacheKey. *out_base_cache_key = OffsetableCacheKey(db_id, db_session_id, file_num, /*max_offset*/ file_size >> 2); }