diff --git a/CMakeLists.txt b/CMakeLists.txt index 573511088..b49a88be3 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -265,6 +265,7 @@ set(SOURCES utilities/merge_operators/max.cc utilities/merge_operators/uint64add.cc utilities/options/options_util.cc + utilities/persistent_cache/persistent_cache_tier.cc utilities/redis/redis_lists.cc utilities/spatialdb/spatial_db.cc utilities/table_properties_collectors/compact_on_deletion_collector.cc diff --git a/src.mk b/src.mk index c81983b42..f92cea6fd 100644 --- a/src.mk +++ b/src.mk @@ -131,6 +131,7 @@ LIB_SOURCES = \ utilities/merge_operators/string_append/stringappend.cc \ utilities/merge_operators/uint64add.cc \ utilities/options/options_util.cc \ + utilities/persistent_cache/persistent_cache_tier.cc \ utilities/redis/redis_lists.cc \ utilities/simulator_cache/sim_cache.cc \ utilities/spatialdb/spatial_db.cc \ diff --git a/utilities/persistent_cache/persistent_cache_tier.cc b/utilities/persistent_cache/persistent_cache_tier.cc new file mode 100644 index 000000000..217bcbea7 --- /dev/null +++ b/utilities/persistent_cache/persistent_cache_tier.cc @@ -0,0 +1,111 @@ +// Copyright (c) 2013, Facebook, Inc. All rights reserved. +// This source code is licensed under the BSD-style license found in the +// LICENSE file in the root directory of this source tree. An additional grant +// of patent rights can be found in the PATENTS file in the same directory. +// +#ifndef ROCKSDB_LITE + +#include "utilities/persistent_cache/persistent_cache_tier.h" + +#include + +namespace rocksdb { + +// +// PersistentCacheTier implementation +// +Status PersistentCacheTier::Open() { + if (next_tier_) { + return next_tier_->Open(); + } + return Status::OK(); +} + +Status PersistentCacheTier::Close() { + if (next_tier_) { + return next_tier_->Close(); + } + return Status::OK(); +} + +void PersistentCacheTier::Flush() { + if (next_tier_) { + next_tier_->Flush(); + } +} + +bool PersistentCacheTier::Reserve(const size_t size) { + // default implementation is a pass through + return true; +} + +bool PersistentCacheTier::Erase(const Slice& key) { + // default implementation is a pass through since not all cache tiers might + // support erase + return true; +} + +std::string PersistentCacheTier::PrintStats() { + if (next_tier_) { + return next_tier_->PrintStats(); + } + return std::string(); +} + +// +// PersistentTieredCache implementation +// +PersistentTieredCache::~PersistentTieredCache() { assert(tiers_.empty()); } + +Status PersistentTieredCache::Open() { + assert(!tiers_.empty()); + return tiers_.front()->Open(); +} + +Status PersistentTieredCache::Close() { + assert(!tiers_.empty()); + Status status = tiers_.front()->Close(); + if (status.ok()) { + tiers_.clear(); + } + return status; +} + +void PersistentTieredCache::Flush() { + assert(!tiers_.empty()); + tiers_.front()->Flush(); +} + +bool PersistentTieredCache::Erase(const Slice& key) { + assert(!tiers_.empty()); + return tiers_.front()->Erase(key); +} + +std::string PersistentTieredCache::PrintStats() { + assert(!tiers_.empty()); + return tiers_.front()->PrintStats(); +} + +Status PersistentTieredCache::Insert(const Slice& page_key, const char* data, + const size_t size) { + assert(!tiers_.empty()); + return tiers_.front()->Insert(page_key, data, size); +} + +Status PersistentTieredCache::Lookup(const Slice& page_key, + std::unique_ptr* data, + size_t* size) { + assert(!tiers_.empty()); + return tiers_.front()->Lookup(page_key, data, size); +} + +void PersistentTieredCache::AddTier(const Tier& tier) { + if (!tiers_.empty()) { + tiers_.back()->set_next_tier(tier); + } + tiers_.push_back(tier); +} + +} // namespace rocksdb + +#endif diff --git a/utilities/persistent_cache/persistent_cache_tier.h b/utilities/persistent_cache/persistent_cache_tier.h new file mode 100644 index 000000000..7e1afce43 --- /dev/null +++ b/utilities/persistent_cache/persistent_cache_tier.h @@ -0,0 +1,306 @@ +// Copyright (c) 2013, Facebook, Inc. All rights reserved. +// This source code is licensed under the BSD-style license found in the +// LICENSE file in the root directory of this source tree. An additional grant +// of patent rights can be found in the PATENTS file in the same directory. +// +#pragma once + +#ifndef ROCKSDB_LITE + +#include +#include +#include +#include +#include + +#include "rocksdb/env.h" +#include "rocksdb/persistent_cache.h" +#include "rocksdb/status.h" +#include "util/histogram.h" + +// Persistent Cache +// +// Persistent cache is tiered key-value cache that can use persistent medium. It +// is a generic design and can leverage any storage medium -- disk/SSD/NVM/RAM. +// The code has been kept generic but significant benchmark/design/development +// time has been spent to make sure the cache performs appropriately for +// respective storage medium. +// The file defines +// PersistentCacheOptions : Options for persistent cache +// PersistentCacheTier : Implementation that handles individual cache tier +// PersistentTieresCache : Implementation that handles all tiers as a logical +// unit +// +// PersistentTieredCache architecture: +// +--------------------------+ PersistentCacheTier that handles multiple tiers +// | +----------------+ | +// | | RAM | PersistentCacheTier that handles RAM (VolatileCacheImpl) +// | +----------------+ | +// | | next | +// | v | +// | +----------------+ | +// | | NVM | PersistentCacheTier implementation that handles NVM +// | +----------------+ (BlockCacheImpl) +// | | next | +// | V | +// | +----------------+ | +// | | LE-SSD | PersistentCacheTier implementation that handles LE-SSD +// | +----------------+ (BlockCacheImpl) +// | | | +// | V | +// | null | +// +--------------------------+ +// | +// V +// null +namespace rocksdb { + +// Persistent Cache Options +// +// This struct captures all the options that are used to configure persistent +// cache. Some of the terminologies used in naming the options are +// +// dispatch size : +// This is the size in which IO is dispatched to the device +// +// write buffer size : +// This is the size of an individual write buffer size. Write buffers are +// grouped to form buffered file. +// +// cache size : +// This is the logical maximum for the cache size +// +// qdepth : +// This is the max number of IOs that can issues to the device in parallel +// +// pepeling : +// The writer code path follows pipelined architecture, which means the +// operations are handed off from one stage to another +// +// pipelining backlog size : +// With the pipelined architecture, there can always be backlogging of ops in +// pipeline queues. This is the maximum backlog size after which ops are dropped +// from queue +struct PersistentCacheOptions { + explicit PersistentCacheOptions(Env* const _env, const std::string& _path, + const uint64_t _cache_size, + const std::shared_ptr& _log, + const uint32_t _write_buffer_size = 1 * 1024 * + 1024) { + env = _env; + path = _path; + log = _log; + cache_size = _cache_size; + writer_dispatch_size = write_buffer_size = _write_buffer_size; + } + + // + // Validate the settings. Our intentions are to catch erroneous settings ahead + // of time instead going violating invariants or causing dead locks. + // + Status ValidateSettings() const { + // (1) check pre-conditions for variables + if (!env || path.empty()) { + return Status::InvalidArgument("empty or null args"); + } + + // (2) assert size related invariants + // - cache size cannot be less than cache file size + // - individual write buffer size cannot be greater than cache file size + // - total write buffer size cannot be less than 2X cache file size + if (cache_size < cache_file_size || write_buffer_size >= cache_file_size || + write_buffer_size * write_buffer_count() < 2 * cache_file_size) { + return Status::InvalidArgument("invalid cache size"); + } + + // (2) check writer settings + // - Queue depth cannot be 0 + // - writer_dispatch_size cannot be greater than writer_buffer_size + // - dispatch size and buffer size need to be aligned + if (!writer_qdepth || writer_dispatch_size > write_buffer_size || + write_buffer_size % writer_dispatch_size) { + return Status::InvalidArgument("invalid writer settings"); + } + + return Status::OK(); + } + + // + // Env abstraction to use for systmer level operations + // + Env* env; + + // + // Path for the block cache where blocks are persisted + // + std::string path; + + // + // Log handle for logging messages + // + std::shared_ptr log; + + // + // Logical cache size + // + uint64_t cache_size = std::numeric_limits::max(); + + // cache-file-size + // + // Cache consists of multiples of small files. This parameter defines the + // size of an individual cache file + // + // default: 1M + uint32_t cache_file_size = 100ULL * 1024 * 1024; + + // writer-qdepth + // + // The writers can issues IO to the devices in parallel. This parameter + // controls the max number if IOs that can issues in parallel to the block + // device + // + // default :1 + uint32_t writer_qdepth = 1; + + // pipeline-writes + // + // The write optionally follow pipelined architecture. This helps + // avoid regression in the eviction code path of the primary tier. This + // parameter defines if pipelining is enabled or disabled + // + // default: true + bool pipeline_writes_ = true; + + // max-write-pipeline-backlog-size + // + // Max pipeline buffer size. This is the maximum backlog we can accumulate + // while waiting for writes. After the limit, new ops will be dropped. + // + // Default: 1GiB + uint64_t max_write_pipeline_backlog_size = 1ULL * 1024 * 1024 * 1024; + + // write-buffer-size + // + // This is the size in which buffer slabs are allocated. + // + // Default: 1M + uint32_t write_buffer_size = 1ULL * 1024 * 1024; + + // write-buffer-count + // + // This is the total number of buffer slabs. This is calculated as a factor of + // file size in order to avoid dead lock. + size_t write_buffer_count() const { + assert(write_buffer_size); + return (writer_qdepth + 1.2) * cache_file_size / write_buffer_size; + } + + // writer-dispatch-size + // + // The writer thread will dispatch the IO at the specified IO size + // + // default: 1M + uint64_t writer_dispatch_size = 1ULL * 1024 * 1024; + + PersistentCacheOptions MakePersistentCacheOptions( + const std::string& path, const uint64_t size, + const std::shared_ptr& log); +}; + +// Persistent Cache Tier +// +// This a logical abstraction that defines a tier of the persistent cache. Tiers +// can be stacked over one another. PersistentCahe provides the basic definition +// for accessing/storing in the cache. PersistentCacheTier extends the interface +// to enable management and stacking of tiers. +class PersistentCacheTier : public PersistentCache { + public: + typedef std::shared_ptr Tier; + + struct TierStats { + std::vector histograms_; + std::map counters_; + }; + + virtual ~PersistentCacheTier() {} + + // Open the persistent cache tier + virtual Status Open(); + + // Close the persistent cache tier + virtual Status Close(); + + // Flush the pending writes + virtual void Flush(); + + // Reserve space up to 'size' bytes + virtual bool Reserve(const size_t size); + + // Erase a key from the cache + virtual bool Erase(const Slice& key); + + // Print stats to string recursively + virtual std::string PrintStats(); + + // Expose stats + virtual std::vector Stats() = 0; + + // Insert to page cache + virtual Status Insert(const Slice& page_key, const char* data, + const size_t size) = 0; + + // Lookup page cache by page identifier + virtual Status Lookup(const Slice& page_key, std::unique_ptr* data, + size_t* size) = 0; + + // Return a reference to next tier + virtual Tier& next_tier() { return next_tier_; } + + // Set the value for next tier + virtual void set_next_tier(const Tier& tier) { + assert(!next_tier_); + next_tier_ = tier; + } + + private: + Tier next_tier_; // next tier +}; + +// PersistentTieredCache +// +// Abstraction that helps you construct a tiers of persistent caches as a +// unified cache. The tier(s) of cache will act a single tier for management +// ease and support PersistentCache methods for accessing data. +class PersistentTieredCache : public PersistentCacheTier { + public: + virtual ~PersistentTieredCache(); + + Status Open() override; + Status Close() override; + void Flush() override; + bool Erase(const Slice& key) override; + std::string PrintStats() override; + Status Insert(const Slice& page_key, const char* data, + const size_t size) override; + Status Lookup(const Slice& page_key, std::unique_ptr* data, + size_t* size) override; + + void AddTier(const Tier& tier); + + Tier& next_tier() override { + auto it = tiers_.end(); + return (*it)->next_tier(); + } + + void set_next_tier(const Tier& tier) override { + auto it = tiers_.end(); + (*it)->set_next_tier(tier); + } + + protected: + std::list tiers_; // list of tiers top-down +}; + +} // namespace rocksdb + +#endif