rocksdb/utilities/persistent_cache/persistent_cache_tier.h

//  Copyright (c) 2013, Facebook, Inc.  All rights reserved.
//  This source code is licensed under the BSD-style license found in the
//  LICENSE file in the root directory of this source tree. An additional grant
//  of patent rights can be found in the PATENTS file in the same directory.
//
#pragma once

#ifndef ROCKSDB_LITE

#include <limits>
#include <list>
#include <map>
#include <string>
#include <vector>

#include "rocksdb/env.h"
#include "rocksdb/persistent_cache.h"
#include "rocksdb/status.h"
#include "util/histogram.h"

// Persistent Cache
//
// Persistent cache is tiered key-value cache that can use persistent medium. It
// is a generic design and can leverage any storage medium -- disk/SSD/NVM/RAM.
// The code has been kept generic but significant benchmark/design/development
// time has been spent to make sure the cache performs appropriately for
// respective storage medium.
// The file defines
// PersistentCacheOptions : Options for persistent cache
// PersistentCacheTier    : Implementation that handles individual cache tier
// PersistentTieresCache  : Implementation that handles all tiers as a logical
//                          unit
//
// PersistentTieredCache architecture:
// +--------------------------+ PersistentCacheTier that handles multiple tiers
// | +----------------+       |
// | | RAM            | PersistentCacheTier that handles RAM (VolatileCacheImpl)
// | +----------------+       |
// |   | next                 |
// |   v                      |
// | +----------------+       |
// | | NVM            | PersistentCacheTier implementation that handles NVM
// | +----------------+ (BlockCacheImpl)
// |   | next                 |
// |   V                      |
// | +----------------+       |
// | | LE-SSD         | PersistentCacheTier implementation that handles LE-SSD
// | +----------------+ (BlockCacheImpl)
// |   |                      |
// |   V                      |
// |  null                    |
// +--------------------------+
//               |
//               V
//              null
namespace rocksdb {

// Persistent Cache Options
//
// This struct captures all the options that are used to configure persistent
// cache. Some of the terminologies used in naming the options are
//
// dispatch size :
// This is the size in which IO is dispatched to the device
//
// write buffer size :
// This is the size of an individual write buffer size. Write buffers are
// grouped to form buffered file.
//
// cache size :
// This is the logical maximum for the cache size
//
// qdepth :
// This is the max number of IOs that can issues to the device in parallel
//
// pepeling :
// The writer code path follows pipelined architecture, which means the
// operations are handed off from one stage to another
//
// pipelining backlog size :
// With the pipelined architecture, there can always be backlogging of ops in
// pipeline queues. This is the maximum backlog size after which ops are dropped
// from queue
struct PersistentCacheOptions {
  explicit PersistentCacheOptions(Env* const _env, const std::string& _path,
                                  const uint64_t _cache_size,
                                  const std::shared_ptr<Logger>& _log,
                                  const uint32_t _write_buffer_size = 1 * 1024 *
                                                                      1024) {
    env = _env;
    path = _path;
    log = _log;
    cache_size = _cache_size;
    writer_dispatch_size = write_buffer_size = _write_buffer_size;
  }

  //
  // Validate the settings. Our intentions are to catch erroneous settings ahead
  // of time instead going violating invariants or causing dead locks.
  //
  Status ValidateSettings() const {
    // (1) check pre-conditions for variables
    if (!env || path.empty()) {
      return Status::InvalidArgument("empty or null args");
    }

    // (2) assert size related invariants
    // - cache size cannot be less than cache file size
    // - individual write buffer size cannot be greater than cache file size
    // - total write buffer size cannot be less than 2X cache file size
    if (cache_size < cache_file_size || write_buffer_size >= cache_file_size ||
        write_buffer_size * write_buffer_count() < 2 * cache_file_size) {
      return Status::InvalidArgument("invalid cache size");
    }

    // (2) check writer settings
    // - Queue depth cannot be 0
    // - writer_dispatch_size cannot be greater than writer_buffer_size
    // - dispatch size and buffer size need to be aligned
    if (!writer_qdepth || writer_dispatch_size > write_buffer_size ||
        write_buffer_size % writer_dispatch_size) {
      return Status::InvalidArgument("invalid writer settings");
    }

    return Status::OK();
  }

  //
  // Env abstraction to use for systmer level operations
  //
  Env* env;

  //
  // Path for the block cache where blocks are persisted
  //
  std::string path;

  //
  // Log handle for logging messages
  //
  std::shared_ptr<Logger> log;

  //
  // Logical cache size
  //
  uint64_t cache_size = std::numeric_limits<uint64_t>::max();

  // cache-file-size
  //
  // Cache consists of multiples of small files. This parameter defines the
  // size of an individual cache file
  //
  // default: 1M
  uint32_t cache_file_size = 100ULL * 1024 * 1024;

  // writer-qdepth
  //
  // The writers can issues IO to the devices in parallel. This parameter
  // controls the max number if IOs that can issues in parallel to the block
  // device
  //
  // default :1
  uint32_t writer_qdepth = 1;

  // pipeline-writes
  //
  // The write optionally follow pipelined architecture. This helps
  // avoid regression in the eviction code path of the primary tier. This
  // parameter defines if pipelining is enabled or disabled
  //
  // default: true
  bool pipeline_writes_ = true;

  // max-write-pipeline-backlog-size
  //
  // Max pipeline buffer size. This is the maximum backlog we can accumulate
  // while waiting for writes. After the limit, new ops will be dropped.
  //
  // Default: 1GiB
  uint64_t max_write_pipeline_backlog_size = 1ULL * 1024 * 1024 * 1024;

  // write-buffer-size
  //
  // This is the size in which buffer slabs are allocated.
  //
  // Default: 1M
  uint32_t write_buffer_size = 1ULL * 1024 * 1024;

  // write-buffer-count
  //
  // This is the total number of buffer slabs. This is calculated as a factor of
  // file size in order to avoid dead lock.
  size_t write_buffer_count() const {
    assert(write_buffer_size);
    return static_cast<size_t>((writer_qdepth + 1.2) * cache_file_size /
                               write_buffer_size);
  }

  // writer-dispatch-size
  //
  // The writer thread will dispatch the IO at the specified IO size
  //
  // default: 1M
  uint64_t writer_dispatch_size = 1ULL * 1024 * 1024;

  PersistentCacheOptions MakePersistentCacheOptions(
      const std::string& path, const uint64_t size,
      const std::shared_ptr<Logger>& log);
};

// Persistent Cache Tier
//
// This a logical abstraction that defines a tier of the persistent cache. Tiers
// can be stacked over one another. PersistentCahe provides the basic definition
// for accessing/storing in the cache. PersistentCacheTier extends the interface
// to enable management and stacking of tiers.
class PersistentCacheTier : public PersistentCache {
 public:
  typedef std::shared_ptr<PersistentCacheTier> Tier;
  typedef std::map<std::string, double> TierStats;

  virtual ~PersistentCacheTier() {}

  // Open the persistent cache tier
  virtual Status Open();

  // Close the persistent cache tier
  virtual Status Close();

  // Flush the pending writes
  virtual void Flush();

  // Reserve space up to 'size' bytes
  virtual bool Reserve(const size_t size);

  // Erase a key from the cache
  virtual bool Erase(const Slice& key);

  // Print stats to string recursively
  virtual std::string PrintStats();

  // Expose stats
  virtual std::vector<TierStats> Stats() = 0;

  // Insert to page cache
  virtual Status Insert(const Slice& page_key, const char* data,
                        const size_t size) = 0;

  // Lookup page cache by page identifier
  virtual Status Lookup(const Slice& page_key, std::unique_ptr<char[]>* data,
                        size_t* size) = 0;

  // Return a reference to next tier
  virtual Tier& next_tier() { return next_tier_; }

  // Set the value for next tier
  virtual void set_next_tier(const Tier& tier) {
    assert(!next_tier_);
    next_tier_ = tier;
  }

 private:
  Tier next_tier_;  // next tier
};

// PersistentTieredCache
//
// Abstraction that helps you construct a tiers of persistent caches as a
// unified cache. The tier(s) of cache will act a single tier for management
// ease and support PersistentCache methods for accessing data.
class PersistentTieredCache : public PersistentCacheTier {
 public:
  virtual ~PersistentTieredCache();

  Status Open() override;
  Status Close() override;
  void Flush() override;
  bool Erase(const Slice& key) override;
  std::string PrintStats() override;
  Status Insert(const Slice& page_key, const char* data,
                const size_t size) override;
  Status Lookup(const Slice& page_key, std::unique_ptr<char[]>* data,
                size_t* size) override;

  void AddTier(const Tier& tier);

  Tier& next_tier() override {
    auto it = tiers_.end();
    return (*it)->next_tier();
  }

  void set_next_tier(const Tier& tier) override {
    auto it = tiers_.end();
    (*it)->set_next_tier(tier);
  }

 protected:
  std::list<Tier> tiers_;  // list of tiers top-down
};

}  // namespace rocksdb

#endif
Persistent Read Cache (4) Interface definitions Summary: This diff provides the basic interface definitions of persistent read cache system PersistentCacheOptions captures the persistent read cache options used to configure and control the system PersistentCacheTier provides the basic building block for constructing tiered cache PersistentTieredCache provides a logical abstraction of tiers of cache layered over one another Test Plan: Compile Reviewers: sdong Subscribers: andrewkr, dhruba, leveldb Differential Revision: https://reviews.facebook.net/D57051 9 years ago			`// Copyright (c) 2013, Facebook, Inc. All rights reserved.`
			`// This source code is licensed under the BSD-style license found in the`
			`// LICENSE file in the root directory of this source tree. An additional grant`
			`// of patent rights can be found in the PATENTS file in the same directory.`
			`//`
			`#pragma once`

			`#ifndef ROCKSDB_LITE`

			`#include <limits>`
			`#include <list>`
			`#include <map>`
			`#include <string>`
			`#include <vector>`

			`#include "rocksdb/env.h"`
			`#include "rocksdb/persistent_cache.h"`
			`#include "rocksdb/status.h"`
			`#include "util/histogram.h"`

			`// Persistent Cache`
			`//`
			`// Persistent cache is tiered key-value cache that can use persistent medium. It`
			`// is a generic design and can leverage any storage medium -- disk/SSD/NVM/RAM.`
			`// The code has been kept generic but significant benchmark/design/development`
			`// time has been spent to make sure the cache performs appropriately for`
			`// respective storage medium.`
			`// The file defines`
			`// PersistentCacheOptions : Options for persistent cache`
			`// PersistentCacheTier : Implementation that handles individual cache tier`
			`// PersistentTieresCache : Implementation that handles all tiers as a logical`
			`// unit`
			`//`
			`// PersistentTieredCache architecture:`
			`// +--------------------------+ PersistentCacheTier that handles multiple tiers`
			`// \| +----------------+ \|`
			`// \| \| RAM \| PersistentCacheTier that handles RAM (VolatileCacheImpl)`
			`// \| +----------------+ \|`
			`// \| \| next \|`
			`// \| v \|`
			`// \| +----------------+ \|`
			`// \| \| NVM \| PersistentCacheTier implementation that handles NVM`
			`// \| +----------------+ (BlockCacheImpl)`
			`// \| \| next \|`
			`// \| V \|`
			`// \| +----------------+ \|`
			`// \| \| LE-SSD \| PersistentCacheTier implementation that handles LE-SSD`
			`// \| +----------------+ (BlockCacheImpl)`
			`// \| \| \|`
			`// \| V \|`
			`// \| null \|`
			`// +--------------------------+`
			`// \|`
			`// V`
			`// null`
			`namespace rocksdb {`

			`// Persistent Cache Options`
			`//`
			`// This struct captures all the options that are used to configure persistent`
			`// cache. Some of the terminologies used in naming the options are`
			`//`
			`// dispatch size :`
			`// This is the size in which IO is dispatched to the device`
			`//`
			`// write buffer size :`
			`// This is the size of an individual write buffer size. Write buffers are`
			`// grouped to form buffered file.`
			`//`
			`// cache size :`
			`// This is the logical maximum for the cache size`
			`//`
			`// qdepth :`
			`// This is the max number of IOs that can issues to the device in parallel`
			`//`
			`// pepeling :`
			`// The writer code path follows pipelined architecture, which means the`
			`// operations are handed off from one stage to another`
			`//`
			`// pipelining backlog size :`
			`// With the pipelined architecture, there can always be backlogging of ops in`
			`// pipeline queues. This is the maximum backlog size after which ops are dropped`
			`// from queue`
			`struct PersistentCacheOptions {`
			`explicit PersistentCacheOptions(Env* const _env, const std::string& _path,`
			`const uint64_t _cache_size,`
			`const std::shared_ptr<Logger>& _log,`
			`const uint32_t _write_buffer_size = 1 * 1024 *`
			`1024) {`
			`env = _env;`
			`path = _path;`
			`log = _log;`
			`cache_size = _cache_size;`
			`writer_dispatch_size = write_buffer_size = _write_buffer_size;`
			`}`

			`//`
			`// Validate the settings. Our intentions are to catch erroneous settings ahead`
			`// of time instead going violating invariants or causing dead locks.`
			`//`
			`Status ValidateSettings() const {`
			`// (1) check pre-conditions for variables`
			`if (!env \|\| path.empty()) {`
			`return Status::InvalidArgument("empty or null args");`
			`}`

			`// (2) assert size related invariants`
			`// - cache size cannot be less than cache file size`
			`// - individual write buffer size cannot be greater than cache file size`
			`// - total write buffer size cannot be less than 2X cache file size`
			`if (cache_size < cache_file_size \|\| write_buffer_size >= cache_file_size \|\|`
			`write_buffer_size * write_buffer_count() < 2 * cache_file_size) {`
			`return Status::InvalidArgument("invalid cache size");`
			`}`

			`// (2) check writer settings`
			`// - Queue depth cannot be 0`
			`// - writer_dispatch_size cannot be greater than writer_buffer_size`
			`// - dispatch size and buffer size need to be aligned`
			`if (!writer_qdepth \|\| writer_dispatch_size > write_buffer_size \|\|`
			`write_buffer_size % writer_dispatch_size) {`
			`return Status::InvalidArgument("invalid writer settings");`
			`}`

			`return Status::OK();`
			`}`

			`//`
			`// Env abstraction to use for systmer level operations`
			`//`
			`Env* env;`

			`//`
			`// Path for the block cache where blocks are persisted`
			`//`
			`std::string path;`

			`//`
			`// Log handle for logging messages`
			`//`
			`std::shared_ptr<Logger> log;`

			`//`
			`// Logical cache size`
			`//`
			`uint64_t cache_size = std::numeric_limits<uint64_t>::max();`

			`// cache-file-size`
			`//`
			`// Cache consists of multiples of small files. This parameter defines the`
			`// size of an individual cache file`
			`//`
			`// default: 1M`
			`uint32_t cache_file_size = 100ULL * 1024 * 1024;`

			`// writer-qdepth`
			`//`
			`// The writers can issues IO to the devices in parallel. This parameter`
			`// controls the max number if IOs that can issues in parallel to the block`
			`// device`
			`//`
			`// default :1`
			`uint32_t writer_qdepth = 1;`

			`// pipeline-writes`
			`//`
			`// The write optionally follow pipelined architecture. This helps`
			`// avoid regression in the eviction code path of the primary tier. This`
			`// parameter defines if pipelining is enabled or disabled`
			`//`
			`// default: true`
			`bool pipeline_writes_ = true;`

			`// max-write-pipeline-backlog-size`
			`//`
			`// Max pipeline buffer size. This is the maximum backlog we can accumulate`
			`// while waiting for writes. After the limit, new ops will be dropped.`
			`//`
			`// Default: 1GiB`
			`uint64_t max_write_pipeline_backlog_size = 1ULL * 1024 * 1024 * 1024;`

			`// write-buffer-size`
			`//`
			`// This is the size in which buffer slabs are allocated.`
			`//`
			`// Default: 1M`
			`uint32_t write_buffer_size = 1ULL * 1024 * 1024;`

			`// write-buffer-count`
			`//`
			`// This is the total number of buffer slabs. This is calculated as a factor of`
			`// file size in order to avoid dead lock.`
			`size_t write_buffer_count() const {`
			`assert(write_buffer_size);`
Fix Windows build Summary: Fix 2 issues that was breaking Windows build 1) double to size_t potential downcast warning 2) port_posix is not ready for windows, avoiding building hash_table_bench to avoid build break Test Plan: compile in Windoes and make check Reviewers: sdong, andrewkr Subscribers: andrewkr, dhruba, leveldb Differential Revision: https://reviews.facebook.net/D59265 9 years ago			`return static_cast<size_t>((writer_qdepth + 1.2) * cache_file_size /`
			`write_buffer_size);`
Persistent Read Cache (4) Interface definitions Summary: This diff provides the basic interface definitions of persistent read cache system PersistentCacheOptions captures the persistent read cache options used to configure and control the system PersistentCacheTier provides the basic building block for constructing tiered cache PersistentTieredCache provides a logical abstraction of tiers of cache layered over one another Test Plan: Compile Reviewers: sdong Subscribers: andrewkr, dhruba, leveldb Differential Revision: https://reviews.facebook.net/D57051 9 years ago			`}`

			`// writer-dispatch-size`
			`//`
			`// The writer thread will dispatch the IO at the specified IO size`
			`//`
			`// default: 1M`
			`uint64_t writer_dispatch_size = 1ULL * 1024 * 1024;`

			`PersistentCacheOptions MakePersistentCacheOptions(`
			`const std::string& path, const uint64_t size,`
			`const std::shared_ptr<Logger>& log);`
			`};`

			`// Persistent Cache Tier`
			`//`
			`// This a logical abstraction that defines a tier of the persistent cache. Tiers`
			`// can be stacked over one another. PersistentCahe provides the basic definition`
			`// for accessing/storing in the cache. PersistentCacheTier extends the interface`
			`// to enable management and stacking of tiers.`
			`class PersistentCacheTier : public PersistentCache {`
			`public:`
			`typedef std::shared_ptr<PersistentCacheTier> Tier;`
Persistent Read Cache (5) Volatile cache tier implementation Summary: This provides provides an implementation of PersistentCacheTier that is specialized for RAM. This tier does not persist data though. Why do we need this tier ? This is ideal as tier 0. This tier can host data that is too hot. Why can't we use Cache variants ? Yes you can use them instead. This tier can potentially outperform BlockCache in RAW mode by virtue of compression and compressed cache in block cache doesn't seem very popular. Potentially this tier can be modified to under stand the disadvantage of the tier below and retain data that the tier below is bad at handling (for example index and bloom data that is huge in size) Test Plan: Run unit tests added Subscribers: andrewkr, dhruba, leveldb Differential Revision: https://reviews.facebook.net/D57069 9 years ago			`typedef std::map<std::string, double> TierStats;`
Persistent Read Cache (4) Interface definitions Summary: This diff provides the basic interface definitions of persistent read cache system PersistentCacheOptions captures the persistent read cache options used to configure and control the system PersistentCacheTier provides the basic building block for constructing tiered cache PersistentTieredCache provides a logical abstraction of tiers of cache layered over one another Test Plan: Compile Reviewers: sdong Subscribers: andrewkr, dhruba, leveldb Differential Revision: https://reviews.facebook.net/D57051 9 years ago
			`virtual ~PersistentCacheTier() {}`

			`// Open the persistent cache tier`
			`virtual Status Open();`

			`// Close the persistent cache tier`
			`virtual Status Close();`

			`// Flush the pending writes`
			`virtual void Flush();`

			`// Reserve space up to 'size' bytes`
			`virtual bool Reserve(const size_t size);`

			`// Erase a key from the cache`
			`virtual bool Erase(const Slice& key);`

			`// Print stats to string recursively`
			`virtual std::string PrintStats();`

			`// Expose stats`
			`virtual std::vector<TierStats> Stats() = 0;`

			`// Insert to page cache`
			`virtual Status Insert(const Slice& page_key, const char* data,`
			`const size_t size) = 0;`

			`// Lookup page cache by page identifier`
			`virtual Status Lookup(const Slice& page_key, std::unique_ptr<char[]>* data,`
			`size_t* size) = 0;`

			`// Return a reference to next tier`
			`virtual Tier& next_tier() { return next_tier_; }`

			`// Set the value for next tier`
			`virtual void set_next_tier(const Tier& tier) {`
			`assert(!next_tier_);`
			`next_tier_ = tier;`
			`}`

			`private:`
			`Tier next_tier_; // next tier`
			`};`

			`// PersistentTieredCache`
			`//`
			`// Abstraction that helps you construct a tiers of persistent caches as a`
			`// unified cache. The tier(s) of cache will act a single tier for management`
			`// ease and support PersistentCache methods for accessing data.`
			`class PersistentTieredCache : public PersistentCacheTier {`
			`public:`
			`virtual ~PersistentTieredCache();`

			`Status Open() override;`
			`Status Close() override;`
			`void Flush() override;`
			`bool Erase(const Slice& key) override;`
			`std::string PrintStats() override;`
			`Status Insert(const Slice& page_key, const char* data,`
			`const size_t size) override;`
			`Status Lookup(const Slice& page_key, std::unique_ptr<char[]>* data,`
			`size_t* size) override;`

			`void AddTier(const Tier& tier);`

			`Tier& next_tier() override {`
			`auto it = tiers_.end();`
			`return (*it)->next_tier();`
			`}`

			`void set_next_tier(const Tier& tier) override {`
			`auto it = tiers_.end();`
			`(*it)->set_next_tier(tier);`
			`}`

			`protected:`
			`std::list<Tier> tiers_; // list of tiers top-down`
			`};`

			`} // namespace rocksdb`

			`#endif`