Persistent Read Cache (part 6) Block Cache Tier Implementation

Summary:
The patch is a continuation of part 5. It glues the abstraction for
file layout and metadata, and flush out the implementation of the API. It
adds unit tests for the implementation.

Test Plan: Run unit tests

Subscribers: andrewkr, dhruba, leveldb

Differential Revision: https://reviews.facebook.net/D57549
main
krad 9 years ago
parent 64046e581c
commit c116b47804
  1. 1
      src.mk
  2. 3
      util/io_posix.cc
  3. 358
      utilities/persistent_cache/block_cache_tier.cc
  4. 145
      utilities/persistent_cache/block_cache_tier.h
  5. 17
      utilities/persistent_cache/block_cache_tier_file.cc
  6. 16
      utilities/persistent_cache/block_cache_tier_file.h
  7. 18
      utilities/persistent_cache/block_cache_tier_file_buffer.h
  8. 14
      utilities/persistent_cache/block_cache_tier_metadata.cc
  9. 3
      utilities/persistent_cache/block_cache_tier_metadata.h
  10. 338
      utilities/persistent_cache/persistent_cache_test.cc
  11. 158
      utilities/persistent_cache/persistent_cache_test.h
  12. 26
      utilities/persistent_cache/persistent_cache_tier.cc
  13. 190
      utilities/persistent_cache/persistent_cache_tier.h

@ -136,6 +136,7 @@ LIB_SOURCES = \
utilities/persistent_cache/volatile_tier_impl.cc \ utilities/persistent_cache/volatile_tier_impl.cc \
utilities/persistent_cache/block_cache_tier_file.cc \ utilities/persistent_cache/block_cache_tier_file.cc \
utilities/persistent_cache/block_cache_tier_metadata.cc \ utilities/persistent_cache/block_cache_tier_metadata.cc \
utilities/persistent_cache/block_cache_tier.cc \
utilities/redis/redis_lists.cc \ utilities/redis/redis_lists.cc \
utilities/simulator_cache/sim_cache.cc \ utilities/simulator_cache/sim_cache.cc \
utilities/spatialdb/spatial_db.cc \ utilities/spatialdb/spatial_db.cc \

@ -145,8 +145,7 @@ Status ReadUnaligned(int fd, Slice* data, const uint64_t offset,
Status DirectIORead(int fd, Slice* result, size_t off, size_t n, Status DirectIORead(int fd, Slice* result, size_t off, size_t n,
char* scratch) { char* scratch) {
if (IsSectorAligned(off) && IsSectorAligned(n) && if (IsSectorAligned(off) && IsSectorAligned(n) && IsPageAligned(scratch)) {
IsPageAligned(result->data())) {
return ReadAligned(fd, result, off, n, scratch); return ReadAligned(fd, result, off, n, scratch);
} }
return ReadUnaligned(fd, result, off, n, scratch); return ReadUnaligned(fd, result, off, n, scratch);

@ -0,0 +1,358 @@
// Copyright (c) 2013, Facebook, Inc. All rights reserved.
// This source code is licensed under the BSD-style license found in the
// LICENSE file in the root directory of this source tree. An additional grant
// of patent rights can be found in the PATENTS file in the same directory.
#ifndef ROCKSDB_LITE
#include "utilities/persistent_cache/block_cache_tier.h"
#include <regex>
#include <utility>
#include <vector>
#include "util/stop_watch.h"
#include "utilities/persistent_cache/block_cache_tier_file.h"
namespace rocksdb {
//
// BlockCacheImpl
//
Status BlockCacheTier::Open() {
Status status;
WriteLock _(&lock_);
assert(!size_);
// Check the validity of the options
status = opt_.ValidateSettings();
assert(status.ok());
if (!status.ok()) {
Error(opt_.log, "Invalid block cache options");
return status;
}
// Create base directory or cleanup existing directory
status = opt_.env->CreateDirIfMissing(opt_.path);
if (!status.ok()) {
Error(opt_.log, "Error creating directory %s. %s", opt_.path.c_str(),
status.ToString().c_str());
return status;
}
// Create base/<cache dir> directory
status = opt_.env->CreateDir(GetCachePath());
if (!status.ok()) {
// directory already exisits, clean it up
status = CleanupCacheFolder(GetCachePath());
assert(status.ok());
if (!status.ok()) {
Error(opt_.log, "Error creating directory %s. %s", opt_.path.c_str(),
status.ToString().c_str());
return status;
}
}
assert(!cache_file_);
NewCacheFile();
assert(cache_file_);
if (opt_.pipeline_writes_) {
assert(!insert_th_.joinable());
insert_th_ = std::thread(&BlockCacheTier::InsertMain, this);
}
return Status::OK();
}
Status BlockCacheTier::CleanupCacheFolder(const std::string& folder) {
std::vector<std::string> files;
Status status = opt_.env->GetChildren(folder, &files);
if (!status.ok()) {
Error(opt_.log, "Error getting files for %s. %s", folder.c_str(),
status.ToString().c_str());
return status;
}
// cleanup files with the patter :digi:.rc
for (auto file : files) {
try {
const std::regex cache_file_regex("(0-9)+\\.rc$");
if (std::regex_match(file, cache_file_regex)) {
// cache file
Info(opt_.log, "Removing file %s.", file.c_str());
status = opt_.env->DeleteFile(folder + "/" + file);
if (!status.ok()) {
Error(opt_.log, "Error deleting file %s. %s", file.c_str(),
status.ToString().c_str());
return Status::IOError("Error deleting file " + file);
}
} else {
Info(opt_.log, "Skipping file %s.", file.c_str());
}
} catch (const std::regex_error& e) {
// Since std library is evolving, you can potentially get an exception for
// certain older compiler version. It is safer to exit cleanly.
return Status::IOError(e.what());
}
}
return Status::OK();
}
Status BlockCacheTier::Close() {
// stop the insert thread
if (opt_.pipeline_writes_ && insert_th_.joinable()) {
InsertOp op(/*quit=*/true);
insert_ops_.Push(std::move(op));
insert_th_.join();
}
// stop the writer before
writer_.Stop();
// clear all metadata
WriteLock _(&lock_);
metadata_.Clear();
return Status::OK();
}
std::string BlockCacheTier::PrintStats() {
std::ostringstream os;
os << "persistentcache.blockcachetier.bytes_piplined: "
<< stats_.bytes_pipelined_.ToString() << std::endl
<< "persistentcache.blockcachetier.bytes_written: "
<< stats_.bytes_written_.ToString() << std::endl
<< "persistentcache.blockcachetier.bytes_read: "
<< stats_.bytes_read_.ToString() << std::endl
<< "persistentcache.blockcachetier.insert_dropped"
<< stats_.insert_dropped_ << std::endl
<< "persistentcache.blockcachetier.cache_hits: " << stats_.cache_hits_
<< std::endl
<< "persistentcache.blockcachetier.cache_misses: " << stats_.cache_misses_
<< std::endl
<< "persistentcache.blockcachetier.cache_errors: " << stats_.cache_errors_
<< std::endl
<< "persistentcache.blockcachetier.cache_hits_pct: "
<< stats_.CacheHitPct() << std::endl
<< "persistentcache.blockcachetier.cache_misses_pct: "
<< stats_.CacheMissPct() << std::endl
<< "persistentcache.blockcachetier.read_hit_latency: "
<< stats_.read_hit_latency_.ToString() << std::endl
<< "persistentcache.blockcachetier.read_miss_latency: "
<< stats_.read_miss_latency_.ToString() << std::endl
<< "persistenetcache.blockcachetier.write_latency: "
<< stats_.write_latency_.ToString() << std::endl
<< PersistentCacheTier::PrintStats();
return os.str();
}
Status BlockCacheTier::Insert(const Slice& key, const char* data,
const size_t size) {
// update stats
stats_.bytes_pipelined_.Add(size);
if (opt_.pipeline_writes_) {
// off load the write to the write thread
insert_ops_.Push(
InsertOp(key.ToString(), std::move(std::string(data, size))));
return Status::OK();
}
assert(!opt_.pipeline_writes_);
return InsertImpl(key, Slice(data, size));
}
void BlockCacheTier::InsertMain() {
while (true) {
InsertOp op(insert_ops_.Pop());
if (op.signal_) {
// that is a secret signal to exit
break;
}
size_t retry = 0;
Status s;
while ((s = InsertImpl(Slice(op.key_), Slice(op.data_))).IsTryAgain()) {
if (retry > kMaxRetry) {
break;
}
// this can happen when the buffers are full, we wait till some buffers
// are free. Why don't we wait inside the code. This is because we want
// to support both pipelined and non-pipelined mode
buffer_allocator_.WaitUntilUsable();
retry++;
}
if (!s.ok()) {
stats_.insert_dropped_++;
}
}
}
Status BlockCacheTier::InsertImpl(const Slice& key, const Slice& data) {
// pre-condition
assert(key.size());
assert(data.size());
assert(cache_file_);
StopWatchNano timer(opt_.env);
WriteLock _(&lock_);
LBA lba;
if (metadata_.Lookup(key, &lba)) {
// the key already exisits, this is duplicate insert
return Status::OK();
}
while (!cache_file_->Append(key, data, &lba)) {
if (!cache_file_->Eof()) {
Debug(opt_.log, "Error inserting to cache file %d",
cache_file_->cacheid());
stats_.write_latency_.Add(timer.ElapsedNanos() / 1000);
return Status::TryAgain();
}
assert(cache_file_->Eof());
NewCacheFile();
}
// Insert into lookup index
BlockInfo* info = metadata_.Insert(key, lba);
assert(info);
if (!info) {
return Status::IOError("Unexpected error inserting to index");
}
// insert to cache file reverse mapping
cache_file_->Add(info);
// update stats
stats_.bytes_written_.Add(data.size());
stats_.write_latency_.Add(timer.ElapsedNanos() / 1000);
return Status::OK();
}
Status BlockCacheTier::Lookup(const Slice& key, unique_ptr<char[]>* val,
size_t* size) {
StopWatchNano timer(opt_.env);
LBA lba;
bool status;
status = metadata_.Lookup(key, &lba);
if (!status) {
stats_.cache_misses_++;
stats_.read_miss_latency_.Add(timer.ElapsedNanos() / 1000);
return Status::NotFound("blockcache: key not found");
}
BlockCacheFile* const file = metadata_.Lookup(lba.cache_id_);
if (!file) {
// this can happen because the block index and cache file index are
// different, and the cache file might be removed between the two lookups
stats_.cache_misses_++;
stats_.read_miss_latency_.Add(timer.ElapsedNanos() / 1000);
return Status::NotFound("blockcache: cache file not found");
}
assert(file->refs_);
unique_ptr<char[]> scratch(new char[lba.size_]);
Slice blk_key;
Slice blk_val;
status = file->Read(lba, &blk_key, &blk_val, scratch.get());
--file->refs_;
assert(status);
if (!status) {
stats_.cache_misses_++;
stats_.cache_errors_++;
stats_.read_miss_latency_.Add(timer.ElapsedNanos() / 1000);
return Status::NotFound("blockcache: error reading data");
}
assert(blk_key == key);
val->reset(new char[blk_val.size()]);
memcpy(val->get(), blk_val.data(), blk_val.size());
*size = blk_val.size();
stats_.bytes_read_.Add(*size);
stats_.cache_hits_++;
stats_.read_hit_latency_.Add(timer.ElapsedNanos() / 1000);
return Status::OK();
}
bool BlockCacheTier::Erase(const Slice& key) {
WriteLock _(&lock_);
BlockInfo* info = metadata_.Remove(key);
assert(info);
delete info;
return true;
}
void BlockCacheTier::NewCacheFile() {
lock_.AssertHeld();
Info(opt_.log, "Creating cache file %d", writer_cache_id_);
writer_cache_id_++;
cache_file_ = new WriteableCacheFile(opt_.env, &buffer_allocator_, &writer_,
GetCachePath(), writer_cache_id_,
opt_.cache_file_size, opt_.log);
bool status;
status =
cache_file_->Create(opt_.enable_direct_writes, opt_.enable_direct_reads);
assert(status);
// insert to cache files tree
status = metadata_.Insert(cache_file_);
(void)status;
assert(status);
}
bool BlockCacheTier::Reserve(const size_t size) {
WriteLock _(&lock_);
assert(size_ <= opt_.cache_size);
if (size + size_ <= opt_.cache_size) {
// there is enough space to write
size_ += size;
return true;
}
assert(size + size_ >= opt_.cache_size);
// there is not enough space to fit the requested data
// we can clear some space by evicting cold data
const double retain_fac = (100 - kEvictPct) / static_cast<double>(100);
while (size + size_ > opt_.cache_size * retain_fac) {
unique_ptr<BlockCacheFile> f(metadata_.Evict());
if (!f) {
// nothing is evictable
return false;
}
assert(!f->refs_);
size_t file_size;
if (!f->Delete(&file_size).ok()) {
// unable to delete file
return false;
}
assert(file_size <= size_);
size_ -= file_size;
}
size_ += size;
assert(size_ <= opt_.cache_size * 0.9);
return true;
}
} // namespace rocksdb
#endif // ifndef ROCKSDB_LITE

@ -0,0 +1,145 @@
// Copyright (c) 2013, Facebook, Inc. All rights reserved.
// This source code is licensed under the BSD-style license found in the
// LICENSE file in the root directory of this source tree. An additional grant
// of patent rights can be found in the PATENTS file in the same directory.
#pragma once
#include <unistd.h>
#include <list>
#include <memory>
#include <set>
#include <sstream>
#include <stdexcept>
#include <string>
#include <thread>
#include "rocksdb/cache.h"
#include "rocksdb/comparator.h"
#include "rocksdb/persistent_cache.h"
#include "utilities/persistent_cache/block_cache_tier_file.h"
#include "utilities/persistent_cache/block_cache_tier_metadata.h"
#include "utilities/persistent_cache/persistent_cache_util.h"
#include "db/skiplist.h"
#include "port/port_posix.h"
#include "util/arena.h"
#include "util/coding.h"
#include "util/crc32c.h"
#include "util/histogram.h"
#include "util/mutexlock.h"
namespace rocksdb {
//
// Block cache tier implementation
//
class BlockCacheTier : public PersistentCacheTier {
public:
explicit BlockCacheTier(const PersistentCacheConfig& opt)
: opt_(opt),
insert_ops_(opt_.max_write_pipeline_backlog_size),
buffer_allocator_(opt.write_buffer_size, opt.write_buffer_count()),
writer_(this, opt_.writer_qdepth, opt_.writer_dispatch_size) {
Info(opt_.log, "Initializing allocator. size=%d B count=%d",
opt_.write_buffer_size, opt_.write_buffer_count());
}
virtual ~BlockCacheTier() {
// By contract, the user should have called stop before destroying the
// object
assert(!insert_th_.joinable());
}
Status Insert(const Slice& key, const char* data, const size_t size) override;
Status Lookup(const Slice& key, std::unique_ptr<char[]>* data,
size_t* size) override;
Status Open() override;
Status Close() override;
bool Erase(const Slice& key) override;
bool Reserve(const size_t size) override;
bool IsCompressed() override { return opt_.is_compressed; }
std::string PrintStats() override;
void TEST_Flush() override {
while (insert_ops_.Size()) {
/* sleep override */ sleep(1);
}
}
private:
// Percentage of cache to be evicted when the cache is full
static const size_t kEvictPct = 10;
// Max attempts to insert key, value to cache in pipelined mode
static const size_t kMaxRetry = 3;
// Pipelined operation
struct InsertOp {
explicit InsertOp(const bool signal) : signal_(signal) {}
explicit InsertOp(std::string&& key, const std::string& data)
: key_(std::move(key)), data_(data) {}
~InsertOp() {}
InsertOp() = delete;
InsertOp(InsertOp&& rhs) = default;
InsertOp& operator=(InsertOp&& rhs) = default;
// used for estimating size by bounded queue
size_t Size() { return data_.size() + key_.size(); }
std::string key_;
std::string data_;
const bool signal_ = false; // signal to request processing thread to exit
};
// entry point for insert thread
void InsertMain();
// insert implementation
Status InsertImpl(const Slice& key, const Slice& data);
// Create a new cache file
void NewCacheFile();
// Get cache directory path
std::string GetCachePath() const { return opt_.path + "/cache"; }
// Cleanup folder
Status CleanupCacheFolder(const std::string& folder);
// Statistics
struct Stats {
HistogramImpl bytes_pipelined_;
HistogramImpl bytes_written_;
HistogramImpl bytes_read_;
HistogramImpl read_hit_latency_;
HistogramImpl read_miss_latency_;
HistogramImpl write_latency_;
uint64_t cache_hits_ = 0;
uint64_t cache_misses_ = 0;
uint64_t cache_errors_ = 0;
uint64_t insert_dropped_ = 0;
double CacheHitPct() const {
const auto lookups = cache_hits_ + cache_misses_;
return lookups ? 100 * cache_hits_ / static_cast<double>(lookups) : 0.0;
}
double CacheMissPct() const {
const auto lookups = cache_hits_ + cache_misses_;
return lookups ? 100 * cache_misses_ / static_cast<double>(lookups) : 0.0;
}
};
port::RWMutex lock_; // Synchronization
const PersistentCacheConfig opt_; // BlockCache options
BoundedQueue<InsertOp> insert_ops_; // Ops waiting for insert
std::thread insert_th_; // Insert thread
uint32_t writer_cache_id_ = 0; // Current cache file identifier
WriteableCacheFile* cache_file_ = nullptr; // Current cache file reference
CacheWriteBufferAllocator buffer_allocator_; // Buffer provider
ThreadedWriter writer_; // Writer threads
BlockCacheTierMetadata metadata_; // Cache meta data manager
std::atomic<uint64_t> size_{0}; // Size of the cache
Stats stats_; // Statistics
};
} // namespace rocksdb

@ -189,12 +189,12 @@ bool CacheRecord::Deserialize(const Slice& data) {
// RandomAccessFile // RandomAccessFile
// //
bool RandomAccessCacheFile::Open() { bool RandomAccessCacheFile::Open(const bool enable_direct_reads) {
WriteLock _(&rwlock_); WriteLock _(&rwlock_);
return OpenImpl(); return OpenImpl(enable_direct_reads);
} }
bool RandomAccessCacheFile::OpenImpl() { bool RandomAccessCacheFile::OpenImpl(const bool enable_direct_reads) {
rwlock_.AssertHeld(); rwlock_.AssertHeld();
Debug(log_, "Opening cache file %s", Path().c_str()); Debug(log_, "Opening cache file %s", Path().c_str());
@ -265,9 +265,12 @@ WriteableCacheFile::~WriteableCacheFile() {
ClearBuffers(); ClearBuffers();
} }
bool WriteableCacheFile::Create() { bool WriteableCacheFile::Create(const bool enable_direct_writes,
const bool enable_direct_reads) {
WriteLock _(&rwlock_); WriteLock _(&rwlock_);
enable_direct_reads_ = enable_direct_reads;
Debug(log_, "Creating new cache %s (max size is %d B)", Path().c_str(), Debug(log_, "Creating new cache %s (max size is %d B)", Path().c_str(),
max_size_); max_size_);
@ -388,7 +391,7 @@ void WriteableCacheFile::DispatchBuffer() {
// pad it with zero for direct IO // pad it with zero for direct IO
buf->FillTrailingZeros(); buf->FillTrailingZeros();
assert(buf->Used() % FILE_ALIGNMENT_SIZE == 0); assert(buf->Used() % kFileAlignmentSize == 0);
writer_->Write(file_.get(), buf, file_off, writer_->Write(file_.get(), buf, file_off,
std::bind(&WriteableCacheFile::BufferWriteDone, this)); std::bind(&WriteableCacheFile::BufferWriteDone, this));
@ -417,7 +420,7 @@ void WriteableCacheFile::CloseAndOpenForReading() {
// Our env abstraction do not allow reading from a file opened for appending // Our env abstraction do not allow reading from a file opened for appending
// We need close the file and re-open it for reading // We need close the file and re-open it for reading
Close(); Close();
RandomAccessCacheFile::OpenImpl(); RandomAccessCacheFile::OpenImpl(enable_direct_reads_);
} }
bool WriteableCacheFile::ReadBuffer(const LBA& lba, Slice* key, Slice* block, bool WriteableCacheFile::ReadBuffer(const LBA& lba, Slice* key, Slice* block,
@ -523,7 +526,9 @@ void ThreadedWriter::Stop() {
// wait for all threads to exit // wait for all threads to exit
for (auto& th : threads_) { for (auto& th : threads_) {
th.join(); th.join();
assert(!th.joinable());
} }
threads_.clear();
} }
void ThreadedWriter::Write(WritableFile* const file, CacheWriteBuffer* buf, void ThreadedWriter::Write(WritableFile* const file, CacheWriteBuffer* buf,

@ -114,7 +114,9 @@ class BlockCacheFile : public LRUElement<BlockCacheFile> {
} }
// get file path // get file path
std::string Path() const { return dir_ + "/" + std::to_string(cache_id_); } std::string Path() const {
return dir_ + "/" + std::to_string(cache_id_) + ".rc";
}
// get cache ID // get cache ID
uint32_t cacheid() const { return cache_id_; } uint32_t cacheid() const { return cache_id_; }
// Add block information to file data // Add block information to file data
@ -150,7 +152,7 @@ class RandomAccessCacheFile : public BlockCacheFile {
virtual ~RandomAccessCacheFile() {} virtual ~RandomAccessCacheFile() {}
// open file for reading // open file for reading
bool Open(); bool Open(const bool enable_direct_reads);
// read data from the disk // read data from the disk
bool Read(const LBA& lba, Slice* key, Slice* block, char* scratch) override; bool Read(const LBA& lba, Slice* key, Slice* block, char* scratch) override;
@ -158,7 +160,7 @@ class RandomAccessCacheFile : public BlockCacheFile {
std::unique_ptr<RandomAccessFile> file_; std::unique_ptr<RandomAccessFile> file_;
protected: protected:
bool OpenImpl(); bool OpenImpl(const bool enable_direct_reads);
bool ParseRec(const LBA& lba, Slice* key, Slice* val, char* scratch); bool ParseRec(const LBA& lba, Slice* key, Slice* val, char* scratch);
std::shared_ptr<Logger> log_; // log file std::shared_ptr<Logger> log_; // log file
@ -183,7 +185,7 @@ class WriteableCacheFile : public RandomAccessCacheFile {
virtual ~WriteableCacheFile(); virtual ~WriteableCacheFile();
// create file on disk // create file on disk
bool Create(); bool Create(const bool enable_direct_writes, const bool enable_direct_reads);
// read data from logical file // read data from logical file
bool Read(const LBA& lba, Slice* key, Slice* block, char* scratch) override { bool Read(const LBA& lba, Slice* key, Slice* block, char* scratch) override {
@ -205,7 +207,7 @@ class WriteableCacheFile : public RandomAccessCacheFile {
private: private:
friend class ThreadedWriter; friend class ThreadedWriter;
static const size_t FILE_ALIGNMENT_SIZE = 4 * 1024; // align file size static const size_t kFileAlignmentSize = 4 * 1024; // align file size
bool ReadBuffer(const LBA& lba, Slice* key, Slice* block, char* scratch); bool ReadBuffer(const LBA& lba, Slice* key, Slice* block, char* scratch);
bool ReadBuffer(const LBA& lba, char* data); bool ReadBuffer(const LBA& lba, char* data);
@ -240,6 +242,8 @@ class WriteableCacheFile : public RandomAccessCacheFile {
size_t buf_woff_ = 0; // off into bufs_ to write size_t buf_woff_ = 0; // off into bufs_ to write
size_t buf_doff_ = 0; // off into bufs_ to dispatch size_t buf_doff_ = 0; // off into bufs_ to dispatch
size_t pending_ios_ = 0; // Number of ios to disk in-progress size_t pending_ios_ = 0; // Number of ios to disk in-progress
bool enable_direct_reads_ = false; // Should we enable direct reads
// when reading from disk
}; };
// //
@ -267,7 +271,7 @@ class ThreadedWriter : public Writer {
explicit ThreadedWriter(PersistentCacheTier* const cache, const size_t qdepth, explicit ThreadedWriter(PersistentCacheTier* const cache, const size_t qdepth,
const size_t io_size); const size_t io_size);
virtual ~ThreadedWriter() {} virtual ~ThreadedWriter() { assert(threads_.empty()); }
void Stop() override; void Stop() override;
void Write(WritableFile* const file, CacheWriteBuffer* buf, void Write(WritableFile* const file, CacheWriteBuffer* buf,

@ -61,9 +61,9 @@ class CacheWriteBuffer {
// //
class CacheWriteBufferAllocator { class CacheWriteBufferAllocator {
public: public:
explicit CacheWriteBufferAllocator(const uint32_t buffer_size, explicit CacheWriteBufferAllocator(const size_t buffer_size,
const uint32_t buffer_count) const size_t buffer_count)
: buffer_size_(buffer_size) { : cond_empty_(&lock_), buffer_size_(buffer_size) {
MutexLock _(&lock_); MutexLock _(&lock_);
buffer_size_ = buffer_size; buffer_size_ = buffer_size;
for (uint32_t i = 0; i < buffer_count; i++) { for (uint32_t i = 0; i < buffer_count; i++) {
@ -71,6 +71,7 @@ class CacheWriteBufferAllocator {
assert(buf); assert(buf);
if (buf) { if (buf) {
bufs_.push_back(buf); bufs_.push_back(buf);
cond_empty_.Signal();
} }
} }
} }
@ -93,7 +94,6 @@ class CacheWriteBufferAllocator {
assert(!bufs_.empty()); assert(!bufs_.empty());
CacheWriteBuffer* const buf = bufs_.front(); CacheWriteBuffer* const buf = bufs_.front();
bufs_.pop_front(); bufs_.pop_front();
return buf; return buf;
} }
@ -102,6 +102,15 @@ class CacheWriteBufferAllocator {
MutexLock _(&lock_); MutexLock _(&lock_);
buf->Reset(); buf->Reset();
bufs_.push_back(buf); bufs_.push_back(buf);
cond_empty_.Signal();
}
void WaitUntilUsable() {
// We are asked to wait till we have buffers available
MutexLock _(&lock_);
while (bufs_.empty()) {
cond_empty_.Wait();
}
} }
size_t Capacity() const { return bufs_.size() * buffer_size_; } size_t Capacity() const { return bufs_.size() * buffer_size_; }
@ -110,6 +119,7 @@ class CacheWriteBufferAllocator {
private: private:
port::Mutex lock_; // Sync lock port::Mutex lock_; // Sync lock
port::CondVar cond_empty_; // Condition var for empty buffers
size_t buffer_size_; // Size of each buffer size_t buffer_size_; // Size of each buffer
std::list<CacheWriteBuffer*> bufs_; // Buffer stash std::list<CacheWriteBuffer*> bufs_; // Buffer stash
}; };

@ -36,8 +36,12 @@ void BlockCacheTierMetadata::Clear() {
block_index_.Clear([](BlockInfo* arg){ delete arg; }); block_index_.Clear([](BlockInfo* arg){ delete arg; });
} }
bool BlockCacheTierMetadata::Insert(BlockInfo* binfo) { BlockInfo* BlockCacheTierMetadata::Insert(const Slice& key, const LBA& lba) {
return block_index_.Insert(binfo); std::unique_ptr<BlockInfo> binfo(new BlockInfo(key, lba));
if (!block_index_.Insert(binfo.get())) {
return nullptr;
}
return binfo.release();
} }
bool BlockCacheTierMetadata::Lookup(const Slice& key, LBA* lba) { bool BlockCacheTierMetadata::Lookup(const Slice& key, LBA* lba) {
@ -59,10 +63,8 @@ bool BlockCacheTierMetadata::Lookup(const Slice& key, LBA* lba) {
BlockInfo* BlockCacheTierMetadata::Remove(const Slice& key) { BlockInfo* BlockCacheTierMetadata::Remove(const Slice& key) {
BlockInfo lookup_key(key); BlockInfo lookup_key(key);
BlockInfo* binfo = nullptr; BlockInfo* binfo = nullptr;
bool status __attribute__((__unused__)) = bool ok __attribute__((__unused__)) = block_index_.Erase(&lookup_key, &binfo);
block_index_.Erase(&lookup_key, &binfo); assert(ok);
(void)status;
assert(status);
return binfo; return binfo;
} }

@ -60,7 +60,8 @@ class BlockCacheTierMetadata {
BlockCacheFile* Lookup(const uint32_t cache_id); BlockCacheFile* Lookup(const uint32_t cache_id);
// Insert block information to block index // Insert block information to block index
bool Insert(BlockInfo* binfo); BlockInfo* Insert(const Slice& key, const LBA& lba);
// bool Insert(BlockInfo* binfo);
// Lookup block information from block index // Lookup block information from block index
bool Lookup(const Slice& key, LBA* lba); bool Lookup(const Slice& key, LBA* lba);

@ -14,36 +14,356 @@
#include <memory> #include <memory>
#include <thread> #include <thread>
#include "utilities/persistent_cache/block_cache_tier.h"
namespace rocksdb { namespace rocksdb {
#if !(defined(__clang__) && defined(OS_LINUX)) static const double kStressFactor = .125;
static void OnOpenForRead(void* arg) {
int* val = static_cast<int*>(arg);
*val &= ~O_DIRECT;
rocksdb::SyncPoint::GetInstance()->SetCallBack(
"NewRandomAccessFile:O_DIRECT",
std::bind(OnOpenForRead, std::placeholders::_1));
}
static void OnOpenForWrite(void* arg) {
int* val = static_cast<int*>(arg);
*val &= ~O_DIRECT;
rocksdb::SyncPoint::GetInstance()->SetCallBack(
"NewWritableFile:O_DIRECT",
std::bind(OnOpenForWrite, std::placeholders::_1));
}
//
// Simple logger that prints message on stdout
//
class ConsoleLogger : public Logger {
public:
using Logger::Logv;
ConsoleLogger() : Logger(InfoLogLevel::ERROR_LEVEL) {}
void Logv(const char* format, va_list ap) override {
MutexLock _(&lock_);
vprintf(format, ap);
printf("\n");
}
port::Mutex lock_;
};
// construct a tiered RAM+Block cache
std::unique_ptr<PersistentTieredCache> NewTieredCache(
const size_t mem_size, const PersistentCacheConfig& opt) {
std::unique_ptr<PersistentTieredCache> tcache(new PersistentTieredCache());
// create primary tier
assert(mem_size);
auto pcache = std::shared_ptr<PersistentCacheTier>(new VolatileCacheTier(
/*is_compressed*/ true, mem_size));
tcache->AddTier(pcache);
// create secondary tier
auto scache = std::shared_ptr<PersistentCacheTier>(new BlockCacheTier(opt));
tcache->AddTier(scache);
Status s = tcache->Open();
assert(s.ok());
return tcache;
}
// create block cache
std::unique_ptr<PersistentCacheTier> NewBlockCache(
Env* env, const std::string& path,
const uint64_t max_size = std::numeric_limits<uint64_t>::max(),
const bool enable_direct_writes = false) {
const uint32_t max_file_size = 12 * 1024 * 1024 * kStressFactor;
auto log = std::make_shared<ConsoleLogger>();
PersistentCacheConfig opt(env, path, max_size, log);
opt.cache_file_size = max_file_size;
opt.max_write_pipeline_backlog_size = std::numeric_limits<uint64_t>::max();
opt.enable_direct_writes = enable_direct_writes;
std::unique_ptr<PersistentCacheTier> scache(new BlockCacheTier(opt));
Status s = scache->Open();
assert(s.ok());
return scache;
}
// create a new cache tier
std::unique_ptr<PersistentTieredCache> NewTieredCache(
Env* env, const std::string& path, const uint64_t max_volatile_cache_size,
const uint64_t max_block_cache_size =
std::numeric_limits<uint64_t>::max()) {
const uint32_t max_file_size = 12 * 1024 * 1024 * kStressFactor;
auto log = std::make_shared<ConsoleLogger>();
auto opt = PersistentCacheConfig(env, path, max_block_cache_size, log);
opt.cache_file_size = max_file_size;
opt.max_write_pipeline_backlog_size = std::numeric_limits<uint64_t>::max();
// create tier out of the two caches
auto cache = NewTieredCache(max_volatile_cache_size, opt);
return cache;
}
PersistentCacheTierTest::PersistentCacheTierTest()
: path_(test::TmpDir(Env::Default()) + "/cache_test") {
rocksdb::SyncPoint::GetInstance()->EnableProcessing();
rocksdb::SyncPoint::GetInstance()->SetCallBack("NewRandomAccessFile:O_DIRECT",
OnOpenForRead);
rocksdb::SyncPoint::GetInstance()->SetCallBack("NewWritableFile:O_DIRECT",
OnOpenForWrite);
}
// Volatile cache tests // Volatile cache tests
TEST_F(PersistentCacheTierTest, VolatileCacheInsert) { TEST_F(PersistentCacheTierTest, VolatileCacheInsert) {
for (auto nthreads : {1, 5}) { for (auto nthreads : {1, 5}) {
for (auto max_keys : {10 * 1024, 1 * 1024 * 1024}) { for (auto max_keys :
{10 * 1024 * kStressFactor, 1 * 1024 * 1024 * kStressFactor}) {
cache_ = std::make_shared<VolatileCacheTier>(); cache_ = std::make_shared<VolatileCacheTier>();
RunInsertTest(nthreads, max_keys); RunInsertTest(nthreads, max_keys);
} }
} }
} }
#endif // !(defined(__clang__) && defined(OS_LINUX))
TEST_F(PersistentCacheTierTest, VolatileCacheInsertWithEviction) { TEST_F(PersistentCacheTierTest, VolatileCacheInsertWithEviction) {
for (auto nthreads : {1, 5}) { for (auto nthreads : {1, 5}) {
for (auto max_keys : {1 * 1024 * 1024}) { for (auto max_keys : {1 * 1024 * 1024 * kStressFactor}) {
cache_ = std::make_shared<VolatileCacheTier>(/*compressed=*/true, cache_ = std::make_shared<VolatileCacheTier>(
/*size=*/1 * 1024 * 1024); /*compressed=*/true, /*size=*/1 * 1024 * 1024 * kStressFactor);
RunInsertTestWithEviction(nthreads, max_keys); RunInsertTestWithEviction(nthreads, max_keys);
} }
} }
} }
#if !(defined(__clang__) && defined(OS_LINUX)) // Block cache tests
TEST_F(PersistentCacheTierTest, BlockCacheInsert) {
for (auto direct_writes : {true, false}) {
for (auto nthreads : {1, 5}) {
for (auto max_keys :
{10 * 1024 * kStressFactor, 1 * 1024 * 1024 * kStressFactor}) {
cache_ = NewBlockCache(Env::Default(), path_,
/*size=*/std::numeric_limits<uint64_t>::max(),
direct_writes);
RunInsertTest(nthreads, max_keys);
}
}
}
}
TEST_F(PersistentCacheTierTest, BlockCacheInsertWithEviction) {
for (auto nthreads : {1, 5}) {
for (auto max_keys : {1 * 1024 * 1024 * kStressFactor}) {
cache_ = NewBlockCache(Env::Default(), path_,
/*max_size=*/200 * 1024 * 1024 * kStressFactor);
RunInsertTestWithEviction(nthreads, max_keys);
}
}
}
// Tiered cache tests
TEST_F(PersistentCacheTierTest, TieredCacheInsert) {
for (auto nthreads : {1, 5}) {
for (auto max_keys :
{10 * 1024 * kStressFactor, 1 * 1024 * 1024 * kStressFactor}) {
cache_ = NewTieredCache(Env::Default(), path_,
/*memory_size=*/1 * 1024 * 1024 * kStressFactor);
RunInsertTest(nthreads, max_keys);
}
}
}
TEST_F(PersistentCacheTierTest, TieredCacheInsertWithEviction) {
for (auto nthreads : {1, 5}) {
for (auto max_keys : {1 * 1024 * 1024 * kStressFactor}) {
cache_ = NewTieredCache(
Env::Default(), path_,
/*memory_size=*/1 * 1024 * 1024 * kStressFactor,
/*block_cache_size*/ 200 * 1024 * 1024 * kStressFactor);
RunInsertTestWithEviction(nthreads, max_keys);
}
}
}
std::shared_ptr<PersistentCacheTier> MakeVolatileCache(
const std::string& /*dbname*/) {
return std::make_shared<VolatileCacheTier>();
}
std::shared_ptr<PersistentCacheTier> MakeBlockCache(const std::string& dbname) {
return NewBlockCache(Env::Default(), dbname);
}
std::shared_ptr<PersistentCacheTier> MakeTieredCache(
const std::string& dbname) {
const auto memory_size = 1 * 1024 * 1024 * kStressFactor;
return NewTieredCache(Env::Default(), dbname, memory_size);
}
static void UniqueIdCallback(void* arg) {
int* result = reinterpret_cast<int*>(arg);
if (*result == -1) {
*result = 0;
}
rocksdb::SyncPoint::GetInstance()->ClearTrace();
rocksdb::SyncPoint::GetInstance()->SetCallBack(
"GetUniqueIdFromFile:FS_IOC_GETVERSION", UniqueIdCallback);
}
PersistentCacheDBTest::PersistentCacheDBTest() : DBTestBase("/cache_test") {
rocksdb::SyncPoint::GetInstance()->EnableProcessing();
rocksdb::SyncPoint::GetInstance()->SetCallBack(
"GetUniqueIdFromFile:FS_IOC_GETVERSION", UniqueIdCallback);
rocksdb::SyncPoint::GetInstance()->SetCallBack("NewRandomAccessFile:O_DIRECT",
OnOpenForRead);
}
// test template
void PersistentCacheDBTest::RunTest(
const std::function<std::shared_ptr<PersistentCacheTier>(bool)>&
new_pcache) {
if (!Snappy_Supported()) {
return;
}
// number of insertion interations
int num_iter = 100 * 1024 * kStressFactor;
for (int iter = 0; iter < 5; iter++) {
Options options;
options.write_buffer_size =
64 * 1024 * kStressFactor; // small write buffer
options.statistics = rocksdb::CreateDBStatistics();
options = CurrentOptions(options);
// setup page cache
std::shared_ptr<PersistentCacheTier> pcache;
BlockBasedTableOptions table_options;
table_options.cache_index_and_filter_blocks = true;
const uint64_t uint64_max = std::numeric_limits<uint64_t>::max();
switch (iter) {
case 0:
// page cache, block cache, no-compressed cache
pcache = new_pcache(/*is_compressed=*/true);
table_options.persistent_cache = pcache;
table_options.block_cache = NewLRUCache(uint64_max);
table_options.block_cache_compressed = nullptr;
options.table_factory.reset(NewBlockBasedTableFactory(table_options));
break;
case 1:
// page cache, block cache, compressed cache
pcache = new_pcache(/*is_compressed=*/true);
table_options.persistent_cache = pcache;
table_options.block_cache = NewLRUCache(uint64_max);
table_options.block_cache_compressed = NewLRUCache(uint64_max);
options.table_factory.reset(NewBlockBasedTableFactory(table_options));
break;
case 2:
// page cache, block cache, compressed cache + KNoCompression
// both block cache and compressed cache, but DB is not compressed
// also, make block cache sizes bigger, to trigger block cache hits
pcache = new_pcache(/*is_compressed=*/true);
table_options.persistent_cache = pcache;
table_options.block_cache = NewLRUCache(uint64_max);
table_options.block_cache_compressed = NewLRUCache(uint64_max);
options.table_factory.reset(NewBlockBasedTableFactory(table_options));
options.compression = kNoCompression;
break;
case 3:
// page cache, no block cache, no compressed cache
pcache = new_pcache(/*is_compressed=*/false);
table_options.persistent_cache = pcache;
table_options.block_cache = nullptr;
table_options.block_cache_compressed = nullptr;
options.table_factory.reset(NewBlockBasedTableFactory(table_options));
break;
case 4:
// page cache, no block cache, no compressed cache
// Page cache caches compressed blocks
pcache = new_pcache(/*is_compressed=*/true);
table_options.persistent_cache = pcache;
table_options.block_cache = nullptr;
table_options.block_cache_compressed = nullptr;
options.table_factory.reset(NewBlockBasedTableFactory(table_options));
break;
default:
ASSERT_TRUE(false);
}
std::vector<std::string> values;
// insert data
Insert(options, table_options, num_iter, &values);
// flush all data in cache to device
pcache->TEST_Flush();
// verify data
Verify(num_iter, values);
auto block_miss = TestGetTickerCount(options, BLOCK_CACHE_MISS);
auto compressed_block_hit =
TestGetTickerCount(options, BLOCK_CACHE_COMPRESSED_HIT);
auto compressed_block_miss =
TestGetTickerCount(options, BLOCK_CACHE_COMPRESSED_MISS);
auto page_hit = TestGetTickerCount(options, PERSISTENT_CACHE_HIT);
auto page_miss = TestGetTickerCount(options, PERSISTENT_CACHE_MISS);
// check that we triggered the appropriate code paths in the cache
switch (iter) {
case 0:
// page cache, block cache, no-compressed cache
ASSERT_GT(page_miss, 0);
ASSERT_GT(page_hit, 0);
ASSERT_GT(block_miss, 0);
ASSERT_EQ(compressed_block_miss, 0);
ASSERT_EQ(compressed_block_hit, 0);
break;
case 1:
// page cache, block cache, compressed cache
ASSERT_GT(page_miss, 0);
ASSERT_GT(block_miss, 0);
ASSERT_GT(compressed_block_miss, 0);
break;
case 2:
// page cache, block cache, compressed cache + KNoCompression
ASSERT_GT(page_miss, 0);
ASSERT_GT(page_hit, 0);
ASSERT_GT(block_miss, 0);
ASSERT_GT(compressed_block_miss, 0);
// remember kNoCompression
ASSERT_EQ(compressed_block_hit, 0);
break;
case 3:
case 4:
// page cache, no block cache, no compressed cache
ASSERT_GT(page_miss, 0);
ASSERT_GT(page_hit, 0);
ASSERT_EQ(compressed_block_hit, 0);
ASSERT_EQ(compressed_block_miss, 0);
break;
default:
ASSERT_TRUE(false);
}
options.create_if_missing = true;
DestroyAndReopen(options);
pcache->Close();
}
}
// test table with volatile page cache // test table with volatile page cache
TEST_F(PersistentCacheDBTest, VolatileCacheTest) { TEST_F(PersistentCacheDBTest, VolatileCacheTest) {
RunTest(std::bind(&PersistentCacheDBTest::MakeVolatileCache, this)); RunTest(std::bind(&MakeVolatileCache, dbname_));
}
// test table with block page cache
TEST_F(PersistentCacheDBTest, BlockCacheTest) {
RunTest(std::bind(&MakeBlockCache, dbname_));
}
// test table with tiered page cache
TEST_F(PersistentCacheDBTest, TieredCacheTest) {
RunTest(std::bind(&MakeTieredCache, dbname_));
} }
#endif // !(defined(__clang__) && defined(OS_LINUX))
} // namespace rocksdb } // namespace rocksdb

@ -32,9 +32,7 @@ namespace rocksdb {
// //
class PersistentCacheTierTest : public testing::Test { class PersistentCacheTierTest : public testing::Test {
public: public:
explicit PersistentCacheTierTest() PersistentCacheTierTest();
: path_(test::TmpDir(Env::Default()) + "/cache_test") {}
virtual ~PersistentCacheTierTest() { virtual ~PersistentCacheTierTest() {
if (cache_) { if (cache_) {
Status s = cache_->Close(); Status s = cache_->Close();
@ -46,7 +44,7 @@ class PersistentCacheTierTest : public testing::Test {
// Flush cache // Flush cache
void Flush() { void Flush() {
if (cache_) { if (cache_) {
cache_->Flush(); cache_->TEST_Flush();
} }
} }
@ -208,27 +206,7 @@ class PersistentCacheTierTest : public testing::Test {
// //
class PersistentCacheDBTest : public DBTestBase { class PersistentCacheDBTest : public DBTestBase {
public: public:
PersistentCacheDBTest() : DBTestBase("/cache_test") { PersistentCacheDBTest();
rocksdb::SyncPoint::GetInstance()->EnableProcessing();
rocksdb::SyncPoint::GetInstance()->SetCallBack(
"GetUniqueIdFromFile:FS_IOC_GETVERSION",
PersistentCacheDBTest::UniqueIdCallback);
}
static void UniqueIdCallback(void* arg) {
int* result = reinterpret_cast<int*>(arg);
if (*result == -1) {
*result = 0;
}
rocksdb::SyncPoint::GetInstance()->ClearTrace();
rocksdb::SyncPoint::GetInstance()->SetCallBack(
"GetUniqueIdFromFile:FS_IOC_GETVERSION", UniqueIdCallback);
}
std::shared_ptr<PersistentCacheTier> MakeVolatileCache() {
return std::make_shared<VolatileCacheTier>();
}
static uint64_t TestGetTickerCount(const Options& options, static uint64_t TestGetTickerCount(const Options& options,
Tickers ticker_type) { Tickers ticker_type) {
@ -281,135 +259,7 @@ class PersistentCacheDBTest : public DBTestBase {
// test template // test template
void RunTest(const std::function<std::shared_ptr<PersistentCacheTier>(bool)>& void RunTest(const std::function<std::shared_ptr<PersistentCacheTier>(bool)>&
new_pcache) { new_pcache);
if (!Snappy_Supported()) {
return;
}
// number of insertion interations
int num_iter = 100 * 1024;
for (int iter = 0; iter < 5; iter++) {
Options options;
options.write_buffer_size = 64 * 1024; // small write buffer
options.statistics = rocksdb::CreateDBStatistics();
options = CurrentOptions(options);
// setup page cache
std::shared_ptr<PersistentCacheTier> pcache;
BlockBasedTableOptions table_options;
table_options.cache_index_and_filter_blocks = true;
const uint64_t uint64_max = std::numeric_limits<uint64_t>::max();
switch (iter) {
case 0:
// page cache, block cache, no-compressed cache
pcache = new_pcache(/*is_compressed=*/true);
table_options.persistent_cache = pcache;
table_options.block_cache = NewLRUCache(uint64_max);
table_options.block_cache_compressed = nullptr;
options.table_factory.reset(NewBlockBasedTableFactory(table_options));
break;
case 1:
// page cache, block cache, compressed cache
pcache = new_pcache(/*is_compressed=*/true);
table_options.persistent_cache = pcache;
table_options.block_cache = NewLRUCache(uint64_max);
table_options.block_cache_compressed = NewLRUCache(uint64_max);
options.table_factory.reset(NewBlockBasedTableFactory(table_options));
break;
case 2:
// page cache, block cache, compressed cache + KNoCompression
// both block cache and compressed cache, but DB is not compressed
// also, make block cache sizes bigger, to trigger block cache hits
pcache = new_pcache(/*is_compressed=*/true);
table_options.persistent_cache = pcache;
table_options.block_cache = NewLRUCache(uint64_max);
table_options.block_cache_compressed = NewLRUCache(uint64_max);
options.table_factory.reset(NewBlockBasedTableFactory(table_options));
options.compression = kNoCompression;
break;
case 3:
// page cache, no block cache, no compressed cache
pcache = new_pcache(/*is_compressed=*/false);
table_options.persistent_cache = pcache;
table_options.block_cache = nullptr;
table_options.block_cache_compressed = nullptr;
options.table_factory.reset(NewBlockBasedTableFactory(table_options));
break;
case 4:
// page cache, no block cache, no compressed cache
// Page cache caches compressed blocks
pcache = new_pcache(/*is_compressed=*/true);
table_options.persistent_cache = pcache;
table_options.block_cache = nullptr;
table_options.block_cache_compressed = nullptr;
options.table_factory.reset(NewBlockBasedTableFactory(table_options));
break;
default:
ASSERT_TRUE(false);
}
std::vector<std::string> values;
// insert data
Insert(options, table_options, num_iter, &values);
// flush all data in cache to device
pcache->Flush();
// verify data
Verify(num_iter, values);
auto block_miss = TestGetTickerCount(options, BLOCK_CACHE_MISS);
auto compressed_block_hit =
TestGetTickerCount(options, BLOCK_CACHE_COMPRESSED_HIT);
auto compressed_block_miss =
TestGetTickerCount(options, BLOCK_CACHE_COMPRESSED_MISS);
auto page_hit = TestGetTickerCount(options, PERSISTENT_CACHE_HIT);
auto page_miss = TestGetTickerCount(options, PERSISTENT_CACHE_MISS);
// check that we triggered the appropriate code paths in the cache
switch (iter) {
case 0:
// page cache, block cache, no-compressed cache
ASSERT_GT(page_miss, 0);
ASSERT_GT(page_hit, 0);
ASSERT_GT(block_miss, 0);
ASSERT_EQ(compressed_block_miss, 0);
ASSERT_EQ(compressed_block_hit, 0);
break;
case 1:
// page cache, block cache, compressed cache
ASSERT_GT(page_miss, 0);
ASSERT_GT(block_miss, 0);
ASSERT_GT(compressed_block_miss, 0);
break;
case 2:
// page cache, block cache, compressed cache + KNoCompression
ASSERT_GT(page_miss, 0);
ASSERT_GT(page_hit, 0);
ASSERT_GT(block_miss, 0);
ASSERT_GT(compressed_block_miss, 0);
// remember kNoCompression
ASSERT_EQ(compressed_block_hit, 0);
break;
case 3:
case 4:
// page cache, no block cache, no compressed cache
ASSERT_GT(page_miss, 0);
ASSERT_GT(page_hit, 0);
ASSERT_EQ(compressed_block_hit, 0);
ASSERT_EQ(compressed_block_miss, 0);
break;
default:
ASSERT_TRUE(false);
}
options.create_if_missing = true;
DestroyAndReopen(options);
pcache->Close();
}
}
}; };
} // namespace rocksdb } // namespace rocksdb

@ -28,12 +28,6 @@ Status PersistentCacheTier::Close() {
return Status::OK(); return Status::OK();
} }
void PersistentCacheTier::Flush() {
if (next_tier_) {
next_tier_->Flush();
}
}
bool PersistentCacheTier::Reserve(const size_t size) { bool PersistentCacheTier::Reserve(const size_t size) {
// default implementation is a pass through // default implementation is a pass through
return true; return true;
@ -52,6 +46,13 @@ std::string PersistentCacheTier::PrintStats() {
return std::string(); return std::string();
} }
std::vector<PersistentCacheTier::TierStats> PersistentCacheTier::Stats() {
if (next_tier_) {
return next_tier_->Stats();
}
return std::vector<TierStats>{};
}
// //
// PersistentTieredCache implementation // PersistentTieredCache implementation
// //
@ -71,14 +72,14 @@ Status PersistentTieredCache::Close() {
return status; return status;
} }
void PersistentTieredCache::Flush() { bool PersistentTieredCache::Erase(const Slice& key) {
assert(!tiers_.empty()); assert(!tiers_.empty());
tiers_.front()->Flush(); return tiers_.front()->Erase(key);
} }
bool PersistentTieredCache::Erase(const Slice& key) { std::vector<PersistentCacheTier::TierStats> PersistentTieredCache::Stats() {
assert(!tiers_.empty()); assert(!tiers_.empty());
return tiers_.front()->Erase(key); return tiers_.front()->Stats();
} }
std::string PersistentTieredCache::PrintStats() { std::string PersistentTieredCache::PrintStats() {
@ -106,6 +107,11 @@ void PersistentTieredCache::AddTier(const Tier& tier) {
tiers_.push_back(tier); tiers_.push_back(tier);
} }
bool PersistentTieredCache::IsCompressed() {
assert(tiers_.size());
return tiers_.front()->IsCompressed();
}
} // namespace rocksdb } // namespace rocksdb
#endif #endif

@ -54,6 +54,173 @@
// null // null
namespace rocksdb { namespace rocksdb {
// Persistent Cache Config
//
// This struct captures all the options that are used to configure persistent
// cache. Some of the terminologies used in naming the options are
//
// dispatch size :
// This is the size in which IO is dispatched to the device
//
// write buffer size :
// This is the size of an individual write buffer size. Write buffers are
// grouped to form buffered file.
//
// cache size :
// This is the logical maximum for the cache size
//
// qdepth :
// This is the max number of IOs that can issues to the device in parallel
//
// pepeling :
// The writer code path follows pipelined architecture, which means the
// operations are handed off from one stage to another
//
// pipelining backlog size :
// With the pipelined architecture, there can always be backlogging of ops in
// pipeline queues. This is the maximum backlog size after which ops are dropped
// from queue
struct PersistentCacheConfig {
explicit PersistentCacheConfig(
Env* const _env, const std::string& _path, const uint64_t _cache_size,
const std::shared_ptr<Logger>& _log,
const uint32_t _write_buffer_size = 1 * 1024 * 1024 /*1MB*/) {
env = _env;
path = _path;
log = _log;
cache_size = _cache_size;
writer_dispatch_size = write_buffer_size = _write_buffer_size;
}
//
// Validate the settings. Our intentions are to catch erroneous settings ahead
// of time instead going violating invariants or causing dead locks.
//
Status ValidateSettings() const {
// (1) check pre-conditions for variables
if (!env || path.empty()) {
return Status::InvalidArgument("empty or null args");
}
// (2) assert size related invariants
// - cache size cannot be less than cache file size
// - individual write buffer size cannot be greater than cache file size
// - total write buffer size cannot be less than 2X cache file size
if (cache_size < cache_file_size || write_buffer_size >= cache_file_size ||
write_buffer_size * write_buffer_count() < 2 * cache_file_size) {
return Status::InvalidArgument("invalid cache size");
}
// (2) check writer settings
// - Queue depth cannot be 0
// - writer_dispatch_size cannot be greater than writer_buffer_size
// - dispatch size and buffer size need to be aligned
if (!writer_qdepth || writer_dispatch_size > write_buffer_size ||
write_buffer_size % writer_dispatch_size) {
return Status::InvalidArgument("invalid writer settings");
}
return Status::OK();
}
//
// Env abstraction to use for systmer level operations
//
Env* env;
//
// Path for the block cache where blocks are persisted
//
std::string path;
//
// Log handle for logging messages
//
std::shared_ptr<Logger> log;
//
// Enable direct IO for reading
//
bool enable_direct_reads = true;
//
// Enable direct IO for writing
//
bool enable_direct_writes = false;
//
// Logical cache size
//
uint64_t cache_size = std::numeric_limits<uint64_t>::max();
// cache-file-size
//
// Cache consists of multiples of small files. This parameter defines the
// size of an individual cache file
//
// default: 1M
uint32_t cache_file_size = 100ULL * 1024 * 1024;
// writer-qdepth
//
// The writers can issues IO to the devices in parallel. This parameter
// controls the max number if IOs that can issues in parallel to the block
// device
//
// default :1
uint32_t writer_qdepth = 1;
// pipeline-writes
//
// The write optionally follow pipelined architecture. This helps
// avoid regression in the eviction code path of the primary tier. This
// parameter defines if pipelining is enabled or disabled
//
// default: true
bool pipeline_writes_ = true;
// max-write-pipeline-backlog-size
//
// Max pipeline buffer size. This is the maximum backlog we can accumulate
// while waiting for writes. After the limit, new ops will be dropped.
//
// Default: 1GiB
uint64_t max_write_pipeline_backlog_size = 1ULL * 1024 * 1024 * 1024;
// write-buffer-size
//
// This is the size in which buffer slabs are allocated.
//
// Default: 1M
uint32_t write_buffer_size = 1ULL * 1024 * 1024;
// write-buffer-count
//
// This is the total number of buffer slabs. This is calculated as a factor of
// file size in order to avoid dead lock.
size_t write_buffer_count() const {
assert(write_buffer_size);
return (writer_qdepth + 1.2) * cache_file_size / write_buffer_size;
}
// writer-dispatch-size
//
// The writer thread will dispatch the IO at the specified IO size
//
// default: 1M
uint64_t writer_dispatch_size = 1ULL * 1024 * 1024;
// is_compressed
//
// This option determines if the cache will run in compressed mode or
// uncompressed mode
bool is_compressed = true;
PersistentCacheConfig MakePersistentCacheConfig(
const std::string& path, const uint64_t size,
const std::shared_ptr<Logger>& log);
};
// Persistent Cache Tier // Persistent Cache Tier
// //
// This a logical abstraction that defines a tier of the persistent cache. Tiers // This a logical abstraction that defines a tier of the persistent cache. Tiers
@ -73,9 +240,6 @@ class PersistentCacheTier : public PersistentCache {
// Close the persistent cache tier // Close the persistent cache tier
virtual Status Close(); virtual Status Close();
// Flush the pending writes
virtual void Flush();
// Reserve space up to 'size' bytes // Reserve space up to 'size' bytes
virtual bool Reserve(const size_t size); virtual bool Reserve(const size_t size);
@ -86,7 +250,7 @@ class PersistentCacheTier : public PersistentCache {
virtual std::string PrintStats(); virtual std::string PrintStats();
// Expose stats // Expose stats
virtual std::vector<TierStats> Stats() = 0; virtual std::vector<TierStats> Stats();
// Insert to page cache // Insert to page cache
virtual Status Insert(const Slice& page_key, const char* data, virtual Status Insert(const Slice& page_key, const char* data,
@ -96,6 +260,9 @@ class PersistentCacheTier : public PersistentCache {
virtual Status Lookup(const Slice& page_key, std::unique_ptr<char[]>* data, virtual Status Lookup(const Slice& page_key, std::unique_ptr<char[]>* data,
size_t* size) = 0; size_t* size) = 0;
// Does it store compressed data ?
virtual bool IsCompressed() = 0;
// Return a reference to next tier // Return a reference to next tier
virtual Tier& next_tier() { return next_tier_; } virtual Tier& next_tier() { return next_tier_; }
@ -105,6 +272,12 @@ class PersistentCacheTier : public PersistentCache {
next_tier_ = tier; next_tier_ = tier;
} }
virtual void TEST_Flush() {
if (next_tier_) {
next_tier_->TEST_Flush();
}
}
private: private:
Tier next_tier_; // next tier Tier next_tier_; // next tier
}; };
@ -120,13 +293,14 @@ class PersistentTieredCache : public PersistentCacheTier {
Status Open() override; Status Open() override;
Status Close() override; Status Close() override;
void Flush() override;
bool Erase(const Slice& key) override; bool Erase(const Slice& key) override;
std::string PrintStats() override; std::string PrintStats() override;
std::vector<TierStats> Stats() override;
Status Insert(const Slice& page_key, const char* data, Status Insert(const Slice& page_key, const char* data,
const size_t size) override; const size_t size) override;
Status Lookup(const Slice& page_key, std::unique_ptr<char[]>* data, Status Lookup(const Slice& page_key, std::unique_ptr<char[]>* data,
size_t* size) override; size_t* size) override;
bool IsCompressed() override;
void AddTier(const Tier& tier); void AddTier(const Tier& tier);
@ -140,6 +314,12 @@ class PersistentTieredCache : public PersistentCacheTier {
(*it)->set_next_tier(tier); (*it)->set_next_tier(tier);
} }
void TEST_Flush() override {
assert(!tiers_.empty());
tiers_.front()->TEST_Flush();
PersistentCacheTier::TEST_Flush();
}
protected: protected:
std::list<Tier> tiers_; // list of tiers top-down std::list<Tier> tiers_; // list of tiers top-down
}; };

Loading…
Cancel
Save