Summary: The patch is a continuation of part 5. It glues the abstraction for file layout and metadata, and flush out the implementation of the API. It adds unit tests for the implementation. Test Plan: Run unit tests Subscribers: andrewkr, dhruba, leveldb Differential Revision: https://reviews.facebook.net/D57549main
parent
64046e581c
commit
c116b47804
@ -0,0 +1,358 @@ |
||||
// Copyright (c) 2013, Facebook, Inc. All rights reserved.
|
||||
// This source code is licensed under the BSD-style license found in the
|
||||
// LICENSE file in the root directory of this source tree. An additional grant
|
||||
// of patent rights can be found in the PATENTS file in the same directory.
|
||||
#ifndef ROCKSDB_LITE |
||||
|
||||
#include "utilities/persistent_cache/block_cache_tier.h" |
||||
|
||||
#include <regex> |
||||
#include <utility> |
||||
#include <vector> |
||||
|
||||
#include "util/stop_watch.h" |
||||
#include "utilities/persistent_cache/block_cache_tier_file.h" |
||||
|
||||
namespace rocksdb { |
||||
|
||||
//
|
||||
// BlockCacheImpl
|
||||
//
|
||||
Status BlockCacheTier::Open() { |
||||
Status status; |
||||
|
||||
WriteLock _(&lock_); |
||||
|
||||
assert(!size_); |
||||
|
||||
// Check the validity of the options
|
||||
status = opt_.ValidateSettings(); |
||||
assert(status.ok()); |
||||
if (!status.ok()) { |
||||
Error(opt_.log, "Invalid block cache options"); |
||||
return status; |
||||
} |
||||
|
||||
// Create base directory or cleanup existing directory
|
||||
status = opt_.env->CreateDirIfMissing(opt_.path); |
||||
if (!status.ok()) { |
||||
Error(opt_.log, "Error creating directory %s. %s", opt_.path.c_str(), |
||||
status.ToString().c_str()); |
||||
return status; |
||||
} |
||||
|
||||
// Create base/<cache dir> directory
|
||||
status = opt_.env->CreateDir(GetCachePath()); |
||||
if (!status.ok()) { |
||||
// directory already exisits, clean it up
|
||||
status = CleanupCacheFolder(GetCachePath()); |
||||
assert(status.ok()); |
||||
if (!status.ok()) { |
||||
Error(opt_.log, "Error creating directory %s. %s", opt_.path.c_str(), |
||||
status.ToString().c_str()); |
||||
return status; |
||||
} |
||||
} |
||||
|
||||
assert(!cache_file_); |
||||
NewCacheFile(); |
||||
assert(cache_file_); |
||||
|
||||
if (opt_.pipeline_writes_) { |
||||
assert(!insert_th_.joinable()); |
||||
insert_th_ = std::thread(&BlockCacheTier::InsertMain, this); |
||||
} |
||||
|
||||
return Status::OK(); |
||||
} |
||||
|
||||
Status BlockCacheTier::CleanupCacheFolder(const std::string& folder) { |
||||
std::vector<std::string> files; |
||||
Status status = opt_.env->GetChildren(folder, &files); |
||||
if (!status.ok()) { |
||||
Error(opt_.log, "Error getting files for %s. %s", folder.c_str(), |
||||
status.ToString().c_str()); |
||||
return status; |
||||
} |
||||
|
||||
// cleanup files with the patter :digi:.rc
|
||||
for (auto file : files) { |
||||
try { |
||||
const std::regex cache_file_regex("(0-9)+\\.rc$"); |
||||
if (std::regex_match(file, cache_file_regex)) { |
||||
// cache file
|
||||
Info(opt_.log, "Removing file %s.", file.c_str()); |
||||
status = opt_.env->DeleteFile(folder + "/" + file); |
||||
if (!status.ok()) { |
||||
Error(opt_.log, "Error deleting file %s. %s", file.c_str(), |
||||
status.ToString().c_str()); |
||||
return Status::IOError("Error deleting file " + file); |
||||
} |
||||
} else { |
||||
Info(opt_.log, "Skipping file %s.", file.c_str()); |
||||
} |
||||
} catch (const std::regex_error& e) { |
||||
// Since std library is evolving, you can potentially get an exception for
|
||||
// certain older compiler version. It is safer to exit cleanly.
|
||||
return Status::IOError(e.what()); |
||||
} |
||||
} |
||||
return Status::OK(); |
||||
} |
||||
|
||||
Status BlockCacheTier::Close() { |
||||
// stop the insert thread
|
||||
if (opt_.pipeline_writes_ && insert_th_.joinable()) { |
||||
InsertOp op(/*quit=*/true); |
||||
insert_ops_.Push(std::move(op)); |
||||
insert_th_.join(); |
||||
} |
||||
|
||||
// stop the writer before
|
||||
writer_.Stop(); |
||||
|
||||
// clear all metadata
|
||||
WriteLock _(&lock_); |
||||
metadata_.Clear(); |
||||
return Status::OK(); |
||||
} |
||||
|
||||
std::string BlockCacheTier::PrintStats() { |
||||
std::ostringstream os; |
||||
os << "persistentcache.blockcachetier.bytes_piplined: " |
||||
<< stats_.bytes_pipelined_.ToString() << std::endl |
||||
<< "persistentcache.blockcachetier.bytes_written: " |
||||
<< stats_.bytes_written_.ToString() << std::endl |
||||
<< "persistentcache.blockcachetier.bytes_read: " |
||||
<< stats_.bytes_read_.ToString() << std::endl |
||||
<< "persistentcache.blockcachetier.insert_dropped" |
||||
<< stats_.insert_dropped_ << std::endl |
||||
<< "persistentcache.blockcachetier.cache_hits: " << stats_.cache_hits_ |
||||
<< std::endl |
||||
<< "persistentcache.blockcachetier.cache_misses: " << stats_.cache_misses_ |
||||
<< std::endl |
||||
<< "persistentcache.blockcachetier.cache_errors: " << stats_.cache_errors_ |
||||
<< std::endl |
||||
<< "persistentcache.blockcachetier.cache_hits_pct: " |
||||
<< stats_.CacheHitPct() << std::endl |
||||
<< "persistentcache.blockcachetier.cache_misses_pct: " |
||||
<< stats_.CacheMissPct() << std::endl |
||||
<< "persistentcache.blockcachetier.read_hit_latency: " |
||||
<< stats_.read_hit_latency_.ToString() << std::endl |
||||
<< "persistentcache.blockcachetier.read_miss_latency: " |
||||
<< stats_.read_miss_latency_.ToString() << std::endl |
||||
<< "persistenetcache.blockcachetier.write_latency: " |
||||
<< stats_.write_latency_.ToString() << std::endl |
||||
<< PersistentCacheTier::PrintStats(); |
||||
return os.str(); |
||||
} |
||||
|
||||
Status BlockCacheTier::Insert(const Slice& key, const char* data, |
||||
const size_t size) { |
||||
// update stats
|
||||
stats_.bytes_pipelined_.Add(size); |
||||
|
||||
if (opt_.pipeline_writes_) { |
||||
// off load the write to the write thread
|
||||
insert_ops_.Push( |
||||
InsertOp(key.ToString(), std::move(std::string(data, size)))); |
||||
return Status::OK(); |
||||
} |
||||
|
||||
assert(!opt_.pipeline_writes_); |
||||
return InsertImpl(key, Slice(data, size)); |
||||
} |
||||
|
||||
void BlockCacheTier::InsertMain() { |
||||
while (true) { |
||||
InsertOp op(insert_ops_.Pop()); |
||||
|
||||
if (op.signal_) { |
||||
// that is a secret signal to exit
|
||||
break; |
||||
} |
||||
|
||||
size_t retry = 0; |
||||
Status s; |
||||
while ((s = InsertImpl(Slice(op.key_), Slice(op.data_))).IsTryAgain()) { |
||||
if (retry > kMaxRetry) { |
||||
break; |
||||
} |
||||
|
||||
// this can happen when the buffers are full, we wait till some buffers
|
||||
// are free. Why don't we wait inside the code. This is because we want
|
||||
// to support both pipelined and non-pipelined mode
|
||||
buffer_allocator_.WaitUntilUsable(); |
||||
retry++; |
||||
} |
||||
|
||||
if (!s.ok()) { |
||||
stats_.insert_dropped_++; |
||||
} |
||||
} |
||||
} |
||||
|
||||
Status BlockCacheTier::InsertImpl(const Slice& key, const Slice& data) { |
||||
// pre-condition
|
||||
assert(key.size()); |
||||
assert(data.size()); |
||||
assert(cache_file_); |
||||
|
||||
StopWatchNano timer(opt_.env); |
||||
|
||||
WriteLock _(&lock_); |
||||
|
||||
LBA lba; |
||||
if (metadata_.Lookup(key, &lba)) { |
||||
// the key already exisits, this is duplicate insert
|
||||
return Status::OK(); |
||||
} |
||||
|
||||
while (!cache_file_->Append(key, data, &lba)) { |
||||
if (!cache_file_->Eof()) { |
||||
Debug(opt_.log, "Error inserting to cache file %d", |
||||
cache_file_->cacheid()); |
||||
stats_.write_latency_.Add(timer.ElapsedNanos() / 1000); |
||||
return Status::TryAgain(); |
||||
} |
||||
|
||||
assert(cache_file_->Eof()); |
||||
NewCacheFile(); |
||||
} |
||||
|
||||
// Insert into lookup index
|
||||
BlockInfo* info = metadata_.Insert(key, lba); |
||||
assert(info); |
||||
if (!info) { |
||||
return Status::IOError("Unexpected error inserting to index"); |
||||
} |
||||
|
||||
// insert to cache file reverse mapping
|
||||
cache_file_->Add(info); |
||||
|
||||
// update stats
|
||||
stats_.bytes_written_.Add(data.size()); |
||||
stats_.write_latency_.Add(timer.ElapsedNanos() / 1000); |
||||
return Status::OK(); |
||||
} |
||||
|
||||
Status BlockCacheTier::Lookup(const Slice& key, unique_ptr<char[]>* val, |
||||
size_t* size) { |
||||
StopWatchNano timer(opt_.env); |
||||
|
||||
LBA lba; |
||||
bool status; |
||||
status = metadata_.Lookup(key, &lba); |
||||
if (!status) { |
||||
stats_.cache_misses_++; |
||||
stats_.read_miss_latency_.Add(timer.ElapsedNanos() / 1000); |
||||
return Status::NotFound("blockcache: key not found"); |
||||
} |
||||
|
||||
BlockCacheFile* const file = metadata_.Lookup(lba.cache_id_); |
||||
if (!file) { |
||||
// this can happen because the block index and cache file index are
|
||||
// different, and the cache file might be removed between the two lookups
|
||||
stats_.cache_misses_++; |
||||
stats_.read_miss_latency_.Add(timer.ElapsedNanos() / 1000); |
||||
return Status::NotFound("blockcache: cache file not found"); |
||||
} |
||||
|
||||
assert(file->refs_); |
||||
|
||||
unique_ptr<char[]> scratch(new char[lba.size_]); |
||||
Slice blk_key; |
||||
Slice blk_val; |
||||
|
||||
status = file->Read(lba, &blk_key, &blk_val, scratch.get()); |
||||
--file->refs_; |
||||
assert(status); |
||||
if (!status) { |
||||
stats_.cache_misses_++; |
||||
stats_.cache_errors_++; |
||||
stats_.read_miss_latency_.Add(timer.ElapsedNanos() / 1000); |
||||
return Status::NotFound("blockcache: error reading data"); |
||||
} |
||||
|
||||
assert(blk_key == key); |
||||
|
||||
val->reset(new char[blk_val.size()]); |
||||
memcpy(val->get(), blk_val.data(), blk_val.size()); |
||||
*size = blk_val.size(); |
||||
|
||||
stats_.bytes_read_.Add(*size); |
||||
stats_.cache_hits_++; |
||||
stats_.read_hit_latency_.Add(timer.ElapsedNanos() / 1000); |
||||
|
||||
return Status::OK(); |
||||
} |
||||
|
||||
bool BlockCacheTier::Erase(const Slice& key) { |
||||
WriteLock _(&lock_); |
||||
BlockInfo* info = metadata_.Remove(key); |
||||
assert(info); |
||||
delete info; |
||||
return true; |
||||
} |
||||
|
||||
void BlockCacheTier::NewCacheFile() { |
||||
lock_.AssertHeld(); |
||||
|
||||
Info(opt_.log, "Creating cache file %d", writer_cache_id_); |
||||
|
||||
writer_cache_id_++; |
||||
|
||||
cache_file_ = new WriteableCacheFile(opt_.env, &buffer_allocator_, &writer_, |
||||
GetCachePath(), writer_cache_id_, |
||||
opt_.cache_file_size, opt_.log); |
||||
bool status; |
||||
status = |
||||
cache_file_->Create(opt_.enable_direct_writes, opt_.enable_direct_reads); |
||||
assert(status); |
||||
|
||||
// insert to cache files tree
|
||||
status = metadata_.Insert(cache_file_); |
||||
(void)status; |
||||
assert(status); |
||||
} |
||||
|
||||
bool BlockCacheTier::Reserve(const size_t size) { |
||||
WriteLock _(&lock_); |
||||
assert(size_ <= opt_.cache_size); |
||||
|
||||
if (size + size_ <= opt_.cache_size) { |
||||
// there is enough space to write
|
||||
size_ += size; |
||||
return true; |
||||
} |
||||
|
||||
assert(size + size_ >= opt_.cache_size); |
||||
// there is not enough space to fit the requested data
|
||||
// we can clear some space by evicting cold data
|
||||
|
||||
const double retain_fac = (100 - kEvictPct) / static_cast<double>(100); |
||||
while (size + size_ > opt_.cache_size * retain_fac) { |
||||
unique_ptr<BlockCacheFile> f(metadata_.Evict()); |
||||
if (!f) { |
||||
// nothing is evictable
|
||||
return false; |
||||
} |
||||
assert(!f->refs_); |
||||
size_t file_size; |
||||
if (!f->Delete(&file_size).ok()) { |
||||
// unable to delete file
|
||||
return false; |
||||
} |
||||
|
||||
assert(file_size <= size_); |
||||
size_ -= file_size; |
||||
} |
||||
|
||||
size_ += size; |
||||
assert(size_ <= opt_.cache_size * 0.9); |
||||
return true; |
||||
} |
||||
|
||||
} // namespace rocksdb
|
||||
|
||||
#endif // ifndef ROCKSDB_LITE
|
@ -0,0 +1,145 @@ |
||||
// Copyright (c) 2013, Facebook, Inc. All rights reserved.
|
||||
// This source code is licensed under the BSD-style license found in the
|
||||
// LICENSE file in the root directory of this source tree. An additional grant
|
||||
// of patent rights can be found in the PATENTS file in the same directory.
|
||||
#pragma once |
||||
|
||||
#include <unistd.h> |
||||
#include <list> |
||||
#include <memory> |
||||
#include <set> |
||||
#include <sstream> |
||||
#include <stdexcept> |
||||
#include <string> |
||||
#include <thread> |
||||
|
||||
#include "rocksdb/cache.h" |
||||
#include "rocksdb/comparator.h" |
||||
#include "rocksdb/persistent_cache.h" |
||||
|
||||
#include "utilities/persistent_cache/block_cache_tier_file.h" |
||||
#include "utilities/persistent_cache/block_cache_tier_metadata.h" |
||||
#include "utilities/persistent_cache/persistent_cache_util.h" |
||||
|
||||
#include "db/skiplist.h" |
||||
#include "port/port_posix.h" |
||||
#include "util/arena.h" |
||||
#include "util/coding.h" |
||||
#include "util/crc32c.h" |
||||
#include "util/histogram.h" |
||||
#include "util/mutexlock.h" |
||||
|
||||
namespace rocksdb { |
||||
|
||||
//
|
||||
// Block cache tier implementation
|
||||
//
|
||||
class BlockCacheTier : public PersistentCacheTier { |
||||
public: |
||||
explicit BlockCacheTier(const PersistentCacheConfig& opt) |
||||
: opt_(opt), |
||||
insert_ops_(opt_.max_write_pipeline_backlog_size), |
||||
buffer_allocator_(opt.write_buffer_size, opt.write_buffer_count()), |
||||
writer_(this, opt_.writer_qdepth, opt_.writer_dispatch_size) { |
||||
Info(opt_.log, "Initializing allocator. size=%d B count=%d", |
||||
opt_.write_buffer_size, opt_.write_buffer_count()); |
||||
} |
||||
|
||||
virtual ~BlockCacheTier() { |
||||
// By contract, the user should have called stop before destroying the
|
||||
// object
|
||||
assert(!insert_th_.joinable()); |
||||
} |
||||
|
||||
Status Insert(const Slice& key, const char* data, const size_t size) override; |
||||
Status Lookup(const Slice& key, std::unique_ptr<char[]>* data, |
||||
size_t* size) override; |
||||
Status Open() override; |
||||
Status Close() override; |
||||
bool Erase(const Slice& key) override; |
||||
bool Reserve(const size_t size) override; |
||||
|
||||
bool IsCompressed() override { return opt_.is_compressed; } |
||||
|
||||
std::string PrintStats() override; |
||||
|
||||
void TEST_Flush() override { |
||||
while (insert_ops_.Size()) { |
||||
/* sleep override */ sleep(1); |
||||
} |
||||
} |
||||
|
||||
private: |
||||
// Percentage of cache to be evicted when the cache is full
|
||||
static const size_t kEvictPct = 10; |
||||
// Max attempts to insert key, value to cache in pipelined mode
|
||||
static const size_t kMaxRetry = 3; |
||||
|
||||
// Pipelined operation
|
||||
struct InsertOp { |
||||
explicit InsertOp(const bool signal) : signal_(signal) {} |
||||
explicit InsertOp(std::string&& key, const std::string& data) |
||||
: key_(std::move(key)), data_(data) {} |
||||
~InsertOp() {} |
||||
|
||||
InsertOp() = delete; |
||||
InsertOp(InsertOp&& rhs) = default; |
||||
InsertOp& operator=(InsertOp&& rhs) = default; |
||||
|
||||
// used for estimating size by bounded queue
|
||||
size_t Size() { return data_.size() + key_.size(); } |
||||
|
||||
std::string key_; |
||||
std::string data_; |
||||
const bool signal_ = false; // signal to request processing thread to exit
|
||||
}; |
||||
|
||||
// entry point for insert thread
|
||||
void InsertMain(); |
||||
// insert implementation
|
||||
Status InsertImpl(const Slice& key, const Slice& data); |
||||
// Create a new cache file
|
||||
void NewCacheFile(); |
||||
// Get cache directory path
|
||||
std::string GetCachePath() const { return opt_.path + "/cache"; } |
||||
// Cleanup folder
|
||||
Status CleanupCacheFolder(const std::string& folder); |
||||
|
||||
// Statistics
|
||||
struct Stats { |
||||
HistogramImpl bytes_pipelined_; |
||||
HistogramImpl bytes_written_; |
||||
HistogramImpl bytes_read_; |
||||
HistogramImpl read_hit_latency_; |
||||
HistogramImpl read_miss_latency_; |
||||
HistogramImpl write_latency_; |
||||
uint64_t cache_hits_ = 0; |
||||
uint64_t cache_misses_ = 0; |
||||
uint64_t cache_errors_ = 0; |
||||
uint64_t insert_dropped_ = 0; |
||||
|
||||
double CacheHitPct() const { |
||||
const auto lookups = cache_hits_ + cache_misses_; |
||||
return lookups ? 100 * cache_hits_ / static_cast<double>(lookups) : 0.0; |
||||
} |
||||
|
||||
double CacheMissPct() const { |
||||
const auto lookups = cache_hits_ + cache_misses_; |
||||
return lookups ? 100 * cache_misses_ / static_cast<double>(lookups) : 0.0; |
||||
} |
||||
}; |
||||
|
||||
port::RWMutex lock_; // Synchronization
|
||||
const PersistentCacheConfig opt_; // BlockCache options
|
||||
BoundedQueue<InsertOp> insert_ops_; // Ops waiting for insert
|
||||
std::thread insert_th_; // Insert thread
|
||||
uint32_t writer_cache_id_ = 0; // Current cache file identifier
|
||||
WriteableCacheFile* cache_file_ = nullptr; // Current cache file reference
|
||||
CacheWriteBufferAllocator buffer_allocator_; // Buffer provider
|
||||
ThreadedWriter writer_; // Writer threads
|
||||
BlockCacheTierMetadata metadata_; // Cache meta data manager
|
||||
std::atomic<uint64_t> size_{0}; // Size of the cache
|
||||
Stats stats_; // Statistics
|
||||
}; |
||||
|
||||
} // namespace rocksdb
|
Loading…
Reference in new issue