From d85ff4953c29faed7b278268fe73e6b57787a8f4 Mon Sep 17 00:00:00 2001 From: Anirban Rahut Date: Wed, 10 May 2017 14:54:35 -0700 Subject: [PATCH] Blob storage pr Summary: The final pull request for Blob Storage. Closes https://github.com/facebook/rocksdb/pull/2269 Differential Revision: D5033189 Pulled By: yiwu-arbug fbshipit-source-id: 6356b683ccd58cbf38a1dc55e2ea400feecd5d06 --- CMakeLists.txt | 5 + Makefile | 5 +- TARGETS | 6 + db/compaction_iterator.cc | 4 +- db/db_impl.cc | 22 + db/db_impl.h | 5 + env/env_posix.cc | 21 +- include/rocksdb/env.h | 2 - src.mk | 7 + tools/db_bench_tool.cc | 32 +- util/file_reader_writer.h | 2 + util/mpsc.h | 158 ++ util/timer_queue.h | 217 ++ util/timer_queue_test.cc | 72 + utilities/blob_db/blob_db.cc | 307 +-- utilities/blob_db/blob_db.h | 183 +- utilities/blob_db/blob_db_impl.cc | 2210 +++++++++++++++++ utilities/blob_db/blob_db_impl.h | 657 +++++ utilities/blob_db/blob_db_options_impl.cc | 66 + utilities/blob_db/blob_db_options_impl.h | 73 + utilities/blob_db/blob_db_test.cc | 547 +++- utilities/blob_db/blob_file.cc | 225 ++ utilities/blob_db/blob_log_format.cc | 313 +++ utilities/blob_db/blob_log_format.h | 226 ++ utilities/blob_db/blob_log_reader.cc | 163 ++ utilities/blob_db/blob_log_reader.h | 93 + utilities/blob_db/blob_log_writer.cc | 172 ++ utilities/blob_db/blob_log_writer.h | 98 + .../optimistic_transaction_db_impl.h | 11 +- 29 files changed, 5687 insertions(+), 215 deletions(-) create mode 100644 util/mpsc.h create mode 100644 util/timer_queue.h create mode 100644 util/timer_queue_test.cc create mode 100644 utilities/blob_db/blob_db_impl.cc create mode 100644 utilities/blob_db/blob_db_impl.h create mode 100644 utilities/blob_db/blob_db_options_impl.cc create mode 100644 utilities/blob_db/blob_db_options_impl.h create mode 100644 utilities/blob_db/blob_file.cc create mode 100644 utilities/blob_db/blob_log_format.cc create mode 100644 utilities/blob_db/blob_log_format.h create mode 100644 utilities/blob_db/blob_log_reader.cc create mode 100644 utilities/blob_db/blob_log_reader.h create mode 100644 utilities/blob_db/blob_log_writer.cc create mode 100644 utilities/blob_db/blob_log_writer.h diff --git a/CMakeLists.txt b/CMakeLists.txt index c2cbd19ae..057f6ed7e 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -446,6 +446,10 @@ set(SOURCES util/xxhash.cc utilities/backupable/backupable_db.cc utilities/blob_db/blob_db.cc + utilities/blob_db/blob_db_impl.cc + utilities/blob_db/blob_log_reader.cc + utilities/blob_db/blob_log_writer.cc + utilities/blob_db/blob_log_format.cc utilities/checkpoint/checkpoint_impl.cc utilities/col_buf_decoder.cc utilities/col_buf_encoder.cc @@ -658,6 +662,7 @@ set(TESTS util/heap_test.cc util/rate_limiter_test.cc util/slice_transform_test.cc + util/timer_queue_test.cc util/thread_list_test.cc util/thread_local_test.cc utilities/backupable/backupable_db_test.cc diff --git a/Makefile b/Makefile index 8d60f90e3..c68ac141f 100644 --- a/Makefile +++ b/Makefile @@ -403,7 +403,6 @@ TESTS = \ ttl_test \ date_tiered_test \ backupable_db_test \ - blob_db_test \ document_db_test \ json_document_test \ sim_cache_test \ @@ -424,6 +423,7 @@ TESTS = \ options_settable_test \ options_util_test \ event_logger_test \ + timer_queue_test \ cuckoo_table_builder_test \ cuckoo_table_reader_test \ cuckoo_table_db_test \ @@ -1307,6 +1307,9 @@ db_bench_tool_test: tools/db_bench_tool_test.o $(BENCHTOOLOBJECTS) $(TESTHARNESS event_logger_test: util/event_logger_test.o $(LIBOBJECTS) $(TESTHARNESS) $(AM_LINK) +timer_queue_test: util/timer_queue_test.o $(LIBOBJECTS) $(TESTHARNESS) + $(AM_LINK) + sst_dump_test: tools/sst_dump_test.o $(LIBOBJECTS) $(TESTHARNESS) $(AM_LINK) diff --git a/TARGETS b/TARGETS index ce532d7d8..8ddeec34f 100644 --- a/TARGETS +++ b/TARGETS @@ -196,6 +196,12 @@ cpp_library( "util/xxhash.cc", "utilities/backupable/backupable_db.cc", "utilities/blob_db/blob_db.cc", + "utilities/blob_db/blob_db_impl.cc", + "utilities/blob_db/blob_db_options_impl.cc", + "utilities/blob_db/blob_file.cc", + "utilities/blob_db/blob_log_reader.cc", + "utilities/blob_db/blob_log_writer.cc", + "utilities/blob_db/blob_log_format.cc", "utilities/checkpoint/checkpoint_impl.cc", "utilities/compaction_filters/remove_emptyvalue_compactionfilter.cc", "utilities/convenience/info_log_finder.cc", diff --git a/db/compaction_iterator.cc b/db/compaction_iterator.cc index e742813cf..4332acb5a 100644 --- a/db/compaction_iterator.cc +++ b/db/compaction_iterator.cc @@ -93,7 +93,9 @@ CompactionIterator::CompactionIterator( latest_snapshot_ = snapshots_->back(); } if (compaction_filter_ != nullptr) { - if (compaction_filter_->IgnoreSnapshots()) ignore_snapshots_ = true; + if (compaction_filter_->IgnoreSnapshots()) { + ignore_snapshots_ = true; + } } else { ignore_snapshots_ = false; } diff --git a/db/db_impl.cc b/db/db_impl.cc index 3ceb3bbcd..348b3b40d 100644 --- a/db/db_impl.cc +++ b/db/db_impl.cc @@ -1577,6 +1577,14 @@ void DBImpl::ReleaseSnapshot(const Snapshot* s) { delete casted_s; } +bool DBImpl::HasActiveSnapshotLaterThanSN(SequenceNumber sn) { + InstrumentedMutexLock l(&mutex_); + if (snapshots_.empty()) { + return false; + } + return (snapshots_.newest()->GetSequenceNumber() > sn); +} + #ifndef ROCKSDB_LITE Status DBImpl::GetPropertiesOfAllTables(ColumnFamilyHandle* column_family, TablePropertiesCollection* props) { @@ -1821,6 +1829,20 @@ ColumnFamilyHandle* DBImpl::GetColumnFamilyHandle(uint32_t column_family_id) { return cf_memtables->GetColumnFamilyHandle(); } +// REQUIRED: mutex is NOT held. +ColumnFamilyHandle* DBImpl::GetColumnFamilyHandleUnlocked( + uint32_t column_family_id) { + ColumnFamilyMemTables* cf_memtables = column_family_memtables_.get(); + + InstrumentedMutexLock l(&mutex_); + + if (!cf_memtables->Seek(column_family_id)) { + return nullptr; + } + + return cf_memtables->GetColumnFamilyHandle(); +} + void DBImpl::GetApproximateMemTableStats(ColumnFamilyHandle* column_family, const Range& range, uint64_t* const count, diff --git a/db/db_impl.h b/db/db_impl.h index 20c3c0ae6..689ca575f 100644 --- a/db/db_impl.h +++ b/db/db_impl.h @@ -203,6 +203,8 @@ class DBImpl : public DB { virtual SequenceNumber GetLatestSequenceNumber() const override; + bool HasActiveSnapshotLaterThanSN(SequenceNumber sn); + #ifndef ROCKSDB_LITE using DB::ResetStats; virtual Status ResetStats() override; @@ -465,6 +467,9 @@ class DBImpl : public DB { // mutex is released. ColumnFamilyHandle* GetColumnFamilyHandle(uint32_t column_family_id); + // Same as above, should called without mutex held and not on write thread. + ColumnFamilyHandle* GetColumnFamilyHandleUnlocked(uint32_t column_family_id); + // Returns the number of currently running flushes. // REQUIREMENT: mutex_ must be held when calling this function. int num_running_flushes() { diff --git a/env/env_posix.cc b/env/env_posix.cc index b0befba8c..7d726176a 100644 --- a/env/env_posix.cc +++ b/env/env_posix.cc @@ -253,13 +253,14 @@ class PosixEnv : public Env { return s; } - virtual Status NewWritableFile(const std::string& fname, - unique_ptr* result, - const EnvOptions& options) override { + virtual Status OpenWritableFile(const std::string& fname, + unique_ptr* result, + const EnvOptions& options, + bool reopen = false) { result->reset(); Status s; int fd = -1; - int flags = O_CREAT | O_TRUNC; + int flags = (reopen) ? (O_CREAT | O_APPEND) : (O_CREAT | O_TRUNC); // Direct IO mode with O_DIRECT flag or F_NOCAHCE (MAC OSX) if (options.use_direct_writes && !options.use_mmap_writes) { // Note: we should avoid O_APPEND here due to ta the following bug: @@ -333,6 +334,18 @@ class PosixEnv : public Env { return s; } + virtual Status NewWritableFile(const std::string& fname, + unique_ptr* result, + const EnvOptions& options) override { + return OpenWritableFile(fname, result, options, false); + } + + virtual Status ReopenWritableFile(const std::string& fname, + unique_ptr* result, + const EnvOptions& options) override { + return OpenWritableFile(fname, result, options, true); + } + virtual Status ReuseWritableFile(const std::string& fname, const std::string& old_fname, unique_ptr* result, diff --git a/include/rocksdb/env.h b/include/rocksdb/env.h index 9af0261fa..ad59dd1a0 100644 --- a/include/rocksdb/env.h +++ b/include/rocksdb/env.h @@ -468,8 +468,6 @@ class SequentialFile { // aligned buffer for Direct I/O virtual size_t GetRequiredBufferAlignment() const { return kDefaultPageSize; } - virtual void Rewind() {} - // Remove any kind of caching of data from the offset to offset+length // of this file. If the length is 0, then it refers to the end of file. // If the system is not caching the file contents, then this is a noop. diff --git a/src.mk b/src.mk index 4a570d1bd..677b18b18 100644 --- a/src.mk +++ b/src.mk @@ -150,6 +150,12 @@ LIB_SOURCES = \ util/xxhash.cc \ utilities/backupable/backupable_db.cc \ utilities/blob_db/blob_db.cc \ + utilities/blob_db/blob_db_impl.cc \ + utilities/blob_db/blob_db_options_impl.cc \ + utilities/blob_db/blob_file.cc \ + utilities/blob_db/blob_log_reader.cc \ + utilities/blob_db/blob_log_writer.cc \ + utilities/blob_db/blob_log_format.cc \ utilities/checkpoint/checkpoint_impl.cc \ utilities/compaction_filters/remove_emptyvalue_compactionfilter.cc \ utilities/convenience/info_log_finder.cc \ @@ -308,6 +314,7 @@ MAIN_SOURCES = \ util/log_write_bench.cc \ util/rate_limiter_test.cc \ util/slice_transform_test.cc \ + util/timer_queue_test.cc \ util/thread_list_test.cc \ util/thread_local_test.cc \ utilities/backupable/backupable_db_test.cc \ diff --git a/tools/db_bench_tool.cc b/tools/db_bench_tool.cc index 029747271..195c1c660 100644 --- a/tools/db_bench_tool.cc +++ b/tools/db_bench_tool.cc @@ -583,6 +583,10 @@ DEFINE_bool(optimistic_transaction_db, false, "Open a OptimisticTransactionDB instance. " "Required for randomtransaction benchmark."); +DEFINE_bool(use_blob_db, false, + "Open a BlobDB instance. " + "Required for largevalue benchmark."); + DEFINE_bool(transaction_db, false, "Open a TransactionDB instance. " "Required for randomtransaction benchmark."); @@ -630,8 +634,6 @@ DEFINE_bool(report_bg_io_stats, false, DEFINE_bool(use_stderr_info_logger, false, "Write info logs to stderr instead of to LOG file. "); -DEFINE_bool(use_blob_db, false, "Whether to use BlobDB. "); - static enum rocksdb::CompressionType StringToCompressionType(const char* ctype) { assert(ctype); @@ -1128,6 +1130,15 @@ class RandomGenerator { pos_ += len; return Slice(data_.data() + pos_ - len, len); } + + Slice GenerateWithTTL(unsigned int len) { + assert(len <= data_.size()); + if (pos_ + len > data_.size()) { + pos_ = 0; + } + pos_ += len; + return Slice(data_.data() + pos_ - len, len); + } }; static void AppendWithSpace(std::string* str, Slice msg) { @@ -3227,9 +3238,14 @@ void VerifyDBFromDB(std::string& truth_db_name) { if (s.ok()) { db->db = ptr; } -#endif // ROCKSDB_LITE } else if (FLAGS_use_blob_db) { - s = NewBlobDB(options, db_name, &db->db); + blob_db::BlobDBOptions blob_db_options; + blob_db::BlobDB* ptr; + s = blob_db::BlobDB::Open(options, blob_db_options, db_name, &ptr); + if (s.ok()) { + db->db = ptr; + } +#endif // ROCKSDB_LITE } else { s = DB::Open(options, db_name, &db->db); } @@ -3406,8 +3422,12 @@ void VerifyDBFromDB(std::string& truth_db_name) { int64_t rand_num = key_gens[id]->Next(); GenerateKeyFromInt(rand_num, FLAGS_num, &key); if (FLAGS_use_blob_db) { - s = db_with_cfh->db->Put(write_options_, key, - gen.Generate(value_size_)); + Slice val = gen.Generate(value_size_); + int ttl = rand() % 86400; + blob_db::BlobDB* blobdb = + static_cast(db_with_cfh->db); + s = blobdb->PutWithTTL(write_options_, key, val, ttl); + } else if (FLAGS_num_column_families <= 1) { batch.Put(key, gen.Generate(value_size_)); } else { diff --git a/util/file_reader_writer.h b/util/file_reader_writer.h index 8204b4303..8de41a203 100644 --- a/util/file_reader_writer.h +++ b/util/file_reader_writer.h @@ -48,6 +48,8 @@ class SequentialFileReader { Status Skip(uint64_t n); + void Rewind(); + SequentialFile* file() { return file_.get(); } bool use_direct_io() const { return file_->use_direct_io(); } diff --git a/util/mpsc.h b/util/mpsc.h new file mode 100644 index 000000000..b81492738 --- /dev/null +++ b/util/mpsc.h @@ -0,0 +1,158 @@ +// Portions Copyright (c) 2011-present, Facebook, Inc. All rights reserved. +// This source code is licensed under the BSD-style license found in the +// LICENSE file in the root directory of this source tree. An additional grant +// of patent rights can be found in the PATENTS file in the same directory. +// +// Large parts of this file is borrowed from the public domain code below. +// from https://github.com/mstump/queues + +// C++ implementation of Dmitry Vyukov's non-intrusive +// lock free unbound MPSC queue +// http://www.1024cores.net/home/ +// lock-free-algorithms/queues/non-intrusive-mpsc-node-based-queue + +// License from mstump/queues +// This is free and unencumbered software released into the public domain. +// +// Anyone is free to copy, modify, publish, use, compile, sell, or +// distribute this software, either in source code form or as a compiled +// binary, for any purpose, commercial or non-commercial, and by any +// means. +// +// In jurisdictions that recognize copyright laws, the author or authors +// of this software dedicate any and all copyright interest in the +// software to the public domain. We make this dedication for the benefit +// of the public at large and to the detriment of our heirs and +// successors. We intend this dedication to be an overt act of +// relinquishment in perpetuity of all present and future rights to this +// software under copyright law. +// +// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, +// EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF +// MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. +// IN NO EVENT SHALL THE AUTHORS BE LIABLE FOR ANY CLAIM, DAMAGES OR +// OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, +// ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR +// OTHER DEALINGS IN THE SOFTWARE. +// +// For more information, please refer to + +// License from http://www.1024cores.net/home/ +// lock-free-algorithms/queues/non-intrusive-mpsc-node-based-queue +// Copyright (c) 2010-2011 Dmitry Vyukov. All rights reserved. +// Redistribution and use in source and binary forms, with or +// without modification, are permitted provided that the following +// conditions are met: +// 1. Redistributions of source code must retain the above copyright notice, +// this list of conditions and the following disclaimer. +// +// 2. Redistributions in binary form must reproduce the above copyright notice, +// this list of conditions and the following disclaimer in the documentation +// and/or other materials provided with the distribution. +// +// THIS SOFTWARE IS PROVIDED BY DMITRY VYUKOV "AS IS" AND ANY EXPRESS OR +// IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES +// OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO +// EVENT SHALL DMITRY VYUKOV OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, +// INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, +// BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF +// USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY +// THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT +// (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +// OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +// +// The views and conclusions contained in the software and documentation +// are those of the authors and should not be interpreted as representing +// official policies, either expressed or implied, of Dmitry Vyukov. +// + +#ifndef UTIL_MPSC_H_ +#define UTIL_MPSC_H_ + +#include +#include +#include + +/** + * Multiple Producer Single Consumer Lockless Q + */ +template +class mpsc_queue_t { + public: + struct buffer_node_t { + T data; + std::atomic next; + }; + + mpsc_queue_t() { + buffer_node_aligned_t* al_st = new buffer_node_aligned_t; + buffer_node_t* node = new (al_st) buffer_node_t(); + _head.store(node); + _tail.store(node); + + node->next.store(nullptr, std::memory_order_relaxed); + } + + ~mpsc_queue_t() { + T output; + while (this->dequeue(&output)) { + } + buffer_node_t* front = _head.load(std::memory_order_relaxed); + front->~buffer_node_t(); + + ::operator delete(front); + } + + void enqueue(const T& input) { + buffer_node_aligned_t* al_st = new buffer_node_aligned_t; + buffer_node_t* node = new (al_st) buffer_node_t(); + + node->data = input; + node->next.store(nullptr, std::memory_order_relaxed); + + buffer_node_t* prev_head = _head.exchange(node, std::memory_order_acq_rel); + prev_head->next.store(node, std::memory_order_release); + } + + bool dequeue(T* output) { + buffer_node_t* tail = _tail.load(std::memory_order_relaxed); + buffer_node_t* next = tail->next.load(std::memory_order_acquire); + + if (next == nullptr) { + return false; + } + + *output = next->data; + _tail.store(next, std::memory_order_release); + + tail->~buffer_node_t(); + + ::operator delete(tail); + return true; + } + + // you can only use pop_all if the queue is SPSC + buffer_node_t* pop_all() { + // nobody else can move the tail pointer. + buffer_node_t* tptr = _tail.load(std::memory_order_relaxed); + buffer_node_t* next = + tptr->next.exchange(nullptr, std::memory_order_acquire); + _head.exchange(tptr, std::memory_order_acquire); + + // there is a race condition here + return next; + } + + private: + typedef typename std::aligned_storage< + sizeof(buffer_node_t), std::alignment_of::value>::type + buffer_node_aligned_t; + + std::atomic _head; + std::atomic _tail; + + mpsc_queue_t(const mpsc_queue_t&) = delete; + mpsc_queue_t& operator=(const mpsc_queue_t&) = delete; +}; + +#endif // UTIL_MPSC_H_ diff --git a/util/timer_queue.h b/util/timer_queue.h new file mode 100644 index 000000000..72b44dc2d --- /dev/null +++ b/util/timer_queue.h @@ -0,0 +1,217 @@ +// Portions Copyright (c) 2011-present, Facebook, Inc. All rights reserved. +// This source code is licensed under the BSD-style license found in the +// LICENSE file in the root directory of this source tree. An additional grant +// of patent rights can be found in the PATENTS file in the same directory. +// +// Borrowed from +// http://www.crazygaze.com/blog/2016/03/24/portable-c-timer-queue/ +// Timer Queue +// +// License +// +// The source code in this article is licensed under the CC0 license, so feel +// free to copy, modify, share, do whatever you want with it. +// No attribution is required, but Ill be happy if you do. +// CC0 license + +// The person who associated a work with this deed has dedicated the work to the +// public domain by waiving all of his or her rights to the work worldwide +// under copyright law, including all related and neighboring rights, to the +// extent allowed by law. You can copy, modify, distribute and perform the +// work, even for commercial purposes, all without asking permission. + +#pragma once +#include +#include +#include +#include +#include +#include +#include +#include + +// Allows execution of handlers at a specified time in the future +// Guarantees: +// - All handlers are executed ONCE, even if cancelled (aborted parameter will +// be set to true) +// - If TimerQueue is destroyed, it will cancel all handlers. +// - Handlers are ALWAYS executed in the Timer Queue worker thread. +// - Handlers execution order is NOT guaranteed +// +//////////////////////////////////////////////////////////////////////////////// +// borrowed from +// http://www.crazygaze.com/blog/2016/03/24/portable-c-timer-queue/ +class TimerQueue { + public: + TimerQueue() : m_th(&TimerQueue::run, this) {} + + ~TimerQueue() { + cancelAll(); + // Abusing the timer queue to trigger the shutdown. + add(0, [this](bool) { + m_finish = true; + return std::make_pair(false, 0); + }); + m_th.join(); + } + + // Adds a new timer + // \return + // Returns the ID of the new timer. You can use this ID to cancel the + // timer + uint64_t add(int64_t milliseconds, + std::function(bool)> handler) { + WorkItem item; + Clock::time_point tp = Clock::now(); + item.end = tp + std::chrono::milliseconds(milliseconds); + item.period = milliseconds; + item.handler = std::move(handler); + + std::unique_lock lk(m_mtx); + uint64_t id = ++m_idcounter; + item.id = id; + m_items.push(std::move(item)); + + // Something changed, so wake up timer thread + m_checkWork.notify_one(); + return id; + } + + // Cancels the specified timer + // \return + // 1 if the timer was cancelled. + // 0 if you were too late to cancel (or the timer ID was never valid to + // start with) + size_t cancel(uint64_t id) { + // Instead of removing the item from the container (thus breaking the + // heap integrity), we set the item as having no handler, and put + // that handler on a new item at the top for immediate execution + // The timer thread will then ignore the original item, since it has no + // handler. + std::unique_lock lk(m_mtx); + for (auto&& item : m_items.getContainer()) { + if (item.id == id && item.handler) { + WorkItem newItem; + // Zero time, so it stays at the top for immediate execution + newItem.end = Clock::time_point(); + newItem.id = 0; // Means it is a canceled item + // Move the handler from item to newitem (thus clearing item) + newItem.handler = std::move(item.handler); + m_items.push(std::move(newItem)); + + // Something changed, so wake up timer thread + m_checkWork.notify_one(); + return 1; + } + } + return 0; + } + + // Cancels all timers + // \return + // The number of timers cancelled + size_t cancelAll() { + // Setting all "end" to 0 (for immediate execution) is ok, + // since it maintains the heap integrity + std::unique_lock lk(m_mtx); + m_cancel = true; + for (auto&& item : m_items.getContainer()) { + if (item.id && item.handler) { + item.end = Clock::time_point(); + item.id = 0; + } + } + auto ret = m_items.size(); + + m_checkWork.notify_one(); + return ret; + } + + private: + using Clock = std::chrono::steady_clock; + TimerQueue(const TimerQueue&) = delete; + TimerQueue& operator=(const TimerQueue&) = delete; + + void run() { + std::unique_lock lk(m_mtx); + while (!m_finish) { + auto end = calcWaitTime_lock(); + if (end.first) { + // Timers found, so wait until it expires (or something else + // changes) + m_checkWork.wait_until(lk, end.second); + } else { + // No timers exist, so wait forever until something changes + m_checkWork.wait(lk); + } + + // Check and execute as much work as possible, such as, all expired + // timers + checkWork(&lk); + } + + // If we are shutting down, we should not have any items left, + // since the shutdown cancels all items + assert(m_items.size() == 0); + } + + std::pair calcWaitTime_lock() { + while (m_items.size()) { + if (m_items.top().handler) { + // Item present, so return the new wait time + return std::make_pair(true, m_items.top().end); + } else { + // Discard empty handlers (they were cancelled) + m_items.pop(); + } + } + + // No items found, so return no wait time (causes the thread to wait + // indefinitely) + return std::make_pair(false, Clock::time_point()); + } + + void checkWork(std::unique_lock* lk) { + while (m_items.size() && m_items.top().end <= Clock::now()) { + WorkItem item(m_items.top()); + m_items.pop(); + + if (item.handler) { + (*lk).unlock(); + auto reschedule_pair = item.handler(item.id == 0); + (*lk).lock(); + if (!m_cancel && reschedule_pair.first) { + int64_t new_period = (reschedule_pair.second == -1) + ? item.period + : reschedule_pair.second; + + item.period = new_period; + item.end = Clock::now() + std::chrono::milliseconds(new_period); + m_items.push(std::move(item)); + } + } + } + } + + bool m_finish = false; + bool m_cancel = false; + uint64_t m_idcounter = 0; + std::condition_variable m_checkWork; + + struct WorkItem { + Clock::time_point end; + int64_t period; + uint64_t id; // id==0 means it was cancelled + std::function(bool)> handler; + bool operator>(const WorkItem& other) const { return end > other.end; } + }; + + std::mutex m_mtx; + // Inheriting from priority_queue, so we can access the internal container + class Queue : public std::priority_queue, + std::greater> { + public: + std::vector& getContainer() { return this->c; } + } m_items; + std::thread m_th; +}; diff --git a/util/timer_queue_test.cc b/util/timer_queue_test.cc new file mode 100644 index 000000000..e0c545d0d --- /dev/null +++ b/util/timer_queue_test.cc @@ -0,0 +1,72 @@ +// Portions Copyright (c) 2011-present, Facebook, Inc. All rights reserved. +// This source code is licensed under the BSD-style license found in the +// LICENSE file in the root directory of this source tree. An additional grant +// of patent rights can be found in the PATENTS file in the same directory. + +// borrowed from +// http://www.crazygaze.com/blog/2016/03/24/portable-c-timer-queue/ +// Timer Queue +// +// License +// +// The source code in this article is licensed under the CC0 license, so feel +// free +// to copy, modify, share, do whatever you want with it. +// No attribution is required, but Ill be happy if you do. +// CC0 license + +// The person who associated a work with this deed has dedicated the work to the +// public domain by waiving all of his or her rights to the work worldwide +// under copyright law, including all related and neighboring rights, to the +// extent allowed by law. You can copy, modify, distribute and perform the +// work, even for +// commercial purposes, all without asking permission. See Other Information +// below. +// + +#include "util/timer_queue.h" +#include + +namespace Timing { + +using Clock = std::chrono::high_resolution_clock; +double now() { + static auto start = Clock::now(); + return std::chrono::duration(Clock::now() - start) + .count(); +} + +} // namespace Timing + +int main() { + TimerQueue q; + + double tnow = Timing::now(); + + q.add(10000, [tnow](bool aborted) mutable { + printf("T 1: %d, Elapsed %4.2fms\n", aborted, Timing::now() - tnow); + return std::make_pair(false, 0); + }); + q.add(10001, [tnow](bool aborted) mutable { + printf("T 2: %d, Elapsed %4.2fms\n", aborted, Timing::now() - tnow); + return std::make_pair(false, 0); + }); + + q.add(1000, [tnow](bool aborted) mutable { + printf("T 3: %d, Elapsed %4.2fms\n", aborted, Timing::now() - tnow); + return std::make_pair(!aborted, 1000); + }); + + auto id = q.add(2000, [tnow](bool aborted) mutable { + printf("T 4: %d, Elapsed %4.2fms\n", aborted, Timing::now() - tnow); + return std::make_pair(!aborted, 2000); + }); + + (void)id; + // auto ret = q.cancel(id); + // assert(ret == 1); + // q.cancelAll(); + + return 0; +} +////////////////////////////////////////// diff --git a/utilities/blob_db/blob_db.cc b/utilities/blob_db/blob_db.cc index dcc9b5e3f..b3ef96bf9 100644 --- a/utilities/blob_db/blob_db.cc +++ b/utilities/blob_db/blob_db.cc @@ -1,12 +1,15 @@ -// Copyright (c) 2011 The LevelDB Authors. All rights reserved. -// Use of this source code is governed by a BSD-style license that can be -// found in the LICENSE file. See the AUTHORS file for names of contributors. -#include "utilities/blob_db/blob_db.h" - +// Copyright (c) 2011-present, Facebook, Inc. All rights reserved. +// This source code is licensed under the BSD-style license found in the +// LICENSE file in the root directory of this source tree. An additional grant +// of patent rights can be found in the PATENTS file in the same directory. +// #ifndef ROCKSDB_LITE + +#include "utilities/blob_db/blob_db.h" #include "db/write_batch_internal.h" #include "monitoring/instrumented_mutex.h" #include "options/cf_options.h" +#include "rocksdb/compaction_filter.h" #include "rocksdb/convenience.h" #include "rocksdb/env.h" #include "rocksdb/iterator.h" @@ -17,194 +20,152 @@ #include "util/crc32c.h" #include "util/file_reader_writer.h" #include "util/filename.h" +#include "utilities/blob_db/blob_db_impl.h" namespace rocksdb { -namespace { -int kBlockBasedTableVersionFormat = 2; -} // namespace - -class BlobDB : public StackableDB { - public: - using rocksdb::StackableDB::Put; - Status Put(const WriteOptions& options, const Slice& key, - const Slice& value) override; - - using rocksdb::StackableDB::Get; - Status Get(const ReadOptions& options, const Slice& key, - std::string* value) override; - - Status Open(); - - explicit BlobDB(DB* db); - - private: - std::string dbname_; - ImmutableCFOptions ioptions_; - InstrumentedMutex mutex_; - std::unique_ptr file_reader_; - std::unique_ptr file_writer_; - size_t writer_offset_; - size_t next_sync_offset_; - - static const std::string kFileName; - static const size_t kBlockHeaderSize; - static const size_t kBytesPerSync; -}; - -Status NewBlobDB(Options options, std::string dbname, DB** blob_db) { - DB* db; - Status s = DB::Open(options, dbname, &db); - if (!s.ok()) { - return s; - } - BlobDB* bdb = new BlobDB(db); - s = bdb->Open(); - if (!s.ok()) { - delete bdb; +namespace blob_db { +port::Mutex listener_mutex; +typedef std::shared_ptr FlushBeginListener_t; +typedef std::shared_ptr ReconcileWalFilter_t; +typedef std::shared_ptr + CompactionListener_t; + +// to ensure the lifetime of the listeners +std::vector> all_blobdb_listeners; +std::vector all_wal_filters; + +Status BlobDB::OpenAndLoad(const Options& options, + const BlobDBOptions& bdb_options, + const std::string& dbname, BlobDB** blob_db, + Options* changed_options) { + *changed_options = options; + *blob_db = nullptr; + + FlushBeginListener_t fblistener = + std::make_shared(); + ReconcileWalFilter_t rw_filter = std::make_shared(); + CompactionListener_t ce_listener = + std::make_shared(); + + { + MutexLock l(&listener_mutex); + all_blobdb_listeners.push_back(fblistener); + all_blobdb_listeners.push_back(ce_listener); + all_wal_filters.push_back(rw_filter); } + + changed_options->listeners.emplace_back(fblistener); + changed_options->listeners.emplace_back(ce_listener); + changed_options->wal_filter = rw_filter.get(); + + DBOptions db_options(*changed_options); + + // we need to open blob db first so that recovery can happen + BlobDBImpl* bdb = new BlobDBImpl(dbname, bdb_options, db_options); + + fblistener->SetImplPtr(bdb); + ce_listener->SetImplPtr(bdb); + rw_filter->SetImplPtr(bdb); + + Status s = bdb->OpenPhase1(); + if (!s.ok()) return s; + *blob_db = bdb; return s; } -const std::string BlobDB::kFileName = "blob_log"; -const size_t BlobDB::kBlockHeaderSize = 8; -const size_t BlobDB::kBytesPerSync = 1024 * 1024 * 128; - -BlobDB::BlobDB(DB* db) - : StackableDB(db), - ioptions_(db->GetOptions()), - writer_offset_(0), - next_sync_offset_(kBytesPerSync) {} - -Status BlobDB::Open() { - unique_ptr wfile; - EnvOptions env_options(db_->GetOptions()); - Status s = ioptions_.env->NewWritableFile(db_->GetName() + "/" + kFileName, - &wfile, env_options); - if (!s.ok()) { - return s; +Status BlobDB::Open(const Options& options, const BlobDBOptions& bdb_options, + const std::string& dbname, BlobDB** blob_db) { + *blob_db = nullptr; + DBOptions db_options(options); + ColumnFamilyOptions cf_options(options); + std::vector column_families; + column_families.push_back( + ColumnFamilyDescriptor(kDefaultColumnFamilyName, cf_options)); + std::vector handles; + Status s = BlobDB::Open(db_options, bdb_options, dbname, column_families, + &handles, blob_db); + if (s.ok()) { + assert(handles.size() == 1); + // i can delete the handle since DBImpl is always holding a reference to + // default column family + delete handles[0]; } - file_writer_.reset(new WritableFileWriter(std::move(wfile), env_options)); - - // Write version - std::string version; - PutFixed64(&version, 0); - s = file_writer_->Append(Slice(version)); - if (!s.ok()) { - return s; - } - writer_offset_ += version.size(); - - std::unique_ptr rfile; - s = ioptions_.env->NewRandomAccessFile(db_->GetName() + "/" + kFileName, - &rfile, env_options); - if (!s.ok()) { - return s; - } - file_reader_.reset(new RandomAccessFileReader(std::move(rfile))); return s; } -Status BlobDB::Put(const WriteOptions& options, const Slice& key, - const Slice& value) { - BlockBuilder block_builder(1, false); - block_builder.Add(key, value); - - CompressionType compression = CompressionType::kLZ4Compression; - CompressionOptions compression_opts; - - Slice block_contents; - std::string compression_output; +Status BlobDB::Open(const DBOptions& db_options, + const BlobDBOptions& bdb_options, const std::string& dbname, + const std::vector& column_families, + std::vector* handles, BlobDB** blob_db, + bool no_base_db) { + *blob_db = nullptr; - block_contents = CompressBlock(block_builder.Finish(), compression_opts, - &compression, kBlockBasedTableVersionFormat, - Slice() /* dictionary */, &compression_output); + DBOptions my_db_options(db_options); + FlushBeginListener_t fblistener = + std::make_shared(); + CompactionListener_t ce_listener = + std::make_shared(); + ReconcileWalFilter_t rw_filter = std::make_shared(); - char header[kBlockHeaderSize]; - char trailer[kBlockTrailerSize]; - trailer[0] = compression; - auto crc = crc32c::Value(block_contents.data(), block_contents.size()); - crc = crc32c::Extend(crc, trailer, 1); // Extend to cover block type - EncodeFixed32(trailer + 1, crc32c::Mask(crc)); + my_db_options.listeners.emplace_back(fblistener); + my_db_options.listeners.emplace_back(ce_listener); + my_db_options.wal_filter = rw_filter.get(); - BlockHandle handle; - std::string index_entry; - Status s; { - InstrumentedMutexLock l(&mutex_); - auto raw_block_size = block_contents.size(); - EncodeFixed64(header, raw_block_size); - s = file_writer_->Append(Slice(header, kBlockHeaderSize)); - writer_offset_ += kBlockHeaderSize; - if (s.ok()) { - handle.set_offset(writer_offset_); - handle.set_size(raw_block_size); - s = file_writer_->Append(block_contents); - } - if (s.ok()) { - s = file_writer_->Append(Slice(trailer, kBlockTrailerSize)); - } - if (s.ok()) { - s = file_writer_->Flush(); - } - if (s.ok() && writer_offset_ > next_sync_offset_) { - // Sync every kBytesPerSync. This is a hacky way to limit unsynced data. - next_sync_offset_ += kBytesPerSync; - s = file_writer_->Sync(db_->GetOptions().use_fsync); - } - if (s.ok()) { - writer_offset_ += block_contents.size() + kBlockTrailerSize; - // Put file number - PutVarint64(&index_entry, 0); - handle.EncodeTo(&index_entry); - s = db_->Put(options, key, index_entry); - } + MutexLock l(&listener_mutex); + all_blobdb_listeners.push_back(fblistener); + all_blobdb_listeners.push_back(ce_listener); + all_wal_filters.push_back(rw_filter); } - return s; -} -Status BlobDB::Get(const ReadOptions& options, const Slice& key, - std::string* value) { - Status s; - std::string index_entry; - s = db_->Get(options, key, &index_entry); - if (!s.ok()) { - return s; - } - BlockHandle handle; - Slice index_entry_slice(index_entry); - uint64_t file_number; - if (!GetVarint64(&index_entry_slice, &file_number)) { - return Status::Corruption(); - } - assert(file_number == 0); - s = handle.DecodeFrom(&index_entry_slice); - if (!s.ok()) { - return s; - } - Footer footer(0, kBlockBasedTableVersionFormat); - BlockContents contents; - s = ReadBlockContents(file_reader_.get(), footer, options, handle, &contents, - ioptions_); + // we need to open blob db first so that recovery can happen + BlobDBImpl* bdb = new BlobDBImpl(dbname, bdb_options, my_db_options); + fblistener->SetImplPtr(bdb); + ce_listener->SetImplPtr(bdb); + rw_filter->SetImplPtr(bdb); + + Status s = bdb->OpenPhase1(); + if (!s.ok()) return s; + + if (no_base_db) return s; + + DB* db = nullptr; + s = DB::Open(my_db_options, dbname, column_families, handles, &db); + if (!s.ok()) return s; + + // set the implementation pointer + s = bdb->LinkToBaseDB(db); if (!s.ok()) { - return s; - } - Block block(std::move(contents), kDisableGlobalSequenceNumber); - BlockIter bit; - InternalIterator* it = block.NewIterator(nullptr, &bit); - it->SeekToFirst(); - if (!it->status().ok()) { - return it->status(); + delete bdb; + bdb = nullptr; } - *value = it->value().ToString(); + *blob_db = bdb; return s; } + +BlobDB::BlobDB(DB* db) : StackableDB(db) {} + +//////////////////////////////////////////////////////////////////////////////// +// +// +// std::function fnCaller = +// std::bind(&A::fn, &anInstance, std::placeholders::_1); +//////////////////////////////////////////////////////////////////////////////// +BlobDBOptions::BlobDBOptions() + : blob_dir("blob_dir"), + path_relative(true), + is_fifo(false), + blob_dir_size(1000ULL * 1024ULL * 1024ULL * 1024ULL), + ttl_range_secs(3600), + min_blob_size(512), + bytes_per_sync(0), + blob_file_size(256 * 1024 * 1024), + num_concurrent_simple_blobs(4), + default_ttl_extractor(false), + compression(kNoCompression) {} + +} // namespace blob_db } // namespace rocksdb -#else -namespace rocksdb { -Status NewBlobDB(Options options, std::string dbname, DB** blob_db) { - return Status::NotSupported(); -} -} // namespace rocksdb -#endif // ROCKSDB_LITE +#endif diff --git a/utilities/blob_db/blob_db.h b/utilities/blob_db/blob_db.h index 43111fa0e..fea8063a4 100644 --- a/utilities/blob_db/blob_db.h +++ b/utilities/blob_db/blob_db.h @@ -7,12 +7,19 @@ #pragma once +#ifndef ROCKSDB_LITE + +#include #include +#include #include "rocksdb/db.h" #include "rocksdb/status.h" +#include "rocksdb/utilities/stackable_db.h" namespace rocksdb { -// EXPERIMENAL ONLY + +namespace blob_db { + // A wrapped database which puts values of KV pairs in a separate log // and store location to the log in the underlying DB. // It lacks lots of importatant functionalities, e.g. DB restarts, @@ -20,5 +27,177 @@ namespace rocksdb { // // The factory needs to be moved to include/rocksdb/utilities to allow // users to use blob DB. -extern Status NewBlobDB(Options options, std::string dbname, DB** blob_db); + +struct BlobDBOptions { + // name of the directory under main db, where blobs will be stored. + // default is "blob_dir" + std::string blob_dir; + + // whether the blob_dir path is relative or absolute. + bool path_relative; + + // is the eviction strategy fifo based + bool is_fifo; + + // maximum size of the blob dir. Once this gets used, up + // evict the blob file which is oldest (is_fifo ) + // 0 means no limits + uint64_t blob_dir_size; + + // a new bucket is opened, for ttl_range. So if ttl_range is 600seconds + // (10 minutes), and the first bucket starts at 1471542000 + // then the blob buckets will be + // first bucket is 1471542000 - 1471542600 + // second bucket is 1471542600 - 1471543200 + // and so on + uint32_t ttl_range_secs; + + // at what size will the blobs be stored in separate log rather than + // inline + uint64_t min_blob_size; + + // at what bytes will the blob files be synced to blob log. + uint64_t bytes_per_sync; + + // the target size of each blob file. File will become immutable + // after it exceeds that size + uint64_t blob_file_size; + + // how many files to use for simple blobs at one time + uint32_t num_concurrent_simple_blobs; + + // this function is to be provided by client if they intend to + // use Put API to provide TTL. + // the first argument is the value in the Put API + // in case you want to do some modifications to the value, + // return a new Slice in the second. + // otherwise just copy the input value into output. + // the ttl should be extracted and returned in last pointer. + // otherwise assign it to -1 + std::function extract_ttl_fn; + + // eviction callback. + // this function will be called for every blob that is getting + // evicted. + std::function + gc_evict_cb_fn; + + // default ttl extactor + bool default_ttl_extractor; + + // what compression to use for Blob's + CompressionType compression; + + // default constructor + BlobDBOptions(); + + BlobDBOptions(const BlobDBOptions& in) = default; + + virtual ~BlobDBOptions() = default; +}; + +class BlobDB : public StackableDB { + public: + // the suffix to a blob value to represent "ttl:TTLVAL" + static const uint64_t kTTLSuffixLength = 8; + + public: + using rocksdb::StackableDB::Put; + + // This function needs to be called before destroying + // the base DB + static Status DestroyBlobDB(const std::string& dbname, const Options& options, + const BlobDBOptions& bdb_options); + + virtual Status Put(const WriteOptions& options, + ColumnFamilyHandle* column_family, const Slice& key, + const Slice& value) override = 0; + + using rocksdb::StackableDB::Delete; + virtual Status Delete(const WriteOptions& options, + ColumnFamilyHandle* column_family, + const Slice& key) override = 0; + + virtual Status PutWithTTL(const WriteOptions& options, + ColumnFamilyHandle* column_family, const Slice& key, + const Slice& value, int32_t ttl) = 0; + + virtual Status PutWithTTL(const WriteOptions& options, const Slice& key, + const Slice& value, int32_t ttl) { + return PutWithTTL(options, DefaultColumnFamily(), key, value, ttl); + } + + virtual Status PutUntil(const WriteOptions& options, + ColumnFamilyHandle* column_family, const Slice& key, + const Slice& value, int32_t expiration) = 0; + + virtual Status PutUntil(const WriteOptions& options, const Slice& key, + const Slice& value, int32_t expiration) { + return PutUntil(options, DefaultColumnFamily(), key, value, expiration); + } + + using rocksdb::StackableDB::Get; + virtual Status Get(const ReadOptions& options, + ColumnFamilyHandle* column_family, const Slice& key, + std::string* value) override = 0; + + using rocksdb::StackableDB::MultiGet; + virtual std::vector MultiGet( + const ReadOptions& options, + const std::vector& column_family, + const std::vector& keys, + std::vector* values) override = 0; + + using rocksdb::StackableDB::SingleDelete; + virtual Status SingleDelete(const WriteOptions& wopts, + ColumnFamilyHandle* column_family, + const Slice& key) override = 0; + + using rocksdb::StackableDB::Merge; + virtual Status Merge(const WriteOptions& options, + ColumnFamilyHandle* column_family, const Slice& key, + const Slice& value) override { + return Status::NotSupported("Not supported operation in blob db."); + } + + virtual Status Write(const WriteOptions& opts, + WriteBatch* updates) override = 0; + + // Starting point for opening a Blob DB. + // changed_options - critical. Blob DB loads and inserts listeners + // into options which are necessary for recovery and atomicity + // Use this pattern if you need control on step 2, i.e. your + // BaseDB is not just a simple rocksdb but a stacked DB + // 1. ::OpenAndLoad + // 2. Open Base DB with the changed_options + // 3. ::LinkToBaseDB + static Status OpenAndLoad(const Options& options, + const BlobDBOptions& bdb_options, + const std::string& dbname, BlobDB** blob_db, + Options* changed_options); + + // This is another way to open BLOB DB which do not have other + // Stackable DB's in play + // Steps. + // 1. ::Open + static Status Open(const Options& options, const BlobDBOptions& bdb_options, + const std::string& dbname, BlobDB** blob_db); + + static Status Open(const DBOptions& db_options, + const BlobDBOptions& bdb_options, + const std::string& dbname, + const std::vector& column_families, + std::vector* handles, + BlobDB** blob_db, bool no_base_db = false); + + virtual ~BlobDB() {} + + virtual Status LinkToBaseDB(DB* db_base) = 0; + + protected: + explicit BlobDB(DB* db); +}; + +} // namespace blob_db } // namespace rocksdb +#endif // ROCKSDB_LITE diff --git a/utilities/blob_db/blob_db_impl.cc b/utilities/blob_db/blob_db_impl.cc new file mode 100644 index 000000000..72c5d0e7e --- /dev/null +++ b/utilities/blob_db/blob_db_impl.cc @@ -0,0 +1,2210 @@ +// Copyright (c) 2011-present, Facebook, Inc. All rights reserved. +// This source code is licensed under the BSD-style license found in the +// LICENSE file in the root directory of this source tree. An additional grant +// of patent rights can be found in the PATENTS file in the same directory. +#ifndef ROCKSDB_LITE + +#include "utilities/blob_db/blob_db_impl.h" +#include +#include +#include +#include +#include +#include +#include + +#include "db/db_impl.h" +#include "db/write_batch_internal.h" +#include "monitoring/instrumented_mutex.h" +#include "rocksdb/convenience.h" +#include "rocksdb/env.h" +#include "rocksdb/iterator.h" +#include "rocksdb/utilities/stackable_db.h" +#include "rocksdb/utilities/transaction.h" +#include "table/block.h" +#include "table/block_based_table_builder.h" +#include "table/block_builder.h" +#include "table/meta_blocks.h" +#include "util/crc32c.h" +#include "util/file_reader_writer.h" +#include "util/filename.h" +#include "util/random.h" +#include "util/timer_queue.h" +#include "utilities/transactions/optimistic_transaction_db_impl.h" +#include "utilities/transactions/optimistic_transaction_impl.h" + +namespace { +int kBlockBasedTableVersionFormat = 2; + +void extendTTL(rocksdb::blob_db::ttlrange_t* ttl_range, uint32_t ttl) { + ttl_range->first = std::min(ttl_range->first, ttl); + ttl_range->second = std::max(ttl_range->second, ttl); +} + +void extendTimestamps(rocksdb::blob_db::tsrange_t* ts_range, uint64_t ts) { + ts_range->first = std::min(ts_range->first, ts); + ts_range->second = std::max(ts_range->second, ts); +} + +void extendSN(rocksdb::blob_db::snrange_t* sn_range, + rocksdb::SequenceNumber sn) { + sn_range->first = std::min(sn_range->first, sn); + sn_range->second = std::max(sn_range->second, sn); +} +} // end namespace + +namespace rocksdb { + +namespace blob_db { + +struct GCStats { + uint64_t blob_count; + uint64_t num_deletes; + uint64_t deleted_size; + uint64_t num_relocs; + uint64_t succ_deletes_lsm; + uint64_t succ_relocs; + std::shared_ptr newfile; + GCStats() + : blob_count(0), + num_deletes(0), + deleted_size(0), + num_relocs(0), + succ_deletes_lsm(0), + succ_relocs(0) {} +}; + +// BlobHandle is a pointer to the blob that is stored in the LSM +class BlobHandle { + public: + BlobHandle() + : file_number_(std::numeric_limits::max()), + offset_(std::numeric_limits::max()), + size_(std::numeric_limits::max()), + compression_(kNoCompression) {} + + uint64_t filenumber() const { return file_number_; } + void set_filenumber(uint64_t fn) { file_number_ = fn; } + + // The offset of the block in the file. + uint64_t offset() const { return offset_; } + void set_offset(uint64_t _offset) { offset_ = _offset; } + + // The size of the stored block + uint64_t size() const { return size_; } + void set_size(uint64_t _size) { size_ = _size; } + + CompressionType compression() const { return compression_; } + void set_compression(CompressionType t) { compression_ = t; } + + void EncodeTo(std::string* dst) const; + Status DecodeFrom(Slice* input); + + void clear(); + + private: + uint64_t file_number_; + uint64_t offset_; + uint64_t size_; + CompressionType compression_; +}; + +void BlobHandle::EncodeTo(std::string* dst) const { + // Sanity check that all fields have been set + assert(offset_ != std::numeric_limits::max()); + assert(size_ != std::numeric_limits::max()); + assert(file_number_ != std::numeric_limits::max()); + + dst->reserve(30); + PutVarint64(dst, file_number_); + PutVarint64(dst, offset_); + PutVarint64(dst, size_); + dst->push_back(static_cast(compression_)); +} + +void BlobHandle::clear() { + file_number_ = std::numeric_limits::max(); + offset_ = std::numeric_limits::max(); + size_ = std::numeric_limits::max(); + compression_ = kNoCompression; +} + +Status BlobHandle::DecodeFrom(Slice* input) { + if (GetVarint64(input, &file_number_) && GetVarint64(input, &offset_) && + GetVarint64(input, &size_)) { + compression_ = static_cast(input->data()[0]); + return Status::OK(); + } else { + clear(); + return Status::Corruption("bad blob handle"); + } +} + +Random blob_rgen(static_cast(time(nullptr))); + +void BlobDBFlushBeginListener::OnFlushBegin(DB* db, const FlushJobInfo& info) { + if (impl_) impl_->OnFlushBeginHandler(db, info); +} + +WalFilter::WalProcessingOption BlobReconcileWalFilter::LogRecordFound( + unsigned long long log_number, const std::string& log_file_name, + const WriteBatch& batch, WriteBatch* new_batch, bool* batch_changed) { + return WalFilter::WalProcessingOption::kContinueProcessing; +} + +bool blobf_compare_ttl::operator()(const std::shared_ptr& lhs, + const std::shared_ptr& rhs) const { + if (lhs->ttl_range_.first < rhs->ttl_range_.first) return true; + + if (lhs->ttl_range_.first > rhs->ttl_range_.first) return false; + + return lhs->BlobFileNumber() > rhs->BlobFileNumber(); +} + +void EvictAllVersionsCompactionListener::InternalListener::OnCompaction( + int level, const Slice& key, + CompactionEventListener::CompactionListenerValueType value_type, + const Slice& existing_value, const SequenceNumber& sn, bool is_new) { + if (!is_new && + value_type == + CompactionEventListener::CompactionListenerValueType::kValue) { + BlobHandle handle; + Slice lsmval(existing_value); + Status s = handle.DecodeFrom(&lsmval); + if (s.ok()) { + if (impl_->debug_level_ >= 3) + Log(InfoLogLevel::INFO_LEVEL, impl_->db_options_.info_log, + "CALLBACK COMPACTED OUT KEY: %s SN: %d " + "NEW: %d FN: %" PRIu64 " OFFSET: %" PRIu64 " SIZE: %" PRIu64, + key.ToString().c_str(), sn, is_new, handle.filenumber(), + handle.offset(), handle.size()); + + impl_->override_vals_q_.enqueue({handle.filenumber(), key.size(), + handle.offset(), handle.size(), sn}); + } + } else { + if (impl_->debug_level_ >= 3) + Log(InfoLogLevel::INFO_LEVEL, impl_->db_options_.info_log, + "CALLBACK NEW KEY: %s SN: %d NEW: %d", key.ToString().c_str(), sn, + is_new); + } +} + +Status BlobDB::DestroyBlobDB(const std::string& dbname, const Options& options, + const BlobDBOptions& bdb_options) { + const ImmutableDBOptions soptions(SanitizeOptions(dbname, options)); + Env* env = soptions.env; + + Status result; + std::string blobdir; + blobdir = (bdb_options.path_relative) ? dbname + "/" + bdb_options.blob_dir + : bdb_options.blob_dir; + + std::vector filenames; + Status status = env->GetChildren(blobdir, &filenames); + + for (const auto& f : filenames) { + uint64_t number; + FileType type; + if (ParseFileName(f, &number, &type) && type == kBlobFile) { + Status del = env->DeleteFile(blobdir + "/" + f); + if (result.ok() && !del.ok()) { + result = del; + } + } + } + + env->DeleteDir(blobdir); + return result; +} + +BlobDBImpl::BlobDBImpl(const std::string& dbname, + const BlobDBOptions& blob_db_options, + const DBOptions& db_options) + : BlobDB(nullptr), + db_impl_(nullptr), + myenv_(db_options.env), + wo_set_(false), + bdb_options_(blob_db_options), + db_options_(db_options), + env_options_(db_options), + dir_change_(false), + next_file_number_(1), + epoch_of_(0), + shutdown_(false), + current_epoch_(0), + open_file_count_(0), + last_period_write_(0), + last_period_ampl_(0), + total_periods_write_(0), + total_periods_ampl_(0), + total_blob_space_(0), + open_p1_done_(false), + debug_level_(0) { + const BlobDBOptionsImpl* options_impl = + dynamic_cast(&blob_db_options); + if (options_impl) { + bdb_options_ = *options_impl; + } + blob_dir_ = (bdb_options_.path_relative) + ? dbname + "/" + bdb_options_.blob_dir + : bdb_options_.blob_dir; + + if (bdb_options_.default_ttl_extractor) { + bdb_options_.extract_ttl_fn = &BlobDBImpl::ExtractTTLFromBlob; + } +} + +Status BlobDBImpl::LinkToBaseDB(DB* db) { + assert(db_ == nullptr); + assert(open_p1_done_); + + db_ = db; + + // the Base DB in-itself can be a stackable DB + StackableDB* sdb = dynamic_cast(db_); + if (sdb) { + db_impl_ = dynamic_cast(sdb->GetBaseDB()); + } else { + db_impl_ = dynamic_cast(db); + } + + myenv_ = db_->GetEnv(); + + opt_db_.reset(new OptimisticTransactionDBImpl(db, false)); + + Status s = myenv_->CreateDirIfMissing(blob_dir_); + if (!s.ok()) { + Log(InfoLogLevel::WARN_LEVEL, db_options_.info_log, + "Failed to create blob directory: %s status: '%s'", blob_dir_.c_str(), + s.ToString().c_str()); + } + s = myenv_->NewDirectory(blob_dir_, &dir_ent_); + if (!s.ok()) { + Log(InfoLogLevel::WARN_LEVEL, db_options_.info_log, + "Failed to open blob directory: %s status: '%s'", blob_dir_.c_str(), + s.ToString().c_str()); + } + + StartBackgroundTasks(); + return s; +} + +BlobDBImpl::BlobDBImpl(DB* db, const BlobDBOptions& blob_db_options) + : BlobDB(db), + db_impl_(dynamic_cast(db)), + opt_db_(new OptimisticTransactionDBImpl(db, false)), + wo_set_(false), + bdb_options_(blob_db_options), + db_options_(db->GetOptions()), + env_options_(db_->GetOptions()), + dir_change_(false), + next_file_number_(1), + epoch_of_(0), + shutdown_(false), + current_epoch_(0), + open_file_count_(0), + last_period_write_(0), + last_period_ampl_(0), + total_periods_write_(0), + total_periods_ampl_(0), + total_blob_space_(0) { + assert(db_impl_ != nullptr); + const BlobDBOptionsImpl* options_impl = + dynamic_cast(&blob_db_options); + if (options_impl) { + bdb_options_ = *options_impl; + } + + if (!bdb_options_.blob_dir.empty()) + blob_dir_ = (bdb_options_.path_relative) + ? db_->GetName() + "/" + bdb_options_.blob_dir + : bdb_options_.blob_dir; + + if (bdb_options_.default_ttl_extractor) { + bdb_options_.extract_ttl_fn = &BlobDBImpl::ExtractTTLFromBlob; + } +} + +BlobDBImpl::~BlobDBImpl() { + // CancelAllBackgroundWork(db_, true); + + Shutdown(); +} + +Status BlobDBImpl::OpenPhase1() { + assert(db_ == nullptr); + if (blob_dir_.empty()) + return Status::NotSupported("No blob directory in options"); + + std::unique_ptr dir_ent; + Status s = myenv_->NewDirectory(blob_dir_, &dir_ent); + if (!s.ok()) { + Log(InfoLogLevel::WARN_LEVEL, db_options_.info_log, + "Failed to open blob directory: %s status: '%s'", blob_dir_.c_str(), + s.ToString().c_str()); + open_p1_done_ = true; + return Status::OK(); + } + + s = OpenAllFiles(); + open_p1_done_ = true; + return s; +} + +void BlobDBImpl::StartBackgroundTasks() { + // store a call to a member function and object + tqueue_.add( + bdb_options_.reclaim_of_period_millisecs, + std::bind(&BlobDBImpl::ReclaimOpenFiles, this, std::placeholders::_1)); + tqueue_.add(bdb_options_.gc_check_period_millisecs, + std::bind(&BlobDBImpl::RunGC, this, std::placeholders::_1)); + tqueue_.add( + bdb_options_.deletion_check_period_millisecs, + std::bind(&BlobDBImpl::EvictDeletions, this, std::placeholders::_1)); + tqueue_.add( + bdb_options_.deletion_check_period_millisecs, + std::bind(&BlobDBImpl::EvictCompacted, this, std::placeholders::_1)); + tqueue_.add( + bdb_options_.delete_obsf_period_millisecs, + std::bind(&BlobDBImpl::DeleteObsFiles, this, std::placeholders::_1)); + tqueue_.add(bdb_options_.sanity_check_period_millisecs, + std::bind(&BlobDBImpl::SanityCheck, this, std::placeholders::_1)); + tqueue_.add(bdb_options_.wa_stats_period_millisecs, + std::bind(&BlobDBImpl::WaStats, this, std::placeholders::_1)); + tqueue_.add(bdb_options_.fsync_files_period_millisecs, + std::bind(&BlobDBImpl::FsyncFiles, this, std::placeholders::_1)); + tqueue_.add( + bdb_options_.check_seqf_period_millisecs, + std::bind(&BlobDBImpl::CheckSeqFiles, this, std::placeholders::_1)); +} + +void BlobDBImpl::Shutdown() { shutdown_.store(true); } + +void BlobDBImpl::OnFlushBeginHandler(DB* db, const FlushJobInfo& info) { + if (shutdown_.load()) return; + + // a callback that happens too soon needs to be ignored + if (!db_) return; + + FsyncFiles(false); +} + +Status BlobDBImpl::GetAllLogFiles( + std::set>* file_nums) { + std::vector all_files; + Status status = myenv_->GetChildren(blob_dir_, &all_files); + if (!status.ok()) { + return status; + } + + for (const auto& f : all_files) { + uint64_t number; + FileType type; + bool psucc = ParseFileName(f, &number, &type); + if (psucc && type == kBlobFile) { + file_nums->insert(std::make_pair(number, f)); + } else { + Log(InfoLogLevel::WARN_LEVEL, db_options_.info_log, + "Skipping file in blob directory %s parse: %d type: %d", f.c_str(), + psucc, ((psucc) ? type : -1)); + } + } + + return status; +} + +Status BlobDBImpl::OpenAllFiles() { + WriteLock wl(&mutex_); + + std::set> file_nums; + Status status = GetAllLogFiles(&file_nums); + + if (!status.ok()) { + Log(InfoLogLevel::ERROR_LEVEL, db_options_.info_log, + "Failed to collect files from blob dir: %s status: '%s'", + blob_dir_.c_str(), status.ToString().c_str()); + return status; + } + + Log(InfoLogLevel::INFO_LEVEL, db_options_.info_log, + "BlobDir files path: %s count: %d min: %" PRIu64 " max: %" PRIu64, + blob_dir_.c_str(), static_cast(file_nums.size()), + (file_nums.empty()) ? -1 : (file_nums.begin())->first, + (file_nums.empty()) ? -1 : (file_nums.end())->first); + + if (!file_nums.empty()) + next_file_number_.store((file_nums.rbegin())->first + 1); + + for (auto f_iter : file_nums) { + std::string bfpath = BlobFileName(blob_dir_, f_iter.first); + uint64_t size_bytes; + Status s1 = myenv_->GetFileSize(bfpath, &size_bytes); + if (!s1.ok()) { + Log(InfoLogLevel::WARN_LEVEL, db_options_.info_log, + "Unable to get size of %s. File skipped from open status: '%s'", + bfpath.c_str(), s1.ToString().c_str()); + continue; + } + + if (debug_level_ >= 1) + Log(InfoLogLevel::INFO_LEVEL, db_options_.info_log, + "Blob File open: %s size: %" PRIu64, bfpath.c_str(), size_bytes); + + std::shared_ptr bfptr = + std::make_shared(this, blob_dir_, f_iter.first); + bfptr->SetFileSize(size_bytes); + + // since this file already existed, we will try to reconcile + // deleted count with LSM + bfptr->gc_once_after_open_ = true; + + // read header + std::shared_ptr reader; + reader = bfptr->OpenSequentialReader(myenv_, db_options_, env_options_); + s1 = reader->ReadHeader(&bfptr->header_); + if (!s1.ok()) { + Log(InfoLogLevel::ERROR_LEVEL, db_options_.info_log, + "Failure to read header for blob-file %s " + "status: '%s' size: %" PRIu64, + bfpath.c_str(), s1.ToString().c_str(), size_bytes); + continue; + } + bfptr->header_valid_ = true; + + std::shared_ptr ra_reader = + GetOrOpenRandomAccessReader(bfptr, myenv_, env_options_); + + BlobLogFooter bf; + s1 = bfptr->ReadFooter(&bf); + + bfptr->CloseRandomAccessLocked(); + if (s1.ok()) { + s1 = bfptr->SetFromFooterLocked(bf); + if (!s1.ok()) { + Log(InfoLogLevel::ERROR_LEVEL, db_options_.info_log, + "Header Footer mismatch for blob-file %s " + "status: '%s' size: %" PRIu64, + bfpath.c_str(), s1.ToString().c_str(), size_bytes); + continue; + } + } else { + Log(InfoLogLevel::INFO_LEVEL, db_options_.info_log, + "File found incomplete (w/o footer) %s", bfpath.c_str()); + + // sequentially iterate over the file and read all the records + ttlrange_t ttl_range(std::numeric_limits::max(), + std::numeric_limits::min()); + tsrange_t ts_range(std::numeric_limits::max(), + std::numeric_limits::min()); + snrange_t sn_range(std::numeric_limits::max(), + std::numeric_limits::min()); + + uint64_t blob_count = 0; + BlobLogRecord record; + Reader::ReadLevel shallow = Reader::kReadHdrKeyFooter; + + uint64_t record_start = reader->GetNextByte(); + // TODO(arahut) - when we detect corruption, we should truncate + while (reader->ReadRecord(&record, shallow).ok()) { + ++blob_count; + if (bfptr->HasTTL()) { + extendTTL(&ttl_range, record.GetTTL()); + } + if (bfptr->HasTimestamp()) { + extendTimestamps(&ts_range, record.GetTimeVal()); + } + extendSN(&sn_range, record.GetSN()); + record_start = reader->GetNextByte(); + } + + if (record_start != bfptr->GetFileSize()) { + Log(InfoLogLevel::ERROR_LEVEL, db_options_.info_log, + "Blob file is corrupted or crashed during write %s" + " good_size: %" PRIu64 " file_size: %" PRIu64, + bfpath.c_str(), record_start, bfptr->GetFileSize()); + } + + if (!blob_count) { + Log(InfoLogLevel::INFO_LEVEL, db_options_.info_log, + "BlobCount = 0 in file %s", bfpath.c_str()); + continue; + } + + bfptr->SetBlobCount(blob_count); + bfptr->SetSNRange(sn_range); + + if (bfptr->HasTimestamp()) bfptr->set_time_range(ts_range); + + Log(InfoLogLevel::INFO_LEVEL, db_options_.info_log, + "Blob File: %s blob_count: %" PRIu64 " size_bytes: %" PRIu64 + " sn_range: (%d, %d) ts: %d ttl: %d", + bfpath.c_str(), blob_count, size_bytes, sn_range.first, + sn_range.second, bfptr->HasTimestamp(), bfptr->HasTTL()); + + if (bfptr->HasTTL()) { + ttl_range.second = + std::max(ttl_range.second, + ttl_range.first + (uint32_t)bdb_options_.ttl_range_secs); + bfptr->set_ttl_range(ttl_range); + + std::time_t epoch_now = std::chrono::system_clock::to_time_t( + std::chrono::system_clock::now()); + if (ttl_range.second < epoch_now) { + Status fstatus = CreateWriterLocked(bfptr); + if (fstatus.ok()) fstatus = bfptr->WriteFooterAndCloseLocked(); + if (!fstatus.ok()) { + Log(InfoLogLevel::ERROR_LEVEL, db_options_.info_log, + "Failed to close Blob File: %s status: '%s'. Skipped", + bfpath.c_str(), fstatus.ToString().c_str()); + continue; + } else { + Log(InfoLogLevel::ERROR_LEVEL, db_options_.info_log, + "Blob File Closed: %s now: %d ttl_range: (%d, %d)", + bfpath.c_str(), epoch_now, ttl_range.first, ttl_range.second); + } + } else { + open_blob_files_.insert(bfptr); + } + } + } + + blob_files_.insert(std::make_pair(f_iter.first, bfptr)); + } + + return status; +} + +void BlobDBImpl::CloseRandomAccessLocked( + const std::shared_ptr& bfile) { + bfile->CloseRandomAccessLocked(); + open_file_count_--; +} + +std::shared_ptr BlobDBImpl::GetOrOpenRandomAccessReader( + const std::shared_ptr& bfile, Env* env, + const EnvOptions& env_options) { + bool fresh_open = false; + auto rar = bfile->GetOrOpenRandomAccessReader(env, env_options, &fresh_open); + if (fresh_open) open_file_count_++; + return rar; +} + +std::shared_ptr BlobDBImpl::NewBlobFile(const std::string& reason) { + uint64_t file_num = next_file_number_++; + auto bfile = std::make_shared(this, blob_dir_, file_num); + Log(InfoLogLevel::DEBUG_LEVEL, db_options_.info_log, + "New blob file created: %s reason='%s'", bfile->PathName().c_str(), + reason.c_str()); + LogFlush(db_options_.info_log); + return bfile; +} + +Status BlobDBImpl::CreateWriterLocked(const std::shared_ptr& bfile) { + std::string fpath(bfile->PathName()); + std::unique_ptr wfile; + + // We are having issue that we write duplicate blob to blob file and the bug + // is related to writable file buffer. Force no buffer until we fix the bug. + EnvOptions env_options = env_options_; + env_options.writable_file_max_buffer_size = 0; + + Status s = myenv_->ReopenWritableFile(fpath, &wfile, env_options); + if (!s.ok()) { + Log(InfoLogLevel::ERROR_LEVEL, db_options_.info_log, + "Failed to open blob file for write: %s status: '%s'" + " exists: '%s'", + fpath.c_str(), s.ToString().c_str(), + myenv_->FileExists(fpath).ToString().c_str()); + return s; + } + + std::unique_ptr fwriter; + fwriter.reset(new WritableFileWriter(std::move(wfile), env_options)); + + uint64_t boffset = bfile->GetFileSize(); + if (debug_level_ >= 2 && boffset) { + Log(InfoLogLevel::DEBUG_LEVEL, db_options_.info_log, + "Open blob file: %s with offset: %d", fpath.c_str(), boffset); + } + + Writer::ElemType et = Writer::kEtNone; + if (bfile->file_size_ == BlobLogHeader::kHeaderSize) + et = Writer::kEtFileHdr; + else if (bfile->file_size_ > BlobLogHeader::kHeaderSize) + et = Writer::kEtFooter; + else if (bfile->file_size_) { + Log(InfoLogLevel::WARN_LEVEL, db_options_.info_log, + "Open blob file: %s with wrong size: %d", fpath.c_str(), boffset); + return Status::Corruption("Invalid blob file size"); + } + + bfile->log_writer_ = std::make_shared( + std::move(fwriter), bfile->file_number_, bdb_options_.bytes_per_sync, + db_options_.use_fsync, boffset); + bfile->log_writer_->last_elem_type_ = et; + + return s; +} + +std::shared_ptr BlobDBImpl::FindBlobFileLocked( + uint32_t expiration) const { + if (open_blob_files_.empty()) return nullptr; + + std::shared_ptr tmp = std::make_shared(); + tmp->ttl_range_ = std::make_pair(expiration, 0); + + auto citr = open_blob_files_.equal_range(tmp); + if (citr.first == open_blob_files_.end()) { + assert(citr.second == open_blob_files_.end()); + + std::shared_ptr check = *(open_blob_files_.rbegin()); + return (check->ttl_range_.second < expiration) ? nullptr : check; + } + + if (citr.first != citr.second) return *(citr.first); + + auto finditr = citr.second; + if (finditr != open_blob_files_.begin()) --finditr; + + bool b2 = (*finditr)->ttl_range_.second < expiration; + bool b1 = (*finditr)->ttl_range_.first > expiration; + + return (b1 || b2) ? nullptr : (*finditr); +} + +std::shared_ptr BlobDBImpl::CheckOrCreateWriterLocked( + const std::shared_ptr& bfile) { + std::shared_ptr writer = bfile->GetWriter(); + if (writer) return writer; + + Status s = CreateWriterLocked(bfile); + if (!s.ok()) return nullptr; + + writer = bfile->GetWriter(); + return writer; +} + +void BlobDBImpl::UpdateWriteOptions(const WriteOptions& options) { + if (!wo_set_.load(std::memory_order_relaxed)) { + // DCLP + WriteLock wl(&mutex_); + if (!wo_set_.load(std::memory_order_acquire)) { + wo_set_.store(true, std::memory_order_release); + write_options_ = options; + } + } +} + +std::shared_ptr BlobDBImpl::SelectBlobFile() { + uint32_t val = blob_rgen.Next(); + { + ReadLock rl(&mutex_); + if (open_simple_files_.size() == bdb_options_.num_concurrent_simple_blobs) + return open_simple_files_[val % bdb_options_.num_concurrent_simple_blobs]; + } + + std::shared_ptr bfile = NewBlobFile("SelectBlobFile"); + assert(bfile); + + // file not visible, hence no lock + std::shared_ptr writer = CheckOrCreateWriterLocked(bfile); + if (!writer) { + Log(InfoLogLevel::ERROR_LEVEL, db_options_.info_log, + "Failed to get writer from blob file: %s", bfile->PathName().c_str()); + return nullptr; + } + + bfile->file_size_ = BlobLogHeader::kHeaderSize; + bfile->header_.compression_ = bdb_options_.compression; + bfile->header_valid_ = true; + + // CHECK again + WriteLock wl(&mutex_); + if (open_simple_files_.size() == bdb_options_.num_concurrent_simple_blobs) { + return open_simple_files_[val % bdb_options_.num_concurrent_simple_blobs]; + } + + Status s = writer->WriteHeader(bfile->header_); + if (!s.ok()) { + Log(InfoLogLevel::ERROR_LEVEL, db_options_.info_log, + "Failed to write header to new blob file: %s" + " status: '%s'", + bfile->PathName().c_str(), s.ToString().c_str()); + return nullptr; + } + + dir_change_.store(true); + blob_files_.insert(std::make_pair(bfile->BlobFileNumber(), bfile)); + open_simple_files_.push_back(bfile); + return bfile; +} + +std::shared_ptr BlobDBImpl::SelectBlobFileTTL(uint32_t expiration) { + uint64_t epoch_read = 0; + std::shared_ptr bfile; + { + ReadLock rl(&mutex_); + bfile = FindBlobFileLocked(expiration); + epoch_read = epoch_of_.load(); + } + + if (bfile) { + assert(!bfile->Immutable()); + return bfile; + } + + uint32_t exp_low = + (expiration / bdb_options_.ttl_range_secs) * bdb_options_.ttl_range_secs; + uint32_t exp_high = exp_low + bdb_options_.ttl_range_secs; + ttlrange_t ttl_guess = std::make_pair(exp_low, exp_high); + + bfile = NewBlobFile("SelectBlobFileTTL"); + assert(bfile); + + Log(InfoLogLevel::INFO_LEVEL, db_options_.info_log, + "New blob file TTL range: %s %d %d", bfile->PathName().c_str(), exp_low, + exp_high); + LogFlush(db_options_.info_log); + + // we don't need to take lock as no other thread is seeing bfile yet + std::shared_ptr writer = CheckOrCreateWriterLocked(bfile); + if (!writer) { + Log(InfoLogLevel::ERROR_LEVEL, db_options_.info_log, + "Failed to get writer from blob file with TTL: %s", + bfile->PathName().c_str()); + return nullptr; + } + + bfile->header_.set_ttl_guess(ttl_guess); + bfile->header_.compression_ = bdb_options_.compression; + bfile->header_valid_ = true; + bfile->file_size_ = BlobLogHeader::kHeaderSize; + + // set the first value of the range, since that is + // concrete at this time. also necessary to add to open_blob_files_ + bfile->ttl_range_ = ttl_guess; + + WriteLock wl(&mutex_); + // in case the epoch has shifted in the interim, then check + // check condition again - should be rare. + if (epoch_of_.load() != epoch_read) { + auto bfile2 = FindBlobFileLocked(expiration); + if (bfile2) return bfile2; + } + + Status s = writer->WriteHeader(bfile->header_); + if (!s.ok()) { + Log(InfoLogLevel::ERROR_LEVEL, db_options_.info_log, + "Failed to write header to new blob file: %s" + " status: '%s'", + bfile->PathName().c_str(), s.ToString().c_str()); + return nullptr; + } + + dir_change_.store(true); + blob_files_.insert(std::make_pair(bfile->BlobFileNumber(), bfile)); + open_blob_files_.insert(bfile); + epoch_of_++; + + return bfile; +} + +bool BlobDBImpl::ExtractTTLFromBlob(const Slice& value, Slice* newval, + int32_t* ttl_val) { + *newval = value; + *ttl_val = -1; + if (value.size() <= BlobDB::kTTLSuffixLength) return false; + + int32_t ttl_tmp = + DecodeFixed32(value.data() + value.size() - sizeof(int32_t)); + std::string ttl_exp(value.data() + value.size() - BlobDB::kTTLSuffixLength, + 4); + if (ttl_exp != "ttl:") return false; + + newval->remove_suffix(BlobDB::kTTLSuffixLength); + *ttl_val = ttl_tmp; + return true; +} + +//////////////////////////////////////////////////////////////////////////////// +// A specific pattern is looked up at the end of the value part. +// ttl:TTLVAL . if this pattern is found, PutWithTTL is called, otherwise +// regular Put is called. +//////////////////////////////////////////////////////////////////////////////// +Status BlobDBImpl::Put(const WriteOptions& options, + ColumnFamilyHandle* column_family, const Slice& key, + const Slice& value) { + Slice newval; + int32_t ttl_val; + if (bdb_options_.extract_ttl_fn) { + bdb_options_.extract_ttl_fn(value, &newval, &ttl_val); + return PutWithTTL(options, column_family, key, newval, ttl_val); + } + + return PutWithTTL(options, column_family, key, value, -1); +} + +Status BlobDBImpl::Delete(const WriteOptions& options, + ColumnFamilyHandle* column_family, const Slice& key) { + SequenceNumber lsn = db_impl_->GetLatestSequenceNumber(); + Status s = db_->Delete(options, column_family, key); + + // add deleted key to list of keys that have been deleted for book-keeping + delete_keys_q_.enqueue({column_family, key.ToString(), lsn}); + return s; +} + +Status BlobDBImpl::SingleDelete(const WriteOptions& wopts, + ColumnFamilyHandle* column_family, + const Slice& key) { + SequenceNumber lsn = db_impl_->GetLatestSequenceNumber(); + Status s = db_->SingleDelete(wopts, column_family, key); + + delete_keys_q_.enqueue({column_family, key.ToString(), lsn}); + return s; +} + +Status BlobDBImpl::Write(const WriteOptions& opts, WriteBatch* updates) { + class Handler1 : public WriteBatch::Handler { + public: + explicit Handler1(BlobDBImpl* i) : impl(i), previous_put(false) {} + + BlobDBImpl* impl; + WriteBatch updates_blob; + Status batch_rewrite_status; + std::shared_ptr last_file; + bool previous_put; + + virtual Status PutCF(uint32_t column_family_id, const Slice& key, + const Slice& value_unc) override { + Slice newval; + int32_t ttl_val = -1; + if (impl->bdb_options_.extract_ttl_fn) { + impl->bdb_options_.extract_ttl_fn(value_unc, &newval, &ttl_val); + } else { + newval = value_unc; + } + + int32_t expiration = -1; + if (ttl_val != -1) { + std::time_t cur_t = std::chrono::system_clock::to_time_t( + std::chrono::system_clock::now()); + expiration = ttl_val + static_cast(cur_t); + } + std::shared_ptr bfile = + (ttl_val != -1) ? impl->SelectBlobFileTTL(expiration) + : ((last_file) ? last_file : impl->SelectBlobFile()); + if (last_file && last_file != bfile) { + batch_rewrite_status = Status::NotFound("too many blob files"); + return batch_rewrite_status; + } + + if (!bfile) { + batch_rewrite_status = Status::NotFound("blob file not found"); + return batch_rewrite_status; + } + + Slice value = value_unc; + std::string compression_output; + if (impl->bdb_options_.compression != kNoCompression) { + CompressionType ct = impl->bdb_options_.compression; + CompressionOptions compression_opts; + value = CompressBlock(value_unc, compression_opts, &ct, + kBlockBasedTableVersionFormat, Slice(), + &compression_output); + } + + std::string headerbuf; + Writer::ConstructBlobHeader(&headerbuf, key, value, expiration, -1); + + if (previous_put) { + impl->AppendSN(last_file, -1); + previous_put = false; + } + + last_file = bfile; + + std::string index_entry; + Status st = impl->AppendBlob(bfile, headerbuf, key, value, &index_entry); + + if (expiration != -1) + extendTTL(&(bfile->ttl_range_), (uint32_t)expiration); + + if (!st.ok()) { + batch_rewrite_status = st; + } else { + previous_put = true; + WriteBatchInternal::Put(&updates_blob, column_family_id, key, + index_entry); + } + return Status::OK(); + } + + virtual Status MergeCF(uint32_t column_family_id, const Slice& key, + const Slice& value) override { + batch_rewrite_status = + Status::NotSupported("Not supported operation in blob db."); + return batch_rewrite_status; + } + + virtual Status DeleteCF(uint32_t column_family_id, + const Slice& key) override { + WriteBatchInternal::Delete(&updates_blob, column_family_id, key); + return Status::OK(); + } + + virtual void LogData(const Slice& blob) override { + updates_blob.PutLogData(blob); + } + + private: + }; + + Handler1 handler1(this); + updates->Iterate(&handler1); + + Status s; + SequenceNumber lsn = db_impl_->GetLatestSequenceNumber(); + + if (!handler1.batch_rewrite_status.ok()) { + return handler1.batch_rewrite_status; + } else { + s = db_->Write(opts, &(handler1.updates_blob)); + } + + if (!s.ok()) return s; + + if (handler1.previous_put) { + // this is the sequence number of the write. + SequenceNumber sn = WriteBatchInternal::Sequence(&handler1.updates_blob); + AppendSN(handler1.last_file, sn); + + CloseIf(handler1.last_file); + } + + // add deleted key to list of keys that have been deleted for book-keeping + class Handler2 : public WriteBatch::Handler { + public: + explicit Handler2(BlobDBImpl* i, const SequenceNumber& sn) + : impl(i), lsn(sn) {} + + virtual Status DeleteCF(uint32_t column_family_id, + const Slice& key) override { + ColumnFamilyHandle* cfh = + impl->db_impl_->GetColumnFamilyHandleUnlocked(column_family_id); + + impl->delete_keys_q_.enqueue({cfh, key.ToString(), lsn}); + return Status::OK(); + } + + private: + BlobDBImpl* impl; + SequenceNumber lsn; + }; + + // add deleted key to list of keys that have been deleted for book-keeping + Handler2 handler2(this, lsn); + updates->Iterate(&handler2); + + return Status::OK(); +} + +Status BlobDBImpl::PutWithTTL(const WriteOptions& options, + ColumnFamilyHandle* column_family, + const Slice& key, const Slice& value, + int32_t ttl) { + return PutUntil( + options, column_family, key, value, + (ttl != -1) + ? ttl + static_cast(std::chrono::system_clock::to_time_t( + std::chrono::system_clock::now())) + : -1); +} + +Status BlobDBImpl::PutUntil(const WriteOptions& options, + ColumnFamilyHandle* column_family, const Slice& key, + const Slice& value_unc, int32_t expiration) { + UpdateWriteOptions(options); + + std::shared_ptr bfile = + (expiration != -1) ? SelectBlobFileTTL(expiration) : SelectBlobFile(); + + if (!bfile) return Status::NotFound("Blob file not found"); + + Slice value = value_unc; + std::string compression_output; + if (bdb_options_.compression != kNoCompression) { + CompressionType ct = bdb_options_.compression; + CompressionOptions compression_opts; + value = CompressBlock(value_unc, compression_opts, &ct, + kBlockBasedTableVersionFormat, Slice(), + &compression_output); + } + + std::string headerbuf; + Writer::ConstructBlobHeader(&headerbuf, key, value, expiration, -1); + + // this is another more safer way to do it, where you keep the writeLock + // for the entire write path. this will increase latency and reduce + // throughput + // WriteLock lockbfile_w(&bfile->mutex_); + // std::shared_ptr writer = + // CheckOrCreateWriterLocked(bfile); + + if (debug_level_ >= 3) + Log(InfoLogLevel::DEBUG_LEVEL, db_options_.info_log, + ">Adding KEY FILE: %s: KEY: %s VALSZ: %d", bfile->PathName().c_str(), + key.ToString().c_str(), value.size()); + + std::string index_entry; + Status s = AppendBlob(bfile, headerbuf, key, value, &index_entry); + if (!s.ok()) { + Log(InfoLogLevel::ERROR_LEVEL, db_options_.info_log, + "Failed to append blob to FILE: %s: KEY: %s VALSZ: %d" + " status: '%s' blob_file: '%s'", + bfile->PathName().c_str(), key.ToString().c_str(), value.size(), + s.ToString().c_str(), bfile->DumpState().c_str()); + // Fallback just write to the LSM and get going + WriteBatch batch; + batch.Put(column_family, key, value); + return db_->Write(options, &batch); + } + + WriteBatch batch; + batch.Put(column_family, key, index_entry); + + // this goes to the base db and can be expensive + s = db_->Write(options, &batch); + + // this is the sequence number of the write. + SequenceNumber sn = WriteBatchInternal::Sequence(&batch); + + if (debug_level_ >= 3) + Log(InfoLogLevel::INFO_LEVEL, db_options_.info_log, + "PathName().c_str(), + key.ToString().c_str(), sn); + + s = AppendSN(bfile, sn); + if (!s.ok()) { + Log(InfoLogLevel::ERROR_LEVEL, db_options_.info_log, + "Failed to append SN to FILE: %s: KEY: %s VALSZ: %d" + " status: '%s' blob_file: '%s'", + bfile->PathName().c_str(), key.ToString().c_str(), value.size(), + s.ToString().c_str(), bfile->DumpState().c_str()); + } + + if (expiration != -1) extendTTL(&(bfile->ttl_range_), (uint32_t)expiration); + + CloseIf(bfile); + + return s; +} + +Status BlobDBImpl::AppendBlob(const std::shared_ptr& bfile, + const std::string& headerbuf, const Slice& key, + const Slice& value, std::string* index_entry) { + Status s; + + uint64_t blob_offset = 0; + uint64_t key_offset = 0; + { + WriteLock lockbfile_w(&bfile->mutex_); + std::shared_ptr writer = CheckOrCreateWriterLocked(bfile); + if (!writer) return Status::IOError("Failed to create blob writer"); + + // write the blob to the blob log. + s = writer->EmitPhysicalRecord(headerbuf, key, value, &key_offset, + &blob_offset); + } + + if (!s.ok()) { + Log(InfoLogLevel::ERROR_LEVEL, db_options_.info_log, + "Invalid status in AppendBlob: %s status: '%s'", + bfile->PathName().c_str(), s.ToString().c_str()); + return s; + } + + // increment blob count + bfile->blob_count_++; + auto size_put = BlobLogRecord::kHeaderSize + key.size() + value.size(); + + bfile->file_size_ += size_put; + last_period_write_ += size_put; + total_blob_space_ += size_put; + + BlobHandle handle; + handle.set_filenumber(bfile->BlobFileNumber()); + handle.set_size(value.size()); + handle.set_offset(blob_offset); + handle.set_compression(bdb_options_.compression); + handle.EncodeTo(index_entry); + + if (debug_level_ >= 3) + Log(InfoLogLevel::INFO_LEVEL, db_options_.info_log, + ">Adding KEY FILE: %s: BC: %d OFFSET: %d SZ: %d", + bfile->PathName().c_str(), bfile->blob_count_.load(), blob_offset, + value.size()); + + return s; +} + +Status BlobDBImpl::AppendSN(const std::shared_ptr& bfile, + const SequenceNumber& sn) { + Status s; + { + WriteLock lockbfile_w(&bfile->mutex_); + std::shared_ptr writer = CheckOrCreateWriterLocked(bfile); + if (!writer) return Status::IOError("Failed to create blob writer"); + + s = writer->AddRecordFooter(sn); + if (!s.ok()) { + Log(InfoLogLevel::ERROR_LEVEL, db_options_.info_log, + "Invalid status in AppendSN: %s status: '%s'", + bfile->PathName().c_str(), s.ToString().c_str()); + return s; + } + + if (sn != std::numeric_limits::max()) + extendSN(&(bfile->sn_range_), sn); + } + + bfile->file_size_ += BlobLogRecord::kFooterSize; + last_period_write_ += BlobLogRecord::kFooterSize; + total_blob_space_ += BlobLogRecord::kFooterSize; + return s; +} + +std::vector BlobDBImpl::MultiGet( + const ReadOptions& options, + const std::vector& column_family, + const std::vector& keys, std::vector* values) { + std::vector values_lsm; + values_lsm.resize(keys.size()); + auto statuses = db_->MultiGet(options, column_family, keys, &values_lsm); + + for (size_t i = 0; i < keys.size(); ++i) { + if (!statuses[i].ok()) continue; + + auto cfh = reinterpret_cast(column_family[i]); + auto cfd = cfh->cfd(); + + Status s = CommonGet(cfd, keys[i], values_lsm[i], &((*values)[i])); + statuses[i] = s; + } + return statuses; +} + +Status BlobDBImpl::CommonGet(const ColumnFamilyData* cfd, const Slice& key, + const std::string& index_entry, + std::string* value) { + Slice index_entry_slice(index_entry); + BlobHandle handle; + Status s = handle.DecodeFrom(&index_entry_slice); + if (!s.ok()) return s; + + // offset has to have certain min, as we will read CRC + // later from the Blob Header, which needs to be also a + // valid offset. + if (handle.offset() < + (BlobLogHeader::kHeaderSize + BlobLogRecord::kHeaderSize + key.size())) { + if (debug_level_ >= 2) { + Log(InfoLogLevel::ERROR_LEVEL, db_options_.info_log, + "Invalid blob handle file_number: %" PRIu64 " blob_offset: %" PRIu64 + " blob_size: %" PRIu64 " key: %s", + handle.filenumber(), handle.offset(), handle.size(), key.data()); + } + return Status::NotFound("Blob Not Found, although found in LSM"); + } + + std::shared_ptr bfile; + { + ReadLock rl(&mutex_); + auto hitr = blob_files_.find(handle.filenumber()); + + // file was deleted + if (hitr == blob_files_.end()) { + return Status::NotFound("Blob Not Found as blob file missing"); + } + + bfile = hitr->second; + } + + if (bfile->Obsolete()) { + return Status::NotFound( + "Blob Not Found as blob file was garbage collected"); + } + + // 0 - size + if (!handle.size()) { + value->clear(); + return Status::OK(); + } + + // takes locks when called + std::shared_ptr reader = + GetOrOpenRandomAccessReader(bfile, myenv_, env_options_); + + std::string* valueptr = value; + std::string value_c; + if (bdb_options_.compression != kNoCompression) { + valueptr = &value_c; + } + + // allocate the buffer. This is safe in C++11 + valueptr->resize(handle.size()); + char* buffer = &(*valueptr)[0]; + + Slice blob_value; + s = reader->Read(handle.offset(), handle.size(), &blob_value, buffer); + if (!s.ok() || blob_value.size() != handle.size()) { + if (debug_level_ >= 2) { + Log(InfoLogLevel::ERROR_LEVEL, db_options_.info_log, + "Failed to read blob from file: %s blob_offset: %" PRIu64 + " blob_size: %" PRIu64 " read: %d key: %s status: '%s'", + bfile->PathName().c_str(), handle.offset(), handle.size(), + static_cast(blob_value.size()), key.data(), + s.ToString().c_str()); + } + return Status::NotFound("Blob Not Found as couldnt retrieve Blob"); + } + + Slice crc_slice; + uint32_t crc_exp; + std::string crc_str; + crc_str.resize(sizeof(uint32_t)); + char* crc_buffer = &(crc_str[0]); + s = reader->Read(handle.offset() - (key.size() + sizeof(uint32_t)), + sizeof(uint32_t), &crc_slice, crc_buffer); + if (!s.ok() || !GetFixed32(&crc_slice, &crc_exp)) { + if (debug_level_ >= 2) { + Log(InfoLogLevel::ERROR_LEVEL, db_options_.info_log, + "Failed to fetch blob crc file: %s blob_offset: %" PRIu64 + " blob_size: %" PRIu64 " key: %s status: '%s'", + bfile->PathName().c_str(), handle.offset(), handle.size(), key.data(), + s.ToString().c_str()); + } + return Status::NotFound("Blob Not Found as couldnt retrieve CRC"); + } + + uint32_t crc = crc32c::Extend(0, blob_value.data(), blob_value.size()); + crc = crc32c::Mask(crc); // Adjust for storage + if (crc != crc_exp) { + if (debug_level_ >= 2) { + Log(InfoLogLevel::ERROR_LEVEL, db_options_.info_log, + "Blob crc mismatch file: %s blob_offset: %" PRIu64 + " blob_size: %" PRIu64 " key: %s status: '%s'", + bfile->PathName().c_str(), handle.offset(), handle.size(), key.data(), + s.ToString().c_str()); + } + return Status::Corruption("Corruption. Blob CRC mismatch"); + } + + if (bdb_options_.compression != kNoCompression) { + BlockContents contents; + s = UncompressBlockContentsForCompressionType( + blob_value.data(), blob_value.size(), &contents, + kBlockBasedTableVersionFormat, Slice(), bdb_options_.compression, + *(cfd->ioptions())); + *value = contents.data.ToString(); + } + + return s; +} + +Status BlobDBImpl::Get(const ReadOptions& options, + ColumnFamilyHandle* column_family, const Slice& key, + std::string* value) { + auto cfh = reinterpret_cast(column_family); + auto cfd = cfh->cfd(); + + Status s; + std::string index_entry; + s = db_->Get(options, column_family, key, &index_entry); + if (!s.ok()) { + if (debug_level_ >= 3) + Log(InfoLogLevel::WARN_LEVEL, db_options_.info_log, + "Get Failed on LSM KEY: %s status: '%s'", key.ToString().c_str(), + s.ToString().c_str()); + return s; + } + + s = CommonGet(cfd, key, index_entry, value); + return s; +} + +Slice BlobDBIterator::value() const { + Slice index_entry = iter_->value(); + + auto cfh = reinterpret_cast(cfh_); + auto cfd = cfh->cfd(); + + Status s = db_impl_->CommonGet(cfd, iter_->key(), index_entry.ToString(false), + &vpart_); + return Slice(vpart_); +} + +std::pair BlobDBImpl::SanityCheck(bool aborted) { + if (aborted) return std::make_pair(false, -1); + + Log(InfoLogLevel::INFO_LEVEL, db_options_.info_log, "Starting Sanity Check"); + + Log(InfoLogLevel::INFO_LEVEL, db_options_.info_log, + "Number of files %" PRIu64, blob_files_.size()); + + Log(InfoLogLevel::INFO_LEVEL, db_options_.info_log, + "Number of open files %" PRIu64, open_blob_files_.size()); + + for (auto bfile : open_blob_files_) { + assert(!bfile->Immutable()); + } + + std::time_t epoch_now = + std::chrono::system_clock::to_time_t(std::chrono::system_clock::now()); + + for (auto bfile_pair : blob_files_) { + auto bfile = bfile_pair.second; + Log(InfoLogLevel::INFO_LEVEL, db_options_.info_log, + "Blob File %s %" PRIu64 " %" PRIu64 " %" PRIu64 " %" PRIu64 " %d", + bfile->PathName().c_str(), bfile->GetFileSize(), bfile->BlobCount(), + bfile->deleted_count_, bfile->deleted_size_, + (bfile->ttl_range_.second - epoch_now)); + } + + // reschedule + return std::make_pair(true, -1); +} + +std::pair BlobDBImpl::CloseSeqWrite( + std::shared_ptr bfile, bool aborted) { + { + WriteLock wl(&mutex_); + + // this prevents others from picking up this file + open_blob_files_.erase(bfile); + + auto findit = + std::find(open_simple_files_.begin(), open_simple_files_.end(), bfile); + if (findit != open_simple_files_.end()) open_simple_files_.erase(findit); + } + + if (!bfile->closed_.load()) { + WriteLock lockbfile_w(&bfile->mutex_); + bfile->WriteFooterAndCloseLocked(); + } + + return std::make_pair(false, -1); +} + +void BlobDBImpl::CloseIf(const std::shared_ptr& bfile) { + // atomic read + bool close = bfile->GetFileSize() > bdb_options_.blob_file_size; + if (!close) return; + + if (debug_level_ >= 2) { + Log(InfoLogLevel::DEBUG_LEVEL, db_options_.info_log, + "Scheduling file for close %s fsize: %" PRIu64 " limit: %" PRIu64, + bfile->PathName().c_str(), bfile->GetFileSize(), + bdb_options_.blob_file_size); + } + + { + WriteLock wl(&mutex_); + + open_blob_files_.erase(bfile); + auto findit = + std::find(open_simple_files_.begin(), open_simple_files_.end(), bfile); + if (findit != open_simple_files_.end()) { + open_simple_files_.erase(findit); + } else { + Log(InfoLogLevel::WARN_LEVEL, db_options_.info_log, + "File not found while closing %s fsize: %" PRIu64 + " Multithreaded Writes?", + bfile->PathName().c_str(), bfile->GetFileSize()); + } + } + + tqueue_.add(0, std::bind(&BlobDBImpl::CloseSeqWrite, this, bfile, + std::placeholders::_1)); +} + +bool BlobDBImpl::FileDeleteOk_SnapshotCheckLocked( + const std::shared_ptr& bfile) { + assert(bfile->Obsolete()); + + SequenceNumber esn = bfile->GetSNRange().first; + + // this is not correct. + // you want to check that there are no snapshots in the + bool notok = db_impl_->HasActiveSnapshotLaterThanSN(esn); + if (notok) { + Log(InfoLogLevel::INFO_LEVEL, db_options_.info_log, + "Could not delete file due to snapshot failure %s", + bfile->PathName().c_str()); + return false; + } else { + Log(InfoLogLevel::INFO_LEVEL, db_options_.info_log, + "Will delete file due to snapshot success %s", + bfile->PathName().c_str()); + return true; + } +} + +bool BlobDBImpl::FindFileAndEvictABlob(uint64_t file_number, uint64_t key_size, + uint64_t blob_offset, + uint64_t blob_size) { + (void)blob_offset; + std::shared_ptr bfile; + { + ReadLock rl(&mutex_); + auto hitr = blob_files_.find(file_number); + + // file was deleted + if (hitr == blob_files_.end()) { + Log(InfoLogLevel::INFO_LEVEL, db_options_.info_log, + "Could not find file_number %" PRIu64, file_number); + return false; + } + + bfile = hitr->second; + } + + WriteLock lockbfile_w(&bfile->mutex_); + + bfile->deleted_count_++; + bfile->deleted_size_ += key_size + blob_size + BlobLogRecord::kHeaderSize + + BlobLogRecord::kFooterSize; + return true; +} + +bool BlobDBImpl::MarkBlobDeleted(const Slice& key, const Slice& lsmValue) { + Slice val(lsmValue); + BlobHandle handle; + Status s = handle.DecodeFrom(&val); + if (!s.ok()) { + Log(InfoLogLevel::INFO_LEVEL, db_options_.info_log, + "Could not parse lsm val in MarkBlobDeleted %s", + lsmValue.ToString().c_str()); + return false; + } + bool succ = FindFileAndEvictABlob(handle.filenumber(), key.size(), + handle.offset(), handle.size()); + return succ; +} + +std::pair BlobDBImpl::EvictCompacted(bool aborted) { + if (aborted) return std::make_pair(false, -1); + + override_packet_t packet; + while (override_vals_q_.dequeue(&packet)) { + bool succ = FindFileAndEvictABlob(packet.file_number_, packet.key_size_, + packet.blob_offset_, packet.blob_size_); + + if (!succ) + Log(InfoLogLevel::DEBUG_LEVEL, db_options_.info_log, + "EVICT COMPACTION FAILURE SN: %d FN: %d OFFSET: %d SIZE: %d", + packet.dsn_, packet.file_number_, packet.blob_offset_, + packet.blob_size_); + + if (debug_level_ >= 3) + Log(InfoLogLevel::INFO_LEVEL, db_options_.info_log, + "EVICT COMPACTED SN: %d FN: %d OFFSET: %d SIZE: %d SUCC: %d", + packet.dsn_, packet.file_number_, packet.blob_offset_, + packet.blob_size_, succ); + } + return std::make_pair(true, -1); +} + +std::pair BlobDBImpl::EvictDeletions(bool aborted) { + if (aborted) return std::make_pair(false, -1); + + ColumnFamilyHandle* last_cfh = nullptr; + Options last_op; + + Arena arena; + ScopedArenaIterator iter; + + // we will use same RangeDelAggregator for all cf's. + // essentially we do not support Range Deletes now + std::unique_ptr range_del_agg; + delete_packet_t dpacket; + while (delete_keys_q_.dequeue(&dpacket)) { + if (last_cfh != dpacket.cfh_) { + if (!range_del_agg) { + auto cfhi = reinterpret_cast(dpacket.cfh_); + auto cfd = cfhi->cfd(); + range_del_agg.reset(new RangeDelAggregator(cfd->internal_comparator(), + kMaxSequenceNumber)); + } + + // this can be expensive + last_cfh = dpacket.cfh_; + last_op = db_impl_->GetOptions(last_cfh); + iter.set(db_impl_->NewInternalIterator(&arena, range_del_agg.get(), + dpacket.cfh_)); + // this will not work for multiple CF's. + } + + Slice user_key(dpacket.key_); + InternalKey target(user_key, dpacket.dsn_, kTypeValue); + + Slice eslice = target.Encode(); + iter->Seek(eslice); + + if (!iter->status().ok()) { + Log(InfoLogLevel::INFO_LEVEL, db_options_.info_log, + "Invalid iterator seek %s", dpacket.key_.c_str()); + continue; + } + + const Comparator* bwc = BytewiseComparator(); + while (iter->Valid()) { + if (!bwc->Equal(ExtractUserKey(iter->key()), ExtractUserKey(eslice))) + break; + + ParsedInternalKey ikey(Slice(), 0, kTypeValue); + if (!ParseInternalKey(iter->key(), &ikey)) { + continue; + } + + // once you hit a DELETE, assume the keys below have been + // processed previously + if (ikey.type == kTypeDeletion || ikey.type == kTypeSingleDeletion) break; + + Slice val = iter->value(); + MarkBlobDeleted(ikey.user_key, val); + + iter->Next(); + } + } + return std::make_pair(true, -1); +} + +std::pair BlobDBImpl::CheckSeqFiles(bool aborted) { + if (aborted) return std::make_pair(false, -1); + + std::vector> process_files; + { + std::time_t epoch_now = + std::chrono::system_clock::to_time_t(std::chrono::system_clock::now()); + + ReadLock rl(&mutex_); + for (auto bfile : open_blob_files_) { + { + ReadLock lockbfile_r(&bfile->mutex_); + + if (bfile->ttl_range_.second > epoch_now) continue; + process_files.push_back(bfile); + } + } + } + + for (auto bfile : process_files) CloseSeqWrite(bfile, false); + + return std::make_pair(true, -1); +} + +std::pair BlobDBImpl::FsyncFiles(bool aborted) { + if (aborted) return std::make_pair(false, -1); + + std::vector> process_files; + { + ReadLock rl(&mutex_); + for (auto fitr : open_blob_files_) { + if (fitr->NeedsFsync(true, bdb_options_.bytes_per_sync)) + process_files.push_back(fitr); + } + + for (auto fitr : open_simple_files_) { + if (fitr->NeedsFsync(true, bdb_options_.bytes_per_sync)) + process_files.push_back(fitr); + } + } + + for (auto fitr : process_files) { + if (fitr->NeedsFsync(true, bdb_options_.bytes_per_sync)) fitr->Fsync(); + } + + bool expected = true; + if (dir_change_.compare_exchange_weak(expected, false)) dir_ent_->Fsync(); + + return std::make_pair(true, -1); +} + +std::pair BlobDBImpl::ReclaimOpenFiles(bool aborted) { + if (aborted) return std::make_pair(false, -1); + + if (open_file_count_.load() < bdb_options_.open_files_trigger) + return std::make_pair(true, -1); + + // in the future, we should sort by last_access_ + // instead of closing every file + ReadLock rl(&mutex_); + for (auto const& ent : blob_files_) { + auto bfile = ent.second; + if (bfile->last_access_.load() == -1) continue; + + WriteLock lockbfile_w(&bfile->mutex_); + CloseRandomAccessLocked(bfile); + } + + return std::make_pair(true, -1); +} + +std::pair BlobDBImpl::WaStats(bool aborted) { + if (aborted) return std::make_pair(false, -1); + + WriteLock wl(&mutex_); + + if (all_periods_write_.size() < bdb_options_.wa_num_stats_periods) { + total_periods_write_ -= (*all_periods_write_.begin()); + total_periods_ampl_ = (*all_periods_ampl_.begin()); + + all_periods_write_.pop_front(); + all_periods_ampl_.pop_front(); + } + + uint64_t val1 = last_period_write_.load(); + uint64_t val2 = last_period_ampl_.load(); + + all_periods_write_.push_back(val1); + all_periods_ampl_.push_back(val2); + + last_period_write_ = 0; + last_period_ampl_ = 0; + + total_periods_write_ += val1; + total_periods_ampl_ += val2; + + return std::make_pair(true, -1); +} + +//////////////////////////////////////////////////////////////////////////////// +// iterate over the blobs sequentially and check if the blob sequence number +// is the latest. If it is the latest, preserve it, otherwise delete it +// if it is TTL based, and the TTL has expired, then +// we can blow the entity if the key is still the latest or the Key is not +// found +// WHAT HAPPENS IF THE KEY HAS BEEN OVERRIDEN. Then we can drop the blob +// without doing anything if the earliest snapshot is not +// referring to that sequence number, i.e. it is later than the sequence number +// of the new key +// +// if it is not TTL based, then we can blow the key if the key has been +// DELETED in the LSM +//////////////////////////////////////////////////////////////////////////////// +Status BlobDBImpl::GCFileAndUpdateLSM(const std::shared_ptr& bfptr, + GCStats* gcstats) { + std::chrono::system_clock::time_point now = std::chrono::system_clock::now(); + std::time_t tt = std::chrono::system_clock::to_time_t(now); + + std::shared_ptr reader = + bfptr->OpenSequentialReader(myenv_, db_options_, env_options_); + if (!reader) { + Log(InfoLogLevel::ERROR_LEVEL, db_options_.info_log, + "File sequential reader could not be opened", + bfptr->PathName().c_str()); + return Status::IOError("failed to create sequential reader"); + } + + BlobLogHeader header; + Status s = reader->ReadHeader(&header); + if (!s.ok()) { + Log(InfoLogLevel::ERROR_LEVEL, db_options_.info_log, + "Failure to read header for blob-file %s", bfptr->PathName().c_str()); + return s; + } + + bool first_gc = bfptr->gc_once_after_open_; + + ColumnFamilyHandle* cfh = bfptr->GetColumnFamily(db_); + auto cfhi = reinterpret_cast(cfh); + auto cfd = cfhi->cfd(); + bool has_ttl = header.HasTTL(); + + // this reads the key but skips the blob + Reader::ReadLevel shallow = Reader::kReadHdrKeyFooter; + + assert(opt_db_); + + bool no_relocation_ttl = (has_ttl && tt > bfptr->GetTTLRange().second); + + bool no_relocation_lsmdel = false; + { + ReadLock lockbfile_r(&bfptr->mutex_); + no_relocation_lsmdel = (bfptr->GetFileSize() == + (BlobLogHeader::kHeaderSize + bfptr->deleted_size_ + + BlobLogFooter::kFooterSize)); + } + + bool no_relocation = no_relocation_ttl || no_relocation_lsmdel; + if (!no_relocation) { + // read the blob because you have to write it back to new file + shallow = Reader::kReadHdrKeyBlobFooter; + } + + BlobLogRecord record; + std::shared_ptr newfile; + std::shared_ptr new_writer; + + while (reader->ReadRecord(&record, shallow).ok()) { + gcstats->blob_count++; + + bool del_this = false; + // this particular TTL has expired + if (no_relocation_ttl || (has_ttl && tt > record.GetTTL())) { + del_this = true; + } else { + SequenceNumber seq = kMaxSequenceNumber; + bool found_record_for_key = false; + SuperVersion* sv = db_impl_->GetAndRefSuperVersion(cfd); + if (sv == nullptr) { + Status result = + Status::InvalidArgument("Could not access column family 0"); + return result; + } + Status s1 = db_impl_->GetLatestSequenceForKey( + sv, record.Key(), false, &seq, &found_record_for_key); + if (s1.IsNotFound() || (!found_record_for_key || seq != record.GetSN())) { + del_this = true; + } + db_impl_->ReturnAndCleanupSuperVersion(cfd, sv); + } + + if (del_this) { + gcstats->num_deletes++; + gcstats->deleted_size += record.GetBlobSize(); + if (first_gc) continue; + + Transaction* txn = static_cast(opt_db_.get()) + ->BeginTransaction(write_options_); + txn->Delete(cfh, record.Key()); + Status s1 = txn->Commit(); + // chances that this DELETE will fail is low. If it fails, it would be + // because + // a new version of the key came in at this time, which will override + // the current version being iterated on. + if (s1.IsBusy()) { + Log(InfoLogLevel::INFO_LEVEL, db_options_.info_log, + "Optimistic transaction failed delete: %s bn: %" PRIu32, + bfptr->PathName().c_str(), gcstats->blob_count); + } else { + Log(InfoLogLevel::DEBUG_LEVEL, db_options_.info_log, + "Successfully added delete back into LSM: %s bn: %" PRIu32, + bfptr->PathName().c_str(), gcstats->blob_count); + + // assume that failures happen due to new writes. + gcstats->succ_deletes_lsm++; + } + delete txn; + continue; + } else if (first_gc) { + continue; + } + + if (!newfile) { + // new file + std::string reason("GC of "); + reason += bfptr->PathName(); + newfile = NewBlobFile(reason); + gcstats->newfile = newfile; + + new_writer = CheckOrCreateWriterLocked(newfile); + newfile->header_ = std::move(header); + // Can't use header beyond this point + newfile->header_valid_ = true; + newfile->file_size_ = BlobLogHeader::kHeaderSize; + s = new_writer->WriteHeader(newfile->header_); + + if (!s.ok()) { + Log(InfoLogLevel::ERROR_LEVEL, db_options_.info_log, + "File: %s - header writing failed", newfile->PathName().c_str()); + return s; + } + + WriteLock wl(&mutex_); + + dir_change_.store(true); + blob_files_.insert(std::make_pair(newfile->BlobFileNumber(), newfile)); + } + + gcstats->num_relocs++; + std::string index_entry; + + uint64_t blob_offset = 0; + uint64_t key_offset = 0; + // write the blob to the blob log. + s = new_writer->AddRecord(record.Key(), record.Blob(), &key_offset, + &blob_offset, record.GetTTL()); + + BlobHandle handle; + handle.set_filenumber(newfile->BlobFileNumber()); + handle.set_size(record.Blob().size()); + handle.set_offset(blob_offset); + handle.set_compression(bdb_options_.compression); + handle.EncodeTo(&index_entry); + + new_writer->AddRecordFooter(record.GetSN()); + newfile->blob_count_++; + newfile->file_size_ += BlobLogRecord::kHeaderSize + record.Key().size() + + record.Blob().size() + BlobLogRecord::kFooterSize; + + Transaction* txn = static_cast(opt_db_.get()) + ->BeginTransaction(write_options_); + txn->Put(cfh, record.Key(), index_entry); + Status s1 = txn->Commit(); + // chances that this Put will fail is low. If it fails, it would be because + // a new version of the key came in at this time, which will override + // the current version being iterated on. + if (s1.IsBusy()) { + Log(InfoLogLevel::INFO_LEVEL, db_options_.info_log, + "Optimistic transaction failed: %s put bn: %" PRIu32, + bfptr->PathName().c_str(), gcstats->blob_count); + } else { + gcstats->succ_relocs++; + Log(InfoLogLevel::DEBUG_LEVEL, db_options_.info_log, + "Successfully added put back into LSM: %s bn: %" PRIu32, + bfptr->PathName().c_str(), gcstats->blob_count); + } + delete txn; + } + + if (gcstats->newfile) total_blob_space_ += newfile->file_size_; + + Log(InfoLogLevel::INFO_LEVEL, db_options_.info_log, + "File: %s Num deletes %" PRIu32 " Num relocs: %" PRIu32 + " Succ Deletes: %" PRIu32 " Succ relocs: %" PRIu32, + bfptr->PathName().c_str(), gcstats->num_deletes, gcstats->num_relocs, + gcstats->succ_deletes_lsm, gcstats->succ_relocs); + + return s; +} + +// Ideally we should hold the lock during the entire function, +// but under the asusmption that this is only called when a +// file is Immutable, we can reduce the critical section +bool BlobDBImpl::ShouldGCFile(std::shared_ptr bfile, std::time_t tt, + uint64_t last_id, std::string* reason) { + if (bfile->HasTTL()) { + ttlrange_t ttl_range = bfile->GetTTLRange(); + if (tt > ttl_range.second) { + *reason = "entire file ttl expired"; + return true; + } + + if (!bfile->file_size_.load()) { + Log(InfoLogLevel::ERROR_LEVEL, db_options_.info_log, + "Invalid file size = 0 %s", bfile->PathName().c_str()); + *reason = "file is empty"; + return false; + } + + if (bfile->gc_once_after_open_.load()) { + return true; + } + + if (bdb_options_.ttl_range_secs < + bdb_options_.partial_expiration_gc_range_secs) { + *reason = "has ttl but partial expiration not turned on"; + return false; + } + + ReadLock lockbfile_r(&bfile->mutex_); + bool ret = ((bfile->deleted_size_ * 100.0 / bfile->file_size_.load()) > + bdb_options_.partial_expiration_pct); + if (ret) { + *reason = "deleted blobs beyond threshold"; + } else { + *reason = "deleted blobs below threshold"; + } + return ret; + } + + // when crash happens, we lose the in-memory account of deleted blobs. + // we are therefore forced to do one GC to make sure delete accounting + // is OK + if (bfile->gc_once_after_open_.load()) { + return true; + } + + ReadLock lockbfile_r(&bfile->mutex_); + + if ((bfile->deleted_size_ * 100.0 / bfile->file_size_.load()) > + bdb_options_.partial_expiration_pct) { + *reason = "deleted simple blobs beyond threshold"; + return true; + } + + // if we haven't reached limits of disk space, don't DELETE + if (total_blob_space_.load() < bdb_options_.blob_dir_size) { + *reason = "disk space not exceeded"; + return false; + } + + bool ret = bfile->BlobFileNumber() == last_id; + if (ret) { + *reason = "eligible last simple blob file"; + } else { + *reason = "not eligible since not last simple blob file"; + } + return ret; +} + +std::pair BlobDBImpl::DeleteObsFiles(bool aborted) { + if (aborted) return std::make_pair(false, -1); + + { + ReadLock rl(&mutex_); + if (obsolete_files_.empty()) return std::make_pair(true, -1); + } + + std::list> tobsolete; + { + WriteLock wl(&mutex_); + tobsolete.swap(obsolete_files_); + } + + bool file_deleted = false; + for (auto iter = tobsolete.begin(); iter != tobsolete.end();) { + auto bfile = *iter; + { + ReadLock lockbfile_r(&bfile->mutex_); + if (!FileDeleteOk_SnapshotCheckLocked(bfile)) { + ++iter; + continue; + } + } + + Status s = myenv_->DeleteFile(bfile->PathName()); + if (!s.ok()) { + Log(InfoLogLevel::ERROR_LEVEL, db_options_.info_log, + "File failed to be deleted as obsolete %s", + bfile->PathName().c_str()); + ++iter; + continue; + } + + file_deleted = true; + total_blob_space_ -= bfile->file_size_; + Log(InfoLogLevel::INFO_LEVEL, db_options_.info_log, + "File deleted as obsolete from blob dir %s", bfile->PathName().c_str()); + + iter = tobsolete.erase(iter); + } + + // directory change. Fsync + if (file_deleted) dir_ent_->Fsync(); + + // put files back into obsolete if for some reason, delete failed + if (!tobsolete.empty()) { + WriteLock wl(&mutex_); + for (auto bfile : tobsolete) obsolete_files_.push_front(bfile); + } + + return std::make_pair(!aborted, -1); +} + +bool BlobDBImpl::CallbackEvictsImpl(std::shared_ptr bfile) { + std::shared_ptr reader = + bfile->OpenSequentialReader(myenv_, db_options_, env_options_); + if (!reader) { + Log(InfoLogLevel::ERROR_LEVEL, db_options_.info_log, + "File sequential reader could not be opened for evict callback: %s", + bfile->PathName().c_str()); + return false; + } + + ReadLock lockbfile_r(&bfile->mutex_); + + BlobLogHeader header; + Status s = reader->ReadHeader(&header); + if (!s.ok()) { + Log(InfoLogLevel::ERROR_LEVEL, db_options_.info_log, + "Failure to read header for blob-file during evict callback %s", + bfile->PathName().c_str()); + return false; + } + + ColumnFamilyHandle* cfh = bfile->GetColumnFamily(db_); + BlobLogRecord record; + Reader::ReadLevel full = Reader::kReadHdrKeyBlobFooter; + while (reader->ReadRecord(&record, full).ok()) { + bdb_options_.gc_evict_cb_fn(cfh, record.Key(), record.Blob()); + } + + return true; +} + +std::pair BlobDBImpl::RemoveTimerQ(TimerQueue* tq, + bool aborted) { + WriteLock wl(&mutex_); + for (auto itr = cb_threads_.begin(); itr != cb_threads_.end(); ++itr) { + if ((*itr).get() != tq) continue; + + cb_threads_.erase(itr); + break; + } + return std::make_pair(false, -1); +} + +std::pair BlobDBImpl::CallbackEvicts( + TimerQueue* tq, std::shared_ptr bfile, bool aborted) { + if (aborted) return std::make_pair(false, -1); + bool succ = CallbackEvictsImpl(bfile); + if (succ) { + Log(InfoLogLevel::DEBUG_LEVEL, db_options_.info_log, + "Eviction callbacks completed %s", bfile->PathName().c_str()); + } + + WriteLock wl(&mutex_); + bfile->SetCanBeDeleted(); + obsolete_files_.push_front(bfile); + if (tq) { + // all of the callbacks have been processed + tqueue_.add(0, std::bind(&BlobDBImpl::RemoveTimerQ, this, tq, + std::placeholders::_1)); + } + return std::make_pair(false, -1); +} + +void BlobDBImpl::CopyBlobFiles( + std::vector>* bfiles_copy, uint64_t* last_id) { + ReadLock rl(&mutex_); + + // take a copy + bfiles_copy->reserve(blob_files_.size()); + for (auto const& ent : blob_files_) { + bfiles_copy->push_back(ent.second); + + // A. has ttl is immutable, once set, hence no locks required + // B. blob files are sorted based on number(i.e. index of creation ) + // so we will return the last blob file + if (!ent.second->HasTTL()) *last_id = ent.second->BlobFileNumber(); + } +} + +void BlobDBImpl::FilterSubsetOfFiles( + const std::vector>& blob_files, + std::vector>* to_process, uint64_t epoch, + uint64_t last_id, size_t files_to_collect) { + // 100.0 / 15.0 = 7 + uint64_t next_epoch_increment = static_cast( + std::ceil(100 / static_cast(bdb_options_.gc_file_pct))); + std::chrono::system_clock::time_point now = std::chrono::system_clock::now(); + std::time_t tt = std::chrono::system_clock::to_time_t(now); + + size_t files_processed = 0; + for (auto bfile : blob_files) { + if (files_processed >= files_to_collect) break; + // if this is the first time processing the file + // i.e. gc_epoch == -1, process it. + // else process the file if its processing epoch matches + // the current epoch. Typically the #of epochs should be + // around 5-10 + if (bfile->gc_epoch_ != -1 && (uint64_t)bfile->gc_epoch_ != epoch) { + continue; + } + + files_processed++; + // reset the epoch + bfile->gc_epoch_ = epoch + next_epoch_increment; + + // file has already been GC'd or is still open for append, + // then it should not be GC'd + if (bfile->Obsolete() || !bfile->Immutable()) continue; + + std::string reason; + bool shouldgc = ShouldGCFile(bfile, tt, last_id, &reason); + if (!shouldgc) { + Log(InfoLogLevel::DEBUG_LEVEL, db_options_.info_log, + "File has been skipped for GC ttl %s %d %d reason='%s'", + bfile->PathName().c_str(), tt, bfile->GetTTLRange().second, + reason.c_str()); + continue; + } + + Log(InfoLogLevel::INFO_LEVEL, db_options_.info_log, + "File has been chosen for GC ttl %s %d %d reason='%s'", + bfile->PathName().c_str(), tt, bfile->GetTTLRange().second, + reason.c_str()); + to_process->push_back(bfile); + } +} + +std::pair BlobDBImpl::RunGC(bool aborted) { + if (aborted) return std::make_pair(false, -1); + + current_epoch_++; + + // collect the ID of the last regular file, in case we need to GC it. + uint64_t last_id = std::numeric_limits::max(); + + std::vector> blob_files; + CopyBlobFiles(&blob_files, &last_id); + + if (!blob_files.size()) return std::make_pair(true, -1); + + // 15% of files are collected each call to space out the IO and CPU + // consumption. + size_t files_to_collect = + (bdb_options_.gc_file_pct * blob_files.size()) / 100; + + std::vector> to_process; + FilterSubsetOfFiles(blob_files, &to_process, current_epoch_, last_id, + files_to_collect); + + // in this collect the set of files, which became obsolete + std::vector> obsoletes; + for (auto bfile : to_process) { + GCStats gcstats; + Status s = GCFileAndUpdateLSM(bfile, &gcstats); + if (!s.ok()) continue; + + if (bfile->gc_once_after_open_.load()) { + WriteLock lockbfile_w(&bfile->mutex_); + + bfile->deleted_size_ = gcstats.deleted_size; + bfile->deleted_count_ = gcstats.num_deletes; + bfile->gc_once_after_open_ = false; + } else { + obsoletes.push_back(bfile); + } + } + + if (!obsoletes.empty()) { + bool evict_cb = (!!bdb_options_.gc_evict_cb_fn); + std::shared_ptr tq; + if (evict_cb) tq = std::make_shared(); + + // if evict callback is present, first schedule the callback thread + WriteLock wl(&mutex_); + for (auto bfile : obsoletes) { + bool last_file = (bfile == obsoletes.back()); + // remove from global list so writers + blob_files_.erase(bfile->BlobFileNumber()); + + if (!evict_cb) { + bfile->SetCanBeDeleted(); + obsolete_files_.push_front(bfile); + } else { + tq->add(0, std::bind(&BlobDBImpl::CallbackEvicts, this, + (last_file) ? tq.get() : nullptr, bfile, + std::placeholders::_1)); + } + } + if (evict_cb) cb_threads_.emplace_back(tq); + } + + // reschedule + return std::make_pair(true, -1); +} + +Iterator* BlobDBImpl::NewIterator(const ReadOptions& opts, + ColumnFamilyHandle* column_family) { + return new BlobDBIterator(db_->NewIterator(opts, column_family), + column_family, this); +} + +} // namespace blob_db +} // namespace rocksdb +#endif // ROCKSDB_LITE diff --git a/utilities/blob_db/blob_db_impl.h b/utilities/blob_db/blob_db_impl.h new file mode 100644 index 000000000..5b9d1fba7 --- /dev/null +++ b/utilities/blob_db/blob_db_impl.h @@ -0,0 +1,657 @@ +// Copyright (c) 2011-present, Facebook, Inc. All rights reserved. +// This source code is licensed under the BSD-style license found in the +// LICENSE file in the root directory of this source tree. An additional grant +// of patent rights can be found in the PATENTS file in the same directory. + +#pragma once + +#ifndef ROCKSDB_LITE + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include "rocksdb/compaction_filter.h" +#include "rocksdb/db.h" +#include "rocksdb/listener.h" +#include "rocksdb/options.h" +#include "rocksdb/wal_filter.h" +#include "util/file_reader_writer.h" +#include "util/mpsc.h" +#include "util/mutexlock.h" +#include "util/timer_queue.h" +#include "utilities/blob_db/blob_db.h" +#include "utilities/blob_db/blob_db_options_impl.h" +#include "utilities/blob_db/blob_log_format.h" +#include "utilities/blob_db/blob_log_reader.h" +#include "utilities/blob_db/blob_log_writer.h" + +namespace rocksdb { + +class DBImpl; +class ColumnFamilyHandle; +class ColumnFamilyData; +class OptimisticTransactionDBImpl; +struct FlushJobInfo; + +namespace blob_db { + +class BlobFile; +class BlobDBImpl; +struct GCStats; + +class BlobDBFlushBeginListener : public EventListener { + public: + explicit BlobDBFlushBeginListener() : impl_(nullptr) {} + + void OnFlushBegin(DB* db, const FlushJobInfo& info) override; + + void SetImplPtr(BlobDBImpl* p) { impl_ = p; } + + protected: + BlobDBImpl* impl_; +}; + +// this implements the callback from the WAL which ensures that the +// blob record is present in the blob log. If fsync/fdatasync in not +// happening on every write, there is the probability that keys in the +// blob log can lag the keys in blobs +class BlobReconcileWalFilter : public WalFilter { + public: + virtual WalFilter::WalProcessingOption LogRecordFound( + unsigned long long log_number, const std::string& log_file_name, + const WriteBatch& batch, WriteBatch* new_batch, + bool* batch_changed) override; + + virtual const char* Name() const override { return "BlobDBWalReconciler"; } + + void SetImplPtr(BlobDBImpl* p) { impl_ = p; } + + protected: + BlobDBImpl* impl_; +}; + +class EvictAllVersionsCompactionListener : public EventListener { + public: + class InternalListener : public CompactionEventListener { + friend class BlobDBImpl; + + public: + virtual void OnCompaction(int level, const Slice& key, + CompactionListenerValueType value_type, + const Slice& existing_value, + const SequenceNumber& sn, bool is_new) override; + + void SetImplPtr(BlobDBImpl* p) { impl_ = p; } + + private: + BlobDBImpl* impl_; + }; + + explicit EvictAllVersionsCompactionListener() + : internal_listener_(new InternalListener()) {} + + virtual CompactionEventListener* GetCompactionEventListener() override { + return internal_listener_.get(); + } + + void SetImplPtr(BlobDBImpl* p) { internal_listener_->SetImplPtr(p); } + + private: + std::unique_ptr internal_listener_; +}; + +#if 0 +class EvictAllVersionsFilterFactory : public CompactionFilterFactory { + private: + BlobDBImpl* impl_; + + public: + EvictAllVersionsFilterFactory() : impl_(nullptr) {} + + void SetImplPtr(BlobDBImpl* p) { impl_ = p; } + + virtual std::unique_ptr CreateCompactionFilter( + const CompactionFilter::Context& context) override; + + virtual const char* Name() const override { + return "EvictAllVersionsFilterFactory"; + } +}; +#endif + +// Comparator to sort "TTL" aware Blob files based on the lower value of +// TTL range. +struct blobf_compare_ttl { + bool operator()(const std::shared_ptr& lhs, + const std::shared_ptr& rhs) const; +}; + +/** + * The implementation class for BlobDB. This manages the value + * part in TTL aware sequentially written files. These files are + * Garbage Collected. + */ +class BlobDBImpl : public BlobDB { + friend class BlobDBFlushBeginListener; + friend class EvictAllVersionsCompactionListener; + friend class BlobDB; + friend class BlobFile; + friend class BlobDBIterator; + + public: + using rocksdb::StackableDB::Put; + Status Put(const WriteOptions& options, ColumnFamilyHandle* column_family, + const Slice& key, const Slice& value) override; + + using rocksdb::StackableDB::Delete; + Status Delete(const WriteOptions& options, ColumnFamilyHandle* column_family, + const Slice& key) override; + + using rocksdb::StackableDB::SingleDelete; + virtual Status SingleDelete(const WriteOptions& wopts, + ColumnFamilyHandle* column_family, + const Slice& key) override; + + using rocksdb::StackableDB::Get; + Status Get(const ReadOptions& options, ColumnFamilyHandle* column_family, + const Slice& key, std::string* value) override; + + using rocksdb::StackableDB::NewIterator; + virtual Iterator* NewIterator(const ReadOptions& opts, + ColumnFamilyHandle* column_family) override; + + using rocksdb::StackableDB::MultiGet; + virtual std::vector MultiGet( + const ReadOptions& options, + const std::vector& column_family, + const std::vector& keys, + std::vector* values) override; + + virtual Status Write(const WriteOptions& opts, WriteBatch* updates) override; + + using BlobDB::PutWithTTL; + Status PutWithTTL(const WriteOptions& options, + ColumnFamilyHandle* column_family, const Slice& key, + const Slice& value, int32_t ttl) override; + + using BlobDB::PutUntil; + Status PutUntil(const WriteOptions& options, + ColumnFamilyHandle* column_family, const Slice& key, + const Slice& value_unc, int32_t expiration) override; + + Status LinkToBaseDB(DB* db) override; + + BlobDBImpl(DB* db, const BlobDBOptions& bdb_options); + + BlobDBImpl(const std::string& dbname, const BlobDBOptions& bdb_options, + const DBOptions& db_options); + + ~BlobDBImpl(); + + private: + static bool ExtractTTLFromBlob(const Slice& value, Slice* newval, + int32_t* ttl_val); + + Status OpenPhase1(); + + Status CommonGet(const ColumnFamilyData* cfd, const Slice& key, + const std::string& index_entry, std::string* value); + + // Just before flush starts acting on memtable files, + // this handler is called. + void OnFlushBeginHandler(DB* db, const FlushJobInfo& info); + + // timer queue callback to close a file by appending a footer + // removes file from open files list + std::pair CloseSeqWrite(std::shared_ptr bfile, + bool aborted); + + // is this file ready for Garbage collection. if the TTL of the file + // has expired or if threshold of the file has been evicted + // tt - current time + // last_id - the id of the non-TTL file to evict + bool ShouldGCFile(std::shared_ptr bfile, std::time_t tt, + uint64_t last_id, std::string* reason); + + // collect all the blob log files from the blob directory + Status GetAllLogFiles(std::set>* file_nums); + + // appends a task into timer queue to close the file + void CloseIf(const std::shared_ptr& bfile); + + Status AppendBlob(const std::shared_ptr& bfile, + const std::string& headerbuf, const Slice& key, + const Slice& value, std::string* index_entry); + + Status AppendSN(const std::shared_ptr& bfile, + const SequenceNumber& sn); + + // find an existing blob log file based on the expiration unix epoch + // if such a file does not exist, return nullptr + std::shared_ptr SelectBlobFileTTL(uint32_t expiration); + + // find an existing blob log file to append the value to + std::shared_ptr SelectBlobFile(); + + std::shared_ptr FindBlobFileLocked(uint32_t expiration) const; + + void UpdateWriteOptions(const WriteOptions& options); + + void Shutdown(); + + // periodic sanity check. Bunch of checks + std::pair SanityCheck(bool aborted); + + // delete files which have been garbage collected and marked + // obsolete. Check whether any snapshots exist which refer to + // the same + std::pair DeleteObsFiles(bool aborted); + + // Major task to garbage collect expired and deleted blobs + std::pair RunGC(bool aborted); + + // asynchronous task to fsync/fdatasync the open blob files + std::pair FsyncFiles(bool aborted); + + // periodically check if open blob files and their TTL's has expired + // if expired, close the sequential writer and make the file immutable + std::pair CheckSeqFiles(bool aborted); + + // if the number of open files, approaches ULIMIT's this + // task will close random readers, which are kept around for + // efficiency + std::pair ReclaimOpenFiles(bool aborted); + + // periodically print write amplification statistics + std::pair WaStats(bool aborted); + + // background task to do book-keeping of deleted keys + std::pair EvictDeletions(bool aborted); + + std::pair EvictCompacted(bool aborted); + + bool CallbackEvictsImpl(std::shared_ptr bfile); + + std::pair RemoveTimerQ(TimerQueue* tq, bool aborted); + + std::pair CallbackEvicts(TimerQueue* tq, + std::shared_ptr bfile, + bool aborted); + + // Adds the background tasks to the timer queue + void StartBackgroundTasks(); + + // add a new Blob File + std::shared_ptr NewBlobFile(const std::string& reason); + + Status OpenAllFiles(); + + // hold write mutex on file and call + // creates a Random Access reader for GET call + std::shared_ptr GetOrOpenRandomAccessReader( + const std::shared_ptr& bfile, Env* env, + const EnvOptions& env_options); + + // hold write mutex on file and call. + // Close the above Random Access reader + void CloseRandomAccessLocked(const std::shared_ptr& bfile); + + // hold write mutex on file and call + // creates a sequential (append) writer for this blobfile + Status CreateWriterLocked(const std::shared_ptr& bfile); + + // returns a Writer object for the file. If writer is not + // already present, creates one. Needs Write Mutex to be held + std::shared_ptr CheckOrCreateWriterLocked( + const std::shared_ptr& bfile); + + // Iterate through keys and values on Blob and write into + // separate file the remaining blobs and delete/update pointers + // in LSM atomically + Status GCFileAndUpdateLSM(const std::shared_ptr& bfptr, + GCStats* gcstats); + + // checks if there is no snapshot which is referencing the + // blobs + bool FileDeleteOk_SnapshotCheckLocked(const std::shared_ptr& bfile); + + bool MarkBlobDeleted(const Slice& key, const Slice& lsmValue); + + bool FindFileAndEvictABlob(uint64_t file_number, uint64_t key_size, + uint64_t blob_offset, uint64_t blob_size); + + void CopyBlobFiles(std::vector>* bfiles_copy, + uint64_t* last_id); + + void FilterSubsetOfFiles( + const std::vector>& blob_files, + std::vector>* to_process, uint64_t epoch, + uint64_t last_id, size_t files_to_collect); + + private: + // the base DB + DBImpl* db_impl_; + + Env* myenv_; + + // Optimistic Transaction DB used during Garbage collection + // for atomicity + std::unique_ptr opt_db_; + + // a boolean to capture whether write_options has been set + std::atomic wo_set_; + WriteOptions write_options_; + + // the options that govern the behavior of Blob Storage + BlobDBOptionsImpl bdb_options_; + DBOptions db_options_; + EnvOptions env_options_; + + // name of the database directory + std::string dbname_; + + // by default this is "blob_dir" under dbname_ + // but can be configured + std::string blob_dir_; + + // pointer to directory + std::unique_ptr dir_ent_; + + std::atomic dir_change_; + + // Read Write Mutex, which protects all the data structures + // HEAVILY TRAFFICKED + port::RWMutex mutex_; + + // counter for blob file number + std::atomic next_file_number_; + + // entire metadata of all the BLOB files memory + std::unordered_map> blob_files_; + + // epoch or version of the open files. + std::atomic epoch_of_; + + // typically we keep 4 open blob files (simple i.e. no TTL) + std::vector> open_simple_files_; + + // all the blob files which are currently being appended to based + // on variety of incoming TTL's + std::multiset, blobf_compare_ttl> open_blob_files_; + + // packet of information to put in lockess delete(s) queue + struct delete_packet_t { + ColumnFamilyHandle* cfh_; + std::string key_; + SequenceNumber dsn_; + }; + + struct override_packet_t { + uint64_t file_number_; + uint64_t key_size_; + uint64_t blob_offset_; + uint64_t blob_size_; + SequenceNumber dsn_; + }; + + // LOCKLESS multiple producer single consumer queue to quickly append + // deletes without taking lock. Can rapidly grow in size!! + // deletes happen in LSM, but minor book-keeping needs to happen on + // BLOB side (for triggering eviction) + mpsc_queue_t delete_keys_q_; + + // LOCKLESS multiple producer single consumer queue for values + // that are being compacted + mpsc_queue_t override_vals_q_; + + // atomic bool to represent shutdown + std::atomic shutdown_; + + // timer based queue to execute tasks + TimerQueue tqueue_; + + // timer queues to call eviction callbacks. + std::vector> cb_threads_; + + // only accessed in GC thread, hence not atomic. The epoch of the + // GC task. Each execution is one epoch. Helps us in allocating + // files to one execution + uint64_t current_epoch_; + + // number of files opened for random access/GET + // counter is used to monitor and close excess RA files. + std::atomic open_file_count_; + + // should hold mutex to modify + // STATISTICS for WA of Blob Files due to GC + // collect by default 24 hourly periods + std::list all_periods_write_; + std::list all_periods_ampl_; + + std::atomic last_period_write_; + std::atomic last_period_ampl_; + + uint64_t total_periods_write_; + uint64_t total_periods_ampl_; + + // total size of all blob files at a given time + std::atomic total_blob_space_; + std::list> obsolete_files_; + bool open_p1_done_; + + uint32_t debug_level_; +}; + +class BlobFile { + friend class BlobDBImpl; + friend struct blobf_compare_ttl; + + private: + // access to parent + const BlobDBImpl* parent_; + + // path to blob directory + std::string path_to_dir_; + + // the id of the file. + // the above 2 are created during file creation and never changed + // after that + uint64_t file_number_; + + // number of blobs in the file + std::atomic blob_count_; + + // the file will be selected for GC in this future epoch + std::atomic gc_epoch_; + + // size of the file + std::atomic file_size_; + + // number of blobs in this particular file which have been evicted + uint64_t deleted_count_; + + // size of deleted blobs (used by heuristic to select file for GC) + uint64_t deleted_size_; + + BlobLogHeader header_; + + // closed_ = true implies the file is no more mutable + // no more blobs will be appended and the footer has been written out + std::atomic closed_; + + // has a pass of garbage collection successfully finished on this file + // can_be_deleted_ still needs to do iterator/snapshot checks + std::atomic can_be_deleted_; + + // should this file been gc'd once to reconcile lost deletes/compactions + std::atomic gc_once_after_open_; + + // et - lt of the blobs + ttlrange_t ttl_range_; + + // et - lt of the timestamp of the KV pairs. + tsrange_t time_range_; + + // ESN - LSN of the blobs + snrange_t sn_range_; + + // Sequential/Append writer for blobs + std::shared_ptr log_writer_; + + // random access file reader for GET calls + std::shared_ptr ra_file_reader_; + + // This Read-Write mutex is per file specific and protects + // all the datastructures + port::RWMutex mutex_; + + // time when the random access reader was last created. + std::atomic last_access_; + + // last time file was fsync'd/fdatasyncd + std::atomic last_fsync_; + + bool header_valid_; + + public: + BlobFile(); + + BlobFile(const BlobDBImpl* parent, const std::string& bdir, uint64_t fnum); + + ~BlobFile(); + + ColumnFamilyHandle* GetColumnFamily(DB* db); + + // Returns log file's pathname relative to the main db dir + // Eg. For a live-log-file = blob_dir/000003.blob + std::string PathName() const; + + // Primary identifier for blob file. + // once the file is created, this never changes + uint64_t BlobFileNumber() const { return file_number_; } + + // the following functions are atomic, and don't need + // read lock + uint64_t BlobCount() const { + return blob_count_.load(std::memory_order_acquire); + } + + std::string DumpState() const; + + // if the file has gone through GC and blobs have been relocated + bool Obsolete() const { return can_be_deleted_.load(); } + + // if the file is not taking any more appends. + bool Immutable() const { return closed_.load(); } + + // we will assume this is atomic + bool NeedsFsync(bool hard, uint64_t bytes_per_sync) const; + + uint64_t GetFileSize() const { + return file_size_.load(std::memory_order_acquire); + } + + // All Get functions which are not atomic, will need ReadLock on the mutex + tsrange_t GetTimeRange() const { + assert(HasTimestamp()); + return time_range_; + } + + ttlrange_t GetTTLRange() const { return ttl_range_; } + + snrange_t GetSNRange() const { return sn_range_; } + + bool HasTTL() const { + assert(header_valid_); + return header_.HasTTL(); + } + + bool HasTimestamp() const { + assert(header_valid_); + return header_.HasTimestamp(); + } + + std::shared_ptr GetWriter() const { return log_writer_; } + + void Fsync(); + + private: + std::shared_ptr OpenSequentialReader( + Env* env, const DBOptions& db_options, + const EnvOptions& env_options) const; + + Status ReadFooter(BlobLogFooter* footer); + + Status WriteFooterAndCloseLocked(); + + std::shared_ptr GetOrOpenRandomAccessReader( + Env* env, const EnvOptions& env_options, bool* fresh_open); + + void CloseRandomAccessLocked(); + + // this is used, when you are reading only the footer of a + // previously closed file + Status SetFromFooterLocked(const BlobLogFooter& footer); + + void set_time_range(const tsrange_t& tr) { time_range_ = tr; } + + void set_ttl_range(const ttlrange_t& ttl) { ttl_range_ = ttl; } + + void SetSNRange(const snrange_t& snr) { sn_range_ = snr; } + + // The following functions are atomic, and don't need locks + void SetFileSize(uint64_t fs) { file_size_ = fs; } + + void SetBlobCount(uint64_t bc) { blob_count_ = bc; } + + void SetCanBeDeleted() { can_be_deleted_ = true; } +}; + +class BlobDBIterator : public Iterator { + public: + explicit BlobDBIterator(Iterator* iter, ColumnFamilyHandle* column_family, + BlobDBImpl* impl) + : iter_(iter), cfh_(column_family), db_impl_(impl) { + assert(iter_); + } + + ~BlobDBIterator() { delete iter_; } + + bool Valid() const override { return iter_->Valid(); } + + void SeekToFirst() override { iter_->SeekToFirst(); } + + void SeekToLast() override { iter_->SeekToLast(); } + + void Seek(const Slice& target) override { iter_->Seek(target); } + + void SeekForPrev(const Slice& target) override { iter_->SeekForPrev(target); } + + void Next() override { iter_->Next(); } + + void Prev() override { iter_->Prev(); } + + Slice key() const override { return iter_->key(); } + + Slice value() const override; + + Status status() const override { return iter_->status(); } + + private: + Iterator* iter_; + ColumnFamilyHandle* cfh_; + BlobDBImpl* db_impl_; + mutable std::string vpart_; +}; + +} // namespace blob_db +} // namespace rocksdb +#endif // ROCKSDB_LITE diff --git a/utilities/blob_db/blob_db_options_impl.cc b/utilities/blob_db/blob_db_options_impl.cc new file mode 100644 index 000000000..fff85a92a --- /dev/null +++ b/utilities/blob_db/blob_db_options_impl.cc @@ -0,0 +1,66 @@ +// Copyright (c) 2011-present, Facebook, Inc. All rights reserved. +// This source code is licensed under the BSD-style license found in the +// LICENSE file in the root directory of this source tree. An additional grant +// of patent rights can be found in the PATENTS file in the same directory. +#ifndef ROCKSDB_LITE + +#include "utilities/blob_db/blob_db_options_impl.h" + +namespace rocksdb { + +namespace blob_db { + +BlobDBOptionsImpl::BlobDBOptionsImpl(const BlobDBOptions& in) + : BlobDBOptions(in), + deletion_check_period_millisecs(2 * 1000), + gc_file_pct(20), + gc_check_period_millisecs(60 * 1000), + sanity_check_period_millisecs(20 * 60 * 1000), + open_files_trigger(100), + wa_num_stats_periods(24), + wa_stats_period_millisecs(3600 * 1000), + partial_expiration_gc_range_secs(4 * 3600), + partial_expiration_pct(75), + fsync_files_period_millisecs(10 * 1000), + reclaim_of_period_millisecs(1 * 1000), + delete_obsf_period_millisecs(10 * 1000), + check_seqf_period_millisecs(10 * 1000) {} + +BlobDBOptionsImpl::BlobDBOptionsImpl() + : deletion_check_period_millisecs(2 * 1000), + gc_file_pct(20), + gc_check_period_millisecs(60 * 1000), + sanity_check_period_millisecs(20 * 60 * 1000), + open_files_trigger(100), + wa_num_stats_periods(24), + wa_stats_period_millisecs(3600 * 1000), + partial_expiration_gc_range_secs(4 * 3600), + partial_expiration_pct(75), + fsync_files_period_millisecs(10 * 1000), + reclaim_of_period_millisecs(1 * 1000), + delete_obsf_period_millisecs(10 * 1000), + check_seqf_period_millisecs(10 * 1000) {} + +BlobDBOptionsImpl& BlobDBOptionsImpl::operator=(const BlobDBOptionsImpl& in) { + BlobDBOptions::operator=(in); + if (this != &in) { + deletion_check_period_millisecs = in.deletion_check_period_millisecs; + gc_file_pct = in.gc_file_pct; + gc_check_period_millisecs = in.gc_check_period_millisecs; + sanity_check_period_millisecs = in.sanity_check_period_millisecs; + open_files_trigger = in.open_files_trigger; + wa_num_stats_periods = in.wa_num_stats_periods; + wa_stats_period_millisecs = in.wa_stats_period_millisecs; + partial_expiration_gc_range_secs = in.partial_expiration_gc_range_secs; + partial_expiration_pct = in.partial_expiration_pct; + fsync_files_period_millisecs = in.fsync_files_period_millisecs; + reclaim_of_period_millisecs = in.reclaim_of_period_millisecs; + delete_obsf_period_millisecs = in.delete_obsf_period_millisecs; + check_seqf_period_millisecs = in.check_seqf_period_millisecs; + } + return *this; +} + +} // namespace blob_db +} // namespace rocksdb +#endif // ROCKSDB_LITE diff --git a/utilities/blob_db/blob_db_options_impl.h b/utilities/blob_db/blob_db_options_impl.h new file mode 100644 index 000000000..9cc887ee2 --- /dev/null +++ b/utilities/blob_db/blob_db_options_impl.h @@ -0,0 +1,73 @@ +// Copyright (c) 2011-present, Facebook, Inc. All rights reserved. +// This source code is licensed under the BSD-style license found in the +// LICENSE file in the root directory of this source tree. An additional grant +// of patent rights can be found in the PATENTS file in the same directory. + +#pragma once + +#ifndef ROCKSDB_LITE + +#include "utilities/blob_db/blob_db.h" + +namespace rocksdb { + +namespace blob_db { + +struct BlobDBOptionsImpl : public BlobDBOptions { + // deletions check period + uint32_t deletion_check_period_millisecs; + + // gc percentage each check period + uint32_t gc_file_pct; + + // gc period + uint32_t gc_check_period_millisecs; + + // sanity check task + uint32_t sanity_check_period_millisecs; + + // how many random access open files can we tolerate + uint32_t open_files_trigger; + + // how many periods of stats do we keep. + uint32_t wa_num_stats_periods; + + // what is the length of any period + uint32_t wa_stats_period_millisecs; + + // we will garbage collect blob files in + // which entire files have expired. However if the + // ttl_range of files is very large say a day, we + // would have to wait for the entire day, before we + // recover most of the space. + uint32_t partial_expiration_gc_range_secs; + + // this should be based on allowed Write Amplification + // if 50% of the space of a blob file has been deleted/expired, + uint32_t partial_expiration_pct; + + // how often should we schedule a job to fsync open files + uint32_t fsync_files_period_millisecs; + + // how often to schedule reclaim open files. + uint32_t reclaim_of_period_millisecs; + + // how often to schedule delete obs files periods + uint32_t delete_obsf_period_millisecs; + + // how often to schedule check seq files period + uint32_t check_seqf_period_millisecs; + + // default constructor + BlobDBOptionsImpl(); + + explicit BlobDBOptionsImpl(const BlobDBOptions& in); + + BlobDBOptionsImpl& operator=(const BlobDBOptionsImpl& in); +}; + +} // namespace blob_db + +} // namespace rocksdb + +#endif // endif ROCKSDB diff --git a/utilities/blob_db/blob_db_test.cc b/utilities/blob_db/blob_db_test.cc index 4d26ef0e4..17a5ddca7 100644 --- a/utilities/blob_db/blob_db_test.cc +++ b/utilities/blob_db/blob_db_test.cc @@ -1,66 +1,567 @@ -// Copyright (c) 2011 The LevelDB Authors. All rights reserved. -// Use of this source code is governed by a BSD-style license that can be -// found in the LICENSE file. See the AUTHORS file for names of contributors. - +// Copyright (c) 2011-present, Facebook, Inc. All rights reserved. +// This source code is licensed under the BSD-style license found in the +// LICENSE file in the root directory of this source tree. An additional grant +// of patent rights can be found in the PATENTS file in the same directory. +// #ifndef ROCKSDB_LITE #include "utilities/blob_db/blob_db.h" +#include +#include "db/db_test_util.h" #include "util/random.h" #include "util/testharness.h" #include "util/testutil.h" +#include "utilities/blob_db/blob_db_options_impl.h" namespace rocksdb { + +namespace blob_db { +Random s_rnd(301); + +void gen_random(char *s, const int len) { + static const char alphanum[] = + "0123456789" + "ABCDEFGHIJKLMNOPQRSTUVWXYZ" + "abcdefghijklmnopqrstuvwxyz"; + + for (int i = 0; i < len; ++i) { + s[i] = alphanum[s_rnd.Next() % (sizeof(alphanum) - 1)]; + } + + s[len] = 0; +} + class BlobDBTest : public testing::Test { public: - BlobDBTest() { + BlobDBTest() : blobdb_(nullptr) { dbname_ = test::TmpDir() + "/blob_db_test"; - Options options; - options.create_if_missing = true; - EXPECT_TRUE(NewBlobDB(options, dbname_, &db_).ok()); + // Reopen1(BlobDBOptionsImpl()); + } + + ~BlobDBTest() { + if (blobdb_) { + delete blobdb_; + blobdb_ = nullptr; + } } - ~BlobDBTest() { delete db_; } + void Reopen1(const BlobDBOptionsImpl &bdboptions, + const Options &options = Options()) { + if (blobdb_) { + delete blobdb_; + blobdb_ = nullptr; + } + + BlobDBOptionsImpl bblobdb_options = bdboptions; + Options myoptions = options; + BlobDB::DestroyBlobDB(dbname_, myoptions, bblobdb_options); + + DestroyDB(dbname_, myoptions); + + myoptions.create_if_missing = true; + EXPECT_TRUE( + BlobDB::Open(myoptions, bblobdb_options, dbname_, &blobdb_).ok()); + } + + void insert_blobs() { + WriteOptions wo; + ReadOptions ro; + std::string value; + + ColumnFamilyHandle *dcfh = blobdb_->DefaultColumnFamily(); + + Random rnd(301); + for (size_t i = 0; i < 100000; i++) { + int len = rnd.Next() % 16384; + if (!len) continue; + + char *val = new char[len + 1]; + gen_random(val, len); + + std::string key("key"); + key += std::to_string(i % 500); + + Slice keyslice(key); + Slice valslice(val, len + 1); + + int ttl = rnd.Next() % 86400; + + ASSERT_OK(blobdb_->PutWithTTL(wo, dcfh, keyslice, valslice, ttl)); + delete[] val; + } + + for (size_t i = 0; i < 10; i++) { + std::string key("key"); + key += std::to_string(i % 500); + Slice keyslice(key); + blobdb_->Delete(wo, dcfh, keyslice); + } + } - DB* db_; + BlobDB *blobdb_; std::string dbname_; }; // class BlobDBTest -TEST_F(BlobDBTest, Basic) { +TEST_F(BlobDBTest, DeleteComplex) { + BlobDBOptionsImpl bdboptions; + bdboptions.partial_expiration_pct = 75; + bdboptions.gc_check_period_millisecs = 20 * 1000; + bdboptions.blob_file_size = 219 * 1024; + + Reopen1(bdboptions); + WriteOptions wo; ReadOptions ro; std::string value; - ASSERT_OK(db_->Put(wo, "foo", "v1")); - ASSERT_OK(db_->Put(wo, "bar", "v2")); + ColumnFamilyHandle *dcfh = blobdb_->DefaultColumnFamily(); + + Random rnd(301); + for (size_t i = 0; i < 100; i++) { + int len = rnd.Next() % 16384; + if (!len) continue; + + char *val = new char[len + 1]; + gen_random(val, len); - ASSERT_OK(db_->Get(ro, "foo", &value)); - ASSERT_EQ("v1", value); - ASSERT_OK(db_->Get(ro, "bar", &value)); - ASSERT_EQ("v2", value); + std::string key("key"); + key += std::to_string(i); + + Slice keyslice(key); + Slice valslice(val, len + 1); + + ASSERT_OK(blobdb_->Put(wo, dcfh, keyslice, valslice)); + delete[] val; + } + + for (size_t i = 0; i < 99; i++) { + std::string key("key"); + key += std::to_string(i); + + Slice keyslice(key); + blobdb_->Delete(wo, dcfh, keyslice); + } + + Env::Default()->SleepForMicroseconds(60 * 1000 * 1000); } +TEST_F(BlobDBTest, OverrideTest) { + BlobDBOptionsImpl bdboptions; + bdboptions.ttl_range_secs = 30; + bdboptions.gc_file_pct = 100; + bdboptions.gc_check_period_millisecs = 20 * 1000; + bdboptions.num_concurrent_simple_blobs = 2; + bdboptions.blob_file_size = 876 * 1024 * 10; + + Options options; + options.write_buffer_size = 256 * 1024; + options.info_log_level = INFO_LEVEL; + + Reopen1(bdboptions, options); + + WriteOptions wo; + ReadOptions ro; + std::string value; + + Random rnd(301); + ColumnFamilyHandle *dcfh = blobdb_->DefaultColumnFamily(); + + for (int i = 0; i < 10000; i++) { + int len = rnd.Next() % 16384; + if (!len) continue; + + char *val = new char[len + 1]; + gen_random(val, len); + + std::string key("key"); + char x[10]; + std::sprintf(x, "%04d", i); + key += std::string(x); + + Slice keyslice(key); + Slice valslice(val, len + 1); + + ASSERT_OK(blobdb_->Put(wo, dcfh, keyslice, valslice)); + delete[] val; + } + + // override all the keys + for (int i = 0; i < 10000; i++) { + int len = rnd.Next() % 16384; + if (!len) continue; + + char *val = new char[len + 1]; + gen_random(val, len); + + std::string key("key"); + char x[10]; + std::sprintf(x, "%04d", i); + key += std::string(x); + + Slice keyslice(key); + Slice valslice(val, len + 1); + + ASSERT_OK(blobdb_->Put(wo, dcfh, keyslice, valslice)); + delete[] val; + } + + blobdb_->Flush(FlushOptions()); + +#if 1 + blobdb_->GetBaseDB()->CompactRange(CompactRangeOptions(), nullptr, nullptr); + reinterpret_cast(blobdb_->GetBaseDB())->TEST_WaitForFlushMemTable(); + reinterpret_cast(blobdb_->GetBaseDB())->TEST_WaitForCompact(); +#endif + + Env::Default()->SleepForMicroseconds(120 * 1000 * 1000); +} + +TEST_F(BlobDBTest, DeleteTest) { + BlobDBOptionsImpl bdboptions; + bdboptions.ttl_range_secs = 30; + bdboptions.gc_file_pct = 100; + bdboptions.partial_expiration_pct = 18; + bdboptions.gc_check_period_millisecs = 20 * 1000; + bdboptions.num_concurrent_simple_blobs = 1; + bdboptions.blob_file_size = 876 * 1024; + + Reopen1(bdboptions); + + WriteOptions wo; + ReadOptions ro; + std::string value; + + Random rnd(301); + ColumnFamilyHandle *dcfh = blobdb_->DefaultColumnFamily(); + + for (size_t i = 0; i < 100; i++) { + int len = rnd.Next() % 16384; + if (!len) continue; + + char *val = new char[len + 1]; + gen_random(val, len); + + std::string key("key"); + key += std::to_string(i); + + Slice keyslice(key); + Slice valslice(val, len + 1); + + ASSERT_OK(blobdb_->Put(wo, dcfh, keyslice, valslice)); + delete[] val; + } + + for (size_t i = 0; i < 100; i += 5) { + std::string key("key"); + key += std::to_string(i); + + Slice keyslice(key); + blobdb_->Delete(wo, dcfh, keyslice); + } + + Env::Default()->SleepForMicroseconds(60 * 1000 * 1000); +} + +TEST_F(BlobDBTest, GCTestWithWrite) { + BlobDBOptionsImpl bdboptions; + bdboptions.ttl_range_secs = 30; + bdboptions.gc_file_pct = 100; + bdboptions.gc_check_period_millisecs = 20 * 1000; + bdboptions.default_ttl_extractor = true; + + Reopen1(bdboptions); + + WriteOptions wo; + ReadOptions ro; + std::string value; + + ColumnFamilyHandle *dcfh = blobdb_->DefaultColumnFamily(); + + WriteBatch WB; + + Random rnd(301); + for (size_t i = 0; i < 100; i++) { + int len = rnd.Next() % 16384; + if (!len) continue; + + int ttl = 30; + + char *val = new char[len + BlobDB::kTTLSuffixLength]; + gen_random(val, len); + strncpy(val + len, "ttl:", 4); + EncodeFixed32(val + len + 4, ttl); + + std::string key("key"); + key += std::to_string(i); + + Slice keyslice(key); + Slice valslice(val, len + BlobDB::kTTLSuffixLength); + + WB.Put(dcfh, keyslice, valslice); + delete[] val; + } + + ASSERT_OK(blobdb_->Write(wo, &WB)); + + Env::Default()->SleepForMicroseconds(120 * 1000 * 1000); +} + +void cb_evict(const ColumnFamilyHandle *cfh, const Slice &key, + const Slice &val) { + fprintf(stderr, "key evicted: %s\n", key.ToString().c_str()); +} + +static const char *LONG_STRING = + "AJFJFJFFFFFFFFFFFFFFFFFFFJFHFHFHFHFHFHFHHFHHFHHFH " + "AJFJFJFFFFFFFFFFFFFFFFFFFJFHFHFHFHFHFHFHHFHHFHHFH " + "AJFJFJFFFFFFFFFFFFFFFFFFFJFHFHFHFHFHFHFHHFHHFHHFH " + "AJFJFJFFFFFFFFFFFFFFFFFFFJFHFHFHFHFHFHFHHFHHFHHFH " + "AJFJFJFJFJFJTWFNLLFKFFMFMFMFMFMFMFMFMFMFMFMFMFMMF " + "AJFJFJFFFFFFFFFFFFFFFFFFFJFHFHFHFHFHFHFHHFHHFHHFH " + "AJFJFJFFFFFFFFFFFFFFFFFFFJFHFHFHFHFHFHFHHFHHFHHFH " + "AJFJFJFFFFFFFFFFFFFFFFFFFJFHFHFHFHFHFHFHHFHHFHHFH " + "AJFJFJFFFFFFFFFFFFFFFFFFFJFHFHFHFHFHFHFHHFHHFHHFH " + "AJFJFJFFFFFFFFFFFFFFFFFFFJFHFHFHFHFHFHFHHFHHFHHFH " + "AJFJFJFFFFFFFFFFFFFFFFFFFJFHFHFHFHFHFHFHHFHHFHHFH " + "AJFJFJFFFFFFFFFFFFFFFFFFFJFHFHFHFHFHFHFHHFHHFHHFH " + "AJFJFJFFFFFFFFFFFFFFFFFFFJFHFHFHFHFHFHFHHFHHFHHFH " + "AJFJFJFFFFFFFFFFFFFFFFFFFJFHFHFHFHFHFHFHHFHHFHHFH " + "AJFJFJFFFFFFFFFFFFFFFFFFFJFHFHFHFHFHFHFHHFHHFHHFH " + "AJFJFJFFFFFFFFFFFFFFFFFFFJFHFHFHFHFHFHFHHFHHFHHFH " + "AJFJFJFFFFFFFFFFFFFFFFFFFJFHFHFHFHFHFHFHHFHHFHHFH " + "AJFJFJFFFFFFFFFFFFFFFFFFFJFHFHFHFHFHFHFHHFHHFHHFH " + "AJFJFJAJFJFJFJFJTWBFNMFLLWMFMFMFMWKWMFMFMFMFMFMFM " + "AJFJFJFFFFFFFFFFFFFFFFFFFJFHFHFHFHFHFHFHHFHHFHHFH " + "AJFJFJFFFFFFFFFFFFFFFFFFFJFHFHFHFHFHFHFHHFHHFHHFH " + "AJFJFJFFFFFFFFFFFFFFFFFFFJFHFHFHFHFHFHFHHFHHFHHFH " + "AJFJFJFFFFFFFFFFFFFFFFFFFJFHFHFHFHFHFHFHHFHHFHHFH " + "AJFJFJFFFFFFFFFFFFFFFFFFFJFHFHFHFHFHFHFHHFHHFHHFH " + "AJFJFJFFFFFFFFFFFFFFFFFFFJFHFHFHFHFHFHFHHFHHFHHFH " + "AJFJFJFFFFFFFFFFFFFFFFFFFJFHFHFHFHFHFHFHHFHHFHHFH " + "AJFJFJFFFFFFFFFFFFFFFFFFFJFHFHFHFHFHFHFHHFHHFHHFH "; + +TEST_F(BlobDBTest, GetWithCompression) { + BlobDBOptionsImpl bdboptions; + bdboptions.gc_file_pct = 100; + bdboptions.gc_check_period_millisecs = 20 * 1000; + bdboptions.default_ttl_extractor = true; + bdboptions.gc_evict_cb_fn = &cb_evict; + bdboptions.compression = CompressionType::kLZ4Compression; + + Reopen1(bdboptions); + + WriteOptions wo; + ReadOptions ro; + std::string value; + Random rnd(301); + + ColumnFamilyHandle *dcfh = blobdb_->DefaultColumnFamily(); + + std::string orig(LONG_STRING); + + for (size_t i = 0; i < 10000; i++) { + int len = orig.length(); + int ttl = 3000 * (rnd.Next() % 10); + + char *val = new char[len + BlobDB::kTTLSuffixLength]; + strncpy(val, LONG_STRING, len); + strncpy(val + len, "ttl:", 4); + EncodeFixed32(val + len + 4, ttl); + + std::string key("key"); + key += std::to_string(i); + + Slice keyslice(key); + Slice valslice(val, len + BlobDB::kTTLSuffixLength); + + ASSERT_OK(blobdb_->Put(wo, dcfh, keyslice, valslice)); + delete[] val; + } + + for (size_t i = 0; i < 10000; i++) { + std::string key("key"); + key += std::to_string(i); + + Slice keyslice(key); + std::string val; + Status s = blobdb_->Get(ro, dcfh, keyslice, &val); + ASSERT_TRUE(orig == val); + } + + Env::Default()->SleepForMicroseconds(120 * 1000 * 1000); +} + +TEST_F(BlobDBTest, GCTestWithPutAndCompression) { + BlobDBOptionsImpl bdboptions; + bdboptions.ttl_range_secs = 30; + bdboptions.gc_file_pct = 100; + bdboptions.gc_check_period_millisecs = 20 * 1000; + bdboptions.default_ttl_extractor = true; + bdboptions.gc_evict_cb_fn = &cb_evict; + bdboptions.compression = CompressionType::kLZ4Compression; + + Reopen1(bdboptions); + + WriteOptions wo; + ReadOptions ro; + std::string value; + Random rnd(301); + + ColumnFamilyHandle *dcfh = blobdb_->DefaultColumnFamily(); + + for (size_t i = 0; i < 100; i++) { + int len = rnd.Next() % 16384; + if (!len) continue; + + int ttl = 30; + + char *val = new char[len + BlobDB::kTTLSuffixLength]; + gen_random(val, len); + strncpy(val + len, "ttl:", 4); + EncodeFixed32(val + len + 4, ttl); + + std::string key("key"); + key += std::to_string(i); + + Slice keyslice(key); + Slice valslice(val, len + BlobDB::kTTLSuffixLength); + + ASSERT_OK(blobdb_->Put(wo, dcfh, keyslice, valslice)); + delete[] val; + } + + Env::Default()->SleepForMicroseconds(120 * 1000 * 1000); +} + +TEST_F(BlobDBTest, GCTestWithPut) { + BlobDBOptionsImpl bdboptions; + bdboptions.ttl_range_secs = 30; + bdboptions.gc_file_pct = 100; + bdboptions.gc_check_period_millisecs = 20 * 1000; + bdboptions.default_ttl_extractor = true; + bdboptions.gc_evict_cb_fn = &cb_evict; + + Reopen1(bdboptions); + + WriteOptions wo; + ReadOptions ro; + std::string value; + Random rnd(301); + + ColumnFamilyHandle *dcfh = blobdb_->DefaultColumnFamily(); + + for (size_t i = 0; i < 100; i++) { + int len = rnd.Next() % 16384; + if (!len) continue; + + int ttl = 30; + + char *val = new char[len + BlobDB::kTTLSuffixLength]; + gen_random(val, len); + strncpy(val + len, "ttl:", 4); + EncodeFixed32(val + len + 4, ttl); + + std::string key("key"); + key += std::to_string(i); + + Slice keyslice(key); + Slice valslice(val, len + BlobDB::kTTLSuffixLength); + + ASSERT_OK(blobdb_->Put(wo, dcfh, keyslice, valslice)); + delete[] val; + } + + Env::Default()->SleepForMicroseconds(120 * 1000 * 1000); +} + +TEST_F(BlobDBTest, GCTest) { + BlobDBOptionsImpl bdboptions; + bdboptions.ttl_range_secs = 30; + bdboptions.gc_file_pct = 100; + + Reopen1(bdboptions); + + WriteOptions wo; + ReadOptions ro; + std::string value; + Random rnd(301); + + ColumnFamilyHandle *dcfh = blobdb_->DefaultColumnFamily(); + + for (size_t i = 0; i < 100; i++) { + int len = rnd.Next() % 16384; + if (!len) continue; + + char *val = new char[len + 1]; + gen_random(val, len); + + std::string key("key"); + key += std::to_string(i); + + Slice keyslice(key); + Slice valslice(val, len + 1); + + int ttl = 30; + + ASSERT_OK(blobdb_->PutWithTTL(wo, dcfh, keyslice, valslice, ttl)); + delete[] val; + } + + Env::Default()->SleepForMicroseconds(240 * 1000 * 1000); +} + +TEST_F(BlobDBTest, DISABLED_MultipleWriters) { + BlobDBOptionsImpl bdboptions; + Reopen1(bdboptions); + + ASSERT_TRUE(blobdb_ != nullptr); + + std::vector workers; + for (size_t ii = 0; ii < 10; ii++) + workers.push_back(std::thread(&BlobDBTest::insert_blobs, this)); + + for (std::thread &t : workers) { + if (t.joinable()) { + t.join(); + } + } + + Env::Default()->SleepForMicroseconds(180 * 1000 * 1000); + // ASSERT_OK(blobdb_->PutWithTTL(wo, dcfh, "bar", "v2", 60)); + // ASSERT_OK(blobdb_->Get(ro, dcfh, "foo", &value)); + // ASSERT_EQ("v1", value); + // ASSERT_OK(blobdb_->Get(ro, dcfh, "bar", &value)); + // ASSERT_EQ("v2", value); +} + +#if 0 TEST_F(BlobDBTest, Large) { + ASSERT_TRUE(blobdb_ != nullptr); + WriteOptions wo; ReadOptions ro; std::string value1, value2, value3; Random rnd(301); + ColumnFamilyHandle* dcfh = blobdb_->DefaultColumnFamily(); value1.assign(8999, '1'); - ASSERT_OK(db_->Put(wo, "foo", value1)); + ASSERT_OK(blobdb_->PutWithTTL(wo, dcfh, "foo", value1, 3600)); value2.assign(9001, '2'); - ASSERT_OK(db_->Put(wo, "bar", value2)); + ASSERT_OK(blobdb_->PutWithTTL(wo, dcfh, "bar", value2, 3600)); test::RandomString(&rnd, 13333, &value3); - ASSERT_OK(db_->Put(wo, "barfoo", value3)); + ASSERT_OK(blobdb_->PutWithTTL(wo, dcfh, "barfoo", value3, 3600)); std::string value; - ASSERT_OK(db_->Get(ro, "foo", &value)); + ASSERT_OK(blobdb_->Get(ro, dcfh, "foo", &value)); ASSERT_EQ(value1, value); - ASSERT_OK(db_->Get(ro, "bar", &value)); + ASSERT_OK(blobdb_->Get(ro, dcfh, "bar", &value)); ASSERT_EQ(value2, value); - ASSERT_OK(db_->Get(ro, "barfoo", &value)); + ASSERT_OK(blobdb_->Get(ro, dcfh, "barfoo", &value)); ASSERT_EQ(value3, value); } +#endif +} // namespace blob_db } // namespace rocksdb // A black-box test for the ttl wrapper around rocksdb diff --git a/utilities/blob_db/blob_file.cc b/utilities/blob_db/blob_file.cc new file mode 100644 index 000000000..5c8d6864f --- /dev/null +++ b/utilities/blob_db/blob_file.cc @@ -0,0 +1,225 @@ +// Copyright (c) 2011-present, Facebook, Inc. All rights reserved. +// This source code is licensed under the BSD-style license found in the +// LICENSE file in the root directory of this source tree. An additional grant +// of patent rights can be found in the PATENTS file in the same directory. +#ifndef ROCKSDB_LITE + +#include +#include +#include +#include "utilities/blob_db/blob_db_impl.h" + +#include "util/filename.h" + +namespace rocksdb { + +namespace blob_db { + +BlobFile::BlobFile() + : parent_(nullptr), + file_number_(0), + blob_count_(0), + gc_epoch_(-1), + file_size_(0), + deleted_count_(0), + deleted_size_(0), + closed_(false), + can_be_deleted_(false), + gc_once_after_open_(false), + ttl_range_(std::make_pair(0, 0)), + time_range_(std::make_pair(0, 0)), + sn_range_(std::make_pair(0, 0)), + last_access_(-1), + last_fsync_(0), + header_valid_(false) {} + +BlobFile::BlobFile(const BlobDBImpl* p, const std::string& bdir, uint64_t fn) + : parent_(p), + path_to_dir_(bdir), + file_number_(fn), + blob_count_(0), + gc_epoch_(-1), + file_size_(0), + deleted_count_(0), + deleted_size_(0), + closed_(false), + can_be_deleted_(false), + gc_once_after_open_(false), + ttl_range_(std::make_pair(0, 0)), + time_range_(std::make_pair(0, 0)), + sn_range_(std::make_pair(0, 0)), + last_access_(-1), + last_fsync_(0), + header_valid_(false) {} + +BlobFile::~BlobFile() { + if (can_be_deleted_) { + std::string pn(PathName()); + Status s = Env::Default()->DeleteFile(PathName()); + if (!s.ok()) { + // Log(InfoLogLevel::INFO_LEVEL, db_options_.info_log, + // "File could not be deleted %s", pn.c_str()); + } + } +} + +std::string BlobFile::PathName() const { + return BlobFileName(path_to_dir_, file_number_); +} + +std::shared_ptr BlobFile::OpenSequentialReader( + Env* env, const DBOptions& db_options, + const EnvOptions& env_options) const { + std::unique_ptr sfile; + Status s = env->NewSequentialFile(PathName(), &sfile, env_options); + if (!s.ok()) { + // report something here. + return nullptr; + } + + std::unique_ptr sfile_reader; + sfile_reader.reset(new SequentialFileReader(std::move(sfile))); + + std::shared_ptr log_reader = + std::make_shared(db_options.info_log, std::move(sfile_reader)); + + return log_reader; +} + +std::string BlobFile::DumpState() const { + char str[1000]; + std::snprintf(str, sizeof(str), + "path: %s fn: %" PRIu64 " blob_count: %" PRIu64 + " gc_epoch: %" PRIu64 " file_size: %" PRIu64 + " deleted_count: %" PRIu64 " deleted_size: %" PRIu64 + " closed: %d can_be_deleted: %d ttl_range: (%d, %d)" + " sn_range: (%" PRIu64 " %" PRIu64 "), writer: %d reader: %d", + path_to_dir_.c_str(), file_number_, blob_count_.load(), + gc_epoch_.load(), file_size_.load(), deleted_count_, + deleted_size_, closed_.load(), can_be_deleted_.load(), + ttl_range_.first, ttl_range_.second, sn_range_.first, + sn_range_.second, (!!log_writer_), (!!ra_file_reader_)); + return str; +} + +bool BlobFile::NeedsFsync(bool hard, uint64_t bytes_per_sync) const { + assert(last_fsync_ <= file_size_); + return (hard) ? file_size_ > last_fsync_ + : (file_size_ - last_fsync_) >= bytes_per_sync; +} + +Status BlobFile::WriteFooterAndCloseLocked() { + Log(InfoLogLevel::INFO_LEVEL, parent_->db_options_.info_log, + "File is being closed after footer %s", PathName().c_str()); + + BlobLogFooter footer; + footer.blob_count_ = blob_count_; + if (HasTTL()) footer.set_ttl_range(ttl_range_); + + footer.sn_range_ = sn_range_; + if (HasTimestamp()) footer.set_time_range(time_range_); + + // this will close the file and reset the Writable File Pointer. + Status s = log_writer_->AppendFooter(footer); + if (s.ok()) { + closed_ = true; + file_size_ += BlobLogFooter::kFooterSize; + } else { + Log(InfoLogLevel::ERROR_LEVEL, parent_->db_options_.info_log, + "Failure to read Header for blob-file %s", PathName().c_str()); + } + // delete the sequential writer + log_writer_.reset(); + return s; +} + +Status BlobFile::ReadFooter(BlobLogFooter* bf) { + if (file_size_ < (BlobLogHeader::kHeaderSize + BlobLogFooter::kFooterSize)) { + return Status::IOError("File does not have footer", PathName()); + } + + uint64_t footer_offset = file_size_ - BlobLogFooter::kFooterSize; + // assume that ra_file_reader_ is valid before we enter this + assert(ra_file_reader_); + + Slice result; + char scratch[BlobLogFooter::kFooterSize + 10]; + Status s = ra_file_reader_->Read(footer_offset, BlobLogFooter::kFooterSize, + &result, scratch); + if (!s.ok()) return s; + if (result.size() != BlobLogFooter::kFooterSize) { + // should not happen + return Status::IOError("EOF reached before footer"); + } + + s = bf->DecodeFrom(&result); + return s; +} + +Status BlobFile::SetFromFooterLocked(const BlobLogFooter& footer) { + if (footer.HasTTL() != header_.HasTTL()) { + return Status::Corruption("has_ttl mismatch"); + } + if (footer.HasTimestamp() != header_.HasTimestamp()) { + return Status::Corruption("has_ts mismatch"); + } + + // assume that file has been fully fsync'd + last_fsync_.store(file_size_); + blob_count_ = footer.GetBlobCount(); + ttl_range_ = footer.GetTTLRange(); + time_range_ = footer.GetTimeRange(); + sn_range_ = footer.GetSNRange(); + closed_ = true; + + return Status::OK(); +} + +void BlobFile::Fsync() { + if (log_writer_.get()) { + log_writer_->Sync(); + last_fsync_.store(file_size_.load()); + } +} + +void BlobFile::CloseRandomAccessLocked() { + ra_file_reader_.reset(); + last_access_ = -1; +} + +std::shared_ptr BlobFile::GetOrOpenRandomAccessReader( + Env* env, const EnvOptions& env_options, bool* fresh_open) { + *fresh_open = false; + last_access_ = + std::chrono::system_clock::to_time_t(std::chrono::system_clock::now()); + { + ReadLock lockbfile_r(&mutex_); + if (ra_file_reader_) return ra_file_reader_; + } + + WriteLock lockbfile_w(&mutex_); + if (ra_file_reader_) return ra_file_reader_; + + std::unique_ptr rfile; + Status s = env->NewRandomAccessFile(PathName(), &rfile, env_options); + if (!s.ok()) { + Log(InfoLogLevel::ERROR_LEVEL, parent_->db_options_.info_log, + "Failed to open blob file for random-read: %s status: '%s'" + " exists: '%s'", + PathName().c_str(), s.ToString().c_str(), + env->FileExists(PathName()).ToString().c_str()); + return nullptr; + } + + ra_file_reader_ = std::make_shared(std::move(rfile)); + *fresh_open = true; + return ra_file_reader_; +} + +ColumnFamilyHandle* BlobFile::GetColumnFamily(DB* db) { + return db->DefaultColumnFamily(); +} + +} // namespace blob_db +} // namespace rocksdb +#endif // ROCKSDB_LITE diff --git a/utilities/blob_db/blob_log_format.cc b/utilities/blob_db/blob_log_format.cc new file mode 100644 index 000000000..051e9bb01 --- /dev/null +++ b/utilities/blob_db/blob_log_format.cc @@ -0,0 +1,313 @@ +// Copyright (c) 2011-present, Facebook, Inc. All rights reserved. +// This source code is licensed under the BSD-style license found in the +// LICENSE file in the root directory of this source tree. An additional grant +// of patent rights can be found in the PATENTS file in the same directory. +// +#ifndef ROCKSDB_LITE + +#include "utilities/blob_db/blob_log_format.h" +#include "util/coding.h" +#include "util/crc32c.h" + +namespace rocksdb { +namespace blob_db { + +const uint32_t kMagicNumber = 2395959; +const uint32_t kVersion1 = 1; +const size_t kBlockSize = 32768; + +BlobLogHeader::BlobLogHeader() + : magic_number_(kMagicNumber), compression_(kNoCompression) {} + +BlobLogHeader& BlobLogHeader::operator=(BlobLogHeader&& in) noexcept { + if (this != &in) { + magic_number_ = in.magic_number_; + version_ = in.version_; + ttl_guess_ = std::move(in.ttl_guess_); + ts_guess_ = std::move(in.ts_guess_); + compression_ = in.compression_; + } + return *this; +} + +BlobLogFooter::BlobLogFooter() : magic_number_(kMagicNumber), blob_count_(0) {} + +Status BlobLogFooter::DecodeFrom(Slice* input) { + uint32_t val; + if (!GetFixed32(input, &val)) { + return Status::Corruption("Invalid Blob Footer: flags"); + } + + bool has_ttl = false; + bool has_ts = false; + val >>= 8; + RecordSubType st = static_cast(val); + switch (st) { + case kRegularType: + break; + case kTTLType: + has_ttl = true; + break; + case kTimestampType: + has_ts = true; + break; + default: + return Status::Corruption("Invalid Blob Footer: flags_val"); + } + + if (!GetFixed64(input, &blob_count_)) { + return Status::Corruption("Invalid Blob Footer: blob_count"); + } + + ttlrange_t temp_ttl; + if (!GetFixed32(input, &temp_ttl.first) || + !GetFixed32(input, &temp_ttl.second)) { + return Status::Corruption("Invalid Blob Footer: ttl_range"); + } + if (has_ttl) { + printf("has ttl\n"); + ttl_range_.reset(new ttlrange_t(temp_ttl)); + } + + if (!GetFixed64(input, &sn_range_.first) || + !GetFixed64(input, &sn_range_.second)) { + return Status::Corruption("Invalid Blob Footer: sn_range"); + } + + tsrange_t temp_ts; + if (!GetFixed64(input, &temp_ts.first) || + !GetFixed64(input, &temp_ts.second)) { + return Status::Corruption("Invalid Blob Footer: ts_range"); + } + if (has_ts) ts_range_.reset(new tsrange_t(temp_ts)); + + if (!GetFixed32(input, &magic_number_) || magic_number_ != kMagicNumber) { + return Status::Corruption("Invalid Blob Footer: magic"); + } + + return Status::OK(); +} + +void BlobLogFooter::EncodeTo(std::string* dst) const { + dst->reserve(kFooterSize); + + RecordType rt = kFullType; + RecordSubType st = kRegularType; + if (HasTTL()) { + st = kTTLType; + } else if (HasTimestamp()) { + st = kTimestampType; + } + uint32_t val = static_cast(rt) | (static_cast(st) << 8); + PutFixed32(dst, val); + + PutFixed64(dst, blob_count_); + bool has_ttl = HasTTL(); + bool has_ts = HasTimestamp(); + + if (has_ttl) { + PutFixed32(dst, ttl_range_.get()->first); + PutFixed32(dst, ttl_range_.get()->second); + } else { + PutFixed32(dst, 0); + PutFixed32(dst, 0); + } + PutFixed64(dst, sn_range_.first); + PutFixed64(dst, sn_range_.second); + + if (has_ts) { + PutFixed64(dst, ts_range_.get()->first); + PutFixed64(dst, ts_range_.get()->second); + } else { + PutFixed64(dst, 0); + PutFixed64(dst, 0); + } + + PutFixed32(dst, magic_number_); +} + +void BlobLogHeader::EncodeTo(std::string* dst) const { + dst->reserve(kHeaderSize); + + PutFixed32(dst, magic_number_); + + PutFixed32(dst, version_); + + RecordSubType st = kRegularType; + bool has_ttl = HasTTL(); + bool has_ts = HasTimestamp(); + + if (has_ttl) { + st = kTTLType; + } else if (has_ts) { + st = kTimestampType; + } + uint32_t val = + static_cast(st) | (static_cast(compression_) << 8); + PutFixed32(dst, val); + + if (has_ttl) { + PutFixed32(dst, ttl_guess_.get()->first); + PutFixed32(dst, ttl_guess_.get()->second); + } else { + PutFixed32(dst, 0); + PutFixed32(dst, 0); + } + + if (has_ts) { + PutFixed64(dst, ts_guess_.get()->first); + PutFixed64(dst, ts_guess_.get()->second); + } else { + PutFixed64(dst, 0); + PutFixed64(dst, 0); + } +} + +Status BlobLogHeader::DecodeFrom(Slice* input) { + if (!GetFixed32(input, &magic_number_) || magic_number_ != kMagicNumber) { + return Status::Corruption("Invalid Blob Log Header: magic"); + } + + // as of today, we only support 1 version + if (!GetFixed32(input, &version_) || version_ != kVersion1) { + return Status::Corruption("Invalid Blob Log Header: version"); + } + + uint32_t val; + if (!GetFixed32(input, &val)) { + return Status::Corruption("Invalid Blob Log Header: subtype"); + } + + bool has_ttl = false; + bool has_ts = false; + RecordSubType st = static_cast(val & 0xff); + compression_ = static_cast((val >> 8) & 0xff); + switch (st) { + case kRegularType: + break; + case kTTLType: + has_ttl = true; + break; + case kTimestampType: + has_ts = true; + break; + default: + return Status::Corruption("Invalid Blob Log Header: subtype_2"); + } + + ttlrange_t temp_ttl; + if (!GetFixed32(input, &temp_ttl.first) || + !GetFixed32(input, &temp_ttl.second)) { + return Status::Corruption("Invalid Blob Log Header: ttl"); + } + if (has_ttl) set_ttl_guess(temp_ttl); + + tsrange_t temp_ts; + if (!GetFixed64(input, &temp_ts.first) || + !GetFixed64(input, &temp_ts.second)) { + return Status::Corruption("Invalid Blob Log Header: timestamp"); + } + if (has_ts) set_ts_guess(temp_ts); + + return Status::OK(); +} + +BlobLogRecord::BlobLogRecord() + : checksum_(0), + header_cksum_(0), + key_size_(0), + blob_size_(0), + time_val_(0), + ttl_val_(0), + sn_(0), + type_(0), + subtype_(0) {} + +BlobLogRecord::~BlobLogRecord() {} + +void BlobLogRecord::ResizeKeyBuffer(size_t kbs) { + if (kbs > key_buffer_.size()) { + key_buffer_.resize(kbs); + } +} + +void BlobLogRecord::ResizeBlobBuffer(size_t bbs) { + if (bbs > blob_buffer_.size()) { + blob_buffer_.resize(bbs); + } +} + +void BlobLogRecord::Clear() { + checksum_ = 0; + header_cksum_ = 0; + key_size_ = 0; + blob_size_ = 0; + time_val_ = 0; + ttl_val_ = 0; + sn_ = 0; + type_ = subtype_ = 0; + key_.clear(); + blob_.clear(); +} + +Status BlobLogRecord::DecodeHeaderFrom(const Slice& hdrslice) { + Slice input = hdrslice; + if (input.size() < kHeaderSize) { + return Status::Corruption("Invalid Blob Record Header: size"); + } + + if (!GetFixed32(&input, &key_size_)) { + return Status::Corruption("Invalid Blob Record Header: key_size"); + } + if (!GetFixed64(&input, &blob_size_)) { + return Status::Corruption("Invalid Blob Record Header: blob_size"); + } + if (!GetFixed32(&input, &ttl_val_)) { + return Status::Corruption("Invalid Blob Record Header: ttl_val"); + } + if (!GetFixed64(&input, &time_val_)) { + return Status::Corruption("Invalid Blob Record Header: time_val"); + } + + type_ = *(input.data()); + input.remove_prefix(1); + subtype_ = *(input.data()); + input.remove_prefix(1); + + if (!GetFixed32(&input, &header_cksum_)) { + return Status::Corruption("Invalid Blob Record Header: header_cksum"); + } + if (!GetFixed32(&input, &checksum_)) { + return Status::Corruption("Invalid Blob Record Header: checksum"); + } + + return Status::OK(); +} + +Status BlobLogRecord::DecodeFooterFrom(const Slice& footerslice) { + Slice input = footerslice; + if (input.size() < kFooterSize) { + return Status::Corruption("Invalid Blob Record Footer: size"); + } + + uint32_t f_crc = crc32c::Extend(0, input.data(), 8); + f_crc = crc32c::Mask(f_crc); + + if (!GetFixed64(&input, &sn_)) { + return Status::Corruption("Invalid Blob Record Footer: sn"); + } + + if (!GetFixed32(&input, &footer_cksum_)) { + return Status::Corruption("Invalid Blob Record Footer: cksum"); + } + + if (f_crc != footer_cksum_) { + return Status::Corruption("Record Checksum mismatch: footer_cksum"); + } + + return Status::OK(); +} + +} // namespace blob_db +} // namespace rocksdb +#endif // ROCKSDB_LITE diff --git a/utilities/blob_db/blob_log_format.h b/utilities/blob_db/blob_log_format.h new file mode 100644 index 000000000..c688ed400 --- /dev/null +++ b/utilities/blob_db/blob_log_format.h @@ -0,0 +1,226 @@ +// Copyright (c) 2011-present, Facebook, Inc. All rights reserved. +// This source code is licensed under the BSD-style license found in the +// LICENSE file in the root directory of this source tree. An additional grant +// of patent rights can be found in the PATENTS file in the same directory. +// +// Log format information shared by reader and writer. + +#pragma once + +#ifndef ROCKSDB_LITE + +#include +#include +#include +#include +#include +#include "rocksdb/options.h" +#include "rocksdb/status.h" +#include "rocksdb/types.h" + +namespace rocksdb { + +namespace blob_db { +class BlobFile; +class BlobDBImpl; + +enum RecordType : uint8_t { + // Zero is reserved for preallocated files + kFullType = 0, + + // For fragments + kFirstType = 1, + kMiddleType = 2, + kLastType = 3, + kMaxRecordType = kLastType +}; + +enum RecordSubType : uint8_t { + kRegularType = 0, + kTTLType = 1, + kTimestampType = 2, +}; + +extern const uint32_t kMagicNumber; + +class Reader; + +typedef std::pair ttlrange_t; +typedef std::pair tsrange_t; +typedef std::pair snrange_t; + +class BlobLogHeader { + friend class BlobFile; + friend class BlobDBImpl; + + private: + uint32_t magic_number_ = 0; + uint32_t version_ = 1; + CompressionType compression_; + std::unique_ptr ttl_guess_; + std::unique_ptr ts_guess_; + + private: + void set_ttl_guess(const ttlrange_t& ttl) { + ttl_guess_.reset(new ttlrange_t(ttl)); + } + + void set_version(uint32_t v) { version_ = v; } + + void set_ts_guess(const tsrange_t& ts) { ts_guess_.reset(new tsrange_t(ts)); } + + public: + // magic number + version + flags + ttl guess + timestamp range + static const size_t kHeaderSize = 4 + 4 + 4 + 4 * 2 + 8 * 2; + // 32 + + void EncodeTo(std::string* dst) const; + + Status DecodeFrom(Slice* input); + + BlobLogHeader(); + + bool HasTTL() const { return !!ttl_guess_; } + + bool HasTimestamp() const { return !!ts_guess_; } + + BlobLogHeader& operator=(BlobLogHeader&& in) noexcept; +}; + +// Footer encapsulates the fixed information stored at the tail +// end of every blob log file. +class BlobLogFooter { + friend class BlobFile; + + public: + // Use this constructor when you plan to write out the footer using + // EncodeTo(). Never use this constructor with DecodeFrom(). + BlobLogFooter(); + + uint64_t magic_number() const { return magic_number_; } + + void EncodeTo(std::string* dst) const; + + Status DecodeFrom(Slice* input); + + // convert this object to a human readable form + std::string ToString() const; + + // footer size = 4 byte magic number + // 8 bytes count + // 4, 4 - ttl range + // 8, 8 - sn range + // 8, 8 - ts range + // = 56 + static const size_t kFooterSize = 4 + 4 + 8 + (4 * 2) + (8 * 2) + (8 * 2); + + bool HasTTL() const { return !!ttl_range_; } + + bool HasTimestamp() const { return !!ts_range_; } + + uint64_t GetBlobCount() const { return blob_count_; } + + ttlrange_t GetTTLRange() const { + if (ttl_range_) { + *ttl_range_; + } + return {0, 0}; + } + + tsrange_t GetTimeRange() const { + if (ts_range_) { + return *ts_range_; + } + return {0, 0}; + } + + const snrange_t& GetSNRange() const { return sn_range_; } + + private: + uint32_t magic_number_ = 0; + uint64_t blob_count_ = 0; + + std::unique_ptr ttl_range_; + std::unique_ptr ts_range_; + snrange_t sn_range_; + + private: + void set_ttl_range(const ttlrange_t& ttl) { + ttl_range_.reset(new ttlrange_t(ttl)); + } + void set_time_range(const tsrange_t& ts) { + ts_range_.reset(new tsrange_t(ts)); + } +}; + +extern const size_t kBlockSize; + +class BlobLogRecord { + friend class Reader; + + private: + // this might not be set. + uint32_t checksum_; + uint32_t header_cksum_; + uint32_t key_size_; + uint64_t blob_size_; + uint64_t time_val_; + uint32_t ttl_val_; + SequenceNumber sn_; + uint32_t footer_cksum_; + char type_; + char subtype_; + Slice key_; + Slice blob_; + std::string key_buffer_; + std::string blob_buffer_; + + private: + void Clear(); + + char* GetKeyBuffer() { return &(key_buffer_[0]); } + + char* GetBlobBuffer() { return &(blob_buffer_[0]); } + + void ResizeKeyBuffer(size_t kbs); + + void ResizeBlobBuffer(size_t bbs); + + public: + // Header is + // Key Length ( 4 bytes ), + // Blob Length ( 8 bytes), timestamp/ttl (8 bytes), + // type (1 byte), subtype (1 byte) + // header checksum (4 bytes), blob checksum (4 bytes), + // = 34 + static const size_t kHeaderSize = 4 + 4 + 4 + 8 + 4 + 8 + 1 + 1; + + static const size_t kFooterSize = 8 + 4; + + public: + BlobLogRecord(); + + ~BlobLogRecord(); + + const Slice& Key() const { return key_; } + + const Slice& Blob() const { return blob_; } + + uint32_t GetKeySize() const { return key_size_; } + + uint64_t GetBlobSize() const { return blob_size_; } + + uint32_t GetTTL() const { return ttl_val_; } + + uint64_t GetTimeVal() const { return time_val_; } + + SequenceNumber GetSN() const { return sn_; } + + Status DecodeHeaderFrom(const Slice& hdrslice); + + Status DecodeFooterFrom(const Slice& footerslice); +}; + +} // namespace blob_db +} // namespace rocksdb +#endif // ROCKSDB_LITE diff --git a/utilities/blob_db/blob_log_reader.cc b/utilities/blob_db/blob_log_reader.cc new file mode 100644 index 000000000..c93a520ae --- /dev/null +++ b/utilities/blob_db/blob_log_reader.cc @@ -0,0 +1,163 @@ +// Copyright (c) 2011-present, Facebook, Inc. All rights reserved. +// This source code is licensed under the BSD-style license found in the +// LICENSE file in the root directory of this source tree. An additional grant +// of patent rights can be found in the PATENTS file in the same directory. +// +#ifndef ROCKSDB_LITE + +#include "utilities/blob_db/blob_log_reader.h" + +#include +#include "rocksdb/env.h" +#include "util/coding.h" +#include "util/crc32c.h" +#include "util/file_reader_writer.h" + +namespace rocksdb { +namespace blob_db { + +Reader::Reader(std::shared_ptr info_log, + unique_ptr&& _file) + : info_log_(info_log), file_(std::move(_file)), buffer_(), next_byte_(0) { + backing_store_.resize(kBlockSize); +} + +Reader::~Reader() {} + +Status Reader::ReadHeader(BlobLogHeader* header) { + assert(file_.get() != nullptr); + assert(next_byte_ == 0); + Status status = + file_->Read(BlobLogHeader::kHeaderSize, &buffer_, GetReadBuffer()); + next_byte_ += buffer_.size(); + if (!status.ok()) return status; + + if (buffer_.size() != BlobLogHeader::kHeaderSize) { + return Status::IOError("EOF reached before file header"); + } + + status = header->DecodeFrom(&buffer_); + return status; +} + +Status Reader::ReadRecord(BlobLogRecord* record, ReadLevel level, + WALRecoveryMode wal_recovery_mode) { + record->Clear(); + buffer_.clear(); + backing_store_[0] = '\0'; + + Status status = + file_->Read(BlobLogRecord::kHeaderSize, &buffer_, GetReadBuffer()); + next_byte_ += buffer_.size(); + if (!status.ok()) return status; + if (buffer_.size() != BlobLogRecord::kHeaderSize) { + return Status::IOError("EOF reached before record header"); + } + + status = record->DecodeHeaderFrom(buffer_); + if (!status.ok()) return status; + + uint32_t header_crc = 0; + uint32_t blob_crc = 0; + size_t crc_data_size = BlobLogRecord::kHeaderSize - 2 * sizeof(uint32_t); + header_crc = crc32c::Extend(header_crc, buffer_.data(), crc_data_size); + + uint64_t kb_size = record->GetKeySize() + record->GetBlobSize(); + switch (level) { + case kReadHdrFooter: + file_->Skip(kb_size); + next_byte_ += kb_size; + status = + file_->Read(BlobLogRecord::kFooterSize, &buffer_, GetReadBuffer()); + next_byte_ += buffer_.size(); + if (!status.ok()) return status; + if (buffer_.size() != BlobLogRecord::kFooterSize) { + return Status::IOError("EOF reached before record footer"); + } + + status = record->DecodeFooterFrom(buffer_); + return status; + + case kReadHdrKeyFooter: + record->ResizeKeyBuffer(record->GetKeySize()); + status = file_->Read(record->GetKeySize(), &record->key_, + record->GetKeyBuffer()); + next_byte_ += record->key_.size(); + if (!status.ok()) return status; + if (record->key_.size() != record->GetKeySize()) { + return Status::IOError("EOF reached before key read"); + } + + header_crc = + crc32c::Extend(header_crc, record->key_.data(), record->GetKeySize()); + header_crc = crc32c::Mask(header_crc); + if (header_crc != record->header_cksum_) { + return Status::Corruption("Record Checksum mismatch: header_cksum"); + } + + file_->Skip(record->GetBlobSize()); + next_byte_ += record->GetBlobSize(); + + status = + file_->Read(BlobLogRecord::kFooterSize, &buffer_, GetReadBuffer()); + next_byte_ += buffer_.size(); + if (!status.ok()) return status; + if (buffer_.size() != BlobLogRecord::kFooterSize) { + return Status::IOError("EOF reached during footer read"); + } + + status = record->DecodeFooterFrom(buffer_); + return status; + + case kReadHdrKeyBlobFooter: + record->ResizeKeyBuffer(record->GetKeySize()); + status = file_->Read(record->GetKeySize(), &record->key_, + record->GetKeyBuffer()); + next_byte_ += record->key_.size(); + if (!status.ok()) return status; + if (record->key_.size() != record->GetKeySize()) { + return Status::IOError("EOF reached before key read"); + } + + header_crc = + crc32c::Extend(header_crc, record->key_.data(), record->GetKeySize()); + header_crc = crc32c::Mask(header_crc); + if (header_crc != record->header_cksum_) { + return Status::Corruption("Record Checksum mismatch: header_cksum"); + } + + record->ResizeBlobBuffer(record->GetBlobSize()); + status = file_->Read(record->GetBlobSize(), &record->blob_, + record->GetBlobBuffer()); + next_byte_ += record->blob_.size(); + if (!status.ok()) return status; + if (record->blob_.size() != record->GetBlobSize()) { + return Status::IOError("EOF reached during blob read"); + } + + blob_crc = + crc32c::Extend(blob_crc, record->blob_.data(), record->blob_.size()); + blob_crc = crc32c::Mask(blob_crc); + if (blob_crc != record->checksum_) { + return Status::Corruption("Blob Checksum mismatch"); + } + + status = + file_->Read(BlobLogRecord::kFooterSize, &buffer_, GetReadBuffer()); + next_byte_ += buffer_.size(); + if (!status.ok()) return status; + if (buffer_.size() != BlobLogRecord::kFooterSize) { + return Status::IOError("EOF reached during blob footer read"); + } + + status = record->DecodeFooterFrom(buffer_); + return status; + default: + assert(0); + return status; + } +} + +} // namespace blob_db +} // namespace rocksdb +#endif // ROCKSDB_LITE diff --git a/utilities/blob_db/blob_log_reader.h b/utilities/blob_db/blob_log_reader.h new file mode 100644 index 000000000..5805ceb5e --- /dev/null +++ b/utilities/blob_db/blob_log_reader.h @@ -0,0 +1,93 @@ +// Copyright (c) 2011-present, Facebook, Inc. All rights reserved. +// This source code is licensed under the BSD-style license found in the +// LICENSE file in the root directory of this source tree. An additional grant +// of patent rights can be found in the PATENTS file in the same directory. +// +#pragma once + +#ifndef ROCKSDB_LITE + +#include +#include +#include + +#include "rocksdb/options.h" +#include "rocksdb/slice.h" +#include "rocksdb/status.h" +#include "utilities/blob_db/blob_log_format.h" + +namespace rocksdb { + +class SequentialFileReader; +class Logger; + +namespace blob_db { + +/** + * Reader is a general purpose log stream reader implementation. The actual job + * of reading from the device is implemented by the SequentialFile interface. + * + * Please see Writer for details on the file and record layout. + */ +class Reader { + public: + enum ReadLevel { + kReadHdrFooter, + kReadHdrKeyFooter, + kReadHdrKeyBlobFooter, + }; + + // Create a reader that will return log records from "*file". + // "*file" must remain live while this Reader is in use. + // + // If "reporter" is non-nullptr, it is notified whenever some data is + // dropped due to a detected corruption. "*reporter" must remain + // live while this Reader is in use. + // + // If "checksum" is true, verify checksums if available. + // + // The Reader will start reading at the first record located at physical + // position >= initial_offset within the file. + Reader(std::shared_ptr info_log, + std::unique_ptr&& file); + + ~Reader(); + + Status ReadHeader(BlobLogHeader* header); + + // Read the next record into *record. Returns true if read + // successfully, false if we hit end of the input. May use + // "*scratch" as temporary storage. The contents filled in *record + // will only be valid until the next mutating operation on this + // reader or the next mutation to *scratch. + Status ReadRecord(BlobLogRecord* record, ReadLevel level = kReadHdrFooter, + WALRecoveryMode wal_recovery_mode = + WALRecoveryMode::kTolerateCorruptedTailRecords); + + SequentialFileReader* file() { return file_.get(); } + + void ResetNextByte() { next_byte_ = 0; } + + uint64_t GetNextByte() const { return next_byte_; } + + private: + char* GetReadBuffer() { return &(backing_store_[0]); } + + private: + std::shared_ptr info_log_; + const std::unique_ptr file_; + + std::string backing_store_; + Slice buffer_; + + // which byte to read next. For asserting proper usage + uint64_t next_byte_; + + // No copying allowed + Reader(const Reader&) = delete; + Reader& operator=(const Reader&) = delete; +}; + +} // namespace blob_db +} // namespace rocksdb +#endif // ROCKSDB_LITE diff --git a/utilities/blob_db/blob_log_writer.cc b/utilities/blob_db/blob_log_writer.cc new file mode 100644 index 000000000..295624ddc --- /dev/null +++ b/utilities/blob_db/blob_log_writer.cc @@ -0,0 +1,172 @@ +// Copyright (c) 2011-present, Facebook, Inc. All rights reserved. +// This source code is licensed under the BSD-style license found in the +// LICENSE file in the root directory of this source tree. An additional grant +// of patent rights can be found in the PATENTS file in the same directory. +// +#ifndef ROCKSDB_LITE + +#include "utilities/blob_db/blob_log_writer.h" + +#include +#include +#include +#include "rocksdb/env.h" +#include "util/coding.h" +#include "util/crc32c.h" +#include "util/file_reader_writer.h" + +namespace rocksdb { +namespace blob_db { + +Writer::Writer(unique_ptr&& dest, uint64_t log_number, + uint64_t bpsync, bool use_fs, uint64_t boffset) + : dest_(std::move(dest)), + log_number_(log_number), + block_offset_(boffset), + bytes_per_sync_(bpsync), + next_sync_offset_(0), + use_fsync_(use_fs), + last_elem_type_(kEtNone) { + for (int i = 0; i <= kMaxRecordType; i++) { + char t = static_cast(i); + type_crc_[i] = crc32c::Value(&t, 1); + } +} + +Writer::~Writer() {} + +void Writer::Sync() { dest_->Sync(use_fsync_); } + +Status Writer::WriteHeader(const BlobLogHeader& header) { + assert(block_offset_ == 0); + assert(last_elem_type_ == kEtNone); + std::string str; + header.EncodeTo(&str); + + Status s = dest_->Append(Slice(str)); + if (s.ok()) { + block_offset_ += str.size(); + s = dest_->Flush(); + } + last_elem_type_ = kEtFileHdr; + return s; +} + +Status Writer::AppendFooter(const BlobLogFooter& footer) { + assert(block_offset_ != 0); + assert(last_elem_type_ == kEtFileHdr || last_elem_type_ == kEtFooter); + + std::string str; + footer.EncodeTo(&str); + + Status s = dest_->Append(Slice(str)); + if (s.ok()) { + block_offset_ += str.size(); + s = dest_->Close(); + dest_.reset(); + } + + last_elem_type_ = kEtFileFooter; + return s; +} + +Status Writer::AddRecord(const Slice& key, const Slice& val, + uint64_t* key_offset, uint64_t* blob_offset, + uint32_t ttl) { + assert(block_offset_ != 0); + assert(last_elem_type_ == kEtFileHdr || last_elem_type_ == kEtFooter); + + std::string buf; + ConstructBlobHeader(&buf, key, val, ttl, -1); + + Status s = EmitPhysicalRecord(buf, key, val, key_offset, blob_offset); + return s; +} + +Status Writer::AddRecord(const Slice& key, const Slice& val, + uint64_t* key_offset, uint64_t* blob_offset) { + assert(block_offset_ != 0); + assert(last_elem_type_ == kEtFileHdr || last_elem_type_ == kEtFooter); + + std::string buf; + ConstructBlobHeader(&buf, key, val, -1, -1); + + Status s = EmitPhysicalRecord(buf, key, val, key_offset, blob_offset); + return s; +} + +void Writer::ConstructBlobHeader(std::string* headerbuf, const Slice& key, + const Slice& val, int32_t ttl, int64_t ts) { + headerbuf->reserve(BlobLogRecord::kHeaderSize); + + uint32_t key_size = static_cast(key.size()); + PutFixed32(headerbuf, key_size); + PutFixed64(headerbuf, val.size()); + + uint32_t ttl_write = (ttl != -1) ? static_cast(ttl) + : std::numeric_limits::max(); + PutFixed32(headerbuf, ttl_write); + + uint64_t ts_write = (ts != -1) ? static_cast(ts) + : std::numeric_limits::max(); + PutFixed64(headerbuf, ts_write); + + RecordType t = kFullType; + headerbuf->push_back(static_cast(t)); + + RecordSubType st = kRegularType; + if (ttl != -1) st = kTTLType; + headerbuf->push_back(static_cast(st)); + + uint32_t header_crc = 0; + header_crc = + crc32c::Extend(header_crc, headerbuf->c_str(), headerbuf->size()); + header_crc = crc32c::Extend(header_crc, key.data(), key.size()); + header_crc = crc32c::Mask(header_crc); + PutFixed32(headerbuf, header_crc); + + uint32_t crc = 0; + // Compute the crc of the record type and the payload. + crc = crc32c::Extend(crc, val.data(), val.size()); + crc = crc32c::Mask(crc); // Adjust for storage + PutFixed32(headerbuf, crc); +} + +Status Writer::EmitPhysicalRecord(const std::string& headerbuf, + const Slice& key, const Slice& val, + uint64_t* key_offset, uint64_t* blob_offset) { + Status s = dest_->Append(Slice(headerbuf)); + if (s.ok()) { + s = dest_->Append(key); + if (s.ok()) s = dest_->Append(val); + } + + *key_offset = block_offset_ + BlobLogRecord::kHeaderSize; + *blob_offset = *key_offset + key.size(); + block_offset_ = *blob_offset + val.size(); + last_elem_type_ = kEtRecord; + return s; +} + +Status Writer::AddRecordFooter(const SequenceNumber& seq) { + assert(last_elem_type_ == kEtRecord); + + std::string buf; + PutFixed64(&buf, seq); + + uint32_t footer_crc = crc32c::Extend(0, buf.c_str(), buf.size()); + footer_crc = crc32c::Mask(footer_crc); + PutFixed32(&buf, footer_crc); + + Status s = dest_->Append(Slice(buf)); + block_offset_ += BlobLogRecord::kFooterSize; + + if (s.ok()) dest_->Flush(); + + last_elem_type_ = kEtFooter; + return s; +} + +} // namespace blob_db +} // namespace rocksdb +#endif // ROCKSDB_LITE diff --git a/utilities/blob_db/blob_log_writer.h b/utilities/blob_db/blob_log_writer.h new file mode 100644 index 000000000..4443c4eeb --- /dev/null +++ b/utilities/blob_db/blob_log_writer.h @@ -0,0 +1,98 @@ +// Copyright (c) 2011-present, Facebook, Inc. All rights reserved. +// This source code is licensed under the BSD-style license found in the +// LICENSE file in the root directory of this source tree. An additional grant +// of patent rights can be found in the PATENTS file in the same directory. +// +#pragma once + +#ifndef ROCKSDB_LITE + +#include +#include +#include + +#include "rocksdb/slice.h" +#include "rocksdb/status.h" +#include "rocksdb/types.h" +#include "utilities/blob_db/blob_log_format.h" + +namespace rocksdb { + +class WritableFileWriter; + +namespace blob_db { + +/** + * Writer is the blob log stream writer. It provides an append-only + * abstraction for writing blob data. + * + * + * Look at blob_db_format.h to see the details of the record formats. + */ + +class Writer { + public: + // Create a writer that will append data to "*dest". + // "*dest" must be initially empty. + // "*dest" must remain live while this Writer is in use. + explicit Writer(std::unique_ptr&& dest, + uint64_t log_number, uint64_t bpsync, bool use_fsync, + uint64_t boffset = 0); + ~Writer(); + + static void ConstructBlobHeader(std::string* headerbuf, const Slice& key, + const Slice& val, int32_t ttl, int64_t ts); + + Status AddRecord(const Slice& key, const Slice& val, uint64_t* key_offset, + uint64_t* blob_offset); + + Status AddRecord(const Slice& key, const Slice& val, uint64_t* key_offset, + uint64_t* blob_offset, uint32_t ttl); + + Status EmitPhysicalRecord(const std::string& headerbuf, const Slice& key, + const Slice& val, uint64_t* key_offset, + uint64_t* blob_offset); + + Status AddRecordFooter(const SequenceNumber& sn); + + Status AppendFooter(const BlobLogFooter& footer); + + Status WriteHeader(const BlobLogHeader& header); + + WritableFileWriter* file() { return dest_.get(); } + + const WritableFileWriter* file() const { return dest_.get(); } + + uint64_t get_log_number() const { return log_number_; } + + bool ShouldSync() const { return block_offset_ > next_sync_offset_; } + + void Sync(); + + void ResetSyncPointer() { next_sync_offset_ += bytes_per_sync_; } + + private: + std::unique_ptr dest_; + uint64_t log_number_; + uint64_t block_offset_; // Current offset in block + uint64_t bytes_per_sync_; + uint64_t next_sync_offset_; + bool use_fsync_; + + // crc32c values for all supported record types. These are + // pre-computed to reduce the overhead of computing the crc of the + // record type stored in the header. + uint32_t type_crc_[kMaxRecordType + 1]; + + // No copying allowed + Writer(const Writer&) = delete; + Writer& operator=(const Writer&) = delete; + + public: + enum ElemType { kEtNone, kEtFileHdr, kEtRecord, kEtFooter, kEtFileFooter }; + ElemType last_elem_type_; +}; + +} // namespace blob_db +} // namespace rocksdb +#endif // ROCKSDB_LITE diff --git a/utilities/transactions/optimistic_transaction_db_impl.h b/utilities/transactions/optimistic_transaction_db_impl.h index 51d2950ed..5721e499e 100644 --- a/utilities/transactions/optimistic_transaction_db_impl.h +++ b/utilities/transactions/optimistic_transaction_db_impl.h @@ -16,10 +16,14 @@ namespace rocksdb { class OptimisticTransactionDBImpl : public OptimisticTransactionDB { public: - explicit OptimisticTransactionDBImpl(DB* db) - : OptimisticTransactionDB(db), db_(db) {} + explicit OptimisticTransactionDBImpl(DB* db, bool take_ownership = true) + : OptimisticTransactionDB(db), db_(db), db_owner_(take_ownership) {} - ~OptimisticTransactionDBImpl() {} + ~OptimisticTransactionDBImpl() { + if (!db_owner_) { + db_.release(); + } + } Transaction* BeginTransaction(const WriteOptions& write_options, const OptimisticTransactionOptions& txn_options, @@ -29,6 +33,7 @@ class OptimisticTransactionDBImpl : public OptimisticTransactionDB { private: std::unique_ptr db_; + bool db_owner_; void ReinitializeTransaction(Transaction* txn, const WriteOptions& write_options,