cache SuperVersion in thread local storage to avoid mutex lock

Summary: as title

Test Plan:
asan_check
will post results later

Reviewers: haobo, igor, dhruba, sdong

Reviewed By: haobo

CC: leveldb

Differential Revision: https://reviews.facebook.net/D16257
main
Lei Jin 11 years ago
parent e41c060a06
commit ad0c3747cb
  1. 146
      db/db_impl.cc
  2. 31
      db/db_impl.h
  3. 4
      include/rocksdb/options.h
  4. 5
      include/rocksdb/statistics.h
  5. 21
      port/likely.h
  6. 3
      util/options.cc
  7. 2
      util/statistics.h
  8. 6
      util/thread_local.cc

@ -38,6 +38,7 @@
#include "db/version_set.h" #include "db/version_set.h"
#include "db/write_batch_internal.h" #include "db/write_batch_internal.h"
#include "port/port.h" #include "port/port.h"
#include "port/likely.h"
#include "rocksdb/compaction_filter.h" #include "rocksdb/compaction_filter.h"
#include "rocksdb/db.h" #include "rocksdb/db.h"
#include "rocksdb/env.h" #include "rocksdb/env.h"
@ -270,6 +271,7 @@ DBImpl::DBImpl(const Options& options, const std::string& dbname)
logfile_number_(0), logfile_number_(0),
super_version_(nullptr), super_version_(nullptr),
super_version_number_(0), super_version_number_(0),
local_sv_(new ThreadLocalPtr(&SuperVersionUnrefHandle)),
tmp_batch_(), tmp_batch_(),
bg_compaction_scheduled_(0), bg_compaction_scheduled_(0),
bg_manual_only_(0), bg_manual_only_(0),
@ -288,7 +290,8 @@ DBImpl::DBImpl(const Options& options, const std::string& dbname)
delayed_writes_(0), delayed_writes_(0),
storage_options_(options), storage_options_(options),
bg_work_gate_closed_(false), bg_work_gate_closed_(false),
refitting_level_(false) { refitting_level_(false),
opened_successfully_(false) {
mem_->Ref(); mem_->Ref();
env_->GetAbsolutePath(dbname, &db_absolute_path_); env_->GetAbsolutePath(dbname, &db_absolute_path_);
@ -319,12 +322,11 @@ DBImpl::DBImpl(const Options& options, const std::string& dbname)
} }
DBImpl::~DBImpl() { DBImpl::~DBImpl() {
autovector<MemTable*> to_delete;
// Wait for background work to finish // Wait for background work to finish
if (flush_on_destroy_ && mem_->GetFirstSequenceNumber() != 0) { if (flush_on_destroy_ && mem_->GetFirstSequenceNumber() != 0) {
FlushMemTable(FlushOptions()); FlushMemTable(FlushOptions());
} }
mutex_.Lock(); mutex_.Lock();
shutting_down_.Release_Store(this); // Any non-nullptr value is ok shutting_down_.Release_Store(this); // Any non-nullptr value is ok
while (bg_compaction_scheduled_ || while (bg_compaction_scheduled_ ||
@ -332,6 +334,34 @@ DBImpl::~DBImpl() {
bg_logstats_scheduled_) { bg_logstats_scheduled_) {
bg_cv_.Wait(); bg_cv_.Wait();
} }
mutex_.Unlock();
// Release SuperVersion reference kept in ThreadLocalPtr.
// This must be done outside of mutex_ since unref handler can lock mutex.
// It also needs to be done after FlushMemTable, which can trigger local_sv_
// access.
delete local_sv_;
mutex_.Lock();
if (options_.allow_thread_local) {
// Clean up obsolete files due to SuperVersion release.
// (1) Need to delete to obsolete files before closing because RepairDB()
// scans all existing files in the file system and builds manifest file.
// Keeping obsolete files confuses the repair process.
// (2) Need to check if we Open()/Recover() the DB successfully before
// deleting because if VersionSet recover fails (may be due to corrupted
// manifest file), it is not able to identify live files correctly. As a
// result, all "live" files can get deleted by accident. However, corrupted
// manifest is recoverable by RepairDB().
if (opened_successfully_) {
DeletionState deletion_state;
FindObsoleteFiles(deletion_state, true);
// manifest number starting from 2
deletion_state.manifest_file_number = 1;
PurgeObsoleteFiles(deletion_state);
}
}
if (super_version_ != nullptr) { if (super_version_ != nullptr) {
bool is_last_reference __attribute__((unused)); bool is_last_reference __attribute__((unused));
is_last_reference = super_version_->Unref(); is_last_reference = super_version_->Unref();
@ -349,6 +379,7 @@ DBImpl::~DBImpl() {
delete mem_->Unref(); delete mem_->Unref();
} }
autovector<MemTable*> to_delete;
imm_.current()->Unref(&to_delete); imm_.current()->Unref(&to_delete);
for (MemTable* m: to_delete) { for (MemTable* m: to_delete) {
delete m; delete m;
@ -1286,6 +1317,10 @@ Status DBImpl::FlushMemTableToOutputFile(bool* madeProgress,
if (s.ok()) { if (s.ok()) {
InstallSuperVersion(deletion_state); InstallSuperVersion(deletion_state);
// Reset SuperVersions cached in thread local storage
if (options_.allow_thread_local) {
ResetThreadLocalSuperVersions(&deletion_state);
}
if (madeProgress) { if (madeProgress) {
*madeProgress = 1; *madeProgress = 1;
} }
@ -2811,26 +2846,21 @@ Status DBImpl::Get(const ReadOptions& options,
// DeletionState gets created and destructed outside of the lock -- we // DeletionState gets created and destructed outside of the lock -- we
// use this convinently to: // use this convinently to:
// * malloc one SuperVersion() outside of the lock -- new_superversion // * malloc one SuperVersion() outside of the lock -- new_superversion
// * delete one SuperVersion() outside of the lock -- superversion_to_free // * delete SuperVersion()s outside of the lock -- superversions_to_free
// //
// However, if InstallSuperVersion() gets called twice with the same, // However, if InstallSuperVersion() gets called twice with the same,
// deletion_state, we can't reuse the SuperVersion() that got malloced because // deletion_state, we can't reuse the SuperVersion() that got malloced because
// first call already used it. In that rare case, we take a hit and create a // first call already used it. In that rare case, we take a hit and create a
// new SuperVersion() inside of the mutex. We do similar thing // new SuperVersion() inside of the mutex.
// for superversion_to_free
void DBImpl::InstallSuperVersion(DeletionState& deletion_state) { void DBImpl::InstallSuperVersion(DeletionState& deletion_state) {
mutex_.AssertHeld();
// if new_superversion == nullptr, it means somebody already used it // if new_superversion == nullptr, it means somebody already used it
SuperVersion* new_superversion = SuperVersion* new_superversion =
(deletion_state.new_superversion != nullptr) ? (deletion_state.new_superversion != nullptr) ?
deletion_state.new_superversion : new SuperVersion(); deletion_state.new_superversion : new SuperVersion();
SuperVersion* old_superversion = InstallSuperVersion(new_superversion); SuperVersion* old_superversion = InstallSuperVersion(new_superversion);
deletion_state.new_superversion = nullptr; deletion_state.new_superversion = nullptr;
if (deletion_state.superversion_to_free != nullptr) { deletion_state.superversions_to_free.push_back(old_superversion);
// somebody already put it there
delete old_superversion;
} else {
deletion_state.superversion_to_free = old_superversion;
}
} }
DBImpl::SuperVersion* DBImpl::InstallSuperVersion( DBImpl::SuperVersion* DBImpl::InstallSuperVersion(
@ -2839,7 +2869,10 @@ DBImpl::SuperVersion* DBImpl::InstallSuperVersion(
new_superversion->Init(mem_, imm_.current(), versions_->current()); new_superversion->Init(mem_, imm_.current(), versions_->current());
SuperVersion* old_superversion = super_version_; SuperVersion* old_superversion = super_version_;
super_version_ = new_superversion; super_version_ = new_superversion;
super_version_->db = this;
++super_version_number_; ++super_version_number_;
super_version_->version_number = super_version_number_;
if (old_superversion != nullptr && old_superversion->Unref()) { if (old_superversion != nullptr && old_superversion->Unref()) {
old_superversion->Cleanup(); old_superversion->Cleanup();
return old_superversion; // will let caller delete outside of mutex return old_superversion; // will let caller delete outside of mutex
@ -2847,6 +2880,20 @@ DBImpl::SuperVersion* DBImpl::InstallSuperVersion(
return nullptr; return nullptr;
} }
void DBImpl::ResetThreadLocalSuperVersions(DeletionState* deletion_state) {
mutex_.AssertHeld();
autovector<void*> sv_ptrs;
local_sv_->Scrape(&sv_ptrs);
for (auto ptr : sv_ptrs) {
assert(ptr);
auto sv = static_cast<SuperVersion*>(ptr);
if (static_cast<SuperVersion*>(ptr)->Unref()) {
sv->Cleanup();
deletion_state->superversions_to_free.push_back(sv);
}
}
}
Status DBImpl::GetImpl(const ReadOptions& options, Status DBImpl::GetImpl(const ReadOptions& options,
const Slice& key, const Slice& key,
std::string* value, std::string* value,
@ -2864,10 +2911,41 @@ Status DBImpl::GetImpl(const ReadOptions& options,
snapshot = versions_->LastSequence(); snapshot = versions_->LastSequence();
} }
// This can be replaced by using atomics and spinlock instead of big mutex // Acquire SuperVersion
mutex_.Lock(); SuperVersion* sv = nullptr;
SuperVersion* get_version = super_version_->Ref(); if (LIKELY(options_.allow_thread_local)) {
mutex_.Unlock(); // The SuperVersion is cached in thread local storage to avoid acquiring
// mutex when SuperVersion does not change since the last use. When a new
// SuperVersion is installed, the compaction or flush thread cleans up
// cached SuperVersion in all existing thread local storage. To avoid
// acquiring mutex for this operation, we use atomic Swap() on the thread
// local pointer to guarantee exclusive access. If the thread local pointer
// is being used while a new SuperVersion is installed, the cached
// SuperVersion can become stale. It will eventually get refreshed either
// on the next GetImpl() call or next SuperVersion installation.
sv = static_cast<SuperVersion*>(local_sv_->Swap(nullptr));
if (!sv || sv->version_number !=
super_version_number_.load(std::memory_order_relaxed)) {
RecordTick(options_.statistics.get(), NUMBER_SUPERVERSION_UPDATES);
SuperVersion* sv_to_delete = nullptr;
if (sv && sv->Unref()) {
mutex_.Lock();
sv->Cleanup();
sv_to_delete = sv;
} else {
mutex_.Lock();
}
sv = super_version_->Ref();
mutex_.Unlock();
delete sv_to_delete;
}
} else {
mutex_.Lock();
sv = super_version_->Ref();
mutex_.Unlock();
}
bool have_stat_update = false; bool have_stat_update = false;
Version::GetStats stats; Version::GetStats stats;
@ -2880,18 +2958,18 @@ Status DBImpl::GetImpl(const ReadOptions& options,
// merge_operands will contain the sequence of merges in the latter case. // merge_operands will contain the sequence of merges in the latter case.
LookupKey lkey(key, snapshot); LookupKey lkey(key, snapshot);
BumpPerfTime(&perf_context.get_snapshot_time, &snapshot_timer); BumpPerfTime(&perf_context.get_snapshot_time, &snapshot_timer);
if (get_version->mem->Get(lkey, value, &s, merge_context, options_)) { if (sv->mem->Get(lkey, value, &s, merge_context, options_)) {
// Done // Done
RecordTick(options_.statistics.get(), MEMTABLE_HIT); RecordTick(options_.statistics.get(), MEMTABLE_HIT);
} else if (get_version->imm->Get(lkey, value, &s, merge_context, options_)) { } else if (sv->imm->Get(lkey, value, &s, merge_context, options_)) {
// Done // Done
RecordTick(options_.statistics.get(), MEMTABLE_HIT); RecordTick(options_.statistics.get(), MEMTABLE_HIT);
} else { } else {
StopWatchNano from_files_timer(env_, false); StopWatchNano from_files_timer(env_, false);
StartPerfTimer(&from_files_timer); StartPerfTimer(&from_files_timer);
get_version->current->Get(options, lkey, value, &s, &merge_context, &stats, sv->current->Get(options, lkey, value, &s, &merge_context, &stats,
options_, value_found); options_, value_found);
have_stat_update = true; have_stat_update = true;
BumpPerfTime(&perf_context.get_from_output_files_time, &from_files_timer); BumpPerfTime(&perf_context.get_from_output_files_time, &from_files_timer);
RecordTick(options_.statistics.get(), MEMTABLE_MISS); RecordTick(options_.statistics.get(), MEMTABLE_MISS);
@ -2900,31 +2978,32 @@ Status DBImpl::GetImpl(const ReadOptions& options,
StopWatchNano post_process_timer(env_, false); StopWatchNano post_process_timer(env_, false);
StartPerfTimer(&post_process_timer); StartPerfTimer(&post_process_timer);
bool delete_get_version = false;
if (!options_.disable_seek_compaction && have_stat_update) { if (!options_.disable_seek_compaction && have_stat_update) {
mutex_.Lock(); mutex_.Lock();
if (get_version->current->UpdateStats(stats)) { if (sv->current->UpdateStats(stats)) {
MaybeScheduleFlushOrCompaction(); MaybeScheduleFlushOrCompaction();
} }
if (get_version->Unref()) {
get_version->Cleanup();
delete_get_version = true;
}
mutex_.Unlock(); mutex_.Unlock();
}
// Release SuperVersion
if (LIKELY(options_.allow_thread_local)) {
// Put the SuperVersion back
local_sv_->Reset(static_cast<void*>(sv));
} else { } else {
if (get_version->Unref()) { bool delete_sv = false;
if (sv->Unref()) {
mutex_.Lock(); mutex_.Lock();
get_version->Cleanup(); sv->Cleanup();
mutex_.Unlock(); mutex_.Unlock();
delete_get_version = true; delete_sv = true;
}
if (delete_sv) {
delete sv;
} }
}
if (delete_get_version) {
delete get_version;
} }
// Note, tickers are atomic now - no lock protection needed any more. // Note, tickers are atomic now - no lock protection needed any more.
RecordTick(options_.statistics.get(), NUMBER_KEYS_READ); RecordTick(options_.statistics.get(), NUMBER_KEYS_READ);
RecordTick(options_.statistics.get(), BYTES_READ, value->size()); RecordTick(options_.statistics.get(), BYTES_READ, value->size());
BumpPerfTime(&perf_context.get_post_process_time, &post_process_timer); BumpPerfTime(&perf_context.get_post_process_time, &post_process_timer);
@ -3772,6 +3851,7 @@ Status DB::Open(const Options& options, const std::string& dbname, DB** dbptr) {
impl->mutex_.Unlock(); impl->mutex_.Unlock();
if (s.ok()) { if (s.ok()) {
impl->opened_successfully_ = true;
*dbptr = impl; *dbptr = impl;
} else { } else {
delete impl; delete impl;

@ -26,6 +26,7 @@
#include "rocksdb/transaction_log.h" #include "rocksdb/transaction_log.h"
#include "util/autovector.h" #include "util/autovector.h"
#include "util/stats_logger.h" #include "util/stats_logger.h"
#include "util/thread_local.h"
#include "db/internal_stats.h" #include "db/internal_stats.h"
namespace rocksdb { namespace rocksdb {
@ -152,6 +153,9 @@ class DBImpl : public DB {
// all memtables that we need to free through this vector. We then // all memtables that we need to free through this vector. We then
// delete all those memtables outside of mutex, during destruction // delete all those memtables outside of mutex, during destruction
autovector<MemTable*> to_delete; autovector<MemTable*> to_delete;
// Version number of the current SuperVersion
uint64_t version_number;
DBImpl* db;
// should be called outside the mutex // should be called outside the mutex
SuperVersion() = default; SuperVersion() = default;
@ -170,6 +174,16 @@ class DBImpl : public DB {
Version* new_current); Version* new_current);
}; };
static void SuperVersionUnrefHandle(void* ptr) {
DBImpl::SuperVersion* sv = static_cast<DBImpl::SuperVersion*>(ptr);
if (sv->Unref()) {
sv->db->mutex_.Lock();
sv->Cleanup();
sv->db->mutex_.Unlock();
delete sv;
}
}
// needed for CleanupIteratorState // needed for CleanupIteratorState
struct DeletionState { struct DeletionState {
inline bool HaveSomethingToDelete() const { inline bool HaveSomethingToDelete() const {
@ -195,7 +209,7 @@ class DBImpl : public DB {
// a list of memtables to be free // a list of memtables to be free
autovector<MemTable*> memtables_to_free; autovector<MemTable*> memtables_to_free;
SuperVersion* superversion_to_free; // if nullptr nothing to free autovector<SuperVersion*> superversions_to_free;
SuperVersion* new_superversion; // if nullptr no new superversion SuperVersion* new_superversion; // if nullptr no new superversion
@ -207,7 +221,6 @@ class DBImpl : public DB {
manifest_file_number = 0; manifest_file_number = 0;
log_number = 0; log_number = 0;
prev_log_number = 0; prev_log_number = 0;
superversion_to_free = nullptr;
new_superversion = new_superversion =
create_superversion ? new SuperVersion() : nullptr; create_superversion ? new SuperVersion() : nullptr;
} }
@ -217,8 +230,10 @@ class DBImpl : public DB {
for (auto m : memtables_to_free) { for (auto m : memtables_to_free) {
delete m; delete m;
} }
// free superversion. if nullptr, this will be noop // free superversions
delete superversion_to_free; for (auto s : superversions_to_free) {
delete s;
}
// if new_superversion was not used, it will be non-nullptr and needs // if new_superversion was not used, it will be non-nullptr and needs
// to be freed here // to be freed here
delete new_superversion; delete new_superversion;
@ -400,6 +415,9 @@ class DBImpl : public DB {
// InstallSuperVersion(), i.e. incremented every time super_version_ // InstallSuperVersion(), i.e. incremented every time super_version_
// changes. // changes.
std::atomic<uint64_t> super_version_number_; std::atomic<uint64_t> super_version_number_;
// Thread's local copy of SuperVersion pointer
// This needs to be destructed after mutex_
ThreadLocalPtr* local_sv_;
std::string host_name_; std::string host_name_;
@ -489,6 +507,9 @@ class DBImpl : public DB {
// Guard against multiple concurrent refitting // Guard against multiple concurrent refitting
bool refitting_level_; bool refitting_level_;
// Indicate DB was opened successfully
bool opened_successfully_;
// No copying allowed // No copying allowed
DBImpl(const DBImpl&); DBImpl(const DBImpl&);
void operator=(const DBImpl&); void operator=(const DBImpl&);
@ -515,6 +536,8 @@ class DBImpl : public DB {
// deletion_state which can have new_superversion already allocated. // deletion_state which can have new_superversion already allocated.
void InstallSuperVersion(DeletionState& deletion_state); void InstallSuperVersion(DeletionState& deletion_state);
void ResetThreadLocalSuperVersions(DeletionState* deletion_state);
virtual Status GetPropertiesOfAllTables(TablePropertiesCollection* props) virtual Status GetPropertiesOfAllTables(TablePropertiesCollection* props)
override; override;

@ -714,6 +714,10 @@ struct Options {
// //
// Default: 0 (disabled) // Default: 0 (disabled)
size_t max_successive_merges; size_t max_successive_merges;
// Allow RocksDB to use thread local storage to optimize performance.
// Default: true
bool allow_thread_local;
}; };
// //

@ -122,6 +122,7 @@ enum Tickers {
// Number of table's properties loaded directly from file, without creating // Number of table's properties loaded directly from file, without creating
// table reader object. // table reader object.
NUMBER_DIRECT_LOAD_TABLE_PROPERTIES, NUMBER_DIRECT_LOAD_TABLE_PROPERTIES,
NUMBER_SUPERVERSION_UPDATES,
TICKER_ENUM_MAX TICKER_ENUM_MAX
}; };
@ -176,7 +177,9 @@ const std::vector<std::pair<Tickers, std::string>> TickersNameMap = {
{COMPACT_READ_BYTES, "rocksdb.compact.read.bytes"}, {COMPACT_READ_BYTES, "rocksdb.compact.read.bytes"},
{COMPACT_WRITE_BYTES, "rocksdb.compact.write.bytes"}, {COMPACT_WRITE_BYTES, "rocksdb.compact.write.bytes"},
{NUMBER_DIRECT_LOAD_TABLE_PROPERTIES, {NUMBER_DIRECT_LOAD_TABLE_PROPERTIES,
"rocksdb.number.direct.load.table.properties"}, }; "rocksdb.number.direct.load.table.properties"},
{NUMBER_SUPERVERSION_UPDATES, "rocksdb.number.superversion_updates"},
};
/** /**
* Keep adding histogram's here. * Keep adding histogram's here.

@ -0,0 +1,21 @@
// Copyright (c) 2013, Facebook, Inc. All rights reserved.
// This source code is licensed under the BSD-style license found in the
// LICENSE file in the root directory of this source tree. An additional grant
// of patent rights can be found in the PATENTS file in the same directory.
//
// Copyright (c) 2011 The LevelDB Authors. All rights reserved.
// Use of this source code is governed by a BSD-style license that can be
// found in the LICENSE file. See the AUTHORS file for names of contributors.
#ifndef PORT_LIKELY_H_
#define PORT_LIKELY_H_
#if defined(__GNUC__) && __GNUC__ >= 4
#define LIKELY(x) (__builtin_expect((x), 1))
#define UNLIKELY(x) (__builtin_expect((x), 0))
#else
#define LIKELY(x) (x)
#define UNLIKELY(x) (x)
#endif
#endif // PORT_LIKELY_H_

@ -110,7 +110,8 @@ Options::Options()
inplace_callback(nullptr), inplace_callback(nullptr),
memtable_prefix_bloom_bits(0), memtable_prefix_bloom_bits(0),
memtable_prefix_bloom_probes(6), memtable_prefix_bloom_probes(6),
max_successive_merges(0) { max_successive_merges(0),
allow_thread_local(true) {
assert(memtable_factory.get() != nullptr); assert(memtable_factory.get() != nullptr);
} }

@ -7,11 +7,11 @@
#include "rocksdb/statistics.h" #include "rocksdb/statistics.h"
#include "util/histogram.h" #include "util/histogram.h"
#include "util/mutexlock.h" #include "util/mutexlock.h"
#include "port/likely.h"
#include <vector> #include <vector>
#include <atomic> #include <atomic>
#define UNLIKELY(val) (__builtin_expect((val), 0))
namespace rocksdb { namespace rocksdb {

@ -9,12 +9,8 @@
#include "util/thread_local.h" #include "util/thread_local.h"
#include "util/mutexlock.h" #include "util/mutexlock.h"
#include "port/likely.h"
#if defined(__GNUC__) && __GNUC__ >= 4
#define UNLIKELY(x) (__builtin_expect((x), 0))
#else
#define UNLIKELY(x) (x)
#endif
namespace rocksdb { namespace rocksdb {

Loading…
Cancel
Save