|
|
|
// Copyright (c) 2013, Facebook, Inc. All rights reserved.
|
|
|
|
// This source code is licensed under the BSD-style license found in the
|
|
|
|
// LICENSE file in the root directory of this source tree. An additional grant
|
|
|
|
// of patent rights can be found in the PATENTS file in the same directory.
|
|
|
|
//
|
|
|
|
// Copyright (c) 2011 The LevelDB Authors. All rights reserved.
|
|
|
|
// Use of this source code is governed by a BSD-style license that can be
|
|
|
|
// found in the LICENSE file. See the AUTHORS file for names of contributors.
|
|
|
|
|
|
|
|
#include "db/db_impl.h"
|
|
|
|
|
|
|
|
#ifndef __STDC_FORMAT_MACROS
|
|
|
|
#define __STDC_FORMAT_MACROS
|
|
|
|
#endif
|
|
|
|
|
|
|
|
#include <inttypes.h>
|
|
|
|
#include <algorithm>
|
|
|
|
#include <climits>
|
|
|
|
#include <cstdio>
|
|
|
|
#include <set>
|
|
|
|
#include <stdexcept>
|
|
|
|
#include <stdint.h>
|
|
|
|
#include <string>
|
[RocksDB] [Performance] Speed up FindObsoleteFiles
Summary:
FindObsoleteFiles was slow, holding the single big lock, resulted in bad p99 behavior.
Didn't profile anything, but several things could be improved:
1. VersionSet::AddLiveFiles works with std::set, which is by itself slow (a tree).
You also don't know how many dynamic allocations occur just for building up this tree.
switched to std::vector, also added logic to pre-calculate total size and do just one allocation
2. Don't see why env_->GetChildren() needs to be mutex proteced, moved to PurgeObsoleteFiles where
mutex could be unlocked.
3. switched std::set to std:unordered_set, the conversion from vector is also inside PurgeObsoleteFiles
I have a feeling this should pretty much fix it.
Test Plan: make check; db_stress
Reviewers: dhruba, heyongqiang, MarkCallaghan
Reviewed By: dhruba
CC: leveldb, zshao
Differential Revision: https://reviews.facebook.net/D10197
12 years ago
|
|
|
#include <unordered_set>
|
|
|
|
#include <unordered_map>
|
|
|
|
#include <utility>
|
|
|
|
#include <vector>
|
|
|
|
|
|
|
|
#include "db/builder.h"
|
|
|
|
#include "db/flush_job.h"
|
|
|
|
#include "db/compaction_job.h"
|
|
|
|
#include "db/db_iter.h"
|
|
|
|
#include "db/dbformat.h"
|
|
|
|
#include "db/filename.h"
|
|
|
|
#include "db/log_reader.h"
|
|
|
|
#include "db/log_writer.h"
|
|
|
|
#include "db/memtable.h"
|
|
|
|
#include "db/memtable_list.h"
|
|
|
|
#include "db/merge_context.h"
|
|
|
|
#include "db/merge_helper.h"
|
|
|
|
#include "db/table_cache.h"
|
|
|
|
#include "db/table_properties_collector.h"
|
|
|
|
#include "db/forward_iterator.h"
|
|
|
|
#include "db/transaction_log_impl.h"
|
|
|
|
#include "db/version_set.h"
|
|
|
|
#include "db/write_batch_internal.h"
|
|
|
|
#include "port/port.h"
|
[CF] Rethink table cache
Summary:
Adapting table cache to column families is interesting. We want table cache to be global LRU, so if some column families are use not as often as others, we want them to be evicted from cache. However, current TableCache object also constructs tables on its own. If table is not found in the cache, TableCache automatically creates new table. We want each column family to be able to specify different table factory.
To solve the problem, we still have a single LRU, but we provide the LRUCache object to TableCache on construction. We have one TableCache per column family, but the underyling cache is shared by all TableCache objects.
This allows us to have a global LRU, but still be able to support different table factories for different column families. Also, in the future it will also be able to support different directories for different column families.
Test Plan: make check
Reviewers: dhruba, haobo, kailiu, sdong
CC: leveldb
Differential Revision: https://reviews.facebook.net/D15915
11 years ago
|
|
|
#include "rocksdb/cache.h"
|
|
|
|
#include "port/likely.h"
|
|
|
|
#include "rocksdb/compaction_filter.h"
|
|
|
|
#include "rocksdb/db.h"
|
|
|
|
#include "rocksdb/env.h"
|
|
|
|
#include "rocksdb/merge_operator.h"
|
|
|
|
#include "rocksdb/version.h"
|
|
|
|
#include "rocksdb/statistics.h"
|
|
|
|
#include "rocksdb/status.h"
|
|
|
|
#include "rocksdb/table.h"
|
|
|
|
#include "table/block.h"
|
|
|
|
#include "table/block_based_table_factory.h"
|
|
|
|
#include "table/merger.h"
|
|
|
|
#include "table/table_builder.h"
|
|
|
|
#include "table/two_level_iterator.h"
|
|
|
|
#include "util/auto_roll_logger.h"
|
|
|
|
#include "util/autovector.h"
|
|
|
|
#include "util/build_version.h"
|
|
|
|
#include "util/coding.h"
|
|
|
|
#include "util/hash_skiplist_rep.h"
|
|
|
|
#include "util/hash_linklist_rep.h"
|
|
|
|
#include "util/logging.h"
|
|
|
|
#include "util/log_buffer.h"
|
|
|
|
#include "util/mutexlock.h"
|
|
|
|
#include "util/perf_context_imp.h"
|
|
|
|
#include "util/iostats_context_imp.h"
|
|
|
|
#include "util/stop_watch.h"
|
|
|
|
#include "util/sync_point.h"
|
|
|
|
|
|
|
|
namespace rocksdb {
|
|
|
|
|
|
|
|
const std::string kDefaultColumnFamilyName("default");
|
[RocksDB] [Column Family] Interface proposal
Summary:
<This diff is for Column Family branch>
Sharing some of the work I've done so far. This diff compiles and passes the tests.
The biggest change is in options.h - I broke down Options into two parts - DBOptions and ColumnFamilyOptions. DBOptions is DB-specific (env, create_if_missing, block_cache, etc.) and ColumnFamilyOptions is column family-specific (all compaction options, compresion options, etc.). Note that this does not break backwards compatibility at all.
Further, I created DBWithColumnFamily which inherits DB interface and adds new functions with column family support. Clients can transparently switch to DBWithColumnFamily and it will not break their backwards compatibility.
There are few methods worth checking out: ListColumnFamilies(), MultiNewIterator(), MultiGet() and GetSnapshot(). [GetSnapshot() returns the snapshot across all column families for now - I think that's what we agreed on]
Finally, I made small changes to WriteBatch so we are able to atomically insert data across column families.
Please provide feedback.
Test Plan: make check works, the code is backward compatible
Reviewers: dhruba, haobo, sdong, kailiu, emayanke
CC: leveldb
Differential Revision: https://reviews.facebook.net/D14445
11 years ago
|
|
|
|
|
|
|
void DumpRocksDBBuildVersion(Logger * log);
|
|
|
|
|
|
|
|
struct DBImpl::WriteContext {
|
|
|
|
autovector<SuperVersion*> superversions_to_free_;
|
|
|
|
autovector<log::Writer*> logs_to_free_;
|
|
|
|
|
|
|
|
~WriteContext() {
|
|
|
|
for (auto& sv : superversions_to_free_) {
|
|
|
|
delete sv;
|
|
|
|
}
|
|
|
|
for (auto& log : logs_to_free_) {
|
|
|
|
delete log;
|
|
|
|
}
|
|
|
|
}
|
|
|
|
};
|
|
|
|
|
|
|
|
Options SanitizeOptions(const std::string& dbname,
|
|
|
|
const InternalKeyComparator* icmp,
|
|
|
|
const Options& src) {
|
|
|
|
auto db_options = SanitizeOptions(dbname, DBOptions(src));
|
|
|
|
auto cf_options = SanitizeOptions(icmp, ColumnFamilyOptions(src));
|
|
|
|
return Options(db_options, cf_options);
|
|
|
|
}
|
|
|
|
|
|
|
|
DBOptions SanitizeOptions(const std::string& dbname, const DBOptions& src) {
|
|
|
|
DBOptions result = src;
|
|
|
|
|
|
|
|
// result.max_open_files means an "infinite" open files.
|
|
|
|
if (result.max_open_files != -1) {
|
|
|
|
ClipToRange(&result.max_open_files, 20, 1000000);
|
|
|
|
}
|
|
|
|
|
|
|
|
if (result.info_log == nullptr) {
|
|
|
|
Status s = CreateLoggerFromOptions(dbname, result.db_log_dir, src.env,
|
|
|
|
result, &result.info_log);
|
|
|
|
if (!s.ok()) {
|
|
|
|
// No place suitable for logging
|
|
|
|
result.info_log = nullptr;
|
|
|
|
}
|
|
|
|
}
|
|
|
|
result.env->IncBackgroundThreadsIfNeeded(src.max_background_compactions,
|
|
|
|
Env::Priority::LOW);
|
|
|
|
result.env->IncBackgroundThreadsIfNeeded(src.max_background_flushes,
|
|
|
|
Env::Priority::HIGH);
|
|
|
|
|
|
|
|
if (!result.rate_limiter) {
|
|
|
|
if (result.bytes_per_sync == 0) {
|
|
|
|
result.bytes_per_sync = 1024 * 1024;
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
if (result.wal_dir.empty()) {
|
|
|
|
// Use dbname as default
|
|
|
|
result.wal_dir = dbname;
|
|
|
|
}
|
|
|
|
if (result.wal_dir.back() == '/') {
|
|
|
|
result.wal_dir = result.wal_dir.substr(0, result.wal_dir.size() - 1);
|
|
|
|
}
|
|
|
|
|
|
|
|
if (result.db_paths.size() == 0) {
|
|
|
|
result.db_paths.emplace_back(dbname, std::numeric_limits<uint64_t>::max());
|
|
|
|
}
|
|
|
|
|
|
|
|
return result;
|
|
|
|
}
|
|
|
|
|
|
|
|
namespace {
|
|
|
|
|
|
|
|
Status SanitizeOptionsByTable(
|
|
|
|
const DBOptions& db_opts,
|
|
|
|
const std::vector<ColumnFamilyDescriptor>& column_families) {
|
|
|
|
Status s;
|
|
|
|
for (auto cf : column_families) {
|
|
|
|
s = cf.options.table_factory->SanitizeOptions(db_opts, cf.options);
|
|
|
|
if (!s.ok()) {
|
|
|
|
return s;
|
|
|
|
}
|
|
|
|
}
|
|
|
|
return Status::OK();
|
|
|
|
}
|
|
|
|
|
|
|
|
CompressionType GetCompressionFlush(const ImmutableCFOptions& ioptions) {
|
|
|
|
// Compressing memtable flushes might not help unless the sequential load
|
|
|
|
// optimization is used for leveled compaction. Otherwise the CPU and
|
|
|
|
// latency overhead is not offset by saving much space.
|
|
|
|
|
|
|
|
bool can_compress;
|
|
|
|
|
|
|
|
if (ioptions.compaction_style == kCompactionStyleUniversal) {
|
|
|
|
can_compress =
|
|
|
|
(ioptions.compaction_options_universal.compression_size_percent < 0);
|
|
|
|
} else {
|
|
|
|
// For leveled compress when min_level_to_compress == 0.
|
|
|
|
can_compress = ioptions.compression_per_level.empty() ||
|
|
|
|
ioptions.compression_per_level[0] != kNoCompression;
|
|
|
|
}
|
|
|
|
|
|
|
|
if (can_compress) {
|
|
|
|
return ioptions.compression;
|
|
|
|
} else {
|
|
|
|
return kNoCompression;
|
|
|
|
}
|
|
|
|
}
|
|
|
|
} // namespace
|
|
|
|
|
|
|
|
DBImpl::DBImpl(const DBOptions& options, const std::string& dbname)
|
|
|
|
: env_(options.env),
|
|
|
|
dbname_(dbname),
|
|
|
|
db_options_(SanitizeOptions(dbname, options)),
|
|
|
|
stats_(db_options_.statistics.get()),
|
|
|
|
db_lock_(nullptr),
|
|
|
|
mutex_(options.use_adaptive_mutex),
|
|
|
|
shutting_down_(false),
|
|
|
|
bg_cv_(&mutex_),
|
|
|
|
logfile_number_(0),
|
|
|
|
log_empty_(true),
|
|
|
|
default_cf_handle_(nullptr),
|
|
|
|
total_log_size_(0),
|
|
|
|
max_total_in_memory_state_(0),
|
|
|
|
tmp_batch_(),
|
Fix data race against logging data structure because of LogBuffer
Summary:
@igor pointed out that there is a potential data race because of the way we use the newly introduced LogBuffer. After "bg_compaction_scheduled_--" or "bg_flush_scheduled_--", they can both become 0. As soon as the lock is released after that, DBImpl's deconstructor can go ahead and deconstruct all the states inside DB, including the info_log object hold in a shared pointer of the options object it keeps. At that point it is not safe anymore to continue using the info logger to write the delayed logs.
With the patch, lock is released temporarily for log buffer to be flushed before "bg_compaction_scheduled_--" or "bg_flush_scheduled_--". In order to make sure we don't miss any pending flush or compaction, a new flag bg_schedule_needed_ is added, which is set to be true if there is a pending flush or compaction but not scheduled because of the max thread limit. If the flag is set to be true, the scheduling function will be called before compaction or flush thread finishes.
Thanks @igor for this finding!
Test Plan: make all check
Reviewers: haobo, igor
Reviewed By: haobo
CC: dhruba, ljin, yhchiang, igor, leveldb
Differential Revision: https://reviews.facebook.net/D16767
11 years ago
|
|
|
bg_schedule_needed_(false),
|
|
|
|
bg_compaction_scheduled_(0),
|
Fix a deadlock in CompactRange()
Summary:
The way DBImpl::TEST_CompactRange() throttles down the number of bg compactions
can cause it to deadlock when CompactRange() is called concurrently from
multiple threads. Imagine a following scenario with only two threads
(max_background_compactions is 10 and bg_compaction_scheduled_ is initially 0):
1. Thread #1 increments bg_compaction_scheduled_ (to LargeNumber), sets
bg_compaction_scheduled_ to 9 (newvalue), schedules the compaction
(bg_compaction_scheduled_ is now 10) and waits for it to complete.
2. Thread #2 calls TEST_CompactRange(), increments bg_compaction_scheduled_
(now LargeNumber + 10) and waits on a cv for bg_compaction_scheduled_ to
drop to LargeNumber.
3. BG thread completes the first manual compaction, decrements
bg_compaction_scheduled_ and wakes up all threads waiting on bg_cv_.
Thread #1 runs, increments bg_compaction_scheduled_ by LargeNumber again
(now 2*LargeNumber + 9). Since that's more than LargeNumber + newvalue,
thread #2 also goes to sleep (waiting on bg_cv_), without resetting
bg_compaction_scheduled_.
This diff attempts to address the problem by introducing a new counter
bg_manual_only_ (when positive, MaybeScheduleFlushOrCompaction() will only
schedule manual compactions).
Test Plan:
I could pretty much consistently reproduce the deadlock with a program that
calls CompactRange(nullptr, nullptr) immediately after Write() from multiple
threads. This no longer happens with this patch.
Tests (make check) pass.
Reviewers: dhruba, igor, sdong, haobo
Reviewed By: igor
CC: leveldb
Differential Revision: https://reviews.facebook.net/D14799
11 years ago
|
|
|
bg_manual_only_(0),
|
|
|
|
bg_flush_scheduled_(0),
|
|
|
|
manual_compaction_(nullptr),
|
|
|
|
disable_delete_obsolete_files_(0),
|
|
|
|
delete_obsolete_files_last_run_(options.env->NowMicros()),
|
|
|
|
last_stats_dump_time_microsec_(0),
|
|
|
|
flush_on_destroy_(false),
|
|
|
|
env_options_(options),
|
|
|
|
#ifndef ROCKSDB_LITE
|
|
|
|
wal_manager_(db_options_, env_options_),
|
|
|
|
#endif // ROCKSDB_LITE
|
|
|
|
bg_work_gate_closed_(false),
|
|
|
|
refitting_level_(false),
|
|
|
|
opened_successfully_(false) {
|
|
|
|
env_->GetAbsolutePath(dbname, &db_absolute_path_);
|
|
|
|
|
|
|
|
// Reserve ten files or so for other uses and give the rest to TableCache.
|
|
|
|
// Give a large number for setting of "infinite" open files.
|
|
|
|
const int table_cache_size = (db_options_.max_open_files == -1) ?
|
|
|
|
4194304 : db_options_.max_open_files - 10;
|
|
|
|
// Reserve ten files or so for other uses and give the rest to TableCache.
|
|
|
|
table_cache_ =
|
|
|
|
NewLRUCache(table_cache_size, db_options_.table_cache_numshardbits,
|
|
|
|
db_options_.table_cache_remove_scan_count_limit);
|
|
|
|
|
Push- instead of pull-model for managing Write stalls
Summary:
Introducing WriteController, which is a source of truth about per-DB write delays. Let's define an DB epoch as a period where there are no flushes and compactions (i.e. new epoch is started when flush or compaction finishes). Each epoch can either:
* proceed with all writes without delay
* delay all writes by fixed time
* stop all writes
The three modes are recomputed at each epoch change (flush, compaction), rather than on every write (which is currently the case).
When we have a lot of column families, our current pull behavior adds a big overhead, since we need to loop over every column family for every write. With new push model, overhead on Write code-path is minimal.
This is just the start. Next step is to also take care of stalls introduced by slow memtable flushes. The final goal is to eliminate function MakeRoomForWrite(), which currently needs to be called for every column family by every write.
Test Plan: make check for now. I'll add some unit tests later. Also, perf test.
Reviewers: dhruba, yhchiang, MarkCallaghan, sdong, ljin
Reviewed By: ljin
Subscribers: leveldb
Differential Revision: https://reviews.facebook.net/D22791
10 years ago
|
|
|
versions_.reset(new VersionSet(dbname_, &db_options_, env_options_,
|
|
|
|
table_cache_.get(), &write_controller_));
|
|
|
|
column_family_memtables_.reset(new ColumnFamilyMemTablesImpl(
|
|
|
|
versions_->GetColumnFamilySet(), &flush_scheduler_));
|
|
|
|
|
|
|
|
DumpRocksDBBuildVersion(db_options_.info_log.get());
|
|
|
|
DumpDBFileSummary(db_options_, dbname_);
|
|
|
|
db_options_.Dump(db_options_.info_log.get());
|
|
|
|
|
|
|
|
LogFlush(db_options_.info_log);
|
|
|
|
}
|
|
|
|
|
|
|
|
DBImpl::~DBImpl() {
|
|
|
|
mutex_.Lock();
|
|
|
|
if (flush_on_destroy_) {
|
|
|
|
for (auto cfd : *versions_->GetColumnFamilySet()) {
|
|
|
|
if (!cfd->mem()->IsEmpty()) {
|
|
|
|
cfd->Ref();
|
|
|
|
mutex_.Unlock();
|
|
|
|
FlushMemTable(cfd, FlushOptions());
|
|
|
|
mutex_.Lock();
|
|
|
|
cfd->Unref();
|
|
|
|
}
|
|
|
|
}
|
|
|
|
versions_->GetColumnFamilySet()->FreeDeadColumnFamilies();
|
|
|
|
}
|
|
|
|
|
|
|
|
// Wait for background work to finish
|
|
|
|
shutting_down_.store(true, std::memory_order_release);
|
|
|
|
while (bg_compaction_scheduled_ || bg_flush_scheduled_) {
|
|
|
|
bg_cv_.Wait();
|
|
|
|
}
|
|
|
|
|
|
|
|
flush_scheduler_.Clear();
|
|
|
|
|
|
|
|
if (default_cf_handle_ != nullptr) {
|
|
|
|
// we need to delete handle outside of lock because it does its own locking
|
|
|
|
mutex_.Unlock();
|
|
|
|
delete default_cf_handle_;
|
|
|
|
mutex_.Lock();
|
|
|
|
}
|
|
|
|
|
|
|
|
// Clean up obsolete files due to SuperVersion release.
|
|
|
|
// (1) Need to delete to obsolete files before closing because RepairDB()
|
|
|
|
// scans all existing files in the file system and builds manifest file.
|
|
|
|
// Keeping obsolete files confuses the repair process.
|
|
|
|
// (2) Need to check if we Open()/Recover() the DB successfully before
|
|
|
|
// deleting because if VersionSet recover fails (may be due to corrupted
|
|
|
|
// manifest file), it is not able to identify live files correctly. As a
|
|
|
|
// result, all "live" files can get deleted by accident. However, corrupted
|
|
|
|
// manifest is recoverable by RepairDB().
|
|
|
|
if (opened_successfully_) {
|
|
|
|
JobContext job_context;
|
|
|
|
FindObsoleteFiles(&job_context, true);
|
|
|
|
// manifest number starting from 2
|
|
|
|
job_context.manifest_file_number = 1;
|
|
|
|
if (job_context.HaveSomethingToDelete()) {
|
|
|
|
PurgeObsoleteFiles(job_context);
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
// versions need to be destroyed before table_cache since it can hold
|
|
|
|
// references to table_cache.
|
|
|
|
versions_.reset();
|
|
|
|
mutex_.Unlock();
|
|
|
|
if (db_lock_ != nullptr) {
|
|
|
|
env_->UnlockFile(db_lock_);
|
|
|
|
}
|
|
|
|
|
|
|
|
LogFlush(db_options_.info_log);
|
|
|
|
}
|
|
|
|
|
|
|
|
Status DBImpl::NewDB() {
|
|
|
|
VersionEdit new_db;
|
|
|
|
new_db.SetLogNumber(0);
|
|
|
|
new_db.SetNextFile(2);
|
|
|
|
new_db.SetLastSequence(0);
|
|
|
|
|
|
|
|
Log(InfoLogLevel::INFO_LEVEL,
|
|
|
|
db_options_.info_log, "Creating manifest 1 \n");
|
|
|
|
const std::string manifest = DescriptorFileName(dbname_, 1);
|
|
|
|
unique_ptr<WritableFile> file;
|
|
|
|
Status s = env_->NewWritableFile(
|
|
|
|
manifest, &file, env_->OptimizeForManifestWrite(env_options_));
|
|
|
|
if (!s.ok()) {
|
|
|
|
return s;
|
|
|
|
}
|
|
|
|
file->SetPreallocationBlockSize(db_options_.manifest_preallocation_size);
|
|
|
|
{
|
|
|
|
log::Writer log(std::move(file));
|
|
|
|
std::string record;
|
|
|
|
new_db.EncodeTo(&record);
|
|
|
|
s = log.AddRecord(record);
|
|
|
|
}
|
|
|
|
if (s.ok()) {
|
|
|
|
// Make "CURRENT" file that points to the new manifest file.
|
|
|
|
s = SetCurrentFile(env_, dbname_, 1, db_directory_.get());
|
|
|
|
} else {
|
|
|
|
env_->DeleteFile(manifest);
|
|
|
|
}
|
|
|
|
return s;
|
|
|
|
}
|
|
|
|
|
|
|
|
void DBImpl::MaybeIgnoreError(Status* s) const {
|
|
|
|
if (s->ok() || db_options_.paranoid_checks) {
|
|
|
|
// No change needed
|
|
|
|
} else {
|
|
|
|
Log(InfoLogLevel::WARN_LEVEL,
|
|
|
|
db_options_.info_log, "Ignoring error %s", s->ToString().c_str());
|
|
|
|
*s = Status::OK();
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
const Status DBImpl::CreateArchivalDirectory() {
|
|
|
|
if (db_options_.WAL_ttl_seconds > 0 || db_options_.WAL_size_limit_MB > 0) {
|
|
|
|
std::string archivalPath = ArchivalDirectory(db_options_.wal_dir);
|
|
|
|
return env_->CreateDirIfMissing(archivalPath);
|
|
|
|
}
|
|
|
|
return Status::OK();
|
|
|
|
}
|
|
|
|
|
|
|
|
void DBImpl::PrintStatistics() {
|
|
|
|
auto dbstats = db_options_.statistics.get();
|
|
|
|
if (dbstats) {
|
|
|
|
Log(InfoLogLevel::INFO_LEVEL, db_options_.info_log,
|
|
|
|
"STATISTCS:\n %s",
|
|
|
|
dbstats->ToString().c_str());
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
void DBImpl::MaybeDumpStats() {
|
|
|
|
if (db_options_.stats_dump_period_sec == 0) return;
|
|
|
|
|
|
|
|
const uint64_t now_micros = env_->NowMicros();
|
|
|
|
|
|
|
|
if (last_stats_dump_time_microsec_ +
|
|
|
|
db_options_.stats_dump_period_sec * 1000000
|
|
|
|
<= now_micros) {
|
|
|
|
// Multiple threads could race in here simultaneously.
|
|
|
|
// However, the last one will update last_stats_dump_time_microsec_
|
|
|
|
// atomically. We could see more than one dump during one dump
|
|
|
|
// period in rare cases.
|
|
|
|
last_stats_dump_time_microsec_ = now_micros;
|
make internal stats independent of statistics
Summary:
also make it aware of column family
output from db_bench
```
** Compaction Stats [default] **
Level Files Size(MB) Score Read(GB) Rn(GB) Rnp1(GB) Write(GB) Wnew(GB) RW-Amp W-Amp Rd(MB/s) Wr(MB/s) Rn(cnt) Rnp1(cnt) Wnp1(cnt) Wnew(cnt) Comp(sec) Comp(cnt) Avg(sec) Stall(sec) Stall(cnt) Avg(ms)
------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------
L0 14 956 0.9 0.0 0.0 0.0 2.7 2.7 0.0 0.0 0.0 111.6 0 0 0 0 24 40 0.612 75.20 492387 0.15
L1 21 2001 2.0 5.7 2.0 3.7 5.3 1.6 5.4 2.6 71.2 65.7 31 43 55 12 82 2 41.242 43.72 41183 1.06
L2 217 18974 1.9 16.5 2.0 14.4 15.1 0.7 15.6 7.4 70.1 64.3 17 182 185 3 241 16 15.052 0.00 0 0.00
L3 1641 188245 1.8 9.1 1.1 8.0 8.5 0.5 15.4 7.4 61.3 57.2 9 75 76 1 152 9 16.887 0.00 0 0.00
L4 4447 449025 0.4 13.4 4.8 8.6 9.1 0.5 4.7 1.9 77.8 52.7 38 79 100 21 176 38 4.639 0.00 0 0.00
Sum 6340 659201 0.0 44.7 10.0 34.7 40.6 6.0 32.0 15.2 67.7 61.6 95 379 416 37 676 105 6.439 118.91 533570 0.22
Int 0 0 0.0 1.2 0.4 0.8 1.3 0.5 5.2 2.7 59.1 65.6 3 7 9 2 20 10 2.003 0.00 0 0.00
Stalls(secs): 75.197 level0_slowdown, 0.000 level0_numfiles, 0.000 memtable_compaction, 43.717 leveln_slowdown
Stalls(count): 492387 level0_slowdown, 0 level0_numfiles, 0 memtable_compaction, 41183 leveln_slowdown
** DB Stats **
Uptime(secs): 202.1 total, 13.5 interval
Cumulative writes: 6291456 writes, 6291456 batches, 1.0 writes per batch, 4.90 ingest GB
Cumulative WAL: 6291456 writes, 6291456 syncs, 1.00 writes per sync, 4.90 GB written
Interval writes: 1048576 writes, 1048576 batches, 1.0 writes per batch, 836.0 ingest MB
Interval WAL: 1048576 writes, 1048576 syncs, 1.00 writes per sync, 0.82 MB written
Test Plan: ran it
Reviewers: sdong, yhchiang, igor
Reviewed By: igor
Subscribers: leveldb
Differential Revision: https://reviews.facebook.net/D19917
11 years ago
|
|
|
|
|
|
|
bool tmp1 = false;
|
|
|
|
bool tmp2 = false;
|
|
|
|
DBPropertyType cf_property_type =
|
|
|
|
GetPropertyType("rocksdb.cfstats", &tmp1, &tmp2);
|
|
|
|
DBPropertyType db_property_type =
|
|
|
|
GetPropertyType("rocksdb.dbstats", &tmp1, &tmp2);
|
|
|
|
std::string stats;
|
make internal stats independent of statistics
Summary:
also make it aware of column family
output from db_bench
```
** Compaction Stats [default] **
Level Files Size(MB) Score Read(GB) Rn(GB) Rnp1(GB) Write(GB) Wnew(GB) RW-Amp W-Amp Rd(MB/s) Wr(MB/s) Rn(cnt) Rnp1(cnt) Wnp1(cnt) Wnew(cnt) Comp(sec) Comp(cnt) Avg(sec) Stall(sec) Stall(cnt) Avg(ms)
------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------
L0 14 956 0.9 0.0 0.0 0.0 2.7 2.7 0.0 0.0 0.0 111.6 0 0 0 0 24 40 0.612 75.20 492387 0.15
L1 21 2001 2.0 5.7 2.0 3.7 5.3 1.6 5.4 2.6 71.2 65.7 31 43 55 12 82 2 41.242 43.72 41183 1.06
L2 217 18974 1.9 16.5 2.0 14.4 15.1 0.7 15.6 7.4 70.1 64.3 17 182 185 3 241 16 15.052 0.00 0 0.00
L3 1641 188245 1.8 9.1 1.1 8.0 8.5 0.5 15.4 7.4 61.3 57.2 9 75 76 1 152 9 16.887 0.00 0 0.00
L4 4447 449025 0.4 13.4 4.8 8.6 9.1 0.5 4.7 1.9 77.8 52.7 38 79 100 21 176 38 4.639 0.00 0 0.00
Sum 6340 659201 0.0 44.7 10.0 34.7 40.6 6.0 32.0 15.2 67.7 61.6 95 379 416 37 676 105 6.439 118.91 533570 0.22
Int 0 0 0.0 1.2 0.4 0.8 1.3 0.5 5.2 2.7 59.1 65.6 3 7 9 2 20 10 2.003 0.00 0 0.00
Stalls(secs): 75.197 level0_slowdown, 0.000 level0_numfiles, 0.000 memtable_compaction, 43.717 leveln_slowdown
Stalls(count): 492387 level0_slowdown, 0 level0_numfiles, 0 memtable_compaction, 41183 leveln_slowdown
** DB Stats **
Uptime(secs): 202.1 total, 13.5 interval
Cumulative writes: 6291456 writes, 6291456 batches, 1.0 writes per batch, 4.90 ingest GB
Cumulative WAL: 6291456 writes, 6291456 syncs, 1.00 writes per sync, 4.90 GB written
Interval writes: 1048576 writes, 1048576 batches, 1.0 writes per batch, 836.0 ingest MB
Interval WAL: 1048576 writes, 1048576 syncs, 1.00 writes per sync, 0.82 MB written
Test Plan: ran it
Reviewers: sdong, yhchiang, igor
Reviewed By: igor
Subscribers: leveldb
Differential Revision: https://reviews.facebook.net/D19917
11 years ago
|
|
|
{
|
|
|
|
MutexLock l(&mutex_);
|
|
|
|
for (auto cfd : *versions_->GetColumnFamilySet()) {
|
|
|
|
cfd->internal_stats()->GetStringProperty(cf_property_type,
|
|
|
|
"rocksdb.cfstats", &stats);
|
make internal stats independent of statistics
Summary:
also make it aware of column family
output from db_bench
```
** Compaction Stats [default] **
Level Files Size(MB) Score Read(GB) Rn(GB) Rnp1(GB) Write(GB) Wnew(GB) RW-Amp W-Amp Rd(MB/s) Wr(MB/s) Rn(cnt) Rnp1(cnt) Wnp1(cnt) Wnew(cnt) Comp(sec) Comp(cnt) Avg(sec) Stall(sec) Stall(cnt) Avg(ms)
------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------
L0 14 956 0.9 0.0 0.0 0.0 2.7 2.7 0.0 0.0 0.0 111.6 0 0 0 0 24 40 0.612 75.20 492387 0.15
L1 21 2001 2.0 5.7 2.0 3.7 5.3 1.6 5.4 2.6 71.2 65.7 31 43 55 12 82 2 41.242 43.72 41183 1.06
L2 217 18974 1.9 16.5 2.0 14.4 15.1 0.7 15.6 7.4 70.1 64.3 17 182 185 3 241 16 15.052 0.00 0 0.00
L3 1641 188245 1.8 9.1 1.1 8.0 8.5 0.5 15.4 7.4 61.3 57.2 9 75 76 1 152 9 16.887 0.00 0 0.00
L4 4447 449025 0.4 13.4 4.8 8.6 9.1 0.5 4.7 1.9 77.8 52.7 38 79 100 21 176 38 4.639 0.00 0 0.00
Sum 6340 659201 0.0 44.7 10.0 34.7 40.6 6.0 32.0 15.2 67.7 61.6 95 379 416 37 676 105 6.439 118.91 533570 0.22
Int 0 0 0.0 1.2 0.4 0.8 1.3 0.5 5.2 2.7 59.1 65.6 3 7 9 2 20 10 2.003 0.00 0 0.00
Stalls(secs): 75.197 level0_slowdown, 0.000 level0_numfiles, 0.000 memtable_compaction, 43.717 leveln_slowdown
Stalls(count): 492387 level0_slowdown, 0 level0_numfiles, 0 memtable_compaction, 41183 leveln_slowdown
** DB Stats **
Uptime(secs): 202.1 total, 13.5 interval
Cumulative writes: 6291456 writes, 6291456 batches, 1.0 writes per batch, 4.90 ingest GB
Cumulative WAL: 6291456 writes, 6291456 syncs, 1.00 writes per sync, 4.90 GB written
Interval writes: 1048576 writes, 1048576 batches, 1.0 writes per batch, 836.0 ingest MB
Interval WAL: 1048576 writes, 1048576 syncs, 1.00 writes per sync, 0.82 MB written
Test Plan: ran it
Reviewers: sdong, yhchiang, igor
Reviewed By: igor
Subscribers: leveldb
Differential Revision: https://reviews.facebook.net/D19917
11 years ago
|
|
|
}
|
|
|
|
default_cf_internal_stats_->GetStringProperty(db_property_type,
|
|
|
|
"rocksdb.dbstats", &stats);
|
make internal stats independent of statistics
Summary:
also make it aware of column family
output from db_bench
```
** Compaction Stats [default] **
Level Files Size(MB) Score Read(GB) Rn(GB) Rnp1(GB) Write(GB) Wnew(GB) RW-Amp W-Amp Rd(MB/s) Wr(MB/s) Rn(cnt) Rnp1(cnt) Wnp1(cnt) Wnew(cnt) Comp(sec) Comp(cnt) Avg(sec) Stall(sec) Stall(cnt) Avg(ms)
------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------
L0 14 956 0.9 0.0 0.0 0.0 2.7 2.7 0.0 0.0 0.0 111.6 0 0 0 0 24 40 0.612 75.20 492387 0.15
L1 21 2001 2.0 5.7 2.0 3.7 5.3 1.6 5.4 2.6 71.2 65.7 31 43 55 12 82 2 41.242 43.72 41183 1.06
L2 217 18974 1.9 16.5 2.0 14.4 15.1 0.7 15.6 7.4 70.1 64.3 17 182 185 3 241 16 15.052 0.00 0 0.00
L3 1641 188245 1.8 9.1 1.1 8.0 8.5 0.5 15.4 7.4 61.3 57.2 9 75 76 1 152 9 16.887 0.00 0 0.00
L4 4447 449025 0.4 13.4 4.8 8.6 9.1 0.5 4.7 1.9 77.8 52.7 38 79 100 21 176 38 4.639 0.00 0 0.00
Sum 6340 659201 0.0 44.7 10.0 34.7 40.6 6.0 32.0 15.2 67.7 61.6 95 379 416 37 676 105 6.439 118.91 533570 0.22
Int 0 0 0.0 1.2 0.4 0.8 1.3 0.5 5.2 2.7 59.1 65.6 3 7 9 2 20 10 2.003 0.00 0 0.00
Stalls(secs): 75.197 level0_slowdown, 0.000 level0_numfiles, 0.000 memtable_compaction, 43.717 leveln_slowdown
Stalls(count): 492387 level0_slowdown, 0 level0_numfiles, 0 memtable_compaction, 41183 leveln_slowdown
** DB Stats **
Uptime(secs): 202.1 total, 13.5 interval
Cumulative writes: 6291456 writes, 6291456 batches, 1.0 writes per batch, 4.90 ingest GB
Cumulative WAL: 6291456 writes, 6291456 syncs, 1.00 writes per sync, 4.90 GB written
Interval writes: 1048576 writes, 1048576 batches, 1.0 writes per batch, 836.0 ingest MB
Interval WAL: 1048576 writes, 1048576 syncs, 1.00 writes per sync, 0.82 MB written
Test Plan: ran it
Reviewers: sdong, yhchiang, igor
Reviewed By: igor
Subscribers: leveldb
Differential Revision: https://reviews.facebook.net/D19917
11 years ago
|
|
|
}
|
|
|
|
Log(InfoLogLevel::INFO_LEVEL,
|
|
|
|
db_options_.info_log, "------- DUMPING STATS -------");
|
|
|
|
Log(InfoLogLevel::INFO_LEVEL,
|
|
|
|
db_options_.info_log, "%s", stats.c_str());
|
make internal stats independent of statistics
Summary:
also make it aware of column family
output from db_bench
```
** Compaction Stats [default] **
Level Files Size(MB) Score Read(GB) Rn(GB) Rnp1(GB) Write(GB) Wnew(GB) RW-Amp W-Amp Rd(MB/s) Wr(MB/s) Rn(cnt) Rnp1(cnt) Wnp1(cnt) Wnew(cnt) Comp(sec) Comp(cnt) Avg(sec) Stall(sec) Stall(cnt) Avg(ms)
------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------
L0 14 956 0.9 0.0 0.0 0.0 2.7 2.7 0.0 0.0 0.0 111.6 0 0 0 0 24 40 0.612 75.20 492387 0.15
L1 21 2001 2.0 5.7 2.0 3.7 5.3 1.6 5.4 2.6 71.2 65.7 31 43 55 12 82 2 41.242 43.72 41183 1.06
L2 217 18974 1.9 16.5 2.0 14.4 15.1 0.7 15.6 7.4 70.1 64.3 17 182 185 3 241 16 15.052 0.00 0 0.00
L3 1641 188245 1.8 9.1 1.1 8.0 8.5 0.5 15.4 7.4 61.3 57.2 9 75 76 1 152 9 16.887 0.00 0 0.00
L4 4447 449025 0.4 13.4 4.8 8.6 9.1 0.5 4.7 1.9 77.8 52.7 38 79 100 21 176 38 4.639 0.00 0 0.00
Sum 6340 659201 0.0 44.7 10.0 34.7 40.6 6.0 32.0 15.2 67.7 61.6 95 379 416 37 676 105 6.439 118.91 533570 0.22
Int 0 0 0.0 1.2 0.4 0.8 1.3 0.5 5.2 2.7 59.1 65.6 3 7 9 2 20 10 2.003 0.00 0 0.00
Stalls(secs): 75.197 level0_slowdown, 0.000 level0_numfiles, 0.000 memtable_compaction, 43.717 leveln_slowdown
Stalls(count): 492387 level0_slowdown, 0 level0_numfiles, 0 memtable_compaction, 41183 leveln_slowdown
** DB Stats **
Uptime(secs): 202.1 total, 13.5 interval
Cumulative writes: 6291456 writes, 6291456 batches, 1.0 writes per batch, 4.90 ingest GB
Cumulative WAL: 6291456 writes, 6291456 syncs, 1.00 writes per sync, 4.90 GB written
Interval writes: 1048576 writes, 1048576 batches, 1.0 writes per batch, 836.0 ingest MB
Interval WAL: 1048576 writes, 1048576 syncs, 1.00 writes per sync, 0.82 MB written
Test Plan: ran it
Reviewers: sdong, yhchiang, igor
Reviewed By: igor
Subscribers: leveldb
Differential Revision: https://reviews.facebook.net/D19917
11 years ago
|
|
|
|
|
|
|
PrintStatistics();
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
// Returns the list of live files in 'sst_live' and the list
|
|
|
|
// of all files in the filesystem in 'candidate_files'.
|
|
|
|
// no_full_scan = true -- never do the full scan using GetChildren()
|
|
|
|
// force = false -- don't force the full scan, except every
|
|
|
|
// db_options_.delete_obsolete_files_period_micros
|
|
|
|
// force = true -- force the full scan
|
|
|
|
void DBImpl::FindObsoleteFiles(JobContext* job_context, bool force,
|
|
|
|
bool no_full_scan) {
|
|
|
|
mutex_.AssertHeld();
|
|
|
|
|
|
|
|
// if deletion is disabled, do nothing
|
|
|
|
if (disable_delete_obsolete_files_ > 0) {
|
|
|
|
return;
|
|
|
|
}
|
|
|
|
|
|
|
|
bool doing_the_full_scan = false;
|
|
|
|
|
|
|
|
// logic for figurint out if we're doing the full scan
|
|
|
|
if (no_full_scan) {
|
|
|
|
doing_the_full_scan = false;
|
|
|
|
} else if (force || db_options_.delete_obsolete_files_period_micros == 0) {
|
|
|
|
doing_the_full_scan = true;
|
|
|
|
} else {
|
|
|
|
const uint64_t now_micros = env_->NowMicros();
|
|
|
|
if (delete_obsolete_files_last_run_ +
|
|
|
|
db_options_.delete_obsolete_files_period_micros < now_micros) {
|
|
|
|
doing_the_full_scan = true;
|
|
|
|
delete_obsolete_files_last_run_ = now_micros;
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
// get obsolete files
|
|
|
|
versions_->GetObsoleteFiles(&job_context->sst_delete_files);
|
|
|
|
|
|
|
|
// store the current filenum, lognum, etc
|
|
|
|
job_context->manifest_file_number = versions_->manifest_file_number();
|
|
|
|
job_context->pending_manifest_file_number =
|
|
|
|
versions_->pending_manifest_file_number();
|
|
|
|
job_context->log_number = versions_->MinLogNumber();
|
|
|
|
job_context->prev_log_number = versions_->prev_log_number();
|
|
|
|
|
|
|
|
if (!doing_the_full_scan && !job_context->HaveSomethingToDelete()) {
|
|
|
|
// avoid filling up sst_live if we're sure that we
|
|
|
|
// are not going to do the full scan and that we don't have
|
|
|
|
// anything to delete at the moment
|
|
|
|
return;
|
|
|
|
}
|
|
|
|
|
|
|
|
// don't delete live files
|
|
|
|
for (auto pair : pending_outputs_) {
|
|
|
|
job_context->sst_live.emplace_back(pair.first, pair.second, 0);
|
|
|
|
}
|
|
|
|
versions_->AddLiveFiles(&job_context->sst_live);
|
|
|
|
|
|
|
|
if (doing_the_full_scan) {
|
|
|
|
for (uint32_t path_id = 0;
|
|
|
|
path_id < db_options_.db_paths.size(); path_id++) {
|
|
|
|
// set of all files in the directory. We'll exclude files that are still
|
|
|
|
// alive in the subsequent processings.
|
|
|
|
std::vector<std::string> files;
|
|
|
|
env_->GetChildren(db_options_.db_paths[path_id].path,
|
|
|
|
&files); // Ignore errors
|
|
|
|
for (std::string file : files) {
|
|
|
|
// TODO(icanadi) clean up this mess to avoid having one-off "/" prefixes
|
|
|
|
job_context->candidate_files.emplace_back("/" + file, path_id);
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
//Add log files in wal_dir
|
|
|
|
if (db_options_.wal_dir != dbname_) {
|
|
|
|
std::vector<std::string> log_files;
|
|
|
|
env_->GetChildren(db_options_.wal_dir, &log_files); // Ignore errors
|
|
|
|
for (std::string log_file : log_files) {
|
|
|
|
job_context->candidate_files.emplace_back(log_file, 0);
|
|
|
|
}
|
|
|
|
}
|
|
|
|
// Add info log files in db_log_dir
|
|
|
|
if (!db_options_.db_log_dir.empty() && db_options_.db_log_dir != dbname_) {
|
|
|
|
std::vector<std::string> info_log_files;
|
|
|
|
// Ignore errors
|
|
|
|
env_->GetChildren(db_options_.db_log_dir, &info_log_files);
|
|
|
|
for (std::string log_file : info_log_files) {
|
|
|
|
job_context->candidate_files.emplace_back(log_file, 0);
|
|
|
|
}
|
|
|
|
}
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
namespace {
|
|
|
|
bool CompareCandidateFile(const JobContext::CandidateFileInfo& first,
|
|
|
|
const JobContext::CandidateFileInfo& second) {
|
|
|
|
if (first.file_name > second.file_name) {
|
|
|
|
return true;
|
|
|
|
} else if (first.file_name < second.file_name) {
|
|
|
|
return false;
|
|
|
|
} else {
|
|
|
|
return (first.path_id > second.path_id);
|
|
|
|
}
|
|
|
|
}
|
|
|
|
}; // namespace
|
|
|
|
|
|
|
|
// Diffs the files listed in filenames and those that do not
|
|
|
|
// belong to live files are posibly removed. Also, removes all the
|
|
|
|
// files in sst_delete_files and log_delete_files.
|
|
|
|
// It is not necessary to hold the mutex when invoking this method.
|
|
|
|
void DBImpl::PurgeObsoleteFiles(const JobContext& state) {
|
|
|
|
// we'd better have sth to delete
|
|
|
|
assert(state.HaveSomethingToDelete());
|
|
|
|
|
|
|
|
// this checks if FindObsoleteFiles() was run before. If not, don't do
|
|
|
|
// PurgeObsoleteFiles(). If FindObsoleteFiles() was run, we need to also
|
|
|
|
// run PurgeObsoleteFiles(), even if disable_delete_obsolete_files_ is true
|
|
|
|
if (state.manifest_file_number == 0) {
|
|
|
|
return;
|
|
|
|
}
|
|
|
|
|
|
|
|
// Now, convert live list to an unordered map, WITHOUT mutex held;
|
[RocksDB] [Performance] Speed up FindObsoleteFiles
Summary:
FindObsoleteFiles was slow, holding the single big lock, resulted in bad p99 behavior.
Didn't profile anything, but several things could be improved:
1. VersionSet::AddLiveFiles works with std::set, which is by itself slow (a tree).
You also don't know how many dynamic allocations occur just for building up this tree.
switched to std::vector, also added logic to pre-calculate total size and do just one allocation
2. Don't see why env_->GetChildren() needs to be mutex proteced, moved to PurgeObsoleteFiles where
mutex could be unlocked.
3. switched std::set to std:unordered_set, the conversion from vector is also inside PurgeObsoleteFiles
I have a feeling this should pretty much fix it.
Test Plan: make check; db_stress
Reviewers: dhruba, heyongqiang, MarkCallaghan
Reviewed By: dhruba
CC: leveldb, zshao
Differential Revision: https://reviews.facebook.net/D10197
12 years ago
|
|
|
// set is slow.
|
|
|
|
std::unordered_map<uint64_t, const FileDescriptor*> sst_live_map;
|
|
|
|
for (const FileDescriptor& fd : state.sst_live) {
|
|
|
|
sst_live_map[fd.GetNumber()] = &fd;
|
|
|
|
}
|
|
|
|
|
|
|
|
auto candidate_files = state.candidate_files;
|
|
|
|
candidate_files.reserve(candidate_files.size() +
|
|
|
|
state.sst_delete_files.size() +
|
|
|
|
state.log_delete_files.size());
|
|
|
|
// We may ignore the dbname when generating the file names.
|
|
|
|
const char* kDumbDbName = "";
|
|
|
|
for (auto file : state.sst_delete_files) {
|
|
|
|
candidate_files.emplace_back(
|
|
|
|
MakeTableFileName(kDumbDbName, file->fd.GetNumber()),
|
|
|
|
file->fd.GetPathId());
|
|
|
|
delete file;
|
|
|
|
}
|
|
|
|
|
|
|
|
for (auto file_num : state.log_delete_files) {
|
|
|
|
if (file_num > 0) {
|
|
|
|
candidate_files.emplace_back(LogFileName(kDumbDbName, file_num).substr(1),
|
|
|
|
0);
|
|
|
|
}
|
|
|
|
}
|
[RocksDB] [Performance] Speed up FindObsoleteFiles
Summary:
FindObsoleteFiles was slow, holding the single big lock, resulted in bad p99 behavior.
Didn't profile anything, but several things could be improved:
1. VersionSet::AddLiveFiles works with std::set, which is by itself slow (a tree).
You also don't know how many dynamic allocations occur just for building up this tree.
switched to std::vector, also added logic to pre-calculate total size and do just one allocation
2. Don't see why env_->GetChildren() needs to be mutex proteced, moved to PurgeObsoleteFiles where
mutex could be unlocked.
3. switched std::set to std:unordered_set, the conversion from vector is also inside PurgeObsoleteFiles
I have a feeling this should pretty much fix it.
Test Plan: make check; db_stress
Reviewers: dhruba, heyongqiang, MarkCallaghan
Reviewed By: dhruba
CC: leveldb, zshao
Differential Revision: https://reviews.facebook.net/D10197
12 years ago
|
|
|
|
|
|
|
// dedup state.candidate_files so we don't try to delete the same
|
|
|
|
// file twice
|
|
|
|
sort(candidate_files.begin(), candidate_files.end(), CompareCandidateFile);
|
|
|
|
candidate_files.erase(unique(candidate_files.begin(), candidate_files.end()),
|
|
|
|
candidate_files.end());
|
|
|
|
|
|
|
|
std::vector<std::string> old_info_log_files;
|
|
|
|
InfoLogPrefix info_log_prefix(!db_options_.db_log_dir.empty(), dbname_);
|
|
|
|
for (const auto& candidate_file : candidate_files) {
|
|
|
|
std::string to_delete = candidate_file.file_name;
|
|
|
|
uint32_t path_id = candidate_file.path_id;
|
|
|
|
uint64_t number;
|
|
|
|
FileType type;
|
|
|
|
// Ignore file if we cannot recognize it.
|
|
|
|
if (!ParseFileName(to_delete, &number, info_log_prefix.prefix, &type)) {
|
|
|
|
continue;
|
|
|
|
}
|
|
|
|
|
|
|
|
bool keep = true;
|
|
|
|
switch (type) {
|
|
|
|
case kLogFile:
|
|
|
|
keep = ((number >= state.log_number) ||
|
|
|
|
(number == state.prev_log_number));
|
|
|
|
break;
|
|
|
|
case kDescriptorFile:
|
|
|
|
// Keep my manifest file, and any newer incarnations'
|
|
|
|
// (can happen during manifest roll)
|
|
|
|
keep = (number >= state.manifest_file_number);
|
|
|
|
break;
|
|
|
|
case kTableFile:
|
|
|
|
keep = (sst_live_map.find(number) != sst_live_map.end());
|
|
|
|
break;
|
|
|
|
case kTempFile:
|
|
|
|
// Any temp files that are currently being written to must
|
|
|
|
// be recorded in pending_outputs_, which is inserted into "live".
|
|
|
|
// Also, SetCurrentFile creates a temp file when writing out new
|
|
|
|
// manifest, which is equal to state.pending_manifest_file_number. We
|
|
|
|
// should not delete that file
|
|
|
|
keep = (sst_live_map.find(number) != sst_live_map.end()) ||
|
|
|
|
(number == state.pending_manifest_file_number);
|
|
|
|
break;
|
|
|
|
case kInfoLogFile:
|
|
|
|
keep = true;
|
|
|
|
if (number != 0) {
|
|
|
|
old_info_log_files.push_back(to_delete);
|
|
|
|
}
|
|
|
|
break;
|
|
|
|
case kCurrentFile:
|
|
|
|
case kDBLockFile:
|
|
|
|
case kIdentityFile:
|
|
|
|
case kMetaDatabase:
|
|
|
|
keep = true;
|
|
|
|
break;
|
|
|
|
}
|
|
|
|
|
|
|
|
if (keep) {
|
|
|
|
continue;
|
|
|
|
}
|
|
|
|
|
|
|
|
std::string fname;
|
|
|
|
if (type == kTableFile) {
|
|
|
|
// evict from cache
|
|
|
|
TableCache::Evict(table_cache_.get(), number);
|
|
|
|
fname = TableFileName(db_options_.db_paths, number, path_id);
|
|
|
|
} else {
|
|
|
|
fname = ((type == kLogFile) ?
|
|
|
|
db_options_.wal_dir : dbname_) + "/" + to_delete;
|
|
|
|
}
|
|
|
|
|
|
|
|
#ifdef ROCKSDB_LITE
|
|
|
|
Status s = env_->DeleteFile(fname);
|
|
|
|
Log(InfoLogLevel::DEBUG_LEVEL, db_options_.info_log,
|
|
|
|
"Delete %s type=%d #%" PRIu64 " -- %s\n",
|
|
|
|
fname.c_str(), type, number, s.ToString().c_str());
|
|
|
|
#else // not ROCKSDB_LITE
|
|
|
|
if (type == kLogFile && (db_options_.WAL_ttl_seconds > 0 ||
|
|
|
|
db_options_.WAL_size_limit_MB > 0)) {
|
|
|
|
wal_manager_.ArchiveWALFile(fname, number);
|
|
|
|
} else {
|
|
|
|
Status s = env_->DeleteFile(fname);
|
|
|
|
Log(InfoLogLevel::DEBUG_LEVEL, db_options_.info_log,
|
|
|
|
"Delete %s type=%d #%" PRIu64 " -- %s\n",
|
|
|
|
fname.c_str(), type, number, s.ToString().c_str());
|
|
|
|
}
|
|
|
|
#endif // ROCKSDB_LITE
|
|
|
|
}
|
|
|
|
|
|
|
|
// Delete old info log files.
|
|
|
|
size_t old_info_log_file_count = old_info_log_files.size();
|
|
|
|
if (old_info_log_file_count >= db_options_.keep_log_file_num) {
|
|
|
|
std::sort(old_info_log_files.begin(), old_info_log_files.end());
|
|
|
|
size_t end = old_info_log_file_count - db_options_.keep_log_file_num;
|
|
|
|
for (unsigned int i = 0; i <= end; i++) {
|
|
|
|
std::string& to_delete = old_info_log_files.at(i);
|
|
|
|
std::string full_path_to_delete = (db_options_.db_log_dir.empty() ?
|
|
|
|
dbname_ : db_options_.db_log_dir) + "/" + to_delete;
|
|
|
|
Log(InfoLogLevel::INFO_LEVEL, db_options_.info_log,
|
|
|
|
"Delete info log file %s\n",
|
|
|
|
full_path_to_delete.c_str());
|
|
|
|
Status s = env_->DeleteFile(full_path_to_delete);
|
|
|
|
if (!s.ok()) {
|
|
|
|
Log(InfoLogLevel::ERROR_LEVEL,
|
|
|
|
db_options_.info_log, "Delete info log file %s FAILED -- %s\n",
|
|
|
|
to_delete.c_str(), s.ToString().c_str());
|
|
|
|
}
|
|
|
|
}
|
|
|
|
}
|
|
|
|
#ifndef ROCKSDB_LITE
|
|
|
|
wal_manager_.PurgeObsoleteWALFiles();
|
|
|
|
#endif // ROCKSDB_LITE
|
|
|
|
LogFlush(db_options_.info_log);
|
|
|
|
}
|
|
|
|
|
|
|
|
void DBImpl::DeleteObsoleteFiles() {
|
|
|
|
mutex_.AssertHeld();
|
|
|
|
JobContext job_context;
|
|
|
|
FindObsoleteFiles(&job_context, true);
|
|
|
|
if (job_context.HaveSomethingToDelete()) {
|
|
|
|
PurgeObsoleteFiles(job_context);
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
Status DBImpl::Recover(
|
|
|
|
const std::vector<ColumnFamilyDescriptor>& column_families, bool read_only,
|
|
|
|
bool error_if_log_file_exist) {
|
|
|
|
mutex_.AssertHeld();
|
|
|
|
|
|
|
|
bool is_new_db = false;
|
|
|
|
assert(db_lock_ == nullptr);
|
Refactor Recover() code
Summary:
This diff does two things:
* Rethinks how we call Recover() with read_only option. Before, we call it with pointer to memtable where we'd like to apply those changes to. This memtable is set in db_impl_readonly.cc and it's actually DBImpl::mem_. Why don't we just apply updates to mem_ right away? It seems more intuitive.
* Changes when we apply updates to manifest. Before, the process is to recover all the logs, flush it to sst files and then do one giant commit that atomically adds all recovered sst files and sets the next log number. This works good enough, but causes some small troubles for my column family approach, since I can't have one VersionEdit apply to more than single column family[1]. The change here is to commit the files recovered from logs right away. Here is the state of the world before the change:
1. Recover log 5, add new sst files to edit
2. Recover log 7, add new sst files to edit
3. Recover log 8, add new sst files to edit
4. Commit all added sst files to manifest and mark log files 5, 7 and 8 as recoverd (via SetLogNumber(9) function)
After the change, we'll do:
1. Recover log 5, commit the new sst files and set log 5 as recovered
2. Recover log 7, commit the new sst files and set log 7 as recovered
3. Recover log 8, commit the new sst files and set log 8 as recovered
The added (small) benefit is that if we fail after (2), the new recovery will only have to recover log 8. In previous case, we'll have to restart the recovery from the beginning. The bigger benefit will be to enable easier integration of multiple column families in Recovery code path.
[1] I'm happy to dicuss this decison, but I believe this is the cleanest way to go. It also makes backward compatibility much easier. We don't have a requirement of adding multiple column families atomically.
Test Plan: make check
Reviewers: dhruba, haobo, kailiu, sdong
Reviewed By: kailiu
CC: leveldb
Differential Revision: https://reviews.facebook.net/D15237
11 years ago
|
|
|
if (!read_only) {
|
|
|
|
// We call CreateDirIfMissing() as the directory may already exist (if we
|
|
|
|
// are reopening a DB), when this happens we don't want creating the
|
|
|
|
// directory to cause an error. However, we need to check if creating the
|
|
|
|
// directory fails or else we may get an obscure message about the lock
|
|
|
|
// file not existing. One real-world example of this occurring is if
|
|
|
|
// env->CreateDirIfMissing() doesn't create intermediate directories, e.g.
|
|
|
|
// when dbname_ is "dir/db" but when "dir" doesn't exist.
|
|
|
|
Status s = env_->CreateDirIfMissing(dbname_);
|
|
|
|
if (!s.ok()) {
|
|
|
|
return s;
|
|
|
|
}
|
|
|
|
|
|
|
|
for (auto& db_path : db_options_.db_paths) {
|
|
|
|
s = env_->CreateDirIfMissing(db_path.path);
|
|
|
|
if (!s.ok()) {
|
|
|
|
return s;
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
s = env_->NewDirectory(dbname_, &db_directory_);
|
|
|
|
if (!s.ok()) {
|
|
|
|
return s;
|
|
|
|
}
|
|
|
|
|
|
|
|
s = env_->LockFile(LockFileName(dbname_), &db_lock_);
|
|
|
|
if (!s.ok()) {
|
|
|
|
return s;
|
|
|
|
}
|
|
|
|
|
|
|
|
if (!env_->FileExists(CurrentFileName(dbname_))) {
|
|
|
|
if (db_options_.create_if_missing) {
|
|
|
|
s = NewDB();
|
|
|
|
is_new_db = true;
|
|
|
|
if (!s.ok()) {
|
|
|
|
return s;
|
|
|
|
}
|
|
|
|
} else {
|
|
|
|
return Status::InvalidArgument(
|
|
|
|
dbname_, "does not exist (create_if_missing is false)");
|
|
|
|
}
|
|
|
|
} else {
|
|
|
|
if (db_options_.error_if_exists) {
|
|
|
|
return Status::InvalidArgument(
|
|
|
|
dbname_, "exists (error_if_exists is true)");
|
|
|
|
}
|
|
|
|
}
|
|
|
|
// Check for the IDENTITY file and create it if not there
|
|
|
|
if (!env_->FileExists(IdentityFileName(dbname_))) {
|
|
|
|
s = SetIdentityFile(env_, dbname_);
|
|
|
|
if (!s.ok()) {
|
|
|
|
return s;
|
|
|
|
}
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
Status s = versions_->Recover(column_families, read_only);
|
|
|
|
if (db_options_.paranoid_checks && s.ok()) {
|
|
|
|
s = CheckConsistency();
|
|
|
|
}
|
|
|
|
if (s.ok()) {
|
|
|
|
SequenceNumber max_sequence(0);
|
|
|
|
default_cf_handle_ = new ColumnFamilyHandleImpl(
|
|
|
|
versions_->GetColumnFamilySet()->GetDefault(), this, &mutex_);
|
make internal stats independent of statistics
Summary:
also make it aware of column family
output from db_bench
```
** Compaction Stats [default] **
Level Files Size(MB) Score Read(GB) Rn(GB) Rnp1(GB) Write(GB) Wnew(GB) RW-Amp W-Amp Rd(MB/s) Wr(MB/s) Rn(cnt) Rnp1(cnt) Wnp1(cnt) Wnew(cnt) Comp(sec) Comp(cnt) Avg(sec) Stall(sec) Stall(cnt) Avg(ms)
------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------
L0 14 956 0.9 0.0 0.0 0.0 2.7 2.7 0.0 0.0 0.0 111.6 0 0 0 0 24 40 0.612 75.20 492387 0.15
L1 21 2001 2.0 5.7 2.0 3.7 5.3 1.6 5.4 2.6 71.2 65.7 31 43 55 12 82 2 41.242 43.72 41183 1.06
L2 217 18974 1.9 16.5 2.0 14.4 15.1 0.7 15.6 7.4 70.1 64.3 17 182 185 3 241 16 15.052 0.00 0 0.00
L3 1641 188245 1.8 9.1 1.1 8.0 8.5 0.5 15.4 7.4 61.3 57.2 9 75 76 1 152 9 16.887 0.00 0 0.00
L4 4447 449025 0.4 13.4 4.8 8.6 9.1 0.5 4.7 1.9 77.8 52.7 38 79 100 21 176 38 4.639 0.00 0 0.00
Sum 6340 659201 0.0 44.7 10.0 34.7 40.6 6.0 32.0 15.2 67.7 61.6 95 379 416 37 676 105 6.439 118.91 533570 0.22
Int 0 0 0.0 1.2 0.4 0.8 1.3 0.5 5.2 2.7 59.1 65.6 3 7 9 2 20 10 2.003 0.00 0 0.00
Stalls(secs): 75.197 level0_slowdown, 0.000 level0_numfiles, 0.000 memtable_compaction, 43.717 leveln_slowdown
Stalls(count): 492387 level0_slowdown, 0 level0_numfiles, 0 memtable_compaction, 41183 leveln_slowdown
** DB Stats **
Uptime(secs): 202.1 total, 13.5 interval
Cumulative writes: 6291456 writes, 6291456 batches, 1.0 writes per batch, 4.90 ingest GB
Cumulative WAL: 6291456 writes, 6291456 syncs, 1.00 writes per sync, 4.90 GB written
Interval writes: 1048576 writes, 1048576 batches, 1.0 writes per batch, 836.0 ingest MB
Interval WAL: 1048576 writes, 1048576 syncs, 1.00 writes per sync, 0.82 MB written
Test Plan: ran it
Reviewers: sdong, yhchiang, igor
Reviewed By: igor
Subscribers: leveldb
Differential Revision: https://reviews.facebook.net/D19917
11 years ago
|
|
|
default_cf_internal_stats_ = default_cf_handle_->cfd()->internal_stats();
|
|
|
|
single_column_family_mode_ =
|
|
|
|
versions_->GetColumnFamilySet()->NumberOfColumnFamilies() == 1;
|
|
|
|
|
|
|
|
// Recover from all newer log files than the ones named in the
|
|
|
|
// descriptor (new log files may have been added by the previous
|
|
|
|
// incarnation without registering them in the descriptor).
|
|
|
|
//
|
|
|
|
// Note that prev_log_number() is no longer used, but we pay
|
|
|
|
// attention to it in case we are recovering a database
|
|
|
|
// produced by an older version of rocksdb.
|
|
|
|
const uint64_t min_log = versions_->MinLogNumber();
|
|
|
|
const uint64_t prev_log = versions_->prev_log_number();
|
|
|
|
std::vector<std::string> filenames;
|
|
|
|
s = env_->GetChildren(db_options_.wal_dir, &filenames);
|
|
|
|
if (!s.ok()) {
|
|
|
|
return s;
|
|
|
|
}
|
|
|
|
|
|
|
|
std::vector<uint64_t> logs;
|
|
|
|
for (size_t i = 0; i < filenames.size(); i++) {
|
|
|
|
uint64_t number;
|
|
|
|
FileType type;
|
|
|
|
if (ParseFileName(filenames[i], &number, &type) && type == kLogFile) {
|
|
|
|
if (is_new_db) {
|
|
|
|
return Status::Corruption(
|
|
|
|
"While creating a new Db, wal_dir contains "
|
|
|
|
"existing log file: ",
|
|
|
|
filenames[i]);
|
|
|
|
} else if ((number >= min_log) || (number == prev_log)) {
|
|
|
|
logs.push_back(number);
|
|
|
|
}
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
if (logs.size() > 0 && error_if_log_file_exist) {
|
|
|
|
return Status::Corruption(""
|
|
|
|
"The db was opened in readonly mode with error_if_log_file_exist"
|
|
|
|
"flag but a log file already exists");
|
|
|
|
}
|
|
|
|
|
|
|
|
if (!logs.empty()) {
|
|
|
|
// Recover in the order in which the logs were generated
|
|
|
|
std::sort(logs.begin(), logs.end());
|
|
|
|
s = RecoverLogFiles(logs, &max_sequence, read_only);
|
|
|
|
if (!s.ok()) {
|
|
|
|
// Clear memtables if recovery failed
|
|
|
|
for (auto cfd : *versions_->GetColumnFamilySet()) {
|
|
|
|
cfd->CreateNewMemtable(*cfd->GetLatestMutableCFOptions());
|
|
|
|
}
|
|
|
|
}
|
|
|
|
}
|
|
|
|
SetTickerCount(stats_, SEQUENCE_NUMBER, versions_->LastSequence());
|
|
|
|
}
|
|
|
|
|
|
|
|
// Initial value
|
|
|
|
max_total_in_memory_state_ = 0;
|
|
|
|
for (auto cfd : *versions_->GetColumnFamilySet()) {
|
|
|
|
auto* mutable_cf_options = cfd->GetLatestMutableCFOptions();
|
|
|
|
max_total_in_memory_state_ += mutable_cf_options->write_buffer_size *
|
|
|
|
mutable_cf_options->max_write_buffer_number;
|
|
|
|
}
|
|
|
|
|
|
|
|
return s;
|
|
|
|
}
|
|
|
|
|
|
|
|
// REQUIRES: log_numbers are sorted in ascending order
|
|
|
|
Status DBImpl::RecoverLogFiles(const std::vector<uint64_t>& log_numbers,
|
|
|
|
SequenceNumber* max_sequence, bool read_only) {
|
|
|
|
struct LogReporter : public log::Reader::Reporter {
|
|
|
|
Env* env;
|
|
|
|
Logger* info_log;
|
|
|
|
const char* fname;
|
|
|
|
Status* status; // nullptr if db_options_.paranoid_checks==false or
|
|
|
|
// db_options_.skip_log_error_on_recovery==true
|
|
|
|
virtual void Corruption(size_t bytes, const Status& s) {
|
|
|
|
Log(InfoLogLevel::WARN_LEVEL,
|
|
|
|
info_log, "%s%s: dropping %d bytes; %s",
|
|
|
|
(this->status == nullptr ? "(ignoring error) " : ""),
|
|
|
|
fname, static_cast<int>(bytes), s.ToString().c_str());
|
|
|
|
if (this->status != nullptr && this->status->ok()) *this->status = s;
|
|
|
|
}
|
|
|
|
};
|
|
|
|
|
|
|
|
mutex_.AssertHeld();
|
|
|
|
Status status;
|
|
|
|
std::unordered_map<int, VersionEdit> version_edits;
|
|
|
|
// no need to refcount because iteration is under mutex
|
|
|
|
for (auto cfd : *versions_->GetColumnFamilySet()) {
|
|
|
|
VersionEdit edit;
|
|
|
|
edit.SetColumnFamily(cfd->GetID());
|
|
|
|
version_edits.insert({cfd->GetID(), edit});
|
|
|
|
}
|
Refactor Recover() code
Summary:
This diff does two things:
* Rethinks how we call Recover() with read_only option. Before, we call it with pointer to memtable where we'd like to apply those changes to. This memtable is set in db_impl_readonly.cc and it's actually DBImpl::mem_. Why don't we just apply updates to mem_ right away? It seems more intuitive.
* Changes when we apply updates to manifest. Before, the process is to recover all the logs, flush it to sst files and then do one giant commit that atomically adds all recovered sst files and sets the next log number. This works good enough, but causes some small troubles for my column family approach, since I can't have one VersionEdit apply to more than single column family[1]. The change here is to commit the files recovered from logs right away. Here is the state of the world before the change:
1. Recover log 5, add new sst files to edit
2. Recover log 7, add new sst files to edit
3. Recover log 8, add new sst files to edit
4. Commit all added sst files to manifest and mark log files 5, 7 and 8 as recoverd (via SetLogNumber(9) function)
After the change, we'll do:
1. Recover log 5, commit the new sst files and set log 5 as recovered
2. Recover log 7, commit the new sst files and set log 7 as recovered
3. Recover log 8, commit the new sst files and set log 8 as recovered
The added (small) benefit is that if we fail after (2), the new recovery will only have to recover log 8. In previous case, we'll have to restart the recovery from the beginning. The bigger benefit will be to enable easier integration of multiple column families in Recovery code path.
[1] I'm happy to dicuss this decison, but I believe this is the cleanest way to go. It also makes backward compatibility much easier. We don't have a requirement of adding multiple column families atomically.
Test Plan: make check
Reviewers: dhruba, haobo, kailiu, sdong
Reviewed By: kailiu
CC: leveldb
Differential Revision: https://reviews.facebook.net/D15237
11 years ago
|
|
|
|
|
|
|
for (auto log_number : log_numbers) {
|
|
|
|
// The previous incarnation may not have written any MANIFEST
|
|
|
|
// records after allocating this log number. So we manually
|
|
|
|
// update the file number allocation counter in VersionSet.
|
|
|
|
versions_->MarkFileNumberUsed(log_number);
|
|
|
|
// Open the log file
|
|
|
|
std::string fname = LogFileName(db_options_.wal_dir, log_number);
|
|
|
|
unique_ptr<SequentialFile> file;
|
|
|
|
status = env_->NewSequentialFile(fname, &file, env_options_);
|
|
|
|
if (!status.ok()) {
|
|
|
|
MaybeIgnoreError(&status);
|
|
|
|
if (!status.ok()) {
|
|
|
|
return status;
|
|
|
|
} else {
|
|
|
|
// Fail with one log file, but that's ok.
|
|
|
|
// Try next one.
|
|
|
|
continue;
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
// Create the log reader.
|
|
|
|
LogReporter reporter;
|
|
|
|
reporter.env = env_;
|
|
|
|
reporter.info_log = db_options_.info_log.get();
|
|
|
|
reporter.fname = fname.c_str();
|
|
|
|
reporter.status =
|
|
|
|
(db_options_.paranoid_checks && !db_options_.skip_log_error_on_recovery
|
|
|
|
? &status
|
|
|
|
: nullptr);
|
|
|
|
// We intentially make log::Reader do checksumming even if
|
|
|
|
// paranoid_checks==false so that corruptions cause entire commits
|
|
|
|
// to be skipped instead of propagating bad information (like overly
|
|
|
|
// large sequence numbers).
|
|
|
|
log::Reader reader(std::move(file), &reporter, true /*checksum*/,
|
|
|
|
0 /*initial_offset*/);
|
|
|
|
Log(InfoLogLevel::INFO_LEVEL,
|
|
|
|
db_options_.info_log, "Recovering log #%" PRIu64 "", log_number);
|
|
|
|
|
|
|
|
// Read all the records and add to a memtable
|
|
|
|
std::string scratch;
|
|
|
|
Slice record;
|
|
|
|
WriteBatch batch;
|
|
|
|
while (reader.ReadRecord(&record, &scratch)) {
|
|
|
|
if (record.size() < 12) {
|
|
|
|
reporter.Corruption(record.size(),
|
|
|
|
Status::Corruption("log record too small"));
|
|
|
|
continue;
|
|
|
|
}
|
|
|
|
WriteBatchInternal::SetContents(&batch, record);
|
|
|
|
|
|
|
|
// If column family was not found, it might mean that the WAL write
|
|
|
|
// batch references to the column family that was dropped after the
|
|
|
|
// insert. We don't want to fail the whole write batch in that case --
|
|
|
|
// we just ignore the update.
|
|
|
|
// That's why we set ignore missing column families to true
|
|
|
|
status = WriteBatchInternal::InsertInto(
|
|
|
|
&batch, column_family_memtables_.get(), true, log_number);
|
|
|
|
|
|
|
|
MaybeIgnoreError(&status);
|
|
|
|
if (!status.ok()) {
|
|
|
|
return status;
|
|
|
|
}
|
|
|
|
const SequenceNumber last_seq = WriteBatchInternal::Sequence(&batch) +
|
|
|
|
WriteBatchInternal::Count(&batch) - 1;
|
|
|
|
if (last_seq > *max_sequence) {
|
|
|
|
*max_sequence = last_seq;
|
|
|
|
}
|
|
|
|
|
|
|
|
if (!read_only) {
|
|
|
|
// we can do this because this is called before client has access to the
|
|
|
|
// DB and there is only a single thread operating on DB
|
|
|
|
ColumnFamilyData* cfd;
|
|
|
|
|
|
|
|
while ((cfd = flush_scheduler_.GetNextColumnFamily()) != nullptr) {
|
|
|
|
cfd->Unref();
|
|
|
|
// If this asserts, it means that InsertInto failed in
|
|
|
|
// filtering updates to already-flushed column families
|
|
|
|
assert(cfd->GetLogNumber() <= log_number);
|
|
|
|
auto iter = version_edits.find(cfd->GetID());
|
|
|
|
assert(iter != version_edits.end());
|
|
|
|
VersionEdit* edit = &iter->second;
|
|
|
|
status = WriteLevel0TableForRecovery(cfd, cfd->mem(), edit);
|
|
|
|
if (!status.ok()) {
|
|
|
|
// Reflect errors immediately so that conditions like full
|
|
|
|
// file-systems cause the DB::Open() to fail.
|
|
|
|
return status;
|
|
|
|
}
|
|
|
|
cfd->CreateNewMemtable(*cfd->GetLatestMutableCFOptions());
|
|
|
|
}
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
flush_scheduler_.Clear();
|
|
|
|
if (versions_->LastSequence() < *max_sequence) {
|
|
|
|
versions_->SetLastSequence(*max_sequence);
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
if (!read_only) {
|
|
|
|
// no need to refcount since client still doesn't have access
|
|
|
|
// to the DB and can not drop column families while we iterate
|
|
|
|
auto max_log_number = log_numbers.back();
|
|
|
|
for (auto cfd : *versions_->GetColumnFamilySet()) {
|
|
|
|
auto iter = version_edits.find(cfd->GetID());
|
|
|
|
assert(iter != version_edits.end());
|
|
|
|
VersionEdit* edit = &iter->second;
|
|
|
|
|
|
|
|
if (cfd->GetLogNumber() > max_log_number) {
|
|
|
|
// Column family cfd has already flushed the data
|
|
|
|
// from all logs. Memtable has to be empty because
|
|
|
|
// we filter the updates based on log_number
|
|
|
|
// (in WriteBatch::InsertInto)
|
|
|
|
assert(cfd->mem()->GetFirstSequenceNumber() == 0);
|
|
|
|
assert(edit->NumEntries() == 0);
|
|
|
|
continue;
|
|
|
|
}
|
|
|
|
|
|
|
|
// flush the final memtable (if non-empty)
|
|
|
|
if (cfd->mem()->GetFirstSequenceNumber() != 0) {
|
|
|
|
status = WriteLevel0TableForRecovery(cfd, cfd->mem(), edit);
|
|
|
|
if (!status.ok()) {
|
|
|
|
// Recovery failed
|
|
|
|
break;
|
|
|
|
}
|
|
|
|
cfd->CreateNewMemtable(*cfd->GetLatestMutableCFOptions());
|
|
|
|
}
|
|
|
|
|
|
|
|
// write MANIFEST with update
|
|
|
|
// writing log_number in the manifest means that any log file
|
|
|
|
// with number strongly less than (log_number + 1) is already
|
|
|
|
// recovered and should be ignored on next reincarnation.
|
|
|
|
// Since we already recovered max_log_number, we want all logs
|
|
|
|
// with numbers `<= max_log_number` (includes this one) to be ignored
|
|
|
|
edit->SetLogNumber(max_log_number + 1);
|
|
|
|
// we must mark the next log number as used, even though it's
|
|
|
|
// not actually used. that is because VersionSet assumes
|
|
|
|
// VersionSet::next_file_number_ always to be strictly greater than any
|
|
|
|
// log number
|
|
|
|
versions_->MarkFileNumberUsed(max_log_number + 1);
|
|
|
|
status = versions_->LogAndApply(
|
|
|
|
cfd, *cfd->GetLatestMutableCFOptions(), edit, &mutex_);
|
|
|
|
if (!status.ok()) {
|
|
|
|
// Recovery failed
|
|
|
|
break;
|
|
|
|
}
|
|
|
|
}
|
|
|
|
}
|
Refactor Recover() code
Summary:
This diff does two things:
* Rethinks how we call Recover() with read_only option. Before, we call it with pointer to memtable where we'd like to apply those changes to. This memtable is set in db_impl_readonly.cc and it's actually DBImpl::mem_. Why don't we just apply updates to mem_ right away? It seems more intuitive.
* Changes when we apply updates to manifest. Before, the process is to recover all the logs, flush it to sst files and then do one giant commit that atomically adds all recovered sst files and sets the next log number. This works good enough, but causes some small troubles for my column family approach, since I can't have one VersionEdit apply to more than single column family[1]. The change here is to commit the files recovered from logs right away. Here is the state of the world before the change:
1. Recover log 5, add new sst files to edit
2. Recover log 7, add new sst files to edit
3. Recover log 8, add new sst files to edit
4. Commit all added sst files to manifest and mark log files 5, 7 and 8 as recoverd (via SetLogNumber(9) function)
After the change, we'll do:
1. Recover log 5, commit the new sst files and set log 5 as recovered
2. Recover log 7, commit the new sst files and set log 7 as recovered
3. Recover log 8, commit the new sst files and set log 8 as recovered
The added (small) benefit is that if we fail after (2), the new recovery will only have to recover log 8. In previous case, we'll have to restart the recovery from the beginning. The bigger benefit will be to enable easier integration of multiple column families in Recovery code path.
[1] I'm happy to dicuss this decison, but I believe this is the cleanest way to go. It also makes backward compatibility much easier. We don't have a requirement of adding multiple column families atomically.
Test Plan: make check
Reviewers: dhruba, haobo, kailiu, sdong
Reviewed By: kailiu
CC: leveldb
Differential Revision: https://reviews.facebook.net/D15237
11 years ago
|
|
|
|
|
|
|
return status;
|
|
|
|
}
|
|
|
|
|
|
|
|
Status DBImpl::WriteLevel0TableForRecovery(ColumnFamilyData* cfd, MemTable* mem,
|
|
|
|
VersionEdit* edit) {
|
|
|
|
mutex_.AssertHeld();
|
|
|
|
const uint64_t start_micros = env_->NowMicros();
|
|
|
|
FileMetaData meta;
|
|
|
|
meta.fd = FileDescriptor(versions_->NewFileNumber(), 0, 0);
|
|
|
|
pending_outputs_[meta.fd.GetNumber()] = 0; // path 0 for level 0 file.
|
|
|
|
ReadOptions ro;
|
|
|
|
ro.total_order_seek = true;
|
|
|
|
Arena arena;
|
|
|
|
Status s;
|
|
|
|
{
|
|
|
|
ScopedArenaIterator iter(mem->NewIterator(ro, &arena));
|
|
|
|
const SequenceNumber newest_snapshot = snapshots_.GetNewest();
|
|
|
|
const SequenceNumber earliest_seqno_in_memtable =
|
|
|
|
mem->GetFirstSequenceNumber();
|
|
|
|
Log(InfoLogLevel::DEBUG_LEVEL, db_options_.info_log,
|
|
|
|
"[%s] [WriteLevel0TableForRecovery]"
|
|
|
|
" Level-0 table #%" PRIu64 ": started",
|
|
|
|
cfd->GetName().c_str(), meta.fd.GetNumber());
|
|
|
|
|
|
|
|
{
|
|
|
|
mutex_.Unlock();
|
|
|
|
s = BuildTable(
|
|
|
|
dbname_, env_, *cfd->ioptions(), env_options_, cfd->table_cache(),
|
|
|
|
iter.get(), &meta, cfd->internal_comparator(), newest_snapshot,
|
|
|
|
earliest_seqno_in_memtable, GetCompressionFlush(*cfd->ioptions()),
|
|
|
|
cfd->ioptions()->compression_opts, Env::IO_HIGH);
|
|
|
|
LogFlush(db_options_.info_log);
|
|
|
|
mutex_.Lock();
|
|
|
|
}
|
|
|
|
|
|
|
|
Log(InfoLogLevel::DEBUG_LEVEL, db_options_.info_log,
|
|
|
|
"[%s] [WriteLevel0TableForRecovery]"
|
|
|
|
" Level-0 table #%" PRIu64 ": %" PRIu64 " bytes %s",
|
|
|
|
cfd->GetName().c_str(), meta.fd.GetNumber(), meta.fd.GetFileSize(),
|
|
|
|
s.ToString().c_str());
|
|
|
|
}
|
|
|
|
pending_outputs_.erase(meta.fd.GetNumber());
|
|
|
|
|
|
|
|
// Note that if file_size is zero, the file has been deleted and
|
|
|
|
// should not be added to the manifest.
|
|
|
|
int level = 0;
|
|
|
|
if (s.ok() && meta.fd.GetFileSize() > 0) {
|
|
|
|
edit->AddFile(level, meta.fd.GetNumber(), meta.fd.GetPathId(),
|
|
|
|
meta.fd.GetFileSize(), meta.smallest, meta.largest,
|
|
|
|
meta.smallest_seqno, meta.largest_seqno);
|
|
|
|
}
|
|
|
|
|
|
|
|
InternalStats::CompactionStats stats(1);
|
|
|
|
stats.micros = env_->NowMicros() - start_micros;
|
|
|
|
stats.bytes_written = meta.fd.GetFileSize();
|
|
|
|
stats.files_out_levelnp1 = 1;
|
|
|
|
cfd->internal_stats()->AddCompactionStats(level, stats);
|
make internal stats independent of statistics
Summary:
also make it aware of column family
output from db_bench
```
** Compaction Stats [default] **
Level Files Size(MB) Score Read(GB) Rn(GB) Rnp1(GB) Write(GB) Wnew(GB) RW-Amp W-Amp Rd(MB/s) Wr(MB/s) Rn(cnt) Rnp1(cnt) Wnp1(cnt) Wnew(cnt) Comp(sec) Comp(cnt) Avg(sec) Stall(sec) Stall(cnt) Avg(ms)
------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------
L0 14 956 0.9 0.0 0.0 0.0 2.7 2.7 0.0 0.0 0.0 111.6 0 0 0 0 24 40 0.612 75.20 492387 0.15
L1 21 2001 2.0 5.7 2.0 3.7 5.3 1.6 5.4 2.6 71.2 65.7 31 43 55 12 82 2 41.242 43.72 41183 1.06
L2 217 18974 1.9 16.5 2.0 14.4 15.1 0.7 15.6 7.4 70.1 64.3 17 182 185 3 241 16 15.052 0.00 0 0.00
L3 1641 188245 1.8 9.1 1.1 8.0 8.5 0.5 15.4 7.4 61.3 57.2 9 75 76 1 152 9 16.887 0.00 0 0.00
L4 4447 449025 0.4 13.4 4.8 8.6 9.1 0.5 4.7 1.9 77.8 52.7 38 79 100 21 176 38 4.639 0.00 0 0.00
Sum 6340 659201 0.0 44.7 10.0 34.7 40.6 6.0 32.0 15.2 67.7 61.6 95 379 416 37 676 105 6.439 118.91 533570 0.22
Int 0 0 0.0 1.2 0.4 0.8 1.3 0.5 5.2 2.7 59.1 65.6 3 7 9 2 20 10 2.003 0.00 0 0.00
Stalls(secs): 75.197 level0_slowdown, 0.000 level0_numfiles, 0.000 memtable_compaction, 43.717 leveln_slowdown
Stalls(count): 492387 level0_slowdown, 0 level0_numfiles, 0 memtable_compaction, 41183 leveln_slowdown
** DB Stats **
Uptime(secs): 202.1 total, 13.5 interval
Cumulative writes: 6291456 writes, 6291456 batches, 1.0 writes per batch, 4.90 ingest GB
Cumulative WAL: 6291456 writes, 6291456 syncs, 1.00 writes per sync, 4.90 GB written
Interval writes: 1048576 writes, 1048576 batches, 1.0 writes per batch, 836.0 ingest MB
Interval WAL: 1048576 writes, 1048576 syncs, 1.00 writes per sync, 0.82 MB written
Test Plan: ran it
Reviewers: sdong, yhchiang, igor
Reviewed By: igor
Subscribers: leveldb
Differential Revision: https://reviews.facebook.net/D19917
11 years ago
|
|
|
cfd->internal_stats()->AddCFStats(
|
|
|
|
InternalStats::BYTES_FLUSHED, meta.fd.GetFileSize());
|
|
|
|
RecordTick(stats_, COMPACT_WRITE_BYTES, meta.fd.GetFileSize());
|
|
|
|
return s;
|
|
|
|
}
|
|
|
|
|
|
|
|
Status DBImpl::FlushMemTableToOutputFile(
|
|
|
|
ColumnFamilyData* cfd, const MutableCFOptions& mutable_cf_options,
|
|
|
|
bool* madeProgress, JobContext* job_context, LogBuffer* log_buffer) {
|
|
|
|
mutex_.AssertHeld();
|
|
|
|
assert(cfd->imm()->size() != 0);
|
|
|
|
assert(cfd->imm()->IsFlushPending());
|
|
|
|
|
|
|
|
FlushJob flush_job(dbname_, cfd, db_options_, mutable_cf_options,
|
|
|
|
env_options_, versions_.get(), &mutex_, &shutting_down_,
|
|
|
|
&pending_outputs_, snapshots_.GetNewest(), job_context,
|
|
|
|
log_buffer, db_directory_.get(),
|
|
|
|
GetCompressionFlush(*cfd->ioptions()), stats_);
|
|
|
|
|
|
|
|
Status s = flush_job.Run();
|
|
|
|
|
|
|
|
if (s.ok()) {
|
|
|
|
InstallSuperVersionBackground(cfd, job_context, mutable_cf_options);
|
|
|
|
if (madeProgress) {
|
|
|
|
*madeProgress = 1;
|
|
|
|
}
|
|
|
|
VersionStorageInfo::LevelSummaryStorage tmp;
|
|
|
|
LogToBuffer(log_buffer, "[%s] Level summary: %s\n", cfd->GetName().c_str(),
|
|
|
|
cfd->current()->storage_info()->LevelSummary(&tmp));
|
|
|
|
|
|
|
|
if (disable_delete_obsolete_files_ == 0) {
|
|
|
|
// add to deletion state
|
|
|
|
while (alive_log_files_.size() &&
|
|
|
|
alive_log_files_.begin()->number < versions_->MinLogNumber()) {
|
|
|
|
const auto& earliest = *alive_log_files_.begin();
|
|
|
|
job_context->log_delete_files.push_back(earliest.number);
|
|
|
|
total_log_size_ -= earliest.size;
|
|
|
|
alive_log_files_.pop_front();
|
|
|
|
}
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
if (!s.ok() && !s.IsShutdownInProgress() && db_options_.paranoid_checks &&
|
|
|
|
bg_error_.ok()) {
|
|
|
|
// if a bad error happened (not ShutdownInProgress) and paranoid_checks is
|
|
|
|
// true, mark DB read-only
|
|
|
|
bg_error_ = s;
|
|
|
|
}
|
|
|
|
RecordFlushIOStats();
|
|
|
|
return s;
|
|
|
|
}
|
|
|
|
|
|
|
|
Status DBImpl::CompactRange(ColumnFamilyHandle* column_family,
|
|
|
|
const Slice* begin, const Slice* end,
|
|
|
|
bool reduce_level, int target_level,
|
|
|
|
uint32_t target_path_id) {
|
|
|
|
if (target_path_id >= db_options_.db_paths.size()) {
|
|
|
|
return Status::InvalidArgument("Invalid target path ID");
|
|
|
|
}
|
|
|
|
|
|
|
|
auto cfh = reinterpret_cast<ColumnFamilyHandleImpl*>(column_family);
|
|
|
|
auto cfd = cfh->cfd();
|
|
|
|
|
|
|
|
Status s = FlushMemTable(cfd, FlushOptions());
|
|
|
|
if (!s.ok()) {
|
|
|
|
LogFlush(db_options_.info_log);
|
|
|
|
return s;
|
|
|
|
}
|
|
|
|
|
|
|
|
int max_level_with_files = 0;
|
|
|
|
{
|
|
|
|
MutexLock l(&mutex_);
|
|
|
|
Version* base = cfd->current();
|
|
|
|
for (int level = 1; level < cfd->NumberLevels(); level++) {
|
|
|
|
if (base->storage_info()->OverlapInLevel(level, begin, end)) {
|
|
|
|
max_level_with_files = level;
|
|
|
|
}
|
|
|
|
}
|
|
|
|
}
|
|
|
|
for (int level = 0; level <= max_level_with_files; level++) {
|
|
|
|
// in case the compaction is unversal or if we're compacting the
|
|
|
|
// bottom-most level, the output level will be the same as input one.
|
|
|
|
// level 0 can never be the bottommost level (i.e. if all files are in level
|
|
|
|
// 0, we will compact to level 1)
|
|
|
|
if (cfd->ioptions()->compaction_style == kCompactionStyleUniversal ||
|
|
|
|
cfd->ioptions()->compaction_style == kCompactionStyleFIFO ||
|
|
|
|
(level == max_level_with_files && level > 0)) {
|
|
|
|
s = RunManualCompaction(cfd, level, level, target_path_id, begin, end);
|
|
|
|
} else {
|
|
|
|
s = RunManualCompaction(cfd, level, level + 1, target_path_id, begin,
|
|
|
|
end);
|
|
|
|
}
|
|
|
|
if (!s.ok()) {
|
|
|
|
LogFlush(db_options_.info_log);
|
|
|
|
return s;
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
if (reduce_level) {
|
|
|
|
s = ReFitLevel(cfd, max_level_with_files, target_level);
|
|
|
|
}
|
|
|
|
LogFlush(db_options_.info_log);
|
|
|
|
|
|
|
|
{
|
|
|
|
MutexLock l(&mutex_);
|
|
|
|
// an automatic compaction that has been scheduled might have been
|
|
|
|
// preempted by the manual compactions. Need to schedule it back.
|
|
|
|
MaybeScheduleFlushOrCompaction();
|
|
|
|
}
|
|
|
|
|
|
|
|
return s;
|
|
|
|
}
|
|
|
|
|
|
|
|
Status DBImpl::SetOptions(ColumnFamilyHandle* column_family,
|
|
|
|
const std::unordered_map<std::string, std::string>& options_map) {
|
|
|
|
auto* cfd = reinterpret_cast<ColumnFamilyHandleImpl*>(column_family)->cfd();
|
|
|
|
if (options_map.empty()) {
|
|
|
|
Log(InfoLogLevel::WARN_LEVEL,
|
|
|
|
db_options_.info_log, "SetOptions() on column family [%s], empty input",
|
|
|
|
cfd->GetName().c_str());
|
|
|
|
return Status::InvalidArgument("empty input");
|
|
|
|
}
|
|
|
|
|
|
|
|
MutableCFOptions new_options;
|
|
|
|
Status s;
|
|
|
|
{
|
|
|
|
MutexLock l(&mutex_);
|
|
|
|
s = cfd->SetOptions(options_map);
|
|
|
|
if (s.ok()) {
|
|
|
|
new_options = *cfd->GetLatestMutableCFOptions();
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
Log(InfoLogLevel::INFO_LEVEL, db_options_.info_log,
|
|
|
|
"SetOptions() on column family [%s], inputs:",
|
|
|
|
cfd->GetName().c_str());
|
|
|
|
for (const auto& o : options_map) {
|
|
|
|
Log(InfoLogLevel::INFO_LEVEL, db_options_.info_log,
|
|
|
|
"%s: %s\n", o.first.c_str(), o.second.c_str());
|
|
|
|
}
|
|
|
|
if (s.ok()) {
|
|
|
|
Log(InfoLogLevel::INFO_LEVEL,
|
|
|
|
db_options_.info_log, "[%s] SetOptions succeeded",
|
|
|
|
cfd->GetName().c_str());
|
|
|
|
new_options.Dump(db_options_.info_log.get());
|
|
|
|
} else {
|
|
|
|
Log(InfoLogLevel::WARN_LEVEL, db_options_.info_log,
|
|
|
|
"[%s] SetOptions failed", cfd->GetName().c_str());
|
|
|
|
}
|
|
|
|
return s;
|
|
|
|
}
|
|
|
|
|
|
|
|
// return the same level if it cannot be moved
|
|
|
|
int DBImpl::FindMinimumEmptyLevelFitting(ColumnFamilyData* cfd,
|
|
|
|
const MutableCFOptions& mutable_cf_options, int level) {
|
|
|
|
mutex_.AssertHeld();
|
|
|
|
const auto* vstorage = cfd->current()->storage_info();
|
|
|
|
int minimum_level = level;
|
|
|
|
for (int i = level - 1; i > 0; --i) {
|
|
|
|
// stop if level i is not empty
|
|
|
|
if (vstorage->NumLevelFiles(i) > 0) break;
|
|
|
|
// stop if level i is too small (cannot fit the level files)
|
|
|
|
if (mutable_cf_options.MaxBytesForLevel(i) <
|
|
|
|
vstorage->NumLevelBytes(level)) {
|
|
|
|
break;
|
|
|
|
}
|
|
|
|
|
|
|
|
minimum_level = i;
|
|
|
|
}
|
|
|
|
return minimum_level;
|
|
|
|
}
|
|
|
|
|
|
|
|
Status DBImpl::ReFitLevel(ColumnFamilyData* cfd, int level, int target_level) {
|
|
|
|
assert(level < cfd->NumberLevels());
|
|
|
|
|
|
|
|
SuperVersion* superversion_to_free = nullptr;
|
|
|
|
SuperVersion* new_superversion = new SuperVersion();
|
|
|
|
|
|
|
|
mutex_.Lock();
|
|
|
|
|
|
|
|
// only allow one thread refitting
|
|
|
|
if (refitting_level_) {
|
|
|
|
mutex_.Unlock();
|
|
|
|
Log(InfoLogLevel::INFO_LEVEL, db_options_.info_log,
|
|
|
|
"[ReFitLevel] another thread is refitting");
|
|
|
|
delete new_superversion;
|
|
|
|
return Status::NotSupported("another thread is refitting");
|
|
|
|
}
|
|
|
|
refitting_level_ = true;
|
|
|
|
|
|
|
|
// wait for all background threads to stop
|
|
|
|
bg_work_gate_closed_ = true;
|
|
|
|
while (bg_compaction_scheduled_ > 0 || bg_flush_scheduled_) {
|
|
|
|
Log(InfoLogLevel::INFO_LEVEL, db_options_.info_log,
|
|
|
|
"[RefitLevel] waiting for background threads to stop: %d %d",
|
|
|
|
bg_compaction_scheduled_, bg_flush_scheduled_);
|
|
|
|
bg_cv_.Wait();
|
|
|
|
}
|
|
|
|
|
|
|
|
const MutableCFOptions mutable_cf_options =
|
|
|
|
*cfd->GetLatestMutableCFOptions();
|
|
|
|
// move to a smaller level
|
|
|
|
int to_level = target_level;
|
|
|
|
if (target_level < 0) {
|
|
|
|
to_level = FindMinimumEmptyLevelFitting(cfd, mutable_cf_options, level);
|
|
|
|
}
|
|
|
|
|
|
|
|
assert(to_level <= level);
|
|
|
|
|
|
|
|
Status status;
|
|
|
|
if (to_level < level) {
|
|
|
|
Log(InfoLogLevel::DEBUG_LEVEL, db_options_.info_log,
|
|
|
|
"[%s] Before refitting:\n%s",
|
|
|
|
cfd->GetName().c_str(), cfd->current()->DebugString().data());
|
|
|
|
|
|
|
|
VersionEdit edit;
|
|
|
|
edit.SetColumnFamily(cfd->GetID());
|
|
|
|
for (const auto& f : cfd->current()->storage_info()->LevelFiles(level)) {
|
|
|
|
edit.DeleteFile(level, f->fd.GetNumber());
|
|
|
|
edit.AddFile(to_level, f->fd.GetNumber(), f->fd.GetPathId(),
|
|
|
|
f->fd.GetFileSize(), f->smallest, f->largest,
|
|
|
|
f->smallest_seqno, f->largest_seqno);
|
|
|
|
}
|
|
|
|
Log(InfoLogLevel::DEBUG_LEVEL, db_options_.info_log,
|
|
|
|
"[%s] Apply version edit:\n%s",
|
|
|
|
cfd->GetName().c_str(), edit.DebugString().data());
|
|
|
|
|
|
|
|
status = versions_->LogAndApply(cfd,
|
|
|
|
mutable_cf_options, &edit, &mutex_, db_directory_.get());
|
|
|
|
superversion_to_free = InstallSuperVersion(
|
|
|
|
cfd, new_superversion, mutable_cf_options);
|
|
|
|
new_superversion = nullptr;
|
|
|
|
|
|
|
|
Log(InfoLogLevel::DEBUG_LEVEL, db_options_.info_log,
|
|
|
|
"[%s] LogAndApply: %s\n", cfd->GetName().c_str(),
|
|
|
|
status.ToString().data());
|
|
|
|
|
|
|
|
if (status.ok()) {
|
|
|
|
Log(InfoLogLevel::DEBUG_LEVEL, db_options_.info_log,
|
|
|
|
"[%s] After refitting:\n%s",
|
|
|
|
cfd->GetName().c_str(), cfd->current()->DebugString().data());
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
refitting_level_ = false;
|
|
|
|
bg_work_gate_closed_ = false;
|
|
|
|
|
|
|
|
mutex_.Unlock();
|
|
|
|
delete superversion_to_free;
|
|
|
|
delete new_superversion;
|
|
|
|
return status;
|
|
|
|
}
|
|
|
|
|
|
|
|
int DBImpl::NumberLevels(ColumnFamilyHandle* column_family) {
|
|
|
|
auto cfh = reinterpret_cast<ColumnFamilyHandleImpl*>(column_family);
|
|
|
|
return cfh->cfd()->NumberLevels();
|
|
|
|
}
|
|
|
|
|
|
|
|
int DBImpl::MaxMemCompactionLevel(ColumnFamilyHandle* column_family) {
|
|
|
|
auto cfh = reinterpret_cast<ColumnFamilyHandleImpl*>(column_family);
|
|
|
|
MutexLock l(&mutex_);
|
|
|
|
return cfh->cfd()->GetSuperVersion()->
|
|
|
|
mutable_cf_options.max_mem_compaction_level;
|
|
|
|
}
|
|
|
|
|
|
|
|
int DBImpl::Level0StopWriteTrigger(ColumnFamilyHandle* column_family) {
|
|
|
|
auto cfh = reinterpret_cast<ColumnFamilyHandleImpl*>(column_family);
|
|
|
|
MutexLock l(&mutex_);
|
|
|
|
return cfh->cfd()->GetSuperVersion()->
|
|
|
|
mutable_cf_options.level0_stop_writes_trigger;
|
|
|
|
}
|
|
|
|
|
|
|
|
Status DBImpl::Flush(const FlushOptions& flush_options,
|
|
|
|
ColumnFamilyHandle* column_family) {
|
|
|
|
auto cfh = reinterpret_cast<ColumnFamilyHandleImpl*>(column_family);
|
|
|
|
return FlushMemTable(cfh->cfd(), flush_options);
|
|
|
|
}
|
|
|
|
|
|
|
|
SequenceNumber DBImpl::GetLatestSequenceNumber() const {
|
|
|
|
return versions_->LastSequence();
|
|
|
|
}
|
|
|
|
|
|
|
|
Status DBImpl::RunManualCompaction(ColumnFamilyData* cfd, int input_level,
|
|
|
|
int output_level, uint32_t output_path_id,
|
|
|
|
const Slice* begin, const Slice* end) {
|
|
|
|
assert(input_level >= 0);
|
|
|
|
|
|
|
|
InternalKey begin_storage, end_storage;
|
|
|
|
|
|
|
|
ManualCompaction manual;
|
|
|
|
manual.cfd = cfd;
|
|
|
|
manual.input_level = input_level;
|
|
|
|
manual.output_level = output_level;
|
|
|
|
manual.output_path_id = output_path_id;
|
|
|
|
manual.done = false;
|
|
|
|
manual.in_progress = false;
|
|
|
|
// For universal compaction, we enforce every manual compaction to compact
|
|
|
|
// all files.
|
|
|
|
if (begin == nullptr ||
|
|
|
|
cfd->ioptions()->compaction_style == kCompactionStyleUniversal ||
|
|
|
|
cfd->ioptions()->compaction_style == kCompactionStyleFIFO) {
|
|
|
|
manual.begin = nullptr;
|
|
|
|
} else {
|
|
|
|
begin_storage = InternalKey(*begin, kMaxSequenceNumber, kValueTypeForSeek);
|
|
|
|
manual.begin = &begin_storage;
|
|
|
|
}
|
|
|
|
if (end == nullptr ||
|
|
|
|
cfd->ioptions()->compaction_style == kCompactionStyleUniversal ||
|
|
|
|
cfd->ioptions()->compaction_style == kCompactionStyleFIFO) {
|
|
|
|
manual.end = nullptr;
|
|
|
|
} else {
|
|
|
|
end_storage = InternalKey(*end, 0, static_cast<ValueType>(0));
|
|
|
|
manual.end = &end_storage;
|
|
|
|
}
|
|
|
|
|
|
|
|
MutexLock l(&mutex_);
|
|
|
|
|
Fix a deadlock in CompactRange()
Summary:
The way DBImpl::TEST_CompactRange() throttles down the number of bg compactions
can cause it to deadlock when CompactRange() is called concurrently from
multiple threads. Imagine a following scenario with only two threads
(max_background_compactions is 10 and bg_compaction_scheduled_ is initially 0):
1. Thread #1 increments bg_compaction_scheduled_ (to LargeNumber), sets
bg_compaction_scheduled_ to 9 (newvalue), schedules the compaction
(bg_compaction_scheduled_ is now 10) and waits for it to complete.
2. Thread #2 calls TEST_CompactRange(), increments bg_compaction_scheduled_
(now LargeNumber + 10) and waits on a cv for bg_compaction_scheduled_ to
drop to LargeNumber.
3. BG thread completes the first manual compaction, decrements
bg_compaction_scheduled_ and wakes up all threads waiting on bg_cv_.
Thread #1 runs, increments bg_compaction_scheduled_ by LargeNumber again
(now 2*LargeNumber + 9). Since that's more than LargeNumber + newvalue,
thread #2 also goes to sleep (waiting on bg_cv_), without resetting
bg_compaction_scheduled_.
This diff attempts to address the problem by introducing a new counter
bg_manual_only_ (when positive, MaybeScheduleFlushOrCompaction() will only
schedule manual compactions).
Test Plan:
I could pretty much consistently reproduce the deadlock with a program that
calls CompactRange(nullptr, nullptr) immediately after Write() from multiple
threads. This no longer happens with this patch.
Tests (make check) pass.
Reviewers: dhruba, igor, sdong, haobo
Reviewed By: igor
CC: leveldb
Differential Revision: https://reviews.facebook.net/D14799
11 years ago
|
|
|
// When a manual compaction arrives, temporarily disable scheduling of
|
|
|
|
// non-manual compactions and wait until the number of scheduled compaction
|
|
|
|
// jobs drops to zero. This is needed to ensure that this manual compaction
|
|
|
|
// can compact any range of keys/files.
|
|
|
|
//
|
|
|
|
// bg_manual_only_ is non-zero when at least one thread is inside
|
|
|
|
// RunManualCompaction(), i.e. during that time no other compaction will
|
Fix a deadlock in CompactRange()
Summary:
The way DBImpl::TEST_CompactRange() throttles down the number of bg compactions
can cause it to deadlock when CompactRange() is called concurrently from
multiple threads. Imagine a following scenario with only two threads
(max_background_compactions is 10 and bg_compaction_scheduled_ is initially 0):
1. Thread #1 increments bg_compaction_scheduled_ (to LargeNumber), sets
bg_compaction_scheduled_ to 9 (newvalue), schedules the compaction
(bg_compaction_scheduled_ is now 10) and waits for it to complete.
2. Thread #2 calls TEST_CompactRange(), increments bg_compaction_scheduled_
(now LargeNumber + 10) and waits on a cv for bg_compaction_scheduled_ to
drop to LargeNumber.
3. BG thread completes the first manual compaction, decrements
bg_compaction_scheduled_ and wakes up all threads waiting on bg_cv_.
Thread #1 runs, increments bg_compaction_scheduled_ by LargeNumber again
(now 2*LargeNumber + 9). Since that's more than LargeNumber + newvalue,
thread #2 also goes to sleep (waiting on bg_cv_), without resetting
bg_compaction_scheduled_.
This diff attempts to address the problem by introducing a new counter
bg_manual_only_ (when positive, MaybeScheduleFlushOrCompaction() will only
schedule manual compactions).
Test Plan:
I could pretty much consistently reproduce the deadlock with a program that
calls CompactRange(nullptr, nullptr) immediately after Write() from multiple
threads. This no longer happens with this patch.
Tests (make check) pass.
Reviewers: dhruba, igor, sdong, haobo
Reviewed By: igor
CC: leveldb
Differential Revision: https://reviews.facebook.net/D14799
11 years ago
|
|
|
// get scheduled (see MaybeScheduleFlushOrCompaction).
|
|
|
|
//
|
|
|
|
// Note that the following loop doesn't stop more that one thread calling
|
|
|
|
// RunManualCompaction() from getting to the second while loop below.
|
Fix a deadlock in CompactRange()
Summary:
The way DBImpl::TEST_CompactRange() throttles down the number of bg compactions
can cause it to deadlock when CompactRange() is called concurrently from
multiple threads. Imagine a following scenario with only two threads
(max_background_compactions is 10 and bg_compaction_scheduled_ is initially 0):
1. Thread #1 increments bg_compaction_scheduled_ (to LargeNumber), sets
bg_compaction_scheduled_ to 9 (newvalue), schedules the compaction
(bg_compaction_scheduled_ is now 10) and waits for it to complete.
2. Thread #2 calls TEST_CompactRange(), increments bg_compaction_scheduled_
(now LargeNumber + 10) and waits on a cv for bg_compaction_scheduled_ to
drop to LargeNumber.
3. BG thread completes the first manual compaction, decrements
bg_compaction_scheduled_ and wakes up all threads waiting on bg_cv_.
Thread #1 runs, increments bg_compaction_scheduled_ by LargeNumber again
(now 2*LargeNumber + 9). Since that's more than LargeNumber + newvalue,
thread #2 also goes to sleep (waiting on bg_cv_), without resetting
bg_compaction_scheduled_.
This diff attempts to address the problem by introducing a new counter
bg_manual_only_ (when positive, MaybeScheduleFlushOrCompaction() will only
schedule manual compactions).
Test Plan:
I could pretty much consistently reproduce the deadlock with a program that
calls CompactRange(nullptr, nullptr) immediately after Write() from multiple
threads. This no longer happens with this patch.
Tests (make check) pass.
Reviewers: dhruba, igor, sdong, haobo
Reviewed By: igor
CC: leveldb
Differential Revision: https://reviews.facebook.net/D14799
11 years ago
|
|
|
// However, only one of them will actually schedule compaction, while
|
|
|
|
// others will wait on a condition variable until it completes.
|
|
|
|
|
|
|
|
++bg_manual_only_;
|
|
|
|
while (bg_compaction_scheduled_ > 0) {
|
|
|
|
Log(InfoLogLevel::INFO_LEVEL, db_options_.info_log,
|
|
|
|
"[%s] Manual compaction waiting for all other scheduled background "
|
|
|
|
"compactions to finish",
|
|
|
|
cfd->GetName().c_str());
|
|
|
|
bg_cv_.Wait();
|
|
|
|
}
|
Fix a deadlock in CompactRange()
Summary:
The way DBImpl::TEST_CompactRange() throttles down the number of bg compactions
can cause it to deadlock when CompactRange() is called concurrently from
multiple threads. Imagine a following scenario with only two threads
(max_background_compactions is 10 and bg_compaction_scheduled_ is initially 0):
1. Thread #1 increments bg_compaction_scheduled_ (to LargeNumber), sets
bg_compaction_scheduled_ to 9 (newvalue), schedules the compaction
(bg_compaction_scheduled_ is now 10) and waits for it to complete.
2. Thread #2 calls TEST_CompactRange(), increments bg_compaction_scheduled_
(now LargeNumber + 10) and waits on a cv for bg_compaction_scheduled_ to
drop to LargeNumber.
3. BG thread completes the first manual compaction, decrements
bg_compaction_scheduled_ and wakes up all threads waiting on bg_cv_.
Thread #1 runs, increments bg_compaction_scheduled_ by LargeNumber again
(now 2*LargeNumber + 9). Since that's more than LargeNumber + newvalue,
thread #2 also goes to sleep (waiting on bg_cv_), without resetting
bg_compaction_scheduled_.
This diff attempts to address the problem by introducing a new counter
bg_manual_only_ (when positive, MaybeScheduleFlushOrCompaction() will only
schedule manual compactions).
Test Plan:
I could pretty much consistently reproduce the deadlock with a program that
calls CompactRange(nullptr, nullptr) immediately after Write() from multiple
threads. This no longer happens with this patch.
Tests (make check) pass.
Reviewers: dhruba, igor, sdong, haobo
Reviewed By: igor
CC: leveldb
Differential Revision: https://reviews.facebook.net/D14799
11 years ago
|
|
|
|
|
|
|
Log(InfoLogLevel::INFO_LEVEL, db_options_.info_log,
|
|
|
|
"[%s] Manual compaction starting",
|
|
|
|
cfd->GetName().c_str());
|
|
|
|
|
Don't run background jobs (flush, compactions) when bg_error_ is set
Summary:
If bg_error_ is set, that means that we mark DB read only. However, current behavior still continues the flushes and compactions, even though bg_error_ is set.
On the other hand, if bg_error_ is set, we will return Status::OK() from CompactRange(), although the compaction didn't actually succeed.
This is clearly not desired behavior. I found this when I was debugging t5132159, although I'm pretty sure these aren't related.
Also, when we're shutting down, it's dangerous to exit RunManualCompaction(), since that will destruct ManualCompaction object. Background compaction job might still hold a reference to manual_compaction_ and this will lead to undefined behavior. I changed the behavior so that we only exit RunManualCompaction when manual compaction job is marked done.
Test Plan: make check
Reviewers: sdong, ljin, yhchiang
Reviewed By: yhchiang
Subscribers: leveldb
Differential Revision: https://reviews.facebook.net/D23223
10 years ago
|
|
|
// We don't check bg_error_ here, because if we get the error in compaction,
|
|
|
|
// the compaction will set manual.status to bg_error_ and set manual.done to
|
|
|
|
// true.
|
|
|
|
while (!manual.done) {
|
Fix a deadlock in CompactRange()
Summary:
The way DBImpl::TEST_CompactRange() throttles down the number of bg compactions
can cause it to deadlock when CompactRange() is called concurrently from
multiple threads. Imagine a following scenario with only two threads
(max_background_compactions is 10 and bg_compaction_scheduled_ is initially 0):
1. Thread #1 increments bg_compaction_scheduled_ (to LargeNumber), sets
bg_compaction_scheduled_ to 9 (newvalue), schedules the compaction
(bg_compaction_scheduled_ is now 10) and waits for it to complete.
2. Thread #2 calls TEST_CompactRange(), increments bg_compaction_scheduled_
(now LargeNumber + 10) and waits on a cv for bg_compaction_scheduled_ to
drop to LargeNumber.
3. BG thread completes the first manual compaction, decrements
bg_compaction_scheduled_ and wakes up all threads waiting on bg_cv_.
Thread #1 runs, increments bg_compaction_scheduled_ by LargeNumber again
(now 2*LargeNumber + 9). Since that's more than LargeNumber + newvalue,
thread #2 also goes to sleep (waiting on bg_cv_), without resetting
bg_compaction_scheduled_.
This diff attempts to address the problem by introducing a new counter
bg_manual_only_ (when positive, MaybeScheduleFlushOrCompaction() will only
schedule manual compactions).
Test Plan:
I could pretty much consistently reproduce the deadlock with a program that
calls CompactRange(nullptr, nullptr) immediately after Write() from multiple
threads. This no longer happens with this patch.
Tests (make check) pass.
Reviewers: dhruba, igor, sdong, haobo
Reviewed By: igor
CC: leveldb
Differential Revision: https://reviews.facebook.net/D14799
11 years ago
|
|
|
assert(bg_manual_only_ > 0);
|
|
|
|
if (manual_compaction_ != nullptr) {
|
|
|
|
// Running either this or some other manual compaction
|
|
|
|
bg_cv_.Wait();
|
Fix a deadlock in CompactRange()
Summary:
The way DBImpl::TEST_CompactRange() throttles down the number of bg compactions
can cause it to deadlock when CompactRange() is called concurrently from
multiple threads. Imagine a following scenario with only two threads
(max_background_compactions is 10 and bg_compaction_scheduled_ is initially 0):
1. Thread #1 increments bg_compaction_scheduled_ (to LargeNumber), sets
bg_compaction_scheduled_ to 9 (newvalue), schedules the compaction
(bg_compaction_scheduled_ is now 10) and waits for it to complete.
2. Thread #2 calls TEST_CompactRange(), increments bg_compaction_scheduled_
(now LargeNumber + 10) and waits on a cv for bg_compaction_scheduled_ to
drop to LargeNumber.
3. BG thread completes the first manual compaction, decrements
bg_compaction_scheduled_ and wakes up all threads waiting on bg_cv_.
Thread #1 runs, increments bg_compaction_scheduled_ by LargeNumber again
(now 2*LargeNumber + 9). Since that's more than LargeNumber + newvalue,
thread #2 also goes to sleep (waiting on bg_cv_), without resetting
bg_compaction_scheduled_.
This diff attempts to address the problem by introducing a new counter
bg_manual_only_ (when positive, MaybeScheduleFlushOrCompaction() will only
schedule manual compactions).
Test Plan:
I could pretty much consistently reproduce the deadlock with a program that
calls CompactRange(nullptr, nullptr) immediately after Write() from multiple
threads. This no longer happens with this patch.
Tests (make check) pass.
Reviewers: dhruba, igor, sdong, haobo
Reviewed By: igor
CC: leveldb
Differential Revision: https://reviews.facebook.net/D14799
11 years ago
|
|
|
} else {
|
|
|
|
manual_compaction_ = &manual;
|
|
|
|
bg_compaction_scheduled_++;
|
|
|
|
env_->Schedule(&DBImpl::BGWorkCompaction, this, Env::Priority::LOW);
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
Fix a deadlock in CompactRange()
Summary:
The way DBImpl::TEST_CompactRange() throttles down the number of bg compactions
can cause it to deadlock when CompactRange() is called concurrently from
multiple threads. Imagine a following scenario with only two threads
(max_background_compactions is 10 and bg_compaction_scheduled_ is initially 0):
1. Thread #1 increments bg_compaction_scheduled_ (to LargeNumber), sets
bg_compaction_scheduled_ to 9 (newvalue), schedules the compaction
(bg_compaction_scheduled_ is now 10) and waits for it to complete.
2. Thread #2 calls TEST_CompactRange(), increments bg_compaction_scheduled_
(now LargeNumber + 10) and waits on a cv for bg_compaction_scheduled_ to
drop to LargeNumber.
3. BG thread completes the first manual compaction, decrements
bg_compaction_scheduled_ and wakes up all threads waiting on bg_cv_.
Thread #1 runs, increments bg_compaction_scheduled_ by LargeNumber again
(now 2*LargeNumber + 9). Since that's more than LargeNumber + newvalue,
thread #2 also goes to sleep (waiting on bg_cv_), without resetting
bg_compaction_scheduled_.
This diff attempts to address the problem by introducing a new counter
bg_manual_only_ (when positive, MaybeScheduleFlushOrCompaction() will only
schedule manual compactions).
Test Plan:
I could pretty much consistently reproduce the deadlock with a program that
calls CompactRange(nullptr, nullptr) immediately after Write() from multiple
threads. This no longer happens with this patch.
Tests (make check) pass.
Reviewers: dhruba, igor, sdong, haobo
Reviewed By: igor
CC: leveldb
Differential Revision: https://reviews.facebook.net/D14799
11 years ago
|
|
|
assert(!manual.in_progress);
|
|
|
|
assert(bg_manual_only_ > 0);
|
|
|
|
--bg_manual_only_;
|
|
|
|
return manual.status;
|
|
|
|
}
|
|
|
|
|
|
|
|
Status DBImpl::FlushMemTable(ColumnFamilyData* cfd,
|
|
|
|
const FlushOptions& flush_options) {
|
|
|
|
Status s;
|
|
|
|
{
|
|
|
|
WriteContext context;
|
|
|
|
MutexLock guard_lock(&mutex_);
|
|
|
|
|
|
|
|
if (cfd->imm()->size() == 0 && cfd->mem()->IsEmpty()) {
|
|
|
|
// Nothing to flush
|
|
|
|
return Status::OK();
|
|
|
|
}
|
|
|
|
|
|
|
|
WriteThread::Writer w(&mutex_);
|
|
|
|
s = write_thread_.EnterWriteThread(&w, 0);
|
|
|
|
assert(s.ok() && !w.done); // No timeout and nobody should do our job
|
|
|
|
|
|
|
|
// SetNewMemtableAndNewLogFile() will release and reacquire mutex
|
|
|
|
// during execution
|
|
|
|
s = SetNewMemtableAndNewLogFile(cfd, &context);
|
|
|
|
cfd->imm()->FlushRequested();
|
|
|
|
MaybeScheduleFlushOrCompaction();
|
|
|
|
|
|
|
|
write_thread_.ExitWriteThread(&w, &w, s);
|
|
|
|
}
|
|
|
|
|
|
|
|
if (s.ok() && flush_options.wait) {
|
|
|
|
// Wait until the compaction completes
|
|
|
|
s = WaitForFlushMemTable(cfd);
|
|
|
|
}
|
|
|
|
return s;
|
|
|
|
}
|
|
|
|
|
|
|
|
Status DBImpl::WaitForFlushMemTable(ColumnFamilyData* cfd) {
|
|
|
|
Status s;
|
|
|
|
// Wait until the compaction completes
|
|
|
|
MutexLock l(&mutex_);
|
|
|
|
while (cfd->imm()->size() > 0 && bg_error_.ok()) {
|
|
|
|
bg_cv_.Wait();
|
|
|
|
}
|
|
|
|
if (!bg_error_.ok()) {
|
|
|
|
s = bg_error_;
|
|
|
|
}
|
|
|
|
return s;
|
|
|
|
}
|
|
|
|
|
|
|
|
void DBImpl::MaybeScheduleFlushOrCompaction() {
|
|
|
|
mutex_.AssertHeld();
|
Fix data race against logging data structure because of LogBuffer
Summary:
@igor pointed out that there is a potential data race because of the way we use the newly introduced LogBuffer. After "bg_compaction_scheduled_--" or "bg_flush_scheduled_--", they can both become 0. As soon as the lock is released after that, DBImpl's deconstructor can go ahead and deconstruct all the states inside DB, including the info_log object hold in a shared pointer of the options object it keeps. At that point it is not safe anymore to continue using the info logger to write the delayed logs.
With the patch, lock is released temporarily for log buffer to be flushed before "bg_compaction_scheduled_--" or "bg_flush_scheduled_--". In order to make sure we don't miss any pending flush or compaction, a new flag bg_schedule_needed_ is added, which is set to be true if there is a pending flush or compaction but not scheduled because of the max thread limit. If the flag is set to be true, the scheduling function will be called before compaction or flush thread finishes.
Thanks @igor for this finding!
Test Plan: make all check
Reviewers: haobo, igor
Reviewed By: haobo
CC: dhruba, ljin, yhchiang, igor, leveldb
Differential Revision: https://reviews.facebook.net/D16767
11 years ago
|
|
|
bg_schedule_needed_ = false;
|
|
|
|
if (bg_work_gate_closed_) {
|
|
|
|
// gate closed for backgrond work
|
|
|
|
} else if (shutting_down_.load(std::memory_order_acquire)) {
|
|
|
|
// DB is being deleted; no more background compactions
|
|
|
|
} else {
|
|
|
|
bool is_flush_pending = false;
|
|
|
|
// no need to refcount since we're under a mutex
|
|
|
|
for (auto cfd : *versions_->GetColumnFamilySet()) {
|
|
|
|
if (cfd->imm()->IsFlushPending()) {
|
|
|
|
is_flush_pending = true;
|
|
|
|
}
|
|
|
|
}
|
|
|
|
if (is_flush_pending) {
|
|
|
|
// memtable flush needed
|
|
|
|
if (bg_flush_scheduled_ < db_options_.max_background_flushes) {
|
|
|
|
bg_flush_scheduled_++;
|
|
|
|
env_->Schedule(&DBImpl::BGWorkFlush, this, Env::Priority::HIGH);
|
|
|
|
} else if (db_options_.max_background_flushes > 0) {
|
Fix data race against logging data structure because of LogBuffer
Summary:
@igor pointed out that there is a potential data race because of the way we use the newly introduced LogBuffer. After "bg_compaction_scheduled_--" or "bg_flush_scheduled_--", they can both become 0. As soon as the lock is released after that, DBImpl's deconstructor can go ahead and deconstruct all the states inside DB, including the info_log object hold in a shared pointer of the options object it keeps. At that point it is not safe anymore to continue using the info logger to write the delayed logs.
With the patch, lock is released temporarily for log buffer to be flushed before "bg_compaction_scheduled_--" or "bg_flush_scheduled_--". In order to make sure we don't miss any pending flush or compaction, a new flag bg_schedule_needed_ is added, which is set to be true if there is a pending flush or compaction but not scheduled because of the max thread limit. If the flag is set to be true, the scheduling function will be called before compaction or flush thread finishes.
Thanks @igor for this finding!
Test Plan: make all check
Reviewers: haobo, igor
Reviewed By: haobo
CC: dhruba, ljin, yhchiang, igor, leveldb
Differential Revision: https://reviews.facebook.net/D16767
11 years ago
|
|
|
bg_schedule_needed_ = true;
|
|
|
|
}
|
|
|
|
}
|
|
|
|
bool is_compaction_needed = false;
|
|
|
|
// no need to refcount since we're under a mutex
|
|
|
|
for (auto cfd : *versions_->GetColumnFamilySet()) {
|
|
|
|
if (cfd->current()->storage_info()->NeedsCompaction()) {
|
|
|
|
is_compaction_needed = true;
|
|
|
|
break;
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
// Schedule BGWorkCompaction if there's a compaction pending (or a memtable
|
|
|
|
// flush, but the HIGH pool is not enabled)
|
|
|
|
// Do it only if max_background_compactions hasn't been reached and
|
|
|
|
// bg_manual_only_ == 0
|
|
|
|
if (!bg_manual_only_ &&
|
|
|
|
(is_compaction_needed ||
|
|
|
|
(is_flush_pending && db_options_.max_background_flushes == 0))) {
|
|
|
|
if (bg_compaction_scheduled_ < db_options_.max_background_compactions) {
|
Fix data race against logging data structure because of LogBuffer
Summary:
@igor pointed out that there is a potential data race because of the way we use the newly introduced LogBuffer. After "bg_compaction_scheduled_--" or "bg_flush_scheduled_--", they can both become 0. As soon as the lock is released after that, DBImpl's deconstructor can go ahead and deconstruct all the states inside DB, including the info_log object hold in a shared pointer of the options object it keeps. At that point it is not safe anymore to continue using the info logger to write the delayed logs.
With the patch, lock is released temporarily for log buffer to be flushed before "bg_compaction_scheduled_--" or "bg_flush_scheduled_--". In order to make sure we don't miss any pending flush or compaction, a new flag bg_schedule_needed_ is added, which is set to be true if there is a pending flush or compaction but not scheduled because of the max thread limit. If the flag is set to be true, the scheduling function will be called before compaction or flush thread finishes.
Thanks @igor for this finding!
Test Plan: make all check
Reviewers: haobo, igor
Reviewed By: haobo
CC: dhruba, ljin, yhchiang, igor, leveldb
Differential Revision: https://reviews.facebook.net/D16767
11 years ago
|
|
|
bg_compaction_scheduled_++;
|
|
|
|
env_->Schedule(&DBImpl::BGWorkCompaction, this, Env::Priority::LOW);
|
|
|
|
} else {
|
|
|
|
bg_schedule_needed_ = true;
|
|
|
|
}
|
|
|
|
}
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
void DBImpl::RecordFlushIOStats() {
|
|
|
|
RecordTick(stats_, FLUSH_WRITE_BYTES, IOSTATS(bytes_written));
|
|
|
|
IOSTATS_RESET(bytes_written);
|
|
|
|
}
|
|
|
|
|
|
|
|
void DBImpl::BGWorkFlush(void* db) {
|
|
|
|
IOSTATS_SET_THREAD_POOL_ID(Env::Priority::HIGH);
|
|
|
|
reinterpret_cast<DBImpl*>(db)->BackgroundCallFlush();
|
|
|
|
}
|
|
|
|
|
|
|
|
void DBImpl::BGWorkCompaction(void* db) {
|
|
|
|
IOSTATS_SET_THREAD_POOL_ID(Env::Priority::LOW);
|
|
|
|
reinterpret_cast<DBImpl*>(db)->BackgroundCallCompaction();
|
|
|
|
}
|
|
|
|
|
|
|
|
Status DBImpl::BackgroundFlush(bool* madeProgress, JobContext* job_context,
|
|
|
|
LogBuffer* log_buffer) {
|
|
|
|
mutex_.AssertHeld();
|
Don't run background jobs (flush, compactions) when bg_error_ is set
Summary:
If bg_error_ is set, that means that we mark DB read only. However, current behavior still continues the flushes and compactions, even though bg_error_ is set.
On the other hand, if bg_error_ is set, we will return Status::OK() from CompactRange(), although the compaction didn't actually succeed.
This is clearly not desired behavior. I found this when I was debugging t5132159, although I'm pretty sure these aren't related.
Also, when we're shutting down, it's dangerous to exit RunManualCompaction(), since that will destruct ManualCompaction object. Background compaction job might still hold a reference to manual_compaction_ and this will lead to undefined behavior. I changed the behavior so that we only exit RunManualCompaction when manual compaction job is marked done.
Test Plan: make check
Reviewers: sdong, ljin, yhchiang
Reviewed By: yhchiang
Subscribers: leveldb
Differential Revision: https://reviews.facebook.net/D23223
10 years ago
|
|
|
|
|
|
|
if (!bg_error_.ok()) {
|
|
|
|
return bg_error_;
|
|
|
|
}
|
|
|
|
|
|
|
|
// call_status is failure if at least one flush was a failure. even if
|
|
|
|
// flushing one column family reports a failure, we will continue flushing
|
|
|
|
// other column families. however, call_status will be a failure in that case.
|
|
|
|
Status call_status;
|
|
|
|
// refcounting in iteration
|
|
|
|
for (auto cfd : *versions_->GetColumnFamilySet()) {
|
|
|
|
cfd->Ref();
|
|
|
|
Status flush_status;
|
|
|
|
const MutableCFOptions mutable_cf_options =
|
|
|
|
*cfd->GetLatestMutableCFOptions();
|
|
|
|
while (flush_status.ok() && cfd->imm()->IsFlushPending()) {
|
|
|
|
LogToBuffer(
|
|
|
|
log_buffer,
|
|
|
|
"BackgroundCallFlush doing FlushMemTableToOutputFile with column "
|
|
|
|
"family [%s], flush slots available %d",
|
|
|
|
cfd->GetName().c_str(),
|
|
|
|
db_options_.max_background_flushes - bg_flush_scheduled_);
|
|
|
|
flush_status = FlushMemTableToOutputFile(
|
|
|
|
cfd, mutable_cf_options, madeProgress, job_context, log_buffer);
|
|
|
|
}
|
|
|
|
if (call_status.ok() && !flush_status.ok()) {
|
|
|
|
call_status = flush_status;
|
|
|
|
}
|
|
|
|
cfd->Unref();
|
|
|
|
}
|
|
|
|
versions_->GetColumnFamilySet()->FreeDeadColumnFamilies();
|
|
|
|
return call_status;
|
|
|
|
}
|
|
|
|
|
|
|
|
void DBImpl::BackgroundCallFlush() {
|
|
|
|
bool madeProgress = false;
|
|
|
|
JobContext job_context(true);
|
|
|
|
assert(bg_flush_scheduled_);
|
|
|
|
|
|
|
|
LogBuffer log_buffer(InfoLogLevel::INFO_LEVEL, db_options_.info_log.get());
|
|
|
|
{
|
|
|
|
MutexLock l(&mutex_);
|
|
|
|
|
|
|
|
Status s;
|
|
|
|
if (!shutting_down_.load(std::memory_order_acquire)) {
|
|
|
|
s = BackgroundFlush(&madeProgress, &job_context, &log_buffer);
|
|
|
|
if (!s.ok()) {
|
|
|
|
// Wait a little bit before retrying background compaction in
|
|
|
|
// case this is an environmental problem and we do not want to
|
|
|
|
// chew up resources for failed compactions for the duration of
|
|
|
|
// the problem.
|
make internal stats independent of statistics
Summary:
also make it aware of column family
output from db_bench
```
** Compaction Stats [default] **
Level Files Size(MB) Score Read(GB) Rn(GB) Rnp1(GB) Write(GB) Wnew(GB) RW-Amp W-Amp Rd(MB/s) Wr(MB/s) Rn(cnt) Rnp1(cnt) Wnp1(cnt) Wnew(cnt) Comp(sec) Comp(cnt) Avg(sec) Stall(sec) Stall(cnt) Avg(ms)
------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------
L0 14 956 0.9 0.0 0.0 0.0 2.7 2.7 0.0 0.0 0.0 111.6 0 0 0 0 24 40 0.612 75.20 492387 0.15
L1 21 2001 2.0 5.7 2.0 3.7 5.3 1.6 5.4 2.6 71.2 65.7 31 43 55 12 82 2 41.242 43.72 41183 1.06
L2 217 18974 1.9 16.5 2.0 14.4 15.1 0.7 15.6 7.4 70.1 64.3 17 182 185 3 241 16 15.052 0.00 0 0.00
L3 1641 188245 1.8 9.1 1.1 8.0 8.5 0.5 15.4 7.4 61.3 57.2 9 75 76 1 152 9 16.887 0.00 0 0.00
L4 4447 449025 0.4 13.4 4.8 8.6 9.1 0.5 4.7 1.9 77.8 52.7 38 79 100 21 176 38 4.639 0.00 0 0.00
Sum 6340 659201 0.0 44.7 10.0 34.7 40.6 6.0 32.0 15.2 67.7 61.6 95 379 416 37 676 105 6.439 118.91 533570 0.22
Int 0 0 0.0 1.2 0.4 0.8 1.3 0.5 5.2 2.7 59.1 65.6 3 7 9 2 20 10 2.003 0.00 0 0.00
Stalls(secs): 75.197 level0_slowdown, 0.000 level0_numfiles, 0.000 memtable_compaction, 43.717 leveln_slowdown
Stalls(count): 492387 level0_slowdown, 0 level0_numfiles, 0 memtable_compaction, 41183 leveln_slowdown
** DB Stats **
Uptime(secs): 202.1 total, 13.5 interval
Cumulative writes: 6291456 writes, 6291456 batches, 1.0 writes per batch, 4.90 ingest GB
Cumulative WAL: 6291456 writes, 6291456 syncs, 1.00 writes per sync, 4.90 GB written
Interval writes: 1048576 writes, 1048576 batches, 1.0 writes per batch, 836.0 ingest MB
Interval WAL: 1048576 writes, 1048576 syncs, 1.00 writes per sync, 0.82 MB written
Test Plan: ran it
Reviewers: sdong, yhchiang, igor
Reviewed By: igor
Subscribers: leveldb
Differential Revision: https://reviews.facebook.net/D19917
11 years ago
|
|
|
uint64_t error_cnt =
|
|
|
|
default_cf_internal_stats_->BumpAndGetBackgroundErrorCount();
|
|
|
|
bg_cv_.SignalAll(); // In case a waiter can proceed despite the error
|
|
|
|
mutex_.Unlock();
|
|
|
|
Log(InfoLogLevel::ERROR_LEVEL, db_options_.info_log,
|
|
|
|
"Waiting after background flush error: %s"
|
|
|
|
"Accumulated background error counts: %" PRIu64,
|
|
|
|
s.ToString().c_str(), error_cnt);
|
|
|
|
log_buffer.FlushBufferToLog();
|
|
|
|
LogFlush(db_options_.info_log);
|
|
|
|
env_->SleepForMicroseconds(1000000);
|
|
|
|
mutex_.Lock();
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
// If !s.ok(), this means that Flush failed. In that case, we want
|
|
|
|
// to delete all obsolete files and we force FindObsoleteFiles()
|
|
|
|
FindObsoleteFiles(&job_context, !s.ok());
|
|
|
|
// delete unnecessary files if any, this is done outside the mutex
|
|
|
|
if (job_context.HaveSomethingToDelete() || !log_buffer.IsEmpty()) {
|
|
|
|
mutex_.Unlock();
|
Fix data race against logging data structure because of LogBuffer
Summary:
@igor pointed out that there is a potential data race because of the way we use the newly introduced LogBuffer. After "bg_compaction_scheduled_--" or "bg_flush_scheduled_--", they can both become 0. As soon as the lock is released after that, DBImpl's deconstructor can go ahead and deconstruct all the states inside DB, including the info_log object hold in a shared pointer of the options object it keeps. At that point it is not safe anymore to continue using the info logger to write the delayed logs.
With the patch, lock is released temporarily for log buffer to be flushed before "bg_compaction_scheduled_--" or "bg_flush_scheduled_--". In order to make sure we don't miss any pending flush or compaction, a new flag bg_schedule_needed_ is added, which is set to be true if there is a pending flush or compaction but not scheduled because of the max thread limit. If the flag is set to be true, the scheduling function will be called before compaction or flush thread finishes.
Thanks @igor for this finding!
Test Plan: make all check
Reviewers: haobo, igor
Reviewed By: haobo
CC: dhruba, ljin, yhchiang, igor, leveldb
Differential Revision: https://reviews.facebook.net/D16767
11 years ago
|
|
|
// Have to flush the info logs before bg_flush_scheduled_--
|
|
|
|
// because if bg_flush_scheduled_ becomes 0 and the lock is
|
|
|
|
// released, the deconstructor of DB can kick in and destroy all the
|
|
|
|
// states of DB so info_log might not be available after that point.
|
|
|
|
// It also applies to access other states that DB owns.
|
|
|
|
log_buffer.FlushBufferToLog();
|
|
|
|
if (job_context.HaveSomethingToDelete()) {
|
|
|
|
PurgeObsoleteFiles(job_context);
|
Fix data race against logging data structure because of LogBuffer
Summary:
@igor pointed out that there is a potential data race because of the way we use the newly introduced LogBuffer. After "bg_compaction_scheduled_--" or "bg_flush_scheduled_--", they can both become 0. As soon as the lock is released after that, DBImpl's deconstructor can go ahead and deconstruct all the states inside DB, including the info_log object hold in a shared pointer of the options object it keeps. At that point it is not safe anymore to continue using the info logger to write the delayed logs.
With the patch, lock is released temporarily for log buffer to be flushed before "bg_compaction_scheduled_--" or "bg_flush_scheduled_--". In order to make sure we don't miss any pending flush or compaction, a new flag bg_schedule_needed_ is added, which is set to be true if there is a pending flush or compaction but not scheduled because of the max thread limit. If the flag is set to be true, the scheduling function will be called before compaction or flush thread finishes.
Thanks @igor for this finding!
Test Plan: make all check
Reviewers: haobo, igor
Reviewed By: haobo
CC: dhruba, ljin, yhchiang, igor, leveldb
Differential Revision: https://reviews.facebook.net/D16767
11 years ago
|
|
|
}
|
|
|
|
mutex_.Lock();
|
|
|
|
}
|
|
|
|
|
|
|
|
bg_flush_scheduled_--;
|
Fix data race against logging data structure because of LogBuffer
Summary:
@igor pointed out that there is a potential data race because of the way we use the newly introduced LogBuffer. After "bg_compaction_scheduled_--" or "bg_flush_scheduled_--", they can both become 0. As soon as the lock is released after that, DBImpl's deconstructor can go ahead and deconstruct all the states inside DB, including the info_log object hold in a shared pointer of the options object it keeps. At that point it is not safe anymore to continue using the info logger to write the delayed logs.
With the patch, lock is released temporarily for log buffer to be flushed before "bg_compaction_scheduled_--" or "bg_flush_scheduled_--". In order to make sure we don't miss any pending flush or compaction, a new flag bg_schedule_needed_ is added, which is set to be true if there is a pending flush or compaction but not scheduled because of the max thread limit. If the flag is set to be true, the scheduling function will be called before compaction or flush thread finishes.
Thanks @igor for this finding!
Test Plan: make all check
Reviewers: haobo, igor
Reviewed By: haobo
CC: dhruba, ljin, yhchiang, igor, leveldb
Differential Revision: https://reviews.facebook.net/D16767
11 years ago
|
|
|
// Any time the mutex is released After finding the work to do, another
|
|
|
|
// thread might execute MaybeScheduleFlushOrCompaction(). It is possible
|
|
|
|
// that there is a pending job but it is not scheduled because of the
|
|
|
|
// max thread limit.
|
|
|
|
if (madeProgress || bg_schedule_needed_) {
|
|
|
|
MaybeScheduleFlushOrCompaction();
|
|
|
|
}
|
|
|
|
RecordFlushIOStats();
|
|
|
|
bg_cv_.SignalAll();
|
|
|
|
// IMPORTANT: there should be no code after calling SignalAll. This call may
|
|
|
|
// signal the DB destructor that it's OK to proceed with destruction. In
|
|
|
|
// that case, all DB variables will be dealloacated and referencing them
|
|
|
|
// will cause trouble.
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
void DBImpl::BackgroundCallCompaction() {
|
|
|
|
bool madeProgress = false;
|
|
|
|
JobContext job_context(true);
|
|
|
|
|
|
|
|
MaybeDumpStats();
|
|
|
|
LogBuffer log_buffer(InfoLogLevel::INFO_LEVEL, db_options_.info_log.get());
|
Buffer info logs when picking compactions and write them out after releasing the mutex
Summary: Now while the background thread is picking compactions, it writes out multiple info_logs, especially for universal compaction, which introduces a chance of waiting log writing in mutex, which is bad. To remove this risk, write all those info logs to a buffer and flush it after releasing the mutex.
Test Plan:
make all check
check the log lines while running some tests that trigger compactions.
Reviewers: haobo, igor, dhruba
Reviewed By: dhruba
CC: i.am.jin.lei, dhruba, yhchiang, leveldb, nkg-
Differential Revision: https://reviews.facebook.net/D16515
11 years ago
|
|
|
{
|
|
|
|
MutexLock l(&mutex_);
|
|
|
|
assert(bg_compaction_scheduled_);
|
|
|
|
Status s;
|
|
|
|
if (!shutting_down_.load(std::memory_order_acquire)) {
|
|
|
|
s = BackgroundCompaction(&madeProgress, &job_context, &log_buffer);
|
Buffer info logs when picking compactions and write them out after releasing the mutex
Summary: Now while the background thread is picking compactions, it writes out multiple info_logs, especially for universal compaction, which introduces a chance of waiting log writing in mutex, which is bad. To remove this risk, write all those info logs to a buffer and flush it after releasing the mutex.
Test Plan:
make all check
check the log lines while running some tests that trigger compactions.
Reviewers: haobo, igor, dhruba
Reviewed By: dhruba
CC: i.am.jin.lei, dhruba, yhchiang, leveldb, nkg-
Differential Revision: https://reviews.facebook.net/D16515
11 years ago
|
|
|
if (!s.ok()) {
|
|
|
|
// Wait a little bit before retrying background compaction in
|
|
|
|
// case this is an environmental problem and we do not want to
|
|
|
|
// chew up resources for failed compactions for the duration of
|
|
|
|
// the problem.
|
make internal stats independent of statistics
Summary:
also make it aware of column family
output from db_bench
```
** Compaction Stats [default] **
Level Files Size(MB) Score Read(GB) Rn(GB) Rnp1(GB) Write(GB) Wnew(GB) RW-Amp W-Amp Rd(MB/s) Wr(MB/s) Rn(cnt) Rnp1(cnt) Wnp1(cnt) Wnew(cnt) Comp(sec) Comp(cnt) Avg(sec) Stall(sec) Stall(cnt) Avg(ms)
------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------
L0 14 956 0.9 0.0 0.0 0.0 2.7 2.7 0.0 0.0 0.0 111.6 0 0 0 0 24 40 0.612 75.20 492387 0.15
L1 21 2001 2.0 5.7 2.0 3.7 5.3 1.6 5.4 2.6 71.2 65.7 31 43 55 12 82 2 41.242 43.72 41183 1.06
L2 217 18974 1.9 16.5 2.0 14.4 15.1 0.7 15.6 7.4 70.1 64.3 17 182 185 3 241 16 15.052 0.00 0 0.00
L3 1641 188245 1.8 9.1 1.1 8.0 8.5 0.5 15.4 7.4 61.3 57.2 9 75 76 1 152 9 16.887 0.00 0 0.00
L4 4447 449025 0.4 13.4 4.8 8.6 9.1 0.5 4.7 1.9 77.8 52.7 38 79 100 21 176 38 4.639 0.00 0 0.00
Sum 6340 659201 0.0 44.7 10.0 34.7 40.6 6.0 32.0 15.2 67.7 61.6 95 379 416 37 676 105 6.439 118.91 533570 0.22
Int 0 0 0.0 1.2 0.4 0.8 1.3 0.5 5.2 2.7 59.1 65.6 3 7 9 2 20 10 2.003 0.00 0 0.00
Stalls(secs): 75.197 level0_slowdown, 0.000 level0_numfiles, 0.000 memtable_compaction, 43.717 leveln_slowdown
Stalls(count): 492387 level0_slowdown, 0 level0_numfiles, 0 memtable_compaction, 41183 leveln_slowdown
** DB Stats **
Uptime(secs): 202.1 total, 13.5 interval
Cumulative writes: 6291456 writes, 6291456 batches, 1.0 writes per batch, 4.90 ingest GB
Cumulative WAL: 6291456 writes, 6291456 syncs, 1.00 writes per sync, 4.90 GB written
Interval writes: 1048576 writes, 1048576 batches, 1.0 writes per batch, 836.0 ingest MB
Interval WAL: 1048576 writes, 1048576 syncs, 1.00 writes per sync, 0.82 MB written
Test Plan: ran it
Reviewers: sdong, yhchiang, igor
Reviewed By: igor
Subscribers: leveldb
Differential Revision: https://reviews.facebook.net/D19917
11 years ago
|
|
|
uint64_t error_cnt =
|
|
|
|
default_cf_internal_stats_->BumpAndGetBackgroundErrorCount();
|
Buffer info logs when picking compactions and write them out after releasing the mutex
Summary: Now while the background thread is picking compactions, it writes out multiple info_logs, especially for universal compaction, which introduces a chance of waiting log writing in mutex, which is bad. To remove this risk, write all those info logs to a buffer and flush it after releasing the mutex.
Test Plan:
make all check
check the log lines while running some tests that trigger compactions.
Reviewers: haobo, igor, dhruba
Reviewed By: dhruba
CC: i.am.jin.lei, dhruba, yhchiang, leveldb, nkg-
Differential Revision: https://reviews.facebook.net/D16515
11 years ago
|
|
|
bg_cv_.SignalAll(); // In case a waiter can proceed despite the error
|
|
|
|
mutex_.Unlock();
|
|
|
|
log_buffer.FlushBufferToLog();
|
|
|
|
Log(InfoLogLevel::ERROR_LEVEL, db_options_.info_log,
|
|
|
|
"Waiting after background compaction error: %s, "
|
|
|
|
"Accumulated background error counts: %" PRIu64,
|
|
|
|
s.ToString().c_str(), error_cnt);
|
|
|
|
LogFlush(db_options_.info_log);
|
Buffer info logs when picking compactions and write them out after releasing the mutex
Summary: Now while the background thread is picking compactions, it writes out multiple info_logs, especially for universal compaction, which introduces a chance of waiting log writing in mutex, which is bad. To remove this risk, write all those info logs to a buffer and flush it after releasing the mutex.
Test Plan:
make all check
check the log lines while running some tests that trigger compactions.
Reviewers: haobo, igor, dhruba
Reviewed By: dhruba
CC: i.am.jin.lei, dhruba, yhchiang, leveldb, nkg-
Differential Revision: https://reviews.facebook.net/D16515
11 years ago
|
|
|
env_->SleepForMicroseconds(1000000);
|
|
|
|
mutex_.Lock();
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
Buffer info logs when picking compactions and write them out after releasing the mutex
Summary: Now while the background thread is picking compactions, it writes out multiple info_logs, especially for universal compaction, which introduces a chance of waiting log writing in mutex, which is bad. To remove this risk, write all those info logs to a buffer and flush it after releasing the mutex.
Test Plan:
make all check
check the log lines while running some tests that trigger compactions.
Reviewers: haobo, igor, dhruba
Reviewed By: dhruba
CC: i.am.jin.lei, dhruba, yhchiang, leveldb, nkg-
Differential Revision: https://reviews.facebook.net/D16515
11 years ago
|
|
|
// If !s.ok(), this means that Compaction failed. In that case, we want
|
|
|
|
// to delete all obsolete files we might have created and we force
|
|
|
|
// FindObsoleteFiles(). This is because job_context does not
|
|
|
|
// catch all created files if compaction failed.
|
|
|
|
FindObsoleteFiles(&job_context, !s.ok());
|
Buffer info logs when picking compactions and write them out after releasing the mutex
Summary: Now while the background thread is picking compactions, it writes out multiple info_logs, especially for universal compaction, which introduces a chance of waiting log writing in mutex, which is bad. To remove this risk, write all those info logs to a buffer and flush it after releasing the mutex.
Test Plan:
make all check
check the log lines while running some tests that trigger compactions.
Reviewers: haobo, igor, dhruba
Reviewed By: dhruba
CC: i.am.jin.lei, dhruba, yhchiang, leveldb, nkg-
Differential Revision: https://reviews.facebook.net/D16515
11 years ago
|
|
|
|
|
|
|
// delete unnecessary files if any, this is done outside the mutex
|
|
|
|
if (job_context.HaveSomethingToDelete() || !log_buffer.IsEmpty()) {
|
|
|
|
mutex_.Unlock();
|
Fix data race against logging data structure because of LogBuffer
Summary:
@igor pointed out that there is a potential data race because of the way we use the newly introduced LogBuffer. After "bg_compaction_scheduled_--" or "bg_flush_scheduled_--", they can both become 0. As soon as the lock is released after that, DBImpl's deconstructor can go ahead and deconstruct all the states inside DB, including the info_log object hold in a shared pointer of the options object it keeps. At that point it is not safe anymore to continue using the info logger to write the delayed logs.
With the patch, lock is released temporarily for log buffer to be flushed before "bg_compaction_scheduled_--" or "bg_flush_scheduled_--". In order to make sure we don't miss any pending flush or compaction, a new flag bg_schedule_needed_ is added, which is set to be true if there is a pending flush or compaction but not scheduled because of the max thread limit. If the flag is set to be true, the scheduling function will be called before compaction or flush thread finishes.
Thanks @igor for this finding!
Test Plan: make all check
Reviewers: haobo, igor
Reviewed By: haobo
CC: dhruba, ljin, yhchiang, igor, leveldb
Differential Revision: https://reviews.facebook.net/D16767
11 years ago
|
|
|
// Have to flush the info logs before bg_compaction_scheduled_--
|
|
|
|
// because if bg_flush_scheduled_ becomes 0 and the lock is
|
|
|
|
// released, the deconstructor of DB can kick in and destroy all the
|
|
|
|
// states of DB so info_log might not be available after that point.
|
|
|
|
// It also applies to access other states that DB owns.
|
|
|
|
log_buffer.FlushBufferToLog();
|
|
|
|
if (job_context.HaveSomethingToDelete()) {
|
|
|
|
PurgeObsoleteFiles(job_context);
|
Fix data race against logging data structure because of LogBuffer
Summary:
@igor pointed out that there is a potential data race because of the way we use the newly introduced LogBuffer. After "bg_compaction_scheduled_--" or "bg_flush_scheduled_--", they can both become 0. As soon as the lock is released after that, DBImpl's deconstructor can go ahead and deconstruct all the states inside DB, including the info_log object hold in a shared pointer of the options object it keeps. At that point it is not safe anymore to continue using the info logger to write the delayed logs.
With the patch, lock is released temporarily for log buffer to be flushed before "bg_compaction_scheduled_--" or "bg_flush_scheduled_--". In order to make sure we don't miss any pending flush or compaction, a new flag bg_schedule_needed_ is added, which is set to be true if there is a pending flush or compaction but not scheduled because of the max thread limit. If the flag is set to be true, the scheduling function will be called before compaction or flush thread finishes.
Thanks @igor for this finding!
Test Plan: make all check
Reviewers: haobo, igor
Reviewed By: haobo
CC: dhruba, ljin, yhchiang, igor, leveldb
Differential Revision: https://reviews.facebook.net/D16767
11 years ago
|
|
|
}
|
|
|
|
mutex_.Lock();
|
|
|
|
}
|
|
|
|
|
Buffer info logs when picking compactions and write them out after releasing the mutex
Summary: Now while the background thread is picking compactions, it writes out multiple info_logs, especially for universal compaction, which introduces a chance of waiting log writing in mutex, which is bad. To remove this risk, write all those info logs to a buffer and flush it after releasing the mutex.
Test Plan:
make all check
check the log lines while running some tests that trigger compactions.
Reviewers: haobo, igor, dhruba
Reviewed By: dhruba
CC: i.am.jin.lei, dhruba, yhchiang, leveldb, nkg-
Differential Revision: https://reviews.facebook.net/D16515
11 years ago
|
|
|
bg_compaction_scheduled_--;
|
|
|
|
|
|
|
|
versions_->GetColumnFamilySet()->FreeDeadColumnFamilies();
|
|
|
|
|
Buffer info logs when picking compactions and write them out after releasing the mutex
Summary: Now while the background thread is picking compactions, it writes out multiple info_logs, especially for universal compaction, which introduces a chance of waiting log writing in mutex, which is bad. To remove this risk, write all those info logs to a buffer and flush it after releasing the mutex.
Test Plan:
make all check
check the log lines while running some tests that trigger compactions.
Reviewers: haobo, igor, dhruba
Reviewed By: dhruba
CC: i.am.jin.lei, dhruba, yhchiang, leveldb, nkg-
Differential Revision: https://reviews.facebook.net/D16515
11 years ago
|
|
|
// Previous compaction may have produced too many files in a level,
|
|
|
|
// So reschedule another compaction if we made progress in the
|
|
|
|
// last compaction.
|
Fix data race against logging data structure because of LogBuffer
Summary:
@igor pointed out that there is a potential data race because of the way we use the newly introduced LogBuffer. After "bg_compaction_scheduled_--" or "bg_flush_scheduled_--", they can both become 0. As soon as the lock is released after that, DBImpl's deconstructor can go ahead and deconstruct all the states inside DB, including the info_log object hold in a shared pointer of the options object it keeps. At that point it is not safe anymore to continue using the info logger to write the delayed logs.
With the patch, lock is released temporarily for log buffer to be flushed before "bg_compaction_scheduled_--" or "bg_flush_scheduled_--". In order to make sure we don't miss any pending flush or compaction, a new flag bg_schedule_needed_ is added, which is set to be true if there is a pending flush or compaction but not scheduled because of the max thread limit. If the flag is set to be true, the scheduling function will be called before compaction or flush thread finishes.
Thanks @igor for this finding!
Test Plan: make all check
Reviewers: haobo, igor
Reviewed By: haobo
CC: dhruba, ljin, yhchiang, igor, leveldb
Differential Revision: https://reviews.facebook.net/D16767
11 years ago
|
|
|
//
|
|
|
|
// Also, any time the mutex is released After finding the work to do,
|
|
|
|
// another thread might execute MaybeScheduleFlushOrCompaction(). It is
|
|
|
|
// possible that there is a pending job but it is not scheduled because of
|
|
|
|
// the max thread limit.
|
|
|
|
if (madeProgress || bg_schedule_needed_) {
|
Buffer info logs when picking compactions and write them out after releasing the mutex
Summary: Now while the background thread is picking compactions, it writes out multiple info_logs, especially for universal compaction, which introduces a chance of waiting log writing in mutex, which is bad. To remove this risk, write all those info logs to a buffer and flush it after releasing the mutex.
Test Plan:
make all check
check the log lines while running some tests that trigger compactions.
Reviewers: haobo, igor, dhruba
Reviewed By: dhruba
CC: i.am.jin.lei, dhruba, yhchiang, leveldb, nkg-
Differential Revision: https://reviews.facebook.net/D16515
11 years ago
|
|
|
MaybeScheduleFlushOrCompaction();
|
|
|
|
}
|
|
|
|
if (madeProgress || bg_compaction_scheduled_ == 0 || bg_manual_only_ > 0) {
|
|
|
|
// signal if
|
|
|
|
// * madeProgress -- need to wakeup DelayWrite
|
|
|
|
// * bg_compaction_scheduled_ == 0 -- need to wakeup ~DBImpl
|
|
|
|
// * bg_manual_only_ > 0 -- need to wakeup RunManualCompaction
|
|
|
|
// If none of this is true, there is no need to signal since nobody is
|
|
|
|
// waiting for it
|
|
|
|
bg_cv_.SignalAll();
|
|
|
|
}
|
|
|
|
// IMPORTANT: there should be no code after calling SignalAll. This call may
|
|
|
|
// signal the DB destructor that it's OK to proceed with destruction. In
|
|
|
|
// that case, all DB variables will be dealloacated and referencing them
|
|
|
|
// will cause trouble.
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
Status DBImpl::BackgroundCompaction(bool* madeProgress, JobContext* job_context,
|
Buffer info logs when picking compactions and write them out after releasing the mutex
Summary: Now while the background thread is picking compactions, it writes out multiple info_logs, especially for universal compaction, which introduces a chance of waiting log writing in mutex, which is bad. To remove this risk, write all those info logs to a buffer and flush it after releasing the mutex.
Test Plan:
make all check
check the log lines while running some tests that trigger compactions.
Reviewers: haobo, igor, dhruba
Reviewed By: dhruba
CC: i.am.jin.lei, dhruba, yhchiang, leveldb, nkg-
Differential Revision: https://reviews.facebook.net/D16515
11 years ago
|
|
|
LogBuffer* log_buffer) {
|
|
|
|
*madeProgress = false;
|
|
|
|
mutex_.AssertHeld();
|
|
|
|
|
|
|
|
bool is_manual = (manual_compaction_ != nullptr) &&
|
|
|
|
(manual_compaction_->in_progress == false);
|
|
|
|
|
Don't run background jobs (flush, compactions) when bg_error_ is set
Summary:
If bg_error_ is set, that means that we mark DB read only. However, current behavior still continues the flushes and compactions, even though bg_error_ is set.
On the other hand, if bg_error_ is set, we will return Status::OK() from CompactRange(), although the compaction didn't actually succeed.
This is clearly not desired behavior. I found this when I was debugging t5132159, although I'm pretty sure these aren't related.
Also, when we're shutting down, it's dangerous to exit RunManualCompaction(), since that will destruct ManualCompaction object. Background compaction job might still hold a reference to manual_compaction_ and this will lead to undefined behavior. I changed the behavior so that we only exit RunManualCompaction when manual compaction job is marked done.
Test Plan: make check
Reviewers: sdong, ljin, yhchiang
Reviewed By: yhchiang
Subscribers: leveldb
Differential Revision: https://reviews.facebook.net/D23223
10 years ago
|
|
|
if (!bg_error_.ok()) {
|
|
|
|
if (is_manual) {
|
|
|
|
manual_compaction_->status = bg_error_;
|
|
|
|
manual_compaction_->done = true;
|
|
|
|
manual_compaction_->in_progress = false;
|
|
|
|
manual_compaction_ = nullptr;
|
|
|
|
}
|
|
|
|
return bg_error_;
|
|
|
|
}
|
|
|
|
|
|
|
|
if (is_manual) {
|
|
|
|
// another thread cannot pick up the same work
|
|
|
|
manual_compaction_->in_progress = true;
|
|
|
|
} else if (manual_compaction_ != nullptr) {
|
|
|
|
// there should be no automatic compactions running when manual compaction
|
|
|
|
// is running
|
|
|
|
return Status::OK();
|
|
|
|
}
|
|
|
|
|
|
|
|
// FLUSH preempts compaction
|
|
|
|
Status flush_stat;
|
|
|
|
for (auto cfd : *versions_->GetColumnFamilySet()) {
|
|
|
|
const MutableCFOptions mutable_cf_options =
|
|
|
|
*cfd->GetLatestMutableCFOptions();
|
|
|
|
while (cfd->imm()->IsFlushPending()) {
|
|
|
|
LogToBuffer(
|
|
|
|
log_buffer,
|
|
|
|
"BackgroundCompaction doing FlushMemTableToOutputFile, "
|
|
|
|
"compaction slots available %d",
|
|
|
|
db_options_.max_background_compactions - bg_compaction_scheduled_);
|
|
|
|
cfd->Ref();
|
|
|
|
flush_stat = FlushMemTableToOutputFile(
|
|
|
|
cfd, mutable_cf_options, madeProgress, job_context, log_buffer);
|
|
|
|
cfd->Unref();
|
|
|
|
if (!flush_stat.ok()) {
|
|
|
|
if (is_manual) {
|
|
|
|
manual_compaction_->status = flush_stat;
|
|
|
|
manual_compaction_->done = true;
|
|
|
|
manual_compaction_->in_progress = false;
|
|
|
|
manual_compaction_ = nullptr;
|
|
|
|
}
|
|
|
|
return flush_stat;
|
|
|
|
}
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
// Compaction makes a copy of the latest MutableCFOptions. It should be used
|
|
|
|
// throughout the compaction procedure to make sure consistency. It will
|
|
|
|
// eventually be installed into SuperVersion
|
|
|
|
unique_ptr<Compaction> c;
|
|
|
|
InternalKey manual_end_storage;
|
|
|
|
InternalKey* manual_end = &manual_end_storage;
|
|
|
|
if (is_manual) {
|
|
|
|
ManualCompaction* m = manual_compaction_;
|
|
|
|
assert(m->in_progress);
|
|
|
|
c.reset(m->cfd->CompactRange(
|
|
|
|
*m->cfd->GetLatestMutableCFOptions(), m->input_level, m->output_level,
|
|
|
|
m->output_path_id, m->begin, m->end, &manual_end));
|
|
|
|
if (!c) {
|
|
|
|
m->done = true;
|
|
|
|
}
|
|
|
|
LogToBuffer(log_buffer,
|
|
|
|
"[%s] Manual compaction from level-%d to level-%d from %s .. "
|
|
|
|
"%s; will stop at %s\n",
|
|
|
|
m->cfd->GetName().c_str(), m->input_level, m->output_level,
|
|
|
|
(m->begin ? m->begin->DebugString().c_str() : "(begin)"),
|
|
|
|
(m->end ? m->end->DebugString().c_str() : "(end)"),
|
|
|
|
((m->done || manual_end == nullptr)
|
|
|
|
? "(end)"
|
|
|
|
: manual_end->DebugString().c_str()));
|
|
|
|
} else {
|
|
|
|
// no need to refcount in iteration since it's always under a mutex
|
|
|
|
for (auto cfd : *versions_->GetColumnFamilySet()) {
|
|
|
|
// Pick up latest mutable CF Options and use it throughout the
|
|
|
|
// compaction job
|
|
|
|
auto* mutable_cf_options = cfd->GetLatestMutableCFOptions();
|
|
|
|
if (!mutable_cf_options->disable_auto_compactions) {
|
|
|
|
// NOTE: try to avoid unnecessary copy of MutableCFOptions if
|
|
|
|
// compaction is not necessary. Need to make sure mutex is held
|
|
|
|
// until we make a copy in the following code
|
|
|
|
c.reset(cfd->PickCompaction(*mutable_cf_options, log_buffer));
|
|
|
|
if (c != nullptr) {
|
|
|
|
// update statistics
|
|
|
|
MeasureTime(stats_, NUM_FILES_IN_SINGLE_COMPACTION,
|
|
|
|
c->inputs(0)->size());
|
|
|
|
break;
|
|
|
|
}
|
|
|
|
}
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
Status status;
|
|
|
|
if (!c) {
|
|
|
|
// Nothing to do
|
Buffer info logs when picking compactions and write them out after releasing the mutex
Summary: Now while the background thread is picking compactions, it writes out multiple info_logs, especially for universal compaction, which introduces a chance of waiting log writing in mutex, which is bad. To remove this risk, write all those info logs to a buffer and flush it after releasing the mutex.
Test Plan:
make all check
check the log lines while running some tests that trigger compactions.
Reviewers: haobo, igor, dhruba
Reviewed By: dhruba
CC: i.am.jin.lei, dhruba, yhchiang, leveldb, nkg-
Differential Revision: https://reviews.facebook.net/D16515
11 years ago
|
|
|
LogToBuffer(log_buffer, "Compaction nothing to do");
|
|
|
|
} else if (c->IsDeletionCompaction()) {
|
|
|
|
// TODO(icanadi) Do we want to honor snapshots here? i.e. not delete old
|
|
|
|
// file if there is alive snapshot pointing to it
|
|
|
|
assert(c->num_input_files(1) == 0);
|
|
|
|
assert(c->level() == 0);
|
|
|
|
assert(c->column_family_data()->ioptions()->compaction_style ==
|
|
|
|
kCompactionStyleFIFO);
|
|
|
|
for (const auto& f : *c->inputs(0)) {
|
|
|
|
c->edit()->DeleteFile(c->level(), f->fd.GetNumber());
|
|
|
|
}
|
|
|
|
status = versions_->LogAndApply(
|
|
|
|
c->column_family_data(), *c->mutable_cf_options(), c->edit(),
|
|
|
|
&mutex_, db_directory_.get());
|
|
|
|
InstallSuperVersionBackground(c->column_family_data(), job_context,
|
|
|
|
*c->mutable_cf_options());
|
|
|
|
LogToBuffer(log_buffer, "[%s] Deleted %d files\n",
|
|
|
|
c->column_family_data()->GetName().c_str(),
|
|
|
|
c->num_input_files(0));
|
|
|
|
c->ReleaseCompactionFiles(status);
|
|
|
|
*madeProgress = true;
|
|
|
|
} else if (!is_manual && c->IsTrivialMove()) {
|
|
|
|
// Move file to next level
|
|
|
|
assert(c->num_input_files(0) == 1);
|
|
|
|
FileMetaData* f = c->input(0, 0);
|
|
|
|
c->edit()->DeleteFile(c->level(), f->fd.GetNumber());
|
|
|
|
c->edit()->AddFile(c->level() + 1, f->fd.GetNumber(), f->fd.GetPathId(),
|
|
|
|
f->fd.GetFileSize(), f->smallest, f->largest,
|
|
|
|
f->smallest_seqno, f->largest_seqno);
|
|
|
|
status = versions_->LogAndApply(c->column_family_data(),
|
|
|
|
*c->mutable_cf_options(),
|
|
|
|
c->edit(), &mutex_, db_directory_.get());
|
|
|
|
// Use latest MutableCFOptions
|
|
|
|
InstallSuperVersionBackground(c->column_family_data(), job_context,
|
|
|
|
*c->mutable_cf_options());
|
|
|
|
|
|
|
|
VersionStorageInfo::LevelSummaryStorage tmp;
|
|
|
|
LogToBuffer(log_buffer, "[%s] Moved #%" PRIu64 " to level-%d %" PRIu64
|
|
|
|
" bytes %s: %s\n",
|
|
|
|
c->column_family_data()->GetName().c_str(), f->fd.GetNumber(),
|
|
|
|
c->level() + 1, f->fd.GetFileSize(), status.ToString().c_str(),
|
|
|
|
c->input_version()->storage_info()->LevelSummary(&tmp));
|
|
|
|
c->ReleaseCompactionFiles(status);
|
|
|
|
*madeProgress = true;
|
|
|
|
} else {
|
|
|
|
MaybeScheduleFlushOrCompaction(); // do more compaction work in parallel.
|
|
|
|
|
|
|
|
auto yield_callback = [&]() {
|
|
|
|
return CallFlushDuringCompaction(c->column_family_data(),
|
|
|
|
*c->mutable_cf_options(), job_context,
|
|
|
|
log_buffer);
|
|
|
|
};
|
|
|
|
CompactionJob compaction_job(
|
|
|
|
c.get(), db_options_, *c->mutable_cf_options(), env_options_,
|
|
|
|
versions_.get(), &mutex_, &shutting_down_, &pending_outputs_,
|
|
|
|
log_buffer, db_directory_.get(), stats_, &snapshots_,
|
|
|
|
IsSnapshotSupported(), table_cache_, std::move(yield_callback));
|
|
|
|
compaction_job.Prepare();
|
|
|
|
mutex_.Unlock();
|
|
|
|
status = compaction_job.Run();
|
|
|
|
mutex_.Lock();
|
|
|
|
status = compaction_job.Install(status);
|
|
|
|
if (status.ok()) {
|
|
|
|
InstallSuperVersionBackground(c->column_family_data(), job_context,
|
|
|
|
*c->mutable_cf_options());
|
|
|
|
}
|
|
|
|
c->ReleaseCompactionFiles(status);
|
|
|
|
c->ReleaseInputs();
|
|
|
|
*madeProgress = true;
|
|
|
|
}
|
|
|
|
c.reset();
|
|
|
|
|
|
|
|
if (status.ok()) {
|
|
|
|
// Done
|
|
|
|
} else if (status.IsShutdownInProgress()) {
|
|
|
|
// Ignore compaction errors found during shutting down
|
|
|
|
} else {
|
|
|
|
Log(InfoLogLevel::WARN_LEVEL, db_options_.info_log, "Compaction error: %s",
|
Buffer info logs when picking compactions and write them out after releasing the mutex
Summary: Now while the background thread is picking compactions, it writes out multiple info_logs, especially for universal compaction, which introduces a chance of waiting log writing in mutex, which is bad. To remove this risk, write all those info logs to a buffer and flush it after releasing the mutex.
Test Plan:
make all check
check the log lines while running some tests that trigger compactions.
Reviewers: haobo, igor, dhruba
Reviewed By: dhruba
CC: i.am.jin.lei, dhruba, yhchiang, leveldb, nkg-
Differential Revision: https://reviews.facebook.net/D16515
11 years ago
|
|
|
status.ToString().c_str());
|
|
|
|
if (db_options_.paranoid_checks && bg_error_.ok()) {
|
|
|
|
bg_error_ = status;
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
if (is_manual) {
|
|
|
|
ManualCompaction* m = manual_compaction_;
|
|
|
|
if (!status.ok()) {
|
|
|
|
m->status = status;
|
|
|
|
m->done = true;
|
|
|
|
}
|
|
|
|
// For universal compaction:
|
|
|
|
// Because universal compaction always happens at level 0, so one
|
|
|
|
// compaction will pick up all overlapped files. No files will be
|
|
|
|
// filtered out due to size limit and left for a successive compaction.
|
|
|
|
// So we can safely conclude the current compaction.
|
|
|
|
//
|
|
|
|
// Also note that, if we don't stop here, then the current compaction
|
|
|
|
// writes a new file back to level 0, which will be used in successive
|
|
|
|
// compaction. Hence the manual compaction will never finish.
|
|
|
|
//
|
|
|
|
// Stop the compaction if manual_end points to nullptr -- this means
|
|
|
|
// that we compacted the whole range. manual_end should always point
|
|
|
|
// to nullptr in case of universal compaction
|
|
|
|
if (manual_end == nullptr) {
|
|
|
|
m->done = true;
|
|
|
|
}
|
|
|
|
if (!m->done) {
|
|
|
|
// We only compacted part of the requested range. Update *m
|
|
|
|
// to the range that is left to be compacted.
|
|
|
|
// Universal and FIFO compactions should always compact the whole range
|
|
|
|
assert(m->cfd->ioptions()->compaction_style != kCompactionStyleUniversal);
|
|
|
|
assert(m->cfd->ioptions()->compaction_style != kCompactionStyleFIFO);
|
|
|
|
m->tmp_storage = *manual_end;
|
|
|
|
m->begin = &m->tmp_storage;
|
|
|
|
}
|
|
|
|
m->in_progress = false; // not being processed anymore
|
|
|
|
manual_compaction_ = nullptr;
|
|
|
|
}
|
|
|
|
return status;
|
|
|
|
}
|
|
|
|
|
|
|
|
uint64_t DBImpl::CallFlushDuringCompaction(
|
|
|
|
ColumnFamilyData* cfd, const MutableCFOptions& mutable_cf_options,
|
|
|
|
JobContext* job_context, LogBuffer* log_buffer) {
|
|
|
|
if (db_options_.max_background_flushes > 0) {
|
|
|
|
// flush thread will take care of this
|
|
|
|
return 0;
|
|
|
|
}
|
|
|
|
if (cfd->imm()->imm_flush_needed.load(std::memory_order_relaxed)) {
|
|
|
|
const uint64_t imm_start = env_->NowMicros();
|
|
|
|
mutex_.Lock();
|
|
|
|
if (cfd->imm()->IsFlushPending()) {
|
|
|
|
cfd->Ref();
|
|
|
|
FlushMemTableToOutputFile(cfd, mutable_cf_options, nullptr, job_context,
|
|
|
|
log_buffer);
|
|
|
|
cfd->Unref();
|
|
|
|
bg_cv_.SignalAll(); // Wakeup DelayWrite() if necessary
|
|
|
|
}
|
|
|
|
mutex_.Unlock();
|
|
|
|
log_buffer->FlushBufferToLog();
|
|
|
|
return env_->NowMicros() - imm_start;
|
|
|
|
}
|
|
|
|
return 0;
|
|
|
|
}
|
|
|
|
|
|
|
|
namespace {
|
|
|
|
struct IterState {
|
|
|
|
IterState(DBImpl* _db, port::Mutex* _mu, SuperVersion* _super_version)
|
|
|
|
: db(_db), mu(_mu), super_version(_super_version) {}
|
|
|
|
|
|
|
|
DBImpl* db;
|
|
|
|
port::Mutex* mu;
|
|
|
|
SuperVersion* super_version;
|
|
|
|
};
|
|
|
|
|
|
|
|
static void CleanupIteratorState(void* arg1, void* arg2) {
|
|
|
|
IterState* state = reinterpret_cast<IterState*>(arg1);
|
|
|
|
|
|
|
|
if (state->super_version->Unref()) {
|
|
|
|
JobContext job_context;
|
|
|
|
|
|
|
|
state->mu->Lock();
|
|
|
|
state->super_version->Cleanup();
|
|
|
|
state->db->FindObsoleteFiles(&job_context, false, true);
|
|
|
|
state->mu->Unlock();
|
|
|
|
|
|
|
|
delete state->super_version;
|
|
|
|
if (job_context.HaveSomethingToDelete()) {
|
|
|
|
state->db->PurgeObsoleteFiles(job_context);
|
|
|
|
}
|
MemTableListVersion
Summary:
MemTableListVersion is to MemTableList what Version is to VersionSet. I took almost the same ideas to develop MemTableListVersion. The reason is to have copying std::list done in background, while flushing, rather than in foreground (MultiGet() and NewIterator()) under a mutex! Also, whenever we copied MemTableList, we copied also some MemTableList metadata (flush_requested_, commit_in_progress_, etc.), which was wasteful.
This diff avoids std::list copy under a mutex in both MultiGet() and NewIterator(). I created a small database with some number of immutable memtables, and creating 100.000 iterators in a single-thread (!) decreased from {188739, 215703, 198028} to {154352, 164035, 159817}. A lot of the savings come from code under a mutex, so we should see much higher savings with multiple threads. Creating new iterator is very important to LogDevice team.
I also think this diff will make SuperVersion obsolete for performance reasons. I will try it in the next diff. SuperVersion gave us huge savings on Get() code path, but I think that most of the savings came from copying MemTableList under a mutex. If we had MemTableListVersion, we would never need to copy the entire object (like we still do in NewIterator() and MultiGet())
Test Plan: `make check` works. I will also do `make valgrind_check` before commit
Reviewers: dhruba, haobo, kailiu, sdong, emayanke, tnovak
Reviewed By: kailiu
CC: leveldb
Differential Revision: https://reviews.facebook.net/D15255
11 years ago
|
|
|
}
|
|
|
|
|
|
|
|
delete state;
|
|
|
|
}
|
|
|
|
} // namespace
|
|
|
|
|
|
|
|
Iterator* DBImpl::NewInternalIterator(const ReadOptions& read_options,
|
|
|
|
ColumnFamilyData* cfd,
|
In DB::NewIterator(), try to allocate the whole iterator tree in an arena
Summary:
In this patch, try to allocate the whole iterator tree starting from DBIter from an arena
1. ArenaWrappedDBIter is created when serves as the entry point of an iterator tree, with an arena in it.
2. Add an option to create iterator from arena for following iterators: DBIter, MergingIterator, MemtableIterator, all mem table's iterators, all table reader's iterators and two level iterator.
3. MergeIteratorBuilder is created to incrementally build the tree of internal iterators. It is passed to mem table list and version set and add iterators to it.
Limitations:
(1) Only DB::NewIterator() without tailing uses the arena. Other cases, including readonly DB and compactions are still from malloc
(2) Two level iterator itself is allocated in arena, but not iterators inside it.
Test Plan: make all check
Reviewers: ljin, haobo
Reviewed By: haobo
Subscribers: leveldb, dhruba, yhchiang, igor
Differential Revision: https://reviews.facebook.net/D18513
11 years ago
|
|
|
SuperVersion* super_version,
|
|
|
|
Arena* arena) {
|
|
|
|
Iterator* internal_iter;
|
|
|
|
assert(arena != nullptr);
|
|
|
|
// Need to create internal iterator from the arena.
|
|
|
|
MergeIteratorBuilder merge_iter_builder(&cfd->internal_comparator(), arena);
|
|
|
|
// Collect iterator for mutable mem
|
|
|
|
merge_iter_builder.AddIterator(
|
|
|
|
super_version->mem->NewIterator(read_options, arena));
|
|
|
|
// Collect all needed child iterators for immutable memtables
|
|
|
|
super_version->imm->AddIterators(read_options, &merge_iter_builder);
|
|
|
|
// Collect iterators for files in L0 - Ln
|
|
|
|
super_version->current->AddIterators(read_options, env_options_,
|
|
|
|
&merge_iter_builder);
|
|
|
|
internal_iter = merge_iter_builder.Finish();
|
|
|
|
IterState* cleanup = new IterState(this, &mutex_, super_version);
|
|
|
|
internal_iter->RegisterCleanup(CleanupIteratorState, cleanup, nullptr);
|
|
|
|
|
|
|
|
return internal_iter;
|
|
|
|
}
|
|
|
|
|
|
|
|
ColumnFamilyHandle* DBImpl::DefaultColumnFamily() const {
|
|
|
|
return default_cf_handle_;
|
|
|
|
}
|
|
|
|
|
|
|
|
Status DBImpl::Get(const ReadOptions& read_options,
|
|
|
|
ColumnFamilyHandle* column_family, const Slice& key,
|
|
|
|
std::string* value) {
|
|
|
|
return GetImpl(read_options, column_family, key, value);
|
|
|
|
}
|
|
|
|
|
|
|
|
// JobContext gets created and destructed outside of the lock --
|
|
|
|
// we
|
|
|
|
// use this convinently to:
|
|
|
|
// * malloc one SuperVersion() outside of the lock -- new_superversion
|
|
|
|
// * delete SuperVersion()s outside of the lock -- superversions_to_free
|
|
|
|
//
|
|
|
|
// However, if InstallSuperVersion() gets called twice with the same
|
|
|
|
// job_context, we can't reuse the SuperVersion() that got
|
|
|
|
// malloced
|
|
|
|
// because
|
|
|
|
// first call already used it. In that rare case, we take a hit and create a
|
|
|
|
// new SuperVersion() inside of the mutex. We do similar thing
|
|
|
|
// for superversion_to_free
|
|
|
|
void DBImpl::InstallSuperVersionBackground(
|
|
|
|
ColumnFamilyData* cfd, JobContext* job_context,
|
|
|
|
const MutableCFOptions& mutable_cf_options) {
|
|
|
|
mutex_.AssertHeld();
|
|
|
|
SuperVersion* old_superversion = InstallSuperVersion(
|
|
|
|
cfd, job_context->new_superversion, mutable_cf_options);
|
|
|
|
job_context->new_superversion = nullptr;
|
|
|
|
job_context->superversions_to_free.push_back(old_superversion);
|
|
|
|
}
|
|
|
|
|
|
|
|
SuperVersion* DBImpl::InstallSuperVersion(
|
|
|
|
ColumnFamilyData* cfd, SuperVersion* new_sv,
|
|
|
|
const MutableCFOptions& mutable_cf_options) {
|
|
|
|
mutex_.AssertHeld();
|
|
|
|
auto* old = cfd->InstallSuperVersion(
|
|
|
|
new_sv ? new_sv : new SuperVersion(), &mutex_, mutable_cf_options);
|
|
|
|
|
|
|
|
// We want to schedule potential flush or compactions since new options may
|
|
|
|
// have been picked up in this new version. New options may cause flush
|
|
|
|
// compaction trigger condition to change.
|
|
|
|
MaybeScheduleFlushOrCompaction();
|
|
|
|
|
|
|
|
// Update max_total_in_memory_state_
|
|
|
|
auto old_memtable_size = 0;
|
|
|
|
if (old) {
|
|
|
|
old_memtable_size = old->mutable_cf_options.write_buffer_size *
|
|
|
|
old->mutable_cf_options.max_write_buffer_number;
|
|
|
|
}
|
|
|
|
max_total_in_memory_state_ =
|
|
|
|
max_total_in_memory_state_ - old_memtable_size +
|
|
|
|
mutable_cf_options.write_buffer_size *
|
|
|
|
mutable_cf_options.max_write_buffer_number;
|
|
|
|
return old;
|
|
|
|
}
|
|
|
|
|
|
|
|
Status DBImpl::GetImpl(const ReadOptions& read_options,
|
|
|
|
ColumnFamilyHandle* column_family, const Slice& key,
|
|
|
|
std::string* value, bool* value_found) {
|
|
|
|
StopWatch sw(env_, stats_, DB_GET);
|
|
|
|
PERF_TIMER_GUARD(get_snapshot_time);
|
|
|
|
|
|
|
|
auto cfh = reinterpret_cast<ColumnFamilyHandleImpl*>(column_family);
|
|
|
|
auto cfd = cfh->cfd();
|
|
|
|
|
|
|
|
SequenceNumber snapshot;
|
|
|
|
if (read_options.snapshot != nullptr) {
|
|
|
|
snapshot = reinterpret_cast<const SnapshotImpl*>(
|
|
|
|
read_options.snapshot)->number_;
|
|
|
|
} else {
|
|
|
|
snapshot = versions_->LastSequence();
|
|
|
|
}
|
|
|
|
|
|
|
|
// Acquire SuperVersion
|
|
|
|
SuperVersion* sv = GetAndRefSuperVersion(cfd);
|
|
|
|
|
[RocksDB] [MergeOperator] The new Merge Interface! Uses merge sequences.
Summary:
Here are the major changes to the Merge Interface. It has been expanded
to handle cases where the MergeOperator is not associative. It does so by stacking
up merge operations while scanning through the key history (i.e.: during Get() or
Compaction), until a valid Put/Delete/end-of-history is encountered; it then
applies all of the merge operations in the correct sequence starting with the
base/sentinel value.
I have also introduced an "AssociativeMerge" function which allows the user to
take advantage of associative merge operations (such as in the case of counters).
The implementation will always attempt to merge the operations/operands themselves
together when they are encountered, and will resort to the "stacking" method if
and only if the "associative-merge" fails.
This implementation is conjectured to allow MergeOperator to handle the general
case, while still providing the user with the ability to take advantage of certain
efficiencies in their own merge-operator / data-structure.
NOTE: This is a preliminary diff. This must still go through a lot of review,
revision, and testing. Feedback welcome!
Test Plan:
-This is a preliminary diff. I have only just begun testing/debugging it.
-I will be testing this with the existing MergeOperator use-cases and unit-tests
(counters, string-append, and redis-lists)
-I will be "desk-checking" and walking through the code with the help gdb.
-I will find a way of stress-testing the new interface / implementation using
db_bench, db_test, merge_test, and/or db_stress.
-I will ensure that my tests cover all cases: Get-Memtable,
Get-Immutable-Memtable, Get-from-Disk, Iterator-Range-Scan, Flush-Memtable-to-L0,
Compaction-L0-L1, Compaction-Ln-L(n+1), Put/Delete found, Put/Delete not-found,
end-of-history, end-of-file, etc.
-A lot of feedback from the reviewers.
Reviewers: haobo, dhruba, zshao, emayanke
Reviewed By: haobo
CC: leveldb
Differential Revision: https://reviews.facebook.net/D11499
12 years ago
|
|
|
// Prepare to store a list of merge operations if merge occurs.
|
|
|
|
MergeContext merge_context;
|
[RocksDB] [MergeOperator] The new Merge Interface! Uses merge sequences.
Summary:
Here are the major changes to the Merge Interface. It has been expanded
to handle cases where the MergeOperator is not associative. It does so by stacking
up merge operations while scanning through the key history (i.e.: during Get() or
Compaction), until a valid Put/Delete/end-of-history is encountered; it then
applies all of the merge operations in the correct sequence starting with the
base/sentinel value.
I have also introduced an "AssociativeMerge" function which allows the user to
take advantage of associative merge operations (such as in the case of counters).
The implementation will always attempt to merge the operations/operands themselves
together when they are encountered, and will resort to the "stacking" method if
and only if the "associative-merge" fails.
This implementation is conjectured to allow MergeOperator to handle the general
case, while still providing the user with the ability to take advantage of certain
efficiencies in their own merge-operator / data-structure.
NOTE: This is a preliminary diff. This must still go through a lot of review,
revision, and testing. Feedback welcome!
Test Plan:
-This is a preliminary diff. I have only just begun testing/debugging it.
-I will be testing this with the existing MergeOperator use-cases and unit-tests
(counters, string-append, and redis-lists)
-I will be "desk-checking" and walking through the code with the help gdb.
-I will find a way of stress-testing the new interface / implementation using
db_bench, db_test, merge_test, and/or db_stress.
-I will ensure that my tests cover all cases: Get-Memtable,
Get-Immutable-Memtable, Get-from-Disk, Iterator-Range-Scan, Flush-Memtable-to-L0,
Compaction-L0-L1, Compaction-Ln-L(n+1), Put/Delete found, Put/Delete not-found,
end-of-history, end-of-file, etc.
-A lot of feedback from the reviewers.
Reviewers: haobo, dhruba, zshao, emayanke
Reviewed By: haobo
CC: leveldb
Differential Revision: https://reviews.facebook.net/D11499
12 years ago
|
|
|
|
|
|
|
Status s;
|
|
|
|
// First look in the memtable, then in the immutable memtable (if any).
|
|
|
|
// s is both in/out. When in, s could either be OK or MergeInProgress.
|
[RocksDB] [MergeOperator] The new Merge Interface! Uses merge sequences.
Summary:
Here are the major changes to the Merge Interface. It has been expanded
to handle cases where the MergeOperator is not associative. It does so by stacking
up merge operations while scanning through the key history (i.e.: during Get() or
Compaction), until a valid Put/Delete/end-of-history is encountered; it then
applies all of the merge operations in the correct sequence starting with the
base/sentinel value.
I have also introduced an "AssociativeMerge" function which allows the user to
take advantage of associative merge operations (such as in the case of counters).
The implementation will always attempt to merge the operations/operands themselves
together when they are encountered, and will resort to the "stacking" method if
and only if the "associative-merge" fails.
This implementation is conjectured to allow MergeOperator to handle the general
case, while still providing the user with the ability to take advantage of certain
efficiencies in their own merge-operator / data-structure.
NOTE: This is a preliminary diff. This must still go through a lot of review,
revision, and testing. Feedback welcome!
Test Plan:
-This is a preliminary diff. I have only just begun testing/debugging it.
-I will be testing this with the existing MergeOperator use-cases and unit-tests
(counters, string-append, and redis-lists)
-I will be "desk-checking" and walking through the code with the help gdb.
-I will find a way of stress-testing the new interface / implementation using
db_bench, db_test, merge_test, and/or db_stress.
-I will ensure that my tests cover all cases: Get-Memtable,
Get-Immutable-Memtable, Get-from-Disk, Iterator-Range-Scan, Flush-Memtable-to-L0,
Compaction-L0-L1, Compaction-Ln-L(n+1), Put/Delete found, Put/Delete not-found,
end-of-history, end-of-file, etc.
-A lot of feedback from the reviewers.
Reviewers: haobo, dhruba, zshao, emayanke
Reviewed By: haobo
CC: leveldb
Differential Revision: https://reviews.facebook.net/D11499
12 years ago
|
|
|
// merge_operands will contain the sequence of merges in the latter case.
|
|
|
|
LookupKey lkey(key, snapshot);
|
|
|
|
PERF_TIMER_STOP(get_snapshot_time);
|
|
|
|
|
|
|
|
if (sv->mem->Get(lkey, value, &s, &merge_context)) {
|
|
|
|
// Done
|
|
|
|
RecordTick(stats_, MEMTABLE_HIT);
|
|
|
|
} else if (sv->imm->Get(lkey, value, &s, &merge_context)) {
|
|
|
|
// Done
|
|
|
|
RecordTick(stats_, MEMTABLE_HIT);
|
|
|
|
} else {
|
|
|
|
PERF_TIMER_GUARD(get_from_output_files_time);
|
|
|
|
sv->current->Get(read_options, lkey, value, &s, &merge_context,
|
|
|
|
value_found);
|
|
|
|
RecordTick(stats_, MEMTABLE_MISS);
|
|
|
|
}
|
|
|
|
|
|
|
|
{
|
|
|
|
PERF_TIMER_GUARD(get_post_process_time);
|
|
|
|
|
|
|
|
ReturnAndCleanupSuperVersion(cfd, sv);
|
|
|
|
|
|
|
|
RecordTick(stats_, NUMBER_KEYS_READ);
|
|
|
|
RecordTick(stats_, BYTES_READ, value->size());
|
|
|
|
}
|
|
|
|
return s;
|
|
|
|
}
|
|
|
|
|
[RocksDB] [Column Family] Interface proposal
Summary:
<This diff is for Column Family branch>
Sharing some of the work I've done so far. This diff compiles and passes the tests.
The biggest change is in options.h - I broke down Options into two parts - DBOptions and ColumnFamilyOptions. DBOptions is DB-specific (env, create_if_missing, block_cache, etc.) and ColumnFamilyOptions is column family-specific (all compaction options, compresion options, etc.). Note that this does not break backwards compatibility at all.
Further, I created DBWithColumnFamily which inherits DB interface and adds new functions with column family support. Clients can transparently switch to DBWithColumnFamily and it will not break their backwards compatibility.
There are few methods worth checking out: ListColumnFamilies(), MultiNewIterator(), MultiGet() and GetSnapshot(). [GetSnapshot() returns the snapshot across all column families for now - I think that's what we agreed on]
Finally, I made small changes to WriteBatch so we are able to atomically insert data across column families.
Please provide feedback.
Test Plan: make check works, the code is backward compatible
Reviewers: dhruba, haobo, sdong, kailiu, emayanke
CC: leveldb
Differential Revision: https://reviews.facebook.net/D14445
11 years ago
|
|
|
std::vector<Status> DBImpl::MultiGet(
|
|
|
|
const ReadOptions& read_options,
|
|
|
|
const std::vector<ColumnFamilyHandle*>& column_family,
|
[RocksDB] [Column Family] Interface proposal
Summary:
<This diff is for Column Family branch>
Sharing some of the work I've done so far. This diff compiles and passes the tests.
The biggest change is in options.h - I broke down Options into two parts - DBOptions and ColumnFamilyOptions. DBOptions is DB-specific (env, create_if_missing, block_cache, etc.) and ColumnFamilyOptions is column family-specific (all compaction options, compresion options, etc.). Note that this does not break backwards compatibility at all.
Further, I created DBWithColumnFamily which inherits DB interface and adds new functions with column family support. Clients can transparently switch to DBWithColumnFamily and it will not break their backwards compatibility.
There are few methods worth checking out: ListColumnFamilies(), MultiNewIterator(), MultiGet() and GetSnapshot(). [GetSnapshot() returns the snapshot across all column families for now - I think that's what we agreed on]
Finally, I made small changes to WriteBatch so we are able to atomically insert data across column families.
Please provide feedback.
Test Plan: make check works, the code is backward compatible
Reviewers: dhruba, haobo, sdong, kailiu, emayanke
CC: leveldb
Differential Revision: https://reviews.facebook.net/D14445
11 years ago
|
|
|
const std::vector<Slice>& keys, std::vector<std::string>* values) {
|
|
|
|
|
|
|
|
StopWatch sw(env_, stats_, DB_MULTIGET);
|
|
|
|
PERF_TIMER_GUARD(get_snapshot_time);
|
|
|
|
|
|
|
|
SequenceNumber snapshot;
|
|
|
|
|
|
|
|
struct MultiGetColumnFamilyData {
|
|
|
|
ColumnFamilyData* cfd;
|
|
|
|
SuperVersion* super_version;
|
|
|
|
};
|
|
|
|
std::unordered_map<uint32_t, MultiGetColumnFamilyData*> multiget_cf_data;
|
|
|
|
// fill up and allocate outside of mutex
|
|
|
|
for (auto cf : column_family) {
|
|
|
|
auto cfh = reinterpret_cast<ColumnFamilyHandleImpl*>(cf);
|
|
|
|
auto cfd = cfh->cfd();
|
|
|
|
if (multiget_cf_data.find(cfd->GetID()) == multiget_cf_data.end()) {
|
|
|
|
auto mgcfd = new MultiGetColumnFamilyData();
|
|
|
|
mgcfd->cfd = cfd;
|
|
|
|
multiget_cf_data.insert({cfd->GetID(), mgcfd});
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
mutex_.Lock();
|
|
|
|
if (read_options.snapshot != nullptr) {
|
|
|
|
snapshot = reinterpret_cast<const SnapshotImpl*>(
|
|
|
|
read_options.snapshot)->number_;
|
|
|
|
} else {
|
|
|
|
snapshot = versions_->LastSequence();
|
|
|
|
}
|
|
|
|
for (auto mgd_iter : multiget_cf_data) {
|
|
|
|
mgd_iter.second->super_version =
|
|
|
|
mgd_iter.second->cfd->GetSuperVersion()->Ref();
|
|
|
|
}
|
|
|
|
mutex_.Unlock();
|
|
|
|
|
|
|
|
// Contain a list of merge operations if merge occurs.
|
|
|
|
MergeContext merge_context;
|
[RocksDB] [MergeOperator] The new Merge Interface! Uses merge sequences.
Summary:
Here are the major changes to the Merge Interface. It has been expanded
to handle cases where the MergeOperator is not associative. It does so by stacking
up merge operations while scanning through the key history (i.e.: during Get() or
Compaction), until a valid Put/Delete/end-of-history is encountered; it then
applies all of the merge operations in the correct sequence starting with the
base/sentinel value.
I have also introduced an "AssociativeMerge" function which allows the user to
take advantage of associative merge operations (such as in the case of counters).
The implementation will always attempt to merge the operations/operands themselves
together when they are encountered, and will resort to the "stacking" method if
and only if the "associative-merge" fails.
This implementation is conjectured to allow MergeOperator to handle the general
case, while still providing the user with the ability to take advantage of certain
efficiencies in their own merge-operator / data-structure.
NOTE: This is a preliminary diff. This must still go through a lot of review,
revision, and testing. Feedback welcome!
Test Plan:
-This is a preliminary diff. I have only just begun testing/debugging it.
-I will be testing this with the existing MergeOperator use-cases and unit-tests
(counters, string-append, and redis-lists)
-I will be "desk-checking" and walking through the code with the help gdb.
-I will find a way of stress-testing the new interface / implementation using
db_bench, db_test, merge_test, and/or db_stress.
-I will ensure that my tests cover all cases: Get-Memtable,
Get-Immutable-Memtable, Get-from-Disk, Iterator-Range-Scan, Flush-Memtable-to-L0,
Compaction-L0-L1, Compaction-Ln-L(n+1), Put/Delete found, Put/Delete not-found,
end-of-history, end-of-file, etc.
-A lot of feedback from the reviewers.
Reviewers: haobo, dhruba, zshao, emayanke
Reviewed By: haobo
CC: leveldb
Differential Revision: https://reviews.facebook.net/D11499
12 years ago
|
|
|
|
|
|
|
// Note: this always resizes the values array
|
|
|
|
size_t num_keys = keys.size();
|
|
|
|
std::vector<Status> stat_list(num_keys);
|
|
|
|
values->resize(num_keys);
|
|
|
|
|
|
|
|
// Keep track of bytes that we read for statistics-recording later
|
|
|
|
uint64_t bytes_read = 0;
|
|
|
|
PERF_TIMER_STOP(get_snapshot_time);
|
|
|
|
|
|
|
|
// For each of the given keys, apply the entire "get" process as follows:
|
|
|
|
// First look in the memtable, then in the immutable memtable (if any).
|
|
|
|
// s is both in/out. When in, s could either be OK or MergeInProgress.
|
[RocksDB] [MergeOperator] The new Merge Interface! Uses merge sequences.
Summary:
Here are the major changes to the Merge Interface. It has been expanded
to handle cases where the MergeOperator is not associative. It does so by stacking
up merge operations while scanning through the key history (i.e.: during Get() or
Compaction), until a valid Put/Delete/end-of-history is encountered; it then
applies all of the merge operations in the correct sequence starting with the
base/sentinel value.
I have also introduced an "AssociativeMerge" function which allows the user to
take advantage of associative merge operations (such as in the case of counters).
The implementation will always attempt to merge the operations/operands themselves
together when they are encountered, and will resort to the "stacking" method if
and only if the "associative-merge" fails.
This implementation is conjectured to allow MergeOperator to handle the general
case, while still providing the user with the ability to take advantage of certain
efficiencies in their own merge-operator / data-structure.
NOTE: This is a preliminary diff. This must still go through a lot of review,
revision, and testing. Feedback welcome!
Test Plan:
-This is a preliminary diff. I have only just begun testing/debugging it.
-I will be testing this with the existing MergeOperator use-cases and unit-tests
(counters, string-append, and redis-lists)
-I will be "desk-checking" and walking through the code with the help gdb.
-I will find a way of stress-testing the new interface / implementation using
db_bench, db_test, merge_test, and/or db_stress.
-I will ensure that my tests cover all cases: Get-Memtable,
Get-Immutable-Memtable, Get-from-Disk, Iterator-Range-Scan, Flush-Memtable-to-L0,
Compaction-L0-L1, Compaction-Ln-L(n+1), Put/Delete found, Put/Delete not-found,
end-of-history, end-of-file, etc.
-A lot of feedback from the reviewers.
Reviewers: haobo, dhruba, zshao, emayanke
Reviewed By: haobo
CC: leveldb
Differential Revision: https://reviews.facebook.net/D11499
12 years ago
|
|
|
// merge_operands will contain the sequence of merges in the latter case.
|
|
|
|
for (size_t i = 0; i < num_keys; ++i) {
|
|
|
|
merge_context.Clear();
|
|
|
|
Status& s = stat_list[i];
|
|
|
|
std::string* value = &(*values)[i];
|
|
|
|
|
|
|
|
LookupKey lkey(keys[i], snapshot);
|
|
|
|
auto cfh = reinterpret_cast<ColumnFamilyHandleImpl*>(column_family[i]);
|
|
|
|
auto mgd_iter = multiget_cf_data.find(cfh->cfd()->GetID());
|
|
|
|
assert(mgd_iter != multiget_cf_data.end());
|
|
|
|
auto mgd = mgd_iter->second;
|
|
|
|
auto super_version = mgd->super_version;
|
|
|
|
if (super_version->mem->Get(lkey, value, &s, &merge_context)) {
|
|
|
|
// Done
|
|
|
|
} else if (super_version->imm->Get(lkey, value, &s, &merge_context)) {
|
|
|
|
// Done
|
|
|
|
} else {
|
|
|
|
PERF_TIMER_GUARD(get_from_output_files_time);
|
|
|
|
super_version->current->Get(read_options, lkey, value, &s,
|
|
|
|
&merge_context);
|
|
|
|
}
|
|
|
|
|
|
|
|
if (s.ok()) {
|
|
|
|
bytes_read += value->size();
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
// Post processing (decrement reference counts and record statistics)
|
|
|
|
PERF_TIMER_GUARD(get_post_process_time);
|
|
|
|
autovector<SuperVersion*> superversions_to_delete;
|
|
|
|
|
|
|
|
// TODO(icanadi) do we need lock here or just around Cleanup()?
|
|
|
|
mutex_.Lock();
|
|
|
|
for (auto mgd_iter : multiget_cf_data) {
|
|
|
|
auto mgd = mgd_iter.second;
|
|
|
|
if (mgd->super_version->Unref()) {
|
|
|
|
mgd->super_version->Cleanup();
|
|
|
|
superversions_to_delete.push_back(mgd->super_version);
|
|
|
|
}
|
|
|
|
}
|
|
|
|
mutex_.Unlock();
|
|
|
|
|
|
|
|
for (auto td : superversions_to_delete) {
|
|
|
|
delete td;
|
|
|
|
}
|
|
|
|
for (auto mgd : multiget_cf_data) {
|
|
|
|
delete mgd.second;
|
|
|
|
}
|
|
|
|
|
|
|
|
RecordTick(stats_, NUMBER_MULTIGET_CALLS);
|
|
|
|
RecordTick(stats_, NUMBER_MULTIGET_KEYS_READ, num_keys);
|
|
|
|
RecordTick(stats_, NUMBER_MULTIGET_BYTES_READ, bytes_read);
|
|
|
|
PERF_TIMER_STOP(get_post_process_time);
|
|
|
|
|
|
|
|
return stat_list;
|
|
|
|
}
|
|
|
|
|
|
|
|
Status DBImpl::CreateColumnFamily(const ColumnFamilyOptions& cf_options,
|
|
|
|
const std::string& column_family_name,
|
|
|
|
ColumnFamilyHandle** handle) {
|
|
|
|
*handle = nullptr;
|
|
|
|
MutexLock l(&mutex_);
|
|
|
|
|
|
|
|
if (versions_->GetColumnFamilySet()->GetColumnFamily(column_family_name) !=
|
|
|
|
nullptr) {
|
|
|
|
return Status::InvalidArgument("Column family already exists");
|
|
|
|
}
|
|
|
|
VersionEdit edit;
|
|
|
|
edit.AddColumnFamily(column_family_name);
|
|
|
|
uint32_t new_id = versions_->GetColumnFamilySet()->GetNextColumnFamilyID();
|
|
|
|
edit.SetColumnFamily(new_id);
|
|
|
|
edit.SetLogNumber(logfile_number_);
|
|
|
|
edit.SetComparatorName(cf_options.comparator->Name());
|
|
|
|
|
|
|
|
// LogAndApply will both write the creation in MANIFEST and create
|
|
|
|
// ColumnFamilyData object
|
|
|
|
Options opt(db_options_, cf_options);
|
|
|
|
Status s = versions_->LogAndApply(nullptr,
|
|
|
|
MutableCFOptions(opt, ImmutableCFOptions(opt)),
|
|
|
|
&edit, &mutex_, db_directory_.get(), false, &cf_options);
|
|
|
|
if (s.ok()) {
|
|
|
|
single_column_family_mode_ = false;
|
|
|
|
auto cfd =
|
|
|
|
versions_->GetColumnFamilySet()->GetColumnFamily(column_family_name);
|
|
|
|
assert(cfd != nullptr);
|
|
|
|
delete InstallSuperVersion(cfd, nullptr, *cfd->GetLatestMutableCFOptions());
|
|
|
|
*handle = new ColumnFamilyHandleImpl(cfd, this, &mutex_);
|
|
|
|
Log(InfoLogLevel::INFO_LEVEL, db_options_.info_log,
|
|
|
|
"Created column family [%s] (ID %u)",
|
|
|
|
column_family_name.c_str(), (unsigned)cfd->GetID());
|
|
|
|
} else {
|
|
|
|
Log(InfoLogLevel::ERROR_LEVEL, db_options_.info_log,
|
|
|
|
"Creating column family [%s] FAILED -- %s",
|
|
|
|
column_family_name.c_str(), s.ToString().c_str());
|
|
|
|
}
|
|
|
|
return s;
|
|
|
|
}
|
|
|
|
|
|
|
|
Status DBImpl::DropColumnFamily(ColumnFamilyHandle* column_family) {
|
|
|
|
auto cfh = reinterpret_cast<ColumnFamilyHandleImpl*>(column_family);
|
|
|
|
auto cfd = cfh->cfd();
|
|
|
|
if (cfd->GetID() == 0) {
|
|
|
|
return Status::InvalidArgument("Can't drop default column family");
|
|
|
|
}
|
|
|
|
|
|
|
|
VersionEdit edit;
|
|
|
|
edit.DropColumnFamily();
|
|
|
|
edit.SetColumnFamily(cfd->GetID());
|
|
|
|
|
|
|
|
Status s;
|
|
|
|
{
|
|
|
|
MutexLock l(&mutex_);
|
|
|
|
if (cfd->IsDropped()) {
|
|
|
|
s = Status::InvalidArgument("Column family already dropped!\n");
|
|
|
|
}
|
|
|
|
if (s.ok()) {
|
|
|
|
// we drop column family from a single write thread
|
|
|
|
WriteThread::Writer w(&mutex_);
|
|
|
|
s = write_thread_.EnterWriteThread(&w, 0);
|
|
|
|
assert(s.ok() && !w.done); // No timeout and nobody should do our job
|
|
|
|
s = versions_->LogAndApply(cfd, *cfd->GetLatestMutableCFOptions(),
|
|
|
|
&edit, &mutex_);
|
|
|
|
write_thread_.ExitWriteThread(&w, &w, s);
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
if (s.ok()) {
|
|
|
|
assert(cfd->IsDropped());
|
|
|
|
auto* mutable_cf_options = cfd->GetLatestMutableCFOptions();
|
|
|
|
max_total_in_memory_state_ -= mutable_cf_options->write_buffer_size *
|
|
|
|
mutable_cf_options->max_write_buffer_number;
|
|
|
|
Log(InfoLogLevel::INFO_LEVEL, db_options_.info_log,
|
|
|
|
"Dropped column family with id %u\n",
|
|
|
|
cfd->GetID());
|
|
|
|
} else {
|
|
|
|
Log(InfoLogLevel::ERROR_LEVEL, db_options_.info_log,
|
|
|
|
"Dropping column family with id %u FAILED -- %s\n",
|
|
|
|
cfd->GetID(), s.ToString().c_str());
|
|
|
|
}
|
|
|
|
|
|
|
|
return s;
|
|
|
|
}
|
|
|
|
|
|
|
|
bool DBImpl::KeyMayExist(const ReadOptions& read_options,
|
|
|
|
ColumnFamilyHandle* column_family, const Slice& key,
|
|
|
|
std::string* value, bool* value_found) {
|
|
|
|
if (value_found != nullptr) {
|
|
|
|
// falsify later if key-may-exist but can't fetch value
|
|
|
|
*value_found = true;
|
|
|
|
}
|
|
|
|
ReadOptions roptions = read_options;
|
|
|
|
roptions.read_tier = kBlockCacheTier; // read from block cache only
|
|
|
|
auto s = GetImpl(roptions, column_family, key, value, value_found);
|
|
|
|
|
|
|
|
// If block_cache is enabled and the index block of the table didn't
|
|
|
|
// not present in block_cache, the return value will be Status::Incomplete.
|
|
|
|
// In this case, key may still exist in the table.
|
|
|
|
return s.ok() || s.IsIncomplete();
|
|
|
|
}
|
|
|
|
|
|
|
|
Iterator* DBImpl::NewIterator(const ReadOptions& read_options,
|
|
|
|
ColumnFamilyHandle* column_family) {
|
|
|
|
auto cfh = reinterpret_cast<ColumnFamilyHandleImpl*>(column_family);
|
|
|
|
auto cfd = cfh->cfd();
|
|
|
|
|
|
|
|
if (read_options.tailing) {
|
|
|
|
#ifdef ROCKSDB_LITE
|
|
|
|
// not supported in lite version
|
|
|
|
return nullptr;
|
|
|
|
#else
|
|
|
|
SuperVersion* sv = cfd->GetReferencedSuperVersion(&mutex_);
|
|
|
|
auto iter = new ForwardIterator(this, read_options, cfd, sv);
|
|
|
|
return NewDBIterator(env_, *cfd->ioptions(), cfd->user_comparator(), iter,
|
|
|
|
kMaxSequenceNumber,
|
|
|
|
sv->mutable_cf_options.max_sequential_skip_in_iterations,
|
|
|
|
read_options.iterate_upper_bound);
|
|
|
|
#endif
|
|
|
|
} else {
|
|
|
|
SequenceNumber latest_snapshot = versions_->LastSequence();
|
|
|
|
SuperVersion* sv = cfd->GetReferencedSuperVersion(&mutex_);
|
|
|
|
|
|
|
|
auto snapshot =
|
|
|
|
read_options.snapshot != nullptr
|
|
|
|
? reinterpret_cast<const SnapshotImpl*>(
|
|
|
|
read_options.snapshot)->number_
|
|
|
|
: latest_snapshot;
|
|
|
|
|
In DB::NewIterator(), try to allocate the whole iterator tree in an arena
Summary:
In this patch, try to allocate the whole iterator tree starting from DBIter from an arena
1. ArenaWrappedDBIter is created when serves as the entry point of an iterator tree, with an arena in it.
2. Add an option to create iterator from arena for following iterators: DBIter, MergingIterator, MemtableIterator, all mem table's iterators, all table reader's iterators and two level iterator.
3. MergeIteratorBuilder is created to incrementally build the tree of internal iterators. It is passed to mem table list and version set and add iterators to it.
Limitations:
(1) Only DB::NewIterator() without tailing uses the arena. Other cases, including readonly DB and compactions are still from malloc
(2) Two level iterator itself is allocated in arena, but not iterators inside it.
Test Plan: make all check
Reviewers: ljin, haobo
Reviewed By: haobo
Subscribers: leveldb, dhruba, yhchiang, igor
Differential Revision: https://reviews.facebook.net/D18513
11 years ago
|
|
|
// Try to generate a DB iterator tree in continuous memory area to be
|
|
|
|
// cache friendly. Here is an example of result:
|
|
|
|
// +-------------------------------+
|
|
|
|
// | |
|
|
|
|
// | ArenaWrappedDBIter |
|
|
|
|
// | + |
|
|
|
|
// | +---> Inner Iterator ------------+
|
|
|
|
// | | | |
|
|
|
|
// | | +-- -- -- -- -- -- -- --+ |
|
|
|
|
// | +--- | Arena | |
|
|
|
|
// | | | |
|
|
|
|
// | Allocated Memory: | |
|
|
|
|
// | | +-------------------+ |
|
|
|
|
// | | | DBIter | <---+
|
|
|
|
// | | + |
|
|
|
|
// | | | +-> iter_ ------------+
|
|
|
|
// | | | | |
|
|
|
|
// | | +-------------------+ |
|
|
|
|
// | | | MergingIterator | <---+
|
|
|
|
// | | + |
|
|
|
|
// | | | +->child iter1 ------------+
|
|
|
|
// | | | | | |
|
|
|
|
// | | +->child iter2 ----------+ |
|
|
|
|
// | | | | | | |
|
|
|
|
// | | | +->child iter3 --------+ | |
|
|
|
|
// | | | | | |
|
|
|
|
// | | +-------------------+ | | |
|
|
|
|
// | | | Iterator1 | <--------+
|
|
|
|
// | | +-------------------+ | |
|
|
|
|
// | | | Iterator2 | <------+
|
|
|
|
// | | +-------------------+ |
|
|
|
|
// | | | Iterator3 | <----+
|
|
|
|
// | | +-------------------+
|
|
|
|
// | | |
|
|
|
|
// +-------+-----------------------+
|
|
|
|
//
|
|
|
|
// ArenaWrappedDBIter inlines an arena area where all the iterartor in the
|
|
|
|
// the iterator tree is allocated in the order of being accessed when
|
|
|
|
// querying.
|
|
|
|
// Laying out the iterators in the order of being accessed makes it more
|
|
|
|
// likely that any iterator pointer is close to the iterator it points to so
|
|
|
|
// that they are likely to be in the same cache line and/or page.
|
|
|
|
ArenaWrappedDBIter* db_iter = NewArenaWrappedDbIterator(
|
|
|
|
env_, *cfd->ioptions(), cfd->user_comparator(),
|
|
|
|
snapshot, sv->mutable_cf_options.max_sequential_skip_in_iterations,
|
|
|
|
read_options.iterate_upper_bound);
|
|
|
|
|
In DB::NewIterator(), try to allocate the whole iterator tree in an arena
Summary:
In this patch, try to allocate the whole iterator tree starting from DBIter from an arena
1. ArenaWrappedDBIter is created when serves as the entry point of an iterator tree, with an arena in it.
2. Add an option to create iterator from arena for following iterators: DBIter, MergingIterator, MemtableIterator, all mem table's iterators, all table reader's iterators and two level iterator.
3. MergeIteratorBuilder is created to incrementally build the tree of internal iterators. It is passed to mem table list and version set and add iterators to it.
Limitations:
(1) Only DB::NewIterator() without tailing uses the arena. Other cases, including readonly DB and compactions are still from malloc
(2) Two level iterator itself is allocated in arena, but not iterators inside it.
Test Plan: make all check
Reviewers: ljin, haobo
Reviewed By: haobo
Subscribers: leveldb, dhruba, yhchiang, igor
Differential Revision: https://reviews.facebook.net/D18513
11 years ago
|
|
|
Iterator* internal_iter =
|
|
|
|
NewInternalIterator(read_options, cfd, sv, db_iter->GetArena());
|
In DB::NewIterator(), try to allocate the whole iterator tree in an arena
Summary:
In this patch, try to allocate the whole iterator tree starting from DBIter from an arena
1. ArenaWrappedDBIter is created when serves as the entry point of an iterator tree, with an arena in it.
2. Add an option to create iterator from arena for following iterators: DBIter, MergingIterator, MemtableIterator, all mem table's iterators, all table reader's iterators and two level iterator.
3. MergeIteratorBuilder is created to incrementally build the tree of internal iterators. It is passed to mem table list and version set and add iterators to it.
Limitations:
(1) Only DB::NewIterator() without tailing uses the arena. Other cases, including readonly DB and compactions are still from malloc
(2) Two level iterator itself is allocated in arena, but not iterators inside it.
Test Plan: make all check
Reviewers: ljin, haobo
Reviewed By: haobo
Subscribers: leveldb, dhruba, yhchiang, igor
Differential Revision: https://reviews.facebook.net/D18513
11 years ago
|
|
|
db_iter->SetIterUnderDBIter(internal_iter);
|
|
|
|
|
|
|
|
return db_iter;
|
|
|
|
}
|
|
|
|
// To stop compiler from complaining
|
|
|
|
return nullptr;
|
|
|
|
}
|
|
|
|
|
[RocksDB] [Column Family] Interface proposal
Summary:
<This diff is for Column Family branch>
Sharing some of the work I've done so far. This diff compiles and passes the tests.
The biggest change is in options.h - I broke down Options into two parts - DBOptions and ColumnFamilyOptions. DBOptions is DB-specific (env, create_if_missing, block_cache, etc.) and ColumnFamilyOptions is column family-specific (all compaction options, compresion options, etc.). Note that this does not break backwards compatibility at all.
Further, I created DBWithColumnFamily which inherits DB interface and adds new functions with column family support. Clients can transparently switch to DBWithColumnFamily and it will not break their backwards compatibility.
There are few methods worth checking out: ListColumnFamilies(), MultiNewIterator(), MultiGet() and GetSnapshot(). [GetSnapshot() returns the snapshot across all column families for now - I think that's what we agreed on]
Finally, I made small changes to WriteBatch so we are able to atomically insert data across column families.
Please provide feedback.
Test Plan: make check works, the code is backward compatible
Reviewers: dhruba, haobo, sdong, kailiu, emayanke
CC: leveldb
Differential Revision: https://reviews.facebook.net/D14445
11 years ago
|
|
|
Status DBImpl::NewIterators(
|
|
|
|
const ReadOptions& read_options,
|
|
|
|
const std::vector<ColumnFamilyHandle*>& column_families,
|
[RocksDB] [Column Family] Interface proposal
Summary:
<This diff is for Column Family branch>
Sharing some of the work I've done so far. This diff compiles and passes the tests.
The biggest change is in options.h - I broke down Options into two parts - DBOptions and ColumnFamilyOptions. DBOptions is DB-specific (env, create_if_missing, block_cache, etc.) and ColumnFamilyOptions is column family-specific (all compaction options, compresion options, etc.). Note that this does not break backwards compatibility at all.
Further, I created DBWithColumnFamily which inherits DB interface and adds new functions with column family support. Clients can transparently switch to DBWithColumnFamily and it will not break their backwards compatibility.
There are few methods worth checking out: ListColumnFamilies(), MultiNewIterator(), MultiGet() and GetSnapshot(). [GetSnapshot() returns the snapshot across all column families for now - I think that's what we agreed on]
Finally, I made small changes to WriteBatch so we are able to atomically insert data across column families.
Please provide feedback.
Test Plan: make check works, the code is backward compatible
Reviewers: dhruba, haobo, sdong, kailiu, emayanke
CC: leveldb
Differential Revision: https://reviews.facebook.net/D14445
11 years ago
|
|
|
std::vector<Iterator*>* iterators) {
|
|
|
|
iterators->clear();
|
|
|
|
iterators->reserve(column_families.size());
|
|
|
|
|
|
|
|
if (read_options.tailing) {
|
|
|
|
#ifdef ROCKSDB_LITE
|
|
|
|
return Status::InvalidArgument(
|
|
|
|
"Tailing interator not supported in RocksDB lite");
|
|
|
|
#else
|
|
|
|
for (auto cfh : column_families) {
|
|
|
|
auto cfd = reinterpret_cast<ColumnFamilyHandleImpl*>(cfh)->cfd();
|
|
|
|
SuperVersion* sv = cfd->GetReferencedSuperVersion(&mutex_);
|
|
|
|
auto iter = new ForwardIterator(this, read_options, cfd, sv);
|
|
|
|
iterators->push_back(
|
|
|
|
NewDBIterator(env_, *cfd->ioptions(), cfd->user_comparator(), iter,
|
|
|
|
kMaxSequenceNumber,
|
|
|
|
sv->mutable_cf_options.max_sequential_skip_in_iterations));
|
|
|
|
}
|
|
|
|
#endif
|
|
|
|
} else {
|
|
|
|
SequenceNumber latest_snapshot = versions_->LastSequence();
|
|
|
|
|
|
|
|
for (size_t i = 0; i < column_families.size(); ++i) {
|
|
|
|
auto* cfd = reinterpret_cast<ColumnFamilyHandleImpl*>(
|
|
|
|
column_families[i])->cfd();
|
|
|
|
SuperVersion* sv = cfd->GetReferencedSuperVersion(&mutex_);
|
|
|
|
|
|
|
|
auto snapshot =
|
|
|
|
read_options.snapshot != nullptr
|
|
|
|
? reinterpret_cast<const SnapshotImpl*>(
|
|
|
|
read_options.snapshot)->number_
|
|
|
|
: latest_snapshot;
|
|
|
|
|
|
|
|
ArenaWrappedDBIter* db_iter = NewArenaWrappedDbIterator(
|
|
|
|
env_, *cfd->ioptions(), cfd->user_comparator(), snapshot,
|
|
|
|
sv->mutable_cf_options.max_sequential_skip_in_iterations);
|
|
|
|
Iterator* internal_iter = NewInternalIterator(
|
|
|
|
read_options, cfd, sv, db_iter->GetArena());
|
|
|
|
db_iter->SetIterUnderDBIter(internal_iter);
|
|
|
|
iterators->push_back(db_iter);
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
return Status::OK();
|
[RocksDB] [Column Family] Interface proposal
Summary:
<This diff is for Column Family branch>
Sharing some of the work I've done so far. This diff compiles and passes the tests.
The biggest change is in options.h - I broke down Options into two parts - DBOptions and ColumnFamilyOptions. DBOptions is DB-specific (env, create_if_missing, block_cache, etc.) and ColumnFamilyOptions is column family-specific (all compaction options, compresion options, etc.). Note that this does not break backwards compatibility at all.
Further, I created DBWithColumnFamily which inherits DB interface and adds new functions with column family support. Clients can transparently switch to DBWithColumnFamily and it will not break their backwards compatibility.
There are few methods worth checking out: ListColumnFamilies(), MultiNewIterator(), MultiGet() and GetSnapshot(). [GetSnapshot() returns the snapshot across all column families for now - I think that's what we agreed on]
Finally, I made small changes to WriteBatch so we are able to atomically insert data across column families.
Please provide feedback.
Test Plan: make check works, the code is backward compatible
Reviewers: dhruba, haobo, sdong, kailiu, emayanke
CC: leveldb
Differential Revision: https://reviews.facebook.net/D14445
11 years ago
|
|
|
}
|
|
|
|
|
Add a new mem-table representation based on cuckoo hash.
Summary:
= Major Changes =
* Add a new mem-table representation, HashCuckooRep, which is based cuckoo hash.
Cuckoo hash uses multiple hash functions. This allows each key to have multiple
possible locations in the mem-table.
- Put: When insert a key, it will try to find whether one of its possible
locations is vacant and store the key. If none of its possible
locations are available, then it will kick out a victim key and
store at that location. The kicked-out victim key will then be
stored at a vacant space of its possible locations or kick-out
another victim. In this diff, the kick-out path (known as
cuckoo-path) is found using BFS, which guarantees to be the shortest.
- Get: Simply tries all possible locations of a key --- this guarantees
worst-case constant time complexity.
- Time complexity: O(1) for Get, and average O(1) for Put if the
fullness of the mem-table is below 80%.
- Default using two hash functions, the number of hash functions used
by the cuckoo-hash may dynamically increase if it fails to find a
short-enough kick-out path.
- Currently, HashCuckooRep does not support iteration and snapshots,
as our current main purpose of this is to optimize point access.
= Minor Changes =
* Add IsSnapshotSupported() to DB to indicate whether the current DB
supports snapshots. If it returns false, then DB::GetSnapshot() will
always return nullptr.
Test Plan:
Run existing tests. Will develop a test specifically for cuckoo hash in
the next diff.
Reviewers: sdong, haobo
Reviewed By: sdong
CC: leveldb, dhruba, igor
Differential Revision: https://reviews.facebook.net/D16155
11 years ago
|
|
|
bool DBImpl::IsSnapshotSupported() const {
|
|
|
|
for (auto cfd : *versions_->GetColumnFamilySet()) {
|
|
|
|
if (!cfd->mem()->IsSnapshotSupported()) {
|
|
|
|
return false;
|
|
|
|
}
|
|
|
|
}
|
|
|
|
return true;
|
|
|
|
}
|
|
|
|
|
|
|
|
const Snapshot* DBImpl::GetSnapshot() {
|
|
|
|
MutexLock l(&mutex_);
|
Add a new mem-table representation based on cuckoo hash.
Summary:
= Major Changes =
* Add a new mem-table representation, HashCuckooRep, which is based cuckoo hash.
Cuckoo hash uses multiple hash functions. This allows each key to have multiple
possible locations in the mem-table.
- Put: When insert a key, it will try to find whether one of its possible
locations is vacant and store the key. If none of its possible
locations are available, then it will kick out a victim key and
store at that location. The kicked-out victim key will then be
stored at a vacant space of its possible locations or kick-out
another victim. In this diff, the kick-out path (known as
cuckoo-path) is found using BFS, which guarantees to be the shortest.
- Get: Simply tries all possible locations of a key --- this guarantees
worst-case constant time complexity.
- Time complexity: O(1) for Get, and average O(1) for Put if the
fullness of the mem-table is below 80%.
- Default using two hash functions, the number of hash functions used
by the cuckoo-hash may dynamically increase if it fails to find a
short-enough kick-out path.
- Currently, HashCuckooRep does not support iteration and snapshots,
as our current main purpose of this is to optimize point access.
= Minor Changes =
* Add IsSnapshotSupported() to DB to indicate whether the current DB
supports snapshots. If it returns false, then DB::GetSnapshot() will
always return nullptr.
Test Plan:
Run existing tests. Will develop a test specifically for cuckoo hash in
the next diff.
Reviewers: sdong, haobo
Reviewed By: sdong
CC: leveldb, dhruba, igor
Differential Revision: https://reviews.facebook.net/D16155
11 years ago
|
|
|
// returns null if the underlying memtable does not support snapshot.
|
|
|
|
if (!IsSnapshotSupported()) return nullptr;
|
|
|
|
return snapshots_.New(versions_->LastSequence());
|
|
|
|
}
|
|
|
|
|
|
|
|
void DBImpl::ReleaseSnapshot(const Snapshot* s) {
|
|
|
|
MutexLock l(&mutex_);
|
|
|
|
snapshots_.Delete(reinterpret_cast<const SnapshotImpl*>(s));
|
|
|
|
}
|
|
|
|
|
|
|
|
// Convenience methods
|
|
|
|
Status DBImpl::Put(const WriteOptions& o, ColumnFamilyHandle* column_family,
|
|
|
|
const Slice& key, const Slice& val) {
|
[RocksDB] [Column Family] Interface proposal
Summary:
<This diff is for Column Family branch>
Sharing some of the work I've done so far. This diff compiles and passes the tests.
The biggest change is in options.h - I broke down Options into two parts - DBOptions and ColumnFamilyOptions. DBOptions is DB-specific (env, create_if_missing, block_cache, etc.) and ColumnFamilyOptions is column family-specific (all compaction options, compresion options, etc.). Note that this does not break backwards compatibility at all.
Further, I created DBWithColumnFamily which inherits DB interface and adds new functions with column family support. Clients can transparently switch to DBWithColumnFamily and it will not break their backwards compatibility.
There are few methods worth checking out: ListColumnFamilies(), MultiNewIterator(), MultiGet() and GetSnapshot(). [GetSnapshot() returns the snapshot across all column families for now - I think that's what we agreed on]
Finally, I made small changes to WriteBatch so we are able to atomically insert data across column families.
Please provide feedback.
Test Plan: make check works, the code is backward compatible
Reviewers: dhruba, haobo, sdong, kailiu, emayanke
CC: leveldb
Differential Revision: https://reviews.facebook.net/D14445
11 years ago
|
|
|
return DB::Put(o, column_family, key, val);
|
|
|
|
}
|
|
|
|
|
|
|
|
Status DBImpl::Merge(const WriteOptions& o, ColumnFamilyHandle* column_family,
|
|
|
|
const Slice& key, const Slice& val) {
|
|
|
|
auto cfh = reinterpret_cast<ColumnFamilyHandleImpl*>(column_family);
|
|
|
|
if (!cfh->cfd()->ioptions()->merge_operator) {
|
|
|
|
return Status::NotSupported("Provide a merge_operator when opening DB");
|
|
|
|
} else {
|
[RocksDB] [Column Family] Interface proposal
Summary:
<This diff is for Column Family branch>
Sharing some of the work I've done so far. This diff compiles and passes the tests.
The biggest change is in options.h - I broke down Options into two parts - DBOptions and ColumnFamilyOptions. DBOptions is DB-specific (env, create_if_missing, block_cache, etc.) and ColumnFamilyOptions is column family-specific (all compaction options, compresion options, etc.). Note that this does not break backwards compatibility at all.
Further, I created DBWithColumnFamily which inherits DB interface and adds new functions with column family support. Clients can transparently switch to DBWithColumnFamily and it will not break their backwards compatibility.
There are few methods worth checking out: ListColumnFamilies(), MultiNewIterator(), MultiGet() and GetSnapshot(). [GetSnapshot() returns the snapshot across all column families for now - I think that's what we agreed on]
Finally, I made small changes to WriteBatch so we are able to atomically insert data across column families.
Please provide feedback.
Test Plan: make check works, the code is backward compatible
Reviewers: dhruba, haobo, sdong, kailiu, emayanke
CC: leveldb
Differential Revision: https://reviews.facebook.net/D14445
11 years ago
|
|
|
return DB::Merge(o, column_family, key, val);
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
Status DBImpl::Delete(const WriteOptions& write_options,
|
|
|
|
ColumnFamilyHandle* column_family, const Slice& key) {
|
|
|
|
return DB::Delete(write_options, column_family, key);
|
|
|
|
}
|
|
|
|
|
|
|
|
Status DBImpl::Write(const WriteOptions& write_options, WriteBatch* my_batch) {
|
|
|
|
if (my_batch == nullptr) {
|
|
|
|
return Status::Corruption("Batch is nullptr!");
|
|
|
|
}
|
|
|
|
PERF_TIMER_GUARD(write_pre_and_post_process_time);
|
|
|
|
WriteThread::Writer w(&mutex_);
|
|
|
|
w.batch = my_batch;
|
|
|
|
w.sync = write_options.sync;
|
|
|
|
w.disableWAL = write_options.disableWAL;
|
|
|
|
w.in_batch_group = false;
|
|
|
|
w.done = false;
|
|
|
|
w.timeout_hint_us = write_options.timeout_hint_us;
|
|
|
|
|
|
|
|
uint64_t expiration_time = 0;
|
|
|
|
bool has_timeout = false;
|
|
|
|
if (w.timeout_hint_us == 0) {
|
|
|
|
w.timeout_hint_us = WriteThread::kNoTimeOut;
|
|
|
|
} else {
|
|
|
|
expiration_time = env_->NowMicros() + w.timeout_hint_us;
|
|
|
|
has_timeout = true;
|
|
|
|
}
|
|
|
|
|
|
|
|
if (!write_options.disableWAL) {
|
|
|
|
RecordTick(stats_, WRITE_WITH_WAL);
|
|
|
|
default_cf_internal_stats_->AddDBStats(InternalStats::WRITE_WITH_WAL, 1);
|
|
|
|
}
|
|
|
|
|
|
|
|
WriteContext context;
|
|
|
|
mutex_.Lock();
|
|
|
|
Status status = write_thread_.EnterWriteThread(&w, expiration_time);
|
|
|
|
assert(status.ok() || status.IsTimedOut());
|
|
|
|
if (status.IsTimedOut()) {
|
|
|
|
mutex_.Unlock();
|
|
|
|
RecordTick(stats_, WRITE_TIMEDOUT);
|
|
|
|
return Status::TimedOut();
|
|
|
|
}
|
|
|
|
if (w.done) { // write was done by someone else
|
|
|
|
default_cf_internal_stats_->AddDBStats(InternalStats::WRITE_DONE_BY_OTHER,
|
|
|
|
1);
|
|
|
|
mutex_.Unlock();
|
|
|
|
RecordTick(stats_, WRITE_DONE_BY_OTHER);
|
|
|
|
return w.status;
|
|
|
|
}
|
|
|
|
|
|
|
|
RecordTick(stats_, WRITE_DONE_BY_SELF);
|
|
|
|
default_cf_internal_stats_->AddDBStats(InternalStats::WRITE_DONE_BY_SELF, 1);
|
|
|
|
|
|
|
|
// Once reaches this point, the current writer "w" will try to do its write
|
|
|
|
// job. It may also pick up some of the remaining writers in the "writers_"
|
|
|
|
// when it finds suitable, and finish them in the same write batch.
|
|
|
|
// This is how a write job could be done by the other writer.
|
|
|
|
assert(!single_column_family_mode_ ||
|
|
|
|
versions_->GetColumnFamilySet()->NumberOfColumnFamilies() == 1);
|
|
|
|
|
|
|
|
uint64_t max_total_wal_size = (db_options_.max_total_wal_size == 0)
|
|
|
|
? 4 * max_total_in_memory_state_
|
|
|
|
: db_options_.max_total_wal_size;
|
|
|
|
if (UNLIKELY(!single_column_family_mode_) &&
|
|
|
|
alive_log_files_.begin()->getting_flushed == false &&
|
|
|
|
total_log_size_ > max_total_wal_size) {
|
|
|
|
uint64_t flush_column_family_if_log_file = alive_log_files_.begin()->number;
|
|
|
|
alive_log_files_.begin()->getting_flushed = true;
|
|
|
|
Log(InfoLogLevel::INFO_LEVEL, db_options_.info_log,
|
|
|
|
"Flushing all column families with data in WAL number %" PRIu64
|
|
|
|
". Total log size is %" PRIu64 " while max_total_wal_size is %" PRIu64,
|
|
|
|
flush_column_family_if_log_file, total_log_size_, max_total_wal_size);
|
|
|
|
// no need to refcount because drop is happening in write thread, so can't
|
|
|
|
// happen while we're in the write thread
|
|
|
|
for (auto cfd : *versions_->GetColumnFamilySet()) {
|
|
|
|
if (cfd->GetLogNumber() <= flush_column_family_if_log_file) {
|
|
|
|
status = SetNewMemtableAndNewLogFile(cfd, &context);
|
|
|
|
if (!status.ok()) {
|
|
|
|
break;
|
|
|
|
}
|
|
|
|
cfd->imm()->FlushRequested();
|
|
|
|
}
|
|
|
|
}
|
|
|
|
MaybeScheduleFlushOrCompaction();
|
|
|
|
}
|
|
|
|
|
|
|
|
if (UNLIKELY(status.ok() && !bg_error_.ok())) {
|
|
|
|
status = bg_error_;
|
|
|
|
}
|
|
|
|
|
|
|
|
if (UNLIKELY(status.ok() && !flush_scheduler_.Empty())) {
|
|
|
|
status = ScheduleFlushes(&context);
|
|
|
|
}
|
|
|
|
|
|
|
|
if (UNLIKELY(status.ok()) &&
|
|
|
|
(write_controller_.IsStopped() || write_controller_.GetDelay() > 0)) {
|
|
|
|
status = DelayWrite(expiration_time);
|
|
|
|
}
|
|
|
|
|
|
|
|
if (UNLIKELY(status.ok() && has_timeout &&
|
|
|
|
env_->NowMicros() > expiration_time)) {
|
|
|
|
status = Status::TimedOut();
|
|
|
|
}
|
|
|
|
|
|
|
|
uint64_t last_sequence = versions_->LastSequence();
|
|
|
|
WriteThread::Writer* last_writer = &w;
|
|
|
|
if (status.ok()) {
|
|
|
|
autovector<WriteBatch*> write_batch_group;
|
|
|
|
write_thread_.BuildBatchGroup(&last_writer, &write_batch_group);
|
|
|
|
|
|
|
|
// Add to log and apply to memtable. We can release the lock
|
|
|
|
// during this phase since &w is currently responsible for logging
|
|
|
|
// and protects against concurrent loggers and concurrent writes
|
|
|
|
// into memtables
|
|
|
|
{
|
|
|
|
mutex_.Unlock();
|
|
|
|
WriteBatch* updates = nullptr;
|
|
|
|
if (write_batch_group.size() == 1) {
|
|
|
|
updates = write_batch_group[0];
|
|
|
|
} else {
|
|
|
|
updates = &tmp_batch_;
|
|
|
|
for (size_t i = 0; i < write_batch_group.size(); ++i) {
|
|
|
|
WriteBatchInternal::Append(updates, write_batch_group[i]);
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
const SequenceNumber current_sequence = last_sequence + 1;
|
|
|
|
WriteBatchInternal::SetSequence(updates, current_sequence);
|
|
|
|
int my_batch_count = WriteBatchInternal::Count(updates);
|
|
|
|
last_sequence += my_batch_count;
|
make internal stats independent of statistics
Summary:
also make it aware of column family
output from db_bench
```
** Compaction Stats [default] **
Level Files Size(MB) Score Read(GB) Rn(GB) Rnp1(GB) Write(GB) Wnew(GB) RW-Amp W-Amp Rd(MB/s) Wr(MB/s) Rn(cnt) Rnp1(cnt) Wnp1(cnt) Wnew(cnt) Comp(sec) Comp(cnt) Avg(sec) Stall(sec) Stall(cnt) Avg(ms)
------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------
L0 14 956 0.9 0.0 0.0 0.0 2.7 2.7 0.0 0.0 0.0 111.6 0 0 0 0 24 40 0.612 75.20 492387 0.15
L1 21 2001 2.0 5.7 2.0 3.7 5.3 1.6 5.4 2.6 71.2 65.7 31 43 55 12 82 2 41.242 43.72 41183 1.06
L2 217 18974 1.9 16.5 2.0 14.4 15.1 0.7 15.6 7.4 70.1 64.3 17 182 185 3 241 16 15.052 0.00 0 0.00
L3 1641 188245 1.8 9.1 1.1 8.0 8.5 0.5 15.4 7.4 61.3 57.2 9 75 76 1 152 9 16.887 0.00 0 0.00
L4 4447 449025 0.4 13.4 4.8 8.6 9.1 0.5 4.7 1.9 77.8 52.7 38 79 100 21 176 38 4.639 0.00 0 0.00
Sum 6340 659201 0.0 44.7 10.0 34.7 40.6 6.0 32.0 15.2 67.7 61.6 95 379 416 37 676 105 6.439 118.91 533570 0.22
Int 0 0 0.0 1.2 0.4 0.8 1.3 0.5 5.2 2.7 59.1 65.6 3 7 9 2 20 10 2.003 0.00 0 0.00
Stalls(secs): 75.197 level0_slowdown, 0.000 level0_numfiles, 0.000 memtable_compaction, 43.717 leveln_slowdown
Stalls(count): 492387 level0_slowdown, 0 level0_numfiles, 0 memtable_compaction, 41183 leveln_slowdown
** DB Stats **
Uptime(secs): 202.1 total, 13.5 interval
Cumulative writes: 6291456 writes, 6291456 batches, 1.0 writes per batch, 4.90 ingest GB
Cumulative WAL: 6291456 writes, 6291456 syncs, 1.00 writes per sync, 4.90 GB written
Interval writes: 1048576 writes, 1048576 batches, 1.0 writes per batch, 836.0 ingest MB
Interval WAL: 1048576 writes, 1048576 syncs, 1.00 writes per sync, 0.82 MB written
Test Plan: ran it
Reviewers: sdong, yhchiang, igor
Reviewed By: igor
Subscribers: leveldb
Differential Revision: https://reviews.facebook.net/D19917
11 years ago
|
|
|
const uint64_t batch_size = WriteBatchInternal::ByteSize(updates);
|
|
|
|
// Record statistics
|
|
|
|
RecordTick(stats_, NUMBER_KEYS_WRITTEN, my_batch_count);
|
|
|
|
RecordTick(stats_, BYTES_WRITTEN, batch_size);
|
|
|
|
if (write_options.disableWAL) {
|
|
|
|
flush_on_destroy_ = true;
|
|
|
|
}
|
|
|
|
PERF_TIMER_STOP(write_pre_and_post_process_time);
|
|
|
|
|
make internal stats independent of statistics
Summary:
also make it aware of column family
output from db_bench
```
** Compaction Stats [default] **
Level Files Size(MB) Score Read(GB) Rn(GB) Rnp1(GB) Write(GB) Wnew(GB) RW-Amp W-Amp Rd(MB/s) Wr(MB/s) Rn(cnt) Rnp1(cnt) Wnp1(cnt) Wnew(cnt) Comp(sec) Comp(cnt) Avg(sec) Stall(sec) Stall(cnt) Avg(ms)
------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------
L0 14 956 0.9 0.0 0.0 0.0 2.7 2.7 0.0 0.0 0.0 111.6 0 0 0 0 24 40 0.612 75.20 492387 0.15
L1 21 2001 2.0 5.7 2.0 3.7 5.3 1.6 5.4 2.6 71.2 65.7 31 43 55 12 82 2 41.242 43.72 41183 1.06
L2 217 18974 1.9 16.5 2.0 14.4 15.1 0.7 15.6 7.4 70.1 64.3 17 182 185 3 241 16 15.052 0.00 0 0.00
L3 1641 188245 1.8 9.1 1.1 8.0 8.5 0.5 15.4 7.4 61.3 57.2 9 75 76 1 152 9 16.887 0.00 0 0.00
L4 4447 449025 0.4 13.4 4.8 8.6 9.1 0.5 4.7 1.9 77.8 52.7 38 79 100 21 176 38 4.639 0.00 0 0.00
Sum 6340 659201 0.0 44.7 10.0 34.7 40.6 6.0 32.0 15.2 67.7 61.6 95 379 416 37 676 105 6.439 118.91 533570 0.22
Int 0 0 0.0 1.2 0.4 0.8 1.3 0.5 5.2 2.7 59.1 65.6 3 7 9 2 20 10 2.003 0.00 0 0.00
Stalls(secs): 75.197 level0_slowdown, 0.000 level0_numfiles, 0.000 memtable_compaction, 43.717 leveln_slowdown
Stalls(count): 492387 level0_slowdown, 0 level0_numfiles, 0 memtable_compaction, 41183 leveln_slowdown
** DB Stats **
Uptime(secs): 202.1 total, 13.5 interval
Cumulative writes: 6291456 writes, 6291456 batches, 1.0 writes per batch, 4.90 ingest GB
Cumulative WAL: 6291456 writes, 6291456 syncs, 1.00 writes per sync, 4.90 GB written
Interval writes: 1048576 writes, 1048576 batches, 1.0 writes per batch, 836.0 ingest MB
Interval WAL: 1048576 writes, 1048576 syncs, 1.00 writes per sync, 0.82 MB written
Test Plan: ran it
Reviewers: sdong, yhchiang, igor
Reviewed By: igor
Subscribers: leveldb
Differential Revision: https://reviews.facebook.net/D19917
11 years ago
|
|
|
uint64_t log_size = 0;
|
|
|
|
if (!write_options.disableWAL) {
|
|
|
|
PERF_TIMER_GUARD(write_wal_time);
|
Add monitoring for universal compaction and add counters for compaction IO
Summary:
Adds these counters
{ WAL_FILE_SYNCED, "rocksdb.wal.synced" }
number of writes that request a WAL sync
{ WAL_FILE_BYTES, "rocksdb.wal.bytes" },
number of bytes written to the WAL
{ WRITE_DONE_BY_SELF, "rocksdb.write.self" },
number of writes processed by the calling thread
{ WRITE_DONE_BY_OTHER, "rocksdb.write.other" },
number of writes not processed by the calling thread. Instead these were
processed by the current holder of the write lock
{ WRITE_WITH_WAL, "rocksdb.write.wal" },
number of writes that request WAL logging
{ COMPACT_READ_BYTES, "rocksdb.compact.read.bytes" },
number of bytes read during compaction
{ COMPACT_WRITE_BYTES, "rocksdb.compact.write.bytes" },
number of bytes written during compaction
Per-interval stats output was updated with WAL stats and correct stats for universal compaction
including a correct value for write-amplification. It now looks like:
Compactions
Level Files Size(MB) Score Time(sec) Read(MB) Write(MB) Rn(MB) Rnp1(MB) Wnew(MB) RW-Amplify Read(MB/s) Write(MB/s) Rn Rnp1 Wnp1 NewW Count Ln-stall Stall-cnt
--------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------
0 7 464 46.4 281 3411 3875 3411 0 3875 2.1 12.1 13.8 621 0 240 240 628 0.0 0
Uptime(secs): 310.8 total, 2.0 interval
Writes cumulative: 9999999 total, 9999999 batches, 1.0 per batch, 1.22 ingest GB
WAL cumulative: 9999999 WAL writes, 9999999 WAL syncs, 1.00 writes per sync, 1.22 GB written
Compaction IO cumulative (GB): 1.22 new, 3.33 read, 3.78 write, 7.12 read+write
Compaction IO cumulative (MB/sec): 4.0 new, 11.0 read, 12.5 write, 23.4 read+write
Amplification cumulative: 4.1 write, 6.8 compaction
Writes interval: 100000 total, 100000 batches, 1.0 per batch, 12.5 ingest MB
WAL interval: 100000 WAL writes, 100000 WAL syncs, 1.00 writes per sync, 0.01 MB written
Compaction IO interval (MB): 12.49 new, 14.98 read, 21.50 write, 36.48 read+write
Compaction IO interval (MB/sec): 6.4 new, 7.6 read, 11.0 write, 18.6 read+write
Amplification interval: 101.7 write, 102.9 compaction
Stalls(secs): 142.924 level0_slowdown, 0.000 level0_numfiles, 0.805 memtable_compaction, 0.000 leveln_slowdown
Stalls(count): 132461 level0_slowdown, 0 level0_numfiles, 3 memtable_compaction, 0 leveln_slowdown
Task ID: #3329644, #3301695
Blame Rev:
Test Plan:
Revert Plan:
Database Impact:
Memcache Impact:
Other Notes:
EImportant:
- begin *PUBLIC* platform impact section -
Bugzilla: #
- end platform impact -
Reviewers: dhruba
CC: leveldb
Differential Revision: https://reviews.facebook.net/D14583
11 years ago
|
|
|
Slice log_entry = WriteBatchInternal::Contents(updates);
|
|
|
|
status = log_->AddRecord(log_entry);
|
|
|
|
total_log_size_ += log_entry.size();
|
|
|
|
alive_log_files_.back().AddSize(log_entry.size());
|
|
|
|
log_empty_ = false;
|
make internal stats independent of statistics
Summary:
also make it aware of column family
output from db_bench
```
** Compaction Stats [default] **
Level Files Size(MB) Score Read(GB) Rn(GB) Rnp1(GB) Write(GB) Wnew(GB) RW-Amp W-Amp Rd(MB/s) Wr(MB/s) Rn(cnt) Rnp1(cnt) Wnp1(cnt) Wnew(cnt) Comp(sec) Comp(cnt) Avg(sec) Stall(sec) Stall(cnt) Avg(ms)
------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------
L0 14 956 0.9 0.0 0.0 0.0 2.7 2.7 0.0 0.0 0.0 111.6 0 0 0 0 24 40 0.612 75.20 492387 0.15
L1 21 2001 2.0 5.7 2.0 3.7 5.3 1.6 5.4 2.6 71.2 65.7 31 43 55 12 82 2 41.242 43.72 41183 1.06
L2 217 18974 1.9 16.5 2.0 14.4 15.1 0.7 15.6 7.4 70.1 64.3 17 182 185 3 241 16 15.052 0.00 0 0.00
L3 1641 188245 1.8 9.1 1.1 8.0 8.5 0.5 15.4 7.4 61.3 57.2 9 75 76 1 152 9 16.887 0.00 0 0.00
L4 4447 449025 0.4 13.4 4.8 8.6 9.1 0.5 4.7 1.9 77.8 52.7 38 79 100 21 176 38 4.639 0.00 0 0.00
Sum 6340 659201 0.0 44.7 10.0 34.7 40.6 6.0 32.0 15.2 67.7 61.6 95 379 416 37 676 105 6.439 118.91 533570 0.22
Int 0 0 0.0 1.2 0.4 0.8 1.3 0.5 5.2 2.7 59.1 65.6 3 7 9 2 20 10 2.003 0.00 0 0.00
Stalls(secs): 75.197 level0_slowdown, 0.000 level0_numfiles, 0.000 memtable_compaction, 43.717 leveln_slowdown
Stalls(count): 492387 level0_slowdown, 0 level0_numfiles, 0 memtable_compaction, 41183 leveln_slowdown
** DB Stats **
Uptime(secs): 202.1 total, 13.5 interval
Cumulative writes: 6291456 writes, 6291456 batches, 1.0 writes per batch, 4.90 ingest GB
Cumulative WAL: 6291456 writes, 6291456 syncs, 1.00 writes per sync, 4.90 GB written
Interval writes: 1048576 writes, 1048576 batches, 1.0 writes per batch, 836.0 ingest MB
Interval WAL: 1048576 writes, 1048576 syncs, 1.00 writes per sync, 0.82 MB written
Test Plan: ran it
Reviewers: sdong, yhchiang, igor
Reviewed By: igor
Subscribers: leveldb
Differential Revision: https://reviews.facebook.net/D19917
11 years ago
|
|
|
log_size = log_entry.size();
|
|
|
|
RecordTick(stats_, WAL_FILE_BYTES, log_size);
|
|
|
|
if (status.ok() && write_options.sync) {
|
|
|
|
RecordTick(stats_, WAL_FILE_SYNCED);
|
|
|
|
StopWatch sw(env_, stats_, WAL_FILE_SYNC_MICROS);
|
|
|
|
if (db_options_.use_fsync) {
|
|
|
|
status = log_->file()->Fsync();
|
|
|
|
} else {
|
|
|
|
status = log_->file()->Sync();
|
|
|
|
}
|
|
|
|
}
|
|
|
|
}
|
|
|
|
if (status.ok()) {
|
|
|
|
PERF_TIMER_GUARD(write_memtable_time);
|
|
|
|
|
|
|
|
status = WriteBatchInternal::InsertInto(
|
|
|
|
updates, column_family_memtables_.get(),
|
|
|
|
write_options.ignore_missing_column_families, 0, this, false);
|
|
|
|
// A non-OK status here indicates iteration failure (either in-memory
|
|
|
|
// writebatch corruption (very bad), or the client specified invalid
|
|
|
|
// column family). This will later on trigger bg_error_.
|
|
|
|
//
|
|
|
|
// Note that existing logic was not sound. Any partial failure writing
|
|
|
|
// into the memtable would result in a state that some write ops might
|
|
|
|
// have succeeded in memtable but Status reports error for all writes.
|
|
|
|
|
|
|
|
SetTickerCount(stats_, SEQUENCE_NUMBER, last_sequence);
|
|
|
|
}
|
|
|
|
PERF_TIMER_START(write_pre_and_post_process_time);
|
make internal stats independent of statistics
Summary:
also make it aware of column family
output from db_bench
```
** Compaction Stats [default] **
Level Files Size(MB) Score Read(GB) Rn(GB) Rnp1(GB) Write(GB) Wnew(GB) RW-Amp W-Amp Rd(MB/s) Wr(MB/s) Rn(cnt) Rnp1(cnt) Wnp1(cnt) Wnew(cnt) Comp(sec) Comp(cnt) Avg(sec) Stall(sec) Stall(cnt) Avg(ms)
------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------
L0 14 956 0.9 0.0 0.0 0.0 2.7 2.7 0.0 0.0 0.0 111.6 0 0 0 0 24 40 0.612 75.20 492387 0.15
L1 21 2001 2.0 5.7 2.0 3.7 5.3 1.6 5.4 2.6 71.2 65.7 31 43 55 12 82 2 41.242 43.72 41183 1.06
L2 217 18974 1.9 16.5 2.0 14.4 15.1 0.7 15.6 7.4 70.1 64.3 17 182 185 3 241 16 15.052 0.00 0 0.00
L3 1641 188245 1.8 9.1 1.1 8.0 8.5 0.5 15.4 7.4 61.3 57.2 9 75 76 1 152 9 16.887 0.00 0 0.00
L4 4447 449025 0.4 13.4 4.8 8.6 9.1 0.5 4.7 1.9 77.8 52.7 38 79 100 21 176 38 4.639 0.00 0 0.00
Sum 6340 659201 0.0 44.7 10.0 34.7 40.6 6.0 32.0 15.2 67.7 61.6 95 379 416 37 676 105 6.439 118.91 533570 0.22
Int 0 0 0.0 1.2 0.4 0.8 1.3 0.5 5.2 2.7 59.1 65.6 3 7 9 2 20 10 2.003 0.00 0 0.00
Stalls(secs): 75.197 level0_slowdown, 0.000 level0_numfiles, 0.000 memtable_compaction, 43.717 leveln_slowdown
Stalls(count): 492387 level0_slowdown, 0 level0_numfiles, 0 memtable_compaction, 41183 leveln_slowdown
** DB Stats **
Uptime(secs): 202.1 total, 13.5 interval
Cumulative writes: 6291456 writes, 6291456 batches, 1.0 writes per batch, 4.90 ingest GB
Cumulative WAL: 6291456 writes, 6291456 syncs, 1.00 writes per sync, 4.90 GB written
Interval writes: 1048576 writes, 1048576 batches, 1.0 writes per batch, 836.0 ingest MB
Interval WAL: 1048576 writes, 1048576 syncs, 1.00 writes per sync, 0.82 MB written
Test Plan: ran it
Reviewers: sdong, yhchiang, igor
Reviewed By: igor
Subscribers: leveldb
Differential Revision: https://reviews.facebook.net/D19917
11 years ago
|
|
|
if (updates == &tmp_batch_) {
|
|
|
|
tmp_batch_.Clear();
|
|
|
|
}
|
|
|
|
mutex_.Lock();
|
make internal stats independent of statistics
Summary:
also make it aware of column family
output from db_bench
```
** Compaction Stats [default] **
Level Files Size(MB) Score Read(GB) Rn(GB) Rnp1(GB) Write(GB) Wnew(GB) RW-Amp W-Amp Rd(MB/s) Wr(MB/s) Rn(cnt) Rnp1(cnt) Wnp1(cnt) Wnew(cnt) Comp(sec) Comp(cnt) Avg(sec) Stall(sec) Stall(cnt) Avg(ms)
------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------
L0 14 956 0.9 0.0 0.0 0.0 2.7 2.7 0.0 0.0 0.0 111.6 0 0 0 0 24 40 0.612 75.20 492387 0.15
L1 21 2001 2.0 5.7 2.0 3.7 5.3 1.6 5.4 2.6 71.2 65.7 31 43 55 12 82 2 41.242 43.72 41183 1.06
L2 217 18974 1.9 16.5 2.0 14.4 15.1 0.7 15.6 7.4 70.1 64.3 17 182 185 3 241 16 15.052 0.00 0 0.00
L3 1641 188245 1.8 9.1 1.1 8.0 8.5 0.5 15.4 7.4 61.3 57.2 9 75 76 1 152 9 16.887 0.00 0 0.00
L4 4447 449025 0.4 13.4 4.8 8.6 9.1 0.5 4.7 1.9 77.8 52.7 38 79 100 21 176 38 4.639 0.00 0 0.00
Sum 6340 659201 0.0 44.7 10.0 34.7 40.6 6.0 32.0 15.2 67.7 61.6 95 379 416 37 676 105 6.439 118.91 533570 0.22
Int 0 0 0.0 1.2 0.4 0.8 1.3 0.5 5.2 2.7 59.1 65.6 3 7 9 2 20 10 2.003 0.00 0 0.00
Stalls(secs): 75.197 level0_slowdown, 0.000 level0_numfiles, 0.000 memtable_compaction, 43.717 leveln_slowdown
Stalls(count): 492387 level0_slowdown, 0 level0_numfiles, 0 memtable_compaction, 41183 leveln_slowdown
** DB Stats **
Uptime(secs): 202.1 total, 13.5 interval
Cumulative writes: 6291456 writes, 6291456 batches, 1.0 writes per batch, 4.90 ingest GB
Cumulative WAL: 6291456 writes, 6291456 syncs, 1.00 writes per sync, 4.90 GB written
Interval writes: 1048576 writes, 1048576 batches, 1.0 writes per batch, 836.0 ingest MB
Interval WAL: 1048576 writes, 1048576 syncs, 1.00 writes per sync, 0.82 MB written
Test Plan: ran it
Reviewers: sdong, yhchiang, igor
Reviewed By: igor
Subscribers: leveldb
Differential Revision: https://reviews.facebook.net/D19917
11 years ago
|
|
|
// internal stats
|
|
|
|
default_cf_internal_stats_->AddDBStats(
|
|
|
|
InternalStats::BYTES_WRITTEN, batch_size);
|
|
|
|
default_cf_internal_stats_->AddDBStats(InternalStats::NUMBER_KEYS_WRITTEN,
|
|
|
|
my_batch_count);
|
|
|
|
if (!write_options.disableWAL) {
|
make internal stats independent of statistics
Summary:
also make it aware of column family
output from db_bench
```
** Compaction Stats [default] **
Level Files Size(MB) Score Read(GB) Rn(GB) Rnp1(GB) Write(GB) Wnew(GB) RW-Amp W-Amp Rd(MB/s) Wr(MB/s) Rn(cnt) Rnp1(cnt) Wnp1(cnt) Wnew(cnt) Comp(sec) Comp(cnt) Avg(sec) Stall(sec) Stall(cnt) Avg(ms)
------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------
L0 14 956 0.9 0.0 0.0 0.0 2.7 2.7 0.0 0.0 0.0 111.6 0 0 0 0 24 40 0.612 75.20 492387 0.15
L1 21 2001 2.0 5.7 2.0 3.7 5.3 1.6 5.4 2.6 71.2 65.7 31 43 55 12 82 2 41.242 43.72 41183 1.06
L2 217 18974 1.9 16.5 2.0 14.4 15.1 0.7 15.6 7.4 70.1 64.3 17 182 185 3 241 16 15.052 0.00 0 0.00
L3 1641 188245 1.8 9.1 1.1 8.0 8.5 0.5 15.4 7.4 61.3 57.2 9 75 76 1 152 9 16.887 0.00 0 0.00
L4 4447 449025 0.4 13.4 4.8 8.6 9.1 0.5 4.7 1.9 77.8 52.7 38 79 100 21 176 38 4.639 0.00 0 0.00
Sum 6340 659201 0.0 44.7 10.0 34.7 40.6 6.0 32.0 15.2 67.7 61.6 95 379 416 37 676 105 6.439 118.91 533570 0.22
Int 0 0 0.0 1.2 0.4 0.8 1.3 0.5 5.2 2.7 59.1 65.6 3 7 9 2 20 10 2.003 0.00 0 0.00
Stalls(secs): 75.197 level0_slowdown, 0.000 level0_numfiles, 0.000 memtable_compaction, 43.717 leveln_slowdown
Stalls(count): 492387 level0_slowdown, 0 level0_numfiles, 0 memtable_compaction, 41183 leveln_slowdown
** DB Stats **
Uptime(secs): 202.1 total, 13.5 interval
Cumulative writes: 6291456 writes, 6291456 batches, 1.0 writes per batch, 4.90 ingest GB
Cumulative WAL: 6291456 writes, 6291456 syncs, 1.00 writes per sync, 4.90 GB written
Interval writes: 1048576 writes, 1048576 batches, 1.0 writes per batch, 836.0 ingest MB
Interval WAL: 1048576 writes, 1048576 syncs, 1.00 writes per sync, 0.82 MB written
Test Plan: ran it
Reviewers: sdong, yhchiang, igor
Reviewed By: igor
Subscribers: leveldb
Differential Revision: https://reviews.facebook.net/D19917
11 years ago
|
|
|
default_cf_internal_stats_->AddDBStats(
|
|
|
|
InternalStats::WAL_FILE_SYNCED, 1);
|
|
|
|
default_cf_internal_stats_->AddDBStats(
|
|
|
|
InternalStats::WAL_FILE_BYTES, log_size);
|
|
|
|
}
|
|
|
|
if (status.ok()) {
|
|
|
|
versions_->SetLastSequence(last_sequence);
|
|
|
|
}
|
|
|
|
}
|
|
|
|
}
|
|
|
|
if (db_options_.paranoid_checks && !status.ok() &&
|
|
|
|
!status.IsTimedOut() && bg_error_.ok()) {
|
|
|
|
bg_error_ = status; // stop compaction & fail any further writes
|
|
|
|
}
|
|
|
|
|
|
|
|
write_thread_.ExitWriteThread(&w, last_writer, status);
|
|
|
|
mutex_.Unlock();
|
|
|
|
|
|
|
|
if (status.IsTimedOut()) {
|
|
|
|
RecordTick(stats_, WRITE_TIMEDOUT);
|
|
|
|
}
|
|
|
|
|
|
|
|
return status;
|
|
|
|
}
|
|
|
|
|
Push- instead of pull-model for managing Write stalls
Summary:
Introducing WriteController, which is a source of truth about per-DB write delays. Let's define an DB epoch as a period where there are no flushes and compactions (i.e. new epoch is started when flush or compaction finishes). Each epoch can either:
* proceed with all writes without delay
* delay all writes by fixed time
* stop all writes
The three modes are recomputed at each epoch change (flush, compaction), rather than on every write (which is currently the case).
When we have a lot of column families, our current pull behavior adds a big overhead, since we need to loop over every column family for every write. With new push model, overhead on Write code-path is minimal.
This is just the start. Next step is to also take care of stalls introduced by slow memtable flushes. The final goal is to eliminate function MakeRoomForWrite(), which currently needs to be called for every column family by every write.
Test Plan: make check for now. I'll add some unit tests later. Also, perf test.
Reviewers: dhruba, yhchiang, MarkCallaghan, sdong, ljin
Reviewed By: ljin
Subscribers: leveldb
Differential Revision: https://reviews.facebook.net/D22791
10 years ago
|
|
|
// REQUIRES: mutex_ is held
|
|
|
|
// REQUIRES: this thread is currently at the front of the writer queue
|
|
|
|
Status DBImpl::DelayWrite(uint64_t expiration_time) {
|
Push- instead of pull-model for managing Write stalls
Summary:
Introducing WriteController, which is a source of truth about per-DB write delays. Let's define an DB epoch as a period where there are no flushes and compactions (i.e. new epoch is started when flush or compaction finishes). Each epoch can either:
* proceed with all writes without delay
* delay all writes by fixed time
* stop all writes
The three modes are recomputed at each epoch change (flush, compaction), rather than on every write (which is currently the case).
When we have a lot of column families, our current pull behavior adds a big overhead, since we need to loop over every column family for every write. With new push model, overhead on Write code-path is minimal.
This is just the start. Next step is to also take care of stalls introduced by slow memtable flushes. The final goal is to eliminate function MakeRoomForWrite(), which currently needs to be called for every column family by every write.
Test Plan: make check for now. I'll add some unit tests later. Also, perf test.
Reviewers: dhruba, yhchiang, MarkCallaghan, sdong, ljin
Reviewed By: ljin
Subscribers: leveldb
Differential Revision: https://reviews.facebook.net/D22791
10 years ago
|
|
|
StopWatch sw(env_, stats_, WRITE_STALL);
|
|
|
|
bool has_timeout = (expiration_time > 0);
|
|
|
|
auto delay = write_controller_.GetDelay();
|
|
|
|
if (write_controller_.IsStopped() == false && delay > 0) {
|
|
|
|
mutex_.Unlock();
|
|
|
|
env_->SleepForMicroseconds(delay);
|
|
|
|
mutex_.Lock();
|
|
|
|
}
|
|
|
|
|
|
|
|
while (bg_error_.ok() && write_controller_.IsStopped()) {
|
Push- instead of pull-model for managing Write stalls
Summary:
Introducing WriteController, which is a source of truth about per-DB write delays. Let's define an DB epoch as a period where there are no flushes and compactions (i.e. new epoch is started when flush or compaction finishes). Each epoch can either:
* proceed with all writes without delay
* delay all writes by fixed time
* stop all writes
The three modes are recomputed at each epoch change (flush, compaction), rather than on every write (which is currently the case).
When we have a lot of column families, our current pull behavior adds a big overhead, since we need to loop over every column family for every write. With new push model, overhead on Write code-path is minimal.
This is just the start. Next step is to also take care of stalls introduced by slow memtable flushes. The final goal is to eliminate function MakeRoomForWrite(), which currently needs to be called for every column family by every write.
Test Plan: make check for now. I'll add some unit tests later. Also, perf test.
Reviewers: dhruba, yhchiang, MarkCallaghan, sdong, ljin
Reviewed By: ljin
Subscribers: leveldb
Differential Revision: https://reviews.facebook.net/D22791
10 years ago
|
|
|
if (has_timeout) {
|
|
|
|
bg_cv_.TimedWait(expiration_time);
|
|
|
|
if (env_->NowMicros() > expiration_time) {
|
|
|
|
return Status::TimedOut();
|
Push- instead of pull-model for managing Write stalls
Summary:
Introducing WriteController, which is a source of truth about per-DB write delays. Let's define an DB epoch as a period where there are no flushes and compactions (i.e. new epoch is started when flush or compaction finishes). Each epoch can either:
* proceed with all writes without delay
* delay all writes by fixed time
* stop all writes
The three modes are recomputed at each epoch change (flush, compaction), rather than on every write (which is currently the case).
When we have a lot of column families, our current pull behavior adds a big overhead, since we need to loop over every column family for every write. With new push model, overhead on Write code-path is minimal.
This is just the start. Next step is to also take care of stalls introduced by slow memtable flushes. The final goal is to eliminate function MakeRoomForWrite(), which currently needs to be called for every column family by every write.
Test Plan: make check for now. I'll add some unit tests later. Also, perf test.
Reviewers: dhruba, yhchiang, MarkCallaghan, sdong, ljin
Reviewed By: ljin
Subscribers: leveldb
Differential Revision: https://reviews.facebook.net/D22791
10 years ago
|
|
|
}
|
|
|
|
} else {
|
|
|
|
bg_cv_.Wait();
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
return bg_error_;
|
|
|
|
}
|
|
|
|
|
|
|
|
Status DBImpl::ScheduleFlushes(WriteContext* context) {
|
|
|
|
bool schedule_bg_work = false;
|
|
|
|
ColumnFamilyData* cfd;
|
|
|
|
while ((cfd = flush_scheduler_.GetNextColumnFamily()) != nullptr) {
|
|
|
|
schedule_bg_work = true;
|
|
|
|
auto status = SetNewMemtableAndNewLogFile(cfd, context);
|
|
|
|
if (cfd->Unref()) {
|
|
|
|
delete cfd;
|
|
|
|
}
|
|
|
|
if (!status.ok()) {
|
|
|
|
return status;
|
|
|
|
}
|
|
|
|
}
|
|
|
|
if (schedule_bg_work) {
|
|
|
|
MaybeScheduleFlushOrCompaction();
|
|
|
|
}
|
|
|
|
return Status::OK();
|
|
|
|
}
|
|
|
|
|
|
|
|
// REQUIRES: mutex_ is held
|
|
|
|
// REQUIRES: this thread is currently at the front of the writer queue
|
|
|
|
Status DBImpl::SetNewMemtableAndNewLogFile(ColumnFamilyData* cfd,
|
|
|
|
WriteContext* context) {
|
|
|
|
mutex_.AssertHeld();
|
|
|
|
unique_ptr<WritableFile> lfile;
|
|
|
|
log::Writer* new_log = nullptr;
|
|
|
|
MemTable* new_mem = nullptr;
|
|
|
|
|
|
|
|
// Attempt to switch to a new memtable and trigger flush of old.
|
|
|
|
// Do this without holding the dbmutex lock.
|
|
|
|
assert(versions_->prev_log_number() == 0);
|
|
|
|
bool creating_new_log = !log_empty_;
|
|
|
|
uint64_t new_log_number =
|
|
|
|
creating_new_log ? versions_->NewFileNumber() : logfile_number_;
|
|
|
|
SuperVersion* new_superversion = nullptr;
|
|
|
|
const MutableCFOptions mutable_cf_options = *cfd->GetLatestMutableCFOptions();
|
|
|
|
mutex_.Unlock();
|
|
|
|
Status s;
|
|
|
|
{
|
|
|
|
if (creating_new_log) {
|
|
|
|
s = env_->NewWritableFile(
|
|
|
|
LogFileName(db_options_.wal_dir, new_log_number),
|
|
|
|
&lfile, env_->OptimizeForLogWrite(env_options_));
|
|
|
|
if (s.ok()) {
|
|
|
|
// Our final size should be less than write_buffer_size
|
|
|
|
// (compression, etc) but err on the side of caution.
|
|
|
|
lfile->SetPreallocationBlockSize(
|
|
|
|
1.1 * mutable_cf_options.write_buffer_size);
|
|
|
|
new_log = new log::Writer(std::move(lfile));
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
if (s.ok()) {
|
|
|
|
new_mem = new MemTable(cfd->internal_comparator(), *cfd->ioptions(),
|
|
|
|
mutable_cf_options);
|
|
|
|
new_superversion = new SuperVersion();
|
|
|
|
}
|
|
|
|
}
|
|
|
|
Log(InfoLogLevel::DEBUG_LEVEL, db_options_.info_log,
|
|
|
|
"[%s] New memtable created with log file: #%" PRIu64 "\n",
|
|
|
|
cfd->GetName().c_str(), new_log_number);
|
|
|
|
mutex_.Lock();
|
|
|
|
if (!s.ok()) {
|
|
|
|
// how do we fail if we're not creating new log?
|
|
|
|
assert(creating_new_log);
|
|
|
|
// Avoid chewing through file number space in a tight loop.
|
|
|
|
versions_->ReuseLogFileNumber(new_log_number);
|
|
|
|
assert(!new_mem);
|
|
|
|
assert(!new_log);
|
|
|
|
return s;
|
|
|
|
}
|
|
|
|
if (creating_new_log) {
|
|
|
|
logfile_number_ = new_log_number;
|
|
|
|
assert(new_log != nullptr);
|
|
|
|
context->logs_to_free_.push_back(log_.release());
|
|
|
|
log_.reset(new_log);
|
|
|
|
log_empty_ = true;
|
|
|
|
alive_log_files_.push_back(LogFileNumberSize(logfile_number_));
|
|
|
|
for (auto loop_cfd : *versions_->GetColumnFamilySet()) {
|
|
|
|
// all this is just optimization to delete logs that
|
|
|
|
// are no longer needed -- if CF is empty, that means it
|
|
|
|
// doesn't need that particular log to stay alive, so we just
|
|
|
|
// advance the log number. no need to persist this in the manifest
|
|
|
|
if (loop_cfd->mem()->GetFirstSequenceNumber() == 0 &&
|
|
|
|
loop_cfd->imm()->size() == 0) {
|
|
|
|
loop_cfd->SetLogNumber(logfile_number_);
|
|
|
|
}
|
|
|
|
}
|
|
|
|
}
|
|
|
|
cfd->mem()->SetNextLogNumber(logfile_number_);
|
|
|
|
cfd->imm()->Add(cfd->mem());
|
|
|
|
new_mem->Ref();
|
|
|
|
cfd->SetMemtable(new_mem);
|
|
|
|
context->superversions_to_free_.push_back(
|
|
|
|
InstallSuperVersion(cfd, new_superversion, mutable_cf_options));
|
|
|
|
return s;
|
|
|
|
}
|
|
|
|
|
|
|
|
#ifndef ROCKSDB_LITE
|
|
|
|
Status DBImpl::GetPropertiesOfAllTables(ColumnFamilyHandle* column_family,
|
|
|
|
TablePropertiesCollection* props) {
|
|
|
|
auto cfh = reinterpret_cast<ColumnFamilyHandleImpl*>(column_family);
|
|
|
|
auto cfd = cfh->cfd();
|
|
|
|
|
|
|
|
// Increment the ref count
|
|
|
|
mutex_.Lock();
|
|
|
|
auto version = cfd->current();
|
|
|
|
version->Ref();
|
|
|
|
mutex_.Unlock();
|
|
|
|
|
|
|
|
auto s = version->GetPropertiesOfAllTables(props);
|
|
|
|
|
|
|
|
// Decrement the ref count
|
|
|
|
mutex_.Lock();
|
|
|
|
version->Unref();
|
|
|
|
mutex_.Unlock();
|
|
|
|
|
|
|
|
return s;
|
|
|
|
}
|
|
|
|
#endif // ROCKSDB_LITE
|
|
|
|
|
[RocksDB] BackupableDB
Summary:
In this diff I present you BackupableDB v1. You can easily use it to backup your DB and it will do incremental snapshots for you.
Let's first describe how you would use BackupableDB. It's inheriting StackableDB interface so you can easily construct it with your DB object -- it will add a method RollTheSnapshot() to the DB object. When you call RollTheSnapshot(), current snapshot of the DB will be stored in the backup dir. To restore, you can just call RestoreDBFromBackup() on a BackupableDB (which is a static method) and it will restore all files from the backup dir. In the next version, it will even support automatic backuping every X minutes.
There are multiple things you can configure:
1. backup_env and db_env can be different, which is awesome because then you can easily backup to HDFS or wherever you feel like.
2. sync - if true, it *guarantees* backup consistency on machine reboot
3. number of snapshots to keep - this will keep last N snapshots around if you want, for some reason, be able to restore from an earlier snapshot. All the backuping is done in incremental fashion - if we already have 00010.sst, we will not copy it again. *IMPORTANT* -- This is based on assumption that 00010.sst never changes - two files named 00010.sst from the same DB will always be exactly the same. Is this true? I always copy manifest, current and log files.
4. You can decide if you want to flush the memtables before you backup, or you're fine with backing up the log files -- either way, you get a complete and consistent view of the database at a time of backup.
5. More things you can find in BackupableDBOptions
Here is the directory structure I use:
backup_dir/CURRENT_SNAPSHOT - just 4 bytes holding the latest snapshot
0, 1, 2, ... - files containing serialized version of each snapshot - containing a list of files
files/*.sst - sst files shared between snapshots - if one snapshot references 00010.sst and another one needs to backup it from the DB, it will just reference the same file
files/ 0/, 1/, 2/, ... - snapshot directories containing private snapshot files - current, manifest and log files
All the files are ref counted and deleted immediatelly when they get out of scope.
Some other stuff in this diff:
1. Added GetEnv() method to the DB. Discussed with @haobo and we agreed that it seems right thing to do.
2. Fixed StackableDB interface. The way it was set up before, I was not able to implement BackupableDB.
Test Plan:
I have a unittest, but please don't look at this yet. I just hacked it up to help me with debugging. I will write a lot of good tests and update the diff.
Also, `make asan_check`
Reviewers: dhruba, haobo, emayanke
Reviewed By: dhruba
CC: leveldb, haobo
Differential Revision: https://reviews.facebook.net/D14295
11 years ago
|
|
|
const std::string& DBImpl::GetName() const {
|
|
|
|
return dbname_;
|
|
|
|
}
|
|
|
|
|
|
|
|
Env* DBImpl::GetEnv() const {
|
|
|
|
return env_;
|
|
|
|
}
|
|
|
|
|
|
|
|
const Options& DBImpl::GetOptions(ColumnFamilyHandle* column_family) const {
|
|
|
|
auto cfh = reinterpret_cast<ColumnFamilyHandleImpl*>(column_family);
|
|
|
|
return *cfh->cfd()->options();
|
|
|
|
}
|
|
|
|
|
|
|
|
bool DBImpl::GetProperty(ColumnFamilyHandle* column_family,
|
[RocksDB] [Column Family] Interface proposal
Summary:
<This diff is for Column Family branch>
Sharing some of the work I've done so far. This diff compiles and passes the tests.
The biggest change is in options.h - I broke down Options into two parts - DBOptions and ColumnFamilyOptions. DBOptions is DB-specific (env, create_if_missing, block_cache, etc.) and ColumnFamilyOptions is column family-specific (all compaction options, compresion options, etc.). Note that this does not break backwards compatibility at all.
Further, I created DBWithColumnFamily which inherits DB interface and adds new functions with column family support. Clients can transparently switch to DBWithColumnFamily and it will not break their backwards compatibility.
There are few methods worth checking out: ListColumnFamilies(), MultiNewIterator(), MultiGet() and GetSnapshot(). [GetSnapshot() returns the snapshot across all column families for now - I think that's what we agreed on]
Finally, I made small changes to WriteBatch so we are able to atomically insert data across column families.
Please provide feedback.
Test Plan: make check works, the code is backward compatible
Reviewers: dhruba, haobo, sdong, kailiu, emayanke
CC: leveldb
Differential Revision: https://reviews.facebook.net/D14445
11 years ago
|
|
|
const Slice& property, std::string* value) {
|
|
|
|
bool is_int_property = false;
|
|
|
|
bool need_out_of_mutex = false;
|
|
|
|
DBPropertyType property_type =
|
|
|
|
GetPropertyType(property, &is_int_property, &need_out_of_mutex);
|
|
|
|
|
|
|
|
value->clear();
|
|
|
|
if (is_int_property) {
|
|
|
|
uint64_t int_value;
|
|
|
|
bool ret_value = GetIntPropertyInternal(column_family, property_type,
|
|
|
|
need_out_of_mutex, &int_value);
|
|
|
|
if (ret_value) {
|
|
|
|
*value = std::to_string(int_value);
|
|
|
|
}
|
|
|
|
return ret_value;
|
|
|
|
} else {
|
|
|
|
auto cfh = reinterpret_cast<ColumnFamilyHandleImpl*>(column_family);
|
|
|
|
auto cfd = cfh->cfd();
|
|
|
|
MutexLock l(&mutex_);
|
|
|
|
return cfd->internal_stats()->GetStringProperty(property_type, property,
|
|
|
|
value);
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
bool DBImpl::GetIntProperty(ColumnFamilyHandle* column_family,
|
|
|
|
const Slice& property, uint64_t* value) {
|
|
|
|
bool is_int_property = false;
|
|
|
|
bool need_out_of_mutex = false;
|
|
|
|
DBPropertyType property_type =
|
|
|
|
GetPropertyType(property, &is_int_property, &need_out_of_mutex);
|
|
|
|
if (!is_int_property) {
|
|
|
|
return false;
|
|
|
|
}
|
|
|
|
return GetIntPropertyInternal(column_family, property_type, need_out_of_mutex,
|
|
|
|
value);
|
|
|
|
}
|
|
|
|
|
|
|
|
bool DBImpl::GetIntPropertyInternal(ColumnFamilyHandle* column_family,
|
|
|
|
DBPropertyType property_type,
|
|
|
|
bool need_out_of_mutex, uint64_t* value) {
|
|
|
|
auto cfh = reinterpret_cast<ColumnFamilyHandleImpl*>(column_family);
|
|
|
|
auto cfd = cfh->cfd();
|
|
|
|
|
|
|
|
if (!need_out_of_mutex) {
|
|
|
|
MutexLock l(&mutex_);
|
|
|
|
return cfd->internal_stats()->GetIntProperty(property_type, value, this);
|
|
|
|
} else {
|
|
|
|
SuperVersion* sv = GetAndRefSuperVersion(cfd);
|
|
|
|
|
|
|
|
bool ret = cfd->internal_stats()->GetIntPropertyOutOfMutex(
|
|
|
|
property_type, sv->current, value);
|
|
|
|
|
|
|
|
ReturnAndCleanupSuperVersion(cfd, sv);
|
|
|
|
|
|
|
|
return ret;
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
SuperVersion* DBImpl::GetAndRefSuperVersion(ColumnFamilyData* cfd) {
|
|
|
|
// TODO(ljin): consider using GetReferencedSuperVersion() directly
|
|
|
|
return cfd->GetThreadLocalSuperVersion(&mutex_);
|
|
|
|
}
|
|
|
|
|
|
|
|
void DBImpl::ReturnAndCleanupSuperVersion(ColumnFamilyData* cfd,
|
|
|
|
SuperVersion* sv) {
|
|
|
|
bool unref_sv = !cfd->ReturnThreadLocalSuperVersion(sv);
|
|
|
|
|
|
|
|
if (unref_sv) {
|
|
|
|
// Release SuperVersion
|
|
|
|
if (sv->Unref()) {
|
|
|
|
{
|
|
|
|
MutexLock l(&mutex_);
|
|
|
|
sv->Cleanup();
|
|
|
|
}
|
|
|
|
delete sv;
|
|
|
|
RecordTick(stats_, NUMBER_SUPERVERSION_CLEANUPS);
|
|
|
|
}
|
|
|
|
RecordTick(stats_, NUMBER_SUPERVERSION_RELEASES);
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
void DBImpl::GetApproximateSizes(ColumnFamilyHandle* column_family,
|
[RocksDB] [Column Family] Interface proposal
Summary:
<This diff is for Column Family branch>
Sharing some of the work I've done so far. This diff compiles and passes the tests.
The biggest change is in options.h - I broke down Options into two parts - DBOptions and ColumnFamilyOptions. DBOptions is DB-specific (env, create_if_missing, block_cache, etc.) and ColumnFamilyOptions is column family-specific (all compaction options, compresion options, etc.). Note that this does not break backwards compatibility at all.
Further, I created DBWithColumnFamily which inherits DB interface and adds new functions with column family support. Clients can transparently switch to DBWithColumnFamily and it will not break their backwards compatibility.
There are few methods worth checking out: ListColumnFamilies(), MultiNewIterator(), MultiGet() and GetSnapshot(). [GetSnapshot() returns the snapshot across all column families for now - I think that's what we agreed on]
Finally, I made small changes to WriteBatch so we are able to atomically insert data across column families.
Please provide feedback.
Test Plan: make check works, the code is backward compatible
Reviewers: dhruba, haobo, sdong, kailiu, emayanke
CC: leveldb
Differential Revision: https://reviews.facebook.net/D14445
11 years ago
|
|
|
const Range* range, int n, uint64_t* sizes) {
|
|
|
|
// TODO(opt): better implementation
|
|
|
|
Version* v;
|
|
|
|
auto cfh = reinterpret_cast<ColumnFamilyHandleImpl*>(column_family);
|
|
|
|
auto cfd = cfh->cfd();
|
|
|
|
{
|
|
|
|
MutexLock l(&mutex_);
|
|
|
|
v = cfd->current();
|
|
|
|
v->Ref();
|
|
|
|
}
|
|
|
|
|
|
|
|
for (int i = 0; i < n; i++) {
|
|
|
|
// Convert user_key into a corresponding internal key.
|
|
|
|
InternalKey k1(range[i].start, kMaxSequenceNumber, kValueTypeForSeek);
|
|
|
|
InternalKey k2(range[i].limit, kMaxSequenceNumber, kValueTypeForSeek);
|
|
|
|
uint64_t start = versions_->ApproximateOffsetOf(v, k1);
|
|
|
|
uint64_t limit = versions_->ApproximateOffsetOf(v, k2);
|
|
|
|
sizes[i] = (limit >= start ? limit - start : 0);
|
|
|
|
}
|
|
|
|
|
|
|
|
{
|
|
|
|
MutexLock l(&mutex_);
|
|
|
|
v->Unref();
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
#ifndef ROCKSDB_LITE
|
|
|
|
Status DBImpl::GetUpdatesSince(
|
|
|
|
SequenceNumber seq, unique_ptr<TransactionLogIterator>* iter,
|
|
|
|
const TransactionLogIterator::ReadOptions& read_options) {
|
|
|
|
|
|
|
|
RecordTick(stats_, GET_UPDATES_SINCE_CALLS);
|
|
|
|
if (seq > versions_->LastSequence()) {
|
|
|
|
return Status::NotFound("Requested sequence not yet written in the db");
|
|
|
|
}
|
|
|
|
return wal_manager_.GetUpdatesSince(seq, iter, read_options, versions_.get());
|
|
|
|
}
|
|
|
|
|
|
|
|
Status DBImpl::DeleteFile(std::string name) {
|
|
|
|
uint64_t number;
|
|
|
|
FileType type;
|
|
|
|
WalFileType log_type;
|
|
|
|
if (!ParseFileName(name, &number, &type, &log_type) ||
|
|
|
|
(type != kTableFile && type != kLogFile)) {
|
|
|
|
Log(InfoLogLevel::ERROR_LEVEL, db_options_.info_log,
|
|
|
|
"DeleteFile %s failed.\n", name.c_str());
|
|
|
|
return Status::InvalidArgument("Invalid file name");
|
|
|
|
}
|
|
|
|
|
|
|
|
Status status;
|
|
|
|
if (type == kLogFile) {
|
|
|
|
// Only allow deleting archived log files
|
|
|
|
if (log_type != kArchivedLogFile) {
|
|
|
|
Log(InfoLogLevel::ERROR_LEVEL, db_options_.info_log,
|
|
|
|
"DeleteFile %s failed - not archived log.\n",
|
|
|
|
name.c_str());
|
|
|
|
return Status::NotSupported("Delete only supported for archived logs");
|
|
|
|
}
|
|
|
|
status = env_->DeleteFile(db_options_.wal_dir + "/" + name.c_str());
|
|
|
|
if (!status.ok()) {
|
|
|
|
Log(InfoLogLevel::ERROR_LEVEL, db_options_.info_log,
|
|
|
|
"DeleteFile %s failed -- %s.\n",
|
|
|
|
name.c_str(), status.ToString().c_str());
|
|
|
|
}
|
|
|
|
return status;
|
|
|
|
}
|
|
|
|
|
|
|
|
int level;
|
|
|
|
FileMetaData* metadata;
|
|
|
|
ColumnFamilyData* cfd;
|
|
|
|
VersionEdit edit;
|
|
|
|
JobContext job_context(true);
|
|
|
|
{
|
|
|
|
MutexLock l(&mutex_);
|
|
|
|
status = versions_->GetMetadataForFile(number, &level, &metadata, &cfd);
|
|
|
|
if (!status.ok()) {
|
|
|
|
Log(InfoLogLevel::WARN_LEVEL, db_options_.info_log,
|
|
|
|
"DeleteFile %s failed. File not found\n", name.c_str());
|
|
|
|
return Status::InvalidArgument("File not found");
|
|
|
|
}
|
|
|
|
assert(level < cfd->NumberLevels());
|
|
|
|
|
|
|
|
// If the file is being compacted no need to delete.
|
|
|
|
if (metadata->being_compacted) {
|
|
|
|
Log(InfoLogLevel::INFO_LEVEL, db_options_.info_log,
|
|
|
|
"DeleteFile %s Skipped. File about to be compacted\n", name.c_str());
|
|
|
|
return Status::OK();
|
|
|
|
}
|
|
|
|
|
|
|
|
// Only the files in the last level can be deleted externally.
|
|
|
|
// This is to make sure that any deletion tombstones are not
|
|
|
|
// lost. Check that the level passed is the last level.
|
|
|
|
auto* vstoreage = cfd->current()->storage_info();
|
|
|
|
for (int i = level + 1; i < cfd->NumberLevels(); i++) {
|
|
|
|
if (vstoreage->NumLevelFiles(i) != 0) {
|
|
|
|
Log(InfoLogLevel::WARN_LEVEL, db_options_.info_log,
|
|
|
|
"DeleteFile %s FAILED. File not in last level\n", name.c_str());
|
|
|
|
return Status::InvalidArgument("File not in last level");
|
|
|
|
}
|
|
|
|
}
|
|
|
|
// if level == 0, it has to be the oldest file
|
|
|
|
if (level == 0 &&
|
|
|
|
vstoreage->LevelFiles(0).back()->fd.GetNumber() != number) {
|
|
|
|
Log(InfoLogLevel::WARN_LEVEL, db_options_.info_log,
|
|
|
|
"DeleteFile %s failed ---"
|
|
|
|
" target file in level 0 must be the oldest.", name.c_str());
|
|
|
|
return Status::InvalidArgument("File in level 0, but not oldest");
|
|
|
|
}
|
|
|
|
edit.SetColumnFamily(cfd->GetID());
|
|
|
|
edit.DeleteFile(level, number);
|
|
|
|
status = versions_->LogAndApply(cfd, *cfd->GetLatestMutableCFOptions(),
|
|
|
|
&edit, &mutex_, db_directory_.get());
|
|
|
|
if (status.ok()) {
|
|
|
|
InstallSuperVersionBackground(cfd, &job_context,
|
|
|
|
*cfd->GetLatestMutableCFOptions());
|
|
|
|
}
|
|
|
|
FindObsoleteFiles(&job_context, false);
|
|
|
|
} // lock released here
|
|
|
|
LogFlush(db_options_.info_log);
|
|
|
|
// remove files outside the db-lock
|
|
|
|
if (job_context.HaveSomethingToDelete()) {
|
|
|
|
PurgeObsoleteFiles(job_context);
|
|
|
|
}
|
|
|
|
{
|
|
|
|
MutexLock l(&mutex_);
|
|
|
|
// schedule flush if file deletion means we freed the space for flushes to
|
|
|
|
// continue
|
|
|
|
MaybeScheduleFlushOrCompaction();
|
|
|
|
}
|
|
|
|
return status;
|
|
|
|
}
|
|
|
|
|
|
|
|
void DBImpl::GetLiveFilesMetaData(std::vector<LiveFileMetaData>* metadata) {
|
|
|
|
MutexLock l(&mutex_);
|
|
|
|
versions_->GetLiveFilesMetaData(metadata);
|
|
|
|
}
|
|
|
|
#endif // ROCKSDB_LITE
|
|
|
|
|
|
|
|
Status DBImpl::CheckConsistency() {
|
|
|
|
mutex_.AssertHeld();
|
|
|
|
std::vector<LiveFileMetaData> metadata;
|
|
|
|
versions_->GetLiveFilesMetaData(&metadata);
|
|
|
|
|
|
|
|
std::string corruption_messages;
|
|
|
|
for (const auto& md : metadata) {
|
|
|
|
std::string file_path = md.db_path + "/" + md.name;
|
|
|
|
|
|
|
|
uint64_t fsize = 0;
|
|
|
|
Status s = env_->GetFileSize(file_path, &fsize);
|
|
|
|
if (!s.ok()) {
|
|
|
|
corruption_messages +=
|
|
|
|
"Can't access " + md.name + ": " + s.ToString() + "\n";
|
|
|
|
} else if (fsize != md.size) {
|
|
|
|
corruption_messages += "Sst file size mismatch: " + file_path +
|
|
|
|
". Size recorded in manifest " +
|
|
|
|
std::to_string(md.size) + ", actual size " +
|
|
|
|
std::to_string(fsize) + "\n";
|
|
|
|
}
|
|
|
|
}
|
|
|
|
if (corruption_messages.size() == 0) {
|
|
|
|
return Status::OK();
|
|
|
|
} else {
|
|
|
|
return Status::Corruption(corruption_messages);
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
Status DBImpl::GetDbIdentity(std::string& identity) {
|
|
|
|
std::string idfilename = IdentityFileName(dbname_);
|
|
|
|
unique_ptr<SequentialFile> idfile;
|
|
|
|
const EnvOptions soptions;
|
|
|
|
Status s = env_->NewSequentialFile(idfilename, &idfile, soptions);
|
|
|
|
if (!s.ok()) {
|
|
|
|
return s;
|
|
|
|
}
|
|
|
|
uint64_t file_size;
|
|
|
|
s = env_->GetFileSize(idfilename, &file_size);
|
|
|
|
if (!s.ok()) {
|
|
|
|
return s;
|
|
|
|
}
|
|
|
|
char buffer[file_size];
|
|
|
|
Slice id;
|
|
|
|
s = idfile->Read(file_size, &id, buffer);
|
|
|
|
if (!s.ok()) {
|
|
|
|
return s;
|
|
|
|
}
|
|
|
|
identity.assign(id.ToString());
|
|
|
|
// If last character is '\n' remove it from identity
|
|
|
|
if (identity.size() > 0 && identity.back() == '\n') {
|
|
|
|
identity.pop_back();
|
|
|
|
}
|
|
|
|
return s;
|
|
|
|
}
|
|
|
|
|
|
|
|
// Default implementations of convenience methods that subclasses of DB
|
|
|
|
// can call if they wish
|
|
|
|
Status DB::Put(const WriteOptions& opt, ColumnFamilyHandle* column_family,
|
[RocksDB] [Column Family] Interface proposal
Summary:
<This diff is for Column Family branch>
Sharing some of the work I've done so far. This diff compiles and passes the tests.
The biggest change is in options.h - I broke down Options into two parts - DBOptions and ColumnFamilyOptions. DBOptions is DB-specific (env, create_if_missing, block_cache, etc.) and ColumnFamilyOptions is column family-specific (all compaction options, compresion options, etc.). Note that this does not break backwards compatibility at all.
Further, I created DBWithColumnFamily which inherits DB interface and adds new functions with column family support. Clients can transparently switch to DBWithColumnFamily and it will not break their backwards compatibility.
There are few methods worth checking out: ListColumnFamilies(), MultiNewIterator(), MultiGet() and GetSnapshot(). [GetSnapshot() returns the snapshot across all column families for now - I think that's what we agreed on]
Finally, I made small changes to WriteBatch so we are able to atomically insert data across column families.
Please provide feedback.
Test Plan: make check works, the code is backward compatible
Reviewers: dhruba, haobo, sdong, kailiu, emayanke
CC: leveldb
Differential Revision: https://reviews.facebook.net/D14445
11 years ago
|
|
|
const Slice& key, const Slice& value) {
|
|
|
|
// Pre-allocate size of write batch conservatively.
|
|
|
|
// 8 bytes are taken by header, 4 bytes for count, 1 byte for type,
|
|
|
|
// and we allocate 11 extra bytes for key length, as well as value length.
|
|
|
|
WriteBatch batch(key.size() + value.size() + 24);
|
|
|
|
batch.Put(column_family, key, value);
|
|
|
|
return Write(opt, &batch);
|
|
|
|
}
|
|
|
|
|
|
|
|
Status DB::Delete(const WriteOptions& opt, ColumnFamilyHandle* column_family,
|
|
|
|
const Slice& key) {
|
|
|
|
WriteBatch batch;
|
|
|
|
batch.Delete(column_family, key);
|
|
|
|
return Write(opt, &batch);
|
|
|
|
}
|
|
|
|
|
|
|
|
Status DB::Merge(const WriteOptions& opt, ColumnFamilyHandle* column_family,
|
|
|
|
const Slice& key, const Slice& value) {
|
|
|
|
WriteBatch batch;
|
|
|
|
batch.Merge(column_family, key, value);
|
|
|
|
return Write(opt, &batch);
|
|
|
|
}
|
|
|
|
|
|
|
|
// Default implementation -- returns not supported status
|
|
|
|
Status DB::CreateColumnFamily(const ColumnFamilyOptions& cf_options,
|
|
|
|
const std::string& column_family_name,
|
|
|
|
ColumnFamilyHandle** handle) {
|
|
|
|
return Status::NotSupported("");
|
[RocksDB] [Column Family] Interface proposal
Summary:
<This diff is for Column Family branch>
Sharing some of the work I've done so far. This diff compiles and passes the tests.
The biggest change is in options.h - I broke down Options into two parts - DBOptions and ColumnFamilyOptions. DBOptions is DB-specific (env, create_if_missing, block_cache, etc.) and ColumnFamilyOptions is column family-specific (all compaction options, compresion options, etc.). Note that this does not break backwards compatibility at all.
Further, I created DBWithColumnFamily which inherits DB interface and adds new functions with column family support. Clients can transparently switch to DBWithColumnFamily and it will not break their backwards compatibility.
There are few methods worth checking out: ListColumnFamilies(), MultiNewIterator(), MultiGet() and GetSnapshot(). [GetSnapshot() returns the snapshot across all column families for now - I think that's what we agreed on]
Finally, I made small changes to WriteBatch so we are able to atomically insert data across column families.
Please provide feedback.
Test Plan: make check works, the code is backward compatible
Reviewers: dhruba, haobo, sdong, kailiu, emayanke
CC: leveldb
Differential Revision: https://reviews.facebook.net/D14445
11 years ago
|
|
|
}
|
|
|
|
Status DB::DropColumnFamily(ColumnFamilyHandle* column_family) {
|
|
|
|
return Status::NotSupported("");
|
[RocksDB] [Column Family] Interface proposal
Summary:
<This diff is for Column Family branch>
Sharing some of the work I've done so far. This diff compiles and passes the tests.
The biggest change is in options.h - I broke down Options into two parts - DBOptions and ColumnFamilyOptions. DBOptions is DB-specific (env, create_if_missing, block_cache, etc.) and ColumnFamilyOptions is column family-specific (all compaction options, compresion options, etc.). Note that this does not break backwards compatibility at all.
Further, I created DBWithColumnFamily which inherits DB interface and adds new functions with column family support. Clients can transparently switch to DBWithColumnFamily and it will not break their backwards compatibility.
There are few methods worth checking out: ListColumnFamilies(), MultiNewIterator(), MultiGet() and GetSnapshot(). [GetSnapshot() returns the snapshot across all column families for now - I think that's what we agreed on]
Finally, I made small changes to WriteBatch so we are able to atomically insert data across column families.
Please provide feedback.
Test Plan: make check works, the code is backward compatible
Reviewers: dhruba, haobo, sdong, kailiu, emayanke
CC: leveldb
Differential Revision: https://reviews.facebook.net/D14445
11 years ago
|
|
|
}
|
|
|
|
|
|
|
|
DB::~DB() { }
|
|
|
|
|
|
|
|
Status DB::Open(const Options& options, const std::string& dbname, DB** dbptr) {
|
|
|
|
DBOptions db_options(options);
|
|
|
|
ColumnFamilyOptions cf_options(options);
|
|
|
|
std::vector<ColumnFamilyDescriptor> column_families;
|
|
|
|
column_families.push_back(
|
|
|
|
ColumnFamilyDescriptor(kDefaultColumnFamilyName, cf_options));
|
|
|
|
std::vector<ColumnFamilyHandle*> handles;
|
|
|
|
Status s = DB::Open(db_options, dbname, column_families, &handles, dbptr);
|
|
|
|
if (s.ok()) {
|
|
|
|
assert(handles.size() == 1);
|
|
|
|
// i can delete the handle since DBImpl is always holding a reference to
|
|
|
|
// default column family
|
|
|
|
delete handles[0];
|
|
|
|
}
|
|
|
|
return s;
|
|
|
|
}
|
|
|
|
|
|
|
|
Status DB::Open(const DBOptions& db_options, const std::string& dbname,
|
|
|
|
const std::vector<ColumnFamilyDescriptor>& column_families,
|
|
|
|
std::vector<ColumnFamilyHandle*>* handles, DB** dbptr) {
|
|
|
|
Status s = SanitizeOptionsByTable(db_options, column_families);
|
|
|
|
if (!s.ok()) {
|
|
|
|
return s;
|
|
|
|
}
|
|
|
|
if (db_options.db_paths.size() > 1) {
|
|
|
|
for (auto& cfd : column_families) {
|
|
|
|
if (cfd.options.compaction_style != kCompactionStyleUniversal) {
|
|
|
|
return Status::NotSupported(
|
|
|
|
"More than one DB paths are only supported in "
|
|
|
|
"universal compaction style. ");
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
if (db_options.db_paths.size() > 4) {
|
|
|
|
return Status::NotSupported(
|
|
|
|
"More than four DB paths are not supported yet. ");
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
*dbptr = nullptr;
|
|
|
|
handles->clear();
|
|
|
|
|
|
|
|
size_t max_write_buffer_size = 0;
|
|
|
|
for (auto cf : column_families) {
|
|
|
|
max_write_buffer_size =
|
|
|
|
std::max(max_write_buffer_size, cf.options.write_buffer_size);
|
|
|
|
}
|
|
|
|
|
|
|
|
DBImpl* impl = new DBImpl(db_options, dbname);
|
|
|
|
s = impl->env_->CreateDirIfMissing(impl->db_options_.wal_dir);
|
|
|
|
if (s.ok()) {
|
|
|
|
for (auto db_path : impl->db_options_.db_paths) {
|
|
|
|
s = impl->env_->CreateDirIfMissing(db_path.path);
|
|
|
|
if (!s.ok()) {
|
|
|
|
break;
|
|
|
|
}
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
if (!s.ok()) {
|
|
|
|
delete impl;
|
|
|
|
return s;
|
|
|
|
}
|
|
|
|
|
|
|
|
s = impl->CreateArchivalDirectory();
|
|
|
|
if (!s.ok()) {
|
|
|
|
delete impl;
|
|
|
|
return s;
|
|
|
|
}
|
|
|
|
impl->mutex_.Lock();
|
|
|
|
// Handles create_if_missing, error_if_exists
|
|
|
|
s = impl->Recover(column_families);
|
|
|
|
if (s.ok()) {
|
|
|
|
uint64_t new_log_number = impl->versions_->NewFileNumber();
|
|
|
|
unique_ptr<WritableFile> lfile;
|
|
|
|
EnvOptions soptions(db_options);
|
|
|
|
s = impl->db_options_.env->NewWritableFile(
|
|
|
|
LogFileName(impl->db_options_.wal_dir, new_log_number), &lfile,
|
|
|
|
impl->db_options_.env->OptimizeForLogWrite(soptions));
|
|
|
|
if (s.ok()) {
|
|
|
|
lfile->SetPreallocationBlockSize(1.1 * max_write_buffer_size);
|
|
|
|
impl->logfile_number_ = new_log_number;
|
|
|
|
impl->log_.reset(new log::Writer(std::move(lfile)));
|
|
|
|
|
|
|
|
// set column family handles
|
|
|
|
for (auto cf : column_families) {
|
|
|
|
auto cfd =
|
|
|
|
impl->versions_->GetColumnFamilySet()->GetColumnFamily(cf.name);
|
|
|
|
if (cfd != nullptr) {
|
|
|
|
handles->push_back(
|
|
|
|
new ColumnFamilyHandleImpl(cfd, impl, &impl->mutex_));
|
|
|
|
} else {
|
|
|
|
if (db_options.create_missing_column_families) {
|
|
|
|
// missing column family, create it
|
|
|
|
ColumnFamilyHandle* handle;
|
|
|
|
impl->mutex_.Unlock();
|
|
|
|
s = impl->CreateColumnFamily(cf.options, cf.name, &handle);
|
|
|
|
impl->mutex_.Lock();
|
|
|
|
if (s.ok()) {
|
|
|
|
handles->push_back(handle);
|
|
|
|
} else {
|
|
|
|
break;
|
|
|
|
}
|
|
|
|
} else {
|
|
|
|
s = Status::InvalidArgument("Column family not found: ", cf.name);
|
|
|
|
break;
|
|
|
|
}
|
|
|
|
}
|
|
|
|
}
|
|
|
|
}
|
|
|
|
if (s.ok()) {
|
|
|
|
for (auto cfd : *impl->versions_->GetColumnFamilySet()) {
|
|
|
|
delete impl->InstallSuperVersion(
|
|
|
|
cfd, nullptr, *cfd->GetLatestMutableCFOptions());
|
|
|
|
}
|
|
|
|
impl->alive_log_files_.push_back(
|
|
|
|
DBImpl::LogFileNumberSize(impl->logfile_number_));
|
|
|
|
impl->DeleteObsoleteFiles();
|
|
|
|
impl->MaybeScheduleFlushOrCompaction();
|
|
|
|
s = impl->db_directory_->Fsync();
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
if (s.ok()) {
|
|
|
|
for (auto cfd : *impl->versions_->GetColumnFamilySet()) {
|
|
|
|
if (cfd->ioptions()->compaction_style == kCompactionStyleUniversal ||
|
|
|
|
cfd->ioptions()->compaction_style == kCompactionStyleFIFO) {
|
|
|
|
auto* vstorage = cfd->current()->storage_info();
|
|
|
|
for (int i = 1; i < vstorage->num_levels(); ++i) {
|
|
|
|
int num_files = vstorage->NumLevelFiles(i);
|
|
|
|
if (num_files > 0) {
|
|
|
|
s = Status::InvalidArgument(
|
|
|
|
"Not all files are at level 0. Cannot "
|
|
|
|
"open with universal or FIFO compaction style.");
|
|
|
|
break;
|
|
|
|
}
|
|
|
|
}
|
|
|
|
}
|
|
|
|
if (cfd->ioptions()->merge_operator != nullptr &&
|
Add a new mem-table representation based on cuckoo hash.
Summary:
= Major Changes =
* Add a new mem-table representation, HashCuckooRep, which is based cuckoo hash.
Cuckoo hash uses multiple hash functions. This allows each key to have multiple
possible locations in the mem-table.
- Put: When insert a key, it will try to find whether one of its possible
locations is vacant and store the key. If none of its possible
locations are available, then it will kick out a victim key and
store at that location. The kicked-out victim key will then be
stored at a vacant space of its possible locations or kick-out
another victim. In this diff, the kick-out path (known as
cuckoo-path) is found using BFS, which guarantees to be the shortest.
- Get: Simply tries all possible locations of a key --- this guarantees
worst-case constant time complexity.
- Time complexity: O(1) for Get, and average O(1) for Put if the
fullness of the mem-table is below 80%.
- Default using two hash functions, the number of hash functions used
by the cuckoo-hash may dynamically increase if it fails to find a
short-enough kick-out path.
- Currently, HashCuckooRep does not support iteration and snapshots,
as our current main purpose of this is to optimize point access.
= Minor Changes =
* Add IsSnapshotSupported() to DB to indicate whether the current DB
supports snapshots. If it returns false, then DB::GetSnapshot() will
always return nullptr.
Test Plan:
Run existing tests. Will develop a test specifically for cuckoo hash in
the next diff.
Reviewers: sdong, haobo
Reviewed By: sdong
CC: leveldb, dhruba, igor
Differential Revision: https://reviews.facebook.net/D16155
11 years ago
|
|
|
!cfd->mem()->IsMergeOperatorSupported()) {
|
|
|
|
s = Status::InvalidArgument(
|
|
|
|
"The memtable of column family %s does not support merge operator "
|
|
|
|
"its options.merge_operator is non-null", cfd->GetName().c_str());
|
|
|
|
}
|
|
|
|
if (!s.ok()) {
|
|
|
|
break;
|
|
|
|
}
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
impl->mutex_.Unlock();
|
|
|
|
|
|
|
|
if (s.ok()) {
|
|
|
|
impl->opened_successfully_ = true;
|
|
|
|
*dbptr = impl;
|
|
|
|
} else {
|
|
|
|
for (auto h : *handles) {
|
|
|
|
delete h;
|
|
|
|
}
|
|
|
|
handles->clear();
|
|
|
|
delete impl;
|
|
|
|
}
|
|
|
|
return s;
|
|
|
|
}
|
|
|
|
|
|
|
|
Status DB::ListColumnFamilies(const DBOptions& db_options,
|
|
|
|
const std::string& name,
|
|
|
|
std::vector<std::string>* column_families) {
|
|
|
|
return VersionSet::ListColumnFamilies(column_families, name, db_options.env);
|
[RocksDB] [Column Family] Interface proposal
Summary:
<This diff is for Column Family branch>
Sharing some of the work I've done so far. This diff compiles and passes the tests.
The biggest change is in options.h - I broke down Options into two parts - DBOptions and ColumnFamilyOptions. DBOptions is DB-specific (env, create_if_missing, block_cache, etc.) and ColumnFamilyOptions is column family-specific (all compaction options, compresion options, etc.). Note that this does not break backwards compatibility at all.
Further, I created DBWithColumnFamily which inherits DB interface and adds new functions with column family support. Clients can transparently switch to DBWithColumnFamily and it will not break their backwards compatibility.
There are few methods worth checking out: ListColumnFamilies(), MultiNewIterator(), MultiGet() and GetSnapshot(). [GetSnapshot() returns the snapshot across all column families for now - I think that's what we agreed on]
Finally, I made small changes to WriteBatch so we are able to atomically insert data across column families.
Please provide feedback.
Test Plan: make check works, the code is backward compatible
Reviewers: dhruba, haobo, sdong, kailiu, emayanke
CC: leveldb
Differential Revision: https://reviews.facebook.net/D14445
11 years ago
|
|
|
}
|
|
|
|
|
|
|
|
Snapshot::~Snapshot() {
|
|
|
|
}
|
|
|
|
|
|
|
|
Status DestroyDB(const std::string& dbname, const Options& options) {
|
|
|
|
const InternalKeyComparator comparator(options.comparator);
|
|
|
|
const Options& soptions(SanitizeOptions(dbname, &comparator, options));
|
|
|
|
Env* env = soptions.env;
|
|
|
|
std::vector<std::string> filenames;
|
|
|
|
std::vector<std::string> archiveFiles;
|
|
|
|
|
|
|
|
std::string archivedir = ArchivalDirectory(dbname);
|
|
|
|
// Ignore error in case directory does not exist
|
|
|
|
env->GetChildren(dbname, &filenames);
|
|
|
|
|
|
|
|
if (dbname != soptions.wal_dir) {
|
|
|
|
std::vector<std::string> logfilenames;
|
|
|
|
env->GetChildren(soptions.wal_dir, &logfilenames);
|
|
|
|
filenames.insert(filenames.end(), logfilenames.begin(), logfilenames.end());
|
|
|
|
archivedir = ArchivalDirectory(soptions.wal_dir);
|
|
|
|
}
|
|
|
|
|
|
|
|
if (filenames.empty()) {
|
|
|
|
return Status::OK();
|
|
|
|
}
|
|
|
|
|
|
|
|
FileLock* lock;
|
|
|
|
const std::string lockname = LockFileName(dbname);
|
|
|
|
Status result = env->LockFile(lockname, &lock);
|
|
|
|
if (result.ok()) {
|
|
|
|
uint64_t number;
|
|
|
|
FileType type;
|
|
|
|
InfoLogPrefix info_log_prefix(!options.db_log_dir.empty(), dbname);
|
|
|
|
for (size_t i = 0; i < filenames.size(); i++) {
|
|
|
|
if (ParseFileName(filenames[i], &number, info_log_prefix.prefix, &type) &&
|
|
|
|
type != kDBLockFile) { // Lock file will be deleted at end
|
|
|
|
Status del;
|
|
|
|
if (type == kMetaDatabase) {
|
|
|
|
del = DestroyDB(dbname + "/" + filenames[i], options);
|
|
|
|
} else if (type == kLogFile) {
|
|
|
|
del = env->DeleteFile(soptions.wal_dir + "/" + filenames[i]);
|
|
|
|
} else {
|
|
|
|
del = env->DeleteFile(dbname + "/" + filenames[i]);
|
|
|
|
}
|
|
|
|
if (result.ok() && !del.ok()) {
|
|
|
|
result = del;
|
|
|
|
}
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
for (auto& db_path : options.db_paths) {
|
|
|
|
env->GetChildren(db_path.path, &filenames);
|
|
|
|
for (size_t i = 0; i < filenames.size(); i++) {
|
|
|
|
if (ParseFileName(filenames[i], &number, &type) &&
|
|
|
|
type == kTableFile) { // Lock file will be deleted at end
|
|
|
|
Status del = env->DeleteFile(db_path.path + "/" + filenames[i]);
|
|
|
|
if (result.ok() && !del.ok()) {
|
|
|
|
result = del;
|
|
|
|
}
|
|
|
|
}
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
env->GetChildren(archivedir, &archiveFiles);
|
|
|
|
// Delete archival files.
|
|
|
|
for (size_t i = 0; i < archiveFiles.size(); ++i) {
|
|
|
|
if (ParseFileName(archiveFiles[i], &number, &type) &&
|
|
|
|
type == kLogFile) {
|
|
|
|
Status del = env->DeleteFile(archivedir + "/" + archiveFiles[i]);
|
|
|
|
if (result.ok() && !del.ok()) {
|
|
|
|
result = del;
|
|
|
|
}
|
|
|
|
}
|
|
|
|
}
|
|
|
|
// ignore case where no archival directory is present.
|
|
|
|
env->DeleteDir(archivedir);
|
|
|
|
|
|
|
|
env->UnlockFile(lock); // Ignore error since state is already gone
|
|
|
|
env->DeleteFile(lockname);
|
|
|
|
env->DeleteDir(dbname); // Ignore error in case dir contains other files
|
|
|
|
env->DeleteDir(soptions.wal_dir);
|
|
|
|
}
|
|
|
|
return result;
|
|
|
|
}
|
|
|
|
|
|
|
|
//
|
|
|
|
// A global method that can dump out the build version
|
|
|
|
void DumpRocksDBBuildVersion(Logger * log) {
|
|
|
|
#if !defined(IOS_CROSS_COMPILE)
|
|
|
|
// if we compile with Xcode, we don't run build_detect_vesion, so we don't
|
|
|
|
// generate util/build_version.cc
|
|
|
|
Log(InfoLogLevel::INFO_LEVEL, log,
|
|
|
|
"RocksDB version: %d.%d.%d\n", ROCKSDB_MAJOR, ROCKSDB_MINOR,
|
|
|
|
ROCKSDB_PATCH);
|
|
|
|
Log(InfoLogLevel::INFO_LEVEL, log, "Git sha %s", rocksdb_build_git_sha);
|
|
|
|
Log(InfoLogLevel::INFO_LEVEL, log, "Compile time %s %s",
|
|
|
|
rocksdb_build_compile_time, rocksdb_build_compile_date);
|
|
|
|
#endif
|
|
|
|
}
|
|
|
|
|
|
|
|
} // namespace rocksdb
|