Support group commits of version edits (#3944)

Summary:
This PR supports the group commit of multiple version edit entries corresponding to different column families. Column family drop/creation still cannot be grouped. This PR is a subset of [PR 3752](https://github.com/facebook/rocksdb/pull/3752).
Closes https://github.com/facebook/rocksdb/pull/3944

Differential Revision: D8432536

Pulled By: riversand963

fbshipit-source-id: 8f11bd05193b6c0d9272d82e44b676abfac113cb
main
Yanqin Jin 7 years ago committed by Facebook Github Bot
parent 0a5b5d88b2
commit 26d67e357e
  1. 2
      cache/lru_cache.cc
  2. 18
      db/memtable_list.cc
  3. 350
      db/version_set.cc
  4. 32
      db/version_set.h
  5. 123
      db/version_set_test.cc
  6. 4
      include/rocksdb/utilities/transaction_db.h
  7. 1
      monitoring/statistics.h

@ -510,7 +510,7 @@ void LRUCache::DisownData() {
shards_ = nullptr; shards_ = nullptr;
num_shards_ = 0; num_shards_ = 0;
#endif #endif
#else // __clang__ #else // __clang__
#ifndef __SANITIZE_ADDRESS__ #ifndef __SANITIZE_ADDRESS__
shards_ = nullptr; shards_ = nullptr;
num_shards_ = 0; num_shards_ = 0;

@ -41,7 +41,6 @@ void MemTableListVersion::UnrefMemTable(autovector<MemTable*>* to_delete,
to_delete->push_back(m); to_delete->push_back(m);
assert(*parent_memtable_list_memory_usage_ >= m->ApproximateMemoryUsage()); assert(*parent_memtable_list_memory_usage_ >= m->ApproximateMemoryUsage());
*parent_memtable_list_memory_usage_ -= m->ApproximateMemoryUsage(); *parent_memtable_list_memory_usage_ -= m->ApproximateMemoryUsage();
} else {
} }
} }
@ -401,7 +400,22 @@ Status MemTableList::InstallMemtableFlushResults(
// All the later memtables that have the same filenum // All the later memtables that have the same filenum
// are part of the same batch. They can be committed now. // are part of the same batch. They can be committed now.
uint64_t mem_id = 1; // how many memtables have been flushed. uint64_t mem_id = 1; // how many memtables have been flushed.
if (s.ok()) { // commit new state
// commit new state only if the column family is NOT dropped.
// The reason is as follows (refer to
// ColumnFamilyTest.FlushAndDropRaceCondition).
// If the column family is dropped, then according to LogAndApply, its
// corrresponding flush operation is NOT written to the MANIFEST. This
// means the DB is not aware of the L0 files generated from the flush.
// By committing the new state, we remove the memtable from the memtable
// list. Creating an iterator on this column family will not be able to
// read full data since the memtable is removed, and the DB is not aware
// of the L0 files, causing MergingIterator unable to build child
// iterators. RocksDB contract requires that the iterator can be created
// on a dropped column family, and we must be able to
// read full data as long as column family handle is not deleted, even if
// the column family is dropped.
if (s.ok() && !cfd->IsDropped()) { // commit new state
while (batch_count-- > 0) { while (batch_count-- > 0) {
MemTable* m = current_->memlist_.back(); MemTable* m = current_->memlist_.back();
ROCKS_LOG_BUFFER(log_buffer, "[%s] Level-0 commit table #%" PRIu64 ROCKS_LOG_BUFFER(log_buffer, "[%s] Level-0 commit table #%" PRIu64

@ -2645,11 +2645,17 @@ struct VersionSet::ManifestWriter {
bool done; bool done;
InstrumentedCondVar cv; InstrumentedCondVar cv;
ColumnFamilyData* cfd; ColumnFamilyData* cfd;
const MutableCFOptions mutable_cf_options;
const autovector<VersionEdit*>& edit_list; const autovector<VersionEdit*>& edit_list;
explicit ManifestWriter(InstrumentedMutex* mu, ColumnFamilyData* _cfd, explicit ManifestWriter(InstrumentedMutex* mu, ColumnFamilyData* _cfd,
const MutableCFOptions& cf_options,
const autovector<VersionEdit*>& e) const autovector<VersionEdit*>& e)
: done(false), cv(mu), cfd(_cfd), edit_list(e) {} : done(false),
cv(mu),
cfd(_cfd),
mutable_cf_options(cf_options),
edit_list(e) {}
}; };
VersionSet::VersionSet(const std::string& dbname, VersionSet::VersionSet(const std::string& dbname,
@ -2724,90 +2730,78 @@ void VersionSet::AppendVersion(ColumnFamilyData* column_family_data,
v->next_->prev_ = v; v->next_->prev_ = v;
} }
Status VersionSet::LogAndApply(ColumnFamilyData* column_family_data, Status VersionSet::ProcessManifestWrites(
const MutableCFOptions& mutable_cf_options, std::deque<ManifestWriter>& writers, InstrumentedMutex* mu,
const autovector<VersionEdit*>& edit_list, Directory* db_directory, bool new_descriptor_log,
InstrumentedMutex* mu, Directory* db_directory, const ColumnFamilyOptions* new_cf_options) {
bool new_descriptor_log, assert(!writers.empty());
const ColumnFamilyOptions* new_cf_options) { ManifestWriter& first_writer = writers.front();
mu->AssertHeld(); ManifestWriter* last_writer = &first_writer;
// num of edits
auto num_edits = edit_list.size();
if (num_edits == 0) {
return Status::OK();
} else if (num_edits > 1) {
#ifndef NDEBUG
// no group commits for column family add or drop
for (auto& edit : edit_list) {
assert(!edit->IsColumnFamilyManipulation());
}
#endif
}
// column_family_data can be nullptr only if this is column_family_add.
// in that case, we also need to specify ColumnFamilyOptions
if (column_family_data == nullptr) {
assert(num_edits == 1);
assert(edit_list[0]->is_column_family_add_);
assert(new_cf_options != nullptr);
}
// queue our request assert(!manifest_writers_.empty());
ManifestWriter w(mu, column_family_data, edit_list); assert(manifest_writers_.front() == &first_writer);
manifest_writers_.push_back(&w);
while (!w.done && &w != manifest_writers_.front()) {
w.cv.Wait();
}
if (w.done) {
return w.status;
}
if (column_family_data != nullptr && column_family_data->IsDropped()) {
// if column family is dropped by the time we get here, no need to write
// anything to the manifest
manifest_writers_.pop_front();
// Notify new head of write queue
if (!manifest_writers_.empty()) {
manifest_writers_.front()->cv.Signal();
}
// we steal this code to also inform about cf-drop
return Status::ShutdownInProgress();
}
autovector<VersionEdit*> batch_edits; autovector<VersionEdit*> batch_edits;
Version* v = nullptr; autovector<Version*> versions;
std::unique_ptr<BaseReferencedVersionBuilder> builder_guard(nullptr); autovector<const MutableCFOptions*> mutable_cf_options_ptrs;
std::vector<std::unique_ptr<BaseReferencedVersionBuilder>> builder_guards;
// process all requests in the queue
ManifestWriter* last_writer = &w; if (first_writer.edit_list.front()->IsColumnFamilyManipulation()) {
assert(!manifest_writers_.empty()); // No group commits for column family add or drop
assert(manifest_writers_.front() == &w); LogAndApplyCFHelper(first_writer.edit_list.front());
if (w.edit_list.front()->IsColumnFamilyManipulation()) { batch_edits.push_back(first_writer.edit_list.front());
// no group commits for column family add or drop
LogAndApplyCFHelper(w.edit_list.front());
batch_edits.push_back(w.edit_list.front());
} else { } else {
v = new Version(column_family_data, this, env_options_, mutable_cf_options, auto it = manifest_writers_.cbegin();
current_version_number_++); while (it != manifest_writers_.cend()) {
builder_guard.reset(new BaseReferencedVersionBuilder(column_family_data)); if ((*it)->edit_list.front()->IsColumnFamilyManipulation()) {
auto* builder = builder_guard->version_builder();
for (const auto& writer : manifest_writers_) {
if (writer->edit_list.front()->IsColumnFamilyManipulation() ||
writer->cfd->GetID() != column_family_data->GetID()) {
// no group commits for column family add or drop // no group commits for column family add or drop
// also, group commits across column families are not supported
break; break;
} }
last_writer = writer; last_writer = *(it++);
for (const auto& edit : writer->edit_list) { assert(last_writer != nullptr);
LogAndApplyHelper(column_family_data, builder, v, edit, mu); if (last_writer->cfd != nullptr && last_writer->cfd->IsDropped()) {
batch_edits.push_back(edit); continue;
}
// We do a linear search on versions because versions is small.
// TODO(yanqin) maybe consider unordered_map
Version* version = nullptr;
VersionBuilder* builder = nullptr;
for (int i = 0; i != static_cast<int>(versions.size()); ++i) {
uint32_t cf_id = last_writer->cfd->GetID();
if (versions[i]->cfd()->GetID() == cf_id) {
version = versions[i];
assert(!builder_guards.empty() &&
builder_guards.size() == versions.size());
builder = builder_guards[i]->version_builder();
TEST_SYNC_POINT_CALLBACK(
"VersionSet::ProcessManifestWrites:SameColumnFamily", &cf_id);
break;
}
}
if (version == nullptr) {
version = new Version(last_writer->cfd, this, env_options_,
last_writer->mutable_cf_options,
current_version_number_++);
versions.push_back(version);
mutable_cf_options_ptrs.push_back(&last_writer->mutable_cf_options);
builder_guards.emplace_back(
new BaseReferencedVersionBuilder(last_writer->cfd));
builder = builder_guards.back()->version_builder();
}
assert(builder != nullptr); // make checker happy
for (const auto& e : last_writer->edit_list) {
LogAndApplyHelper(last_writer->cfd, builder, version, e, mu);
batch_edits.push_back(e);
} }
} }
builder->SaveTo(v->storage_info()); for (int i = 0; i < static_cast<int>(versions.size()); ++i) {
assert(!builder_guards.empty() &&
builder_guards.size() == versions.size());
auto* builder = builder_guards[i]->version_builder();
builder->SaveTo(versions[i]->storage_info());
}
} }
// Initialize new descriptor log file if necessary by creating
// a temporary file that contains a snapshot of the current version.
uint64_t new_manifest_file_size = 0; uint64_t new_manifest_file_size = 0;
Status s; Status s;
@ -2822,39 +2816,39 @@ Status VersionSet::LogAndApply(ColumnFamilyData* column_family_data,
} }
if (new_descriptor_log) { if (new_descriptor_log) {
// if we're writing out new snapshot make sure to persist max column family // if we are writing out new snapshot make sure to persist max column
// family.
if (column_family_set_->GetMaxColumnFamily() > 0) { if (column_family_set_->GetMaxColumnFamily() > 0) {
w.edit_list.front()->SetMaxColumnFamily( first_writer.edit_list.front()->SetMaxColumnFamily(
column_family_set_->GetMaxColumnFamily()); column_family_set_->GetMaxColumnFamily());
} }
} }
// Unlock during expensive operations. New writes cannot get here
// because &w is ensuring that all new writes get queued.
{ {
EnvOptions opt_env_opts = env_->OptimizeForManifestWrite(env_options_); EnvOptions opt_env_opts = env_->OptimizeForManifestWrite(env_options_);
// Before releasing mutex, make a copy of mutable_cf_options and pass to
// `PrepareApply` to avoided a potential data race with backgroundflush
MutableCFOptions mutable_cf_options_copy(mutable_cf_options);
mu->Unlock(); mu->Unlock();
TEST_SYNC_POINT("VersionSet::LogAndApply:WriteManifest"); TEST_SYNC_POINT("VersionSet::LogAndApply:WriteManifest");
if (!w.edit_list.front()->IsColumnFamilyManipulation() && if (!first_writer.edit_list.front()->IsColumnFamilyManipulation() &&
this->GetColumnFamilySet()->get_table_cache()->GetCapacity() == column_family_set_->get_table_cache()->GetCapacity() ==
TableCache::kInfiniteCapacity) { TableCache::kInfiniteCapacity) {
// unlimited table cache. Pre-load table handle now. for (int i = 0; i < static_cast<int>(versions.size()); ++i) {
// Need to do it out of the mutex. assert(!builder_guards.empty() &&
builder_guard->version_builder()->LoadTableHandlers( builder_guards.size() == versions.size());
column_family_data->internal_stats(), assert(!mutable_cf_options_ptrs.empty() &&
column_family_data->ioptions()->optimize_filters_for_hits, builder_guards.size() == versions.size());
true /* prefetch_index_and_filter_in_cache */, ColumnFamilyData* cfd = versions[i]->cfd_;
mutable_cf_options.prefix_extractor.get()); builder_guards[i]->version_builder()->LoadTableHandlers(
cfd->internal_stats(), cfd->ioptions()->optimize_filters_for_hits,
true /* prefetch_index_and_filter_in_cache */,
mutable_cf_options_ptrs[i]->prefix_extractor.get());
}
} }
// This is fine because everything inside of this block is serialized -- // This is fine because everything inside of this block is serialized --
// only one thread can be here at the same time // only one thread can be here at the same time
if (new_descriptor_log) { if (new_descriptor_log) {
// create manifest file // create new manifest file
ROCKS_LOG_INFO(db_options_->info_log, "Creating manifest %" PRIu64 "\n", ROCKS_LOG_INFO(db_options_->info_log, "Creating manifest %" PRIu64 "\n",
pending_manifest_file_number_); pending_manifest_file_number_);
unique_ptr<WritableFile> descriptor_file; unique_ptr<WritableFile> descriptor_file;
@ -2873,18 +2867,19 @@ Status VersionSet::LogAndApply(ColumnFamilyData* column_family_data,
} }
} }
if (!w.edit_list.front()->IsColumnFamilyManipulation()) { if (!first_writer.edit_list.front()->IsColumnFamilyManipulation()) {
// This is cpu-heavy operations, which should be called outside mutex. for (int i = 0; i < static_cast<int>(versions.size()); ++i) {
v->PrepareApply(mutable_cf_options_copy, true); versions[i]->PrepareApply(*mutable_cf_options_ptrs[i], true);
}
} }
// Write new record to MANIFEST log // Write new records to MANIFEST log
if (s.ok()) { if (s.ok()) {
for (auto& e : batch_edits) { for (auto& e : batch_edits) {
std::string record; std::string record;
if (!e->EncodeTo(&record)) { if (!e->EncodeTo(&record)) {
s = Status::Corruption( s = Status::Corruption("Unable to encode VersionEdit:" +
"Unable to Encode VersionEdit:" + e->DebugString(true)); e->DebugString(true));
break; break;
} }
TEST_KILL_RANDOM("VersionSet::LogAndApply:BeforeAddRecord", TEST_KILL_RANDOM("VersionSet::LogAndApply:BeforeAddRecord",
@ -2898,7 +2893,7 @@ Status VersionSet::LogAndApply(ColumnFamilyData* column_family_data,
s = SyncManifest(env_, db_options_, descriptor_log_->file()); s = SyncManifest(env_, db_options_, descriptor_log_->file());
} }
if (!s.ok()) { if (!s.ok()) {
ROCKS_LOG_ERROR(db_options_->info_log, "MANIFEST write: %s\n", ROCKS_LOG_ERROR(db_options_->info_log, "MANIFEST write %s\n",
s.ToString().c_str()); s.ToString().c_str());
} }
} }
@ -2915,7 +2910,7 @@ Status VersionSet::LogAndApply(ColumnFamilyData* column_family_data,
new_manifest_file_size = descriptor_log_->file()->GetFileSize(); new_manifest_file_size = descriptor_log_->file()->GetFileSize();
} }
if (w.edit_list.front()->is_column_family_drop_) { if (first_writer.edit_list.front()->is_column_family_drop_) {
TEST_SYNC_POINT("VersionSet::LogAndApply::ColumnFamilyDrop:0"); TEST_SYNC_POINT("VersionSet::LogAndApply::ColumnFamilyDrop:0");
TEST_SYNC_POINT("VersionSet::LogAndApply::ColumnFamilyDrop:1"); TEST_SYNC_POINT("VersionSet::LogAndApply::ColumnFamilyDrop:1");
TEST_SYNC_POINT("VersionSet::LogAndApply::ColumnFamilyDrop:2"); TEST_SYNC_POINT("VersionSet::LogAndApply::ColumnFamilyDrop:2");
@ -2926,25 +2921,24 @@ Status VersionSet::LogAndApply(ColumnFamilyData* column_family_data,
mu->Lock(); mu->Lock();
} }
// Append the old mainfest file to the obsolete_manifests_ list to be deleted // Append the old manifest file to the obsolete_manifest_ list to be deleted
// by PurgeObsoleteFiles later. // by PurgeObsoleteFiles later.
if (s.ok() && new_descriptor_log) { if (s.ok() && new_descriptor_log) {
obsolete_manifests_.emplace_back( obsolete_manifests_.emplace_back(
DescriptorFileName("", manifest_file_number_)); DescriptorFileName("", manifest_file_number_));
} }
// Install the new version // Install the new versions
if (s.ok()) { if (s.ok()) {
if (w.edit_list.front()->is_column_family_add_) { if (first_writer.edit_list.front()->is_column_family_add_) {
// no group commit on column family add
assert(batch_edits.size() == 1); assert(batch_edits.size() == 1);
assert(new_cf_options != nullptr); assert(new_cf_options != nullptr);
CreateColumnFamily(*new_cf_options, w.edit_list.front()); CreateColumnFamily(*new_cf_options, first_writer.edit_list.front());
} else if (w.edit_list.front()->is_column_family_drop_) { } else if (first_writer.edit_list.front()->is_column_family_drop_) {
assert(batch_edits.size() == 1); assert(batch_edits.size() == 1);
column_family_data->SetDropped(); first_writer.cfd->SetDropped();
if (column_family_data->Unref()) { if (first_writer.cfd->Unref()) {
delete column_family_data; delete first_writer.cfd;
} }
} else { } else {
uint64_t max_log_number_in_batch = 0; uint64_t max_log_number_in_batch = 0;
@ -2960,60 +2954,158 @@ Status VersionSet::LogAndApply(ColumnFamilyData* column_family_data,
} }
} }
if (max_log_number_in_batch != 0) { if (max_log_number_in_batch != 0) {
assert(column_family_data->GetLogNumber() <= max_log_number_in_batch); for (int i = 0; i < static_cast<int>(versions.size()); ++i) {
column_family_data->SetLogNumber(max_log_number_in_batch); ColumnFamilyData* cfd = versions[i]->cfd_;
assert(cfd->GetLogNumber() <= max_log_number_in_batch);
cfd->SetLogNumber(max_log_number_in_batch);
}
} }
if (last_min_log_number_to_keep != 0) { if (last_min_log_number_to_keep != 0) {
// Should only be set in 2PC mode. // Should only be set in 2PC mode.
MarkMinLogNumberToKeep2PC(last_min_log_number_to_keep); MarkMinLogNumberToKeep2PC(last_min_log_number_to_keep);
} }
AppendVersion(column_family_data, v); for (int i = 0; i < static_cast<int>(versions.size()); ++i) {
ColumnFamilyData* cfd = versions[i]->cfd_;
AppendVersion(cfd, versions[i]);
}
} }
manifest_file_number_ = pending_manifest_file_number_; manifest_file_number_ = pending_manifest_file_number_;
manifest_file_size_ = new_manifest_file_size; manifest_file_size_ = new_manifest_file_size;
prev_log_number_ = w.edit_list.front()->prev_log_number_; prev_log_number_ = first_writer.edit_list.front()->prev_log_number_;
} else { } else {
std::string version_edits; std::string version_edits;
for (auto& e : batch_edits) { for (auto& e : batch_edits) {
version_edits = version_edits + "\n" + e->DebugString(true); version_edits += ("\n" + e->DebugString(true));
}
ROCKS_LOG_ERROR(db_options_->info_log,
"Error in committing version edit to MANIFEST: %s",
version_edits.c_str());
for (auto v : versions) {
delete v;
} }
ROCKS_LOG_ERROR(
db_options_->info_log,
"[%s] Error in committing version edit to MANIFEST: %s",
column_family_data ? column_family_data->GetName().c_str() : "<null>",
version_edits.c_str());
delete v;
if (new_descriptor_log) { if (new_descriptor_log) {
ROCKS_LOG_INFO(db_options_->info_log, "Deleting manifest %" PRIu64 ROCKS_LOG_INFO(db_options_->info_log,
" current manifest %" PRIu64 "\n", "Deleting manifest %" PRIu64 " current manifest %" PRIu64
"\n",
manifest_file_number_, pending_manifest_file_number_); manifest_file_number_, pending_manifest_file_number_);
descriptor_log_.reset(); descriptor_log_.reset();
env_->DeleteFile( env_->DeleteFile(
DescriptorFileName(dbname_, pending_manifest_file_number_)); DescriptorFileName(dbname_, pending_manifest_file_number_));
} }
} }
pending_manifest_file_number_ = 0; pending_manifest_file_number_ = 0;
// wake up all the waiting writers // wake up all the waiting writers
while (true) { while (true) {
ManifestWriter* ready = manifest_writers_.front(); ManifestWriter* ready = manifest_writers_.front();
manifest_writers_.pop_front(); manifest_writers_.pop_front();
if (ready != &w) { bool need_signal = true;
ready->status = s; for (const auto& w : writers) {
ready->done = true; if (&w == ready) {
need_signal = false;
break;
}
}
ready->status = s;
ready->done = true;
if (need_signal) {
ready->cv.Signal(); ready->cv.Signal();
} }
if (ready == last_writer) break; if (ready == last_writer) {
break;
}
} }
// Notify new head of write queue
if (!manifest_writers_.empty()) { if (!manifest_writers_.empty()) {
manifest_writers_.front()->cv.Signal(); manifest_writers_.front()->cv.Signal();
} }
return s; return s;
} }
// 'datas' is gramatically incorrect. We still use this notation is to indicate
// that this variable represents a collection of column_family_data.
Status VersionSet::LogAndApply(
const std::vector<ColumnFamilyData*>& column_family_datas,
const std::vector<MutableCFOptions>& mutable_cf_options_list,
const std::vector<autovector<VersionEdit*>>& edit_lists,
InstrumentedMutex* mu, Directory* db_directory, bool new_descriptor_log,
const ColumnFamilyOptions* new_cf_options) {
mu->AssertHeld();
int num_edits = 0;
for (const auto& elist : edit_lists) {
num_edits += static_cast<int>(elist.size());
}
if (num_edits == 0) {
return Status::OK();
} else if (num_edits > 1) {
#ifndef NDEBUG
for (const auto& edit_list : edit_lists) {
for (const auto& edit : edit_list) {
assert(!edit->IsColumnFamilyManipulation());
}
}
#endif /* ! NDEBUG */
}
int num_cfds = static_cast<int>(column_family_datas.size());
if (num_cfds == 1 && column_family_datas[0] == nullptr) {
assert(edit_lists.size() == 1 && edit_lists[0].size() == 1);
assert(edit_lists[0][0]->is_column_family_add_);
assert(new_cf_options != nullptr);
}
std::deque<ManifestWriter> writers;
if (num_cfds > 0) {
assert(static_cast<size_t>(num_cfds) == mutable_cf_options_list.size());
assert(static_cast<size_t>(num_cfds) == edit_lists.size());
}
for (int i = 0; i < num_cfds; ++i) {
writers.emplace_back(mu, column_family_datas[i], mutable_cf_options_list[i],
edit_lists[i]);
manifest_writers_.push_back(&writers[i]);
}
assert(!writers.empty());
ManifestWriter& first_writer = writers.front();
while (!first_writer.done && &first_writer != manifest_writers_.front()) {
first_writer.cv.Wait();
}
if (first_writer.done) {
// All non-CF-manipulation operations can be grouped together and committed
// to MANIFEST. They should all have finished. The status code is stored in
// the first manifest writer.
#ifndef NDEBUG
for (const auto& writer : writers) {
assert(writer.done);
}
#endif /* !NDEBUG */
return first_writer.status;
}
int num_undropped_cfds = 0;
for (auto cfd : column_family_datas) {
// if cfd == nullptr, it is a column family add.
if (cfd == nullptr || !cfd->IsDropped()) {
++num_undropped_cfds;
}
}
if (0 == num_undropped_cfds) {
// TODO (yanqin) maybe use a different status code to denote column family
// drop other than OK and ShutdownInProgress
for (int i = 0; i != num_cfds; ++i) {
manifest_writers_.pop_front();
}
// Notify new head of manifest write queue.
if (!manifest_writers_.empty()) {
manifest_writers_.front()->cv.Signal();
}
return Status::OK();
}
return ProcessManifestWrites(writers, mu, db_directory, new_descriptor_log,
new_cf_options);
}
void VersionSet::LogAndApplyCFHelper(VersionEdit* edit) { void VersionSet::LogAndApplyCFHelper(VersionEdit* edit) {
assert(edit->IsColumnFamilyManipulation()); assert(edit->IsColumnFamilyManipulation());
edit->SetNextFile(next_file_number_.load()); edit->SetNextFile(next_file_number_.load());
@ -4023,7 +4115,7 @@ InternalIterator* VersionSet::MakeInputIterator(
nullptr /* table_reader_ptr */, nullptr /* table_reader_ptr */,
nullptr /* no per level latency histogram */, nullptr /* no per level latency histogram */,
true /* for_compaction */, nullptr /* arena */, true /* for_compaction */, nullptr /* arena */,
false /* skip_filters */, (int)which /* level */); false /* skip_filters */, static_cast<int>(which) /* level */);
} }
} else { } else {
// Create concatenating iterator for the files from this level // Create concatenating iterator for the files from this level
@ -4034,7 +4126,7 @@ InternalIterator* VersionSet::MakeInputIterator(
false /* should_sample */, false /* should_sample */,
nullptr /* no per level latency histogram */, nullptr /* no per level latency histogram */,
true /* for_compaction */, false /* skip_filters */, true /* for_compaction */, false /* skip_filters */,
(int)which /* level */, range_del_agg); static_cast<int>(which) /* level */, range_del_agg);
} }
} }
} }

@ -748,9 +748,11 @@ class VersionSet {
InstrumentedMutex* mu, Directory* db_directory = nullptr, InstrumentedMutex* mu, Directory* db_directory = nullptr,
bool new_descriptor_log = false, bool new_descriptor_log = false,
const ColumnFamilyOptions* column_family_options = nullptr) { const ColumnFamilyOptions* column_family_options = nullptr) {
autovector<VersionEdit*> edit_list; std::vector<ColumnFamilyData*> cfds(1, column_family_data);
edit_list.push_back(edit); std::vector<MutableCFOptions> mutable_cf_options_list(1,
return LogAndApply(column_family_data, mutable_cf_options, edit_list, mu, mutable_cf_options);
std::vector<autovector<VersionEdit*>> edit_lists(1, {edit});
return LogAndApply(cfds, mutable_cf_options_list, edit_lists, mu,
db_directory, new_descriptor_log, column_family_options); db_directory, new_descriptor_log, column_family_options);
} }
// The batch version. If edit_list.size() > 1, caller must ensure that // The batch version. If edit_list.size() > 1, caller must ensure that
@ -760,7 +762,24 @@ class VersionSet {
const MutableCFOptions& mutable_cf_options, const MutableCFOptions& mutable_cf_options,
const autovector<VersionEdit*>& edit_list, InstrumentedMutex* mu, const autovector<VersionEdit*>& edit_list, InstrumentedMutex* mu,
Directory* db_directory = nullptr, bool new_descriptor_log = false, Directory* db_directory = nullptr, bool new_descriptor_log = false,
const ColumnFamilyOptions* column_family_options = nullptr); const ColumnFamilyOptions* column_family_options = nullptr) {
std::vector<ColumnFamilyData*> cfds(1, column_family_data);
std::vector<MutableCFOptions> mutable_cf_options_list(1,
mutable_cf_options);
std::vector<autovector<VersionEdit*>> edit_lists(1, edit_list);
return LogAndApply(cfds, mutable_cf_options_list, edit_lists, mu,
db_directory, new_descriptor_log, column_family_options);
}
// The across-multi-cf batch version. If edit_lists contain more than
// 1 version edits, caller must ensure that no edit in the []list is column
// family manipulation.
Status LogAndApply(const std::vector<ColumnFamilyData*>& cfds,
const std::vector<MutableCFOptions>& mutable_cf_options,
const std::vector<autovector<VersionEdit*>>& edit_lists,
InstrumentedMutex* mu, Directory* db_directory = nullptr,
bool new_descriptor_log = false,
const ColumnFamilyOptions* new_cf_options = nullptr);
// Recover the last saved descriptor from persistent storage. // Recover the last saved descriptor from persistent storage.
// If read_only == true, Recover() will not complain if some column families // If read_only == true, Recover() will not complain if some column families
@ -965,6 +984,11 @@ class VersionSet {
ColumnFamilyData* CreateColumnFamily(const ColumnFamilyOptions& cf_options, ColumnFamilyData* CreateColumnFamily(const ColumnFamilyOptions& cf_options,
VersionEdit* edit); VersionEdit* edit);
Status ProcessManifestWrites(std::deque<ManifestWriter>& writers,
InstrumentedMutex* mu, Directory* db_directory,
bool new_descriptor_log,
const ColumnFamilyOptions* new_cf_options);
std::unique_ptr<ColumnFamilySet> column_family_set_; std::unique_ptr<ColumnFamilySet> column_family_set_;
Env* const env_; Env* const env_;

@ -8,6 +8,8 @@
// found in the LICENSE file. See the AUTHORS file for names of contributors. // found in the LICENSE file. See the AUTHORS file for names of contributors.
#include "db/version_set.h" #include "db/version_set.h"
#include "db/log_writer.h"
#include "table/mock_table.h"
#include "util/logging.h" #include "util/logging.h"
#include "util/testharness.h" #include "util/testharness.h"
#include "util/testutil.h" #include "util/testutil.h"
@ -452,6 +454,127 @@ TEST_F(FindLevelFileTest, LevelOverlappingFiles) {
ASSERT_TRUE(Overlaps("600", "700")); ASSERT_TRUE(Overlaps("600", "700"));
} }
class ManifestWriterTest : public testing::Test {
public:
ManifestWriterTest()
: env_(Env::Default()),
dbname_(test::TmpDir() + "/version_set_test"),
db_options_(),
mutable_cf_options_(cf_options_),
table_cache_(NewLRUCache(50000, 16)),
write_buffer_manager_(db_options_.db_write_buffer_size),
versions_(new VersionSet(dbname_, &db_options_, env_options_,
table_cache_.get(), &write_buffer_manager_,
&write_controller_)),
shutting_down_(false),
mock_table_factory_(std::make_shared<mock::MockTableFactory>()) {
EXPECT_OK(env_->CreateDirIfMissing(dbname_));
db_options_.db_paths.emplace_back(dbname_,
std::numeric_limits<uint64_t>::max());
}
// Create DB with 3 column families.
void NewDB() {
VersionEdit new_db;
new_db.SetLogNumber(0);
new_db.SetNextFile(2);
new_db.SetLastSequence(0);
const std::vector<std::string> cf_names = {kDefaultColumnFamilyName,
"alice", "bob"};
const int kInitialNumOfCfs = static_cast<int>(cf_names.size());
autovector<VersionEdit> new_cfs;
uint64_t last_seq = 1;
uint32_t cf_id = 1;
for (int i = 1; i != kInitialNumOfCfs; ++i) {
VersionEdit new_cf;
new_cf.AddColumnFamily(cf_names[i]);
new_cf.SetColumnFamily(cf_id++);
new_cf.SetLogNumber(0);
new_cf.SetNextFile(2);
new_cf.SetLastSequence(last_seq++);
new_cfs.emplace_back(new_cf);
}
const std::string manifest = DescriptorFileName(dbname_, 1);
unique_ptr<WritableFile> file;
Status s = env_->NewWritableFile(
manifest, &file, env_->OptimizeForManifestWrite(env_options_));
ASSERT_OK(s);
unique_ptr<WritableFileWriter> file_writer(
new WritableFileWriter(std::move(file), env_options_));
{
log::Writer log(std::move(file_writer), 0, false);
std::string record;
new_db.EncodeTo(&record);
s = log.AddRecord(record);
for (const auto& e : new_cfs) {
e.EncodeTo(&record);
s = log.AddRecord(record);
ASSERT_OK(s);
}
}
ASSERT_OK(s);
// Make "CURRENT" file point to the new manifest file.
s = SetCurrentFile(env_, dbname_, 1, nullptr);
std::vector<ColumnFamilyDescriptor> column_families;
cf_options_.table_factory = mock_table_factory_;
for (const auto& cf_name : cf_names) {
column_families.emplace_back(cf_name, cf_options_);
}
EXPECT_OK(versions_->Recover(column_families, false));
EXPECT_EQ(kInitialNumOfCfs,
versions_->GetColumnFamilySet()->NumberOfColumnFamilies());
for (auto cfd : *versions_->GetColumnFamilySet()) {
cfds_.emplace_back(cfd);
}
}
Env* env_;
const std::string dbname_;
EnvOptions env_options_;
ImmutableDBOptions db_options_;
ColumnFamilyOptions cf_options_;
MutableCFOptions mutable_cf_options_;
std::shared_ptr<Cache> table_cache_;
WriteController write_controller_;
WriteBufferManager write_buffer_manager_;
std::unique_ptr<VersionSet> versions_;
InstrumentedMutex mutex_;
std::atomic<bool> shutting_down_;
std::shared_ptr<mock::MockTableFactory> mock_table_factory_;
std::vector<ColumnFamilyData*> cfds_;
};
TEST_F(ManifestWriterTest, SameColumnFamilyGroupCommit) {
NewDB();
const int kGroupSize = 5;
std::vector<VersionEdit> edits(kGroupSize);
std::vector<ColumnFamilyData*> cfds(kGroupSize, cfds_[0]);
std::vector<MutableCFOptions> all_mutable_cf_options(kGroupSize,
mutable_cf_options_);
std::vector<autovector<VersionEdit*>> edit_lists(kGroupSize);
for (int i = 0; i != kGroupSize; ++i) {
edit_lists[i].emplace_back(&edits[i]);
}
int count = 0;
SyncPoint::GetInstance()->SetCallBack(
"VersionSet::ProcessManifestWrites:SameColumnFamily", [&](void* arg) {
uint32_t* cf_id = reinterpret_cast<uint32_t*>(arg);
EXPECT_EQ(0, *cf_id);
++count;
});
SyncPoint::GetInstance()->EnableProcessing();
mutex_.Lock();
Status s =
versions_->LogAndApply(cfds, all_mutable_cf_options, edit_lists, &mutex_);
mutex_.Unlock();
EXPECT_OK(s);
EXPECT_EQ(kGroupSize - 1, count);
}
} // namespace rocksdb } // namespace rocksdb
int main(int argc, char** argv) { int main(int argc, char** argv) {

@ -171,8 +171,8 @@ struct DeadlockPath {
bool limit_exceeded; bool limit_exceeded;
int64_t deadlock_time; int64_t deadlock_time;
explicit DeadlockPath( explicit DeadlockPath(std::vector<DeadlockInfo> path_entry,
std::vector<DeadlockInfo> path_entry, const int64_t& dl_time) const int64_t& dl_time)
: path(path_entry), limit_exceeded(false), deadlock_time(dl_time) {} : path(path_entry), limit_exceeded(false), deadlock_time(dl_time) {}
// empty path, limit exceeded constructor and default constructor // empty path, limit exceeded constructor and default constructor

@ -39,7 +39,6 @@ enum HistogramsInternal : uint32_t {
INTERNAL_HISTOGRAM_ENUM_MAX INTERNAL_HISTOGRAM_ENUM_MAX
}; };
class StatisticsImpl : public Statistics { class StatisticsImpl : public Statistics {
public: public:
StatisticsImpl(std::shared_ptr<Statistics> stats, StatisticsImpl(std::shared_ptr<Statistics> stats,

Loading…
Cancel
Save