MakeRoomForWrite() support for column families

Summary: Making room for write will be the hardest part of the column family implementation. For now, I just iterate through all column families and run MakeRoomForWrite() for every one.

Test Plan: make check does not complain

Reviewers: dhruba, haobo

CC: leveldb

Differential Revision: https://reviews.facebook.net/D15597
main
Igor Canadi 11 years ago
parent c37e7de669
commit 6973bb1722
  1. 14
      db/column_family.cc
  2. 14
      db/column_family.h
  3. 91
      db/db_impl.cc
  4. 14
      db/db_impl.h
  5. 4
      db/version_set.cc
  6. 10
      db/version_set.h

@ -17,9 +17,7 @@
namespace rocksdb {
SuperVersion::SuperVersion(const int num_memtables) {
to_delete.resize(num_memtables);
}
SuperVersion::SuperVersion() {}
SuperVersion::~SuperVersion() {
for (auto td : to_delete) {
@ -71,7 +69,8 @@ ColumnFamilyData::ColumnFamilyData(uint32_t id, const std::string& name,
imm_(options.min_write_buffer_number_to_merge),
super_version_(nullptr),
super_version_number_(0),
log_number_(0) {}
log_number_(0),
need_slowdown_for_num_level0_files_(false) {}
ColumnFamilyData::~ColumnFamilyData() {
if (super_version_ != nullptr) {
@ -95,6 +94,13 @@ ColumnFamilyData::~ColumnFamilyData() {
}
}
void ColumnFamilyData::SetCurrent(Version* current) {
current_ = current;
need_slowdown_for_num_level0_files_ =
(options_.level0_slowdown_writes_trigger >= 0 &&
current_->NumLevelFiles(0) >= options_.level0_slowdown_writes_trigger);
}
void ColumnFamilyData::CreateNewMemtable() {
assert(current_ != nullptr);
if (mem_ != nullptr) {

@ -36,7 +36,7 @@ struct SuperVersion {
std::vector<MemTable*> to_delete;
// should be called outside the mutex
explicit SuperVersion(const int num_memtables = 0);
SuperVersion();
~SuperVersion();
SuperVersion* Ref();
// Returns true if this was the last reference and caller should
@ -72,7 +72,7 @@ class ColumnFamilyData {
Version* current() { return current_; }
Version* dummy_versions() { return dummy_versions_; }
void SetMemtable(MemTable* new_mem) { mem_ = new_mem; }
void SetCurrent(Version* current) { current_ = current; }
void SetCurrent(Version* current);
void CreateNewMemtable();
SuperVersion* GetSuperVersion() const { return super_version_; }
@ -85,6 +85,12 @@ class ColumnFamilyData {
// the clients to allocate SuperVersion outside of mutex.
SuperVersion* InstallSuperVersion(SuperVersion* new_superversion);
// A Flag indicating whether write needs to slowdown because of there are
// too many number of level0 files.
bool NeedSlowdownForNumLevel0Files() const {
return need_slowdown_for_num_level0_files_;
}
private:
uint32_t id_;
const std::string name_;
@ -105,6 +111,10 @@ class ColumnFamilyData {
// Column Family. All earlier log files must be ignored and not
// recovered from
uint64_t log_number_;
// A flag indicating whether we should delay writes because
// we have too many level 0 files
bool need_slowdown_for_num_level0_files_;
};
// Thread safe only for reading without a writer. All access should be

@ -1298,8 +1298,7 @@ Status DBImpl::ReFitLevel(int level, int target_level) {
assert(level < NumberLevels());
SuperVersion* superversion_to_free = nullptr;
SuperVersion* new_superversion =
new SuperVersion(options_.max_write_buffer_number);
SuperVersion* new_superversion = new SuperVersion();
mutex_.Lock();
@ -2949,6 +2948,13 @@ std::vector<Status> DBImpl::MultiGet(
return statList;
}
// TODO(icanadi) creating column family while writing will cause a data race.
// In write code path, we iterate through column families and call
// MakeRoomForWrite() for each. MakeRoomForWrite() can unlock the mutex
// and wait (delay the write). If we create or drop a column family when
// that mutex is unlocked for delay, that's bad.
// Solution TODO: enable iteration by chaining column families in
// circular linked lists
Status DBImpl::CreateColumnFamily(const ColumnFamilyOptions& options,
const std::string& column_family_name,
ColumnFamilyHandle* handle) {
@ -3106,9 +3112,14 @@ Status DBImpl::Write(const WriteOptions& options, WriteBatch* my_batch) {
RecordTick(options_.statistics.get(), WRITE_DONE_BY_SELF, 1);
}
// May temporarily unlock and wait.
SuperVersion* superversion_to_free = nullptr;
Status status = MakeRoomForWrite(my_batch == nullptr, &superversion_to_free);
Status status;
for (auto cfd : *versions_->GetColumnFamilySet()) {
// May temporarily unlock and wait.
status = MakeRoomForWrite(cfd, my_batch == nullptr);
if (!status.ok()) {
break;
}
}
uint64_t last_sequence = versions_->LastSequence();
Writer* last_writer = &w;
if (status.ok() && my_batch != nullptr) { // nullptr batch is for compactions
@ -3209,7 +3220,6 @@ Status DBImpl::Write(const WriteOptions& options, WriteBatch* my_batch) {
writers_.front()->cv.Signal();
}
mutex_.Unlock();
delete superversion_to_free;
return status;
}
@ -3295,8 +3305,7 @@ uint64_t DBImpl::SlowdownAmount(int n, double bottom, double top) {
// REQUIRES: mutex_ is held
// REQUIRES: this thread is currently at the front of the writer queue
Status DBImpl::MakeRoomForWrite(bool force,
SuperVersion** superversion_to_free) {
Status DBImpl::MakeRoomForWrite(ColumnFamilyData* cfd, bool force) {
mutex_.AssertHeld();
assert(!writers_.empty());
bool allow_delay = !force;
@ -3305,14 +3314,13 @@ Status DBImpl::MakeRoomForWrite(bool force,
uint64_t rate_limit_delay_millis = 0;
Status s;
double score;
*superversion_to_free = nullptr;
while (true) {
if (!bg_error_.ok()) {
// Yield previous error
s = bg_error_;
break;
} else if (allow_delay && versions_->NeedSlowdownForNumLevel0Files()) {
} else if (allow_delay && cfd->NeedSlowdownForNumLevel0Files()) {
// We are getting close to hitting a hard limit on the number of
// L0 files. Rather than delaying a single write by several
// seconds when we hit the hard limit, start delaying each
@ -3320,9 +3328,9 @@ Status DBImpl::MakeRoomForWrite(bool force,
// this delay hands over some CPU to the compaction thread in
// case it is sharing the same core as the writer.
uint64_t slowdown =
SlowdownAmount(default_cfd_->current()->NumLevelFiles(0),
options_.level0_slowdown_writes_trigger,
options_.level0_stop_writes_trigger);
SlowdownAmount(cfd->current()->NumLevelFiles(0),
cfd->options()->level0_slowdown_writes_trigger,
cfd->options()->level0_stop_writes_trigger);
mutex_.Unlock();
uint64_t delayed;
{
@ -3335,15 +3343,15 @@ Status DBImpl::MakeRoomForWrite(bool force,
allow_delay = false; // Do not delay a single write more than once
mutex_.Lock();
delayed_writes_++;
} else if (!force && (default_cfd_->mem()->ApproximateMemoryUsage() <=
options_.write_buffer_size)) {
} else if (!force && (cfd->mem()->ApproximateMemoryUsage() <=
cfd->options()->write_buffer_size)) {
// There is room in current memtable
if (allow_delay) {
DelayLoggingAndReset();
}
break;
} else if (default_cfd_->imm()->size() ==
options_.max_write_buffer_number - 1) {
} else if (cfd->imm()->size() ==
cfd->options()->max_write_buffer_number - 1) {
// We have filled up the current memtable, but the previous
// ones are still being compacted, so we wait.
DelayLoggingAndReset();
@ -3351,7 +3359,7 @@ Status DBImpl::MakeRoomForWrite(bool force,
uint64_t stall;
{
StopWatch sw(env_, options_.statistics.get(),
STALL_MEMTABLE_COMPACTION_COUNT);
STALL_MEMTABLE_COMPACTION_COUNT);
bg_cv_.Wait();
stall = sw.ElapsedMicros();
}
@ -3359,8 +3367,8 @@ Status DBImpl::MakeRoomForWrite(bool force,
STALL_MEMTABLE_COMPACTION_MICROS, stall);
internal_stats_.RecordWriteStall(InternalStats::MEMTABLE_COMPACTION,
stall);
} else if (default_cfd_->current()->NumLevelFiles(0) >=
options_.level0_stop_writes_trigger) {
} else if (cfd->current()->NumLevelFiles(0) >=
cfd->options()->level0_stop_writes_trigger) {
// There are too many level-0 files.
DelayLoggingAndReset();
Log(options_.info_log, "wait for fewer level0 files...\n");
@ -3374,10 +3382,10 @@ Status DBImpl::MakeRoomForWrite(bool force,
RecordTick(options_.statistics.get(), STALL_L0_NUM_FILES_MICROS, stall);
internal_stats_.RecordWriteStall(InternalStats::LEVEL0_NUM_FILES, stall);
} else if (allow_hard_rate_limit_delay && options_.hard_rate_limit > 1.0 &&
(score = default_cfd_->current()->MaxCompactionScore()) >
options_.hard_rate_limit) {
(score = cfd->current()->MaxCompactionScore()) >
cfd->options()->hard_rate_limit) {
// Delay a write when the compaction score for any level is too large.
int max_level = default_cfd_->current()->MaxCompactionScoreLevel();
int max_level = cfd->current()->MaxCompactionScoreLevel();
mutex_.Unlock();
uint64_t delayed;
{
@ -3392,26 +3400,25 @@ Status DBImpl::MakeRoomForWrite(bool force,
rate_limit_delay_millis += rate_limit;
RecordTick(options_.statistics.get(),
RATE_LIMIT_DELAY_MILLIS, rate_limit);
if (options_.rate_limit_delay_max_milliseconds > 0 &&
if (cfd->options()->rate_limit_delay_max_milliseconds > 0 &&
rate_limit_delay_millis >=
(unsigned)options_.rate_limit_delay_max_milliseconds) {
(unsigned)cfd->options()->rate_limit_delay_max_milliseconds) {
allow_hard_rate_limit_delay = false;
}
mutex_.Lock();
} else if (allow_soft_rate_limit_delay && options_.soft_rate_limit > 0.0 &&
(score = default_cfd_->current()->MaxCompactionScore()) >
options_.soft_rate_limit) {
} else if (allow_soft_rate_limit_delay &&
cfd->options()->soft_rate_limit > 0.0 &&
(score = cfd->current()->MaxCompactionScore()) >
cfd->options()->soft_rate_limit) {
// Delay a write when the compaction score for any level is too large.
// TODO: add statistics
mutex_.Unlock();
{
StopWatch sw(env_, options_.statistics.get(),
SOFT_RATE_LIMIT_DELAY_COUNT);
env_->SleepForMicroseconds(SlowdownAmount(
score,
options_.soft_rate_limit,
options_.hard_rate_limit)
);
env_->SleepForMicroseconds(
SlowdownAmount(score, cfd->options()->soft_rate_limit,
cfd->options()->hard_rate_limit));
rate_limit_delay_millis += sw.ElapsedMicros();
}
allow_soft_rate_limit_delay = false;
@ -3436,9 +3443,10 @@ Status DBImpl::MakeRoomForWrite(bool force,
if (s.ok()) {
// Our final size should be less than write_buffer_size
// (compression, etc) but err on the side of caution.
lfile->SetPreallocationBlockSize(1.1 * options_.write_buffer_size);
memtmp = new MemTable(internal_comparator_, options_);
new_superversion = new SuperVersion(options_.max_write_buffer_number);
lfile->SetPreallocationBlockSize(1.1 *
cfd->options()->write_buffer_size);
memtmp = new MemTable(internal_comparator_, *cfd->options());
new_superversion = new SuperVersion();
}
}
mutex_.Lock();
@ -3450,20 +3458,19 @@ Status DBImpl::MakeRoomForWrite(bool force,
}
logfile_number_ = new_log_number;
log_.reset(new log::Writer(std::move(lfile)));
default_cfd_->mem()->SetNextLogNumber(logfile_number_);
default_cfd_->imm()->Add(default_cfd_->mem());
cfd->mem()->SetNextLogNumber(logfile_number_);
cfd->imm()->Add(cfd->mem());
if (force) {
default_cfd_->imm()->FlushRequested();
cfd->imm()->FlushRequested();
}
memtmp->Ref();
memtmp->SetLogNumber(logfile_number_);
default_cfd_->SetMemtable(memtmp);
cfd->SetMemtable(memtmp);
Log(options_.info_log, "New memtable created with log file: #%lu\n",
(unsigned long)logfile_number_);
force = false; // Do not force another compaction if have room
MaybeScheduleFlushOrCompaction();
*superversion_to_free =
default_cfd_->InstallSuperVersion(new_superversion);
delete cfd->InstallSuperVersion(new_superversion);
}
}
return s;

@ -201,9 +201,9 @@ class DBImpl : public DB {
// a list of memtables to be free
std::vector<MemTable *> memtables_to_free;
SuperVersion* superversion_to_free; // if nullptr nothing to free
SuperVersion* superversion_to_free; // if nullptr nothing to free
SuperVersion* new_superversion; // if nullptr no new superversion
SuperVersion* new_superversion; // if nullptr no new superversion
// the current manifest_file_number, log_number and prev_log_number
// that corresponds to the set of files in 'live'.
@ -216,8 +216,7 @@ class DBImpl : public DB {
prev_log_number = 0;
memtables_to_free.reserve(num_memtables);
superversion_to_free = nullptr;
new_superversion =
create_superversion ? new SuperVersion(num_memtables) : nullptr;
new_superversion = create_superversion ? new SuperVersion() : nullptr;
}
~DeletionState() {
@ -303,11 +302,8 @@ class DBImpl : public DB {
uint64_t* filenumber);
uint64_t SlowdownAmount(int n, double bottom, double top);
// MakeRoomForWrite will return superversion_to_free through an arugment,
// which the caller needs to delete. We do it because caller can delete
// the superversion outside of mutex
Status MakeRoomForWrite(bool force /* compact even if there is room? */,
SuperVersion** superversion_to_free);
Status MakeRoomForWrite(ColumnFamilyData* cfd,
bool force /* flush even if there is room? */);
void BuildBatchGroup(Writer** last_writer,
autovector<WriteBatch*>* write_batch_group);

@ -1377,7 +1377,6 @@ VersionSet::VersionSet(const std::string& dbname, const Options* options,
log_number_(0),
prev_log_number_(0),
num_levels_(options_->num_levels),
need_slowdown_for_num_level0_files_(false),
current_version_number_(0),
manifest_file_size_(0),
storage_options_(storage_options),
@ -1413,9 +1412,6 @@ void VersionSet::AppendVersion(ColumnFamilyData* column_family_data,
current->Unref();
}
column_family_data->SetCurrent(v);
need_slowdown_for_num_level0_files_ =
(options_->level0_slowdown_writes_trigger >= 0 &&
v->NumLevelFiles(0) >= options_->level0_slowdown_writes_trigger);
v->Ref();
// Append to linked list

@ -315,12 +315,6 @@ class VersionSet {
const EnvOptions& storage_options,
int new_levels);
// A Flag indicating whether write needs to slowdown because of there are
// too many number of level0 files.
bool NeedSlowdownForNumLevel0Files() const {
return need_slowdown_for_num_level0_files_;
}
// Return the current manifest file number
uint64_t ManifestFileNumber() const { return manifest_file_number_; }
@ -482,10 +476,6 @@ class VersionSet {
// Opened lazily
unique_ptr<log::Writer> descriptor_log_;
// A flag indicating whether we should delay writes because
// we have too many level 0 files
bool need_slowdown_for_num_level0_files_;
// An object that keeps all the compaction stats
// and picks the next compaction
std::unique_ptr<CompactionPicker> compaction_picker_;

Loading…
Cancel
Save