Fsync directory after we create a new file

Summary:
@dhruba, I'm not sure where we need to sync the directory. I implemented the function in Env() and added the dir sync just after we close the newly created file in the builder.

Should I also add FsyncDir() to new files that get created by a compaction?

Test Plan: Confirmed that FsyncDir is returning Status::OK()

Reviewers: dhruba, haobo

Reviewed By: dhruba

CC: leveldb, dhruba

Differential Revision: https://reviews.facebook.net/D14751
main
Igor Canadi 11 years ago
parent 6c2ca1d3e6
commit 832158e7f7
  1. 28
      db/db_impl.cc
  2. 2
      db/db_impl.h
  3. 12
      db/memtablelist.cc
  4. 13
      db/memtablelist.h
  5. 6
      db/version_set.cc
  6. 1
      db/version_set.h
  7. 8
      hdfs/env_hdfs.h
  8. 11
      helpers/memenv/memenv.cc
  9. 24
      include/rocksdb/env.h
  10. 5
      util/env_hdfs.cc
  11. 30
      util/env_posix.cc

@ -881,6 +881,11 @@ Status DBImpl::Recover(bool read_only, bool error_if_log_file_exist) {
return s; return s;
} }
s = env_->NewDirectory(dbname_, &db_directory_);
if (!s.ok()) {
return s;
}
s = env_->LockFile(LockFileName(dbname_), &db_lock_); s = env_->LockFile(LockFileName(dbname_), &db_lock_);
if (!s.ok()) { if (!s.ok()) {
return s; return s;
@ -1173,6 +1178,9 @@ Status DBImpl::WriteLevel0Table(std::vector<MemTable*> &mems, VersionEdit* edit,
(unsigned long) meta.number, (unsigned long) meta.number,
(unsigned long) meta.file_size, (unsigned long) meta.file_size,
s.ToString().c_str()); s.ToString().c_str());
if (!options_.disableDataSync) {
db_directory_->Fsync();
}
mutex_.Lock(); mutex_.Lock();
} }
base->Unref(); base->Unref();
@ -1267,8 +1275,8 @@ Status DBImpl::FlushMemTableToOutputFile(bool* madeProgress,
// Replace immutable memtable with the generated Table // Replace immutable memtable with the generated Table
s = imm_.InstallMemtableFlushResults( s = imm_.InstallMemtableFlushResults(
mems, versions_.get(), s, &mutex_, options_.info_log.get(), mems, versions_.get(), s, &mutex_, options_.info_log.get(), file_number,
file_number, pending_outputs_, &deletion_state.memtables_to_free); pending_outputs_, &deletion_state.memtables_to_free, db_directory_.get());
if (s.ok()) { if (s.ok()) {
InstallSuperVersion(deletion_state); InstallSuperVersion(deletion_state);
@ -1397,7 +1405,7 @@ Status DBImpl::ReFitLevel(int level, int target_level) {
Log(options_.info_log, "Apply version edit:\n%s", Log(options_.info_log, "Apply version edit:\n%s",
edit.DebugString().data()); edit.DebugString().data());
status = versions_->LogAndApply(&edit, &mutex_); status = versions_->LogAndApply(&edit, &mutex_, db_directory_.get());
superversion_to_free = InstallSuperVersion(new_superversion); superversion_to_free = InstallSuperVersion(new_superversion);
new_superversion = nullptr; new_superversion = nullptr;
@ -1969,7 +1977,7 @@ Status DBImpl::BackgroundCompaction(bool* madeProgress,
c->edit()->AddFile(c->level() + 1, f->number, f->file_size, c->edit()->AddFile(c->level() + 1, f->number, f->file_size,
f->smallest, f->largest, f->smallest, f->largest,
f->smallest_seqno, f->largest_seqno); f->smallest_seqno, f->largest_seqno);
status = versions_->LogAndApply(c->edit(), &mutex_); status = versions_->LogAndApply(c->edit(), &mutex_, db_directory_.get());
InstallSuperVersion(deletion_state); InstallSuperVersion(deletion_state);
Version::LevelSummaryStorage tmp; Version::LevelSummaryStorage tmp;
Log(options_.info_log, "Moved #%lld to level-%d %lld bytes %s: %s\n", Log(options_.info_log, "Moved #%lld to level-%d %lld bytes %s: %s\n",
@ -2217,7 +2225,8 @@ Status DBImpl::InstallCompactionResults(CompactionState* compact) {
compact->compaction->output_level(), out.number, out.file_size, compact->compaction->output_level(), out.number, out.file_size,
out.smallest, out.largest, out.smallest_seqno, out.largest_seqno); out.smallest, out.largest, out.smallest_seqno, out.largest_seqno);
} }
return versions_->LogAndApply(compact->compaction->edit(), &mutex_); return versions_->LogAndApply(compact->compaction->edit(), &mutex_,
db_directory_.get());
} }
// //
@ -2590,6 +2599,9 @@ Status DBImpl::DoCompactionWork(CompactionState* compact,
} }
input.reset(); input.reset();
if (!options_.disableDataSync) {
db_directory_->Fsync();
}
CompactionStats stats; CompactionStats stats;
stats.micros = env_->NowMicros() - start_micros - imm_micros; stats.micros = env_->NowMicros() - start_micros - imm_micros;
MeasureTime(options_.statistics.get(), COMPACTION_TIME, stats.micros); MeasureTime(options_.statistics.get(), COMPACTION_TIME, stats.micros);
@ -3827,7 +3839,7 @@ Status DBImpl::DeleteFile(std::string name) {
} }
} }
edit.DeleteFile(level, number); edit.DeleteFile(level, number);
status = versions_->LogAndApply(&edit, &mutex_); status = versions_->LogAndApply(&edit, &mutex_, db_directory_.get());
if (status.ok()) { if (status.ok()) {
InstallSuperVersion(deletion_state); InstallSuperVersion(deletion_state);
} }
@ -3935,7 +3947,8 @@ Status DB::Open(const Options& options, const std::string& dbname, DB** dbptr) {
edit.SetLogNumber(new_log_number); edit.SetLogNumber(new_log_number);
impl->logfile_number_ = new_log_number; impl->logfile_number_ = new_log_number;
impl->log_.reset(new log::Writer(std::move(lfile))); impl->log_.reset(new log::Writer(std::move(lfile)));
s = impl->versions_->LogAndApply(&edit, &impl->mutex_); s = impl->versions_->LogAndApply(&edit, &impl->mutex_,
impl->db_directory_.get());
} }
if (s.ok()) { if (s.ok()) {
delete impl->InstallSuperVersion(new DBImpl::SuperVersion()); delete impl->InstallSuperVersion(new DBImpl::SuperVersion());
@ -3943,6 +3956,7 @@ Status DB::Open(const Options& options, const std::string& dbname, DB** dbptr) {
impl->DeleteObsoleteFiles(); impl->DeleteObsoleteFiles();
impl->MaybeScheduleFlushOrCompaction(); impl->MaybeScheduleFlushOrCompaction();
impl->MaybeScheduleLogDBDeployStats(); impl->MaybeScheduleLogDBDeployStats();
s = impl->db_directory_->Fsync();
} }
} }

@ -401,6 +401,8 @@ class DBImpl : public DB {
std::string host_name_; std::string host_name_;
std::unique_ptr<Directory> db_directory_;
// Queue of writers. // Queue of writers.
std::deque<Writer*> writers_; std::deque<Writer*> writers_;
WriteBatch tmp_batch_; WriteBatch tmp_batch_;

@ -121,12 +121,10 @@ void MemTableList::PickMemtablesToFlush(std::vector<MemTable*>* ret) {
// Record a successful flush in the manifest file // Record a successful flush in the manifest file
Status MemTableList::InstallMemtableFlushResults( Status MemTableList::InstallMemtableFlushResults(
const std::vector<MemTable*> &mems, const std::vector<MemTable*>& mems, VersionSet* vset, Status flushStatus,
VersionSet* vset, Status flushStatus, port::Mutex* mu, Logger* info_log, uint64_t file_number,
port::Mutex* mu, Logger* info_log, std::set<uint64_t>& pending_outputs, std::vector<MemTable*>* to_delete,
uint64_t file_number, Directory* db_directory) {
std::set<uint64_t>& pending_outputs,
std::vector<MemTable*>* to_delete) {
mu->AssertHeld(); mu->AssertHeld();
// If the flush was not successful, then just reset state. // If the flush was not successful, then just reset state.
@ -178,7 +176,7 @@ Status MemTableList::InstallMemtableFlushResults(
(unsigned long)m->file_number_); (unsigned long)m->file_number_);
// this can release and reacquire the mutex. // this can release and reacquire the mutex.
s = vset->LogAndApply(&m->edit_, mu); s = vset->LogAndApply(&m->edit_, mu, db_directory);
// we will be changing the version in the next code path, // we will be changing the version in the next code path,
// so we better create a new one, since versions are immutable // so we better create a new one, since versions are immutable

@ -90,12 +90,13 @@ class MemTableList {
void PickMemtablesToFlush(std::vector<MemTable*>* mems); void PickMemtablesToFlush(std::vector<MemTable*>* mems);
// Commit a successful flush in the manifest file // Commit a successful flush in the manifest file
Status InstallMemtableFlushResults(const std::vector<MemTable*> &m, Status InstallMemtableFlushResults(const std::vector<MemTable*>& m,
VersionSet* vset, Status flushStatus, VersionSet* vset, Status flushStatus,
port::Mutex* mu, Logger* info_log, port::Mutex* mu, Logger* info_log,
uint64_t file_number, uint64_t file_number,
std::set<uint64_t>& pending_outputs, std::set<uint64_t>& pending_outputs,
std::vector<MemTable*>* to_delete); std::vector<MemTable*>* to_delete,
Directory* db_directory);
// New memtables are inserted at the front of the list. // New memtables are inserted at the front of the list.
// Takes ownership of the referenced held on *m by the caller of Add(). // Takes ownership of the referenced held on *m by the caller of Add().

@ -1419,6 +1419,7 @@ void VersionSet::AppendVersion(Version* v) {
} }
Status VersionSet::LogAndApply(VersionEdit* edit, port::Mutex* mu, Status VersionSet::LogAndApply(VersionEdit* edit, port::Mutex* mu,
Directory* db_directory,
bool new_descriptor_log) { bool new_descriptor_log) {
mu->AssertHeld(); mu->AssertHeld();
@ -1541,6 +1542,9 @@ Status VersionSet::LogAndApply(VersionEdit* edit, port::Mutex* mu,
// of it later // of it later
env_->DeleteFile(DescriptorFileName(dbname_, old_manifest_file_number)); env_->DeleteFile(DescriptorFileName(dbname_, old_manifest_file_number));
} }
if (!options_->disableDataSync && db_directory != nullptr) {
db_directory->Fsync();
}
} }
// find offset in manifest file where this version is stored. // find offset in manifest file where this version is stored.
@ -1821,7 +1825,7 @@ Status VersionSet::ReduceNumberOfLevels(const std::string& dbname,
VersionEdit ve; VersionEdit ve;
port::Mutex dummy_mutex; port::Mutex dummy_mutex;
MutexLock l(&dummy_mutex); MutexLock l(&dummy_mutex);
return versions.LogAndApply(&ve, &dummy_mutex, true); return versions.LogAndApply(&ve, &dummy_mutex, nullptr, true);
} }
Status VersionSet::DumpManifest(Options& options, std::string& dscname, Status VersionSet::DumpManifest(Options& options, std::string& dscname,

@ -284,6 +284,7 @@ class VersionSet {
// REQUIRES: *mu is held on entry. // REQUIRES: *mu is held on entry.
// REQUIRES: no other thread concurrently calls LogAndApply() // REQUIRES: no other thread concurrently calls LogAndApply()
Status LogAndApply(VersionEdit* edit, port::Mutex* mu, Status LogAndApply(VersionEdit* edit, port::Mutex* mu,
Directory* db_directory = nullptr,
bool new_descriptor_log = false); bool new_descriptor_log = false);
// Recover the last saved descriptor from persistent storage. // Recover the last saved descriptor from persistent storage.

@ -70,6 +70,9 @@ class HdfsEnv : public Env {
unique_ptr<RandomRWFile>* result, unique_ptr<RandomRWFile>* result,
const EnvOptions& options); const EnvOptions& options);
virtual Status NewDirectory(const std::string& name,
unique_ptr<Directory>* result);
virtual bool FileExists(const std::string& fname); virtual bool FileExists(const std::string& fname);
virtual Status GetChildren(const std::string& path, virtual Status GetChildren(const std::string& path,
@ -246,6 +249,11 @@ class HdfsEnv : public Env {
return notsup; return notsup;
} }
virtual Status NewDirectory(const std::string& name,
unique_ptr<Directory>* result) {
return notsup;
}
virtual bool FileExists(const std::string& fname){return false;} virtual bool FileExists(const std::string& fname){return false;}
virtual Status GetChildren(const std::string& path, virtual Status GetChildren(const std::string& path,

@ -221,6 +221,11 @@ class WritableFileImpl : public WritableFile {
FileState* file_; FileState* file_;
}; };
class InMemoryDirectory : public Directory {
public:
virtual Status Fsync() { return Status::OK(); }
};
class InMemoryEnv : public EnvWrapper { class InMemoryEnv : public EnvWrapper {
public: public:
explicit InMemoryEnv(Env* base_env) : EnvWrapper(base_env) { } explicit InMemoryEnv(Env* base_env) : EnvWrapper(base_env) { }
@ -274,6 +279,12 @@ class InMemoryEnv : public EnvWrapper {
return Status::OK(); return Status::OK();
} }
virtual Status NewDirectory(const std::string& name,
unique_ptr<Directory>* result) {
result->reset(new InMemoryDirectory());
return Status::OK();
}
virtual bool FileExists(const std::string& fname) { virtual bool FileExists(const std::string& fname) {
MutexLock lock(&mutex_); MutexLock lock(&mutex_);
return file_map_.find(fname) != file_map_.end(); return file_map_.find(fname) != file_map_.end();

@ -33,6 +33,7 @@ class SequentialFile;
class Slice; class Slice;
class WritableFile; class WritableFile;
class RandomRWFile; class RandomRWFile;
class Directory;
struct Options; struct Options;
using std::unique_ptr; using std::unique_ptr;
@ -122,6 +123,16 @@ class Env {
unique_ptr<RandomRWFile>* result, unique_ptr<RandomRWFile>* result,
const EnvOptions& options) = 0; const EnvOptions& options) = 0;
// Create an object that represents a directory. Will fail if directory
// doesn't exist. If the directory exists, it will open the directory
// and create a new Directory object.
//
// On success, stores a pointer to the new Directory in
// *result and returns OK. On failure stores nullptr in *result and
// returns non-OK.
virtual Status NewDirectory(const std::string& name,
unique_ptr<Directory>* result) = 0;
// Returns true iff the named file exists. // Returns true iff the named file exists.
virtual bool FileExists(const std::string& fname) = 0; virtual bool FileExists(const std::string& fname) = 0;
@ -488,6 +499,15 @@ class RandomRWFile {
void operator=(const RandomRWFile&); void operator=(const RandomRWFile&);
}; };
// Directory object represents collection of files and implements
// filesystem operations that can be executed on directories.
class Directory {
public:
virtual ~Directory() {}
// Fsync directory
virtual Status Fsync() = 0;
};
// An interface for writing log messages. // An interface for writing log messages.
class Logger { class Logger {
public: public:
@ -578,6 +598,10 @@ class EnvWrapper : public Env {
const EnvOptions& options) { const EnvOptions& options) {
return target_->NewRandomRWFile(f, r, options); return target_->NewRandomRWFile(f, r, options);
} }
virtual Status NewDirectory(const std::string& name,
unique_ptr<Directory>* result) {
return target_->NewDirectory(name, result);
}
bool FileExists(const std::string& f) { return target_->FileExists(f); } bool FileExists(const std::string& f) { return target_->FileExists(f); }
Status GetChildren(const std::string& dir, std::vector<std::string>* r) { Status GetChildren(const std::string& dir, std::vector<std::string>* r) {
return target_->GetChildren(dir, r); return target_->GetChildren(dir, r);

@ -366,6 +366,11 @@ Status HdfsEnv::NewRandomRWFile(const std::string& fname,
return Status::NotSupported("NewRandomRWFile not supported on HdfsEnv"); return Status::NotSupported("NewRandomRWFile not supported on HdfsEnv");
} }
virtual Status NewDirectory(const std::string& name,
unique_ptr<Directory>* result) {
return Status::NotSupported("NewDirectory not yet supported on HdfsEnv");
}
bool HdfsEnv::FileExists(const std::string& fname) { bool HdfsEnv::FileExists(const std::string& fname) {
int value = hdfsExists(fileSys_, fname.c_str()); int value = hdfsExists(fileSys_, fname.c_str());
if (value == 0) { if (value == 0) {

@ -867,6 +867,24 @@ class PosixRandomRWFile : public RandomRWFile {
#endif #endif
}; };
class PosixDirectory : public Directory {
public:
explicit PosixDirectory(int fd) : fd_(fd) {}
~PosixDirectory() {
close(fd_);
}
virtual Status Fsync() {
if (fsync(fd_) == -1) {
return IOError("directory", errno);
}
return Status::OK();
}
private:
int fd_;
};
static int LockOrUnlock(const std::string& fname, int fd, bool lock) { static int LockOrUnlock(const std::string& fname, int fd, bool lock) {
mutex_lockedFiles.Lock(); mutex_lockedFiles.Lock();
if (lock) { if (lock) {
@ -1038,6 +1056,18 @@ class PosixEnv : public Env {
return s; return s;
} }
virtual Status NewDirectory(const std::string& name,
unique_ptr<Directory>* result) {
result->reset();
const int fd = open(name.c_str(), 0);
if (fd < 0) {
return IOError(name, errno);
} else {
result->reset(new PosixDirectory(fd));
}
return Status::OK();
}
virtual bool FileExists(const std::string& fname) { virtual bool FileExists(const std::string& fname) {
return access(fname.c_str(), F_OK) == 0; return access(fname.c_str(), F_OK) == 0;
} }

Loading…
Cancel
Save