From ae25742af9cf05f3bf2289a25f8fc0d87c25b19c Mon Sep 17 00:00:00 2001 From: Igor Canadi Date: Mon, 17 Mar 2014 21:50:15 -0700 Subject: [PATCH 1/4] Fix race condition in manifest roll Summary: When the manifest is getting rolled the following happens: 1) manifest_file_number_ is assigned to a new manifest number (even though the old one is still current) 2) mutex is unlocked 3) SetCurrentFile() creates temporary file manifest_file_number_.dbtmp 4) SetCurrentFile() renames manifest_file_number_.dbtmp to CURRENT 5) mutex is locked If FindObsoleteFiles happens between (3) and (4) it will: 1) Delete manifest_file_number_.dbtmp (because it's not in pending_outputs_) 2) Delete old manifest (because the manifest_file_number_ already points to a new one) I introduce the concept of prev_manifest_file_number_ that will avoid the race condition. However, we should discuss the future of MANIFEST file rolling. We found some race conditions with it last week and who knows how many more are there. Nobody is using it in production because we don't trust the implementation. Should we even support it? Test Plan: make check Reviewers: ljin, dhruba, haobo, sdong Reviewed By: haobo CC: leveldb Differential Revision: https://reviews.facebook.net/D16929 --- db/db_impl.cc | 28 ++++++++++++------------ db/db_impl.h | 4 +++- db/version_set.cc | 54 ++++++++++++++++++++++++----------------------- db/version_set.h | 8 ++++++- 4 files changed, 52 insertions(+), 42 deletions(-) diff --git a/db/db_impl.cc b/db/db_impl.cc index cc17e743d..06e485d50 100644 --- a/db/db_impl.cc +++ b/db/db_impl.cc @@ -601,6 +601,8 @@ void DBImpl::FindObsoleteFiles(DeletionState& deletion_state, // store the current filenum, lognum, etc deletion_state.manifest_file_number = versions_->ManifestFileNumber(); + deletion_state.pending_manifest_file_number = + versions_->PendingManifestFileNumber(); deletion_state.log_number = versions_->LogNumber(); deletion_state.prev_log_number = versions_->PrevLogNumber(); @@ -651,12 +653,10 @@ void DBImpl::PurgeObsoleteFiles(DeletionState& state) { return; } - // Now, convert live list to an unordered set, WITHOUT mutex held; // set is slow. - std::unordered_set sst_live( - state.sst_live.begin(), state.sst_live.end() - ); + std::unordered_set sst_live(state.sst_live.begin(), + state.sst_live.end()); auto& candidate_files = state.candidate_files; candidate_files.reserve( @@ -674,19 +674,15 @@ void DBImpl::PurgeObsoleteFiles(DeletionState& state) { for (auto file_num : state.log_delete_files) { if (file_num > 0) { - candidate_files.push_back( - LogFileName(kDumbDbName, file_num).substr(1) - ); + candidate_files.push_back(LogFileName(kDumbDbName, file_num).substr(1)); } } // dedup state.candidate_files so we don't try to delete the same // file twice sort(candidate_files.begin(), candidate_files.end()); - candidate_files.erase( - unique(candidate_files.begin(), candidate_files.end()), - candidate_files.end() - ); + candidate_files.erase(unique(candidate_files.begin(), candidate_files.end()), + candidate_files.end()); std::vector old_info_log_files; @@ -706,7 +702,7 @@ void DBImpl::PurgeObsoleteFiles(DeletionState& state) { break; case kDescriptorFile: // Keep my manifest file, and any newer incarnations' - // (in case there is a race that allows other incarnations) + // (can happen during manifest roll) keep = (number >= state.manifest_file_number); break; case kTableFile: @@ -714,8 +710,12 @@ void DBImpl::PurgeObsoleteFiles(DeletionState& state) { break; case kTempFile: // Any temp files that are currently being written to must - // be recorded in pending_outputs_, which is inserted into "live" - keep = (sst_live.find(number) != sst_live.end()); + // be recorded in pending_outputs_, which is inserted into "live". + // Also, SetCurrentFile creates a temp file when writing out new + // manifest, which is equal to state.pending_manifest_file_number. We + // should not delete that file + keep = (sst_live.find(number) != sst_live.end()) || + (number == state.pending_manifest_file_number); break; case kInfoLogFile: keep = true; diff --git a/db/db_impl.h b/db/db_impl.h index e42848d11..6e6dc425a 100644 --- a/db/db_impl.h +++ b/db/db_impl.h @@ -230,10 +230,12 @@ class DBImpl : public DB { // the current manifest_file_number, log_number and prev_log_number // that corresponds to the set of files in 'live'. - uint64_t manifest_file_number, log_number, prev_log_number; + uint64_t manifest_file_number, pending_manifest_file_number, log_number, + prev_log_number; explicit DeletionState(bool create_superversion = false) { manifest_file_number = 0; + pending_manifest_file_number = 0; log_number = 0; prev_log_number = 0; new_superversion = diff --git a/db/version_set.cc b/db/version_set.cc index 5c4043116..0ce8a7efe 100644 --- a/db/version_set.cc +++ b/db/version_set.cc @@ -7,8 +7,10 @@ // Use of this source code is governed by a BSD-style license that can be // found in the LICENSE file. See the AUTHORS file for names of contributors. +#define __STDC_FORMAT_MACROS #include "db/version_set.h" +#include #include #include #include @@ -1435,6 +1437,7 @@ VersionSet::VersionSet(const std::string& dbname, const Options* options, icmp_(*cmp), next_file_number_(2), manifest_file_number_(0), // Filled by Recover() + pending_manifest_file_number_(0), last_sequence_(0), log_number_(0), prev_log_number_(0), @@ -1527,22 +1530,17 @@ Status VersionSet::LogAndApply(VersionEdit* edit, port::Mutex* mu, // Initialize new descriptor log file if necessary by creating // a temporary file that contains a snapshot of the current version. - std::string new_manifest_filename; uint64_t new_manifest_file_size = 0; Status s; - // we will need this if we are creating new manifest - uint64_t old_manifest_file_number = manifest_file_number_; - // No need to perform this check if a new Manifest is being created anyways. + assert(pending_manifest_file_number_ == 0); if (!descriptor_log_ || manifest_file_size_ > options_->max_manifest_file_size) { + pending_manifest_file_number_ = NewFileNumber(); + batch_edits.back()->SetNextFile(next_file_number_); new_descriptor_log = true; - manifest_file_number_ = NewFileNumber(); // Change manifest file no. - } - - if (new_descriptor_log) { - new_manifest_filename = DescriptorFileName(dbname_, manifest_file_number_); - edit->SetNextFile(next_file_number_); + } else { + pending_manifest_file_number_ = manifest_file_number_; } // Unlock during expensive operations. New writes cannot get here @@ -1562,10 +1560,11 @@ Status VersionSet::LogAndApply(VersionEdit* edit, port::Mutex* mu, // This is fine because everything inside of this block is serialized -- // only one thread can be here at the same time - if (!new_manifest_filename.empty()) { + if (new_descriptor_log) { unique_ptr descriptor_file; - s = env_->NewWritableFile(new_manifest_filename, &descriptor_file, - storage_options_.AdaptForLogWrite()); + s = env_->NewWritableFile( + DescriptorFileName(dbname_, pending_manifest_file_number_), + &descriptor_file, storage_options_.AdaptForLogWrite()); if (s.ok()) { descriptor_log_.reset(new log::Writer(std::move(descriptor_file))); s = WriteSnapshot(descriptor_log_.get()); @@ -1604,7 +1603,7 @@ Status VersionSet::LogAndApply(VersionEdit* edit, port::Mutex* mu, for (auto& e : batch_edits) { std::string record; e->EncodeTo(&record); - if (!ManifestContains(record)) { + if (!ManifestContains(pending_manifest_file_number_, record)) { all_records_in = false; break; } @@ -1621,17 +1620,16 @@ Status VersionSet::LogAndApply(VersionEdit* edit, port::Mutex* mu, // If we just created a new descriptor file, install it by writing a // new CURRENT file that points to it. - if (s.ok() && !new_manifest_filename.empty()) { - s = SetCurrentFile(env_, dbname_, manifest_file_number_); - if (s.ok() && old_manifest_file_number < manifest_file_number_) { + if (s.ok() && new_descriptor_log) { + s = SetCurrentFile(env_, dbname_, pending_manifest_file_number_); + if (s.ok() && pending_manifest_file_number_ > manifest_file_number_) { // delete old manifest file Log(options_->info_log, - "Deleting manifest %lu current manifest %lu\n", - (unsigned long)old_manifest_file_number, - (unsigned long)manifest_file_number_); + "Deleting manifest %" PRIu64 " current manifest %" PRIu64 "\n", + manifest_file_number_, pending_manifest_file_number_); // we don't care about an error here, PurgeObsoleteFiles will take care // of it later - env_->DeleteFile(DescriptorFileName(dbname_, old_manifest_file_number)); + env_->DeleteFile(DescriptorFileName(dbname_, manifest_file_number_)); } if (!options_->disableDataSync && db_directory != nullptr) { db_directory->Fsync(); @@ -1649,6 +1647,7 @@ Status VersionSet::LogAndApply(VersionEdit* edit, port::Mutex* mu, // Install the new version if (s.ok()) { + manifest_file_number_ = pending_manifest_file_number_; manifest_file_size_ = new_manifest_file_size; AppendVersion(v); if (max_log_number_in_batch != 0) { @@ -1656,16 +1655,17 @@ Status VersionSet::LogAndApply(VersionEdit* edit, port::Mutex* mu, log_number_ = max_log_number_in_batch; } prev_log_number_ = edit->prev_log_number_; - } else { Log(options_->info_log, "Error in committing version %lu", (unsigned long)v->GetVersionNumber()); delete v; - if (!new_manifest_filename.empty()) { + if (new_descriptor_log) { descriptor_log_.reset(); - env_->DeleteFile(new_manifest_filename); + env_->DeleteFile( + DescriptorFileName(dbname_, pending_manifest_file_number_)); } } + pending_manifest_file_number_ = 0; // wake up all the waiting writers while (true) { @@ -2103,8 +2103,10 @@ Status VersionSet::WriteSnapshot(log::Writer* log) { // Opens the mainfest file and reads all records // till it finds the record we are looking for. -bool VersionSet::ManifestContains(const std::string& record) const { - std::string fname = DescriptorFileName(dbname_, manifest_file_number_); +bool VersionSet::ManifestContains(uint64_t manifest_file_number, + const std::string& record) const { + std::string fname = + DescriptorFileName(dbname_, manifest_file_number); Log(options_->info_log, "ManifestContains: checking %s\n", fname.c_str()); unique_ptr file; Status s = env_->NewSequentialFile(fname, &file, storage_options_); diff --git a/db/version_set.h b/db/version_set.h index 317cfe353..19489701f 100644 --- a/db/version_set.h +++ b/db/version_set.h @@ -327,6 +327,10 @@ class VersionSet { // Return the current manifest file number uint64_t ManifestFileNumber() const { return manifest_file_number_; } + uint64_t PendingManifestFileNumber() const { + return pending_manifest_file_number_; + } + // Allocate and return a new file number uint64_t NewFileNumber() { return next_file_number_++; } @@ -436,7 +440,8 @@ class VersionSet { void AppendVersion(Version* v); - bool ManifestContains(const std::string& record) const; + bool ManifestContains(uint64_t manifest_file_number, + const std::string& record) const; Env* const env_; const std::string dbname_; @@ -445,6 +450,7 @@ class VersionSet { const InternalKeyComparator icmp_; uint64_t next_file_number_; uint64_t manifest_file_number_; + uint64_t pending_manifest_file_number_; std::atomic last_sequence_; uint64_t log_number_; uint64_t prev_log_number_; // 0 or backing store for memtable being compacted From f26cb0f093393b96659d33d9d1cd5f0f58ca31f9 Mon Sep 17 00:00:00 2001 From: Igor Canadi Date: Mon, 17 Mar 2014 21:52:14 -0700 Subject: [PATCH 2/4] Optimize fallocation Summary: Based on my recent findings (posted in our internal group), if we use fallocate without KEEP_SIZE flag, we get superior performance of fdatasync() in append-only workloads. This diff provides an option for user to not use KEEP_SIZE flag, thus optimizing his sync performance by up to 2x-3x. At one point we also just called posix_fallocate instead of fallocate, which isn't very fast: http://code.woboq.org/userspace/glibc/sysdeps/posix/posix_fallocate.c.html (tl;dr it manually writes out zero bytes to allocate storage). This diff also fixes that, by first calling fallocate and then posix_fallocate if fallocate is not supported. Test Plan: make check Reviewers: dhruba, sdong, haobo, ljin Reviewed By: dhruba CC: leveldb Differential Revision: https://reviews.facebook.net/D16761 --- db/db_impl.cc | 9 ++--- db/repair.cc | 4 +-- db/version_set.cc | 2 +- include/rocksdb/env.h | 22 +++++++++++-- util/env.cc | 10 +++--- util/env_posix.cc | 76 ++++++++++++++++++++++++++++++------------- 6 files changed, 86 insertions(+), 37 deletions(-) diff --git a/db/db_impl.cc b/db/db_impl.cc index 06e485d50..3354e79c0 100644 --- a/db/db_impl.cc +++ b/db/db_impl.cc @@ -456,8 +456,8 @@ Status DBImpl::NewDB() { const std::string manifest = DescriptorFileName(dbname_, 1); unique_ptr file; - Status s = env_->NewWritableFile(manifest, &file, - storage_options_.AdaptForLogWrite()); + Status s = env_->NewWritableFile( + manifest, &file, env_->OptimizeForManifestWrite(storage_options_)); if (!s.ok()) { return s; } @@ -3626,7 +3626,8 @@ Status DBImpl::MakeRoomForWrite(bool force, { DelayLoggingAndReset(); s = env_->NewWritableFile(LogFileName(options_.wal_dir, new_log_number), - &lfile, storage_options_.AdaptForLogWrite()); + &lfile, + env_->OptimizeForLogWrite(storage_options_)); if (s.ok()) { // Our final size should be less than write_buffer_size // (compression, etc) but err on the side of caution. @@ -3912,7 +3913,7 @@ Status DB::Open(const Options& options, const std::string& dbname, DB** dbptr) { EnvOptions soptions(options); s = impl->options_.env->NewWritableFile( LogFileName(impl->options_.wal_dir, new_log_number), &lfile, - soptions.AdaptForLogWrite()); + impl->options_.env->OptimizeForLogWrite(soptions)); if (s.ok()) { lfile->SetPreallocationBlockSize(1.1 * impl->options_.write_buffer_size); VersionEdit edit; diff --git a/db/repair.cc b/db/repair.cc index 235bb8967..f3b95f5e5 100644 --- a/db/repair.cc +++ b/db/repair.cc @@ -306,8 +306,8 @@ class Repairer { Status WriteDescriptor() { std::string tmp = TempFileName(dbname_, 1); unique_ptr file; - Status status = - env_->NewWritableFile(tmp, &file, storage_options_.AdaptForLogWrite()); + Status status = env_->NewWritableFile( + tmp, &file, env_->OptimizeForManifestWrite(storage_options_)); if (!status.ok()) { return status; } diff --git a/db/version_set.cc b/db/version_set.cc index 0ce8a7efe..7276cd0b6 100644 --- a/db/version_set.cc +++ b/db/version_set.cc @@ -1564,7 +1564,7 @@ Status VersionSet::LogAndApply(VersionEdit* edit, port::Mutex* mu, unique_ptr descriptor_file; s = env_->NewWritableFile( DescriptorFileName(dbname_, pending_manifest_file_number_), - &descriptor_file, storage_options_.AdaptForLogWrite()); + &descriptor_file, env_->OptimizeForManifestWrite(storage_options_)); if (s.ok()) { descriptor_log_.reset(new log::Writer(std::move(descriptor_file))); s = WriteSnapshot(descriptor_log_.get()); diff --git a/include/rocksdb/env.h b/include/rocksdb/env.h index 16eb16440..f1c579981 100644 --- a/include/rocksdb/env.h +++ b/include/rocksdb/env.h @@ -49,8 +49,6 @@ struct EnvOptions { // construct from Options explicit EnvOptions(const Options& options); - EnvOptions AdaptForLogWrite() const; - // If true, then allow caching of data in environment buffers bool use_os_buffer = true; @@ -61,13 +59,21 @@ struct EnvOptions { bool use_mmap_writes = true; // If true, set the FD_CLOEXEC on open fd. - bool set_fd_cloexec= true; + bool set_fd_cloexec = true; // Allows OS to incrementally sync files to disk while they are being // written, in the background. Issue one request for every bytes_per_sync // written. 0 turns it off. // Default: 0 uint64_t bytes_per_sync = 0; + + // If true, we will preallocate the file with FALLOC_FL_KEEP_SIZE flag, which + // means that file size won't change as part of preallocation. + // If false, preallocation will also change the file size. This option will + // improve the performance in workloads where you sync the data on every + // write. By default, we set it to true for MANIFEST writes and false for + // WAL writes + bool fallocate_with_keep_size = true; }; class Env { @@ -260,6 +266,16 @@ class Env { // Generates a unique id that can be used to identify a db virtual std::string GenerateUniqueId(); + // OptimizeForLogWrite will create a new EnvOptions object that is a copy of + // the EnvOptions in the parameters, but is optimized for writing log files. + // Default implementation returns the copy of the same object. + virtual EnvOptions OptimizeForLogWrite(const EnvOptions& env_options) const; + // OptimizeForManifestWrite will create a new EnvOptions object that is a copy + // of the EnvOptions in the parameters, but is optimized for writing manifest + // files. Default implementation returns the copy of the same object. + virtual EnvOptions OptimizeForManifestWrite(const EnvOptions& env_options) + const; + private: // No copying allowed Env(const Env&); diff --git a/util/env.cc b/util/env.cc index 419c8145d..f2ebfcd59 100644 --- a/util/env.cc +++ b/util/env.cc @@ -241,10 +241,12 @@ void AssignEnvOptions(EnvOptions* env_options, const Options& options) { } -EnvOptions EnvOptions::AdaptForLogWrite() const { - EnvOptions adapted = *this; - adapted.use_mmap_writes = false; - return adapted; +EnvOptions Env::OptimizeForLogWrite(const EnvOptions& env_options) const { + return env_options; +} + +EnvOptions Env::OptimizeForManifestWrite(const EnvOptions& env_options) const { + return env_options; } EnvOptions::EnvOptions(const Options& options) { diff --git a/util/env_posix.cc b/util/env_posix.cc index 89d8df68d..c610c1546 100644 --- a/util/env_posix.cc +++ b/util/env_posix.cc @@ -354,9 +354,9 @@ class PosixMmapFile : public WritableFile { char* dst_; // Where to write next (in range [base_,limit_]) char* last_sync_; // Where have we synced up to uint64_t file_offset_; // Offset of base_ in file - // Have we done an munmap of unsynced data? bool pending_sync_; + bool fallocate_with_keep_size_; // Roundup x to a multiple of y static size_t Roundup(size_t x, size_t y) { @@ -399,7 +399,12 @@ class PosixMmapFile : public WritableFile { assert(base_ == nullptr); TEST_KILL_RANDOM(rocksdb_kill_odds); - int alloc_status = posix_fallocate(fd_, file_offset_, map_size_); + // we can't fallocate with FALLOC_FL_KEEP_SIZE here + int alloc_status = fallocate(fd_, 0, file_offset_, map_size_); + if (alloc_status != 0) { + // fallback to posix_fallocate + alloc_status = posix_fallocate(fd_, file_offset_, map_size_); + } if (alloc_status != 0) { return Status::IOError("Error allocating space to file : " + filename_ + "Error : " + strerror(alloc_status)); @@ -436,7 +441,8 @@ class PosixMmapFile : public WritableFile { dst_(nullptr), last_sync_(nullptr), file_offset_(0), - pending_sync_(false) { + pending_sync_(false), + fallocate_with_keep_size_(options.fallocate_with_keep_size) { assert((page_size & (page_size - 1)) == 0); assert(options.use_mmap_writes); } @@ -584,7 +590,9 @@ class PosixMmapFile : public WritableFile { #ifdef ROCKSDB_FALLOCATE_PRESENT virtual Status Allocate(off_t offset, off_t len) { TEST_KILL_RANDOM(rocksdb_kill_odds); - if (!fallocate(fd_, FALLOC_FL_KEEP_SIZE, offset, len)) { + int alloc_status = fallocate( + fd_, fallocate_with_keep_size_ ? FALLOC_FL_KEEP_SIZE : 0, offset, len); + if (alloc_status == 0) { return Status::OK(); } else { return IOError(filename_, errno); @@ -606,20 +614,22 @@ class PosixWritableFile : public WritableFile { bool pending_fsync_; uint64_t last_sync_size_; uint64_t bytes_per_sync_; + bool fallocate_with_keep_size_; public: PosixWritableFile(const std::string& fname, int fd, size_t capacity, - const EnvOptions& options) : - filename_(fname), - fd_(fd), - cursize_(0), - capacity_(capacity), - buf_(new char[capacity]), - filesize_(0), - pending_sync_(false), - pending_fsync_(false), - last_sync_size_(0), - bytes_per_sync_(options.bytes_per_sync) { + const EnvOptions& options) + : filename_(fname), + fd_(fd), + cursize_(0), + capacity_(capacity), + buf_(new char[capacity]), + filesize_(0), + pending_sync_(false), + pending_fsync_(false), + last_sync_size_(0), + bytes_per_sync_(options.bytes_per_sync), + fallocate_with_keep_size_(options.fallocate_with_keep_size) { assert(!options.use_mmap_writes); } @@ -771,7 +781,9 @@ class PosixWritableFile : public WritableFile { #ifdef ROCKSDB_FALLOCATE_PRESENT virtual Status Allocate(off_t offset, off_t len) { TEST_KILL_RANDOM(rocksdb_kill_odds); - if (!fallocate(fd_, FALLOC_FL_KEEP_SIZE, offset, len)) { + int alloc_status = fallocate( + fd_, fallocate_with_keep_size_ ? FALLOC_FL_KEEP_SIZE : 0, offset, len); + if (alloc_status == 0) { return Status::OK(); } else { return IOError(filename_, errno); @@ -797,14 +809,15 @@ class PosixRandomRWFile : public RandomRWFile { int fd_; bool pending_sync_; bool pending_fsync_; + bool fallocate_with_keep_size_; public: - PosixRandomRWFile(const std::string& fname, int fd, - const EnvOptions& options) : - filename_(fname), - fd_(fd), - pending_sync_(false), - pending_fsync_(false) { + PosixRandomRWFile(const std::string& fname, int fd, const EnvOptions& options) + : filename_(fname), + fd_(fd), + pending_sync_(false), + pending_fsync_(false), + fallocate_with_keep_size_(options.fallocate_with_keep_size) { assert(!options.use_mmap_writes && !options.use_mmap_reads); } @@ -874,7 +887,10 @@ class PosixRandomRWFile : public RandomRWFile { #ifdef ROCKSDB_FALLOCATE_PRESENT virtual Status Allocate(off_t offset, off_t len) { - if (!fallocate(fd_, FALLOC_FL_KEEP_SIZE, offset, len)) { + TEST_KILL_RANDOM(rocksdb_kill_odds); + int alloc_status = fallocate( + fd_, fallocate_with_keep_size_ ? FALLOC_FL_KEEP_SIZE : 0, offset, len); + if (alloc_status == 0) { return Status::OK(); } else { return IOError(filename_, errno); @@ -1332,6 +1348,20 @@ class PosixEnv : public Env { return dummy; } + EnvOptions OptimizeForLogWrite(const EnvOptions& env_options) const { + EnvOptions optimized = env_options; + optimized.use_mmap_writes = false; + optimized.fallocate_with_keep_size = true; + return optimized; + } + + EnvOptions OptimizeForManifestWrite(const EnvOptions& env_options) const { + EnvOptions optimized = env_options; + optimized.use_mmap_writes = false; + optimized.fallocate_with_keep_size = true; + return optimized; + } + private: bool checkedDiskForMmap_; bool forceMmapOff; // do we override Env options? From 7624f43e0ac3dc6fcd90c20149d5d3e078521854 Mon Sep 17 00:00:00 2001 From: Yueh-Hsuan Chiang Date: Tue, 18 Mar 2014 12:01:44 -0700 Subject: [PATCH 3/4] Fixed a typo in INSTALL.md Summary: Replace "RocskDB" by "RocksDB" in INSTALL.md Test Plan: No code change. Reviewers: ljin, igor Reviewed By: ljin CC: leveldb Differential Revision: https://reviews.facebook.net/D16977 --- INSTALL.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/INSTALL.md b/INSTALL.md index 472fd2331..86934db69 100644 --- a/INSTALL.md +++ b/INSTALL.md @@ -68,7 +68,7 @@ libraries. You are on your own. We did not run any production workloads on it. ## Compilation -`make clean; make` will compile librocksdb.a (RocskDB static library) and all +`make clean; make` will compile librocksdb.a (RocksDB static library) and all the unit tests. You can run all unit tests with `make check`. For shared library builds, exec `make shared_lib` instead. From 63cef90078a324233686ce0d9bb5c255545f2d3d Mon Sep 17 00:00:00 2001 From: Lei Jin Date: Tue, 18 Mar 2014 12:46:29 -0700 Subject: [PATCH 4/4] disable the log_number check in Recover() Summary: There is a chance that an old MANIFEST is corrupted in 2.7 but just not noticed. This check would fail them. Change it to log instead of returning a Corruption status. Test Plan: make Reviewers: haobo, igor Reviewed By: igor CC: leveldb Differential Revision: https://reviews.facebook.net/D16923 --- db/db_impl.cc | 1 + db/version_edit.cc | 17 ++++++++++++++++- db/version_edit.h | 10 ++++++++++ db/version_set.cc | 31 ++++++++++++++++++++++++++----- 4 files changed, 53 insertions(+), 6 deletions(-) diff --git a/db/db_impl.cc b/db/db_impl.cc index 3354e79c0..4c161be1e 100644 --- a/db/db_impl.cc +++ b/db/db_impl.cc @@ -449,6 +449,7 @@ uint64_t DBImpl::TEST_Current_Manifest_FileNo() { Status DBImpl::NewDB() { VersionEdit new_db; + new_db.SetVersionNumber(); new_db.SetComparatorName(user_comparator()->Name()); new_db.SetLogNumber(0); new_db.SetNextFile(2); diff --git a/db/version_edit.cc b/db/version_edit.cc index 23f9f7ee5..f949a32ba 100644 --- a/db/version_edit.cc +++ b/db/version_edit.cc @@ -28,16 +28,19 @@ enum Tag { kPrevLogNumber = 9, // these are new formats divergent from open source leveldb - kNewFile2 = 100 // store smallest & largest seqno + kNewFile2 = 100, // store smallest & largest seqno + kVersionNumber = 101, // manifest version number, available after 2.8 }; void VersionEdit::Clear() { + version_number_ = 0; comparator_.clear(); max_level_ = 0; log_number_ = 0; prev_log_number_ = 0; last_sequence_ = 0; next_file_number_ = 0; + has_version_number_ = false; has_comparator_ = false; has_log_number_ = false; has_prev_log_number_ = false; @@ -48,6 +51,10 @@ void VersionEdit::Clear() { } void VersionEdit::EncodeTo(std::string* dst) const { + if (has_version_number_) { + PutVarint32(dst, kVersionNumber); + PutVarint32(dst, version_number_); + } if (has_comparator_) { PutVarint32(dst, kComparator); PutLengthPrefixedSlice(dst, comparator_); @@ -126,6 +133,14 @@ Status VersionEdit::DecodeFrom(const Slice& src) { while (msg == nullptr && GetVarint32(&input, &tag)) { switch (tag) { + case kVersionNumber: + if (GetVarint32(&input, &version_number_)) { + has_version_number_ = true; + } else { + msg = "version number"; + } + break; + case kComparator: if (GetLengthPrefixedSlice(&input, &str)) { comparator_ = str.ToString(); diff --git a/db/version_edit.h b/db/version_edit.h index f54949fbf..c1a3799f4 100644 --- a/db/version_edit.h +++ b/db/version_edit.h @@ -46,6 +46,10 @@ class VersionEdit { void Clear(); + void SetVersionNumber() { + has_version_number_ = true; + version_number_ = kManifestVersion; + } void SetComparatorName(const Slice& name) { has_comparator_ = true; comparator_ = name.ToString(); @@ -110,11 +114,13 @@ class VersionEdit { bool GetLevel(Slice* input, int* level, const char** msg); int max_level_; + uint32_t version_number_; std::string comparator_; uint64_t log_number_; uint64_t prev_log_number_; uint64_t next_file_number_; SequenceNumber last_sequence_; + bool has_version_number_; bool has_comparator_; bool has_log_number_; bool has_prev_log_number_; @@ -123,6 +129,10 @@ class VersionEdit { DeletedFileSet deleted_files_; std::vector > new_files_; + + enum { + kManifestVersion = 1 + }; }; } // namespace rocksdb diff --git a/db/version_set.cc b/db/version_set.cc index 7276cd0b6..6c3178523 100644 --- a/db/version_set.cc +++ b/db/version_set.cc @@ -1746,6 +1746,8 @@ Status VersionSet::Recover() { return s; } + bool have_version_number = false; + bool log_number_decrease = false; bool have_log_number = false; bool have_prev_log_number = false; bool have_next_file = false; @@ -1786,17 +1788,21 @@ Status VersionSet::Recover() { builder.Apply(&edit); + if (edit.has_version_number_) { + have_version_number = true; + } + // Only a flush's edit or a new snapshot can write log number during // LogAndApply. Since memtables are flushed and inserted into // manifest_writers_ queue in order, the log number in MANIFEST file // should be monotonically increasing. if (edit.has_log_number_) { - if (have_log_number && log_number > edit.log_number_) { - s = Status::Corruption("log number decreases"); - break; + if (have_log_number && log_number >= edit.log_number_) { + log_number_decrease = true; + } else { + log_number = edit.log_number_; + have_log_number = true; } - log_number = edit.log_number_; - have_log_number = true; } if (edit.has_prev_log_number_) { @@ -1814,6 +1820,20 @@ Status VersionSet::Recover() { have_last_sequence = true; } } + + if (s.ok() && log_number_decrease) { + // Since release 2.8, version number is added into MANIFEST file. + // Prior release 2.8, a bug in LogAndApply() can cause log_number + // to be smaller than the one from previous edit. To ensure backward + // compatibility, only fail for MANIFEST genearated by release 2.8 + // and after. + if (have_version_number) { + s = Status::Corruption("log number decreases"); + } else { + Log(options_->info_log, "decreasing of log_number is detected " + "in MANIFEST\n"); + } + } } if (s.ok()) { @@ -2083,6 +2103,7 @@ Status VersionSet::WriteSnapshot(log::Writer* log) { // Save metadata VersionEdit edit; + edit.SetVersionNumber(); edit.SetComparatorName(icmp_.user_comparator()->Name()); // Save files