From 92060b2153fd32c602e51eefefb59f7c99fd738b Mon Sep 17 00:00:00 2001 From: Maxime Caron Date: Wed, 14 Oct 2015 16:05:55 -0700 Subject: [PATCH 01/26] Fix build error using Visual Studio 12 --- utilities/document/json_document.cc | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/utilities/document/json_document.cc b/utilities/document/json_document.cc index 99376d2b5..b8d178bb2 100644 --- a/utilities/document/json_document.cc +++ b/utilities/document/json_document.cc @@ -588,8 +588,8 @@ JSONDocument::const_item_iterator::~const_item_iterator() { JSONDocument::const_item_iterator::value_type JSONDocument::const_item_iterator::operator*() { - return {std::string(it_->getKeyStr(), it_->klen()), - JSONDocument(it_->value(), false)}; + return JSONDocument::const_item_iterator::value_type(std::string(it_->getKeyStr(), it_->klen()), + JSONDocument(it_->value(), false)); } JSONDocument::ItemsIteratorGenerator::ItemsIteratorGenerator( From 1bcafb62f4aa7c0b2b3786a958220411347f27b8 Mon Sep 17 00:00:00 2001 From: Sage Weil Date: Wed, 7 Oct 2015 22:11:09 -0400 Subject: [PATCH 02/26] env: add ReuseWritableFile Add an environment method to reuse an existing file. Provide a generic implementation that does a simple rename + open (writeable), and also a posix variant that is more careful about error handling (if we fail to open, do not rename, etc.). Signed-off-by: Sage Weil --- include/rocksdb/env.h | 6 ++++++ util/env.cc | 11 +++++++++++ util/env_posix.cc | 44 +++++++++++++++++++++++++++++++++++++++++++ 3 files changed, 61 insertions(+) diff --git a/include/rocksdb/env.h b/include/rocksdb/env.h index 57c60f0c9..1dfd0f997 100644 --- a/include/rocksdb/env.h +++ b/include/rocksdb/env.h @@ -139,6 +139,12 @@ class Env { unique_ptr* result, const EnvOptions& options) = 0; + // Reuse an existing file by renaming it and opening it as writable. + virtual Status ReuseWritableFile(const std::string& fname, + const std::string& old_fname, + unique_ptr* result, + const EnvOptions& options); + // Create an object that represents a directory. Will fail if directory // doesn't exist. If the directory exists, it will open the directory // and create a new Directory object. diff --git a/util/env.cc b/util/env.cc index effa7f552..df45d9804 100644 --- a/util/env.cc +++ b/util/env.cc @@ -27,6 +27,17 @@ uint64_t Env::GetThreadID() const { return hasher(std::this_thread::get_id()); } +Status Env::ReuseWritableFile(const std::string& fname, + const std::string& old_fname, + unique_ptr* result, + const EnvOptions& options) { + Status s = RenameFile(old_fname, fname); + if (!s.ok()) { + return s; + } + return NewWritableFile(fname, result, options); +} + SequentialFile::~SequentialFile() { } diff --git a/util/env_posix.cc b/util/env_posix.cc index 3c264e1cf..fb4c41f35 100644 --- a/util/env_posix.cc +++ b/util/env_posix.cc @@ -933,6 +933,50 @@ class PosixEnv : public Env { return s; } + virtual Status ReuseWritableFile(const std::string& fname, + const std::string& old_fname, + unique_ptr* result, + const EnvOptions& options) override { + result->reset(); + Status s; + int fd = -1; + do { + IOSTATS_TIMER_GUARD(open_nanos); + fd = open(old_fname.c_str(), O_RDWR, 0644); + } while (fd < 0 && errno == EINTR); + if (fd < 0) { + s = IOError(fname, errno); + } else { + SetFD_CLOEXEC(fd, &options); + // rename into place + if (rename(old_fname.c_str(), fname.c_str()) != 0) { + Status r = IOError(old_fname, errno); + close(fd); + return r; + } + if (options.use_mmap_writes) { + if (!checkedDiskForMmap_) { + // this will be executed once in the program's lifetime. + // do not use mmapWrite on non ext-3/xfs/tmpfs systems. + if (!SupportsFastAllocate(fname)) { + forceMmapOff = true; + } + checkedDiskForMmap_ = true; + } + } + if (options.use_mmap_writes && !forceMmapOff) { + result->reset(new PosixMmapFile(fname, fd, page_size_, options)); + } else { + // disable mmap writes + EnvOptions no_mmap_writes_options = options; + no_mmap_writes_options.use_mmap_writes = false; + + result->reset(new PosixWritableFile(fname, fd, no_mmap_writes_options)); + } + } + return s; + } + virtual Status NewDirectory(const std::string& name, unique_ptr* result) override { result->reset(); From 543c12ab06576ab46b00901f65aab96fe6cb0fbf Mon Sep 17 00:00:00 2001 From: Sage Weil Date: Wed, 7 Oct 2015 21:06:28 -0400 Subject: [PATCH 03/26] options: add recycle_log_file_num option Signed-off-by: Sage Weil --- db/c.cc | 5 ++++ include/rocksdb/c.h | 2 ++ include/rocksdb/options.h | 10 ++++++++ java/rocksjni/options.cc | 53 +++++++++++++++++++++++++++++++++++++++ util/options.cc | 4 +++ util/options_helper.h | 3 +++ util/options_test.cc | 6 ++++- 7 files changed, 82 insertions(+), 1 deletion(-) diff --git a/db/c.cc b/db/c.cc index 8cd08265e..ec8e4fb99 100644 --- a/db/c.cc +++ b/db/c.cc @@ -1687,6 +1687,11 @@ void rocksdb_options_set_keep_log_file_num(rocksdb_options_t* opt, size_t v) { opt->rep.keep_log_file_num = v; } +void rocksdb_options_set_recycle_log_file_num(rocksdb_options_t* opt, + size_t v) { + opt->rep.recycle_log_file_num = v; +} + void rocksdb_options_set_soft_rate_limit(rocksdb_options_t* opt, double v) { opt->rep.soft_rate_limit = v; } diff --git a/include/rocksdb/c.h b/include/rocksdb/c.h index 782d10b48..76c801e67 100644 --- a/include/rocksdb/c.h +++ b/include/rocksdb/c.h @@ -572,6 +572,8 @@ extern ROCKSDB_LIBRARY_API void rocksdb_options_set_log_file_time_to_roll( rocksdb_options_t*, size_t); extern ROCKSDB_LIBRARY_API void rocksdb_options_set_keep_log_file_num( rocksdb_options_t*, size_t); +extern ROCKSDB_LIBRARY_API void rocksdb_options_set_recycle_log_file_num( + rocksdb_options_t*, size_t); extern ROCKSDB_LIBRARY_API void rocksdb_options_set_soft_rate_limit( rocksdb_options_t*, double); extern ROCKSDB_LIBRARY_API void rocksdb_options_set_hard_rate_limit( diff --git a/include/rocksdb/options.h b/include/rocksdb/options.h index 16aa3782b..a9830221a 100644 --- a/include/rocksdb/options.h +++ b/include/rocksdb/options.h @@ -957,6 +957,16 @@ struct DBOptions { // Default: 1000 size_t keep_log_file_num; + // Recycle log files. + // If non-zero, we will reuse previously written log files for new + // logs, overwriting the old data. The value indicates how many + // such files we will keep around at any point in time for later + // use. This is more efficient because the blocks are already + // allocated and fdatasync does not need to update the inode after + // each write. + // Default: 0 + size_t recycle_log_file_num; + // manifest file is rolled over on reaching this limit. // The older manifest file be deleted. // The default value is MAX_INT so that roll-over does not take place. diff --git a/java/rocksjni/options.cc b/java/rocksjni/options.cc index 216fa5e8a..de3df942c 100644 --- a/java/rocksjni/options.cc +++ b/java/rocksjni/options.cc @@ -576,6 +576,33 @@ void Java_org_rocksdb_Options_setKeepLogFileNum( } } +/* + * Class: org_rocksdb_Options + * Method: recycleLogFiles + * Signature: (J)J + */ +jlong Java_org_rocksdb_Options_recycleLogFileNum(JNIEnv* env, jobject jobj, + jlong jhandle) { + return reinterpret_cast(jhandle)->recycle_log_file_num; +} + +/* + * Class: org_rocksdb_Options + * Method: setRecycleLogFiles + * Signature: (JJ)V + */ +void Java_org_rocksdb_Options_setRecycleLogFiles(JNIEnv* env, jobject jobj, + jlong jhandle, + jlong recycle_log_file_num) { + rocksdb::Status s = rocksdb::check_if_jlong_fits_size_t(recycle_log_file_num); + if (s.ok()) { + reinterpret_cast(jhandle)->recycle_log_file_num = + recycle_log_file_num; + } else { + rocksdb::IllegalArgumentExceptionJni::ThrowNew(env, s); + } +} + /* * Class: org_rocksdb_Options * Method: maxManifestFileSize @@ -3533,6 +3560,32 @@ jlong Java_org_rocksdb_DBOptions_keepLogFileNum( return reinterpret_cast(jhandle)->keep_log_file_num; } +/* + * Class: org_rocksdb_DBOptions + * Method: setRecycleLogFiles + * Signature: (JJ)V + */ +void Java_org_rocksdb_DBOptions_setRecycleLogFileNum( + JNIEnv* env, jobject jobj, jlong jhandle, jlong recycle_log_file_num) { + rocksdb::Status s = rocksdb::check_if_jlong_fits_size_t(recycle_log_file_num); + if (s.ok()) { + reinterpret_cast(jhandle)->recycle_log_file_num = + recycle_log_file_num; + } else { + rocksdb::IllegalArgumentExceptionJni::ThrowNew(env, s); + } +} + +/* + * Class: org_rocksdb_DBOptions + * Method: recycleLogFiles + * Signature: (J)J + */ +jlong Java_org_rocksdb_DBOptions_recycleLogFileNum(JNIEnv* env, jobject jobj, + jlong jhandle) { + return reinterpret_cast(jhandle)->recycle_log_file_num; +} + /* * Class: org_rocksdb_DBOptions * Method: setMaxManifestFileSize diff --git a/util/options.cc b/util/options.cc index 14b69e678..70bcf7063 100644 --- a/util/options.cc +++ b/util/options.cc @@ -231,6 +231,7 @@ DBOptions::DBOptions() max_log_file_size(0), log_file_time_to_roll(0), keep_log_file_num(1000), + recycle_log_file_num(0), max_manifest_file_size(std::numeric_limits::max()), table_cache_numshardbits(4), WAL_ttl_seconds(0), @@ -285,6 +286,7 @@ DBOptions::DBOptions(const Options& options) max_log_file_size(options.max_log_file_size), log_file_time_to_roll(options.log_file_time_to_roll), keep_log_file_num(options.keep_log_file_num), + recycle_log_file_num(options.recycle_log_file_num), max_manifest_file_size(options.max_manifest_file_size), table_cache_numshardbits(options.table_cache_numshardbits), WAL_ttl_seconds(options.WAL_ttl_seconds), @@ -338,6 +340,8 @@ void DBOptions::Dump(Logger* log) const { log_file_time_to_roll); Header(log, " Options.keep_log_file_num: %" ROCKSDB_PRIszt, keep_log_file_num); + Header(log, " Options.recycle_log_file_num: %" ROCKSDB_PRIszt, + recycle_log_file_num); Header(log, " Options.allow_os_buffer: %d", allow_os_buffer); Header(log, " Options.allow_mmap_reads: %d", allow_mmap_reads); Header(log, " Options.allow_fallocate: %d", allow_fallocate); diff --git a/util/options_helper.h b/util/options_helper.h index ec567bc7c..3f6eab40a 100644 --- a/util/options_helper.h +++ b/util/options_helper.h @@ -207,6 +207,9 @@ static std::unordered_map db_options_type_info = { {"keep_log_file_num", {offsetof(struct DBOptions, keep_log_file_num), OptionType::kSizeT, OptionVerificationType::kNormal}}, + {"recycle_log_file_num", + {offsetof(struct DBOptions, recycle_log_file_num), OptionType::kSizeT, + OptionVerificationType::kNormal}}, {"log_file_time_to_roll", {offsetof(struct DBOptions, log_file_time_to_roll), OptionType::kSizeT, OptionVerificationType::kNormal}}, diff --git a/util/options_test.cc b/util/options_test.cc index 48f56cea3..e1849bb3f 100644 --- a/util/options_test.cc +++ b/util/options_test.cc @@ -323,6 +323,7 @@ TEST_F(OptionsTest, GetOptionsFromMapTest) { {"max_log_file_size", "37"}, {"log_file_time_to_roll", "38"}, {"keep_log_file_num", "39"}, + {"recycle_log_file_num", "5"}, {"max_manifest_file_size", "40"}, {"table_cache_numshardbits", "41"}, {"WAL_ttl_seconds", "43"}, @@ -339,7 +340,8 @@ TEST_F(OptionsTest, GetOptionsFromMapTest) { {"new_table_reader_for_compaction_inputs", "true"}, {"compaction_readahead_size", "100"}, {"bytes_per_sync", "47"}, - {"wal_bytes_per_sync", "48"}, }; + {"wal_bytes_per_sync", "48"}, + }; ColumnFamilyOptions base_cf_opt; ColumnFamilyOptions new_cf_opt; @@ -431,6 +433,7 @@ TEST_F(OptionsTest, GetOptionsFromMapTest) { ASSERT_EQ(new_db_opt.max_log_file_size, 37U); ASSERT_EQ(new_db_opt.log_file_time_to_roll, 38U); ASSERT_EQ(new_db_opt.keep_log_file_num, 39U); + ASSERT_EQ(new_db_opt.recycle_log_file_num, 5U); ASSERT_EQ(new_db_opt.max_manifest_file_size, static_cast(40)); ASSERT_EQ(new_db_opt.table_cache_numshardbits, 41); ASSERT_EQ(new_db_opt.WAL_ttl_seconds, static_cast(43)); @@ -692,6 +695,7 @@ void RandomInitDBOptions(DBOptions* db_opt, Random* rnd) { db_opt->skip_stats_update_on_db_open = rnd->Uniform(2); db_opt->use_adaptive_mutex = rnd->Uniform(2); db_opt->use_fsync = rnd->Uniform(2); + db_opt->recycle_log_file_num = rnd->Uniform(2); // int options db_opt->max_background_compactions = rnd->Uniform(100); From d666225a0a01f531c910b81945329bff397c6537 Mon Sep 17 00:00:00 2001 From: Sage Weil Date: Mon, 28 Sep 2015 15:29:31 -0400 Subject: [PATCH 04/26] db_impl: disable recycle_log_files if WAL archive is enabled We can't recycle the files if they are being archived. Signed-off-by: Sage Weil --- db/db_impl.cc | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/db/db_impl.cc b/db/db_impl.cc index 42df6ec66..dbf97fb7c 100644 --- a/db/db_impl.cc +++ b/db/db_impl.cc @@ -150,6 +150,10 @@ DBOptions SanitizeOptions(const std::string& dbname, const DBOptions& src) { } } + if (result.WAL_ttl_seconds > 0 || result.WAL_size_limit_MB > 0) { + result.recycle_log_file_num = false; + } + if (result.wal_dir.empty()) { // Use dbname as default result.wal_dir = dbname; From 666376150cdb85e3b32eceebedf59545a9d4a9ac Mon Sep 17 00:00:00 2001 From: Sage Weil Date: Thu, 1 Oct 2015 09:12:04 -0400 Subject: [PATCH 05/26] db_impl: recycle log files If log recycling is enabled, put old WAL files on a recycle queue instead of deleting them. When we need a new log file, take a recycled file off the list if one is available. Signed-off-by: Sage Weil --- db/db_impl.cc | 33 +++++++++++++++++++++++++++------ db/db_impl.h | 2 ++ 2 files changed, 29 insertions(+), 6 deletions(-) diff --git a/db/db_impl.cc b/db/db_impl.cc index dbf97fb7c..67885f73a 100644 --- a/db/db_impl.cc +++ b/db/db_impl.cc @@ -605,7 +605,13 @@ void DBImpl::FindObsoleteFiles(JobContext* job_context, bool force, // find newly obsoleted log files while (alive_log_files_.begin()->number < min_log_number) { auto& earliest = *alive_log_files_.begin(); - job_context->log_delete_files.push_back(earliest.number); + if (db_options_.recycle_log_file_num > log_recycle_files.size()) { + Log(InfoLogLevel::INFO_LEVEL, db_options_.info_log, + "adding log %" PRIu64 " to recycle list\n", earliest.number); + log_recycle_files.push_back(earliest.number); + } else { + job_context->log_delete_files.push_back(earliest.number); + } total_log_size_ -= earliest.size; alive_log_files_.pop_front(); // Current log should always stay alive since it can't have @@ -4076,6 +4082,12 @@ Status DBImpl::SwitchMemtable(ColumnFamilyData* cfd, WriteContext* context) { // Do this without holding the dbmutex lock. assert(versions_->prev_log_number() == 0); bool creating_new_log = !log_empty_; + uint64_t recycle_log_number = 0; + if (creating_new_log && db_options_.recycle_log_file_num && + !log_recycle_files.empty()) { + recycle_log_number = log_recycle_files.front(); + log_recycle_files.pop_front(); + } uint64_t new_log_number = creating_new_log ? versions_->NewFileNumber() : logfile_number_; SuperVersion* new_superversion = nullptr; @@ -4086,14 +4098,23 @@ Status DBImpl::SwitchMemtable(ColumnFamilyData* cfd, WriteContext* context) { if (creating_new_log) { EnvOptions opt_env_opt = env_->OptimizeForLogWrite(env_options_, db_options_); - s = NewWritableFile(env_, - LogFileName(db_options_.wal_dir, new_log_number), - &lfile, opt_env_opt); + if (recycle_log_number) { + Log(InfoLogLevel::INFO_LEVEL, db_options_.info_log, + "reusing log %" PRIu64 " from recycle list\n", recycle_log_number); + s = env_->ReuseWritableFile( + LogFileName(db_options_.wal_dir, new_log_number), + LogFileName(db_options_.wal_dir, recycle_log_number), &lfile, + opt_env_opt); + } else { + s = NewWritableFile(env_, + LogFileName(db_options_.wal_dir, new_log_number), + &lfile, opt_env_opt); + } if (s.ok()) { // Our final size should be less than write_buffer_size // (compression, etc) but err on the side of caution. - lfile->SetPreallocationBlockSize( - 1.1 * mutable_cf_options.write_buffer_size); + lfile->SetPreallocationBlockSize(1.1 * + mutable_cf_options.write_buffer_size); unique_ptr file_writer( new WritableFileWriter(std::move(lfile), opt_env_opt)); new_log = new log::Writer(std::move(file_writer)); diff --git a/db/db_impl.h b/db/db_impl.h index 3d8754634..277c54241 100644 --- a/db/db_impl.h +++ b/db/db_impl.h @@ -556,6 +556,8 @@ class DBImpl : public DB { // * whenever there is an error in background flush or compaction InstrumentedCondVar bg_cv_; uint64_t logfile_number_; + std::deque + log_recycle_files; // a list of log files that we can recycle bool log_dir_synced_; bool log_empty_; ColumnFamilyHandleImpl* default_cf_handle_; From 5830c699f2c07a7b4f35d0c4dd86e4a364643fda Mon Sep 17 00:00:00 2001 From: Sage Weil Date: Thu, 8 Oct 2015 13:07:15 -0400 Subject: [PATCH 06/26] log_writer: pass log number and whether recycling is enabled to ctor When we recycle log files, we need to mix the log number into the CRC for each record. Note that for logs that don't get recycled (like the manifest), we always pass a log_number of 0 and false. Signed-off-by: Sage Weil --- db/compaction_job_test.cc | 2 +- db/db_impl.cc | 13 ++++-- db/db_test.cc | 4 +- db/flush_job_test.cc | 2 +- db/log_test.cc | 97 ++++++++++++++++++++------------------- db/log_writer.cc | 8 +++- db/log_writer.h | 5 +- db/repair.cc | 2 +- db/version_set.cc | 2 +- db/wal_manager_test.cc | 5 +- 10 files changed, 78 insertions(+), 62 deletions(-) diff --git a/db/compaction_job_test.cc b/db/compaction_job_test.cc index b05694017..d4ae5f2f3 100644 --- a/db/compaction_job_test.cc +++ b/db/compaction_job_test.cc @@ -198,7 +198,7 @@ class CompactionJobTest : public testing::Test { unique_ptr file_writer( new WritableFileWriter(std::move(file), env_options_)); { - log::Writer log(std::move(file_writer)); + log::Writer log(std::move(file_writer), 0, false); std::string record; new_db.EncodeTo(&record); s = log.AddRecord(record); diff --git a/db/db_impl.cc b/db/db_impl.cc index 67885f73a..402df5316 100644 --- a/db/db_impl.cc +++ b/db/db_impl.cc @@ -420,7 +420,7 @@ Status DBImpl::NewDB() { file->SetPreallocationBlockSize(db_options_.manifest_preallocation_size); unique_ptr file_writer( new WritableFileWriter(std::move(file), env_options)); - log::Writer log(std::move(file_writer)); + log::Writer log(std::move(file_writer), 0, false); std::string record; new_db.EncodeTo(&record); s = log.AddRecord(record); @@ -4117,7 +4117,9 @@ Status DBImpl::SwitchMemtable(ColumnFamilyData* cfd, WriteContext* context) { mutable_cf_options.write_buffer_size); unique_ptr file_writer( new WritableFileWriter(std::move(lfile), opt_env_opt)); - new_log = new log::Writer(std::move(file_writer)); + new_log = new log::Writer(std::move(file_writer), + new_log_number, + db_options_.recycle_log_file_num > 0); } } @@ -4756,8 +4758,11 @@ Status DB::Open(const DBOptions& db_options, const std::string& dbname, impl->logfile_number_ = new_log_number; unique_ptr file_writer( new WritableFileWriter(std::move(lfile), opt_env_options)); - impl->logs_.emplace_back(new_log_number, - new log::Writer(std::move(file_writer))); + impl->logs_.emplace_back( + new_log_number, + new log::Writer(std::move(file_writer), + new_log_number, + impl->db_options_.recycle_log_file_num > 0)); // set column family handles for (auto cf : column_families) { diff --git a/db/db_test.cc b/db/db_test.cc index 83db98c55..03d4e485a 100644 --- a/db/db_test.cc +++ b/db/db_test.cc @@ -5074,7 +5074,9 @@ class RecoveryTestHelper { ASSERT_OK(db_options.env->NewWritableFile(fname, &file, env_options)); unique_ptr file_writer( new WritableFileWriter(std::move(file), env_options)); - current_log_writer.reset(new log::Writer(std::move(file_writer))); + current_log_writer.reset(new log::Writer( + std::move(file_writer), current_log_number, + db_options.recycle_log_file_num > 0)); for (int i = 0; i < kKeysPerWALFile; i++) { std::string key = "key" + ToString(count++); diff --git a/db/flush_job_test.cc b/db/flush_job_test.cc index d2c423c36..d20cf5e90 100644 --- a/db/flush_job_test.cc +++ b/db/flush_job_test.cc @@ -61,7 +61,7 @@ class FlushJobTest : public testing::Test { unique_ptr file_writer( new WritableFileWriter(std::move(file), EnvOptions())); { - log::Writer log(std::move(file_writer)); + log::Writer log(std::move(file_writer), 0, false); std::string record; new_db.EncodeTo(&record); s = log.AddRecord(record); diff --git a/db/log_test.cc b/db/log_test.cc index 5ab41f251..55ecd5f3f 100644 --- a/db/log_test.cc +++ b/db/log_test.cc @@ -43,7 +43,7 @@ static std::string RandomSkewedString(int i, Random* rnd) { return BigString(NumberString(i), rnd->Skewed(17)); } -class LogTest : public testing::Test { +class LogTest : public ::testing::TestWithParam { private: class StringSource : public SequentialFile { public: @@ -158,12 +158,11 @@ class LogTest : public testing::Test { public: LogTest() : reader_contents_(), - dest_holder_( - test::GetWritableFileWriter( - new test::StringSink(&reader_contents_))), + dest_holder_(test::GetWritableFileWriter( + new test::StringSink(&reader_contents_))), source_holder_( test::GetSequentialFileReader(new StringSource(reader_contents_))), - writer_(std::move(dest_holder_)), + writer_(std::move(dest_holder_), 123, GetParam()), reader_(std::move(source_holder_), &report_, true /*checksum*/, 0 /*initial_offset*/) {} @@ -298,9 +297,9 @@ uint64_t LogTest::initial_offset_last_record_offsets_[] = 2 * (kHeaderSize + 10000) + (2 * log::kBlockSize - 1000) + 3 * kHeaderSize}; -TEST_F(LogTest, Empty) { ASSERT_EQ("EOF", Read()); } +TEST_P(LogTest, Empty) { ASSERT_EQ("EOF", Read()); } -TEST_F(LogTest, ReadWrite) { +TEST_P(LogTest, ReadWrite) { Write("foo"); Write("bar"); Write(""); @@ -313,7 +312,7 @@ TEST_F(LogTest, ReadWrite) { ASSERT_EQ("EOF", Read()); // Make sure reads at eof work } -TEST_F(LogTest, ManyBlocks) { +TEST_P(LogTest, ManyBlocks) { for (int i = 0; i < 100000; i++) { Write(NumberString(i)); } @@ -323,7 +322,7 @@ TEST_F(LogTest, ManyBlocks) { ASSERT_EQ("EOF", Read()); } -TEST_F(LogTest, Fragmentation) { +TEST_P(LogTest, Fragmentation) { Write("small"); Write(BigString("medium", 50000)); Write(BigString("large", 100000)); @@ -333,7 +332,7 @@ TEST_F(LogTest, Fragmentation) { ASSERT_EQ("EOF", Read()); } -TEST_F(LogTest, MarginalTrailer) { +TEST_P(LogTest, MarginalTrailer) { // Make a trailer that is exactly the same length as an empty record. const int n = kBlockSize - 2*kHeaderSize; Write(BigString("foo", n)); @@ -346,7 +345,7 @@ TEST_F(LogTest, MarginalTrailer) { ASSERT_EQ("EOF", Read()); } -TEST_F(LogTest, MarginalTrailer2) { +TEST_P(LogTest, MarginalTrailer2) { // Make a trailer that is exactly the same length as an empty record. const int n = kBlockSize - 2*kHeaderSize; Write(BigString("foo", n)); @@ -359,7 +358,7 @@ TEST_F(LogTest, MarginalTrailer2) { ASSERT_EQ("", ReportMessage()); } -TEST_F(LogTest, ShortTrailer) { +TEST_P(LogTest, ShortTrailer) { const int n = kBlockSize - 2*kHeaderSize + 4; Write(BigString("foo", n)); ASSERT_EQ((unsigned int)(kBlockSize - kHeaderSize + 4), WrittenBytes()); @@ -371,7 +370,7 @@ TEST_F(LogTest, ShortTrailer) { ASSERT_EQ("EOF", Read()); } -TEST_F(LogTest, AlignedEof) { +TEST_P(LogTest, AlignedEof) { const int n = kBlockSize - 2*kHeaderSize + 4; Write(BigString("foo", n)); ASSERT_EQ((unsigned int)(kBlockSize - kHeaderSize + 4), WrittenBytes()); @@ -379,7 +378,7 @@ TEST_F(LogTest, AlignedEof) { ASSERT_EQ("EOF", Read()); } -TEST_F(LogTest, RandomRead) { +TEST_P(LogTest, RandomRead) { const int N = 500; Random write_rnd(301); for (int i = 0; i < N; i++) { @@ -394,7 +393,7 @@ TEST_F(LogTest, RandomRead) { // Tests of all the error paths in log_reader.cc follow: -TEST_F(LogTest, ReadError) { +TEST_P(LogTest, ReadError) { Write("foo"); ForceError(); ASSERT_EQ("EOF", Read()); @@ -402,7 +401,7 @@ TEST_F(LogTest, ReadError) { ASSERT_EQ("OK", MatchError("read error")); } -TEST_F(LogTest, BadRecordType) { +TEST_P(LogTest, BadRecordType) { Write("foo"); // Type is stored in header[6] IncrementByte(6, 100); @@ -412,7 +411,7 @@ TEST_F(LogTest, BadRecordType) { ASSERT_EQ("OK", MatchError("unknown record type")); } -TEST_F(LogTest, TruncatedTrailingRecordIsIgnored) { +TEST_P(LogTest, TruncatedTrailingRecordIsIgnored) { Write("foo"); ShrinkSize(4); // Drop all payload as well as a header byte ASSERT_EQ("EOF", Read()); @@ -421,7 +420,7 @@ TEST_F(LogTest, TruncatedTrailingRecordIsIgnored) { ASSERT_EQ("", ReportMessage()); } -TEST_F(LogTest, TruncatedTrailingRecordIsNotIgnored) { +TEST_P(LogTest, TruncatedTrailingRecordIsNotIgnored) { Write("foo"); ShrinkSize(4); // Drop all payload as well as a header byte ASSERT_EQ("EOF", Read(/*report_eof_inconsistency*/ true)); @@ -430,7 +429,7 @@ TEST_F(LogTest, TruncatedTrailingRecordIsNotIgnored) { ASSERT_EQ("OK", MatchError("Corruption: truncated header")); } -TEST_F(LogTest, BadLength) { +TEST_P(LogTest, BadLength) { const int kPayloadSize = kBlockSize - kHeaderSize; Write(BigString("bar", kPayloadSize)); Write("foo"); @@ -441,7 +440,7 @@ TEST_F(LogTest, BadLength) { ASSERT_EQ("OK", MatchError("bad record length")); } -TEST_F(LogTest, BadLengthAtEndIsIgnored) { +TEST_P(LogTest, BadLengthAtEndIsIgnored) { Write("foo"); ShrinkSize(1); ASSERT_EQ("EOF", Read()); @@ -449,7 +448,7 @@ TEST_F(LogTest, BadLengthAtEndIsIgnored) { ASSERT_EQ("", ReportMessage()); } -TEST_F(LogTest, BadLengthAtEndIsNotIgnored) { +TEST_P(LogTest, BadLengthAtEndIsNotIgnored) { Write("foo"); ShrinkSize(1); ASSERT_EQ("EOF", Read(/*report_eof_inconsistency=*/true)); @@ -457,7 +456,7 @@ TEST_F(LogTest, BadLengthAtEndIsNotIgnored) { ASSERT_EQ("OK", MatchError("Corruption: truncated header")); } -TEST_F(LogTest, ChecksumMismatch) { +TEST_P(LogTest, ChecksumMismatch) { Write("foo"); IncrementByte(0, 10); ASSERT_EQ("EOF", Read()); @@ -465,7 +464,7 @@ TEST_F(LogTest, ChecksumMismatch) { ASSERT_EQ("OK", MatchError("checksum mismatch")); } -TEST_F(LogTest, UnexpectedMiddleType) { +TEST_P(LogTest, UnexpectedMiddleType) { Write("foo"); SetByte(6, kMiddleType); FixChecksum(0, 3); @@ -474,7 +473,7 @@ TEST_F(LogTest, UnexpectedMiddleType) { ASSERT_EQ("OK", MatchError("missing start")); } -TEST_F(LogTest, UnexpectedLastType) { +TEST_P(LogTest, UnexpectedLastType) { Write("foo"); SetByte(6, kLastType); FixChecksum(0, 3); @@ -483,7 +482,7 @@ TEST_F(LogTest, UnexpectedLastType) { ASSERT_EQ("OK", MatchError("missing start")); } -TEST_F(LogTest, UnexpectedFullType) { +TEST_P(LogTest, UnexpectedFullType) { Write("foo"); Write("bar"); SetByte(6, kFirstType); @@ -494,7 +493,7 @@ TEST_F(LogTest, UnexpectedFullType) { ASSERT_EQ("OK", MatchError("partial record without end")); } -TEST_F(LogTest, UnexpectedFirstType) { +TEST_P(LogTest, UnexpectedFirstType) { Write("foo"); Write(BigString("bar", 100000)); SetByte(6, kFirstType); @@ -505,7 +504,7 @@ TEST_F(LogTest, UnexpectedFirstType) { ASSERT_EQ("OK", MatchError("partial record without end")); } -TEST_F(LogTest, MissingLastIsIgnored) { +TEST_P(LogTest, MissingLastIsIgnored) { Write(BigString("bar", kBlockSize)); // Remove the LAST block, including header. ShrinkSize(14); @@ -514,7 +513,7 @@ TEST_F(LogTest, MissingLastIsIgnored) { ASSERT_EQ(0U, DroppedBytes()); } -TEST_F(LogTest, MissingLastIsNotIgnored) { +TEST_P(LogTest, MissingLastIsNotIgnored) { Write(BigString("bar", kBlockSize)); // Remove the LAST block, including header. ShrinkSize(14); @@ -523,7 +522,7 @@ TEST_F(LogTest, MissingLastIsNotIgnored) { ASSERT_EQ("OK", MatchError("Corruption: error reading trailing data")); } -TEST_F(LogTest, PartialLastIsIgnored) { +TEST_P(LogTest, PartialLastIsIgnored) { Write(BigString("bar", kBlockSize)); // Cause a bad record length in the LAST block. ShrinkSize(1); @@ -532,7 +531,7 @@ TEST_F(LogTest, PartialLastIsIgnored) { ASSERT_EQ(0U, DroppedBytes()); } -TEST_F(LogTest, PartialLastIsNotIgnored) { +TEST_P(LogTest, PartialLastIsNotIgnored) { Write(BigString("bar", kBlockSize)); // Cause a bad record length in the LAST block. ShrinkSize(1); @@ -543,7 +542,7 @@ TEST_F(LogTest, PartialLastIsNotIgnored) { "error reading trailing data")); } -TEST_F(LogTest, ErrorJoinsRecords) { +TEST_P(LogTest, ErrorJoinsRecords) { // Consider two fragmented records: // first(R1) last(R1) first(R2) last(R2) // where the middle two fragments disappear. We do not want @@ -566,43 +565,43 @@ TEST_F(LogTest, ErrorJoinsRecords) { ASSERT_GE(dropped, 2 * kBlockSize); } -TEST_F(LogTest, ReadStart) { CheckInitialOffsetRecord(0, 0); } +TEST_P(LogTest, ReadStart) { CheckInitialOffsetRecord(0, 0); } -TEST_F(LogTest, ReadSecondOneOff) { CheckInitialOffsetRecord(1, 1); } +TEST_P(LogTest, ReadSecondOneOff) { CheckInitialOffsetRecord(1, 1); } -TEST_F(LogTest, ReadSecondTenThousand) { CheckInitialOffsetRecord(10000, 1); } +TEST_P(LogTest, ReadSecondTenThousand) { CheckInitialOffsetRecord(10000, 1); } -TEST_F(LogTest, ReadSecondStart) { CheckInitialOffsetRecord(10007, 1); } +TEST_P(LogTest, ReadSecondStart) { CheckInitialOffsetRecord(10007, 1); } -TEST_F(LogTest, ReadThirdOneOff) { CheckInitialOffsetRecord(10008, 2); } +TEST_P(LogTest, ReadThirdOneOff) { CheckInitialOffsetRecord(10008, 2); } -TEST_F(LogTest, ReadThirdStart) { CheckInitialOffsetRecord(20014, 2); } +TEST_P(LogTest, ReadThirdStart) { CheckInitialOffsetRecord(20014, 2); } -TEST_F(LogTest, ReadFourthOneOff) { CheckInitialOffsetRecord(20015, 3); } +TEST_P(LogTest, ReadFourthOneOff) { CheckInitialOffsetRecord(20015, 3); } -TEST_F(LogTest, ReadFourthFirstBlockTrailer) { +TEST_P(LogTest, ReadFourthFirstBlockTrailer) { CheckInitialOffsetRecord(log::kBlockSize - 4, 3); } -TEST_F(LogTest, ReadFourthMiddleBlock) { +TEST_P(LogTest, ReadFourthMiddleBlock) { CheckInitialOffsetRecord(log::kBlockSize + 1, 3); } -TEST_F(LogTest, ReadFourthLastBlock) { +TEST_P(LogTest, ReadFourthLastBlock) { CheckInitialOffsetRecord(2 * log::kBlockSize + 1, 3); } -TEST_F(LogTest, ReadFourthStart) { +TEST_P(LogTest, ReadFourthStart) { CheckInitialOffsetRecord( 2 * (kHeaderSize + 1000) + (2 * log::kBlockSize - 1000) + 3 * kHeaderSize, 3); } -TEST_F(LogTest, ReadEnd) { CheckOffsetPastEndReturnsNoRecords(0); } +TEST_P(LogTest, ReadEnd) { CheckOffsetPastEndReturnsNoRecords(0); } -TEST_F(LogTest, ReadPastEnd) { CheckOffsetPastEndReturnsNoRecords(5); } +TEST_P(LogTest, ReadPastEnd) { CheckOffsetPastEndReturnsNoRecords(5); } -TEST_F(LogTest, ClearEofSingleBlock) { +TEST_P(LogTest, ClearEofSingleBlock) { Write("foo"); Write("bar"); ForceEOF(3 + kHeaderSize + 2); @@ -617,7 +616,7 @@ TEST_F(LogTest, ClearEofSingleBlock) { ASSERT_TRUE(IsEOF()); } -TEST_F(LogTest, ClearEofMultiBlock) { +TEST_P(LogTest, ClearEofMultiBlock) { size_t num_full_blocks = 5; size_t n = (kBlockSize - kHeaderSize) * num_full_blocks + 25; Write(BigString("foo", n)); @@ -634,7 +633,7 @@ TEST_F(LogTest, ClearEofMultiBlock) { ASSERT_TRUE(IsEOF()); } -TEST_F(LogTest, ClearEofError) { +TEST_P(LogTest, ClearEofError) { // If an error occurs during Read() in UnmarkEOF(), the records contained // in the buffer should be returned on subsequent calls of ReadRecord() // until no more full records are left, whereafter ReadRecord() should return @@ -652,7 +651,7 @@ TEST_F(LogTest, ClearEofError) { ASSERT_EQ("EOF", Read()); } -TEST_F(LogTest, ClearEofError2) { +TEST_P(LogTest, ClearEofError2) { Write("foo"); Write("bar"); UnmarkEOF(); @@ -666,6 +665,8 @@ TEST_F(LogTest, ClearEofError2) { ASSERT_EQ("OK", MatchError("read error")); } +INSTANTIATE_TEST_CASE_P(bool, LogTest, ::testing::Values(0, 2)); + } // namespace log } // namespace rocksdb diff --git a/db/log_writer.cc b/db/log_writer.cc index 32d4afdc9..9bef090f1 100644 --- a/db/log_writer.cc +++ b/db/log_writer.cc @@ -18,8 +18,12 @@ namespace rocksdb { namespace log { -Writer::Writer(unique_ptr&& dest) - : dest_(std::move(dest)), block_offset_(0) { +Writer::Writer(unique_ptr&& dest, + uint64_t log_number, bool recycle_log_files) + : dest_(std::move(dest)), + block_offset_(0), + log_number_(log_number), + recycle_log_files_(recycle_log_files) { for (int i = 0; i <= kMaxRecordType; i++) { char t = static_cast(i); type_crc_[i] = crc32c::Value(&t, 1); diff --git a/db/log_writer.h b/db/log_writer.h index 6b59bbdd5..a8aa5d476 100644 --- a/db/log_writer.h +++ b/db/log_writer.h @@ -63,7 +63,8 @@ class Writer { // Create a writer that will append data to "*dest". // "*dest" must be initially empty. // "*dest" must remain live while this Writer is in use. - explicit Writer(unique_ptr&& dest); + explicit Writer(unique_ptr&& dest, + uint64_t log_number, bool recycle_log_files); ~Writer(); Status AddRecord(const Slice& slice); @@ -74,6 +75,8 @@ class Writer { private: unique_ptr dest_; int block_offset_; // Current offset in block + uint64_t log_number_; + bool recycle_log_files_; // crc32c values for all supported record types. These are // pre-computed to reduce the overhead of computing the crc of the diff --git a/db/repair.cc b/db/repair.cc index ba63850be..d5375f5dd 100644 --- a/db/repair.cc +++ b/db/repair.cc @@ -413,7 +413,7 @@ class Repairer { { unique_ptr file_writer( new WritableFileWriter(std::move(file), env_options)); - log::Writer log(std::move(file_writer)); + log::Writer log(std::move(file_writer), 0, false); std::string record; edit_->EncodeTo(&record); status = log.AddRecord(record); diff --git a/db/version_set.cc b/db/version_set.cc index 5ddecbb53..146e93ad0 100644 --- a/db/version_set.cc +++ b/db/version_set.cc @@ -2112,7 +2112,7 @@ Status VersionSet::LogAndApply(ColumnFamilyData* column_family_data, unique_ptr file_writer( new WritableFileWriter(std::move(descriptor_file), opt_env_opts)); - descriptor_log_.reset(new log::Writer(std::move(file_writer))); + descriptor_log_.reset(new log::Writer(std::move(file_writer), 0, false)); s = WriteSnapshot(descriptor_log_.get()); } } diff --git a/db/wal_manager_test.cc b/db/wal_manager_test.cc index ec56c9632..428f4c954 100644 --- a/db/wal_manager_test.cc +++ b/db/wal_manager_test.cc @@ -77,7 +77,7 @@ class WalManagerTest : public testing::Test { ASSERT_OK(env_->NewWritableFile(fname, &file, env_options_)); unique_ptr file_writer( new WritableFileWriter(std::move(file), env_options_)); - current_log_writer_.reset(new log::Writer(std::move(file_writer))); + current_log_writer_.reset(new log::Writer(std::move(file_writer), 0, false)); } void CreateArchiveLogs(int num_logs, int entries_per_log) { @@ -127,7 +127,8 @@ TEST_F(WalManagerTest, ReadFirstRecordCache) { unique_ptr file_writer( new WritableFileWriter(std::move(file), EnvOptions())); - log::Writer writer(std::move(file_writer)); + log::Writer writer(std::move(file_writer), 1, + db_options_.recycle_log_file_num > 0); WriteBatch batch; batch.Put("foo", "bar"); WriteBatchInternal::SetSequence(&batch, 10); From 3ac13c99d1094c10357ff2a13ad1d09b218f41f7 Mon Sep 17 00:00:00 2001 From: Sage Weil Date: Thu, 8 Oct 2015 13:06:16 -0400 Subject: [PATCH 07/26] log_reader: pass log_number and optional info_log to ctor We will need the log number to validate the recycle-style CRCs. The log is helpful for debugging, but optional, as not all callers have it. Signed-off-by: Sage Weil --- db/db_impl.cc | 4 ++-- db/log_reader.cc | 12 ++++++++---- db/log_reader.h | 11 +++++++++-- db/log_test.cc | 13 +++++++------ db/repair.cc | 4 ++-- db/transaction_log_impl.cc | 6 ++++-- db/version_set.cc | 15 ++++++++------- db/wal_manager.cc | 4 ++-- tools/ldb_cmd.cc | 16 +++++++++++++++- 9 files changed, 57 insertions(+), 28 deletions(-) diff --git a/db/db_impl.cc b/db/db_impl.cc index 402df5316..fc39e15c6 100644 --- a/db/db_impl.cc +++ b/db/db_impl.cc @@ -1119,8 +1119,8 @@ Status DBImpl::RecoverLogFiles(const std::vector& log_numbers, // paranoid_checks==false so that corruptions cause entire commits // to be skipped instead of propagating bad information (like overly // large sequence numbers). - log::Reader reader(std::move(file_reader), &reporter, true /*checksum*/, - 0 /*initial_offset*/); + log::Reader reader(db_options_.info_log, std::move(file_reader), &reporter, + true /*checksum*/, 0 /*initial_offset*/, log_number); Log(InfoLogLevel::INFO_LEVEL, db_options_.info_log, "Recovering log #%" PRIu64 " mode %d skip-recovery %d", log_number, db_options_.wal_recovery_mode, !continue_replay_log); diff --git a/db/log_reader.cc b/db/log_reader.cc index 296f1d50c..44396eb7a 100644 --- a/db/log_reader.cc +++ b/db/log_reader.cc @@ -21,9 +21,12 @@ namespace log { Reader::Reporter::~Reporter() { } -Reader::Reader(unique_ptr&& _file, Reporter* reporter, - bool checksum, uint64_t initial_offset) - : file_(std::move(_file)), +Reader::Reader(std::shared_ptr info_log, + unique_ptr&& _file, + Reporter* reporter, bool checksum, uint64_t initial_offset, + uint64_t log_num) + : info_log_(info_log), + file_(std::move(_file)), reporter_(reporter), checksum_(checksum), backing_store_(new char[kBlockSize]), @@ -33,7 +36,8 @@ Reader::Reader(unique_ptr&& _file, Reporter* reporter, eof_offset_(0), last_record_offset_(0), end_of_buffer_offset_(0), - initial_offset_(initial_offset) {} + initial_offset_(initial_offset), + log_number_(log_num) {} Reader::~Reader() { delete[] backing_store_; diff --git a/db/log_reader.h b/db/log_reader.h index 390696b85..1b0a592c9 100644 --- a/db/log_reader.h +++ b/db/log_reader.h @@ -18,6 +18,7 @@ namespace rocksdb { class SequentialFileReader; +class Logger; using std::unique_ptr; namespace log { @@ -51,8 +52,10 @@ class Reader { // // The Reader will start reading at the first record located at physical // position >= initial_offset within the file. - Reader(unique_ptr&& file, Reporter* reporter, - bool checksum, uint64_t initial_offset); + Reader(std::shared_ptr info_log, + unique_ptr&& file, + Reporter* reporter, bool checksum, uint64_t initial_offset, + uint64_t log_num); ~Reader(); @@ -84,6 +87,7 @@ class Reader { SequentialFileReader* file() { return file_.get(); } private: + std::shared_ptr info_log_; const unique_ptr file_; Reporter* const reporter_; bool const checksum_; @@ -104,6 +108,9 @@ class Reader { // Offset at which to start looking for the first record to return uint64_t const initial_offset_; + // which log number this is + uint64_t const log_number_; + // Extend record types with the following special values enum { kEof = kMaxRecordType + 1, diff --git a/db/log_test.cc b/db/log_test.cc index 55ecd5f3f..55f8a9682 100644 --- a/db/log_test.cc +++ b/db/log_test.cc @@ -163,8 +163,8 @@ class LogTest : public ::testing::TestWithParam { source_holder_( test::GetSequentialFileReader(new StringSource(reader_contents_))), writer_(std::move(dest_holder_), 123, GetParam()), - reader_(std::move(source_holder_), &report_, true /*checksum*/, - 0 /*initial_offset*/) {} + reader_(NULL, std::move(source_holder_), &report_, + true /*checksum*/, 0 /*initial_offset*/, 123) {} void Write(const std::string& msg) { writer_.AddRecord(Slice(msg)); @@ -258,8 +258,8 @@ class LogTest : public ::testing::TestWithParam { unique_ptr file_reader( test::GetSequentialFileReader(new StringSource(reader_contents_))); unique_ptr offset_reader( - new Reader(std::move(file_reader), &report_, true /*checksum*/, - WrittenBytes() + offset_past_end)); + new Reader(NULL, std::move(file_reader), &report_, + true /*checksum*/, WrittenBytes() + offset_past_end, 123)); Slice record; std::string scratch; ASSERT_TRUE(!offset_reader->ReadRecord(&record, &scratch)); @@ -270,8 +270,9 @@ class LogTest : public ::testing::TestWithParam { WriteInitialOffsetLog(); unique_ptr file_reader( test::GetSequentialFileReader(new StringSource(reader_contents_))); - unique_ptr offset_reader(new Reader( - std::move(file_reader), &report_, true /*checksum*/, initial_offset)); + unique_ptr offset_reader( + new Reader(NULL, std::move(file_reader), &report_, + true /*checksum*/, initial_offset, 123)); Slice record; std::string scratch; ASSERT_TRUE(offset_reader->ReadRecord(&record, &scratch)); diff --git a/db/repair.cc b/db/repair.cc index d5375f5dd..c27db6e49 100644 --- a/db/repair.cc +++ b/db/repair.cc @@ -249,8 +249,8 @@ class Repairer { // corruptions cause entire commits to be skipped instead of // propagating bad information (like overly large sequence // numbers). - log::Reader reader(std::move(lfile_reader), &reporter, - true /*enable checksum*/, 0 /*initial_offset*/); + log::Reader reader(options_.info_log, std::move(lfile_reader), &reporter, + true /*enable checksum*/, 0 /*initial_offset*/, log); // Read all the records and add to a memtable std::string scratch; diff --git a/db/transaction_log_impl.cc b/db/transaction_log_impl.cc index 23bd6672b..28c4490f5 100644 --- a/db/transaction_log_impl.cc +++ b/db/transaction_log_impl.cc @@ -262,8 +262,10 @@ Status TransactionLogIteratorImpl::OpenLogReader(const LogFile* logFile) { return s; } assert(file); - currentLogReader_.reset(new log::Reader(std::move(file), &reporter_, - read_options_.verify_checksums_, 0)); + currentLogReader_.reset(new log::Reader(options_->info_log, + std::move(file), &reporter_, + read_options_.verify_checksums_, 0, + logFile->LogNumber())); return Status::OK(); } } // namespace rocksdb diff --git a/db/version_set.cc b/db/version_set.cc index 146e93ad0..478adbcb4 100644 --- a/db/version_set.cc +++ b/db/version_set.cc @@ -2376,8 +2376,8 @@ Status VersionSet::Recover( { VersionSet::LogReporter reporter; reporter.status = &s; - log::Reader reader(std::move(manifest_file_reader), &reporter, - true /*checksum*/, 0 /*initial_offset*/); + log::Reader reader(NULL, std::move(manifest_file_reader), &reporter, + true /*checksum*/, 0 /*initial_offset*/, 0); Slice record; std::string scratch; while (reader.ReadRecord(&record, &scratch) && s.ok()) { @@ -2629,8 +2629,8 @@ Status VersionSet::ListColumnFamilies(std::vector* column_families, column_family_names.insert({0, kDefaultColumnFamilyName}); VersionSet::LogReporter reporter; reporter.status = &s; - log::Reader reader(std::move(file_reader), &reporter, true /*checksum*/, - 0 /*initial_offset*/); + log::Reader reader(NULL, std::move(file_reader), &reporter, true /*checksum*/, + 0 /*initial_offset*/, 0); Slice record; std::string scratch; while (reader.ReadRecord(&record, &scratch) && s.ok()) { @@ -2787,8 +2787,8 @@ Status VersionSet::DumpManifest(Options& options, std::string& dscname, { VersionSet::LogReporter reporter; reporter.status = &s; - log::Reader reader(std::move(file_reader), &reporter, true /*checksum*/, - 0 /*initial_offset*/); + log::Reader reader(NULL, std::move(file_reader), &reporter, + true /*checksum*/, 0 /*initial_offset*/, 0); Slice record; std::string scratch; while (reader.ReadRecord(&record, &scratch) && s.ok()) { @@ -3040,7 +3040,8 @@ bool VersionSet::ManifestContains(uint64_t manifest_file_num, } file_reader.reset(new SequentialFileReader(std::move(file))); } - log::Reader reader(std::move(file_reader), nullptr, true /*checksum*/, 0); + log::Reader reader(NULL, std::move(file_reader), nullptr, + true /*checksum*/, 0, 0); Slice r; std::string scratch; bool result = false; diff --git a/db/wal_manager.cc b/db/wal_manager.cc index 37861ab45..58b0b1643 100644 --- a/db/wal_manager.cc +++ b/db/wal_manager.cc @@ -448,8 +448,8 @@ Status WalManager::ReadFirstLine(const std::string& fname, reporter.fname = fname.c_str(); reporter.status = &status; reporter.ignore_error = !db_options_.paranoid_checks; - log::Reader reader(std::move(file_reader), &reporter, true /*checksum*/, - 0 /*initial_offset*/); + log::Reader reader(db_options_.info_log, std::move(file_reader), &reporter, + true /*checksum*/, 0 /*initial_offset*/, *sequence); std::string scratch; Slice record; diff --git a/tools/ldb_cmd.cc b/tools/ldb_cmd.cc index 112014351..60687f98d 100644 --- a/tools/ldb_cmd.cc +++ b/tools/ldb_cmd.cc @@ -1438,7 +1438,21 @@ void DumpWalFile(std::string wal_file, bool print_header, bool print_values, } } else { StdErrReporter reporter; - log::Reader reader(move(wal_file_reader), &reporter, true, 0); + uint64_t log_number; + FileType type; + + // we need the log number, but ParseFilename expects dbname/NNN.log. + string sanitized = wal_file; + size_t lastslash = sanitized.rfind('/'); + if (lastslash != std::string::npos) + sanitized = sanitized.substr(lastslash + 1); + if (!ParseFileName(sanitized, &log_number, &type)) { + // bogus input, carry on as best we can + log_number = 0; + } + DBOptions db_options; + log::Reader reader(db_options.info_log, move(wal_file_reader), &reporter, + true, 0, log_number); string scratch; WriteBatch batch; Slice record; From 71880521073d7a2318078221b5d90d02e9d072ed Mon Sep 17 00:00:00 2001 From: Sage Weil Date: Thu, 1 Oct 2015 17:14:20 -0400 Subject: [PATCH 08/26] db_test_util: add recycle_log_files to set of tested options Signed-off-by: Sage Weil --- db/db_test_util.cc | 4 ++++ db/db_test_util.h | 7 ++++--- 2 files changed, 8 insertions(+), 3 deletions(-) diff --git a/db/db_test_util.cc b/db/db_test_util.cc index ab0ab4d69..bbb473386 100644 --- a/db/db_test_util.cc +++ b/db/db_test_util.cc @@ -343,6 +343,10 @@ Options DBTestBase::CurrentOptions( options.row_cache = NewLRUCache(1024 * 1024); break; } + case kRecycleLogFiles: { + options.recycle_log_file_num = 2; + break; + } case kLevelSubcompactions: { options.max_subcompactions = 4; break; diff --git a/db/db_test_util.h b/db/db_test_util.h index befd6cd8c..6b55c394d 100644 --- a/db/db_test_util.h +++ b/db/db_test_util.h @@ -437,9 +437,10 @@ class DBTestBase : public testing::Test { kFIFOCompaction = 26, kOptimizeFiltersForHits = 27, kRowCache = 28, - kLevelSubcompactions = 29, - kUniversalSubcompactions = 30, - kEnd = 29 + kRecycleLogFiles = 29, + kLevelSubcompactions = 30, + kUniversalSubcompactions = 31, + kEnd = 30 }; int option_config_; From 9c33f64d191cd8afe80a31de23d9248be058bc44 Mon Sep 17 00:00:00 2001 From: Sage Weil Date: Fri, 2 Oct 2015 15:53:59 -0400 Subject: [PATCH 09/26] log_reader: pass in WALRecoveryMode instead of bool report_eof_inconsistency Soon our behavior will depend on more than just whther we are in kAbsoluteConsistency or not. Signed-off-by: Sage Weil --- db/db_impl.cc | 22 ++++------------------ db/log_reader.cc | 24 ++++++++++++++++++------ db/log_reader.h | 9 ++++++--- db/log_test.cc | 13 +++++++------ 4 files changed, 35 insertions(+), 33 deletions(-) diff --git a/db/db_impl.cc b/db/db_impl.cc index fc39e15c6..cbed7482d 100644 --- a/db/db_impl.cc +++ b/db/db_impl.cc @@ -1126,21 +1126,6 @@ Status DBImpl::RecoverLogFiles(const std::vector& log_numbers, db_options_.wal_recovery_mode, !continue_replay_log); // Determine if we should tolerate incomplete records at the tail end of the - // log - bool report_eof_inconsistency; - if (db_options_.wal_recovery_mode == - WALRecoveryMode::kAbsoluteConsistency) { - // in clean shutdown we don't expect any error in the log files - report_eof_inconsistency = true; - } else { - // for other modes ignore only incomplete records in the last log file - // which is presumably due to write in progress during restart - report_eof_inconsistency = false; - - // TODO krad: Evaluate if we need to move to a more strict mode where we - // restrict the inconsistency to only the last log - } - // Read all the records and add to a memtable std::string scratch; Slice record; @@ -1155,9 +1140,10 @@ Status DBImpl::RecoverLogFiles(const std::vector& log_numbers, } } - while (continue_replay_log && - reader.ReadRecord(&record, &scratch, report_eof_inconsistency) && - status.ok()) { + while ( + continue_replay_log && + reader.ReadRecord(&record, &scratch, db_options_.wal_recovery_mode) && + status.ok()) { if (record.size() < 12) { reporter.Corruption(record.size(), Status::Corruption("log record too small")); diff --git a/db/log_reader.cc b/db/log_reader.cc index 44396eb7a..cf5c26152 100644 --- a/db/log_reader.cc +++ b/db/log_reader.cc @@ -66,8 +66,15 @@ bool Reader::SkipToInitialBlock() { return true; } +// For kAbsoluteConsistency, on clean shutdown we don't expect any error +// in the log files. For other modes, we can ignore only incomplete records +// in the last log file, which are presumably due to a write in progress +// during restart (or from log recycling). +// +// TODO krad: Evaluate if we need to move to a more strict mode where we +// restrict the inconsistency to only the last log bool Reader::ReadRecord(Slice* record, std::string* scratch, - const bool report_eof_inconsistency) { + WALRecoveryMode wal_recovery_mode) { if (last_record_offset_ < initial_offset_) { if (!SkipToInitialBlock()) { return false; @@ -85,7 +92,7 @@ bool Reader::ReadRecord(Slice* record, std::string* scratch, while (true) { uint64_t physical_record_offset = end_of_buffer_offset_ - buffer_.size(); const unsigned int record_type = - ReadPhysicalRecord(&fragment, report_eof_inconsistency); + ReadPhysicalRecord(&fragment, wal_recovery_mode); switch (record_type) { case kFullType: if (in_fragmented_record && !scratch->empty()) { @@ -137,7 +144,8 @@ bool Reader::ReadRecord(Slice* record, std::string* scratch, case kEof: if (in_fragmented_record) { - if (report_eof_inconsistency) { + if (wal_recovery_mode == WALRecoveryMode::kAbsoluteConsistency) { + // in clean shutdown we don't expect any error in the log files ReportCorruption(scratch->size(), "error reading trailing data"); } // This can be caused by the writer dying immediately after @@ -249,7 +257,7 @@ void Reader::ReportDrop(size_t bytes, const Status& reason) { } unsigned int Reader::ReadPhysicalRecord(Slice* result, - const bool report_eof_inconsistency) { + WALRecoveryMode wal_recovery_mode) { while (true) { if (buffer_.size() < (size_t)kHeaderSize) { if (!eof_ && !read_error_) { @@ -272,7 +280,9 @@ unsigned int Reader::ReadPhysicalRecord(Slice* result, // end of the file, which can be caused by the writer crashing in the // middle of writing the header. Unless explicitly requested we don't // considering this an error, just report EOF. - if (buffer_.size() && report_eof_inconsistency) { + if (buffer_.size() && + wal_recovery_mode == WALRecoveryMode::kAbsoluteConsistency) { + // in clean shutdown we don't expect any error in the log files ReportCorruption(buffer_.size(), "truncated header"); } buffer_.clear(); @@ -296,7 +306,9 @@ unsigned int Reader::ReadPhysicalRecord(Slice* result, // If the end of the file has been reached without reading |length| bytes // of payload, assume the writer died in the middle of writing the record. // Don't report a corruption unless requested. - if (drop_size && report_eof_inconsistency) { + if (drop_size && + wal_recovery_mode == WALRecoveryMode::kAbsoluteConsistency) { + // in clean shutdown we don't expect any error in the log files ReportCorruption(drop_size, "truncated header"); } return kEof; diff --git a/db/log_reader.h b/db/log_reader.h index 1b0a592c9..bb86dfda7 100644 --- a/db/log_reader.h +++ b/db/log_reader.h @@ -14,6 +14,7 @@ #include "db/log_format.h" #include "rocksdb/slice.h" #include "rocksdb/status.h" +#include "rocksdb/options.h" namespace rocksdb { @@ -65,7 +66,8 @@ class Reader { // will only be valid until the next mutating operation on this // reader or the next mutation to *scratch. bool ReadRecord(Slice* record, std::string* scratch, - bool report_eof_inconsistency = false); + WALRecoveryMode wal_recovery_mode = + WALRecoveryMode::kTolerateCorruptedTailRecords); // Returns the physical offset of the last record returned by ReadRecord. // @@ -128,8 +130,9 @@ class Reader { bool SkipToInitialBlock(); // Return type, or one of the preceding special values - unsigned int ReadPhysicalRecord(Slice* result, - bool report_eof_inconsistency = false); + unsigned int ReadPhysicalRecord( + Slice* result, WALRecoveryMode wal_recovery_mode = + WALRecoveryMode::kTolerateCorruptedTailRecords); // Reports dropped bytes to the reporter. // buffer_ must be updated to remove the dropped bytes prior to invocation. diff --git a/db/log_test.cc b/db/log_test.cc index 55f8a9682..22921090a 100644 --- a/db/log_test.cc +++ b/db/log_test.cc @@ -174,10 +174,11 @@ class LogTest : public ::testing::TestWithParam { return dest_contents().size(); } - std::string Read(const bool report_eof_inconsistency = false) { + std::string Read(const WALRecoveryMode wal_recovery_mode = + WALRecoveryMode::kTolerateCorruptedTailRecords) { std::string scratch; Slice record; - if (reader_.ReadRecord(&record, &scratch, report_eof_inconsistency)) { + if (reader_.ReadRecord(&record, &scratch, wal_recovery_mode)) { return record.ToString(); } else { return "EOF"; @@ -424,7 +425,7 @@ TEST_P(LogTest, TruncatedTrailingRecordIsIgnored) { TEST_P(LogTest, TruncatedTrailingRecordIsNotIgnored) { Write("foo"); ShrinkSize(4); // Drop all payload as well as a header byte - ASSERT_EQ("EOF", Read(/*report_eof_inconsistency*/ true)); + ASSERT_EQ("EOF", Read(WALRecoveryMode::kAbsoluteConsistency)); // Truncated last record is ignored, not treated as an error ASSERT_GT(DroppedBytes(), 0U); ASSERT_EQ("OK", MatchError("Corruption: truncated header")); @@ -452,7 +453,7 @@ TEST_P(LogTest, BadLengthAtEndIsIgnored) { TEST_P(LogTest, BadLengthAtEndIsNotIgnored) { Write("foo"); ShrinkSize(1); - ASSERT_EQ("EOF", Read(/*report_eof_inconsistency=*/true)); + ASSERT_EQ("EOF", Read(WALRecoveryMode::kAbsoluteConsistency)); ASSERT_GT(DroppedBytes(), 0U); ASSERT_EQ("OK", MatchError("Corruption: truncated header")); } @@ -518,7 +519,7 @@ TEST_P(LogTest, MissingLastIsNotIgnored) { Write(BigString("bar", kBlockSize)); // Remove the LAST block, including header. ShrinkSize(14); - ASSERT_EQ("EOF", Read(/*report_eof_inconsistency=*/true)); + ASSERT_EQ("EOF", Read(WALRecoveryMode::kAbsoluteConsistency)); ASSERT_GT(DroppedBytes(), 0U); ASSERT_EQ("OK", MatchError("Corruption: error reading trailing data")); } @@ -536,7 +537,7 @@ TEST_P(LogTest, PartialLastIsNotIgnored) { Write(BigString("bar", kBlockSize)); // Cause a bad record length in the LAST block. ShrinkSize(1); - ASSERT_EQ("EOF", Read(/*report_eof_inconsistency=*/true)); + ASSERT_EQ("EOF", Read(WALRecoveryMode::kAbsoluteConsistency)); ASSERT_GT(DroppedBytes(), 0U); ASSERT_EQ("OK", MatchError( "Corruption: truncated headerCorruption: " From 4104e9bb6749506902462e22e503feb9d2ad6b1e Mon Sep 17 00:00:00 2001 From: Sage Weil Date: Mon, 12 Oct 2015 17:04:33 -0400 Subject: [PATCH 10/26] log_reader: introduce kBadHeader; drop wal mode from ReadPhysicalRecord Move the WAL recovery mode logic out of ReadPhysicalRecord. To do this we introduce a new type indicating when we fail to read a valid header. Signed-off-by: Sage Weil --- db/log_reader.cc | 36 ++++++++++++++++++++---------------- db/log_reader.h | 8 ++++---- 2 files changed, 24 insertions(+), 20 deletions(-) diff --git a/db/log_reader.cc b/db/log_reader.cc index cf5c26152..d1b136660 100644 --- a/db/log_reader.cc +++ b/db/log_reader.cc @@ -91,8 +91,8 @@ bool Reader::ReadRecord(Slice* record, std::string* scratch, Slice fragment; while (true) { uint64_t physical_record_offset = end_of_buffer_offset_ - buffer_.size(); - const unsigned int record_type = - ReadPhysicalRecord(&fragment, wal_recovery_mode); + size_t drop_size; + const unsigned int record_type = ReadPhysicalRecord(&fragment, &drop_size); switch (record_type) { case kFullType: if (in_fragmented_record && !scratch->empty()) { @@ -142,6 +142,13 @@ bool Reader::ReadRecord(Slice* record, std::string* scratch, } break; + case kBadHeader: + if (wal_recovery_mode == WALRecoveryMode::kAbsoluteConsistency) { + // in clean shutdown we don't expect any error in the log files + ReportCorruption(drop_size, "truncated header"); + } + // fall-thru + case kEof: if (in_fragmented_record) { if (wal_recovery_mode == WALRecoveryMode::kAbsoluteConsistency) { @@ -256,8 +263,7 @@ void Reader::ReportDrop(size_t bytes, const Status& reason) { } } -unsigned int Reader::ReadPhysicalRecord(Slice* result, - WALRecoveryMode wal_recovery_mode) { +unsigned int Reader::ReadPhysicalRecord(Slice* result, size_t* drop_size) { while (true) { if (buffer_.size() < (size_t)kHeaderSize) { if (!eof_ && !read_error_) { @@ -280,10 +286,10 @@ unsigned int Reader::ReadPhysicalRecord(Slice* result, // end of the file, which can be caused by the writer crashing in the // middle of writing the header. Unless explicitly requested we don't // considering this an error, just report EOF. - if (buffer_.size() && - wal_recovery_mode == WALRecoveryMode::kAbsoluteConsistency) { - // in clean shutdown we don't expect any error in the log files - ReportCorruption(buffer_.size(), "truncated header"); + if (buffer_.size()) { + *drop_size = buffer_.size(); + buffer_.clear(); + return kBadHeader; } buffer_.clear(); return kEof; @@ -297,19 +303,17 @@ unsigned int Reader::ReadPhysicalRecord(Slice* result, const unsigned int type = header[6]; const uint32_t length = a | (b << 8); if (kHeaderSize + length > buffer_.size()) { - size_t drop_size = buffer_.size(); + *drop_size = buffer_.size(); buffer_.clear(); if (!eof_) { - ReportCorruption(drop_size, "bad record length"); + ReportCorruption(*drop_size, "bad record length"); return kBadRecord; } // If the end of the file has been reached without reading |length| bytes // of payload, assume the writer died in the middle of writing the record. // Don't report a corruption unless requested. - if (drop_size && - wal_recovery_mode == WALRecoveryMode::kAbsoluteConsistency) { - // in clean shutdown we don't expect any error in the log files - ReportCorruption(drop_size, "truncated header"); + if (*drop_size) { + return kBadHeader; } return kEof; } @@ -333,9 +337,9 @@ unsigned int Reader::ReadPhysicalRecord(Slice* result, // been corrupted and if we trust it, we could find some // fragment of a real log record that just happens to look // like a valid log record. - size_t drop_size = buffer_.size(); + *drop_size = buffer_.size(); buffer_.clear(); - ReportCorruption(drop_size, "checksum mismatch"); + ReportCorruption(*drop_size, "checksum mismatch"); return kBadRecord; } } diff --git a/db/log_reader.h b/db/log_reader.h index bb86dfda7..422469bab 100644 --- a/db/log_reader.h +++ b/db/log_reader.h @@ -121,7 +121,9 @@ class Reader { // * The record has an invalid CRC (ReadPhysicalRecord reports a drop) // * The record is a 0-length record (No drop is reported) // * The record is below constructor's initial_offset (No drop is reported) - kBadRecord = kMaxRecordType + 2 + kBadRecord = kMaxRecordType + 2, + // Returned when we fail to read a valid header. + kBadHeader = kMaxRecordType + 3, }; // Skips all blocks that are completely before "initial_offset_". @@ -130,9 +132,7 @@ class Reader { bool SkipToInitialBlock(); // Return type, or one of the preceding special values - unsigned int ReadPhysicalRecord( - Slice* result, WALRecoveryMode wal_recovery_mode = - WALRecoveryMode::kTolerateCorruptedTailRecords); + unsigned int ReadPhysicalRecord(Slice* result, size_t* drop_size); // Reports dropped bytes to the reporter. // buffer_ must be updated to remove the dropped bytes prior to invocation. From a7b2bedfb0ecab4ebf0084c7de281bafa7dfc5fb Mon Sep 17 00:00:00 2001 From: Sage Weil Date: Mon, 19 Oct 2015 17:24:05 -0400 Subject: [PATCH 11/26] log_{reader,write}: recyclable record format Introduce new tags for records that have a log_number. This changes the header size from 7 to 11 for these records, making this a backward-incompatible change. If we read a record that belongs to a different log_number (i.e., a previous instantiation of this log file, before it was most recently recycled), we return kOldRecord from ReadPhysicalRecord. ReadRecord will translate this into a kEof or kBadRecord depending on what the WAL recovery mode is. We make several adjustments to the log_test.cc tests to compensate for the fact that the header size varies between the two modes. Signed-off-by: Sage Weil --- db/log_format.h | 14 +++++- db/log_reader.cc | 113 +++++++++++++++++++++++++++++++++-------------- db/log_reader.h | 5 +++ db/log_test.cc | 105 ++++++++++++++++++++++++++----------------- db/log_writer.cc | 60 ++++++++++++++++++------- db/log_writer.h | 11 ++++- 6 files changed, 215 insertions(+), 93 deletions(-) diff --git a/db/log_format.h b/db/log_format.h index 919c087e2..97eb13393 100644 --- a/db/log_format.h +++ b/db/log_format.h @@ -22,14 +22,24 @@ enum RecordType { // For fragments kFirstType = 2, kMiddleType = 3, - kLastType = 4 + kLastType = 4, + + // For recycled log files + kRecyclableFullType = 5, + kRecyclableFirstType = 6, + kRecyclableMiddleType = 7, + kRecyclableLastType = 8, }; -static const int kMaxRecordType = kLastType; +static const int kMaxRecordType = kRecyclableLastType; static const unsigned int kBlockSize = 32768; // Header is checksum (4 bytes), type (1 byte), length (2 bytes). static const int kHeaderSize = 4 + 1 + 2; +// Recyclable header is checksum (4 bytes), type (1 byte), log number +// (4 bytes), length (2 bytes). +static const int kRecyclableHeaderSize = 4 + 1 + 4 + 2; + } // namespace log } // namespace rocksdb diff --git a/db/log_reader.cc b/db/log_reader.cc index d1b136660..512dd08d3 100644 --- a/db/log_reader.cc +++ b/db/log_reader.cc @@ -95,6 +95,7 @@ bool Reader::ReadRecord(Slice* record, std::string* scratch, const unsigned int record_type = ReadPhysicalRecord(&fragment, &drop_size); switch (record_type) { case kFullType: + case kRecyclableFullType: if (in_fragmented_record && !scratch->empty()) { // Handle bug in earlier versions of log::Writer where // it could emit an empty kFirstType record at the tail end @@ -109,6 +110,7 @@ bool Reader::ReadRecord(Slice* record, std::string* scratch, return true; case kFirstType: + case kRecyclableFirstType: if (in_fragmented_record && !scratch->empty()) { // Handle bug in earlier versions of log::Writer where // it could emit an empty kFirstType record at the tail end @@ -122,6 +124,7 @@ bool Reader::ReadRecord(Slice* record, std::string* scratch, break; case kMiddleType: + case kRecyclableMiddleType: if (!in_fragmented_record) { ReportCorruption(fragment.size(), "missing start of fragmented record(1)"); @@ -131,6 +134,7 @@ bool Reader::ReadRecord(Slice* record, std::string* scratch, break; case kLastType: + case kRecyclableLastType: if (!in_fragmented_record) { ReportCorruption(fragment.size(), "missing start of fragmented record(2)"); @@ -162,6 +166,23 @@ bool Reader::ReadRecord(Slice* record, std::string* scratch, } return false; + case kOldRecord: + if (wal_recovery_mode != WALRecoveryMode::kSkipAnyCorruptedRecords) { + // Treat a record from a previous instance of the log as EOF. + if (in_fragmented_record) { + if (wal_recovery_mode == WALRecoveryMode::kAbsoluteConsistency) { + // in clean shutdown we don't expect any error in the log files + ReportCorruption(scratch->size(), "error reading trailing data"); + } + // This can be caused by the writer dying immediately after + // writing a physical record but before completing the next; don't + // treat it as a corruption, just ignore the entire logical record. + scratch->clear(); + } + return false; + } + // fall-thru + case kBadRecord: if (in_fragmented_record) { ReportCorruption(scratch->size(), "error in middle of record"); @@ -263,37 +284,49 @@ void Reader::ReportDrop(size_t bytes, const Status& reason) { } } +bool Reader::ReadMore(size_t* drop_size, int *error) { + if (!eof_ && !read_error_) { + // Last read was a full read, so this is a trailer to skip + buffer_.clear(); + Status status = file_->Read(kBlockSize, &buffer_, backing_store_); + end_of_buffer_offset_ += buffer_.size(); + if (!status.ok()) { + buffer_.clear(); + ReportDrop(kBlockSize, status); + read_error_ = true; + *error = kEof; + return false; + } else if (buffer_.size() < (size_t)kBlockSize) { + eof_ = true; + eof_offset_ = buffer_.size(); + } + return true; + } else { + // Note that if buffer_ is non-empty, we have a truncated header at the + // end of the file, which can be caused by the writer crashing in the + // middle of writing the header. Unless explicitly requested we don't + // considering this an error, just report EOF. + if (buffer_.size()) { + *drop_size = buffer_.size(); + buffer_.clear(); + *error = kBadHeader; + return false; + } + buffer_.clear(); + *error = kEof; + return false; + } +} + unsigned int Reader::ReadPhysicalRecord(Slice* result, size_t* drop_size) { while (true) { + // We need at least the minimum header size if (buffer_.size() < (size_t)kHeaderSize) { - if (!eof_ && !read_error_) { - // Last read was a full read, so this is a trailer to skip - buffer_.clear(); - Status status = file_->Read(kBlockSize, &buffer_, backing_store_); - end_of_buffer_offset_ += buffer_.size(); - if (!status.ok()) { - buffer_.clear(); - ReportDrop(kBlockSize, status); - read_error_ = true; - return kEof; - } else if (buffer_.size() < (size_t)kBlockSize) { - eof_ = true; - eof_offset_ = buffer_.size(); - } - continue; - } else { - // Note that if buffer_ is non-empty, we have a truncated header at the - // end of the file, which can be caused by the writer crashing in the - // middle of writing the header. Unless explicitly requested we don't - // considering this an error, just report EOF. - if (buffer_.size()) { - *drop_size = buffer_.size(); - buffer_.clear(); - return kBadHeader; - } - buffer_.clear(); - return kEof; + int r; + if (!ReadMore(drop_size, &r)) { + return r; } + continue; } // Parse the header @@ -302,7 +335,23 @@ unsigned int Reader::ReadPhysicalRecord(Slice* result, size_t* drop_size) { const uint32_t b = static_cast(header[5]) & 0xff; const unsigned int type = header[6]; const uint32_t length = a | (b << 8); - if (kHeaderSize + length > buffer_.size()) { + int header_size = kHeaderSize; + if (type >= kRecyclableFullType && type <= kRecyclableLastType) { + header_size = kRecyclableHeaderSize; + // We need enough for the larger header + if (buffer_.size() < (size_t)kRecyclableHeaderSize) { + int r; + if (!ReadMore(drop_size, &r)) { + return r; + } + continue; + } + const uint32_t log_num = DecodeFixed32(header + 7); + if (log_num != log_number_) { + return kOldRecord; + } + } + if (header_size + length > buffer_.size()) { *drop_size = buffer_.size(); buffer_.clear(); if (!eof_) { @@ -331,7 +380,7 @@ unsigned int Reader::ReadPhysicalRecord(Slice* result, size_t* drop_size) { // Check crc if (checksum_) { uint32_t expected_crc = crc32c::Unmask(DecodeFixed32(header)); - uint32_t actual_crc = crc32c::Value(header + 6, 1 + length); + uint32_t actual_crc = crc32c::Value(header + 6, length + header_size - 6); if (actual_crc != expected_crc) { // Drop the rest of the buffer since "length" itself may have // been corrupted and if we trust it, we could find some @@ -344,16 +393,16 @@ unsigned int Reader::ReadPhysicalRecord(Slice* result, size_t* drop_size) { } } - buffer_.remove_prefix(kHeaderSize + length); + buffer_.remove_prefix(header_size + length); // Skip physical record that started before initial_offset_ - if (end_of_buffer_offset_ - buffer_.size() - kHeaderSize - length < + if (end_of_buffer_offset_ - buffer_.size() - header_size - length < initial_offset_) { result->clear(); return kBadRecord; } - *result = Slice(header + kHeaderSize, length); + *result = Slice(header + header_size, length); return type; } } diff --git a/db/log_reader.h b/db/log_reader.h index 422469bab..28f0a2c1e 100644 --- a/db/log_reader.h +++ b/db/log_reader.h @@ -124,6 +124,8 @@ class Reader { kBadRecord = kMaxRecordType + 2, // Returned when we fail to read a valid header. kBadHeader = kMaxRecordType + 3, + // Returned when we read an old record from a previous user of the log. + kOldRecord = kMaxRecordType + 4, }; // Skips all blocks that are completely before "initial_offset_". @@ -134,6 +136,9 @@ class Reader { // Return type, or one of the preceding special values unsigned int ReadPhysicalRecord(Slice* result, size_t* drop_size); + // Read some more + bool ReadMore(size_t* drop_size, int *error); + // Reports dropped bytes to the reporter. // buffer_ must be updated to remove the dropped bytes prior to invocation. void ReportCorruption(size_t bytes, const char* reason); diff --git a/db/log_test.cc b/db/log_test.cc index 22921090a..41f4c8223 100644 --- a/db/log_test.cc +++ b/db/log_test.cc @@ -153,7 +153,7 @@ class LogTest : public ::testing::TestWithParam { // Record metadata for testing initial offset functionality static size_t initial_offset_record_sizes_[]; - static uint64_t initial_offset_last_record_offsets_[]; + uint64_t initial_offset_last_record_offsets_[4]; public: LogTest() @@ -163,8 +163,16 @@ class LogTest : public ::testing::TestWithParam { source_holder_( test::GetSequentialFileReader(new StringSource(reader_contents_))), writer_(std::move(dest_holder_), 123, GetParam()), - reader_(NULL, std::move(source_holder_), &report_, - true /*checksum*/, 0 /*initial_offset*/, 123) {} + reader_(NULL, std::move(source_holder_), &report_, true /*checksum*/, + 0 /*initial_offset*/, 123) { + int header_size = GetParam() ? kRecyclableHeaderSize : kHeaderSize; + initial_offset_last_record_offsets_[0] = 0; + initial_offset_last_record_offsets_[1] = header_size + 10000; + initial_offset_last_record_offsets_[2] = 2 * (header_size + 10000); + initial_offset_last_record_offsets_[3] = 2 * (header_size + 10000) + + (2 * log::kBlockSize - 1000) + + 3 * header_size; + } void Write(const std::string& msg) { writer_.AddRecord(Slice(msg)); @@ -200,9 +208,11 @@ class LogTest : public ::testing::TestWithParam { dest->Drop(bytes); } - void FixChecksum(int header_offset, int len) { + void FixChecksum(int header_offset, int len, bool recyclable) { // Compute crc of type/len/data - uint32_t crc = crc32c::Value(&dest_contents()[header_offset+6], 1 + len); + int header_size = recyclable ? kRecyclableHeaderSize : kHeaderSize; + uint32_t crc = crc32c::Value(&dest_contents()[header_offset + 6], + header_size - 6 + len); crc = crc32c::Mask(crc); EncodeFixed32(&dest_contents()[header_offset], crc); } @@ -292,13 +302,6 @@ size_t LogTest::initial_offset_record_sizes_[] = 2 * log::kBlockSize - 1000, // Span three blocks 1}; -uint64_t LogTest::initial_offset_last_record_offsets_[] = - {0, - kHeaderSize + 10000, - 2 * (kHeaderSize + 10000), - 2 * (kHeaderSize + 10000) + - (2 * log::kBlockSize - 1000) + 3 * kHeaderSize}; - TEST_P(LogTest, Empty) { ASSERT_EQ("EOF", Read()); } TEST_P(LogTest, ReadWrite) { @@ -336,9 +339,10 @@ TEST_P(LogTest, Fragmentation) { TEST_P(LogTest, MarginalTrailer) { // Make a trailer that is exactly the same length as an empty record. - const int n = kBlockSize - 2*kHeaderSize; + int header_size = GetParam() ? kRecyclableHeaderSize : kHeaderSize; + const int n = kBlockSize - 2 * header_size; Write(BigString("foo", n)); - ASSERT_EQ((unsigned int)(kBlockSize - kHeaderSize), WrittenBytes()); + ASSERT_EQ((unsigned int)(kBlockSize - header_size), WrittenBytes()); Write(""); Write("bar"); ASSERT_EQ(BigString("foo", n), Read()); @@ -349,9 +353,10 @@ TEST_P(LogTest, MarginalTrailer) { TEST_P(LogTest, MarginalTrailer2) { // Make a trailer that is exactly the same length as an empty record. - const int n = kBlockSize - 2*kHeaderSize; + int header_size = GetParam() ? kRecyclableHeaderSize : kHeaderSize; + const int n = kBlockSize - 2 * header_size; Write(BigString("foo", n)); - ASSERT_EQ((unsigned int)(kBlockSize - kHeaderSize), WrittenBytes()); + ASSERT_EQ((unsigned int)(kBlockSize - header_size), WrittenBytes()); Write("bar"); ASSERT_EQ(BigString("foo", n), Read()); ASSERT_EQ("bar", Read()); @@ -361,9 +366,10 @@ TEST_P(LogTest, MarginalTrailer2) { } TEST_P(LogTest, ShortTrailer) { - const int n = kBlockSize - 2*kHeaderSize + 4; + int header_size = GetParam() ? kRecyclableHeaderSize : kHeaderSize; + const int n = kBlockSize - 2 * header_size + 4; Write(BigString("foo", n)); - ASSERT_EQ((unsigned int)(kBlockSize - kHeaderSize + 4), WrittenBytes()); + ASSERT_EQ((unsigned int)(kBlockSize - header_size + 4), WrittenBytes()); Write(""); Write("bar"); ASSERT_EQ(BigString("foo", n), Read()); @@ -373,9 +379,10 @@ TEST_P(LogTest, ShortTrailer) { } TEST_P(LogTest, AlignedEof) { - const int n = kBlockSize - 2*kHeaderSize + 4; + int header_size = GetParam() ? kRecyclableHeaderSize : kHeaderSize; + const int n = kBlockSize - 2 * header_size + 4; Write(BigString("foo", n)); - ASSERT_EQ((unsigned int)(kBlockSize - kHeaderSize + 4), WrittenBytes()); + ASSERT_EQ((unsigned int)(kBlockSize - header_size + 4), WrittenBytes()); ASSERT_EQ(BigString("foo", n), Read()); ASSERT_EQ("EOF", Read()); } @@ -407,7 +414,7 @@ TEST_P(LogTest, BadRecordType) { Write("foo"); // Type is stored in header[6] IncrementByte(6, 100); - FixChecksum(0, 3); + FixChecksum(0, 3, false); ASSERT_EQ("EOF", Read()); ASSERT_EQ(3U, DroppedBytes()); ASSERT_EQ("OK", MatchError("unknown record type")); @@ -432,7 +439,8 @@ TEST_P(LogTest, TruncatedTrailingRecordIsNotIgnored) { } TEST_P(LogTest, BadLength) { - const int kPayloadSize = kBlockSize - kHeaderSize; + int header_size = GetParam() ? kRecyclableHeaderSize : kHeaderSize; + const int kPayloadSize = kBlockSize - header_size; Write(BigString("bar", kPayloadSize)); Write("foo"); // Least significant size byte is stored in header[4]. @@ -459,17 +467,17 @@ TEST_P(LogTest, BadLengthAtEndIsNotIgnored) { } TEST_P(LogTest, ChecksumMismatch) { - Write("foo"); - IncrementByte(0, 10); + Write("foooooo"); + IncrementByte(0, 14); ASSERT_EQ("EOF", Read()); - ASSERT_EQ(10U, DroppedBytes()); + ASSERT_EQ(14U + 4 * !!GetParam(), DroppedBytes()); ASSERT_EQ("OK", MatchError("checksum mismatch")); } TEST_P(LogTest, UnexpectedMiddleType) { Write("foo"); - SetByte(6, kMiddleType); - FixChecksum(0, 3); + SetByte(6, GetParam() ? kRecyclableMiddleType : kMiddleType); + FixChecksum(0, 3, !!GetParam()); ASSERT_EQ("EOF", Read()); ASSERT_EQ(3U, DroppedBytes()); ASSERT_EQ("OK", MatchError("missing start")); @@ -477,8 +485,8 @@ TEST_P(LogTest, UnexpectedMiddleType) { TEST_P(LogTest, UnexpectedLastType) { Write("foo"); - SetByte(6, kLastType); - FixChecksum(0, 3); + SetByte(6, GetParam() ? kRecyclableLastType : kLastType); + FixChecksum(0, 3, !!GetParam()); ASSERT_EQ("EOF", Read()); ASSERT_EQ(3U, DroppedBytes()); ASSERT_EQ("OK", MatchError("missing start")); @@ -487,8 +495,8 @@ TEST_P(LogTest, UnexpectedLastType) { TEST_P(LogTest, UnexpectedFullType) { Write("foo"); Write("bar"); - SetByte(6, kFirstType); - FixChecksum(0, 3); + SetByte(6, GetParam() ? kRecyclableFirstType : kFirstType); + FixChecksum(0, 3, !!GetParam()); ASSERT_EQ("bar", Read()); ASSERT_EQ("EOF", Read()); ASSERT_EQ(3U, DroppedBytes()); @@ -498,8 +506,8 @@ TEST_P(LogTest, UnexpectedFullType) { TEST_P(LogTest, UnexpectedFirstType) { Write("foo"); Write(BigString("bar", 100000)); - SetByte(6, kFirstType); - FixChecksum(0, 3); + SetByte(6, GetParam() ? kRecyclableFirstType : kFirstType); + FixChecksum(0, 3, !!GetParam()); ASSERT_EQ(BigString("bar", 100000), Read()); ASSERT_EQ("EOF", Read()); ASSERT_EQ(3U, DroppedBytes()); @@ -573,13 +581,25 @@ TEST_P(LogTest, ReadSecondOneOff) { CheckInitialOffsetRecord(1, 1); } TEST_P(LogTest, ReadSecondTenThousand) { CheckInitialOffsetRecord(10000, 1); } -TEST_P(LogTest, ReadSecondStart) { CheckInitialOffsetRecord(10007, 1); } +TEST_P(LogTest, ReadSecondStart) { + int header_size = GetParam() ? kRecyclableHeaderSize : kHeaderSize; + CheckInitialOffsetRecord(10000 + header_size, 1); +} -TEST_P(LogTest, ReadThirdOneOff) { CheckInitialOffsetRecord(10008, 2); } +TEST_P(LogTest, ReadThirdOneOff) { + int header_size = GetParam() ? kRecyclableHeaderSize : kHeaderSize; + CheckInitialOffsetRecord(10000 + header_size + 1, 2); +} -TEST_P(LogTest, ReadThirdStart) { CheckInitialOffsetRecord(20014, 2); } +TEST_P(LogTest, ReadThirdStart) { + int header_size = GetParam() ? kRecyclableHeaderSize : kHeaderSize; + CheckInitialOffsetRecord(20000 + 2 * header_size, 2); +} -TEST_P(LogTest, ReadFourthOneOff) { CheckInitialOffsetRecord(20015, 3); } +TEST_P(LogTest, ReadFourthOneOff) { + int header_size = GetParam() ? kRecyclableHeaderSize : kHeaderSize; + CheckInitialOffsetRecord(20000 + 2 * header_size + 1, 3); +} TEST_P(LogTest, ReadFourthFirstBlockTrailer) { CheckInitialOffsetRecord(log::kBlockSize - 4, 3); @@ -594,8 +614,9 @@ TEST_P(LogTest, ReadFourthLastBlock) { } TEST_P(LogTest, ReadFourthStart) { + int header_size = GetParam() ? kRecyclableHeaderSize : kHeaderSize; CheckInitialOffsetRecord( - 2 * (kHeaderSize + 1000) + (2 * log::kBlockSize - 1000) + 3 * kHeaderSize, + 2 * (header_size + 1000) + (2 * log::kBlockSize - 1000) + 3 * header_size, 3); } @@ -606,7 +627,8 @@ TEST_P(LogTest, ReadPastEnd) { CheckOffsetPastEndReturnsNoRecords(5); } TEST_P(LogTest, ClearEofSingleBlock) { Write("foo"); Write("bar"); - ForceEOF(3 + kHeaderSize + 2); + int header_size = GetParam() ? kRecyclableHeaderSize : kHeaderSize; + ForceEOF(3 + header_size + 2); ASSERT_EQ("foo", Read()); UnmarkEOF(); ASSERT_EQ("bar", Read()); @@ -620,10 +642,11 @@ TEST_P(LogTest, ClearEofSingleBlock) { TEST_P(LogTest, ClearEofMultiBlock) { size_t num_full_blocks = 5; - size_t n = (kBlockSize - kHeaderSize) * num_full_blocks + 25; + int header_size = GetParam() ? kRecyclableHeaderSize : kHeaderSize; + size_t n = (kBlockSize - header_size) * num_full_blocks + 25; Write(BigString("foo", n)); Write(BigString("bar", n)); - ForceEOF(n + num_full_blocks * kHeaderSize + 10); + ForceEOF(n + num_full_blocks * header_size + header_size + 3); ASSERT_EQ(BigString("foo", n), Read()); ASSERT_TRUE(IsEOF()); UnmarkEOF(); diff --git a/db/log_writer.cc b/db/log_writer.cc index 9bef090f1..6ba285656 100644 --- a/db/log_writer.cc +++ b/db/log_writer.cc @@ -37,6 +37,10 @@ Status Writer::AddRecord(const Slice& slice) { const char* ptr = slice.data(); size_t left = slice.size(); + // Header size varies depending on whether we are recycling or not. + const int header_size = + recycle_log_files_ ? kRecyclableHeaderSize : kHeaderSize; + // Fragment the record if necessary and emit it. Note that if slice // is empty, we still want to iterate once to emit a single // zero-length record @@ -45,32 +49,34 @@ Status Writer::AddRecord(const Slice& slice) { do { const int leftover = kBlockSize - block_offset_; assert(leftover >= 0); - if (leftover < kHeaderSize) { + if (leftover < header_size) { // Switch to a new block if (leftover > 0) { - // Fill the trailer (literal below relies on kHeaderSize being 7) - assert(kHeaderSize == 7); - dest_->Append(Slice("\x00\x00\x00\x00\x00\x00", leftover)); + // Fill the trailer (literal below relies on kHeaderSize and + // kRecyclableHeaderSize being <= 11) + assert(header_size <= 11); + dest_->Append( + Slice("\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00", leftover)); } block_offset_ = 0; } - // Invariant: we never leave < kHeaderSize bytes in a block. - assert(static_cast(kBlockSize) - block_offset_ >= kHeaderSize); + // Invariant: we never leave < header_size bytes in a block. + assert(static_cast(kBlockSize) - block_offset_ >= header_size); - const size_t avail = kBlockSize - block_offset_ - kHeaderSize; + const size_t avail = kBlockSize - block_offset_ - header_size; const size_t fragment_length = (left < avail) ? left : avail; RecordType type; const bool end = (left == fragment_length); if (begin && end) { - type = kFullType; + type = recycle_log_files_ ? kRecyclableFullType : kFullType; } else if (begin) { - type = kFirstType; + type = recycle_log_files_ ? kRecyclableFirstType : kFirstType; } else if (end) { - type = kLastType; + type = recycle_log_files_ ? kRecyclableLastType : kLastType; } else { - type = kMiddleType; + type = recycle_log_files_ ? kRecyclableMiddleType : kMiddleType; } s = EmitPhysicalRecord(type, ptr, fragment_length); @@ -83,28 +89,48 @@ Status Writer::AddRecord(const Slice& slice) { Status Writer::EmitPhysicalRecord(RecordType t, const char* ptr, size_t n) { assert(n <= 0xffff); // Must fit in two bytes - assert(block_offset_ + kHeaderSize + n <= kBlockSize); + + size_t header_size; + char buf[kRecyclableHeaderSize]; // Format the header - char buf[kHeaderSize]; buf[4] = static_cast(n & 0xff); buf[5] = static_cast(n >> 8); buf[6] = static_cast(t); + uint32_t crc = type_crc_[t]; + if (t < kRecyclableFullType) { + // Legacy record format + assert(block_offset_ + kHeaderSize + n <= kBlockSize); + header_size = kHeaderSize; + } else { + // Recyclable record format + assert(block_offset_ + kRecyclableHeaderSize + n <= kBlockSize); + header_size = kRecyclableHeaderSize; + + // Only encode low 32-bits of the 64-bit log number. This means + // we will fail to detect an old record if we recycled a log from + // ~4 billion logs ago, but that is effectively impossible, and + // even if it were we'dbe far more likely to see a false positive + // on the 32-bit CRC. + EncodeFixed32(buf + 7, static_cast(log_number_)); + crc = crc32c::Extend(crc, buf + 7, 4); + } + // Compute the crc of the record type and the payload. - uint32_t crc = crc32c::Extend(type_crc_[t], ptr, n); - crc = crc32c::Mask(crc); // Adjust for storage + crc = crc32c::Extend(crc, ptr, n); + crc = crc32c::Mask(crc); // Adjust for storage EncodeFixed32(buf, crc); // Write the header and the payload - Status s = dest_->Append(Slice(buf, kHeaderSize)); + Status s = dest_->Append(Slice(buf, header_size)); if (s.ok()) { s = dest_->Append(Slice(ptr, n)); if (s.ok()) { s = dest_->Flush(); } } - block_offset_ += kHeaderSize + n; + block_offset_ += header_size + n; return s; } diff --git a/db/log_writer.h b/db/log_writer.h index a8aa5d476..8f5c3a10e 100644 --- a/db/log_writer.h +++ b/db/log_writer.h @@ -43,7 +43,7 @@ namespace log { * Data is written out in kBlockSize chunks. If next record does not fit * into the space left, the leftover space will be padded with \0. * - * Record format: + * Legacy record format: * * +---------+-----------+-----------+--- ... ---+ * |CRC (4B) | Size (2B) | Type (1B) | Payload | @@ -57,6 +57,15 @@ namespace log { * blocks that are larger than kBlockSize * Payload = Byte stream as long as specified by the payload size * + * Recyclable record format: + * + * +---------+-----------+-----------+----------------+--- ... ---+ + * |CRC (4B) | Size (2B) | Type (1B) | Log number (4B)| Payload | + * +---------+-----------+-----------+----------------+--- ... ---+ + * + * Same as above, with the addition of + * Log number = 32bit log file number, so that we can distinguish between + * records written by the most recent log writer vs a previous one. */ class Writer { public: From e3b1d23d3ec3f3686dcdd154ad9b9a32f3b5decc Mon Sep 17 00:00:00 2001 From: Islam AbdelRahman Date: Mon, 19 Oct 2015 15:05:59 -0700 Subject: [PATCH 12/26] Bump version to 4.2 Summary: Bump the version to 4.2 ( the unreleased version ), so that when fbcode_unittests run it can differentiate between old and new APIs Test Plan: make check Reviewers: sdong, yhchiang, igor Reviewed By: igor Subscribers: dhruba Differential Revision: https://reviews.facebook.net/D49041 --- include/rocksdb/version.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/include/rocksdb/version.h b/include/rocksdb/version.h index 86a19393e..6660bcc3c 100644 --- a/include/rocksdb/version.h +++ b/include/rocksdb/version.h @@ -5,7 +5,7 @@ #pragma once #define ROCKSDB_MAJOR 4 -#define ROCKSDB_MINOR 1 +#define ROCKSDB_MINOR 2 #define ROCKSDB_PATCH 0 // Do not use these. We made the mistake of declaring macros starting with From 6d6776f6b8b093a1ad24871e4ab589e441edbca3 Mon Sep 17 00:00:00 2001 From: sdong Date: Mon, 19 Oct 2015 13:07:05 -0700 Subject: [PATCH 13/26] Log more information for the add file with overlapping range failure Summary: crash_test sometimes fails, hitting the add file overlapping assert. Add information in info logs help us to find the bug. Test Plan: Run all test suites. Do some manual tests to make sure printing is correct. Reviewers: kradhakrishnan, yhchiang, anthony, IslamAbdelRahman, rven, igor Reviewed By: igor Subscribers: leveldb, dhruba Differential Revision: https://reviews.facebook.net/D49017 --- db/version_builder.cc | 11 +++++++---- db/version_builder.h | 2 +- db/version_set.cc | 26 ++++++++++++++++++++------ db/version_set.h | 2 +- 4 files changed, 29 insertions(+), 12 deletions(-) diff --git a/db/version_builder.cc b/db/version_builder.cc index 7444bfc5c..7e6358e5f 100644 --- a/db/version_builder.cc +++ b/db/version_builder.cc @@ -82,6 +82,7 @@ class VersionBuilder::Rep { }; const EnvOptions& env_options_; + Logger* info_log_; TableCache* table_cache_; VersionStorageInfo* base_vstorage_; LevelState* levels_; @@ -89,9 +90,10 @@ class VersionBuilder::Rep { FileComparator level_nonzero_cmp_; public: - Rep(const EnvOptions& env_options, TableCache* table_cache, + Rep(const EnvOptions& env_options, Logger* info_log, TableCache* table_cache, VersionStorageInfo* base_vstorage) : env_options_(env_options), + info_log_(info_log), table_cache_(table_cache), base_vstorage_(base_vstorage) { levels_ = new LevelState[base_vstorage_->num_levels()]; @@ -335,15 +337,16 @@ class VersionBuilder::Rep { if (levels_[level].deleted_files.count(f->fd.GetNumber()) > 0) { // File is deleted: do nothing } else { - vstorage->AddFile(level, f); + vstorage->AddFile(level, f, info_log_); } } }; VersionBuilder::VersionBuilder(const EnvOptions& env_options, TableCache* table_cache, - VersionStorageInfo* base_vstorage) - : rep_(new Rep(env_options, table_cache, base_vstorage)) {} + VersionStorageInfo* base_vstorage, + Logger* info_log) + : rep_(new Rep(env_options, info_log, table_cache, base_vstorage)) {} VersionBuilder::~VersionBuilder() { delete rep_; } void VersionBuilder::CheckConsistency(VersionStorageInfo* vstorage) { rep_->CheckConsistency(vstorage); diff --git a/db/version_builder.h b/db/version_builder.h index c7ef2796c..143da9905 100644 --- a/db/version_builder.h +++ b/db/version_builder.h @@ -24,7 +24,7 @@ class InternalStats; class VersionBuilder { public: VersionBuilder(const EnvOptions& env_options, TableCache* table_cache, - VersionStorageInfo* base_vstorage); + VersionStorageInfo* base_vstorage, Logger* info_log = nullptr); ~VersionBuilder(); void CheckConsistency(VersionStorageInfo* vstorage); void CheckConsistencyForDeletes(VersionEdit* edit, uint64_t number, diff --git a/db/version_set.cc b/db/version_set.cc index 34e67aa0f..673450ad1 100644 --- a/db/version_set.cc +++ b/db/version_set.cc @@ -524,7 +524,7 @@ class BaseReferencedVersionBuilder { explicit BaseReferencedVersionBuilder(ColumnFamilyData* cfd) : version_builder_(new VersionBuilder( cfd->current()->version_set()->env_options(), cfd->table_cache(), - cfd->current()->storage_info())), + cfd->current()->storage_info(), cfd->ioptions()->info_log)), version_(cfd->current()) { version_->Ref(); } @@ -1262,13 +1262,27 @@ bool CompareCompensatedSizeDescending(const Fsize& first, const Fsize& second) { } } // anonymous namespace -void VersionStorageInfo::AddFile(int level, FileMetaData* f) { +void VersionStorageInfo::AddFile(int level, FileMetaData* f, Logger* info_log) { auto* level_files = &files_[level]; // Must not overlap - assert(level <= 0 || level_files->empty() || - internal_comparator_->Compare( - (*level_files)[level_files->size() - 1]->largest, f->smallest) < - 0); +#ifndef NDEBUG + if (level > 0 && !level_files->empty() && + internal_comparator_->Compare( + (*level_files)[level_files->size() - 1]->largest, f->smallest) >= 0) { + auto* f2 = (*level_files)[level_files->size() - 1]; + if (info_log != nullptr) { + Error(info_log, "Adding new file %" PRIu64 + " range (%s, %s) to level %d but overlapping " + "with existing file %" PRIu64 " %s %s", + f->fd.GetNumber(), f->smallest.DebugString(true).c_str(), + f->largest.DebugString(true).c_str(), level, f2->fd.GetNumber(), + f2->smallest.DebugString(true).c_str(), + f2->largest.DebugString(true).c_str()); + LogFlush(info_log); + } + assert(false); + } +#endif f->refs++; level_files->push_back(f); } diff --git a/db/version_set.h b/db/version_set.h index c250dddbd..0c0f3cc9d 100644 --- a/db/version_set.h +++ b/db/version_set.h @@ -97,7 +97,7 @@ class VersionStorageInfo { void Reserve(int level, size_t size) { files_[level].reserve(size); } - void AddFile(int level, FileMetaData* f); + void AddFile(int level, FileMetaData* f, Logger* info_log = nullptr); void SetFinalized(); From 0bf656b90469d01e97cd2ed1d49b79896aa51da1 Mon Sep 17 00:00:00 2001 From: Igor Canadi Date: Mon, 19 Oct 2015 18:47:59 -0700 Subject: [PATCH 14/26] Don't spew warnings when flint doesn't exist Summary: Before this diff `arc lint` on non-fb machine issued warnings. Now it doesn't. Test Plan: `arc lint` is quiet. Reviewers: yhchiang, sdong Reviewed By: sdong Subscribers: dhruba, leveldb Differential Revision: https://reviews.facebook.net/D49071 --- arcanist_util/cpp_linter/FbcodeCppLinter.php | 3 +++ 1 file changed, 3 insertions(+) diff --git a/arcanist_util/cpp_linter/FbcodeCppLinter.php b/arcanist_util/cpp_linter/FbcodeCppLinter.php index 66eefa004..3dac9bf73 100644 --- a/arcanist_util/cpp_linter/FbcodeCppLinter.php +++ b/arcanist_util/cpp_linter/FbcodeCppLinter.php @@ -88,6 +88,9 @@ class FbcodeCppLinter extends ArcanistLinter { } private function getCppLintOutput($path) { + if (!array_key_exists($path, $this->rawLintOutput)) { + return array(); + } list($output) = $this->rawLintOutput[$path]; $msgs = array(); From 7717ad1afe38f7cb0e6041debee9ee9cd41734b8 Mon Sep 17 00:00:00 2001 From: krad Date: Mon, 19 Oct 2015 20:13:14 -0700 Subject: [PATCH 15/26] Adding artifacts to stress_crash CI job Summary: Adding the ability to upload logs and db content to storage after the completion of the job Test Plan: Manual run Reviewers: CC: leveldb@ Task ID: #8754201 Blame Rev: --- build_tools/rocksdb-lego-determinator | 8 ++++++++ 1 file changed, 8 insertions(+) diff --git a/build_tools/rocksdb-lego-determinator b/build_tools/rocksdb-lego-determinator index 3fed164fd..b48cb65b7 100755 --- a/build_tools/rocksdb-lego-determinator +++ b/build_tools/rocksdb-lego-determinator @@ -70,6 +70,13 @@ TSAN="COMPILE_WITH_TSAN=1" DISABLE_JEMALLOC="DISABLE_JEMALLOC=1" PARSER="'parser':'egrep \'Failure|^#|Abort\''" +ARTIFACTS=" 'artifacts': [ + { + 'name':'database', + 'paths':[ '/dev/shm/rocksdb' ], + } +]" + # # A mechanism to disable tests temporarily # @@ -280,6 +287,7 @@ STRESS_CRASH_TEST_COMMANDS="[ $PARSER } ], + $ARTIFACTS, $REPORT } ]" From 033c6f1add7aad4ae81d944f549ec0f61634b344 Mon Sep 17 00:00:00 2001 From: Shusen Liu Date: Mon, 19 Oct 2015 20:58:38 -0700 Subject: [PATCH 16/26] T7916298, bug fix Summary: dbname => cmd_params['db'] Test Plan: Run `make crash_test` Reviewers: sdong Reviewed By: sdong Subscribers: dhruba Differential Revision: https://reviews.facebook.net/D49077 --- tools/db_crashtest.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/tools/db_crashtest.py b/tools/db_crashtest.py index 6f9a1e867..0233746c9 100644 --- a/tools/db_crashtest.py +++ b/tools/db_crashtest.py @@ -219,7 +219,7 @@ def blackbox_crash_main(args): time.sleep(1) # time to stabilize before the next run # we need to clean up after ourselves -- only do this on test success - shutil.rmtree(dbname, True) + shutil.rmtree(cmd_params['db'], True) # This python script runs db_stress multiple times. Some runs with @@ -328,7 +328,7 @@ def whitebox_crash_main(args): if time.time() > half_time: # we need to clean up after ourselves -- only do this on test # success - shutil.rmtree(dbname, True) + shutil.rmtree(cmd_params['db'], True) check_mode = (check_mode + 1) % total_check_mode time.sleep(1) # time to stabilize after a kill From e3d4e140753e19083c777b0c3d42d0ff94bae4bd Mon Sep 17 00:00:00 2001 From: sdong Date: Mon, 19 Oct 2015 23:40:15 -0700 Subject: [PATCH 17/26] DBCompactionTestWithParam.ManualCompaction to verify block cache is not filled in manual compaction Summary: Manual compaction should not fill block cache. Add the verification in unit test Test Plan: Run the test Reviewers: yhchiang, kradhakrishnan, rven, IslamAbdelRahman, anthony, igor Reviewed By: igor Subscribers: leveldb, dhruba Differential Revision: https://reviews.facebook.net/D49089 --- db/db_compaction_test.cc | 9 +++++++++ 1 file changed, 9 insertions(+) diff --git a/db/db_compaction_test.cc b/db/db_compaction_test.cc index 615adbdc0..be53fdfde 100644 --- a/db/db_compaction_test.cc +++ b/db/db_compaction_test.cc @@ -1227,6 +1227,7 @@ TEST_F(DBCompactionTest, L0_CompactionBug_Issue44_b) { TEST_P(DBCompactionTestWithParam, ManualCompaction) { Options options = CurrentOptions(); options.max_subcompactions = max_subcompactions_; + options.statistics = rocksdb::CreateDBStatistics(); CreateAndReopenWithCF({"pikachu"}, options); // iter - 0 with 7 levels @@ -1258,7 +1259,14 @@ TEST_P(DBCompactionTestWithParam, ManualCompaction) { // Compact all MakeTables(1, "a", "z", 1); ASSERT_EQ("1,0,2", FilesPerLevel(1)); + + uint64_t prev_block_cache_add = + options.statistics->getTickerCount(BLOCK_CACHE_ADD); db_->CompactRange(CompactRangeOptions(), handles_[1], nullptr, nullptr); + // Verify manual compaction doesn't fill block cache + ASSERT_EQ(prev_block_cache_add, + options.statistics->getTickerCount(BLOCK_CACHE_ADD)); + ASSERT_EQ("0,0,1", FilesPerLevel(1)); if (iter == 0) { @@ -1266,6 +1274,7 @@ TEST_P(DBCompactionTestWithParam, ManualCompaction) { options.max_background_flushes = 0; options.num_levels = 3; options.create_if_missing = true; + options.statistics = rocksdb::CreateDBStatistics(); DestroyAndReopen(options); CreateAndReopenWithCF({"pikachu"}, options); } From e154ee08639baa0eddf952201655f6d1313b9b5d Mon Sep 17 00:00:00 2001 From: Dmitri Smirnov Date: Tue, 20 Oct 2015 13:35:08 -0700 Subject: [PATCH 18/26] Do not build test only code and unit tests in Release builds Test code errors are currently blocking Windows Release builew We do not want spend time building in Release what we can not run We want to eliminate a source of most frequent errors when people check-in test only code which can not be built in Release. This feature will work only if you invoke msbuild against rocksdb.sln Invoking it against ALL_BUILD target will attempt to build everything. --- CMakeLists.txt | 59 +++++++++++++++++++++++++++++++++++------- appveyor.yml | 2 +- appveyordailytests.yml | 2 +- 3 files changed, 52 insertions(+), 11 deletions(-) diff --git a/CMakeLists.txt b/CMakeLists.txt index 98efe3892..a05e7c887 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -14,10 +14,15 @@ # 3. Run cmake to generate project files for Windows, add more options to enable required third-party libraries. # See thirdparty.inc for more information. # sample command: cmake -G "Visual Studio 12 Win64" -DGFLAGS=1 -DSNAPPY=1 -DJEMALLOC=1 .. -# 4. Then build the project in debug mode (you may want to add /m: flag to run msbuild in parallel threads) -# msbuild ALL_BUILD.vcxproj +# 4. Then build the project in debug mode (you may want to add /m[:] flag to run msbuild in parallel threads +# or simply /m ot use all avail cores) +# msbuild rocksdb.sln +# +# rocksdb.sln build features exclusions of test only code in Release. If you build ALL_BUILD then everything +# will be attempted but test only code does not build in Release mode. +# # 5. And release mode (/m[:] is also supported) -# msbuild ALL_BUILD.vcxproj /p:Configuration=Release +# msbuild rocksdb.sln /p:Configuration=Release # cmake_minimum_required(VERSION 2.6) @@ -83,6 +88,7 @@ set(LIBS ${ROCKSDB_LIBS} ${THIRDPARTY_LIBS} ${SYSTEM_LIBS}) add_subdirectory(third-party/gtest-1.7.0/fused-src/gtest) +# Main library source code set(SOURCES db/builder.cc db/c.cc @@ -100,7 +106,6 @@ set(SOURCES db/db_impl_experimental.cc db/db_impl_readonly.cc db/db_iter.cc - db/db_test_util.cc db/event_helpers.cc db/experimental.cc db/filename.cc @@ -252,6 +257,12 @@ set(SOURCES utilities/write_batch_with_index/write_batch_with_index_internal.cc ) +# For test util library that is build only in DEBUG mode +# and linked to tests. Add test only code that is not #ifdefed for Release here. +set(TESTUTIL_SOURCE + db/db_test_util.cc +) + add_library(rocksdblib${ARTIFACT_SUFFIX} ${SOURCES}) set_target_properties(rocksdblib${ARTIFACT_SUFFIX} PROPERTIES COMPILE_FLAGS "/Fd${CMAKE_CFG_INTDIR}/rocksdblib${ARTIFACT_SUFFIX}.pdb") add_dependencies(rocksdblib${ARTIFACT_SUFFIX} GenerateBuildVersion) @@ -367,7 +378,7 @@ set(TESTS utilities/write_batch_with_index/write_batch_with_index_test.cc ) -set(EXES ${APPS} ${TESTS}) +set(EXES ${APPS}) foreach(sourcefile ${EXES}) string(REPLACE ".cc" "" exename ${sourcefile}) @@ -376,12 +387,42 @@ foreach(sourcefile ${EXES}) target_link_libraries(${exename}${ARTIFACT_SUFFIX} ${LIBS}) endforeach(sourcefile ${EXES}) +# test utilities are only build in debug +set(TESTUTILLIB testutillib${ARTIFACT_SUFFIX}) +add_library(${TESTUTILLIB} STATIC ${TESTUTIL_SOURCE}) +set_target_properties(${TESTUTILLIB} PROPERTIES COMPILE_FLAGS "/Fd${CMAKE_CFG_INTDIR}/testutillib${ARTIFACT_SUFFIX}.pdb") +set_target_properties(${TESTUTILLIB} + PROPERTIES EXCLUDE_FROM_DEFAULT_BUILD_RELEASE 1 + EXCLUDE_FROM_DEFAULT_BUILD_MINRELEASE 1 + EXCLUDE_FROM_DEFAULT_BUILD_RELWITHDEBINFO 1 + ) + +# Tests are excluded from Release builds +set(TEST_EXES ${TESTS}) + +foreach(sourcefile ${TEST_EXES}) + string(REPLACE ".cc" "" exename ${sourcefile}) + string(REGEX REPLACE "^((.+)/)+" "" exename ${exename}) + add_executable(${exename}${ARTIFACT_SUFFIX} ${sourcefile}) + set_target_properties(${exename}${ARTIFACT_SUFFIX} + PROPERTIES EXCLUDE_FROM_DEFAULT_BUILD_RELEASE 1 + EXCLUDE_FROM_DEFAULT_BUILD_MINRELEASE 1 + EXCLUDE_FROM_DEFAULT_BUILD_RELWITHDEBINFO 1 + ) + target_link_libraries(${exename}${ARTIFACT_SUFFIX} ${LIBS} testutillib${ARTIFACT_SUFFIX}) +endforeach(sourcefile ${TEST_EXES}) + # C executables must link to a shared object -set(C_EXES ${C_TESTS}) +set(C_TEST_EXES ${C_TESTS}) -foreach(sourcefile ${C_EXES}) +foreach(sourcefile ${C_TEST_EXES}) string(REPLACE ".c" "" exename ${sourcefile}) string(REGEX REPLACE "^((.+)/)+" "" exename ${exename}) add_executable(${exename}${ARTIFACT_SUFFIX} ${sourcefile}) - target_link_libraries(${exename}${ARTIFACT_SUFFIX} rocksdb${ARTIFACT_SUFFIX}) -endforeach(sourcefile ${C_TESTS}) + set_target_properties(${exename}${ARTIFACT_SUFFIX} + PROPERTIES EXCLUDE_FROM_DEFAULT_BUILD_RELEASE 1 + EXCLUDE_FROM_DEFAULT_BUILD_MINRELEASE 1 + EXCLUDE_FROM_DEFAULT_BUILD_RELWITHDEBINFO 1 + ) + target_link_libraries(${exename}${ARTIFACT_SUFFIX} rocksdb${ARTIFACT_SUFFIX} testutillib${ARTIFACT_SUFFIX}) +endforeach(sourcefile ${C_TEST_EXES}) diff --git a/appveyor.yml b/appveyor.yml index e13e2d226..eb2f63172 100644 --- a/appveyor.yml +++ b/appveyor.yml @@ -5,7 +5,7 @@ before_build: - cmake -G "Visual Studio 12 Win64" .. - cd .. build: - project: build\ALL_BUILD.vcxproj + project: build\rocksdb.sln parallel: true verbosity: minimal test: off diff --git a/appveyordailytests.yml b/appveyordailytests.yml index a8b4af60c..0a35ac68b 100644 --- a/appveyordailytests.yml +++ b/appveyordailytests.yml @@ -5,7 +5,7 @@ before_build: - cmake -G "Visual Studio 12 Win64" -DOPTDBG=1 .. - cd .. build: - project: build\ALL_BUILD.vcxproj + project: build\rocksdb.sln parallel: true verbosity: minimal test: From 5678c05d865010e589105645248d33b814f3fcf7 Mon Sep 17 00:00:00 2001 From: Yueh-Hsuan Chiang Date: Tue, 20 Oct 2015 17:09:09 -0700 Subject: [PATCH 19/26] Use DEBUG_LEVEL=0 in make release and make clean Summary: Use DEBUG_LEVEL=0 in make release and make clean Test Plan: make clean make release -j32 Reviewers: MarkCallaghan, sdong, anthony, IslamAbdelRahman, igor Reviewed By: igor Subscribers: dhruba, leveldb Differential Revision: https://reviews.facebook.net/D49125 --- Makefile | 10 +++++++++- 1 file changed, 9 insertions(+), 1 deletion(-) diff --git a/Makefile b/Makefile index fb14eab92..47b56cf3b 100644 --- a/Makefile +++ b/Makefile @@ -41,6 +41,14 @@ ifeq ($(MAKECMDGOALS),dbg) DEBUG_LEVEL=2 endif +ifeq ($(MAKECMDGOALS),clean) + DEBUG_LEVEL=0 +endif + +ifeq ($(MAKECMDGOALS),release) + DEBUG_LEVEL=0 +endif + ifeq ($(MAKECMDGOALS),shared_lib) DEBUG_LEVEL=0 endif @@ -404,7 +412,7 @@ dbg: $(LIBRARY) $(BENCHMARKS) tools $(TESTS) # creates static library and programs release: $(MAKE) clean - OPT="-DNDEBUG -O2" $(MAKE) static_lib tools db_bench + DEBUG_LEVEL=0 $(MAKE) static_lib tools db_bench coverage: $(MAKE) clean From d0d13ebf6777ea9775e6f2c510ebe65822955145 Mon Sep 17 00:00:00 2001 From: Shusen Liu Date: Tue, 20 Oct 2015 11:31:27 -0700 Subject: [PATCH 20/26] fix bug in db_crashtest.py Summary: in tools/db_crashtest.py, cmd_params['db'] by default is a lambda expression, not the actual db_name. fix by get the db_name before passing it to gen_cmd. Test Plan: run `make crashtest` Reviewers: sdong Reviewed By: sdong Subscribers: dhruba Differential Revision: https://reviews.facebook.net/D49119 --- tools/db_crashtest.py | 16 +++++++--------- 1 file changed, 7 insertions(+), 9 deletions(-) diff --git a/tools/db_crashtest.py b/tools/db_crashtest.py index 0233746c9..5fbb39e16 100644 --- a/tools/db_crashtest.py +++ b/tools/db_crashtest.py @@ -57,7 +57,6 @@ def get_dbname(test_name): return dbname blackbox_default_params = { - 'db': lambda: get_dbname('blackbox'), # total time for this script to test db_stress "duration": 6000, # time for one db_stress instance to run @@ -69,7 +68,6 @@ blackbox_default_params = { } whitebox_default_params = { - 'db': lambda: get_dbname('whitebox'), "duration": 10000, "log2_keys_per_lock": 10, "nooverwritepercent": 1, @@ -110,7 +108,6 @@ simple_default_params = { } blackbox_simple_default_params = { - 'db': lambda: get_dbname('blackbox'), "duration": 6000, "interval": 120, "open_files": -1, @@ -120,7 +117,6 @@ blackbox_simple_default_params = { } whitebox_simple_default_params = { - 'db': lambda: get_dbname('whitebox'), "duration": 10000, "log2_keys_per_lock": 10, "nooverwritepercent": 1, @@ -166,7 +162,7 @@ def gen_cmd(params): # in case of unsafe crashes in RocksDB. def blackbox_crash_main(args): cmd_params = gen_cmd_params(args) - + dbname = get_dbname('blackbox') exit_time = time.time() + cmd_params['duration'] print("Running blackbox-crash-test with \n" @@ -180,7 +176,7 @@ def blackbox_crash_main(args): run_had_errors = False killtime = time.time() + cmd_params['interval'] - cmd = gen_cmd(cmd_params) + cmd = gen_cmd(dict(cmd_params.items() + {'db': dbname}.items())) child = subprocess.Popen([cmd], stderr=subprocess.PIPE, shell=True) @@ -219,13 +215,14 @@ def blackbox_crash_main(args): time.sleep(1) # time to stabilize before the next run # we need to clean up after ourselves -- only do this on test success - shutil.rmtree(cmd_params['db'], True) + shutil.rmtree(dbname, True) # This python script runs db_stress multiple times. Some runs with # kill_random_test that causes rocksdb to crash at various points in code. def whitebox_crash_main(args): cmd_params = gen_cmd_params(args) + dbname = get_dbname('whitebox') cur_time = time.time() exit_time = cur_time + cmd_params['duration'] @@ -285,7 +282,8 @@ def whitebox_crash_main(args): "ops_per_thread": cmd_params['ops_per_thread'], } - cmd = gen_cmd(dict(cmd_params.items() + additional_opts.items())) + cmd = gen_cmd(dict(cmd_params.items() + additional_opts.items() + + {'db': dbname}.items())) print "Running:" + cmd + "\n" @@ -328,7 +326,7 @@ def whitebox_crash_main(args): if time.time() > half_time: # we need to clean up after ourselves -- only do this on test # success - shutil.rmtree(cmd_params['db'], True) + shutil.rmtree(dbname, True) check_mode = (check_mode + 1) % total_check_mode time.sleep(1) # time to stabilize after a kill From 980a82ee2fa2c0e16b2278e9f732d50b9df7ce3c Mon Sep 17 00:00:00 2001 From: Alexey Maykov Date: Wed, 21 Oct 2015 18:34:25 -0700 Subject: [PATCH 21/26] Fix a bug in GetApproximateSizes Summary: Need to pass through the memtable parameter. Test Plan: built, tested through myrocks Reviewers: igor, sdong, rven Reviewed By: rven Subscribers: dhruba Differential Revision: https://reviews.facebook.net/D49167 --- include/rocksdb/utilities/stackable_db.h | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/include/rocksdb/utilities/stackable_db.h b/include/rocksdb/utilities/stackable_db.h index 50b5538f7..75e1da458 100644 --- a/include/rocksdb/utilities/stackable_db.h +++ b/include/rocksdb/utilities/stackable_db.h @@ -148,7 +148,8 @@ class StackableDB : public DB { virtual void GetApproximateSizes(ColumnFamilyHandle* column_family, const Range* r, int n, uint64_t* sizes, bool include_memtable = false) override { - return db_->GetApproximateSizes(column_family, r, n, sizes); + return db_->GetApproximateSizes(column_family, r, n, sizes, + include_memtable); } using DB::CompactRange; From 6e6dd5f6f9f65582a77014d064d4cfc554f6854a Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Javier=20Gonz=C3=A1lez?= Date: Wed, 14 Oct 2015 10:14:53 +0200 Subject: [PATCH 22/26] Split posix storage backend into Env and library Summary: This patch splits the posix storage backend into Env and the actual *File implementations. The motivation is to allow other Envs to use posix as a library. This enables a storage backend different from posix to split its secondary storage between a normal file system partition managed by posix, and it own media. Test Plan: No new functionality is added to posix Env or the library, thus the current tests should suffice. --- build_tools/build_detect_platform | 2 +- build_tools/fbcode_config.sh | 2 +- build_tools/fbcode_config4.8.1.sh | 2 +- include/posix/io_posix.h | 214 +++++++++ src.mk | 1 + util/env_posix.cc | 742 +----------------------------- util/io_posix.cc | 614 ++++++++++++++++++++++++ 7 files changed, 839 insertions(+), 738 deletions(-) create mode 100644 include/posix/io_posix.h create mode 100644 util/io_posix.cc diff --git a/build_tools/build_detect_platform b/build_tools/build_detect_platform index f81c96cc3..fc099a540 100755 --- a/build_tools/build_detect_platform +++ b/build_tools/build_detect_platform @@ -45,7 +45,7 @@ fi # we depend on C++11 PLATFORM_CXXFLAGS="-std=c++11" # we currently depend on POSIX platform -COMMON_FLAGS="-DROCKSDB_PLATFORM_POSIX" +COMMON_FLAGS="-DROCKSDB_PLATFORM_POSIX -DROCKSDB_LIB_IO_POSIX" # Default to fbcode gcc on internal fb machines if [ -z "$ROCKSDB_NO_FBCODE" -a -d /mnt/gvfs/third-party ]; then diff --git a/build_tools/fbcode_config.sh b/build_tools/fbcode_config.sh index 572c0fe68..d6c7b2a28 100644 --- a/build_tools/fbcode_config.sh +++ b/build_tools/fbcode_config.sh @@ -116,7 +116,7 @@ else fi CFLAGS+=" $DEPS_INCLUDE" -CFLAGS+=" -DROCKSDB_PLATFORM_POSIX -DROCKSDB_FALLOCATE_PRESENT -DROCKSDB_MALLOC_USABLE_SIZE" +CFLAGS+=" -DROCKSDB_PLATFORM_POSIX -DROCKSDB_LIB_IO_POSIX -DROCKSDB_FALLOCATE_PRESENT -DROCKSDB_MALLOC_USABLE_SIZE" CXXFLAGS+=" $CFLAGS" EXEC_LDFLAGS=" $SNAPPY_LIBS $ZLIB_LIBS $BZIP_LIBS $LZ4_LIBS $ZSTD_LIBS $GFLAGS_LIBS $NUMA_LIB" diff --git a/build_tools/fbcode_config4.8.1.sh b/build_tools/fbcode_config4.8.1.sh index 524a5ed7f..41babf180 100644 --- a/build_tools/fbcode_config4.8.1.sh +++ b/build_tools/fbcode_config4.8.1.sh @@ -91,7 +91,7 @@ else fi CFLAGS+=" $DEPS_INCLUDE" -CFLAGS+=" -DROCKSDB_PLATFORM_POSIX -DROCKSDB_FALLOCATE_PRESENT -DROCKSDB_MALLOC_USABLE_SIZE" +CFLAGS+=" -DROCKSDB_PLATFORM_POSIX -DROCKSDB_LIB_IO_POSIX -DROCKSDB_FALLOCATE_PRESENT -DROCKSDB_MALLOC_USABLE_SIZE" CFLAGS+=" -DSNAPPY -DGFLAGS=google -DZLIB -DBZIP2 -DLZ4 -DZSTD -DNUMA" CXXFLAGS+=" $CFLAGS" diff --git a/include/posix/io_posix.h b/include/posix/io_posix.h new file mode 100644 index 000000000..aaa6b4048 --- /dev/null +++ b/include/posix/io_posix.h @@ -0,0 +1,214 @@ +// Copyright (c) 2013, Facebook, Inc. All rights reserved. +// This source code is licensed under the BSD-style license found in the +// LICENSE file in the root directory of this source tree. An additional grant +// of patent rights can be found in the PATENTS file in the same directory. +// +// Copyright (c) 2011 The LevelDB Authors. All rights reserved. +// Use of this source code is governed by a BSD-style license that can be +// found in the LICENSE file. See the AUTHORS file for names of contributors. +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#ifdef OS_LINUX +#include +#include +#endif +#if defined(OS_LINUX) +#include +#endif +#include "rocksdb/env.h" +#include "util/sync_point.h" +#include "util/iostats_context_imp.h" +#include "util/coding.h" +#include "rocksdb/slice.h" +#include "port/port.h" +#include "util/logging.h" +#include "util/posix_logger.h" +#include "util/random.h" +#include "util/string_util.h" +#include "util/thread_status_updater.h" +#include "util/thread_status_util.h" + +// For non linux platform, the following macros are used only as place +// holder. +#if !(defined OS_LINUX) && !(defined CYGWIN) +#define POSIX_FADV_NORMAL 0 /* [MC1] no further special treatment */ +#define POSIX_FADV_RANDOM 1 /* [MC1] expect random page refs */ +#define POSIX_FADV_SEQUENTIAL 2 /* [MC1] expect sequential page refs */ +#define POSIX_FADV_WILLNEED 3 /* [MC1] will need these pages */ +#define POSIX_FADV_DONTNEED 4 /* [MC1] dont need these pages */ +#endif + +namespace rocksdb { + +static Status IOError(const std::string& context, int err_number) { + return Status::IOError(context, strerror(err_number)); +} + +class PosixSequentialFile : public SequentialFile { + private: + std::string filename_; + FILE* file_; + int fd_; + bool use_os_buffer_; + + public: + PosixSequentialFile(const std::string& fname, FILE* f, + const EnvOptions& options); + virtual ~PosixSequentialFile(); + + virtual Status Read(size_t n, Slice* result, char* scratch) override; + virtual Status Skip(uint64_t n) override; + virtual Status InvalidateCache(size_t offset, size_t length) override; +}; + +class PosixRandomAccessFile : public RandomAccessFile { + private: + std::string filename_; + int fd_; + bool use_os_buffer_; + + public: + PosixRandomAccessFile(const std::string& fname, int fd, + const EnvOptions& options); + virtual ~PosixRandomAccessFile(); + + virtual Status Read(uint64_t offset, size_t n, Slice* result, + char* scratch) const override; +#ifdef OS_LINUX + virtual size_t GetUniqueId(char* id, size_t max_size) const override; +#endif + virtual void Hint(AccessPattern pattern) override; + virtual Status InvalidateCache(size_t offset, size_t length) override; +}; + +class PosixWritableFile : public WritableFile { + private: + const std::string filename_; + int fd_; + uint64_t filesize_; +#ifdef ROCKSDB_FALLOCATE_PRESENT + bool allow_fallocate_; + bool fallocate_with_keep_size_; +#endif + + public: + PosixWritableFile(const std::string& fname, int fd, + const EnvOptions& options); + ~PosixWritableFile(); + + // Means Close() will properly take care of truncate + // and it does not need any additional information + virtual Status Truncate(uint64_t size) override { return Status::OK(); } + virtual Status Close() override; + virtual Status Append(const Slice& data) override; + virtual Status Flush() override; + virtual Status Sync() override; + virtual Status Fsync() override; + virtual bool IsSyncThreadSafe() const override; + virtual uint64_t GetFileSize() override; + virtual Status InvalidateCache(size_t offset, size_t length) override; +#ifdef ROCKSDB_FALLOCATE_PRESENT + virtual Status Allocate(off_t offset, off_t len) override; + virtual Status RangeSync(off_t offset, off_t nbytes) override; + virtual size_t GetUniqueId(char* id, size_t max_size) const override; +#endif +}; + +class PosixMmapReadableFile : public RandomAccessFile { + private: + int fd_; + std::string filename_; + void* mmapped_region_; + size_t length_; + + public: + PosixMmapReadableFile(const int fd, const std::string& fname, void* base, + size_t length, const EnvOptions& options); + virtual ~PosixMmapReadableFile(); + virtual Status Read(uint64_t offset, size_t n, Slice* result, + char* scratch) const override; + virtual Status InvalidateCache(size_t offset, size_t length) override; +}; + +class PosixMmapFile : public WritableFile { + private: + std::string filename_; + int fd_; + size_t page_size_; + size_t map_size_; // How much extra memory to map at a time + char* base_; // The mapped region + char* limit_; // Limit of the mapped region + char* dst_; // Where to write next (in range [base_,limit_]) + char* last_sync_; // Where have we synced up to + uint64_t file_offset_; // Offset of base_ in file +#ifdef ROCKSDB_FALLOCATE_PRESENT + bool allow_fallocate_; // If false, fallocate calls are bypassed + bool fallocate_with_keep_size_; +#endif + + // Roundup x to a multiple of y + static size_t Roundup(size_t x, size_t y) { return ((x + y - 1) / y) * y; } + + size_t TruncateToPageBoundary(size_t s) { + s -= (s & (page_size_ - 1)); + assert((s % page_size_) == 0); + return s; + } + + Status MapNewRegion(); + Status UnmapCurrentRegion(); + Status Msync(); + + public: + PosixMmapFile(const std::string& fname, int fd, size_t page_size, + const EnvOptions& options); + ~PosixMmapFile(); + + // Means Close() will properly take care of truncate + // and it does not need any additional information + virtual Status Truncate(uint64_t size) override { return Status::OK(); } + virtual Status Close() override; + virtual Status Append(const Slice& data) override; + virtual Status Flush() override; + virtual Status Sync() override; + virtual Status Fsync() override; + virtual uint64_t GetFileSize() override; + virtual Status InvalidateCache(size_t offset, size_t length) override; +#ifdef ROCKSDB_FALLOCATE_PRESENT + virtual Status Allocate(off_t offset, off_t len) override; +#endif +}; + +class PosixDirectory : public Directory { + public: + explicit PosixDirectory(int fd) : fd_(fd) {} + ~PosixDirectory() { close(fd_); } + + virtual Status Fsync() override { + if (fsync(fd_) == -1) { + return IOError("directory", errno); + } + return Status::OK(); + } + + private: + int fd_; +}; + +} // namespace rocksdb diff --git a/src.mk b/src.mk index cb43744be..9b87d6ede 100644 --- a/src.mk +++ b/src.mk @@ -95,6 +95,7 @@ LIB_SOURCES = \ util/env.cc \ util/env_hdfs.cc \ util/env_posix.cc \ + util/io_posix.cc \ util/file_util.cc \ util/file_reader_writer.cc \ util/filter_policy.cc \ diff --git a/util/env_posix.cc b/util/env_posix.cc index 7d241ca63..359c848b6 100644 --- a/util/env_posix.cc +++ b/util/env_posix.cc @@ -7,43 +7,7 @@ // Use of this source code is governed by a BSD-style license that can be // found in the LICENSE file. See the AUTHORS file for names of contributors. -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#ifdef OS_LINUX -#include -#include -#endif -#include -#include -#include -#include -#if defined(OS_LINUX) -#include -#endif -#include -#include -#include "rocksdb/env.h" -#include "rocksdb/slice.h" -#include "port/port.h" -#include "util/coding.h" -#include "util/logging.h" -#include "util/posix_logger.h" -#include "util/random.h" -#include "util/iostats_context_imp.h" -#include "util/string_util.h" -#include "util/sync_point.h" -#include "util/thread_status_updater.h" -#include "util/thread_status_util.h" +#include "posix/io_posix.h" // Get nano time includes #if defined(OS_LINUX) || defined(OS_FREEBSD) @@ -64,31 +28,10 @@ #define EXT4_SUPER_MAGIC 0xEF53 #endif -// For non linux platform, the following macros are used only as place -// holder. -#if !(defined OS_LINUX) && !(defined CYGWIN) -#define POSIX_FADV_NORMAL 0 /* [MC1] no further special treatment */ -#define POSIX_FADV_RANDOM 1 /* [MC1] expect random page refs */ -#define POSIX_FADV_SEQUENTIAL 2 /* [MC1] expect sequential page refs */ -#define POSIX_FADV_WILLNEED 3 /* [MC1] will need these pages */ -#define POSIX_FADV_DONTNEED 4 /* [MC1] dont need these pages */ -#endif - - namespace rocksdb { namespace { -// A wrapper for fadvise, if the platform doesn't support fadvise, -// it will simply return Status::NotSupport. -int Fadvise(int fd, off_t offset, size_t len, int advice) { -#ifdef OS_LINUX - return posix_fadvise(fd, offset, len, advice); -#else - return 0; // simply do nothing. -#endif -} - ThreadStatusUpdater* CreateThreadStatusUpdater() { return new ThreadStatusUpdater(); } @@ -97,677 +40,6 @@ ThreadStatusUpdater* CreateThreadStatusUpdater() { static std::set lockedFiles; static port::Mutex mutex_lockedFiles; -static Status IOError(const std::string& context, int err_number) { - return Status::IOError(context, strerror(err_number)); -} - -#if defined(OS_LINUX) -namespace { - static size_t GetUniqueIdFromFile(int fd, char* id, size_t max_size) { - if (max_size < kMaxVarint64Length*3) { - return 0; - } - - struct stat buf; - int result = fstat(fd, &buf); - if (result == -1) { - return 0; - } - - long version = 0; - result = ioctl(fd, FS_IOC_GETVERSION, &version); - if (result == -1) { - return 0; - } - uint64_t uversion = (uint64_t)version; - - char* rid = id; - rid = EncodeVarint64(rid, buf.st_dev); - rid = EncodeVarint64(rid, buf.st_ino); - rid = EncodeVarint64(rid, uversion); - assert(rid >= id); - return static_cast(rid-id); - } -} -#endif - -class PosixSequentialFile: public SequentialFile { - private: - std::string filename_; - FILE* file_; - int fd_; - bool use_os_buffer_; - - public: - PosixSequentialFile(const std::string& fname, FILE* f, - const EnvOptions& options) - : filename_(fname), file_(f), fd_(fileno(f)), - use_os_buffer_(options.use_os_buffer) { - } - virtual ~PosixSequentialFile() { fclose(file_); } - - virtual Status Read(size_t n, Slice* result, char* scratch) override { - Status s; - size_t r = 0; - do { - r = fread_unlocked(scratch, 1, n, file_); - } while (r == 0 && ferror(file_) && errno == EINTR); - *result = Slice(scratch, r); - if (r < n) { - if (feof(file_)) { - // We leave status as ok if we hit the end of the file - // We also clear the error so that the reads can continue - // if a new data is written to the file - clearerr(file_); - } else { - // A partial read with an error: return a non-ok status - s = IOError(filename_, errno); - } - } - if (!use_os_buffer_) { - // we need to fadvise away the entire range of pages because - // we do not want readahead pages to be cached. - Fadvise(fd_, 0, 0, POSIX_FADV_DONTNEED); // free OS pages - } - return s; - } - - virtual Status Skip(uint64_t n) override { - if (fseek(file_, static_cast(n), SEEK_CUR)) { - return IOError(filename_, errno); - } - return Status::OK(); - } - - virtual Status InvalidateCache(size_t offset, size_t length) override { -#ifndef OS_LINUX - return Status::OK(); -#else - // free OS pages - int ret = Fadvise(fd_, offset, length, POSIX_FADV_DONTNEED); - if (ret == 0) { - return Status::OK(); - } - return IOError(filename_, errno); -#endif - } -}; - -// pread() based random-access -class PosixRandomAccessFile: public RandomAccessFile { - private: - std::string filename_; - int fd_; - bool use_os_buffer_; - - public: - PosixRandomAccessFile(const std::string& fname, int fd, - const EnvOptions& options) - : filename_(fname), fd_(fd), use_os_buffer_(options.use_os_buffer) { - assert(!options.use_mmap_reads || sizeof(void*) < 8); - } - virtual ~PosixRandomAccessFile() { close(fd_); } - - virtual Status Read(uint64_t offset, size_t n, Slice* result, - char* scratch) const override { - Status s; - ssize_t r = -1; - size_t left = n; - char* ptr = scratch; - while (left > 0) { - r = pread(fd_, ptr, left, static_cast(offset)); - - if (r <= 0) { - if (errno == EINTR) { - continue; - } - break; - } - ptr += r; - offset += r; - left -= r; - } - - *result = Slice(scratch, (r < 0) ? 0 : n - left); - if (r < 0) { - // An error: return a non-ok status - s = IOError(filename_, errno); - } - if (!use_os_buffer_) { - // we need to fadvise away the entire range of pages because - // we do not want readahead pages to be cached. - Fadvise(fd_, 0, 0, POSIX_FADV_DONTNEED); // free OS pages - } - return s; - } - -#ifdef OS_LINUX - virtual size_t GetUniqueId(char* id, size_t max_size) const override { - return GetUniqueIdFromFile(fd_, id, max_size); - } -#endif - - virtual void Hint(AccessPattern pattern) override { - switch(pattern) { - case NORMAL: - Fadvise(fd_, 0, 0, POSIX_FADV_NORMAL); - break; - case RANDOM: - Fadvise(fd_, 0, 0, POSIX_FADV_RANDOM); - break; - case SEQUENTIAL: - Fadvise(fd_, 0, 0, POSIX_FADV_SEQUENTIAL); - break; - case WILLNEED: - Fadvise(fd_, 0, 0, POSIX_FADV_WILLNEED); - break; - case DONTNEED: - Fadvise(fd_, 0, 0, POSIX_FADV_DONTNEED); - break; - default: - assert(false); - break; - } - } - - virtual Status InvalidateCache(size_t offset, size_t length) override { -#ifndef OS_LINUX - return Status::OK(); -#else - // free OS pages - int ret = Fadvise(fd_, offset, length, POSIX_FADV_DONTNEED); - if (ret == 0) { - return Status::OK(); - } - return IOError(filename_, errno); -#endif - } -}; - -// mmap() based random-access -class PosixMmapReadableFile: public RandomAccessFile { - private: - int fd_; - std::string filename_; - void* mmapped_region_; - size_t length_; - - public: - // base[0,length-1] contains the mmapped contents of the file. - PosixMmapReadableFile(const int fd, const std::string& fname, - void* base, size_t length, - const EnvOptions& options) - : fd_(fd), filename_(fname), mmapped_region_(base), length_(length) { - fd_ = fd_ + 0; // suppress the warning for used variables - assert(options.use_mmap_reads); - assert(options.use_os_buffer); - } - virtual ~PosixMmapReadableFile() { - int ret = munmap(mmapped_region_, length_); - if (ret != 0) { - fprintf(stdout, "failed to munmap %p length %" ROCKSDB_PRIszt " \n", - mmapped_region_, length_); - } - } - - virtual Status Read(uint64_t offset, size_t n, Slice* result, - char* scratch) const override { - Status s; - if (offset > length_) { - *result = Slice(); - return IOError(filename_, EINVAL); - } else if (offset + n > length_) { - n = static_cast(length_ - offset); - } - *result = Slice(reinterpret_cast(mmapped_region_) + offset, n); - return s; - } - virtual Status InvalidateCache(size_t offset, size_t length) override { -#ifndef OS_LINUX - return Status::OK(); -#else - // free OS pages - int ret = Fadvise(fd_, offset, length, POSIX_FADV_DONTNEED); - if (ret == 0) { - return Status::OK(); - } - return IOError(filename_, errno); -#endif - } -}; - -// We preallocate up to an extra megabyte and use memcpy to append new -// data to the file. This is safe since we either properly close the -// file before reading from it, or for log files, the reading code -// knows enough to skip zero suffixes. -class PosixMmapFile : public WritableFile { - private: - std::string filename_; - int fd_; - size_t page_size_; - size_t map_size_; // How much extra memory to map at a time - char* base_; // The mapped region - char* limit_; // Limit of the mapped region - char* dst_; // Where to write next (in range [base_,limit_]) - char* last_sync_; // Where have we synced up to - uint64_t file_offset_; // Offset of base_ in file -#ifdef ROCKSDB_FALLOCATE_PRESENT - bool allow_fallocate_; // If false, fallocate calls are bypassed - bool fallocate_with_keep_size_; -#endif - - // Roundup x to a multiple of y - static size_t Roundup(size_t x, size_t y) { - return ((x + y - 1) / y) * y; - } - - size_t TruncateToPageBoundary(size_t s) { - s -= (s & (page_size_ - 1)); - assert((s % page_size_) == 0); - return s; - } - - Status UnmapCurrentRegion() { - TEST_KILL_RANDOM("PosixMmapFile::UnmapCurrentRegion:0", rocksdb_kill_odds); - if (base_ != nullptr) { - int munmap_status = munmap(base_, limit_ - base_); - if (munmap_status != 0) { - return IOError(filename_, munmap_status); - } - file_offset_ += limit_ - base_; - base_ = nullptr; - limit_ = nullptr; - last_sync_ = nullptr; - dst_ = nullptr; - - // Increase the amount we map the next time, but capped at 1MB - if (map_size_ < (1<<20)) { - map_size_ *= 2; - } - } - return Status::OK(); - } - - Status MapNewRegion() { -#ifdef ROCKSDB_FALLOCATE_PRESENT - assert(base_ == nullptr); - - TEST_KILL_RANDOM("PosixMmapFile::UnmapCurrentRegion:0", rocksdb_kill_odds); - // we can't fallocate with FALLOC_FL_KEEP_SIZE here - if (allow_fallocate_) { - IOSTATS_TIMER_GUARD(allocate_nanos); - int alloc_status = fallocate(fd_, 0, file_offset_, map_size_); - if (alloc_status != 0) { - // fallback to posix_fallocate - alloc_status = posix_fallocate(fd_, file_offset_, map_size_); - } - if (alloc_status != 0) { - return Status::IOError("Error allocating space to file : " + filename_ + - "Error : " + strerror(alloc_status)); - } - } - - TEST_KILL_RANDOM("PosixMmapFile::Append:1", rocksdb_kill_odds); - void* ptr = mmap(nullptr, map_size_, PROT_READ | PROT_WRITE, MAP_SHARED, - fd_, file_offset_); - if (ptr == MAP_FAILED) { - return Status::IOError("MMap failed on " + filename_); - } - TEST_KILL_RANDOM("PosixMmapFile::Append:2", rocksdb_kill_odds); - - base_ = reinterpret_cast(ptr); - limit_ = base_ + map_size_; - dst_ = base_; - last_sync_ = base_; - return Status::OK(); -#else - return Status::NotSupported("This platform doesn't support fallocate()"); -#endif - } - - Status Msync() { - if (dst_ == last_sync_) { - return Status::OK(); - } - // Find the beginnings of the pages that contain the first and last - // bytes to be synced. - size_t p1 = TruncateToPageBoundary(last_sync_ - base_); - size_t p2 = TruncateToPageBoundary(dst_ - base_ - 1); - last_sync_ = dst_; - TEST_KILL_RANDOM("PosixMmapFile::Msync:0", rocksdb_kill_odds); - if (msync(base_ + p1, p2 - p1 + page_size_, MS_SYNC) < 0) { - return IOError(filename_, errno); - } - return Status::OK(); - } - - public: - PosixMmapFile(const std::string& fname, int fd, size_t page_size, - const EnvOptions& options) - : filename_(fname), - fd_(fd), - page_size_(page_size), - map_size_(Roundup(65536, page_size)), - base_(nullptr), - limit_(nullptr), - dst_(nullptr), - last_sync_(nullptr), - file_offset_(0) { -#ifdef ROCKSDB_FALLOCATE_PRESENT - allow_fallocate_ = options.allow_fallocate; - fallocate_with_keep_size_ = options.fallocate_with_keep_size; -#endif - assert((page_size & (page_size - 1)) == 0); - assert(options.use_mmap_writes); - } - - - ~PosixMmapFile() { - if (fd_ >= 0) { - PosixMmapFile::Close(); - } - } - - virtual Status Append(const Slice& data) override { - const char* src = data.data(); - size_t left = data.size(); - while (left > 0) { - assert(base_ <= dst_); - assert(dst_ <= limit_); - size_t avail = limit_ - dst_; - if (avail == 0) { - Status s = UnmapCurrentRegion(); - if (!s.ok()) { - return s; - } - s = MapNewRegion(); - if (!s.ok()) { - return s; - } - TEST_KILL_RANDOM("PosixMmapFile::Append:0", rocksdb_kill_odds); - } - - size_t n = (left <= avail) ? left : avail; - memcpy(dst_, src, n); - dst_ += n; - src += n; - left -= n; - } - return Status::OK(); - } - - // Means Close() will properly take care of truncate - // and it does not need any additional information - virtual Status Truncate(uint64_t size) override { - return Status::OK(); - } - - virtual Status Close() override { - Status s; - size_t unused = limit_ - dst_; - - s = UnmapCurrentRegion(); - if (!s.ok()) { - s = IOError(filename_, errno); - } else if (unused > 0) { - // Trim the extra space at the end of the file - if (ftruncate(fd_, file_offset_ - unused) < 0) { - s = IOError(filename_, errno); - } - } - - if (close(fd_) < 0) { - if (s.ok()) { - s = IOError(filename_, errno); - } - } - - fd_ = -1; - base_ = nullptr; - limit_ = nullptr; - return s; - } - - virtual Status Flush() override { - return Status::OK(); - } - - virtual Status Sync() override { - if (fdatasync(fd_) < 0) { - return IOError(filename_, errno); - } - - return Msync(); - } - - /** - * Flush data as well as metadata to stable storage. - */ - virtual Status Fsync() override { - if (fsync(fd_) < 0) { - return IOError(filename_, errno); - } - - return Msync(); - } - - /** - * Get the size of valid data in the file. This will not match the - * size that is returned from the filesystem because we use mmap - * to extend file by map_size every time. - */ - virtual uint64_t GetFileSize() override { - size_t used = dst_ - base_; - return file_offset_ + used; - } - - virtual Status InvalidateCache(size_t offset, size_t length) override { -#ifndef OS_LINUX - return Status::OK(); -#else - // free OS pages - int ret = Fadvise(fd_, offset, length, POSIX_FADV_DONTNEED); - if (ret == 0) { - return Status::OK(); - } - return IOError(filename_, errno); -#endif - } - -#ifdef ROCKSDB_FALLOCATE_PRESENT - virtual Status Allocate(off_t offset, off_t len) override { - TEST_KILL_RANDOM("PosixMmapFile::Allocate:0", rocksdb_kill_odds); - int alloc_status = 0; - if (allow_fallocate_) { - alloc_status = - fallocate(fd_, fallocate_with_keep_size_ ? FALLOC_FL_KEEP_SIZE : 0, - offset, len); - } - if (alloc_status == 0) { - return Status::OK(); - } else { - return IOError(filename_, errno); - } - } -#endif -}; - -// Use posix write to write data to a file. -class PosixWritableFile : public WritableFile { - private: - const std::string filename_; - int fd_; - uint64_t filesize_; -#ifdef ROCKSDB_FALLOCATE_PRESENT - bool allow_fallocate_; - bool fallocate_with_keep_size_; -#endif - - public: - PosixWritableFile(const std::string& fname, int fd, const EnvOptions& options) - : filename_(fname), fd_(fd), filesize_(0) { -#ifdef ROCKSDB_FALLOCATE_PRESENT - allow_fallocate_ = options.allow_fallocate; - fallocate_with_keep_size_ = options.fallocate_with_keep_size; -#endif - assert(!options.use_mmap_writes); - } - - ~PosixWritableFile() { - if (fd_ >= 0) { - PosixWritableFile::Close(); - } - } - - virtual Status Append(const Slice& data) override { - const char* src = data.data(); - size_t left = data.size(); - while (left != 0) { - ssize_t done = write(fd_, src, left); - if (done < 0) { - if (errno == EINTR) { - continue; - } - return IOError(filename_, errno); - } - left -= done; - src += done; - } - filesize_ += data.size(); - return Status::OK(); - } - - // Means Close() will properly take care of truncate - // and it does not need any additional information - virtual Status Truncate(uint64_t size) override { - return Status::OK(); - } - - virtual Status Close() override { - Status s; - - size_t block_size; - size_t last_allocated_block; - GetPreallocationStatus(&block_size, &last_allocated_block); - if (last_allocated_block > 0) { - // trim the extra space preallocated at the end of the file - // NOTE(ljin): we probably don't want to surface failure as an IOError, - // but it will be nice to log these errors. - int dummy __attribute__((unused)); - dummy = ftruncate(fd_, filesize_); -#ifdef ROCKSDB_FALLOCATE_PRESENT - // in some file systems, ftruncate only trims trailing space if the - // new file size is smaller than the current size. Calling fallocate - // with FALLOC_FL_PUNCH_HOLE flag to explicitly release these unused - // blocks. FALLOC_FL_PUNCH_HOLE is supported on at least the following - // filesystems: - // XFS (since Linux 2.6.38) - // ext4 (since Linux 3.0) - // Btrfs (since Linux 3.7) - // tmpfs (since Linux 3.5) - // We ignore error since failure of this operation does not affect - // correctness. - IOSTATS_TIMER_GUARD(allocate_nanos); - if (allow_fallocate_) { - fallocate(fd_, FALLOC_FL_KEEP_SIZE | FALLOC_FL_PUNCH_HOLE, filesize_, - block_size * last_allocated_block - filesize_); - } -#endif - } - - if (close(fd_) < 0) { - s = IOError(filename_, errno); - } - fd_ = -1; - return s; - } - - // write out the cached data to the OS cache - virtual Status Flush() override { - return Status::OK(); - } - - virtual Status Sync() override { - if (fdatasync(fd_) < 0) { - return IOError(filename_, errno); - } - return Status::OK(); - } - - virtual Status Fsync() override { - if (fsync(fd_) < 0) { - return IOError(filename_, errno); - } - return Status::OK(); - } - - virtual bool IsSyncThreadSafe() const override { - return true; - } - - virtual uint64_t GetFileSize() override { return filesize_; } - - virtual Status InvalidateCache(size_t offset, size_t length) override { -#ifndef OS_LINUX - return Status::OK(); -#else - // free OS pages - int ret = Fadvise(fd_, offset, length, POSIX_FADV_DONTNEED); - if (ret == 0) { - return Status::OK(); - } - return IOError(filename_, errno); -#endif - } - -#ifdef ROCKSDB_FALLOCATE_PRESENT - virtual Status Allocate(off_t offset, off_t len) override { - TEST_KILL_RANDOM("PosixWritableFile::Allocate:0", rocksdb_kill_odds); - IOSTATS_TIMER_GUARD(allocate_nanos); - int alloc_status = 0; - if (allow_fallocate_) { - alloc_status = - fallocate(fd_, fallocate_with_keep_size_ ? FALLOC_FL_KEEP_SIZE : 0, - offset, len); - } - if (alloc_status == 0) { - return Status::OK(); - } else { - return IOError(filename_, errno); - } - } - - virtual Status RangeSync(off_t offset, off_t nbytes) override { - if (sync_file_range(fd_, offset, nbytes, SYNC_FILE_RANGE_WRITE) == 0) { - return Status::OK(); - } else { - return IOError(filename_, errno); - } - } - virtual size_t GetUniqueId(char* id, size_t max_size) const override { - return GetUniqueIdFromFile(fd_, id, max_size); - } -#endif -}; - -class PosixDirectory : public Directory { - public: - explicit PosixDirectory(int fd) : fd_(fd) {} - ~PosixDirectory() { - close(fd_); - } - - virtual Status Fsync() override { - if (fsync(fd_) == -1) { - return IOError("directory", errno); - } - return Status::OK(); - } - - private: - int fd_; -}; - static int LockOrUnlock(const std::string& fname, int fd, bool lock) { mutex_lockedFiles.Lock(); if (lock) { @@ -806,12 +78,6 @@ static int LockOrUnlock(const std::string& fname, int fd, bool lock) { return value; } -class PosixFileLock : public FileLock { - public: - int fd_; - std::string filename; -}; - void PthreadCall(const char* label, int result) { if (result != 0) { fprintf(stderr, "pthread %s: %s\n", label, strerror(result)); @@ -819,6 +85,12 @@ void PthreadCall(const char* label, int result) { } } +class PosixFileLock : public FileLock { + public: + int fd_; + std::string filename; +}; + class PosixEnv : public Env { public: PosixEnv(); diff --git a/util/io_posix.cc b/util/io_posix.cc new file mode 100644 index 000000000..4c33fee85 --- /dev/null +++ b/util/io_posix.cc @@ -0,0 +1,614 @@ +// Copyright (c) 2013, Facebook, Inc. All rights reserved. +// This source code is licensed under the BSD-style license found in the +// LICENSE file in the root directory of this source tree. An additional grant +// of patent rights can be found in the PATENTS file in the same directory. +// +// Copyright (c) 2011 The LevelDB Authors. All rights reserved. +// Use of this source code is governed by a BSD-style license that can be +// found in the LICENSE file. See the AUTHORS file for names of contributors. + +#ifdef ROCKSDB_LIB_IO_POSIX + +#include "posix/io_posix.h" + +namespace rocksdb { + +// A wrapper for fadvise, if the platform doesn't support fadvise, +// it will simply return Status::NotSupport. +int Fadvise(int fd, off_t offset, size_t len, int advice) { +#ifdef OS_LINUX + return posix_fadvise(fd, offset, len, advice); +#else + return 0; // simply do nothing. +#endif +} + +/* + * PosixSequentialFile + */ +PosixSequentialFile::PosixSequentialFile(const std::string& fname, FILE* f, + const EnvOptions& options) + : filename_(fname), + file_(f), + fd_(fileno(f)), + use_os_buffer_(options.use_os_buffer) {} + +PosixSequentialFile::~PosixSequentialFile() { fclose(file_); } + +Status PosixSequentialFile::Read(size_t n, Slice* result, char* scratch) { + Status s; + size_t r = 0; + do { + r = fread_unlocked(scratch, 1, n, file_); + } while (r == 0 && ferror(file_) && errno == EINTR); + *result = Slice(scratch, r); + if (r < n) { + if (feof(file_)) { + // We leave status as ok if we hit the end of the file + // We also clear the error so that the reads can continue + // if a new data is written to the file + clearerr(file_); + } else { + // A partial read with an error: return a non-ok status + s = IOError(filename_, errno); + } + } + if (!use_os_buffer_) { + // we need to fadvise away the entire range of pages because + // we do not want readahead pages to be cached. + Fadvise(fd_, 0, 0, POSIX_FADV_DONTNEED); // free OS pages + } + return s; +} + +Status PosixSequentialFile::Skip(uint64_t n) { + if (fseek(file_, static_cast(n), SEEK_CUR)) { + return IOError(filename_, errno); + } + return Status::OK(); +} + +Status PosixSequentialFile::InvalidateCache(size_t offset, size_t length) { +#ifndef OS_LINUX + return Status::OK(); +#else + // free OS pages + int ret = Fadvise(fd_, offset, length, POSIX_FADV_DONTNEED); + if (ret == 0) { + return Status::OK(); + } + return IOError(filename_, errno); +#endif +} + +#if defined(OS_LINUX) +namespace { +static size_t GetUniqueIdFromFile(int fd, char* id, size_t max_size) { + if (max_size < kMaxVarint64Length * 3) { + return 0; + } + + struct stat buf; + int result = fstat(fd, &buf); + if (result == -1) { + return 0; + } + + long version = 0; + result = ioctl(fd, FS_IOC_GETVERSION, &version); + if (result == -1) { + return 0; + } + uint64_t uversion = (uint64_t)version; + + char* rid = id; + rid = EncodeVarint64(rid, buf.st_dev); + rid = EncodeVarint64(rid, buf.st_ino); + rid = EncodeVarint64(rid, uversion); + assert(rid >= id); + return static_cast(rid - id); +} +} +#endif + +/* + * PosixRandomAccessFile + * + * pread() based random-access + */ +PosixRandomAccessFile::PosixRandomAccessFile(const std::string& fname, int fd, + const EnvOptions& options) + : filename_(fname), fd_(fd), use_os_buffer_(options.use_os_buffer) { + assert(!options.use_mmap_reads || sizeof(void*) < 8); +} + +PosixRandomAccessFile::~PosixRandomAccessFile() { close(fd_); } + +Status PosixRandomAccessFile::Read(uint64_t offset, size_t n, Slice* result, + char* scratch) const { + Status s; + ssize_t r = -1; + size_t left = n; + char* ptr = scratch; + while (left > 0) { + r = pread(fd_, ptr, left, static_cast(offset)); + + if (r <= 0) { + if (errno == EINTR) { + continue; + } + break; + } + ptr += r; + offset += r; + left -= r; + } + + *result = Slice(scratch, (r < 0) ? 0 : n - left); + if (r < 0) { + // An error: return a non-ok status + s = IOError(filename_, errno); + } + if (!use_os_buffer_) { + // we need to fadvise away the entire range of pages because + // we do not want readahead pages to be cached. + Fadvise(fd_, 0, 0, POSIX_FADV_DONTNEED); // free OS pages + } + return s; +} + +#ifdef OS_LINUX +size_t PosixRandomAccessFile::GetUniqueId(char* id, size_t max_size) const { + return GetUniqueIdFromFile(fd_, id, max_size); +} +#endif + +void PosixRandomAccessFile::Hint(AccessPattern pattern) { + switch (pattern) { + case NORMAL: + Fadvise(fd_, 0, 0, POSIX_FADV_NORMAL); + break; + case RANDOM: + Fadvise(fd_, 0, 0, POSIX_FADV_RANDOM); + break; + case SEQUENTIAL: + Fadvise(fd_, 0, 0, POSIX_FADV_SEQUENTIAL); + break; + case WILLNEED: + Fadvise(fd_, 0, 0, POSIX_FADV_WILLNEED); + break; + case DONTNEED: + Fadvise(fd_, 0, 0, POSIX_FADV_DONTNEED); + break; + default: + assert(false); + break; + } +} + +Status PosixRandomAccessFile::InvalidateCache(size_t offset, size_t length) { +#ifndef OS_LINUX + return Status::OK(); +#else + // free OS pages + int ret = Fadvise(fd_, offset, length, POSIX_FADV_DONTNEED); + if (ret == 0) { + return Status::OK(); + } + return IOError(filename_, errno); +#endif +} + +/* + * PosixMmapReadableFile + * + * mmap() based random-access + */ +// base[0,length-1] contains the mmapped contents of the file. +PosixMmapReadableFile::PosixMmapReadableFile(const int fd, + const std::string& fname, + void* base, size_t length, + const EnvOptions& options) + : fd_(fd), filename_(fname), mmapped_region_(base), length_(length) { + fd_ = fd_ + 0; // suppress the warning for used variables + assert(options.use_mmap_reads); + assert(options.use_os_buffer); +} + +PosixMmapReadableFile::~PosixMmapReadableFile() { + int ret = munmap(mmapped_region_, length_); + if (ret != 0) { + fprintf(stdout, "failed to munmap %p length %" ROCKSDB_PRIszt " \n", + mmapped_region_, length_); + } +} + +Status PosixMmapReadableFile::Read(uint64_t offset, size_t n, Slice* result, + char* scratch) const { + Status s; + if (offset > length_) { + *result = Slice(); + return IOError(filename_, EINVAL); + } else if (offset + n > length_) { + n = static_cast(length_ - offset); + } + *result = Slice(reinterpret_cast(mmapped_region_) + offset, n); + return s; +} + +Status PosixMmapReadableFile::InvalidateCache(size_t offset, size_t length) { +#ifndef OS_LINUX + return Status::OK(); +#else + // free OS pages + int ret = Fadvise(fd_, offset, length, POSIX_FADV_DONTNEED); + if (ret == 0) { + return Status::OK(); + } + return IOError(filename_, errno); +#endif +} + +/* + * PosixMmapFile + * + * We preallocate up to an extra megabyte and use memcpy to append new + * data to the file. This is safe since we either properly close the + * file before reading from it, or for log files, the reading code + * knows enough to skip zero suffixes. + */ +Status PosixMmapFile::UnmapCurrentRegion() { + TEST_KILL_RANDOM("PosixMmapFile::UnmapCurrentRegion:0", rocksdb_kill_odds); + if (base_ != nullptr) { + int munmap_status = munmap(base_, limit_ - base_); + if (munmap_status != 0) { + return IOError(filename_, munmap_status); + } + file_offset_ += limit_ - base_; + base_ = nullptr; + limit_ = nullptr; + last_sync_ = nullptr; + dst_ = nullptr; + + // Increase the amount we map the next time, but capped at 1MB + if (map_size_ < (1 << 20)) { + map_size_ *= 2; + } + } + return Status::OK(); +} + +Status PosixMmapFile::MapNewRegion() { +#ifdef ROCKSDB_FALLOCATE_PRESENT + assert(base_ == nullptr); + + TEST_KILL_RANDOM("PosixMmapFile::UnmapCurrentRegion:0", rocksdb_kill_odds); + // we can't fallocate with FALLOC_FL_KEEP_SIZE here + if (allow_fallocate_) { + IOSTATS_TIMER_GUARD(allocate_nanos); + int alloc_status = fallocate(fd_, 0, file_offset_, map_size_); + if (alloc_status != 0) { + // fallback to posix_fallocate + alloc_status = posix_fallocate(fd_, file_offset_, map_size_); + } + if (alloc_status != 0) { + return Status::IOError("Error allocating space to file : " + filename_ + + "Error : " + strerror(alloc_status)); + } + } + + TEST_KILL_RANDOM("PosixMmapFile::Append:1", rocksdb_kill_odds); + void* ptr = mmap(nullptr, map_size_, PROT_READ | PROT_WRITE, MAP_SHARED, fd_, + file_offset_); + if (ptr == MAP_FAILED) { + return Status::IOError("MMap failed on " + filename_); + } + TEST_KILL_RANDOM("PosixMmapFile::Append:2", rocksdb_kill_odds); + + base_ = reinterpret_cast(ptr); + limit_ = base_ + map_size_; + dst_ = base_; + last_sync_ = base_; + return Status::OK(); +#else + return Status::NotSupported("This platform doesn't support fallocate()"); +#endif +} + +Status PosixMmapFile::Msync() { + if (dst_ == last_sync_) { + return Status::OK(); + } + // Find the beginnings of the pages that contain the first and last + // bytes to be synced. + size_t p1 = TruncateToPageBoundary(last_sync_ - base_); + size_t p2 = TruncateToPageBoundary(dst_ - base_ - 1); + last_sync_ = dst_; + TEST_KILL_RANDOM("PosixMmapFile::Msync:0", rocksdb_kill_odds); + if (msync(base_ + p1, p2 - p1 + page_size_, MS_SYNC) < 0) { + return IOError(filename_, errno); + } + return Status::OK(); +} + +PosixMmapFile::PosixMmapFile(const std::string& fname, int fd, size_t page_size, + const EnvOptions& options) + : filename_(fname), + fd_(fd), + page_size_(page_size), + map_size_(Roundup(65536, page_size)), + base_(nullptr), + limit_(nullptr), + dst_(nullptr), + last_sync_(nullptr), + file_offset_(0) { +#ifdef ROCKSDB_FALLOCATE_PRESENT + allow_fallocate_ = options.allow_fallocate; + fallocate_with_keep_size_ = options.fallocate_with_keep_size; +#endif + assert((page_size & (page_size - 1)) == 0); + assert(options.use_mmap_writes); +} + +PosixMmapFile::~PosixMmapFile() { + if (fd_ >= 0) { + PosixMmapFile::Close(); + } +} + +Status PosixMmapFile::Append(const Slice& data) { + const char* src = data.data(); + size_t left = data.size(); + while (left > 0) { + assert(base_ <= dst_); + assert(dst_ <= limit_); + size_t avail = limit_ - dst_; + if (avail == 0) { + Status s = UnmapCurrentRegion(); + if (!s.ok()) { + return s; + } + s = MapNewRegion(); + if (!s.ok()) { + return s; + } + TEST_KILL_RANDOM("PosixMmapFile::Append:0", rocksdb_kill_odds); + } + + size_t n = (left <= avail) ? left : avail; + memcpy(dst_, src, n); + dst_ += n; + src += n; + left -= n; + } + return Status::OK(); +} + +Status PosixMmapFile::Close() { + Status s; + size_t unused = limit_ - dst_; + + s = UnmapCurrentRegion(); + if (!s.ok()) { + s = IOError(filename_, errno); + } else if (unused > 0) { + // Trim the extra space at the end of the file + if (ftruncate(fd_, file_offset_ - unused) < 0) { + s = IOError(filename_, errno); + } + } + + if (close(fd_) < 0) { + if (s.ok()) { + s = IOError(filename_, errno); + } + } + + fd_ = -1; + base_ = nullptr; + limit_ = nullptr; + return s; +} + +Status PosixMmapFile::Flush() { return Status::OK(); } + +Status PosixMmapFile::Sync() { + if (fdatasync(fd_) < 0) { + return IOError(filename_, errno); + } + + return Msync(); +} + +/** + * Flush data as well as metadata to stable storage. + */ +Status PosixMmapFile::Fsync() { + if (fsync(fd_) < 0) { + return IOError(filename_, errno); + } + + return Msync(); +} + +/** + * Get the size of valid data in the file. This will not match the + * size that is returned from the filesystem because we use mmap + * to extend file by map_size every time. + */ +uint64_t PosixMmapFile::GetFileSize() { + size_t used = dst_ - base_; + return file_offset_ + used; +} + +Status PosixMmapFile::InvalidateCache(size_t offset, size_t length) { +#ifndef OS_LINUX + return Status::OK(); +#else + // free OS pages + int ret = Fadvise(fd_, offset, length, POSIX_FADV_DONTNEED); + if (ret == 0) { + return Status::OK(); + } + return IOError(filename_, errno); +#endif +} + +#ifdef ROCKSDB_FALLOCATE_PRESENT +Status PosixMmapFile::Allocate(off_t offset, off_t len) { + TEST_KILL_RANDOM("PosixMmapFile::Allocate:0", rocksdb_kill_odds); + int alloc_status = 0; + if (allow_fallocate_) { + alloc_status = fallocate( + fd_, fallocate_with_keep_size_ ? FALLOC_FL_KEEP_SIZE : 0, offset, len); + } + if (alloc_status == 0) { + return Status::OK(); + } else { + return IOError(filename_, errno); + } +} +#endif + +/* + * PosixWritableFile + * + * Use posix write to write data to a file. + */ +PosixWritableFile::PosixWritableFile(const std::string& fname, int fd, + const EnvOptions& options) + : filename_(fname), fd_(fd), filesize_(0) { +#ifdef ROCKSDB_FALLOCATE_PRESENT + allow_fallocate_ = options.allow_fallocate; + fallocate_with_keep_size_ = options.fallocate_with_keep_size; +#endif + assert(!options.use_mmap_writes); +} + +PosixWritableFile::~PosixWritableFile() { + if (fd_ >= 0) { + PosixWritableFile::Close(); + } +} + +Status PosixWritableFile::Append(const Slice& data) { + const char* src = data.data(); + size_t left = data.size(); + while (left != 0) { + ssize_t done = write(fd_, src, left); + if (done < 0) { + if (errno == EINTR) { + continue; + } + return IOError(filename_, errno); + } + left -= done; + src += done; + } + filesize_ += data.size(); + return Status::OK(); +} + +Status PosixWritableFile::Close() { + Status s; + + size_t block_size; + size_t last_allocated_block; + GetPreallocationStatus(&block_size, &last_allocated_block); + if (last_allocated_block > 0) { + // trim the extra space preallocated at the end of the file + // NOTE(ljin): we probably don't want to surface failure as an IOError, + // but it will be nice to log these errors. + int dummy __attribute__((unused)); + dummy = ftruncate(fd_, filesize_); +#ifdef ROCKSDB_FALLOCATE_PRESENT + // in some file systems, ftruncate only trims trailing space if the + // new file size is smaller than the current size. Calling fallocate + // with FALLOC_FL_PUNCH_HOLE flag to explicitly release these unused + // blocks. FALLOC_FL_PUNCH_HOLE is supported on at least the following + // filesystems: + // XFS (since Linux 2.6.38) + // ext4 (since Linux 3.0) + // Btrfs (since Linux 3.7) + // tmpfs (since Linux 3.5) + // We ignore error since failure of this operation does not affect + // correctness. + IOSTATS_TIMER_GUARD(allocate_nanos); + if (allow_fallocate_) { + fallocate(fd_, FALLOC_FL_KEEP_SIZE | FALLOC_FL_PUNCH_HOLE, filesize_, + block_size * last_allocated_block - filesize_); + } +#endif + } + + if (close(fd_) < 0) { + s = IOError(filename_, errno); + } + fd_ = -1; + return s; +} + +// write out the cached data to the OS cache +Status PosixWritableFile::Flush() { return Status::OK(); } + +Status PosixWritableFile::Sync() { + if (fdatasync(fd_) < 0) { + return IOError(filename_, errno); + } + return Status::OK(); +} + +Status PosixWritableFile::Fsync() { + if (fsync(fd_) < 0) { + return IOError(filename_, errno); + } + return Status::OK(); +} + +bool PosixWritableFile::IsSyncThreadSafe() const { return true; } + +uint64_t PosixWritableFile::GetFileSize() { return filesize_; } + +Status PosixWritableFile::InvalidateCache(size_t offset, size_t length) { +#ifndef OS_LINUX + return Status::OK(); +#else + // free OS pages + int ret = Fadvise(fd_, offset, length, POSIX_FADV_DONTNEED); + if (ret == 0) { + return Status::OK(); + } + return IOError(filename_, errno); +#endif +} + +#ifdef ROCKSDB_FALLOCATE_PRESENT +Status PosixWritableFile::Allocate(off_t offset, off_t len) { + TEST_KILL_RANDOM("PosixWritableFile::Allocate:0", rocksdb_kill_odds); + IOSTATS_TIMER_GUARD(allocate_nanos); + int alloc_status = 0; + if (allow_fallocate_) { + alloc_status = fallocate( + fd_, fallocate_with_keep_size_ ? FALLOC_FL_KEEP_SIZE : 0, offset, len); + } + if (alloc_status == 0) { + return Status::OK(); + } else { + return IOError(filename_, errno); + } +} + +Status PosixWritableFile::RangeSync(off_t offset, off_t nbytes) { + if (sync_file_range(fd_, offset, nbytes, SYNC_FILE_RANGE_WRITE) == 0) { + return Status::OK(); + } else { + return IOError(filename_, errno); + } +} + +size_t PosixWritableFile::GetUniqueId(char* id, size_t max_size) const { + return GetUniqueIdFromFile(fd_, id, max_size); +} +#endif +} // namespace rocksdb +#endif From 3c750b59aeb7ab826bfdcc14a9d758fb9d1b37ec Mon Sep 17 00:00:00 2001 From: Dmitri Smirnov Date: Thu, 22 Oct 2015 15:15:37 -0700 Subject: [PATCH 23/26] No need to #ifdef test only code on windows --- db/column_family_test.cc | 7 ------- db/compaction_job_stats_test.cc | 2 +- db/db_compaction_filter_test.cc | 4 ---- db/db_compaction_test.cc | 6 +++--- db/db_dynamic_level_test.cc | 6 +++--- db/db_iter_test.cc | 3 --- db/db_log_iter_test.cc | 6 +++--- db/db_table_properties_test.cc | 4 ---- db/db_tailing_iter_test.cc | 6 +++--- db/db_test.cc | 7 ------- db/db_test_util.cc | 3 --- db/db_universal_compaction_test.cc | 6 +++--- db/db_wal_test.cc | 8 -------- db/fault_injection_test.cc | 8 -------- utilities/checkpoint/checkpoint_test.cc | 8 -------- 15 files changed, 16 insertions(+), 68 deletions(-) diff --git a/db/column_family_test.cc b/db/column_family_test.cc index a258b83df..938c4121a 100644 --- a/db/column_family_test.cc +++ b/db/column_family_test.cc @@ -23,8 +23,6 @@ #include "util/sync_point.h" #include "utilities/merge_operators.h" -#if !(defined NDEBUG) || !defined(OS_WIN) - namespace rocksdb { namespace { @@ -1262,13 +1260,8 @@ TEST_F(ColumnFamilyTest, FlushAndDropRaceCondition) { } } // namespace rocksdb -#endif int main(int argc, char** argv) { -#if !(defined NDEBUG) || !defined(OS_WIN) ::testing::InitGoogleTest(&argc, argv); return RUN_ALL_TESTS(); -#else - return 0; -#endif } diff --git a/db/compaction_job_stats_test.cc b/db/compaction_job_stats_test.cc index d6a82e18d..c7d8eb821 100644 --- a/db/compaction_job_stats_test.cc +++ b/db/compaction_job_stats_test.cc @@ -64,7 +64,7 @@ #include "util/xfunc.h" #include "utilities/merge_operators.h" -#if !defined(IOS_CROSS_COMPILE) && (!defined(NDEBUG) || !defined(OS_WIN)) +#if !defined(IOS_CROSS_COMPILE) #ifndef ROCKSDB_LITE namespace rocksdb { diff --git a/db/db_compaction_filter_test.cc b/db/db_compaction_filter_test.cc index 7535ea53a..e99db81f7 100644 --- a/db/db_compaction_filter_test.cc +++ b/db/db_compaction_filter_test.cc @@ -625,11 +625,7 @@ TEST_F(DBTestCompactionFilter, CompactionFilterSnapshot) { } // namespace rocksdb int main(int argc, char** argv) { -#if !(defined NDEBUG) || !defined(OS_WIN) rocksdb::port::InstallStackTraceHandler(); ::testing::InitGoogleTest(&argc, argv); return RUN_ALL_TESTS(); -#else - return 0; -#endif } diff --git a/db/db_compaction_test.cc b/db/db_compaction_test.cc index be53fdfde..1ff946966 100644 --- a/db/db_compaction_test.cc +++ b/db/db_compaction_test.cc @@ -14,7 +14,7 @@ namespace rocksdb { // SYNC_POINT is not supported in released Windows mode. -#if (!(defined NDEBUG) || !defined(OS_WIN)) && !defined(ROCKSDB_LITE) +#if !defined(ROCKSDB_LITE) class DBCompactionTest : public DBTestBase { public: @@ -1852,11 +1852,11 @@ TEST_P(DBCompactionTestWithParam, ForceBottommostLevelCompaction) { INSTANTIATE_TEST_CASE_P(DBCompactionTestWithParam, DBCompactionTestWithParam, ::testing::Values(1, 4)); -#endif // (!(defined NDEBUG) || !defined(OS_WIN)) && !defined(ROCKSDB_LITE) +#endif // !defined(ROCKSDB_LITE) } // namespace rocksdb int main(int argc, char** argv) { -#if (!(defined NDEBUG) || !defined(OS_WIN)) && !defined(ROCKSDB_LITE) +#if !defined(ROCKSDB_LITE) rocksdb::port::InstallStackTraceHandler(); ::testing::InitGoogleTest(&argc, argv); return RUN_ALL_TESTS(); diff --git a/db/db_dynamic_level_test.cc b/db/db_dynamic_level_test.cc index 03e632a77..f29985e05 100644 --- a/db/db_dynamic_level_test.cc +++ b/db/db_dynamic_level_test.cc @@ -10,7 +10,7 @@ // Introduction of SyncPoint effectively disabled building and running this test // in Release build. // which is a pity, it is a good test -#if (!(defined NDEBUG) || !defined(OS_WIN)) && !defined(ROCKSDB_LITE) +#if !defined(ROCKSDB_LITE) #include "db/db_test_util.h" #include "port/stack_trace.h" @@ -484,10 +484,10 @@ TEST_F(DBTestDynamicLevel, MigrateToDynamicLevelMaxBytesBase) { } } // namespace rocksdb -#endif // (!(defined NDEBUG) || !defined(OS_WIN)) && !defined(ROCKSDB_LITE) +#endif // !defined(ROCKSDB_LITE) int main(int argc, char** argv) { -#if (!(defined NDEBUG) || !defined(OS_WIN)) && !defined(ROCKSDB_LITE) +#if !defined(ROCKSDB_LITE) rocksdb::port::InstallStackTraceHandler(); ::testing::InitGoogleTest(&argc, argv); return RUN_ALL_TESTS(); diff --git a/db/db_iter_test.cc b/db/db_iter_test.cc index ed5c28bae..f1e3324d8 100644 --- a/db/db_iter_test.cc +++ b/db/db_iter_test.cc @@ -1943,8 +1943,6 @@ TEST_F(DBIterWithMergeIterTest, InnerMergeIterator2) { ASSERT_EQ(db_iter_->value().ToString(), "4"); } -#if !(defined NDEBUG) || !defined(OS_WIN) - TEST_F(DBIterWithMergeIterTest, InnerMergeIteratorDataRace1) { // Test Prev() when one child iterator is at its end but more rows // are added. @@ -2295,7 +2293,6 @@ TEST_F(DBIterWithMergeIterTest, InnerMergeIteratorDataRace8) { rocksdb::SyncPoint::GetInstance()->DisableProcessing(); } -#endif // #if !(defined NDEBUG) || !defined(OS_WIN) } // namespace rocksdb int main(int argc, char** argv) { diff --git a/db/db_log_iter_test.cc b/db/db_log_iter_test.cc index cb5ccdc26..64a74e02e 100644 --- a/db/db_log_iter_test.cc +++ b/db/db_log_iter_test.cc @@ -10,7 +10,7 @@ // Introduction of SyncPoint effectively disabled building and running this test // in Release build. // which is a pity, it is a good test -#if (!(defined NDEBUG) || !defined(OS_WIN)) && !defined(ROCKSDB_LITE) +#if !defined(ROCKSDB_LITE) #include "db/db_test_util.h" #include "port/stack_trace.h" @@ -277,10 +277,10 @@ TEST_F(DBTestXactLogIterator, TransactionLogIteratorBlobs) { } } // namespace rocksdb -#endif // (!(defined NDEBUG) || !defined(OS_WIN)) && !defined(ROCKSDB_LITE) +#endif // !defined(ROCKSDB_LITE) int main(int argc, char** argv) { -#if (!(defined NDEBUG) || !defined(OS_WIN)) && !defined(ROCKSDB_LITE) +#if !defined(ROCKSDB_LITE) rocksdb::port::InstallStackTraceHandler(); ::testing::InitGoogleTest(&argc, argv); return RUN_ALL_TESTS(); diff --git a/db/db_table_properties_test.cc b/db/db_table_properties_test.cc index f1f4558c5..29afa0d9e 100644 --- a/db/db_table_properties_test.cc +++ b/db/db_table_properties_test.cc @@ -207,11 +207,7 @@ TEST_F(DBTablePropertiesTest, GetPropertiesOfTablesInRange) { #endif // ROCKSDB_LITE int main(int argc, char** argv) { -#if !(defined NDEBUG) || !defined(OS_WIN) rocksdb::port::InstallStackTraceHandler(); ::testing::InitGoogleTest(&argc, argv); return RUN_ALL_TESTS(); -#else - return 0; -#endif } diff --git a/db/db_tailing_iter_test.cc b/db/db_tailing_iter_test.cc index 87e4f1cab..3be5e953c 100644 --- a/db/db_tailing_iter_test.cc +++ b/db/db_tailing_iter_test.cc @@ -10,7 +10,7 @@ // Introduction of SyncPoint effectively disabled building and running this test // in Release build. // which is a pity, it is a good test -#if (!(defined NDEBUG) || !defined(OS_WIN)) && !defined(ROCKSDB_LITE) +#if !defined(ROCKSDB_LITE) #include "db/db_test_util.h" #include "db/forward_iterator.h" @@ -646,10 +646,10 @@ TEST_F(DBTestTailingIterator, ManagedTailingIteratorSeekToSame) { } // namespace rocksdb -#endif // (!(defined NDEBUG) || !defined(OS_WIN)) && !defined(ROCKSDB_LITE) +#endif // !defined(ROCKSDB_LITE) int main(int argc, char** argv) { -#if (!(defined NDEBUG) || !defined(OS_WIN)) && !defined(ROCKSDB_LITE) +#if !defined(ROCKSDB_LITE) rocksdb::port::InstallStackTraceHandler(); ::testing::InitGoogleTest(&argc, argv); return RUN_ALL_TESTS(); diff --git a/db/db_test.cc b/db/db_test.cc index 93f355185..a2f8ea587 100644 --- a/db/db_test.cc +++ b/db/db_test.cc @@ -10,8 +10,6 @@ // Introduction of SyncPoint effectively disabled building and running this test // in Release build. // which is a pity, it is a good test -#if !(defined NDEBUG) || !defined(OS_WIN) - #include #include #include @@ -10045,14 +10043,9 @@ INSTANTIATE_TEST_CASE_P(BloomStatsTestWithParam, BloomStatsTestWithParam, #endif // ROCKSDB_LITE } // namespace rocksdb -#endif int main(int argc, char** argv) { -#if !(defined NDEBUG) || !defined(OS_WIN) rocksdb::port::InstallStackTraceHandler(); ::testing::InitGoogleTest(&argc, argv); return RUN_ALL_TESTS(); -#else - return 0; -#endif } diff --git a/db/db_test_util.cc b/db/db_test_util.cc index ab0ab4d69..06e536576 100644 --- a/db/db_test_util.cc +++ b/db/db_test_util.cc @@ -58,12 +58,9 @@ DBTestBase::DBTestBase(const std::string path) } DBTestBase::~DBTestBase() { -// SyncPoint is not supported in Released Windows Mode. -#if !(defined NDEBUG) || !defined(OS_WIN) rocksdb::SyncPoint::GetInstance()->DisableProcessing(); rocksdb::SyncPoint::GetInstance()->LoadDependency({}); rocksdb::SyncPoint::GetInstance()->ClearAllCallBacks(); -#endif // !(defined NDEBUG) || !defined(OS_WIN) Close(); Options options; options.db_paths.emplace_back(dbname_, 0); diff --git a/db/db_universal_compaction_test.cc b/db/db_universal_compaction_test.cc index 3323afe31..ec4ec32b8 100644 --- a/db/db_universal_compaction_test.cc +++ b/db/db_universal_compaction_test.cc @@ -9,7 +9,7 @@ #include "db/db_test_util.h" #include "port/stack_trace.h" -#if (!(defined NDEBUG) || !defined(OS_WIN)) && !defined(ROCKSDB_LITE) +#if !defined(ROCKSDB_LITE) #include "util/sync_point.h" namespace rocksdb { @@ -1210,10 +1210,10 @@ INSTANTIATE_TEST_CASE_P(DBTestUniversalManualCompactionOutputPathId, } // namespace rocksdb -#endif // (!(defined NDEBUG) || !defined(OS_WIN)) && !defined(ROCKSDB_LITE) +#endif // !defined(ROCKSDB_LITE) int main(int argc, char** argv) { -#if (!(defined NDEBUG) || !defined(OS_WIN)) && !defined(ROCKSDB_LITE) +#if !defined(ROCKSDB_LITE) rocksdb::port::InstallStackTraceHandler(); ::testing::InitGoogleTest(&argc, argv); return RUN_ALL_TESTS(); diff --git a/db/db_wal_test.cc b/db/db_wal_test.cc index 7cbaacd61..9e8a19dce 100644 --- a/db/db_wal_test.cc +++ b/db/db_wal_test.cc @@ -9,9 +9,7 @@ #include "db/db_test_util.h" #include "port/stack_trace.h" -#if !(defined NDEBUG) || !defined(OS_WIN) #include "util/sync_point.h" -#endif namespace rocksdb { class DBWALTest : public DBTestBase { @@ -70,7 +68,6 @@ TEST_F(DBWALTest, RollLog) { } while (ChangeOptions()); } -#if !(defined NDEBUG) || !defined(OS_WIN) TEST_F(DBWALTest, SyncWALNotBlockWrite) { Options options = CurrentOptions(); options.max_write_buffer_number = 4; @@ -130,15 +127,10 @@ TEST_F(DBWALTest, SyncWALNotWaitWrite) { ASSERT_EQ(Get("foo2"), "bar2"); rocksdb::SyncPoint::GetInstance()->DisableProcessing(); } -#endif } // namespace rocksdb int main(int argc, char** argv) { -#if !(defined NDEBUG) || !defined(OS_WIN) rocksdb::port::InstallStackTraceHandler(); ::testing::InitGoogleTest(&argc, argv); return RUN_ALL_TESTS(); -#else - return 0; -#endif } diff --git a/db/fault_injection_test.cc b/db/fault_injection_test.cc index 84a6e9a52..03d829b20 100644 --- a/db/fault_injection_test.cc +++ b/db/fault_injection_test.cc @@ -11,8 +11,6 @@ // the last "sync". It then checks for data loss errors by purposely dropping // file data (or entire files) not protected by a "sync". -#if !(defined NDEBUG) || !defined(OS_WIN) - #include #include #include "db/db_impl.h" @@ -902,13 +900,7 @@ INSTANTIATE_TEST_CASE_P(FaultTest, FaultInjectionTest, ::testing::Bool()); } // namespace rocksdb -#endif // #if !(defined NDEBUG) || !defined(OS_WIN) - int main(int argc, char** argv) { -#if !(defined NDEBUG) || !defined(OS_WIN) ::testing::InitGoogleTest(&argc, argv); return RUN_ALL_TESTS(); -#else - return 0; -#endif } diff --git a/utilities/checkpoint/checkpoint_test.cc b/utilities/checkpoint/checkpoint_test.cc index 5cd72ea64..27c1beb5f 100644 --- a/utilities/checkpoint/checkpoint_test.cc +++ b/utilities/checkpoint/checkpoint_test.cc @@ -10,8 +10,6 @@ // Syncpoint prevents us building and running tests in release #ifndef ROCKSDB_LITE -#if !defined(NDEBUG) || !defined(OS_WIN) - #ifndef OS_WIN #include #endif @@ -350,16 +348,10 @@ TEST_F(DBTest, CheckpointCF) { } // namespace rocksdb -#endif - int main(int argc, char** argv) { -#if !defined(NDEBUG) || !defined(OS_WIN) rocksdb::port::InstallStackTraceHandler(); ::testing::InitGoogleTest(&argc, argv); return RUN_ALL_TESTS(); -#else - return 0; -#endif } #else From d6911111464b3ddc35405200c4f9d77045f9110a Mon Sep 17 00:00:00 2001 From: sdong Date: Fri, 23 Oct 2015 07:36:22 -0700 Subject: [PATCH 24/26] include/posix/io_posix.h should have a once declartion Summary: include/posix/io_posix.h doesn't not prevent multiple includes. Need to fix it. It is also breaking unity build. Test Plan: Run unity build and see error go away. Reviewers: rven, igor, IslamAbdelRahman, kradhakrishnan, anthony Subscribers: leveldb, dhruba Differential Revision: https://reviews.facebook.net/D49281 --- include/posix/io_posix.h | 1 + 1 file changed, 1 insertion(+) diff --git a/include/posix/io_posix.h b/include/posix/io_posix.h index aaa6b4048..65645b0ff 100644 --- a/include/posix/io_posix.h +++ b/include/posix/io_posix.h @@ -6,6 +6,7 @@ // Copyright (c) 2011 The LevelDB Authors. All rights reserved. // Use of this source code is governed by a BSD-style license that can be // found in the LICENSE file. See the AUTHORS file for names of contributors. +#pragma once #include #include #include From 581f20fd8b99fe9d31d20e6094316143c47b8adf Mon Sep 17 00:00:00 2001 From: Islam AbdelRahman Date: Mon, 26 Oct 2015 11:50:29 -0700 Subject: [PATCH 25/26] Add LITE tests to Legocastle Summary: Update rocksdb-lego-determinator to include running make check under ROCKSDB_LITE Test Plan: will be tested after landing in fbcode Reviewers: sdong, yhchiang, igor, kradhakrishnan Reviewed By: kradhakrishnan Subscribers: dhruba Differential Revision: https://reviews.facebook.net/D49065 --- build_tools/rocksdb-lego-determinator | 23 +++++++++++++++++++++++ 1 file changed, 23 insertions(+) diff --git a/build_tools/rocksdb-lego-determinator b/build_tools/rocksdb-lego-determinator index b48cb65b7..a02b62cb1 100755 --- a/build_tools/rocksdb-lego-determinator +++ b/build_tools/rocksdb-lego-determinator @@ -263,6 +263,26 @@ LITE_BUILD_COMMANDS="[ } ]" +# +# RocksDB lite tests +# +LITE_UNIT_TEST_COMMANDS="[ + { + 'name':'Rocksdb Lite Unit Test', + 'oncall':'$ONCALL', + 'steps': [ + $CLEANUP_ENV, + { + 'name':'Build RocksDB debug version', + 'shell':'$SHM $LITE $DEBUG make J=1 check', + 'user':'root', + $PARSER + }, + ], + $REPORT + } +]" + # # RocksDB stress/crash test # @@ -575,6 +595,9 @@ case $1 in lite) echo $LITE_BUILD_COMMANDS ;; + lite_test) + echo $LITE_UNIT_TEST_COMMANDS + ;; stress_crash) echo $STRESS_CRASH_TEST_COMMANDS ;; From 44d4057d785a92ea94c03746535ad3a4fb824a0f Mon Sep 17 00:00:00 2001 From: sdong Date: Fri, 23 Oct 2015 09:16:46 -0700 Subject: [PATCH 26/26] Avoid some includes in io_posix.h Summary: IO Posix depends on too many .h files. Move most of them to .cc files. Test Plan: make all Reviewers: anthony, rven, IslamAbdelRahman, yhchiang, kradhakrishnan, igor Reviewed By: igor Subscribers: leveldb, dhruba Differential Revision: https://reviews.facebook.net/D49479 --- include/posix/io_posix.h | 35 ----------------------------------- util/env_posix.cc | 38 +++++++++++++++++++++++++++++++++++--- util/io_posix.cc | 23 +++++++++++++++++++++++ 3 files changed, 58 insertions(+), 38 deletions(-) diff --git a/include/posix/io_posix.h b/include/posix/io_posix.h index 65645b0ff..6e35e8939 100644 --- a/include/posix/io_posix.h +++ b/include/posix/io_posix.h @@ -7,43 +7,8 @@ // Use of this source code is governed by a BSD-style license that can be // found in the LICENSE file. See the AUTHORS file for names of contributors. #pragma once -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include #include -#include -#include -#include -#include -#include -#include -#include -#ifdef OS_LINUX -#include -#include -#endif -#if defined(OS_LINUX) -#include -#endif #include "rocksdb/env.h" -#include "util/sync_point.h" -#include "util/iostats_context_imp.h" -#include "util/coding.h" -#include "rocksdb/slice.h" -#include "port/port.h" -#include "util/logging.h" -#include "util/posix_logger.h" -#include "util/random.h" -#include "util/string_util.h" -#include "util/thread_status_updater.h" -#include "util/thread_status_util.h" // For non linux platform, the following macros are used only as place // holder. diff --git a/util/env_posix.cc b/util/env_posix.cc index 1e04ac63e..a434b8216 100644 --- a/util/env_posix.cc +++ b/util/env_posix.cc @@ -6,9 +6,28 @@ // Copyright (c) 2011 The LevelDB Authors. All rights reserved. // Use of this source code is governed by a BSD-style license that can be // found in the LICENSE file. See the AUTHORS file for names of contributors. - -#include "posix/io_posix.h" - +#include +#include +#include +#if defined(OS_LINUX) +#include +#endif +#include +#include +#include +#include +#include +#include +#include +#include +#ifdef OS_LINUX +#include +#include +#endif +#include +#include +#include +#include // Get nano time includes #if defined(OS_LINUX) || defined(OS_FREEBSD) #elif defined(__MACH__) @@ -17,6 +36,19 @@ #else #include #endif +#include +#include +#include "port/port.h" +#include "posix/io_posix.h" +#include "rocksdb/slice.h" +#include "util/coding.h" +#include "util/iostats_context_imp.h" +#include "util/logging.h" +#include "util/posix_logger.h" +#include "util/string_util.h" +#include "util/sync_point.h" +#include "util/thread_status_updater.h" +#include "util/thread_status_util.h" #if !defined(TMPFS_MAGIC) #define TMPFS_MAGIC 0x01021994 diff --git a/util/io_posix.cc b/util/io_posix.cc index 4c33fee85..a8144324a 100644 --- a/util/io_posix.cc +++ b/util/io_posix.cc @@ -10,6 +10,29 @@ #ifdef ROCKSDB_LIB_IO_POSIX #include "posix/io_posix.h" +#include +#include +#if defined(OS_LINUX) +#include +#endif +#include +#include +#include +#include +#include +#include +#include +#ifdef OS_LINUX +#include +#include +#endif +#include "port/port.h" +#include "rocksdb/slice.h" +#include "util/coding.h" +#include "util/iostats_context_imp.h" +#include "util/posix_logger.h" +#include "util/string_util.h" +#include "util/sync_point.h" namespace rocksdb {