From 6e9fbeb27c38329f33ae541302c44c8db8374f8c Mon Sep 17 00:00:00 2001 From: sdong Date: Fri, 17 Jul 2015 16:16:11 -0700 Subject: [PATCH] Move rate_limiter, write buffering, most perf context instrumentation and most random kill out of Env Summary: We want to keep Env a think layer for better portability. Less platform dependent codes should be moved out of Env. In this patch, I create a wrapper of file readers and writers, and put rate limiting, write buffering, as well as most perf context instrumentation and random kill out of Env. It will make it easier to maintain multiple Env in the future. Test Plan: Run all existing unit tests. Reviewers: anthony, kradhakrishnan, IslamAbdelRahman, yhchiang, igor Reviewed By: igor Subscribers: leveldb, dhruba Differential Revision: https://reviews.facebook.net/D42321 --- db/builder.cc | 38 ++-- db/builder.h | 4 +- db/compaction_job.cc | 22 +- db/compaction_job_test.cc | 5 +- db/db_impl.cc | 84 +++++--- db/db_test.cc | 13 +- db/fault_injection_test.cc | 4 +- db/filename.cc | 9 +- db/filename.h | 4 +- db/flush_job_test.cc | 5 +- db/log_reader.cc | 3 +- db/log_reader.h | 8 +- db/log_test.cc | 52 +++-- db/log_writer.cc | 6 +- db/log_writer.h | 10 +- db/plain_table_db_test.cc | 8 +- db/repair.cc | 15 +- db/table_cache.cc | 5 +- db/table_properties_collector_test.cc | 42 ++-- db/transaction_log_impl.cc | 19 +- db/transaction_log_impl.h | 3 +- db/version_set.cc | 88 +++++--- db/wal_manager.cc | 5 +- db/wal_manager_test.cc | 9 +- include/rocksdb/env.h | 23 +- include/rocksdb/table.h | 8 +- port/port_posix.cc | 12 +- port/port_posix.h | 3 +- port/win/env_win.cc | 39 +--- src.mk | 1 + table/adaptive_table_factory.cc | 7 +- table/adaptive_table_factory.h | 13 +- table/block_based_table_builder.cc | 8 +- table/block_based_table_builder.h | 2 +- table/block_based_table_factory.cc | 4 +- table/block_based_table_factory.h | 8 +- table/block_based_table_reader.cc | 20 +- table/block_based_table_reader.h | 5 +- table/cuckoo_table_builder.cc | 3 +- table/cuckoo_table_builder.h | 15 +- table/cuckoo_table_builder_test.cc | 114 ++++++---- table/cuckoo_table_factory.cc | 9 +- table/cuckoo_table_factory.h | 13 +- table/cuckoo_table_reader.cc | 3 +- table/cuckoo_table_reader.h | 14 +- table/cuckoo_table_reader_test.cc | 66 +++--- table/format.cc | 11 +- table/format.h | 5 +- table/meta_blocks.cc | 12 +- table/meta_blocks.h | 13 +- table/mock_table.cc | 9 +- table/mock_table.h | 13 +- table/plain_table_builder.cc | 10 +- table/plain_table_builder.h | 11 +- table/plain_table_factory.cc | 13 +- table/plain_table_factory.h | 13 +- table/plain_table_key_coding.cc | 6 +- table/plain_table_key_coding.h | 2 +- table/plain_table_reader.cc | 4 +- table/plain_table_reader.h | 9 +- table/table_reader_bench.cc | 15 +- table/table_test.cc | 48 +++-- util/db_test_util.h | 3 + util/env_posix.cc | 238 ++------------------- util/env_test.cc | 7 +- util/file_reader_writer.cc | 225 +++++++++++++++++++ util/file_reader_writer.h | 109 ++++++++++ util/file_util.cc | 14 +- util/ldb_cmd.cc | 14 +- util/sst_dump_test.cc | 8 +- util/sst_dump_tool.cc | 18 +- util/sst_dump_tool_imp.h | 7 +- util/sync_point.cc | 15 ++ util/sync_point.h | 28 +++ util/testutil.cc | 16 ++ util/testutil.h | 8 + utilities/backupable/backupable_db.cc | 32 ++- utilities/backupable/backupable_db_test.cc | 6 +- 78 files changed, 1079 insertions(+), 714 deletions(-) create mode 100644 util/file_reader_writer.cc create mode 100644 util/file_reader_writer.h diff --git a/db/builder.cc b/db/builder.cc index a04075ba6..2c3d6842d 100644 --- a/db/builder.cc +++ b/db/builder.cc @@ -21,6 +21,7 @@ #include "rocksdb/options.h" #include "rocksdb/table.h" #include "table/block_based_table_builder.h" +#include "util/file_reader_writer.h" #include "util/iostats_context_imp.h" #include "util/thread_status_util.h" #include "util/stop_watch.h" @@ -34,7 +35,7 @@ TableBuilder* NewTableBuilder( const InternalKeyComparator& internal_comparator, const std::vector>* int_tbl_prop_collector_factories, - WritableFile* file, const CompressionType compression_type, + WritableFileWriter* file, const CompressionType compression_type, const CompressionOptions& compression_opts, const bool skip_filters) { return ioptions.table_factory->NewTableBuilder( TableBuilderOptions(ioptions, internal_comparator, @@ -72,16 +73,22 @@ Status BuildTable( std::string fname = TableFileName(ioptions.db_paths, meta->fd.GetNumber(), meta->fd.GetPathId()); if (iter->Valid()) { - unique_ptr file; - s = env->NewWritableFile(fname, &file, env_options); - if (!s.ok()) { - return s; - } - file->SetIOPriority(io_priority); + TableBuilder* builder; + unique_ptr file_writer; + { + unique_ptr file; + s = env->NewWritableFile(fname, &file, env_options); + if (!s.ok()) { + return s; + } + file->SetIOPriority(io_priority); - TableBuilder* builder = NewTableBuilder( - ioptions, internal_comparator, int_tbl_prop_collector_factories, - file.get(), compression, compression_opts); + file_writer.reset(new WritableFileWriter(std::move(file), env_options)); + + builder = NewTableBuilder( + ioptions, internal_comparator, int_tbl_prop_collector_factories, + file_writer.get(), compression, compression_opts); + } { // the first key is the smallest key @@ -232,16 +239,11 @@ Status BuildTable( // Finish and check for file errors if (s.ok() && !ioptions.disable_data_sync) { - if (ioptions.use_fsync) { - StopWatch sw(env, ioptions.statistics, TABLE_SYNC_MICROS); - s = file->Fsync(); - } else { - StopWatch sw(env, ioptions.statistics, TABLE_SYNC_MICROS); - s = file->Sync(); - } + StopWatch sw(env, ioptions.statistics, TABLE_SYNC_MICROS); + file_writer->Sync(ioptions.use_fsync); } if (s.ok()) { - s = file->Close(); + s = file_writer->Close(); } if (s.ok()) { diff --git a/db/builder.h b/db/builder.h index 9d2888dee..e1f625c8b 100644 --- a/db/builder.h +++ b/db/builder.h @@ -29,14 +29,14 @@ class Iterator; class TableCache; class VersionEdit; class TableBuilder; -class WritableFile; +class WritableFileWriter; TableBuilder* NewTableBuilder( const ImmutableCFOptions& options, const InternalKeyComparator& internal_comparator, const std::vector>* int_tbl_prop_collector_factories, - WritableFile* file, const CompressionType compression_type, + WritableFileWriter* file, const CompressionType compression_type, const CompressionOptions& compression_opts, const bool skip_filters = false); diff --git a/db/compaction_job.cc b/db/compaction_job.cc index 0edf984d5..1f924e9bb 100644 --- a/db/compaction_job.cc +++ b/db/compaction_job.cc @@ -44,6 +44,7 @@ #include "table/table_builder.h" #include "table/two_level_iterator.h" #include "util/coding.h" +#include "util/file_reader_writer.h" #include "util/logging.h" #include "util/log_buffer.h" #include "util/mutexlock.h" @@ -71,7 +72,7 @@ struct CompactionJob::CompactionState { std::vector outputs; // State kept for output being generated - std::unique_ptr outfile; + std::unique_ptr outfile; std::unique_ptr builder; uint64_t total_bytes; @@ -662,13 +663,8 @@ Status CompactionJob::FinishCompactionOutputFile(const Status& input_status) { // Finish and check for file errors if (s.ok() && !db_options_.disableDataSync) { - if (db_options_.use_fsync) { - StopWatch sw(env_, stats_, COMPACTION_OUTFILE_SYNC_MICROS); - s = compact_->outfile->Fsync(); - } else { - StopWatch sw(env_, stats_, COMPACTION_OUTFILE_SYNC_MICROS); - s = compact_->outfile->Sync(); - } + StopWatch sw(env_, stats_, COMPACTION_OUTFILE_SYNC_MICROS); + s = compact_->outfile->Sync(db_options_.use_fsync); } if (s.ok()) { s = compact_->outfile->Close(); @@ -799,10 +795,10 @@ Status CompactionJob::OpenCompactionOutputFile() { // no need to lock because VersionSet::next_file_number_ is atomic uint64_t file_number = versions_->NewFileNumber(); // Make the output file + unique_ptr writable_file; std::string fname = TableFileName(db_options_.db_paths, file_number, compact_->compaction->output_path_id()); - Status s = env_->NewWritableFile(fname, &compact_->outfile, env_options_); - + Status s = env_->NewWritableFile(fname, &writable_file, env_options_); if (!s.ok()) { Log(InfoLogLevel::ERROR_LEVEL, db_options_.info_log, "[%s] [JOB %d] OpenCompactionOutputFiles for table #%" PRIu64 @@ -820,9 +816,11 @@ Status CompactionJob::OpenCompactionOutputFile() { out.smallest_seqno = out.largest_seqno = 0; compact_->outputs.push_back(out); - compact_->outfile->SetIOPriority(Env::IO_LOW); - compact_->outfile->SetPreallocationBlockSize( + writable_file->SetIOPriority(Env::IO_LOW); + writable_file->SetPreallocationBlockSize( static_cast(compact_->compaction->OutputFilePreallocationSize())); + compact_->outfile.reset( + new WritableFileWriter(std::move(writable_file), env_options_)); ColumnFamilyData* cfd = compact_->compaction->column_family_data(); bool skip_filters = false; diff --git a/db/compaction_job_test.cc b/db/compaction_job_test.cc index 40ac27e22..ab716c068 100644 --- a/db/compaction_job_test.cc +++ b/db/compaction_job_test.cc @@ -13,6 +13,7 @@ #include "rocksdb/cache.h" #include "rocksdb/options.h" #include "rocksdb/db.h" +#include "util/file_reader_writer.h" #include "util/string_util.h" #include "util/testharness.h" #include "util/testutil.h" @@ -166,8 +167,10 @@ class CompactionJobTest : public testing::Test { Status s = env_->NewWritableFile( manifest, &file, env_->OptimizeForManifestWrite(env_options_)); ASSERT_OK(s); + unique_ptr file_writer( + new WritableFileWriter(std::move(file), EnvOptions())); { - log::Writer log(std::move(file)); + log::Writer log(std::move(file_writer)); std::string record; new_db.EncodeTo(&record); s = log.AddRecord(record); diff --git a/db/db_impl.cc b/db/db_impl.cc index b681514d8..030f182d1 100644 --- a/db/db_impl.cc +++ b/db/db_impl.cc @@ -72,6 +72,7 @@ #include "util/compression.h" #include "util/crc32c.h" #include "util/db_info_dumper.h" +#include "util/file_reader_writer.h" #include "util/file_util.h" #include "util/hash_skiplist_rep.h" #include "util/hash_linklist_rep.h" @@ -384,18 +385,22 @@ Status DBImpl::NewDB() { new_db.SetNextFile(2); new_db.SetLastSequence(0); + Status s; + Log(InfoLogLevel::INFO_LEVEL, db_options_.info_log, "Creating manifest 1 \n"); const std::string manifest = DescriptorFileName(dbname_, 1); - unique_ptr file; - Status s = env_->NewWritableFile( - manifest, &file, env_->OptimizeForManifestWrite(env_options_)); - if (!s.ok()) { - return s; - } - file->SetPreallocationBlockSize(db_options_.manifest_preallocation_size); { - log::Writer log(std::move(file)); + unique_ptr file; + EnvOptions env_options = env_->OptimizeForManifestWrite(env_options_); + s = env_->NewWritableFile(manifest, &file, env_options); + if (!s.ok()) { + return s; + } + file->SetPreallocationBlockSize(db_options_.manifest_preallocation_size); + unique_ptr file_writer( + new WritableFileWriter(std::move(file), env_options)); + log::Writer log(std::move(file_writer)); std::string record; new_db.EncodeTo(&record); s = log.AddRecord(record); @@ -1013,17 +1018,21 @@ Status DBImpl::RecoverLogFiles(const std::vector& log_numbers, versions_->MarkFileNumberUsedDuringRecovery(log_number); // Open the log file std::string fname = LogFileName(db_options_.wal_dir, log_number); - unique_ptr file; - status = env_->NewSequentialFile(fname, &file, env_options_); - if (!status.ok()) { - MaybeIgnoreError(&status); + unique_ptr file_reader; + { + unique_ptr file; + status = env_->NewSequentialFile(fname, &file, env_options_); if (!status.ok()) { - return status; - } else { - // Fail with one log file, but that's ok. - // Try next one. - continue; + MaybeIgnoreError(&status); + if (!status.ok()) { + return status; + } else { + // Fail with one log file, but that's ok. + // Try next one. + continue; + } } + file_reader.reset(new SequentialFileReader(std::move(file))); } // Create the log reader. @@ -1042,7 +1051,7 @@ Status DBImpl::RecoverLogFiles(const std::vector& log_numbers, // paranoid_checks==false so that corruptions cause entire commits // to be skipped instead of propagating bad information (like overly // large sequence numbers). - log::Reader reader(std::move(file), &reporter, true /*checksum*/, + log::Reader reader(std::move(file_reader), &reporter, true /*checksum*/, 0 /*initial_offset*/); Log(InfoLogLevel::INFO_LEVEL, db_options_.info_log, "Recovering log #%" PRIu64 " mode %d skip-recovery %d", log_number, @@ -3490,11 +3499,7 @@ Status DBImpl::WriteImpl(const WriteOptions& write_options, if (status.ok() && write_options.sync) { RecordTick(stats_, WAL_FILE_SYNCED); StopWatch sw(env_, stats_, WAL_FILE_SYNC_MICROS); - if (db_options_.use_fsync) { - status = log_->file()->Fsync(); - } else { - status = log_->file()->Sync(); - } + status = log_->file()->Sync(db_options_.use_fsync); if (status.ok() && !log_dir_synced_) { // We only sync WAL directory the first time WAL syncing is // requested, so that in case users never turn on WAL sync, @@ -3624,15 +3629,19 @@ Status DBImpl::SwitchMemtable(ColumnFamilyData* cfd, WriteContext* context) { Status s; { if (creating_new_log) { + EnvOptions opt_env_opt = + env_->OptimizeForLogWrite(env_options_, db_options_); s = env_->NewWritableFile( LogFileName(db_options_.wal_dir, new_log_number), &lfile, - env_->OptimizeForLogWrite(env_options_, db_options_)); + opt_env_opt); if (s.ok()) { // Our final size should be less than write_buffer_size // (compression, etc) but err on the side of caution. lfile->SetPreallocationBlockSize( 1.1 * mutable_cf_options.write_buffer_size); - new_log = new log::Writer(std::move(lfile)); + unique_ptr file_writer( + new WritableFileWriter(std::move(lfile), opt_env_opt)); + new_log = new log::Writer(std::move(file_writer)); log_dir_synced_ = false; } } @@ -4031,12 +4040,18 @@ Status DBImpl::CheckConsistency() { Status DBImpl::GetDbIdentity(std::string& identity) const { std::string idfilename = IdentityFileName(dbname_); - unique_ptr idfile; const EnvOptions soptions; - Status s = env_->NewSequentialFile(idfilename, &idfile, soptions); - if (!s.ok()) { - return s; + unique_ptr id_file_reader; + Status s; + { + unique_ptr idfile; + s = env_->NewSequentialFile(idfilename, &idfile, soptions); + if (!s.ok()) { + return s; + } + id_file_reader.reset(new SequentialFileReader(std::move(idfile))); } + uint64_t file_size; s = env_->GetFileSize(idfilename, &file_size); if (!s.ok()) { @@ -4044,7 +4059,7 @@ Status DBImpl::GetDbIdentity(std::string& identity) const { } char* buffer = reinterpret_cast(alloca(file_size)); Slice id; - s = idfile->Read(static_cast(file_size), &id, buffer); + s = id_file_reader->Read(static_cast(file_size), &id, buffer); if (!s.ok()) { return s; } @@ -4176,14 +4191,17 @@ Status DB::Open(const DBOptions& db_options, const std::string& dbname, uint64_t new_log_number = impl->versions_->NewFileNumber(); unique_ptr lfile; EnvOptions soptions(db_options); + EnvOptions opt_env_options = + impl->db_options_.env->OptimizeForLogWrite(soptions, impl->db_options_); s = impl->db_options_.env->NewWritableFile( LogFileName(impl->db_options_.wal_dir, new_log_number), &lfile, - impl->db_options_.env->OptimizeForLogWrite(soptions, - impl->db_options_)); + opt_env_options); if (s.ok()) { lfile->SetPreallocationBlockSize(1.1 * max_write_buffer_size); impl->logfile_number_ = new_log_number; - impl->log_.reset(new log::Writer(std::move(lfile))); + unique_ptr file_writer( + new WritableFileWriter(std::move(lfile), opt_env_options)); + impl->log_.reset(new log::Writer(std::move(file_writer))); // set column family handles for (auto cf : column_families) { diff --git a/db/db_test.cc b/db/db_test.cc index c732c9fb8..82f393b85 100644 --- a/db/db_test.cc +++ b/db/db_test.cc @@ -52,6 +52,7 @@ #include "table/mock_table.h" #include "table/plain_table_factory.h" #include "util/db_test_util.h" +#include "util/file_reader_writer.h" #include "util/hash.h" #include "util/hash_linklist_rep.h" #include "utilities/merge_operators.h" @@ -6008,7 +6009,9 @@ class RecoveryTestHelper { std::string fname = LogFileName(test->dbname_, current_log_number); unique_ptr file; ASSERT_OK(db_options.env->NewWritableFile(fname, &file, env_options)); - current_log_writer.reset(new log::Writer(std::move(file))); + unique_ptr file_writer( + new WritableFileWriter(std::move(file), env_options)); + current_log_writer.reset(new log::Writer(std::move(file_writer))); for (int i = 0; i < kKeysPerWALFile; i++) { std::string key = "key" + ToString(count++); @@ -7231,8 +7234,7 @@ TEST_F(DBTest, RateLimitingTest) { } elapsed = env_->NowMicros() - start; Close(); - ASSERT_TRUE(options.rate_limiter->GetTotalBytesThrough() == - env_->bytes_written_); + ASSERT_EQ(options.rate_limiter->GetTotalBytesThrough(), env_->bytes_written_); double ratio = env_->bytes_written_ * 1000000 / elapsed / raw_rate; fprintf(stderr, "write rate ratio = %.2lf, expected 0.7\n", ratio); ASSERT_TRUE(ratio < 0.8); @@ -7251,11 +7253,10 @@ TEST_F(DBTest, RateLimitingTest) { } elapsed = env_->NowMicros() - start; Close(); - ASSERT_TRUE(options.rate_limiter->GetTotalBytesThrough() == - env_->bytes_written_); + ASSERT_EQ(options.rate_limiter->GetTotalBytesThrough(), env_->bytes_written_); ratio = env_->bytes_written_ * 1000000 / elapsed / raw_rate; fprintf(stderr, "write rate ratio = %.2lf, expected 0.5\n", ratio); - ASSERT_TRUE(ratio < 0.6); + ASSERT_LT(ratio, 0.6); } namespace { diff --git a/db/fault_injection_test.cc b/db/fault_injection_test.cc index 845dff8b9..88366d055 100644 --- a/db/fault_injection_test.cc +++ b/db/fault_injection_test.cc @@ -130,7 +130,7 @@ struct FileState { } // anonymous namespace -// A wrapper around WritableFile which informs another Env whenever this file +// A wrapper around WritableFileWriter* file // is written to or sync'ed. class TestWritableFile : public WritableFile { public: @@ -197,7 +197,7 @@ class FaultInjectionTestEnv : public EnvWrapper { Status s = target()->NewWritableFile(fname, result, soptions); if (s.ok()) { result->reset(new TestWritableFile(fname, std::move(*result), this)); - // WritableFile doesn't append to files, so if the same file is opened + // WritableFileWriter* file is opened // again then it will be truncated - so forget our saved state. UntrackFile(fname); MutexLock l(&mutex_); diff --git a/db/filename.cc b/db/filename.cc index c639bee20..f445712e1 100644 --- a/db/filename.cc +++ b/db/filename.cc @@ -18,6 +18,7 @@ #include #include "db/dbformat.h" #include "rocksdb/env.h" +#include "util/file_reader_writer.h" #include "util/logging.h" #include "util/stop_watch.h" @@ -328,15 +329,13 @@ Status SetIdentityFile(Env* env, const std::string& dbname) { return s; } -Status SyncManifest(Env* env, const DBOptions* db_options, WritableFile* file) { +Status SyncManifest(Env* env, const DBOptions* db_options, + WritableFileWriter* file) { if (db_options->disableDataSync) { return Status::OK(); - } else if (db_options->use_fsync) { - StopWatch sw(env, db_options->statistics.get(), MANIFEST_FILE_SYNC_MICROS); - return file->Fsync(); } else { StopWatch sw(env, db_options->statistics.get(), MANIFEST_FILE_SYNC_MICROS); - return file->Sync(); + return file->Sync(db_options->use_fsync); } } diff --git a/db/filename.h b/db/filename.h index 36562de93..6425eb955 100644 --- a/db/filename.h +++ b/db/filename.h @@ -25,7 +25,7 @@ namespace rocksdb { class Env; class Directory; -class WritableFile; +class WritableFileWriter; enum FileType { kLogFile, @@ -140,6 +140,6 @@ extern Status SetIdentityFile(Env* env, const std::string& dbname); // Sync manifest file `file`. extern Status SyncManifest(Env* env, const DBOptions* db_options, - WritableFile* file); + WritableFileWriter* file); } // namespace rocksdb diff --git a/db/flush_job_test.cc b/db/flush_job_test.cc index 75590ec65..7e8fb8f51 100644 --- a/db/flush_job_test.cc +++ b/db/flush_job_test.cc @@ -11,6 +11,7 @@ #include "db/version_set.h" #include "db/writebuffer.h" #include "rocksdb/cache.h" +#include "util/file_reader_writer.h" #include "util/string_util.h" #include "util/testharness.h" #include "util/testutil.h" @@ -56,8 +57,10 @@ class FlushJobTest : public testing::Test { Status s = env_->NewWritableFile( manifest, &file, env_->OptimizeForManifestWrite(env_options_)); ASSERT_OK(s); + unique_ptr file_writer( + new WritableFileWriter(std::move(file), EnvOptions())); { - log::Writer log(std::move(file)); + log::Writer log(std::move(file_writer)); std::string record; new_db.EncodeTo(&record); s = log.AddRecord(record); diff --git a/db/log_reader.cc b/db/log_reader.cc index f3fdc18f3..296f1d50c 100644 --- a/db/log_reader.cc +++ b/db/log_reader.cc @@ -13,6 +13,7 @@ #include "rocksdb/env.h" #include "util/coding.h" #include "util/crc32c.h" +#include "util/file_reader_writer.h" namespace rocksdb { namespace log { @@ -20,7 +21,7 @@ namespace log { Reader::Reporter::~Reporter() { } -Reader::Reader(unique_ptr&& _file, Reporter* reporter, +Reader::Reader(unique_ptr&& _file, Reporter* reporter, bool checksum, uint64_t initial_offset) : file_(std::move(_file)), reporter_(reporter), diff --git a/db/log_reader.h b/db/log_reader.h index e6cbf47ac..390696b85 100644 --- a/db/log_reader.h +++ b/db/log_reader.h @@ -17,7 +17,7 @@ namespace rocksdb { -class SequentialFile; +class SequentialFileReader; using std::unique_ptr; namespace log { @@ -51,7 +51,7 @@ class Reader { // // The Reader will start reading at the first record located at physical // position >= initial_offset within the file. - Reader(unique_ptr&& file, Reporter* reporter, + Reader(unique_ptr&& file, Reporter* reporter, bool checksum, uint64_t initial_offset); ~Reader(); @@ -81,10 +81,10 @@ class Reader { // block that was partially read. void UnmarkEOF(); - SequentialFile* file() { return file_.get(); } + SequentialFileReader* file() { return file_.get(); } private: - const unique_ptr file_; + const unique_ptr file_; Reporter* const reporter_; bool const checksum_; char* const backing_store_; diff --git a/db/log_test.cc b/db/log_test.cc index 74715acde..168936e98 100644 --- a/db/log_test.cc +++ b/db/log_test.cc @@ -12,8 +12,10 @@ #include "rocksdb/env.h" #include "util/coding.h" #include "util/crc32c.h" +#include "util/file_reader_writer.h" #include "util/random.h" #include "util/testharness.h" +#include "util/testutil.h" namespace rocksdb { namespace log { @@ -163,26 +165,27 @@ class LogTest : public testing::Test { }; std::string& dest_contents() { - auto dest = dynamic_cast(writer_.file()); + auto dest = dynamic_cast(writer_.file()->writable_file()); assert(dest); return dest->contents_; } const std::string& dest_contents() const { - auto dest = dynamic_cast(writer_.file()); + auto dest = + dynamic_cast(writer_.file()->writable_file()); assert(dest); return dest->contents_; } void reset_source_contents() { - auto src = dynamic_cast(reader_.file()); + auto src = dynamic_cast(reader_.file()->file()); assert(src); src->contents_ = dest_contents(); } Slice reader_contents_; - unique_ptr dest_holder_; - unique_ptr source_holder_; + unique_ptr dest_holder_; + unique_ptr source_holder_; ReportCollector report_; Writer writer_; Reader reader_; @@ -192,13 +195,15 @@ class LogTest : public testing::Test { static uint64_t initial_offset_last_record_offsets_[]; public: - LogTest() : reader_contents_(), - dest_holder_(new StringDest(reader_contents_)), - source_holder_(new StringSource(reader_contents_)), - writer_(std::move(dest_holder_)), - reader_(std::move(source_holder_), &report_, true/*checksum*/, - 0/*initial_offset*/) { - } + LogTest() + : reader_contents_(), + dest_holder_( + test::GetWritableFileWriter(new StringDest(reader_contents_))), + source_holder_( + test::GetSequentialFileReader(new StringSource(reader_contents_))), + writer_(std::move(dest_holder_)), + reader_(std::move(source_holder_), &report_, true /*checksum*/, + 0 /*initial_offset*/) {} void Write(const std::string& msg) { writer_.AddRecord(Slice(msg)); @@ -227,7 +232,7 @@ class LogTest : public testing::Test { } void ShrinkSize(int bytes) { - auto dest = dynamic_cast(writer_.file()); + auto dest = dynamic_cast(writer_.file()->writable_file()); assert(dest); dest->Drop(bytes); } @@ -240,7 +245,7 @@ class LogTest : public testing::Test { } void ForceError(size_t position = 0) { - auto src = dynamic_cast(reader_.file()); + auto src = dynamic_cast(reader_.file()->file()); src->force_error_ = true; src->force_error_position_ = position; } @@ -254,13 +259,13 @@ class LogTest : public testing::Test { } void ForceEOF(size_t position = 0) { - auto src = dynamic_cast(reader_.file()); + auto src = dynamic_cast(reader_.file()->file()); src->force_eof_ = true; src->force_eof_position_ = position; } void UnmarkEOF() { - auto src = dynamic_cast(reader_.file()); + auto src = dynamic_cast(reader_.file()->file()); src->returned_partial_ = false; reader_.UnmarkEOF(); } @@ -288,10 +293,11 @@ class LogTest : public testing::Test { void CheckOffsetPastEndReturnsNoRecords(uint64_t offset_past_end) { WriteInitialOffsetLog(); - unique_ptr source(new StringSource(reader_contents_)); + unique_ptr file_reader( + test::GetSequentialFileReader(new StringSource(reader_contents_))); unique_ptr offset_reader( - new Reader(std::move(source), &report_, true/*checksum*/, - WrittenBytes() + offset_past_end)); + new Reader(std::move(file_reader), &report_, true /*checksum*/, + WrittenBytes() + offset_past_end)); Slice record; std::string scratch; ASSERT_TRUE(!offset_reader->ReadRecord(&record, &scratch)); @@ -300,10 +306,10 @@ class LogTest : public testing::Test { void CheckInitialOffsetRecord(uint64_t initial_offset, int expected_record_offset) { WriteInitialOffsetLog(); - unique_ptr source(new StringSource(reader_contents_)); - unique_ptr offset_reader( - new Reader(std::move(source), &report_, true/*checksum*/, - initial_offset)); + unique_ptr file_reader( + test::GetSequentialFileReader(new StringSource(reader_contents_))); + unique_ptr offset_reader(new Reader( + std::move(file_reader), &report_, true /*checksum*/, initial_offset)); Slice record; std::string scratch; ASSERT_TRUE(offset_reader->ReadRecord(&record, &scratch)); diff --git a/db/log_writer.cc b/db/log_writer.cc index d78de5e7b..32d4afdc9 100644 --- a/db/log_writer.cc +++ b/db/log_writer.cc @@ -13,13 +13,13 @@ #include "rocksdb/env.h" #include "util/coding.h" #include "util/crc32c.h" +#include "util/file_reader_writer.h" namespace rocksdb { namespace log { -Writer::Writer(unique_ptr&& dest) - : dest_(std::move(dest)), - block_offset_(0) { +Writer::Writer(unique_ptr&& dest) + : dest_(std::move(dest)), block_offset_(0) { for (int i = 0; i <= kMaxRecordType; i++) { char t = static_cast(i); type_crc_[i] = crc32c::Value(&t, 1); diff --git a/db/log_writer.h b/db/log_writer.h index 46226ec27..8f0728c69 100644 --- a/db/log_writer.h +++ b/db/log_writer.h @@ -16,7 +16,7 @@ namespace rocksdb { -class WritableFile; +class WritableFileWriter; using std::unique_ptr; @@ -61,16 +61,16 @@ class Writer { // Create a writer that will append data to "*dest". // "*dest" must be initially empty. // "*dest" must remain live while this Writer is in use. - explicit Writer(unique_ptr&& dest); + explicit Writer(unique_ptr&& dest); ~Writer(); Status AddRecord(const Slice& slice); - WritableFile* file() { return dest_.get(); } - const WritableFile* file() const { return dest_.get(); } + WritableFileWriter* file() { return dest_.get(); } + const WritableFileWriter* file() const { return dest_.get(); } private: - unique_ptr dest_; + unique_ptr dest_; int block_offset_; // Current offset in block // crc32c values for all supported record types. These are diff --git a/db/plain_table_db_test.cc b/db/plain_table_db_test.cc index 9a97e801c..fa6afa916 100644 --- a/db/plain_table_db_test.cc +++ b/db/plain_table_db_test.cc @@ -198,10 +198,9 @@ class TestPlainTableReader : public PlainTableReader { int bloom_bits_per_key, double hash_table_ratio, size_t index_sparseness, const TableProperties* table_properties, - unique_ptr&& file, + unique_ptr&& file, const ImmutableCFOptions& ioptions, - bool* expect_bloom_not_match, - bool store_index_in_file) + bool* expect_bloom_not_match, bool store_index_in_file) : PlainTableReader(ioptions, std::move(file), env_options, icomparator, encoding_type, file_size, table_properties), expect_bloom_not_match_(expect_bloom_not_match) { @@ -257,7 +256,8 @@ class TestPlainTableFactory : public PlainTableFactory { Status NewTableReader(const ImmutableCFOptions& ioptions, const EnvOptions& env_options, const InternalKeyComparator& internal_comparator, - unique_ptr&& file, uint64_t file_size, + unique_ptr&& file, + uint64_t file_size, unique_ptr* table) const override { TableProperties* props = nullptr; auto s = ReadTableProperties(file.get(), file_size, kPlainTableMagicNumber, diff --git a/db/repair.cc b/db/repair.cc index 87788c5b5..a0322b683 100644 --- a/db/repair.cc +++ b/db/repair.cc @@ -81,6 +81,7 @@ #include "rocksdb/env.h" #include "rocksdb/options.h" #include "rocksdb/immutable_options.h" +#include "util/file_reader_writer.h" #include "util/scoped_arena_iterator.h" namespace rocksdb { @@ -236,6 +237,8 @@ class Repairer { if (!status.ok()) { return status; } + unique_ptr lfile_reader( + new SequentialFileReader(std::move(lfile))); // Create the log reader. LogReporter reporter; @@ -246,8 +249,8 @@ class Repairer { // corruptions cause entire commits to be skipped instead of // propagating bad information (like overly large sequence // numbers). - log::Reader reader(std::move(lfile), &reporter, true /*enable checksum*/, - 0/*initial_offset*/); + log::Reader reader(std::move(lfile_reader), &reporter, + true /*enable checksum*/, 0 /*initial_offset*/); // Read all the records and add to a memtable std::string scratch; @@ -378,8 +381,8 @@ class Repairer { Status WriteDescriptor() { std::string tmp = TempFileName(dbname_, 1); unique_ptr file; - Status status = env_->NewWritableFile( - tmp, &file, env_->OptimizeForManifestWrite(env_options_)); + EnvOptions env_options = env_->OptimizeForManifestWrite(env_options_); + Status status = env_->NewWritableFile(tmp, &file, env_options); if (!status.ok()) { return status; } @@ -407,7 +410,9 @@ class Repairer { //fprintf(stderr, "NewDescriptor:\n%s\n", edit_.DebugString().c_str()); { - log::Writer log(std::move(file)); + unique_ptr file_writer( + new WritableFileWriter(std::move(file), env_options)); + log::Writer log(std::move(file_writer)); std::string record; edit_->EncodeTo(&record); status = log.AddRecord(record); diff --git a/db/table_cache.cc b/db/table_cache.cc index f74cfb6f1..3e0fdc3eb 100644 --- a/db/table_cache.cc +++ b/db/table_cache.cc @@ -18,6 +18,7 @@ #include "table/table_reader.h" #include "table/get_context.h" #include "util/coding.h" +#include "util/file_reader_writer.h" #include "util/perf_context_imp.h" #include "util/stop_watch.h" @@ -99,8 +100,10 @@ Status TableCache::FindTable(const EnvOptions& env_options, file->Hint(RandomAccessFile::RANDOM); } StopWatch sw(ioptions_.env, ioptions_.statistics, TABLE_OPEN_IO_MICROS); + std::unique_ptr file_reader( + new RandomAccessFileReader(std::move(file))); s = ioptions_.table_factory->NewTableReader( - ioptions_, env_options, internal_comparator, std::move(file), + ioptions_, env_options, internal_comparator, std::move(file_reader), fd.GetFileSize(), &table_reader); } diff --git a/db/table_properties_collector_test.cc b/db/table_properties_collector_test.cc index 363b31bb9..34af49748 100644 --- a/db/table_properties_collector_test.cc +++ b/db/table_properties_collector_test.cc @@ -18,6 +18,7 @@ #include "table/plain_table_factory.h" #include "table/table_builder.h" #include "util/coding.h" +#include "util/file_reader_writer.h" #include "util/testharness.h" #include "util/testutil.h" @@ -33,7 +34,7 @@ class TablePropertiesTest : public testing::Test, // TODO(kailiu) the following classes should be moved to some more general // places, so that other tests can also make use of them. -// `FakeWritableFile` and `FakeRandomeAccessFile` bypass the real file system +// `FakeWritableFileWriter* file system // and therefore enable us to quickly setup the tests. class FakeWritableFile : public WritableFile { public: @@ -96,9 +97,11 @@ void MakeBuilder(const Options& options, const ImmutableCFOptions& ioptions, const InternalKeyComparator& internal_comparator, const std::vector>* int_tbl_prop_collector_factories, - std::unique_ptr* writable, + std::unique_ptr* writable, std::unique_ptr* builder) { - writable->reset(new FakeWritableFile); + unique_ptr wf(new FakeWritableFile); + writable->reset(new WritableFileWriter(std::move(wf), EnvOptions())); + builder->reset(NewTableBuilder( ioptions, internal_comparator, int_tbl_prop_collector_factories, writable->get(), options.compression, options.compression_opts)); @@ -289,7 +292,7 @@ void TestCustomizedTablePropertiesCollector( // -- Step 1: build table std::unique_ptr builder; - std::unique_ptr writable; + std::unique_ptr writer; const ImmutableCFOptions ioptions(options); std::vector> int_tbl_prop_collector_factories; @@ -300,7 +303,7 @@ void TestCustomizedTablePropertiesCollector( GetIntTblPropCollectorFactory(options, &int_tbl_prop_collector_factories); } MakeBuilder(options, ioptions, internal_comparator, - &int_tbl_prop_collector_factories, &writable, &builder); + &int_tbl_prop_collector_factories, &writer, &builder); SequenceNumber seqNum = 0U; for (const auto& kv : kvs) { @@ -310,18 +313,17 @@ void TestCustomizedTablePropertiesCollector( builder->Add(ikey.Encode(), kv.second); } ASSERT_OK(builder->Finish()); + writer->Flush(); // -- Step 2: Read properties - FakeRandomeAccessFile readable(writable->contents()); + FakeWritableFile* fwf = + static_cast(writer->writable_file()); + std::unique_ptr fake_file_reader( + test::GetRandomAccessFileReader( + new FakeRandomeAccessFile(fwf->contents()))); TableProperties* props; - Status s = ReadTableProperties( - &readable, - writable->contents().size(), - magic_number, - Env::Default(), - nullptr, - &props - ); + Status s = ReadTableProperties(fake_file_reader.get(), fwf->contents().size(), + magic_number, Env::Default(), nullptr, &props); std::unique_ptr props_guard(props); ASSERT_OK(s); @@ -414,7 +416,7 @@ void TestInternalKeyPropertiesCollector( }; std::unique_ptr builder; - std::unique_ptr writable; + std::unique_ptr writable; Options options; test::PlainInternalKeyComparator pikc(options.comparator); @@ -449,12 +451,16 @@ void TestInternalKeyPropertiesCollector( } ASSERT_OK(builder->Finish()); + writable->Flush(); - FakeRandomeAccessFile readable(writable->contents()); + FakeWritableFile* fwf = + static_cast(writable->writable_file()); + unique_ptr reader(test::GetRandomAccessFileReader( + new FakeRandomeAccessFile(fwf->contents()))); TableProperties* props; Status s = - ReadTableProperties(&readable, writable->contents().size(), - magic_number, Env::Default(), nullptr, &props); + ReadTableProperties(reader.get(), fwf->contents().size(), magic_number, + Env::Default(), nullptr, &props); ASSERT_OK(s); std::unique_ptr props_guard(props); diff --git a/db/transaction_log_impl.cc b/db/transaction_log_impl.cc index b0bf6e4e9..23bd6672b 100644 --- a/db/transaction_log_impl.cc +++ b/db/transaction_log_impl.cc @@ -11,6 +11,7 @@ #include #include "db/transaction_log_impl.h" #include "db/write_batch_internal.h" +#include "util/file_reader_writer.h" namespace rocksdb { @@ -40,23 +41,27 @@ TransactionLogIteratorImpl::TransactionLogIteratorImpl( } Status TransactionLogIteratorImpl::OpenLogFile( - const LogFile* logFile, - unique_ptr* file) { + const LogFile* logFile, unique_ptr* file_reader) { Env* env = options_->env; + unique_ptr file; + Status s; if (logFile->Type() == kArchivedLogFile) { std::string fname = ArchivedLogFileName(dir_, logFile->LogNumber()); - return env->NewSequentialFile(fname, file, soptions_); + s = env->NewSequentialFile(fname, &file, soptions_); } else { std::string fname = LogFileName(dir_, logFile->LogNumber()); - Status s = env->NewSequentialFile(fname, file, soptions_); + s = env->NewSequentialFile(fname, &file, soptions_); if (!s.ok()) { // If cannot open file in DB directory. // Try the archive dir, as it could have moved in the meanwhile. fname = ArchivedLogFileName(dir_, logFile->LogNumber()); - s = env->NewSequentialFile(fname, file, soptions_); + s = env->NewSequentialFile(fname, &file, soptions_); } - return s; } + if (s.ok()) { + file_reader->reset(new SequentialFileReader(std::move(file))); + } + return s; } BatchResult TransactionLogIteratorImpl::GetBatch() { @@ -251,7 +256,7 @@ void TransactionLogIteratorImpl::UpdateCurrentWriteBatch(const Slice& record) { } Status TransactionLogIteratorImpl::OpenLogReader(const LogFile* logFile) { - unique_ptr file; + unique_ptr file; Status s = OpenLogFile(logFile, &file); if (!s.ok()) { return s; diff --git a/db/transaction_log_impl.h b/db/transaction_log_impl.h index c730d0f61..f89cc3207 100644 --- a/db/transaction_log_impl.h +++ b/db/transaction_log_impl.h @@ -84,7 +84,8 @@ class TransactionLogIteratorImpl : public TransactionLogIterator { size_t currentFileIndex_; std::unique_ptr currentBatch_; unique_ptr currentLogReader_; - Status OpenLogFile(const LogFile* logFile, unique_ptr* file); + Status OpenLogFile(const LogFile* logFile, + unique_ptr* file); struct LogReporter : public log::Reader::Reporter { Env* env; diff --git a/db/version_set.cc b/db/version_set.cc index 72874492f..c3f090ba1 100644 --- a/db/version_set.cc +++ b/db/version_set.cc @@ -42,6 +42,7 @@ #include "table/meta_blocks.h" #include "table/get_context.h" #include "util/coding.h" +#include "util/file_reader_writer.h" #include "util/logging.h" #include "util/stop_watch.h" #include "util/sync_point.h" @@ -567,10 +568,12 @@ Status Version::GetTableProperties(std::shared_ptr* tp, TableProperties* raw_table_properties; // By setting the magic number to kInvalidTableMagicNumber, we can by // pass the magic number check in the footer. + std::unique_ptr file_reader( + new RandomAccessFileReader(std::move(file))); s = ReadTableProperties( - file.get(), file_meta->fd.GetFileSize(), - Footer::kInvalidTableMagicNumber /* table's magic number */, - vset_->env_, ioptions->info_log, &raw_table_properties); + file_reader.get(), file_meta->fd.GetFileSize(), + Footer::kInvalidTableMagicNumber /* table's magic number */, vset_->env_, + ioptions->info_log, &raw_table_properties); if (!s.ok()) { return s; } @@ -1912,13 +1915,17 @@ Status VersionSet::LogAndApply(ColumnFamilyData* column_family_data, Log(InfoLogLevel::INFO_LEVEL, db_options_->info_log, "Creating manifest %" PRIu64 "\n", pending_manifest_file_number_); unique_ptr descriptor_file; + EnvOptions opt_env_opts = env_->OptimizeForManifestWrite(env_options_); s = env_->NewWritableFile( DescriptorFileName(dbname_, pending_manifest_file_number_), - &descriptor_file, env_->OptimizeForManifestWrite(env_options_)); + &descriptor_file, opt_env_opts); if (s.ok()) { descriptor_file->SetPreallocationBlockSize( db_options_->manifest_preallocation_size); - descriptor_log_.reset(new log::Writer(std::move(descriptor_file))); + + unique_ptr file_writer( + new WritableFileWriter(std::move(descriptor_file), opt_env_opts)); + descriptor_log_.reset(new log::Writer(std::move(file_writer))); s = WriteSnapshot(descriptor_log_.get()); } } @@ -2132,11 +2139,16 @@ Status VersionSet::Recover( manifest_filename.c_str()); manifest_filename = dbname_ + "/" + manifest_filename; - unique_ptr manifest_file; - s = env_->NewSequentialFile(manifest_filename, &manifest_file, - env_options_); - if (!s.ok()) { - return s; + unique_ptr manifest_file_reader; + { + unique_ptr manifest_file; + s = env_->NewSequentialFile(manifest_filename, &manifest_file, + env_options_); + if (!s.ok()) { + return s; + } + manifest_file_reader.reset( + new SequentialFileReader(std::move(manifest_file))); } uint64_t current_manifest_file_size; s = env_->GetFileSize(manifest_filename, ¤t_manifest_file_size); @@ -2170,8 +2182,8 @@ Status VersionSet::Recover( { VersionSet::LogReporter reporter; reporter.status = &s; - log::Reader reader(std::move(manifest_file), &reporter, true /*checksum*/, - 0 /*initial_offset*/); + log::Reader reader(std::move(manifest_file_reader), &reporter, + true /*checksum*/, 0 /*initial_offset*/); Slice record; std::string scratch; while (reader.ReadRecord(&record, &scratch) && s.ok()) { @@ -2405,18 +2417,23 @@ Status VersionSet::ListColumnFamilies(std::vector* column_families, current.resize(current.size() - 1); std::string dscname = dbname + "/" + current; + + unique_ptr file_reader; + { unique_ptr file; s = env->NewSequentialFile(dscname, &file, soptions); if (!s.ok()) { return s; } + file_reader.reset(new SequentialFileReader(std::move(file))); + } std::map column_family_names; // default column family is always implicitly there column_family_names.insert({0, kDefaultColumnFamilyName}); VersionSet::LogReporter reporter; reporter.status = &s; - log::Reader reader(std::move(file), &reporter, true /*checksum*/, + log::Reader reader(std::move(file_reader), &reporter, true /*checksum*/, 0 /*initial_offset*/); Slice record; std::string scratch; @@ -2542,10 +2559,15 @@ Status VersionSet::ReduceNumberOfLevels(const std::string& dbname, Status VersionSet::DumpManifest(Options& options, std::string& dscname, bool verbose, bool hex, bool json) { // Open the specified manifest file. - unique_ptr file; - Status s = options.env->NewSequentialFile(dscname, &file, env_options_); - if (!s.ok()) { - return s; + unique_ptr file_reader; + Status s; + { + unique_ptr file; + s = options.env->NewSequentialFile(dscname, &file, env_options_); + if (!s.ok()) { + return s; + } + file_reader.reset(new SequentialFileReader(std::move(file))); } bool have_prev_log_number = false; @@ -2569,8 +2591,8 @@ Status VersionSet::DumpManifest(Options& options, std::string& dscname, { VersionSet::LogReporter reporter; reporter.status = &s; - log::Reader reader(std::move(file), &reporter, true/*checksum*/, - 0/*initial_offset*/); + log::Reader reader(std::move(file_reader), &reporter, true /*checksum*/, + 0 /*initial_offset*/); Slice record; std::string scratch; while (reader.ReadRecord(&record, &scratch) && s.ok()) { @@ -2664,7 +2686,7 @@ Status VersionSet::DumpManifest(Options& options, std::string& dscname, } } } - file.reset(); + file_reader.reset(); if (s.ok()) { if (!have_next_file) { @@ -2806,17 +2828,23 @@ bool VersionSet::ManifestContains(uint64_t manifest_file_num, std::string fname = DescriptorFileName(dbname_, manifest_file_num); Log(InfoLogLevel::INFO_LEVEL, db_options_->info_log, "ManifestContains: checking %s\n", fname.c_str()); - unique_ptr file; - Status s = env_->NewSequentialFile(fname, &file, env_options_); - if (!s.ok()) { - Log(InfoLogLevel::INFO_LEVEL, db_options_->info_log, - "ManifestContains: %s\n", s.ToString().c_str()); - Log(InfoLogLevel::INFO_LEVEL, db_options_->info_log, - "ManifestContains: is unable to reopen the manifest file %s", - fname.c_str()); - return false; + + unique_ptr file_reader; + Status s; + { + unique_ptr file; + s = env_->NewSequentialFile(fname, &file, env_options_); + if (!s.ok()) { + Log(InfoLogLevel::INFO_LEVEL, db_options_->info_log, + "ManifestContains: %s\n", s.ToString().c_str()); + Log(InfoLogLevel::INFO_LEVEL, db_options_->info_log, + "ManifestContains: is unable to reopen the manifest file %s", + fname.c_str()); + return false; + } + file_reader.reset(new SequentialFileReader(std::move(file))); } - log::Reader reader(std::move(file), nullptr, true/*checksum*/, 0); + log::Reader reader(std::move(file_reader), nullptr, true /*checksum*/, 0); Slice r; std::string scratch; bool result = false; diff --git a/db/wal_manager.cc b/db/wal_manager.cc index 5651bae3a..dceaf6643 100644 --- a/db/wal_manager.cc +++ b/db/wal_manager.cc @@ -28,6 +28,7 @@ #include "rocksdb/options.h" #include "rocksdb/write_batch.h" #include "util/coding.h" +#include "util/file_reader_writer.h" #include "util/logging.h" #include "util/mutexlock.h" #include "util/sync_point.h" @@ -430,6 +431,8 @@ Status WalManager::ReadFirstLine(const std::string& fname, std::unique_ptr file; Status status = env_->NewSequentialFile(fname, &file, env_options_); + unique_ptr file_reader( + new SequentialFileReader(std::move(file))); if (!status.ok()) { return status; @@ -441,7 +444,7 @@ Status WalManager::ReadFirstLine(const std::string& fname, reporter.fname = fname.c_str(); reporter.status = &status; reporter.ignore_error = !db_options_.paranoid_checks; - log::Reader reader(std::move(file), &reporter, true /*checksum*/, + log::Reader reader(std::move(file_reader), &reporter, true /*checksum*/, 0 /*initial_offset*/); std::string scratch; Slice record; diff --git a/db/wal_manager_test.cc b/db/wal_manager_test.cc index 325f0d94c..b72c377a8 100644 --- a/db/wal_manager_test.cc +++ b/db/wal_manager_test.cc @@ -14,6 +14,7 @@ #include "db/column_family.h" #include "db/version_set.h" #include "db/writebuffer.h" +#include "util/file_reader_writer.h" #include "util/mock_env.h" #include "util/string_util.h" #include "util/testharness.h" @@ -72,7 +73,9 @@ class WalManagerTest : public testing::Test { std::string fname = ArchivedLogFileName(dbname_, current_log_number_); unique_ptr file; ASSERT_OK(env_->NewWritableFile(fname, &file, env_options_)); - current_log_writer_.reset(new log::Writer(std::move(file))); + unique_ptr file_writer( + new WritableFileWriter(std::move(file), env_options_)); + current_log_writer_.reset(new log::Writer(std::move(file_writer))); } void CreateArchiveLogs(int num_logs, int entries_per_log) { @@ -120,7 +123,9 @@ TEST_F(WalManagerTest, ReadFirstRecordCache) { ASSERT_OK(wal_manager_->TEST_ReadFirstRecord(kAliveLogFile, 1, &s)); ASSERT_EQ(s, 0U); - log::Writer writer(std::move(file)); + unique_ptr file_writer( + new WritableFileWriter(std::move(file), EnvOptions())); + log::Writer writer(std::move(file_writer)); WriteBatch batch; batch.Put("foo", "bar"); WriteBatchInternal::SetSequence(&batch, 10); diff --git a/include/rocksdb/env.h b/include/rocksdb/env.h index 684448e2a..acdc66e48 100644 --- a/include/rocksdb/env.h +++ b/include/rocksdb/env.h @@ -465,6 +465,8 @@ class WritableFile { io_priority_ = pri; } + virtual Env::IOPriority GetIOPriority() { return io_priority_; } + /* * Get the size of valid data in the file. */ @@ -501,7 +503,14 @@ class WritableFile { return Status::NotSupported("InvalidateCache not supported."); } - protected: + // Sync a file range with disk. + // offset is the starting byte of the file range to be synchronized. + // nbytes specifies the length of the range to be synchronized. + // This asks the OS to initiate flushing the cached data to disk, + // without waiting for completion. + // Default implementation does nothing. + virtual Status RangeSync(off_t offset, off_t nbytes) { return Status::OK(); } + // PrepareWrite performs any necessary preparation for a write // before the write actually occurs. This allows for pre-allocation // of space on devices where it can result in less file @@ -526,6 +535,7 @@ class WritableFile { } } + protected: /* * Pre-allocate space for a file. */ @@ -533,16 +543,6 @@ class WritableFile { return Status::OK(); } - // Sync a file range with disk. - // offset is the starting byte of the file range to be synchronized. - // nbytes specifies the length of the range to be synchronized. - // This asks the OS to initiate flushing the cached data to disk, - // without waiting for completion. - // Default implementation does nothing. - virtual Status RangeSync(off_t offset, off_t nbytes) { - return Status::OK(); - } - size_t preallocation_block_size() { return preallocation_block_size_; } private: @@ -893,6 +893,7 @@ class WritableFileWrapper : public WritableFile { void SetIOPriority(Env::IOPriority pri) override { target_->SetIOPriority(pri); } + Env::IOPriority GetIOPriority() override { return target_->GetIOPriority(); } uint64_t GetFileSize() override { return target_->GetFileSize(); } void GetPreallocationStatus(size_t* block_size, size_t* last_allocated_block) override { diff --git a/include/rocksdb/table.h b/include/rocksdb/table.h index b84363a94..f3a22dc31 100644 --- a/include/rocksdb/table.h +++ b/include/rocksdb/table.h @@ -34,7 +34,7 @@ class RandomAccessFile; struct TableBuilderOptions; class TableBuilder; class TableReader; -class WritableFile; +class WritableFileWriter; struct EnvOptions; struct Options; @@ -315,6 +315,8 @@ extern TableFactory* NewCuckooTableFactory( #endif // ROCKSDB_LITE +class RandomAccessFileReader; + // A base class for table factories. class TableFactory { public: @@ -348,7 +350,7 @@ class TableFactory { virtual Status NewTableReader( const ImmutableCFOptions& ioptions, const EnvOptions& env_options, const InternalKeyComparator& internal_comparator, - unique_ptr&& file, uint64_t file_size, + unique_ptr&& file, uint64_t file_size, unique_ptr* table_reader) const = 0; // Return a table builder to write to a file for this table type. @@ -372,7 +374,7 @@ class TableFactory { // to use in this table. virtual TableBuilder* NewTableBuilder( const TableBuilderOptions& table_builder_options, - WritableFile* file) const = 0; + WritableFileWriter* file) const = 0; // Sanitizes the specified DB Options and ColumnFamilyOptions. // diff --git a/port/port_posix.cc b/port/port_posix.cc index a8cffcc7e..0e1cb4db1 100644 --- a/port/port_posix.cc +++ b/port/port_posix.cc @@ -9,11 +9,13 @@ #include "port/port_posix.h" -#include #include #include -#include +#include +#include #include +#include +#include #include #include "util/logging.h" @@ -133,5 +135,11 @@ void InitOnce(OnceType* once, void (*initializer)()) { PthreadCall("once", pthread_once(once, initializer)); } +void Crash(const std::string& srcfile, int srcline) { + fprintf(stdout, "Crashing at %s:%d\n", srcfile.c_str(), srcline); + fflush(stdout); + kill(getpid(), SIGTERM); +} + } // namespace port } // namespace rocksdb diff --git a/port/port_posix.h b/port/port_posix.h index ff8d4af4b..0dd824a5d 100644 --- a/port/port_posix.h +++ b/port/port_posix.h @@ -53,7 +53,7 @@ #if defined(OS_MACOSX) || defined(OS_SOLARIS) || defined(OS_FREEBSD) ||\ defined(OS_NETBSD) || defined(OS_OPENBSD) || defined(OS_DRAGONFLYBSD) ||\ - defined(OS_ANDROID) || defined(OS_CYGWIN) + defined(OS_ANDROID) || defined(CYGWIN) // Use fread/fwrite/fflush on platforms without _unlocked variants #define fread_unlocked fread #define fwrite_unlocked fwrite @@ -150,6 +150,7 @@ extern void InitOnce(OnceType* once, void (*initializer)()); #define PREFETCH(addr, rw, locality) __builtin_prefetch(addr, rw, locality) +extern void Crash(const std::string& srcfile, int srcline); } // namespace port } // namespace rocksdb diff --git a/port/win/env_win.cc b/port/win/env_win.cc index 31ce7b5bf..ce9dcd4e0 100644 --- a/port/win/env_win.cc +++ b/port/win/env_win.cc @@ -29,6 +29,7 @@ #include "util/random.h" #include "util/iostats_context_imp.h" #include "util/rate_limiter.h" +#include "util/sync_point.h" #include "util/thread_status_updater.h" #include "util/thread_status_util.h" @@ -36,10 +37,6 @@ #include // For UUID generation #include -// This is only set from db_stress.cc and for testing only. -// If non-zero, kill at various points in source code with probability 1/this -int rocksdb_kill_odds = 0; - namespace rocksdb { std::string GetWindowsErrSz(DWORD err) { @@ -90,40 +87,6 @@ inline void PrintThreadInfo(size_t thread_id, size_t terminatingId) { // returns the ID of the current process inline int current_process_id() { return _getpid(); } -#ifdef NDEBUG -// empty in release build -#define TEST_KILL_RANDOM(rocksdb_kill_odds) -#else - -// Kill the process with probablity 1/odds for testing. -void TestKillRandom(int odds, const std::string& srcfile, int srcline) { - time_t curtime = time(nullptr); - Random r((uint32_t)curtime); - - assert(odds > 0); - bool crash = r.OneIn(odds); - if (crash) { - fprintf(stdout, "Crashing at %s:%d\n", srcfile.c_str(), srcline); - fflush(stdout); - std::string* p_str = nullptr; - p_str->c_str(); - } -} - -// To avoid crashing always at some frequently executed codepaths (during -// kill random test), use this factor to reduce odds -#define REDUCE_ODDS 2 -#define REDUCE_ODDS2 4 - -#define TEST_KILL_RANDOM(rocksdb_kill_odds) \ - { \ - if (rocksdb_kill_odds > 0) { \ - TestKillRandom(rocksdb_kill_odds, __FILE__, __LINE__); \ - } \ - } - -#endif - // RAII helpers for HANDLEs const auto CloseHandleFunc = [](HANDLE h) { ::CloseHandle(h); }; typedef std::unique_ptr UniqueCloseHandlePtr; diff --git a/src.mk b/src.mk index ae7a932bb..62b4ef721 100644 --- a/src.mk +++ b/src.mk @@ -88,6 +88,7 @@ LIB_SOURCES = \ util/env_hdfs.cc \ util/env_posix.cc \ util/file_util.cc \ + util/file_reader_writer.cc \ util/filter_policy.cc \ util/hash.cc \ util/hash_cuckoo_rep.cc \ diff --git a/table/adaptive_table_factory.cc b/table/adaptive_table_factory.cc index e8831f757..f642fa7bd 100644 --- a/table/adaptive_table_factory.cc +++ b/table/adaptive_table_factory.cc @@ -41,8 +41,9 @@ extern const uint64_t kCuckooTableMagicNumber; Status AdaptiveTableFactory::NewTableReader( const ImmutableCFOptions& ioptions, const EnvOptions& env_options, - const InternalKeyComparator& icomp, unique_ptr&& file, - uint64_t file_size, unique_ptr* table) const { + const InternalKeyComparator& icomp, + unique_ptr&& file, uint64_t file_size, + unique_ptr* table) const { Footer footer; auto s = ReadFooterFromFile(file.get(), file_size, &footer); if (!s.ok()) { @@ -66,7 +67,7 @@ Status AdaptiveTableFactory::NewTableReader( TableBuilder* AdaptiveTableFactory::NewTableBuilder( const TableBuilderOptions& table_builder_options, - WritableFile* file) const { + WritableFileWriter* file) const { return table_factory_to_write_->NewTableBuilder(table_builder_options, file); } diff --git a/table/adaptive_table_factory.h b/table/adaptive_table_factory.h index aa0f82708..47a17de2c 100644 --- a/table/adaptive_table_factory.h +++ b/table/adaptive_table_factory.h @@ -33,15 +33,16 @@ class AdaptiveTableFactory : public TableFactory { const char* Name() const override { return "AdaptiveTableFactory"; } - Status NewTableReader( - const ImmutableCFOptions& ioptions, const EnvOptions& env_options, - const InternalKeyComparator& internal_comparator, - unique_ptr&& file, uint64_t file_size, - unique_ptr* table) const override; + Status NewTableReader(const ImmutableCFOptions& ioptions, + const EnvOptions& env_options, + const InternalKeyComparator& internal_comparator, + unique_ptr&& file, + uint64_t file_size, + unique_ptr* table) const override; TableBuilder* NewTableBuilder( const TableBuilderOptions& table_builder_options, - WritableFile* file) const override; + WritableFileWriter* file) const override; // Sanitizes the specified DB Options. Status SanitizeOptions(const DBOptions& db_opts, diff --git a/table/block_based_table_builder.cc b/table/block_based_table_builder.cc index 588f2475b..d0563b133 100644 --- a/table/block_based_table_builder.cc +++ b/table/block_based_table_builder.cc @@ -437,7 +437,7 @@ struct BlockBasedTableBuilder::Rep { const ImmutableCFOptions ioptions; const BlockBasedTableOptions table_options; const InternalKeyComparator& internal_comparator; - WritableFile* file; + WritableFileWriter* file; uint64_t offset = 0; Status status; BlockBuilder data_block; @@ -467,7 +467,7 @@ struct BlockBasedTableBuilder::Rep { const InternalKeyComparator& icomparator, const std::vector>* int_tbl_prop_collector_factories, - WritableFile* f, const CompressionType _compression_type, + WritableFileWriter* f, const CompressionType _compression_type, const CompressionOptions& _compression_opts, const bool skip_filters) : ioptions(_ioptions), table_options(table_opt), @@ -502,7 +502,7 @@ BlockBasedTableBuilder::BlockBasedTableBuilder( const InternalKeyComparator& internal_comparator, const std::vector>* int_tbl_prop_collector_factories, - WritableFile* file, const CompressionType compression_type, + WritableFileWriter* file, const CompressionType compression_type, const CompressionOptions& compression_opts, const bool skip_filters) { BlockBasedTableOptions sanitized_table_options(table_options); if (sanitized_table_options.format_version == 0 && @@ -524,7 +524,7 @@ BlockBasedTableBuilder::BlockBasedTableBuilder( } if (table_options.block_cache_compressed.get() != nullptr) { BlockBasedTable::GenerateCachePrefix( - table_options.block_cache_compressed.get(), file, + table_options.block_cache_compressed.get(), file->writable_file(), &rep_->compressed_cache_key_prefix[0], &rep_->compressed_cache_key_prefix_size); } diff --git a/table/block_based_table_builder.h b/table/block_based_table_builder.h index 97f31277c..ce868207a 100644 --- a/table/block_based_table_builder.h +++ b/table/block_based_table_builder.h @@ -40,7 +40,7 @@ class BlockBasedTableBuilder : public TableBuilder { const InternalKeyComparator& internal_comparator, const std::vector>* int_tbl_prop_collector_factories, - WritableFile* file, const CompressionType compression_type, + WritableFileWriter* file, const CompressionType compression_type, const CompressionOptions& compression_opts, const bool skip_filters); // REQUIRES: Either Finish() or Abandon() has been called. diff --git a/table/block_based_table_factory.cc b/table/block_based_table_factory.cc index 6c1e0a21c..7f56d72d9 100644 --- a/table/block_based_table_factory.cc +++ b/table/block_based_table_factory.cc @@ -44,7 +44,7 @@ BlockBasedTableFactory::BlockBasedTableFactory( Status BlockBasedTableFactory::NewTableReader( const ImmutableCFOptions& ioptions, const EnvOptions& soptions, const InternalKeyComparator& internal_comparator, - unique_ptr&& file, uint64_t file_size, + unique_ptr&& file, uint64_t file_size, unique_ptr* table_reader, const bool prefetch_enabled) const { return BlockBasedTable::Open(ioptions, soptions, table_options_, internal_comparator, std::move(file), file_size, @@ -53,7 +53,7 @@ Status BlockBasedTableFactory::NewTableReader( TableBuilder* BlockBasedTableFactory::NewTableBuilder( const TableBuilderOptions& table_builder_options, - WritableFile* file) const { + WritableFileWriter* file) const { auto table_builder = new BlockBasedTableBuilder( table_builder_options.ioptions, table_options_, table_builder_options.internal_comparator, diff --git a/table/block_based_table_factory.h b/table/block_based_table_factory.h index 639492659..fc6cdc55a 100644 --- a/table/block_based_table_factory.h +++ b/table/block_based_table_factory.h @@ -36,7 +36,8 @@ class BlockBasedTableFactory : public TableFactory { Status NewTableReader(const ImmutableCFOptions& ioptions, const EnvOptions& soptions, const InternalKeyComparator& internal_comparator, - unique_ptr&& file, uint64_t file_size, + unique_ptr&& file, + uint64_t file_size, unique_ptr* table_reader) const override { return NewTableReader(ioptions, soptions, internal_comparator, std::move(file), file_size, table_reader, @@ -48,13 +49,14 @@ class BlockBasedTableFactory : public TableFactory { Status NewTableReader(const ImmutableCFOptions& ioptions, const EnvOptions& soptions, const InternalKeyComparator& internal_comparator, - unique_ptr&& file, uint64_t file_size, + unique_ptr&& file, + uint64_t file_size, unique_ptr* table_reader, bool prefetch_index_and_filter) const; TableBuilder* NewTableBuilder( const TableBuilderOptions& table_builder_options, - WritableFile* file) const override; + WritableFileWriter* file) const override; // Sanitizes the specified DB Options. Status SanitizeOptions(const DBOptions& db_opts, diff --git a/table/block_based_table_reader.cc b/table/block_based_table_reader.cc index 79c7d0edd..bc9efd68a 100644 --- a/table/block_based_table_reader.cc +++ b/table/block_based_table_reader.cc @@ -37,6 +37,7 @@ #include "table/get_context.h" #include "util/coding.h" +#include "util/file_reader_writer.h" #include "util/perf_context_imp.h" #include "util/stop_watch.h" #include "util/string_util.h" @@ -62,7 +63,7 @@ const size_t kMaxCacheKeyPrefixSize __attribute__((unused)) = // The only relevant option is options.verify_checksums for now. // On failure return non-OK. // On success fill *result and return OK - caller owns *result -Status ReadBlockFromFile(RandomAccessFile* file, const Footer& footer, +Status ReadBlockFromFile(RandomAccessFileReader* file, const Footer& footer, const ReadOptions& options, const BlockHandle& handle, std::unique_ptr* result, Env* env, bool do_uncompress = true) { @@ -167,7 +168,7 @@ class BinarySearchIndexReader : public IndexReader { // `BinarySearchIndexReader`. // On success, index_reader will be populated; otherwise it will remain // unmodified. - static Status Create(RandomAccessFile* file, const Footer& footer, + static Status Create(RandomAccessFileReader* file, const Footer& footer, const BlockHandle& index_handle, Env* env, const Comparator* comparator, IndexReader** index_reader) { @@ -212,8 +213,8 @@ class BinarySearchIndexReader : public IndexReader { class HashIndexReader : public IndexReader { public: static Status Create(const SliceTransform* hash_key_extractor, - const Footer& footer, RandomAccessFile* file, Env* env, - const Comparator* comparator, + const Footer& footer, RandomAccessFileReader* file, + Env* env, const Comparator* comparator, const BlockHandle& index_handle, Iterator* meta_index_iter, IndexReader** index_reader, bool hash_index_allow_collision) { @@ -347,7 +348,7 @@ struct BlockBasedTable::Rep { const FilterPolicy* const filter_policy; const InternalKeyComparator& internal_comparator; Status status; - unique_ptr file; + unique_ptr file; char cache_key_prefix[kMaxCacheKeyPrefixSize]; size_t cache_key_prefix_size = 0; char compressed_cache_key_prefix[kMaxCacheKeyPrefixSize]; @@ -405,13 +406,12 @@ void BlockBasedTable::SetupCacheKeyPrefix(Rep* rep) { rep->cache_key_prefix_size = 0; rep->compressed_cache_key_prefix_size = 0; if (rep->table_options.block_cache != nullptr) { - GenerateCachePrefix(rep->table_options.block_cache.get(), rep->file.get(), - &rep->cache_key_prefix[0], - &rep->cache_key_prefix_size); + GenerateCachePrefix(rep->table_options.block_cache.get(), rep->file->file(), + &rep->cache_key_prefix[0], &rep->cache_key_prefix_size); } if (rep->table_options.block_cache_compressed != nullptr) { GenerateCachePrefix(rep->table_options.block_cache_compressed.get(), - rep->file.get(), &rep->compressed_cache_key_prefix[0], + rep->file->file(), &rep->compressed_cache_key_prefix[0], &rep->compressed_cache_key_prefix_size); } } @@ -469,7 +469,7 @@ Status BlockBasedTable::Open(const ImmutableCFOptions& ioptions, const EnvOptions& env_options, const BlockBasedTableOptions& table_options, const InternalKeyComparator& internal_comparator, - unique_ptr&& file, + unique_ptr&& file, uint64_t file_size, unique_ptr* table_reader, const bool prefetch_index_and_filter) { diff --git a/table/block_based_table_reader.h b/table/block_based_table_reader.h index 727a0d632..6b5a63a23 100644 --- a/table/block_based_table_reader.h +++ b/table/block_based_table_reader.h @@ -21,6 +21,7 @@ #include "table/table_reader.h" #include "table/table_properties_internal.h" #include "util/coding.h" +#include "util/file_reader_writer.h" namespace rocksdb { @@ -69,8 +70,8 @@ class BlockBasedTable : public TableReader { const EnvOptions& env_options, const BlockBasedTableOptions& table_options, const InternalKeyComparator& internal_key_comparator, - unique_ptr&& file, uint64_t file_size, - unique_ptr* table_reader, + unique_ptr&& file, + uint64_t file_size, unique_ptr* table_reader, bool prefetch_index_and_filter = true); bool PrefixMayMatch(const Slice& internal_key); diff --git a/table/cuckoo_table_builder.cc b/table/cuckoo_table_builder.cc index 1aa1e0707..946a8b5fb 100644 --- a/table/cuckoo_table_builder.cc +++ b/table/cuckoo_table_builder.cc @@ -20,6 +20,7 @@ #include "table/format.h" #include "table/meta_blocks.h" #include "util/autovector.h" +#include "util/file_reader_writer.h" #include "util/random.h" #include "util/string_util.h" @@ -47,7 +48,7 @@ const std::string CuckooTablePropertyNames::kUserKeyLength = extern const uint64_t kCuckooTableMagicNumber = 0x926789d0c5f17873ull; CuckooTableBuilder::CuckooTableBuilder( - WritableFile* file, double max_hash_table_ratio, + WritableFileWriter* file, double max_hash_table_ratio, uint32_t max_num_hash_table, uint32_t max_search_depth, const Comparator* user_comparator, uint32_t cuckoo_block_size, bool use_module_hash, bool identity_as_first_hash, diff --git a/table/cuckoo_table_builder.h b/table/cuckoo_table_builder.h index 913d50503..093e1c245 100644 --- a/table/cuckoo_table_builder.h +++ b/table/cuckoo_table_builder.h @@ -21,12 +21,13 @@ namespace rocksdb { class CuckooTableBuilder: public TableBuilder { public: - CuckooTableBuilder( - WritableFile* file, double max_hash_table_ratio, - uint32_t max_num_hash_func, uint32_t max_search_depth, - const Comparator* user_comparator, uint32_t cuckoo_block_size, - bool use_module_hash, bool identity_as_first_hash, - uint64_t (*get_slice_hash)(const Slice&, uint32_t, uint64_t)); + CuckooTableBuilder(WritableFileWriter* file, double max_hash_table_ratio, + uint32_t max_num_hash_func, uint32_t max_search_depth, + const Comparator* user_comparator, + uint32_t cuckoo_block_size, bool use_module_hash, + bool identity_as_first_hash, + uint64_t (*get_slice_hash)(const Slice&, uint32_t, + uint64_t)); // REQUIRES: Either Finish() or Abandon() has been called. ~CuckooTableBuilder() {} @@ -82,7 +83,7 @@ class CuckooTableBuilder: public TableBuilder { inline Slice GetValue(uint64_t idx) const; uint32_t num_hash_func_; - WritableFile* file_; + WritableFileWriter* file_; const double max_hash_table_ratio_; const uint32_t max_num_hash_func_; const uint32_t max_search_depth_; diff --git a/table/cuckoo_table_builder_test.cc b/table/cuckoo_table_builder_test.cc index bd9283659..69a8245e6 100644 --- a/table/cuckoo_table_builder_test.cc +++ b/table/cuckoo_table_builder_test.cc @@ -10,6 +10,7 @@ #include "table/meta_blocks.h" #include "table/cuckoo_table_builder.h" +#include "util/file_reader_writer.h" #include "util/testharness.h" #include "util/testutil.h" @@ -48,8 +49,11 @@ class CuckooBuilderTest : public testing::Test { // Assert Table Properties. TableProperties* props = nullptr; - ASSERT_OK(ReadTableProperties(read_file.get(), read_file_size, - kCuckooTableMagicNumber, env_, nullptr, &props)); + unique_ptr file_reader( + new RandomAccessFileReader(std::move(read_file))); + ASSERT_OK(ReadTableProperties(file_reader.get(), read_file_size, + kCuckooTableMagicNumber, env_, nullptr, + &props)); // Check unused bucket. std::string unused_key = props->user_collected_properties[ CuckooTablePropertyNames::kEmptyKey]; @@ -90,8 +94,8 @@ class CuckooBuilderTest : public testing::Test { size_t bucket_size = expected_unused_bucket.size(); for (uint32_t i = 0; i < table_size + cuckoo_block_size - 1; ++i) { Slice read_slice; - ASSERT_OK(read_file->Read(i*bucket_size, bucket_size, - &read_slice, nullptr)); + ASSERT_OK(file_reader->Read(i * bucket_size, bucket_size, &read_slice, + nullptr)); size_t key_idx = std::find(expected_locations.begin(), expected_locations.end(), i) - expected_locations.begin(); @@ -104,7 +108,7 @@ class CuckooBuilderTest : public testing::Test { } } for (auto key_found : keys_found) { - // Check that all keys were found. + // Check that all keys wereReader found. ASSERT_TRUE(key_found); } } @@ -133,12 +137,15 @@ TEST_F(CuckooBuilderTest, SuccessWithEmptyFile) { unique_ptr writable_file; fname = test::TmpDir() + "/EmptyFile"; ASSERT_OK(env_->NewWritableFile(fname, &writable_file, env_options_)); - CuckooTableBuilder builder(writable_file.get(), kHashTableRatio, - 4, 100, BytewiseComparator(), 1, false, false, GetSliceHash); + unique_ptr file_writer( + new WritableFileWriter(std::move(writable_file), EnvOptions())); + CuckooTableBuilder builder(file_writer.get(), kHashTableRatio, 4, 100, + BytewiseComparator(), 1, false, false, + GetSliceHash); ASSERT_OK(builder.status()); ASSERT_EQ(0UL, builder.FileSize()); ASSERT_OK(builder.Finish()); - ASSERT_OK(writable_file->Close()); + ASSERT_OK(file_writer->Close()); CheckFileContents({}, {}, {}, "", 2, 2, false); } @@ -165,8 +172,11 @@ TEST_F(CuckooBuilderTest, WriteSuccessNoCollisionFullKey) { unique_ptr writable_file; fname = test::TmpDir() + "/NoCollisionFullKey"; ASSERT_OK(env_->NewWritableFile(fname, &writable_file, env_options_)); - CuckooTableBuilder builder(writable_file.get(), kHashTableRatio, - num_hash_fun, 100, BytewiseComparator(), 1, false, false, GetSliceHash); + unique_ptr file_writer( + new WritableFileWriter(std::move(writable_file), EnvOptions())); + CuckooTableBuilder builder(file_writer.get(), kHashTableRatio, num_hash_fun, + 100, BytewiseComparator(), 1, false, false, + GetSliceHash); ASSERT_OK(builder.status()); for (uint32_t i = 0; i < user_keys.size(); i++) { builder.Add(Slice(keys[i]), Slice(values[i])); @@ -176,7 +186,7 @@ TEST_F(CuckooBuilderTest, WriteSuccessNoCollisionFullKey) { size_t bucket_size = keys[0].size() + values[0].size(); ASSERT_EQ(expected_table_size * bucket_size - 1, builder.FileSize()); ASSERT_OK(builder.Finish()); - ASSERT_OK(writable_file->Close()); + ASSERT_OK(file_writer->Close()); ASSERT_LE(expected_table_size * bucket_size, builder.FileSize()); std::string expected_unused_bucket = GetInternalKey("key00", true); @@ -209,8 +219,11 @@ TEST_F(CuckooBuilderTest, WriteSuccessWithCollisionFullKey) { unique_ptr writable_file; fname = test::TmpDir() + "/WithCollisionFullKey"; ASSERT_OK(env_->NewWritableFile(fname, &writable_file, env_options_)); - CuckooTableBuilder builder(writable_file.get(), kHashTableRatio, - num_hash_fun, 100, BytewiseComparator(), 1, false, false, GetSliceHash); + unique_ptr file_writer( + new WritableFileWriter(std::move(writable_file), EnvOptions())); + CuckooTableBuilder builder(file_writer.get(), kHashTableRatio, num_hash_fun, + 100, BytewiseComparator(), 1, false, false, + GetSliceHash); ASSERT_OK(builder.status()); for (uint32_t i = 0; i < user_keys.size(); i++) { builder.Add(Slice(keys[i]), Slice(values[i])); @@ -220,7 +233,7 @@ TEST_F(CuckooBuilderTest, WriteSuccessWithCollisionFullKey) { size_t bucket_size = keys[0].size() + values[0].size(); ASSERT_EQ(expected_table_size * bucket_size - 1, builder.FileSize()); ASSERT_OK(builder.Finish()); - ASSERT_OK(writable_file->Close()); + ASSERT_OK(file_writer->Close()); ASSERT_LE(expected_table_size * bucket_size, builder.FileSize()); std::string expected_unused_bucket = GetInternalKey("key00", true); @@ -254,9 +267,11 @@ TEST_F(CuckooBuilderTest, WriteSuccessWithCollisionAndCuckooBlock) { uint32_t cuckoo_block_size = 2; fname = test::TmpDir() + "/WithCollisionFullKey2"; ASSERT_OK(env_->NewWritableFile(fname, &writable_file, env_options_)); - CuckooTableBuilder builder(writable_file.get(), kHashTableRatio, - num_hash_fun, 100, BytewiseComparator(), cuckoo_block_size, - false, false, GetSliceHash); + unique_ptr file_writer( + new WritableFileWriter(std::move(writable_file), EnvOptions())); + CuckooTableBuilder builder(file_writer.get(), kHashTableRatio, num_hash_fun, + 100, BytewiseComparator(), cuckoo_block_size, + false, false, GetSliceHash); ASSERT_OK(builder.status()); for (uint32_t i = 0; i < user_keys.size(); i++) { builder.Add(Slice(keys[i]), Slice(values[i])); @@ -266,7 +281,7 @@ TEST_F(CuckooBuilderTest, WriteSuccessWithCollisionAndCuckooBlock) { size_t bucket_size = keys[0].size() + values[0].size(); ASSERT_EQ(expected_table_size * bucket_size - 1, builder.FileSize()); ASSERT_OK(builder.Finish()); - ASSERT_OK(writable_file->Close()); + ASSERT_OK(file_writer->Close()); ASSERT_LE(expected_table_size * bucket_size, builder.FileSize()); std::string expected_unused_bucket = GetInternalKey("key00", true); @@ -304,8 +319,11 @@ TEST_F(CuckooBuilderTest, WithCollisionPathFullKey) { unique_ptr writable_file; fname = test::TmpDir() + "/WithCollisionPathFullKey"; ASSERT_OK(env_->NewWritableFile(fname, &writable_file, env_options_)); - CuckooTableBuilder builder(writable_file.get(), kHashTableRatio, - num_hash_fun, 100, BytewiseComparator(), 1, false, false, GetSliceHash); + unique_ptr file_writer( + new WritableFileWriter(std::move(writable_file), EnvOptions())); + CuckooTableBuilder builder(file_writer.get(), kHashTableRatio, num_hash_fun, + 100, BytewiseComparator(), 1, false, false, + GetSliceHash); ASSERT_OK(builder.status()); for (uint32_t i = 0; i < user_keys.size(); i++) { builder.Add(Slice(keys[i]), Slice(values[i])); @@ -315,7 +333,7 @@ TEST_F(CuckooBuilderTest, WithCollisionPathFullKey) { size_t bucket_size = keys[0].size() + values[0].size(); ASSERT_EQ(expected_table_size * bucket_size - 1, builder.FileSize()); ASSERT_OK(builder.Finish()); - ASSERT_OK(writable_file->Close()); + ASSERT_OK(file_writer->Close()); ASSERT_LE(expected_table_size * bucket_size, builder.FileSize()); std::string expected_unused_bucket = GetInternalKey("key00", true); @@ -350,8 +368,11 @@ TEST_F(CuckooBuilderTest, WithCollisionPathFullKeyAndCuckooBlock) { unique_ptr writable_file; fname = test::TmpDir() + "/WithCollisionPathFullKeyAndCuckooBlock"; ASSERT_OK(env_->NewWritableFile(fname, &writable_file, env_options_)); - CuckooTableBuilder builder(writable_file.get(), kHashTableRatio, - num_hash_fun, 100, BytewiseComparator(), 2, false, false, GetSliceHash); + unique_ptr file_writer( + new WritableFileWriter(std::move(writable_file), EnvOptions())); + CuckooTableBuilder builder(file_writer.get(), kHashTableRatio, num_hash_fun, + 100, BytewiseComparator(), 2, false, false, + GetSliceHash); ASSERT_OK(builder.status()); for (uint32_t i = 0; i < user_keys.size(); i++) { builder.Add(Slice(keys[i]), Slice(values[i])); @@ -361,7 +382,7 @@ TEST_F(CuckooBuilderTest, WithCollisionPathFullKeyAndCuckooBlock) { size_t bucket_size = keys[0].size() + values[0].size(); ASSERT_EQ(expected_table_size * bucket_size - 1, builder.FileSize()); ASSERT_OK(builder.Finish()); - ASSERT_OK(writable_file->Close()); + ASSERT_OK(file_writer->Close()); ASSERT_LE(expected_table_size * bucket_size, builder.FileSize()); std::string expected_unused_bucket = GetInternalKey("key00", true); @@ -389,8 +410,11 @@ TEST_F(CuckooBuilderTest, WriteSuccessNoCollisionUserKey) { unique_ptr writable_file; fname = test::TmpDir() + "/NoCollisionUserKey"; ASSERT_OK(env_->NewWritableFile(fname, &writable_file, env_options_)); - CuckooTableBuilder builder(writable_file.get(), kHashTableRatio, - num_hash_fun, 100, BytewiseComparator(), 1, false, false, GetSliceHash); + unique_ptr file_writer( + new WritableFileWriter(std::move(writable_file), EnvOptions())); + CuckooTableBuilder builder(file_writer.get(), kHashTableRatio, num_hash_fun, + 100, BytewiseComparator(), 1, false, false, + GetSliceHash); ASSERT_OK(builder.status()); for (uint32_t i = 0; i < user_keys.size(); i++) { builder.Add(Slice(GetInternalKey(user_keys[i], true)), Slice(values[i])); @@ -400,7 +424,7 @@ TEST_F(CuckooBuilderTest, WriteSuccessNoCollisionUserKey) { size_t bucket_size = user_keys[0].size() + values[0].size(); ASSERT_EQ(expected_table_size * bucket_size - 1, builder.FileSize()); ASSERT_OK(builder.Finish()); - ASSERT_OK(writable_file->Close()); + ASSERT_OK(file_writer->Close()); ASSERT_LE(expected_table_size * bucket_size, builder.FileSize()); std::string expected_unused_bucket = "key00"; @@ -429,8 +453,11 @@ TEST_F(CuckooBuilderTest, WriteSuccessWithCollisionUserKey) { unique_ptr writable_file; fname = test::TmpDir() + "/WithCollisionUserKey"; ASSERT_OK(env_->NewWritableFile(fname, &writable_file, env_options_)); - CuckooTableBuilder builder(writable_file.get(), kHashTableRatio, - num_hash_fun, 100, BytewiseComparator(), 1, false, false, GetSliceHash); + unique_ptr file_writer( + new WritableFileWriter(std::move(writable_file), EnvOptions())); + CuckooTableBuilder builder(file_writer.get(), kHashTableRatio, num_hash_fun, + 100, BytewiseComparator(), 1, false, false, + GetSliceHash); ASSERT_OK(builder.status()); for (uint32_t i = 0; i < user_keys.size(); i++) { builder.Add(Slice(GetInternalKey(user_keys[i], true)), Slice(values[i])); @@ -440,7 +467,7 @@ TEST_F(CuckooBuilderTest, WriteSuccessWithCollisionUserKey) { size_t bucket_size = user_keys[0].size() + values[0].size(); ASSERT_EQ(expected_table_size * bucket_size - 1, builder.FileSize()); ASSERT_OK(builder.Finish()); - ASSERT_OK(writable_file->Close()); + ASSERT_OK(file_writer->Close()); ASSERT_LE(expected_table_size * bucket_size, builder.FileSize()); std::string expected_unused_bucket = "key00"; @@ -471,8 +498,11 @@ TEST_F(CuckooBuilderTest, WithCollisionPathUserKey) { unique_ptr writable_file; fname = test::TmpDir() + "/WithCollisionPathUserKey"; ASSERT_OK(env_->NewWritableFile(fname, &writable_file, env_options_)); - CuckooTableBuilder builder(writable_file.get(), kHashTableRatio, - num_hash_fun, 2, BytewiseComparator(), 1, false, false, GetSliceHash); + unique_ptr file_writer( + new WritableFileWriter(std::move(writable_file), EnvOptions())); + CuckooTableBuilder builder(file_writer.get(), kHashTableRatio, num_hash_fun, + 2, BytewiseComparator(), 1, false, false, + GetSliceHash); ASSERT_OK(builder.status()); for (uint32_t i = 0; i < user_keys.size(); i++) { builder.Add(Slice(GetInternalKey(user_keys[i], true)), Slice(values[i])); @@ -482,7 +512,7 @@ TEST_F(CuckooBuilderTest, WithCollisionPathUserKey) { size_t bucket_size = user_keys[0].size() + values[0].size(); ASSERT_EQ(expected_table_size * bucket_size - 1, builder.FileSize()); ASSERT_OK(builder.Finish()); - ASSERT_OK(writable_file->Close()); + ASSERT_OK(file_writer->Close()); ASSERT_LE(expected_table_size * bucket_size, builder.FileSize()); std::string expected_unused_bucket = "key00"; @@ -512,8 +542,11 @@ TEST_F(CuckooBuilderTest, FailWhenCollisionPathTooLong) { unique_ptr writable_file; fname = test::TmpDir() + "/WithCollisionPathUserKey"; ASSERT_OK(env_->NewWritableFile(fname, &writable_file, env_options_)); - CuckooTableBuilder builder(writable_file.get(), kHashTableRatio, - num_hash_fun, 2, BytewiseComparator(), 1, false, false, GetSliceHash); + unique_ptr file_writer( + new WritableFileWriter(std::move(writable_file), EnvOptions())); + CuckooTableBuilder builder(file_writer.get(), kHashTableRatio, num_hash_fun, + 2, BytewiseComparator(), 1, false, false, + GetSliceHash); ASSERT_OK(builder.status()); for (uint32_t i = 0; i < user_keys.size(); i++) { builder.Add(Slice(GetInternalKey(user_keys[i], false)), Slice("value")); @@ -521,7 +554,7 @@ TEST_F(CuckooBuilderTest, FailWhenCollisionPathTooLong) { ASSERT_OK(builder.status()); } ASSERT_TRUE(builder.Finish().IsNotSupported()); - ASSERT_OK(writable_file->Close()); + ASSERT_OK(file_writer->Close()); } TEST_F(CuckooBuilderTest, FailWhenSameKeyInserted) { @@ -536,8 +569,11 @@ TEST_F(CuckooBuilderTest, FailWhenSameKeyInserted) { unique_ptr writable_file; fname = test::TmpDir() + "/FailWhenSameKeyInserted"; ASSERT_OK(env_->NewWritableFile(fname, &writable_file, env_options_)); - CuckooTableBuilder builder(writable_file.get(), kHashTableRatio, - num_hash_fun, 100, BytewiseComparator(), 1, false, false, GetSliceHash); + unique_ptr file_writer( + new WritableFileWriter(std::move(writable_file), EnvOptions())); + CuckooTableBuilder builder(file_writer.get(), kHashTableRatio, num_hash_fun, + 100, BytewiseComparator(), 1, false, false, + GetSliceHash); ASSERT_OK(builder.status()); builder.Add(Slice(GetInternalKey(user_key, false)), Slice("value1")); @@ -548,7 +584,7 @@ TEST_F(CuckooBuilderTest, FailWhenSameKeyInserted) { ASSERT_OK(builder.status()); ASSERT_TRUE(builder.Finish().IsNotSupported()); - ASSERT_OK(writable_file->Close()); + ASSERT_OK(file_writer->Close()); } } // namespace rocksdb diff --git a/table/cuckoo_table_factory.cc b/table/cuckoo_table_factory.cc index 17aa1d78e..1899f6c28 100644 --- a/table/cuckoo_table_factory.cc +++ b/table/cuckoo_table_factory.cc @@ -12,9 +12,10 @@ namespace rocksdb { -Status CuckooTableFactory::NewTableReader(const ImmutableCFOptions& ioptions, - const EnvOptions& env_options, const InternalKeyComparator& icomp, - std::unique_ptr&& file, uint64_t file_size, +Status CuckooTableFactory::NewTableReader( + const ImmutableCFOptions& ioptions, const EnvOptions& env_options, + const InternalKeyComparator& icomp, + std::unique_ptr&& file, uint64_t file_size, std::unique_ptr* table) const { std::unique_ptr new_reader(new CuckooTableReader(ioptions, std::move(file), file_size, icomp.user_comparator(), nullptr)); @@ -27,7 +28,7 @@ Status CuckooTableFactory::NewTableReader(const ImmutableCFOptions& ioptions, TableBuilder* CuckooTableFactory::NewTableBuilder( const TableBuilderOptions& table_builder_options, - WritableFile* file) const { + WritableFileWriter* file) const { // Ignore the skipFIlters flag. Does not apply to this file format // diff --git a/table/cuckoo_table_factory.h b/table/cuckoo_table_factory.h index c78fe6931..9f2a67765 100644 --- a/table/cuckoo_table_factory.h +++ b/table/cuckoo_table_factory.h @@ -55,15 +55,16 @@ class CuckooTableFactory : public TableFactory { const char* Name() const override { return "CuckooTable"; } - Status NewTableReader( - const ImmutableCFOptions& ioptions, const EnvOptions& env_options, - const InternalKeyComparator& internal_comparator, - unique_ptr&& file, uint64_t file_size, - unique_ptr* table) const override; + Status NewTableReader(const ImmutableCFOptions& ioptions, + const EnvOptions& env_options, + const InternalKeyComparator& internal_comparator, + unique_ptr&& file, + uint64_t file_size, + unique_ptr* table) const override; TableBuilder* NewTableBuilder( const TableBuilderOptions& table_builder_options, - WritableFile* file) const override; + WritableFileWriter* file) const override; // Sanitizes the specified DB Options. Status SanitizeOptions(const DBOptions& db_opts, diff --git a/table/cuckoo_table_reader.cc b/table/cuckoo_table_reader.cc index 7f017ec7c..51ffc6ffa 100644 --- a/table/cuckoo_table_reader.cc +++ b/table/cuckoo_table_reader.cc @@ -33,8 +33,7 @@ extern const uint64_t kCuckooTableMagicNumber; CuckooTableReader::CuckooTableReader( const ImmutableCFOptions& ioptions, - std::unique_ptr&& file, - uint64_t file_size, + std::unique_ptr&& file, uint64_t file_size, const Comparator* comparator, uint64_t (*get_slice_hash)(const Slice&, uint32_t, uint64_t)) : file_(std::move(file)), diff --git a/table/cuckoo_table_reader.h b/table/cuckoo_table_reader.h index 4f00a9e41..6643be025 100644 --- a/table/cuckoo_table_reader.h +++ b/table/cuckoo_table_reader.h @@ -18,6 +18,7 @@ #include "rocksdb/env.h" #include "rocksdb/options.h" #include "table/table_reader.h" +#include "util/file_reader_writer.h" namespace rocksdb { @@ -26,12 +27,11 @@ class TableReader; class CuckooTableReader: public TableReader { public: - CuckooTableReader( - const ImmutableCFOptions& ioptions, - std::unique_ptr&& file, - uint64_t file_size, - const Comparator* user_comparator, - uint64_t (*get_slice_hash)(const Slice&, uint32_t, uint64_t)); + CuckooTableReader(const ImmutableCFOptions& ioptions, + std::unique_ptr&& file, + uint64_t file_size, const Comparator* user_comparator, + uint64_t (*get_slice_hash)(const Slice&, uint32_t, + uint64_t)); ~CuckooTableReader() {} std::shared_ptr GetTableProperties() const override { @@ -57,7 +57,7 @@ class CuckooTableReader: public TableReader { private: friend class CuckooTableIterator; void LoadAllKeys(std::vector>* key_to_bucket_id); - std::unique_ptr file_; + std::unique_ptr file_; Slice file_data_; bool is_last_level_; bool identity_as_first_hash_; diff --git a/table/cuckoo_table_reader_test.cc b/table/cuckoo_table_reader_test.cc index 660261ab3..1f5b4de0d 100644 --- a/table/cuckoo_table_reader_test.cc +++ b/table/cuckoo_table_reader_test.cc @@ -62,7 +62,6 @@ uint64_t GetSliceHash(const Slice& s, uint32_t index, uint64_t max_num_buckets) { return hash_map[s.ToString()][index]; } - } // namespace class CuckooReaderTest : public testing::Test { @@ -94,9 +93,11 @@ class CuckooReaderTest : public testing::Test { const Comparator* ucomp = BytewiseComparator()) { std::unique_ptr writable_file; ASSERT_OK(env->NewWritableFile(fname, &writable_file, env_options)); - CuckooTableBuilder builder( - writable_file.get(), 0.9, kNumHashFunc, 100, ucomp, 2, - false, false, GetSliceHash); + unique_ptr file_writer( + new WritableFileWriter(std::move(writable_file), env_options)); + + CuckooTableBuilder builder(file_writer.get(), 0.9, kNumHashFunc, 100, ucomp, + 2, false, false, GetSliceHash); ASSERT_OK(builder.status()); for (uint32_t key_idx = 0; key_idx < num_items; ++key_idx) { builder.Add(Slice(keys[key_idx]), Slice(values[key_idx])); @@ -106,18 +107,16 @@ class CuckooReaderTest : public testing::Test { ASSERT_OK(builder.Finish()); ASSERT_EQ(num_items, builder.NumEntries()); file_size = builder.FileSize(); - ASSERT_OK(writable_file->Close()); + ASSERT_OK(file_writer->Close()); // Check reader now. std::unique_ptr read_file; ASSERT_OK(env->NewRandomAccessFile(fname, &read_file, env_options)); + unique_ptr file_reader( + new RandomAccessFileReader(std::move(read_file))); const ImmutableCFOptions ioptions(options); - CuckooTableReader reader( - ioptions, - std::move(read_file), - file_size, - ucomp, - GetSliceHash); + CuckooTableReader reader(ioptions, std::move(file_reader), file_size, ucomp, + GetSliceHash); ASSERT_OK(reader.status()); // Assume no merge/deletion for (uint32_t i = 0; i < num_items; ++i) { @@ -141,13 +140,11 @@ class CuckooReaderTest : public testing::Test { void CheckIterator(const Comparator* ucomp = BytewiseComparator()) { std::unique_ptr read_file; ASSERT_OK(env->NewRandomAccessFile(fname, &read_file, env_options)); + unique_ptr file_reader( + new RandomAccessFileReader(std::move(read_file))); const ImmutableCFOptions ioptions(options); - CuckooTableReader reader( - ioptions, - std::move(read_file), - file_size, - ucomp, - GetSliceHash); + CuckooTableReader reader(ioptions, std::move(file_reader), file_size, ucomp, + GetSliceHash); ASSERT_OK(reader.status()); Iterator* it = reader.NewIterator(ReadOptions(), nullptr); ASSERT_OK(it->status()); @@ -321,13 +318,11 @@ TEST_F(CuckooReaderTest, WhenKeyNotFound) { CreateCuckooFileAndCheckReader(); std::unique_ptr read_file; ASSERT_OK(env->NewRandomAccessFile(fname, &read_file, env_options)); + unique_ptr file_reader( + new RandomAccessFileReader(std::move(read_file))); const ImmutableCFOptions ioptions(options); - CuckooTableReader reader( - ioptions, - std::move(read_file), - file_size, - ucmp, - GetSliceHash); + CuckooTableReader reader(ioptions, std::move(file_reader), file_size, ucmp, + GetSliceHash); ASSERT_OK(reader.status()); // Search for a key with colliding hash values. std::string not_found_user_key = "key" + NumToStr(num_items); @@ -406,10 +401,11 @@ void WriteFile(const std::vector& keys, std::unique_ptr writable_file; ASSERT_OK(env->NewWritableFile(fname, &writable_file, env_options)); - CuckooTableBuilder builder( - writable_file.get(), hash_ratio, - 64, 1000, test::Uint64Comparator(), 5, - false, FLAGS_identity_as_first_hash, nullptr); + unique_ptr file_writer( + new WritableFileWriter(std::move(writable_file), env_options)); + CuckooTableBuilder builder(file_writer.get(), hash_ratio, 64, 1000, + test::Uint64Comparator(), 5, false, + FLAGS_identity_as_first_hash, nullptr); ASSERT_OK(builder.status()); for (uint64_t key_idx = 0; key_idx < num; ++key_idx) { // Value is just a part of key. @@ -419,17 +415,18 @@ void WriteFile(const std::vector& keys, } ASSERT_OK(builder.Finish()); ASSERT_EQ(num, builder.NumEntries()); - ASSERT_OK(writable_file->Close()); + ASSERT_OK(file_writer->Close()); uint64_t file_size; env->GetFileSize(fname, &file_size); std::unique_ptr read_file; ASSERT_OK(env->NewRandomAccessFile(fname, &read_file, env_options)); + unique_ptr file_reader( + new RandomAccessFileReader(std::move(read_file))); const ImmutableCFOptions ioptions(options); - CuckooTableReader reader( - ioptions, std::move(read_file), file_size, - test::Uint64Comparator(), nullptr); + CuckooTableReader reader(ioptions, std::move(file_reader), file_size, + test::Uint64Comparator(), nullptr); ASSERT_OK(reader.status()); ReadOptions r_options; std::string value; @@ -455,11 +452,12 @@ void ReadKeys(uint64_t num, uint32_t batch_size) { env->GetFileSize(fname, &file_size); std::unique_ptr read_file; ASSERT_OK(env->NewRandomAccessFile(fname, &read_file, env_options)); + unique_ptr file_reader( + new RandomAccessFileReader(std::move(read_file))); const ImmutableCFOptions ioptions(options); - CuckooTableReader reader( - ioptions, std::move(read_file), file_size, test::Uint64Comparator(), - nullptr); + CuckooTableReader reader(ioptions, std::move(file_reader), file_size, + test::Uint64Comparator(), nullptr); ASSERT_OK(reader.status()); const UserCollectedProperties user_props = reader.GetTableProperties()->user_collected_properties; diff --git a/table/format.cc b/table/format.cc index ccc345f8e..956c5c47c 100644 --- a/table/format.cc +++ b/table/format.cc @@ -17,6 +17,7 @@ #include "util/coding.h" #include "util/compression.h" #include "util/crc32c.h" +#include "util/file_reader_writer.h" #include "util/perf_context_imp.h" #include "util/string_util.h" #include "util/xxhash.h" @@ -210,7 +211,7 @@ std::string Footer::ToString() const { return result; } -Status ReadFooterFromFile(RandomAccessFile* file, uint64_t file_size, +Status ReadFooterFromFile(RandomAccessFileReader* file, uint64_t file_size, Footer* footer, uint64_t enforce_table_magic_number) { if (file_size < Footer::kMinEncodedLength) { return Status::Corruption("file is too short to be an sstable"); @@ -249,9 +250,9 @@ namespace { // Read a block and check its CRC // contents is the result of reading. // According to the implementation of file->Read, contents may not point to buf -Status ReadBlock(RandomAccessFile* file, const Footer& footer, - const ReadOptions& options, const BlockHandle& handle, - Slice* contents, /* result of reading */ char* buf) { +Status ReadBlock(RandomAccessFileReader* file, const Footer& footer, + const ReadOptions& options, const BlockHandle& handle, + Slice* contents, /* result of reading */ char* buf) { size_t n = static_cast(handle.size()); Status s; @@ -299,7 +300,7 @@ Status ReadBlock(RandomAccessFile* file, const Footer& footer, } // namespace -Status ReadBlockContents(RandomAccessFile* file, const Footer& footer, +Status ReadBlockContents(RandomAccessFileReader* file, const Footer& footer, const ReadOptions& options, const BlockHandle& handle, BlockContents* contents, Env* env, bool decompression_requested) { diff --git a/table/format.h b/table/format.h index 5f1a3dc96..74ec808c6 100644 --- a/table/format.h +++ b/table/format.h @@ -166,7 +166,7 @@ class Footer { // Read the footer from file // If enforce_table_magic_number != 0, ReadFooterFromFile() will return // corruption if table_magic number is not equal to enforce_table_magic_number -Status ReadFooterFromFile(RandomAccessFile* file, uint64_t file_size, +Status ReadFooterFromFile(RandomAccessFileReader* file, uint64_t file_size, Footer* footer, uint64_t enforce_table_magic_number = 0); @@ -205,7 +205,8 @@ struct BlockContents { // Read the block identified by "handle" from "file". On failure // return non-OK. On success fill *result and return OK. -extern Status ReadBlockContents(RandomAccessFile* file, const Footer& footer, +extern Status ReadBlockContents(RandomAccessFileReader* file, + const Footer& footer, const ReadOptions& options, const BlockHandle& handle, BlockContents* contents, Env* env, diff --git a/table/meta_blocks.cc b/table/meta_blocks.cc index 6fad80825..ad04b670f 100644 --- a/table/meta_blocks.cc +++ b/table/meta_blocks.cc @@ -129,9 +129,9 @@ bool NotifyCollectTableCollectorsOnFinish( return all_succeeded; } -Status ReadProperties(const Slice &handle_value, RandomAccessFile *file, - const Footer &footer, Env *env, Logger *logger, - TableProperties **table_properties) { +Status ReadProperties(const Slice& handle_value, RandomAccessFileReader* file, + const Footer& footer, Env* env, Logger* logger, + TableProperties** table_properties) { assert(table_properties); Slice v = handle_value; @@ -217,7 +217,7 @@ Status ReadProperties(const Slice &handle_value, RandomAccessFile *file, return s; } -Status ReadTableProperties(RandomAccessFile* file, uint64_t file_size, +Status ReadTableProperties(RandomAccessFileReader* file, uint64_t file_size, uint64_t table_magic_number, Env* env, Logger* info_log, TableProperties** properties) { // -- Read metaindex block @@ -271,7 +271,7 @@ Status FindMetaBlock(Iterator* meta_index_iter, } } -Status FindMetaBlock(RandomAccessFile* file, uint64_t file_size, +Status FindMetaBlock(RandomAccessFileReader* file, uint64_t file_size, uint64_t table_magic_number, Env* env, const std::string& meta_block_name, BlockHandle* block_handle) { @@ -298,7 +298,7 @@ Status FindMetaBlock(RandomAccessFile* file, uint64_t file_size, return FindMetaBlock(meta_iter.get(), meta_block_name, block_handle); } -Status ReadMetaBlock(RandomAccessFile* file, uint64_t file_size, +Status ReadMetaBlock(RandomAccessFileReader* file, uint64_t file_size, uint64_t table_magic_number, Env* env, const std::string& meta_block_name, BlockContents* contents) { diff --git a/table/meta_blocks.h b/table/meta_blocks.h index 7ac3cb063..167ba5900 100644 --- a/table/meta_blocks.h +++ b/table/meta_blocks.h @@ -107,26 +107,25 @@ bool NotifyCollectTableCollectorsOnFinish( // @returns a status to indicate if the operation succeeded. On success, // *table_properties will point to a heap-allocated TableProperties // object, otherwise value of `table_properties` will not be modified. -Status ReadProperties(const Slice &handle_value, RandomAccessFile *file, - const Footer &footer, Env *env, Logger *logger, - TableProperties **table_properties); +Status ReadProperties(const Slice& handle_value, RandomAccessFileReader* file, + const Footer& footer, Env* env, Logger* logger, + TableProperties** table_properties); // Directly read the properties from the properties block of a plain table. // @returns a status to indicate if the operation succeeded. On success, // *table_properties will point to a heap-allocated TableProperties // object, otherwise value of `table_properties` will not be modified. -Status ReadTableProperties(RandomAccessFile* file, uint64_t file_size, +Status ReadTableProperties(RandomAccessFileReader* file, uint64_t file_size, uint64_t table_magic_number, Env* env, Logger* info_log, TableProperties** properties); - // Find the meta block from the meta index block. Status FindMetaBlock(Iterator* meta_index_iter, const std::string& meta_block_name, BlockHandle* block_handle); // Find the meta block -Status FindMetaBlock(RandomAccessFile* file, uint64_t file_size, +Status FindMetaBlock(RandomAccessFileReader* file, uint64_t file_size, uint64_t table_magic_number, Env* env, const std::string& meta_block_name, BlockHandle* block_handle); @@ -134,7 +133,7 @@ Status FindMetaBlock(RandomAccessFile* file, uint64_t file_size, // Read the specified meta block with name meta_block_name // from `file` and initialize `contents` with contents of this block. // Return Status::OK in case of success. -Status ReadMetaBlock(RandomAccessFile* file, uint64_t file_size, +Status ReadMetaBlock(RandomAccessFileReader* file, uint64_t file_size, uint64_t table_magic_number, Env* env, const std::string& meta_block_name, BlockContents* contents); diff --git a/table/mock_table.cc b/table/mock_table.cc index 90e2079dd..925b75cd7 100644 --- a/table/mock_table.cc +++ b/table/mock_table.cc @@ -11,6 +11,7 @@ #include "db/dbformat.h" #include "port/port.h" #include "util/coding.h" +#include "util/file_reader_writer.h" namespace rocksdb { namespace mock { @@ -45,7 +46,7 @@ MockTableFactory::MockTableFactory() : next_id_(1) {} Status MockTableFactory::NewTableReader( const ImmutableCFOptions& ioptions, const EnvOptions& env_options, const InternalKeyComparator& internal_key, - unique_ptr&& file, uint64_t file_size, + unique_ptr&& file, uint64_t file_size, unique_ptr* table_reader) const { uint32_t id = GetIDFromFile(file.get()); @@ -63,8 +64,8 @@ Status MockTableFactory::NewTableReader( TableBuilder* MockTableFactory::NewTableBuilder( const TableBuilderOptions& table_builder_options, - WritableFile* file) const { - uint32_t id = GetAndWriteNextID(file); + WritableFileWriter* file) const { + uint32_t id = GetAndWriteNextID(file->writable_file()); return new MockTableBuilder(id, &file_system_); } @@ -90,7 +91,7 @@ uint32_t MockTableFactory::GetAndWriteNextID(WritableFile* file) const { return next_id; } -uint32_t MockTableFactory::GetIDFromFile(RandomAccessFile* file) const { +uint32_t MockTableFactory::GetIDFromFile(RandomAccessFileReader* file) const { char buf[4]; Slice result; file->Read(0, 4, &result, buf); diff --git a/table/mock_table.h b/table/mock_table.h index ef38575cc..fd542c965 100644 --- a/table/mock_table.h +++ b/table/mock_table.h @@ -140,13 +140,14 @@ class MockTableFactory : public TableFactory { MockTableFactory(); const char* Name() const override { return "MockTable"; } Status NewTableReader(const ImmutableCFOptions& ioptions, - const EnvOptions& env_options, - const InternalKeyComparator& internal_key, - unique_ptr&& file, uint64_t file_size, - unique_ptr* table_reader) const override; + const EnvOptions& env_options, + const InternalKeyComparator& internal_key, + unique_ptr&& file, + uint64_t file_size, + unique_ptr* table_reader) const override; TableBuilder* NewTableBuilder( const TableBuilderOptions& table_builder_options, - WritableFile* file) const override; + WritableFileWriter* file) const override; // This function will directly create mock table instead of going through // MockTableBuilder. MockFileContents has to have a format of next_id_; diff --git a/table/plain_table_builder.cc b/table/plain_table_builder.cc index 25e1b85bb..e16224a9d 100644 --- a/table/plain_table_builder.cc +++ b/table/plain_table_builder.cc @@ -26,6 +26,7 @@ #include "table/meta_blocks.h" #include "util/coding.h" #include "util/crc32c.h" +#include "util/file_reader_writer.h" #include "util/stop_watch.h" namespace rocksdb { @@ -35,11 +36,8 @@ namespace { // a utility that helps writing block content to the file // @offset will advance if @block_contents was successfully written. // @block_handle the block handle this particular block. -Status WriteBlock( - const Slice& block_contents, - WritableFile* file, - uint64_t* offset, - BlockHandle* block_handle) { +Status WriteBlock(const Slice& block_contents, WritableFileWriter* file, + uint64_t* offset, BlockHandle* block_handle) { block_handle->set_offset(*offset); block_handle->set_size(block_contents.size()); Status s = file->Append(block_contents); @@ -62,7 +60,7 @@ PlainTableBuilder::PlainTableBuilder( const ImmutableCFOptions& ioptions, const std::vector>* int_tbl_prop_collector_factories, - WritableFile* file, uint32_t user_key_len, EncodingType encoding_type, + WritableFileWriter* file, uint32_t user_key_len, EncodingType encoding_type, size_t index_sparseness, uint32_t bloom_bits_per_key, uint32_t num_probes, size_t huge_page_tlb_size, double hash_table_ratio, bool store_index_in_file) diff --git a/table/plain_table_builder.h b/table/plain_table_builder.h index f542d2f60..75ec3facd 100644 --- a/table/plain_table_builder.h +++ b/table/plain_table_builder.h @@ -34,10 +34,11 @@ class PlainTableBuilder: public TableBuilder { const ImmutableCFOptions& ioptions, const std::vector>* int_tbl_prop_collector_factories, - WritableFile* file, uint32_t user_key_size, EncodingType encoding_type, - size_t index_sparseness, uint32_t bloom_bits_per_key, - uint32_t num_probes = 6, size_t huge_page_tlb_size = 0, - double hash_table_ratio = 0, bool store_index_in_file = false); + WritableFileWriter* file, uint32_t user_key_size, + EncodingType encoding_type, size_t index_sparseness, + uint32_t bloom_bits_per_key, uint32_t num_probes = 6, + size_t huge_page_tlb_size = 0, double hash_table_ratio = 0, + bool store_index_in_file = false); // REQUIRES: Either Finish() or Abandon() has been called. ~PlainTableBuilder(); @@ -82,7 +83,7 @@ class PlainTableBuilder: public TableBuilder { BloomBlockBuilder bloom_block_; std::unique_ptr index_builder_; - WritableFile* file_; + WritableFileWriter* file_; uint64_t offset_ = 0; uint32_t bloom_bits_per_key_; size_t huge_page_tlb_size_; diff --git a/table/plain_table_factory.cc b/table/plain_table_factory.cc index ee492fa56..b7ce3b752 100644 --- a/table/plain_table_factory.cc +++ b/table/plain_table_factory.cc @@ -14,12 +14,11 @@ namespace rocksdb { -Status PlainTableFactory::NewTableReader(const ImmutableCFOptions& ioptions, - const EnvOptions& env_options, - const InternalKeyComparator& icomp, - unique_ptr&& file, - uint64_t file_size, - unique_ptr* table) const { +Status PlainTableFactory::NewTableReader( + const ImmutableCFOptions& ioptions, const EnvOptions& env_options, + const InternalKeyComparator& icomp, + unique_ptr&& file, uint64_t file_size, + unique_ptr* table) const { return PlainTableReader::Open(ioptions, env_options, icomp, std::move(file), file_size, table, bloom_bits_per_key_, hash_table_ratio_, index_sparseness_, @@ -28,7 +27,7 @@ Status PlainTableFactory::NewTableReader(const ImmutableCFOptions& ioptions, TableBuilder* PlainTableFactory::NewTableBuilder( const TableBuilderOptions& table_builder_options, - WritableFile* file) const { + WritableFileWriter* file) const { // Ignore the skip_filters flag. PlainTable format is optimized for small // in-memory dbs. The skip_filters optimization is not useful for plain // tables diff --git a/table/plain_table_factory.h b/table/plain_table_factory.h index 730e13468..e7e72bea4 100644 --- a/table/plain_table_factory.h +++ b/table/plain_table_factory.h @@ -153,14 +153,15 @@ class PlainTableFactory : public TableFactory { full_scan_mode_(options.full_scan_mode), store_index_in_file_(options.store_index_in_file) {} const char* Name() const override { return "PlainTable"; } - Status NewTableReader( - const ImmutableCFOptions& options, const EnvOptions& soptions, - const InternalKeyComparator& internal_comparator, - unique_ptr&& file, uint64_t file_size, - unique_ptr* table) const override; + Status NewTableReader(const ImmutableCFOptions& options, + const EnvOptions& soptions, + const InternalKeyComparator& internal_comparator, + unique_ptr&& file, + uint64_t file_size, + unique_ptr* table) const override; TableBuilder* NewTableBuilder( const TableBuilderOptions& table_builder_options, - WritableFile* file) const override; + WritableFileWriter* file) const override; std::string GetPrintableTableOptions() const override; diff --git a/table/plain_table_key_coding.cc b/table/plain_table_key_coding.cc index 4f09b507e..5aa0fa2a5 100644 --- a/table/plain_table_key_coding.cc +++ b/table/plain_table_key_coding.cc @@ -6,8 +6,9 @@ #ifndef ROCKSDB_LITE #include "table/plain_table_key_coding.h" -#include "table/plain_table_factory.h" #include "db/dbformat.h" +#include "table/plain_table_factory.h" +#include "util/file_reader_writer.h" namespace rocksdb { @@ -64,7 +65,8 @@ const char* DecodeSize(const char* offset, const char* limit, } } // namespace -Status PlainTableKeyEncoder::AppendKey(const Slice& key, WritableFile* file, +Status PlainTableKeyEncoder::AppendKey(const Slice& key, + WritableFileWriter* file, uint64_t* offset, char* meta_bytes_buf, size_t* meta_bytes_buf_size) { ParsedInternalKey parsed_key; diff --git a/table/plain_table_key_coding.h b/table/plain_table_key_coding.h index 9047087ae..e1bc5291b 100644 --- a/table/plain_table_key_coding.h +++ b/table/plain_table_key_coding.h @@ -34,7 +34,7 @@ class PlainTableKeyEncoder { // meta_bytes_buf: buffer for extra meta bytes // meta_bytes_buf_size: offset to append extra meta bytes. Will be updated // if meta_bytes_buf is updated. - Status AppendKey(const Slice& key, WritableFile* file, uint64_t* offset, + Status AppendKey(const Slice& key, WritableFileWriter* file, uint64_t* offset, char* meta_bytes_buf, size_t* meta_bytes_buf_size); // Return actual encoding type to be picked diff --git a/table/plain_table_reader.cc b/table/plain_table_reader.cc index c409204aa..529642bba 100644 --- a/table/plain_table_reader.cc +++ b/table/plain_table_reader.cc @@ -90,7 +90,7 @@ class PlainTableIterator : public Iterator { extern const uint64_t kPlainTableMagicNumber; PlainTableReader::PlainTableReader(const ImmutableCFOptions& ioptions, - unique_ptr&& file, + unique_ptr&& file, const EnvOptions& storage_options, const InternalKeyComparator& icomparator, EncodingType encoding_type, @@ -115,7 +115,7 @@ PlainTableReader::~PlainTableReader() { Status PlainTableReader::Open(const ImmutableCFOptions& ioptions, const EnvOptions& env_options, const InternalKeyComparator& internal_comparator, - unique_ptr&& file, + unique_ptr&& file, uint64_t file_size, unique_ptr* table_reader, const int bloom_bits_per_key, diff --git a/table/plain_table_reader.h b/table/plain_table_reader.h index b4f68a0fd..64b5317aa 100644 --- a/table/plain_table_reader.h +++ b/table/plain_table_reader.h @@ -22,6 +22,7 @@ #include "table/plain_table_index.h" #include "util/arena.h" #include "util/dynamic_bloom.h" +#include "util/file_reader_writer.h" namespace rocksdb { @@ -56,8 +57,8 @@ class PlainTableReader: public TableReader { static Status Open(const ImmutableCFOptions& ioptions, const EnvOptions& env_options, const InternalKeyComparator& internal_comparator, - unique_ptr&& file, uint64_t file_size, - unique_ptr* table, + unique_ptr&& file, + uint64_t file_size, unique_ptr* table, const int bloom_bits_per_key, double hash_table_ratio, size_t index_sparseness, size_t huge_page_tlb_size, bool full_scan_mode); @@ -83,7 +84,7 @@ class PlainTableReader: public TableReader { } PlainTableReader(const ImmutableCFOptions& ioptions, - unique_ptr&& file, + unique_ptr&& file, const EnvOptions& env_options, const InternalKeyComparator& internal_comparator, EncodingType encoding_type, uint64_t file_size, @@ -134,7 +135,7 @@ class PlainTableReader: public TableReader { Arena arena_; const ImmutableCFOptions& ioptions_; - unique_ptr file_; + unique_ptr file_; uint64_t file_size_; std::shared_ptr table_properties_; diff --git a/table/table_reader_bench.cc b/table/table_reader_bench.cc index b4039aa74..e8636f57b 100644 --- a/table/table_reader_bench.cc +++ b/table/table_reader_bench.cc @@ -22,6 +22,7 @@ int main() { #include "table/plain_table_factory.h" #include "table/table_builder.h" #include "table/get_context.h" +#include "util/file_reader_writer.h" #include "util/histogram.h" #include "util/testharness.h" #include "util/testutil.h" @@ -90,11 +91,14 @@ void TableReaderBenchmark(Options& opts, EnvOptions& env_options, std::vector > int_tbl_prop_collector_factories; + unique_ptr file_writer( + new WritableFileWriter(std::move(file), env_options)); + tb = opts.table_factory->NewTableBuilder( TableBuilderOptions(ioptions, ikc, &int_tbl_prop_collector_factories, CompressionType::kNoCompression, CompressionOptions(), false), - file.get()); + file_writer.get()); } else { s = DB::Open(opts, dbname, &db); ASSERT_OK(s); @@ -119,13 +123,16 @@ void TableReaderBenchmark(Options& opts, EnvOptions& env_options, } unique_ptr table_reader; - unique_ptr raf; if (!through_db) { + unique_ptr raf; s = env->NewRandomAccessFile(file_name, &raf, env_options); uint64_t file_size; env->GetFileSize(file_name, &file_size); - s = opts.table_factory->NewTableReader( - ioptions, env_options, ikc, std::move(raf), file_size, &table_reader); + unique_ptr file_reader( + new RandomAccessFileReader(std::move(raf))); + s = opts.table_factory->NewTableReader(ioptions, env_options, ikc, + std::move(file_reader), file_size, + &table_reader); } Random rnd(301); diff --git a/table/table_test.cc b/table/table_test.cc index 3165a9ae4..bda6db7f9 100644 --- a/table/table_test.cc +++ b/table/table_test.cc @@ -347,7 +347,7 @@ class TableConstructor: public Constructor { const InternalKeyComparator& internal_comparator, const KVMap& kv_map) override { Reset(); - sink_.reset(new StringSink()); + file_writer_.reset(test::GetWritableFileWriter(new StringSink())); unique_ptr builder; std::vector> int_tbl_prop_collector_factories; @@ -355,7 +355,7 @@ class TableConstructor: public Constructor { TableBuilderOptions(ioptions, internal_comparator, &int_tbl_prop_collector_factories, options.compression, CompressionOptions(), false), - sink_.get())); + file_writer_.get())); for (const auto kv : kv_map) { if (convert_to_internal_key_) { @@ -369,17 +369,18 @@ class TableConstructor: public Constructor { EXPECT_TRUE(builder->status().ok()); } Status s = builder->Finish(); + file_writer_->Flush(); EXPECT_TRUE(s.ok()) << s.ToString(); - EXPECT_EQ(sink_->contents().size(), builder->FileSize()); + EXPECT_EQ(GetSink()->contents().size(), builder->FileSize()); // Open the table uniq_id_ = cur_uniq_id_++; - source_.reset(new StringSource(sink_->contents(), uniq_id_, - ioptions.allow_mmap_reads)); + file_reader_.reset(test::GetRandomAccessFileReader(new StringSource( + GetSink()->contents(), uniq_id_, ioptions.allow_mmap_reads))); return ioptions.table_factory->NewTableReader( - ioptions, soptions, internal_comparator, std::move(source_), - sink_->contents().size(), &table_reader_); + ioptions, soptions, internal_comparator, std::move(file_reader_), + GetSink()->contents().size(), &table_reader_); } virtual Iterator* NewIterator() const override { @@ -397,12 +398,11 @@ class TableConstructor: public Constructor { } virtual Status Reopen(const ImmutableCFOptions& ioptions) { - source_.reset( - new StringSource(sink_->contents(), uniq_id_, - ioptions.allow_mmap_reads)); + file_reader_.reset(test::GetRandomAccessFileReader(new StringSource( + GetSink()->contents(), uniq_id_, ioptions.allow_mmap_reads))); return ioptions.table_factory->NewTableReader( - ioptions, soptions, *last_internal_key_, std::move(source_), - sink_->contents().size(), &table_reader_); + ioptions, soptions, *last_internal_key_, std::move(file_reader_), + GetSink()->contents().size(), &table_reader_); } virtual TableReader* GetTableReader() { @@ -417,13 +417,17 @@ class TableConstructor: public Constructor { void Reset() { uniq_id_ = 0; table_reader_.reset(); - sink_.reset(); - source_.reset(); + file_writer_.reset(); + file_reader_.reset(); + } + + StringSink* GetSink() { + return static_cast(file_writer_->writable_file()); } uint64_t uniq_id_; - unique_ptr sink_; - unique_ptr source_; + unique_ptr file_writer_; + unique_ptr file_reader_; unique_ptr table_reader_; bool convert_to_internal_key_; @@ -1766,6 +1770,8 @@ TEST_F(PlainTableTest, BasicPlainTableProperties) { PlainTableFactory factory(plain_table_options); StringSink sink; + unique_ptr file_writer( + test::GetWritableFileWriter(new StringSink())); Options options; const ImmutableCFOptions ioptions(options); InternalKeyComparator ikc(options.comparator); @@ -1774,7 +1780,7 @@ TEST_F(PlainTableTest, BasicPlainTableProperties) { std::unique_ptr builder(factory.NewTableBuilder( TableBuilderOptions(ioptions, ikc, &int_tbl_prop_collector_factories, kNoCompression, CompressionOptions(), false), - &sink)); + file_writer.get())); for (char c = 'a'; c <= 'z'; ++c) { std::string key(8, c); @@ -1783,11 +1789,15 @@ TEST_F(PlainTableTest, BasicPlainTableProperties) { builder->Add(key, value); } ASSERT_OK(builder->Finish()); + file_writer->Flush(); - StringSource source(sink.contents(), 72242, true); + StringSink* ss = static_cast(file_writer->writable_file()); + unique_ptr file_reader( + test::GetRandomAccessFileReader( + new StringSource(ss->contents(), 72242, true))); TableProperties* props = nullptr; - auto s = ReadTableProperties(&source, sink.contents().size(), + auto s = ReadTableProperties(file_reader.get(), ss->contents().size(), kPlainTableMagicNumber, Env::Default(), nullptr, &props); std::unique_ptr props_guard(props); diff --git a/util/db_test_util.h b/util/db_test_util.h index 8a1066c51..b55a5ee54 100644 --- a/util/db_test_util.h +++ b/util/db_test_util.h @@ -170,6 +170,9 @@ class SpecialEnv : public EnvWrapper { void SetIOPriority(Env::IOPriority pri) override { base_->SetIOPriority(pri); } + Env::IOPriority GetIOPriority() override { + return base_->GetIOPriority(); + } }; class ManifestFile : public WritableFile { public: diff --git a/util/env_posix.cc b/util/env_posix.cc index 371dcf15b..978d3d505 100644 --- a/util/env_posix.cc +++ b/util/env_posix.cc @@ -40,7 +40,6 @@ #include "util/posix_logger.h" #include "util/random.h" #include "util/iostats_context_imp.h" -#include "util/rate_limiter.h" #include "util/sync_point.h" #include "util/thread_status_updater.h" #include "util/thread_status_util.h" @@ -74,9 +73,6 @@ #define POSIX_FADV_DONTNEED 4 /* [MC1] dont need these pages */ #endif -// This is only set from db_stress.cc and for testing only. -// If non-zero, kill at various points in source code with probability 1/this -int rocksdb_kill_odds = 0; namespace rocksdb { @@ -104,39 +100,6 @@ static Status IOError(const std::string& context, int err_number) { return Status::IOError(context, strerror(err_number)); } -#ifdef NDEBUG -// empty in release build -#define TEST_KILL_RANDOM(rocksdb_kill_odds) -#else - -// Kill the process with probablity 1/odds for testing. -static void TestKillRandom(int odds, const std::string& srcfile, - int srcline) { - time_t curtime = time(nullptr); - Random r((uint32_t)curtime); - - assert(odds > 0); - bool crash = r.OneIn(odds); - if (crash) { - fprintf(stdout, "Crashing at %s:%d\n", srcfile.c_str(), srcline); - fflush(stdout); - kill(getpid(), SIGTERM); - } -} - -// To avoid crashing always at some frequently executed codepaths (during -// kill random test), use this factor to reduce odds -#define REDUCE_ODDS 2 -#define REDUCE_ODDS2 4 - -#define TEST_KILL_RANDOM(rocksdb_kill_odds) { \ - if (rocksdb_kill_odds > 0) { \ - TestKillRandom(rocksdb_kill_odds, __FILE__, __LINE__); \ - } \ -} - -#endif - #if defined(OS_LINUX) namespace { static size_t GetUniqueIdFromFile(int fd, char* id, size_t max_size) { @@ -188,7 +151,6 @@ class PosixSequentialFile: public SequentialFile { do { r = fread_unlocked(scratch, 1, n, file_); } while (r == 0 && ferror(file_) && errno == EINTR); - IOSTATS_ADD(bytes_read, r); *result = Slice(scratch, r); if (r < n) { if (feof(file_)) { @@ -252,10 +214,7 @@ class PosixRandomAccessFile: public RandomAccessFile { size_t left = n; char* ptr = scratch; while (left > 0) { - { - IOSTATS_TIMER_GUARD(read_nanos); - r = pread(fd_, ptr, left, static_cast(offset)); - } + r = pread(fd_, ptr, left, static_cast(offset)); if (r <= 0) { if (errno == EINTR) { @@ -268,7 +227,6 @@ class PosixRandomAccessFile: public RandomAccessFile { left -= r; } - IOSTATS_ADD_IF_POSITIVE(bytes_read, n - left); *result = Slice(scratch, (r < 0) ? 0 : n - left); if (r < 0) { // An error: return a non-ok status @@ -458,7 +416,6 @@ class PosixMmapFile : public WritableFile { if (ptr == MAP_FAILED) { return Status::IOError("MMap failed on " + filename_); } - TEST_KILL_RANDOM(rocksdb_kill_odds); base_ = reinterpret_cast(ptr); @@ -482,8 +439,7 @@ class PosixMmapFile : public WritableFile { limit_(nullptr), dst_(nullptr), last_sync_(nullptr), - file_offset_(0), - pending_sync_(false) { + file_offset_(0) { #ifdef ROCKSDB_FALLOCATE_PRESENT fallocate_with_keep_size_ = options.fallocate_with_keep_size; #endif @@ -501,8 +457,6 @@ class PosixMmapFile : public WritableFile { virtual Status Append(const Slice& data) override { const char* src = data.data(); size_t left = data.size(); - TEST_KILL_RANDOM(rocksdb_kill_odds * REDUCE_ODDS); - PrepareWrite(static_cast(GetFileSize()), left); while (left > 0) { assert(base_ <= dst_); assert(dst_ <= limit_); @@ -521,12 +475,10 @@ class PosixMmapFile : public WritableFile { size_t n = (left <= avail) ? left : avail; memcpy(dst_, src, n); - IOSTATS_ADD(bytes_written, n); dst_ += n; src += n; left -= n; } - TEST_KILL_RANDOM(rocksdb_kill_odds); return Status::OK(); } @@ -534,8 +486,6 @@ class PosixMmapFile : public WritableFile { Status s; size_t unused = limit_ - dst_; - TEST_KILL_RANDOM(rocksdb_kill_odds); - s = UnmapCurrentRegion(); if (!s.ok()) { s = IOError(filename_, errno); @@ -546,8 +496,6 @@ class PosixMmapFile : public WritableFile { } } - TEST_KILL_RANDOM(rocksdb_kill_odds); - if (close(fd_) < 0) { if (s.ok()) { s = IOError(filename_, errno); @@ -561,22 +509,15 @@ class PosixMmapFile : public WritableFile { } virtual Status Flush() override { - TEST_KILL_RANDOM(rocksdb_kill_odds); return Status::OK(); } virtual Status Sync() override { Status s; - if (pending_sync_) { - // Some unmapped data was not synced - TEST_KILL_RANDOM(rocksdb_kill_odds); - pending_sync_ = false; if (fdatasync(fd_) < 0) { s = IOError(filename_, errno); } - TEST_KILL_RANDOM(rocksdb_kill_odds * REDUCE_ODDS); - } if (dst_ > last_sync_) { // Find the beginnings of the pages that contain the first and last @@ -588,7 +529,6 @@ class PosixMmapFile : public WritableFile { if (msync(base_ + p1, p2 - p1 + page_size_, MS_SYNC) < 0) { s = IOError(filename_, errno); } - TEST_KILL_RANDOM(rocksdb_kill_odds); } return s; @@ -598,15 +538,10 @@ class PosixMmapFile : public WritableFile { * Flush data as well as metadata to stable storage. */ virtual Status Fsync() override { - if (pending_sync_) { // Some unmapped data was not synced - TEST_KILL_RANDOM(rocksdb_kill_odds); - pending_sync_ = false; if (fsync(fd_) < 0) { return IOError(filename_, errno); } - TEST_KILL_RANDOM(rocksdb_kill_odds); - } // This invocation to Sync will not issue the call to // fdatasync because pending_sync_ has already been cleared. return Sync(); @@ -638,7 +573,6 @@ class PosixMmapFile : public WritableFile { #ifdef ROCKSDB_FALLOCATE_PRESENT virtual Status Allocate(off_t offset, off_t len) override { TEST_KILL_RANDOM(rocksdb_kill_odds); - IOSTATS_TIMER_GUARD(allocate_nanos); int alloc_status = fallocate( fd_, fallocate_with_keep_size_ ? FALLOC_FL_KEEP_SIZE : 0, offset, len); if (alloc_status == 0) { @@ -655,33 +589,14 @@ class PosixWritableFile : public WritableFile { private: const std::string filename_; int fd_; - size_t cursize_; // current size of cached data in buf_ - size_t capacity_; // max size of buf_ - unique_ptr buf_; // a buffer to cache writes uint64_t filesize_; - bool pending_sync_; - bool pending_fsync_; - uint64_t last_sync_size_; - uint64_t bytes_per_sync_; #ifdef ROCKSDB_FALLOCATE_PRESENT bool fallocate_with_keep_size_; #endif - RateLimiter* rate_limiter_; public: - PosixWritableFile(const std::string& fname, int fd, size_t capacity, - const EnvOptions& options) - : filename_(fname), - fd_(fd), - cursize_(0), - capacity_(capacity), - buf_(new char[capacity]), - filesize_(0), - pending_sync_(false), - pending_fsync_(false), - last_sync_size_(0), - bytes_per_sync_(options.bytes_per_sync), - rate_limiter_(options.rate_limiter) { + PosixWritableFile(const std::string& fname, int fd, const EnvOptions& options) + : filename_(fname), fd_(fd), filesize_(0) { #ifdef ROCKSDB_FALLOCATE_PRESENT fallocate_with_keep_size_ = options.fallocate_with_keep_size; #endif @@ -698,64 +613,23 @@ class PosixWritableFile : public WritableFile { const char* src = data.data(); size_t left = data.size(); Status s; - pending_sync_ = true; - pending_fsync_ = true; - - TEST_KILL_RANDOM(rocksdb_kill_odds * REDUCE_ODDS2); - - PrepareWrite(static_cast(GetFileSize()), left); - // if there is no space in the cache, then flush - if (cursize_ + left > capacity_) { - s = Flush(); - if (!s.ok()) { - return s; - } - // Increase the buffer size, but capped at 1MB - if (capacity_ < (1<<20)) { - capacity_ *= 2; - buf_.reset(new char[capacity_]); - } - assert(cursize_ == 0); - } - - // if the write fits into the cache, then write to cache - // otherwise do a write() syscall to write to OS buffers. - if (cursize_ + left <= capacity_) { - memcpy(buf_.get()+cursize_, src, left); - cursize_ += left; - } else { while (left != 0) { - ssize_t done; - size_t size = RequestToken(left); - { - IOSTATS_TIMER_GUARD(write_nanos); - done = write(fd_, src, size); - } + ssize_t done = write(fd_, src, left); if (done < 0) { if (errno == EINTR) { continue; } return IOError(filename_, errno); } - IOSTATS_ADD(bytes_written, done); - TEST_KILL_RANDOM(rocksdb_kill_odds); - left -= done; src += done; } - } - filesize_ += data.size(); + filesize_ += data.size(); return Status::OK(); } virtual Status Close() override { Status s; - s = Flush(); // flush cache to OS - if (!s.ok()) { - return s; - } - - TEST_KILL_RANDOM(rocksdb_kill_odds); size_t block_size; size_t last_allocated_block; @@ -793,68 +667,20 @@ class PosixWritableFile : public WritableFile { // write out the cached data to the OS cache virtual Status Flush() override { - TEST_KILL_RANDOM(rocksdb_kill_odds * REDUCE_ODDS2); - size_t left = cursize_; - char* src = buf_.get(); - while (left != 0) { - ssize_t done; - size_t size = RequestToken(left); - { - IOSTATS_TIMER_GUARD(write_nanos); - done = write(fd_, src, size); - } - if (done < 0) { - if (errno == EINTR) { - continue; - } - return IOError(filename_, errno); - } - IOSTATS_ADD(bytes_written, done); - TEST_KILL_RANDOM(rocksdb_kill_odds * REDUCE_ODDS2); - left -= done; - src += done; - } - cursize_ = 0; - - // sync OS cache to disk for every bytes_per_sync_ - // TODO: give log file and sst file different options (log - // files could be potentially cached in OS for their whole - // life time, thus we might not want to flush at all). - if (bytes_per_sync_ && - filesize_ - last_sync_size_ >= bytes_per_sync_) { - RangeSync(last_sync_size_, filesize_ - last_sync_size_); - last_sync_size_ = filesize_; - } - return Status::OK(); } virtual Status Sync() override { - Status s = Flush(); - if (!s.ok()) { - return s; - } - TEST_KILL_RANDOM(rocksdb_kill_odds); - if (pending_sync_ && fdatasync(fd_) < 0) { + if (fdatasync(fd_) < 0) { return IOError(filename_, errno); } - TEST_KILL_RANDOM(rocksdb_kill_odds); - pending_sync_ = false; return Status::OK(); } virtual Status Fsync() override { - Status s = Flush(); - if (!s.ok()) { - return s; - } - TEST_KILL_RANDOM(rocksdb_kill_odds); - if (pending_fsync_ && fsync(fd_) < 0) { + if (fsync(fd_) < 0) { return IOError(filename_, errno); } - TEST_KILL_RANDOM(rocksdb_kill_odds); - pending_fsync_ = false; - pending_sync_ = false; return Status::OK(); } @@ -876,8 +702,8 @@ class PosixWritableFile : public WritableFile { #ifdef ROCKSDB_FALLOCATE_PRESENT virtual Status Allocate(off_t offset, off_t len) override { TEST_KILL_RANDOM(rocksdb_kill_odds); - int alloc_status; IOSTATS_TIMER_GUARD(allocate_nanos); + int alloc_status; alloc_status = fallocate( fd_, fallocate_with_keep_size_ ? FALLOC_FL_KEEP_SIZE : 0, offset, len); if (alloc_status == 0) { @@ -888,7 +714,6 @@ class PosixWritableFile : public WritableFile { } virtual Status RangeSync(off_t offset, off_t nbytes) override { - IOSTATS_TIMER_GUARD(range_sync_nanos); if (sync_file_range(fd_, offset, nbytes, SYNC_FILE_RANGE_WRITE) == 0) { return Status::OK(); } else { @@ -899,34 +724,19 @@ class PosixWritableFile : public WritableFile { return GetUniqueIdFromFile(fd_, id, max_size); } #endif - - private: - inline size_t RequestToken(size_t bytes) { - if (rate_limiter_ && io_priority_ < Env::IO_TOTAL) { - bytes = std::min(bytes, - static_cast(rate_limiter_->GetSingleBurstBytes())); - rate_limiter_->Request(bytes, io_priority_); - } - return bytes; - } }; class PosixRandomRWFile : public RandomRWFile { private: const std::string filename_; int fd_; - bool pending_sync_; - bool pending_fsync_; #ifdef ROCKSDB_FALLOCATE_PRESENT bool fallocate_with_keep_size_; #endif public: PosixRandomRWFile(const std::string& fname, int fd, const EnvOptions& options) - : filename_(fname), - fd_(fd), - pending_sync_(false), - pending_fsync_(false) { + : filename_(fname), fd_(fd) { #ifdef ROCKSDB_FALLOCATE_PRESENT fallocate_with_keep_size_ = options.fallocate_with_keep_size; #endif @@ -943,22 +753,15 @@ class PosixRandomRWFile : public RandomRWFile { const char* src = data.data(); size_t left = data.size(); Status s; - pending_sync_ = true; - pending_fsync_ = true; while (left != 0) { - ssize_t done; - { - IOSTATS_TIMER_GUARD(write_nanos); - done = pwrite(fd_, src, left, offset); - } + ssize_t done = pwrite(fd_, src, left, offset); if (done < 0) { if (errno == EINTR) { continue; } return IOError(filename_, errno); } - IOSTATS_ADD(bytes_written, done); left -= done; src += done; @@ -975,11 +778,7 @@ class PosixRandomRWFile : public RandomRWFile { size_t left = n; char* ptr = scratch; while (left > 0) { - { - IOSTATS_TIMER_GUARD(read_nanos); - r = pread(fd_, ptr, left, static_cast(offset)); - } - + r = pread(fd_, ptr, left, static_cast(offset)); if (r <= 0) { if (errno == EINTR) { continue; @@ -990,7 +789,6 @@ class PosixRandomRWFile : public RandomRWFile { offset += r; left -= r; } - IOSTATS_ADD_IF_POSITIVE(bytes_read, n - left); *result = Slice(scratch, (r < 0) ? 0 : n - left); if (r < 0) { s = IOError(filename_, errno); @@ -1008,25 +806,21 @@ class PosixRandomRWFile : public RandomRWFile { } virtual Status Sync() override { - if (pending_sync_ && fdatasync(fd_) < 0) { + if (fdatasync(fd_) < 0) { return IOError(filename_, errno); } - pending_sync_ = false; return Status::OK(); } virtual Status Fsync() override { - if (pending_fsync_ && fsync(fd_) < 0) { + if (fsync(fd_) < 0) { return IOError(filename_, errno); } - pending_fsync_ = false; - pending_sync_ = false; return Status::OK(); } #ifdef ROCKSDB_FALLOCATE_PRESENT virtual Status Allocate(off_t offset, off_t len) override { - TEST_KILL_RANDOM(rocksdb_kill_odds); IOSTATS_TIMER_GUARD(allocate_nanos); int alloc_status = fallocate( fd_, fallocate_with_keep_size_ ? FALLOC_FL_KEEP_SIZE : 0, offset, len); @@ -1216,9 +1010,7 @@ class PosixEnv : public Env { EnvOptions no_mmap_writes_options = options; no_mmap_writes_options.use_mmap_writes = false; - result->reset( - new PosixWritableFile(fname, fd, 65536, no_mmap_writes_options) - ); + result->reset(new PosixWritableFile(fname, fd, no_mmap_writes_options)); } } return s; diff --git a/util/env_test.cc b/util/env_test.cc index baeab1df0..1ae50fdbc 100644 --- a/util/env_test.cc +++ b/util/env_test.cc @@ -664,6 +664,7 @@ TEST_F(EnvPosixTest, AllocateTest) { size_t kPageSize = 4096; std::string data(1024 * 1024, 'a'); wfile->SetPreallocationBlockSize(kPreallocateSize); + wfile->PrepareWrite(wfile->GetFileSize(), data.size()); ASSERT_OK(wfile->Append(Slice(data))); ASSERT_OK(wfile->Flush()); @@ -974,18 +975,22 @@ TEST_F(EnvPosixTest, Preallocation) { ASSERT_EQ(last_allocated_block, 0UL); // Small write should preallocate one block - srcfile->Append("test"); + std::string str = "test"; + srcfile->PrepareWrite(srcfile->GetFileSize(), str.size()); + srcfile->Append(str); srcfile->GetPreallocationStatus(&block_size, &last_allocated_block); ASSERT_EQ(last_allocated_block, 1UL); // Write an entire preallocation block, make sure we increased by two. std::string buf(block_size, ' '); + srcfile->PrepareWrite(srcfile->GetFileSize(), buf.size()); srcfile->Append(buf); srcfile->GetPreallocationStatus(&block_size, &last_allocated_block); ASSERT_EQ(last_allocated_block, 2UL); // Write five more blocks at once, ensure we're where we need to be. buf = std::string(block_size * 5, ' '); + srcfile->PrepareWrite(srcfile->GetFileSize(), buf.size()); srcfile->Append(buf); srcfile->GetPreallocationStatus(&block_size, &last_allocated_block); ASSERT_EQ(last_allocated_block, 7UL); diff --git a/util/file_reader_writer.cc b/util/file_reader_writer.cc new file mode 100644 index 000000000..ebd54c7eb --- /dev/null +++ b/util/file_reader_writer.cc @@ -0,0 +1,225 @@ +// Copyright (c) 2013, Facebook, Inc. All rights reserved. +// This source code is licensed under the BSD-style license found in the +// LICENSE file in the root directory of this source tree. An additional grant +// of patent rights can be found in the PATENTS file in the same directory. +// +// Copyright (c) 2011 The LevelDB Authors. All rights reserved. +// Use of this source code is governed by a BSD-style license that can be +// found in the LICENSE file. See the AUTHORS file for names of contributors. + +#include "util/file_reader_writer.h" + +#include +#include "port/port.h" +#include "util/iostats_context_imp.h" +#include "util/random.h" +#include "util/rate_limiter.h" +#include "util/sync_point.h" + +namespace rocksdb { +Status SequentialFileReader::Read(size_t n, Slice* result, char* scratch) { + Status s = file_->Read(n, result, scratch); + IOSTATS_ADD(bytes_read, result->size()); + return s; +} + +Status SequentialFileReader::Skip(uint64_t n) { return file_->Skip(n); } + +Status RandomAccessFileReader::Read(uint64_t offset, size_t n, Slice* result, + char* scratch) const { + IOSTATS_TIMER_GUARD(read_nanos); + Status s = file_->Read(offset, n, result, scratch); + IOSTATS_ADD_IF_POSITIVE(bytes_read, result->size()); + return s; +} + +Status WritableFileWriter::Append(const Slice& data) { + const char* src = data.data(); + size_t left = data.size(); + Status s; + pending_sync_ = true; + pending_fsync_ = true; + + TEST_KILL_RANDOM(rocksdb_kill_odds * REDUCE_ODDS2); + + writable_file_->PrepareWrite(static_cast(GetFileSize()), left); + // if there is no space in the cache, then flush + if (cursize_ + left > capacity_) { + s = Flush(); + if (!s.ok()) { + return s; + } + // Increase the buffer size, but capped at 1MB + if (capacity_ < (1 << 20)) { + capacity_ *= 2; + buf_.reset(new char[capacity_]); + } + assert(cursize_ == 0); + } + + // if the write fits into the cache, then write to cache + // otherwise do a write() syscall to write to OS buffers. + if (cursize_ + left <= capacity_) { + memcpy(buf_.get() + cursize_, src, left); + cursize_ += left; + } else { + while (left != 0) { + size_t size = RequestToken(left); + { + IOSTATS_TIMER_GUARD(write_nanos); + s = writable_file_->Append(Slice(src, size)); + if (!s.ok()) { + return s; + } + } + IOSTATS_ADD(bytes_written, size); + TEST_KILL_RANDOM(rocksdb_kill_odds); + + left -= size; + src += size; + } + } + TEST_KILL_RANDOM(rocksdb_kill_odds); + filesize_ += data.size(); + return Status::OK(); +} + +Status WritableFileWriter::Close() { + Status s; + s = Flush(); // flush cache to OS + if (!s.ok()) { + return s; + } + + TEST_KILL_RANDOM(rocksdb_kill_odds); + return writable_file_->Close(); +} + +// write out the cached data to the OS cache +Status WritableFileWriter::Flush() { + TEST_KILL_RANDOM(rocksdb_kill_odds * REDUCE_ODDS2); + size_t left = cursize_; + char* src = buf_.get(); + while (left != 0) { + size_t size = RequestToken(left); + { + IOSTATS_TIMER_GUARD(write_nanos); + Status s = writable_file_->Append(Slice(src, size)); + if (!s.ok()) { + return s; + } + } + IOSTATS_ADD(bytes_written, size); + TEST_KILL_RANDOM(rocksdb_kill_odds * REDUCE_ODDS2); + left -= size; + src += size; + } + cursize_ = 0; + + writable_file_->Flush(); + + // sync OS cache to disk for every bytes_per_sync_ + // TODO: give log file and sst file different options (log + // files could be potentially cached in OS for their whole + // life time, thus we might not want to flush at all). + if (bytes_per_sync_ && filesize_ - last_sync_size_ >= bytes_per_sync_) { + writable_file_->RangeSync(last_sync_size_, filesize_ - last_sync_size_); + last_sync_size_ = filesize_; + } + + return Status::OK(); +} + +Status WritableFileWriter::Sync(bool use_fsync) { + Status s = Flush(); + if (!s.ok()) { + return s; + } + TEST_KILL_RANDOM(rocksdb_kill_odds); + if (pending_sync_) { + if (use_fsync) { + s = writable_file_->Fsync(); + } else { + s = writable_file_->Sync(); + } + if (!s.ok()) { + return s; + } + } + TEST_KILL_RANDOM(rocksdb_kill_odds); + pending_sync_ = false; + if (use_fsync) { + pending_fsync_ = false; + } + return Status::OK(); +} + +Status WritableFileWriter::RangeSync(off_t offset, off_t nbytes) { + IOSTATS_TIMER_GUARD(range_sync_nanos); + return writable_file_->RangeSync(offset, nbytes); +} + +size_t WritableFileWriter::RequestToken(size_t bytes) { + Env::IOPriority io_priority; + if (rate_limiter_&&(io_priority = writable_file_->GetIOPriority()) < + Env::IO_TOTAL) { + bytes = std::min(bytes, + static_cast(rate_limiter_->GetSingleBurstBytes())); + rate_limiter_->Request(bytes, io_priority); + } + return bytes; +} + +Status RandomRWFileAccessor::Write(uint64_t offset, const Slice& data) { + Status s; + pending_sync_ = true; + pending_fsync_ = true; + + { + IOSTATS_TIMER_GUARD(write_nanos); + s = random_rw_file_->Write(offset, data); + if (!s.ok()) { + return s; + } + } + IOSTATS_ADD(bytes_written, data.size()); + + return s; +} + +Status RandomRWFileAccessor::Read(uint64_t offset, size_t n, Slice* result, + char* scratch) const { + Status s; + { + IOSTATS_TIMER_GUARD(read_nanos); + s = random_rw_file_->Read(offset, n, result, scratch); + if (!s.ok()) { + return s; + } + } + IOSTATS_ADD_IF_POSITIVE(bytes_read, result->size()); + return s; +} + +Status RandomRWFileAccessor::Close() { return random_rw_file_->Close(); } + +Status RandomRWFileAccessor::Sync(bool use_fsync) { + Status s; + if (pending_sync_) { + if (use_fsync) { + s = random_rw_file_->Fsync(); + } else { + s = random_rw_file_->Sync(); + } + if (!s.ok()) { + return s; + } + } + if (use_fsync) { + pending_fsync_ = false; + } + pending_sync_ = false; + + return s; +} +} // namespace rocksdb diff --git a/util/file_reader_writer.h b/util/file_reader_writer.h new file mode 100644 index 000000000..396ad57ef --- /dev/null +++ b/util/file_reader_writer.h @@ -0,0 +1,109 @@ +// Copyright (c) 2013, Facebook, Inc. All rights reserved. +// This source code is licensed under the BSD-style license found in the +// LICENSE file in the root directory of this source tree. An additional grant +// of patent rights can be found in the PATENTS file in the same directory. +// +// Copyright (c) 2011 The LevelDB Authors. All rights reserved. +// Use of this source code is governed by a BSD-style license that can be +// found in the LICENSE file. See the AUTHORS file for names of contributors. +#pragma once +#include "rocksdb/env.h" + +namespace rocksdb { +class SequentialFileReader { + private: + std::unique_ptr file_; + + public: + explicit SequentialFileReader(std::unique_ptr&& _file) + : file_(std::move(_file)) {} + Status Read(size_t n, Slice* result, char* scratch); + + Status Skip(uint64_t n); + + SequentialFile* file() { return file_.get(); } +}; + +class RandomAccessFileReader : public RandomAccessFile { + private: + std::unique_ptr file_; + + public: + explicit RandomAccessFileReader(std::unique_ptr&& raf) + : file_(std::move(raf)) {} + + Status Read(uint64_t offset, size_t n, Slice* result, char* scratch) const; + + RandomAccessFile* file() { return file_.get(); } +}; + +// Use posix write to write data to a file. +class WritableFileWriter { + private: + std::unique_ptr writable_file_; + size_t cursize_; // current size of cached data in buf_ + size_t capacity_; // max size of buf_ + unique_ptr buf_; // a buffer to cache writes + uint64_t filesize_; + bool pending_sync_; + bool pending_fsync_; + uint64_t last_sync_size_; + uint64_t bytes_per_sync_; + RateLimiter* rate_limiter_; + + public: + explicit WritableFileWriter(std::unique_ptr&& file, + const EnvOptions& options) + : writable_file_(std::move(file)), + cursize_(0), + capacity_(65536), + buf_(new char[capacity_]), + filesize_(0), + pending_sync_(false), + pending_fsync_(false), + last_sync_size_(0), + bytes_per_sync_(options.bytes_per_sync), + rate_limiter_(options.rate_limiter) {} + + ~WritableFileWriter() { Flush(); } + Status Append(const Slice& data); + + Status Flush(); + + Status Close(); + + Status Sync(bool use_fsync); + + uint64_t GetFileSize() { return filesize_; } + + Status InvalidateCache(size_t offset, size_t length) { + return writable_file_->InvalidateCache(offset, length); + } + + WritableFile* writable_file() const { return writable_file_.get(); } + + private: + Status RangeSync(off_t offset, off_t nbytes); + size_t RequestToken(size_t bytes); +}; + +class RandomRWFileAccessor { + private: + std::unique_ptr random_rw_file_; + bool pending_sync_; + bool pending_fsync_; + + public: + explicit RandomRWFileAccessor(std::unique_ptr&& f) + : random_rw_file_(std::move(f)), + pending_sync_(false), + pending_fsync_(false) {} + Status Write(uint64_t offset, const Slice& data); + + Status Read(uint64_t offset, size_t n, Slice* result, char* scratch) const; + + Status Close(); + + Status Sync(bool use_fsync); +}; +} // namespace rocksdb diff --git a/util/file_util.cc b/util/file_util.cc index c75d59c5f..a4d528f70 100644 --- a/util/file_util.cc +++ b/util/file_util.cc @@ -8,6 +8,7 @@ #include "util/file_util.h" #include "rocksdb/env.h" #include "db/filename.h" +#include "util/file_reader_writer.h" namespace rocksdb { @@ -15,8 +16,12 @@ namespace rocksdb { Status CopyFile(Env* env, const std::string& source, const std::string& destination, uint64_t size) { const EnvOptions soptions; - unique_ptr srcfile; Status s; + unique_ptr src_reader; + unique_ptr dest_writer; + + { + unique_ptr srcfile; s = env->NewSequentialFile(source, &srcfile, soptions); unique_ptr destfile; if (s.ok()) { @@ -33,6 +38,9 @@ Status CopyFile(Env* env, const std::string& source, return s; } } + src_reader.reset(new SequentialFileReader(std::move(srcfile))); + dest_writer.reset(new WritableFileWriter(std::move(destfile), soptions)); + } char buffer[4096]; Slice slice; @@ -40,13 +48,13 @@ Status CopyFile(Env* env, const std::string& source, uint64_t bytes_to_read = std::min(static_cast(sizeof(buffer)), size); if (s.ok()) { - s = srcfile->Read(bytes_to_read, &slice, buffer); + s = src_reader->Read(bytes_to_read, &slice, buffer); } if (s.ok()) { if (slice.size() == 0) { return Status::Corruption("file too small"); } - s = destfile->Append(slice); + s = dest_writer->Append(slice); } if (!s.ok()) { return s; diff --git a/util/ldb_cmd.cc b/util/ldb_cmd.cc index bbb738f58..406b6b999 100644 --- a/util/ldb_cmd.cc +++ b/util/ldb_cmd.cc @@ -1408,10 +1408,18 @@ class InMemoryHandler : public WriteBatch::Handler { void DumpWalFile(std::string wal_file, bool print_header, bool print_values, LDBCommandExecuteResult* exec_state) { - unique_ptr file; Env* env_ = Env::Default(); EnvOptions soptions; - Status status = env_->NewSequentialFile(wal_file, &file, soptions); + unique_ptr wal_file_reader; + + Status status; + { + unique_ptr file; + status = env_->NewSequentialFile(wal_file, &file, soptions); + if (status.ok()) { + wal_file_reader.reset(new SequentialFileReader(std::move(file))); + } + } if (!status.ok()) { if (exec_state) { *exec_state = LDBCommandExecuteResult::Failed("Failed to open WAL file " + @@ -1422,7 +1430,7 @@ void DumpWalFile(std::string wal_file, bool print_header, bool print_values, } } else { StdErrReporter reporter; - log::Reader reader(move(file), &reporter, true, 0); + log::Reader reader(move(wal_file_reader), &reporter, true, 0); string scratch; WriteBatch batch; Slice record; diff --git a/util/sst_dump_test.cc b/util/sst_dump_test.cc index 03d7299a3..fb9bde00c 100644 --- a/util/sst_dump_test.cc +++ b/util/sst_dump_test.cc @@ -13,6 +13,7 @@ #include "rocksdb/filter_policy.h" #include "table/block_based_table_factory.h" #include "table/table_builder.h" +#include "util/file_reader_writer.h" #include "util/testharness.h" #include "util/testutil.h" @@ -53,12 +54,13 @@ void createSST(const std::string& file_name, opts.table_factory = tf; std::vector > int_tbl_prop_collector_factories; - + unique_ptr file_writer( + new WritableFileWriter(std::move(file), EnvOptions())); tb.reset(opts.table_factory->NewTableBuilder( TableBuilderOptions(imoptions, ikc, &int_tbl_prop_collector_factories, CompressionType::kNoCompression, CompressionOptions(), false), - file.get())); + file_writer.get())); // Populate slightly more than 1K keys uint32_t num_keys = 1024; @@ -66,7 +68,7 @@ void createSST(const std::string& file_name, tb->Add(MakeKey(i), MakeValue(i)); } tb->Finish(); - file->Close(); + file_writer->Close(); } void cleanup(const std::string& file_name) { diff --git a/util/sst_dump_tool.cc b/util/sst_dump_tool.cc index 04486da83..00bbed28a 100644 --- a/util/sst_dump_tool.cc +++ b/util/sst_dump_tool.cc @@ -24,7 +24,6 @@ SstFileReader::SstFileReader(const std::string& file_path, output_hex_(output_hex), ioptions_(options_), internal_comparator_(BytewiseComparator()) { fprintf(stdout, "Process %s\n", file_path.c_str()); - init_result_ = GetTableReader(file_name_); } @@ -41,10 +40,13 @@ Status SstFileReader::GetTableReader(const std::string& file_path) { unique_ptr file; uint64_t file_size; - Status s = options_.env->NewRandomAccessFile(file_path, &file_, soptions_); + Status s = options_.env->NewRandomAccessFile(file_path, &file, soptions_); if (s.ok()) { s = options_.env->GetFileSize(file_path, &file_size); } + + file_.reset(new RandomAccessFileReader(std::move(file))); + if (s.ok()) { s = ReadFooterFromFile(file_.get(), file_size, &footer); } @@ -56,7 +58,8 @@ Status SstFileReader::GetTableReader(const std::string& file_path) { if (magic_number == kPlainTableMagicNumber || magic_number == kLegacyPlainTableMagicNumber) { soptions_.use_mmap_reads = true; - options_.env->NewRandomAccessFile(file_path, &file_, soptions_); + options_.env->NewRandomAccessFile(file_path, &file, soptions_); + file_.reset(new RandomAccessFileReader(std::move(file))); } options_.comparator = &internal_comparator_; // For old sst format, ReadTableProperties might fail but file can be read @@ -68,16 +71,15 @@ Status SstFileReader::GetTableReader(const std::string& file_path) { } if (s.ok()) { - s = NewTableReader(ioptions_, soptions_, internal_comparator_, - std::move(file_), file_size, &table_reader_); + s = NewTableReader(ioptions_, soptions_, internal_comparator_, file_size, + &table_reader_); } return s; } Status SstFileReader::NewTableReader( const ImmutableCFOptions& ioptions, const EnvOptions& soptions, - const InternalKeyComparator& internal_comparator, - unique_ptr&& file, uint64_t file_size, + const InternalKeyComparator& internal_comparator, uint64_t file_size, unique_ptr* table_reader) { // We need to turn off pre-fetching of index and filter nodes for // BlockBasedTable @@ -108,7 +110,7 @@ Status SstFileReader::DumpTable(const std::string& out_filename) { } Status SstFileReader::ReadTableProperties(uint64_t table_magic_number, - RandomAccessFile* file, + RandomAccessFileReader* file, uint64_t file_size) { TableProperties* table_properties = nullptr; Status s = rocksdb::ReadTableProperties(file, file_size, table_magic_number, diff --git a/util/sst_dump_tool_imp.h b/util/sst_dump_tool_imp.h index a5f22679e..b8b866057 100644 --- a/util/sst_dump_tool_imp.h +++ b/util/sst_dump_tool_imp.h @@ -28,6 +28,7 @@ #include "table/format.h" #include "table/meta_blocks.h" #include "table/plain_table_factory.h" +#include "util/file_reader_writer.h" #include "util/ldb_cmd.h" #include "util/random.h" #include "util/testharness.h" @@ -56,7 +57,7 @@ class SstFileReader { // Get the TableReader implementation for the sst file Status GetTableReader(const std::string& file_path); Status ReadTableProperties(uint64_t table_magic_number, - RandomAccessFile* file, uint64_t file_size); + RandomAccessFileReader* file, uint64_t file_size); Status SetTableOptionsByMagicNumber(uint64_t table_magic_number); Status SetOldTableOptions(); @@ -65,7 +66,7 @@ class SstFileReader { Status NewTableReader(const ImmutableCFOptions& ioptions, const EnvOptions& soptions, const InternalKeyComparator& internal_comparator, - unique_ptr&& file, uint64_t file_size, + uint64_t file_size, unique_ptr* table_reader); std::string file_name_; @@ -76,7 +77,7 @@ class SstFileReader { Status init_result_; unique_ptr table_reader_; - unique_ptr file_; + unique_ptr file_; // options_ and internal_comparator_ will also be used in // ReadSequential internally (specifically, seek-related operations) Options options_; diff --git a/util/sync_point.cc b/util/sync_point.cc index 3c224bfac..7051b5103 100644 --- a/util/sync_point.cc +++ b/util/sync_point.cc @@ -4,10 +4,25 @@ // of patent rights can be found in the PATENTS file in the same directory. #include "util/sync_point.h" +#include "port/port.h" +#include "util/random.h" + +int rocksdb_kill_odds = 0; #ifndef NDEBUG namespace rocksdb { +void TestKillRandom(int odds, const std::string& srcfile, int srcline) { + time_t curtime = time(nullptr); + Random r((uint32_t)curtime); + + assert(odds > 0); + bool crash = r.OneIn(odds); + if (crash) { + port::Crash(srcfile, srcline); + } +} + SyncPoint* SyncPoint::GetInstance() { static SyncPoint sync_point; return &sync_point; diff --git a/util/sync_point.h b/util/sync_point.h index 7827d286f..6a4629cb3 100644 --- a/util/sync_point.h +++ b/util/sync_point.h @@ -4,6 +4,7 @@ // of patent rights can be found in the PATENTS file in the same directory. #pragma once +#include #include #include #include @@ -11,6 +12,33 @@ #include #include +// This is only set from db_stress.cc and for testing only. +// If non-zero, kill at various points in source code with probability 1/this +extern int rocksdb_kill_odds; + +#ifdef NDEBUG +// empty in release build +#define TEST_KILL_RANDOM(rocksdb_kill_odds) +#else + +namespace rocksdb { +// Kill the process with probablity 1/odds for testing. +extern void TestKillRandom(int odds, const std::string& srcfile, int srcline); + +// To avoid crashing always at some frequently executed codepaths (during +// kill random test), use this factor to reduce odds +#define REDUCE_ODDS 2 +#define REDUCE_ODDS2 4 + +#define TEST_KILL_RANDOM(rocksdb_kill_odds) \ + { \ + if (rocksdb_kill_odds > 0) { \ + TestKillRandom(rocksdb_kill_odds, __FILE__, __LINE__); \ + } \ + } +} // namespace rocksdb +#endif + #ifdef NDEBUG #define TEST_SYNC_POINT(x) #define TEST_SYNC_POINT_CALLBACK(x, y) diff --git a/util/testutil.cc b/util/testutil.cc index 20f22c2dc..ebe7a308b 100644 --- a/util/testutil.cc +++ b/util/testutil.cc @@ -10,6 +10,7 @@ #include "util/testutil.h" #include "port/port.h" +#include "util/file_reader_writer.h" #include "util/random.h" namespace rocksdb { @@ -107,5 +108,20 @@ const Comparator* Uint64Comparator() { return uint64comp; } +WritableFileWriter* GetWritableFileWriter(WritableFile* wf) { + unique_ptr file(wf); + return new WritableFileWriter(std::move(file), EnvOptions()); +} + +RandomAccessFileReader* GetRandomAccessFileReader(RandomAccessFile* raf) { + unique_ptr file(raf); + return new RandomAccessFileReader(std::move(file)); +} + +SequentialFileReader* GetSequentialFileReader(SequentialFile* se) { + unique_ptr file(se); + return new SequentialFileReader(std::move(file)); +} + } // namespace test } // namespace rocksdb diff --git a/util/testutil.h b/util/testutil.h index 240a468ae..24eceddfd 100644 --- a/util/testutil.h +++ b/util/testutil.h @@ -19,6 +19,9 @@ #include "util/random.h" namespace rocksdb { +class SequentialFile; +class SequentialFileReader; + namespace test { // Store in *dst a random string of length "len" and return a Slice that @@ -159,6 +162,11 @@ class VectorIterator : public Iterator { std::vector values_; size_t current_; }; +extern WritableFileWriter* GetWritableFileWriter(WritableFile* wf); + +extern RandomAccessFileReader* GetRandomAccessFileReader(RandomAccessFile* raf); + +extern SequentialFileReader* GetSequentialFileReader(SequentialFile* se); } // namespace test } // namespace rocksdb diff --git a/utilities/backupable/backupable_db.cc b/utilities/backupable/backupable_db.cc index 6b8515159..c25cec936 100644 --- a/utilities/backupable/backupable_db.cc +++ b/utilities/backupable/backupable_db.cc @@ -14,6 +14,7 @@ #include "util/channel.h" #include "util/coding.h" #include "util/crc32c.h" +#include "util/file_reader_writer.h" #include "util/logging.h" #include "util/string_util.h" #include "rocksdb/transaction_log.h" @@ -1105,7 +1106,10 @@ Status BackupEngineImpl::GetLatestBackupFileContents(uint32_t* latest_backup) { char buf[11]; Slice data; - s = file->Read(10, &data, buf); + unique_ptr file_reader( + new SequentialFileReader(std::move(file))); + + s = file_reader->Read(10, &data, buf); if (!s.ok() || data.size() == 0) { return s.ok() ? Status::Corruption("Latest backup file corrupted") : s; } @@ -1137,14 +1141,16 @@ Status BackupEngineImpl::PutLatestBackupFileContents(uint32_t latest_backup) { return s; } + unique_ptr file_writer( + new WritableFileWriter(std::move(file), env_options)); char file_contents[10]; int len = sprintf(file_contents, "%u\n", latest_backup); - s = file->Append(Slice(file_contents, len)); + s = file_writer->Append(Slice(file_contents, len)); if (s.ok() && options_.sync) { - file->Sync(); + file_writer->Sync(false); } if (s.ok()) { - s = file->Close(); + s = file_writer->Close(); } if (s.ok()) { // atomically replace real file with new tmp @@ -1187,6 +1193,10 @@ Status BackupEngineImpl::CopyFile( return s; } + unique_ptr dest_writer( + new WritableFileWriter(std::move(dst_file), env_options)); + unique_ptr src_reader( + new SequentialFileReader(std::move(src_file))); unique_ptr buf(new char[copy_file_buffer_size_]); Slice data; @@ -1196,7 +1206,7 @@ Status BackupEngineImpl::CopyFile( } size_t buffer_to_read = (copy_file_buffer_size_ < size_limit) ? copy_file_buffer_size_ : size_limit; - s = src_file->Read(buffer_to_read, &data, buf.get()); + s = src_reader->Read(buffer_to_read, &data, buf.get()); size_limit -= data.size(); if (!s.ok()) { @@ -1210,14 +1220,14 @@ Status BackupEngineImpl::CopyFile( *checksum_value = crc32c::Extend(*checksum_value, data.data(), data.size()); } - s = dst_file->Append(data); + s = dest_writer->Append(data); if (rate_limiter != nullptr) { rate_limiter->ReportAndWait(data.size()); } } while (s.ok() && data.size() > 0 && size_limit > 0); if (s.ok() && sync) { - s = dst_file->Sync(); + s = dest_writer->Sync(false); } return s; @@ -1358,6 +1368,8 @@ Status BackupEngineImpl::CalculateChecksum(const std::string& src, Env* src_env, return s; } + unique_ptr src_reader( + new SequentialFileReader(std::move(src_file))); std::unique_ptr buf(new char[copy_file_buffer_size_]); Slice data; @@ -1367,7 +1379,7 @@ Status BackupEngineImpl::CalculateChecksum(const std::string& src, Env* src_env, } size_t buffer_to_read = (copy_file_buffer_size_ < size_limit) ? copy_file_buffer_size_ : size_limit; - s = src_file->Read(buffer_to_read, &data, buf.get()); + s = src_reader->Read(buffer_to_read, &data, buf.get()); if (!s.ok()) { return s; @@ -1522,9 +1534,11 @@ Status BackupEngineImpl::BackupMeta::LoadFromFile( return s; } + unique_ptr backup_meta_reader( + new SequentialFileReader(std::move(backup_meta_file))); unique_ptr buf(new char[max_backup_meta_file_size_ + 1]); Slice data; - s = backup_meta_file->Read(max_backup_meta_file_size_, &data, buf.get()); + s = backup_meta_reader->Read(max_backup_meta_file_size_, &data, buf.get()); if (!s.ok() || data.size() == max_backup_meta_file_size_) { return s.ok() ? Status::Corruption("File size too big") : s; diff --git a/utilities/backupable/backupable_db_test.cc b/utilities/backupable/backupable_db_test.cc index 985f1f472..89d1b6208 100644 --- a/utilities/backupable/backupable_db_test.cc +++ b/utilities/backupable/backupable_db_test.cc @@ -16,6 +16,7 @@ #include "rocksdb/types.h" #include "rocksdb/transaction_log.h" #include "rocksdb/utilities/backupable_db.h" +#include "util/file_reader_writer.h" #include "util/testharness.h" #include "util/random.h" #include "util/mutexlock.h" @@ -292,11 +293,12 @@ class FileManager : public EnvWrapper { if (!s.ok()) { return s; } - + RandomRWFileAccessor accessor(std::move(file)); for (uint64_t i = 0; s.ok() && i < bytes_to_corrupt; ++i) { std::string tmp; // write one random byte to a random position - s = file->Write(rnd_.Next() % size, test::RandomString(&rnd_, 1, &tmp)); + s = accessor.Write(rnd_.Next() % size, + test::RandomString(&rnd_, 1, &tmp)); } return s; }