diff --git a/db/builder.cc b/db/builder.cc index a04075ba6..2c3d6842d 100644 --- a/db/builder.cc +++ b/db/builder.cc @@ -21,6 +21,7 @@ #include "rocksdb/options.h" #include "rocksdb/table.h" #include "table/block_based_table_builder.h" +#include "util/file_reader_writer.h" #include "util/iostats_context_imp.h" #include "util/thread_status_util.h" #include "util/stop_watch.h" @@ -34,7 +35,7 @@ TableBuilder* NewTableBuilder( const InternalKeyComparator& internal_comparator, const std::vector>* int_tbl_prop_collector_factories, - WritableFile* file, const CompressionType compression_type, + WritableFileWriter* file, const CompressionType compression_type, const CompressionOptions& compression_opts, const bool skip_filters) { return ioptions.table_factory->NewTableBuilder( TableBuilderOptions(ioptions, internal_comparator, @@ -72,16 +73,22 @@ Status BuildTable( std::string fname = TableFileName(ioptions.db_paths, meta->fd.GetNumber(), meta->fd.GetPathId()); if (iter->Valid()) { - unique_ptr file; - s = env->NewWritableFile(fname, &file, env_options); - if (!s.ok()) { - return s; - } - file->SetIOPriority(io_priority); + TableBuilder* builder; + unique_ptr file_writer; + { + unique_ptr file; + s = env->NewWritableFile(fname, &file, env_options); + if (!s.ok()) { + return s; + } + file->SetIOPriority(io_priority); - TableBuilder* builder = NewTableBuilder( - ioptions, internal_comparator, int_tbl_prop_collector_factories, - file.get(), compression, compression_opts); + file_writer.reset(new WritableFileWriter(std::move(file), env_options)); + + builder = NewTableBuilder( + ioptions, internal_comparator, int_tbl_prop_collector_factories, + file_writer.get(), compression, compression_opts); + } { // the first key is the smallest key @@ -232,16 +239,11 @@ Status BuildTable( // Finish and check for file errors if (s.ok() && !ioptions.disable_data_sync) { - if (ioptions.use_fsync) { - StopWatch sw(env, ioptions.statistics, TABLE_SYNC_MICROS); - s = file->Fsync(); - } else { - StopWatch sw(env, ioptions.statistics, TABLE_SYNC_MICROS); - s = file->Sync(); - } + StopWatch sw(env, ioptions.statistics, TABLE_SYNC_MICROS); + file_writer->Sync(ioptions.use_fsync); } if (s.ok()) { - s = file->Close(); + s = file_writer->Close(); } if (s.ok()) { diff --git a/db/builder.h b/db/builder.h index 9d2888dee..e1f625c8b 100644 --- a/db/builder.h +++ b/db/builder.h @@ -29,14 +29,14 @@ class Iterator; class TableCache; class VersionEdit; class TableBuilder; -class WritableFile; +class WritableFileWriter; TableBuilder* NewTableBuilder( const ImmutableCFOptions& options, const InternalKeyComparator& internal_comparator, const std::vector>* int_tbl_prop_collector_factories, - WritableFile* file, const CompressionType compression_type, + WritableFileWriter* file, const CompressionType compression_type, const CompressionOptions& compression_opts, const bool skip_filters = false); diff --git a/db/compaction_job.cc b/db/compaction_job.cc index 0edf984d5..1f924e9bb 100644 --- a/db/compaction_job.cc +++ b/db/compaction_job.cc @@ -44,6 +44,7 @@ #include "table/table_builder.h" #include "table/two_level_iterator.h" #include "util/coding.h" +#include "util/file_reader_writer.h" #include "util/logging.h" #include "util/log_buffer.h" #include "util/mutexlock.h" @@ -71,7 +72,7 @@ struct CompactionJob::CompactionState { std::vector outputs; // State kept for output being generated - std::unique_ptr outfile; + std::unique_ptr outfile; std::unique_ptr builder; uint64_t total_bytes; @@ -662,13 +663,8 @@ Status CompactionJob::FinishCompactionOutputFile(const Status& input_status) { // Finish and check for file errors if (s.ok() && !db_options_.disableDataSync) { - if (db_options_.use_fsync) { - StopWatch sw(env_, stats_, COMPACTION_OUTFILE_SYNC_MICROS); - s = compact_->outfile->Fsync(); - } else { - StopWatch sw(env_, stats_, COMPACTION_OUTFILE_SYNC_MICROS); - s = compact_->outfile->Sync(); - } + StopWatch sw(env_, stats_, COMPACTION_OUTFILE_SYNC_MICROS); + s = compact_->outfile->Sync(db_options_.use_fsync); } if (s.ok()) { s = compact_->outfile->Close(); @@ -799,10 +795,10 @@ Status CompactionJob::OpenCompactionOutputFile() { // no need to lock because VersionSet::next_file_number_ is atomic uint64_t file_number = versions_->NewFileNumber(); // Make the output file + unique_ptr writable_file; std::string fname = TableFileName(db_options_.db_paths, file_number, compact_->compaction->output_path_id()); - Status s = env_->NewWritableFile(fname, &compact_->outfile, env_options_); - + Status s = env_->NewWritableFile(fname, &writable_file, env_options_); if (!s.ok()) { Log(InfoLogLevel::ERROR_LEVEL, db_options_.info_log, "[%s] [JOB %d] OpenCompactionOutputFiles for table #%" PRIu64 @@ -820,9 +816,11 @@ Status CompactionJob::OpenCompactionOutputFile() { out.smallest_seqno = out.largest_seqno = 0; compact_->outputs.push_back(out); - compact_->outfile->SetIOPriority(Env::IO_LOW); - compact_->outfile->SetPreallocationBlockSize( + writable_file->SetIOPriority(Env::IO_LOW); + writable_file->SetPreallocationBlockSize( static_cast(compact_->compaction->OutputFilePreallocationSize())); + compact_->outfile.reset( + new WritableFileWriter(std::move(writable_file), env_options_)); ColumnFamilyData* cfd = compact_->compaction->column_family_data(); bool skip_filters = false; diff --git a/db/compaction_job_test.cc b/db/compaction_job_test.cc index 40ac27e22..ab716c068 100644 --- a/db/compaction_job_test.cc +++ b/db/compaction_job_test.cc @@ -13,6 +13,7 @@ #include "rocksdb/cache.h" #include "rocksdb/options.h" #include "rocksdb/db.h" +#include "util/file_reader_writer.h" #include "util/string_util.h" #include "util/testharness.h" #include "util/testutil.h" @@ -166,8 +167,10 @@ class CompactionJobTest : public testing::Test { Status s = env_->NewWritableFile( manifest, &file, env_->OptimizeForManifestWrite(env_options_)); ASSERT_OK(s); + unique_ptr file_writer( + new WritableFileWriter(std::move(file), EnvOptions())); { - log::Writer log(std::move(file)); + log::Writer log(std::move(file_writer)); std::string record; new_db.EncodeTo(&record); s = log.AddRecord(record); diff --git a/db/db_impl.cc b/db/db_impl.cc index b681514d8..030f182d1 100644 --- a/db/db_impl.cc +++ b/db/db_impl.cc @@ -72,6 +72,7 @@ #include "util/compression.h" #include "util/crc32c.h" #include "util/db_info_dumper.h" +#include "util/file_reader_writer.h" #include "util/file_util.h" #include "util/hash_skiplist_rep.h" #include "util/hash_linklist_rep.h" @@ -384,18 +385,22 @@ Status DBImpl::NewDB() { new_db.SetNextFile(2); new_db.SetLastSequence(0); + Status s; + Log(InfoLogLevel::INFO_LEVEL, db_options_.info_log, "Creating manifest 1 \n"); const std::string manifest = DescriptorFileName(dbname_, 1); - unique_ptr file; - Status s = env_->NewWritableFile( - manifest, &file, env_->OptimizeForManifestWrite(env_options_)); - if (!s.ok()) { - return s; - } - file->SetPreallocationBlockSize(db_options_.manifest_preallocation_size); { - log::Writer log(std::move(file)); + unique_ptr file; + EnvOptions env_options = env_->OptimizeForManifestWrite(env_options_); + s = env_->NewWritableFile(manifest, &file, env_options); + if (!s.ok()) { + return s; + } + file->SetPreallocationBlockSize(db_options_.manifest_preallocation_size); + unique_ptr file_writer( + new WritableFileWriter(std::move(file), env_options)); + log::Writer log(std::move(file_writer)); std::string record; new_db.EncodeTo(&record); s = log.AddRecord(record); @@ -1013,17 +1018,21 @@ Status DBImpl::RecoverLogFiles(const std::vector& log_numbers, versions_->MarkFileNumberUsedDuringRecovery(log_number); // Open the log file std::string fname = LogFileName(db_options_.wal_dir, log_number); - unique_ptr file; - status = env_->NewSequentialFile(fname, &file, env_options_); - if (!status.ok()) { - MaybeIgnoreError(&status); + unique_ptr file_reader; + { + unique_ptr file; + status = env_->NewSequentialFile(fname, &file, env_options_); if (!status.ok()) { - return status; - } else { - // Fail with one log file, but that's ok. - // Try next one. - continue; + MaybeIgnoreError(&status); + if (!status.ok()) { + return status; + } else { + // Fail with one log file, but that's ok. + // Try next one. + continue; + } } + file_reader.reset(new SequentialFileReader(std::move(file))); } // Create the log reader. @@ -1042,7 +1051,7 @@ Status DBImpl::RecoverLogFiles(const std::vector& log_numbers, // paranoid_checks==false so that corruptions cause entire commits // to be skipped instead of propagating bad information (like overly // large sequence numbers). - log::Reader reader(std::move(file), &reporter, true /*checksum*/, + log::Reader reader(std::move(file_reader), &reporter, true /*checksum*/, 0 /*initial_offset*/); Log(InfoLogLevel::INFO_LEVEL, db_options_.info_log, "Recovering log #%" PRIu64 " mode %d skip-recovery %d", log_number, @@ -3490,11 +3499,7 @@ Status DBImpl::WriteImpl(const WriteOptions& write_options, if (status.ok() && write_options.sync) { RecordTick(stats_, WAL_FILE_SYNCED); StopWatch sw(env_, stats_, WAL_FILE_SYNC_MICROS); - if (db_options_.use_fsync) { - status = log_->file()->Fsync(); - } else { - status = log_->file()->Sync(); - } + status = log_->file()->Sync(db_options_.use_fsync); if (status.ok() && !log_dir_synced_) { // We only sync WAL directory the first time WAL syncing is // requested, so that in case users never turn on WAL sync, @@ -3624,15 +3629,19 @@ Status DBImpl::SwitchMemtable(ColumnFamilyData* cfd, WriteContext* context) { Status s; { if (creating_new_log) { + EnvOptions opt_env_opt = + env_->OptimizeForLogWrite(env_options_, db_options_); s = env_->NewWritableFile( LogFileName(db_options_.wal_dir, new_log_number), &lfile, - env_->OptimizeForLogWrite(env_options_, db_options_)); + opt_env_opt); if (s.ok()) { // Our final size should be less than write_buffer_size // (compression, etc) but err on the side of caution. lfile->SetPreallocationBlockSize( 1.1 * mutable_cf_options.write_buffer_size); - new_log = new log::Writer(std::move(lfile)); + unique_ptr file_writer( + new WritableFileWriter(std::move(lfile), opt_env_opt)); + new_log = new log::Writer(std::move(file_writer)); log_dir_synced_ = false; } } @@ -4031,12 +4040,18 @@ Status DBImpl::CheckConsistency() { Status DBImpl::GetDbIdentity(std::string& identity) const { std::string idfilename = IdentityFileName(dbname_); - unique_ptr idfile; const EnvOptions soptions; - Status s = env_->NewSequentialFile(idfilename, &idfile, soptions); - if (!s.ok()) { - return s; + unique_ptr id_file_reader; + Status s; + { + unique_ptr idfile; + s = env_->NewSequentialFile(idfilename, &idfile, soptions); + if (!s.ok()) { + return s; + } + id_file_reader.reset(new SequentialFileReader(std::move(idfile))); } + uint64_t file_size; s = env_->GetFileSize(idfilename, &file_size); if (!s.ok()) { @@ -4044,7 +4059,7 @@ Status DBImpl::GetDbIdentity(std::string& identity) const { } char* buffer = reinterpret_cast(alloca(file_size)); Slice id; - s = idfile->Read(static_cast(file_size), &id, buffer); + s = id_file_reader->Read(static_cast(file_size), &id, buffer); if (!s.ok()) { return s; } @@ -4176,14 +4191,17 @@ Status DB::Open(const DBOptions& db_options, const std::string& dbname, uint64_t new_log_number = impl->versions_->NewFileNumber(); unique_ptr lfile; EnvOptions soptions(db_options); + EnvOptions opt_env_options = + impl->db_options_.env->OptimizeForLogWrite(soptions, impl->db_options_); s = impl->db_options_.env->NewWritableFile( LogFileName(impl->db_options_.wal_dir, new_log_number), &lfile, - impl->db_options_.env->OptimizeForLogWrite(soptions, - impl->db_options_)); + opt_env_options); if (s.ok()) { lfile->SetPreallocationBlockSize(1.1 * max_write_buffer_size); impl->logfile_number_ = new_log_number; - impl->log_.reset(new log::Writer(std::move(lfile))); + unique_ptr file_writer( + new WritableFileWriter(std::move(lfile), opt_env_options)); + impl->log_.reset(new log::Writer(std::move(file_writer))); // set column family handles for (auto cf : column_families) { diff --git a/db/db_test.cc b/db/db_test.cc index c732c9fb8..82f393b85 100644 --- a/db/db_test.cc +++ b/db/db_test.cc @@ -52,6 +52,7 @@ #include "table/mock_table.h" #include "table/plain_table_factory.h" #include "util/db_test_util.h" +#include "util/file_reader_writer.h" #include "util/hash.h" #include "util/hash_linklist_rep.h" #include "utilities/merge_operators.h" @@ -6008,7 +6009,9 @@ class RecoveryTestHelper { std::string fname = LogFileName(test->dbname_, current_log_number); unique_ptr file; ASSERT_OK(db_options.env->NewWritableFile(fname, &file, env_options)); - current_log_writer.reset(new log::Writer(std::move(file))); + unique_ptr file_writer( + new WritableFileWriter(std::move(file), env_options)); + current_log_writer.reset(new log::Writer(std::move(file_writer))); for (int i = 0; i < kKeysPerWALFile; i++) { std::string key = "key" + ToString(count++); @@ -7231,8 +7234,7 @@ TEST_F(DBTest, RateLimitingTest) { } elapsed = env_->NowMicros() - start; Close(); - ASSERT_TRUE(options.rate_limiter->GetTotalBytesThrough() == - env_->bytes_written_); + ASSERT_EQ(options.rate_limiter->GetTotalBytesThrough(), env_->bytes_written_); double ratio = env_->bytes_written_ * 1000000 / elapsed / raw_rate; fprintf(stderr, "write rate ratio = %.2lf, expected 0.7\n", ratio); ASSERT_TRUE(ratio < 0.8); @@ -7251,11 +7253,10 @@ TEST_F(DBTest, RateLimitingTest) { } elapsed = env_->NowMicros() - start; Close(); - ASSERT_TRUE(options.rate_limiter->GetTotalBytesThrough() == - env_->bytes_written_); + ASSERT_EQ(options.rate_limiter->GetTotalBytesThrough(), env_->bytes_written_); ratio = env_->bytes_written_ * 1000000 / elapsed / raw_rate; fprintf(stderr, "write rate ratio = %.2lf, expected 0.5\n", ratio); - ASSERT_TRUE(ratio < 0.6); + ASSERT_LT(ratio, 0.6); } namespace { diff --git a/db/fault_injection_test.cc b/db/fault_injection_test.cc index 845dff8b9..88366d055 100644 --- a/db/fault_injection_test.cc +++ b/db/fault_injection_test.cc @@ -130,7 +130,7 @@ struct FileState { } // anonymous namespace -// A wrapper around WritableFile which informs another Env whenever this file +// A wrapper around WritableFileWriter* file // is written to or sync'ed. class TestWritableFile : public WritableFile { public: @@ -197,7 +197,7 @@ class FaultInjectionTestEnv : public EnvWrapper { Status s = target()->NewWritableFile(fname, result, soptions); if (s.ok()) { result->reset(new TestWritableFile(fname, std::move(*result), this)); - // WritableFile doesn't append to files, so if the same file is opened + // WritableFileWriter* file is opened // again then it will be truncated - so forget our saved state. UntrackFile(fname); MutexLock l(&mutex_); diff --git a/db/filename.cc b/db/filename.cc index c639bee20..f445712e1 100644 --- a/db/filename.cc +++ b/db/filename.cc @@ -18,6 +18,7 @@ #include #include "db/dbformat.h" #include "rocksdb/env.h" +#include "util/file_reader_writer.h" #include "util/logging.h" #include "util/stop_watch.h" @@ -328,15 +329,13 @@ Status SetIdentityFile(Env* env, const std::string& dbname) { return s; } -Status SyncManifest(Env* env, const DBOptions* db_options, WritableFile* file) { +Status SyncManifest(Env* env, const DBOptions* db_options, + WritableFileWriter* file) { if (db_options->disableDataSync) { return Status::OK(); - } else if (db_options->use_fsync) { - StopWatch sw(env, db_options->statistics.get(), MANIFEST_FILE_SYNC_MICROS); - return file->Fsync(); } else { StopWatch sw(env, db_options->statistics.get(), MANIFEST_FILE_SYNC_MICROS); - return file->Sync(); + return file->Sync(db_options->use_fsync); } } diff --git a/db/filename.h b/db/filename.h index 36562de93..6425eb955 100644 --- a/db/filename.h +++ b/db/filename.h @@ -25,7 +25,7 @@ namespace rocksdb { class Env; class Directory; -class WritableFile; +class WritableFileWriter; enum FileType { kLogFile, @@ -140,6 +140,6 @@ extern Status SetIdentityFile(Env* env, const std::string& dbname); // Sync manifest file `file`. extern Status SyncManifest(Env* env, const DBOptions* db_options, - WritableFile* file); + WritableFileWriter* file); } // namespace rocksdb diff --git a/db/flush_job_test.cc b/db/flush_job_test.cc index 75590ec65..7e8fb8f51 100644 --- a/db/flush_job_test.cc +++ b/db/flush_job_test.cc @@ -11,6 +11,7 @@ #include "db/version_set.h" #include "db/writebuffer.h" #include "rocksdb/cache.h" +#include "util/file_reader_writer.h" #include "util/string_util.h" #include "util/testharness.h" #include "util/testutil.h" @@ -56,8 +57,10 @@ class FlushJobTest : public testing::Test { Status s = env_->NewWritableFile( manifest, &file, env_->OptimizeForManifestWrite(env_options_)); ASSERT_OK(s); + unique_ptr file_writer( + new WritableFileWriter(std::move(file), EnvOptions())); { - log::Writer log(std::move(file)); + log::Writer log(std::move(file_writer)); std::string record; new_db.EncodeTo(&record); s = log.AddRecord(record); diff --git a/db/log_reader.cc b/db/log_reader.cc index f3fdc18f3..296f1d50c 100644 --- a/db/log_reader.cc +++ b/db/log_reader.cc @@ -13,6 +13,7 @@ #include "rocksdb/env.h" #include "util/coding.h" #include "util/crc32c.h" +#include "util/file_reader_writer.h" namespace rocksdb { namespace log { @@ -20,7 +21,7 @@ namespace log { Reader::Reporter::~Reporter() { } -Reader::Reader(unique_ptr&& _file, Reporter* reporter, +Reader::Reader(unique_ptr&& _file, Reporter* reporter, bool checksum, uint64_t initial_offset) : file_(std::move(_file)), reporter_(reporter), diff --git a/db/log_reader.h b/db/log_reader.h index e6cbf47ac..390696b85 100644 --- a/db/log_reader.h +++ b/db/log_reader.h @@ -17,7 +17,7 @@ namespace rocksdb { -class SequentialFile; +class SequentialFileReader; using std::unique_ptr; namespace log { @@ -51,7 +51,7 @@ class Reader { // // The Reader will start reading at the first record located at physical // position >= initial_offset within the file. - Reader(unique_ptr&& file, Reporter* reporter, + Reader(unique_ptr&& file, Reporter* reporter, bool checksum, uint64_t initial_offset); ~Reader(); @@ -81,10 +81,10 @@ class Reader { // block that was partially read. void UnmarkEOF(); - SequentialFile* file() { return file_.get(); } + SequentialFileReader* file() { return file_.get(); } private: - const unique_ptr file_; + const unique_ptr file_; Reporter* const reporter_; bool const checksum_; char* const backing_store_; diff --git a/db/log_test.cc b/db/log_test.cc index 74715acde..168936e98 100644 --- a/db/log_test.cc +++ b/db/log_test.cc @@ -12,8 +12,10 @@ #include "rocksdb/env.h" #include "util/coding.h" #include "util/crc32c.h" +#include "util/file_reader_writer.h" #include "util/random.h" #include "util/testharness.h" +#include "util/testutil.h" namespace rocksdb { namespace log { @@ -163,26 +165,27 @@ class LogTest : public testing::Test { }; std::string& dest_contents() { - auto dest = dynamic_cast(writer_.file()); + auto dest = dynamic_cast(writer_.file()->writable_file()); assert(dest); return dest->contents_; } const std::string& dest_contents() const { - auto dest = dynamic_cast(writer_.file()); + auto dest = + dynamic_cast(writer_.file()->writable_file()); assert(dest); return dest->contents_; } void reset_source_contents() { - auto src = dynamic_cast(reader_.file()); + auto src = dynamic_cast(reader_.file()->file()); assert(src); src->contents_ = dest_contents(); } Slice reader_contents_; - unique_ptr dest_holder_; - unique_ptr source_holder_; + unique_ptr dest_holder_; + unique_ptr source_holder_; ReportCollector report_; Writer writer_; Reader reader_; @@ -192,13 +195,15 @@ class LogTest : public testing::Test { static uint64_t initial_offset_last_record_offsets_[]; public: - LogTest() : reader_contents_(), - dest_holder_(new StringDest(reader_contents_)), - source_holder_(new StringSource(reader_contents_)), - writer_(std::move(dest_holder_)), - reader_(std::move(source_holder_), &report_, true/*checksum*/, - 0/*initial_offset*/) { - } + LogTest() + : reader_contents_(), + dest_holder_( + test::GetWritableFileWriter(new StringDest(reader_contents_))), + source_holder_( + test::GetSequentialFileReader(new StringSource(reader_contents_))), + writer_(std::move(dest_holder_)), + reader_(std::move(source_holder_), &report_, true /*checksum*/, + 0 /*initial_offset*/) {} void Write(const std::string& msg) { writer_.AddRecord(Slice(msg)); @@ -227,7 +232,7 @@ class LogTest : public testing::Test { } void ShrinkSize(int bytes) { - auto dest = dynamic_cast(writer_.file()); + auto dest = dynamic_cast(writer_.file()->writable_file()); assert(dest); dest->Drop(bytes); } @@ -240,7 +245,7 @@ class LogTest : public testing::Test { } void ForceError(size_t position = 0) { - auto src = dynamic_cast(reader_.file()); + auto src = dynamic_cast(reader_.file()->file()); src->force_error_ = true; src->force_error_position_ = position; } @@ -254,13 +259,13 @@ class LogTest : public testing::Test { } void ForceEOF(size_t position = 0) { - auto src = dynamic_cast(reader_.file()); + auto src = dynamic_cast(reader_.file()->file()); src->force_eof_ = true; src->force_eof_position_ = position; } void UnmarkEOF() { - auto src = dynamic_cast(reader_.file()); + auto src = dynamic_cast(reader_.file()->file()); src->returned_partial_ = false; reader_.UnmarkEOF(); } @@ -288,10 +293,11 @@ class LogTest : public testing::Test { void CheckOffsetPastEndReturnsNoRecords(uint64_t offset_past_end) { WriteInitialOffsetLog(); - unique_ptr source(new StringSource(reader_contents_)); + unique_ptr file_reader( + test::GetSequentialFileReader(new StringSource(reader_contents_))); unique_ptr offset_reader( - new Reader(std::move(source), &report_, true/*checksum*/, - WrittenBytes() + offset_past_end)); + new Reader(std::move(file_reader), &report_, true /*checksum*/, + WrittenBytes() + offset_past_end)); Slice record; std::string scratch; ASSERT_TRUE(!offset_reader->ReadRecord(&record, &scratch)); @@ -300,10 +306,10 @@ class LogTest : public testing::Test { void CheckInitialOffsetRecord(uint64_t initial_offset, int expected_record_offset) { WriteInitialOffsetLog(); - unique_ptr source(new StringSource(reader_contents_)); - unique_ptr offset_reader( - new Reader(std::move(source), &report_, true/*checksum*/, - initial_offset)); + unique_ptr file_reader( + test::GetSequentialFileReader(new StringSource(reader_contents_))); + unique_ptr offset_reader(new Reader( + std::move(file_reader), &report_, true /*checksum*/, initial_offset)); Slice record; std::string scratch; ASSERT_TRUE(offset_reader->ReadRecord(&record, &scratch)); diff --git a/db/log_writer.cc b/db/log_writer.cc index d78de5e7b..32d4afdc9 100644 --- a/db/log_writer.cc +++ b/db/log_writer.cc @@ -13,13 +13,13 @@ #include "rocksdb/env.h" #include "util/coding.h" #include "util/crc32c.h" +#include "util/file_reader_writer.h" namespace rocksdb { namespace log { -Writer::Writer(unique_ptr&& dest) - : dest_(std::move(dest)), - block_offset_(0) { +Writer::Writer(unique_ptr&& dest) + : dest_(std::move(dest)), block_offset_(0) { for (int i = 0; i <= kMaxRecordType; i++) { char t = static_cast(i); type_crc_[i] = crc32c::Value(&t, 1); diff --git a/db/log_writer.h b/db/log_writer.h index 46226ec27..8f0728c69 100644 --- a/db/log_writer.h +++ b/db/log_writer.h @@ -16,7 +16,7 @@ namespace rocksdb { -class WritableFile; +class WritableFileWriter; using std::unique_ptr; @@ -61,16 +61,16 @@ class Writer { // Create a writer that will append data to "*dest". // "*dest" must be initially empty. // "*dest" must remain live while this Writer is in use. - explicit Writer(unique_ptr&& dest); + explicit Writer(unique_ptr&& dest); ~Writer(); Status AddRecord(const Slice& slice); - WritableFile* file() { return dest_.get(); } - const WritableFile* file() const { return dest_.get(); } + WritableFileWriter* file() { return dest_.get(); } + const WritableFileWriter* file() const { return dest_.get(); } private: - unique_ptr dest_; + unique_ptr dest_; int block_offset_; // Current offset in block // crc32c values for all supported record types. These are diff --git a/db/plain_table_db_test.cc b/db/plain_table_db_test.cc index 9a97e801c..fa6afa916 100644 --- a/db/plain_table_db_test.cc +++ b/db/plain_table_db_test.cc @@ -198,10 +198,9 @@ class TestPlainTableReader : public PlainTableReader { int bloom_bits_per_key, double hash_table_ratio, size_t index_sparseness, const TableProperties* table_properties, - unique_ptr&& file, + unique_ptr&& file, const ImmutableCFOptions& ioptions, - bool* expect_bloom_not_match, - bool store_index_in_file) + bool* expect_bloom_not_match, bool store_index_in_file) : PlainTableReader(ioptions, std::move(file), env_options, icomparator, encoding_type, file_size, table_properties), expect_bloom_not_match_(expect_bloom_not_match) { @@ -257,7 +256,8 @@ class TestPlainTableFactory : public PlainTableFactory { Status NewTableReader(const ImmutableCFOptions& ioptions, const EnvOptions& env_options, const InternalKeyComparator& internal_comparator, - unique_ptr&& file, uint64_t file_size, + unique_ptr&& file, + uint64_t file_size, unique_ptr* table) const override { TableProperties* props = nullptr; auto s = ReadTableProperties(file.get(), file_size, kPlainTableMagicNumber, diff --git a/db/repair.cc b/db/repair.cc index 87788c5b5..a0322b683 100644 --- a/db/repair.cc +++ b/db/repair.cc @@ -81,6 +81,7 @@ #include "rocksdb/env.h" #include "rocksdb/options.h" #include "rocksdb/immutable_options.h" +#include "util/file_reader_writer.h" #include "util/scoped_arena_iterator.h" namespace rocksdb { @@ -236,6 +237,8 @@ class Repairer { if (!status.ok()) { return status; } + unique_ptr lfile_reader( + new SequentialFileReader(std::move(lfile))); // Create the log reader. LogReporter reporter; @@ -246,8 +249,8 @@ class Repairer { // corruptions cause entire commits to be skipped instead of // propagating bad information (like overly large sequence // numbers). - log::Reader reader(std::move(lfile), &reporter, true /*enable checksum*/, - 0/*initial_offset*/); + log::Reader reader(std::move(lfile_reader), &reporter, + true /*enable checksum*/, 0 /*initial_offset*/); // Read all the records and add to a memtable std::string scratch; @@ -378,8 +381,8 @@ class Repairer { Status WriteDescriptor() { std::string tmp = TempFileName(dbname_, 1); unique_ptr file; - Status status = env_->NewWritableFile( - tmp, &file, env_->OptimizeForManifestWrite(env_options_)); + EnvOptions env_options = env_->OptimizeForManifestWrite(env_options_); + Status status = env_->NewWritableFile(tmp, &file, env_options); if (!status.ok()) { return status; } @@ -407,7 +410,9 @@ class Repairer { //fprintf(stderr, "NewDescriptor:\n%s\n", edit_.DebugString().c_str()); { - log::Writer log(std::move(file)); + unique_ptr file_writer( + new WritableFileWriter(std::move(file), env_options)); + log::Writer log(std::move(file_writer)); std::string record; edit_->EncodeTo(&record); status = log.AddRecord(record); diff --git a/db/table_cache.cc b/db/table_cache.cc index f74cfb6f1..3e0fdc3eb 100644 --- a/db/table_cache.cc +++ b/db/table_cache.cc @@ -18,6 +18,7 @@ #include "table/table_reader.h" #include "table/get_context.h" #include "util/coding.h" +#include "util/file_reader_writer.h" #include "util/perf_context_imp.h" #include "util/stop_watch.h" @@ -99,8 +100,10 @@ Status TableCache::FindTable(const EnvOptions& env_options, file->Hint(RandomAccessFile::RANDOM); } StopWatch sw(ioptions_.env, ioptions_.statistics, TABLE_OPEN_IO_MICROS); + std::unique_ptr file_reader( + new RandomAccessFileReader(std::move(file))); s = ioptions_.table_factory->NewTableReader( - ioptions_, env_options, internal_comparator, std::move(file), + ioptions_, env_options, internal_comparator, std::move(file_reader), fd.GetFileSize(), &table_reader); } diff --git a/db/table_properties_collector_test.cc b/db/table_properties_collector_test.cc index 363b31bb9..34af49748 100644 --- a/db/table_properties_collector_test.cc +++ b/db/table_properties_collector_test.cc @@ -18,6 +18,7 @@ #include "table/plain_table_factory.h" #include "table/table_builder.h" #include "util/coding.h" +#include "util/file_reader_writer.h" #include "util/testharness.h" #include "util/testutil.h" @@ -33,7 +34,7 @@ class TablePropertiesTest : public testing::Test, // TODO(kailiu) the following classes should be moved to some more general // places, so that other tests can also make use of them. -// `FakeWritableFile` and `FakeRandomeAccessFile` bypass the real file system +// `FakeWritableFileWriter* file system // and therefore enable us to quickly setup the tests. class FakeWritableFile : public WritableFile { public: @@ -96,9 +97,11 @@ void MakeBuilder(const Options& options, const ImmutableCFOptions& ioptions, const InternalKeyComparator& internal_comparator, const std::vector>* int_tbl_prop_collector_factories, - std::unique_ptr* writable, + std::unique_ptr* writable, std::unique_ptr* builder) { - writable->reset(new FakeWritableFile); + unique_ptr wf(new FakeWritableFile); + writable->reset(new WritableFileWriter(std::move(wf), EnvOptions())); + builder->reset(NewTableBuilder( ioptions, internal_comparator, int_tbl_prop_collector_factories, writable->get(), options.compression, options.compression_opts)); @@ -289,7 +292,7 @@ void TestCustomizedTablePropertiesCollector( // -- Step 1: build table std::unique_ptr builder; - std::unique_ptr writable; + std::unique_ptr writer; const ImmutableCFOptions ioptions(options); std::vector> int_tbl_prop_collector_factories; @@ -300,7 +303,7 @@ void TestCustomizedTablePropertiesCollector( GetIntTblPropCollectorFactory(options, &int_tbl_prop_collector_factories); } MakeBuilder(options, ioptions, internal_comparator, - &int_tbl_prop_collector_factories, &writable, &builder); + &int_tbl_prop_collector_factories, &writer, &builder); SequenceNumber seqNum = 0U; for (const auto& kv : kvs) { @@ -310,18 +313,17 @@ void TestCustomizedTablePropertiesCollector( builder->Add(ikey.Encode(), kv.second); } ASSERT_OK(builder->Finish()); + writer->Flush(); // -- Step 2: Read properties - FakeRandomeAccessFile readable(writable->contents()); + FakeWritableFile* fwf = + static_cast(writer->writable_file()); + std::unique_ptr fake_file_reader( + test::GetRandomAccessFileReader( + new FakeRandomeAccessFile(fwf->contents()))); TableProperties* props; - Status s = ReadTableProperties( - &readable, - writable->contents().size(), - magic_number, - Env::Default(), - nullptr, - &props - ); + Status s = ReadTableProperties(fake_file_reader.get(), fwf->contents().size(), + magic_number, Env::Default(), nullptr, &props); std::unique_ptr props_guard(props); ASSERT_OK(s); @@ -414,7 +416,7 @@ void TestInternalKeyPropertiesCollector( }; std::unique_ptr builder; - std::unique_ptr writable; + std::unique_ptr writable; Options options; test::PlainInternalKeyComparator pikc(options.comparator); @@ -449,12 +451,16 @@ void TestInternalKeyPropertiesCollector( } ASSERT_OK(builder->Finish()); + writable->Flush(); - FakeRandomeAccessFile readable(writable->contents()); + FakeWritableFile* fwf = + static_cast(writable->writable_file()); + unique_ptr reader(test::GetRandomAccessFileReader( + new FakeRandomeAccessFile(fwf->contents()))); TableProperties* props; Status s = - ReadTableProperties(&readable, writable->contents().size(), - magic_number, Env::Default(), nullptr, &props); + ReadTableProperties(reader.get(), fwf->contents().size(), magic_number, + Env::Default(), nullptr, &props); ASSERT_OK(s); std::unique_ptr props_guard(props); diff --git a/db/transaction_log_impl.cc b/db/transaction_log_impl.cc index b0bf6e4e9..23bd6672b 100644 --- a/db/transaction_log_impl.cc +++ b/db/transaction_log_impl.cc @@ -11,6 +11,7 @@ #include #include "db/transaction_log_impl.h" #include "db/write_batch_internal.h" +#include "util/file_reader_writer.h" namespace rocksdb { @@ -40,23 +41,27 @@ TransactionLogIteratorImpl::TransactionLogIteratorImpl( } Status TransactionLogIteratorImpl::OpenLogFile( - const LogFile* logFile, - unique_ptr* file) { + const LogFile* logFile, unique_ptr* file_reader) { Env* env = options_->env; + unique_ptr file; + Status s; if (logFile->Type() == kArchivedLogFile) { std::string fname = ArchivedLogFileName(dir_, logFile->LogNumber()); - return env->NewSequentialFile(fname, file, soptions_); + s = env->NewSequentialFile(fname, &file, soptions_); } else { std::string fname = LogFileName(dir_, logFile->LogNumber()); - Status s = env->NewSequentialFile(fname, file, soptions_); + s = env->NewSequentialFile(fname, &file, soptions_); if (!s.ok()) { // If cannot open file in DB directory. // Try the archive dir, as it could have moved in the meanwhile. fname = ArchivedLogFileName(dir_, logFile->LogNumber()); - s = env->NewSequentialFile(fname, file, soptions_); + s = env->NewSequentialFile(fname, &file, soptions_); } - return s; } + if (s.ok()) { + file_reader->reset(new SequentialFileReader(std::move(file))); + } + return s; } BatchResult TransactionLogIteratorImpl::GetBatch() { @@ -251,7 +256,7 @@ void TransactionLogIteratorImpl::UpdateCurrentWriteBatch(const Slice& record) { } Status TransactionLogIteratorImpl::OpenLogReader(const LogFile* logFile) { - unique_ptr file; + unique_ptr file; Status s = OpenLogFile(logFile, &file); if (!s.ok()) { return s; diff --git a/db/transaction_log_impl.h b/db/transaction_log_impl.h index c730d0f61..f89cc3207 100644 --- a/db/transaction_log_impl.h +++ b/db/transaction_log_impl.h @@ -84,7 +84,8 @@ class TransactionLogIteratorImpl : public TransactionLogIterator { size_t currentFileIndex_; std::unique_ptr currentBatch_; unique_ptr currentLogReader_; - Status OpenLogFile(const LogFile* logFile, unique_ptr* file); + Status OpenLogFile(const LogFile* logFile, + unique_ptr* file); struct LogReporter : public log::Reader::Reporter { Env* env; diff --git a/db/version_set.cc b/db/version_set.cc index 72874492f..c3f090ba1 100644 --- a/db/version_set.cc +++ b/db/version_set.cc @@ -42,6 +42,7 @@ #include "table/meta_blocks.h" #include "table/get_context.h" #include "util/coding.h" +#include "util/file_reader_writer.h" #include "util/logging.h" #include "util/stop_watch.h" #include "util/sync_point.h" @@ -567,10 +568,12 @@ Status Version::GetTableProperties(std::shared_ptr* tp, TableProperties* raw_table_properties; // By setting the magic number to kInvalidTableMagicNumber, we can by // pass the magic number check in the footer. + std::unique_ptr file_reader( + new RandomAccessFileReader(std::move(file))); s = ReadTableProperties( - file.get(), file_meta->fd.GetFileSize(), - Footer::kInvalidTableMagicNumber /* table's magic number */, - vset_->env_, ioptions->info_log, &raw_table_properties); + file_reader.get(), file_meta->fd.GetFileSize(), + Footer::kInvalidTableMagicNumber /* table's magic number */, vset_->env_, + ioptions->info_log, &raw_table_properties); if (!s.ok()) { return s; } @@ -1912,13 +1915,17 @@ Status VersionSet::LogAndApply(ColumnFamilyData* column_family_data, Log(InfoLogLevel::INFO_LEVEL, db_options_->info_log, "Creating manifest %" PRIu64 "\n", pending_manifest_file_number_); unique_ptr descriptor_file; + EnvOptions opt_env_opts = env_->OptimizeForManifestWrite(env_options_); s = env_->NewWritableFile( DescriptorFileName(dbname_, pending_manifest_file_number_), - &descriptor_file, env_->OptimizeForManifestWrite(env_options_)); + &descriptor_file, opt_env_opts); if (s.ok()) { descriptor_file->SetPreallocationBlockSize( db_options_->manifest_preallocation_size); - descriptor_log_.reset(new log::Writer(std::move(descriptor_file))); + + unique_ptr file_writer( + new WritableFileWriter(std::move(descriptor_file), opt_env_opts)); + descriptor_log_.reset(new log::Writer(std::move(file_writer))); s = WriteSnapshot(descriptor_log_.get()); } } @@ -2132,11 +2139,16 @@ Status VersionSet::Recover( manifest_filename.c_str()); manifest_filename = dbname_ + "/" + manifest_filename; - unique_ptr manifest_file; - s = env_->NewSequentialFile(manifest_filename, &manifest_file, - env_options_); - if (!s.ok()) { - return s; + unique_ptr manifest_file_reader; + { + unique_ptr manifest_file; + s = env_->NewSequentialFile(manifest_filename, &manifest_file, + env_options_); + if (!s.ok()) { + return s; + } + manifest_file_reader.reset( + new SequentialFileReader(std::move(manifest_file))); } uint64_t current_manifest_file_size; s = env_->GetFileSize(manifest_filename, ¤t_manifest_file_size); @@ -2170,8 +2182,8 @@ Status VersionSet::Recover( { VersionSet::LogReporter reporter; reporter.status = &s; - log::Reader reader(std::move(manifest_file), &reporter, true /*checksum*/, - 0 /*initial_offset*/); + log::Reader reader(std::move(manifest_file_reader), &reporter, + true /*checksum*/, 0 /*initial_offset*/); Slice record; std::string scratch; while (reader.ReadRecord(&record, &scratch) && s.ok()) { @@ -2405,18 +2417,23 @@ Status VersionSet::ListColumnFamilies(std::vector* column_families, current.resize(current.size() - 1); std::string dscname = dbname + "/" + current; + + unique_ptr file_reader; + { unique_ptr file; s = env->NewSequentialFile(dscname, &file, soptions); if (!s.ok()) { return s; } + file_reader.reset(new SequentialFileReader(std::move(file))); + } std::map column_family_names; // default column family is always implicitly there column_family_names.insert({0, kDefaultColumnFamilyName}); VersionSet::LogReporter reporter; reporter.status = &s; - log::Reader reader(std::move(file), &reporter, true /*checksum*/, + log::Reader reader(std::move(file_reader), &reporter, true /*checksum*/, 0 /*initial_offset*/); Slice record; std::string scratch; @@ -2542,10 +2559,15 @@ Status VersionSet::ReduceNumberOfLevels(const std::string& dbname, Status VersionSet::DumpManifest(Options& options, std::string& dscname, bool verbose, bool hex, bool json) { // Open the specified manifest file. - unique_ptr file; - Status s = options.env->NewSequentialFile(dscname, &file, env_options_); - if (!s.ok()) { - return s; + unique_ptr file_reader; + Status s; + { + unique_ptr file; + s = options.env->NewSequentialFile(dscname, &file, env_options_); + if (!s.ok()) { + return s; + } + file_reader.reset(new SequentialFileReader(std::move(file))); } bool have_prev_log_number = false; @@ -2569,8 +2591,8 @@ Status VersionSet::DumpManifest(Options& options, std::string& dscname, { VersionSet::LogReporter reporter; reporter.status = &s; - log::Reader reader(std::move(file), &reporter, true/*checksum*/, - 0/*initial_offset*/); + log::Reader reader(std::move(file_reader), &reporter, true /*checksum*/, + 0 /*initial_offset*/); Slice record; std::string scratch; while (reader.ReadRecord(&record, &scratch) && s.ok()) { @@ -2664,7 +2686,7 @@ Status VersionSet::DumpManifest(Options& options, std::string& dscname, } } } - file.reset(); + file_reader.reset(); if (s.ok()) { if (!have_next_file) { @@ -2806,17 +2828,23 @@ bool VersionSet::ManifestContains(uint64_t manifest_file_num, std::string fname = DescriptorFileName(dbname_, manifest_file_num); Log(InfoLogLevel::INFO_LEVEL, db_options_->info_log, "ManifestContains: checking %s\n", fname.c_str()); - unique_ptr file; - Status s = env_->NewSequentialFile(fname, &file, env_options_); - if (!s.ok()) { - Log(InfoLogLevel::INFO_LEVEL, db_options_->info_log, - "ManifestContains: %s\n", s.ToString().c_str()); - Log(InfoLogLevel::INFO_LEVEL, db_options_->info_log, - "ManifestContains: is unable to reopen the manifest file %s", - fname.c_str()); - return false; + + unique_ptr file_reader; + Status s; + { + unique_ptr file; + s = env_->NewSequentialFile(fname, &file, env_options_); + if (!s.ok()) { + Log(InfoLogLevel::INFO_LEVEL, db_options_->info_log, + "ManifestContains: %s\n", s.ToString().c_str()); + Log(InfoLogLevel::INFO_LEVEL, db_options_->info_log, + "ManifestContains: is unable to reopen the manifest file %s", + fname.c_str()); + return false; + } + file_reader.reset(new SequentialFileReader(std::move(file))); } - log::Reader reader(std::move(file), nullptr, true/*checksum*/, 0); + log::Reader reader(std::move(file_reader), nullptr, true /*checksum*/, 0); Slice r; std::string scratch; bool result = false; diff --git a/db/wal_manager.cc b/db/wal_manager.cc index 5651bae3a..dceaf6643 100644 --- a/db/wal_manager.cc +++ b/db/wal_manager.cc @@ -28,6 +28,7 @@ #include "rocksdb/options.h" #include "rocksdb/write_batch.h" #include "util/coding.h" +#include "util/file_reader_writer.h" #include "util/logging.h" #include "util/mutexlock.h" #include "util/sync_point.h" @@ -430,6 +431,8 @@ Status WalManager::ReadFirstLine(const std::string& fname, std::unique_ptr file; Status status = env_->NewSequentialFile(fname, &file, env_options_); + unique_ptr file_reader( + new SequentialFileReader(std::move(file))); if (!status.ok()) { return status; @@ -441,7 +444,7 @@ Status WalManager::ReadFirstLine(const std::string& fname, reporter.fname = fname.c_str(); reporter.status = &status; reporter.ignore_error = !db_options_.paranoid_checks; - log::Reader reader(std::move(file), &reporter, true /*checksum*/, + log::Reader reader(std::move(file_reader), &reporter, true /*checksum*/, 0 /*initial_offset*/); std::string scratch; Slice record; diff --git a/db/wal_manager_test.cc b/db/wal_manager_test.cc index 325f0d94c..b72c377a8 100644 --- a/db/wal_manager_test.cc +++ b/db/wal_manager_test.cc @@ -14,6 +14,7 @@ #include "db/column_family.h" #include "db/version_set.h" #include "db/writebuffer.h" +#include "util/file_reader_writer.h" #include "util/mock_env.h" #include "util/string_util.h" #include "util/testharness.h" @@ -72,7 +73,9 @@ class WalManagerTest : public testing::Test { std::string fname = ArchivedLogFileName(dbname_, current_log_number_); unique_ptr file; ASSERT_OK(env_->NewWritableFile(fname, &file, env_options_)); - current_log_writer_.reset(new log::Writer(std::move(file))); + unique_ptr file_writer( + new WritableFileWriter(std::move(file), env_options_)); + current_log_writer_.reset(new log::Writer(std::move(file_writer))); } void CreateArchiveLogs(int num_logs, int entries_per_log) { @@ -120,7 +123,9 @@ TEST_F(WalManagerTest, ReadFirstRecordCache) { ASSERT_OK(wal_manager_->TEST_ReadFirstRecord(kAliveLogFile, 1, &s)); ASSERT_EQ(s, 0U); - log::Writer writer(std::move(file)); + unique_ptr file_writer( + new WritableFileWriter(std::move(file), EnvOptions())); + log::Writer writer(std::move(file_writer)); WriteBatch batch; batch.Put("foo", "bar"); WriteBatchInternal::SetSequence(&batch, 10); diff --git a/include/rocksdb/env.h b/include/rocksdb/env.h index 684448e2a..acdc66e48 100644 --- a/include/rocksdb/env.h +++ b/include/rocksdb/env.h @@ -465,6 +465,8 @@ class WritableFile { io_priority_ = pri; } + virtual Env::IOPriority GetIOPriority() { return io_priority_; } + /* * Get the size of valid data in the file. */ @@ -501,7 +503,14 @@ class WritableFile { return Status::NotSupported("InvalidateCache not supported."); } - protected: + // Sync a file range with disk. + // offset is the starting byte of the file range to be synchronized. + // nbytes specifies the length of the range to be synchronized. + // This asks the OS to initiate flushing the cached data to disk, + // without waiting for completion. + // Default implementation does nothing. + virtual Status RangeSync(off_t offset, off_t nbytes) { return Status::OK(); } + // PrepareWrite performs any necessary preparation for a write // before the write actually occurs. This allows for pre-allocation // of space on devices where it can result in less file @@ -526,6 +535,7 @@ class WritableFile { } } + protected: /* * Pre-allocate space for a file. */ @@ -533,16 +543,6 @@ class WritableFile { return Status::OK(); } - // Sync a file range with disk. - // offset is the starting byte of the file range to be synchronized. - // nbytes specifies the length of the range to be synchronized. - // This asks the OS to initiate flushing the cached data to disk, - // without waiting for completion. - // Default implementation does nothing. - virtual Status RangeSync(off_t offset, off_t nbytes) { - return Status::OK(); - } - size_t preallocation_block_size() { return preallocation_block_size_; } private: @@ -893,6 +893,7 @@ class WritableFileWrapper : public WritableFile { void SetIOPriority(Env::IOPriority pri) override { target_->SetIOPriority(pri); } + Env::IOPriority GetIOPriority() override { return target_->GetIOPriority(); } uint64_t GetFileSize() override { return target_->GetFileSize(); } void GetPreallocationStatus(size_t* block_size, size_t* last_allocated_block) override { diff --git a/include/rocksdb/table.h b/include/rocksdb/table.h index b84363a94..f3a22dc31 100644 --- a/include/rocksdb/table.h +++ b/include/rocksdb/table.h @@ -34,7 +34,7 @@ class RandomAccessFile; struct TableBuilderOptions; class TableBuilder; class TableReader; -class WritableFile; +class WritableFileWriter; struct EnvOptions; struct Options; @@ -315,6 +315,8 @@ extern TableFactory* NewCuckooTableFactory( #endif // ROCKSDB_LITE +class RandomAccessFileReader; + // A base class for table factories. class TableFactory { public: @@ -348,7 +350,7 @@ class TableFactory { virtual Status NewTableReader( const ImmutableCFOptions& ioptions, const EnvOptions& env_options, const InternalKeyComparator& internal_comparator, - unique_ptr&& file, uint64_t file_size, + unique_ptr&& file, uint64_t file_size, unique_ptr* table_reader) const = 0; // Return a table builder to write to a file for this table type. @@ -372,7 +374,7 @@ class TableFactory { // to use in this table. virtual TableBuilder* NewTableBuilder( const TableBuilderOptions& table_builder_options, - WritableFile* file) const = 0; + WritableFileWriter* file) const = 0; // Sanitizes the specified DB Options and ColumnFamilyOptions. // diff --git a/port/port_posix.cc b/port/port_posix.cc index a8cffcc7e..0e1cb4db1 100644 --- a/port/port_posix.cc +++ b/port/port_posix.cc @@ -9,11 +9,13 @@ #include "port/port_posix.h" -#include #include #include -#include +#include +#include #include +#include +#include #include #include "util/logging.h" @@ -133,5 +135,11 @@ void InitOnce(OnceType* once, void (*initializer)()) { PthreadCall("once", pthread_once(once, initializer)); } +void Crash(const std::string& srcfile, int srcline) { + fprintf(stdout, "Crashing at %s:%d\n", srcfile.c_str(), srcline); + fflush(stdout); + kill(getpid(), SIGTERM); +} + } // namespace port } // namespace rocksdb diff --git a/port/port_posix.h b/port/port_posix.h index ff8d4af4b..0dd824a5d 100644 --- a/port/port_posix.h +++ b/port/port_posix.h @@ -53,7 +53,7 @@ #if defined(OS_MACOSX) || defined(OS_SOLARIS) || defined(OS_FREEBSD) ||\ defined(OS_NETBSD) || defined(OS_OPENBSD) || defined(OS_DRAGONFLYBSD) ||\ - defined(OS_ANDROID) || defined(OS_CYGWIN) + defined(OS_ANDROID) || defined(CYGWIN) // Use fread/fwrite/fflush on platforms without _unlocked variants #define fread_unlocked fread #define fwrite_unlocked fwrite @@ -150,6 +150,7 @@ extern void InitOnce(OnceType* once, void (*initializer)()); #define PREFETCH(addr, rw, locality) __builtin_prefetch(addr, rw, locality) +extern void Crash(const std::string& srcfile, int srcline); } // namespace port } // namespace rocksdb diff --git a/port/win/env_win.cc b/port/win/env_win.cc index 31ce7b5bf..ce9dcd4e0 100644 --- a/port/win/env_win.cc +++ b/port/win/env_win.cc @@ -29,6 +29,7 @@ #include "util/random.h" #include "util/iostats_context_imp.h" #include "util/rate_limiter.h" +#include "util/sync_point.h" #include "util/thread_status_updater.h" #include "util/thread_status_util.h" @@ -36,10 +37,6 @@ #include // For UUID generation #include -// This is only set from db_stress.cc and for testing only. -// If non-zero, kill at various points in source code with probability 1/this -int rocksdb_kill_odds = 0; - namespace rocksdb { std::string GetWindowsErrSz(DWORD err) { @@ -90,40 +87,6 @@ inline void PrintThreadInfo(size_t thread_id, size_t terminatingId) { // returns the ID of the current process inline int current_process_id() { return _getpid(); } -#ifdef NDEBUG -// empty in release build -#define TEST_KILL_RANDOM(rocksdb_kill_odds) -#else - -// Kill the process with probablity 1/odds for testing. -void TestKillRandom(int odds, const std::string& srcfile, int srcline) { - time_t curtime = time(nullptr); - Random r((uint32_t)curtime); - - assert(odds > 0); - bool crash = r.OneIn(odds); - if (crash) { - fprintf(stdout, "Crashing at %s:%d\n", srcfile.c_str(), srcline); - fflush(stdout); - std::string* p_str = nullptr; - p_str->c_str(); - } -} - -// To avoid crashing always at some frequently executed codepaths (during -// kill random test), use this factor to reduce odds -#define REDUCE_ODDS 2 -#define REDUCE_ODDS2 4 - -#define TEST_KILL_RANDOM(rocksdb_kill_odds) \ - { \ - if (rocksdb_kill_odds > 0) { \ - TestKillRandom(rocksdb_kill_odds, __FILE__, __LINE__); \ - } \ - } - -#endif - // RAII helpers for HANDLEs const auto CloseHandleFunc = [](HANDLE h) { ::CloseHandle(h); }; typedef std::unique_ptr UniqueCloseHandlePtr; diff --git a/src.mk b/src.mk index ae7a932bb..62b4ef721 100644 --- a/src.mk +++ b/src.mk @@ -88,6 +88,7 @@ LIB_SOURCES = \ util/env_hdfs.cc \ util/env_posix.cc \ util/file_util.cc \ + util/file_reader_writer.cc \ util/filter_policy.cc \ util/hash.cc \ util/hash_cuckoo_rep.cc \ diff --git a/table/adaptive_table_factory.cc b/table/adaptive_table_factory.cc index e8831f757..f642fa7bd 100644 --- a/table/adaptive_table_factory.cc +++ b/table/adaptive_table_factory.cc @@ -41,8 +41,9 @@ extern const uint64_t kCuckooTableMagicNumber; Status AdaptiveTableFactory::NewTableReader( const ImmutableCFOptions& ioptions, const EnvOptions& env_options, - const InternalKeyComparator& icomp, unique_ptr&& file, - uint64_t file_size, unique_ptr* table) const { + const InternalKeyComparator& icomp, + unique_ptr&& file, uint64_t file_size, + unique_ptr* table) const { Footer footer; auto s = ReadFooterFromFile(file.get(), file_size, &footer); if (!s.ok()) { @@ -66,7 +67,7 @@ Status AdaptiveTableFactory::NewTableReader( TableBuilder* AdaptiveTableFactory::NewTableBuilder( const TableBuilderOptions& table_builder_options, - WritableFile* file) const { + WritableFileWriter* file) const { return table_factory_to_write_->NewTableBuilder(table_builder_options, file); } diff --git a/table/adaptive_table_factory.h b/table/adaptive_table_factory.h index aa0f82708..47a17de2c 100644 --- a/table/adaptive_table_factory.h +++ b/table/adaptive_table_factory.h @@ -33,15 +33,16 @@ class AdaptiveTableFactory : public TableFactory { const char* Name() const override { return "AdaptiveTableFactory"; } - Status NewTableReader( - const ImmutableCFOptions& ioptions, const EnvOptions& env_options, - const InternalKeyComparator& internal_comparator, - unique_ptr&& file, uint64_t file_size, - unique_ptr* table) const override; + Status NewTableReader(const ImmutableCFOptions& ioptions, + const EnvOptions& env_options, + const InternalKeyComparator& internal_comparator, + unique_ptr&& file, + uint64_t file_size, + unique_ptr* table) const override; TableBuilder* NewTableBuilder( const TableBuilderOptions& table_builder_options, - WritableFile* file) const override; + WritableFileWriter* file) const override; // Sanitizes the specified DB Options. Status SanitizeOptions(const DBOptions& db_opts, diff --git a/table/block_based_table_builder.cc b/table/block_based_table_builder.cc index 588f2475b..d0563b133 100644 --- a/table/block_based_table_builder.cc +++ b/table/block_based_table_builder.cc @@ -437,7 +437,7 @@ struct BlockBasedTableBuilder::Rep { const ImmutableCFOptions ioptions; const BlockBasedTableOptions table_options; const InternalKeyComparator& internal_comparator; - WritableFile* file; + WritableFileWriter* file; uint64_t offset = 0; Status status; BlockBuilder data_block; @@ -467,7 +467,7 @@ struct BlockBasedTableBuilder::Rep { const InternalKeyComparator& icomparator, const std::vector>* int_tbl_prop_collector_factories, - WritableFile* f, const CompressionType _compression_type, + WritableFileWriter* f, const CompressionType _compression_type, const CompressionOptions& _compression_opts, const bool skip_filters) : ioptions(_ioptions), table_options(table_opt), @@ -502,7 +502,7 @@ BlockBasedTableBuilder::BlockBasedTableBuilder( const InternalKeyComparator& internal_comparator, const std::vector>* int_tbl_prop_collector_factories, - WritableFile* file, const CompressionType compression_type, + WritableFileWriter* file, const CompressionType compression_type, const CompressionOptions& compression_opts, const bool skip_filters) { BlockBasedTableOptions sanitized_table_options(table_options); if (sanitized_table_options.format_version == 0 && @@ -524,7 +524,7 @@ BlockBasedTableBuilder::BlockBasedTableBuilder( } if (table_options.block_cache_compressed.get() != nullptr) { BlockBasedTable::GenerateCachePrefix( - table_options.block_cache_compressed.get(), file, + table_options.block_cache_compressed.get(), file->writable_file(), &rep_->compressed_cache_key_prefix[0], &rep_->compressed_cache_key_prefix_size); } diff --git a/table/block_based_table_builder.h b/table/block_based_table_builder.h index 97f31277c..ce868207a 100644 --- a/table/block_based_table_builder.h +++ b/table/block_based_table_builder.h @@ -40,7 +40,7 @@ class BlockBasedTableBuilder : public TableBuilder { const InternalKeyComparator& internal_comparator, const std::vector>* int_tbl_prop_collector_factories, - WritableFile* file, const CompressionType compression_type, + WritableFileWriter* file, const CompressionType compression_type, const CompressionOptions& compression_opts, const bool skip_filters); // REQUIRES: Either Finish() or Abandon() has been called. diff --git a/table/block_based_table_factory.cc b/table/block_based_table_factory.cc index 6c1e0a21c..7f56d72d9 100644 --- a/table/block_based_table_factory.cc +++ b/table/block_based_table_factory.cc @@ -44,7 +44,7 @@ BlockBasedTableFactory::BlockBasedTableFactory( Status BlockBasedTableFactory::NewTableReader( const ImmutableCFOptions& ioptions, const EnvOptions& soptions, const InternalKeyComparator& internal_comparator, - unique_ptr&& file, uint64_t file_size, + unique_ptr&& file, uint64_t file_size, unique_ptr* table_reader, const bool prefetch_enabled) const { return BlockBasedTable::Open(ioptions, soptions, table_options_, internal_comparator, std::move(file), file_size, @@ -53,7 +53,7 @@ Status BlockBasedTableFactory::NewTableReader( TableBuilder* BlockBasedTableFactory::NewTableBuilder( const TableBuilderOptions& table_builder_options, - WritableFile* file) const { + WritableFileWriter* file) const { auto table_builder = new BlockBasedTableBuilder( table_builder_options.ioptions, table_options_, table_builder_options.internal_comparator, diff --git a/table/block_based_table_factory.h b/table/block_based_table_factory.h index 639492659..fc6cdc55a 100644 --- a/table/block_based_table_factory.h +++ b/table/block_based_table_factory.h @@ -36,7 +36,8 @@ class BlockBasedTableFactory : public TableFactory { Status NewTableReader(const ImmutableCFOptions& ioptions, const EnvOptions& soptions, const InternalKeyComparator& internal_comparator, - unique_ptr&& file, uint64_t file_size, + unique_ptr&& file, + uint64_t file_size, unique_ptr* table_reader) const override { return NewTableReader(ioptions, soptions, internal_comparator, std::move(file), file_size, table_reader, @@ -48,13 +49,14 @@ class BlockBasedTableFactory : public TableFactory { Status NewTableReader(const ImmutableCFOptions& ioptions, const EnvOptions& soptions, const InternalKeyComparator& internal_comparator, - unique_ptr&& file, uint64_t file_size, + unique_ptr&& file, + uint64_t file_size, unique_ptr* table_reader, bool prefetch_index_and_filter) const; TableBuilder* NewTableBuilder( const TableBuilderOptions& table_builder_options, - WritableFile* file) const override; + WritableFileWriter* file) const override; // Sanitizes the specified DB Options. Status SanitizeOptions(const DBOptions& db_opts, diff --git a/table/block_based_table_reader.cc b/table/block_based_table_reader.cc index 79c7d0edd..bc9efd68a 100644 --- a/table/block_based_table_reader.cc +++ b/table/block_based_table_reader.cc @@ -37,6 +37,7 @@ #include "table/get_context.h" #include "util/coding.h" +#include "util/file_reader_writer.h" #include "util/perf_context_imp.h" #include "util/stop_watch.h" #include "util/string_util.h" @@ -62,7 +63,7 @@ const size_t kMaxCacheKeyPrefixSize __attribute__((unused)) = // The only relevant option is options.verify_checksums for now. // On failure return non-OK. // On success fill *result and return OK - caller owns *result -Status ReadBlockFromFile(RandomAccessFile* file, const Footer& footer, +Status ReadBlockFromFile(RandomAccessFileReader* file, const Footer& footer, const ReadOptions& options, const BlockHandle& handle, std::unique_ptr* result, Env* env, bool do_uncompress = true) { @@ -167,7 +168,7 @@ class BinarySearchIndexReader : public IndexReader { // `BinarySearchIndexReader`. // On success, index_reader will be populated; otherwise it will remain // unmodified. - static Status Create(RandomAccessFile* file, const Footer& footer, + static Status Create(RandomAccessFileReader* file, const Footer& footer, const BlockHandle& index_handle, Env* env, const Comparator* comparator, IndexReader** index_reader) { @@ -212,8 +213,8 @@ class BinarySearchIndexReader : public IndexReader { class HashIndexReader : public IndexReader { public: static Status Create(const SliceTransform* hash_key_extractor, - const Footer& footer, RandomAccessFile* file, Env* env, - const Comparator* comparator, + const Footer& footer, RandomAccessFileReader* file, + Env* env, const Comparator* comparator, const BlockHandle& index_handle, Iterator* meta_index_iter, IndexReader** index_reader, bool hash_index_allow_collision) { @@ -347,7 +348,7 @@ struct BlockBasedTable::Rep { const FilterPolicy* const filter_policy; const InternalKeyComparator& internal_comparator; Status status; - unique_ptr file; + unique_ptr file; char cache_key_prefix[kMaxCacheKeyPrefixSize]; size_t cache_key_prefix_size = 0; char compressed_cache_key_prefix[kMaxCacheKeyPrefixSize]; @@ -405,13 +406,12 @@ void BlockBasedTable::SetupCacheKeyPrefix(Rep* rep) { rep->cache_key_prefix_size = 0; rep->compressed_cache_key_prefix_size = 0; if (rep->table_options.block_cache != nullptr) { - GenerateCachePrefix(rep->table_options.block_cache.get(), rep->file.get(), - &rep->cache_key_prefix[0], - &rep->cache_key_prefix_size); + GenerateCachePrefix(rep->table_options.block_cache.get(), rep->file->file(), + &rep->cache_key_prefix[0], &rep->cache_key_prefix_size); } if (rep->table_options.block_cache_compressed != nullptr) { GenerateCachePrefix(rep->table_options.block_cache_compressed.get(), - rep->file.get(), &rep->compressed_cache_key_prefix[0], + rep->file->file(), &rep->compressed_cache_key_prefix[0], &rep->compressed_cache_key_prefix_size); } } @@ -469,7 +469,7 @@ Status BlockBasedTable::Open(const ImmutableCFOptions& ioptions, const EnvOptions& env_options, const BlockBasedTableOptions& table_options, const InternalKeyComparator& internal_comparator, - unique_ptr&& file, + unique_ptr&& file, uint64_t file_size, unique_ptr* table_reader, const bool prefetch_index_and_filter) { diff --git a/table/block_based_table_reader.h b/table/block_based_table_reader.h index 727a0d632..6b5a63a23 100644 --- a/table/block_based_table_reader.h +++ b/table/block_based_table_reader.h @@ -21,6 +21,7 @@ #include "table/table_reader.h" #include "table/table_properties_internal.h" #include "util/coding.h" +#include "util/file_reader_writer.h" namespace rocksdb { @@ -69,8 +70,8 @@ class BlockBasedTable : public TableReader { const EnvOptions& env_options, const BlockBasedTableOptions& table_options, const InternalKeyComparator& internal_key_comparator, - unique_ptr&& file, uint64_t file_size, - unique_ptr* table_reader, + unique_ptr&& file, + uint64_t file_size, unique_ptr* table_reader, bool prefetch_index_and_filter = true); bool PrefixMayMatch(const Slice& internal_key); diff --git a/table/cuckoo_table_builder.cc b/table/cuckoo_table_builder.cc index 1aa1e0707..946a8b5fb 100644 --- a/table/cuckoo_table_builder.cc +++ b/table/cuckoo_table_builder.cc @@ -20,6 +20,7 @@ #include "table/format.h" #include "table/meta_blocks.h" #include "util/autovector.h" +#include "util/file_reader_writer.h" #include "util/random.h" #include "util/string_util.h" @@ -47,7 +48,7 @@ const std::string CuckooTablePropertyNames::kUserKeyLength = extern const uint64_t kCuckooTableMagicNumber = 0x926789d0c5f17873ull; CuckooTableBuilder::CuckooTableBuilder( - WritableFile* file, double max_hash_table_ratio, + WritableFileWriter* file, double max_hash_table_ratio, uint32_t max_num_hash_table, uint32_t max_search_depth, const Comparator* user_comparator, uint32_t cuckoo_block_size, bool use_module_hash, bool identity_as_first_hash, diff --git a/table/cuckoo_table_builder.h b/table/cuckoo_table_builder.h index 913d50503..093e1c245 100644 --- a/table/cuckoo_table_builder.h +++ b/table/cuckoo_table_builder.h @@ -21,12 +21,13 @@ namespace rocksdb { class CuckooTableBuilder: public TableBuilder { public: - CuckooTableBuilder( - WritableFile* file, double max_hash_table_ratio, - uint32_t max_num_hash_func, uint32_t max_search_depth, - const Comparator* user_comparator, uint32_t cuckoo_block_size, - bool use_module_hash, bool identity_as_first_hash, - uint64_t (*get_slice_hash)(const Slice&, uint32_t, uint64_t)); + CuckooTableBuilder(WritableFileWriter* file, double max_hash_table_ratio, + uint32_t max_num_hash_func, uint32_t max_search_depth, + const Comparator* user_comparator, + uint32_t cuckoo_block_size, bool use_module_hash, + bool identity_as_first_hash, + uint64_t (*get_slice_hash)(const Slice&, uint32_t, + uint64_t)); // REQUIRES: Either Finish() or Abandon() has been called. ~CuckooTableBuilder() {} @@ -82,7 +83,7 @@ class CuckooTableBuilder: public TableBuilder { inline Slice GetValue(uint64_t idx) const; uint32_t num_hash_func_; - WritableFile* file_; + WritableFileWriter* file_; const double max_hash_table_ratio_; const uint32_t max_num_hash_func_; const uint32_t max_search_depth_; diff --git a/table/cuckoo_table_builder_test.cc b/table/cuckoo_table_builder_test.cc index bd9283659..69a8245e6 100644 --- a/table/cuckoo_table_builder_test.cc +++ b/table/cuckoo_table_builder_test.cc @@ -10,6 +10,7 @@ #include "table/meta_blocks.h" #include "table/cuckoo_table_builder.h" +#include "util/file_reader_writer.h" #include "util/testharness.h" #include "util/testutil.h" @@ -48,8 +49,11 @@ class CuckooBuilderTest : public testing::Test { // Assert Table Properties. TableProperties* props = nullptr; - ASSERT_OK(ReadTableProperties(read_file.get(), read_file_size, - kCuckooTableMagicNumber, env_, nullptr, &props)); + unique_ptr file_reader( + new RandomAccessFileReader(std::move(read_file))); + ASSERT_OK(ReadTableProperties(file_reader.get(), read_file_size, + kCuckooTableMagicNumber, env_, nullptr, + &props)); // Check unused bucket. std::string unused_key = props->user_collected_properties[ CuckooTablePropertyNames::kEmptyKey]; @@ -90,8 +94,8 @@ class CuckooBuilderTest : public testing::Test { size_t bucket_size = expected_unused_bucket.size(); for (uint32_t i = 0; i < table_size + cuckoo_block_size - 1; ++i) { Slice read_slice; - ASSERT_OK(read_file->Read(i*bucket_size, bucket_size, - &read_slice, nullptr)); + ASSERT_OK(file_reader->Read(i * bucket_size, bucket_size, &read_slice, + nullptr)); size_t key_idx = std::find(expected_locations.begin(), expected_locations.end(), i) - expected_locations.begin(); @@ -104,7 +108,7 @@ class CuckooBuilderTest : public testing::Test { } } for (auto key_found : keys_found) { - // Check that all keys were found. + // Check that all keys wereReader found. ASSERT_TRUE(key_found); } } @@ -133,12 +137,15 @@ TEST_F(CuckooBuilderTest, SuccessWithEmptyFile) { unique_ptr writable_file; fname = test::TmpDir() + "/EmptyFile"; ASSERT_OK(env_->NewWritableFile(fname, &writable_file, env_options_)); - CuckooTableBuilder builder(writable_file.get(), kHashTableRatio, - 4, 100, BytewiseComparator(), 1, false, false, GetSliceHash); + unique_ptr file_writer( + new WritableFileWriter(std::move(writable_file), EnvOptions())); + CuckooTableBuilder builder(file_writer.get(), kHashTableRatio, 4, 100, + BytewiseComparator(), 1, false, false, + GetSliceHash); ASSERT_OK(builder.status()); ASSERT_EQ(0UL, builder.FileSize()); ASSERT_OK(builder.Finish()); - ASSERT_OK(writable_file->Close()); + ASSERT_OK(file_writer->Close()); CheckFileContents({}, {}, {}, "", 2, 2, false); } @@ -165,8 +172,11 @@ TEST_F(CuckooBuilderTest, WriteSuccessNoCollisionFullKey) { unique_ptr writable_file; fname = test::TmpDir() + "/NoCollisionFullKey"; ASSERT_OK(env_->NewWritableFile(fname, &writable_file, env_options_)); - CuckooTableBuilder builder(writable_file.get(), kHashTableRatio, - num_hash_fun, 100, BytewiseComparator(), 1, false, false, GetSliceHash); + unique_ptr file_writer( + new WritableFileWriter(std::move(writable_file), EnvOptions())); + CuckooTableBuilder builder(file_writer.get(), kHashTableRatio, num_hash_fun, + 100, BytewiseComparator(), 1, false, false, + GetSliceHash); ASSERT_OK(builder.status()); for (uint32_t i = 0; i < user_keys.size(); i++) { builder.Add(Slice(keys[i]), Slice(values[i])); @@ -176,7 +186,7 @@ TEST_F(CuckooBuilderTest, WriteSuccessNoCollisionFullKey) { size_t bucket_size = keys[0].size() + values[0].size(); ASSERT_EQ(expected_table_size * bucket_size - 1, builder.FileSize()); ASSERT_OK(builder.Finish()); - ASSERT_OK(writable_file->Close()); + ASSERT_OK(file_writer->Close()); ASSERT_LE(expected_table_size * bucket_size, builder.FileSize()); std::string expected_unused_bucket = GetInternalKey("key00", true); @@ -209,8 +219,11 @@ TEST_F(CuckooBuilderTest, WriteSuccessWithCollisionFullKey) { unique_ptr writable_file; fname = test::TmpDir() + "/WithCollisionFullKey"; ASSERT_OK(env_->NewWritableFile(fname, &writable_file, env_options_)); - CuckooTableBuilder builder(writable_file.get(), kHashTableRatio, - num_hash_fun, 100, BytewiseComparator(), 1, false, false, GetSliceHash); + unique_ptr file_writer( + new WritableFileWriter(std::move(writable_file), EnvOptions())); + CuckooTableBuilder builder(file_writer.get(), kHashTableRatio, num_hash_fun, + 100, BytewiseComparator(), 1, false, false, + GetSliceHash); ASSERT_OK(builder.status()); for (uint32_t i = 0; i < user_keys.size(); i++) { builder.Add(Slice(keys[i]), Slice(values[i])); @@ -220,7 +233,7 @@ TEST_F(CuckooBuilderTest, WriteSuccessWithCollisionFullKey) { size_t bucket_size = keys[0].size() + values[0].size(); ASSERT_EQ(expected_table_size * bucket_size - 1, builder.FileSize()); ASSERT_OK(builder.Finish()); - ASSERT_OK(writable_file->Close()); + ASSERT_OK(file_writer->Close()); ASSERT_LE(expected_table_size * bucket_size, builder.FileSize()); std::string expected_unused_bucket = GetInternalKey("key00", true); @@ -254,9 +267,11 @@ TEST_F(CuckooBuilderTest, WriteSuccessWithCollisionAndCuckooBlock) { uint32_t cuckoo_block_size = 2; fname = test::TmpDir() + "/WithCollisionFullKey2"; ASSERT_OK(env_->NewWritableFile(fname, &writable_file, env_options_)); - CuckooTableBuilder builder(writable_file.get(), kHashTableRatio, - num_hash_fun, 100, BytewiseComparator(), cuckoo_block_size, - false, false, GetSliceHash); + unique_ptr file_writer( + new WritableFileWriter(std::move(writable_file), EnvOptions())); + CuckooTableBuilder builder(file_writer.get(), kHashTableRatio, num_hash_fun, + 100, BytewiseComparator(), cuckoo_block_size, + false, false, GetSliceHash); ASSERT_OK(builder.status()); for (uint32_t i = 0; i < user_keys.size(); i++) { builder.Add(Slice(keys[i]), Slice(values[i])); @@ -266,7 +281,7 @@ TEST_F(CuckooBuilderTest, WriteSuccessWithCollisionAndCuckooBlock) { size_t bucket_size = keys[0].size() + values[0].size(); ASSERT_EQ(expected_table_size * bucket_size - 1, builder.FileSize()); ASSERT_OK(builder.Finish()); - ASSERT_OK(writable_file->Close()); + ASSERT_OK(file_writer->Close()); ASSERT_LE(expected_table_size * bucket_size, builder.FileSize()); std::string expected_unused_bucket = GetInternalKey("key00", true); @@ -304,8 +319,11 @@ TEST_F(CuckooBuilderTest, WithCollisionPathFullKey) { unique_ptr writable_file; fname = test::TmpDir() + "/WithCollisionPathFullKey"; ASSERT_OK(env_->NewWritableFile(fname, &writable_file, env_options_)); - CuckooTableBuilder builder(writable_file.get(), kHashTableRatio, - num_hash_fun, 100, BytewiseComparator(), 1, false, false, GetSliceHash); + unique_ptr file_writer( + new WritableFileWriter(std::move(writable_file), EnvOptions())); + CuckooTableBuilder builder(file_writer.get(), kHashTableRatio, num_hash_fun, + 100, BytewiseComparator(), 1, false, false, + GetSliceHash); ASSERT_OK(builder.status()); for (uint32_t i = 0; i < user_keys.size(); i++) { builder.Add(Slice(keys[i]), Slice(values[i])); @@ -315,7 +333,7 @@ TEST_F(CuckooBuilderTest, WithCollisionPathFullKey) { size_t bucket_size = keys[0].size() + values[0].size(); ASSERT_EQ(expected_table_size * bucket_size - 1, builder.FileSize()); ASSERT_OK(builder.Finish()); - ASSERT_OK(writable_file->Close()); + ASSERT_OK(file_writer->Close()); ASSERT_LE(expected_table_size * bucket_size, builder.FileSize()); std::string expected_unused_bucket = GetInternalKey("key00", true); @@ -350,8 +368,11 @@ TEST_F(CuckooBuilderTest, WithCollisionPathFullKeyAndCuckooBlock) { unique_ptr writable_file; fname = test::TmpDir() + "/WithCollisionPathFullKeyAndCuckooBlock"; ASSERT_OK(env_->NewWritableFile(fname, &writable_file, env_options_)); - CuckooTableBuilder builder(writable_file.get(), kHashTableRatio, - num_hash_fun, 100, BytewiseComparator(), 2, false, false, GetSliceHash); + unique_ptr file_writer( + new WritableFileWriter(std::move(writable_file), EnvOptions())); + CuckooTableBuilder builder(file_writer.get(), kHashTableRatio, num_hash_fun, + 100, BytewiseComparator(), 2, false, false, + GetSliceHash); ASSERT_OK(builder.status()); for (uint32_t i = 0; i < user_keys.size(); i++) { builder.Add(Slice(keys[i]), Slice(values[i])); @@ -361,7 +382,7 @@ TEST_F(CuckooBuilderTest, WithCollisionPathFullKeyAndCuckooBlock) { size_t bucket_size = keys[0].size() + values[0].size(); ASSERT_EQ(expected_table_size * bucket_size - 1, builder.FileSize()); ASSERT_OK(builder.Finish()); - ASSERT_OK(writable_file->Close()); + ASSERT_OK(file_writer->Close()); ASSERT_LE(expected_table_size * bucket_size, builder.FileSize()); std::string expected_unused_bucket = GetInternalKey("key00", true); @@ -389,8 +410,11 @@ TEST_F(CuckooBuilderTest, WriteSuccessNoCollisionUserKey) { unique_ptr writable_file; fname = test::TmpDir() + "/NoCollisionUserKey"; ASSERT_OK(env_->NewWritableFile(fname, &writable_file, env_options_)); - CuckooTableBuilder builder(writable_file.get(), kHashTableRatio, - num_hash_fun, 100, BytewiseComparator(), 1, false, false, GetSliceHash); + unique_ptr file_writer( + new WritableFileWriter(std::move(writable_file), EnvOptions())); + CuckooTableBuilder builder(file_writer.get(), kHashTableRatio, num_hash_fun, + 100, BytewiseComparator(), 1, false, false, + GetSliceHash); ASSERT_OK(builder.status()); for (uint32_t i = 0; i < user_keys.size(); i++) { builder.Add(Slice(GetInternalKey(user_keys[i], true)), Slice(values[i])); @@ -400,7 +424,7 @@ TEST_F(CuckooBuilderTest, WriteSuccessNoCollisionUserKey) { size_t bucket_size = user_keys[0].size() + values[0].size(); ASSERT_EQ(expected_table_size * bucket_size - 1, builder.FileSize()); ASSERT_OK(builder.Finish()); - ASSERT_OK(writable_file->Close()); + ASSERT_OK(file_writer->Close()); ASSERT_LE(expected_table_size * bucket_size, builder.FileSize()); std::string expected_unused_bucket = "key00"; @@ -429,8 +453,11 @@ TEST_F(CuckooBuilderTest, WriteSuccessWithCollisionUserKey) { unique_ptr writable_file; fname = test::TmpDir() + "/WithCollisionUserKey"; ASSERT_OK(env_->NewWritableFile(fname, &writable_file, env_options_)); - CuckooTableBuilder builder(writable_file.get(), kHashTableRatio, - num_hash_fun, 100, BytewiseComparator(), 1, false, false, GetSliceHash); + unique_ptr file_writer( + new WritableFileWriter(std::move(writable_file), EnvOptions())); + CuckooTableBuilder builder(file_writer.get(), kHashTableRatio, num_hash_fun, + 100, BytewiseComparator(), 1, false, false, + GetSliceHash); ASSERT_OK(builder.status()); for (uint32_t i = 0; i < user_keys.size(); i++) { builder.Add(Slice(GetInternalKey(user_keys[i], true)), Slice(values[i])); @@ -440,7 +467,7 @@ TEST_F(CuckooBuilderTest, WriteSuccessWithCollisionUserKey) { size_t bucket_size = user_keys[0].size() + values[0].size(); ASSERT_EQ(expected_table_size * bucket_size - 1, builder.FileSize()); ASSERT_OK(builder.Finish()); - ASSERT_OK(writable_file->Close()); + ASSERT_OK(file_writer->Close()); ASSERT_LE(expected_table_size * bucket_size, builder.FileSize()); std::string expected_unused_bucket = "key00"; @@ -471,8 +498,11 @@ TEST_F(CuckooBuilderTest, WithCollisionPathUserKey) { unique_ptr writable_file; fname = test::TmpDir() + "/WithCollisionPathUserKey"; ASSERT_OK(env_->NewWritableFile(fname, &writable_file, env_options_)); - CuckooTableBuilder builder(writable_file.get(), kHashTableRatio, - num_hash_fun, 2, BytewiseComparator(), 1, false, false, GetSliceHash); + unique_ptr file_writer( + new WritableFileWriter(std::move(writable_file), EnvOptions())); + CuckooTableBuilder builder(file_writer.get(), kHashTableRatio, num_hash_fun, + 2, BytewiseComparator(), 1, false, false, + GetSliceHash); ASSERT_OK(builder.status()); for (uint32_t i = 0; i < user_keys.size(); i++) { builder.Add(Slice(GetInternalKey(user_keys[i], true)), Slice(values[i])); @@ -482,7 +512,7 @@ TEST_F(CuckooBuilderTest, WithCollisionPathUserKey) { size_t bucket_size = user_keys[0].size() + values[0].size(); ASSERT_EQ(expected_table_size * bucket_size - 1, builder.FileSize()); ASSERT_OK(builder.Finish()); - ASSERT_OK(writable_file->Close()); + ASSERT_OK(file_writer->Close()); ASSERT_LE(expected_table_size * bucket_size, builder.FileSize()); std::string expected_unused_bucket = "key00"; @@ -512,8 +542,11 @@ TEST_F(CuckooBuilderTest, FailWhenCollisionPathTooLong) { unique_ptr writable_file; fname = test::TmpDir() + "/WithCollisionPathUserKey"; ASSERT_OK(env_->NewWritableFile(fname, &writable_file, env_options_)); - CuckooTableBuilder builder(writable_file.get(), kHashTableRatio, - num_hash_fun, 2, BytewiseComparator(), 1, false, false, GetSliceHash); + unique_ptr file_writer( + new WritableFileWriter(std::move(writable_file), EnvOptions())); + CuckooTableBuilder builder(file_writer.get(), kHashTableRatio, num_hash_fun, + 2, BytewiseComparator(), 1, false, false, + GetSliceHash); ASSERT_OK(builder.status()); for (uint32_t i = 0; i < user_keys.size(); i++) { builder.Add(Slice(GetInternalKey(user_keys[i], false)), Slice("value")); @@ -521,7 +554,7 @@ TEST_F(CuckooBuilderTest, FailWhenCollisionPathTooLong) { ASSERT_OK(builder.status()); } ASSERT_TRUE(builder.Finish().IsNotSupported()); - ASSERT_OK(writable_file->Close()); + ASSERT_OK(file_writer->Close()); } TEST_F(CuckooBuilderTest, FailWhenSameKeyInserted) { @@ -536,8 +569,11 @@ TEST_F(CuckooBuilderTest, FailWhenSameKeyInserted) { unique_ptr writable_file; fname = test::TmpDir() + "/FailWhenSameKeyInserted"; ASSERT_OK(env_->NewWritableFile(fname, &writable_file, env_options_)); - CuckooTableBuilder builder(writable_file.get(), kHashTableRatio, - num_hash_fun, 100, BytewiseComparator(), 1, false, false, GetSliceHash); + unique_ptr file_writer( + new WritableFileWriter(std::move(writable_file), EnvOptions())); + CuckooTableBuilder builder(file_writer.get(), kHashTableRatio, num_hash_fun, + 100, BytewiseComparator(), 1, false, false, + GetSliceHash); ASSERT_OK(builder.status()); builder.Add(Slice(GetInternalKey(user_key, false)), Slice("value1")); @@ -548,7 +584,7 @@ TEST_F(CuckooBuilderTest, FailWhenSameKeyInserted) { ASSERT_OK(builder.status()); ASSERT_TRUE(builder.Finish().IsNotSupported()); - ASSERT_OK(writable_file->Close()); + ASSERT_OK(file_writer->Close()); } } // namespace rocksdb diff --git a/table/cuckoo_table_factory.cc b/table/cuckoo_table_factory.cc index 17aa1d78e..1899f6c28 100644 --- a/table/cuckoo_table_factory.cc +++ b/table/cuckoo_table_factory.cc @@ -12,9 +12,10 @@ namespace rocksdb { -Status CuckooTableFactory::NewTableReader(const ImmutableCFOptions& ioptions, - const EnvOptions& env_options, const InternalKeyComparator& icomp, - std::unique_ptr&& file, uint64_t file_size, +Status CuckooTableFactory::NewTableReader( + const ImmutableCFOptions& ioptions, const EnvOptions& env_options, + const InternalKeyComparator& icomp, + std::unique_ptr&& file, uint64_t file_size, std::unique_ptr* table) const { std::unique_ptr new_reader(new CuckooTableReader(ioptions, std::move(file), file_size, icomp.user_comparator(), nullptr)); @@ -27,7 +28,7 @@ Status CuckooTableFactory::NewTableReader(const ImmutableCFOptions& ioptions, TableBuilder* CuckooTableFactory::NewTableBuilder( const TableBuilderOptions& table_builder_options, - WritableFile* file) const { + WritableFileWriter* file) const { // Ignore the skipFIlters flag. Does not apply to this file format // diff --git a/table/cuckoo_table_factory.h b/table/cuckoo_table_factory.h index c78fe6931..9f2a67765 100644 --- a/table/cuckoo_table_factory.h +++ b/table/cuckoo_table_factory.h @@ -55,15 +55,16 @@ class CuckooTableFactory : public TableFactory { const char* Name() const override { return "CuckooTable"; } - Status NewTableReader( - const ImmutableCFOptions& ioptions, const EnvOptions& env_options, - const InternalKeyComparator& internal_comparator, - unique_ptr&& file, uint64_t file_size, - unique_ptr* table) const override; + Status NewTableReader(const ImmutableCFOptions& ioptions, + const EnvOptions& env_options, + const InternalKeyComparator& internal_comparator, + unique_ptr&& file, + uint64_t file_size, + unique_ptr* table) const override; TableBuilder* NewTableBuilder( const TableBuilderOptions& table_builder_options, - WritableFile* file) const override; + WritableFileWriter* file) const override; // Sanitizes the specified DB Options. Status SanitizeOptions(const DBOptions& db_opts, diff --git a/table/cuckoo_table_reader.cc b/table/cuckoo_table_reader.cc index 7f017ec7c..51ffc6ffa 100644 --- a/table/cuckoo_table_reader.cc +++ b/table/cuckoo_table_reader.cc @@ -33,8 +33,7 @@ extern const uint64_t kCuckooTableMagicNumber; CuckooTableReader::CuckooTableReader( const ImmutableCFOptions& ioptions, - std::unique_ptr&& file, - uint64_t file_size, + std::unique_ptr&& file, uint64_t file_size, const Comparator* comparator, uint64_t (*get_slice_hash)(const Slice&, uint32_t, uint64_t)) : file_(std::move(file)), diff --git a/table/cuckoo_table_reader.h b/table/cuckoo_table_reader.h index 4f00a9e41..6643be025 100644 --- a/table/cuckoo_table_reader.h +++ b/table/cuckoo_table_reader.h @@ -18,6 +18,7 @@ #include "rocksdb/env.h" #include "rocksdb/options.h" #include "table/table_reader.h" +#include "util/file_reader_writer.h" namespace rocksdb { @@ -26,12 +27,11 @@ class TableReader; class CuckooTableReader: public TableReader { public: - CuckooTableReader( - const ImmutableCFOptions& ioptions, - std::unique_ptr&& file, - uint64_t file_size, - const Comparator* user_comparator, - uint64_t (*get_slice_hash)(const Slice&, uint32_t, uint64_t)); + CuckooTableReader(const ImmutableCFOptions& ioptions, + std::unique_ptr&& file, + uint64_t file_size, const Comparator* user_comparator, + uint64_t (*get_slice_hash)(const Slice&, uint32_t, + uint64_t)); ~CuckooTableReader() {} std::shared_ptr GetTableProperties() const override { @@ -57,7 +57,7 @@ class CuckooTableReader: public TableReader { private: friend class CuckooTableIterator; void LoadAllKeys(std::vector>* key_to_bucket_id); - std::unique_ptr file_; + std::unique_ptr file_; Slice file_data_; bool is_last_level_; bool identity_as_first_hash_; diff --git a/table/cuckoo_table_reader_test.cc b/table/cuckoo_table_reader_test.cc index 660261ab3..1f5b4de0d 100644 --- a/table/cuckoo_table_reader_test.cc +++ b/table/cuckoo_table_reader_test.cc @@ -62,7 +62,6 @@ uint64_t GetSliceHash(const Slice& s, uint32_t index, uint64_t max_num_buckets) { return hash_map[s.ToString()][index]; } - } // namespace class CuckooReaderTest : public testing::Test { @@ -94,9 +93,11 @@ class CuckooReaderTest : public testing::Test { const Comparator* ucomp = BytewiseComparator()) { std::unique_ptr writable_file; ASSERT_OK(env->NewWritableFile(fname, &writable_file, env_options)); - CuckooTableBuilder builder( - writable_file.get(), 0.9, kNumHashFunc, 100, ucomp, 2, - false, false, GetSliceHash); + unique_ptr file_writer( + new WritableFileWriter(std::move(writable_file), env_options)); + + CuckooTableBuilder builder(file_writer.get(), 0.9, kNumHashFunc, 100, ucomp, + 2, false, false, GetSliceHash); ASSERT_OK(builder.status()); for (uint32_t key_idx = 0; key_idx < num_items; ++key_idx) { builder.Add(Slice(keys[key_idx]), Slice(values[key_idx])); @@ -106,18 +107,16 @@ class CuckooReaderTest : public testing::Test { ASSERT_OK(builder.Finish()); ASSERT_EQ(num_items, builder.NumEntries()); file_size = builder.FileSize(); - ASSERT_OK(writable_file->Close()); + ASSERT_OK(file_writer->Close()); // Check reader now. std::unique_ptr read_file; ASSERT_OK(env->NewRandomAccessFile(fname, &read_file, env_options)); + unique_ptr file_reader( + new RandomAccessFileReader(std::move(read_file))); const ImmutableCFOptions ioptions(options); - CuckooTableReader reader( - ioptions, - std::move(read_file), - file_size, - ucomp, - GetSliceHash); + CuckooTableReader reader(ioptions, std::move(file_reader), file_size, ucomp, + GetSliceHash); ASSERT_OK(reader.status()); // Assume no merge/deletion for (uint32_t i = 0; i < num_items; ++i) { @@ -141,13 +140,11 @@ class CuckooReaderTest : public testing::Test { void CheckIterator(const Comparator* ucomp = BytewiseComparator()) { std::unique_ptr read_file; ASSERT_OK(env->NewRandomAccessFile(fname, &read_file, env_options)); + unique_ptr file_reader( + new RandomAccessFileReader(std::move(read_file))); const ImmutableCFOptions ioptions(options); - CuckooTableReader reader( - ioptions, - std::move(read_file), - file_size, - ucomp, - GetSliceHash); + CuckooTableReader reader(ioptions, std::move(file_reader), file_size, ucomp, + GetSliceHash); ASSERT_OK(reader.status()); Iterator* it = reader.NewIterator(ReadOptions(), nullptr); ASSERT_OK(it->status()); @@ -321,13 +318,11 @@ TEST_F(CuckooReaderTest, WhenKeyNotFound) { CreateCuckooFileAndCheckReader(); std::unique_ptr read_file; ASSERT_OK(env->NewRandomAccessFile(fname, &read_file, env_options)); + unique_ptr file_reader( + new RandomAccessFileReader(std::move(read_file))); const ImmutableCFOptions ioptions(options); - CuckooTableReader reader( - ioptions, - std::move(read_file), - file_size, - ucmp, - GetSliceHash); + CuckooTableReader reader(ioptions, std::move(file_reader), file_size, ucmp, + GetSliceHash); ASSERT_OK(reader.status()); // Search for a key with colliding hash values. std::string not_found_user_key = "key" + NumToStr(num_items); @@ -406,10 +401,11 @@ void WriteFile(const std::vector& keys, std::unique_ptr writable_file; ASSERT_OK(env->NewWritableFile(fname, &writable_file, env_options)); - CuckooTableBuilder builder( - writable_file.get(), hash_ratio, - 64, 1000, test::Uint64Comparator(), 5, - false, FLAGS_identity_as_first_hash, nullptr); + unique_ptr file_writer( + new WritableFileWriter(std::move(writable_file), env_options)); + CuckooTableBuilder builder(file_writer.get(), hash_ratio, 64, 1000, + test::Uint64Comparator(), 5, false, + FLAGS_identity_as_first_hash, nullptr); ASSERT_OK(builder.status()); for (uint64_t key_idx = 0; key_idx < num; ++key_idx) { // Value is just a part of key. @@ -419,17 +415,18 @@ void WriteFile(const std::vector& keys, } ASSERT_OK(builder.Finish()); ASSERT_EQ(num, builder.NumEntries()); - ASSERT_OK(writable_file->Close()); + ASSERT_OK(file_writer->Close()); uint64_t file_size; env->GetFileSize(fname, &file_size); std::unique_ptr read_file; ASSERT_OK(env->NewRandomAccessFile(fname, &read_file, env_options)); + unique_ptr file_reader( + new RandomAccessFileReader(std::move(read_file))); const ImmutableCFOptions ioptions(options); - CuckooTableReader reader( - ioptions, std::move(read_file), file_size, - test::Uint64Comparator(), nullptr); + CuckooTableReader reader(ioptions, std::move(file_reader), file_size, + test::Uint64Comparator(), nullptr); ASSERT_OK(reader.status()); ReadOptions r_options; std::string value; @@ -455,11 +452,12 @@ void ReadKeys(uint64_t num, uint32_t batch_size) { env->GetFileSize(fname, &file_size); std::unique_ptr read_file; ASSERT_OK(env->NewRandomAccessFile(fname, &read_file, env_options)); + unique_ptr file_reader( + new RandomAccessFileReader(std::move(read_file))); const ImmutableCFOptions ioptions(options); - CuckooTableReader reader( - ioptions, std::move(read_file), file_size, test::Uint64Comparator(), - nullptr); + CuckooTableReader reader(ioptions, std::move(file_reader), file_size, + test::Uint64Comparator(), nullptr); ASSERT_OK(reader.status()); const UserCollectedProperties user_props = reader.GetTableProperties()->user_collected_properties; diff --git a/table/format.cc b/table/format.cc index ccc345f8e..956c5c47c 100644 --- a/table/format.cc +++ b/table/format.cc @@ -17,6 +17,7 @@ #include "util/coding.h" #include "util/compression.h" #include "util/crc32c.h" +#include "util/file_reader_writer.h" #include "util/perf_context_imp.h" #include "util/string_util.h" #include "util/xxhash.h" @@ -210,7 +211,7 @@ std::string Footer::ToString() const { return result; } -Status ReadFooterFromFile(RandomAccessFile* file, uint64_t file_size, +Status ReadFooterFromFile(RandomAccessFileReader* file, uint64_t file_size, Footer* footer, uint64_t enforce_table_magic_number) { if (file_size < Footer::kMinEncodedLength) { return Status::Corruption("file is too short to be an sstable"); @@ -249,9 +250,9 @@ namespace { // Read a block and check its CRC // contents is the result of reading. // According to the implementation of file->Read, contents may not point to buf -Status ReadBlock(RandomAccessFile* file, const Footer& footer, - const ReadOptions& options, const BlockHandle& handle, - Slice* contents, /* result of reading */ char* buf) { +Status ReadBlock(RandomAccessFileReader* file, const Footer& footer, + const ReadOptions& options, const BlockHandle& handle, + Slice* contents, /* result of reading */ char* buf) { size_t n = static_cast(handle.size()); Status s; @@ -299,7 +300,7 @@ Status ReadBlock(RandomAccessFile* file, const Footer& footer, } // namespace -Status ReadBlockContents(RandomAccessFile* file, const Footer& footer, +Status ReadBlockContents(RandomAccessFileReader* file, const Footer& footer, const ReadOptions& options, const BlockHandle& handle, BlockContents* contents, Env* env, bool decompression_requested) { diff --git a/table/format.h b/table/format.h index 5f1a3dc96..74ec808c6 100644 --- a/table/format.h +++ b/table/format.h @@ -166,7 +166,7 @@ class Footer { // Read the footer from file // If enforce_table_magic_number != 0, ReadFooterFromFile() will return // corruption if table_magic number is not equal to enforce_table_magic_number -Status ReadFooterFromFile(RandomAccessFile* file, uint64_t file_size, +Status ReadFooterFromFile(RandomAccessFileReader* file, uint64_t file_size, Footer* footer, uint64_t enforce_table_magic_number = 0); @@ -205,7 +205,8 @@ struct BlockContents { // Read the block identified by "handle" from "file". On failure // return non-OK. On success fill *result and return OK. -extern Status ReadBlockContents(RandomAccessFile* file, const Footer& footer, +extern Status ReadBlockContents(RandomAccessFileReader* file, + const Footer& footer, const ReadOptions& options, const BlockHandle& handle, BlockContents* contents, Env* env, diff --git a/table/meta_blocks.cc b/table/meta_blocks.cc index 6fad80825..ad04b670f 100644 --- a/table/meta_blocks.cc +++ b/table/meta_blocks.cc @@ -129,9 +129,9 @@ bool NotifyCollectTableCollectorsOnFinish( return all_succeeded; } -Status ReadProperties(const Slice &handle_value, RandomAccessFile *file, - const Footer &footer, Env *env, Logger *logger, - TableProperties **table_properties) { +Status ReadProperties(const Slice& handle_value, RandomAccessFileReader* file, + const Footer& footer, Env* env, Logger* logger, + TableProperties** table_properties) { assert(table_properties); Slice v = handle_value; @@ -217,7 +217,7 @@ Status ReadProperties(const Slice &handle_value, RandomAccessFile *file, return s; } -Status ReadTableProperties(RandomAccessFile* file, uint64_t file_size, +Status ReadTableProperties(RandomAccessFileReader* file, uint64_t file_size, uint64_t table_magic_number, Env* env, Logger* info_log, TableProperties** properties) { // -- Read metaindex block @@ -271,7 +271,7 @@ Status FindMetaBlock(Iterator* meta_index_iter, } } -Status FindMetaBlock(RandomAccessFile* file, uint64_t file_size, +Status FindMetaBlock(RandomAccessFileReader* file, uint64_t file_size, uint64_t table_magic_number, Env* env, const std::string& meta_block_name, BlockHandle* block_handle) { @@ -298,7 +298,7 @@ Status FindMetaBlock(RandomAccessFile* file, uint64_t file_size, return FindMetaBlock(meta_iter.get(), meta_block_name, block_handle); } -Status ReadMetaBlock(RandomAccessFile* file, uint64_t file_size, +Status ReadMetaBlock(RandomAccessFileReader* file, uint64_t file_size, uint64_t table_magic_number, Env* env, const std::string& meta_block_name, BlockContents* contents) { diff --git a/table/meta_blocks.h b/table/meta_blocks.h index 7ac3cb063..167ba5900 100644 --- a/table/meta_blocks.h +++ b/table/meta_blocks.h @@ -107,26 +107,25 @@ bool NotifyCollectTableCollectorsOnFinish( // @returns a status to indicate if the operation succeeded. On success, // *table_properties will point to a heap-allocated TableProperties // object, otherwise value of `table_properties` will not be modified. -Status ReadProperties(const Slice &handle_value, RandomAccessFile *file, - const Footer &footer, Env *env, Logger *logger, - TableProperties **table_properties); +Status ReadProperties(const Slice& handle_value, RandomAccessFileReader* file, + const Footer& footer, Env* env, Logger* logger, + TableProperties** table_properties); // Directly read the properties from the properties block of a plain table. // @returns a status to indicate if the operation succeeded. On success, // *table_properties will point to a heap-allocated TableProperties // object, otherwise value of `table_properties` will not be modified. -Status ReadTableProperties(RandomAccessFile* file, uint64_t file_size, +Status ReadTableProperties(RandomAccessFileReader* file, uint64_t file_size, uint64_t table_magic_number, Env* env, Logger* info_log, TableProperties** properties); - // Find the meta block from the meta index block. Status FindMetaBlock(Iterator* meta_index_iter, const std::string& meta_block_name, BlockHandle* block_handle); // Find the meta block -Status FindMetaBlock(RandomAccessFile* file, uint64_t file_size, +Status FindMetaBlock(RandomAccessFileReader* file, uint64_t file_size, uint64_t table_magic_number, Env* env, const std::string& meta_block_name, BlockHandle* block_handle); @@ -134,7 +133,7 @@ Status FindMetaBlock(RandomAccessFile* file, uint64_t file_size, // Read the specified meta block with name meta_block_name // from `file` and initialize `contents` with contents of this block. // Return Status::OK in case of success. -Status ReadMetaBlock(RandomAccessFile* file, uint64_t file_size, +Status ReadMetaBlock(RandomAccessFileReader* file, uint64_t file_size, uint64_t table_magic_number, Env* env, const std::string& meta_block_name, BlockContents* contents); diff --git a/table/mock_table.cc b/table/mock_table.cc index 90e2079dd..925b75cd7 100644 --- a/table/mock_table.cc +++ b/table/mock_table.cc @@ -11,6 +11,7 @@ #include "db/dbformat.h" #include "port/port.h" #include "util/coding.h" +#include "util/file_reader_writer.h" namespace rocksdb { namespace mock { @@ -45,7 +46,7 @@ MockTableFactory::MockTableFactory() : next_id_(1) {} Status MockTableFactory::NewTableReader( const ImmutableCFOptions& ioptions, const EnvOptions& env_options, const InternalKeyComparator& internal_key, - unique_ptr&& file, uint64_t file_size, + unique_ptr&& file, uint64_t file_size, unique_ptr* table_reader) const { uint32_t id = GetIDFromFile(file.get()); @@ -63,8 +64,8 @@ Status MockTableFactory::NewTableReader( TableBuilder* MockTableFactory::NewTableBuilder( const TableBuilderOptions& table_builder_options, - WritableFile* file) const { - uint32_t id = GetAndWriteNextID(file); + WritableFileWriter* file) const { + uint32_t id = GetAndWriteNextID(file->writable_file()); return new MockTableBuilder(id, &file_system_); } @@ -90,7 +91,7 @@ uint32_t MockTableFactory::GetAndWriteNextID(WritableFile* file) const { return next_id; } -uint32_t MockTableFactory::GetIDFromFile(RandomAccessFile* file) const { +uint32_t MockTableFactory::GetIDFromFile(RandomAccessFileReader* file) const { char buf[4]; Slice result; file->Read(0, 4, &result, buf); diff --git a/table/mock_table.h b/table/mock_table.h index ef38575cc..fd542c965 100644 --- a/table/mock_table.h +++ b/table/mock_table.h @@ -140,13 +140,14 @@ class MockTableFactory : public TableFactory { MockTableFactory(); const char* Name() const override { return "MockTable"; } Status NewTableReader(const ImmutableCFOptions& ioptions, - const EnvOptions& env_options, - const InternalKeyComparator& internal_key, - unique_ptr&& file, uint64_t file_size, - unique_ptr* table_reader) const override; + const EnvOptions& env_options, + const InternalKeyComparator& internal_key, + unique_ptr&& file, + uint64_t file_size, + unique_ptr* table_reader) const override; TableBuilder* NewTableBuilder( const TableBuilderOptions& table_builder_options, - WritableFile* file) const override; + WritableFileWriter* file) const override; // This function will directly create mock table instead of going through // MockTableBuilder. MockFileContents has to have a format of next_id_; diff --git a/table/plain_table_builder.cc b/table/plain_table_builder.cc index 25e1b85bb..e16224a9d 100644 --- a/table/plain_table_builder.cc +++ b/table/plain_table_builder.cc @@ -26,6 +26,7 @@ #include "table/meta_blocks.h" #include "util/coding.h" #include "util/crc32c.h" +#include "util/file_reader_writer.h" #include "util/stop_watch.h" namespace rocksdb { @@ -35,11 +36,8 @@ namespace { // a utility that helps writing block content to the file // @offset will advance if @block_contents was successfully written. // @block_handle the block handle this particular block. -Status WriteBlock( - const Slice& block_contents, - WritableFile* file, - uint64_t* offset, - BlockHandle* block_handle) { +Status WriteBlock(const Slice& block_contents, WritableFileWriter* file, + uint64_t* offset, BlockHandle* block_handle) { block_handle->set_offset(*offset); block_handle->set_size(block_contents.size()); Status s = file->Append(block_contents); @@ -62,7 +60,7 @@ PlainTableBuilder::PlainTableBuilder( const ImmutableCFOptions& ioptions, const std::vector>* int_tbl_prop_collector_factories, - WritableFile* file, uint32_t user_key_len, EncodingType encoding_type, + WritableFileWriter* file, uint32_t user_key_len, EncodingType encoding_type, size_t index_sparseness, uint32_t bloom_bits_per_key, uint32_t num_probes, size_t huge_page_tlb_size, double hash_table_ratio, bool store_index_in_file) diff --git a/table/plain_table_builder.h b/table/plain_table_builder.h index f542d2f60..75ec3facd 100644 --- a/table/plain_table_builder.h +++ b/table/plain_table_builder.h @@ -34,10 +34,11 @@ class PlainTableBuilder: public TableBuilder { const ImmutableCFOptions& ioptions, const std::vector>* int_tbl_prop_collector_factories, - WritableFile* file, uint32_t user_key_size, EncodingType encoding_type, - size_t index_sparseness, uint32_t bloom_bits_per_key, - uint32_t num_probes = 6, size_t huge_page_tlb_size = 0, - double hash_table_ratio = 0, bool store_index_in_file = false); + WritableFileWriter* file, uint32_t user_key_size, + EncodingType encoding_type, size_t index_sparseness, + uint32_t bloom_bits_per_key, uint32_t num_probes = 6, + size_t huge_page_tlb_size = 0, double hash_table_ratio = 0, + bool store_index_in_file = false); // REQUIRES: Either Finish() or Abandon() has been called. ~PlainTableBuilder(); @@ -82,7 +83,7 @@ class PlainTableBuilder: public TableBuilder { BloomBlockBuilder bloom_block_; std::unique_ptr index_builder_; - WritableFile* file_; + WritableFileWriter* file_; uint64_t offset_ = 0; uint32_t bloom_bits_per_key_; size_t huge_page_tlb_size_; diff --git a/table/plain_table_factory.cc b/table/plain_table_factory.cc index ee492fa56..b7ce3b752 100644 --- a/table/plain_table_factory.cc +++ b/table/plain_table_factory.cc @@ -14,12 +14,11 @@ namespace rocksdb { -Status PlainTableFactory::NewTableReader(const ImmutableCFOptions& ioptions, - const EnvOptions& env_options, - const InternalKeyComparator& icomp, - unique_ptr&& file, - uint64_t file_size, - unique_ptr* table) const { +Status PlainTableFactory::NewTableReader( + const ImmutableCFOptions& ioptions, const EnvOptions& env_options, + const InternalKeyComparator& icomp, + unique_ptr&& file, uint64_t file_size, + unique_ptr* table) const { return PlainTableReader::Open(ioptions, env_options, icomp, std::move(file), file_size, table, bloom_bits_per_key_, hash_table_ratio_, index_sparseness_, @@ -28,7 +27,7 @@ Status PlainTableFactory::NewTableReader(const ImmutableCFOptions& ioptions, TableBuilder* PlainTableFactory::NewTableBuilder( const TableBuilderOptions& table_builder_options, - WritableFile* file) const { + WritableFileWriter* file) const { // Ignore the skip_filters flag. PlainTable format is optimized for small // in-memory dbs. The skip_filters optimization is not useful for plain // tables diff --git a/table/plain_table_factory.h b/table/plain_table_factory.h index 730e13468..e7e72bea4 100644 --- a/table/plain_table_factory.h +++ b/table/plain_table_factory.h @@ -153,14 +153,15 @@ class PlainTableFactory : public TableFactory { full_scan_mode_(options.full_scan_mode), store_index_in_file_(options.store_index_in_file) {} const char* Name() const override { return "PlainTable"; } - Status NewTableReader( - const ImmutableCFOptions& options, const EnvOptions& soptions, - const InternalKeyComparator& internal_comparator, - unique_ptr&& file, uint64_t file_size, - unique_ptr* table) const override; + Status NewTableReader(const ImmutableCFOptions& options, + const EnvOptions& soptions, + const InternalKeyComparator& internal_comparator, + unique_ptr&& file, + uint64_t file_size, + unique_ptr* table) const override; TableBuilder* NewTableBuilder( const TableBuilderOptions& table_builder_options, - WritableFile* file) const override; + WritableFileWriter* file) const override; std::string GetPrintableTableOptions() const override; diff --git a/table/plain_table_key_coding.cc b/table/plain_table_key_coding.cc index 4f09b507e..5aa0fa2a5 100644 --- a/table/plain_table_key_coding.cc +++ b/table/plain_table_key_coding.cc @@ -6,8 +6,9 @@ #ifndef ROCKSDB_LITE #include "table/plain_table_key_coding.h" -#include "table/plain_table_factory.h" #include "db/dbformat.h" +#include "table/plain_table_factory.h" +#include "util/file_reader_writer.h" namespace rocksdb { @@ -64,7 +65,8 @@ const char* DecodeSize(const char* offset, const char* limit, } } // namespace -Status PlainTableKeyEncoder::AppendKey(const Slice& key, WritableFile* file, +Status PlainTableKeyEncoder::AppendKey(const Slice& key, + WritableFileWriter* file, uint64_t* offset, char* meta_bytes_buf, size_t* meta_bytes_buf_size) { ParsedInternalKey parsed_key; diff --git a/table/plain_table_key_coding.h b/table/plain_table_key_coding.h index 9047087ae..e1bc5291b 100644 --- a/table/plain_table_key_coding.h +++ b/table/plain_table_key_coding.h @@ -34,7 +34,7 @@ class PlainTableKeyEncoder { // meta_bytes_buf: buffer for extra meta bytes // meta_bytes_buf_size: offset to append extra meta bytes. Will be updated // if meta_bytes_buf is updated. - Status AppendKey(const Slice& key, WritableFile* file, uint64_t* offset, + Status AppendKey(const Slice& key, WritableFileWriter* file, uint64_t* offset, char* meta_bytes_buf, size_t* meta_bytes_buf_size); // Return actual encoding type to be picked diff --git a/table/plain_table_reader.cc b/table/plain_table_reader.cc index c409204aa..529642bba 100644 --- a/table/plain_table_reader.cc +++ b/table/plain_table_reader.cc @@ -90,7 +90,7 @@ class PlainTableIterator : public Iterator { extern const uint64_t kPlainTableMagicNumber; PlainTableReader::PlainTableReader(const ImmutableCFOptions& ioptions, - unique_ptr&& file, + unique_ptr&& file, const EnvOptions& storage_options, const InternalKeyComparator& icomparator, EncodingType encoding_type, @@ -115,7 +115,7 @@ PlainTableReader::~PlainTableReader() { Status PlainTableReader::Open(const ImmutableCFOptions& ioptions, const EnvOptions& env_options, const InternalKeyComparator& internal_comparator, - unique_ptr&& file, + unique_ptr&& file, uint64_t file_size, unique_ptr* table_reader, const int bloom_bits_per_key, diff --git a/table/plain_table_reader.h b/table/plain_table_reader.h index b4f68a0fd..64b5317aa 100644 --- a/table/plain_table_reader.h +++ b/table/plain_table_reader.h @@ -22,6 +22,7 @@ #include "table/plain_table_index.h" #include "util/arena.h" #include "util/dynamic_bloom.h" +#include "util/file_reader_writer.h" namespace rocksdb { @@ -56,8 +57,8 @@ class PlainTableReader: public TableReader { static Status Open(const ImmutableCFOptions& ioptions, const EnvOptions& env_options, const InternalKeyComparator& internal_comparator, - unique_ptr&& file, uint64_t file_size, - unique_ptr* table, + unique_ptr&& file, + uint64_t file_size, unique_ptr* table, const int bloom_bits_per_key, double hash_table_ratio, size_t index_sparseness, size_t huge_page_tlb_size, bool full_scan_mode); @@ -83,7 +84,7 @@ class PlainTableReader: public TableReader { } PlainTableReader(const ImmutableCFOptions& ioptions, - unique_ptr&& file, + unique_ptr&& file, const EnvOptions& env_options, const InternalKeyComparator& internal_comparator, EncodingType encoding_type, uint64_t file_size, @@ -134,7 +135,7 @@ class PlainTableReader: public TableReader { Arena arena_; const ImmutableCFOptions& ioptions_; - unique_ptr file_; + unique_ptr file_; uint64_t file_size_; std::shared_ptr table_properties_; diff --git a/table/table_reader_bench.cc b/table/table_reader_bench.cc index b4039aa74..e8636f57b 100644 --- a/table/table_reader_bench.cc +++ b/table/table_reader_bench.cc @@ -22,6 +22,7 @@ int main() { #include "table/plain_table_factory.h" #include "table/table_builder.h" #include "table/get_context.h" +#include "util/file_reader_writer.h" #include "util/histogram.h" #include "util/testharness.h" #include "util/testutil.h" @@ -90,11 +91,14 @@ void TableReaderBenchmark(Options& opts, EnvOptions& env_options, std::vector > int_tbl_prop_collector_factories; + unique_ptr file_writer( + new WritableFileWriter(std::move(file), env_options)); + tb = opts.table_factory->NewTableBuilder( TableBuilderOptions(ioptions, ikc, &int_tbl_prop_collector_factories, CompressionType::kNoCompression, CompressionOptions(), false), - file.get()); + file_writer.get()); } else { s = DB::Open(opts, dbname, &db); ASSERT_OK(s); @@ -119,13 +123,16 @@ void TableReaderBenchmark(Options& opts, EnvOptions& env_options, } unique_ptr table_reader; - unique_ptr raf; if (!through_db) { + unique_ptr raf; s = env->NewRandomAccessFile(file_name, &raf, env_options); uint64_t file_size; env->GetFileSize(file_name, &file_size); - s = opts.table_factory->NewTableReader( - ioptions, env_options, ikc, std::move(raf), file_size, &table_reader); + unique_ptr file_reader( + new RandomAccessFileReader(std::move(raf))); + s = opts.table_factory->NewTableReader(ioptions, env_options, ikc, + std::move(file_reader), file_size, + &table_reader); } Random rnd(301); diff --git a/table/table_test.cc b/table/table_test.cc index 3165a9ae4..bda6db7f9 100644 --- a/table/table_test.cc +++ b/table/table_test.cc @@ -347,7 +347,7 @@ class TableConstructor: public Constructor { const InternalKeyComparator& internal_comparator, const KVMap& kv_map) override { Reset(); - sink_.reset(new StringSink()); + file_writer_.reset(test::GetWritableFileWriter(new StringSink())); unique_ptr builder; std::vector> int_tbl_prop_collector_factories; @@ -355,7 +355,7 @@ class TableConstructor: public Constructor { TableBuilderOptions(ioptions, internal_comparator, &int_tbl_prop_collector_factories, options.compression, CompressionOptions(), false), - sink_.get())); + file_writer_.get())); for (const auto kv : kv_map) { if (convert_to_internal_key_) { @@ -369,17 +369,18 @@ class TableConstructor: public Constructor { EXPECT_TRUE(builder->status().ok()); } Status s = builder->Finish(); + file_writer_->Flush(); EXPECT_TRUE(s.ok()) << s.ToString(); - EXPECT_EQ(sink_->contents().size(), builder->FileSize()); + EXPECT_EQ(GetSink()->contents().size(), builder->FileSize()); // Open the table uniq_id_ = cur_uniq_id_++; - source_.reset(new StringSource(sink_->contents(), uniq_id_, - ioptions.allow_mmap_reads)); + file_reader_.reset(test::GetRandomAccessFileReader(new StringSource( + GetSink()->contents(), uniq_id_, ioptions.allow_mmap_reads))); return ioptions.table_factory->NewTableReader( - ioptions, soptions, internal_comparator, std::move(source_), - sink_->contents().size(), &table_reader_); + ioptions, soptions, internal_comparator, std::move(file_reader_), + GetSink()->contents().size(), &table_reader_); } virtual Iterator* NewIterator() const override { @@ -397,12 +398,11 @@ class TableConstructor: public Constructor { } virtual Status Reopen(const ImmutableCFOptions& ioptions) { - source_.reset( - new StringSource(sink_->contents(), uniq_id_, - ioptions.allow_mmap_reads)); + file_reader_.reset(test::GetRandomAccessFileReader(new StringSource( + GetSink()->contents(), uniq_id_, ioptions.allow_mmap_reads))); return ioptions.table_factory->NewTableReader( - ioptions, soptions, *last_internal_key_, std::move(source_), - sink_->contents().size(), &table_reader_); + ioptions, soptions, *last_internal_key_, std::move(file_reader_), + GetSink()->contents().size(), &table_reader_); } virtual TableReader* GetTableReader() { @@ -417,13 +417,17 @@ class TableConstructor: public Constructor { void Reset() { uniq_id_ = 0; table_reader_.reset(); - sink_.reset(); - source_.reset(); + file_writer_.reset(); + file_reader_.reset(); + } + + StringSink* GetSink() { + return static_cast(file_writer_->writable_file()); } uint64_t uniq_id_; - unique_ptr sink_; - unique_ptr source_; + unique_ptr file_writer_; + unique_ptr file_reader_; unique_ptr table_reader_; bool convert_to_internal_key_; @@ -1766,6 +1770,8 @@ TEST_F(PlainTableTest, BasicPlainTableProperties) { PlainTableFactory factory(plain_table_options); StringSink sink; + unique_ptr file_writer( + test::GetWritableFileWriter(new StringSink())); Options options; const ImmutableCFOptions ioptions(options); InternalKeyComparator ikc(options.comparator); @@ -1774,7 +1780,7 @@ TEST_F(PlainTableTest, BasicPlainTableProperties) { std::unique_ptr builder(factory.NewTableBuilder( TableBuilderOptions(ioptions, ikc, &int_tbl_prop_collector_factories, kNoCompression, CompressionOptions(), false), - &sink)); + file_writer.get())); for (char c = 'a'; c <= 'z'; ++c) { std::string key(8, c); @@ -1783,11 +1789,15 @@ TEST_F(PlainTableTest, BasicPlainTableProperties) { builder->Add(key, value); } ASSERT_OK(builder->Finish()); + file_writer->Flush(); - StringSource source(sink.contents(), 72242, true); + StringSink* ss = static_cast(file_writer->writable_file()); + unique_ptr file_reader( + test::GetRandomAccessFileReader( + new StringSource(ss->contents(), 72242, true))); TableProperties* props = nullptr; - auto s = ReadTableProperties(&source, sink.contents().size(), + auto s = ReadTableProperties(file_reader.get(), ss->contents().size(), kPlainTableMagicNumber, Env::Default(), nullptr, &props); std::unique_ptr props_guard(props); diff --git a/util/db_test_util.h b/util/db_test_util.h index 8a1066c51..b55a5ee54 100644 --- a/util/db_test_util.h +++ b/util/db_test_util.h @@ -170,6 +170,9 @@ class SpecialEnv : public EnvWrapper { void SetIOPriority(Env::IOPriority pri) override { base_->SetIOPriority(pri); } + Env::IOPriority GetIOPriority() override { + return base_->GetIOPriority(); + } }; class ManifestFile : public WritableFile { public: diff --git a/util/env_posix.cc b/util/env_posix.cc index 371dcf15b..978d3d505 100644 --- a/util/env_posix.cc +++ b/util/env_posix.cc @@ -40,7 +40,6 @@ #include "util/posix_logger.h" #include "util/random.h" #include "util/iostats_context_imp.h" -#include "util/rate_limiter.h" #include "util/sync_point.h" #include "util/thread_status_updater.h" #include "util/thread_status_util.h" @@ -74,9 +73,6 @@ #define POSIX_FADV_DONTNEED 4 /* [MC1] dont need these pages */ #endif -// This is only set from db_stress.cc and for testing only. -// If non-zero, kill at various points in source code with probability 1/this -int rocksdb_kill_odds = 0; namespace rocksdb { @@ -104,39 +100,6 @@ static Status IOError(const std::string& context, int err_number) { return Status::IOError(context, strerror(err_number)); } -#ifdef NDEBUG -// empty in release build -#define TEST_KILL_RANDOM(rocksdb_kill_odds) -#else - -// Kill the process with probablity 1/odds for testing. -static void TestKillRandom(int odds, const std::string& srcfile, - int srcline) { - time_t curtime = time(nullptr); - Random r((uint32_t)curtime); - - assert(odds > 0); - bool crash = r.OneIn(odds); - if (crash) { - fprintf(stdout, "Crashing at %s:%d\n", srcfile.c_str(), srcline); - fflush(stdout); - kill(getpid(), SIGTERM); - } -} - -// To avoid crashing always at some frequently executed codepaths (during -// kill random test), use this factor to reduce odds -#define REDUCE_ODDS 2 -#define REDUCE_ODDS2 4 - -#define TEST_KILL_RANDOM(rocksdb_kill_odds) { \ - if (rocksdb_kill_odds > 0) { \ - TestKillRandom(rocksdb_kill_odds, __FILE__, __LINE__); \ - } \ -} - -#endif - #if defined(OS_LINUX) namespace { static size_t GetUniqueIdFromFile(int fd, char* id, size_t max_size) { @@ -188,7 +151,6 @@ class PosixSequentialFile: public SequentialFile { do { r = fread_unlocked(scratch, 1, n, file_); } while (r == 0 && ferror(file_) && errno == EINTR); - IOSTATS_ADD(bytes_read, r); *result = Slice(scratch, r); if (r < n) { if (feof(file_)) { @@ -252,10 +214,7 @@ class PosixRandomAccessFile: public RandomAccessFile { size_t left = n; char* ptr = scratch; while (left > 0) { - { - IOSTATS_TIMER_GUARD(read_nanos); - r = pread(fd_, ptr, left, static_cast(offset)); - } + r = pread(fd_, ptr, left, static_cast(offset)); if (r <= 0) { if (errno == EINTR) { @@ -268,7 +227,6 @@ class PosixRandomAccessFile: public RandomAccessFile { left -= r; } - IOSTATS_ADD_IF_POSITIVE(bytes_read, n - left); *result = Slice(scratch, (r < 0) ? 0 : n - left); if (r < 0) { // An error: return a non-ok status @@ -458,7 +416,6 @@ class PosixMmapFile : public WritableFile { if (ptr == MAP_FAILED) { return Status::IOError("MMap failed on " + filename_); } - TEST_KILL_RANDOM(rocksdb_kill_odds); base_ = reinterpret_cast(ptr); @@ -482,8 +439,7 @@ class PosixMmapFile : public WritableFile { limit_(nullptr), dst_(nullptr), last_sync_(nullptr), - file_offset_(0), - pending_sync_(false) { + file_offset_(0) { #ifdef ROCKSDB_FALLOCATE_PRESENT fallocate_with_keep_size_ = options.fallocate_with_keep_size; #endif @@ -501,8 +457,6 @@ class PosixMmapFile : public WritableFile { virtual Status Append(const Slice& data) override { const char* src = data.data(); size_t left = data.size(); - TEST_KILL_RANDOM(rocksdb_kill_odds * REDUCE_ODDS); - PrepareWrite(static_cast(GetFileSize()), left); while (left > 0) { assert(base_ <= dst_); assert(dst_ <= limit_); @@ -521,12 +475,10 @@ class PosixMmapFile : public WritableFile { size_t n = (left <= avail) ? left : avail; memcpy(dst_, src, n); - IOSTATS_ADD(bytes_written, n); dst_ += n; src += n; left -= n; } - TEST_KILL_RANDOM(rocksdb_kill_odds); return Status::OK(); } @@ -534,8 +486,6 @@ class PosixMmapFile : public WritableFile { Status s; size_t unused = limit_ - dst_; - TEST_KILL_RANDOM(rocksdb_kill_odds); - s = UnmapCurrentRegion(); if (!s.ok()) { s = IOError(filename_, errno); @@ -546,8 +496,6 @@ class PosixMmapFile : public WritableFile { } } - TEST_KILL_RANDOM(rocksdb_kill_odds); - if (close(fd_) < 0) { if (s.ok()) { s = IOError(filename_, errno); @@ -561,22 +509,15 @@ class PosixMmapFile : public WritableFile { } virtual Status Flush() override { - TEST_KILL_RANDOM(rocksdb_kill_odds); return Status::OK(); } virtual Status Sync() override { Status s; - if (pending_sync_) { - // Some unmapped data was not synced - TEST_KILL_RANDOM(rocksdb_kill_odds); - pending_sync_ = false; if (fdatasync(fd_) < 0) { s = IOError(filename_, errno); } - TEST_KILL_RANDOM(rocksdb_kill_odds * REDUCE_ODDS); - } if (dst_ > last_sync_) { // Find the beginnings of the pages that contain the first and last @@ -588,7 +529,6 @@ class PosixMmapFile : public WritableFile { if (msync(base_ + p1, p2 - p1 + page_size_, MS_SYNC) < 0) { s = IOError(filename_, errno); } - TEST_KILL_RANDOM(rocksdb_kill_odds); } return s; @@ -598,15 +538,10 @@ class PosixMmapFile : public WritableFile { * Flush data as well as metadata to stable storage. */ virtual Status Fsync() override { - if (pending_sync_) { // Some unmapped data was not synced - TEST_KILL_RANDOM(rocksdb_kill_odds); - pending_sync_ = false; if (fsync(fd_) < 0) { return IOError(filename_, errno); } - TEST_KILL_RANDOM(rocksdb_kill_odds); - } // This invocation to Sync will not issue the call to // fdatasync because pending_sync_ has already been cleared. return Sync(); @@ -638,7 +573,6 @@ class PosixMmapFile : public WritableFile { #ifdef ROCKSDB_FALLOCATE_PRESENT virtual Status Allocate(off_t offset, off_t len) override { TEST_KILL_RANDOM(rocksdb_kill_odds); - IOSTATS_TIMER_GUARD(allocate_nanos); int alloc_status = fallocate( fd_, fallocate_with_keep_size_ ? FALLOC_FL_KEEP_SIZE : 0, offset, len); if (alloc_status == 0) { @@ -655,33 +589,14 @@ class PosixWritableFile : public WritableFile { private: const std::string filename_; int fd_; - size_t cursize_; // current size of cached data in buf_ - size_t capacity_; // max size of buf_ - unique_ptr buf_; // a buffer to cache writes uint64_t filesize_; - bool pending_sync_; - bool pending_fsync_; - uint64_t last_sync_size_; - uint64_t bytes_per_sync_; #ifdef ROCKSDB_FALLOCATE_PRESENT bool fallocate_with_keep_size_; #endif - RateLimiter* rate_limiter_; public: - PosixWritableFile(const std::string& fname, int fd, size_t capacity, - const EnvOptions& options) - : filename_(fname), - fd_(fd), - cursize_(0), - capacity_(capacity), - buf_(new char[capacity]), - filesize_(0), - pending_sync_(false), - pending_fsync_(false), - last_sync_size_(0), - bytes_per_sync_(options.bytes_per_sync), - rate_limiter_(options.rate_limiter) { + PosixWritableFile(const std::string& fname, int fd, const EnvOptions& options) + : filename_(fname), fd_(fd), filesize_(0) { #ifdef ROCKSDB_FALLOCATE_PRESENT fallocate_with_keep_size_ = options.fallocate_with_keep_size; #endif @@ -698,64 +613,23 @@ class PosixWritableFile : public WritableFile { const char* src = data.data(); size_t left = data.size(); Status s; - pending_sync_ = true; - pending_fsync_ = true; - - TEST_KILL_RANDOM(rocksdb_kill_odds * REDUCE_ODDS2); - - PrepareWrite(static_cast(GetFileSize()), left); - // if there is no space in the cache, then flush - if (cursize_ + left > capacity_) { - s = Flush(); - if (!s.ok()) { - return s; - } - // Increase the buffer size, but capped at 1MB - if (capacity_ < (1<<20)) { - capacity_ *= 2; - buf_.reset(new char[capacity_]); - } - assert(cursize_ == 0); - } - - // if the write fits into the cache, then write to cache - // otherwise do a write() syscall to write to OS buffers. - if (cursize_ + left <= capacity_) { - memcpy(buf_.get()+cursize_, src, left); - cursize_ += left; - } else { while (left != 0) { - ssize_t done; - size_t size = RequestToken(left); - { - IOSTATS_TIMER_GUARD(write_nanos); - done = write(fd_, src, size); - } + ssize_t done = write(fd_, src, left); if (done < 0) { if (errno == EINTR) { continue; } return IOError(filename_, errno); } - IOSTATS_ADD(bytes_written, done); - TEST_KILL_RANDOM(rocksdb_kill_odds); - left -= done; src += done; } - } - filesize_ += data.size(); + filesize_ += data.size(); return Status::OK(); } virtual Status Close() override { Status s; - s = Flush(); // flush cache to OS - if (!s.ok()) { - return s; - } - - TEST_KILL_RANDOM(rocksdb_kill_odds); size_t block_size; size_t last_allocated_block; @@ -793,68 +667,20 @@ class PosixWritableFile : public WritableFile { // write out the cached data to the OS cache virtual Status Flush() override { - TEST_KILL_RANDOM(rocksdb_kill_odds * REDUCE_ODDS2); - size_t left = cursize_; - char* src = buf_.get(); - while (left != 0) { - ssize_t done; - size_t size = RequestToken(left); - { - IOSTATS_TIMER_GUARD(write_nanos); - done = write(fd_, src, size); - } - if (done < 0) { - if (errno == EINTR) { - continue; - } - return IOError(filename_, errno); - } - IOSTATS_ADD(bytes_written, done); - TEST_KILL_RANDOM(rocksdb_kill_odds * REDUCE_ODDS2); - left -= done; - src += done; - } - cursize_ = 0; - - // sync OS cache to disk for every bytes_per_sync_ - // TODO: give log file and sst file different options (log - // files could be potentially cached in OS for their whole - // life time, thus we might not want to flush at all). - if (bytes_per_sync_ && - filesize_ - last_sync_size_ >= bytes_per_sync_) { - RangeSync(last_sync_size_, filesize_ - last_sync_size_); - last_sync_size_ = filesize_; - } - return Status::OK(); } virtual Status Sync() override { - Status s = Flush(); - if (!s.ok()) { - return s; - } - TEST_KILL_RANDOM(rocksdb_kill_odds); - if (pending_sync_ && fdatasync(fd_) < 0) { + if (fdatasync(fd_) < 0) { return IOError(filename_, errno); } - TEST_KILL_RANDOM(rocksdb_kill_odds); - pending_sync_ = false; return Status::OK(); } virtual Status Fsync() override { - Status s = Flush(); - if (!s.ok()) { - return s; - } - TEST_KILL_RANDOM(rocksdb_kill_odds); - if (pending_fsync_ && fsync(fd_) < 0) { + if (fsync(fd_) < 0) { return IOError(filename_, errno); } - TEST_KILL_RANDOM(rocksdb_kill_odds); - pending_fsync_ = false; - pending_sync_ = false; return Status::OK(); } @@ -876,8 +702,8 @@ class PosixWritableFile : public WritableFile { #ifdef ROCKSDB_FALLOCATE_PRESENT virtual Status Allocate(off_t offset, off_t len) override { TEST_KILL_RANDOM(rocksdb_kill_odds); - int alloc_status; IOSTATS_TIMER_GUARD(allocate_nanos); + int alloc_status; alloc_status = fallocate( fd_, fallocate_with_keep_size_ ? FALLOC_FL_KEEP_SIZE : 0, offset, len); if (alloc_status == 0) { @@ -888,7 +714,6 @@ class PosixWritableFile : public WritableFile { } virtual Status RangeSync(off_t offset, off_t nbytes) override { - IOSTATS_TIMER_GUARD(range_sync_nanos); if (sync_file_range(fd_, offset, nbytes, SYNC_FILE_RANGE_WRITE) == 0) { return Status::OK(); } else { @@ -899,34 +724,19 @@ class PosixWritableFile : public WritableFile { return GetUniqueIdFromFile(fd_, id, max_size); } #endif - - private: - inline size_t RequestToken(size_t bytes) { - if (rate_limiter_ && io_priority_ < Env::IO_TOTAL) { - bytes = std::min(bytes, - static_cast(rate_limiter_->GetSingleBurstBytes())); - rate_limiter_->Request(bytes, io_priority_); - } - return bytes; - } }; class PosixRandomRWFile : public RandomRWFile { private: const std::string filename_; int fd_; - bool pending_sync_; - bool pending_fsync_; #ifdef ROCKSDB_FALLOCATE_PRESENT bool fallocate_with_keep_size_; #endif public: PosixRandomRWFile(const std::string& fname, int fd, const EnvOptions& options) - : filename_(fname), - fd_(fd), - pending_sync_(false), - pending_fsync_(false) { + : filename_(fname), fd_(fd) { #ifdef ROCKSDB_FALLOCATE_PRESENT fallocate_with_keep_size_ = options.fallocate_with_keep_size; #endif @@ -943,22 +753,15 @@ class PosixRandomRWFile : public RandomRWFile { const char* src = data.data(); size_t left = data.size(); Status s; - pending_sync_ = true; - pending_fsync_ = true; while (left != 0) { - ssize_t done; - { - IOSTATS_TIMER_GUARD(write_nanos); - done = pwrite(fd_, src, left, offset); - } + ssize_t done = pwrite(fd_, src, left, offset); if (done < 0) { if (errno == EINTR) { continue; } return IOError(filename_, errno); } - IOSTATS_ADD(bytes_written, done); left -= done; src += done; @@ -975,11 +778,7 @@ class PosixRandomRWFile : public RandomRWFile { size_t left = n; char* ptr = scratch; while (left > 0) { - { - IOSTATS_TIMER_GUARD(read_nanos); - r = pread(fd_, ptr, left, static_cast(offset)); - } - + r = pread(fd_, ptr, left, static_cast(offset)); if (r <= 0) { if (errno == EINTR) { continue; @@ -990,7 +789,6 @@ class PosixRandomRWFile : public RandomRWFile { offset += r; left -= r; } - IOSTATS_ADD_IF_POSITIVE(bytes_read, n - left); *result = Slice(scratch, (r < 0) ? 0 : n - left); if (r < 0) { s = IOError(filename_, errno); @@ -1008,25 +806,21 @@ class PosixRandomRWFile : public RandomRWFile { } virtual Status Sync() override { - if (pending_sync_ && fdatasync(fd_) < 0) { + if (fdatasync(fd_) < 0) { return IOError(filename_, errno); } - pending_sync_ = false; return Status::OK(); } virtual Status Fsync() override { - if (pending_fsync_ && fsync(fd_) < 0) { + if (fsync(fd_) < 0) { return IOError(filename_, errno); } - pending_fsync_ = false; - pending_sync_ = false; return Status::OK(); } #ifdef ROCKSDB_FALLOCATE_PRESENT virtual Status Allocate(off_t offset, off_t len) override { - TEST_KILL_RANDOM(rocksdb_kill_odds); IOSTATS_TIMER_GUARD(allocate_nanos); int alloc_status = fallocate( fd_, fallocate_with_keep_size_ ? FALLOC_FL_KEEP_SIZE : 0, offset, len); @@ -1216,9 +1010,7 @@ class PosixEnv : public Env { EnvOptions no_mmap_writes_options = options; no_mmap_writes_options.use_mmap_writes = false; - result->reset( - new PosixWritableFile(fname, fd, 65536, no_mmap_writes_options) - ); + result->reset(new PosixWritableFile(fname, fd, no_mmap_writes_options)); } } return s; diff --git a/util/env_test.cc b/util/env_test.cc index baeab1df0..1ae50fdbc 100644 --- a/util/env_test.cc +++ b/util/env_test.cc @@ -664,6 +664,7 @@ TEST_F(EnvPosixTest, AllocateTest) { size_t kPageSize = 4096; std::string data(1024 * 1024, 'a'); wfile->SetPreallocationBlockSize(kPreallocateSize); + wfile->PrepareWrite(wfile->GetFileSize(), data.size()); ASSERT_OK(wfile->Append(Slice(data))); ASSERT_OK(wfile->Flush()); @@ -974,18 +975,22 @@ TEST_F(EnvPosixTest, Preallocation) { ASSERT_EQ(last_allocated_block, 0UL); // Small write should preallocate one block - srcfile->Append("test"); + std::string str = "test"; + srcfile->PrepareWrite(srcfile->GetFileSize(), str.size()); + srcfile->Append(str); srcfile->GetPreallocationStatus(&block_size, &last_allocated_block); ASSERT_EQ(last_allocated_block, 1UL); // Write an entire preallocation block, make sure we increased by two. std::string buf(block_size, ' '); + srcfile->PrepareWrite(srcfile->GetFileSize(), buf.size()); srcfile->Append(buf); srcfile->GetPreallocationStatus(&block_size, &last_allocated_block); ASSERT_EQ(last_allocated_block, 2UL); // Write five more blocks at once, ensure we're where we need to be. buf = std::string(block_size * 5, ' '); + srcfile->PrepareWrite(srcfile->GetFileSize(), buf.size()); srcfile->Append(buf); srcfile->GetPreallocationStatus(&block_size, &last_allocated_block); ASSERT_EQ(last_allocated_block, 7UL); diff --git a/util/file_reader_writer.cc b/util/file_reader_writer.cc new file mode 100644 index 000000000..ebd54c7eb --- /dev/null +++ b/util/file_reader_writer.cc @@ -0,0 +1,225 @@ +// Copyright (c) 2013, Facebook, Inc. All rights reserved. +// This source code is licensed under the BSD-style license found in the +// LICENSE file in the root directory of this source tree. An additional grant +// of patent rights can be found in the PATENTS file in the same directory. +// +// Copyright (c) 2011 The LevelDB Authors. All rights reserved. +// Use of this source code is governed by a BSD-style license that can be +// found in the LICENSE file. See the AUTHORS file for names of contributors. + +#include "util/file_reader_writer.h" + +#include +#include "port/port.h" +#include "util/iostats_context_imp.h" +#include "util/random.h" +#include "util/rate_limiter.h" +#include "util/sync_point.h" + +namespace rocksdb { +Status SequentialFileReader::Read(size_t n, Slice* result, char* scratch) { + Status s = file_->Read(n, result, scratch); + IOSTATS_ADD(bytes_read, result->size()); + return s; +} + +Status SequentialFileReader::Skip(uint64_t n) { return file_->Skip(n); } + +Status RandomAccessFileReader::Read(uint64_t offset, size_t n, Slice* result, + char* scratch) const { + IOSTATS_TIMER_GUARD(read_nanos); + Status s = file_->Read(offset, n, result, scratch); + IOSTATS_ADD_IF_POSITIVE(bytes_read, result->size()); + return s; +} + +Status WritableFileWriter::Append(const Slice& data) { + const char* src = data.data(); + size_t left = data.size(); + Status s; + pending_sync_ = true; + pending_fsync_ = true; + + TEST_KILL_RANDOM(rocksdb_kill_odds * REDUCE_ODDS2); + + writable_file_->PrepareWrite(static_cast(GetFileSize()), left); + // if there is no space in the cache, then flush + if (cursize_ + left > capacity_) { + s = Flush(); + if (!s.ok()) { + return s; + } + // Increase the buffer size, but capped at 1MB + if (capacity_ < (1 << 20)) { + capacity_ *= 2; + buf_.reset(new char[capacity_]); + } + assert(cursize_ == 0); + } + + // if the write fits into the cache, then write to cache + // otherwise do a write() syscall to write to OS buffers. + if (cursize_ + left <= capacity_) { + memcpy(buf_.get() + cursize_, src, left); + cursize_ += left; + } else { + while (left != 0) { + size_t size = RequestToken(left); + { + IOSTATS_TIMER_GUARD(write_nanos); + s = writable_file_->Append(Slice(src, size)); + if (!s.ok()) { + return s; + } + } + IOSTATS_ADD(bytes_written, size); + TEST_KILL_RANDOM(rocksdb_kill_odds); + + left -= size; + src += size; + } + } + TEST_KILL_RANDOM(rocksdb_kill_odds); + filesize_ += data.size(); + return Status::OK(); +} + +Status WritableFileWriter::Close() { + Status s; + s = Flush(); // flush cache to OS + if (!s.ok()) { + return s; + } + + TEST_KILL_RANDOM(rocksdb_kill_odds); + return writable_file_->Close(); +} + +// write out the cached data to the OS cache +Status WritableFileWriter::Flush() { + TEST_KILL_RANDOM(rocksdb_kill_odds * REDUCE_ODDS2); + size_t left = cursize_; + char* src = buf_.get(); + while (left != 0) { + size_t size = RequestToken(left); + { + IOSTATS_TIMER_GUARD(write_nanos); + Status s = writable_file_->Append(Slice(src, size)); + if (!s.ok()) { + return s; + } + } + IOSTATS_ADD(bytes_written, size); + TEST_KILL_RANDOM(rocksdb_kill_odds * REDUCE_ODDS2); + left -= size; + src += size; + } + cursize_ = 0; + + writable_file_->Flush(); + + // sync OS cache to disk for every bytes_per_sync_ + // TODO: give log file and sst file different options (log + // files could be potentially cached in OS for their whole + // life time, thus we might not want to flush at all). + if (bytes_per_sync_ && filesize_ - last_sync_size_ >= bytes_per_sync_) { + writable_file_->RangeSync(last_sync_size_, filesize_ - last_sync_size_); + last_sync_size_ = filesize_; + } + + return Status::OK(); +} + +Status WritableFileWriter::Sync(bool use_fsync) { + Status s = Flush(); + if (!s.ok()) { + return s; + } + TEST_KILL_RANDOM(rocksdb_kill_odds); + if (pending_sync_) { + if (use_fsync) { + s = writable_file_->Fsync(); + } else { + s = writable_file_->Sync(); + } + if (!s.ok()) { + return s; + } + } + TEST_KILL_RANDOM(rocksdb_kill_odds); + pending_sync_ = false; + if (use_fsync) { + pending_fsync_ = false; + } + return Status::OK(); +} + +Status WritableFileWriter::RangeSync(off_t offset, off_t nbytes) { + IOSTATS_TIMER_GUARD(range_sync_nanos); + return writable_file_->RangeSync(offset, nbytes); +} + +size_t WritableFileWriter::RequestToken(size_t bytes) { + Env::IOPriority io_priority; + if (rate_limiter_&&(io_priority = writable_file_->GetIOPriority()) < + Env::IO_TOTAL) { + bytes = std::min(bytes, + static_cast(rate_limiter_->GetSingleBurstBytes())); + rate_limiter_->Request(bytes, io_priority); + } + return bytes; +} + +Status RandomRWFileAccessor::Write(uint64_t offset, const Slice& data) { + Status s; + pending_sync_ = true; + pending_fsync_ = true; + + { + IOSTATS_TIMER_GUARD(write_nanos); + s = random_rw_file_->Write(offset, data); + if (!s.ok()) { + return s; + } + } + IOSTATS_ADD(bytes_written, data.size()); + + return s; +} + +Status RandomRWFileAccessor::Read(uint64_t offset, size_t n, Slice* result, + char* scratch) const { + Status s; + { + IOSTATS_TIMER_GUARD(read_nanos); + s = random_rw_file_->Read(offset, n, result, scratch); + if (!s.ok()) { + return s; + } + } + IOSTATS_ADD_IF_POSITIVE(bytes_read, result->size()); + return s; +} + +Status RandomRWFileAccessor::Close() { return random_rw_file_->Close(); } + +Status RandomRWFileAccessor::Sync(bool use_fsync) { + Status s; + if (pending_sync_) { + if (use_fsync) { + s = random_rw_file_->Fsync(); + } else { + s = random_rw_file_->Sync(); + } + if (!s.ok()) { + return s; + } + } + if (use_fsync) { + pending_fsync_ = false; + } + pending_sync_ = false; + + return s; +} +} // namespace rocksdb diff --git a/util/file_reader_writer.h b/util/file_reader_writer.h new file mode 100644 index 000000000..396ad57ef --- /dev/null +++ b/util/file_reader_writer.h @@ -0,0 +1,109 @@ +// Copyright (c) 2013, Facebook, Inc. All rights reserved. +// This source code is licensed under the BSD-style license found in the +// LICENSE file in the root directory of this source tree. An additional grant +// of patent rights can be found in the PATENTS file in the same directory. +// +// Copyright (c) 2011 The LevelDB Authors. All rights reserved. +// Use of this source code is governed by a BSD-style license that can be +// found in the LICENSE file. See the AUTHORS file for names of contributors. +#pragma once +#include "rocksdb/env.h" + +namespace rocksdb { +class SequentialFileReader { + private: + std::unique_ptr file_; + + public: + explicit SequentialFileReader(std::unique_ptr&& _file) + : file_(std::move(_file)) {} + Status Read(size_t n, Slice* result, char* scratch); + + Status Skip(uint64_t n); + + SequentialFile* file() { return file_.get(); } +}; + +class RandomAccessFileReader : public RandomAccessFile { + private: + std::unique_ptr file_; + + public: + explicit RandomAccessFileReader(std::unique_ptr&& raf) + : file_(std::move(raf)) {} + + Status Read(uint64_t offset, size_t n, Slice* result, char* scratch) const; + + RandomAccessFile* file() { return file_.get(); } +}; + +// Use posix write to write data to a file. +class WritableFileWriter { + private: + std::unique_ptr writable_file_; + size_t cursize_; // current size of cached data in buf_ + size_t capacity_; // max size of buf_ + unique_ptr buf_; // a buffer to cache writes + uint64_t filesize_; + bool pending_sync_; + bool pending_fsync_; + uint64_t last_sync_size_; + uint64_t bytes_per_sync_; + RateLimiter* rate_limiter_; + + public: + explicit WritableFileWriter(std::unique_ptr&& file, + const EnvOptions& options) + : writable_file_(std::move(file)), + cursize_(0), + capacity_(65536), + buf_(new char[capacity_]), + filesize_(0), + pending_sync_(false), + pending_fsync_(false), + last_sync_size_(0), + bytes_per_sync_(options.bytes_per_sync), + rate_limiter_(options.rate_limiter) {} + + ~WritableFileWriter() { Flush(); } + Status Append(const Slice& data); + + Status Flush(); + + Status Close(); + + Status Sync(bool use_fsync); + + uint64_t GetFileSize() { return filesize_; } + + Status InvalidateCache(size_t offset, size_t length) { + return writable_file_->InvalidateCache(offset, length); + } + + WritableFile* writable_file() const { return writable_file_.get(); } + + private: + Status RangeSync(off_t offset, off_t nbytes); + size_t RequestToken(size_t bytes); +}; + +class RandomRWFileAccessor { + private: + std::unique_ptr random_rw_file_; + bool pending_sync_; + bool pending_fsync_; + + public: + explicit RandomRWFileAccessor(std::unique_ptr&& f) + : random_rw_file_(std::move(f)), + pending_sync_(false), + pending_fsync_(false) {} + Status Write(uint64_t offset, const Slice& data); + + Status Read(uint64_t offset, size_t n, Slice* result, char* scratch) const; + + Status Close(); + + Status Sync(bool use_fsync); +}; +} // namespace rocksdb diff --git a/util/file_util.cc b/util/file_util.cc index c75d59c5f..a4d528f70 100644 --- a/util/file_util.cc +++ b/util/file_util.cc @@ -8,6 +8,7 @@ #include "util/file_util.h" #include "rocksdb/env.h" #include "db/filename.h" +#include "util/file_reader_writer.h" namespace rocksdb { @@ -15,8 +16,12 @@ namespace rocksdb { Status CopyFile(Env* env, const std::string& source, const std::string& destination, uint64_t size) { const EnvOptions soptions; - unique_ptr srcfile; Status s; + unique_ptr src_reader; + unique_ptr dest_writer; + + { + unique_ptr srcfile; s = env->NewSequentialFile(source, &srcfile, soptions); unique_ptr destfile; if (s.ok()) { @@ -33,6 +38,9 @@ Status CopyFile(Env* env, const std::string& source, return s; } } + src_reader.reset(new SequentialFileReader(std::move(srcfile))); + dest_writer.reset(new WritableFileWriter(std::move(destfile), soptions)); + } char buffer[4096]; Slice slice; @@ -40,13 +48,13 @@ Status CopyFile(Env* env, const std::string& source, uint64_t bytes_to_read = std::min(static_cast(sizeof(buffer)), size); if (s.ok()) { - s = srcfile->Read(bytes_to_read, &slice, buffer); + s = src_reader->Read(bytes_to_read, &slice, buffer); } if (s.ok()) { if (slice.size() == 0) { return Status::Corruption("file too small"); } - s = destfile->Append(slice); + s = dest_writer->Append(slice); } if (!s.ok()) { return s; diff --git a/util/ldb_cmd.cc b/util/ldb_cmd.cc index bbb738f58..406b6b999 100644 --- a/util/ldb_cmd.cc +++ b/util/ldb_cmd.cc @@ -1408,10 +1408,18 @@ class InMemoryHandler : public WriteBatch::Handler { void DumpWalFile(std::string wal_file, bool print_header, bool print_values, LDBCommandExecuteResult* exec_state) { - unique_ptr file; Env* env_ = Env::Default(); EnvOptions soptions; - Status status = env_->NewSequentialFile(wal_file, &file, soptions); + unique_ptr wal_file_reader; + + Status status; + { + unique_ptr file; + status = env_->NewSequentialFile(wal_file, &file, soptions); + if (status.ok()) { + wal_file_reader.reset(new SequentialFileReader(std::move(file))); + } + } if (!status.ok()) { if (exec_state) { *exec_state = LDBCommandExecuteResult::Failed("Failed to open WAL file " + @@ -1422,7 +1430,7 @@ void DumpWalFile(std::string wal_file, bool print_header, bool print_values, } } else { StdErrReporter reporter; - log::Reader reader(move(file), &reporter, true, 0); + log::Reader reader(move(wal_file_reader), &reporter, true, 0); string scratch; WriteBatch batch; Slice record; diff --git a/util/sst_dump_test.cc b/util/sst_dump_test.cc index 03d7299a3..fb9bde00c 100644 --- a/util/sst_dump_test.cc +++ b/util/sst_dump_test.cc @@ -13,6 +13,7 @@ #include "rocksdb/filter_policy.h" #include "table/block_based_table_factory.h" #include "table/table_builder.h" +#include "util/file_reader_writer.h" #include "util/testharness.h" #include "util/testutil.h" @@ -53,12 +54,13 @@ void createSST(const std::string& file_name, opts.table_factory = tf; std::vector > int_tbl_prop_collector_factories; - + unique_ptr file_writer( + new WritableFileWriter(std::move(file), EnvOptions())); tb.reset(opts.table_factory->NewTableBuilder( TableBuilderOptions(imoptions, ikc, &int_tbl_prop_collector_factories, CompressionType::kNoCompression, CompressionOptions(), false), - file.get())); + file_writer.get())); // Populate slightly more than 1K keys uint32_t num_keys = 1024; @@ -66,7 +68,7 @@ void createSST(const std::string& file_name, tb->Add(MakeKey(i), MakeValue(i)); } tb->Finish(); - file->Close(); + file_writer->Close(); } void cleanup(const std::string& file_name) { diff --git a/util/sst_dump_tool.cc b/util/sst_dump_tool.cc index 04486da83..00bbed28a 100644 --- a/util/sst_dump_tool.cc +++ b/util/sst_dump_tool.cc @@ -24,7 +24,6 @@ SstFileReader::SstFileReader(const std::string& file_path, output_hex_(output_hex), ioptions_(options_), internal_comparator_(BytewiseComparator()) { fprintf(stdout, "Process %s\n", file_path.c_str()); - init_result_ = GetTableReader(file_name_); } @@ -41,10 +40,13 @@ Status SstFileReader::GetTableReader(const std::string& file_path) { unique_ptr file; uint64_t file_size; - Status s = options_.env->NewRandomAccessFile(file_path, &file_, soptions_); + Status s = options_.env->NewRandomAccessFile(file_path, &file, soptions_); if (s.ok()) { s = options_.env->GetFileSize(file_path, &file_size); } + + file_.reset(new RandomAccessFileReader(std::move(file))); + if (s.ok()) { s = ReadFooterFromFile(file_.get(), file_size, &footer); } @@ -56,7 +58,8 @@ Status SstFileReader::GetTableReader(const std::string& file_path) { if (magic_number == kPlainTableMagicNumber || magic_number == kLegacyPlainTableMagicNumber) { soptions_.use_mmap_reads = true; - options_.env->NewRandomAccessFile(file_path, &file_, soptions_); + options_.env->NewRandomAccessFile(file_path, &file, soptions_); + file_.reset(new RandomAccessFileReader(std::move(file))); } options_.comparator = &internal_comparator_; // For old sst format, ReadTableProperties might fail but file can be read @@ -68,16 +71,15 @@ Status SstFileReader::GetTableReader(const std::string& file_path) { } if (s.ok()) { - s = NewTableReader(ioptions_, soptions_, internal_comparator_, - std::move(file_), file_size, &table_reader_); + s = NewTableReader(ioptions_, soptions_, internal_comparator_, file_size, + &table_reader_); } return s; } Status SstFileReader::NewTableReader( const ImmutableCFOptions& ioptions, const EnvOptions& soptions, - const InternalKeyComparator& internal_comparator, - unique_ptr&& file, uint64_t file_size, + const InternalKeyComparator& internal_comparator, uint64_t file_size, unique_ptr* table_reader) { // We need to turn off pre-fetching of index and filter nodes for // BlockBasedTable @@ -108,7 +110,7 @@ Status SstFileReader::DumpTable(const std::string& out_filename) { } Status SstFileReader::ReadTableProperties(uint64_t table_magic_number, - RandomAccessFile* file, + RandomAccessFileReader* file, uint64_t file_size) { TableProperties* table_properties = nullptr; Status s = rocksdb::ReadTableProperties(file, file_size, table_magic_number, diff --git a/util/sst_dump_tool_imp.h b/util/sst_dump_tool_imp.h index a5f22679e..b8b866057 100644 --- a/util/sst_dump_tool_imp.h +++ b/util/sst_dump_tool_imp.h @@ -28,6 +28,7 @@ #include "table/format.h" #include "table/meta_blocks.h" #include "table/plain_table_factory.h" +#include "util/file_reader_writer.h" #include "util/ldb_cmd.h" #include "util/random.h" #include "util/testharness.h" @@ -56,7 +57,7 @@ class SstFileReader { // Get the TableReader implementation for the sst file Status GetTableReader(const std::string& file_path); Status ReadTableProperties(uint64_t table_magic_number, - RandomAccessFile* file, uint64_t file_size); + RandomAccessFileReader* file, uint64_t file_size); Status SetTableOptionsByMagicNumber(uint64_t table_magic_number); Status SetOldTableOptions(); @@ -65,7 +66,7 @@ class SstFileReader { Status NewTableReader(const ImmutableCFOptions& ioptions, const EnvOptions& soptions, const InternalKeyComparator& internal_comparator, - unique_ptr&& file, uint64_t file_size, + uint64_t file_size, unique_ptr* table_reader); std::string file_name_; @@ -76,7 +77,7 @@ class SstFileReader { Status init_result_; unique_ptr table_reader_; - unique_ptr file_; + unique_ptr file_; // options_ and internal_comparator_ will also be used in // ReadSequential internally (specifically, seek-related operations) Options options_; diff --git a/util/sync_point.cc b/util/sync_point.cc index 3c224bfac..7051b5103 100644 --- a/util/sync_point.cc +++ b/util/sync_point.cc @@ -4,10 +4,25 @@ // of patent rights can be found in the PATENTS file in the same directory. #include "util/sync_point.h" +#include "port/port.h" +#include "util/random.h" + +int rocksdb_kill_odds = 0; #ifndef NDEBUG namespace rocksdb { +void TestKillRandom(int odds, const std::string& srcfile, int srcline) { + time_t curtime = time(nullptr); + Random r((uint32_t)curtime); + + assert(odds > 0); + bool crash = r.OneIn(odds); + if (crash) { + port::Crash(srcfile, srcline); + } +} + SyncPoint* SyncPoint::GetInstance() { static SyncPoint sync_point; return &sync_point; diff --git a/util/sync_point.h b/util/sync_point.h index 7827d286f..6a4629cb3 100644 --- a/util/sync_point.h +++ b/util/sync_point.h @@ -4,6 +4,7 @@ // of patent rights can be found in the PATENTS file in the same directory. #pragma once +#include #include #include #include @@ -11,6 +12,33 @@ #include #include +// This is only set from db_stress.cc and for testing only. +// If non-zero, kill at various points in source code with probability 1/this +extern int rocksdb_kill_odds; + +#ifdef NDEBUG +// empty in release build +#define TEST_KILL_RANDOM(rocksdb_kill_odds) +#else + +namespace rocksdb { +// Kill the process with probablity 1/odds for testing. +extern void TestKillRandom(int odds, const std::string& srcfile, int srcline); + +// To avoid crashing always at some frequently executed codepaths (during +// kill random test), use this factor to reduce odds +#define REDUCE_ODDS 2 +#define REDUCE_ODDS2 4 + +#define TEST_KILL_RANDOM(rocksdb_kill_odds) \ + { \ + if (rocksdb_kill_odds > 0) { \ + TestKillRandom(rocksdb_kill_odds, __FILE__, __LINE__); \ + } \ + } +} // namespace rocksdb +#endif + #ifdef NDEBUG #define TEST_SYNC_POINT(x) #define TEST_SYNC_POINT_CALLBACK(x, y) diff --git a/util/testutil.cc b/util/testutil.cc index 20f22c2dc..ebe7a308b 100644 --- a/util/testutil.cc +++ b/util/testutil.cc @@ -10,6 +10,7 @@ #include "util/testutil.h" #include "port/port.h" +#include "util/file_reader_writer.h" #include "util/random.h" namespace rocksdb { @@ -107,5 +108,20 @@ const Comparator* Uint64Comparator() { return uint64comp; } +WritableFileWriter* GetWritableFileWriter(WritableFile* wf) { + unique_ptr file(wf); + return new WritableFileWriter(std::move(file), EnvOptions()); +} + +RandomAccessFileReader* GetRandomAccessFileReader(RandomAccessFile* raf) { + unique_ptr file(raf); + return new RandomAccessFileReader(std::move(file)); +} + +SequentialFileReader* GetSequentialFileReader(SequentialFile* se) { + unique_ptr file(se); + return new SequentialFileReader(std::move(file)); +} + } // namespace test } // namespace rocksdb diff --git a/util/testutil.h b/util/testutil.h index 240a468ae..24eceddfd 100644 --- a/util/testutil.h +++ b/util/testutil.h @@ -19,6 +19,9 @@ #include "util/random.h" namespace rocksdb { +class SequentialFile; +class SequentialFileReader; + namespace test { // Store in *dst a random string of length "len" and return a Slice that @@ -159,6 +162,11 @@ class VectorIterator : public Iterator { std::vector values_; size_t current_; }; +extern WritableFileWriter* GetWritableFileWriter(WritableFile* wf); + +extern RandomAccessFileReader* GetRandomAccessFileReader(RandomAccessFile* raf); + +extern SequentialFileReader* GetSequentialFileReader(SequentialFile* se); } // namespace test } // namespace rocksdb diff --git a/utilities/backupable/backupable_db.cc b/utilities/backupable/backupable_db.cc index 6b8515159..c25cec936 100644 --- a/utilities/backupable/backupable_db.cc +++ b/utilities/backupable/backupable_db.cc @@ -14,6 +14,7 @@ #include "util/channel.h" #include "util/coding.h" #include "util/crc32c.h" +#include "util/file_reader_writer.h" #include "util/logging.h" #include "util/string_util.h" #include "rocksdb/transaction_log.h" @@ -1105,7 +1106,10 @@ Status BackupEngineImpl::GetLatestBackupFileContents(uint32_t* latest_backup) { char buf[11]; Slice data; - s = file->Read(10, &data, buf); + unique_ptr file_reader( + new SequentialFileReader(std::move(file))); + + s = file_reader->Read(10, &data, buf); if (!s.ok() || data.size() == 0) { return s.ok() ? Status::Corruption("Latest backup file corrupted") : s; } @@ -1137,14 +1141,16 @@ Status BackupEngineImpl::PutLatestBackupFileContents(uint32_t latest_backup) { return s; } + unique_ptr file_writer( + new WritableFileWriter(std::move(file), env_options)); char file_contents[10]; int len = sprintf(file_contents, "%u\n", latest_backup); - s = file->Append(Slice(file_contents, len)); + s = file_writer->Append(Slice(file_contents, len)); if (s.ok() && options_.sync) { - file->Sync(); + file_writer->Sync(false); } if (s.ok()) { - s = file->Close(); + s = file_writer->Close(); } if (s.ok()) { // atomically replace real file with new tmp @@ -1187,6 +1193,10 @@ Status BackupEngineImpl::CopyFile( return s; } + unique_ptr dest_writer( + new WritableFileWriter(std::move(dst_file), env_options)); + unique_ptr src_reader( + new SequentialFileReader(std::move(src_file))); unique_ptr buf(new char[copy_file_buffer_size_]); Slice data; @@ -1196,7 +1206,7 @@ Status BackupEngineImpl::CopyFile( } size_t buffer_to_read = (copy_file_buffer_size_ < size_limit) ? copy_file_buffer_size_ : size_limit; - s = src_file->Read(buffer_to_read, &data, buf.get()); + s = src_reader->Read(buffer_to_read, &data, buf.get()); size_limit -= data.size(); if (!s.ok()) { @@ -1210,14 +1220,14 @@ Status BackupEngineImpl::CopyFile( *checksum_value = crc32c::Extend(*checksum_value, data.data(), data.size()); } - s = dst_file->Append(data); + s = dest_writer->Append(data); if (rate_limiter != nullptr) { rate_limiter->ReportAndWait(data.size()); } } while (s.ok() && data.size() > 0 && size_limit > 0); if (s.ok() && sync) { - s = dst_file->Sync(); + s = dest_writer->Sync(false); } return s; @@ -1358,6 +1368,8 @@ Status BackupEngineImpl::CalculateChecksum(const std::string& src, Env* src_env, return s; } + unique_ptr src_reader( + new SequentialFileReader(std::move(src_file))); std::unique_ptr buf(new char[copy_file_buffer_size_]); Slice data; @@ -1367,7 +1379,7 @@ Status BackupEngineImpl::CalculateChecksum(const std::string& src, Env* src_env, } size_t buffer_to_read = (copy_file_buffer_size_ < size_limit) ? copy_file_buffer_size_ : size_limit; - s = src_file->Read(buffer_to_read, &data, buf.get()); + s = src_reader->Read(buffer_to_read, &data, buf.get()); if (!s.ok()) { return s; @@ -1522,9 +1534,11 @@ Status BackupEngineImpl::BackupMeta::LoadFromFile( return s; } + unique_ptr backup_meta_reader( + new SequentialFileReader(std::move(backup_meta_file))); unique_ptr buf(new char[max_backup_meta_file_size_ + 1]); Slice data; - s = backup_meta_file->Read(max_backup_meta_file_size_, &data, buf.get()); + s = backup_meta_reader->Read(max_backup_meta_file_size_, &data, buf.get()); if (!s.ok() || data.size() == max_backup_meta_file_size_) { return s.ok() ? Status::Corruption("File size too big") : s; diff --git a/utilities/backupable/backupable_db_test.cc b/utilities/backupable/backupable_db_test.cc index 985f1f472..89d1b6208 100644 --- a/utilities/backupable/backupable_db_test.cc +++ b/utilities/backupable/backupable_db_test.cc @@ -16,6 +16,7 @@ #include "rocksdb/types.h" #include "rocksdb/transaction_log.h" #include "rocksdb/utilities/backupable_db.h" +#include "util/file_reader_writer.h" #include "util/testharness.h" #include "util/random.h" #include "util/mutexlock.h" @@ -292,11 +293,12 @@ class FileManager : public EnvWrapper { if (!s.ok()) { return s; } - + RandomRWFileAccessor accessor(std::move(file)); for (uint64_t i = 0; s.ok() && i < bytes_to_corrupt; ++i) { std::string tmp; // write one random byte to a random position - s = file->Write(rnd_.Next() % size, test::RandomString(&rnd_, 1, &tmp)); + s = accessor.Write(rnd_.Next() % size, + test::RandomString(&rnd_, 1, &tmp)); } return s; }