From cadc1adffa447f02eb65bd848cf26c13142a74bb Mon Sep 17 00:00:00 2001 From: sdong Date: Fri, 13 Jun 2014 15:54:19 -0700 Subject: [PATCH] Refactor: group metadata needed to open an SST file to a separate copyable struct Summary: We added multiple fields to FileMetaData recently and are planning to add more. This refactoring separate the minimum information for accessing the file. This object is copyable (FileMetaData is not copyable since the ref counter). I hope this refactoring can enable further improvements: (1) use it to design a more efficient data structure to speed up read queries. (2) in the future, when we add information of storage level, we can easily do the encoding, instead of enlarge this structure, which might expand memory work set for file meta data. The definition is same as current EncodedFileMetaData used in two level iterator, so now the logic in two level iterator is easier to understand. Test Plan: make all check Reviewers: haobo, igor, ljin Reviewed By: ljin Subscribers: leveldb, dhruba, yhchiang Differential Revision: https://reviews.facebook.net/D18933 --- db/builder.cc | 12 ++-- db/builder.h | 2 +- db/compaction.cc | 14 ++--- db/compaction_picker.cc | 88 +++++++++++++++-------------- db/db_impl.cc | 92 +++++++++++++++--------------- db/forward_iterator.cc | 4 +- db/repair.cc | 34 +++++------- db/table_cache.cc | 39 ++++++------- db/table_cache.h | 12 ++-- db/version_edit.cc | 33 ++++++----- db/version_edit.h | 35 ++++++++---- db/version_set.cc | 120 +++++++++++++++++----------------------- db/version_set_test.cc | 2 +- 13 files changed, 242 insertions(+), 245 deletions(-) diff --git a/db/builder.cc b/db/builder.cc index ce85ae589..61890b5b6 100644 --- a/db/builder.cc +++ b/db/builder.cc @@ -42,7 +42,7 @@ Status BuildTable(const std::string& dbname, Env* env, const Options& options, const SequenceNumber earliest_seqno_in_memtable, const CompressionType compression) { Status s; - meta->file_size = 0; + meta->fd.file_size = 0; meta->smallest_seqno = meta->largest_seqno = 0; iter->SeekToFirst(); @@ -54,7 +54,7 @@ Status BuildTable(const std::string& dbname, Env* env, const Options& options, purge = false; } - std::string fname = TableFileName(dbname, meta->number); + std::string fname = TableFileName(dbname, meta->fd.GetNumber()); if (iter->Valid()) { unique_ptr file; s = env->NewWritableFile(fname, &file, soptions); @@ -177,8 +177,8 @@ Status BuildTable(const std::string& dbname, Env* env, const Options& options, if (s.ok()) { s = builder->Finish(); if (s.ok()) { - meta->file_size = builder->FileSize(); - assert(meta->file_size > 0); + meta->fd.file_size = builder->FileSize(); + assert(meta->fd.GetFileSize() > 0); } } else { builder->Abandon(); @@ -202,7 +202,7 @@ Status BuildTable(const std::string& dbname, Env* env, const Options& options, if (s.ok()) { // Verify that the table is usable Iterator* it = table_cache->NewIterator(ReadOptions(), soptions, - internal_comparator, *meta); + internal_comparator, meta->fd); s = it->status(); delete it; } @@ -213,7 +213,7 @@ Status BuildTable(const std::string& dbname, Env* env, const Options& options, s = iter->status(); } - if (s.ok() && meta->file_size > 0) { + if (s.ok() && meta->fd.GetFileSize() > 0) { // Keep it } else { env->DeleteFile(fname); diff --git a/db/builder.h b/db/builder.h index 630162968..68eb3fc6f 100644 --- a/db/builder.h +++ b/db/builder.h @@ -29,7 +29,7 @@ extern TableBuilder* NewTableBuilder( WritableFile* file, CompressionType compression_type); // Build a Table file from the contents of *iter. The generated file -// will be named according to meta->number. On success, the rest of +// will be named according to number specified in meta. On success, the rest of // *meta will be filled with metadata about the generated table. // If no data is present in *iter, meta->file_size will be set to // zero, and no Table file will be produced. diff --git a/db/compaction.cc b/db/compaction.cc index 0c758cc39..5d22d4484 100644 --- a/db/compaction.cc +++ b/db/compaction.cc @@ -21,7 +21,7 @@ namespace rocksdb { static uint64_t TotalFileSize(const std::vector& files) { uint64_t sum = 0; for (size_t i = 0; i < files.size() && files[i]; i++) { - sum += files[i]->file_size; + sum += files[i]->fd.GetFileSize(); } return sum; } @@ -90,7 +90,7 @@ bool Compaction::IsDeletionCompaction() const { return deletion_compaction_; } void Compaction::AddInputDeletions(VersionEdit* edit) { for (int which = 0; which < 2; which++) { for (size_t i = 0; i < inputs_[which].size(); i++) { - edit->DeleteFile(level_ + which, inputs_[which][i]->number); + edit->DeleteFile(level_ + which, inputs_[which][i]->fd.GetNumber()); } } } @@ -127,7 +127,7 @@ bool Compaction::ShouldStopBefore(const Slice& internal_key) { icmp->Compare(internal_key, grandparents_[grandparent_index_]->largest.Encode()) > 0) { if (seen_key_) { - overlapped_bytes_ += grandparents_[grandparent_index_]->file_size; + overlapped_bytes_ += grandparents_[grandparent_index_]->fd.GetFileSize(); } assert(grandparent_index_ + 1 >= grandparents_.size() || icmp->Compare(grandparents_[grandparent_index_]->largest.Encode(), @@ -212,9 +212,9 @@ int InputSummary(const std::vector& files, char* output, int sz = len - write; int ret; char sztxt[16]; - AppendHumanBytes(files.at(i)->file_size, sztxt, 16); - ret = snprintf(output + write, sz, "%" PRIu64 "(%s) ", files.at(i)->number, - sztxt); + AppendHumanBytes(files.at(i)->fd.GetFileSize(), sztxt, 16); + ret = snprintf(output + write, sz, "%" PRIu64 "(%s) ", + files.at(i)->fd.GetNumber(), sztxt); if (ret < 0 || ret >= sz) break; write += ret; } @@ -258,7 +258,7 @@ uint64_t Compaction::OutputFilePreallocationSize() { cfd_->compaction_picker()->MaxFileSizeForLevel(output_level()); } else { for (const auto& f : inputs_[0]) { - preallocation_size += f->file_size; + preallocation_size += f->fd.GetFileSize(); } } // Over-estimate slightly so we don't end up just barely crossing diff --git a/db/compaction_picker.cc b/db/compaction_picker.cc index 3416a0bac..2ec42b49a 100644 --- a/db/compaction_picker.cc +++ b/db/compaction_picker.cc @@ -22,7 +22,7 @@ namespace { uint64_t TotalFileSize(const std::vector& files) { uint64_t sum = 0; for (size_t i = 0; i < files.size() && files[i]; i++) { - sum += files[i]->file_size; + sum += files[i]->fd.GetFileSize(); } return sum; } @@ -80,7 +80,7 @@ void CompactionPicker::SizeBeingCompacted(std::vector& sizes) { for (auto c : compactions_in_progress_[level]) { assert(c->level() == level); for (int i = 0; i < c->num_input_files(0); i++) { - total += c->input(0,i)->file_size; + total += c->input(0, i)->fd.GetFileSize(); } } sizes[level] = total; @@ -335,7 +335,7 @@ Compaction* CompactionPicker::CompactRange(Version* version, int input_level, MaxFileSizeForLevel(input_level) * options_->source_compaction_factor; uint64_t total = 0; for (size_t i = 0; i + 1 < inputs.size(); ++i) { - uint64_t s = inputs[i]->file_size; + uint64_t s = inputs[i]->fd.GetFileSize(); total += s; if (total >= limit) { **compaction_end = inputs[i + 1]->smallest; @@ -508,10 +508,11 @@ Compaction* LevelCompactionPicker::PickCompactionBySize(Version* version, FileMetaData* f = c->input_version_->files_[level][index]; // check to verify files are arranged in descending size - assert((i == file_size.size() - 1) || - (i >= Version::number_of_files_to_sort_ - 1) || - (f->file_size >= - c->input_version_->files_[level][file_size[i + 1]]->file_size)); + assert( + (i == file_size.size() - 1) || + (i >= Version::number_of_files_to_sort_ - 1) || + (f->fd.GetFileSize() >= + c->input_version_->files_[level][file_size[i + 1]]->fd.GetFileSize())); // do not pick a file to compact if it is being compacted // from n-1 level. @@ -680,19 +681,21 @@ Compaction* UniversalCompactionPicker::PickCompactionUniversalReadAmp( candidate_count = 1; break; } - LogToBuffer( - log_buffer, "[%s] Universal: file %lu[%d] being compacted, skipping", - version->cfd_->GetName().c_str(), (unsigned long)f->number, loop); + LogToBuffer(log_buffer, + "[%s] Universal: file %lu[%d] being compacted, skipping", + version->cfd_->GetName().c_str(), + (unsigned long)f->fd.GetNumber(), loop); f = nullptr; } // This file is not being compacted. Consider it as the // first candidate to be compacted. - uint64_t candidate_size = f != nullptr? f->file_size : 0; + uint64_t candidate_size = f != nullptr ? f->fd.GetFileSize() : 0; if (f != nullptr) { - LogToBuffer( - log_buffer, "[%s] Universal: Possible candidate file %lu[%d].", - version->cfd_->GetName().c_str(), (unsigned long)f->number, loop); + LogToBuffer(log_buffer, + "[%s] Universal: Possible candidate file %lu[%d].", + version->cfd_->GetName().c_str(), + (unsigned long)f->fd.GetNumber(), loop); } // Check if the suceeding files need compaction. @@ -711,13 +714,13 @@ Compaction* UniversalCompactionPicker::PickCompactionUniversalReadAmp( // kCompactionStopStyleSimilarSize, it's simply the size of the last // picked file. uint64_t sz = (candidate_size * (100L + ratio)) /100; - if (sz < f->file_size) { + if (sz < f->fd.GetFileSize()) { break; } if (options_->compaction_options_universal.stop_style == kCompactionStopStyleSimilarSize) { // Similar-size stopping rule: also check the last picked file isn't // far larger than the next candidate file. - sz = (f->file_size * (100L + ratio)) / 100; + sz = (f->fd.GetFileSize() * (100L + ratio)) / 100; if (sz < candidate_size) { // If the small file we've encountered begins a run of similar-size // files, we'll pick them up on a future iteration of the outer @@ -725,9 +728,9 @@ Compaction* UniversalCompactionPicker::PickCompactionUniversalReadAmp( // by the last-resort read amp strategy which disregards size ratios. break; } - candidate_size = f->file_size; + candidate_size = f->fd.GetFileSize(); } else { // default kCompactionStopStyleTotalSize - candidate_size += f->file_size; + candidate_size += f->fd.GetFileSize(); } candidate_count++; } @@ -744,8 +747,9 @@ Compaction* UniversalCompactionPicker::PickCompactionUniversalReadAmp( FileMetaData* f = version->files_[level][index]; LogToBuffer(log_buffer, "[%s] Universal: Skipping file %lu[%d] with size %lu %d\n", - version->cfd_->GetName().c_str(), (unsigned long)f->number, - i, (unsigned long)f->file_size, f->being_compacted); + version->cfd_->GetName().c_str(), + (unsigned long)f->fd.GetNumber(), i, + (unsigned long)f->fd.GetFileSize(), f->being_compacted); } } } @@ -763,7 +767,8 @@ Compaction* UniversalCompactionPicker::PickCompactionUniversalReadAmp( uint64_t older_file_size = 0; for (unsigned int i = file_by_time.size() - 1; i >= first_index_after; i--) { - older_file_size += version->files_[level][file_by_time[i]]->file_size; + older_file_size += + version->files_[level][file_by_time[i]]->fd.GetFileSize(); if (older_file_size * 100L >= total_size * (long) ratio_to_compress) { enable_compression = false; break; @@ -779,10 +784,10 @@ Compaction* UniversalCompactionPicker::PickCompactionUniversalReadAmp( int index = file_by_time[i]; FileMetaData* f = c->input_version_->files_[level][index]; c->inputs_[0].push_back(f); - LogToBuffer(log_buffer, - "[%s] Universal: Picking file %lu[%d] with size %lu\n", - version->cfd_->GetName().c_str(), (unsigned long)f->number, i, - (unsigned long)f->file_size); + LogToBuffer( + log_buffer, "[%s] Universal: Picking file %lu[%d] with size %lu\n", + version->cfd_->GetName().c_str(), (unsigned long)f->fd.GetNumber(), i, + (unsigned long)f->fd.GetFileSize()); } return c; } @@ -818,10 +823,10 @@ Compaction* UniversalCompactionPicker::PickCompactionUniversalSizeAmp( start_index = loop; // Consider this as the first candidate. break; } - LogToBuffer(log_buffer, - "[%s] Universal: skipping file %lu[%d] compacted %s", - version->cfd_->GetName().c_str(), (unsigned long)f->number, - loop, " cannot be a candidate to reduce size amp.\n"); + LogToBuffer( + log_buffer, "[%s] Universal: skipping file %lu[%d] compacted %s", + version->cfd_->GetName().c_str(), (unsigned long)f->fd.GetNumber(), + loop, " cannot be a candidate to reduce size amp.\n"); f = nullptr; } if (f == nullptr) { @@ -829,8 +834,9 @@ Compaction* UniversalCompactionPicker::PickCompactionUniversalSizeAmp( } LogToBuffer(log_buffer, "[%s] Universal: First candidate file %lu[%d] %s", - version->cfd_->GetName().c_str(), (unsigned long)f->number, - start_index, " to reduce size amp.\n"); + version->cfd_->GetName().c_str(), + (unsigned long)f->fd.GetNumber(), start_index, + " to reduce size amp.\n"); // keep adding up all the remaining files for (unsigned int loop = start_index; loop < file_by_time.size() - 1; @@ -840,11 +846,12 @@ Compaction* UniversalCompactionPicker::PickCompactionUniversalSizeAmp( if (f->being_compacted) { LogToBuffer( log_buffer, "[%s] Universal: Possible candidate file %lu[%d] %s.", - version->cfd_->GetName().c_str(), (unsigned long)f->number, loop, + version->cfd_->GetName().c_str(), (unsigned long)f->fd.GetNumber(), + loop, " is already being compacted. No size amp reduction possible.\n"); return nullptr; } - candidate_size += f->file_size; + candidate_size += f->fd.GetFileSize(); candidate_count++; } if (candidate_count == 0) { @@ -853,7 +860,7 @@ Compaction* UniversalCompactionPicker::PickCompactionUniversalSizeAmp( // size of earliest file int index = file_by_time[file_by_time.size() - 1]; - uint64_t earliest_file_size = version->files_[level][index]->file_size; + uint64_t earliest_file_size = version->files_[level][index]->fd.GetFileSize(); // size amplification = percentage of additional size if (candidate_size * 100 < ratio * earliest_file_size) { @@ -885,8 +892,9 @@ Compaction* UniversalCompactionPicker::PickCompactionUniversalSizeAmp( c->inputs_[0].push_back(f); LogToBuffer(log_buffer, "[%s] Universal: size amp picking file %lu[%d] with size %lu", - version->cfd_->GetName().c_str(), (unsigned long)f->number, - index, (unsigned long)f->file_size); + version->cfd_->GetName().c_str(), + (unsigned long)f->fd.GetNumber(), index, + (unsigned long)f->fd.GetFileSize()); } return c; } @@ -896,7 +904,7 @@ Compaction* FIFOCompactionPicker::PickCompaction(Version* version, assert(version->NumberLevels() == 1); uint64_t total_size = 0; for (const auto& file : version->files_[0]) { - total_size += file->file_size; + total_size += file->fd.GetFileSize(); } if (total_size <= options_->compaction_options_fifo.max_table_files_size || @@ -924,13 +932,13 @@ Compaction* FIFOCompactionPicker::PickCompaction(Version* version, for (auto ritr = version->files_[0].rbegin(); ritr != version->files_[0].rend(); ++ritr) { auto f = *ritr; - total_size -= f->file_size; + total_size -= f->fd.GetFileSize(); c->inputs_[0].push_back(f); char tmp_fsize[16]; - AppendHumanBytes(f->file_size, tmp_fsize, sizeof(tmp_fsize)); + AppendHumanBytes(f->fd.GetFileSize(), tmp_fsize, sizeof(tmp_fsize)); LogToBuffer(log_buffer, "[%s] FIFO compaction: picking file %" PRIu64 " with size %s for deletion", - version->cfd_->GetName().c_str(), f->number, tmp_fsize); + version->cfd_->GetName().c_str(), f->fd.GetNumber(), tmp_fsize); if (total_size <= options_->compaction_options_fifo.max_table_files_size) { break; } diff --git a/db/db_impl.cc b/db/db_impl.cc index ec109a582..d2a9d0da8 100644 --- a/db/db_impl.cc +++ b/db/db_impl.cc @@ -644,8 +644,7 @@ void DBImpl::PurgeObsoleteFiles(DeletionState& state) { const char* kDumbDbName = ""; for (auto file : state.sst_delete_files) { candidate_files.push_back( - TableFileName(kDumbDbName, file->number).substr(1) - ); + TableFileName(kDumbDbName, file->fd.GetNumber()).substr(1)); delete file; } @@ -1370,14 +1369,14 @@ Status DBImpl::WriteLevel0TableForRecovery(ColumnFamilyData* cfd, MemTable* mem, mutex_.AssertHeld(); const uint64_t start_micros = env_->NowMicros(); FileMetaData meta; - meta.number = versions_->NewFileNumber(); - pending_outputs_.insert(meta.number); + meta.fd.number = versions_->NewFileNumber(); + pending_outputs_.insert(meta.fd.GetNumber()); Iterator* iter = mem->NewIterator(ReadOptions(), true); const SequenceNumber newest_snapshot = snapshots_.GetNewest(); const SequenceNumber earliest_seqno_in_memtable = mem->GetFirstSequenceNumber(); Log(options_.info_log, "[%s] Level-0 table #%lu: started", - cfd->GetName().c_str(), (unsigned long)meta.number); + cfd->GetName().c_str(), (unsigned long)meta.fd.GetNumber()); Status s; { @@ -1391,27 +1390,28 @@ Status DBImpl::WriteLevel0TableForRecovery(ColumnFamilyData* cfd, MemTable* mem, } Log(options_.info_log, "[%s] Level-0 table #%lu: %lu bytes %s", - cfd->GetName().c_str(), (unsigned long)meta.number, - (unsigned long)meta.file_size, s.ToString().c_str()); + cfd->GetName().c_str(), (unsigned long)meta.fd.GetNumber(), + (unsigned long)meta.fd.GetFileSize(), s.ToString().c_str()); delete iter; - pending_outputs_.erase(meta.number); + pending_outputs_.erase(meta.fd.GetNumber()); // Note that if file_size is zero, the file has been deleted and // should not be added to the manifest. int level = 0; - if (s.ok() && meta.file_size > 0) { - edit->AddFile(level, meta.number, meta.file_size, - meta.smallest, meta.largest, - meta.smallest_seqno, meta.largest_seqno); + if (s.ok() && meta.fd.GetFileSize() > 0) { + edit->AddFile(level, meta.fd.GetNumber(), meta.fd.GetFileSize(), + meta.smallest, meta.largest, meta.smallest_seqno, + meta.largest_seqno); } InternalStats::CompactionStats stats; stats.micros = env_->NowMicros() - start_micros; - stats.bytes_written = meta.file_size; + stats.bytes_written = meta.fd.GetFileSize(); stats.files_out_levelnp1 = 1; cfd->internal_stats()->AddCompactionStats(level, stats); - RecordTick(options_.statistics.get(), COMPACT_WRITE_BYTES, meta.file_size); + RecordTick(options_.statistics.get(), COMPACT_WRITE_BYTES, + meta.fd.GetFileSize()); return s; } @@ -1421,9 +1421,9 @@ Status DBImpl::WriteLevel0Table(ColumnFamilyData* cfd, mutex_.AssertHeld(); const uint64_t start_micros = env_->NowMicros(); FileMetaData meta; - meta.number = versions_->NewFileNumber(); - *filenumber = meta.number; - pending_outputs_.insert(meta.number); + meta.fd.number = versions_->NewFileNumber(); + *filenumber = meta.fd.GetNumber(); + pending_outputs_.insert(meta.fd.GetNumber()); const SequenceNumber newest_snapshot = snapshots_.GetNewest(); const SequenceNumber earliest_seqno_in_memtable = @@ -1443,7 +1443,7 @@ Status DBImpl::WriteLevel0Table(ColumnFamilyData* cfd, Iterator* iter = NewMergingIterator(&cfd->internal_comparator(), &memtables[0], memtables.size()); Log(options_.info_log, "[%s] Level-0 flush table #%lu: started", - cfd->GetName().c_str(), (unsigned long)meta.number); + cfd->GetName().c_str(), (unsigned long)meta.fd.GetNumber()); s = BuildTable(dbname_, env_, *cfd->options(), storage_options_, cfd->table_cache(), iter, &meta, cfd->internal_comparator(), @@ -1452,8 +1452,8 @@ Status DBImpl::WriteLevel0Table(ColumnFamilyData* cfd, LogFlush(options_.info_log); delete iter; Log(options_.info_log, "[%s] Level-0 flush table #%lu: %lu bytes %s", - cfd->GetName().c_str(), (unsigned long)meta.number, - (unsigned long)meta.file_size, s.ToString().c_str()); + cfd->GetName().c_str(), (unsigned long)meta.fd.GetFileSize(), + (unsigned long)meta.fd.GetFileSize(), s.ToString().c_str()); if (!options_.disableDataSync) { db_directory_->Fsync(); @@ -1477,7 +1477,7 @@ Status DBImpl::WriteLevel0Table(ColumnFamilyData* cfd, // Note that if file_size is zero, the file has been deleted and // should not be added to the manifest. int level = 0; - if (s.ok() && meta.file_size > 0) { + if (s.ok() && meta.fd.GetFileSize() > 0) { const Slice min_user_key = meta.smallest.user_key(); const Slice max_user_key = meta.largest.user_key(); // if we have more than 1 background thread, then we cannot @@ -1488,16 +1488,17 @@ Status DBImpl::WriteLevel0Table(ColumnFamilyData* cfd, cfd->options()->compaction_style == kCompactionStyleLevel) { level = base->PickLevelForMemTableOutput(min_user_key, max_user_key); } - edit->AddFile(level, meta.number, meta.file_size, - meta.smallest, meta.largest, - meta.smallest_seqno, meta.largest_seqno); + edit->AddFile(level, meta.fd.GetNumber(), meta.fd.GetFileSize(), + meta.smallest, meta.largest, meta.smallest_seqno, + meta.largest_seqno); } InternalStats::CompactionStats stats; stats.micros = env_->NowMicros() - start_micros; - stats.bytes_written = meta.file_size; + stats.bytes_written = meta.fd.GetFileSize(); cfd->internal_stats()->AddCompactionStats(level, stats); - RecordTick(options_.statistics.get(), COMPACT_WRITE_BYTES, meta.file_size); + RecordTick(options_.statistics.get(), COMPACT_WRITE_BYTES, + meta.fd.GetFileSize()); return s; } @@ -1688,9 +1689,10 @@ Status DBImpl::ReFitLevel(ColumnFamilyData* cfd, int level, int target_level) { VersionEdit edit; edit.SetColumnFamily(cfd->GetID()); for (const auto& f : cfd->current()->files_[level]) { - edit.DeleteFile(level, f->number); - edit.AddFile(to_level, f->number, f->file_size, f->smallest, f->largest, - f->smallest_seqno, f->largest_seqno); + edit.DeleteFile(level, f->fd.GetNumber()); + edit.AddFile(to_level, f->fd.GetNumber(), f->fd.GetFileSize(), + f->smallest, f->largest, f->smallest_seqno, + f->largest_seqno); } Log(options_.info_log, "[%s] Apply version edit:\n%s", cfd->GetName().c_str(), edit.DebugString().data()); @@ -2172,7 +2174,7 @@ Status DBImpl::BackgroundCompaction(bool* madeProgress, assert(c->column_family_data()->options()->compaction_style == kCompactionStyleFIFO); for (const auto& f : *c->inputs(0)) { - c->edit()->DeleteFile(c->level(), f->number); + c->edit()->DeleteFile(c->level(), f->fd.GetNumber()); } status = versions_->LogAndApply(c->column_family_data(), c->edit(), &mutex_, db_directory_.get()); @@ -2186,21 +2188,21 @@ Status DBImpl::BackgroundCompaction(bool* madeProgress, // Move file to next level assert(c->num_input_files(0) == 1); FileMetaData* f = c->input(0, 0); - c->edit()->DeleteFile(c->level(), f->number); - c->edit()->AddFile(c->level() + 1, f->number, f->file_size, - f->smallest, f->largest, - f->smallest_seqno, f->largest_seqno); + c->edit()->DeleteFile(c->level(), f->fd.GetNumber()); + c->edit()->AddFile(c->level() + 1, f->fd.GetNumber(), f->fd.GetFileSize(), + f->smallest, f->largest, f->smallest_seqno, + f->largest_seqno); status = versions_->LogAndApply(c->column_family_data(), c->edit(), &mutex_, db_directory_.get()); InstallSuperVersion(c->column_family_data(), deletion_state); Version::LevelSummaryStorage tmp; - LogToBuffer(log_buffer, "[%s] Moved #%lld to level-%d %lld bytes %s: %s\n", - c->column_family_data()->GetName().c_str(), - static_cast(f->number), c->level() + 1, - static_cast(f->file_size), - status.ToString().c_str(), - c->input_version()->LevelSummary(&tmp)); + LogToBuffer( + log_buffer, "[%s] Moved #%lld to level-%d %lld bytes %s: %s\n", + c->column_family_data()->GetName().c_str(), + static_cast(f->fd.GetNumber()), c->level() + 1, + static_cast(f->fd.GetFileSize()), + status.ToString().c_str(), c->input_version()->LevelSummary(&tmp)); c->ReleaseCompactionFiles(status); *madeProgress = true; } else { @@ -2394,7 +2396,7 @@ Status DBImpl::FinishCompactionOutputFile(CompactionState* compact, if (s.ok() && current_entries > 0) { // Verify that the table is usable ColumnFamilyData* cfd = compact->compaction->column_family_data(); - FileMetaData meta(output_number, current_bytes); + FileDescriptor meta(output_number, current_bytes); Iterator* iter = cfd->table_cache()->NewIterator( ReadOptions(), storage_options_, cfd->internal_comparator(), meta); s = iter->status(); @@ -3094,15 +3096,15 @@ Status DBImpl::DoCompactionWork(CompactionState* compact, stats.files_out_levelnp1 = num_output_files; for (int i = 0; i < compact->compaction->num_input_files(0); i++) { - stats.bytes_readn += compact->compaction->input(0, i)->file_size; + stats.bytes_readn += compact->compaction->input(0, i)->fd.GetFileSize(); RecordTick(options_.statistics.get(), COMPACT_READ_BYTES, - compact->compaction->input(0, i)->file_size); + compact->compaction->input(0, i)->fd.GetFileSize()); } for (int i = 0; i < compact->compaction->num_input_files(1); i++) { - stats.bytes_readnp1 += compact->compaction->input(1, i)->file_size; + stats.bytes_readnp1 += compact->compaction->input(1, i)->fd.GetFileSize(); RecordTick(options_.statistics.get(), COMPACT_READ_BYTES, - compact->compaction->input(1, i)->file_size); + compact->compaction->input(1, i)->fd.GetFileSize()); } for (int i = 0; i < num_output_files; i++) { diff --git a/db/forward_iterator.cc b/db/forward_iterator.cc index 8a8c347a5..4a8bbbbd9 100644 --- a/db/forward_iterator.cc +++ b/db/forward_iterator.cc @@ -39,7 +39,7 @@ class LevelIterator : public Iterator { file_index_ = file_index; file_iter_.reset(cfd_->table_cache()->NewIterator( read_options_, *(cfd_->soptions()), cfd_->internal_comparator(), - *(files_[file_index_]), nullptr /* table_reader_ptr */, false)); + files_[file_index_]->fd, nullptr /* table_reader_ptr */, false)); } valid_ = false; } @@ -293,7 +293,7 @@ void ForwardIterator::RebuildIterators() { l0_iters_.reserve(l0_files.size()); for (const auto* l0 : l0_files) { l0_iters_.push_back(cfd_->table_cache()->NewIterator( - read_options_, *cfd_->soptions(), cfd_->internal_comparator(), *l0)); + read_options_, *cfd_->soptions(), cfd_->internal_comparator(), l0->fd)); } level_iters_.reserve(sv_->current->NumberLevels() - 1); for (int32_t level = 1; level < sv_->current->NumberLevels(); ++level) { diff --git a/db/repair.cc b/db/repair.cc index 03571a829..fe21e67d6 100644 --- a/db/repair.cc +++ b/db/repair.cc @@ -84,7 +84,7 @@ class Repairer { if (status.ok()) { unsigned long long bytes = 0; for (size_t i = 0; i < tables_.size(); i++) { - bytes += tables_[i].meta.file_size; + bytes += tables_[i].meta.fd.GetFileSize(); } Log(options_.info_log, "**** Repaired rocksdb %s; " @@ -230,7 +230,7 @@ class Repairer { // Do not record a version edit for this conversion to a Table // since ExtractMetaData() will also generate edits. FileMetaData meta; - meta.number = next_file_number_++; + meta.fd.number = next_file_number_++; ReadOptions ro; Iterator* iter = mem->NewIterator(ro, true /* enforce_total_order */); status = BuildTable(dbname_, env_, options_, storage_options_, table_cache_, @@ -240,22 +240,20 @@ class Repairer { delete cf_mems_default; mem = nullptr; if (status.ok()) { - if (meta.file_size > 0) { - table_numbers_.push_back(meta.number); + if (meta.fd.GetFileSize() > 0) { + table_numbers_.push_back(meta.fd.GetNumber()); } } Log(options_.info_log, "Log #%llu: %d ops saved to Table #%llu %s", - (unsigned long long) log, - counter, - (unsigned long long) meta.number, - status.ToString().c_str()); + (unsigned long long)log, counter, + (unsigned long long)meta.fd.GetNumber(), status.ToString().c_str()); return status; } void ExtractMetaData() { for (size_t i = 0; i < table_numbers_.size(); i++) { TableInfo t; - t.meta.number = table_numbers_[i]; + t.meta.fd.number = table_numbers_[i]; Status status = ScanTable(&t); if (!status.ok()) { std::string fname = TableFileName(dbname_, table_numbers_[i]); @@ -270,13 +268,12 @@ class Repairer { } Status ScanTable(TableInfo* t) { - std::string fname = TableFileName(dbname_, t->meta.number); + std::string fname = TableFileName(dbname_, t->meta.fd.GetNumber()); int counter = 0; - Status status = env_->GetFileSize(fname, &t->meta.file_size); + Status status = env_->GetFileSize(fname, &t->meta.fd.file_size); if (status.ok()) { - FileMetaData dummy_meta(t->meta.number, t->meta.file_size); Iterator* iter = table_cache_->NewIterator( - ReadOptions(), storage_options_, icmp_, dummy_meta); + ReadOptions(), storage_options_, icmp_, t->meta.fd); bool empty = true; ParsedInternalKey parsed; t->min_sequence = 0; @@ -285,7 +282,7 @@ class Repairer { Slice key = iter->key(); if (!ParseInternalKey(key, &parsed)) { Log(options_.info_log, "Table #%llu: unparsable key %s", - (unsigned long long) t->meta.number, + (unsigned long long)t->meta.fd.GetNumber(), EscapeString(key).c_str()); continue; } @@ -309,8 +306,7 @@ class Repairer { delete iter; } Log(options_.info_log, "Table #%llu: %d entries %s", - (unsigned long long) t->meta.number, - counter, + (unsigned long long)t->meta.fd.GetNumber(), counter, status.ToString().c_str()); return status; } @@ -339,9 +335,9 @@ class Repairer { for (size_t i = 0; i < tables_.size(); i++) { // TODO(opt): separate out into multiple levels const TableInfo& t = tables_[i]; - edit_->AddFile(0, t.meta.number, t.meta.file_size, - t.meta.smallest, t.meta.largest, - t.min_sequence, t.max_sequence); + edit_->AddFile(0, t.meta.fd.GetNumber(), t.meta.fd.GetFileSize(), + t.meta.smallest, t.meta.largest, t.min_sequence, + t.max_sequence); } //fprintf(stderr, "NewDescriptor:\n%s\n", edit_.DebugString().c_str()); diff --git a/db/table_cache.cc b/db/table_cache.cc index f4757cbfe..9993e5a08 100644 --- a/db/table_cache.cc +++ b/db/table_cache.cc @@ -31,7 +31,7 @@ static void UnrefEntry(void* arg1, void* arg2) { cache->Release(h); } -static Slice GetSliceForFileNumber(uint64_t* file_number) { +static Slice GetSliceForFileNumber(const uint64_t* file_number) { return Slice(reinterpret_cast(file_number), sizeof(*file_number)); } @@ -57,11 +57,10 @@ void TableCache::ReleaseHandle(Cache::Handle* handle) { Status TableCache::FindTable(const EnvOptions& toptions, const InternalKeyComparator& internal_comparator, - uint64_t file_number, uint64_t file_size, - Cache::Handle** handle, bool* table_io, - const bool no_io) { + const FileDescriptor& fd, Cache::Handle** handle, + bool* table_io, const bool no_io) { Status s; - Slice key = GetSliceForFileNumber(&file_number); + Slice key = GetSliceForFileNumber(&fd.number); *handle = cache_->Lookup(key); if (*handle == nullptr) { if (no_io) { // Dont do IO and return a not-found status @@ -70,7 +69,7 @@ Status TableCache::FindTable(const EnvOptions& toptions, if (table_io != nullptr) { *table_io = true; // we had to do IO from storage } - std::string fname = TableFileName(dbname_, file_number); + std::string fname = TableFileName(dbname_, fd.GetNumber()); unique_ptr file; unique_ptr table_reader; s = env_->NewRandomAccessFile(fname, &file, toptions); @@ -81,8 +80,8 @@ Status TableCache::FindTable(const EnvOptions& toptions, } StopWatch sw(env_, options_->statistics.get(), TABLE_OPEN_IO_MICROS); s = options_->table_factory->NewTableReader( - *options_, toptions, internal_comparator, std::move(file), file_size, - &table_reader); + *options_, toptions, internal_comparator, std::move(file), + fd.GetFileSize(), &table_reader); } if (!s.ok()) { @@ -101,18 +100,18 @@ Status TableCache::FindTable(const EnvOptions& toptions, Iterator* TableCache::NewIterator(const ReadOptions& options, const EnvOptions& toptions, const InternalKeyComparator& icomparator, - const FileMetaData& file_meta, + const FileDescriptor& fd, TableReader** table_reader_ptr, bool for_compaction, Arena* arena) { if (table_reader_ptr != nullptr) { *table_reader_ptr = nullptr; } - TableReader* table_reader = file_meta.table_reader; + TableReader* table_reader = fd.table_reader; Cache::Handle* handle = nullptr; Status s; if (table_reader == nullptr) { - s = FindTable(toptions, icomparator, file_meta.number, file_meta.file_size, - &handle, nullptr, options.read_tier == kBlockCacheTier); + s = FindTable(toptions, icomparator, fd, &handle, nullptr, + options.read_tier == kBlockCacheTier); if (!s.ok()) { return NewErrorIterator(s, arena); } @@ -136,16 +135,15 @@ Iterator* TableCache::NewIterator(const ReadOptions& options, Status TableCache::Get(const ReadOptions& options, const InternalKeyComparator& internal_comparator, - const FileMetaData& file_meta, const Slice& k, void* arg, + const FileDescriptor& fd, const Slice& k, void* arg, bool (*saver)(void*, const ParsedInternalKey&, const Slice&, bool), bool* table_io, void (*mark_key_may_exist)(void*)) { - TableReader* t = file_meta.table_reader; + TableReader* t = fd.table_reader; Status s; Cache::Handle* handle = nullptr; if (!t) { - s = FindTable(storage_options_, internal_comparator, file_meta.number, - file_meta.file_size, &handle, table_io, + s = FindTable(storage_options_, internal_comparator, fd, &handle, table_io, options.read_tier == kBlockCacheTier); if (s.ok()) { t = GetTableReaderFromHandle(handle); @@ -165,11 +163,10 @@ Status TableCache::Get(const ReadOptions& options, } Status TableCache::GetTableProperties( const EnvOptions& toptions, - const InternalKeyComparator& internal_comparator, - const FileMetaData& file_meta, + const InternalKeyComparator& internal_comparator, const FileDescriptor& fd, std::shared_ptr* properties, bool no_io) { Status s; - auto table_reader = file_meta.table_reader; + auto table_reader = fd.table_reader; // table already been pre-loaded? if (table_reader) { *properties = table_reader->GetTableProperties(); @@ -179,8 +176,8 @@ Status TableCache::GetTableProperties( bool table_io; Cache::Handle* table_handle = nullptr; - s = FindTable(toptions, internal_comparator, file_meta.number, - file_meta.file_size, &table_handle, &table_io, no_io); + s = FindTable(toptions, internal_comparator, fd, &table_handle, &table_io, + no_io); if (!s.ok()) { return s; } diff --git a/db/table_cache.h b/db/table_cache.h index 1aa61db01..4311a390d 100644 --- a/db/table_cache.h +++ b/db/table_cache.h @@ -24,10 +24,8 @@ namespace rocksdb { class Env; class Arena; -struct FileMetaData; +struct FileDescriptor; -// TODO(sdong): try to come up with a better API to pass the file information -// other than simply passing FileMetaData. class TableCache { public: TableCache(const std::string& dbname, const Options* options, @@ -43,7 +41,7 @@ class TableCache { // returned iterator is live. Iterator* NewIterator(const ReadOptions& options, const EnvOptions& toptions, const InternalKeyComparator& internal_comparator, - const FileMetaData& file_meta, + const FileDescriptor& file_fd, TableReader** table_reader_ptr = nullptr, bool for_compaction = false, Arena* arena = nullptr); @@ -52,7 +50,7 @@ class TableCache { // it returns false. Status Get(const ReadOptions& options, const InternalKeyComparator& internal_comparator, - const FileMetaData& file_meta, const Slice& k, void* arg, + const FileDescriptor& file_fd, const Slice& k, void* arg, bool (*handle_result)(void*, const ParsedInternalKey&, const Slice&, bool), bool* table_io, void (*mark_key_may_exist)(void*) = nullptr); @@ -63,7 +61,7 @@ class TableCache { // Find table reader Status FindTable(const EnvOptions& toptions, const InternalKeyComparator& internal_comparator, - uint64_t file_number, uint64_t file_size, Cache::Handle**, + const FileDescriptor& file_fd, Cache::Handle**, bool* table_io = nullptr, const bool no_io = false); // Get TableReader from a cache handle. @@ -77,7 +75,7 @@ class TableCache { // we set `no_io` to be true. Status GetTableProperties(const EnvOptions& toptions, const InternalKeyComparator& internal_comparator, - const FileMetaData& file_meta, + const FileDescriptor& file_meta, std::shared_ptr* properties, bool no_io = false); diff --git a/db/version_edit.cc b/db/version_edit.cc index 2ac35c58c..c2b4928e0 100644 --- a/db/version_edit.cc +++ b/db/version_edit.cc @@ -95,8 +95,8 @@ void VersionEdit::EncodeTo(std::string* dst) const { const FileMetaData& f = new_files_[i].second; PutVarint32(dst, kNewFile2); PutVarint32(dst, new_files_[i].first); // level - PutVarint64(dst, f.number); - PutVarint64(dst, f.file_size); + PutVarint64(dst, f.fd.GetNumber()); + PutVarint64(dst, f.fd.GetFileSize()); PutLengthPrefixedSlice(dst, f.smallest.Encode()); PutLengthPrefixedSlice(dst, f.largest.Encode()); PutVarint64(dst, f.smallest_seqno); @@ -230,12 +230,14 @@ Status VersionEdit::DecodeFrom(const Slice& src) { } break; - case kNewFile: - if (GetLevel(&input, &level, &msg) && - GetVarint64(&input, &f.number) && - GetVarint64(&input, &f.file_size) && + case kNewFile: { + uint64_t number; + uint64_t file_size; + if (GetLevel(&input, &level, &msg) && GetVarint64(&input, &number) && + GetVarint64(&input, &file_size) && GetInternalKey(&input, &f.smallest) && GetInternalKey(&input, &f.largest)) { + f.fd = FileDescriptor(number, file_size); new_files_.push_back(std::make_pair(level, f)); } else { if (!msg) { @@ -243,15 +245,17 @@ Status VersionEdit::DecodeFrom(const Slice& src) { } } break; - - case kNewFile2: - if (GetLevel(&input, &level, &msg) && - GetVarint64(&input, &f.number) && - GetVarint64(&input, &f.file_size) && + } + case kNewFile2: { + uint64_t number; + uint64_t file_size; + if (GetLevel(&input, &level, &msg) && GetVarint64(&input, &number) && + GetVarint64(&input, &file_size) && GetInternalKey(&input, &f.smallest) && GetInternalKey(&input, &f.largest) && GetVarint64(&input, &f.smallest_seqno) && - GetVarint64(&input, &f.largest_seqno) ) { + GetVarint64(&input, &f.largest_seqno)) { + f.fd = FileDescriptor(number, file_size); new_files_.push_back(std::make_pair(level, f)); } else { if (!msg) { @@ -259,6 +263,7 @@ Status VersionEdit::DecodeFrom(const Slice& src) { } } break; + } case kColumnFamily: if (!GetVarint32(&input, &column_family_)) { @@ -336,9 +341,9 @@ std::string VersionEdit::DebugString(bool hex_key) const { r.append("\n AddFile: "); AppendNumberTo(&r, new_files_[i].first); r.append(" "); - AppendNumberTo(&r, f.number); + AppendNumberTo(&r, f.fd.GetNumber()); r.append(" "); - AppendNumberTo(&r, f.file_size); + AppendNumberTo(&r, f.fd.GetFileSize()); r.append(" "); r.append(f.smallest.DebugString(hex_key)); r.append(" .. "); diff --git a/db/version_edit.h b/db/version_edit.h index acaec8a4f..7d4d28b03 100644 --- a/db/version_edit.h +++ b/db/version_edit.h @@ -19,11 +19,28 @@ namespace rocksdb { class VersionSet; +// A copyable structure contains information needed to read data from an SST +// file. It can contains a pointer to a table reader opened for the file, or +// file number and size, which can be used to create a new table reader for it. +// The behavior is undefined when a copied of the structure is used when the +// file is not in any live version any more. +struct FileDescriptor { + uint64_t number; + uint64_t file_size; // File size in bytes + // Table reader in table_reader_handle + TableReader* table_reader; + + FileDescriptor(uint64_t number, uint64_t file_size) + : number(number), file_size(file_size), table_reader(nullptr) {} + + uint64_t GetNumber() const { return number; } + uint64_t GetFileSize() const { return file_size; } +}; + struct FileMetaData { int refs; + FileDescriptor fd; int allowed_seeks; // Seeks allowed until compaction - uint64_t number; - uint64_t file_size; // File size in bytes InternalKey smallest; // Smallest internal key served by table InternalKey largest; // Largest internal key served by table bool being_compacted; // Is this file undergoing compaction? @@ -32,18 +49,13 @@ struct FileMetaData { // Needs to be disposed when refs becomes 0. Cache::Handle* table_reader_handle; - // Table reader in table_reader_handle - TableReader* table_reader; - FileMetaData(uint64_t number, uint64_t file_size) + FileMetaData() : refs(0), + fd(0, 0), allowed_seeks(1 << 30), - number(number), - file_size(file_size), being_compacted(false), - table_reader_handle(nullptr), - table_reader(nullptr) {} - FileMetaData() : FileMetaData(0, 0) {} + table_reader_handle(nullptr) {} }; class VersionEdit { @@ -89,8 +101,7 @@ class VersionEdit { const SequenceNumber& largest_seqno) { assert(smallest_seqno <= largest_seqno); FileMetaData f; - f.number = file; - f.file_size = file_size; + f.fd = FileDescriptor(file, file_size); f.smallest = smallest; f.largest = largest; f.smallest_seqno = smallest_seqno; diff --git a/db/version_set.cc b/db/version_set.cc index b9243196c..a455fa16b 100644 --- a/db/version_set.cc +++ b/db/version_set.cc @@ -42,7 +42,7 @@ namespace rocksdb { static uint64_t TotalFileSize(const std::vector& files) { uint64_t sum = 0; for (size_t i = 0; i < files.size() && files[i]; i++) { - sum += files[i]->file_size; + sum += files[i]->fd.GetFileSize(); } return sum; } @@ -150,18 +150,6 @@ bool SomeFileOverlapsRange( return !BeforeFile(ucmp, largest_user_key, files[index]); } -namespace { -// Used for LevelFileNumIterator to pass "block handle" value, -// which actually means file information in this iterator. -// It contains subset of fields of FileMetaData, that is sufficient -// for table cache to use. -struct EncodedFileMetaData { - uint64_t number; // file number - uint64_t file_size; // file size - TableReader* table_reader; // cached table reader -}; -} // namespace - // An internal iterator. For a given version/level pair, yields // information about the files in the level. For a given entry, key() // is the largest key that occurs in the file, and value() is an @@ -173,7 +161,8 @@ class Version::LevelFileNumIterator : public Iterator { const std::vector* flist) : icmp_(icmp), flist_(flist), - index_(flist->size()) { // Marks as invalid + index_(flist->size()), + current_value_(0, 0) { // Marks as invalid } virtual bool Valid() const { return index_ < flist_->size(); @@ -204,18 +193,16 @@ class Version::LevelFileNumIterator : public Iterator { Slice value() const { assert(Valid()); auto* file_meta = (*flist_)[index_]; - current_value_.number = file_meta->number; - current_value_.file_size = file_meta->file_size; - current_value_.table_reader = file_meta->table_reader; + current_value_ = file_meta->fd; return Slice(reinterpret_cast(¤t_value_), - sizeof(EncodedFileMetaData)); + sizeof(FileDescriptor)); } virtual Status status() const { return Status::OK(); } private: const InternalKeyComparator icmp_; const std::vector* const flist_; uint32_t index_; - mutable EncodedFileMetaData current_value_; + mutable FileDescriptor current_value_; }; class Version::LevelFileIteratorState : public TwoLevelIteratorState { @@ -230,17 +217,15 @@ class Version::LevelFileIteratorState : public TwoLevelIteratorState { for_compaction_(for_compaction) {} Iterator* NewSecondaryIterator(const Slice& meta_handle) override { - if (meta_handle.size() != sizeof(EncodedFileMetaData)) { + if (meta_handle.size() != sizeof(FileDescriptor)) { return NewErrorIterator( Status::Corruption("FileReader invoked with unexpected value")); } else { - const EncodedFileMetaData* encoded_meta = - reinterpret_cast(meta_handle.data()); - FileMetaData meta(encoded_meta->number, encoded_meta->file_size); - meta.table_reader = encoded_meta->table_reader; - return table_cache_->NewIterator(read_options_, env_options_, - icomparator_, meta, nullptr /* don't need reference to table*/, - for_compaction_); + const FileDescriptor* fd = + reinterpret_cast(meta_handle.data()); + return table_cache_->NewIterator( + read_options_, env_options_, icomparator_, *fd, + nullptr /* don't need reference to table*/, for_compaction_); } } @@ -261,12 +246,12 @@ Status Version::GetPropertiesOfAllTables(TablePropertiesCollection* props) { auto options = cfd_->options(); for (int level = 0; level < num_levels_; level++) { for (const auto& file_meta : files_[level]) { - auto fname = TableFileName(vset_->dbname_, file_meta->number); + auto fname = TableFileName(vset_->dbname_, file_meta->fd.GetNumber()); // 1. If the table is already present in table cache, load table // properties from there. std::shared_ptr table_properties; Status s = table_cache->GetTableProperties( - vset_->storage_options_, cfd_->internal_comparator(), *file_meta, + vset_->storage_options_, cfd_->internal_comparator(), file_meta->fd, &table_properties, true /* no io */); if (s.ok()) { props->insert({fname, table_properties}); @@ -292,7 +277,7 @@ Status Version::GetPropertiesOfAllTables(TablePropertiesCollection* props) { // By setting the magic number to kInvalidTableMagicNumber, we can by // pass the magic number check in the footer. s = ReadTableProperties( - file.get(), file_meta->file_size, + file.get(), file_meta->fd.GetFileSize(), Footer::kInvalidTableMagicNumber /* table's magic number */, vset_->env_, options->info_log.get(), &raw_table_properties); if (!s.ok()) { @@ -315,7 +300,7 @@ void Version::AddIterators(const ReadOptions& read_options, // Merge all level zero files together since they may overlap for (const FileMetaData* file : files_[0]) { iters->push_back(cfd_->table_cache()->NewIterator( - read_options, soptions, cfd_->internal_comparator(), *file)); + read_options, soptions, cfd_->internal_comparator(), file->fd)); } // For levels > 0, we can use a concatenating iterator that sequentially @@ -338,7 +323,7 @@ void Version::AddIterators(const ReadOptions& read_options, // Merge all level zero files together since they may overlap for (const FileMetaData* file : files_[0]) { merge_iter_builder->AddIterator(cfd_->table_cache()->NewIterator( - read_options, soptions, cfd_->internal_comparator(), *file, nullptr, + read_options, soptions, cfd_->internal_comparator(), file->fd, nullptr, false, merge_iter_builder->GetArena())); } @@ -461,7 +446,7 @@ static bool SaveValue(void* arg, const ParsedInternalKey& parsed_key, namespace { bool NewestFirst(FileMetaData* a, FileMetaData* b) { - return a->number > b->number; + return a->fd.GetNumber() > b->fd.GetNumber(); } bool NewestFirstBySeqNo(FileMetaData* a, FileMetaData* b) { if (a->smallest_seqno != b->smallest_seqno) { @@ -480,7 +465,7 @@ bool BySmallestKey(FileMetaData* a, FileMetaData* b, return (r < 0); } // Break ties by file number - return (a->number < b->number); + return (a->fd.GetNumber() < b->fd.GetNumber()); } } // anonymous namespace @@ -571,7 +556,7 @@ void Version::Get(const ReadOptions& options, // Prefetch table data to avoid cache miss if possible if (level == 0) { for (int i = 0; i < num_files; ++i) { - auto* r = files_[0][i]->table_reader; + auto* r = files_[0][i]->fd.table_reader; if (r) { r->Prepare(ikey); } @@ -680,7 +665,7 @@ void Version::Get(const ReadOptions& options, prev_file = f; #endif bool tableIO = false; - *status = table_cache_->Get(options, *internal_comparator_, *f, ikey, + *status = table_cache_->Get(options, *internal_comparator_, f->fd, ikey, &saver, SaveValue, &tableIO, MarkKeyMayExist); // TODO: examine the behavior for corrupted key if (!status->ok()) { @@ -793,7 +778,7 @@ void Version::ComputeCompactionScore( uint64_t total_size = 0; for (unsigned int i = 0; i < files_[level].size(); i++) { if (!files_[level][i]->being_compacted) { - total_size += files_[level][i]->file_size; + total_size += files_[level][i]->fd.GetFileSize(); numfiles++; } } @@ -850,7 +835,7 @@ namespace { // In normal mode: descending size bool CompareSizeDescending(const Version::Fsize& first, const Version::Fsize& second) { - return (first.file->file_size > second.file->file_size); + return (first.file->fd.GetFileSize() > second.file->fd.GetFileSize()); } // A static compator used to sort files based on their seqno // In universal style : descending seqno @@ -1245,10 +1230,10 @@ const char* Version::LevelFileSummary(FileSummaryStorage* scratch, for (const auto& f : files_[level]) { int sz = sizeof(scratch->buffer) - len; char sztxt[16]; - AppendHumanBytes(f->file_size, sztxt, 16); + AppendHumanBytes(f->fd.GetFileSize(), sztxt, 16); int ret = snprintf(scratch->buffer + len, sz, - "#%" PRIu64 "(seq=%" PRIu64 ",sz=%s,%d) ", f->number, - f->smallest_seqno, sztxt, + "#%" PRIu64 "(seq=%" PRIu64 ",sz=%s,%d) ", + f->fd.GetNumber(), f->smallest_seqno, sztxt, static_cast(f->being_compacted)); if (ret < 0 || ret >= sz) break; @@ -1281,7 +1266,7 @@ void Version::AddLiveFiles(std::set* live) { for (int level = 0; level < NumberLevels(); level++) { const std::vector& files = files_[level]; for (const auto& file : files) { - live->insert(file->number); + live->insert(file->fd.GetNumber()); } } } @@ -1301,9 +1286,9 @@ std::string Version::DebugString(bool hex) const { const std::vector& files = files_[level]; for (size_t i = 0; i < files.size(); i++) { r.push_back(' '); - AppendNumberTo(&r, files[i]->number); + AppendNumberTo(&r, files[i]->fd.GetNumber()); r.push_back(':'); - AppendNumberTo(&r, files[i]->file_size); + AppendNumberTo(&r, files[i]->fd.GetFileSize()); r.append("["); r.append(files[i]->smallest.DebugString(hex)); r.append(" .. "); @@ -1452,7 +1437,7 @@ class VersionSet::Builder { const std::vector& base_files = base_->files_[l]; for (unsigned int i = 0; i < base_files.size(); i++) { FileMetaData* f = base_files[i]; - if (f->number == number) { + if (f->fd.GetNumber() == number) { found = true; break; } @@ -1466,7 +1451,7 @@ class VersionSet::Builder { for (FileSet::const_iterator added_iter = added->begin(); added_iter != added->end(); ++added_iter) { FileMetaData* f = *added_iter; - if (f->number == number) { + if (f->fd.GetNumber() == number) { found = true; break; } @@ -1479,7 +1464,7 @@ class VersionSet::Builder { for (FileSet::const_iterator added_iter = added->begin(); added_iter != added->end(); ++added_iter) { FileMetaData* f = *added_iter; - if (f->number == number) { + if (f->fd.GetNumber() == number) { found = true; break; } @@ -1521,10 +1506,10 @@ class VersionSet::Builder { // same as the compaction of 40KB of data. We are a little // conservative and allow approximately one seek for every 16KB // of data before triggering a compaction. - f->allowed_seeks = (f->file_size / 16384); + f->allowed_seeks = (f->fd.GetFileSize() / 16384); if (f->allowed_seeks < 100) f->allowed_seeks = 100; - levels_[level].deleted_files.erase(f->number); + levels_[level].deleted_files.erase(f->fd.GetNumber()); levels_[level].added_files->insert(f); } } @@ -1573,11 +1558,10 @@ class VersionSet::Builder { bool table_io; cfd_->table_cache()->FindTable( base_->vset_->storage_options_, cfd_->internal_comparator(), - file_meta->number, file_meta->file_size, - &file_meta->table_reader_handle, &table_io, false); + file_meta->fd, &file_meta->table_reader_handle, &table_io, false); if (file_meta->table_reader_handle != nullptr) { // Load table_reader - file_meta->table_reader = + file_meta->fd.table_reader = cfd_->table_cache()->GetTableReaderFromHandle( file_meta->table_reader_handle); } @@ -1586,7 +1570,7 @@ class VersionSet::Builder { } void MaybeAddFile(Version* v, int level, FileMetaData* f) { - if (levels_[level].deleted_files.count(f->number) > 0) { + if (levels_[level].deleted_files.count(f->fd.GetNumber()) > 0) { // File is deleted: do nothing } else { auto* files = &v->files_[level]; @@ -2592,12 +2576,8 @@ Status VersionSet::WriteSnapshot(log::Writer* log) { for (int level = 0; level < cfd->NumberLevels(); level++) { for (const auto& f : cfd->current()->files_[level]) { - edit.AddFile(level, - f->number, - f->file_size, - f->smallest, - f->largest, - f->smallest_seqno, + edit.AddFile(level, f->fd.GetNumber(), f->fd.GetFileSize(), + f->smallest, f->largest, f->smallest_seqno, f->largest_seqno); } } @@ -2653,7 +2633,7 @@ uint64_t VersionSet::ApproximateOffsetOf(Version* v, const InternalKey& ikey) { if (v->cfd_->internal_comparator().Compare(files[i]->largest, ikey) <= 0) { // Entire file is before "ikey", so just add the file size - result += files[i]->file_size; + result += files[i]->fd.GetFileSize(); } else if (v->cfd_->internal_comparator().Compare(files[i]->smallest, ikey) > 0) { // Entire file is after "ikey", so ignore @@ -2669,7 +2649,7 @@ uint64_t VersionSet::ApproximateOffsetOf(Version* v, const InternalKey& ikey) { TableReader* table_reader_ptr; Iterator* iter = v->cfd_->table_cache()->NewIterator( ReadOptions(), storage_options_, v->cfd_->internal_comparator(), - *(files[i]), &table_reader_ptr); + files[i]->fd, &table_reader_ptr); if (table_reader_ptr != nullptr) { result += table_reader_ptr->ApproximateOffsetOf(ikey.Encode()); } @@ -2702,7 +2682,7 @@ void VersionSet::AddLiveFiles(std::vector* live_list) { v = v->next_) { for (int level = 0; level < v->NumberLevels(); level++) { for (const auto& f : v->files_[level]) { - live_list->push_back(f->number); + live_list->push_back(f->fd.GetNumber()); } } } @@ -2728,7 +2708,7 @@ Iterator* VersionSet::MakeInputIterator(Compaction* c) { for (const auto& file : *c->inputs(which)) { list[num++] = cfd->table_cache()->NewIterator( read_options, storage_options_compactions_, - cfd->internal_comparator(), *file, nullptr, + cfd->internal_comparator(), file->fd, nullptr, true /* for compaction */); } } else { @@ -2763,13 +2743,13 @@ bool VersionSet::VerifyCompactionFileConsistency(Compaction* c) { // verify files in level int level = c->level(); for (int i = 0; i < c->num_input_files(0); i++) { - uint64_t number = c->input(0,i)->number; + uint64_t number = c->input(0, i)->fd.GetNumber(); // look for this file in the current version bool found = false; for (unsigned int j = 0; j < version->files_[level].size(); j++) { FileMetaData* f = version->files_[level][j]; - if (f->number == number) { + if (f->fd.GetNumber() == number) { found = true; break; } @@ -2781,13 +2761,13 @@ bool VersionSet::VerifyCompactionFileConsistency(Compaction* c) { // verify level+1 files level++; for (int i = 0; i < c->num_input_files(1); i++) { - uint64_t number = c->input(1,i)->number; + uint64_t number = c->input(1, i)->fd.GetNumber(); // look for this file in the current version bool found = false; for (unsigned int j = 0; j < version->files_[level].size(); j++) { FileMetaData* f = version->files_[level][j]; - if (f->number == number) { + if (f->fd.GetNumber() == number) { found = true; break; } @@ -2807,7 +2787,7 @@ Status VersionSet::GetMetadataForFile(uint64_t number, int* filelevel, Version* version = cfd_iter->current(); for (int level = 0; level < version->NumberLevels(); level++) { for (const auto& file : version->files_[level]) { - if (file->number == number) { + if (file->fd.GetNumber() == number) { *meta = file; *filelevel = level; *cfd = cfd_iter; @@ -2825,9 +2805,9 @@ void VersionSet::GetLiveFilesMetaData(std::vector* metadata) { for (const auto& file : cfd->current()->files_[level]) { LiveFileMetaData filemetadata; filemetadata.column_family_name = cfd->GetName(); - filemetadata.name = TableFileName("", file->number); + filemetadata.name = TableFileName("", file->fd.GetNumber()); filemetadata.level = level; - filemetadata.size = file->file_size; + filemetadata.size = file->fd.GetFileSize(); filemetadata.smallestkey = file->smallest.user_key().ToString(); filemetadata.largestkey = file->largest.user_key().ToString(); filemetadata.smallest_seqno = file->smallest_seqno; diff --git a/db/version_set_test.cc b/db/version_set_test.cc index 1af95dd3f..ef48bf927 100644 --- a/db/version_set_test.cc +++ b/db/version_set_test.cc @@ -31,7 +31,7 @@ class FindFileTest { SequenceNumber smallest_seq = 100, SequenceNumber largest_seq = 100) { FileMetaData* f = new FileMetaData; - f->number = files_.size() + 1; + f->fd = FileDescriptor(files_.size() + 1, 0); f->smallest = InternalKey(smallest, smallest_seq, kTypeValue); f->largest = InternalKey(largest, largest_seq, kTypeValue); files_.push_back(f);