fork of https://github.com/oxigraph/rocksdb and https://github.com/facebook/rocksdb for nextgraph and oxigraph
				
			
			
		
			You can not select more than 25 topics
			Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.
		
		
		
		
		
			
		
			
				
					
					
						
							1480 lines
						
					
					
						
							56 KiB
						
					
					
				
			
		
		
	
	
							1480 lines
						
					
					
						
							56 KiB
						
					
					
				| //  Copyright (c) 2011-present, Facebook, Inc.  All rights reserved.
 | |
| //  This source code is licensed under both the GPLv2 (found in the
 | |
| //  COPYING file in the root directory) and Apache 2.0 License
 | |
| //  (found in the LICENSE.Apache file in the root directory).
 | |
| //
 | |
| // Copyright (c) 2011 The LevelDB Authors. All rights reserved.
 | |
| // Use of this source code is governed by a BSD-style license that can be
 | |
| // found in the LICENSE file. See the AUTHORS file for names of contributors.
 | |
| 
 | |
| #include "db/compaction_job.h"
 | |
| 
 | |
| #ifndef __STDC_FORMAT_MACROS
 | |
| #define __STDC_FORMAT_MACROS
 | |
| #endif
 | |
| 
 | |
| #include <inttypes.h>
 | |
| #include <algorithm>
 | |
| #include <functional>
 | |
| #include <list>
 | |
| #include <memory>
 | |
| #include <random>
 | |
| #include <set>
 | |
| #include <thread>
 | |
| #include <utility>
 | |
| #include <vector>
 | |
| 
 | |
| #include "db/builder.h"
 | |
| #include "db/db_iter.h"
 | |
| #include "db/dbformat.h"
 | |
| #include "db/event_helpers.h"
 | |
| #include "db/log_reader.h"
 | |
| #include "db/log_writer.h"
 | |
| #include "db/memtable.h"
 | |
| #include "db/memtable_list.h"
 | |
| #include "db/merge_context.h"
 | |
| #include "db/merge_helper.h"
 | |
| #include "db/version_set.h"
 | |
| #include "monitoring/iostats_context_imp.h"
 | |
| #include "monitoring/perf_context_imp.h"
 | |
| #include "monitoring/thread_status_util.h"
 | |
| #include "port/likely.h"
 | |
| #include "port/port.h"
 | |
| #include "rocksdb/db.h"
 | |
| #include "rocksdb/env.h"
 | |
| #include "rocksdb/statistics.h"
 | |
| #include "rocksdb/status.h"
 | |
| #include "rocksdb/table.h"
 | |
| #include "table/block.h"
 | |
| #include "table/block_based_table_factory.h"
 | |
| #include "table/merging_iterator.h"
 | |
| #include "table/table_builder.h"
 | |
| #include "util/coding.h"
 | |
| #include "util/file_reader_writer.h"
 | |
| #include "util/filename.h"
 | |
| #include "util/log_buffer.h"
 | |
| #include "util/logging.h"
 | |
| #include "util/mutexlock.h"
 | |
| #include "util/random.h"
 | |
| #include "util/sst_file_manager_impl.h"
 | |
| #include "util/stop_watch.h"
 | |
| #include "util/string_util.h"
 | |
| #include "util/sync_point.h"
 | |
| 
 | |
| namespace rocksdb {
 | |
| 
 | |
| // Maintains state for each sub-compaction
 | |
| struct CompactionJob::SubcompactionState {
 | |
|   const Compaction* compaction;
 | |
|   std::unique_ptr<CompactionIterator> c_iter;
 | |
| 
 | |
|   // The boundaries of the key-range this compaction is interested in. No two
 | |
|   // subcompactions may have overlapping key-ranges.
 | |
|   // 'start' is inclusive, 'end' is exclusive, and nullptr means unbounded
 | |
|   Slice *start, *end;
 | |
| 
 | |
|   // The return status of this subcompaction
 | |
|   Status status;
 | |
| 
 | |
|   // Files produced by this subcompaction
 | |
|   struct Output {
 | |
|     FileMetaData meta;
 | |
|     bool finished;
 | |
|     std::shared_ptr<const TableProperties> table_properties;
 | |
|   };
 | |
| 
 | |
|   // State kept for output being generated
 | |
|   std::vector<Output> outputs;
 | |
|   std::unique_ptr<WritableFileWriter> outfile;
 | |
|   std::unique_ptr<TableBuilder> builder;
 | |
|   Output* current_output() {
 | |
|     if (outputs.empty()) {
 | |
|       // This subcompaction's outptut could be empty if compaction was aborted
 | |
|       // before this subcompaction had a chance to generate any output files.
 | |
|       // When subcompactions are executed sequentially this is more likely and
 | |
|       // will be particulalry likely for the later subcompactions to be empty.
 | |
|       // Once they are run in parallel however it should be much rarer.
 | |
|       return nullptr;
 | |
|     } else {
 | |
|       return &outputs.back();
 | |
|     }
 | |
|   }
 | |
| 
 | |
|   uint64_t current_output_file_size;
 | |
| 
 | |
|   // State during the subcompaction
 | |
|   uint64_t total_bytes;
 | |
|   uint64_t num_input_records;
 | |
|   uint64_t num_output_records;
 | |
|   CompactionJobStats compaction_job_stats;
 | |
|   uint64_t approx_size;
 | |
|   // An index that used to speed up ShouldStopBefore().
 | |
|   size_t grandparent_index = 0;
 | |
|   // The number of bytes overlapping between the current output and
 | |
|   // grandparent files used in ShouldStopBefore().
 | |
|   uint64_t overlapped_bytes = 0;
 | |
|   // A flag determine whether the key has been seen in ShouldStopBefore()
 | |
|   bool seen_key = false;
 | |
|   std::string compression_dict;
 | |
| 
 | |
|   SubcompactionState(Compaction* c, Slice* _start, Slice* _end,
 | |
|                      uint64_t size = 0)
 | |
|       : compaction(c),
 | |
|         start(_start),
 | |
|         end(_end),
 | |
|         outfile(nullptr),
 | |
|         builder(nullptr),
 | |
|         current_output_file_size(0),
 | |
|         total_bytes(0),
 | |
|         num_input_records(0),
 | |
|         num_output_records(0),
 | |
|         approx_size(size),
 | |
|         grandparent_index(0),
 | |
|         overlapped_bytes(0),
 | |
|         seen_key(false),
 | |
|         compression_dict() {
 | |
|     assert(compaction != nullptr);
 | |
|   }
 | |
| 
 | |
|   SubcompactionState(SubcompactionState&& o) { *this = std::move(o); }
 | |
| 
 | |
|   SubcompactionState& operator=(SubcompactionState&& o) {
 | |
|     compaction = std::move(o.compaction);
 | |
|     start = std::move(o.start);
 | |
|     end = std::move(o.end);
 | |
|     status = std::move(o.status);
 | |
|     outputs = std::move(o.outputs);
 | |
|     outfile = std::move(o.outfile);
 | |
|     builder = std::move(o.builder);
 | |
|     current_output_file_size = std::move(o.current_output_file_size);
 | |
|     total_bytes = std::move(o.total_bytes);
 | |
|     num_input_records = std::move(o.num_input_records);
 | |
|     num_output_records = std::move(o.num_output_records);
 | |
|     compaction_job_stats = std::move(o.compaction_job_stats);
 | |
|     approx_size = std::move(o.approx_size);
 | |
|     grandparent_index = std::move(o.grandparent_index);
 | |
|     overlapped_bytes = std::move(o.overlapped_bytes);
 | |
|     seen_key = std::move(o.seen_key);
 | |
|     compression_dict = std::move(o.compression_dict);
 | |
|     return *this;
 | |
|   }
 | |
| 
 | |
|   // Because member unique_ptrs do not have these.
 | |
|   SubcompactionState(const SubcompactionState&) = delete;
 | |
| 
 | |
|   SubcompactionState& operator=(const SubcompactionState&) = delete;
 | |
| 
 | |
|   // Returns true iff we should stop building the current output
 | |
|   // before processing "internal_key".
 | |
|   bool ShouldStopBefore(const Slice& internal_key, uint64_t curr_file_size) {
 | |
|     const InternalKeyComparator* icmp =
 | |
|         &compaction->column_family_data()->internal_comparator();
 | |
|     const std::vector<FileMetaData*>& grandparents = compaction->grandparents();
 | |
| 
 | |
|     // Scan to find earliest grandparent file that contains key.
 | |
|     while (grandparent_index < grandparents.size() &&
 | |
|            icmp->Compare(internal_key,
 | |
|                          grandparents[grandparent_index]->largest.Encode()) >
 | |
|                0) {
 | |
|       if (seen_key) {
 | |
|         overlapped_bytes += grandparents[grandparent_index]->fd.GetFileSize();
 | |
|       }
 | |
|       assert(grandparent_index + 1 >= grandparents.size() ||
 | |
|              icmp->Compare(
 | |
|                  grandparents[grandparent_index]->largest.Encode(),
 | |
|                  grandparents[grandparent_index + 1]->smallest.Encode()) <= 0);
 | |
|       grandparent_index++;
 | |
|     }
 | |
|     seen_key = true;
 | |
| 
 | |
|     if (overlapped_bytes + curr_file_size >
 | |
|         compaction->max_compaction_bytes()) {
 | |
|       // Too much overlap for current output; start new output
 | |
|       overlapped_bytes = 0;
 | |
|       return true;
 | |
|     }
 | |
| 
 | |
|     return false;
 | |
|   }
 | |
| };
 | |
| 
 | |
| // Maintains state for the entire compaction
 | |
| struct CompactionJob::CompactionState {
 | |
|   Compaction* const compaction;
 | |
| 
 | |
|   // REQUIRED: subcompaction states are stored in order of increasing
 | |
|   // key-range
 | |
|   std::vector<CompactionJob::SubcompactionState> sub_compact_states;
 | |
|   Status status;
 | |
| 
 | |
|   uint64_t total_bytes;
 | |
|   uint64_t num_input_records;
 | |
|   uint64_t num_output_records;
 | |
| 
 | |
|   explicit CompactionState(Compaction* c)
 | |
|       : compaction(c),
 | |
|         total_bytes(0),
 | |
|         num_input_records(0),
 | |
|         num_output_records(0) {}
 | |
| 
 | |
|   size_t NumOutputFiles() {
 | |
|     size_t total = 0;
 | |
|     for (auto& s : sub_compact_states) {
 | |
|       total += s.outputs.size();
 | |
|     }
 | |
|     return total;
 | |
|   }
 | |
| 
 | |
|   Slice SmallestUserKey() {
 | |
|     for (const auto& sub_compact_state : sub_compact_states) {
 | |
|       if (!sub_compact_state.outputs.empty() &&
 | |
|           sub_compact_state.outputs[0].finished) {
 | |
|         return sub_compact_state.outputs[0].meta.smallest.user_key();
 | |
|       }
 | |
|     }
 | |
|     // If there is no finished output, return an empty slice.
 | |
|     return Slice(nullptr, 0);
 | |
|   }
 | |
| 
 | |
|   Slice LargestUserKey() {
 | |
|     for (auto it = sub_compact_states.rbegin(); it < sub_compact_states.rend();
 | |
|          ++it) {
 | |
|       if (!it->outputs.empty() && it->current_output()->finished) {
 | |
|         assert(it->current_output() != nullptr);
 | |
|         return it->current_output()->meta.largest.user_key();
 | |
|       }
 | |
|     }
 | |
|     // If there is no finished output, return an empty slice.
 | |
|     return Slice(nullptr, 0);
 | |
|   }
 | |
| };
 | |
| 
 | |
| void CompactionJob::AggregateStatistics() {
 | |
|   for (SubcompactionState& sc : compact_->sub_compact_states) {
 | |
|     compact_->total_bytes += sc.total_bytes;
 | |
|     compact_->num_input_records += sc.num_input_records;
 | |
|     compact_->num_output_records += sc.num_output_records;
 | |
|   }
 | |
|   if (compaction_job_stats_) {
 | |
|     for (SubcompactionState& sc : compact_->sub_compact_states) {
 | |
|       compaction_job_stats_->Add(sc.compaction_job_stats);
 | |
|     }
 | |
|   }
 | |
| }
 | |
| 
 | |
| CompactionJob::CompactionJob(
 | |
|     int job_id, Compaction* compaction, const ImmutableDBOptions& db_options,
 | |
|     const EnvOptions& env_options, VersionSet* versions,
 | |
|     const std::atomic<bool>* shutting_down, LogBuffer* log_buffer,
 | |
|     Directory* db_directory, Directory* output_directory, Statistics* stats,
 | |
|     InstrumentedMutex* db_mutex, Status* db_bg_error,
 | |
|     std::vector<SequenceNumber> existing_snapshots,
 | |
|     SequenceNumber earliest_write_conflict_snapshot,
 | |
|     std::shared_ptr<Cache> table_cache, EventLogger* event_logger,
 | |
|     bool paranoid_file_checks, bool measure_io_stats, const std::string& dbname,
 | |
|     CompactionJobStats* compaction_job_stats)
 | |
|     : job_id_(job_id),
 | |
|       compact_(new CompactionState(compaction)),
 | |
|       compaction_job_stats_(compaction_job_stats),
 | |
|       compaction_stats_(1),
 | |
|       dbname_(dbname),
 | |
|       db_options_(db_options),
 | |
|       env_options_(env_options),
 | |
|       env_(db_options.env),
 | |
|       versions_(versions),
 | |
|       shutting_down_(shutting_down),
 | |
|       log_buffer_(log_buffer),
 | |
|       db_directory_(db_directory),
 | |
|       output_directory_(output_directory),
 | |
|       stats_(stats),
 | |
|       db_mutex_(db_mutex),
 | |
|       db_bg_error_(db_bg_error),
 | |
|       existing_snapshots_(std::move(existing_snapshots)),
 | |
|       earliest_write_conflict_snapshot_(earliest_write_conflict_snapshot),
 | |
|       table_cache_(std::move(table_cache)),
 | |
|       event_logger_(event_logger),
 | |
|       paranoid_file_checks_(paranoid_file_checks),
 | |
|       measure_io_stats_(measure_io_stats) {
 | |
|   assert(log_buffer_ != nullptr);
 | |
|   const auto* cfd = compact_->compaction->column_family_data();
 | |
|   ThreadStatusUtil::SetColumnFamily(cfd, cfd->ioptions()->env,
 | |
|                                     db_options_.enable_thread_tracking);
 | |
|   ThreadStatusUtil::SetThreadOperation(ThreadStatus::OP_COMPACTION);
 | |
|   ReportStartedCompaction(compaction);
 | |
| }
 | |
| 
 | |
| CompactionJob::~CompactionJob() {
 | |
|   assert(compact_ == nullptr);
 | |
|   ThreadStatusUtil::ResetThreadStatus();
 | |
| }
 | |
| 
 | |
| void CompactionJob::ReportStartedCompaction(
 | |
|     Compaction* compaction) {
 | |
|   const auto* cfd = compact_->compaction->column_family_data();
 | |
|   ThreadStatusUtil::SetColumnFamily(cfd, cfd->ioptions()->env,
 | |
|                                     db_options_.enable_thread_tracking);
 | |
| 
 | |
|   ThreadStatusUtil::SetThreadOperationProperty(
 | |
|       ThreadStatus::COMPACTION_JOB_ID,
 | |
|       job_id_);
 | |
| 
 | |
|   ThreadStatusUtil::SetThreadOperationProperty(
 | |
|       ThreadStatus::COMPACTION_INPUT_OUTPUT_LEVEL,
 | |
|       (static_cast<uint64_t>(compact_->compaction->start_level()) << 32) +
 | |
|           compact_->compaction->output_level());
 | |
| 
 | |
|   // In the current design, a CompactionJob is always created
 | |
|   // for non-trivial compaction.
 | |
|   assert(compaction->IsTrivialMove() == false ||
 | |
|          compaction->is_manual_compaction() == true);
 | |
| 
 | |
|   ThreadStatusUtil::SetThreadOperationProperty(
 | |
|       ThreadStatus::COMPACTION_PROP_FLAGS,
 | |
|       compaction->is_manual_compaction() +
 | |
|           (compaction->deletion_compaction() << 1));
 | |
| 
 | |
|   ThreadStatusUtil::SetThreadOperationProperty(
 | |
|       ThreadStatus::COMPACTION_TOTAL_INPUT_BYTES,
 | |
|       compaction->CalculateTotalInputSize());
 | |
| 
 | |
|   IOSTATS_RESET(bytes_written);
 | |
|   IOSTATS_RESET(bytes_read);
 | |
|   ThreadStatusUtil::SetThreadOperationProperty(
 | |
|       ThreadStatus::COMPACTION_BYTES_WRITTEN, 0);
 | |
|   ThreadStatusUtil::SetThreadOperationProperty(
 | |
|       ThreadStatus::COMPACTION_BYTES_READ, 0);
 | |
| 
 | |
|   // Set the thread operation after operation properties
 | |
|   // to ensure GetThreadList() can always show them all together.
 | |
|   ThreadStatusUtil::SetThreadOperation(
 | |
|       ThreadStatus::OP_COMPACTION);
 | |
| 
 | |
|   if (compaction_job_stats_) {
 | |
|     compaction_job_stats_->is_manual_compaction =
 | |
|         compaction->is_manual_compaction();
 | |
|   }
 | |
| }
 | |
| 
 | |
| void CompactionJob::Prepare() {
 | |
|   AutoThreadOperationStageUpdater stage_updater(
 | |
|       ThreadStatus::STAGE_COMPACTION_PREPARE);
 | |
| 
 | |
|   // Generate file_levels_ for compaction berfore making Iterator
 | |
|   auto* c = compact_->compaction;
 | |
|   assert(c->column_family_data() != nullptr);
 | |
|   assert(c->column_family_data()->current()->storage_info()
 | |
|       ->NumLevelFiles(compact_->compaction->level()) > 0);
 | |
| 
 | |
|   // Is this compaction producing files at the bottommost level?
 | |
|   bottommost_level_ = c->bottommost_level();
 | |
| 
 | |
|   if (c->ShouldFormSubcompactions()) {
 | |
|     const uint64_t start_micros = env_->NowMicros();
 | |
|     GenSubcompactionBoundaries();
 | |
|     MeasureTime(stats_, SUBCOMPACTION_SETUP_TIME,
 | |
|                 env_->NowMicros() - start_micros);
 | |
| 
 | |
|     assert(sizes_.size() == boundaries_.size() + 1);
 | |
| 
 | |
|     for (size_t i = 0; i <= boundaries_.size(); i++) {
 | |
|       Slice* start = i == 0 ? nullptr : &boundaries_[i - 1];
 | |
|       Slice* end = i == boundaries_.size() ? nullptr : &boundaries_[i];
 | |
|       compact_->sub_compact_states.emplace_back(c, start, end, sizes_[i]);
 | |
|     }
 | |
|     MeasureTime(stats_, NUM_SUBCOMPACTIONS_SCHEDULED,
 | |
|                 compact_->sub_compact_states.size());
 | |
|   } else {
 | |
|     compact_->sub_compact_states.emplace_back(c, nullptr, nullptr);
 | |
|   }
 | |
| }
 | |
| 
 | |
| struct RangeWithSize {
 | |
|   Range range;
 | |
|   uint64_t size;
 | |
| 
 | |
|   RangeWithSize(const Slice& a, const Slice& b, uint64_t s = 0)
 | |
|       : range(a, b), size(s) {}
 | |
| };
 | |
| 
 | |
| // Generates a histogram representing potential divisions of key ranges from
 | |
| // the input. It adds the starting and/or ending keys of certain input files
 | |
| // to the working set and then finds the approximate size of data in between
 | |
| // each consecutive pair of slices. Then it divides these ranges into
 | |
| // consecutive groups such that each group has a similar size.
 | |
| void CompactionJob::GenSubcompactionBoundaries() {
 | |
|   auto* c = compact_->compaction;
 | |
|   auto* cfd = c->column_family_data();
 | |
|   const Comparator* cfd_comparator = cfd->user_comparator();
 | |
|   std::vector<Slice> bounds;
 | |
|   int start_lvl = c->start_level();
 | |
|   int out_lvl = c->output_level();
 | |
| 
 | |
|   // Add the starting and/or ending key of certain input files as a potential
 | |
|   // boundary
 | |
|   for (size_t lvl_idx = 0; lvl_idx < c->num_input_levels(); lvl_idx++) {
 | |
|     int lvl = c->level(lvl_idx);
 | |
|     if (lvl >= start_lvl && lvl <= out_lvl) {
 | |
|       const LevelFilesBrief* flevel = c->input_levels(lvl_idx);
 | |
|       size_t num_files = flevel->num_files;
 | |
| 
 | |
|       if (num_files == 0) {
 | |
|         continue;
 | |
|       }
 | |
| 
 | |
|       if (lvl == 0) {
 | |
|         // For level 0 add the starting and ending key of each file since the
 | |
|         // files may have greatly differing key ranges (not range-partitioned)
 | |
|         for (size_t i = 0; i < num_files; i++) {
 | |
|           bounds.emplace_back(flevel->files[i].smallest_key);
 | |
|           bounds.emplace_back(flevel->files[i].largest_key);
 | |
|         }
 | |
|       } else {
 | |
|         // For all other levels add the smallest/largest key in the level to
 | |
|         // encompass the range covered by that level
 | |
|         bounds.emplace_back(flevel->files[0].smallest_key);
 | |
|         bounds.emplace_back(flevel->files[num_files - 1].largest_key);
 | |
|         if (lvl == out_lvl) {
 | |
|           // For the last level include the starting keys of all files since
 | |
|           // the last level is the largest and probably has the widest key
 | |
|           // range. Since it's range partitioned, the ending key of one file
 | |
|           // and the starting key of the next are very close (or identical).
 | |
|           for (size_t i = 1; i < num_files; i++) {
 | |
|             bounds.emplace_back(flevel->files[i].smallest_key);
 | |
|           }
 | |
|         }
 | |
|       }
 | |
|     }
 | |
|   }
 | |
| 
 | |
|   std::sort(bounds.begin(), bounds.end(),
 | |
|     [cfd_comparator] (const Slice& a, const Slice& b) -> bool {
 | |
|       return cfd_comparator->Compare(ExtractUserKey(a), ExtractUserKey(b)) < 0;
 | |
|     });
 | |
|   // Remove duplicated entries from bounds
 | |
|   bounds.erase(std::unique(bounds.begin(), bounds.end(),
 | |
|     [cfd_comparator] (const Slice& a, const Slice& b) -> bool {
 | |
|       return cfd_comparator->Compare(ExtractUserKey(a), ExtractUserKey(b)) == 0;
 | |
|     }), bounds.end());
 | |
| 
 | |
|   // Combine consecutive pairs of boundaries into ranges with an approximate
 | |
|   // size of data covered by keys in that range
 | |
|   uint64_t sum = 0;
 | |
|   std::vector<RangeWithSize> ranges;
 | |
|   auto* v = cfd->current();
 | |
|   for (auto it = bounds.begin();;) {
 | |
|     const Slice a = *it;
 | |
|     it++;
 | |
| 
 | |
|     if (it == bounds.end()) {
 | |
|       break;
 | |
|     }
 | |
| 
 | |
|     const Slice b = *it;
 | |
|     uint64_t size = versions_->ApproximateSize(v, a, b, start_lvl, out_lvl + 1);
 | |
|     ranges.emplace_back(a, b, size);
 | |
|     sum += size;
 | |
|   }
 | |
| 
 | |
|   // Group the ranges into subcompactions
 | |
|   const double min_file_fill_percent = 4.0 / 5;
 | |
|   uint64_t max_output_files = static_cast<uint64_t>(
 | |
|       std::ceil(sum / min_file_fill_percent /
 | |
|                 c->mutable_cf_options()->MaxFileSizeForLevel(out_lvl)));
 | |
|   uint64_t subcompactions =
 | |
|       std::min({static_cast<uint64_t>(ranges.size()),
 | |
|                 static_cast<uint64_t>(db_options_.max_subcompactions),
 | |
|                 max_output_files});
 | |
| 
 | |
|   if (subcompactions > 1) {
 | |
|     double mean = sum * 1.0 / subcompactions;
 | |
|     // Greedily add ranges to the subcompaction until the sum of the ranges'
 | |
|     // sizes becomes >= the expected mean size of a subcompaction
 | |
|     sum = 0;
 | |
|     for (size_t i = 0; i < ranges.size() - 1; i++) {
 | |
|       sum += ranges[i].size;
 | |
|       if (subcompactions == 1) {
 | |
|         // If there's only one left to schedule then it goes to the end so no
 | |
|         // need to put an end boundary
 | |
|         continue;
 | |
|       }
 | |
|       if (sum >= mean) {
 | |
|         boundaries_.emplace_back(ExtractUserKey(ranges[i].range.limit));
 | |
|         sizes_.emplace_back(sum);
 | |
|         subcompactions--;
 | |
|         sum = 0;
 | |
|       }
 | |
|     }
 | |
|     sizes_.emplace_back(sum + ranges.back().size);
 | |
|   } else {
 | |
|     // Only one range so its size is the total sum of sizes computed above
 | |
|     sizes_.emplace_back(sum);
 | |
|   }
 | |
| }
 | |
| 
 | |
| Status CompactionJob::Run() {
 | |
|   AutoThreadOperationStageUpdater stage_updater(
 | |
|       ThreadStatus::STAGE_COMPACTION_RUN);
 | |
|   TEST_SYNC_POINT("CompactionJob::Run():Start");
 | |
|   log_buffer_->FlushBufferToLog();
 | |
|   LogCompaction();
 | |
| 
 | |
|   const size_t num_threads = compact_->sub_compact_states.size();
 | |
|   assert(num_threads > 0);
 | |
|   const uint64_t start_micros = env_->NowMicros();
 | |
| 
 | |
|   // Launch a thread for each of subcompactions 1...num_threads-1
 | |
|   std::vector<port::Thread> thread_pool;
 | |
|   thread_pool.reserve(num_threads - 1);
 | |
|   for (size_t i = 1; i < compact_->sub_compact_states.size(); i++) {
 | |
|     thread_pool.emplace_back(&CompactionJob::ProcessKeyValueCompaction, this,
 | |
|                              &compact_->sub_compact_states[i]);
 | |
|   }
 | |
| 
 | |
|   // Always schedule the first subcompaction (whether or not there are also
 | |
|   // others) in the current thread to be efficient with resources
 | |
|   ProcessKeyValueCompaction(&compact_->sub_compact_states[0]);
 | |
| 
 | |
|   // Wait for all other threads (if there are any) to finish execution
 | |
|   for (auto& thread : thread_pool) {
 | |
|     thread.join();
 | |
|   }
 | |
| 
 | |
|   if (output_directory_) {
 | |
|     output_directory_->Fsync();
 | |
|   }
 | |
| 
 | |
|   compaction_stats_.micros = env_->NowMicros() - start_micros;
 | |
|   MeasureTime(stats_, COMPACTION_TIME, compaction_stats_.micros);
 | |
| 
 | |
|   // Check if any thread encountered an error during execution
 | |
|   Status status;
 | |
|   for (const auto& state : compact_->sub_compact_states) {
 | |
|     if (!state.status.ok()) {
 | |
|       status = state.status;
 | |
|       break;
 | |
|     }
 | |
|   }
 | |
| 
 | |
|   TablePropertiesCollection tp;
 | |
|   for (const auto& state : compact_->sub_compact_states) {
 | |
|     for (const auto& output : state.outputs) {
 | |
|       auto fn = TableFileName(db_options_.db_paths, output.meta.fd.GetNumber(),
 | |
|                               output.meta.fd.GetPathId());
 | |
|       tp[fn] = output.table_properties;
 | |
|     }
 | |
|   }
 | |
|   compact_->compaction->SetOutputTableProperties(std::move(tp));
 | |
| 
 | |
|   // Finish up all book-keeping to unify the subcompaction results
 | |
|   AggregateStatistics();
 | |
|   UpdateCompactionStats();
 | |
|   RecordCompactionIOStats();
 | |
|   LogFlush(db_options_.info_log);
 | |
|   TEST_SYNC_POINT("CompactionJob::Run():End");
 | |
| 
 | |
|   compact_->status = status;
 | |
|   return status;
 | |
| }
 | |
| 
 | |
| Status CompactionJob::Install(const MutableCFOptions& mutable_cf_options) {
 | |
|   AutoThreadOperationStageUpdater stage_updater(
 | |
|       ThreadStatus::STAGE_COMPACTION_INSTALL);
 | |
|   db_mutex_->AssertHeld();
 | |
|   Status status = compact_->status;
 | |
|   ColumnFamilyData* cfd = compact_->compaction->column_family_data();
 | |
|   cfd->internal_stats()->AddCompactionStats(
 | |
|       compact_->compaction->output_level(), compaction_stats_);
 | |
| 
 | |
|   if (status.ok()) {
 | |
|     status = InstallCompactionResults(mutable_cf_options);
 | |
|   }
 | |
|   VersionStorageInfo::LevelSummaryStorage tmp;
 | |
|   auto vstorage = cfd->current()->storage_info();
 | |
|   const auto& stats = compaction_stats_;
 | |
| 
 | |
|   double read_write_amp = 0.0;
 | |
|   double write_amp = 0.0;
 | |
|   double bytes_read_per_sec = 0;
 | |
|   double bytes_written_per_sec = 0;
 | |
| 
 | |
|   if (stats.bytes_read_non_output_levels > 0) {
 | |
|     read_write_amp = (stats.bytes_written + stats.bytes_read_output_level +
 | |
|                       stats.bytes_read_non_output_levels) /
 | |
|                      static_cast<double>(stats.bytes_read_non_output_levels);
 | |
|     write_amp = stats.bytes_written /
 | |
|                 static_cast<double>(stats.bytes_read_non_output_levels);
 | |
|   }
 | |
|   if (stats.micros > 0) {
 | |
|     bytes_read_per_sec =
 | |
|         (stats.bytes_read_non_output_levels + stats.bytes_read_output_level) /
 | |
|         static_cast<double>(stats.micros);
 | |
|     bytes_written_per_sec =
 | |
|         stats.bytes_written / static_cast<double>(stats.micros);
 | |
|   }
 | |
| 
 | |
|   ROCKS_LOG_BUFFER(
 | |
|       log_buffer_,
 | |
|       "[%s] compacted to: %s, MB/sec: %.1f rd, %.1f wr, level %d, "
 | |
|       "files in(%d, %d) out(%d) "
 | |
|       "MB in(%.1f, %.1f) out(%.1f), read-write-amplify(%.1f) "
 | |
|       "write-amplify(%.1f) %s, records in: %d, records dropped: %d\n",
 | |
|       cfd->GetName().c_str(), vstorage->LevelSummary(&tmp), bytes_read_per_sec,
 | |
|       bytes_written_per_sec, compact_->compaction->output_level(),
 | |
|       stats.num_input_files_in_non_output_levels,
 | |
|       stats.num_input_files_in_output_level, stats.num_output_files,
 | |
|       stats.bytes_read_non_output_levels / 1048576.0,
 | |
|       stats.bytes_read_output_level / 1048576.0,
 | |
|       stats.bytes_written / 1048576.0, read_write_amp, write_amp,
 | |
|       status.ToString().c_str(), stats.num_input_records,
 | |
|       stats.num_dropped_records);
 | |
| 
 | |
|   UpdateCompactionJobStats(stats);
 | |
| 
 | |
|   auto stream = event_logger_->LogToBuffer(log_buffer_);
 | |
|   stream << "job" << job_id_
 | |
|          << "event" << "compaction_finished"
 | |
|          << "compaction_time_micros" << compaction_stats_.micros
 | |
|          << "output_level" << compact_->compaction->output_level()
 | |
|          << "num_output_files" << compact_->NumOutputFiles()
 | |
|          << "total_output_size" << compact_->total_bytes
 | |
|          << "num_input_records" << compact_->num_input_records
 | |
|          << "num_output_records" << compact_->num_output_records
 | |
|          << "num_subcompactions" << compact_->sub_compact_states.size();
 | |
| 
 | |
|   if (compaction_job_stats_ != nullptr) {
 | |
|     stream << "num_single_delete_mismatches"
 | |
|            << compaction_job_stats_->num_single_del_mismatch;
 | |
|     stream << "num_single_delete_fallthrough"
 | |
|            << compaction_job_stats_->num_single_del_fallthru;
 | |
|   }
 | |
| 
 | |
|   if (measure_io_stats_ && compaction_job_stats_ != nullptr) {
 | |
|     stream << "file_write_nanos" << compaction_job_stats_->file_write_nanos;
 | |
|     stream << "file_range_sync_nanos"
 | |
|            << compaction_job_stats_->file_range_sync_nanos;
 | |
|     stream << "file_fsync_nanos" << compaction_job_stats_->file_fsync_nanos;
 | |
|     stream << "file_prepare_write_nanos"
 | |
|            << compaction_job_stats_->file_prepare_write_nanos;
 | |
|   }
 | |
| 
 | |
|   stream << "lsm_state";
 | |
|   stream.StartArray();
 | |
|   for (int level = 0; level < vstorage->num_levels(); ++level) {
 | |
|     stream << vstorage->NumLevelFiles(level);
 | |
|   }
 | |
|   stream.EndArray();
 | |
| 
 | |
|   CleanupCompaction();
 | |
|   return status;
 | |
| }
 | |
| 
 | |
| void CompactionJob::ProcessKeyValueCompaction(SubcompactionState* sub_compact) {
 | |
|   assert(sub_compact != nullptr);
 | |
|   ColumnFamilyData* cfd = sub_compact->compaction->column_family_data();
 | |
|   std::unique_ptr<RangeDelAggregator> range_del_agg(
 | |
|       new RangeDelAggregator(cfd->internal_comparator(), existing_snapshots_));
 | |
|   std::unique_ptr<InternalIterator> input(versions_->MakeInputIterator(
 | |
|       sub_compact->compaction, range_del_agg.get()));
 | |
| 
 | |
|   AutoThreadOperationStageUpdater stage_updater(
 | |
|       ThreadStatus::STAGE_COMPACTION_PROCESS_KV);
 | |
| 
 | |
|   // I/O measurement variables
 | |
|   PerfLevel prev_perf_level = PerfLevel::kEnableTime;
 | |
|   const uint64_t kRecordStatsEvery = 1000;
 | |
|   uint64_t prev_write_nanos = 0;
 | |
|   uint64_t prev_fsync_nanos = 0;
 | |
|   uint64_t prev_range_sync_nanos = 0;
 | |
|   uint64_t prev_prepare_write_nanos = 0;
 | |
|   if (measure_io_stats_) {
 | |
|     prev_perf_level = GetPerfLevel();
 | |
|     SetPerfLevel(PerfLevel::kEnableTime);
 | |
|     prev_write_nanos = IOSTATS(write_nanos);
 | |
|     prev_fsync_nanos = IOSTATS(fsync_nanos);
 | |
|     prev_range_sync_nanos = IOSTATS(range_sync_nanos);
 | |
|     prev_prepare_write_nanos = IOSTATS(prepare_write_nanos);
 | |
|   }
 | |
| 
 | |
|   const MutableCFOptions* mutable_cf_options =
 | |
|       sub_compact->compaction->mutable_cf_options();
 | |
| 
 | |
|   // To build compression dictionary, we sample the first output file, assuming
 | |
|   // it'll reach the maximum length, and then use the dictionary for compressing
 | |
|   // subsequent output files. The dictionary may be less than max_dict_bytes if
 | |
|   // the first output file's length is less than the maximum.
 | |
|   const int kSampleLenShift = 6;  // 2^6 = 64-byte samples
 | |
|   std::set<size_t> sample_begin_offsets;
 | |
|   if (bottommost_level_ &&
 | |
|       cfd->ioptions()->compression_opts.max_dict_bytes > 0) {
 | |
|     const size_t kMaxSamples =
 | |
|         cfd->ioptions()->compression_opts.max_dict_bytes >> kSampleLenShift;
 | |
|     const size_t kOutFileLen = mutable_cf_options->MaxFileSizeForLevel(
 | |
|         compact_->compaction->output_level());
 | |
|     if (kOutFileLen != port::kMaxSizet) {
 | |
|       const size_t kOutFileNumSamples = kOutFileLen >> kSampleLenShift;
 | |
|       Random64 generator{versions_->NewFileNumber()};
 | |
|       for (size_t i = 0; i < kMaxSamples; ++i) {
 | |
|         sample_begin_offsets.insert(generator.Uniform(kOutFileNumSamples)
 | |
|                                     << kSampleLenShift);
 | |
|       }
 | |
|     }
 | |
|   }
 | |
| 
 | |
|   auto compaction_filter = cfd->ioptions()->compaction_filter;
 | |
|   std::unique_ptr<CompactionFilter> compaction_filter_from_factory = nullptr;
 | |
|   if (compaction_filter == nullptr) {
 | |
|     compaction_filter_from_factory =
 | |
|         sub_compact->compaction->CreateCompactionFilter();
 | |
|     compaction_filter = compaction_filter_from_factory.get();
 | |
|   }
 | |
|   MergeHelper merge(
 | |
|       env_, cfd->user_comparator(), cfd->ioptions()->merge_operator,
 | |
|       compaction_filter, db_options_.info_log.get(),
 | |
|       false /* internal key corruption is expected */,
 | |
|       existing_snapshots_.empty() ? 0 : existing_snapshots_.back(),
 | |
|       compact_->compaction->level(), db_options_.statistics.get(),
 | |
|       shutting_down_);
 | |
| 
 | |
|   TEST_SYNC_POINT("CompactionJob::Run():Inprogress");
 | |
| 
 | |
|   Slice* start = sub_compact->start;
 | |
|   Slice* end = sub_compact->end;
 | |
|   if (start != nullptr) {
 | |
|     IterKey start_iter;
 | |
|     start_iter.SetInternalKey(*start, kMaxSequenceNumber, kValueTypeForSeek);
 | |
|     input->Seek(start_iter.GetInternalKey());
 | |
|   } else {
 | |
|     input->SeekToFirst();
 | |
|   }
 | |
| 
 | |
|   // we allow only 1 compaction event listener. Used by blob storage
 | |
|   CompactionEventListener* comp_event_listener = nullptr;
 | |
| #ifndef ROCKSDB_LITE
 | |
|   for (auto& celitr : cfd->ioptions()->listeners) {
 | |
|     comp_event_listener = celitr->GetCompactionEventListener();
 | |
|     if (comp_event_listener != nullptr) {
 | |
|       break;
 | |
|     }
 | |
|   }
 | |
| #endif  // ROCKSDB_LITE
 | |
| 
 | |
|   Status status;
 | |
|   sub_compact->c_iter.reset(new CompactionIterator(
 | |
|       input.get(), cfd->user_comparator(), &merge, versions_->LastSequence(),
 | |
|       &existing_snapshots_, earliest_write_conflict_snapshot_, env_, false,
 | |
|       range_del_agg.get(), sub_compact->compaction, compaction_filter,
 | |
|       comp_event_listener, shutting_down_));
 | |
|   auto c_iter = sub_compact->c_iter.get();
 | |
|   c_iter->SeekToFirst();
 | |
|   if (c_iter->Valid() &&
 | |
|       sub_compact->compaction->output_level() != 0) {
 | |
|     // ShouldStopBefore() maintains state based on keys processed so far. The
 | |
|     // compaction loop always calls it on the "next" key, thus won't tell it the
 | |
|     // first key. So we do that here.
 | |
|     sub_compact->ShouldStopBefore(
 | |
|       c_iter->key(), sub_compact->current_output_file_size);
 | |
|   }
 | |
|   const auto& c_iter_stats = c_iter->iter_stats();
 | |
|   auto sample_begin_offset_iter = sample_begin_offsets.cbegin();
 | |
|   // data_begin_offset and compression_dict are only valid while generating
 | |
|   // dictionary from the first output file.
 | |
|   size_t data_begin_offset = 0;
 | |
|   std::string compression_dict;
 | |
|   compression_dict.reserve(cfd->ioptions()->compression_opts.max_dict_bytes);
 | |
| 
 | |
|   while (status.ok() && !cfd->IsDropped() && c_iter->Valid()) {
 | |
|     // Invariant: c_iter.status() is guaranteed to be OK if c_iter->Valid()
 | |
|     // returns true.
 | |
|     const Slice& key = c_iter->key();
 | |
|     const Slice& value = c_iter->value();
 | |
| 
 | |
|     // If an end key (exclusive) is specified, check if the current key is
 | |
|     // >= than it and exit if it is because the iterator is out of its range
 | |
|     if (end != nullptr &&
 | |
|         cfd->user_comparator()->Compare(c_iter->user_key(), *end) >= 0) {
 | |
|       break;
 | |
|     }
 | |
|     if (c_iter_stats.num_input_records % kRecordStatsEvery ==
 | |
|         kRecordStatsEvery - 1) {
 | |
|       RecordDroppedKeys(c_iter_stats, &sub_compact->compaction_job_stats);
 | |
|       c_iter->ResetRecordCounts();
 | |
|       RecordCompactionIOStats();
 | |
|     }
 | |
| 
 | |
|     // Open output file if necessary
 | |
|     if (sub_compact->builder == nullptr) {
 | |
|       status = OpenCompactionOutputFile(sub_compact);
 | |
|       if (!status.ok()) {
 | |
|         break;
 | |
|       }
 | |
|     }
 | |
|     assert(sub_compact->builder != nullptr);
 | |
|     assert(sub_compact->current_output() != nullptr);
 | |
|     sub_compact->builder->Add(key, value);
 | |
|     sub_compact->current_output_file_size = sub_compact->builder->FileSize();
 | |
|     sub_compact->current_output()->meta.UpdateBoundaries(
 | |
|         key, c_iter->ikey().sequence);
 | |
|     sub_compact->num_output_records++;
 | |
| 
 | |
|     if (sub_compact->outputs.size() == 1) {  // first output file
 | |
|       // Check if this key/value overlaps any sample intervals; if so, appends
 | |
|       // overlapping portions to the dictionary.
 | |
|       for (const auto& data_elmt : {key, value}) {
 | |
|         size_t data_end_offset = data_begin_offset + data_elmt.size();
 | |
|         while (sample_begin_offset_iter != sample_begin_offsets.cend() &&
 | |
|                *sample_begin_offset_iter < data_end_offset) {
 | |
|           size_t sample_end_offset =
 | |
|               *sample_begin_offset_iter + (1 << kSampleLenShift);
 | |
|           // Invariant: Because we advance sample iterator while processing the
 | |
|           // data_elmt containing the sample's last byte, the current sample
 | |
|           // cannot end before the current data_elmt.
 | |
|           assert(data_begin_offset < sample_end_offset);
 | |
| 
 | |
|           size_t data_elmt_copy_offset, data_elmt_copy_len;
 | |
|           if (*sample_begin_offset_iter <= data_begin_offset) {
 | |
|             // The sample starts before data_elmt starts, so take bytes starting
 | |
|             // at the beginning of data_elmt.
 | |
|             data_elmt_copy_offset = 0;
 | |
|           } else {
 | |
|             // data_elmt starts before the sample starts, so take bytes starting
 | |
|             // at the below offset into data_elmt.
 | |
|             data_elmt_copy_offset =
 | |
|                 *sample_begin_offset_iter - data_begin_offset;
 | |
|           }
 | |
|           if (sample_end_offset <= data_end_offset) {
 | |
|             // The sample ends before data_elmt ends, so take as many bytes as
 | |
|             // needed.
 | |
|             data_elmt_copy_len =
 | |
|                 sample_end_offset - (data_begin_offset + data_elmt_copy_offset);
 | |
|           } else {
 | |
|             // data_elmt ends before the sample ends, so take all remaining
 | |
|             // bytes in data_elmt.
 | |
|             data_elmt_copy_len =
 | |
|                 data_end_offset - (data_begin_offset + data_elmt_copy_offset);
 | |
|           }
 | |
|           compression_dict.append(&data_elmt.data()[data_elmt_copy_offset],
 | |
|                                   data_elmt_copy_len);
 | |
|           if (sample_end_offset > data_end_offset) {
 | |
|             // Didn't finish sample. Try to finish it with the next data_elmt.
 | |
|             break;
 | |
|           }
 | |
|           // Next sample may require bytes from same data_elmt.
 | |
|           sample_begin_offset_iter++;
 | |
|         }
 | |
|         data_begin_offset = data_end_offset;
 | |
|       }
 | |
|     }
 | |
| 
 | |
|     // Close output file if it is big enough. Two possibilities determine it's
 | |
|     // time to close it: (1) the current key should be this file's last key, (2)
 | |
|     // the next key should not be in this file.
 | |
|     //
 | |
|     // TODO(aekmekji): determine if file should be closed earlier than this
 | |
|     // during subcompactions (i.e. if output size, estimated by input size, is
 | |
|     // going to be 1.2MB and max_output_file_size = 1MB, prefer to have 0.6MB
 | |
|     // and 0.6MB instead of 1MB and 0.2MB)
 | |
|     bool output_file_ended = false;
 | |
|     Status input_status;
 | |
|     if (sub_compact->compaction->output_level() != 0 &&
 | |
|         sub_compact->current_output_file_size >=
 | |
|             sub_compact->compaction->max_output_file_size()) {
 | |
|       // (1) this key terminates the file. For historical reasons, the iterator
 | |
|       // status before advancing will be given to FinishCompactionOutputFile().
 | |
|       input_status = input->status();
 | |
|       output_file_ended = true;
 | |
|     }
 | |
|     c_iter->Next();
 | |
|     if (!output_file_ended && c_iter->Valid() &&
 | |
|         sub_compact->compaction->output_level() != 0 &&
 | |
|         sub_compact->ShouldStopBefore(
 | |
|           c_iter->key(), sub_compact->current_output_file_size) &&
 | |
|         sub_compact->builder != nullptr) {
 | |
|       // (2) this key belongs to the next file. For historical reasons, the
 | |
|       // iterator status after advancing will be given to
 | |
|       // FinishCompactionOutputFile().
 | |
|       input_status = input->status();
 | |
|       output_file_ended = true;
 | |
|     }
 | |
|     if (output_file_ended) {
 | |
|       const Slice* next_key = nullptr;
 | |
|       if (c_iter->Valid()) {
 | |
|         next_key = &c_iter->key();
 | |
|       }
 | |
|       CompactionIterationStats range_del_out_stats;
 | |
|       status = FinishCompactionOutputFile(input_status, sub_compact,
 | |
|                                           range_del_agg.get(),
 | |
|                                           &range_del_out_stats, next_key);
 | |
|       RecordDroppedKeys(range_del_out_stats,
 | |
|                         &sub_compact->compaction_job_stats);
 | |
|       if (sub_compact->outputs.size() == 1) {
 | |
|         // Use dictionary from first output file for compression of subsequent
 | |
|         // files.
 | |
|         sub_compact->compression_dict = std::move(compression_dict);
 | |
|       }
 | |
|     }
 | |
|   }
 | |
| 
 | |
|   sub_compact->num_input_records = c_iter_stats.num_input_records;
 | |
|   sub_compact->compaction_job_stats.num_input_deletion_records =
 | |
|       c_iter_stats.num_input_deletion_records;
 | |
|   sub_compact->compaction_job_stats.num_corrupt_keys =
 | |
|       c_iter_stats.num_input_corrupt_records;
 | |
|   sub_compact->compaction_job_stats.num_single_del_fallthru =
 | |
|       c_iter_stats.num_single_del_fallthru;
 | |
|   sub_compact->compaction_job_stats.num_single_del_mismatch =
 | |
|       c_iter_stats.num_single_del_mismatch;
 | |
|   sub_compact->compaction_job_stats.total_input_raw_key_bytes +=
 | |
|       c_iter_stats.total_input_raw_key_bytes;
 | |
|   sub_compact->compaction_job_stats.total_input_raw_value_bytes +=
 | |
|       c_iter_stats.total_input_raw_value_bytes;
 | |
| 
 | |
|   RecordTick(stats_, FILTER_OPERATION_TOTAL_TIME,
 | |
|              c_iter_stats.total_filter_time);
 | |
|   RecordDroppedKeys(c_iter_stats, &sub_compact->compaction_job_stats);
 | |
|   RecordCompactionIOStats();
 | |
| 
 | |
|   if (status.ok() && (shutting_down_->load(std::memory_order_relaxed) ||
 | |
|                       cfd->IsDropped())) {
 | |
|     status = Status::ShutdownInProgress(
 | |
|         "Database shutdown or Column family drop during compaction");
 | |
|   }
 | |
|   if (status.ok()) {
 | |
|     status = input->status();
 | |
|   }
 | |
|   if (status.ok()) {
 | |
|     status = c_iter->status();
 | |
|   }
 | |
| 
 | |
|   if (status.ok() && sub_compact->builder == nullptr &&
 | |
|       sub_compact->outputs.size() == 0 &&
 | |
|       range_del_agg->ShouldAddTombstones(bottommost_level_)) {
 | |
|     // handle subcompaction containing only range deletions
 | |
|     status = OpenCompactionOutputFile(sub_compact);
 | |
|   }
 | |
| 
 | |
|   // Call FinishCompactionOutputFile() even if status is not ok: it needs to
 | |
|   // close the output file.
 | |
|   if (sub_compact->builder != nullptr) {
 | |
|     CompactionIterationStats range_del_out_stats;
 | |
|     Status s = FinishCompactionOutputFile(
 | |
|         status, sub_compact, range_del_agg.get(), &range_del_out_stats);
 | |
|     if (status.ok()) {
 | |
|       status = s;
 | |
|     }
 | |
|     RecordDroppedKeys(range_del_out_stats, &sub_compact->compaction_job_stats);
 | |
|   }
 | |
| 
 | |
|   if (measure_io_stats_) {
 | |
|     sub_compact->compaction_job_stats.file_write_nanos +=
 | |
|         IOSTATS(write_nanos) - prev_write_nanos;
 | |
|     sub_compact->compaction_job_stats.file_fsync_nanos +=
 | |
|         IOSTATS(fsync_nanos) - prev_fsync_nanos;
 | |
|     sub_compact->compaction_job_stats.file_range_sync_nanos +=
 | |
|         IOSTATS(range_sync_nanos) - prev_range_sync_nanos;
 | |
|     sub_compact->compaction_job_stats.file_prepare_write_nanos +=
 | |
|         IOSTATS(prepare_write_nanos) - prev_prepare_write_nanos;
 | |
|     if (prev_perf_level != PerfLevel::kEnableTime) {
 | |
|       SetPerfLevel(prev_perf_level);
 | |
|     }
 | |
|   }
 | |
| 
 | |
|   sub_compact->c_iter.reset();
 | |
|   input.reset();
 | |
|   sub_compact->status = status;
 | |
| }
 | |
| 
 | |
| void CompactionJob::RecordDroppedKeys(
 | |
|     const CompactionIterationStats& c_iter_stats,
 | |
|     CompactionJobStats* compaction_job_stats) {
 | |
|   if (c_iter_stats.num_record_drop_user > 0) {
 | |
|     RecordTick(stats_, COMPACTION_KEY_DROP_USER,
 | |
|                c_iter_stats.num_record_drop_user);
 | |
|   }
 | |
|   if (c_iter_stats.num_record_drop_hidden > 0) {
 | |
|     RecordTick(stats_, COMPACTION_KEY_DROP_NEWER_ENTRY,
 | |
|                c_iter_stats.num_record_drop_hidden);
 | |
|     if (compaction_job_stats) {
 | |
|       compaction_job_stats->num_records_replaced +=
 | |
|           c_iter_stats.num_record_drop_hidden;
 | |
|     }
 | |
|   }
 | |
|   if (c_iter_stats.num_record_drop_obsolete > 0) {
 | |
|     RecordTick(stats_, COMPACTION_KEY_DROP_OBSOLETE,
 | |
|                c_iter_stats.num_record_drop_obsolete);
 | |
|     if (compaction_job_stats) {
 | |
|       compaction_job_stats->num_expired_deletion_records +=
 | |
|           c_iter_stats.num_record_drop_obsolete;
 | |
|     }
 | |
|   }
 | |
|   if (c_iter_stats.num_record_drop_range_del > 0) {
 | |
|     RecordTick(stats_, COMPACTION_KEY_DROP_RANGE_DEL,
 | |
|                c_iter_stats.num_record_drop_range_del);
 | |
|   }
 | |
|   if (c_iter_stats.num_range_del_drop_obsolete > 0) {
 | |
|     RecordTick(stats_, COMPACTION_RANGE_DEL_DROP_OBSOLETE,
 | |
|                c_iter_stats.num_range_del_drop_obsolete);
 | |
|   }
 | |
|   if (c_iter_stats.num_optimized_del_drop_obsolete > 0) {
 | |
|     RecordTick(stats_, COMPACTION_OPTIMIZED_DEL_DROP_OBSOLETE,
 | |
|                c_iter_stats.num_optimized_del_drop_obsolete);
 | |
|   }
 | |
| }
 | |
| 
 | |
| Status CompactionJob::FinishCompactionOutputFile(
 | |
|     const Status& input_status, SubcompactionState* sub_compact,
 | |
|     RangeDelAggregator* range_del_agg,
 | |
|     CompactionIterationStats* range_del_out_stats,
 | |
|     const Slice* next_table_min_key /* = nullptr */) {
 | |
|   AutoThreadOperationStageUpdater stage_updater(
 | |
|       ThreadStatus::STAGE_COMPACTION_SYNC_FILE);
 | |
|   assert(sub_compact != nullptr);
 | |
|   assert(sub_compact->outfile);
 | |
|   assert(sub_compact->builder != nullptr);
 | |
|   assert(sub_compact->current_output() != nullptr);
 | |
| 
 | |
|   uint64_t output_number = sub_compact->current_output()->meta.fd.GetNumber();
 | |
|   assert(output_number != 0);
 | |
| 
 | |
|   // Check for iterator errors
 | |
|   Status s = input_status;
 | |
|   auto meta = &sub_compact->current_output()->meta;
 | |
|   if (s.ok()) {
 | |
|     Slice lower_bound_guard, upper_bound_guard;
 | |
|     const Slice *lower_bound, *upper_bound;
 | |
|     if (sub_compact->outputs.size() == 1) {
 | |
|       // For the first output table, include range tombstones before the min key
 | |
|       // but after the subcompaction boundary.
 | |
|       lower_bound = sub_compact->start;
 | |
|     } else if (meta->smallest.size() > 0) {
 | |
|       // For subsequent output tables, only include range tombstones from min
 | |
|       // key onwards since the previous file was extended to contain range
 | |
|       // tombstones falling before min key.
 | |
|       lower_bound_guard = meta->smallest.user_key();
 | |
|       lower_bound = &lower_bound_guard;
 | |
|     } else {
 | |
|       lower_bound = nullptr;
 | |
|     }
 | |
|     if (next_table_min_key != nullptr) {
 | |
|       // This isn't the last file in the subcompaction, so extend until the next
 | |
|       // file starts.
 | |
|       upper_bound_guard = ExtractUserKey(*next_table_min_key);
 | |
|       upper_bound = &upper_bound_guard;
 | |
|     } else {
 | |
|       // This is the last file in the subcompaction, so extend until the
 | |
|       // subcompaction ends.
 | |
|       upper_bound = sub_compact->end;
 | |
|     }
 | |
|     range_del_agg->AddToBuilder(sub_compact->builder.get(), lower_bound,
 | |
|                                 upper_bound, meta, range_del_out_stats,
 | |
|                                 bottommost_level_);
 | |
|     meta->marked_for_compaction = sub_compact->builder->NeedCompact();
 | |
|   }
 | |
|   const uint64_t current_entries = sub_compact->builder->NumEntries();
 | |
|   if (s.ok()) {
 | |
|     s = sub_compact->builder->Finish();
 | |
|   } else {
 | |
|     sub_compact->builder->Abandon();
 | |
|   }
 | |
|   const uint64_t current_bytes = sub_compact->builder->FileSize();
 | |
|   if (s.ok()) {
 | |
|     meta->fd.file_size = current_bytes;
 | |
|   }
 | |
|   sub_compact->current_output()->finished = true;
 | |
|   sub_compact->total_bytes += current_bytes;
 | |
| 
 | |
|   // Finish and check for file errors
 | |
|   if (s.ok()) {
 | |
|     StopWatch sw(env_, stats_, COMPACTION_OUTFILE_SYNC_MICROS);
 | |
|     s = sub_compact->outfile->Sync(db_options_.use_fsync);
 | |
|   }
 | |
|   if (s.ok()) {
 | |
|     s = sub_compact->outfile->Close();
 | |
|   }
 | |
|   sub_compact->outfile.reset();
 | |
| 
 | |
|   if (s.ok() && current_entries == 0) {
 | |
|     // If there is nothing to output, no necessary to generate a sst file.
 | |
|     // This happens when the output level is bottom level, at the same time
 | |
|     // the sub_compact output nothing.
 | |
|     std::string fname = TableFileName(
 | |
|         db_options_.db_paths, meta->fd.GetNumber(), meta->fd.GetPathId());
 | |
|     env_->DeleteFile(fname);
 | |
| 
 | |
|     // Also need to remove the file from outputs, or it will be added to the
 | |
|     // VersionEdit.
 | |
|     assert(!sub_compact->outputs.empty());
 | |
|     sub_compact->outputs.pop_back();
 | |
|     sub_compact->builder.reset();
 | |
|     sub_compact->current_output_file_size = 0;
 | |
|     return s;
 | |
|   }
 | |
| 
 | |
|   ColumnFamilyData* cfd = sub_compact->compaction->column_family_data();
 | |
|   TableProperties tp;
 | |
|   if (s.ok() && current_entries > 0) {
 | |
|     // Verify that the table is usable
 | |
|     // We set for_compaction to false and don't OptimizeForCompactionTableRead
 | |
|     // here because this is a special case after we finish the table building
 | |
|     // No matter whether use_direct_io_for_flush_and_compaction is true,
 | |
|     // we will regrad this verification as user reads since the goal is
 | |
|     // to cache it here for further user reads
 | |
|     InternalIterator* iter = cfd->table_cache()->NewIterator(
 | |
|         ReadOptions(), env_options_, cfd->internal_comparator(), meta->fd,
 | |
|         nullptr /* range_del_agg */, nullptr,
 | |
|         cfd->internal_stats()->GetFileReadHist(
 | |
|             compact_->compaction->output_level()),
 | |
|         false);
 | |
|     s = iter->status();
 | |
| 
 | |
|     if (s.ok() && paranoid_file_checks_) {
 | |
|       for (iter->SeekToFirst(); iter->Valid(); iter->Next()) {}
 | |
|       s = iter->status();
 | |
|     }
 | |
| 
 | |
|     delete iter;
 | |
| 
 | |
|     // Output to event logger and fire events.
 | |
|     if (s.ok()) {
 | |
|       tp = sub_compact->builder->GetTableProperties();
 | |
|       sub_compact->current_output()->table_properties =
 | |
|           std::make_shared<TableProperties>(tp);
 | |
|       ROCKS_LOG_INFO(db_options_.info_log,
 | |
|                      "[%s] [JOB %d] Generated table #%" PRIu64 ": %" PRIu64
 | |
|                      " keys, %" PRIu64 " bytes%s",
 | |
|                      cfd->GetName().c_str(), job_id_, output_number,
 | |
|                      current_entries, current_bytes,
 | |
|                      meta->marked_for_compaction ? " (need compaction)" : "");
 | |
|     }
 | |
|   }
 | |
|   std::string fname;
 | |
|   FileDescriptor output_fd;
 | |
|   if (meta != nullptr) {
 | |
|     fname = TableFileName(db_options_.db_paths, meta->fd.GetNumber(),
 | |
|                           meta->fd.GetPathId());
 | |
|     output_fd = meta->fd;
 | |
|   } else {
 | |
|     fname = "(nil)";
 | |
|   }
 | |
|   EventHelpers::LogAndNotifyTableFileCreationFinished(
 | |
|       event_logger_, cfd->ioptions()->listeners, dbname_, cfd->GetName(), fname,
 | |
|       job_id_, output_fd, tp, TableFileCreationReason::kCompaction, s);
 | |
| 
 | |
| #ifndef ROCKSDB_LITE
 | |
|   // Report new file to SstFileManagerImpl
 | |
|   auto sfm =
 | |
|       static_cast<SstFileManagerImpl*>(db_options_.sst_file_manager.get());
 | |
|   if (sfm && meta != nullptr && meta->fd.GetPathId() == 0) {
 | |
|     auto fn = TableFileName(cfd->ioptions()->db_paths, meta->fd.GetNumber(),
 | |
|                             meta->fd.GetPathId());
 | |
|     sfm->OnAddFile(fn);
 | |
|     if (sfm->IsMaxAllowedSpaceReached()) {
 | |
|       // TODO(ajkr): should we return OK() if max space was reached by the final
 | |
|       // compaction output file (similarly to how flush works when full)?
 | |
|       s = Status::IOError("Max allowed space was reached");
 | |
|       TEST_SYNC_POINT(
 | |
|           "CompactionJob::FinishCompactionOutputFile:"
 | |
|           "MaxAllowedSpaceReached");
 | |
|       InstrumentedMutexLock l(db_mutex_);
 | |
|       if (db_bg_error_->ok()) {
 | |
|         Status new_bg_error = s;
 | |
|         // may temporarily unlock and lock the mutex.
 | |
|         EventHelpers::NotifyOnBackgroundError(
 | |
|             cfd->ioptions()->listeners, BackgroundErrorReason::kCompaction,
 | |
|             &new_bg_error, db_mutex_);
 | |
|         if (!new_bg_error.ok()) {
 | |
|           *db_bg_error_ = new_bg_error;
 | |
|         }
 | |
|       }
 | |
|     }
 | |
|   }
 | |
| #endif
 | |
| 
 | |
|   sub_compact->builder.reset();
 | |
|   sub_compact->current_output_file_size = 0;
 | |
|   return s;
 | |
| }
 | |
| 
 | |
| Status CompactionJob::InstallCompactionResults(
 | |
|     const MutableCFOptions& mutable_cf_options) {
 | |
|   db_mutex_->AssertHeld();
 | |
| 
 | |
|   auto* compaction = compact_->compaction;
 | |
|   // paranoia: verify that the files that we started with
 | |
|   // still exist in the current version and in the same original level.
 | |
|   // This ensures that a concurrent compaction did not erroneously
 | |
|   // pick the same files to compact_.
 | |
|   if (!versions_->VerifyCompactionFileConsistency(compaction)) {
 | |
|     Compaction::InputLevelSummaryBuffer inputs_summary;
 | |
| 
 | |
|     ROCKS_LOG_ERROR(db_options_.info_log, "[%s] [JOB %d] Compaction %s aborted",
 | |
|                     compaction->column_family_data()->GetName().c_str(),
 | |
|                     job_id_, compaction->InputLevelSummary(&inputs_summary));
 | |
|     return Status::Corruption("Compaction input files inconsistent");
 | |
|   }
 | |
| 
 | |
|   {
 | |
|     Compaction::InputLevelSummaryBuffer inputs_summary;
 | |
|     ROCKS_LOG_INFO(
 | |
|         db_options_.info_log, "[%s] [JOB %d] Compacted %s => %" PRIu64 " bytes",
 | |
|         compaction->column_family_data()->GetName().c_str(), job_id_,
 | |
|         compaction->InputLevelSummary(&inputs_summary), compact_->total_bytes);
 | |
|   }
 | |
| 
 | |
|   // Add compaction outputs
 | |
|   compaction->AddInputDeletions(compact_->compaction->edit());
 | |
| 
 | |
|   for (const auto& sub_compact : compact_->sub_compact_states) {
 | |
|     for (const auto& out : sub_compact.outputs) {
 | |
|       compaction->edit()->AddFile(compaction->output_level(), out.meta);
 | |
|     }
 | |
|   }
 | |
|   return versions_->LogAndApply(compaction->column_family_data(),
 | |
|                                 mutable_cf_options, compaction->edit(),
 | |
|                                 db_mutex_, db_directory_);
 | |
| }
 | |
| 
 | |
| void CompactionJob::RecordCompactionIOStats() {
 | |
|   RecordTick(stats_, COMPACT_READ_BYTES, IOSTATS(bytes_read));
 | |
|   ThreadStatusUtil::IncreaseThreadOperationProperty(
 | |
|       ThreadStatus::COMPACTION_BYTES_READ, IOSTATS(bytes_read));
 | |
|   IOSTATS_RESET(bytes_read);
 | |
|   RecordTick(stats_, COMPACT_WRITE_BYTES, IOSTATS(bytes_written));
 | |
|   ThreadStatusUtil::IncreaseThreadOperationProperty(
 | |
|       ThreadStatus::COMPACTION_BYTES_WRITTEN, IOSTATS(bytes_written));
 | |
|   IOSTATS_RESET(bytes_written);
 | |
| }
 | |
| 
 | |
| Status CompactionJob::OpenCompactionOutputFile(
 | |
|     SubcompactionState* sub_compact) {
 | |
|   assert(sub_compact != nullptr);
 | |
|   assert(sub_compact->builder == nullptr);
 | |
|   // no need to lock because VersionSet::next_file_number_ is atomic
 | |
|   uint64_t file_number = versions_->NewFileNumber();
 | |
|   std::string fname = TableFileName(db_options_.db_paths, file_number,
 | |
|                                     sub_compact->compaction->output_path_id());
 | |
|   // Fire events.
 | |
|   ColumnFamilyData* cfd = sub_compact->compaction->column_family_data();
 | |
| #ifndef ROCKSDB_LITE
 | |
|   EventHelpers::NotifyTableFileCreationStarted(
 | |
|       cfd->ioptions()->listeners, dbname_, cfd->GetName(), fname, job_id_,
 | |
|       TableFileCreationReason::kCompaction);
 | |
| #endif  // !ROCKSDB_LITE
 | |
|   // Make the output file
 | |
|   unique_ptr<WritableFile> writable_file;
 | |
|   EnvOptions opt_env_opts =
 | |
|       env_->OptimizeForCompactionTableWrite(env_options_, db_options_);
 | |
|   TEST_SYNC_POINT_CALLBACK("CompactionJob::OpenCompactionOutputFile",
 | |
|                            &opt_env_opts.use_direct_writes);
 | |
|   Status s = NewWritableFile(env_, fname, &writable_file, opt_env_opts);
 | |
|   if (!s.ok()) {
 | |
|     ROCKS_LOG_ERROR(
 | |
|         db_options_.info_log,
 | |
|         "[%s] [JOB %d] OpenCompactionOutputFiles for table #%" PRIu64
 | |
|         " fails at NewWritableFile with status %s",
 | |
|         sub_compact->compaction->column_family_data()->GetName().c_str(),
 | |
|         job_id_, file_number, s.ToString().c_str());
 | |
|     LogFlush(db_options_.info_log);
 | |
|     EventHelpers::LogAndNotifyTableFileCreationFinished(
 | |
|         event_logger_, cfd->ioptions()->listeners, dbname_, cfd->GetName(),
 | |
|         fname, job_id_, FileDescriptor(), TableProperties(),
 | |
|         TableFileCreationReason::kCompaction, s);
 | |
|     return s;
 | |
|   }
 | |
| 
 | |
|   SubcompactionState::Output out;
 | |
|   out.meta.fd =
 | |
|       FileDescriptor(file_number, sub_compact->compaction->output_path_id(), 0);
 | |
|   out.finished = false;
 | |
| 
 | |
|   sub_compact->outputs.push_back(out);
 | |
|   writable_file->SetIOPriority(Env::IO_LOW);
 | |
|   writable_file->SetPreallocationBlockSize(static_cast<size_t>(
 | |
|       sub_compact->compaction->OutputFilePreallocationSize()));
 | |
|   sub_compact->outfile.reset(new WritableFileWriter(
 | |
|       std::move(writable_file), env_options_, db_options_.statistics.get()));
 | |
| 
 | |
|   // If the Column family flag is to only optimize filters for hits,
 | |
|   // we can skip creating filters if this is the bottommost_level where
 | |
|   // data is going to be found
 | |
|   bool skip_filters =
 | |
|       cfd->ioptions()->optimize_filters_for_hits && bottommost_level_;
 | |
| 
 | |
|   uint64_t output_file_creation_time =
 | |
|       sub_compact->compaction->MaxInputFileCreationTime();
 | |
|   if (output_file_creation_time == 0) {
 | |
|     int64_t _current_time = 0;
 | |
|     db_options_.env->GetCurrentTime(&_current_time);  // ignore error
 | |
|     output_file_creation_time = static_cast<uint64_t>(_current_time);
 | |
|   }
 | |
| 
 | |
|   sub_compact->builder.reset(NewTableBuilder(
 | |
|       *cfd->ioptions(), cfd->internal_comparator(),
 | |
|       cfd->int_tbl_prop_collector_factories(), cfd->GetID(), cfd->GetName(),
 | |
|       sub_compact->outfile.get(), sub_compact->compaction->output_compression(),
 | |
|       cfd->ioptions()->compression_opts,
 | |
|       sub_compact->compaction->output_level(), &sub_compact->compression_dict,
 | |
|       skip_filters, output_file_creation_time));
 | |
|   LogFlush(db_options_.info_log);
 | |
|   return s;
 | |
| }
 | |
| 
 | |
| void CompactionJob::CleanupCompaction() {
 | |
|   for (SubcompactionState& sub_compact : compact_->sub_compact_states) {
 | |
|     const auto& sub_status = sub_compact.status;
 | |
| 
 | |
|     if (sub_compact.builder != nullptr) {
 | |
|       // May happen if we get a shutdown call in the middle of compaction
 | |
|       sub_compact.builder->Abandon();
 | |
|       sub_compact.builder.reset();
 | |
|     } else {
 | |
|       assert(!sub_status.ok() || sub_compact.outfile == nullptr);
 | |
|     }
 | |
|     for (const auto& out : sub_compact.outputs) {
 | |
|       // If this file was inserted into the table cache then remove
 | |
|       // them here because this compaction was not committed.
 | |
|       if (!sub_status.ok()) {
 | |
|         TableCache::Evict(table_cache_.get(), out.meta.fd.GetNumber());
 | |
|       }
 | |
|     }
 | |
|   }
 | |
|   delete compact_;
 | |
|   compact_ = nullptr;
 | |
| }
 | |
| 
 | |
| #ifndef ROCKSDB_LITE
 | |
| namespace {
 | |
| void CopyPrefix(
 | |
|     const Slice& src, size_t prefix_length, std::string* dst) {
 | |
|   assert(prefix_length > 0);
 | |
|   size_t length = src.size() > prefix_length ? prefix_length : src.size();
 | |
|   dst->assign(src.data(), length);
 | |
| }
 | |
| }  // namespace
 | |
| 
 | |
| #endif  // !ROCKSDB_LITE
 | |
| 
 | |
| void CompactionJob::UpdateCompactionStats() {
 | |
|   Compaction* compaction = compact_->compaction;
 | |
|   compaction_stats_.num_input_files_in_non_output_levels = 0;
 | |
|   compaction_stats_.num_input_files_in_output_level = 0;
 | |
|   for (int input_level = 0;
 | |
|        input_level < static_cast<int>(compaction->num_input_levels());
 | |
|        ++input_level) {
 | |
|     if (compaction->level(input_level) != compaction->output_level()) {
 | |
|       UpdateCompactionInputStatsHelper(
 | |
|           &compaction_stats_.num_input_files_in_non_output_levels,
 | |
|           &compaction_stats_.bytes_read_non_output_levels,
 | |
|           input_level);
 | |
|     } else {
 | |
|       UpdateCompactionInputStatsHelper(
 | |
|           &compaction_stats_.num_input_files_in_output_level,
 | |
|           &compaction_stats_.bytes_read_output_level,
 | |
|           input_level);
 | |
|     }
 | |
|   }
 | |
| 
 | |
|   for (const auto& sub_compact : compact_->sub_compact_states) {
 | |
|     size_t num_output_files = sub_compact.outputs.size();
 | |
|     if (sub_compact.builder != nullptr) {
 | |
|       // An error occurred so ignore the last output.
 | |
|       assert(num_output_files > 0);
 | |
|       --num_output_files;
 | |
|     }
 | |
|     compaction_stats_.num_output_files += static_cast<int>(num_output_files);
 | |
| 
 | |
|     for (const auto& out : sub_compact.outputs) {
 | |
|       compaction_stats_.bytes_written += out.meta.fd.file_size;
 | |
|     }
 | |
|     if (sub_compact.num_input_records > sub_compact.num_output_records) {
 | |
|       compaction_stats_.num_dropped_records +=
 | |
|           sub_compact.num_input_records - sub_compact.num_output_records;
 | |
|     }
 | |
|   }
 | |
| }
 | |
| 
 | |
| void CompactionJob::UpdateCompactionInputStatsHelper(
 | |
|     int* num_files, uint64_t* bytes_read, int input_level) {
 | |
|   const Compaction* compaction = compact_->compaction;
 | |
|   auto num_input_files = compaction->num_input_files(input_level);
 | |
|   *num_files += static_cast<int>(num_input_files);
 | |
| 
 | |
|   for (size_t i = 0; i < num_input_files; ++i) {
 | |
|     const auto* file_meta = compaction->input(input_level, i);
 | |
|     *bytes_read += file_meta->fd.GetFileSize();
 | |
|     compaction_stats_.num_input_records +=
 | |
|         static_cast<uint64_t>(file_meta->num_entries);
 | |
|   }
 | |
| }
 | |
| 
 | |
| void CompactionJob::UpdateCompactionJobStats(
 | |
|     const InternalStats::CompactionStats& stats) const {
 | |
| #ifndef ROCKSDB_LITE
 | |
|   if (compaction_job_stats_) {
 | |
|     compaction_job_stats_->elapsed_micros = stats.micros;
 | |
| 
 | |
|     // input information
 | |
|     compaction_job_stats_->total_input_bytes =
 | |
|         stats.bytes_read_non_output_levels +
 | |
|         stats.bytes_read_output_level;
 | |
|     compaction_job_stats_->num_input_records =
 | |
|         compact_->num_input_records;
 | |
|     compaction_job_stats_->num_input_files =
 | |
|         stats.num_input_files_in_non_output_levels +
 | |
|         stats.num_input_files_in_output_level;
 | |
|     compaction_job_stats_->num_input_files_at_output_level =
 | |
|         stats.num_input_files_in_output_level;
 | |
| 
 | |
|     // output information
 | |
|     compaction_job_stats_->total_output_bytes = stats.bytes_written;
 | |
|     compaction_job_stats_->num_output_records =
 | |
|         compact_->num_output_records;
 | |
|     compaction_job_stats_->num_output_files = stats.num_output_files;
 | |
| 
 | |
|     if (compact_->NumOutputFiles() > 0U) {
 | |
|       CopyPrefix(
 | |
|           compact_->SmallestUserKey(),
 | |
|           CompactionJobStats::kMaxPrefixLength,
 | |
|           &compaction_job_stats_->smallest_output_key_prefix);
 | |
|       CopyPrefix(
 | |
|           compact_->LargestUserKey(),
 | |
|           CompactionJobStats::kMaxPrefixLength,
 | |
|           &compaction_job_stats_->largest_output_key_prefix);
 | |
|     }
 | |
|   }
 | |
| #endif  // !ROCKSDB_LITE
 | |
| }
 | |
| 
 | |
| void CompactionJob::LogCompaction() {
 | |
|   Compaction* compaction = compact_->compaction;
 | |
|   ColumnFamilyData* cfd = compaction->column_family_data();
 | |
| 
 | |
|   // Let's check if anything will get logged. Don't prepare all the info if
 | |
|   // we're not logging
 | |
|   if (db_options_.info_log_level <= InfoLogLevel::INFO_LEVEL) {
 | |
|     Compaction::InputLevelSummaryBuffer inputs_summary;
 | |
|     ROCKS_LOG_INFO(
 | |
|         db_options_.info_log, "[%s] [JOB %d] Compacting %s, score %.2f",
 | |
|         cfd->GetName().c_str(), job_id_,
 | |
|         compaction->InputLevelSummary(&inputs_summary), compaction->score());
 | |
|     char scratch[2345];
 | |
|     compaction->Summary(scratch, sizeof(scratch));
 | |
|     ROCKS_LOG_INFO(db_options_.info_log, "[%s] Compaction start summary: %s\n",
 | |
|                    cfd->GetName().c_str(), scratch);
 | |
|     // build event logger report
 | |
|     auto stream = event_logger_->Log();
 | |
|     stream << "job" << job_id_ << "event"
 | |
|            << "compaction_started";
 | |
|     for (size_t i = 0; i < compaction->num_input_levels(); ++i) {
 | |
|       stream << ("files_L" + ToString(compaction->level(i)));
 | |
|       stream.StartArray();
 | |
|       for (auto f : *compaction->inputs(i)) {
 | |
|         stream << f->fd.GetNumber();
 | |
|       }
 | |
|       stream.EndArray();
 | |
|     }
 | |
|     stream << "score" << compaction->score() << "input_data_size"
 | |
|            << compaction->CalculateTotalInputSize();
 | |
|   }
 | |
| }
 | |
| 
 | |
| }  // namespace rocksdb
 | |
| 
 |