diff --git a/db/compaction.cc b/db/compaction.cc new file mode 100644 index 000000000..703e7aeae --- /dev/null +++ b/db/compaction.cc @@ -0,0 +1,214 @@ +// Copyright (c) 2013, Facebook, Inc. All rights reserved. +// This source code is licensed under the BSD-style license found in the +// LICENSE file in the root directory of this source tree. An additional grant +// of patent rights can be found in the PATENTS file in the same directory. +// +// Copyright (c) 2011 The LevelDB Authors. All rights reserved. +// Use of this source code is governed by a BSD-style license that can be +// found in the LICENSE file. See the AUTHORS file for names of contributors. + +#include "db/compaction.h" + +namespace rocksdb { + +static uint64_t TotalFileSize(const std::vector& files) { + uint64_t sum = 0; + for (size_t i = 0; i < files.size() && files[i]; i++) { + sum += files[i]->file_size; + } + return sum; +} + +Compaction::Compaction(Version* input_version, int level, int out_level, + uint64_t target_file_size, + uint64_t max_grandparent_overlap_bytes, + bool seek_compaction, bool enable_compression) + : level_(level), + out_level_(out_level), + max_output_file_size_(target_file_size), + maxGrandParentOverlapBytes_(max_grandparent_overlap_bytes), + input_version_(input_version), + number_levels_(input_version_->NumberLevels()), + seek_compaction_(seek_compaction), + enable_compression_(enable_compression), + grandparent_index_(0), + seen_key_(false), + overlapped_bytes_(0), + base_index_(-1), + parent_index_(-1), + score_(0), + bottommost_level_(false), + is_full_compaction_(false), + level_ptrs_(std::vector(number_levels_)) { + + input_version_->Ref(); + edit_ = new VersionEdit(); + for (int i = 0; i < number_levels_; i++) { + level_ptrs_[i] = 0; + } +} + +Compaction::~Compaction() { + delete edit_; + if (input_version_ != nullptr) { + input_version_->Unref(); + } +} + +bool Compaction::IsTrivialMove() const { + // Avoid a move if there is lots of overlapping grandparent data. + // Otherwise, the move could create a parent file that will require + // a very expensive merge later on. + // If level_== out_level_, the purpose is to force compaction filter to be + // applied to that level, and thus cannot be a trivia move. + return (level_ != out_level_ && + num_input_files(0) == 1 && + num_input_files(1) == 0 && + TotalFileSize(grandparents_) <= maxGrandParentOverlapBytes_); +} + +void Compaction::AddInputDeletions(VersionEdit* edit) { + for (int which = 0; which < 2; which++) { + for (size_t i = 0; i < inputs_[which].size(); i++) { + edit->DeleteFile(level_ + which, inputs_[which][i]->number); + } + } +} + +bool Compaction::IsBaseLevelForKey(const Slice& user_key) { + if (input_version_->vset_->options_->compaction_style == + kCompactionStyleUniversal) { + return bottommost_level_; + } + // Maybe use binary search to find right entry instead of linear search? + const Comparator* user_cmp = input_version_->vset_->icmp_.user_comparator(); + for (int lvl = level_ + 2; lvl < number_levels_; lvl++) { + const std::vector& files = input_version_->files_[lvl]; + for (; level_ptrs_[lvl] < files.size(); ) { + FileMetaData* f = files[level_ptrs_[lvl]]; + if (user_cmp->Compare(user_key, f->largest.user_key()) <= 0) { + // We've advanced far enough + if (user_cmp->Compare(user_key, f->smallest.user_key()) >= 0) { + // Key falls in this file's range, so definitely not base level + return false; + } + break; + } + level_ptrs_[lvl]++; + } + } + return true; +} + +bool Compaction::ShouldStopBefore(const Slice& internal_key) { + // Scan to find earliest grandparent file that contains key. + const InternalKeyComparator* icmp = &input_version_->vset_->icmp_; + while (grandparent_index_ < grandparents_.size() && + icmp->Compare(internal_key, + grandparents_[grandparent_index_]->largest.Encode()) > 0) { + if (seen_key_) { + overlapped_bytes_ += grandparents_[grandparent_index_]->file_size; + } + assert(grandparent_index_ + 1 >= grandparents_.size() || + icmp->Compare(grandparents_[grandparent_index_]->largest.Encode(), + grandparents_[grandparent_index_+1]->smallest.Encode()) + < 0); + grandparent_index_++; + } + seen_key_ = true; + + if (overlapped_bytes_ > maxGrandParentOverlapBytes_) { + // Too much overlap for current output; start new output + overlapped_bytes_ = 0; + return true; + } else { + return false; + } +} + +// Mark (or clear) each file that is being compacted +void Compaction::MarkFilesBeingCompacted(bool value) { + for (int i = 0; i < 2; i++) { + std::vector v = inputs_[i]; + for (unsigned int j = 0; j < inputs_[i].size(); j++) { + assert(value ? !inputs_[i][j]->being_compacted : + inputs_[i][j]->being_compacted); + inputs_[i][j]->being_compacted = value; + } + } +} + +// Is this compaction producing files at the bottommost level? +void Compaction::SetupBottomMostLevel(bool isManual) { + if (input_version_->vset_->options_->compaction_style == + kCompactionStyleUniversal) { + // If universal compaction style is used and manual + // compaction is occuring, then we are guaranteed that + // all files will be picked in a single compaction + // run. We can safely set bottommost_level_ = true. + // If it is not manual compaction, then bottommost_level_ + // is already set when the Compaction was created. + if (isManual) { + bottommost_level_ = true; + } + return; + } + bottommost_level_ = true; + int num_levels = input_version_->vset_->NumberLevels(); + for (int i = output_level() + 1; i < num_levels; i++) { + if (input_version_->NumLevelFiles(i) > 0) { + bottommost_level_ = false; + break; + } + } +} + +void Compaction::ReleaseInputs() { + if (input_version_ != nullptr) { + input_version_->Unref(); + input_version_ = nullptr; + } +} + +void Compaction::ResetNextCompactionIndex() { + input_version_->ResetNextCompactionIndex(level_); +} + +static void InputSummary(std::vector& files, char* output, + int len) { + int write = 0; + for (unsigned int i = 0; i < files.size(); i++) { + int sz = len - write; + int ret = snprintf(output + write, sz, "%lu(%lu) ", + (unsigned long)files.at(i)->number, + (unsigned long)files.at(i)->file_size); + if (ret < 0 || ret >= sz) + break; + write += ret; + } +} + +void Compaction::Summary(char* output, int len) { + int write = snprintf(output, len, + "Base version %lu Base level %d, seek compaction:%d, inputs:", + (unsigned long)input_version_->GetVersionNumber(), + level_, + seek_compaction_); + if (write < 0 || write > len) { + return; + } + + char level_low_summary[100]; + InputSummary(inputs_[0], level_low_summary, sizeof(level_low_summary)); + char level_up_summary[100]; + if (inputs_[1].size()) { + InputSummary(inputs_[1], level_up_summary, sizeof(level_up_summary)); + } else { + level_up_summary[0] = '\0'; + } + + snprintf(output + write, len - write, "[%s],[%s]", + level_low_summary, level_up_summary); +} + +} // namespace rocksdb diff --git a/db/compaction.h b/db/compaction.h new file mode 100644 index 000000000..4cc0197da --- /dev/null +++ b/db/compaction.h @@ -0,0 +1,131 @@ +// Copyright (c) 2013, Facebook, Inc. All rights reserved. +// This source code is licensed under the BSD-style license found in the +// LICENSE file in the root directory of this source tree. An additional grant +// of patent rights can be found in the PATENTS file in the same directory. +// +// Copyright (c) 2011 The LevelDB Authors. All rights reserved. +// Use of this source code is governed by a BSD-style license that can be +// found in the LICENSE file. See the AUTHORS file for names of contributors. + +#pragma once +#include "db/version_set.h" + +namespace rocksdb { + +class Version; + +// A Compaction encapsulates information about a compaction. +class Compaction { + public: + ~Compaction(); + + // Return the level that is being compacted. Inputs from "level" + // will be merged. + int level() const { return level_; } + + // Outputs will go to this level + int output_level() const { return out_level_; } + + // Return the object that holds the edits to the descriptor done + // by this compaction. + VersionEdit* edit() { return edit_; } + + // "which" must be either 0 or 1 + int num_input_files(int which) const { return inputs_[which].size(); } + + // Return the ith input file at "level()+which" ("which" must be 0 or 1). + FileMetaData* input(int which, int i) const { return inputs_[which][i]; } + + // Maximum size of files to build during this compaction. + uint64_t MaxOutputFileSize() const { return max_output_file_size_; } + + // Whether compression will be enabled for compaction outputs + bool enable_compression() const { return enable_compression_; } + + // Is this a trivial compaction that can be implemented by just + // moving a single input file to the next level (no merging or splitting) + bool IsTrivialMove() const; + + // Add all inputs to this compaction as delete operations to *edit. + void AddInputDeletions(VersionEdit* edit); + + // Returns true if the information we have available guarantees that + // the compaction is producing data in "level+1" for which no data exists + // in levels greater than "level+1". + bool IsBaseLevelForKey(const Slice& user_key); + + // Returns true iff we should stop building the current output + // before processing "internal_key". + bool ShouldStopBefore(const Slice& internal_key); + + // Release the input version for the compaction, once the compaction + // is successful. + void ReleaseInputs(); + + void Summary(char* output, int len); + + // Return the score that was used to pick this compaction run. + double score() const { return score_; } + + // Is this compaction creating a file in the bottom most level? + bool BottomMostLevel() { return bottommost_level_; } + + // Does this compaction include all sst files? + bool IsFullCompaction() { return is_full_compaction_; } + + private: + friend class Version; + friend class VersionSet; + + Compaction(Version* input_version, int level, int out_level, + uint64_t target_file_size, uint64_t max_grandparent_overlap_bytes, + bool seek_compaction = false, bool enable_compression = true); + + int level_; + int out_level_; // levels to which output files are stored + uint64_t max_output_file_size_; + uint64_t maxGrandParentOverlapBytes_; + Version* input_version_; + VersionEdit* edit_; + int number_levels_; + + bool seek_compaction_; + bool enable_compression_; + + // Each compaction reads inputs from "level_" and "level_+1" + std::vector inputs_[2]; // The two sets of inputs + + // State used to check for number of of overlapping grandparent files + // (parent == level_ + 1, grandparent == level_ + 2) + std::vector grandparents_; + size_t grandparent_index_; // Index in grandparent_starts_ + bool seen_key_; // Some output key has been seen + uint64_t overlapped_bytes_; // Bytes of overlap between current output + // and grandparent files + int base_index_; // index of the file in files_[level_] + int parent_index_; // index of some file with same range in files_[level_+1] + double score_; // score that was used to pick this compaction. + + // Is this compaction creating a file in the bottom most level? + bool bottommost_level_; + // Does this compaction include all sst files? + bool is_full_compaction_; + + // level_ptrs_ holds indices into input_version_->levels_: our state + // is that we are positioned at one of the file ranges for each + // higher level than the ones involved in this compaction (i.e. for + // all L >= level_ + 2). + std::vector level_ptrs_; + + // mark (or clear) all files that are being compacted + void MarkFilesBeingCompacted(bool); + + // Initialize whether compaction producing files at the bottommost level + void SetupBottomMostLevel(bool isManual); + + // In case of compaction error, reset the nextIndex that is used + // to pick up the next file to be compacted from files_by_size_ + void ResetNextCompactionIndex(); +}; + +} // namespace rocksdb diff --git a/db/version_set.cc b/db/version_set.cc index eb20650ba..05e7c7053 100644 --- a/db/version_set.cc +++ b/db/version_set.cc @@ -18,6 +18,7 @@ #include "db/memtable.h" #include "db/merge_context.h" #include "db/table_cache.h" +#include "db/compaction.h" #include "rocksdb/env.h" #include "rocksdb/merge_operator.h" #include "rocksdb/table.h" @@ -2953,196 +2954,4 @@ Compaction* VersionSet::CompactRange(int input_level, return c; } -Compaction::Compaction(Version* input_version, int level, int out_level, - uint64_t target_file_size, - uint64_t max_grandparent_overlap_bytes, - bool seek_compaction, bool enable_compression) - : level_(level), - out_level_(out_level), - max_output_file_size_(target_file_size), - maxGrandParentOverlapBytes_(max_grandparent_overlap_bytes), - input_version_(input_version), - number_levels_(input_version_->NumberLevels()), - seek_compaction_(seek_compaction), - enable_compression_(enable_compression), - grandparent_index_(0), - seen_key_(false), - overlapped_bytes_(0), - base_index_(-1), - parent_index_(-1), - score_(0), - bottommost_level_(false), - is_full_compaction_(false), - level_ptrs_(std::vector(number_levels_)) { - - input_version_->Ref(); - edit_ = new VersionEdit(); - for (int i = 0; i < number_levels_; i++) { - level_ptrs_[i] = 0; - } -} - -Compaction::~Compaction() { - delete edit_; - if (input_version_ != nullptr) { - input_version_->Unref(); - } -} - -bool Compaction::IsTrivialMove() const { - // Avoid a move if there is lots of overlapping grandparent data. - // Otherwise, the move could create a parent file that will require - // a very expensive merge later on. - // If level_== out_level_, the purpose is to force compaction filter to be - // applied to that level, and thus cannot be a trivia move. - return (level_ != out_level_ && - num_input_files(0) == 1 && - num_input_files(1) == 0 && - TotalFileSize(grandparents_) <= maxGrandParentOverlapBytes_); -} - -void Compaction::AddInputDeletions(VersionEdit* edit) { - for (int which = 0; which < 2; which++) { - for (size_t i = 0; i < inputs_[which].size(); i++) { - edit->DeleteFile(level_ + which, inputs_[which][i]->number); - } - } -} - -bool Compaction::IsBaseLevelForKey(const Slice& user_key) { - if (input_version_->vset_->options_->compaction_style == - kCompactionStyleUniversal) { - return bottommost_level_; - } - // Maybe use binary search to find right entry instead of linear search? - const Comparator* user_cmp = input_version_->vset_->icmp_.user_comparator(); - for (int lvl = level_ + 2; lvl < number_levels_; lvl++) { - const std::vector& files = input_version_->files_[lvl]; - for (; level_ptrs_[lvl] < files.size(); ) { - FileMetaData* f = files[level_ptrs_[lvl]]; - if (user_cmp->Compare(user_key, f->largest.user_key()) <= 0) { - // We've advanced far enough - if (user_cmp->Compare(user_key, f->smallest.user_key()) >= 0) { - // Key falls in this file's range, so definitely not base level - return false; - } - break; - } - level_ptrs_[lvl]++; - } - } - return true; -} - -bool Compaction::ShouldStopBefore(const Slice& internal_key) { - // Scan to find earliest grandparent file that contains key. - const InternalKeyComparator* icmp = &input_version_->vset_->icmp_; - while (grandparent_index_ < grandparents_.size() && - icmp->Compare(internal_key, - grandparents_[grandparent_index_]->largest.Encode()) > 0) { - if (seen_key_) { - overlapped_bytes_ += grandparents_[grandparent_index_]->file_size; - } - assert(grandparent_index_ + 1 >= grandparents_.size() || - icmp->Compare(grandparents_[grandparent_index_]->largest.Encode(), - grandparents_[grandparent_index_+1]->smallest.Encode()) - < 0); - grandparent_index_++; - } - seen_key_ = true; - - if (overlapped_bytes_ > maxGrandParentOverlapBytes_) { - // Too much overlap for current output; start new output - overlapped_bytes_ = 0; - return true; - } else { - return false; - } -} - -// Mark (or clear) each file that is being compacted -void Compaction::MarkFilesBeingCompacted(bool value) { - for (int i = 0; i < 2; i++) { - std::vector v = inputs_[i]; - for (unsigned int j = 0; j < inputs_[i].size(); j++) { - assert(value ? !inputs_[i][j]->being_compacted : - inputs_[i][j]->being_compacted); - inputs_[i][j]->being_compacted = value; - } - } -} - -// Is this compaction producing files at the bottommost level? -void Compaction::SetupBottomMostLevel(bool isManual) { - if (input_version_->vset_->options_->compaction_style == - kCompactionStyleUniversal) { - // If universal compaction style is used and manual - // compaction is occuring, then we are guaranteed that - // all files will be picked in a single compaction - // run. We can safely set bottommost_level_ = true. - // If it is not manual compaction, then bottommost_level_ - // is already set when the Compaction was created. - if (isManual) { - bottommost_level_ = true; - } - return; - } - bottommost_level_ = true; - int num_levels = input_version_->vset_->NumberLevels(); - for (int i = output_level() + 1; i < num_levels; i++) { - if (input_version_->NumLevelFiles(i) > 0) { - bottommost_level_ = false; - break; - } - } -} - -void Compaction::ReleaseInputs() { - if (input_version_ != nullptr) { - input_version_->Unref(); - input_version_ = nullptr; - } -} - -void Compaction::ResetNextCompactionIndex() { - input_version_->ResetNextCompactionIndex(level_); -} - -static void InputSummary(std::vector& files, char* output, - int len) { - int write = 0; - for (unsigned int i = 0; i < files.size(); i++) { - int sz = len - write; - int ret = snprintf(output + write, sz, "%lu(%lu) ", - (unsigned long)files.at(i)->number, - (unsigned long)files.at(i)->file_size); - if (ret < 0 || ret >= sz) - break; - write += ret; - } -} - -void Compaction::Summary(char* output, int len) { - int write = snprintf(output, len, - "Base version %lu Base level %d, seek compaction:%d, inputs:", - (unsigned long)input_version_->GetVersionNumber(), - level_, - seek_compaction_); - if (write < 0 || write > len) { - return; - } - - char level_low_summary[100]; - InputSummary(inputs_[0], level_low_summary, sizeof(level_low_summary)); - char level_up_summary[100]; - if (inputs_[1].size()) { - InputSummary(inputs_[1], level_up_summary, sizeof(level_up_summary)); - } else { - level_up_summary[0] = '\0'; - } - - snprintf(output + write, len - write, "[%s],[%s]", - level_low_summary, level_up_summary); -} - } // namespace rocksdb diff --git a/db/version_set.h b/db/version_set.h index 51f6d9b6c..319067d1a 100644 --- a/db/version_set.h +++ b/db/version_set.h @@ -27,6 +27,7 @@ #include "db/version_edit.h" #include "port/port.h" #include "db/table_cache.h" +#include "db/compaction.h" namespace rocksdb { @@ -546,118 +547,4 @@ class VersionSet { VersionEdit* edit, port::Mutex* mu); }; -// A Compaction encapsulates information about a compaction. -class Compaction { - public: - ~Compaction(); - - // Return the level that is being compacted. Inputs from "level" - // will be merged. - int level() const { return level_; } - - // Outputs will go to this level - int output_level() const { return out_level_; } - - // Return the object that holds the edits to the descriptor done - // by this compaction. - VersionEdit* edit() { return edit_; } - - // "which" must be either 0 or 1 - int num_input_files(int which) const { return inputs_[which].size(); } - - // Return the ith input file at "level()+which" ("which" must be 0 or 1). - FileMetaData* input(int which, int i) const { return inputs_[which][i]; } - - // Maximum size of files to build during this compaction. - uint64_t MaxOutputFileSize() const { return max_output_file_size_; } - - // Whether compression will be enabled for compaction outputs - bool enable_compression() const { return enable_compression_; } - - // Is this a trivial compaction that can be implemented by just - // moving a single input file to the next level (no merging or splitting) - bool IsTrivialMove() const; - - // Add all inputs to this compaction as delete operations to *edit. - void AddInputDeletions(VersionEdit* edit); - - // Returns true if the information we have available guarantees that - // the compaction is producing data in "level+1" for which no data exists - // in levels greater than "level+1". - bool IsBaseLevelForKey(const Slice& user_key); - - // Returns true iff we should stop building the current output - // before processing "internal_key". - bool ShouldStopBefore(const Slice& internal_key); - - // Release the input version for the compaction, once the compaction - // is successful. - void ReleaseInputs(); - - void Summary(char* output, int len); - - // Return the score that was used to pick this compaction run. - double score() const { return score_; } - - // Is this compaction creating a file in the bottom most level? - bool BottomMostLevel() { return bottommost_level_; } - - // Does this compaction include all sst files? - bool IsFullCompaction() { return is_full_compaction_; } - - private: - friend class Version; - friend class VersionSet; - - Compaction(Version* input_version, int level, int out_level, - uint64_t target_file_size, uint64_t max_grandparent_overlap_bytes, - bool seek_compaction = false, bool enable_compression = true); - - int level_; - int out_level_; // levels to which output files are stored - uint64_t max_output_file_size_; - uint64_t maxGrandParentOverlapBytes_; - Version* input_version_; - VersionEdit* edit_; - int number_levels_; - - bool seek_compaction_; - bool enable_compression_; - - // Each compaction reads inputs from "level_" and "level_+1" - std::vector inputs_[2]; // The two sets of inputs - - // State used to check for number of of overlapping grandparent files - // (parent == level_ + 1, grandparent == level_ + 2) - std::vector grandparents_; - size_t grandparent_index_; // Index in grandparent_starts_ - bool seen_key_; // Some output key has been seen - uint64_t overlapped_bytes_; // Bytes of overlap between current output - // and grandparent files - int base_index_; // index of the file in files_[level_] - int parent_index_; // index of some file with same range in files_[level_+1] - double score_; // score that was used to pick this compaction. - - // Is this compaction creating a file in the bottom most level? - bool bottommost_level_; - // Does this compaction include all sst files? - bool is_full_compaction_; - - // level_ptrs_ holds indices into input_version_->levels_: our state - // is that we are positioned at one of the file ranges for each - // higher level than the ones involved in this compaction (i.e. for - // all L >= level_ + 2). - std::vector level_ptrs_; - - // mark (or clear) all files that are being compacted - void MarkFilesBeingCompacted(bool); - - // Initialize whether compaction producing files at the bottommost level - void SetupBottomMostLevel(bool isManual); - - // In case of compaction error, reset the nextIndex that is used - // to pick up the next file to be compacted from files_by_size_ - void ResetNextCompactionIndex(); -}; - } // namespace rocksdb