diff --git a/.clang-format b/.clang-format index a1e9a48e4..7c279811a 100644 --- a/.clang-format +++ b/.clang-format @@ -2,46 +2,4 @@ # http://clang.llvm.org/docs/ClangFormatStyleOptions.html --- BasedOnStyle: Google -AccessModifierOffset: -1 -ConstructorInitializerIndentWidth: 4 -AlignEscapedNewlinesLeft: true -AlignTrailingComments: true -AllowAllParametersOfDeclarationOnNextLine: true -AllowShortIfStatementsOnASingleLine: false -AllowShortLoopsOnASingleLine: false -AlwaysBreakTemplateDeclarations: true -AlwaysBreakBeforeMultilineStrings: true -BreakBeforeBinaryOperators: false -BreakConstructorInitializersBeforeComma: false -BinPackParameters: false -ColumnLimit: 80 -ConstructorInitializerAllOnOneLineOrOnePerLine: true -DerivePointerBinding: true -ExperimentalAutoDetectBinPacking: true -IndentCaseLabels: false -MaxEmptyLinesToKeep: 1 -NamespaceIndentation: None -ObjCSpaceBeforeProtocolList: false -PenaltyBreakBeforeFirstCallParameter: 10 -PenaltyBreakComment: 60 -PenaltyBreakString: 1000 -PenaltyBreakFirstLessLess: 20 -PenaltyExcessCharacter: 1000000 -PenaltyReturnTypeOnItsOwnLine: 200 -PointerBindsToType: true -SpacesBeforeTrailingComments: 2 -Cpp11BracedListStyle: true -Standard: Cpp11 -IndentWidth: 2 -TabWidth: 8 -UseTab: Never -BreakBeforeBraces: Attach -IndentFunctionDeclarationAfterType: false -SpacesInParentheses: false -SpacesInAngles: false -SpaceInEmptyParentheses: false -SpacesInCStyleCastParentheses: false -SpaceAfterControlStatementKeyword: true -SpaceBeforeAssignmentOperators: true -ContinuationIndentWidth: 4 ... diff --git a/Makefile b/Makefile index 6a5995b28..ab13ac0d5 100644 --- a/Makefile +++ b/Makefile @@ -128,19 +128,21 @@ $(SHARED2): $(SHARED3) ln -fs $(SHARED3) $(SHARED2) endif -$(SHARED3): - $(CXX) $(PLATFORM_SHARED_LDFLAGS)$(SHARED2) $(CXXFLAGS) $(COVERAGEFLAGS) $(PLATFORM_SHARED_CFLAGS) $(SOURCES) -o $@ $(LDFLAGS) +$(SHARED3): $(LIBOBJECTS) + $(CXX) $(PLATFORM_SHARED_LDFLAGS)$(SHARED2) $(CXXFLAGS) $(PLATFORM_SHARED_CFLAGS) $(LDFLAGS) $(SOURCES)-o $@ endif # PLATFORM_SHARED_EXT all: $(LIBRARY) $(PROGRAMS) .PHONY: blackbox_crash_test check clean coverage crash_test ldb_tests \ - release tags valgrind_check whitebox_crash_test + release tags valgrind_check whitebox_crash_test format +# Will also generate shared libraries. release: $(MAKE) clean - OPT=-DNDEBUG $(MAKE) -j32 + OPT=-DNDEBUG $(MAKE) all -j32 + OPT=-DNDEBUG $(MAKE) $(SHARED) -j32 coverage: $(MAKE) clean @@ -197,6 +199,9 @@ tags: ctags * -R cscope -b `find . -name '*.cc'` `find . -name '*.h'` +format: + build_tools/format-diff.sh + # --------------------------------------------------------------------------- # Unit tests and tools # --------------------------------------------------------------------------- @@ -415,6 +420,12 @@ DEPFILES = $(filter-out util/build_version.d,$(SOURCES:.cc=.d)) depend: $(DEPFILES) +# if the make goal is either "clean" or "format", we shouldn't +# try to import the *.d files. +# TODO(kailiu) The unfamiliarity of Make's conditions leads to the ugly +# working solution. ifneq ($(MAKECMDGOALS),clean) +ifneq ($(MAKECMDGOALS),format) -include $(DEPFILES) endif +endif diff --git a/build_tools/build_detect_platform b/build_tools/build_detect_platform index 87c4c871d..8e83ae497 100755 --- a/build_tools/build_detect_platform +++ b/build_tools/build_detect_platform @@ -81,9 +81,9 @@ PLATFORM_CCFLAGS= PLATFORM_CXXFLAGS="$PLATFORM_CXXFLAGS ${CXXFLAGS}" PLATFORM_LDFLAGS="$PLATFORM_LDFLAGS" PLATFORM_SHARED_EXT="so" -PLATFORM_SHARED_LDFLAGS="${EXEC_LDFLAGS_SHARED} -shared -Wl,-soname -Wl," +PLATFORM_SHARED_LDFLAGS="-shared -Wl,-soname -Wl," PLATFORM_SHARED_CFLAGS="-fPIC" -PLATFORM_SHARED_VERSIONED=true +PLATFORM_SHARED_VERSIONED=false # generic port files (working on all platform by #ifdef) go directly in /port GENERIC_PORT_FILES=`find $ROCKSDB_ROOT/port -name '*.cc' | tr "\n" " "` diff --git a/build_tools/fbcode.gcc481.sh b/build_tools/fbcode.gcc481.sh index ae2bb57da..e8c9f090b 100644 --- a/build_tools/fbcode.gcc481.sh +++ b/build_tools/fbcode.gcc481.sh @@ -60,7 +60,7 @@ AR=$TOOLCHAIN_EXECUTABLES/binutils/binutils-2.21.1/da39a3e/bin/ar RANLIB=$TOOLCHAIN_EXECUTABLES/binutils/binutils-2.21.1/da39a3e/bin/ranlib CFLAGS="-B$TOOLCHAIN_EXECUTABLES/binutils/binutils-2.21.1/da39a3e/bin/gold -m64 -mtune=generic" -CFLAGS+=" -nostdlib $LIBGCC_INCLUDE $GLIBC_INCLUDE" +CFLAGS+=" $LIBGCC_INCLUDE $GLIBC_INCLUDE" CFLAGS+=" -DROCKSDB_PLATFORM_POSIX -DROCKSDB_ATOMIC_PRESENT -DROCKSDB_FALLOCATE_PRESENT" CFLAGS+=" -DSNAPPY -DGFLAGS -DZLIB -DBZIP2" diff --git a/build_tools/format-diff.sh b/build_tools/format-diff.sh new file mode 100755 index 000000000..ceae38192 --- /dev/null +++ b/build_tools/format-diff.sh @@ -0,0 +1,109 @@ +#!/bin/bash +# If clang_format_diff.py command is not specfied, we assume we are able to +# access directly without any path. +if [ -z $CLANG_FORMAT_DIFF ] +then +CLANG_FORMAT_DIFF="clang-format-diff.py" +fi + +# Check clang-format-diff.py +if ! which $CLANG_FORMAT_DIFF &> /dev/null +then + echo "You didn't have clang-format-diff.py available in your computer!" + echo "You can download it by running: " + echo " curl http://goo.gl/iUW1u2" + exit 128 +fi + +# Check argparse, a library that clang-format-diff.py requires. +python 2>/dev/null << EOF +import argparse +EOF + +if [ "$?" != 0 ] +then + echo "To run clang-format-diff.py, we'll need the library "argparse" to be" + echo "installed. You can try either of the follow ways to install it:" + echo " 1. Manually download argparse: https://pypi.python.org/pypi/argparse" + echo " 2. easy_install argparse (if you have easy_install)" + echo " 3. pip install argparse (if you have pip)" + exit 129 +fi + +# TODO(kailiu) following work is not complete since we still need to figure +# out how to add the modified files done pre-commit hook to git's commit index. +# +# Check if this script has already been added to pre-commit hook. +# Will suggest user to add this script to pre-commit hook if their pre-commit +# is empty. +# PRE_COMMIT_SCRIPT_PATH="`git rev-parse --show-toplevel`/.git/hooks/pre-commit" +# if ! ls $PRE_COMMIT_SCRIPT_PATH &> /dev/null +# then +# echo "Would you like to add this script to pre-commit hook, which will do " +# echo -n "the format check for all the affected lines before you check in (y/n):" +# read add_to_hook +# if [ "$add_to_hook" == "y" ] +# then +# ln -s `git rev-parse --show-toplevel`/build_tools/format-diff.sh $PRE_COMMIT_SCRIPT_PATH +# fi +# fi + +set -e + +uncommitted_code=`git diff HEAD` + +# If there's no uncommitted changes, we assume user are doing post-commit +# format check, in which case we'll check the modified lines from latest commit. +# Otherwise, we'll check format of the uncommitted code only. +format_last_commit=0 +if [ -z "$uncommitted_code" ] +then + # Check the format of last commit + diffs=$(git diff -U0 HEAD^ | $CLANG_FORMAT_DIFF -p 1) +else + # Check the format of uncommitted lines, + diffs=$(git diff -U0 HEAD | $CLANG_FORMAT_DIFF -p 1) +fi + +if [ -z "$diffs" ] +then + echo "Nothing needs to be reformatted!" + exit 0 +fi + +# Highlight the insertion/deletion from the clang-format-diff.py's output +COLOR_END="\033[0m" +COLOR_RED="\033[0;31m" +COLOR_GREEN="\033[0;32m" + +echo -e "Detect lines that doesn't follow the format rules:\r" +# Add the color to the diff. lines added will be green; lines removed will be red. +echo "$diffs" | + sed -e "s/\(^-.*$\)/`echo -e \"$COLOR_RED\1$COLOR_END\"`/" | + sed -e "s/\(^+.*$\)/`echo -e \"$COLOR_GREEN\1$COLOR_END\"`/" +echo -e "Would you like to fix the format automatically (y/n): \c" + +# Make sure under any mode, we can read user input. +exec < /dev/tty +read to_fix + +if [ "$to_fix" != "y" ] +then + exit 1 +fi + +# Do in-place format adjustment. +git diff -U0 HEAD^ | $CLANG_FORMAT_DIFF -i -p 1 +echo "Files reformatted!" + +# Amend to last commit if user do the post-commit format check +if [ -z "$uncommitted_code" ]; then + echo -e "Would you like to amend the changes to last commit (`git log HEAD --oneline | head -1`)? (y/n): \c" + read to_amend + + if [ "$to_amend" == "y" ] + then + git commit -a --amend --reuse-message HEAD + echo "Amended to last commit" + fi +fi diff --git a/db/compaction.cc b/db/compaction.cc new file mode 100644 index 000000000..703e7aeae --- /dev/null +++ b/db/compaction.cc @@ -0,0 +1,214 @@ +// Copyright (c) 2013, Facebook, Inc. All rights reserved. +// This source code is licensed under the BSD-style license found in the +// LICENSE file in the root directory of this source tree. An additional grant +// of patent rights can be found in the PATENTS file in the same directory. +// +// Copyright (c) 2011 The LevelDB Authors. All rights reserved. +// Use of this source code is governed by a BSD-style license that can be +// found in the LICENSE file. See the AUTHORS file for names of contributors. + +#include "db/compaction.h" + +namespace rocksdb { + +static uint64_t TotalFileSize(const std::vector& files) { + uint64_t sum = 0; + for (size_t i = 0; i < files.size() && files[i]; i++) { + sum += files[i]->file_size; + } + return sum; +} + +Compaction::Compaction(Version* input_version, int level, int out_level, + uint64_t target_file_size, + uint64_t max_grandparent_overlap_bytes, + bool seek_compaction, bool enable_compression) + : level_(level), + out_level_(out_level), + max_output_file_size_(target_file_size), + maxGrandParentOverlapBytes_(max_grandparent_overlap_bytes), + input_version_(input_version), + number_levels_(input_version_->NumberLevels()), + seek_compaction_(seek_compaction), + enable_compression_(enable_compression), + grandparent_index_(0), + seen_key_(false), + overlapped_bytes_(0), + base_index_(-1), + parent_index_(-1), + score_(0), + bottommost_level_(false), + is_full_compaction_(false), + level_ptrs_(std::vector(number_levels_)) { + + input_version_->Ref(); + edit_ = new VersionEdit(); + for (int i = 0; i < number_levels_; i++) { + level_ptrs_[i] = 0; + } +} + +Compaction::~Compaction() { + delete edit_; + if (input_version_ != nullptr) { + input_version_->Unref(); + } +} + +bool Compaction::IsTrivialMove() const { + // Avoid a move if there is lots of overlapping grandparent data. + // Otherwise, the move could create a parent file that will require + // a very expensive merge later on. + // If level_== out_level_, the purpose is to force compaction filter to be + // applied to that level, and thus cannot be a trivia move. + return (level_ != out_level_ && + num_input_files(0) == 1 && + num_input_files(1) == 0 && + TotalFileSize(grandparents_) <= maxGrandParentOverlapBytes_); +} + +void Compaction::AddInputDeletions(VersionEdit* edit) { + for (int which = 0; which < 2; which++) { + for (size_t i = 0; i < inputs_[which].size(); i++) { + edit->DeleteFile(level_ + which, inputs_[which][i]->number); + } + } +} + +bool Compaction::IsBaseLevelForKey(const Slice& user_key) { + if (input_version_->vset_->options_->compaction_style == + kCompactionStyleUniversal) { + return bottommost_level_; + } + // Maybe use binary search to find right entry instead of linear search? + const Comparator* user_cmp = input_version_->vset_->icmp_.user_comparator(); + for (int lvl = level_ + 2; lvl < number_levels_; lvl++) { + const std::vector& files = input_version_->files_[lvl]; + for (; level_ptrs_[lvl] < files.size(); ) { + FileMetaData* f = files[level_ptrs_[lvl]]; + if (user_cmp->Compare(user_key, f->largest.user_key()) <= 0) { + // We've advanced far enough + if (user_cmp->Compare(user_key, f->smallest.user_key()) >= 0) { + // Key falls in this file's range, so definitely not base level + return false; + } + break; + } + level_ptrs_[lvl]++; + } + } + return true; +} + +bool Compaction::ShouldStopBefore(const Slice& internal_key) { + // Scan to find earliest grandparent file that contains key. + const InternalKeyComparator* icmp = &input_version_->vset_->icmp_; + while (grandparent_index_ < grandparents_.size() && + icmp->Compare(internal_key, + grandparents_[grandparent_index_]->largest.Encode()) > 0) { + if (seen_key_) { + overlapped_bytes_ += grandparents_[grandparent_index_]->file_size; + } + assert(grandparent_index_ + 1 >= grandparents_.size() || + icmp->Compare(grandparents_[grandparent_index_]->largest.Encode(), + grandparents_[grandparent_index_+1]->smallest.Encode()) + < 0); + grandparent_index_++; + } + seen_key_ = true; + + if (overlapped_bytes_ > maxGrandParentOverlapBytes_) { + // Too much overlap for current output; start new output + overlapped_bytes_ = 0; + return true; + } else { + return false; + } +} + +// Mark (or clear) each file that is being compacted +void Compaction::MarkFilesBeingCompacted(bool value) { + for (int i = 0; i < 2; i++) { + std::vector v = inputs_[i]; + for (unsigned int j = 0; j < inputs_[i].size(); j++) { + assert(value ? !inputs_[i][j]->being_compacted : + inputs_[i][j]->being_compacted); + inputs_[i][j]->being_compacted = value; + } + } +} + +// Is this compaction producing files at the bottommost level? +void Compaction::SetupBottomMostLevel(bool isManual) { + if (input_version_->vset_->options_->compaction_style == + kCompactionStyleUniversal) { + // If universal compaction style is used and manual + // compaction is occuring, then we are guaranteed that + // all files will be picked in a single compaction + // run. We can safely set bottommost_level_ = true. + // If it is not manual compaction, then bottommost_level_ + // is already set when the Compaction was created. + if (isManual) { + bottommost_level_ = true; + } + return; + } + bottommost_level_ = true; + int num_levels = input_version_->vset_->NumberLevels(); + for (int i = output_level() + 1; i < num_levels; i++) { + if (input_version_->NumLevelFiles(i) > 0) { + bottommost_level_ = false; + break; + } + } +} + +void Compaction::ReleaseInputs() { + if (input_version_ != nullptr) { + input_version_->Unref(); + input_version_ = nullptr; + } +} + +void Compaction::ResetNextCompactionIndex() { + input_version_->ResetNextCompactionIndex(level_); +} + +static void InputSummary(std::vector& files, char* output, + int len) { + int write = 0; + for (unsigned int i = 0; i < files.size(); i++) { + int sz = len - write; + int ret = snprintf(output + write, sz, "%lu(%lu) ", + (unsigned long)files.at(i)->number, + (unsigned long)files.at(i)->file_size); + if (ret < 0 || ret >= sz) + break; + write += ret; + } +} + +void Compaction::Summary(char* output, int len) { + int write = snprintf(output, len, + "Base version %lu Base level %d, seek compaction:%d, inputs:", + (unsigned long)input_version_->GetVersionNumber(), + level_, + seek_compaction_); + if (write < 0 || write > len) { + return; + } + + char level_low_summary[100]; + InputSummary(inputs_[0], level_low_summary, sizeof(level_low_summary)); + char level_up_summary[100]; + if (inputs_[1].size()) { + InputSummary(inputs_[1], level_up_summary, sizeof(level_up_summary)); + } else { + level_up_summary[0] = '\0'; + } + + snprintf(output + write, len - write, "[%s],[%s]", + level_low_summary, level_up_summary); +} + +} // namespace rocksdb diff --git a/db/compaction.h b/db/compaction.h new file mode 100644 index 000000000..5e696a053 --- /dev/null +++ b/db/compaction.h @@ -0,0 +1,134 @@ +// Copyright (c) 2013, Facebook, Inc. All rights reserved. +// This source code is licensed under the BSD-style license found in the +// LICENSE file in the root directory of this source tree. An additional grant +// of patent rights can be found in the PATENTS file in the same directory. +// +// Copyright (c) 2011 The LevelDB Authors. All rights reserved. +// Use of this source code is governed by a BSD-style license that can be +// found in the LICENSE file. See the AUTHORS file for names of contributors. + +#pragma once +#include "db/version_set.h" + +namespace rocksdb { + +class Version; + +// A Compaction encapsulates information about a compaction. +class Compaction { + public: + ~Compaction(); + + // Return the level that is being compacted. Inputs from "level" + // will be merged. + int level() const { return level_; } + + // Outputs will go to this level + int output_level() const { return out_level_; } + + // Return the object that holds the edits to the descriptor done + // by this compaction. + VersionEdit* edit() { return edit_; } + + // "which" must be either 0 or 1 + int num_input_files(int which) const { return inputs_[which].size(); } + + // Return the ith input file at "level()+which" ("which" must be 0 or 1). + FileMetaData* input(int which, int i) const { return inputs_[which][i]; } + + // Maximum size of files to build during this compaction. + uint64_t MaxOutputFileSize() const { return max_output_file_size_; } + + // Whether compression will be enabled for compaction outputs + bool enable_compression() const { return enable_compression_; } + + // Is this a trivial compaction that can be implemented by just + // moving a single input file to the next level (no merging or splitting) + bool IsTrivialMove() const; + + // Add all inputs to this compaction as delete operations to *edit. + void AddInputDeletions(VersionEdit* edit); + + // Returns true if the information we have available guarantees that + // the compaction is producing data in "level+1" for which no data exists + // in levels greater than "level+1". + bool IsBaseLevelForKey(const Slice& user_key); + + // Returns true iff we should stop building the current output + // before processing "internal_key". + bool ShouldStopBefore(const Slice& internal_key); + + // Release the input version for the compaction, once the compaction + // is successful. + void ReleaseInputs(); + + void Summary(char* output, int len); + + // Return the score that was used to pick this compaction run. + double score() const { return score_; } + + // Is this compaction creating a file in the bottom most level? + bool BottomMostLevel() { return bottommost_level_; } + + // Does this compaction include all sst files? + bool IsFullCompaction() { return is_full_compaction_; } + + private: + friend class Version; + friend class VersionSet; + friend class CompactionPicker; + friend class UniversalCompactionPicker; + friend class LevelCompactionPicker; + + Compaction(Version* input_version, int level, int out_level, + uint64_t target_file_size, uint64_t max_grandparent_overlap_bytes, + bool seek_compaction = false, bool enable_compression = true); + + int level_; + int out_level_; // levels to which output files are stored + uint64_t max_output_file_size_; + uint64_t maxGrandParentOverlapBytes_; + Version* input_version_; + VersionEdit* edit_; + int number_levels_; + + bool seek_compaction_; + bool enable_compression_; + + // Each compaction reads inputs from "level_" and "level_+1" + std::vector inputs_[2]; // The two sets of inputs + + // State used to check for number of of overlapping grandparent files + // (parent == level_ + 1, grandparent == level_ + 2) + std::vector grandparents_; + size_t grandparent_index_; // Index in grandparent_starts_ + bool seen_key_; // Some output key has been seen + uint64_t overlapped_bytes_; // Bytes of overlap between current output + // and grandparent files + int base_index_; // index of the file in files_[level_] + int parent_index_; // index of some file with same range in files_[level_+1] + double score_; // score that was used to pick this compaction. + + // Is this compaction creating a file in the bottom most level? + bool bottommost_level_; + // Does this compaction include all sst files? + bool is_full_compaction_; + + // level_ptrs_ holds indices into input_version_->levels_: our state + // is that we are positioned at one of the file ranges for each + // higher level than the ones involved in this compaction (i.e. for + // all L >= level_ + 2). + std::vector level_ptrs_; + + // mark (or clear) all files that are being compacted + void MarkFilesBeingCompacted(bool); + + // Initialize whether compaction producing files at the bottommost level + void SetupBottomMostLevel(bool isManual); + + // In case of compaction error, reset the nextIndex that is used + // to pick up the next file to be compacted from files_by_size_ + void ResetNextCompactionIndex(); +}; + +} // namespace rocksdb diff --git a/db/compaction_picker.cc b/db/compaction_picker.cc new file mode 100644 index 000000000..cfa3770d7 --- /dev/null +++ b/db/compaction_picker.cc @@ -0,0 +1,847 @@ +// Copyright (c) 2013, Facebook, Inc. All rights reserved. +// This source code is licensed under the BSD-style license found in the +// LICENSE file in the root directory of this source tree. An additional grant +// of patent rights can be found in the PATENTS file in the same directory. +// +// Copyright (c) 2011 The LevelDB Authors. All rights reserved. +// Use of this source code is governed by a BSD-style license that can be +// found in the LICENSE file. See the AUTHORS file for names of contributors. + +#include "db/compaction_picker.h" +#include "util/statistics.h" + +namespace rocksdb { + +namespace { + +uint64_t TotalFileSize(const std::vector& files) { + uint64_t sum = 0; + for (size_t i = 0; i < files.size() && files[i]; i++) { + sum += files[i]->file_size; + } + return sum; +} + +} // anonymous namespace + +CompactionPicker::CompactionPicker(const Options* options, + const InternalKeyComparator* icmp) + : compactions_in_progress_(options->num_levels), + options_(options), + num_levels_(options->num_levels), + icmp_(icmp) { + Init(); +} + +void CompactionPicker::ReduceNumberOfLevels(int new_levels) { + num_levels_ = new_levels; + Init(); +} + +void CompactionPicker::Init() { + max_file_size_.reset(new uint64_t[NumberLevels()]); + level_max_bytes_.reset(new uint64_t[NumberLevels()]); + int target_file_size_multiplier = options_->target_file_size_multiplier; + int max_bytes_multiplier = options_->max_bytes_for_level_multiplier; + for (int i = 0; i < NumberLevels(); i++) { + if (i == 0 && options_->compaction_style == kCompactionStyleUniversal) { + max_file_size_[i] = ULLONG_MAX; + level_max_bytes_[i] = options_->max_bytes_for_level_base; + } else if (i > 1) { + max_file_size_[i] = max_file_size_[i - 1] * target_file_size_multiplier; + level_max_bytes_[i] = + level_max_bytes_[i - 1] * max_bytes_multiplier * + options_->max_bytes_for_level_multiplier_additional[i - 1]; + } else { + max_file_size_[i] = options_->target_file_size_base; + level_max_bytes_[i] = options_->max_bytes_for_level_base; + } + } +} + +CompactionPicker::~CompactionPicker() {} + +void CompactionPicker::SizeBeingCompacted(std::vector& sizes) { + for (int level = 0; level < NumberLevels() - 1; level++) { + uint64_t total = 0; + for (auto c : compactions_in_progress_[level]) { + assert(c->level() == level); + for (int i = 0; i < c->num_input_files(0); i++) { + total += c->input(0,i)->file_size; + } + } + sizes[level] = total; + } +} + +// Clear all files to indicate that they are not being compacted +// Delete this compaction from the list of running compactions. +void CompactionPicker::ReleaseCompactionFiles(Compaction* c, Status status) { + c->MarkFilesBeingCompacted(false); + compactions_in_progress_[c->level()].erase(c); + if (!status.ok()) { + c->ResetNextCompactionIndex(); + } +} + +uint64_t CompactionPicker::MaxFileSizeForLevel(int level) const { + assert(level >= 0); + assert(level < NumberLevels()); + return max_file_size_[level]; +} + +uint64_t CompactionPicker::MaxGrandParentOverlapBytes(int level) { + uint64_t result = MaxFileSizeForLevel(level); + result *= options_->max_grandparent_overlap_factor; + return result; +} + +double CompactionPicker::MaxBytesForLevel(int level) { + // Note: the result for level zero is not really used since we set + // the level-0 compaction threshold based on number of files. + assert(level >= 0); + assert(level < NumberLevels()); + return level_max_bytes_[level]; +} + +void CompactionPicker::GetRange(const std::vector& inputs, + InternalKey* smallest, InternalKey* largest) { + assert(!inputs.empty()); + smallest->Clear(); + largest->Clear(); + for (size_t i = 0; i < inputs.size(); i++) { + FileMetaData* f = inputs[i]; + if (i == 0) { + *smallest = f->smallest; + *largest = f->largest; + } else { + if (icmp_->Compare(f->smallest, *smallest) < 0) { + *smallest = f->smallest; + } + if (icmp_->Compare(f->largest, *largest) > 0) { + *largest = f->largest; + } + } + } +} + +void CompactionPicker::GetRange(const std::vector& inputs1, + const std::vector& inputs2, + InternalKey* smallest, InternalKey* largest) { + std::vector all = inputs1; + all.insert(all.end(), inputs2.begin(), inputs2.end()); + GetRange(all, smallest, largest); +} + +bool CompactionPicker::ExpandWhileOverlapping(Compaction* c) { + // If inputs are empty then there is nothing to expand. + if (!c || c->inputs_[0].empty()) { + return true; + } + + // GetOverlappingInputs will always do the right thing for level-0. + // So we don't need to do any expansion if level == 0. + if (c->level() == 0) { + return true; + } + + const int level = c->level(); + InternalKey smallest, largest; + + // Keep expanding c->inputs_[0] until we are sure that there is a + // "clean cut" boundary between the files in input and the surrounding files. + // This will ensure that no parts of a key are lost during compaction. + int hint_index = -1; + size_t old_size; + do { + old_size = c->inputs_[0].size(); + GetRange(c->inputs_[0], &smallest, &largest); + c->inputs_[0].clear(); + c->input_version_->GetOverlappingInputs( + level, &smallest, &largest, &c->inputs_[0], hint_index, &hint_index); + } while(c->inputs_[0].size() > old_size); + + // Get the new range + GetRange(c->inputs_[0], &smallest, &largest); + + // If, after the expansion, there are files that are already under + // compaction, then we must drop/cancel this compaction. + int parent_index = -1; + if (FilesInCompaction(c->inputs_[0]) || + (c->level() != c->output_level() && + ParentRangeInCompaction(c->input_version_, &smallest, &largest, level, + &parent_index))) { + c->inputs_[0].clear(); + c->inputs_[1].clear(); + return false; + } + return true; +} + +uint64_t CompactionPicker::ExpandedCompactionByteSizeLimit(int level) { + uint64_t result = MaxFileSizeForLevel(level); + result *= options_->expanded_compaction_factor; + return result; +} + +// Returns true if any one of specified files are being compacted +bool CompactionPicker::FilesInCompaction(std::vector& files) { + for (unsigned int i = 0; i < files.size(); i++) { + if (files[i]->being_compacted) { + return true; + } + } + return false; +} + +// Returns true if any one of the parent files are being compacted +bool CompactionPicker::ParentRangeInCompaction(Version* version, + const InternalKey* smallest, + const InternalKey* largest, + int level, int* parent_index) { + std::vector inputs; + assert(level + 1 < NumberLevels()); + + version->GetOverlappingInputs(level + 1, smallest, largest, &inputs, + *parent_index, parent_index); + return FilesInCompaction(inputs); +} + +// Populates the set of inputs from "level+1" that overlap with "level". +// Will also attempt to expand "level" if that doesn't expand "level+1" +// or cause "level" to include a file for compaction that has an overlapping +// user-key with another file. +void CompactionPicker::SetupOtherInputs(Compaction* c) { + // If inputs are empty, then there is nothing to expand. + // If both input and output levels are the same, no need to consider + // files at level "level+1" + if (c->inputs_[0].empty() || c->level() == c->output_level()) { + return; + } + + const int level = c->level(); + InternalKey smallest, largest; + + // Get the range one last time. + GetRange(c->inputs_[0], &smallest, &largest); + + // Populate the set of next-level files (inputs_[1]) to include in compaction + c->input_version_->GetOverlappingInputs(level + 1, &smallest, &largest, + &c->inputs_[1], c->parent_index_, + &c->parent_index_); + + // Get entire range covered by compaction + InternalKey all_start, all_limit; + GetRange(c->inputs_[0], c->inputs_[1], &all_start, &all_limit); + + // See if we can further grow the number of inputs in "level" without + // changing the number of "level+1" files we pick up. We also choose NOT + // to expand if this would cause "level" to include some entries for some + // user key, while excluding other entries for the same user key. This + // can happen when one user key spans multiple files. + if (!c->inputs_[1].empty()) { + std::vector expanded0; + c->input_version_->GetOverlappingInputs( + level, &all_start, &all_limit, &expanded0, c->base_index_, nullptr); + const uint64_t inputs0_size = TotalFileSize(c->inputs_[0]); + const uint64_t inputs1_size = TotalFileSize(c->inputs_[1]); + const uint64_t expanded0_size = TotalFileSize(expanded0); + uint64_t limit = ExpandedCompactionByteSizeLimit(level); + if (expanded0.size() > c->inputs_[0].size() && + inputs1_size + expanded0_size < limit && + !FilesInCompaction(expanded0) && + !c->input_version_->HasOverlappingUserKey(&expanded0, level)) { + InternalKey new_start, new_limit; + GetRange(expanded0, &new_start, &new_limit); + std::vector expanded1; + c->input_version_->GetOverlappingInputs(level + 1, &new_start, &new_limit, + &expanded1, c->parent_index_, + &c->parent_index_); + if (expanded1.size() == c->inputs_[1].size() && + !FilesInCompaction(expanded1)) { + Log(options_->info_log, + "Expanding@%lu %lu+%lu (%lu+%lu bytes) to %lu+%lu (%lu+%lu bytes)" + "\n", + (unsigned long)level, + (unsigned long)(c->inputs_[0].size()), + (unsigned long)(c->inputs_[1].size()), + (unsigned long)inputs0_size, + (unsigned long)inputs1_size, + (unsigned long)(expanded0.size()), + (unsigned long)(expanded1.size()), + (unsigned long)expanded0_size, + (unsigned long)inputs1_size); + smallest = new_start; + largest = new_limit; + c->inputs_[0] = expanded0; + c->inputs_[1] = expanded1; + GetRange(c->inputs_[0], c->inputs_[1], &all_start, &all_limit); + } + } + } + + // Compute the set of grandparent files that overlap this compaction + // (parent == level+1; grandparent == level+2) + if (level + 2 < NumberLevels()) { + c->input_version_->GetOverlappingInputs(level + 2, &all_start, &all_limit, + &c->grandparents_); + } +} + + +Compaction* CompactionPicker::CompactRange(Version* version, int input_level, + int output_level, + const InternalKey* begin, + const InternalKey* end, + InternalKey** compaction_end) { + std::vector inputs; + bool covering_the_whole_range = true; + + // All files are 'overlapping' in universal style compaction. + // We have to compact the entire range in one shot. + if (options_->compaction_style == kCompactionStyleUniversal) { + begin = nullptr; + end = nullptr; + } + version->GetOverlappingInputs(input_level, begin, end, &inputs); + if (inputs.empty()) { + return nullptr; + } + + // Avoid compacting too much in one shot in case the range is large. + // But we cannot do this for level-0 since level-0 files can overlap + // and we must not pick one file and drop another older file if the + // two files overlap. + if (input_level > 0) { + const uint64_t limit = + MaxFileSizeForLevel(input_level) * options_->source_compaction_factor; + uint64_t total = 0; + for (size_t i = 0; i + 1 < inputs.size(); ++i) { + uint64_t s = inputs[i]->file_size; + total += s; + if (total >= limit) { + **compaction_end = inputs[i + 1]->smallest; + covering_the_whole_range = false; + inputs.resize(i + 1); + break; + } + } + } + Compaction* c = new Compaction(version, input_level, output_level, + MaxFileSizeForLevel(output_level), + MaxGrandParentOverlapBytes(input_level)); + + c->inputs_[0] = inputs; + if (ExpandWhileOverlapping(c) == false) { + delete c; + Log(options_->info_log, "Could not compact due to expansion failure.\n"); + return nullptr; + } + + SetupOtherInputs(c); + + if (covering_the_whole_range) { + *compaction_end = nullptr; + } + + // These files that are to be manaully compacted do not trample + // upon other files because manual compactions are processed when + // the system has a max of 1 background compaction thread. + c->MarkFilesBeingCompacted(true); + + // Is this compaction creating a file at the bottommost level + c->SetupBottomMostLevel(true); + return c; +} + +Compaction* LevelCompactionPicker::PickCompaction(Version* version) { + Compaction* c = nullptr; + int level = -1; + + // Compute the compactions needed. It is better to do it here + // and also in LogAndApply(), otherwise the values could be stale. + std::vector size_being_compacted(NumberLevels() - 1); + SizeBeingCompacted(size_being_compacted); + version->Finalize(size_being_compacted); + + // We prefer compactions triggered by too much data in a level over + // the compactions triggered by seeks. + // + // Find the compactions by size on all levels. + for (int i = 0; i < NumberLevels() - 1; i++) { + assert(i == 0 || + version->compaction_score_[i] <= version->compaction_score_[i - 1]); + level = version->compaction_level_[i]; + if ((version->compaction_score_[i] >= 1)) { + c = PickCompactionBySize(version, level, version->compaction_score_[i]); + if (ExpandWhileOverlapping(c) == false) { + delete c; + c = nullptr; + } else { + break; + } + } + } + + // Find compactions needed by seeks + FileMetaData* f = version->file_to_compact_; + if (c == nullptr && f != nullptr && !f->being_compacted) { + + level = version->file_to_compact_level_; + int parent_index = -1; + + // Only allow one level 0 compaction at a time. + // Do not pick this file if its parents at level+1 are being compacted. + if (level != 0 || compactions_in_progress_[0].empty()) { + if (!ParentRangeInCompaction(version, &f->smallest, &f->largest, level, + &parent_index)) { + c = new Compaction(version, level, level + 1, + MaxFileSizeForLevel(level + 1), + MaxGrandParentOverlapBytes(level), true); + c->inputs_[0].push_back(f); + c->parent_index_ = parent_index; + c->input_version_->file_to_compact_ = nullptr; + if (ExpandWhileOverlapping(c) == false) { + return nullptr; + } + } + } + } + + if (c == nullptr) { + return nullptr; + } + + // Two level 0 compaction won't run at the same time, so don't need to worry + // about files on level 0 being compacted. + if (level == 0) { + assert(compactions_in_progress_[0].empty()); + InternalKey smallest, largest; + GetRange(c->inputs_[0], &smallest, &largest); + // Note that the next call will discard the file we placed in + // c->inputs_[0] earlier and replace it with an overlapping set + // which will include the picked file. + c->inputs_[0].clear(); + c->input_version_->GetOverlappingInputs(0, &smallest, &largest, + &c->inputs_[0]); + + // If we include more L0 files in the same compaction run it can + // cause the 'smallest' and 'largest' key to get extended to a + // larger range. So, re-invoke GetRange to get the new key range + GetRange(c->inputs_[0], &smallest, &largest); + if (ParentRangeInCompaction(c->input_version_, &smallest, &largest, level, + &c->parent_index_)) { + delete c; + return nullptr; + } + assert(!c->inputs_[0].empty()); + } + + // Setup "level+1" files (inputs_[1]) + SetupOtherInputs(c); + + // mark all the files that are being compacted + c->MarkFilesBeingCompacted(true); + + // Is this compaction creating a file at the bottommost level + c->SetupBottomMostLevel(false); + + // remember this currently undergoing compaction + compactions_in_progress_[level].insert(c); + + return c; +} + +Compaction* LevelCompactionPicker::PickCompactionBySize(Version* version, + int level, + double score) { + Compaction* c = nullptr; + + // level 0 files are overlapping. So we cannot pick more + // than one concurrent compactions at this level. This + // could be made better by looking at key-ranges that are + // being compacted at level 0. + if (level == 0 && compactions_in_progress_[level].size() == 1) { + return nullptr; + } + + assert(level >= 0); + assert(level + 1 < NumberLevels()); + c = new Compaction(version, level, level + 1, MaxFileSizeForLevel(level + 1), + MaxGrandParentOverlapBytes(level)); + c->score_ = score; + + // Pick the largest file in this level that is not already + // being compacted + std::vector& file_size = c->input_version_->files_by_size_[level]; + + // record the first file that is not yet compacted + int nextIndex = -1; + + for (unsigned int i = c->input_version_->next_file_to_compact_by_size_[level]; + i < file_size.size(); i++) { + int index = file_size[i]; + FileMetaData* f = c->input_version_->files_[level][index]; + + // check to verify files are arranged in descending size + assert((i == file_size.size() - 1) || + (i >= Version::number_of_files_to_sort_ - 1) || + (f->file_size >= + c->input_version_->files_[level][file_size[i + 1]]->file_size)); + + // do not pick a file to compact if it is being compacted + // from n-1 level. + if (f->being_compacted) { + continue; + } + + // remember the startIndex for the next call to PickCompaction + if (nextIndex == -1) { + nextIndex = i; + } + + //if (i > Version::number_of_files_to_sort_) { + // Log(options_->info_log, "XXX Looking at index %d", i); + //} + + // Do not pick this file if its parents at level+1 are being compacted. + // Maybe we can avoid redoing this work in SetupOtherInputs + int parent_index = -1; + if (ParentRangeInCompaction(c->input_version_, &f->smallest, &f->largest, + level, &parent_index)) { + continue; + } + c->inputs_[0].push_back(f); + c->base_index_ = index; + c->parent_index_ = parent_index; + break; + } + + if (c->inputs_[0].empty()) { + delete c; + c = nullptr; + } + + // store where to start the iteration in the next call to PickCompaction + version->next_file_to_compact_by_size_[level] = nextIndex; + + return c; +} + +// Universal style of compaction. Pick files that are contiguous in +// time-range to compact. +// +Compaction* UniversalCompactionPicker::PickCompaction(Version* version) { + int level = 0; + double score = version->compaction_score_[0]; + + if ((version->files_[level].size() < + (unsigned int)options_->level0_file_num_compaction_trigger)) { + Log(options_->info_log, "Universal: nothing to do\n"); + return nullptr; + } + Version::FileSummaryStorage tmp; + Log(options_->info_log, "Universal: candidate files(%lu): %s\n", + version->files_[level].size(), + version->LevelFileSummary(&tmp, 0)); + + // Check for size amplification first. + Compaction* c = PickCompactionUniversalSizeAmp(version, score); + if (c == nullptr) { + + // Size amplification is within limits. Try reducing read + // amplification while maintaining file size ratios. + unsigned int ratio = options_->compaction_options_universal.size_ratio; + c = PickCompactionUniversalReadAmp(version, score, ratio, UINT_MAX); + + // Size amplification and file size ratios are within configured limits. + // If max read amplification is exceeding configured limits, then force + // compaction without looking at filesize ratios and try to reduce + // the number of files to fewer than level0_file_num_compaction_trigger. + if (c == nullptr) { + unsigned int num_files = version->files_[level].size() - + options_->level0_file_num_compaction_trigger; + c = PickCompactionUniversalReadAmp(version, score, UINT_MAX, num_files); + } + } + if (c == nullptr) { + return nullptr; + } + assert(c->inputs_[0].size() > 1); + + // validate that all the chosen files are non overlapping in time + FileMetaData* newerfile __attribute__((unused)) = nullptr; + for (unsigned int i = 0; i < c->inputs_[0].size(); i++) { + FileMetaData* f = c->inputs_[0][i]; + assert (f->smallest_seqno <= f->largest_seqno); + assert(newerfile == nullptr || + newerfile->smallest_seqno > f->largest_seqno); + newerfile = f; + } + + // The files are sorted from newest first to oldest last. + std::vector& file_by_time = c->input_version_->files_by_size_[level]; + + // Is the earliest file part of this compaction? + int last_index = file_by_time[file_by_time.size()-1]; + FileMetaData* last_file = c->input_version_->files_[level][last_index]; + if (c->inputs_[0][c->inputs_[0].size()-1] == last_file) { + c->bottommost_level_ = true; + } + + // update statistics + MeasureTime(options_->statistics.get(), NUM_FILES_IN_SINGLE_COMPACTION, + c->inputs_[0].size()); + + // mark all the files that are being compacted + c->MarkFilesBeingCompacted(true); + + // remember this currently undergoing compaction + compactions_in_progress_[level].insert(c); + + // Record whether this compaction includes all sst files. + // For now, it is only relevant in universal compaction mode. + c->is_full_compaction_ = + (c->inputs_[0].size() == c->input_version_->files_[0].size()); + + return c; +} + +// +// Consider compaction files based on their size differences with +// the next file in time order. +// +Compaction* UniversalCompactionPicker::PickCompactionUniversalReadAmp( + Version* version, double score, unsigned int ratio, + unsigned int max_number_of_files_to_compact) { + int level = 0; + + unsigned int min_merge_width = + options_->compaction_options_universal.min_merge_width; + unsigned int max_merge_width = + options_->compaction_options_universal.max_merge_width; + + // The files are sorted from newest first to oldest last. + std::vector& file_by_time = version->files_by_size_[level]; + FileMetaData* f = nullptr; + bool done = false; + int start_index = 0; + unsigned int candidate_count; + assert(file_by_time.size() == version->files_[level].size()); + + unsigned int max_files_to_compact = std::min(max_merge_width, + max_number_of_files_to_compact); + min_merge_width = std::max(min_merge_width, 2U); + + // Considers a candidate file only if it is smaller than the + // total size accumulated so far. + for (unsigned int loop = 0; loop < file_by_time.size(); loop++) { + + candidate_count = 0; + + // Skip files that are already being compacted + for (f = nullptr; loop < file_by_time.size(); loop++) { + int index = file_by_time[loop]; + f = version->files_[level][index]; + + if (!f->being_compacted) { + candidate_count = 1; + break; + } + Log(options_->info_log, + "Universal: file %lu[%d] being compacted, skipping", + (unsigned long)f->number, loop); + f = nullptr; + } + + // This file is not being compacted. Consider it as the + // first candidate to be compacted. + uint64_t candidate_size = f != nullptr? f->file_size : 0; + if (f != nullptr) { + Log(options_->info_log, "Universal: Possible candidate file %lu[%d].", + (unsigned long)f->number, loop); + } + + // Check if the suceeding files need compaction. + for (unsigned int i = loop+1; + candidate_count < max_files_to_compact && i < file_by_time.size(); + i++) { + int index = file_by_time[i]; + FileMetaData* f = version->files_[level][index]; + if (f->being_compacted) { + break; + } + // pick files if the total candidate file size (increased by the + // specified ratio) is still larger than the next candidate file. + uint64_t sz = (candidate_size * (100L + ratio)) /100; + if (sz < f->file_size) { + break; + } + candidate_count++; + candidate_size += f->file_size; + } + + // Found a series of consecutive files that need compaction. + if (candidate_count >= (unsigned int)min_merge_width) { + start_index = loop; + done = true; + break; + } else { + for (unsigned int i = loop; + i < loop + candidate_count && i < file_by_time.size(); i++) { + int index = file_by_time[i]; + FileMetaData* f = version->files_[level][index]; + Log(options_->info_log, + "Universal: Skipping file %lu[%d] with size %lu %d\n", + (unsigned long)f->number, + i, + (unsigned long)f->file_size, + f->being_compacted); + } + } + } + if (!done || candidate_count <= 1) { + return nullptr; + } + unsigned int first_index_after = start_index + candidate_count; + // Compression is enabled if files compacted earlier already reached + // size ratio of compression. + bool enable_compression = true; + int ratio_to_compress = + options_->compaction_options_universal.compression_size_percent; + if (ratio_to_compress >= 0) { + uint64_t total_size = version->NumLevelBytes(level); + uint64_t older_file_size = 0; + for (unsigned int i = file_by_time.size() - 1; i >= first_index_after; + i--) { + older_file_size += version->files_[level][file_by_time[i]]->file_size; + if (older_file_size * 100L >= total_size * (long) ratio_to_compress) { + enable_compression = false; + break; + } + } + } + Compaction* c = + new Compaction(version, level, level, MaxFileSizeForLevel(level), + LLONG_MAX, false, enable_compression); + c->score_ = score; + + for (unsigned int i = start_index; i < first_index_after; i++) { + int index = file_by_time[i]; + FileMetaData* f = c->input_version_->files_[level][index]; + c->inputs_[0].push_back(f); + Log(options_->info_log, "Universal: Picking file %lu[%d] with size %lu\n", + (unsigned long)f->number, + i, + (unsigned long)f->file_size); + } + return c; +} + +// Look at overall size amplification. If size amplification +// exceeeds the configured value, then do a compaction +// of the candidate files all the way upto the earliest +// base file (overrides configured values of file-size ratios, +// min_merge_width and max_merge_width). +// +Compaction* UniversalCompactionPicker::PickCompactionUniversalSizeAmp( + Version* version, double score) { + int level = 0; + + // percentage flexibilty while reducing size amplification + uint64_t ratio = options_->compaction_options_universal. + max_size_amplification_percent; + + // The files are sorted from newest first to oldest last. + std::vector& file_by_time = version->files_by_size_[level]; + assert(file_by_time.size() == version->files_[level].size()); + + unsigned int candidate_count = 0; + uint64_t candidate_size = 0; + unsigned int start_index = 0; + FileMetaData* f = nullptr; + + // Skip files that are already being compacted + for (unsigned int loop = 0; loop < file_by_time.size() - 1; loop++) { + int index = file_by_time[loop]; + f = version->files_[level][index]; + if (!f->being_compacted) { + start_index = loop; // Consider this as the first candidate. + break; + } + Log(options_->info_log, "Universal: skipping file %lu[%d] compacted %s", + (unsigned long)f->number, + loop, + " cannot be a candidate to reduce size amp.\n"); + f = nullptr; + } + if (f == nullptr) { + return nullptr; // no candidate files + } + + Log(options_->info_log, "Universal: First candidate file %lu[%d] %s", + (unsigned long)f->number, + start_index, + " to reduce size amp.\n"); + + // keep adding up all the remaining files + for (unsigned int loop = start_index; loop < file_by_time.size() - 1; + loop++) { + int index = file_by_time[loop]; + f = version->files_[level][index]; + if (f->being_compacted) { + Log(options_->info_log, + "Universal: Possible candidate file %lu[%d] %s.", + (unsigned long)f->number, + loop, + " is already being compacted. No size amp reduction possible.\n"); + return nullptr; + } + candidate_size += f->file_size; + candidate_count++; + } + if (candidate_count == 0) { + return nullptr; + } + + // size of earliest file + int index = file_by_time[file_by_time.size() - 1]; + uint64_t earliest_file_size = version->files_[level][index]->file_size; + + // size amplification = percentage of additional size + if (candidate_size * 100 < ratio * earliest_file_size) { + Log(options_->info_log, + "Universal: size amp not needed. newer-files-total-size %lu " + "earliest-file-size %lu", + (unsigned long)candidate_size, + (unsigned long)earliest_file_size); + return nullptr; + } else { + Log(options_->info_log, + "Universal: size amp needed. newer-files-total-size %lu " + "earliest-file-size %lu", + (unsigned long)candidate_size, + (unsigned long)earliest_file_size); + } + assert(start_index >= 0 && start_index < file_by_time.size() - 1); + + // create a compaction request + // We always compact all the files, so always compress. + Compaction* c = + new Compaction(version, level, level, MaxFileSizeForLevel(level), + LLONG_MAX, false, true); + c->score_ = score; + for (unsigned int loop = start_index; loop < file_by_time.size(); loop++) { + int index = file_by_time[loop]; + f = c->input_version_->files_[level][index]; + c->inputs_[0].push_back(f); + Log(options_->info_log, + "Universal: size amp picking file %lu[%d] with size %lu", + (unsigned long)f->number, + index, + (unsigned long)f->file_size); + } + return c; +} + +} // namespace rocksdb diff --git a/db/compaction_picker.h b/db/compaction_picker.h new file mode 100644 index 000000000..0fe086a18 --- /dev/null +++ b/db/compaction_picker.h @@ -0,0 +1,162 @@ +// Copyright (c) 2013, Facebook, Inc. All rights reserved. +// This source code is licensed under the BSD-style license found in the +// LICENSE file in the root directory of this source tree. An additional grant +// of patent rights can be found in the PATENTS file in the same directory. +// +// Copyright (c) 2011 The LevelDB Authors. All rights reserved. +// Use of this source code is governed by a BSD-style license that can be +// found in the LICENSE file. See the AUTHORS file for names of contributors. + +#pragma once +#include "db/version_set.h" +#include "db/compaction.h" +#include "rocksdb/status.h" +#include "rocksdb/options.h" + +#include +#include +#include + +namespace rocksdb { + +class Compaction; +class Version; + +class CompactionPicker { + public: + CompactionPicker(const Options* options, const InternalKeyComparator* icmp); + virtual ~CompactionPicker(); + + // See VersionSet::ReduceNumberOfLevels() + void ReduceNumberOfLevels(int new_levels); + + // Pick level and inputs for a new compaction. + // Returns nullptr if there is no compaction to be done. + // Otherwise returns a pointer to a heap-allocated object that + // describes the compaction. Caller should delete the result. + virtual Compaction* PickCompaction(Version* version) = 0; + + // Return a compaction object for compacting the range [begin,end] in + // the specified level. Returns nullptr if there is nothing in that + // level that overlaps the specified range. Caller should delete + // the result. + // + // The returned Compaction might not include the whole requested range. + // In that case, compaction_end will be set to the next key that needs + // compacting. In case the compaction will compact the whole range, + // compaction_end will be set to nullptr. + // Client is responsible for compaction_end storage -- when called, + // *compaction_end should point to valid InternalKey! + Compaction* CompactRange(Version* version, int input_level, int output_level, + const InternalKey* begin, const InternalKey* end, + InternalKey** compaction_end); + + // Free up the files that participated in a compaction + void ReleaseCompactionFiles(Compaction* c, Status status); + + // Return the total amount of data that is undergoing + // compactions per level + void SizeBeingCompacted(std::vector& sizes); + + // Returns maximum total overlap bytes with grandparent + // level (i.e., level+2) before we stop building a single + // file in level->level+1 compaction. + uint64_t MaxGrandParentOverlapBytes(int level); + + // Returns maximum total bytes of data on a given level. + double MaxBytesForLevel(int level); + + // Get the max file size in a given level. + uint64_t MaxFileSizeForLevel(int level) const; + + protected: + int NumberLevels() const { return num_levels_; } + + // Stores the minimal range that covers all entries in inputs in + // *smallest, *largest. + // REQUIRES: inputs is not empty + void GetRange(const std::vector& inputs, InternalKey* smallest, + InternalKey* largest); + + // Stores the minimal range that covers all entries in inputs1 and inputs2 + // in *smallest, *largest. + // REQUIRES: inputs is not empty + void GetRange(const std::vector& inputs1, + const std::vector& inputs2, + InternalKey* smallest, InternalKey* largest); + + // Add more files to the inputs on "level" to make sure that + // no newer version of a key is compacted to "level+1" while leaving an older + // version in a "level". Otherwise, any Get() will search "level" first, + // and will likely return an old/stale value for the key, since it always + // searches in increasing order of level to find the value. This could + // also scramble the order of merge operands. This function should be + // called any time a new Compaction is created, and its inputs_[0] are + // populated. + // + // Will return false if it is impossible to apply this compaction. + bool ExpandWhileOverlapping(Compaction* c); + + uint64_t ExpandedCompactionByteSizeLimit(int level); + + // Returns true if any one of the specified files are being compacted + bool FilesInCompaction(std::vector& files); + + // Returns true if any one of the parent files are being compacted + bool ParentRangeInCompaction(Version* version, const InternalKey* smallest, + const InternalKey* largest, int level, + int* index); + + void SetupOtherInputs(Compaction* c); + + // record all the ongoing compactions for all levels + std::vector> compactions_in_progress_; + + // Per-level target file size. + std::unique_ptr max_file_size_; + + // Per-level max bytes + std::unique_ptr level_max_bytes_; + + const Options* const options_; + private: + void Init(); + + int num_levels_; + + const InternalKeyComparator* const icmp_; +}; + +class UniversalCompactionPicker : public CompactionPicker { + public: + UniversalCompactionPicker(const Options* options, + const InternalKeyComparator* icmp) + : CompactionPicker(options, icmp) {} + virtual Compaction* PickCompaction(Version* version) override; + + private: + // Pick Universal compaction to limit read amplification + Compaction* PickCompactionUniversalReadAmp(Version* version, double score, + unsigned int ratio, + unsigned int num_files); + + // Pick Universal compaction to limit space amplification. + Compaction* PickCompactionUniversalSizeAmp(Version* version, double score); +}; + +class LevelCompactionPicker : public CompactionPicker { + public: + LevelCompactionPicker(const Options* options, + const InternalKeyComparator* icmp) + : CompactionPicker(options, icmp) {} + virtual Compaction* PickCompaction(Version* version) override; + + private: + // For the specfied level, pick a compaction. + // Returns nullptr if there is no compaction to be done. + // If level is 0 and there is already a compaction on that level, this + // function will return nullptr. + Compaction* PickCompactionBySize(Version* version, int level, double score); +}; + +} // namespace rocksdb diff --git a/db/db_bench.cc b/db/db_bench.cc index e0ba58281..e41a31cf3 100644 --- a/db/db_bench.cc +++ b/db/db_bench.cc @@ -14,7 +14,7 @@ #include #include "db/db_impl.h" #include "db/version_set.h" -#include "db/db_statistics.h" +#include "rocksdb/statistics.h" #include "rocksdb/options.h" #include "rocksdb/cache.h" #include "rocksdb/db.h" @@ -30,6 +30,7 @@ #include "util/random.h" #include "util/stack_trace.h" #include "util/string_util.h" +#include "util/statistics.h" #include "util/testutil.h" #include "hdfs/env_hdfs.h" #include "utilities/merge_operators.h" @@ -355,9 +356,9 @@ static bool ValidateCompressionLevel(const char* flagname, int32_t value) { return true; } -static const bool FLAGS_compression_level_dummy = - google::RegisterFlagValidator(&FLAGS_compression_level, - &ValidateCompressionLevel); +static const bool FLAGS_compression_level_dummy __attribute__((unused)) = + google::RegisterFlagValidator(&FLAGS_compression_level, + &ValidateCompressionLevel); DEFINE_int32(min_level_to_compress, -1, "If non-negative, compression starts" " from this level. Levels with number < min_level_to_compress are" diff --git a/db/db_filesnapshot.cc b/db/db_filesnapshot.cc index a7232246a..04d6d0e17 100644 --- a/db/db_filesnapshot.cc +++ b/db/db_filesnapshot.cc @@ -74,7 +74,7 @@ Status DBImpl::GetLiveFiles(std::vector& ret, // Make a set of all of the live *.sst files std::set live; - versions_->AddLiveFilesCurrentVersion(&live); + versions_->current()->AddLiveFiles(&live); ret.clear(); ret.reserve(live.size() + 2); //*.sst + CURRENT + MANIFEST diff --git a/db/db_impl.cc b/db/db_impl.cc index d07868d21..cb23c979e 100644 --- a/db/db_impl.cc +++ b/db/db_impl.cc @@ -57,6 +57,7 @@ #include "util/mutexlock.h" #include "util/perf_context_imp.h" #include "util/stop_watch.h" +#include "util/autovector.h" namespace rocksdb { @@ -254,8 +255,8 @@ DBImpl::DBImpl(const Options& options, const std::string& dbname) : env_(options.env), dbname_(dbname), internal_comparator_(options.comparator), - options_(SanitizeOptions( - dbname, &internal_comparator_, &internal_filter_policy_, options)), + options_(SanitizeOptions(dbname, &internal_comparator_, + &internal_filter_policy_, options)), internal_filter_policy_(options.filter_policy), owns_info_log_(options_.info_log != options.info_log), db_lock_(nullptr), @@ -263,8 +264,7 @@ DBImpl::DBImpl(const Options& options, const std::string& dbname) shutting_down_(nullptr), bg_cv_(&mutex_), mem_rep_factory_(options_.memtable_factory.get()), - mem_(new MemTable(internal_comparator_, mem_rep_factory_, - NumberLevels(), options_)), + mem_(new MemTable(internal_comparator_, options_)), logfile_number_(0), super_version_(nullptr), tmp_batch_(), @@ -410,7 +410,7 @@ uint64_t DBImpl::TEST_Current_Manifest_FileNo() { } Status DBImpl::NewDB() { - VersionEdit new_db(NumberLevels()); + VersionEdit new_db; new_db.SetComparatorName(user_comparator()->Name()); new_db.SetLogNumber(0); new_db.SetNextFile(2); @@ -1048,8 +1048,7 @@ Status DBImpl::RecoverLogFile(uint64_t log_number, WriteBatchInternal::SetContents(&batch, record); if (mem == nullptr) { - mem = new MemTable(internal_comparator_, mem_rep_factory_, - NumberLevels(), options_); + mem = new MemTable(internal_comparator_, options_); mem->Ref(); } status = WriteBatchInternal::InsertInto(&batch, mem, &options_); @@ -1300,6 +1299,7 @@ Status DBImpl::FlushMemTableToOutputFile(bool* madeProgress, void DBImpl::CompactRange(const ColumnFamilyHandle& column_family, const Slice* begin, const Slice* end, bool reduce_level, int target_level) { + FlushMemTable(FlushOptions()); int max_level_with_files = 1; { MutexLock l(&mutex_); @@ -1310,9 +1310,15 @@ void DBImpl::CompactRange(const ColumnFamilyHandle& column_family, } } } - TEST_FlushMemTable(); // TODO(sanjay): Skip if memtable does not overlap - for (int level = 0; level < max_level_with_files; level++) { - TEST_CompactRange(level, begin, end); + for (int level = 0; level <= max_level_with_files; level++) { + // in case the compaction is unversal or if we're compacting the + // bottom-most level, the output level will be the same as input one + if (options_.compaction_style == kCompactionStyleUniversal || + level == max_level_with_files) { + RunManualCompaction(level, level, begin, end); + } else { + RunManualCompaction(level, level + 1, begin, end); + } } if (reduce_level) { @@ -1324,13 +1330,13 @@ void DBImpl::CompactRange(const ColumnFamilyHandle& column_family, // return the same level if it cannot be moved int DBImpl::FindMinimumEmptyLevelFitting(int level) { mutex_.AssertHeld(); + Version* current = versions_->current(); int minimum_level = level; for (int i = level - 1; i > 0; --i) { // stop if level i is not empty - if (versions_->NumLevelFiles(i) > 0) break; - + if (current->NumLevelFiles(i) > 0) break; // stop if level i is too small (cannot fit the level files) - if (versions_->MaxBytesForLevel(i) < versions_->NumLevelBytes(level)) break; + if (versions_->MaxBytesForLevel(i) < current->NumLevelBytes(level)) break; minimum_level = i; } @@ -1376,7 +1382,7 @@ void DBImpl::ReFitLevel(int level, int target_level) { Log(options_.info_log, "Before refitting:\n%s", versions_->current()->DebugString().data()); - VersionEdit edit(NumberLevels()); + VersionEdit edit; for (const auto& f : versions_->current()->files_[level]) { edit.DeleteFile(level, f->number); edit.AddFile(to_level, f->number, f->file_size, f->smallest, f->largest, @@ -1612,13 +1618,17 @@ Status DBImpl::AppendSortedWalsOfType(const std::string& path, return status; } -void DBImpl::TEST_CompactRange(int level, const Slice* begin,const Slice* end) { - assert(level >= 0); +void DBImpl::RunManualCompaction(int input_level, + int output_level, + const Slice* begin, + const Slice* end) { + assert(input_level >= 0); InternalKey begin_storage, end_storage; ManualCompaction manual; - manual.level = level; + manual.input_level = input_level; + manual.output_level = output_level; manual.done = false; manual.in_progress = false; // For universal compaction, we enforce every manual compaction to compact @@ -1646,11 +1656,11 @@ void DBImpl::TEST_CompactRange(int level, const Slice* begin,const Slice* end) { // can compact any range of keys/files. // // bg_manual_only_ is non-zero when at least one thread is inside - // TEST_CompactRange(), i.e. during that time no other compaction will + // RunManualCompaction(), i.e. during that time no other compaction will // get scheduled (see MaybeScheduleFlushOrCompaction). // // Note that the following loop doesn't stop more that one thread calling - // TEST_CompactRange() from getting to the second while loop below. + // RunManualCompaction() from getting to the second while loop below. // However, only one of them will actually schedule compaction, while // others will wait on a condition variable until it completes. @@ -1680,6 +1690,15 @@ void DBImpl::TEST_CompactRange(int level, const Slice* begin,const Slice* end) { --bg_manual_only_; } +void DBImpl::TEST_CompactRange(int level, + const Slice* begin, + const Slice* end) { + int output_level = (options_.compaction_style == kCompactionStyleUniversal) + ? level + : level + 1; + RunManualCompaction(level, output_level, begin, end); +} + Status DBImpl::FlushMemTable(const FlushOptions& options) { // nullptr batch means just wait for earlier writes to be done Status s = Write(WriteOptions(), nullptr); @@ -1825,6 +1844,11 @@ void DBImpl::TEST_PurgeObsoleteteWAL() { PurgeObsoleteWALFiles(); } +uint64_t DBImpl::TEST_GetLevel0TotalSize() { + MutexLock l(&mutex_); + return versions_->current()->NumLevelBytes(0); +} + void DBImpl::BackgroundCallCompaction() { bool madeProgress = false; DeletionState deletion_state(options_.max_write_buffer_number, true); @@ -1899,23 +1923,27 @@ Status DBImpl::BackgroundCompaction(bool* madeProgress, unique_ptr c; bool is_manual = (manual_compaction_ != nullptr) && (manual_compaction_->in_progress == false); - InternalKey manual_end; + InternalKey manual_end_storage; + InternalKey* manual_end = &manual_end_storage; if (is_manual) { ManualCompaction* m = manual_compaction_; assert(!m->in_progress); m->in_progress = true; // another thread cannot pick up the same work - c.reset(versions_->CompactRange(m->level, m->begin, m->end)); - if (c) { - manual_end = c->input(0, c->num_input_files(0) - 1)->largest; - } else { + c.reset(versions_->CompactRange( + m->input_level, m->output_level, m->begin, m->end, &manual_end)); + if (!c) { m->done = true; } Log(options_.info_log, - "Manual compaction at level-%d from %s .. %s; will stop at %s\n", - m->level, + "Manual compaction from level-%d to level-%d from %s .. %s; will stop " + "at %s\n", + m->input_level, + m->output_level, (m->begin ? m->begin->DebugString().c_str() : "(begin)"), (m->end ? m->end->DebugString().c_str() : "(end)"), - (m->done ? "(end)" : manual_end.DebugString().c_str())); + ((m->done || manual_end == nullptr) + ? "(end)" + : manual_end->DebugString().c_str())); } else if (!options_.disable_auto_compactions) { c.reset(versions_->PickCompaction()); } @@ -1934,13 +1962,11 @@ Status DBImpl::BackgroundCompaction(bool* madeProgress, f->smallest_seqno, f->largest_seqno); status = versions_->LogAndApply(c->edit(), &mutex_); InstallSuperVersion(deletion_state); - VersionSet::LevelSummaryStorage tmp; + Version::LevelSummaryStorage tmp; Log(options_.info_log, "Moved #%lld to level-%d %lld bytes %s: %s\n", - static_cast(f->number), - c->level() + 1, + static_cast(f->number), c->level() + 1, static_cast(f->file_size), - status.ToString().c_str(), - versions_->LevelSummary(&tmp)); + status.ToString().c_str(), versions_->current()->LevelSummary(&tmp)); versions_->ReleaseCompactionFiles(c.get(), status); *madeProgress = true; } else { @@ -1980,13 +2006,19 @@ Status DBImpl::BackgroundCompaction(bool* madeProgress, // Also note that, if we don't stop here, then the current compaction // writes a new file back to level 0, which will be used in successive // compaction. Hence the manual compaction will never finish. - if (options_.compaction_style == kCompactionStyleUniversal) { + // + // Stop the compaction if manual_end points to nullptr -- this means + // that we compacted the whole range. manual_end should always point + // to nullptr in case of universal compaction + if (manual_end == nullptr) { m->done = true; } if (!m->done) { // We only compacted part of the requested range. Update *m // to the range that is left to be compacted. - m->tmp_storage = manual_end; + // Universal compaction should always compact the whole range + assert(options_.compaction_style != kCompactionStyleUniversal); + m->tmp_storage = *manual_end; m->begin = &m->tmp_storage; } m->in_progress = false; // not being processed anymore @@ -2018,14 +2050,14 @@ void DBImpl::CleanupCompaction(CompactionState* compact, Status status) { } // Allocate the file numbers for the output file. We allocate as -// many output file numbers as there are files in level+1. +// many output file numbers as there are files in level+1 (at least one) // Insert them into pending_outputs so that they do not get deleted. void DBImpl::AllocateCompactionOutputFileNumbers(CompactionState* compact) { mutex_.AssertHeld(); assert(compact != nullptr); assert(compact->builder == nullptr); int filesNeeded = compact->compaction->num_input_files(1); - for (int i = 0; i < filesNeeded; i++) { + for (int i = 0; i < std::max(filesNeeded, 1); i++) { uint64_t file_number = versions_->NewFileNumber(); pending_outputs_.insert(file_number); compact->allocated_file_numbers.push_back(file_number); @@ -2169,14 +2201,11 @@ Status DBImpl::InstallCompactionResults(CompactionState* compact) { // Add compaction outputs compact->compaction->AddInputDeletions(compact->compaction->edit()); - const int level = compact->compaction->level(); for (size_t i = 0; i < compact->outputs.size(); i++) { const CompactionState::Output& out = compact->outputs[i]; compact->compaction->edit()->AddFile( - (options_.compaction_style == kCompactionStyleUniversal) ? - level : level + 1, - out.number, out.file_size, out.smallest, out.largest, - out.smallest_seqno, out.largest_seqno); + compact->compaction->output_level(), out.number, out.file_size, + out.smallest, out.largest, out.smallest_seqno, out.largest_seqno); } return versions_->LogAndApply(compact->compaction->edit(), &mutex_); } @@ -2218,14 +2247,14 @@ Status DBImpl::DoCompactionWork(CompactionState* compact, compact->compaction->num_input_files(0), compact->compaction->level(), compact->compaction->num_input_files(1), - compact->compaction->level() + 1, + compact->compaction->output_level(), compact->compaction->score(), options_.max_background_compactions - bg_compaction_scheduled_); char scratch[256]; compact->compaction->Summary(scratch, sizeof(scratch)); Log(options_.info_log, "Compaction start summary: %s\n", scratch); - assert(versions_->NumLevelFiles(compact->compaction->level()) > 0); + assert(versions_->current()->NumLevelFiles(compact->compaction->level()) > 0); assert(compact->builder == nullptr); assert(!compact->outfile); @@ -2553,9 +2582,7 @@ Status DBImpl::DoCompactionWork(CompactionState* compact, CompactionStats stats; stats.micros = env_->NowMicros() - start_micros - imm_micros; - if (options_.statistics.get()) { - options_.statistics.get()->measureTime(COMPACTION_TIME, stats.micros); - } + MeasureTime(options_.statistics.get(), COMPACTION_TIME, stats.micros); stats.files_in_leveln = compact->compaction->num_input_files(0); stats.files_in_levelnp1 = compact->compaction->num_input_files(1); @@ -2597,22 +2624,21 @@ Status DBImpl::DoCompactionWork(CompactionState* compact, status = InstallCompactionResults(compact); InstallSuperVersion(deletion_state); } - VersionSet::LevelSummaryStorage tmp; + Version::LevelSummaryStorage tmp; Log(options_.info_log, "compacted to: %s, %.1f MB/sec, level %d, files in(%d, %d) out(%d) " "MB in(%.1f, %.1f) out(%.1f), read-write-amplify(%.1f) " "write-amplify(%.1f) %s\n", - versions_->LevelSummary(&tmp), + versions_->current()->LevelSummary(&tmp), (stats.bytes_readn + stats.bytes_readnp1 + stats.bytes_written) / - (double) stats.micros, - compact->compaction->output_level(), - stats.files_in_leveln, stats.files_in_levelnp1, stats.files_out_levelnp1, - stats.bytes_readn / 1048576.0, - stats.bytes_readnp1 / 1048576.0, + (double)stats.micros, + compact->compaction->output_level(), stats.files_in_leveln, + stats.files_in_levelnp1, stats.files_out_levelnp1, + stats.bytes_readn / 1048576.0, stats.bytes_readnp1 / 1048576.0, stats.bytes_written / 1048576.0, (stats.bytes_written + stats.bytes_readnp1 + stats.bytes_readn) / - (double) stats.bytes_readn, - stats.bytes_written / (double) stats.bytes_readn, + (double)stats.bytes_readn, + stats.bytes_written / (double)stats.bytes_readn, status.ToString().c_str()); return status; @@ -2649,38 +2675,40 @@ static void CleanupIteratorState(void* arg1, void* arg2) { Iterator* DBImpl::NewInternalIterator(const ReadOptions& options, SequenceNumber* latest_snapshot) { IterState* cleanup = new IterState; - mutex_.Lock(); - *latest_snapshot = versions_->LastSequence(); + MemTable* mutable_mem; + std::vector immutables; + Version* version; // Collect together all needed child iterators for mem - std::vector list; + mutex_.Lock(); + *latest_snapshot = versions_->LastSequence(); mem_->Ref(); - list.push_back(mem_->NewIterator(options)); - - cleanup->mem.push_back(mem_); - + mutable_mem = mem_; // Collect together all needed child iterators for imm_ - std::vector immutables; imm_.GetMemTables(&immutables); for (unsigned int i = 0; i < immutables.size(); i++) { - MemTable* m = immutables[i]; - m->Ref(); + immutables[i]->Ref(); + } + // Collect iterators for files in L0 - Ln + versions_->current()->Ref(); + version = versions_->current(); + mutex_.Unlock(); + + std::vector list; + list.push_back(mutable_mem->NewIterator(options)); + cleanup->mem.push_back(mutable_mem); + for (MemTable* m : immutables) { list.push_back(m->NewIterator(options)); cleanup->mem.push_back(m); } - - // Collect iterators for files in L0 - Ln - versions_->current()->AddIterators(options, storage_options_, &list); + version->AddIterators(options, storage_options_, &list); Iterator* internal_iter = NewMergingIterator(&internal_comparator_, &list[0], list.size()); - versions_->current()->Ref(); - + cleanup->version = version; cleanup->mu = &mutex_; cleanup->db = this; - cleanup->version = versions_->current(); internal_iter->RegisterCleanup(CleanupIteratorState, cleanup, nullptr); - mutex_.Unlock(); return internal_iter; } @@ -2691,7 +2719,7 @@ Iterator* DBImpl::TEST_NewInternalIterator() { int64_t DBImpl::TEST_MaxNextLevelOverlappingBytes() { MutexLock l(&mutex_); - return versions_->MaxNextLevelOverlappingBytes(); + return versions_->current()->MaxNextLevelOverlappingBytes(); } Status DBImpl::Get(const ReadOptions& options, @@ -2898,7 +2926,7 @@ std::vector DBImpl::MultiGet( Status DBImpl::CreateColumnFamily(const ColumnFamilyOptions& options, const std::string& column_family_name, ColumnFamilyHandle* handle) { - VersionEdit edit(0); + VersionEdit edit; edit.AddColumnFamily(column_family_name); MutexLock l(&mutex_); ++versions_->max_column_family_; @@ -2920,7 +2948,7 @@ Status DBImpl::DropColumnFamily(const ColumnFamilyHandle& column_family) { if (column_family.id == 0) { return Status::InvalidArgument("Can't drop default column family"); } - VersionEdit edit(0); + VersionEdit edit; edit.DropColumnFamily(); edit.SetColumnFamily(column_family.id); MutexLock l(&mutex_); @@ -3045,12 +3073,8 @@ Status DBImpl::Write(const WriteOptions& options, WriteBatch* my_batch) { uint64_t last_sequence = versions_->LastSequence(); Writer* last_writer = &w; if (status.ok() && my_batch != nullptr) { // nullptr batch is for compactions - // TODO: BuildBatchGroup physically concatenate/copy all write batches into - // a new one. Mem copy is done with the lock held. Ideally, we only need - // the lock to obtain the last_writer and the references to all batches. - // Creation (copy) of the merged batch could have been done outside of the - // lock protected region. - WriteBatch* updates = BuildBatchGroup(&last_writer); + autovector write_batch_group; + BuildBatchGroup(&last_writer, &write_batch_group); // Add to log and apply to memtable. We can release the lock // during this phase since &w is currently responsible for logging @@ -3058,6 +3082,16 @@ Status DBImpl::Write(const WriteOptions& options, WriteBatch* my_batch) { // into mem_. { mutex_.Unlock(); + WriteBatch* updates = nullptr; + if (write_batch_group.size() == 1) { + updates = write_batch_group[0]; + } else { + updates = &tmp_batch_; + for (size_t i = 0; i < write_batch_group.size(); ++i) { + WriteBatchInternal::Append(updates, write_batch_group[i]); + } + } + const SequenceNumber current_sequence = last_sequence + 1; WriteBatchInternal::SetSequence(updates, current_sequence); int my_batch_count = WriteBatchInternal::Count(updates); @@ -3100,15 +3134,15 @@ Status DBImpl::Write(const WriteOptions& options, WriteBatch* my_batch) { // have succeeded in memtable but Status reports error for all writes. throw std::runtime_error("In memory WriteBatch corruption!"); } - SetTickerCount(options_.statistics.get(), - SEQUENCE_NUMBER, last_sequence); + SetTickerCount(options_.statistics.get(), SEQUENCE_NUMBER, + last_sequence); } + if (updates == &tmp_batch_) tmp_batch_.Clear(); mutex_.Lock(); if (status.ok()) { versions_->SetLastSequence(last_sequence); } } - if (updates == &tmp_batch_) tmp_batch_.Clear(); } if (options_.paranoid_checks && !status.ok() && bg_error_.ok()) { bg_error_ = status; // stop compaction & fail any further writes @@ -3136,13 +3170,14 @@ Status DBImpl::Write(const WriteOptions& options, WriteBatch* my_batch) { // REQUIRES: Writer list must be non-empty // REQUIRES: First writer must have a non-nullptr batch -WriteBatch* DBImpl::BuildBatchGroup(Writer** last_writer) { +void DBImpl::BuildBatchGroup(Writer** last_writer, + autovector* write_batch_group) { assert(!writers_.empty()); Writer* first = writers_.front(); - WriteBatch* result = first->batch; - assert(result != nullptr); + assert(first->batch != nullptr); size_t size = WriteBatchInternal::ByteSize(first->batch); + write_batch_group->push_back(first->batch); // Allow the group to grow up to a maximum size, but if the // original write is small, limit the growth so we do not slow @@ -3175,18 +3210,10 @@ WriteBatch* DBImpl::BuildBatchGroup(Writer** last_writer) { break; } - // Append to *reuslt - if (result == first->batch) { - // Switch to temporary batch instead of disturbing caller's batch - result = &tmp_batch_; - assert(WriteBatchInternal::Count(result) == 0); - WriteBatchInternal::Append(result, first->batch); - } - WriteBatchInternal::Append(result, w->batch); + write_batch_group->push_back(w->batch); } *last_writer = w; } - return result; } // This function computes the amount of time in microseconds by which a write @@ -3200,7 +3227,7 @@ WriteBatch* DBImpl::BuildBatchGroup(Writer** last_writer) { // The goal of this formula is to gradually increase the rate at which writes // are slowed. We also tried linear delay (r * 1000), but it seemed to do // slightly worse. There is no other particular reason for choosing quadratic. -uint64_t DBImpl::SlowdownAmount(int n, int top, int bottom) { +uint64_t DBImpl::SlowdownAmount(int n, double bottom, double top) { uint64_t delay; if (n >= top) { delay = 1000; @@ -3212,10 +3239,10 @@ uint64_t DBImpl::SlowdownAmount(int n, int top, int bottom) { // If we are here, we know that: // level0_start_slowdown <= n < level0_slowdown // since the previous two conditions are false. - float how_much = - (float) (n - bottom) / + double how_much = + (double) (n - bottom) / (top - bottom); - delay = how_much * how_much * 1000; + delay = std::max(how_much * how_much * 1000, 100.0); } assert(delay <= 1000); return delay; @@ -3240,25 +3267,22 @@ Status DBImpl::MakeRoomForWrite(bool force, // Yield previous error s = bg_error_; break; - } else if ( - allow_delay && - versions_->NumLevelFiles(0) >= - options_.level0_slowdown_writes_trigger) { + } else if (allow_delay && versions_->NeedSlowdownForNumLevel0Files()) { // We are getting close to hitting a hard limit on the number of // L0 files. Rather than delaying a single write by several // seconds when we hit the hard limit, start delaying each // individual write by 0-1ms to reduce latency variance. Also, // this delay hands over some CPU to the compaction thread in // case it is sharing the same core as the writer. + uint64_t slowdown = + SlowdownAmount(versions_->current()->NumLevelFiles(0), + options_.level0_slowdown_writes_trigger, + options_.level0_stop_writes_trigger); mutex_.Unlock(); uint64_t delayed; { StopWatch sw(env_, options_.statistics.get(), STALL_L0_SLOWDOWN_COUNT); - env_->SleepForMicroseconds( - SlowdownAmount(versions_->NumLevelFiles(0), - options_.level0_slowdown_writes_trigger, - options_.level0_stop_writes_trigger) - ); + env_->SleepForMicroseconds(slowdown); delayed = sw.ElapsedMicros(); } RecordTick(options_.statistics.get(), STALL_L0_SLOWDOWN_MICROS, delayed); @@ -3290,7 +3314,7 @@ Status DBImpl::MakeRoomForWrite(bool force, STALL_MEMTABLE_COMPACTION_MICROS, stall); stall_memtable_compaction_ += stall; stall_memtable_compaction_count_++; - } else if (versions_->NumLevelFiles(0) >= + } else if (versions_->current()->NumLevelFiles(0) >= options_.level0_stop_writes_trigger) { // There are too many level-0 files. DelayLoggingAndReset(); @@ -3366,17 +3390,13 @@ Status DBImpl::MakeRoomForWrite(bool force, EnvOptions soptions(storage_options_); soptions.use_mmap_writes = false; DelayLoggingAndReset(); - s = env_->NewWritableFile( - LogFileName(options_.wal_dir, new_log_number), - &lfile, - soptions - ); + s = env_->NewWritableFile(LogFileName(options_.wal_dir, new_log_number), + &lfile, soptions); if (s.ok()) { // Our final size should be less than write_buffer_size // (compression, etc) but err on the side of caution. lfile->SetPreallocationBlockSize(1.1 * options_.write_buffer_size); - memtmp = new MemTable( - internal_comparator_, mem_rep_factory_, NumberLevels(), options_); + memtmp = new MemTable(internal_comparator_, options_); new_superversion = new SuperVersion(options_.max_write_buffer_number); } } @@ -3426,6 +3446,7 @@ bool DBImpl::GetProperty(const ColumnFamilyHandle& column_family, value->clear(); MutexLock l(&mutex_); + Version* current = versions_->current(); Slice in = property; Slice prefix("rocksdb."); if (!in.starts_with(prefix)) return false; @@ -3440,7 +3461,7 @@ bool DBImpl::GetProperty(const ColumnFamilyHandle& column_family, } else { char buf[100]; snprintf(buf, sizeof(buf), "%d", - versions_->NumLevelFiles(static_cast(level))); + current->NumLevelFiles(static_cast(level))); *value = buf; return true; } @@ -3455,8 +3476,8 @@ bool DBImpl::GetProperty(const ColumnFamilyHandle& column_family, snprintf(buf, sizeof(buf), "%3d %8d %8.0f\n", level, - versions_->NumLevelFiles(level), - versions_->NumLevelBytes(level) / 1048576.0); + current->NumLevelFiles(level), + current->NumLevelBytes(level) / 1048576.0); value->append(buf); } return true; @@ -3499,8 +3520,8 @@ bool DBImpl::GetProperty(const ColumnFamilyHandle& column_family, "--------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------\n" ); value->append(buf); - for (int level = 0; level < NumberLevels(); level++) { - int files = versions_->NumLevelFiles(level); + for (int level = 0; level < current->NumberLevels(); level++) { + int files = current->NumLevelFiles(level); if (stats_[level].micros > 0 || files > 0) { int64_t bytes_read = stats_[level].bytes_readn + stats_[level].bytes_readnp1; @@ -3521,8 +3542,8 @@ bool DBImpl::GetProperty(const ColumnFamilyHandle& column_family, "%3d %8d %8.0f %5.1f %9.0f %9.0f %9.0f %9.0f %9.0f %9.0f %10.1f %9.1f %11.1f %8d %8d %8d %8d %8d %9.1f %9lu\n", level, files, - versions_->NumLevelBytes(level) / 1048576.0, - versions_->NumLevelBytes(level) / + current->NumLevelBytes(level) / 1048576.0, + current->NumLevelBytes(level) / versions_->MaxBytesForLevel(level), stats_[level].micros / 1e6, bytes_read / 1048576.0, @@ -3758,7 +3779,7 @@ Status DBImpl::DeleteFile(std::string name) { int level; FileMetaData metadata; int maxlevel = NumberLevels(); - VersionEdit edit(maxlevel); + VersionEdit edit; DeletionState deletion_state(0, true); { MutexLock l(&mutex_); @@ -3781,7 +3802,7 @@ Status DBImpl::DeleteFile(std::string name) { // This is to make sure that any deletion tombstones are not // lost. Check that the level passed is the last level. for (int i = level + 1; i < maxlevel; i++) { - if (versions_->NumLevelFiles(i) != 0) { + if (versions_->current()->NumLevelFiles(i) != 0) { Log(options_.info_log, "DeleteFile %s FAILED. File not in last level\n", name.c_str()); return Status::InvalidArgument("File not in last level"); @@ -3836,7 +3857,10 @@ Status DBImpl::GetDbIdentity(std::string& identity) { // can call if they wish Status DB::Put(const WriteOptions& opt, const ColumnFamilyHandle& column_family, const Slice& key, const Slice& value) { - WriteBatch batch; + // Pre-allocate size of write batch conservatively. + // 8 bytes are taken by header, 4 bytes for count, 1 byte for type, + // and we allocate 11 extra bytes for key length, as well as value length. + WriteBatch batch(key.size() + value.size() + 24); batch.Put(column_family.id, key, value); return Write(opt, &batch); } @@ -3915,20 +3939,20 @@ Status DB::OpenWithColumnFamilies( return s; } impl->mutex_.Lock(); - VersionEdit edit(impl->NumberLevels()); + VersionEdit edit; // Handles create_if_missing, error_if_exists s = impl->Recover(&edit, column_families); if (s.ok()) { uint64_t new_log_number = impl->versions_->NewFileNumber(); unique_ptr lfile; soptions.use_mmap_writes = false; - s = options.env->NewWritableFile( + s = impl->options_.env->NewWritableFile( LogFileName(impl->options_.wal_dir, new_log_number), &lfile, soptions ); if (s.ok()) { - lfile->SetPreallocationBlockSize(1.1 * options.write_buffer_size); + lfile->SetPreallocationBlockSize(1.1 * impl->options_.write_buffer_size); edit.SetLogNumber(new_log_number); impl->logfile_number_ = new_log_number; impl->log_.reset(new log::Writer(std::move(lfile))); @@ -3949,12 +3973,11 @@ Status DB::OpenWithColumnFamilies( impl->MaybeScheduleLogDBDeployStats(); } } - impl->mutex_.Unlock(); - if (s.ok() && options.compaction_style == kCompactionStyleUniversal) { - int num_files; + if (s.ok() && impl->options_.compaction_style == kCompactionStyleUniversal) { + Version* current = impl->versions_->current(); for (int i = 1; i < impl->NumberLevels(); i++) { - num_files = impl->versions_->NumLevelFiles(i); + int num_files = current->NumLevelFiles(i); if (num_files > 0) { s = Status::InvalidArgument("Not all files are at level 0. Cannot " "open with universal compaction style."); @@ -3963,6 +3986,8 @@ Status DB::OpenWithColumnFamilies( } } + impl->mutex_.Unlock(); + if (s.ok()) { *dbptr = impl; } else { diff --git a/db/db_impl.h b/db/db_impl.h index 9baea728f..9146df7bd 100644 --- a/db/db_impl.h +++ b/db/db_impl.h @@ -22,6 +22,7 @@ #include "port/port.h" #include "util/stats_logger.h" #include "memtablelist.h" +#include "util/autovector.h" namespace rocksdb { @@ -125,10 +126,17 @@ class DBImpl : public DB { virtual Status GetDbIdentity(std::string& identity); + void RunManualCompaction(int input_level, + int output_level, + const Slice* begin, + const Slice* end); + // Extra methods (for testing) that are not in the public DB interface // Compact any files in the named level that overlap [*begin, *end] - void TEST_CompactRange(int level, const Slice* begin, const Slice* end); + void TEST_CompactRange(int level, + const Slice* begin, + const Slice* end); // Force current memtable contents to be flushed. Status TEST_FlushMemTable(); @@ -158,7 +166,7 @@ class DBImpl : public DB { void TEST_PurgeObsoleteteWAL(); // get total level0 file size. Only for testing. - uint64_t TEST_GetLevel0TotalSize() { return versions_->NumLevelBytes(0);} + uint64_t TEST_GetLevel0TotalSize(); void TEST_SetDefaultTimeToCheck(uint64_t default_interval_to_delete_obsolete_WAL) { @@ -324,13 +332,14 @@ class DBImpl : public DB { Status WriteLevel0Table(std::vector &mems, VersionEdit* edit, uint64_t* filenumber); - uint64_t SlowdownAmount(int n, int top, int bottom); + uint64_t SlowdownAmount(int n, double bottom, double top); // MakeRoomForWrite will return superversion_to_free through an arugment, // which the caller needs to delete. We do it because caller can delete // the superversion outside of mutex Status MakeRoomForWrite(bool force /* compact even if there is room? */, SuperVersion** superversion_to_free); - WriteBatch* BuildBatchGroup(Writer** last_writer); + void BuildBatchGroup(Writer** last_writer, + autovector* write_batch_group); // Force current memtable contents to be flushed. Status FlushMemTable(const FlushOptions& options); @@ -443,7 +452,8 @@ class DBImpl : public DB { // Information for a manual compaction struct ManualCompaction { - int level; + int input_level; + int output_level; bool done; bool in_progress; // compaction request being processed? const InternalKey* begin; // nullptr means beginning of key range diff --git a/db/db_impl_readonly.cc b/db/db_impl_readonly.cc index dee484951..ad3395778 100644 --- a/db/db_impl_readonly.cc +++ b/db/db_impl_readonly.cc @@ -85,7 +85,7 @@ Status DB::OpenForReadOnly(const Options& options, const std::string& dbname, DBImplReadOnly* impl = new DBImplReadOnly(options, dbname); impl->mutex_.Lock(); - VersionEdit edit(impl->NumberLevels()); + VersionEdit edit; DBOptions db_options(options); ColumnFamilyOptions cf_options(options); std::vector column_families; diff --git a/db/db_statistics.cc b/db/db_statistics.cc deleted file mode 100644 index f0cfd6740..000000000 --- a/db/db_statistics.cc +++ /dev/null @@ -1,14 +0,0 @@ -// Copyright (c) 2013, Facebook, Inc. All rights reserved. -// This source code is licensed under the BSD-style license found in the -// LICENSE file in the root directory of this source tree. An additional grant -// of patent rights can be found in the PATENTS file in the same directory. - -#include "db/db_statistics.h" - -namespace rocksdb { - -std::shared_ptr CreateDBStatistics() { - return std::make_shared(); -} - -} // namespace rocksdb diff --git a/db/db_statistics.h b/db/db_statistics.h deleted file mode 100644 index ec71e1688..000000000 --- a/db/db_statistics.h +++ /dev/null @@ -1,63 +0,0 @@ -// Copyright (c) 2013, Facebook, Inc. All rights reserved. -// This source code is licensed under the BSD-style license found in the -// LICENSE file in the root directory of this source tree. An additional grant -// of patent rights can be found in the PATENTS file in the same directory. -// -// Copyright (c) 2011 The LevelDB Authors. All rights reserved. -// Use of this source code is governed by a BSD-style license that can be -// found in the LICENSE file. See the AUTHORS file for names of contributors. - -#pragma once -#include -#include -#include -#include - -#include "rocksdb/statistics.h" -#include "util/histogram.h" -#include "port/port.h" -#include "util/mutexlock.h" - - -namespace rocksdb { - -class DBStatistics: public Statistics { - public: - DBStatistics() : allTickers_(TICKER_ENUM_MAX), - allHistograms_(HISTOGRAM_ENUM_MAX) { } - - virtual ~DBStatistics() {} - - virtual long getTickerCount(Tickers tickerType) { - assert(tickerType < TICKER_ENUM_MAX); - return allTickers_[tickerType].getCount(); - } - - virtual void setTickerCount(Tickers tickerType, uint64_t count) { - assert(tickerType < TICKER_ENUM_MAX); - allTickers_[tickerType].setTickerCount(count); - } - - virtual void recordTick(Tickers tickerType, uint64_t count) { - assert(tickerType < TICKER_ENUM_MAX); - allTickers_[tickerType].recordTick(count); - } - - virtual void measureTime(Histograms histogramType, uint64_t value) { - assert(histogramType < HISTOGRAM_ENUM_MAX); - allHistograms_[histogramType].Add(value); - } - - virtual void histogramData(Histograms histogramType, - HistogramData * const data) { - assert(histogramType < HISTOGRAM_ENUM_MAX); - allHistograms_[histogramType].Data(data); - } - - std::vector allTickers_; - std::vector allHistograms_; -}; - -std::shared_ptr CreateDBStatistics(); - -} // namespace rocksdb diff --git a/db/db_stats_logger.cc b/db/db_stats_logger.cc index 91810abe3..db86865ca 100644 --- a/db/db_stats_logger.cc +++ b/db/db_stats_logger.cc @@ -65,13 +65,14 @@ void DBImpl::LogDBDeployStats() { uint64_t file_total_size = 0; uint32_t file_total_num = 0; - for (int i = 0; i < versions_->NumberLevels(); i++) { - file_total_num += versions_->NumLevelFiles(i); - file_total_size += versions_->NumLevelBytes(i); + Version* current = versions_->current(); + for (int i = 0; i < current->NumberLevels(); i++) { + file_total_num += current->NumLevelFiles(i); + file_total_size += current->NumLevelBytes(i); } - VersionSet::LevelSummaryStorage scratch; - const char* file_num_summary = versions_->LevelSummary(&scratch); + Version::LevelSummaryStorage scratch; + const char* file_num_summary = current->LevelSummary(&scratch); std::string file_num_per_level(file_num_summary); std::string data_size_per_level(file_num_summary); diff --git a/db/db_test.cc b/db/db_test.cc index 3659e8d84..44ce16d60 100644 --- a/db/db_test.cc +++ b/db/db_test.cc @@ -17,7 +17,6 @@ #include "db/filename.h" #include "db/version_set.h" #include "db/write_batch_internal.h" -#include "db/db_statistics.h" #include "rocksdb/cache.h" #include "rocksdb/compaction_filter.h" #include "rocksdb/env.h" @@ -27,6 +26,7 @@ #include "util/mutexlock.h" #include "util/testharness.h" #include "util/testutil.h" +#include "util/statistics.h" #include "utilities/merge_operators.h" namespace rocksdb { @@ -680,6 +680,10 @@ static std::string Key(int i) { return std::string(buf); } +static long TestGetTickerCount(const Options& options, Tickers ticker_type) { + return options.statistics->getTickerCount(ticker_type); +} + TEST(DBTest, Empty) { do { ASSERT_TRUE(db_ != nullptr); @@ -713,14 +717,11 @@ TEST(DBTest, IndexAndFilterBlocksOfNewTableAddedToCache) { dbfull()->Flush(FlushOptions()); // index/filter blocks added to block cache right after table creation. - ASSERT_EQ(1, - options.statistics.get()->getTickerCount(BLOCK_CACHE_INDEX_MISS)); - ASSERT_EQ(1, - options.statistics.get()->getTickerCount(BLOCK_CACHE_FILTER_MISS)); + ASSERT_EQ(1, TestGetTickerCount(options, BLOCK_CACHE_INDEX_MISS)); + ASSERT_EQ(1, TestGetTickerCount(options, BLOCK_CACHE_FILTER_MISS)); ASSERT_EQ(2, /* only index/filter were added */ - options.statistics.get()->getTickerCount(BLOCK_CACHE_ADD)); - ASSERT_EQ(0, - options.statistics.get()->getTickerCount(BLOCK_CACHE_DATA_MISS)); + TestGetTickerCount(options, BLOCK_CACHE_ADD)); + ASSERT_EQ(0, TestGetTickerCount(options, BLOCK_CACHE_DATA_MISS)); // Make sure filter block is in cache. std::string value; @@ -728,31 +729,24 @@ TEST(DBTest, IndexAndFilterBlocksOfNewTableAddedToCache) { db_->KeyMayExist(ReadOptions(), "key", &value); // Miss count should remain the same. - ASSERT_EQ(1, - options.statistics.get()->getTickerCount(BLOCK_CACHE_FILTER_MISS)); - ASSERT_EQ(1, - options.statistics.get()->getTickerCount(BLOCK_CACHE_FILTER_HIT)); + ASSERT_EQ(1, TestGetTickerCount(options, BLOCK_CACHE_FILTER_MISS)); + ASSERT_EQ(1, TestGetTickerCount(options, BLOCK_CACHE_FILTER_HIT)); db_->KeyMayExist(ReadOptions(), "key", &value); - ASSERT_EQ(1, - options.statistics.get()->getTickerCount(BLOCK_CACHE_FILTER_MISS)); - ASSERT_EQ(2, - options.statistics.get()->getTickerCount(BLOCK_CACHE_FILTER_HIT)); + ASSERT_EQ(1, TestGetTickerCount(options, BLOCK_CACHE_FILTER_MISS)); + ASSERT_EQ(2, TestGetTickerCount(options, BLOCK_CACHE_FILTER_HIT)); // Make sure index block is in cache. - auto index_block_hit = - options.statistics.get()->getTickerCount(BLOCK_CACHE_FILTER_HIT); + auto index_block_hit = TestGetTickerCount(options, BLOCK_CACHE_FILTER_HIT); value = Get("key"); - ASSERT_EQ(1, - options.statistics.get()->getTickerCount(BLOCK_CACHE_FILTER_MISS)); + ASSERT_EQ(1, TestGetTickerCount(options, BLOCK_CACHE_FILTER_MISS)); ASSERT_EQ(index_block_hit + 1, - options.statistics.get()->getTickerCount(BLOCK_CACHE_FILTER_HIT)); + TestGetTickerCount(options, BLOCK_CACHE_FILTER_HIT)); value = Get("key"); - ASSERT_EQ(1, - options.statistics.get()->getTickerCount(BLOCK_CACHE_FILTER_MISS)); + ASSERT_EQ(1, TestGetTickerCount(options, BLOCK_CACHE_FILTER_MISS)); ASSERT_EQ(index_block_hit + 2, - options.statistics.get()->getTickerCount(BLOCK_CACHE_FILTER_HIT)); + TestGetTickerCount(options, BLOCK_CACHE_FILTER_HIT)); } TEST(DBTest, LevelLimitReopen) { @@ -768,10 +762,9 @@ TEST(DBTest, LevelLimitReopen) { options.num_levels = 1; options.max_bytes_for_level_multiplier_additional.resize(1, 1); Status s = TryReopen(&options); - ASSERT_EQ(s.IsCorruption(), true); + ASSERT_EQ(s.IsInvalidArgument(), true); ASSERT_EQ(s.ToString(), - "Corruption: VersionEdit: column family already has " - "more levels than specified"); + "Invalid argument: db has more levels than options.num_levels"); options.num_levels = 10; options.max_bytes_for_level_multiplier_additional.resize(10, 1); @@ -968,47 +961,39 @@ TEST(DBTest, KeyMayExist) { dbfull()->Flush(FlushOptions()); value.clear(); - long numopen = options.statistics.get()->getTickerCount(NO_FILE_OPENS); - long cache_added = - options.statistics.get()->getTickerCount(BLOCK_CACHE_ADD); + long numopen = TestGetTickerCount(options, NO_FILE_OPENS); + long cache_added = TestGetTickerCount(options, BLOCK_CACHE_ADD); ASSERT_TRUE(db_->KeyMayExist(ropts, "a", &value, &value_found)); ASSERT_TRUE(!value_found); // assert that no new files were opened and no new blocks were // read into block cache. - ASSERT_EQ(numopen, options.statistics.get()->getTickerCount(NO_FILE_OPENS)); - ASSERT_EQ(cache_added, - options.statistics.get()->getTickerCount(BLOCK_CACHE_ADD)); + ASSERT_EQ(numopen, TestGetTickerCount(options, NO_FILE_OPENS)); + ASSERT_EQ(cache_added, TestGetTickerCount(options, BLOCK_CACHE_ADD)); ASSERT_OK(db_->Delete(WriteOptions(), "a")); - numopen = options.statistics.get()->getTickerCount(NO_FILE_OPENS); - cache_added = - options.statistics.get()->getTickerCount(BLOCK_CACHE_ADD); + numopen = TestGetTickerCount(options, NO_FILE_OPENS); + cache_added = TestGetTickerCount(options, BLOCK_CACHE_ADD); ASSERT_TRUE(!db_->KeyMayExist(ropts, "a", &value)); - ASSERT_EQ(numopen, options.statistics.get()->getTickerCount(NO_FILE_OPENS)); - ASSERT_EQ(cache_added, - options.statistics.get()->getTickerCount(BLOCK_CACHE_ADD)); + ASSERT_EQ(numopen, TestGetTickerCount(options, NO_FILE_OPENS)); + ASSERT_EQ(cache_added, TestGetTickerCount(options, BLOCK_CACHE_ADD)); dbfull()->Flush(FlushOptions()); dbfull()->CompactRange(nullptr, nullptr); - numopen = options.statistics.get()->getTickerCount(NO_FILE_OPENS); - cache_added = - options.statistics.get()->getTickerCount(BLOCK_CACHE_ADD); + numopen = TestGetTickerCount(options, NO_FILE_OPENS); + cache_added = TestGetTickerCount(options, BLOCK_CACHE_ADD); ASSERT_TRUE(!db_->KeyMayExist(ropts, "a", &value)); - ASSERT_EQ(numopen, options.statistics.get()->getTickerCount(NO_FILE_OPENS)); - ASSERT_EQ(cache_added, - options.statistics.get()->getTickerCount(BLOCK_CACHE_ADD)); + ASSERT_EQ(numopen, TestGetTickerCount(options, NO_FILE_OPENS)); + ASSERT_EQ(cache_added, TestGetTickerCount(options, BLOCK_CACHE_ADD)); ASSERT_OK(db_->Delete(WriteOptions(), "c")); - numopen = options.statistics.get()->getTickerCount(NO_FILE_OPENS); - cache_added = - options.statistics.get()->getTickerCount(BLOCK_CACHE_ADD); + numopen = TestGetTickerCount(options, NO_FILE_OPENS); + cache_added = TestGetTickerCount(options, BLOCK_CACHE_ADD); ASSERT_TRUE(!db_->KeyMayExist(ropts, "c", &value)); - ASSERT_EQ(numopen, options.statistics.get()->getTickerCount(NO_FILE_OPENS)); - ASSERT_EQ(cache_added, - options.statistics.get()->getTickerCount(BLOCK_CACHE_ADD)); + ASSERT_EQ(numopen, TestGetTickerCount(options, NO_FILE_OPENS)); + ASSERT_EQ(cache_added, TestGetTickerCount(options, BLOCK_CACHE_ADD)); delete options.filter_policy; } while (ChangeOptions()); @@ -1041,9 +1026,8 @@ TEST(DBTest, NonBlockingIteration) { // verify that a non-blocking iterator does not find any // kvs. Neither does it do any IOs to storage. - long numopen = options.statistics.get()->getTickerCount(NO_FILE_OPENS); - long cache_added = - options.statistics.get()->getTickerCount(BLOCK_CACHE_ADD); + long numopen = TestGetTickerCount(options, NO_FILE_OPENS); + long cache_added = TestGetTickerCount(options, BLOCK_CACHE_ADD); iter = db_->NewIterator(non_blocking_opts); count = 0; for (iter->SeekToFirst(); iter->Valid(); iter->Next()) { @@ -1051,18 +1035,16 @@ TEST(DBTest, NonBlockingIteration) { } ASSERT_EQ(count, 0); ASSERT_TRUE(iter->status().IsIncomplete()); - ASSERT_EQ(numopen, options.statistics.get()->getTickerCount(NO_FILE_OPENS)); - ASSERT_EQ(cache_added, - options.statistics.get()->getTickerCount(BLOCK_CACHE_ADD)); + ASSERT_EQ(numopen, TestGetTickerCount(options, NO_FILE_OPENS)); + ASSERT_EQ(cache_added, TestGetTickerCount(options, BLOCK_CACHE_ADD)); delete iter; // read in the specified block via a regular get ASSERT_EQ(Get("a"), "b"); // verify that we can find it via a non-blocking scan - numopen = options.statistics.get()->getTickerCount(NO_FILE_OPENS); - cache_added = - options.statistics.get()->getTickerCount(BLOCK_CACHE_ADD); + numopen = TestGetTickerCount(options, NO_FILE_OPENS); + cache_added = TestGetTickerCount(options, BLOCK_CACHE_ADD); iter = db_->NewIterator(non_blocking_opts); count = 0; for (iter->SeekToFirst(); iter->Valid(); iter->Next()) { @@ -1070,9 +1052,8 @@ TEST(DBTest, NonBlockingIteration) { count++; } ASSERT_EQ(count, 1); - ASSERT_EQ(numopen, options.statistics.get()->getTickerCount(NO_FILE_OPENS)); - ASSERT_EQ(cache_added, - options.statistics.get()->getTickerCount(BLOCK_CACHE_ADD)); + ASSERT_EQ(numopen, TestGetTickerCount(options, NO_FILE_OPENS)); + ASSERT_EQ(cache_added, TestGetTickerCount(options, BLOCK_CACHE_ADD)); delete iter; } while (ChangeOptions()); @@ -1277,12 +1258,10 @@ TEST(DBTest, IterReseek) { ASSERT_OK(Put("b", "bone")); Iterator* iter = db_->NewIterator(ReadOptions()); iter->SeekToFirst(); - ASSERT_EQ(options.statistics.get()->getTickerCount( - NUMBER_OF_RESEEKS_IN_ITERATION), 0); + ASSERT_EQ(TestGetTickerCount(options, NUMBER_OF_RESEEKS_IN_ITERATION), 0); ASSERT_EQ(IterStatus(iter), "a->two"); iter->Next(); - ASSERT_EQ(options.statistics.get()->getTickerCount( - NUMBER_OF_RESEEKS_IN_ITERATION), 0); + ASSERT_EQ(TestGetTickerCount(options, NUMBER_OF_RESEEKS_IN_ITERATION), 0); ASSERT_EQ(IterStatus(iter), "b->bone"); delete iter; @@ -1293,8 +1272,7 @@ TEST(DBTest, IterReseek) { iter->SeekToFirst(); ASSERT_EQ(IterStatus(iter), "a->three"); iter->Next(); - ASSERT_EQ(options.statistics.get()->getTickerCount( - NUMBER_OF_RESEEKS_IN_ITERATION), 0); + ASSERT_EQ(TestGetTickerCount(options, NUMBER_OF_RESEEKS_IN_ITERATION), 0); ASSERT_EQ(IterStatus(iter), "b->bone"); delete iter; @@ -1304,30 +1282,28 @@ TEST(DBTest, IterReseek) { iter = db_->NewIterator(ReadOptions()); iter->SeekToFirst(); ASSERT_EQ(IterStatus(iter), "a->four"); - ASSERT_EQ(options.statistics.get()->getTickerCount( - NUMBER_OF_RESEEKS_IN_ITERATION), 0); + ASSERT_EQ(TestGetTickerCount(options, NUMBER_OF_RESEEKS_IN_ITERATION), 0); iter->Next(); - ASSERT_EQ(options.statistics.get()->getTickerCount( - NUMBER_OF_RESEEKS_IN_ITERATION), 1); + ASSERT_EQ(TestGetTickerCount(options, NUMBER_OF_RESEEKS_IN_ITERATION), 1); ASSERT_EQ(IterStatus(iter), "b->bone"); delete iter; // Testing reverse iterator // At this point, we have three versions of "a" and one version of "b". // The reseek statistics is already at 1. - int num_reseeks = (int)options.statistics.get()->getTickerCount( - NUMBER_OF_RESEEKS_IN_ITERATION); + int num_reseeks = + (int)TestGetTickerCount(options, NUMBER_OF_RESEEKS_IN_ITERATION); // Insert another version of b and assert that reseek is not invoked ASSERT_OK(Put("b", "btwo")); iter = db_->NewIterator(ReadOptions()); iter->SeekToLast(); ASSERT_EQ(IterStatus(iter), "b->btwo"); - ASSERT_EQ(options.statistics.get()->getTickerCount( - NUMBER_OF_RESEEKS_IN_ITERATION), num_reseeks); + ASSERT_EQ(TestGetTickerCount(options, NUMBER_OF_RESEEKS_IN_ITERATION), + num_reseeks); iter->Prev(); - ASSERT_EQ(options.statistics.get()->getTickerCount( - NUMBER_OF_RESEEKS_IN_ITERATION), num_reseeks+1); + ASSERT_EQ(TestGetTickerCount(options, NUMBER_OF_RESEEKS_IN_ITERATION), + num_reseeks + 1); ASSERT_EQ(IterStatus(iter), "a->four"); delete iter; @@ -1338,13 +1314,13 @@ TEST(DBTest, IterReseek) { iter = db_->NewIterator(ReadOptions()); iter->SeekToLast(); ASSERT_EQ(IterStatus(iter), "b->bfour"); - ASSERT_EQ(options.statistics.get()->getTickerCount( - NUMBER_OF_RESEEKS_IN_ITERATION), num_reseeks + 2); + ASSERT_EQ(TestGetTickerCount(options, NUMBER_OF_RESEEKS_IN_ITERATION), + num_reseeks + 2); iter->Prev(); // the previous Prev call should have invoked reseek - ASSERT_EQ(options.statistics.get()->getTickerCount( - NUMBER_OF_RESEEKS_IN_ITERATION), num_reseeks + 3); + ASSERT_EQ(TestGetTickerCount(options, NUMBER_OF_RESEEKS_IN_ITERATION), + num_reseeks + 3); ASSERT_EQ(IterStatus(iter), "a->four"); delete iter; } @@ -2107,24 +2083,18 @@ TEST(DBTest, CompressedCache) { switch (iter) { case 0: // only uncompressed block cache - ASSERT_GT(options.statistics.get()->getTickerCount(BLOCK_CACHE_MISS), - 0); - ASSERT_EQ(options.statistics.get()->getTickerCount - (BLOCK_CACHE_COMPRESSED_MISS), 0); + ASSERT_GT(TestGetTickerCount(options, BLOCK_CACHE_MISS), 0); + ASSERT_EQ(TestGetTickerCount(options, BLOCK_CACHE_COMPRESSED_MISS), 0); break; case 1: // no block cache, only compressed cache - ASSERT_EQ(options.statistics.get()->getTickerCount(BLOCK_CACHE_MISS), - 0); - ASSERT_GT(options.statistics.get()->getTickerCount - (BLOCK_CACHE_COMPRESSED_MISS), 0); + ASSERT_EQ(TestGetTickerCount(options, BLOCK_CACHE_MISS), 0); + ASSERT_GT(TestGetTickerCount(options, BLOCK_CACHE_COMPRESSED_MISS), 0); break; case 2: // both compressed and uncompressed block cache - ASSERT_GT(options.statistics.get()->getTickerCount(BLOCK_CACHE_MISS), - 0); - ASSERT_GT(options.statistics.get()->getTickerCount - (BLOCK_CACHE_COMPRESSED_MISS), 0); + ASSERT_GT(TestGetTickerCount(options, BLOCK_CACHE_MISS), 0); + ASSERT_GT(TestGetTickerCount(options, BLOCK_CACHE_COMPRESSED_MISS), 0); break; default: ASSERT_TRUE(false); @@ -3313,34 +3283,46 @@ TEST(DBTest, ManualCompaction) { ASSERT_EQ(dbfull()->MaxMemCompactionLevel(), 2) << "Need to update this test to match kMaxMemCompactLevel"; - MakeTables(3, "p", "q"); - ASSERT_EQ("1,1,1", FilesPerLevel()); + // iter - 0 with 7 levels + // iter - 1 with 3 levels + for (int iter = 0; iter < 2; ++iter) { + MakeTables(3, "p", "q"); + ASSERT_EQ("1,1,1", FilesPerLevel()); + + // Compaction range falls before files + Compact("", "c"); + ASSERT_EQ("1,1,1", FilesPerLevel()); - // Compaction range falls before files - Compact("", "c"); - ASSERT_EQ("1,1,1", FilesPerLevel()); + // Compaction range falls after files + Compact("r", "z"); + ASSERT_EQ("1,1,1", FilesPerLevel()); - // Compaction range falls after files - Compact("r", "z"); - ASSERT_EQ("1,1,1", FilesPerLevel()); + // Compaction range overlaps files + Compact("p1", "p9"); + ASSERT_EQ("0,0,1", FilesPerLevel()); - // Compaction range overlaps files - Compact("p1", "p9"); - ASSERT_EQ("0,0,1", FilesPerLevel()); + // Populate a different range + MakeTables(3, "c", "e"); + ASSERT_EQ("1,1,2", FilesPerLevel()); - // Populate a different range - MakeTables(3, "c", "e"); - ASSERT_EQ("1,1,2", FilesPerLevel()); + // Compact just the new range + Compact("b", "f"); + ASSERT_EQ("0,0,2", FilesPerLevel()); - // Compact just the new range - Compact("b", "f"); - ASSERT_EQ("0,0,2", FilesPerLevel()); + // Compact all + MakeTables(1, "a", "z"); + ASSERT_EQ("0,1,2", FilesPerLevel()); + db_->CompactRange(nullptr, nullptr); + ASSERT_EQ("0,0,1", FilesPerLevel()); + + if (iter == 0) { + Options options = CurrentOptions(); + options.num_levels = 3; + options.create_if_missing = true; + DestroyAndReopen(&options); + } + } - // Compact all - MakeTables(1, "a", "z"); - ASSERT_EQ("0,1,2", FilesPerLevel()); - db_->CompactRange(nullptr, nullptr); - ASSERT_EQ("0,0,1", FilesPerLevel()); } TEST(DBTest, DBOpen_Options) { @@ -3401,7 +3383,7 @@ TEST(DBTest, DBOpen_Change_NumLevels) { opts.create_if_missing = false; opts.num_levels = 2; s = DB::Open(opts, dbname, &db); - ASSERT_TRUE(strstr(s.ToString().c_str(), "Corruption") != nullptr); + ASSERT_TRUE(strstr(s.ToString().c_str(), "Invalid argument") != nullptr); ASSERT_TRUE(db == nullptr); } @@ -4336,6 +4318,70 @@ TEST(DBTest, MultiThreaded) { } while (ChangeOptions()); } +// Group commit test: +namespace { + +static const int kGCNumThreads = 4; +static const int kGCNumKeys = 1000; + +struct GCThread { + DB* db; + int id; + std::atomic done; +}; + +static void GCThreadBody(void* arg) { + GCThread* t = reinterpret_cast(arg); + int id = t->id; + DB* db = t->db; + WriteOptions wo; + + for (int i = 0; i < kGCNumKeys; ++i) { + std::string kv(std::to_string(i + id * kGCNumKeys)); + ASSERT_OK(db->Put(wo, kv, kv)); + } + t->done = true; +} + +} // namespace + +TEST(DBTest, GroupCommitTest) { + do { + // Start threads + GCThread thread[kGCNumThreads]; + for (int id = 0; id < kGCNumThreads; id++) { + thread[id].id = id; + thread[id].db = db_; + thread[id].done = false; + env_->StartThread(GCThreadBody, &thread[id]); + } + + for (int id = 0; id < kGCNumThreads; id++) { + while (thread[id].done == false) { + env_->SleepForMicroseconds(100000); + } + } + + std::vector expected_db; + for (int i = 0; i < kGCNumThreads * kGCNumKeys; ++i) { + expected_db.push_back(std::to_string(i)); + } + sort(expected_db.begin(), expected_db.end()); + + Iterator* itr = db_->NewIterator(ReadOptions()); + itr->SeekToFirst(); + for (auto x : expected_db) { + ASSERT_TRUE(itr->Valid()); + ASSERT_EQ(itr->key().ToString(), x); + ASSERT_EQ(itr->value().ToString(), x); + itr->Next(); + } + ASSERT_TRUE(!itr->Valid()); + delete itr; + + } while (ChangeOptions()); +} + namespace { typedef std::map KVMap; } @@ -4903,7 +4949,7 @@ void BM_LogAndApply(int iters, int num_base_files) { EnvOptions sopt; VersionSet vset(dbname, &options, sopt, nullptr, &cmp); ASSERT_OK(vset.Recover()); - VersionEdit vbase(vset.NumberLevels()); + VersionEdit vbase; uint64_t fnum = 1; for (int i = 0; i < num_base_files; i++) { InternalKey start(MakeKey(2*fnum), 1, kTypeValue); @@ -4915,7 +4961,7 @@ void BM_LogAndApply(int iters, int num_base_files) { uint64_t start_micros = env->NowMicros(); for (int i = 0; i < iters; i++) { - VersionEdit vedit(vset.NumberLevels()); + VersionEdit vedit; vedit.DeleteFile(2, fnum); InternalKey start(MakeKey(2*fnum), 1, kTypeValue); InternalKey limit(MakeKey(2*fnum+1), 1, kTypeDeletion); diff --git a/db/memtable.cc b/db/memtable.cc index 796ba1b3a..bf2dfa64b 100644 --- a/db/memtable.cc +++ b/db/memtable.cc @@ -20,7 +20,7 @@ #include "util/coding.h" #include "util/mutexlock.h" #include "util/murmurhash.h" -#include "util/statistics_imp.h" +#include "util/statistics.h" namespace std { template <> @@ -33,24 +33,20 @@ struct hash { namespace rocksdb { -MemTable::MemTable(const InternalKeyComparator& cmp, - MemTableRepFactory* table_factory, - int numlevel, - const Options& options) +MemTable::MemTable(const InternalKeyComparator& cmp, const Options& options) : comparator_(cmp), refs_(0), arena_impl_(options.arena_block_size), - table_(table_factory->CreateMemTableRep(comparator_, &arena_impl_)), + table_(options.memtable_factory->CreateMemTableRep(comparator_, + &arena_impl_)), flush_in_progress_(false), flush_completed_(false), file_number_(0), - edit_(numlevel), first_seqno_(0), mem_next_logfile_number_(0), mem_logfile_number_(0), - locks_(options.inplace_update_support - ? options.inplace_update_num_locks - : 0) { } + locks_(options.inplace_update_support ? options.inplace_update_num_locks + : 0) {} MemTable::~MemTable() { assert(refs_ == 0); @@ -58,7 +54,7 @@ MemTable::~MemTable() { size_t MemTable::ApproximateMemoryUsage() { return arena_impl_.ApproximateMemoryUsage() + - table_->ApproximateMemoryUsage(); + table_->ApproximateMemoryUsage(); } int MemTable::KeyComparator::operator()(const char* aptr, const char* bptr) @@ -89,11 +85,11 @@ class MemTableIterator: public Iterator { MemTableIterator(MemTableRep* table, const ReadOptions& options) : iter_() { if (options.prefix) { - iter_ = table->GetPrefixIterator(*options.prefix); + iter_.reset(table->GetPrefixIterator(*options.prefix)); } else if (options.prefix_seek) { - iter_ = table->GetDynamicPrefixIterator(); + iter_.reset(table->GetDynamicPrefixIterator()); } else { - iter_ = table->GetIterator(); + iter_.reset(table->GetIterator()); } } @@ -114,7 +110,7 @@ class MemTableIterator: public Iterator { virtual Status status() const { return Status::OK(); } private: - std::shared_ptr iter_; + std::unique_ptr iter_; std::string tmp_; // For passing to EncodeKey // No copying allowed @@ -165,8 +161,8 @@ void MemTable::Add(SequenceNumber s, ValueType type, bool MemTable::Get(const LookupKey& key, std::string* value, Status* s, MergeContext& merge_context, const Options& options) { Slice memkey = key.memtable_key(); - std::shared_ptr iter( - table_->GetIterator(key.user_key())); + std::unique_ptr iter( + table_->GetIterator(key.user_key())); iter->Seek(memkey.data()); bool merge_in_progress = s->IsMergeInProgress(); @@ -274,8 +270,8 @@ bool MemTable::Update(SequenceNumber seq, ValueType type, LookupKey lkey(key, seq); Slice memkey = lkey.memtable_key(); - std::shared_ptr iter( - table_->GetIterator(lkey.user_key())); + std::unique_ptr iter( + table_->GetIterator(lkey.user_key())); iter->Seek(memkey.data()); if (iter->Valid()) { @@ -336,8 +332,8 @@ size_t MemTable::CountSuccessiveMergeEntries(const LookupKey& key) { // A total ordered iterator is costly for some memtablerep (prefix aware // reps). By passing in the user key, we allow efficient iterator creation. // The iterator only needs to be ordered within the same user key. - std::shared_ptr iter( - table_->GetIterator(key.user_key())); + std::unique_ptr iter( + table_->GetIterator(key.user_key())); iter->Seek(memkey.data()); size_t num_successive_merges = 0; diff --git a/db/memtable.h b/db/memtable.h index 12ccf3d37..1b9005800 100644 --- a/db/memtable.h +++ b/db/memtable.h @@ -34,11 +34,8 @@ class MemTable { // MemTables are reference counted. The initial reference count // is zero and the caller must call Ref() at least once. - explicit MemTable( - const InternalKeyComparator& comparator, - MemTableRepFactory* table_factory, - int numlevel = 7, - const Options& options = Options()); + explicit MemTable(const InternalKeyComparator& comparator, + const Options& options = Options()); ~MemTable(); @@ -146,7 +143,7 @@ class MemTable { KeyComparator comparator_; int refs_; ArenaImpl arena_impl_; - shared_ptr table_; + unique_ptr table_; // These are used to manage memtable flushes to storage bool flush_in_progress_; // started the flush diff --git a/db/merge_helper.cc b/db/merge_helper.cc index a7e2df0a3..e3f3adb1f 100644 --- a/db/merge_helper.cc +++ b/db/merge_helper.cc @@ -8,7 +8,7 @@ #include "rocksdb/comparator.h" #include "rocksdb/db.h" #include "rocksdb/merge_operator.h" -#include "util/statistics_imp.h" +#include "util/statistics.h" #include #include diff --git a/db/repair.cc b/db/repair.cc index 6db90c865..29524233f 100644 --- a/db/repair.cc +++ b/db/repair.cc @@ -58,7 +58,7 @@ class Repairer { next_file_number_(1) { // TableCache can be small since we expect each table to be opened once. table_cache_ = new TableCache(dbname_, &options_, storage_options_, 10); - edit_ = new VersionEdit(options.num_levels); + edit_ = new VersionEdit(); } ~Repairer() { @@ -196,8 +196,7 @@ class Repairer { std::string scratch; Slice record; WriteBatch batch; - MemTable* mem = new MemTable(icmp_, options_.memtable_factory.get(), - options_.num_levels); + MemTable* mem = new MemTable(icmp_, options_); mem->Ref(); int counter = 0; while (reader.ReadRecord(&record, &scratch)) { diff --git a/db/simple_table_db_test.cc b/db/simple_table_db_test.cc index 555d31893..0f3b89d9b 100644 --- a/db/simple_table_db_test.cc +++ b/db/simple_table_db_test.cc @@ -17,7 +17,7 @@ #include "db/filename.h" #include "db/version_set.h" #include "db/write_batch_internal.h" -#include "db/db_statistics.h" +#include "rocksdb/statistics.h" #include "rocksdb/cache.h" #include "rocksdb/compaction_filter.h" #include "rocksdb/env.h" diff --git a/db/version_edit.cc b/db/version_edit.cc index 2fc6fbb65..5de96b887 100644 --- a/db/version_edit.cc +++ b/db/version_edit.cc @@ -38,6 +38,7 @@ enum Tag { void VersionEdit::Clear() { comparator_.clear(); + max_level_ = 0; log_number_ = 0; prev_log_number_ = 0; last_sequence_ = 0; @@ -77,12 +78,6 @@ void VersionEdit::EncodeTo(std::string* dst) const { PutVarint64(dst, last_sequence_); } - for (size_t i = 0; i < compact_pointers_.size(); i++) { - PutVarint32(dst, kCompactPointer); - PutVarint32(dst, compact_pointers_[i].first); // level - PutLengthPrefixedSlice(dst, compact_pointers_[i].second.Encode()); - } - for (DeletedFileSet::const_iterator iter = deleted_files_.begin(); iter != deleted_files_.end(); ++iter) { @@ -131,14 +126,13 @@ static bool GetInternalKey(Slice* input, InternalKey* dst) { bool VersionEdit::GetLevel(Slice* input, int* level, const char** msg) { uint32_t v; - if (GetVarint32(input, &v) && - (int)v < number_levels_) { + if (GetVarint32(input, &v)) { *level = v; + if (max_level_ < *level) { + max_level_ = *level; + } return true; } else { - if ((int)v >= number_levels_) { - *msg = "column family already has more levels than specified"; - } return false; } } @@ -202,7 +196,9 @@ Status VersionEdit::DecodeFrom(const Slice& src) { case kCompactPointer: if (GetLevel(&input, &level, &msg) && GetInternalKey(&input, &key)) { - compact_pointers_.push_back(std::make_pair(level, key)); + // we don't use compact pointers anymore, + // but we should not fail if they are still + // in manifest } else { if (!msg) { msg = "compaction pointer"; @@ -314,12 +310,6 @@ std::string VersionEdit::DebugString(bool hex_key) const { r.append("\n LastSeq: "); AppendNumberTo(&r, last_sequence_); } - for (size_t i = 0; i < compact_pointers_.size(); i++) { - r.append("\n CompactPointer: "); - AppendNumberTo(&r, compact_pointers_[i].first); - r.append(" "); - r.append(compact_pointers_[i].second.DebugString(hex_key)); - } for (DeletedFileSet::const_iterator iter = deleted_files_.begin(); iter != deleted_files_.end(); ++iter) { diff --git a/db/version_edit.h b/db/version_edit.h index d79642e2c..b7dfa6d03 100644 --- a/db/version_edit.h +++ b/db/version_edit.h @@ -34,9 +34,7 @@ struct FileMetaData { class VersionEdit { public: - explicit VersionEdit(int number_levels) : number_levels_(number_levels) { - Clear(); - } + VersionEdit() { Clear(); } ~VersionEdit() { } void Clear(); @@ -61,9 +59,6 @@ class VersionEdit { has_last_sequence_ = true; last_sequence_ = seq; } - void SetCompactPointer(int level, const InternalKey& key) { - compact_pointers_.push_back(std::make_pair(level, key)); - } // Add the specified file at the specified number. // REQUIRES: This version has not been saved (see VersionSet::SaveTo) @@ -128,7 +123,7 @@ class VersionEdit { bool GetLevel(Slice* input, int* level, const char** msg); - int number_levels_; + int max_level_; std::string comparator_; uint64_t log_number_; uint64_t prev_log_number_; @@ -140,7 +135,6 @@ class VersionEdit { bool has_next_file_number_; bool has_last_sequence_; - std::vector< std::pair > compact_pointers_; DeletedFileSet deleted_files_; std::vector< std::pair > new_files_; diff --git a/db/version_edit_test.cc b/db/version_edit_test.cc index 491fabb89..83d7fc9b3 100644 --- a/db/version_edit_test.cc +++ b/db/version_edit_test.cc @@ -15,7 +15,7 @@ namespace rocksdb { static void TestEncodeDecode(const VersionEdit& edit) { std::string encoded, encoded2; edit.EncodeTo(&encoded); - VersionEdit parsed(7); + VersionEdit parsed; Status s = parsed.DecodeFrom(encoded); ASSERT_TRUE(s.ok()) << s.ToString(); parsed.EncodeTo(&encoded2); @@ -27,7 +27,7 @@ class VersionEditTest { }; TEST(VersionEditTest, EncodeDecode) { static const uint64_t kBig = 1ull << 50; - VersionEdit edit(7); + VersionEdit edit; for (int i = 0; i < 4; i++) { TestEncodeDecode(edit); edit.AddFile(3, kBig + 300 + i, kBig + 400 + i, @@ -36,7 +36,6 @@ TEST(VersionEditTest, EncodeDecode) { kBig + 500 + i, kBig + 600 + i); edit.DeleteFile(4, kBig + 700 + i); - edit.SetCompactPointer(i, InternalKey("x", kBig + 900 + i, kTypeValue)); } edit.SetComparatorName("foo"); @@ -47,7 +46,7 @@ TEST(VersionEditTest, EncodeDecode) { } TEST(VersionEditTest, ColumnFamilyTest) { - VersionEdit edit(7); + VersionEdit edit; edit.SetColumnFamily(2); edit.AddColumnFamily("column_family"); TestEncodeDecode(edit); diff --git a/db/version_set.cc b/db/version_set.cc index ad1169189..f9d04bf37 100644 --- a/db/version_set.cc +++ b/db/version_set.cc @@ -18,6 +18,7 @@ #include "db/memtable.h" #include "db/merge_context.h" #include "db/table_cache.h" +#include "db/compaction.h" #include "rocksdb/env.h" #include "rocksdb/merge_operator.h" #include "rocksdb/table.h" @@ -45,7 +46,7 @@ Version::~Version() { next_->prev_ = prev_; // Drop references to files - for (int level = 0; level < vset_->NumberLevels(); level++) { + for (int level = 0; level < num_levels_; level++) { for (size_t i = 0; i < files_[level].size(); i++) { FileMetaData* f = files_[level][i]; assert(f->refs > 0); @@ -265,7 +266,7 @@ void Version::AddIterators(const ReadOptions& options, // For levels > 0, we can use a concatenating iterator that sequentially // walks through the non-overlapping files in the level, opening them // lazily. - for (int level = 1; level < vset_->NumberLevels(); level++) { + for (int level = 1; level < num_levels_; level++) { if (!files_[level].empty()) { iters->push_back(NewConcatenatingIterator(options, soptions, level)); } @@ -407,16 +408,19 @@ static bool NewestFirstBySeqNo(FileMetaData* a, FileMetaData* b) { } Version::Version(VersionSet* vset, uint64_t version_number) - : vset_(vset), next_(this), prev_(this), refs_(0), - files_(new std::vector[vset->NumberLevels()]), - files_by_size_(vset->NumberLevels()), - next_file_to_compact_by_size_(vset->NumberLevels()), + : vset_(vset), + next_(this), + prev_(this), + refs_(0), + num_levels_(vset->num_levels_), + files_(new std::vector[num_levels_]), + files_by_size_(num_levels_), + next_file_to_compact_by_size_(num_levels_), file_to_compact_(nullptr), file_to_compact_level_(-1), - compaction_score_(vset->NumberLevels()), - compaction_level_(vset->NumberLevels()), - version_number_(version_number) { -} + compaction_score_(num_levels_), + compaction_level_(num_levels_), + version_number_(version_number) {} void Version::Get(const ReadOptions& options, const LookupKey& k, @@ -455,7 +459,7 @@ void Version::Get(const ReadOptions& options, // levels. Therefore we are guaranteed that if we find data // in an smaller level, later levels are irrelevant (unless we // are MergeInProgress). - for (int level = 0; level < vset_->NumberLevels(); level++) { + for (int level = 0; level < num_levels_; level++) { size_t num_files = files_[level].size(); if (num_files == 0) continue; @@ -589,6 +593,159 @@ bool Version::UpdateStats(const GetStats& stats) { return false; } +void Version::Finalize(std::vector& size_being_compacted) { + // Pre-sort level0 for Get() + if (vset_->options_->compaction_style == kCompactionStyleUniversal) { + std::sort(files_[0].begin(), files_[0].end(), NewestFirstBySeqNo); + } else { + std::sort(files_[0].begin(), files_[0].end(), NewestFirst); + } + + double max_score = 0; + int max_score_level = 0; + + int num_levels_to_check = + (vset_->options_->compaction_style != kCompactionStyleUniversal) + ? NumberLevels() - 1 + : 1; + + for (int level = 0; level < num_levels_to_check; level++) { + double score; + if (level == 0) { + // We treat level-0 specially by bounding the number of files + // instead of number of bytes for two reasons: + // + // (1) With larger write-buffer sizes, it is nice not to do too + // many level-0 compactions. + // + // (2) The files in level-0 are merged on every read and + // therefore we wish to avoid too many files when the individual + // file size is small (perhaps because of a small write-buffer + // setting, or very high compression ratios, or lots of + // overwrites/deletions). + int numfiles = 0; + for (unsigned int i = 0; i < files_[level].size(); i++) { + if (!files_[level][i]->being_compacted) { + numfiles++; + } + } + + // If we are slowing down writes, then we better compact that first + if (numfiles >= vset_->options_->level0_stop_writes_trigger) { + score = 1000000; + // Log(options_->info_log, "XXX score l0 = 1000000000 max"); + } else if (numfiles >= vset_->options_->level0_slowdown_writes_trigger) { + score = 10000; + // Log(options_->info_log, "XXX score l0 = 1000000 medium"); + } else { + score = static_cast(numfiles) / + vset_->options_->level0_file_num_compaction_trigger; + if (score >= 1) { + // Log(options_->info_log, "XXX score l0 = %d least", (int)score); + } + } + } else { + // Compute the ratio of current size to size limit. + const uint64_t level_bytes = + TotalFileSize(files_[level]) - size_being_compacted[level]; + score = static_cast(level_bytes) / vset_->MaxBytesForLevel(level); + if (score > 1) { + // Log(options_->info_log, "XXX score l%d = %d ", level, (int)score); + } + if (max_score < score) { + max_score = score; + max_score_level = level; + } + } + compaction_level_[level] = level; + compaction_score_[level] = score; + } + + // update the max compaction score in levels 1 to n-1 + max_compaction_score_ = max_score; + max_compaction_score_level_ = max_score_level; + + // sort all the levels based on their score. Higher scores get listed + // first. Use bubble sort because the number of entries are small. + for (int i = 0; i < NumberLevels() - 2; i++) { + for (int j = i + 1; j < NumberLevels() - 1; j++) { + if (compaction_score_[i] < compaction_score_[j]) { + double score = compaction_score_[i]; + int level = compaction_level_[i]; + compaction_score_[i] = compaction_score_[j]; + compaction_level_[i] = compaction_level_[j]; + compaction_score_[j] = score; + compaction_level_[j] = level; + } + } + } +} + +namespace { + +// Compator that is used to sort files based on their size +// In normal mode: descending size +bool CompareSizeDescending(const Version::Fsize& first, + const Version::Fsize& second) { + return (first.file->file_size > second.file->file_size); +} +// A static compator used to sort files based on their seqno +// In universal style : descending seqno +bool CompareSeqnoDescending(const Version::Fsize& first, + const Version::Fsize& second) { + if (first.file->smallest_seqno > second.file->smallest_seqno) { + assert(first.file->largest_seqno > second.file->largest_seqno); + return true; + } + assert(first.file->largest_seqno <= second.file->largest_seqno); + return false; +} + +} // anonymous namespace + +void Version::UpdateFilesBySize() { + // No need to sort the highest level because it is never compacted. + int max_level = + (vset_->options_->compaction_style == kCompactionStyleUniversal) + ? NumberLevels() + : NumberLevels() - 1; + + for (int level = 0; level < max_level; level++) { + const std::vector& files = files_[level]; + std::vector& files_by_size = files_by_size_[level]; + assert(files_by_size.size() == 0); + + // populate a temp vector for sorting based on size + std::vector temp(files.size()); + for (unsigned int i = 0; i < files.size(); i++) { + temp[i].index = i; + temp[i].file = files[i]; + } + + // sort the top number_of_files_to_sort_ based on file size + if (vset_->options_->compaction_style == kCompactionStyleUniversal) { + int num = temp.size(); + std::partial_sort(temp.begin(), temp.begin() + num, temp.end(), + CompareSeqnoDescending); + } else { + int num = Version::number_of_files_to_sort_; + if (num > (int)temp.size()) { + num = temp.size(); + } + std::partial_sort(temp.begin(), temp.begin() + num, temp.end(), + CompareSizeDescending); + } + assert(temp.size() == files.size()); + + // initialize files_by_size_ + for (unsigned int i = 0; i < temp.size(); i++) { + files_by_size.push_back(temp[i].index); + } + next_file_to_compact_by_size_[level] = 0; + assert(files_[level].size() == files_by_size_[level].size()); + } +} + void Version::Ref() { ++refs_; } @@ -626,13 +783,13 @@ int Version::PickLevelForMemTableOutput( if (OverlapInLevel(level + 1, &smallest_user_key, &largest_user_key)) { break; } - if (level + 2 >= vset_->NumberLevels()) { + if (level + 2 >= num_levels_) { level++; break; } GetOverlappingInputs(level + 2, &start, &limit, &overlaps); const uint64_t sum = TotalFileSize(overlaps); - if (sum > vset_->MaxGrandParentOverlapBytes(level)) { + if (sum > vset_->compaction_picker_->MaxGrandParentOverlapBytes(level)) { break; } level++; @@ -858,9 +1015,70 @@ bool Version::HasOverlappingUserKey( return false; } +int64_t Version::NumLevelBytes(int level) const { + assert(level >= 0); + assert(level < NumberLevels()); + return TotalFileSize(files_[level]); +} + +const char* Version::LevelSummary(LevelSummaryStorage* scratch) const { + int len = snprintf(scratch->buffer, sizeof(scratch->buffer), "files["); + for (int i = 0; i < NumberLevels(); i++) { + int sz = sizeof(scratch->buffer) - len; + int ret = snprintf(scratch->buffer + len, sz, "%d ", int(files_[i].size())); + if (ret < 0 || ret >= sz) break; + len += ret; + } + snprintf(scratch->buffer + len, sizeof(scratch->buffer) - len, "]"); + return scratch->buffer; +} + +const char* Version::LevelFileSummary(FileSummaryStorage* scratch, + int level) const { + int len = snprintf(scratch->buffer, sizeof(scratch->buffer), "files_size["); + for (const auto& f : files_[level]) { + int sz = sizeof(scratch->buffer) - len; + int ret = snprintf(scratch->buffer + len, sz, + "#%lu(seq=%lu,sz=%lu,%lu) ", + (unsigned long)f->number, + (unsigned long)f->smallest_seqno, + (unsigned long)f->file_size, + (unsigned long)f->being_compacted); + if (ret < 0 || ret >= sz) + break; + len += ret; + } + snprintf(scratch->buffer + len, sizeof(scratch->buffer) - len, "]"); + return scratch->buffer; +} + +int64_t Version::MaxNextLevelOverlappingBytes() { + uint64_t result = 0; + std::vector overlaps; + for (int level = 1; level < NumberLevels() - 1; level++) { + for (const auto& f : files_[level]) { + GetOverlappingInputs(level + 1, &f->smallest, &f->largest, &overlaps); + const uint64_t sum = TotalFileSize(overlaps); + if (sum > result) { + result = sum; + } + } + } + return result; +} + +void Version::AddLiveFiles(std::set* live) { + for (int level = 0; level < NumberLevels(); level++) { + const std::vector& files = files_[level]; + for (const auto& file : files) { + live->insert(file->number); + } + } +} + std::string Version::DebugString(bool hex) const { std::string r; - for (int level = 0; level < vset_->NumberLevels(); level++) { + for (int level = 0; level < num_levels_; level++) { // E.g., // --- level 1 --- // 17:123['a' .. 'd'] @@ -929,20 +1147,18 @@ class VersionSet::Builder { public: // Initialize a builder with the files from *base and other info from *vset - Builder(VersionSet* vset, Version* base) - : vset_(vset), - base_(base) { + Builder(VersionSet* vset, Version* base) : vset_(vset), base_(base) { base_->Ref(); - levels_ = new LevelState[vset_->NumberLevels()]; + levels_ = new LevelState[base->NumberLevels()]; BySmallestKey cmp; cmp.internal_comparator = &vset_->icmp_; - for (int level = 0; level < vset_->NumberLevels(); level++) { + for (int level = 0; level < base->NumberLevels(); level++) { levels_[level].added_files = new FileSet(cmp); } } ~Builder() { - for (int level = 0; level < vset_->NumberLevels(); level++) { + for (int level = 0; level < base_->NumberLevels(); level++) { const FileSet* added = levels_[level].added_files; std::vector to_unref; to_unref.reserve(added->size()); @@ -965,7 +1181,7 @@ class VersionSet::Builder { void CheckConsistency(Version* v) { #ifndef NDEBUG - for (int level = 0; level < vset_->NumberLevels(); level++) { + for (int level = 0; level < v->NumberLevels(); level++) { // Make sure there is no overlap in levels > 0 if (level > 0) { for (uint32_t i = 1; i < v->files_[level].size(); i++) { @@ -983,14 +1199,12 @@ class VersionSet::Builder { #endif } - void CheckConsistencyForDeletes( - VersionEdit* edit, - unsigned int number, - int level) { + void CheckConsistencyForDeletes(VersionEdit* edit, unsigned int number, + int level) { #ifndef NDEBUG // a file to be deleted better exist in the previous version bool found = false; - for (int l = 0; !found && l < vset_->NumberLevels(); l++) { + for (int l = 0; !found && l < base_->NumberLevels(); l++) { const std::vector& base_files = base_->files_[l]; for (unsigned int i = 0; i < base_files.size(); i++) { FileMetaData* f = base_files[i]; @@ -1003,7 +1217,7 @@ class VersionSet::Builder { // if the file did not exist in the previous version, then it // is possibly moved from lower level to higher level in current // version - for (int l = level+1; !found && l < vset_->NumberLevels(); l++) { + for (int l = level+1; !found && l < base_->NumberLevels(); l++) { const FileSet* added = levels_[l].added_files; for (FileSet::const_iterator added_iter = added->begin(); added_iter != added->end(); ++added_iter) { @@ -1035,13 +1249,6 @@ class VersionSet::Builder { void Apply(VersionEdit* edit) { CheckConsistency(base_); - // Update compaction pointers - for (size_t i = 0; i < edit->compact_pointers_.size(); i++) { - const int level = edit->compact_pointers_[i].first; - vset_->compact_pointer_[level] = - edit->compact_pointers_[i].second.Encode().ToString(); - } - // Delete files const VersionEdit::DeletedFileSet& del = edit->deleted_files_; for (VersionEdit::DeletedFileSet::const_iterator iter = del.begin(); @@ -1086,7 +1293,7 @@ class VersionSet::Builder { CheckConsistency(v); BySmallestKey cmp; cmp.internal_comparator = &vset_->icmp_; - for (int level = 0; level < vset_->NumberLevels(); level++) { + for (int level = 0; level < base_->NumberLevels(); level++) { // Merge the set of added files with the set of pre-existing files. // Drop any deleted files. Store the result in *v. const std::vector& base_files = base_->files_[level]; @@ -1133,8 +1340,7 @@ class VersionSet::Builder { } }; -VersionSet::VersionSet(const std::string& dbname, - const Options* options, +VersionSet::VersionSet(const std::string& dbname, const Options* options, const EnvOptions& storage_options, TableCache* table_cache, const InternalKeyComparator* cmp) @@ -1149,13 +1355,16 @@ VersionSet::VersionSet(const std::string& dbname, log_number_(0), prev_log_number_(0), num_levels_(options_->num_levels), - compactions_in_progress_(options_->num_levels), + need_slowdown_for_num_level0_files_(false), current_version_number_(0), manifest_file_size_(0), storage_options_(storage_options), - storage_options_compactions_(storage_options_) { - compact_pointer_ = new std::string[options_->num_levels]; - Init(options_->num_levels); + storage_options_compactions_(storage_options_) { + if (options_->compaction_style == kCompactionStyleUniversal) { + compaction_picker_.reset(new UniversalCompactionPicker(options_, &icmp_)); + } else { + compaction_picker_.reset(new LevelCompactionPicker(options_, &icmp_)); + } } VersionSet::~VersionSet() { @@ -1169,29 +1378,6 @@ VersionSet::~VersionSet() { delete file; } obsolete_files_.clear(); - delete[] compact_pointer_; - delete[] max_file_size_; - delete[] level_max_bytes_; -} - -void VersionSet::Init(int num_levels) { - max_file_size_ = new uint64_t[num_levels]; - level_max_bytes_ = new uint64_t[num_levels]; - int target_file_size_multiplier = options_->target_file_size_multiplier; - int max_bytes_multiplier = options_->max_bytes_for_level_multiplier; - for (int i = 0; i < num_levels; i++) { - if (i == 0 && options_->compaction_style == kCompactionStyleUniversal) { - max_file_size_[i] = ULLONG_MAX; - level_max_bytes_[i] = options_->max_bytes_for_level_base; - } else if (i > 1) { - max_file_size_[i] = max_file_size_[i-1] * target_file_size_multiplier; - level_max_bytes_[i] = level_max_bytes_[i-1] * max_bytes_multiplier * - options_->max_bytes_for_level_multiplier_additional[i-1]; - } else { - max_file_size_[i] = options_->target_file_size_base; - level_max_bytes_[i] = options_->max_bytes_for_level_base; - } - } } void VersionSet::AppendVersion(ColumnFamilyData* column_family_data, @@ -1204,6 +1390,9 @@ void VersionSet::AppendVersion(ColumnFamilyData* column_family_data, column_family_data->current->Unref(); } column_family_data->current = v; + need_slowdown_for_num_level0_files_ = + (options_->level0_slowdown_writes_trigger >= 0 && + v->NumLevelFiles(0) >= options_->level0_slowdown_writes_trigger); v->Ref(); // Append to linked list @@ -1269,8 +1458,8 @@ Status VersionSet::LogAndApply(ColumnFamilyData* column_family_data, // because &w is ensuring that all new writes get queued. { // calculate the amount of data being compacted at every level - std::vector size_being_compacted(NumberLevels()-1); - SizeBeingCompacted(size_being_compacted); + std::vector size_being_compacted(v->NumberLevels() - 1); + compaction_picker_->SizeBeingCompacted(size_being_compacted); mu->Unlock(); @@ -1288,8 +1477,8 @@ Status VersionSet::LogAndApply(ColumnFamilyData* column_family_data, // The calls to Finalize and UpdateFilesBySize are cpu-heavy // and is best called outside the mutex. - Finalize(v, size_being_compacted); - UpdateFilesBySize(v); + v->Finalize(size_being_compacted); + v->UpdateFilesBySize(); // Write new record to MANIFEST log if (s.ok()) { @@ -1382,10 +1571,8 @@ Status VersionSet::LogAndApply(ColumnFamilyData* column_family_data, return s; } -void VersionSet::LogAndApplyHelper(Builder* builder, - Version* v, - VersionEdit* edit, - port::Mutex* mu) { +void VersionSet::LogAndApplyHelper(Builder* builder, Version* v, + VersionEdit* edit, port::Mutex* mu) { mu->AssertHeld(); if (edit->has_log_number_) { @@ -1450,7 +1637,7 @@ Status VersionSet::Recover() { std::unordered_map builders; // add default column family - VersionEdit default_cf_edit(0); + VersionEdit default_cf_edit; default_cf_edit.AddColumnFamily(default_column_family_name); default_cf_edit.SetColumnFamily(0); ColumnFamilyData* default_cfd = @@ -1465,20 +1652,18 @@ Status VersionSet::Recover() { Slice record; std::string scratch; while (reader.ReadRecord(&record, &scratch) && s.ok()) { - VersionEdit edit(NumberLevels()); + VersionEdit edit; s = edit.DecodeFrom(record); - if (s.ok()) { - if (edit.has_comparator_ && - edit.comparator_ != icmp_.user_comparator()->Name()) { - s = Status::InvalidArgument(icmp_.user_comparator()->Name(), - "does not match existing comparator " + - edit.comparator_); - } - } - if (!s.ok()) { break; } + if (edit.has_comparator_ && + edit.comparator_ != icmp_.user_comparator()->Name()) { + s = Status::InvalidArgument( + icmp_.user_comparator()->Name(), + "does not match existing comparator " + edit.comparator_); + break; + } if (edit.is_column_family_add_) { ColumnFamilyData* new_cfd = @@ -1492,6 +1677,14 @@ Status VersionSet::Recover() { builders.erase(builder); DropColumnFamily(&edit); } else { + auto cfd = column_family_data_.find(edit.column_family_); + assert(cfd != column_family_data_.end()); + if (edit.max_level_ >= cfd->second->current->NumberLevels()) { + s = Status::InvalidArgument( + "db has more levels than options.num_levels"); + break; + } + // if it isn't column family add or column family drop, // then it's a file add/delete, which should be forwarded // to builder @@ -1546,9 +1739,9 @@ Status VersionSet::Recover() { builders[cfd.first]->SaveTo(v); // Install recovered version - std::vector size_being_compacted(NumberLevels()-1); - SizeBeingCompacted(size_being_compacted); - Finalize(v, size_being_compacted); + std::vector size_being_compacted(v->NumberLevels() - 1); + compaction_picker_->SizeBeingCompacted(size_being_compacted); + v->Finalize(size_being_compacted); AppendVersion(cfd.second, v); } @@ -1578,10 +1771,8 @@ Status VersionSet::Recover() { return s; } -Status VersionSet::DumpManifest(Options& options, - std::string& dscname, - bool verbose, - bool hex) { +Status VersionSet::DumpManifest(Options& options, std::string& dscname, + bool verbose, bool hex) { struct LogReporter : public log::Reader::Reporter { Status* status; virtual void Corruption(size_t bytes, const Status& s) { @@ -1617,7 +1808,7 @@ Status VersionSet::DumpManifest(Options& options, Slice record; std::string scratch; while (reader.ReadRecord(&record, &scratch) && s.ok()) { - VersionEdit edit(NumberLevels()); + VersionEdit edit; s = edit.DecodeFrom(record); if (s.ok()) { if (edit.has_comparator_ && @@ -1711,285 +1902,64 @@ void VersionSet::MarkFileNumberUsed(uint64_t number) { } } -void VersionSet::Finalize(Version* v, - std::vector& size_being_compacted) { - // Pre-sort level0 for Get() - if (options_->compaction_style == kCompactionStyleUniversal) { - std::sort(v->files_[0].begin(), v->files_[0].end(), NewestFirstBySeqNo); - } else { - std::sort(v->files_[0].begin(), v->files_[0].end(), NewestFirst); - } - - double max_score = 0; - int max_score_level = 0; - - int num_levels_to_check = - (options_->compaction_style != kCompactionStyleUniversal) ? - NumberLevels() - 1 : 1; - - for (int level = 0; level < num_levels_to_check; level++) { +Status VersionSet::WriteSnapshot(log::Writer* log) { + // TODO: Break up into multiple records to reduce memory usage on recovery? - double score; - if (level == 0) { - // We treat level-0 specially by bounding the number of files - // instead of number of bytes for two reasons: - // - // (1) With larger write-buffer sizes, it is nice not to do too - // many level-0 compactions. - // - // (2) The files in level-0 are merged on every read and - // therefore we wish to avoid too many files when the individual - // file size is small (perhaps because of a small write-buffer - // setting, or very high compression ratios, or lots of - // overwrites/deletions). - int numfiles = 0; - for (unsigned int i = 0; i < v->files_[level].size(); i++) { - if (!v->files_[level][i]->being_compacted) { - numfiles++; + for (auto cfd : column_family_data_) { + { + // Store column family info + VersionEdit edit; + if (cfd.first != 0) { + // default column family is always there, + // no need to explicitly write it + edit.AddColumnFamily(cfd.second->name); + edit.SetColumnFamily(cfd.first); + std::string record; + edit.EncodeTo(&record); + Status s = log->AddRecord(record); + if (!s.ok()) { + return s; } } + } - // If we are slowing down writes, then we better compact that first - if (numfiles >= options_->level0_stop_writes_trigger) { - score = 1000000; - // Log(options_->info_log, "XXX score l0 = 1000000000 max"); - } else if (numfiles >= options_->level0_slowdown_writes_trigger) { - score = 10000; - // Log(options_->info_log, "XXX score l0 = 1000000 medium"); - } else { - score = numfiles / - static_cast(options_->level0_file_num_compaction_trigger); - if (score >= 1) { - // Log(options_->info_log, "XXX score l0 = %d least", (int)score); + { + // Save files + VersionEdit edit; + edit.SetColumnFamily(cfd.first); + + for (int level = 0; level < NumberLevels(); level++) { + const std::vector& files = + cfd.second->current->files_[level]; + for (size_t i = 0; i < files.size(); i++) { + const FileMetaData* f = files[i]; + edit.AddFile(level, + f->number, + f->file_size, + f->smallest, + f->largest, + f->smallest_seqno, + f->largest_seqno); } } - } else { - // Compute the ratio of current size to size limit. - const uint64_t level_bytes = TotalFileSize(v->files_[level]) - - size_being_compacted[level]; - score = static_cast(level_bytes) / MaxBytesForLevel(level); - if (score > 1) { - // Log(options_->info_log, "XXX score l%d = %d ", level, (int)score); - } - if (max_score < score) { - max_score = score; - max_score_level = level; - } - } - v->compaction_level_[level] = level; - v->compaction_score_[level] = score; - } - - // update the max compaction score in levels 1 to n-1 - v->max_compaction_score_ = max_score; - v->max_compaction_score_level_ = max_score_level; - - // sort all the levels based on their score. Higher scores get listed - // first. Use bubble sort because the number of entries are small. - for (int i = 0; i < NumberLevels()-2; i++) { - for (int j = i+1; j < NumberLevels()-1; j++) { - if (v->compaction_score_[i] < v->compaction_score_[j]) { - double score = v->compaction_score_[i]; - int level = v->compaction_level_[i]; - v->compaction_score_[i] = v->compaction_score_[j]; - v->compaction_level_[i] = v->compaction_level_[j]; - v->compaction_score_[j] = score; - v->compaction_level_[j] = level; - } - } - } -} - -// A static compator used to sort files based on their size -// In normal mode: descending size -static bool compareSizeDescending(const VersionSet::Fsize& first, - const VersionSet::Fsize& second) { - return (first.file->file_size > second.file->file_size); -} -// A static compator used to sort files based on their seqno -// In universal style : descending seqno -static bool compareSeqnoDescending(const VersionSet::Fsize& first, - const VersionSet::Fsize& second) { - if (first.file->smallest_seqno > second.file->smallest_seqno) { - assert(first.file->largest_seqno > second.file->largest_seqno); - return true; - } - assert(first.file->largest_seqno <= second.file->largest_seqno); - return false; -} - -// sort all files in level1 to level(n-1) based on file size -void VersionSet::UpdateFilesBySize(Version* v) { - - // No need to sort the highest level because it is never compacted. - int max_level = (options_->compaction_style == kCompactionStyleUniversal) ? - NumberLevels() : NumberLevels() - 1; - - for (int level = 0; level < max_level; level++) { - - const std::vector& files = v->files_[level]; - std::vector& files_by_size = v->files_by_size_[level]; - assert(files_by_size.size() == 0); - - // populate a temp vector for sorting based on size - std::vector temp(files.size()); - for (unsigned int i = 0; i < files.size(); i++) { - temp[i].index = i; - temp[i].file = files[i]; - } - - // sort the top number_of_files_to_sort_ based on file size - if (options_->compaction_style == kCompactionStyleUniversal) { - int num = temp.size(); - std::partial_sort(temp.begin(), temp.begin() + num, - temp.end(), compareSeqnoDescending); - } else { - int num = Version::number_of_files_to_sort_; - if (num > (int)temp.size()) { - num = temp.size(); - } - std::partial_sort(temp.begin(), temp.begin() + num, - temp.end(), compareSizeDescending); - } - assert(temp.size() == files.size()); - - // initialize files_by_size_ - for (unsigned int i = 0; i < temp.size(); i++) { - files_by_size.push_back(temp[i].index); - } - v->next_file_to_compact_by_size_[level] = 0; - assert(v->files_[level].size() == v->files_by_size_[level].size()); - } -} - -Status VersionSet::WriteSnapshot(log::Writer* log) { - // TODO: Break up into multiple records to reduce memory usage on recovery? - - for (auto cfd : column_family_data_) { - { - // Store column family info - VersionEdit edit(0); - if (cfd.first != 0) { - // default column family is always there, - // no need to explicitly write it - edit.AddColumnFamily(cfd.second->name); - edit.SetColumnFamily(cfd.first); - std::string record; - edit.EncodeTo(&record); - Status s = log->AddRecord(record); - if (!s.ok()) { - return s; - } - } - } - - { - // Save files - VersionEdit edit(NumberLevels()); - edit.SetColumnFamily(cfd.first); - - for (int level = 0; level < NumberLevels(); level++) { - const std::vector& files = - cfd.second->current->files_[level]; - for (size_t i = 0; i < files.size(); i++) { - const FileMetaData* f = files[i]; - edit.AddFile(level, - f->number, - f->file_size, - f->smallest, - f->largest, - f->smallest_seqno, - f->largest_seqno); - } - } - std::string record; - edit.EncodeTo(&record); - Status s = log->AddRecord(record); - if (!s.ok()) { - return s; + std::string record; + edit.EncodeTo(&record); + Status s = log->AddRecord(record); + if (!s.ok()) { + return s; } } } // Save metadata - VersionEdit edit(NumberLevels()); + VersionEdit edit; edit.SetComparatorName(icmp_.user_comparator()->Name()); - // Save compaction pointers - for (int level = 0; level < NumberLevels(); level++) { - if (!compact_pointer_[level].empty()) { - InternalKey key; - key.DecodeFrom(compact_pointer_[level]); - edit.SetCompactPointer(level, key); - } - } - std::string record; edit.EncodeTo(&record); return log->AddRecord(record); } -int VersionSet::NumLevelFiles(int level) const { - assert(level >= 0); - assert(level < NumberLevels()); - // TODO this only works for default column family now - assert(column_family_data_.find(0) != column_family_data_.end()); - Version* version = column_family_data_.find(0)->second->current; - return version->files_[level].size(); -} - -const char* VersionSet::LevelSummary(LevelSummaryStorage* scratch) const { - // TODO this only works for default column family now - Version* version = column_family_data_.find(0)->second->current; - int len = snprintf(scratch->buffer, sizeof(scratch->buffer), "files["); - for (int i = 0; i < NumberLevels(); i++) { - int sz = sizeof(scratch->buffer) - len; - int ret = snprintf(scratch->buffer + len, sz, "%d ", - int(version->files_[i].size())); - if (ret < 0 || ret >= sz) - break; - len += ret; - } - snprintf(scratch->buffer + len, sizeof(scratch->buffer) - len, "]"); - return scratch->buffer; -} - -const char* VersionSet::LevelDataSizeSummary( - LevelSummaryStorage* scratch) const { - int len = snprintf(scratch->buffer, sizeof(scratch->buffer), "files_size["); - for (int i = 0; i < NumberLevels(); i++) { - int sz = sizeof(scratch->buffer) - len; - int ret = snprintf(scratch->buffer + len, sz, "%lu ", - (unsigned long)NumLevelBytes(i)); - if (ret < 0 || ret >= sz) - break; - len += ret; - } - snprintf(scratch->buffer + len, sizeof(scratch->buffer) - len, "]"); - return scratch->buffer; -} - -const char* VersionSet::LevelFileSummary(Version* v, - FileSummaryStorage* scratch, - int level) const { - int len = snprintf(scratch->buffer, sizeof(scratch->buffer), "files_size["); - for (unsigned int i = 0; i < v->files_[level].size(); i++) { - FileMetaData* f = v->files_[level][i]; - int sz = sizeof(scratch->buffer) - len; - int ret = snprintf(scratch->buffer + len, sz, - "#%lu(seq=%lu,sz=%lu,%lu) ", - (unsigned long)f->number, - (unsigned long)f->smallest_seqno, - (unsigned long)f->file_size, - (unsigned long)f->being_compacted); - if (ret < 0 || ret >= sz) - break; - len += ret; - } - snprintf(scratch->buffer + len, sizeof(scratch->buffer) - len, "]"); - return scratch->buffer; -} - // Opens the mainfest file and reads all records // till it finds the record we are looking for. bool VersionSet::ManifestContains(const std::string& record) const { @@ -2021,7 +1991,7 @@ bool VersionSet::ManifestContains(const std::string& record) const { uint64_t VersionSet::ApproximateOffsetOf(Version* v, const InternalKey& ikey) { uint64_t result = 0; - for (int level = 0; level < NumberLevels(); level++) { + for (int level = 0; level < v->NumberLevels(); level++) { const std::vector& files = v->files_[level]; for (size_t i = 0; i < files.size(); i++) { if (icmp_.Compare(files[i]->largest, ikey) <= 0) { @@ -2057,9 +2027,8 @@ void VersionSet::AddLiveFiles(std::vector* live_list) { int64_t total_files = 0; for (auto cfd : column_family_data_) { for (Version* v = cfd.second->dummy_versions.next_; - v != &cfd.second->dummy_versions; - v = v->next_) { - for (int level = 0; level < NumberLevels(); level++) { + v != &cfd.second->dummy_versions; v = v->next_) { + for (int level = 0; level < v->NumberLevels(); level++) { total_files += v->files_[level].size(); } } @@ -2070,9 +2039,8 @@ void VersionSet::AddLiveFiles(std::vector* live_list) { for (auto cfd : column_family_data_) { for (Version* v = cfd.second->dummy_versions.next_; - v != &cfd.second->dummy_versions; - v = v->next_) { - for (int level = 0; level < NumberLevels(); level++) { + v != &cfd.second->dummy_versions; v = v->next_) { + for (int level = 0; level < v->NumberLevels(); level++) { for (const auto& f : v->files_[level]) { live_list->push_back(f->number); } @@ -2081,80 +2049,20 @@ void VersionSet::AddLiveFiles(std::vector* live_list) { } } -void VersionSet::AddLiveFilesCurrentVersion(std::set* live) { - // TODO this only works for default column family now - Version* v = column_family_data_.find(0)->second->current; - for (int level = 0; level < NumberLevels(); level++) { - const std::vector& files = v->files_[level]; - for (size_t i = 0; i < files.size(); i++) { - live->insert(files[i]->number); - } - } -} - -int64_t VersionSet::NumLevelBytes(int level) const { +Compaction* VersionSet::PickCompaction() { // TODO this only works for default column family now Version* version = column_family_data_.find(0)->second->current; - assert(level >= 0); - assert(level < NumberLevels()); - assert(version); - return TotalFileSize(version->files_[level]); + return compaction_picker_->PickCompaction(version); } -int64_t VersionSet::MaxNextLevelOverlappingBytes() { +Compaction* VersionSet::CompactRange(int input_level, int output_level, + const InternalKey* begin, + const InternalKey* end, + InternalKey** compaction_end) { // TODO this only works for default column family now Version* version = column_family_data_.find(0)->second->current; - uint64_t result = 0; - std::vector overlaps; - for (int level = 1; level < NumberLevels() - 1; level++) { - for (size_t i = 0; i < version->files_[level].size(); i++) { - const FileMetaData* f = version->files_[level][i]; - version->GetOverlappingInputs( - level + 1, &f->smallest, &f->largest, &overlaps); - const uint64_t sum = TotalFileSize(overlaps); - if (sum > result) { - result = sum; - } - } - } - return result; -} - -// Stores the minimal range that covers all entries in inputs in -// *smallest, *largest. -// REQUIRES: inputs is not empty -void VersionSet::GetRange(const std::vector& inputs, - InternalKey* smallest, - InternalKey* largest) { - assert(!inputs.empty()); - smallest->Clear(); - largest->Clear(); - for (size_t i = 0; i < inputs.size(); i++) { - FileMetaData* f = inputs[i]; - if (i == 0) { - *smallest = f->smallest; - *largest = f->largest; - } else { - if (icmp_.Compare(f->smallest, *smallest) < 0) { - *smallest = f->smallest; - } - if (icmp_.Compare(f->largest, *largest) > 0) { - *largest = f->largest; - } - } - } -} - -// Stores the minimal range that covers all entries in inputs1 and inputs2 -// in *smallest, *largest. -// REQUIRES: inputs is not empty -void VersionSet::GetRange2(const std::vector& inputs1, - const std::vector& inputs2, - InternalKey* smallest, - InternalKey* largest) { - std::vector all = inputs1; - all.insert(all.end(), inputs2.begin(), inputs2.end()); - GetRange(all, smallest, largest); + return compaction_picker_->CompactRange(version, input_level, output_level, + begin, end, compaction_end); } Iterator* VersionSet::MakeInputIterator(Compaction* c) { @@ -2194,29 +2102,11 @@ Iterator* VersionSet::MakeInputIterator(Compaction* c) { } double VersionSet::MaxBytesForLevel(int level) { - // Note: the result for level zero is not really used since we set - // the level-0 compaction threshold based on number of files. - assert(level >= 0); - assert(level < NumberLevels()); - return level_max_bytes_[level]; + return compaction_picker_->MaxBytesForLevel(level); } uint64_t VersionSet::MaxFileSizeForLevel(int level) { - assert(level >= 0); - assert(level < NumberLevels()); - return max_file_size_[level]; -} - -uint64_t VersionSet::ExpandedCompactionByteSizeLimit(int level) { - uint64_t result = MaxFileSizeForLevel(level); - result *= options_->expanded_compaction_factor; - return result; -} - -uint64_t VersionSet::MaxGrandParentOverlapBytes(int level) { - uint64_t result = MaxFileSizeForLevel(level); - result *= options_->max_grandparent_overlap_factor; - return result; + return compaction_picker_->MaxFileSizeForLevel(level); } // verify that the files listed in this compaction are present @@ -2269,737 +2159,16 @@ bool VersionSet::VerifyCompactionFileConsistency(Compaction* c) { return true; // everything good } -// Clear all files to indicate that they are not being compacted -// Delete this compaction from the list of running compactions. void VersionSet::ReleaseCompactionFiles(Compaction* c, Status status) { - c->MarkFilesBeingCompacted(false); - compactions_in_progress_[c->level()].erase(c); - if (!status.ok()) { - c->ResetNextCompactionIndex(); - } -} - -// The total size of files that are currently being compacted -// at at every level upto the penultimate level. -void VersionSet::SizeBeingCompacted(std::vector& sizes) { - for (int level = 0; level < NumberLevels()-1; level++) { - uint64_t total = 0; - for (std::set::iterator it = - compactions_in_progress_[level].begin(); - it != compactions_in_progress_[level].end(); - ++it) { - Compaction* c = (*it); - assert(c->level() == level); - for (int i = 0; i < c->num_input_files(0); i++) { - total += c->input(0,i)->file_size; - } - } - sizes[level] = total; - } -} - -// -// Look at overall size amplification. If size amplification -// exceeeds the configured value, then do a compaction -// of the candidate files all the way upto the earliest -// base file (overrides configured values of file-size ratios, -// min_merge_width and max_merge_width). -// -Compaction* VersionSet::PickCompactionUniversalSizeAmp(int level, - double score) { - assert (level == 0); - // TODO this only works for default column family now - Version* version = column_family_data_.find(0)->second->current; - - // percentage flexibilty while reducing size amplification - uint64_t ratio = options_->compaction_options_universal. - max_size_amplification_percent; - - // The files are sorted from newest first to oldest last. - std::vector& file_by_time = version->files_by_size_[level]; - assert(file_by_time.size() == version->files_[level].size()); - - unsigned int candidate_count = 0; - uint64_t candidate_size = 0; - unsigned int start_index = 0; - FileMetaData* f = nullptr; - - // Skip files that are already being compacted - for (unsigned int loop = 0; loop < file_by_time.size() - 1; loop++) { - int index = file_by_time[loop]; - f = version->files_[level][index]; - if (!f->being_compacted) { - start_index = loop; // Consider this as the first candidate. - break; - } - Log(options_->info_log, "Universal: skipping file %lu[%d] compacted %s", - (unsigned long)f->number, - loop, - " cannot be a candidate to reduce size amp.\n"); - f = nullptr; - } - if (f == nullptr) { - return nullptr; // no candidate files - } - - Log(options_->info_log, "Universal: First candidate file %lu[%d] %s", - (unsigned long)f->number, - start_index, - " to reduce size amp.\n"); - - // keep adding up all the remaining files - for (unsigned int loop = start_index; loop < file_by_time.size() - 1; - loop++) { - int index = file_by_time[loop]; - f = version->files_[level][index]; - if (f->being_compacted) { - Log(options_->info_log, - "Universal: Possible candidate file %lu[%d] %s.", - (unsigned long)f->number, - loop, - " is already being compacted. No size amp reduction possible.\n"); - return nullptr; - } - candidate_size += f->file_size; - candidate_count++; - } - if (candidate_count == 0) { - return nullptr; - } - - // size of earliest file - int index = file_by_time[file_by_time.size() - 1]; - uint64_t earliest_file_size = version->files_[level][index]->file_size; - - // size amplification = percentage of additional size - if (candidate_size * 100 < ratio * earliest_file_size) { - Log(options_->info_log, - "Universal: size amp not needed. newer-files-total-size %lu " - "earliest-file-size %lu", - (unsigned long)candidate_size, - (unsigned long)earliest_file_size); - return nullptr; - } else { - Log(options_->info_log, - "Universal: size amp needed. newer-files-total-size %lu " - "earliest-file-size %lu", - (unsigned long)candidate_size, - (unsigned long)earliest_file_size); - } - assert(start_index >= 0 && start_index < file_by_time.size() - 1); - - // create a compaction request - // We always compact all the files, so always compress. - Compaction* c = new Compaction(level, - level, - MaxFileSizeForLevel(level), - LLONG_MAX, - NumberLevels(), - version, - false, - true); - c->score_ = score; - for (unsigned int loop = start_index; loop < file_by_time.size(); loop++) { - int index = file_by_time[loop]; - f = version->files_[level][index]; - c->inputs_[0].push_back(f); - Log(options_->info_log, - "Universal: size amp picking file %lu[%d] with size %lu", - (unsigned long)f->number, - index, - (unsigned long)f->file_size); - } - return c; -} - -// -// Consider compaction files based on their size differences with -// the next file in time order. -// -Compaction* VersionSet::PickCompactionUniversalReadAmp( - int level, double score, unsigned int ratio, - unsigned int max_number_of_files_to_compact) { - - // TODO this only works for default column family now - Version* version = column_family_data_.find(0)->second->current; - - unsigned int min_merge_width = - options_->compaction_options_universal.min_merge_width; - unsigned int max_merge_width = - options_->compaction_options_universal.max_merge_width; - - // The files are sorted from newest first to oldest last. - std::vector& file_by_time = version->files_by_size_[level]; - FileMetaData* f = nullptr; - bool done = false; - int start_index = 0; - unsigned int candidate_count; - assert(file_by_time.size() == version->files_[level].size()); - - unsigned int max_files_to_compact = std::min(max_merge_width, - max_number_of_files_to_compact); - min_merge_width = std::max(min_merge_width, 2U); - - // Considers a candidate file only if it is smaller than the - // total size accumulated so far. - for (unsigned int loop = 0; loop < file_by_time.size(); loop++) { - - candidate_count = 0; - - // Skip files that are already being compacted - for (f = nullptr; loop < file_by_time.size(); loop++) { - int index = file_by_time[loop]; - f = version->files_[level][index]; - - if (!f->being_compacted) { - candidate_count = 1; - break; - } - Log(options_->info_log, - "Universal: file %lu[%d] being compacted, skipping", - (unsigned long)f->number, loop); - f = nullptr; - } - - // This file is not being compacted. Consider it as the - // first candidate to be compacted. - uint64_t candidate_size = f != nullptr? f->file_size : 0; - if (f != nullptr) { - Log(options_->info_log, "Universal: Possible candidate file %lu[%d].", - (unsigned long)f->number, loop); - } - - // Check if the suceeding files need compaction. - for (unsigned int i = loop+1; - candidate_count < max_files_to_compact && i < file_by_time.size(); - i++) { - int index = file_by_time[i]; - FileMetaData* f = version->files_[level][index]; - if (f->being_compacted) { - break; - } - // pick files if the total candidate file size (increased by the - // specified ratio) is still larger than the next candidate file. - uint64_t sz = (candidate_size * (100L + ratio)) /100; - if (sz < f->file_size) { - break; - } - candidate_count++; - candidate_size += f->file_size; - } - - // Found a series of consecutive files that need compaction. - if (candidate_count >= (unsigned int)min_merge_width) { - start_index = loop; - done = true; - break; - } else { - for (unsigned int i = loop; - i < loop + candidate_count && i < file_by_time.size(); i++) { - int index = file_by_time[i]; - FileMetaData* f = version->files_[level][index]; - Log(options_->info_log, - "Universal: Skipping file %lu[%d] with size %lu %d\n", - (unsigned long)f->number, - i, - (unsigned long)f->file_size, - f->being_compacted); - } - } - } - if (!done || candidate_count <= 1) { - return nullptr; - } - unsigned int first_index_after = start_index + candidate_count; - // Compression is enabled if files compacted earlier already reached - // size ratio of compression. - bool enable_compression = true; - int ratio_to_compress = - options_->compaction_options_universal.compression_size_percent; - if (ratio_to_compress >= 0) { - uint64_t total_size = TotalFileSize(version->files_[level]); - uint64_t older_file_size = 0; - for (unsigned int i = file_by_time.size() - 1; i >= first_index_after; - i--) { - older_file_size += version->files_[level][file_by_time[i]]->file_size; - if (older_file_size * 100L >= total_size * (long) ratio_to_compress) { - enable_compression = false; - break; - } - } - } - Compaction* c = new Compaction(level, - level, - MaxFileSizeForLevel(level), - LLONG_MAX, - NumberLevels(), - version, - false, - enable_compression); - c->score_ = score; - - for (unsigned int i = start_index; i < first_index_after; i++) { - int index = file_by_time[i]; - FileMetaData* f = version->files_[level][index]; - c->inputs_[0].push_back(f); - Log(options_->info_log, "Universal: Picking file %lu[%d] with size %lu\n", - (unsigned long)f->number, - i, - (unsigned long)f->file_size); - } - return c; -} - -// -// Universal style of compaction. Pick files that are contiguous in -// time-range to compact. -// -Compaction* VersionSet::PickCompactionUniversal(int level, double score) { - assert (level == 0); - // TODO this only works for default column family now - Version* version = column_family_data_.find(0)->second->current; - - if ((version->files_[level].size() < - (unsigned int)options_->level0_file_num_compaction_trigger)) { - Log(options_->info_log, "Universal: nothing to do\n"); - return nullptr; - } - VersionSet::FileSummaryStorage tmp; - Log(options_->info_log, - "Universal: candidate files(%lu): %s\n", - version->files_[level].size(), - LevelFileSummary(version, &tmp, 0)); - - // Check for size amplification first. - Compaction* c = PickCompactionUniversalSizeAmp(level, score); - if (c == nullptr) { - - // Size amplification is within limits. Try reducing read - // amplification while maintaining file size ratios. - unsigned int ratio = options_->compaction_options_universal.size_ratio; - c = PickCompactionUniversalReadAmp(level, score, ratio, UINT_MAX); - - // Size amplification and file size ratios are within configured limits. - // If max read amplification is exceeding configured limits, then force - // compaction without looking at filesize ratios and try to reduce - // the number of files to fewer than level0_file_num_compaction_trigger. - if (c == nullptr) { - unsigned int num_files = version->files_[level].size() - - options_->level0_file_num_compaction_trigger; - c = PickCompactionUniversalReadAmp(level, score, UINT_MAX, num_files); - } - } - if (c == nullptr) { - return nullptr; - } - assert(c->inputs_[0].size() > 1); - - // validate that all the chosen files are non overlapping in time - FileMetaData* newerfile __attribute__((unused)) = nullptr; - for (unsigned int i = 0; i < c->inputs_[0].size(); i++) { - FileMetaData* f = c->inputs_[0][i]; - assert (f->smallest_seqno <= f->largest_seqno); - assert(newerfile == nullptr || - newerfile->smallest_seqno > f->largest_seqno); - newerfile = f; - } - - // The files are sorted from newest first to oldest last. - std::vector& file_by_time = version->files_by_size_[level]; - - // Is the earliest file part of this compaction? - int last_index = file_by_time[file_by_time.size()-1]; - FileMetaData* last_file = version->files_[level][last_index]; - if (c->inputs_[0][c->inputs_[0].size()-1] == last_file) { - c->bottommost_level_ = true; - } - - // update statistics - if (options_->statistics != nullptr) { - options_->statistics->measureTime(NUM_FILES_IN_SINGLE_COMPACTION, - c->inputs_[0].size()); - } - - // mark all the files that are being compacted - c->MarkFilesBeingCompacted(true); - - // remember this currently undergoing compaction - compactions_in_progress_[level].insert(c); - - // Record whether this compaction includes all sst files. - // For now, it is only relevant in universal compaction mode. - c->is_full_compaction_ = (c->inputs_[0].size() == version->files_[0].size()); - - return c; -} - -Compaction* VersionSet::PickCompactionBySize(int level, double score) { - Compaction* c = nullptr; - // TODO this only works for default column family now - Version* version = column_family_data_.find(0)->second->current; - - // level 0 files are overlapping. So we cannot pick more - // than one concurrent compactions at this level. This - // could be made better by looking at key-ranges that are - // being compacted at level 0. - if (level == 0 && compactions_in_progress_[level].size() == 1) { - return nullptr; - } - - assert(level >= 0); - assert(level+1 < NumberLevels()); - c = new Compaction(level, - level + 1, - MaxFileSizeForLevel(level + 1), - MaxGrandParentOverlapBytes(level), - NumberLevels(), - version); - c->score_ = score; - - // Pick the largest file in this level that is not already - // being compacted - std::vector& file_size = version->files_by_size_[level]; - - // record the first file that is not yet compacted - int nextIndex = -1; - - for (unsigned int i = version->next_file_to_compact_by_size_[level]; - i < file_size.size(); i++) { - int index = file_size[i]; - FileMetaData* f = version->files_[level][index]; - - // check to verify files are arranged in descending size - assert((i == file_size.size() - 1) || - (i >= Version::number_of_files_to_sort_-1) || - (f->file_size >= version->files_[level][file_size[i+1]]->file_size)); - - // do not pick a file to compact if it is being compacted - // from n-1 level. - if (f->being_compacted) { - continue; - } - - // remember the startIndex for the next call to PickCompaction - if (nextIndex == -1) { - nextIndex = i; - } - - //if (i > Version::number_of_files_to_sort_) { - // Log(options_->info_log, "XXX Looking at index %d", i); - //} - - // Do not pick this file if its parents at level+1 are being compacted. - // Maybe we can avoid redoing this work in SetupOtherInputs - int parent_index = -1; - if (ParentRangeInCompaction(&f->smallest, &f->largest, level, - &parent_index)) { - continue; - } - c->inputs_[0].push_back(f); - c->base_index_ = index; - c->parent_index_ = parent_index; - break; - } - - if (c->inputs_[0].empty()) { - delete c; - c = nullptr; - } - - // store where to start the iteration in the next call to PickCompaction - version->next_file_to_compact_by_size_[level] = nextIndex; - - return c; -} - -Compaction* VersionSet::PickCompaction() { - Compaction* c = nullptr; - // TODO this only works for default column family now - Version* version = column_family_data_.find(0)->second->current; - int level = -1; - - // Compute the compactions needed. It is better to do it here - // and also in LogAndApply(), otherwise the values could be stale. - std::vector size_being_compacted(NumberLevels()-1); - version->vset_->SizeBeingCompacted(size_being_compacted); - Finalize(version, size_being_compacted); - - // In universal style of compaction, compact L0 files back into L0. - if (options_->compaction_style == kCompactionStyleUniversal) { - int level = 0; - c = PickCompactionUniversal(level, version->compaction_score_[level]); - return c; - } - - // We prefer compactions triggered by too much data in a level over - // the compactions triggered by seeks. - // - // Find the compactions by size on all levels. - for (int i = 0; i < NumberLevels()-1; i++) { - assert(i == 0 || version->compaction_score_[i] <= - version->compaction_score_[i-1]); - level = version->compaction_level_[i]; - if ((version->compaction_score_[i] >= 1)) { - c = PickCompactionBySize(level, version->compaction_score_[i]); - ExpandWhileOverlapping(c); - if (c != nullptr) { - break; - } - } - } - - // Find compactions needed by seeks - FileMetaData* f = version->file_to_compact_; - if (c == nullptr && f != nullptr && !f->being_compacted) { - - level = version->file_to_compact_level_; - int parent_index = -1; - - // Only allow one level 0 compaction at a time. - // Do not pick this file if its parents at level+1 are being compacted. - if (level != 0 || compactions_in_progress_[0].empty()) { - if(!ParentRangeInCompaction(&f->smallest, &f->largest, level, - &parent_index)) { - c = new Compaction(level, - level + 1, - MaxFileSizeForLevel(level + 1), - MaxGrandParentOverlapBytes(level), - NumberLevels(), - version, - true); - c->inputs_[0].push_back(f); - c->parent_index_ = parent_index; - version->file_to_compact_ = nullptr; - ExpandWhileOverlapping(c); - } - } - } - - if (c == nullptr) { - return nullptr; - } - - // Two level 0 compaction won't run at the same time, so don't need to worry - // about files on level 0 being compacted. - if (level == 0) { - assert(compactions_in_progress_[0].empty()); - InternalKey smallest, largest; - GetRange(c->inputs_[0], &smallest, &largest); - // Note that the next call will discard the file we placed in - // c->inputs_[0] earlier and replace it with an overlapping set - // which will include the picked file. - c->inputs_[0].clear(); - version->GetOverlappingInputs(0, &smallest, &largest, &c->inputs_[0]); - - // If we include more L0 files in the same compaction run it can - // cause the 'smallest' and 'largest' key to get extended to a - // larger range. So, re-invoke GetRange to get the new key range - GetRange(c->inputs_[0], &smallest, &largest); - if (ParentRangeInCompaction(&smallest, &largest, - level, &c->parent_index_)) { - delete c; - return nullptr; - } - assert(!c->inputs_[0].empty()); - } - - // Setup "level+1" files (inputs_[1]) - SetupOtherInputs(c); - - // mark all the files that are being compacted - c->MarkFilesBeingCompacted(true); - - // Is this compaction creating a file at the bottommost level - c->SetupBottomMostLevel(false); - - // remember this currently undergoing compaction - compactions_in_progress_[level].insert(c); - - return c; -} - -// Returns true if any one of the parent files are being compacted -bool VersionSet::ParentRangeInCompaction(const InternalKey* smallest, - const InternalKey* largest, int level, int* parent_index) { - std::vector inputs; - // TODO this only works for default column family now - Version* version = column_family_data_.find(0)->second->current; - - version->GetOverlappingInputs( - level + 1, smallest, largest, &inputs, *parent_index, parent_index); - return FilesInCompaction(inputs); -} - -// Returns true if any one of specified files are being compacted -bool VersionSet::FilesInCompaction(std::vector& files) { - for (unsigned int i = 0; i < files.size(); i++) { - if (files[i]->being_compacted) { - return true; - } - } - return false; -} - -// Add more files to the inputs on "level" to make sure that -// no newer version of a key is compacted to "level+1" while leaving an older -// version in a "level". Otherwise, any Get() will search "level" first, -// and will likely return an old/stale value for the key, since it always -// searches in increasing order of level to find the value. This could -// also scramble the order of merge operands. This function should be -// called any time a new Compaction is created, and its inputs_[0] are -// populated. -// -// Will set c to nullptr if it is impossible to apply this compaction. -void VersionSet::ExpandWhileOverlapping(Compaction* c) { - // If inputs are empty then there is nothing to expand. - if (!c || c->inputs_[0].empty()) { - return; - } - - // GetOverlappingInputs will always do the right thing for level-0. - // So we don't need to do any expansion if level == 0. - if (c->level() == 0) { - return; - } - - const int level = c->level(); - InternalKey smallest, largest; - - // Keep expanding c->inputs_[0] until we are sure that there is a - // "clean cut" boundary between the files in input and the surrounding files. - // This will ensure that no parts of a key are lost during compaction. - int hint_index = -1; - size_t old_size; - do { - old_size = c->inputs_[0].size(); - GetRange(c->inputs_[0], &smallest, &largest); - c->inputs_[0].clear(); - c->input_version_->GetOverlappingInputs( - level, &smallest, &largest, &c->inputs_[0], hint_index, &hint_index); - } while(c->inputs_[0].size() > old_size); - - // Get the new range - GetRange(c->inputs_[0], &smallest, &largest); - - // If, after the expansion, there are files that are already under - // compaction, then we must drop/cancel this compaction. - int parent_index = -1; - if (FilesInCompaction(c->inputs_[0]) || - ParentRangeInCompaction(&smallest, &largest, level, &parent_index)) { - c->inputs_[0].clear(); - c->inputs_[1].clear(); - delete c; - c = nullptr; - } -} - -// Populates the set of inputs from "level+1" that overlap with "level". -// Will also attempt to expand "level" if that doesn't expand "level+1" -// or cause "level" to include a file for compaction that has an overlapping -// user-key with another file. -void VersionSet::SetupOtherInputs(Compaction* c) { - // If inputs are empty, then there is nothing to expand. - if (c->inputs_[0].empty()) { - return; - } - - const int level = c->level(); - InternalKey smallest, largest; - - // Get the range one last time. - GetRange(c->inputs_[0], &smallest, &largest); - - // Populate the set of next-level files (inputs_[1]) to include in compaction - c->input_version_->GetOverlappingInputs(level + 1, - &smallest, - &largest, - &c->inputs_[1], - c->parent_index_, - &c->parent_index_); - - // Get entire range covered by compaction - InternalKey all_start, all_limit; - GetRange2(c->inputs_[0], c->inputs_[1], &all_start, &all_limit); - - // See if we can further grow the number of inputs in "level" without - // changing the number of "level+1" files we pick up. We also choose NOT - // to expand if this would cause "level" to include some entries for some - // user key, while excluding other entries for the same user key. This - // can happen when one user key spans multiple files. - if (!c->inputs_[1].empty()) { - std::vector expanded0; - c->input_version_->GetOverlappingInputs( - level, &all_start, &all_limit, &expanded0, c->base_index_, nullptr); - const uint64_t inputs0_size = TotalFileSize(c->inputs_[0]); - const uint64_t inputs1_size = TotalFileSize(c->inputs_[1]); - const uint64_t expanded0_size = TotalFileSize(expanded0); - uint64_t limit = ExpandedCompactionByteSizeLimit(level); - if (expanded0.size() > c->inputs_[0].size() && - inputs1_size + expanded0_size < limit && - !FilesInCompaction(expanded0) && - !c->input_version_->HasOverlappingUserKey(&expanded0, level)) { - InternalKey new_start, new_limit; - GetRange(expanded0, &new_start, &new_limit); - std::vector expanded1; - c->input_version_->GetOverlappingInputs(level + 1, - &new_start, - &new_limit, - &expanded1, - c->parent_index_, - &c->parent_index_); - if (expanded1.size() == c->inputs_[1].size() && - !FilesInCompaction(expanded1)) { - Log(options_->info_log, - "Expanding@%lu %lu+%lu (%lu+%lu bytes) to %lu+%lu (%lu+%lu bytes)" - "\n", - (unsigned long)level, - (unsigned long)(c->inputs_[0].size()), - (unsigned long)(c->inputs_[1].size()), - (unsigned long)inputs0_size, - (unsigned long)inputs1_size, - (unsigned long)(expanded0.size()), - (unsigned long)(expanded1.size()), - (unsigned long)expanded0_size, - (unsigned long)inputs1_size); - smallest = new_start; - largest = new_limit; - c->inputs_[0] = expanded0; - c->inputs_[1] = expanded1; - GetRange2(c->inputs_[0], c->inputs_[1], &all_start, &all_limit); - } - } - } - - // Compute the set of grandparent files that overlap this compaction - // (parent == level+1; grandparent == level+2) - if (level + 2 < NumberLevels()) { - c->input_version_->GetOverlappingInputs( - level + 2, &all_start, &all_limit, &c->grandparents_); - } - - if (false) { - Log(options_->info_log, "Compacting %d '%s' .. '%s'", - level, - smallest.DebugString().c_str(), - largest.DebugString().c_str()); - } - - // Update the place where we will do the next compaction for this level. - // We update this immediately instead of waiting for the VersionEdit - // to be applied so that if the compaction fails, we will try a different - // key range next time. - compact_pointer_[level] = largest.Encode().ToString(); - c->edit_->SetCompactPointer(level, largest); + compaction_picker_->ReleaseCompactionFiles(c, status); } -Status VersionSet::GetMetadataForFile( - uint64_t number, - int *filelevel, - FileMetaData *meta) { +Status VersionSet::GetMetadataForFile(uint64_t number, int* filelevel, + FileMetaData* meta) { for (auto cfd : column_family_data_) { - for (int level = 0; level < NumberLevels(); level++) { - const std::vector& files = - cfd.second->current->files_[level]; + Version* version = cfd.second->current; + for (int level = 0; level < version->NumberLevels(); level++) { + const std::vector& files = version->files_[level]; for (size_t i = 0; i < files.size(); i++) { if (files[i]->number == number) { *meta = *files[i]; @@ -3064,261 +2233,4 @@ void VersionSet::DropColumnFamily(VersionEdit* edit) { column_family_data_.erase(cfd); } -Compaction* VersionSet::CompactRange(int level, - const InternalKey* begin, - const InternalKey* end) { - std::vector inputs; - // TODO this only works for default column family now - Version* version = column_family_data_.find(0)->second->current; - - // All files are 'overlapping' in universal style compaction. - // We have to compact the entire range in one shot. - if (options_->compaction_style == kCompactionStyleUniversal) { - begin = nullptr; - end = nullptr; - } - version->GetOverlappingInputs(level, begin, end, &inputs); - if (inputs.empty()) { - return nullptr; - } - - // Avoid compacting too much in one shot in case the range is large. - // But we cannot do this for level-0 since level-0 files can overlap - // and we must not pick one file and drop another older file if the - // two files overlap. - if (level > 0) { - const uint64_t limit = MaxFileSizeForLevel(level) * - options_->source_compaction_factor; - uint64_t total = 0; - for (size_t i = 0; i < inputs.size(); ++i) { - uint64_t s = inputs[i]->file_size; - total += s; - if (total >= limit) { - inputs.resize(i + 1); - break; - } - } - } - int out_level = (options_->compaction_style == kCompactionStyleUniversal) ? - level : level+1; - - Compaction* c = new Compaction(level, - out_level, - MaxFileSizeForLevel(out_level), - MaxGrandParentOverlapBytes(level), - NumberLevels(), - version); - - c->inputs_[0] = inputs; - ExpandWhileOverlapping(c); - if (c == nullptr) { - Log(options_->info_log, "Could not compact due to expansion failure.\n"); - return nullptr; - } - - SetupOtherInputs(c); - - // These files that are to be manaully compacted do not trample - // upon other files because manual compactions are processed when - // the system has a max of 1 background compaction thread. - c->MarkFilesBeingCompacted(true); - - // Is this compaction creating a file at the bottommost level - c->SetupBottomMostLevel(true); - return c; -} - -Compaction::Compaction(int level, - int out_level, - uint64_t target_file_size, - uint64_t max_grandparent_overlap_bytes, - int number_levels, - Version* input_version, - bool seek_compaction, - bool enable_compression) - : level_(level), - out_level_(out_level), - max_output_file_size_(target_file_size), - maxGrandParentOverlapBytes_(max_grandparent_overlap_bytes), - input_version_(input_version), - number_levels_(number_levels), - seek_compaction_(seek_compaction), - enable_compression_(enable_compression), - grandparent_index_(0), - seen_key_(false), - overlapped_bytes_(0), - base_index_(-1), - parent_index_(-1), - score_(0), - bottommost_level_(false), - is_full_compaction_(false), - level_ptrs_(std::vector(number_levels)) { - input_version_->Ref(); - edit_ = new VersionEdit(number_levels_); - for (int i = 0; i < number_levels_; i++) { - level_ptrs_[i] = 0; - } -} - -Compaction::~Compaction() { - delete edit_; - if (input_version_ != nullptr) { - input_version_->Unref(); - } -} - -bool Compaction::IsTrivialMove() const { - // Avoid a move if there is lots of overlapping grandparent data. - // Otherwise, the move could create a parent file that will require - // a very expensive merge later on. - return (num_input_files(0) == 1 && - num_input_files(1) == 0 && - TotalFileSize(grandparents_) <= maxGrandParentOverlapBytes_); -} - -void Compaction::AddInputDeletions(VersionEdit* edit) { - for (int which = 0; which < 2; which++) { - for (size_t i = 0; i < inputs_[which].size(); i++) { - edit->DeleteFile(level_ + which, inputs_[which][i]->number); - } - } -} - -bool Compaction::IsBaseLevelForKey(const Slice& user_key) { - if (input_version_->vset_->options_->compaction_style == - kCompactionStyleUniversal) { - return bottommost_level_; - } - // Maybe use binary search to find right entry instead of linear search? - const Comparator* user_cmp = input_version_->vset_->icmp_.user_comparator(); - for (int lvl = level_ + 2; lvl < number_levels_; lvl++) { - const std::vector& files = input_version_->files_[lvl]; - for (; level_ptrs_[lvl] < files.size(); ) { - FileMetaData* f = files[level_ptrs_[lvl]]; - if (user_cmp->Compare(user_key, f->largest.user_key()) <= 0) { - // We've advanced far enough - if (user_cmp->Compare(user_key, f->smallest.user_key()) >= 0) { - // Key falls in this file's range, so definitely not base level - return false; - } - break; - } - level_ptrs_[lvl]++; - } - } - return true; -} - -bool Compaction::ShouldStopBefore(const Slice& internal_key) { - // Scan to find earliest grandparent file that contains key. - const InternalKeyComparator* icmp = &input_version_->vset_->icmp_; - while (grandparent_index_ < grandparents_.size() && - icmp->Compare(internal_key, - grandparents_[grandparent_index_]->largest.Encode()) > 0) { - if (seen_key_) { - overlapped_bytes_ += grandparents_[grandparent_index_]->file_size; - } - assert(grandparent_index_ + 1 >= grandparents_.size() || - icmp->Compare(grandparents_[grandparent_index_]->largest.Encode(), - grandparents_[grandparent_index_+1]->smallest.Encode()) - < 0); - grandparent_index_++; - } - seen_key_ = true; - - if (overlapped_bytes_ > maxGrandParentOverlapBytes_) { - // Too much overlap for current output; start new output - overlapped_bytes_ = 0; - return true; - } else { - return false; - } -} - -// Mark (or clear) each file that is being compacted -void Compaction::MarkFilesBeingCompacted(bool value) { - for (int i = 0; i < 2; i++) { - std::vector v = inputs_[i]; - for (unsigned int j = 0; j < inputs_[i].size(); j++) { - assert(value ? !inputs_[i][j]->being_compacted : - inputs_[i][j]->being_compacted); - inputs_[i][j]->being_compacted = value; - } - } -} - -// Is this compaction producing files at the bottommost level? -void Compaction::SetupBottomMostLevel(bool isManual) { - if (input_version_->vset_->options_->compaction_style == - kCompactionStyleUniversal) { - // If universal compaction style is used and manual - // compaction is occuring, then we are guaranteed that - // all files will be picked in a single compaction - // run. We can safely set bottommost_level_ = true. - // If it is not manual compaction, then bottommost_level_ - // is already set when the Compaction was created. - if (isManual) { - bottommost_level_ = true; - } - return; - } - bottommost_level_ = true; - int num_levels = input_version_->vset_->NumberLevels(); - for (int i = level() + 2; i < num_levels; i++) { - if (input_version_->vset_->NumLevelFiles(i) > 0) { - bottommost_level_ = false; - break; - } - } -} - -void Compaction::ReleaseInputs() { - if (input_version_ != nullptr) { - input_version_->Unref(); - input_version_ = nullptr; - } -} - -void Compaction::ResetNextCompactionIndex() { - input_version_->ResetNextCompactionIndex(level_); -} - -static void InputSummary(std::vector& files, - char* output, - int len) { - int write = 0; - for (unsigned int i = 0; i < files.size(); i++) { - int sz = len - write; - int ret = snprintf(output + write, sz, "%lu(%lu) ", - (unsigned long)files.at(i)->number, - (unsigned long)files.at(i)->file_size); - if (ret < 0 || ret >= sz) - break; - write += ret; - } -} - -void Compaction::Summary(char* output, int len) { - int write = snprintf(output, len, - "Base version %lu Base level %d, seek compaction:%d, inputs:", - (unsigned long)input_version_->GetVersionNumber(), - level_, - seek_compaction_); - if (write < 0 || write > len) { - return; - } - - char level_low_summary[100]; - InputSummary(inputs_[0], level_low_summary, sizeof(level_low_summary)); - char level_up_summary[100]; - if (inputs_[1].size()) { - InputSummary(inputs_[1], level_up_summary, sizeof(level_up_summary)); - } else { - level_up_summary[0] = '\0'; - } - - snprintf(output + write, len - write, "[%s],[%s]", - level_low_summary, level_up_summary); -} - } // namespace rocksdb diff --git a/db/version_set.h b/db/version_set.h index b89083613..65a1406aa 100644 --- a/db/version_set.h +++ b/db/version_set.h @@ -27,12 +27,15 @@ #include "db/version_edit.h" #include "port/port.h" #include "db/table_cache.h" +#include "db/compaction.h" +#include "db/compaction_picker.h" namespace rocksdb { namespace log { class Writer; } class Compaction; +class CompactionPicker; class Iterator; class MemTable; class TableCache; @@ -86,6 +89,11 @@ class Version { // REQUIRES: lock is held bool UpdateStats(const GetStats& stats); + // Updates internal structures that keep track of compaction scores + // We use compaction scores to figure out which compaction to do next + // Also pre-sorts level0 files for Get() + void Finalize(std::vector& size_being_compacted); + // Reference count management (so Versions do not disappear out from // under live iterators) void Ref(); @@ -135,21 +143,54 @@ class Version { int PickLevelForMemTableOutput(const Slice& smallest_user_key, const Slice& largest_user_key); - int NumFiles(int level) const { return files_[level].size(); } + int NumberLevels() const { return num_levels_; } + + // REQUIRES: lock is held + int NumLevelFiles(int level) const { return files_[level].size(); } + + // Return the combined file size of all files at the specified level. + int64_t NumLevelBytes(int level) const; + + // Return a human-readable short (single-line) summary of the number + // of files per level. Uses *scratch as backing store. + struct LevelSummaryStorage { + char buffer[100]; + }; + struct FileSummaryStorage { + char buffer[1000]; + }; + const char* LevelSummary(LevelSummaryStorage* scratch) const; + // Return a human-readable short (single-line) summary of files + // in a specified level. Uses *scratch as backing store. + const char* LevelFileSummary(FileSummaryStorage* scratch, int level) const; + + // Return the maximum overlapping data (in bytes) at next level for any + // file at a level >= 1. + int64_t MaxNextLevelOverlappingBytes(); + + // Add all files listed in the current version to *live. + void AddLiveFiles(std::set* live); // Return a human readable string that describes this version's contents. std::string DebugString(bool hex = false) const; // Returns the version nuber of this version - uint64_t GetVersionNumber() { - return version_number_; - } + uint64_t GetVersionNumber() const { return version_number_; } + + // used to sort files by size + struct Fsize { + int index; + FileMetaData* file; + }; private: friend class Compaction; friend class VersionSet; friend class DBImpl; friend struct ColumnFamilyData; + friend class CompactionPicker; + friend class LevelCompactionPicker; + friend class UniversalCompactionPicker; class LevelFileNumIterator; Iterator* NewConcatenatingIterator(const ReadOptions&, @@ -158,10 +199,15 @@ class Version { bool PrefixMayMatch(const ReadOptions& options, const EnvOptions& soptions, const Slice& internal_prefix, Iterator* level_iter) const; + // Sort all files for this version based on their file size and + // record results in files_by_size_. The largest files are listed first. + void UpdateFilesBySize(); + VersionSet* vset_; // VersionSet to which this Version belongs Version* next_; // Next version in linked list Version* prev_; // Previous version in linked list int refs_; // Number of live refs to this version + int num_levels_; // Number of levels // List of files per level, files in each level are arranged // in increasing order of keys @@ -251,10 +297,8 @@ struct ColumnFamilyData { class VersionSet { public: - VersionSet(const std::string& dbname, - const Options* options, - const EnvOptions& storage_options, - TableCache* table_cache, + VersionSet(const std::string& dbname, const Options* options, + const EnvOptions& storage_options, TableCache* table_cache, const InternalKeyComparator*); ~VersionSet(); @@ -292,6 +336,12 @@ class VersionSet { return column_family_data_.find(0)->second->current; } + // A Flag indicating whether write needs to slowdown because of there are + // too many number of level0 files. + bool NeedSlowdownForNumLevel0Files() const { + return need_slowdown_for_num_level0_files_; + } + // Return the current manifest file number uint64_t ManifestFileNumber() const { return manifest_file_number_; } @@ -307,12 +357,6 @@ class VersionSet { } } - // Return the number of Table files at the specified level. - int NumLevelFiles(int level) const; - - // Return the combined file size of all files at the specified level. - int64_t NumLevelBytes(int level) const; - // Return the last sequence number. uint64_t LastSequence() const { return last_sequence_.load(std::memory_order_acquire); @@ -346,14 +390,18 @@ class VersionSet { // the specified level. Returns nullptr if there is nothing in that // level that overlaps the specified range. Caller should delete // the result. - Compaction* CompactRange( - int level, - const InternalKey* begin, - const InternalKey* end); - - // Return the maximum overlapping data (in bytes) at next level for any - // file at a level >= 1. - int64_t MaxNextLevelOverlappingBytes(); + // + // The returned Compaction might not include the whole requested range. + // In that case, compaction_end will be set to the next key that needs + // compacting. In case the compaction will compact the whole range, + // compaction_end will be set to nullptr. + // Client is responsible for compaction_end storage -- when called, + // *compaction_end should point to valid InternalKey! + Compaction* CompactRange(int input_level, + int output_level, + const InternalKey* begin, + const InternalKey* end, + InternalKey** compaction_end); // Create an iterator that reads over the compaction inputs for "*c". // The caller should delete the iterator when no longer needed. @@ -405,58 +453,16 @@ class VersionSet { // Add all files listed in any live version to *live. void AddLiveFiles(std::vector* live_list); - // Add all files listed in the current version to *live. - void AddLiveFilesCurrentVersion(std::set* live); - // Return the approximate offset in the database of the data for // "key" as of version "v". uint64_t ApproximateOffsetOf(Version* v, const InternalKey& key); - // Return a human-readable short (single-line) summary of the number - // of files per level. Uses *scratch as backing store. - struct LevelSummaryStorage { - char buffer[100]; - }; - struct FileSummaryStorage { - char buffer[1000]; - }; - const char* LevelSummary(LevelSummaryStorage* scratch) const; - // printf contents (for debugging) Status DumpManifest(Options& options, std::string& manifestFileName, bool verbose, bool hex = false); - // Return a human-readable short (single-line) summary of the data size - // of files per level. Uses *scratch as backing store. - const char* LevelDataSizeSummary(LevelSummaryStorage* scratch) const; - - // Return a human-readable short (single-line) summary of files - // in a specified level. Uses *scratch as backing store. - const char* LevelFileSummary(Version* version, - FileSummaryStorage* scratch, - int level) const; - // Return the size of the current manifest file - const uint64_t ManifestFileSize() { return manifest_file_size_; } - - // For the specfied level, pick a compaction. - // Returns nullptr if there is no compaction to be done. - // If level is 0 and there is already a compaction on that level, this - // function will return nullptr. - Compaction* PickCompactionBySize(int level, double score); - - // Pick files to compact in Universal mode - Compaction* PickCompactionUniversal(int level, double score); - - // Pick Universal compaction to limit read amplification - Compaction* PickCompactionUniversalReadAmp(int level, double score, - unsigned int ratio, unsigned int num_files); - - // Pick Universal compaction to limit space amplification. - Compaction* PickCompactionUniversalSizeAmp(int level, double score); - - // Free up the files that were participated in a compaction - void ReleaseCompactionFiles(Compaction* c, Status status); + uint64_t ManifestFileSize() const { return manifest_file_size_; } // verify that the files that we started with for a compaction // still exist in the current version and in the same original level. @@ -464,20 +470,12 @@ class VersionSet { // pick the same files to compact. bool VerifyCompactionFileConsistency(Compaction* c); - // used to sort files by size - typedef struct fsize { - int index; - FileMetaData* file; - } Fsize; - - // Sort all files for this version based on their file size and - // record results in files_by_size_. The largest files are listed first. - void UpdateFilesBySize(Version *v); + double MaxBytesForLevel(int level); // Get the max file size in a given level. uint64_t MaxFileSizeForLevel(int level); - double MaxBytesForLevel(int level); + void ReleaseCompactionFiles(Compaction* c, Status status); Status GetMetadataForFile( uint64_t number, int *filelevel, FileMetaData *metadata); @@ -503,23 +501,6 @@ class VersionSet { friend class Compaction; friend class Version; - void Init(int num_levels); - - void Finalize(Version* v, std::vector&); - - void GetRange(const std::vector& inputs, - InternalKey* smallest, - InternalKey* largest); - - void GetRange2(const std::vector& inputs1, - const std::vector& inputs2, - InternalKey* smallest, - InternalKey* largest); - - void ExpandWhileOverlapping(Compaction* c); - - void SetupOtherInputs(Compaction* c); - // Save current contents to *log Status WriteSnapshot(log::Writer* log); @@ -527,10 +508,6 @@ class VersionSet { bool ManifestContains(const std::string& record) const; - uint64_t ExpandedCompactionByteSizeLimit(int level); - - uint64_t MaxGrandParentOverlapBytes(int level); - Env* const env_; const std::string dbname_; const Options* const options_; @@ -547,18 +524,13 @@ class VersionSet { // Opened lazily unique_ptr descriptor_log_; - // Per-level key at which the next compaction at that level should start. - // Either an empty string, or a valid InternalKey. - std::string* compact_pointer_; - - // Per-level target file size. - uint64_t* max_file_size_; + // A flag indicating whether we should delay writes because + // we have too many level 0 files + bool need_slowdown_for_num_level0_files_; - // Per-level max bytes - uint64_t* level_max_bytes_; - - // record all the ongoing compactions for all levels - std::vector > compactions_in_progress_; + // An object that keeps all the compaction stats + // and picks the next compaction + std::unique_ptr compaction_picker_; // generates a increasing version number for every new version uint64_t current_version_number_; @@ -566,7 +538,7 @@ class VersionSet { // Queue of writers to the manifest file std::deque manifest_writers_; - // size of manifest file + // Current size of manifest file uint64_t manifest_file_size_; std::vector obsolete_files_; @@ -582,138 +554,8 @@ class VersionSet { VersionSet(const VersionSet&); void operator=(const VersionSet&); - // Return the total amount of data that is undergoing - // compactions per level - void SizeBeingCompacted(std::vector&); - - // Returns true if any one of the parent files are being compacted - bool ParentRangeInCompaction(const InternalKey* smallest, - const InternalKey* largest, int level, int* index); - - // Returns true if any one of the specified files are being compacted - bool FilesInCompaction(std::vector& files); - void LogAndApplyHelper(Builder*b, Version* v, VersionEdit* edit, port::Mutex* mu); }; -// A Compaction encapsulates information about a compaction. -class Compaction { - public: - ~Compaction(); - - // Return the level that is being compacted. Inputs from "level" - // will be merged. - int level() const { return level_; } - - // Outputs will go to this level - int output_level() const { return out_level_; } - - // Return the object that holds the edits to the descriptor done - // by this compaction. - VersionEdit* edit() { return edit_; } - - // "which" must be either 0 or 1 - int num_input_files(int which) const { return inputs_[which].size(); } - - // Return the ith input file at "level()+which" ("which" must be 0 or 1). - FileMetaData* input(int which, int i) const { return inputs_[which][i]; } - - // Maximum size of files to build during this compaction. - uint64_t MaxOutputFileSize() const { return max_output_file_size_; } - - // Whether compression will be enabled for compaction outputs - bool enable_compression() const { return enable_compression_; } - - // Is this a trivial compaction that can be implemented by just - // moving a single input file to the next level (no merging or splitting) - bool IsTrivialMove() const; - - // Add all inputs to this compaction as delete operations to *edit. - void AddInputDeletions(VersionEdit* edit); - - // Returns true if the information we have available guarantees that - // the compaction is producing data in "level+1" for which no data exists - // in levels greater than "level+1". - bool IsBaseLevelForKey(const Slice& user_key); - - // Returns true iff we should stop building the current output - // before processing "internal_key". - bool ShouldStopBefore(const Slice& internal_key); - - // Release the input version for the compaction, once the compaction - // is successful. - void ReleaseInputs(); - - void Summary(char* output, int len); - - // Return the score that was used to pick this compaction run. - double score() const { return score_; } - - // Is this compaction creating a file in the bottom most level? - bool BottomMostLevel() { return bottommost_level_; } - - // Does this compaction include all sst files? - bool IsFullCompaction() { return is_full_compaction_; } - - private: - friend class Version; - friend class VersionSet; - - Compaction(int level, - int out_level, - uint64_t target_file_size, - uint64_t max_grandparent_overlap_bytes, - int number_levels, - Version* input_version, - bool seek_compaction = false, - bool enable_compression = true); - - int level_; - int out_level_; // levels to which output files are stored - uint64_t max_output_file_size_; - uint64_t maxGrandParentOverlapBytes_; - Version* input_version_; - VersionEdit* edit_; - int number_levels_; - - bool seek_compaction_; - bool enable_compression_; - - // Each compaction reads inputs from "level_" and "level_+1" - std::vector inputs_[2]; // The two sets of inputs - - // State used to check for number of of overlapping grandparent files - // (parent == level_ + 1, grandparent == level_ + 2) - std::vector grandparents_; - size_t grandparent_index_; // Index in grandparent_starts_ - bool seen_key_; // Some output key has been seen - uint64_t overlapped_bytes_; // Bytes of overlap between current output - // and grandparent files - int base_index_; // index of the file in files_[level_] - int parent_index_; // index of some file with same range in files_[level_+1] - double score_; // score that was used to pick this compaction. - - // Is this compaction creating a file in the bottom most level? - bool bottommost_level_; - // Does this compaction include all sst files? - bool is_full_compaction_; - - // level_ptrs_ holds indices into input_version_->levels_: our state - // is that we are positioned at one of the file ranges for each - // higher level than the ones involved in this compaction (i.e. for - // all L >= level_ + 2). - std::vector level_ptrs_; - - // mark (or clear) all files that are being compacted - void MarkFilesBeingCompacted(bool); - - // Initialize whether compaction producing files at the bottommost level - void SetupBottomMostLevel(bool isManual); - - // In case of compaction error, reset the nextIndex that is used - // to pick up the next file to be compacted from files_by_size_ - void ResetNextCompactionIndex(); -}; - } // namespace rocksdb diff --git a/db/version_set_reduce_num_levels.cc b/db/version_set_reduce_num_levels.cc index 89653ea7c..e22b82a5a 100644 --- a/db/version_set_reduce_num_levels.cc +++ b/db/version_set_reduce_num_levels.cc @@ -26,7 +26,7 @@ Status VersionSet::ReduceNumberOfLevels(int new_levels, port::Mutex* mu) { // TODO this only works for default column family now Version* current_version = column_family_data_.find(0)->second->current; - int current_levels = NumberLevels(); + int current_levels = current_version->NumberLevels(); if (current_levels <= new_levels) { return Status::OK(); @@ -37,7 +37,7 @@ Status VersionSet::ReduceNumberOfLevels(int new_levels, port::Mutex* mu) { int first_nonempty_level = -1; int first_nonempty_level_filenum = 0; for (int i = new_levels - 1; i < current_levels; i++) { - int file_num = NumLevelFiles(i); + int file_num = current_version->NumLevelFiles(i); if (file_num != 0) { if (first_nonempty_level < 0) { first_nonempty_level = i; @@ -66,15 +66,12 @@ Status VersionSet::ReduceNumberOfLevels(int new_levels, port::Mutex* mu) { delete[] current_version->files_; current_version->files_ = new_files_list; + current_version->num_levels_ = new_levels; - delete[] compact_pointer_; - delete[] max_file_size_; - delete[] level_max_bytes_; num_levels_ = new_levels; - compact_pointer_ = new std::string[new_levels]; - Init(new_levels); - VersionEdit ve(new_levels); - st = LogAndApply(&ve , mu, true); + compaction_picker_->ReduceNumberOfLevels(new_levels); + VersionEdit ve; + st = LogAndApply(&ve, mu, true); return st; } diff --git a/db/write_batch.cc b/db/write_batch.cc index eae0903c6..af4790ce5 100644 --- a/db/write_batch.cc +++ b/db/write_batch.cc @@ -31,7 +31,7 @@ #include "db/snapshot.h" #include "db/write_batch_internal.h" #include "util/coding.h" -#include "util/statistics_imp.h" +#include "util/statistics.h" #include namespace rocksdb { @@ -39,7 +39,8 @@ namespace rocksdb { // WriteBatch header has an 8-byte sequence number followed by a 4-byte count. static const size_t kHeader = 12; -WriteBatch::WriteBatch() { +WriteBatch::WriteBatch(size_t reserved_bytes) { + rep_.reserve((reserved_bytes > kHeader) ? reserved_bytes : kHeader); Clear(); } diff --git a/db/write_batch_test.cc b/db/write_batch_test.cc index 490a4401f..396e3ea6e 100644 --- a/db/write_batch_test.cc +++ b/db/write_batch_test.cc @@ -22,10 +22,11 @@ namespace rocksdb { static std::string PrintContents(WriteBatch* b) { InternalKeyComparator cmp(BytewiseComparator()); auto factory = std::make_shared(); - MemTable* mem = new MemTable(cmp, factory.get()); + Options options; + options.memtable_factory = factory; + MemTable* mem = new MemTable(cmp, options); mem->Ref(); std::string state; - Options options; Status s = WriteBatchInternal::InsertInto(b, mem, &options); int count = 0; Iterator* iter = mem->NewIterator(); diff --git a/include/rocksdb/db.h b/include/rocksdb/db.h index 086a18014..b60c96cbe 100644 --- a/include/rocksdb/db.h +++ b/include/rocksdb/db.h @@ -291,6 +291,7 @@ class DB { } // Compact the underlying storage for the key range [*begin,*end]. + // The actual compaction interval might be superset of [*begin, *end]. // In particular, deleted and overwritten versions are discarded, // and the data is rearranged to reduce the cost of operations // needed to access the data. This operation should typically only diff --git a/include/rocksdb/memtablerep.h b/include/rocksdb/memtablerep.h index fcb782d41..2fca8d161 100644 --- a/include/rocksdb/memtablerep.h +++ b/include/rocksdb/memtablerep.h @@ -111,27 +111,23 @@ class MemTableRep { }; // Return an iterator over the keys in this representation. - virtual std::shared_ptr GetIterator() = 0; + virtual Iterator* GetIterator() = 0; // Return an iterator over at least the keys with the specified user key. The // iterator may also allow access to other keys, but doesn't have to. Default: // GetIterator(). - virtual std::shared_ptr GetIterator(const Slice& user_key) { - return GetIterator(); - } + virtual Iterator* GetIterator(const Slice& user_key) { return GetIterator(); } // Return an iterator over at least the keys with the specified prefix. The // iterator may also allow access to other keys, but doesn't have to. Default: // GetIterator(). - virtual std::shared_ptr GetPrefixIterator(const Slice& prefix) { + virtual Iterator* GetPrefixIterator(const Slice& prefix) { return GetIterator(); } // Return an iterator that has a special Seek semantics. The result of // a Seek might only include keys with the same prefix as the target key. - virtual std::shared_ptr GetDynamicPrefixIterator() { - return GetIterator(); - } + virtual Iterator* GetDynamicPrefixIterator() { return GetIterator(); } protected: // When *key is an internal key concatenated with the value, returns the @@ -144,8 +140,8 @@ class MemTableRep { class MemTableRepFactory { public: virtual ~MemTableRepFactory() { }; - virtual std::shared_ptr CreateMemTableRep( - MemTableRep::KeyComparator&, Arena*) = 0; + virtual MemTableRep* CreateMemTableRep(MemTableRep::KeyComparator&, + Arena*) = 0; virtual const char* Name() const = 0; }; @@ -161,8 +157,8 @@ class VectorRepFactory : public MemTableRepFactory { const size_t count_; public: explicit VectorRepFactory(size_t count = 0) : count_(count) { } - virtual std::shared_ptr CreateMemTableRep( - MemTableRep::KeyComparator&, Arena*) override; + virtual MemTableRep* CreateMemTableRep(MemTableRep::KeyComparator&, + Arena*) override; virtual const char* Name() const override { return "VectorRepFactory"; } @@ -171,8 +167,8 @@ public: // This uses a skip list to store keys. It is the default. class SkipListFactory : public MemTableRepFactory { public: - virtual std::shared_ptr CreateMemTableRep( - MemTableRep::KeyComparator&, Arena*) override; + virtual MemTableRep* CreateMemTableRep(MemTableRep::KeyComparator&, + Arena*) override; virtual const char* Name() const override { return "SkipListFactory"; } diff --git a/include/rocksdb/statistics.h b/include/rocksdb/statistics.h index 011e510f5..f5fbb5924 100644 --- a/include/rocksdb/statistics.h +++ b/include/rocksdb/statistics.h @@ -242,53 +242,10 @@ struct HistogramData { double standard_deviation; }; - -class Histogram { - public: - // clear's the histogram - virtual void Clear() = 0; - virtual ~Histogram(); - // Add a value to be recorded in the histogram. - virtual void Add(uint64_t value) = 0; - - virtual std::string ToString() const = 0; - - // Get statistics - virtual double Median() const = 0; - virtual double Percentile(double p) const = 0; - virtual double Average() const = 0; - virtual double StandardDeviation() const = 0; - virtual void Data(HistogramData * const data) const = 0; - -}; - -/** - * A dumb ticker which keeps incrementing through its life time. - * Thread safe. Locking managed by implementation of this interface. - */ -class Ticker { - public: - Ticker() : count_(0) { } - - inline void setTickerCount(uint64_t count) { - count_ = count; - } - - inline void recordTick(int count = 1) { - count_ += count; - } - - inline uint64_t getCount() { - return count_; - } - - private: - std::atomic_uint_fast64_t count_; -}; - // Analyze the performance of a db class Statistics { public: + virtual ~Statistics() {} virtual long getTickerCount(Tickers tickerType) = 0; virtual void recordTick(Tickers tickerType, uint64_t count = 0) = 0; diff --git a/include/rocksdb/write_batch.h b/include/rocksdb/write_batch.h index bc1d63ce4..a0072ce68 100644 --- a/include/rocksdb/write_batch.h +++ b/include/rocksdb/write_batch.h @@ -36,7 +36,7 @@ struct SliceParts; class WriteBatch { public: - WriteBatch(); + explicit WriteBatch(size_t reserved_bytes = 0); ~WriteBatch(); // Store the mapping "key->value" in the database. @@ -122,7 +122,10 @@ class WriteBatch { Status Iterate(Handler* handler) const; // Retrieve the serialized version of this batch. - std::string Data() { return rep_; } + const std::string& Data() const { return rep_; } + + // Retrieve data size of the batch. + size_t GetDataSize() const { return rep_.size(); } // Returns the number of updates in the batch int Count() const; diff --git a/table/table_test.cc b/table/table_test.cc index 1f79fcdf9..9907550ce 100644 --- a/table/table_test.cc +++ b/table/table_test.cc @@ -12,7 +12,8 @@ #include #include "db/dbformat.h" -#include "db/db_statistics.h" +#include "rocksdb/statistics.h" +#include "util/statistics.h" #include "db/memtable.h" #include "db/write_batch_internal.h" #include "rocksdb/cache.h" @@ -370,7 +371,9 @@ class MemTableConstructor: public Constructor { : Constructor(cmp), internal_comparator_(cmp), table_factory_(new SkipListFactory) { - memtable_ = new MemTable(internal_comparator_, table_factory_.get()); + Options options; + options.memtable_factory = table_factory_; + memtable_ = new MemTable(internal_comparator_, options); memtable_->Ref(); } ~MemTableConstructor() { @@ -378,7 +381,9 @@ class MemTableConstructor: public Constructor { } virtual Status FinishImpl(const Options& options, const KVMap& data) { delete memtable_->Unref(); - memtable_ = new MemTable(internal_comparator_, table_factory_.get()); + Options memtable_options; + memtable_options.memtable_factory = table_factory_; + memtable_ = new MemTable(internal_comparator_, memtable_options); memtable_->Ref(); int seq = 1; for (KVMap::const_iterator it = data.begin(); @@ -931,18 +936,12 @@ TEST(TableTest, NumBlockStat) { class BlockCacheProperties { public: explicit BlockCacheProperties(Statistics* statistics) { - block_cache_miss = - statistics->getTickerCount(BLOCK_CACHE_MISS); - block_cache_hit = - statistics->getTickerCount(BLOCK_CACHE_HIT); - index_block_cache_miss = - statistics->getTickerCount(BLOCK_CACHE_INDEX_MISS); - index_block_cache_hit = - statistics->getTickerCount(BLOCK_CACHE_INDEX_HIT); - data_block_cache_miss = - statistics->getTickerCount(BLOCK_CACHE_DATA_MISS); - data_block_cache_hit = - statistics->getTickerCount(BLOCK_CACHE_DATA_HIT); + block_cache_miss = statistics->getTickerCount(BLOCK_CACHE_MISS); + block_cache_hit = statistics->getTickerCount(BLOCK_CACHE_HIT); + index_block_cache_miss = statistics->getTickerCount(BLOCK_CACHE_INDEX_MISS); + index_block_cache_hit = statistics->getTickerCount(BLOCK_CACHE_INDEX_HIT); + data_block_cache_miss = statistics->getTickerCount(BLOCK_CACHE_DATA_MISS); + data_block_cache_hit = statistics->getTickerCount(BLOCK_CACHE_DATA_HIT); } // Check if the fetched props matches the expected ones. @@ -1268,10 +1267,11 @@ class MemTableTest { }; TEST(MemTableTest, Simple) { InternalKeyComparator cmp(BytewiseComparator()); auto table_factory = std::make_shared(); - MemTable* memtable = new MemTable(cmp, table_factory.get()); + Options options; + options.memtable_factory = table_factory; + MemTable* memtable = new MemTable(cmp, options); memtable->Ref(); WriteBatch batch; - Options options; WriteBatchInternal::SetSequence(&batch, 100); batch.Put(std::string("k1"), std::string("v1")); batch.Put(std::string("k2"), std::string("v2")); diff --git a/tools/db_stress.cc b/tools/db_stress.cc index 966f007e8..8321c7eaf 100644 --- a/tools/db_stress.cc +++ b/tools/db_stress.cc @@ -26,7 +26,7 @@ #include #include "db/db_impl.h" #include "db/version_set.h" -#include "db/db_statistics.h" +#include "rocksdb/statistics.h" #include "rocksdb/cache.h" #include "utilities/utility_db.h" #include "rocksdb/env.h" diff --git a/util/hash_skiplist_rep.cc b/util/hash_skiplist_rep.cc index c669769e0..e9fe1573a 100644 --- a/util/hash_skiplist_rep.cc +++ b/util/hash_skiplist_rep.cc @@ -31,17 +31,15 @@ class HashSkipListRep : public MemTableRep { virtual ~HashSkipListRep(); - virtual std::shared_ptr GetIterator() override; + virtual MemTableRep::Iterator* GetIterator() override; - virtual std::shared_ptr GetIterator( - const Slice& slice) override; + virtual MemTableRep::Iterator* GetIterator(const Slice& slice) override; - virtual std::shared_ptr GetPrefixIterator( - const Slice& prefix) override; - - virtual std::shared_ptr GetDynamicPrefixIterator() + virtual MemTableRep::Iterator* GetPrefixIterator(const Slice& prefix) override; + virtual MemTableRep::Iterator* GetDynamicPrefixIterator() override; + private: friend class DynamicIterator; typedef SkipList Bucket; @@ -208,18 +206,15 @@ class HashSkipListRep : public MemTableRep { virtual void SeekToLast() { } private: }; - - std::shared_ptr empty_iterator_; }; HashSkipListRep::HashSkipListRep(MemTableRep::KeyComparator& compare, - Arena* arena, const SliceTransform* transform, size_t bucket_size) - : bucket_size_(bucket_size), - transform_(transform), - compare_(compare), - arena_(arena), - empty_iterator_(std::make_shared()) { - + Arena* arena, const SliceTransform* transform, + size_t bucket_size) + : bucket_size_(bucket_size), + transform_(transform), + compare_(compare), + arena_(arena) { buckets_ = new port::AtomicPointer[bucket_size]; for (size_t i = 0; i < bucket_size_; ++i) { @@ -263,7 +258,7 @@ size_t HashSkipListRep::ApproximateMemoryUsage() { return sizeof(buckets_); } -std::shared_ptr HashSkipListRep::GetIterator() { +MemTableRep::Iterator* HashSkipListRep::GetIterator() { auto list = new Bucket(compare_, arena_); for (size_t i = 0; i < bucket_size_; ++i) { auto bucket = GetBucket(i); @@ -274,35 +269,30 @@ std::shared_ptr HashSkipListRep::GetIterator() { } } } - return std::make_shared(list); + return new Iterator(list); } -std::shared_ptr HashSkipListRep::GetPrefixIterator( - const Slice& prefix) { +MemTableRep::Iterator* HashSkipListRep::GetPrefixIterator(const Slice& prefix) { auto bucket = GetBucket(prefix); if (bucket == nullptr) { - return empty_iterator_; + return new EmptyIterator(); } - return std::make_shared(bucket, false); + return new Iterator(bucket, false); } -std::shared_ptr HashSkipListRep::GetIterator( - const Slice& slice) { +MemTableRep::Iterator* HashSkipListRep::GetIterator(const Slice& slice) { return GetPrefixIterator(transform_->Transform(slice)); } -std::shared_ptr - HashSkipListRep::GetDynamicPrefixIterator() { - return std::make_shared(*this); +MemTableRep::Iterator* HashSkipListRep::GetDynamicPrefixIterator() { + return new DynamicIterator(*this); } } // anon namespace -std::shared_ptr -HashSkipListRepFactory::CreateMemTableRep(MemTableRep::KeyComparator &compare, - Arena *arena) { - return std::make_shared(compare, arena, transform_, - bucket_count_); +MemTableRep* HashSkipListRepFactory::CreateMemTableRep( + MemTableRep::KeyComparator& compare, Arena* arena) { + return new HashSkipListRep(compare, arena, transform_, bucket_count_); } MemTableRepFactory* NewHashSkipListRepFactory( diff --git a/util/hash_skiplist_rep.h b/util/hash_skiplist_rep.h index b946cf05e..7b8414c88 100644 --- a/util/hash_skiplist_rep.h +++ b/util/hash_skiplist_rep.h @@ -21,8 +21,8 @@ class HashSkipListRepFactory : public MemTableRepFactory { virtual ~HashSkipListRepFactory() { delete transform_; } - virtual std::shared_ptr CreateMemTableRep( - MemTableRep::KeyComparator& compare, Arena* arena) override; + virtual MemTableRep* CreateMemTableRep(MemTableRep::KeyComparator& compare, + Arena* arena) override; virtual const char* Name() const override { return "HashSkipListRepFactory"; diff --git a/util/histogram.cc b/util/histogram.cc index e83998014..968769cef 100644 --- a/util/histogram.cc +++ b/util/histogram.cc @@ -16,27 +16,38 @@ namespace rocksdb { -HistogramBucketMapper::HistogramBucketMapper() : - // Add newer bucket index here. - // Should be alwyas added in sorted order. - bucketValues_({ - 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 12, 14, 16, 18, 20, 25, 30, 35, 40, 45, - 50, 60, 70, 80, 90, 100, 120, 140, 160, 180, 200, 250, 300, 350, 400, 450, - 500, 600, 700, 800, 900, 1000, 1200, 1400, 1600, 1800, 2000, 2500, 3000, - 3500, 4000, 4500, 5000, 6000, 7000, 8000, 9000, 10000, 12000, 14000, - 16000, 18000, 20000, 25000, 30000, 35000, 40000, 45000, 50000, 60000, - 70000, 80000, 90000, 100000, 120000, 140000, 160000, 180000, 200000, - 250000, 300000, 350000, 400000, 450000, 500000, 600000, 700000, 800000, - 900000, 1000000, 1200000, 1400000, 1600000, 1800000, 2000000, 2500000, - 3000000, 3500000, 4000000, 4500000, 5000000, 6000000, 7000000, 8000000, - 9000000, 10000000, 12000000, 14000000, 16000000, 18000000, 20000000, - 25000000, 30000000, 35000000, 40000000, 45000000, 50000000, 60000000, - 70000000, 80000000, 90000000, 100000000, 120000000, 140000000, 160000000, - 180000000, 200000000, 250000000, 300000000, 350000000, 400000000, - 450000000, 500000000, 600000000, 700000000, 800000000, 900000000, - 1000000000}), - maxBucketValue_(bucketValues_.back()), - minBucketValue_(bucketValues_.front()) { +HistogramBucketMapper::HistogramBucketMapper() + : + // Add newer bucket index here. + // Should be alwyas added in sorted order. + // If you change this, you also need to change + // size of array buckets_ in HistogramImpl + bucketValues_( + {1, 2, 3, 4, 5, 6, + 7, 8, 9, 10, 12, 14, + 16, 18, 20, 25, 30, 35, + 40, 45, 50, 60, 70, 80, + 90, 100, 120, 140, 160, 180, + 200, 250, 300, 350, 400, 450, + 500, 600, 700, 800, 900, 1000, + 1200, 1400, 1600, 1800, 2000, 2500, + 3000, 3500, 4000, 4500, 5000, 6000, + 7000, 8000, 9000, 10000, 12000, 14000, + 16000, 18000, 20000, 25000, 30000, 35000, + 40000, 45000, 50000, 60000, 70000, 80000, + 90000, 100000, 120000, 140000, 160000, 180000, + 200000, 250000, 300000, 350000, 400000, 450000, + 500000, 600000, 700000, 800000, 900000, 1000000, + 1200000, 1400000, 1600000, 1800000, 2000000, 2500000, + 3000000, 3500000, 4000000, 4500000, 5000000, 6000000, + 7000000, 8000000, 9000000, 10000000, 12000000, 14000000, + 16000000, 18000000, 20000000, 25000000, 30000000, 35000000, + 40000000, 45000000, 50000000, 60000000, 70000000, 80000000, + 90000000, 100000000, 120000000, 140000000, 160000000, 180000000, + 200000000, 250000000, 300000000, 350000000, 400000000, 450000000, + 500000000, 600000000, 700000000, 800000000, 900000000, 1000000000}), + maxBucketValue_(bucketValues_.back()), + minBucketValue_(bucketValues_.front()) { for (size_t i =0; i < bucketValues_.size(); ++i) { valueIndexMap_[bucketValues_[i]] = i; } @@ -62,24 +73,17 @@ namespace { const HistogramBucketMapper bucketMapper; } - -HistogramImpl::HistogramImpl() : - min_(bucketMapper.LastValue()), - max_(0), - num_(0), - sum_(0), - sum_squares_(0), - buckets_(std::vector(bucketMapper.BucketCount(), 0)) {} - void HistogramImpl::Clear() { min_ = bucketMapper.LastValue(); max_ = 0; num_ = 0; sum_ = 0; sum_squares_ = 0; - buckets_.resize(bucketMapper.BucketCount(), 0); + memset(buckets_, 0, sizeof buckets_); } +bool HistogramImpl::Empty() { return sum_squares_ == 0; } + void HistogramImpl::Add(uint64_t value) { const size_t index = bucketMapper.IndexForValue(value); buckets_[index] += 1; diff --git a/util/histogram.h b/util/histogram.h index c01594da7..d95588dc2 100644 --- a/util/histogram.h +++ b/util/histogram.h @@ -52,9 +52,8 @@ class HistogramBucketMapper { class HistogramImpl { public: - HistogramImpl(); - virtual ~HistogramImpl() {} virtual void Clear(); + virtual bool Empty(); virtual void Add(uint64_t value); void Merge(const HistogramImpl& other); @@ -67,13 +66,14 @@ class HistogramImpl { virtual void Data(HistogramData * const data) const; private: - double min_; - double max_; - double num_; - double sum_; - double sum_squares_; - std::vector buckets_; - + // To be able to use HistogramImpl as thread local variable, its constructor + // has to be static. That's why we're using manually values from BucketMapper + double min_ = 1000000000; // this is BucketMapper:LastValue() + double max_ = 0; + double num_ = 0; + double sum_ = 0; + double sum_squares_ = 0; + uint64_t buckets_[138] = {0}; // this is BucketMapper::BucketCount() }; } // namespace rocksdb diff --git a/util/ldb_cmd.cc b/util/ldb_cmd.cc index 58d81460e..65ecd61a2 100644 --- a/util/ldb_cmd.cc +++ b/util/ldb_cmd.cc @@ -1024,7 +1024,7 @@ Status ReduceDBLevelsCommand::GetOldNumOfLevels(Options& opt, } int max = -1; for (int i = 0; i < versions.NumberLevels(); i++) { - if (versions.NumLevelFiles(i)) { + if (versions.current()->NumLevelFiles(i)) { max = i; } } diff --git a/util/manual_compaction_test.cc b/util/manual_compaction_test.cc index ebe1339e5..dd615f057 100644 --- a/util/manual_compaction_test.cc +++ b/util/manual_compaction_test.cc @@ -9,9 +9,13 @@ #include #include "rocksdb/db.h" +#include "rocksdb/compaction_filter.h" +#include "rocksdb/slice.h" #include "rocksdb/write_batch.h" #include "util/testharness.h" +using namespace rocksdb; + namespace { const int kNumKeys = 1100000; @@ -26,12 +30,71 @@ std::string Key2(int i) { return Key1(i) + "_xxx"; } -class ManualCompactionTest { }; +class ManualCompactionTest { + public: + ManualCompactionTest() { + // Get rid of any state from an old run. + dbname_ = rocksdb::test::TmpDir() + "/rocksdb_cbug_test"; + DestroyDB(dbname_, rocksdb::Options()); + } + + std::string dbname_; +}; + +class DestroyAllCompactionFilter : public CompactionFilter { + public: + DestroyAllCompactionFilter() {} + + virtual bool Filter(int level, + const Slice& key, + const Slice& existing_value, + std::string* new_value, + bool* value_changed) const { + return existing_value.ToString() == "destroy"; + } + + virtual const char* Name() const { + return "DestroyAllCompactionFilter"; + } +}; + +TEST(ManualCompactionTest, CompactTouchesAllKeys) { + for (int iter = 0; iter < 2; ++iter) { + DB* db; + Options options; + if (iter == 0) { // level compaction + options.num_levels = 3; + options.compaction_style = kCompactionStyleLevel; + } else { // universal compaction + options.compaction_style = kCompactionStyleUniversal; + } + options.create_if_missing = true; + options.compression = rocksdb::kNoCompression; + options.compaction_filter = new DestroyAllCompactionFilter(); + ASSERT_OK(DB::Open(options, dbname_, &db)); + + db->Put(WriteOptions(), Slice("key1"), Slice("destroy")); + db->Put(WriteOptions(), Slice("key2"), Slice("destroy")); + db->Put(WriteOptions(), Slice("key3"), Slice("value3")); + db->Put(WriteOptions(), Slice("key4"), Slice("destroy")); + + Slice key4("key4"); + db->CompactRange(nullptr, &key4); + Iterator* itr = db->NewIterator(ReadOptions()); + itr->SeekToFirst(); + ASSERT_TRUE(itr->Valid()); + ASSERT_EQ("key3", itr->key().ToString()); + itr->Next(); + ASSERT_TRUE(!itr->Valid()); + delete itr; + + delete options.compaction_filter; + delete db; + DestroyDB(dbname_, options); + } +} TEST(ManualCompactionTest, Test) { - // Get rid of any state from an old run. - std::string dbpath = rocksdb::test::TmpDir() + "/rocksdb_cbug_test"; - DestroyDB(dbpath, rocksdb::Options()); // Open database. Disable compression since it affects the creation // of layers and the code below is trying to test against a very @@ -40,7 +103,7 @@ TEST(ManualCompactionTest, Test) { rocksdb::Options db_options; db_options.create_if_missing = true; db_options.compression = rocksdb::kNoCompression; - ASSERT_OK(rocksdb::DB::Open(db_options, dbpath, &db)); + ASSERT_OK(rocksdb::DB::Open(db_options, dbname_, &db)); // create first key range rocksdb::WriteBatch batch; @@ -83,7 +146,7 @@ TEST(ManualCompactionTest, Test) { // close database delete db; - DestroyDB(dbpath, rocksdb::Options()); + DestroyDB(dbname_, rocksdb::Options()); } } // anonymous namespace diff --git a/util/skiplistrep.cc b/util/skiplistrep.cc index 955d754b1..a5b072ad1 100644 --- a/util/skiplistrep.cc +++ b/util/skiplistrep.cc @@ -90,15 +90,15 @@ public: // Unhide default implementations of GetIterator using MemTableRep::GetIterator; - virtual std::shared_ptr GetIterator() override { - return std::make_shared(&skip_list_); + virtual MemTableRep::Iterator* GetIterator() override { + return new SkipListRep::Iterator(&skip_list_); } }; } -std::shared_ptr SkipListFactory::CreateMemTableRep ( - MemTableRep::KeyComparator& compare, Arena* arena) { - return std::shared_ptr(new SkipListRep(compare, arena)); +MemTableRep* SkipListFactory::CreateMemTableRep( + MemTableRep::KeyComparator& compare, Arena* arena) { + return new SkipListRep(compare, arena); } } // namespace rocksdb diff --git a/util/statistics.cc b/util/statistics.cc index 5f7a5ba46..f19a777c1 100644 --- a/util/statistics.cc +++ b/util/statistics.cc @@ -3,12 +3,48 @@ // LICENSE file in the root directory of this source tree. An additional grant // of patent rights can be found in the PATENTS file in the same directory. // +#include "util/statistics.h" #include "rocksdb/statistics.h" #include namespace rocksdb { +std::shared_ptr CreateDBStatistics() { + return std::make_shared(); +} + +StatisticsImpl::StatisticsImpl() {} + +StatisticsImpl::~StatisticsImpl() {} + +long StatisticsImpl::getTickerCount(Tickers tickerType) { + assert(tickerType < TICKER_ENUM_MAX); + return tickers_[tickerType]; +} + +void StatisticsImpl::setTickerCount(Tickers tickerType, uint64_t count) { + assert(tickerType < TICKER_ENUM_MAX); + tickers_[tickerType] = count; +} + +void StatisticsImpl::recordTick(Tickers tickerType, uint64_t count) { + assert(tickerType < TICKER_ENUM_MAX); + tickers_[tickerType] += count; +} + +void StatisticsImpl::measureTime(Histograms histogramType, uint64_t value) { + assert(histogramType < HISTOGRAM_ENUM_MAX); + histograms_[histogramType].Add(value); +} + +void StatisticsImpl::histogramData(Histograms histogramType, + HistogramData* const data) { + assert(histogramType < HISTOGRAM_ENUM_MAX); + histograms_[histogramType].Data(data); +} + namespace { + // a buffer size used for temp string buffers const int kBufferSize = 200; @@ -32,11 +68,8 @@ std::string HistogramToString ( return std::string(buffer); }; -std::string TickerToString ( - Statistics* dbstats, - const Tickers& ticker, - const std::string& name) { - +std::string TickerToString(Statistics* dbstats, const Tickers& ticker, + const std::string& name) { char buffer[kBufferSize]; snprintf(buffer, kBufferSize, "%s COUNT : %ld\n", name.c_str(), dbstats->getTickerCount(ticker)); diff --git a/util/statistics.h b/util/statistics.h new file mode 100644 index 000000000..36456dddc --- /dev/null +++ b/util/statistics.h @@ -0,0 +1,53 @@ +// Copyright (c) 2013, Facebook, Inc. All rights reserved. +// This source code is licensed under the BSD-style license found in the +// LICENSE file in the root directory of this source tree. An additional grant +// of patent rights can be found in the PATENTS file in the same directory. +// +#pragma once +#include "rocksdb/statistics.h" +#include "util/histogram.h" +#include "util/mutexlock.h" + +#define UNLIKELY(val) (__builtin_expect((val), 0)) + +namespace rocksdb { + +class StatisticsImpl : public Statistics { + public: + StatisticsImpl(); + virtual ~StatisticsImpl(); + + virtual long getTickerCount(Tickers tickerType); + virtual void setTickerCount(Tickers tickerType, uint64_t count); + virtual void recordTick(Tickers tickerType, uint64_t count); + virtual void measureTime(Histograms histogramType, uint64_t value); + virtual void histogramData(Histograms histogramType, + HistogramData* const data); + + private: + std::atomic_uint_fast64_t tickers_[TICKER_ENUM_MAX]; + HistogramImpl histograms_[HISTOGRAM_ENUM_MAX]; +}; + +// Utility functions +inline void MeasureTime(Statistics* statistics, Histograms histogramType, + uint64_t value) { + if (statistics) { + statistics->measureTime(histogramType, value); + } +} + +inline void RecordTick(Statistics* statistics, Tickers ticker, + uint64_t count = 1) { + if (statistics) { + statistics->recordTick(ticker, count); + } +} + +inline void SetTickerCount(Statistics* statistics, Tickers ticker, + uint64_t count) { + if (statistics) { + statistics->setTickerCount(ticker, count); + } +} +} diff --git a/util/statistics_imp.h b/util/statistics_imp.h deleted file mode 100644 index 0dc8884c1..000000000 --- a/util/statistics_imp.h +++ /dev/null @@ -1,32 +0,0 @@ -// Copyright (c) 2013, Facebook, Inc. All rights reserved. -// This source code is licensed under the BSD-style license found in the -// LICENSE file in the root directory of this source tree. An additional grant -// of patent rights can be found in the PATENTS file in the same directory. -// -#pragma once -#include "rocksdb/statistics.h" - -namespace rocksdb { - -// Utility functions -inline void RecordTick(Statistics* statistics, - Tickers ticker, - uint64_t count = 1) { - assert(HistogramsNameMap.size() == HISTOGRAM_ENUM_MAX); - assert(TickersNameMap.size() == TICKER_ENUM_MAX); - if (statistics) { - statistics->recordTick(ticker, count); - } -} - -inline void SetTickerCount(Statistics* statistics, - Tickers ticker, - uint64_t count) { - assert(HistogramsNameMap.size() == HISTOGRAM_ENUM_MAX); - assert(TickersNameMap.size() == TICKER_ENUM_MAX); - if (statistics) { - statistics->setTickerCount(ticker, count); - } -} - -} diff --git a/util/stop_watch.h b/util/stop_watch.h index 6325a7440..48e1b01c2 100644 --- a/util/stop_watch.h +++ b/util/stop_watch.h @@ -5,7 +5,7 @@ // #pragma once #include "rocksdb/env.h" -#include "util/statistics_imp.h" +#include "util/statistics.h" namespace rocksdb { // Auto-scoped. @@ -28,11 +28,7 @@ class StopWatch { return env_->NowMicros() - start_time_; } - ~StopWatch() { - if (statistics_) { - statistics_->measureTime(histogram_name_, ElapsedMicros()); - } - } + ~StopWatch() { MeasureTime(statistics_, histogram_name_, ElapsedMicros()); } private: Env* const env_; diff --git a/util/vectorrep.cc b/util/vectorrep.cc index 8d3ccc9df..87fae4bc7 100644 --- a/util/vectorrep.cc +++ b/util/vectorrep.cc @@ -88,7 +88,7 @@ class VectorRep : public MemTableRep { using MemTableRep::GetIterator; // Return an iterator over the keys in this representation. - virtual std::shared_ptr GetIterator() override; + virtual MemTableRep::Iterator* GetIterator() override; private: friend class Iterator; @@ -228,22 +228,22 @@ void VectorRep::Iterator::SeekToLast() { } } -std::shared_ptr VectorRep::GetIterator() { +MemTableRep::Iterator* VectorRep::GetIterator() { ReadLock l(&rwlock_); // Do not sort here. The sorting would be done the first time // a Seek is performed on the iterator. if (immutable_) { - return std::make_shared(this, bucket_, compare_); + return new Iterator(this, bucket_, compare_); } else { std::shared_ptr tmp; tmp.reset(new Bucket(*bucket_)); // make a copy - return std::make_shared(nullptr, tmp, compare_); + return new Iterator(nullptr, tmp, compare_); } } } // anon namespace -std::shared_ptr VectorRepFactory::CreateMemTableRep( - MemTableRep::KeyComparator& compare, Arena* arena) { - return std::make_shared(compare, arena, count_); +MemTableRep* VectorRepFactory::CreateMemTableRep( + MemTableRep::KeyComparator& compare, Arena* arena) { + return new VectorRep(compare, arena, count_); } } // namespace rocksdb