Merge branch 'master' into columnfamilies

Conflicts:
	db/db_impl.cc
	db/db_impl_readonly.cc
	db/db_test.cc
	db/version_edit.cc
	db/version_edit.h
	db/version_set.cc
	db/version_set.h
	db/version_set_reduce_num_levels.cc
main
Igor Canadi 11 years ago
commit 23f6791c9e
  1. 42
      .clang-format
  2. 19
      Makefile
  3. 4
      build_tools/build_detect_platform
  4. 2
      build_tools/fbcode.gcc481.sh
  5. 109
      build_tools/format-diff.sh
  6. 214
      db/compaction.cc
  7. 134
      db/compaction.h
  8. 847
      db/compaction_picker.cc
  9. 162
      db/compaction_picker.h
  10. 9
      db/db_bench.cc
  11. 2
      db/db_filesnapshot.cc
  12. 293
      db/db_impl.cc
  13. 20
      db/db_impl.h
  14. 2
      db/db_impl_readonly.cc
  15. 14
      db/db_statistics.cc
  16. 63
      db/db_statistics.h
  17. 11
      db/db_stats_logger.cc
  18. 286
      db/db_test.cc
  19. 38
      db/memtable.cc
  20. 9
      db/memtable.h
  21. 2
      db/merge_helper.cc
  22. 5
      db/repair.cc
  23. 2
      db/simple_table_db_test.cc
  24. 26
      db/version_edit.cc
  25. 10
      db/version_edit.h
  26. 7
      db/version_edit_test.cc
  27. 1774
      db/version_set.cc
  28. 318
      db/version_set.h
  29. 15
      db/version_set_reduce_num_levels.cc
  30. 5
      db/write_batch.cc
  31. 5
      db/write_batch_test.cc
  32. 1
      include/rocksdb/db.h
  33. 24
      include/rocksdb/memtablerep.h
  34. 45
      include/rocksdb/statistics.h
  35. 7
      include/rocksdb/write_batch.h
  36. 34
      table/table_test.cc
  37. 2
      tools/db_stress.cc
  38. 54
      util/hash_skiplist_rep.cc
  39. 4
      util/hash_skiplist_rep.h
  40. 66
      util/histogram.cc
  41. 18
      util/histogram.h
  42. 2
      util/ldb_cmd.cc
  43. 75
      util/manual_compaction_test.cc
  44. 10
      util/skiplistrep.cc
  45. 43
      util/statistics.cc
  46. 53
      util/statistics.h
  47. 32
      util/statistics_imp.h
  48. 8
      util/stop_watch.h
  49. 14
      util/vectorrep.cc

@ -2,46 +2,4 @@
# http://clang.llvm.org/docs/ClangFormatStyleOptions.html # http://clang.llvm.org/docs/ClangFormatStyleOptions.html
--- ---
BasedOnStyle: Google BasedOnStyle: Google
AccessModifierOffset: -1
ConstructorInitializerIndentWidth: 4
AlignEscapedNewlinesLeft: true
AlignTrailingComments: true
AllowAllParametersOfDeclarationOnNextLine: true
AllowShortIfStatementsOnASingleLine: false
AllowShortLoopsOnASingleLine: false
AlwaysBreakTemplateDeclarations: true
AlwaysBreakBeforeMultilineStrings: true
BreakBeforeBinaryOperators: false
BreakConstructorInitializersBeforeComma: false
BinPackParameters: false
ColumnLimit: 80
ConstructorInitializerAllOnOneLineOrOnePerLine: true
DerivePointerBinding: true
ExperimentalAutoDetectBinPacking: true
IndentCaseLabels: false
MaxEmptyLinesToKeep: 1
NamespaceIndentation: None
ObjCSpaceBeforeProtocolList: false
PenaltyBreakBeforeFirstCallParameter: 10
PenaltyBreakComment: 60
PenaltyBreakString: 1000
PenaltyBreakFirstLessLess: 20
PenaltyExcessCharacter: 1000000
PenaltyReturnTypeOnItsOwnLine: 200
PointerBindsToType: true
SpacesBeforeTrailingComments: 2
Cpp11BracedListStyle: true
Standard: Cpp11
IndentWidth: 2
TabWidth: 8
UseTab: Never
BreakBeforeBraces: Attach
IndentFunctionDeclarationAfterType: false
SpacesInParentheses: false
SpacesInAngles: false
SpaceInEmptyParentheses: false
SpacesInCStyleCastParentheses: false
SpaceAfterControlStatementKeyword: true
SpaceBeforeAssignmentOperators: true
ContinuationIndentWidth: 4
... ...

@ -128,19 +128,21 @@ $(SHARED2): $(SHARED3)
ln -fs $(SHARED3) $(SHARED2) ln -fs $(SHARED3) $(SHARED2)
endif endif
$(SHARED3): $(SHARED3): $(LIBOBJECTS)
$(CXX) $(PLATFORM_SHARED_LDFLAGS)$(SHARED2) $(CXXFLAGS) $(COVERAGEFLAGS) $(PLATFORM_SHARED_CFLAGS) $(SOURCES) -o $@ $(LDFLAGS) $(CXX) $(PLATFORM_SHARED_LDFLAGS)$(SHARED2) $(CXXFLAGS) $(PLATFORM_SHARED_CFLAGS) $(LDFLAGS) $(SOURCES)-o $@
endif # PLATFORM_SHARED_EXT endif # PLATFORM_SHARED_EXT
all: $(LIBRARY) $(PROGRAMS) all: $(LIBRARY) $(PROGRAMS)
.PHONY: blackbox_crash_test check clean coverage crash_test ldb_tests \ .PHONY: blackbox_crash_test check clean coverage crash_test ldb_tests \
release tags valgrind_check whitebox_crash_test release tags valgrind_check whitebox_crash_test format
# Will also generate shared libraries.
release: release:
$(MAKE) clean $(MAKE) clean
OPT=-DNDEBUG $(MAKE) -j32 OPT=-DNDEBUG $(MAKE) all -j32
OPT=-DNDEBUG $(MAKE) $(SHARED) -j32
coverage: coverage:
$(MAKE) clean $(MAKE) clean
@ -197,6 +199,9 @@ tags:
ctags * -R ctags * -R
cscope -b `find . -name '*.cc'` `find . -name '*.h'` cscope -b `find . -name '*.cc'` `find . -name '*.h'`
format:
build_tools/format-diff.sh
# --------------------------------------------------------------------------- # ---------------------------------------------------------------------------
# Unit tests and tools # Unit tests and tools
# --------------------------------------------------------------------------- # ---------------------------------------------------------------------------
@ -415,6 +420,12 @@ DEPFILES = $(filter-out util/build_version.d,$(SOURCES:.cc=.d))
depend: $(DEPFILES) depend: $(DEPFILES)
# if the make goal is either "clean" or "format", we shouldn't
# try to import the *.d files.
# TODO(kailiu) The unfamiliarity of Make's conditions leads to the ugly
# working solution.
ifneq ($(MAKECMDGOALS),clean) ifneq ($(MAKECMDGOALS),clean)
ifneq ($(MAKECMDGOALS),format)
-include $(DEPFILES) -include $(DEPFILES)
endif endif
endif

@ -81,9 +81,9 @@ PLATFORM_CCFLAGS=
PLATFORM_CXXFLAGS="$PLATFORM_CXXFLAGS ${CXXFLAGS}" PLATFORM_CXXFLAGS="$PLATFORM_CXXFLAGS ${CXXFLAGS}"
PLATFORM_LDFLAGS="$PLATFORM_LDFLAGS" PLATFORM_LDFLAGS="$PLATFORM_LDFLAGS"
PLATFORM_SHARED_EXT="so" PLATFORM_SHARED_EXT="so"
PLATFORM_SHARED_LDFLAGS="${EXEC_LDFLAGS_SHARED} -shared -Wl,-soname -Wl," PLATFORM_SHARED_LDFLAGS="-shared -Wl,-soname -Wl,"
PLATFORM_SHARED_CFLAGS="-fPIC" PLATFORM_SHARED_CFLAGS="-fPIC"
PLATFORM_SHARED_VERSIONED=true PLATFORM_SHARED_VERSIONED=false
# generic port files (working on all platform by #ifdef) go directly in /port # generic port files (working on all platform by #ifdef) go directly in /port
GENERIC_PORT_FILES=`find $ROCKSDB_ROOT/port -name '*.cc' | tr "\n" " "` GENERIC_PORT_FILES=`find $ROCKSDB_ROOT/port -name '*.cc' | tr "\n" " "`

@ -60,7 +60,7 @@ AR=$TOOLCHAIN_EXECUTABLES/binutils/binutils-2.21.1/da39a3e/bin/ar
RANLIB=$TOOLCHAIN_EXECUTABLES/binutils/binutils-2.21.1/da39a3e/bin/ranlib RANLIB=$TOOLCHAIN_EXECUTABLES/binutils/binutils-2.21.1/da39a3e/bin/ranlib
CFLAGS="-B$TOOLCHAIN_EXECUTABLES/binutils/binutils-2.21.1/da39a3e/bin/gold -m64 -mtune=generic" CFLAGS="-B$TOOLCHAIN_EXECUTABLES/binutils/binutils-2.21.1/da39a3e/bin/gold -m64 -mtune=generic"
CFLAGS+=" -nostdlib $LIBGCC_INCLUDE $GLIBC_INCLUDE" CFLAGS+=" $LIBGCC_INCLUDE $GLIBC_INCLUDE"
CFLAGS+=" -DROCKSDB_PLATFORM_POSIX -DROCKSDB_ATOMIC_PRESENT -DROCKSDB_FALLOCATE_PRESENT" CFLAGS+=" -DROCKSDB_PLATFORM_POSIX -DROCKSDB_ATOMIC_PRESENT -DROCKSDB_FALLOCATE_PRESENT"
CFLAGS+=" -DSNAPPY -DGFLAGS -DZLIB -DBZIP2" CFLAGS+=" -DSNAPPY -DGFLAGS -DZLIB -DBZIP2"

@ -0,0 +1,109 @@
#!/bin/bash
# If clang_format_diff.py command is not specfied, we assume we are able to
# access directly without any path.
if [ -z $CLANG_FORMAT_DIFF ]
then
CLANG_FORMAT_DIFF="clang-format-diff.py"
fi
# Check clang-format-diff.py
if ! which $CLANG_FORMAT_DIFF &> /dev/null
then
echo "You didn't have clang-format-diff.py available in your computer!"
echo "You can download it by running: "
echo " curl http://goo.gl/iUW1u2"
exit 128
fi
# Check argparse, a library that clang-format-diff.py requires.
python 2>/dev/null << EOF
import argparse
EOF
if [ "$?" != 0 ]
then
echo "To run clang-format-diff.py, we'll need the library "argparse" to be"
echo "installed. You can try either of the follow ways to install it:"
echo " 1. Manually download argparse: https://pypi.python.org/pypi/argparse"
echo " 2. easy_install argparse (if you have easy_install)"
echo " 3. pip install argparse (if you have pip)"
exit 129
fi
# TODO(kailiu) following work is not complete since we still need to figure
# out how to add the modified files done pre-commit hook to git's commit index.
#
# Check if this script has already been added to pre-commit hook.
# Will suggest user to add this script to pre-commit hook if their pre-commit
# is empty.
# PRE_COMMIT_SCRIPT_PATH="`git rev-parse --show-toplevel`/.git/hooks/pre-commit"
# if ! ls $PRE_COMMIT_SCRIPT_PATH &> /dev/null
# then
# echo "Would you like to add this script to pre-commit hook, which will do "
# echo -n "the format check for all the affected lines before you check in (y/n):"
# read add_to_hook
# if [ "$add_to_hook" == "y" ]
# then
# ln -s `git rev-parse --show-toplevel`/build_tools/format-diff.sh $PRE_COMMIT_SCRIPT_PATH
# fi
# fi
set -e
uncommitted_code=`git diff HEAD`
# If there's no uncommitted changes, we assume user are doing post-commit
# format check, in which case we'll check the modified lines from latest commit.
# Otherwise, we'll check format of the uncommitted code only.
format_last_commit=0
if [ -z "$uncommitted_code" ]
then
# Check the format of last commit
diffs=$(git diff -U0 HEAD^ | $CLANG_FORMAT_DIFF -p 1)
else
# Check the format of uncommitted lines,
diffs=$(git diff -U0 HEAD | $CLANG_FORMAT_DIFF -p 1)
fi
if [ -z "$diffs" ]
then
echo "Nothing needs to be reformatted!"
exit 0
fi
# Highlight the insertion/deletion from the clang-format-diff.py's output
COLOR_END="\033[0m"
COLOR_RED="\033[0;31m"
COLOR_GREEN="\033[0;32m"
echo -e "Detect lines that doesn't follow the format rules:\r"
# Add the color to the diff. lines added will be green; lines removed will be red.
echo "$diffs" |
sed -e "s/\(^-.*$\)/`echo -e \"$COLOR_RED\1$COLOR_END\"`/" |
sed -e "s/\(^+.*$\)/`echo -e \"$COLOR_GREEN\1$COLOR_END\"`/"
echo -e "Would you like to fix the format automatically (y/n): \c"
# Make sure under any mode, we can read user input.
exec < /dev/tty
read to_fix
if [ "$to_fix" != "y" ]
then
exit 1
fi
# Do in-place format adjustment.
git diff -U0 HEAD^ | $CLANG_FORMAT_DIFF -i -p 1
echo "Files reformatted!"
# Amend to last commit if user do the post-commit format check
if [ -z "$uncommitted_code" ]; then
echo -e "Would you like to amend the changes to last commit (`git log HEAD --oneline | head -1`)? (y/n): \c"
read to_amend
if [ "$to_amend" == "y" ]
then
git commit -a --amend --reuse-message HEAD
echo "Amended to last commit"
fi
fi

@ -0,0 +1,214 @@
// Copyright (c) 2013, Facebook, Inc. All rights reserved.
// This source code is licensed under the BSD-style license found in the
// LICENSE file in the root directory of this source tree. An additional grant
// of patent rights can be found in the PATENTS file in the same directory.
//
// Copyright (c) 2011 The LevelDB Authors. All rights reserved.
// Use of this source code is governed by a BSD-style license that can be
// found in the LICENSE file. See the AUTHORS file for names of contributors.
#include "db/compaction.h"
namespace rocksdb {
static uint64_t TotalFileSize(const std::vector<FileMetaData*>& files) {
uint64_t sum = 0;
for (size_t i = 0; i < files.size() && files[i]; i++) {
sum += files[i]->file_size;
}
return sum;
}
Compaction::Compaction(Version* input_version, int level, int out_level,
uint64_t target_file_size,
uint64_t max_grandparent_overlap_bytes,
bool seek_compaction, bool enable_compression)
: level_(level),
out_level_(out_level),
max_output_file_size_(target_file_size),
maxGrandParentOverlapBytes_(max_grandparent_overlap_bytes),
input_version_(input_version),
number_levels_(input_version_->NumberLevels()),
seek_compaction_(seek_compaction),
enable_compression_(enable_compression),
grandparent_index_(0),
seen_key_(false),
overlapped_bytes_(0),
base_index_(-1),
parent_index_(-1),
score_(0),
bottommost_level_(false),
is_full_compaction_(false),
level_ptrs_(std::vector<size_t>(number_levels_)) {
input_version_->Ref();
edit_ = new VersionEdit();
for (int i = 0; i < number_levels_; i++) {
level_ptrs_[i] = 0;
}
}
Compaction::~Compaction() {
delete edit_;
if (input_version_ != nullptr) {
input_version_->Unref();
}
}
bool Compaction::IsTrivialMove() const {
// Avoid a move if there is lots of overlapping grandparent data.
// Otherwise, the move could create a parent file that will require
// a very expensive merge later on.
// If level_== out_level_, the purpose is to force compaction filter to be
// applied to that level, and thus cannot be a trivia move.
return (level_ != out_level_ &&
num_input_files(0) == 1 &&
num_input_files(1) == 0 &&
TotalFileSize(grandparents_) <= maxGrandParentOverlapBytes_);
}
void Compaction::AddInputDeletions(VersionEdit* edit) {
for (int which = 0; which < 2; which++) {
for (size_t i = 0; i < inputs_[which].size(); i++) {
edit->DeleteFile(level_ + which, inputs_[which][i]->number);
}
}
}
bool Compaction::IsBaseLevelForKey(const Slice& user_key) {
if (input_version_->vset_->options_->compaction_style ==
kCompactionStyleUniversal) {
return bottommost_level_;
}
// Maybe use binary search to find right entry instead of linear search?
const Comparator* user_cmp = input_version_->vset_->icmp_.user_comparator();
for (int lvl = level_ + 2; lvl < number_levels_; lvl++) {
const std::vector<FileMetaData*>& files = input_version_->files_[lvl];
for (; level_ptrs_[lvl] < files.size(); ) {
FileMetaData* f = files[level_ptrs_[lvl]];
if (user_cmp->Compare(user_key, f->largest.user_key()) <= 0) {
// We've advanced far enough
if (user_cmp->Compare(user_key, f->smallest.user_key()) >= 0) {
// Key falls in this file's range, so definitely not base level
return false;
}
break;
}
level_ptrs_[lvl]++;
}
}
return true;
}
bool Compaction::ShouldStopBefore(const Slice& internal_key) {
// Scan to find earliest grandparent file that contains key.
const InternalKeyComparator* icmp = &input_version_->vset_->icmp_;
while (grandparent_index_ < grandparents_.size() &&
icmp->Compare(internal_key,
grandparents_[grandparent_index_]->largest.Encode()) > 0) {
if (seen_key_) {
overlapped_bytes_ += grandparents_[grandparent_index_]->file_size;
}
assert(grandparent_index_ + 1 >= grandparents_.size() ||
icmp->Compare(grandparents_[grandparent_index_]->largest.Encode(),
grandparents_[grandparent_index_+1]->smallest.Encode())
< 0);
grandparent_index_++;
}
seen_key_ = true;
if (overlapped_bytes_ > maxGrandParentOverlapBytes_) {
// Too much overlap for current output; start new output
overlapped_bytes_ = 0;
return true;
} else {
return false;
}
}
// Mark (or clear) each file that is being compacted
void Compaction::MarkFilesBeingCompacted(bool value) {
for (int i = 0; i < 2; i++) {
std::vector<FileMetaData*> v = inputs_[i];
for (unsigned int j = 0; j < inputs_[i].size(); j++) {
assert(value ? !inputs_[i][j]->being_compacted :
inputs_[i][j]->being_compacted);
inputs_[i][j]->being_compacted = value;
}
}
}
// Is this compaction producing files at the bottommost level?
void Compaction::SetupBottomMostLevel(bool isManual) {
if (input_version_->vset_->options_->compaction_style ==
kCompactionStyleUniversal) {
// If universal compaction style is used and manual
// compaction is occuring, then we are guaranteed that
// all files will be picked in a single compaction
// run. We can safely set bottommost_level_ = true.
// If it is not manual compaction, then bottommost_level_
// is already set when the Compaction was created.
if (isManual) {
bottommost_level_ = true;
}
return;
}
bottommost_level_ = true;
int num_levels = input_version_->vset_->NumberLevels();
for (int i = output_level() + 1; i < num_levels; i++) {
if (input_version_->NumLevelFiles(i) > 0) {
bottommost_level_ = false;
break;
}
}
}
void Compaction::ReleaseInputs() {
if (input_version_ != nullptr) {
input_version_->Unref();
input_version_ = nullptr;
}
}
void Compaction::ResetNextCompactionIndex() {
input_version_->ResetNextCompactionIndex(level_);
}
static void InputSummary(std::vector<FileMetaData*>& files, char* output,
int len) {
int write = 0;
for (unsigned int i = 0; i < files.size(); i++) {
int sz = len - write;
int ret = snprintf(output + write, sz, "%lu(%lu) ",
(unsigned long)files.at(i)->number,
(unsigned long)files.at(i)->file_size);
if (ret < 0 || ret >= sz)
break;
write += ret;
}
}
void Compaction::Summary(char* output, int len) {
int write = snprintf(output, len,
"Base version %lu Base level %d, seek compaction:%d, inputs:",
(unsigned long)input_version_->GetVersionNumber(),
level_,
seek_compaction_);
if (write < 0 || write > len) {
return;
}
char level_low_summary[100];
InputSummary(inputs_[0], level_low_summary, sizeof(level_low_summary));
char level_up_summary[100];
if (inputs_[1].size()) {
InputSummary(inputs_[1], level_up_summary, sizeof(level_up_summary));
} else {
level_up_summary[0] = '\0';
}
snprintf(output + write, len - write, "[%s],[%s]",
level_low_summary, level_up_summary);
}
} // namespace rocksdb

@ -0,0 +1,134 @@
// Copyright (c) 2013, Facebook, Inc. All rights reserved.
// This source code is licensed under the BSD-style license found in the
// LICENSE file in the root directory of this source tree. An additional grant
// of patent rights can be found in the PATENTS file in the same directory.
//
// Copyright (c) 2011 The LevelDB Authors. All rights reserved.
// Use of this source code is governed by a BSD-style license that can be
// found in the LICENSE file. See the AUTHORS file for names of contributors.
#pragma once
#include "db/version_set.h"
namespace rocksdb {
class Version;
// A Compaction encapsulates information about a compaction.
class Compaction {
public:
~Compaction();
// Return the level that is being compacted. Inputs from "level"
// will be merged.
int level() const { return level_; }
// Outputs will go to this level
int output_level() const { return out_level_; }
// Return the object that holds the edits to the descriptor done
// by this compaction.
VersionEdit* edit() { return edit_; }
// "which" must be either 0 or 1
int num_input_files(int which) const { return inputs_[which].size(); }
// Return the ith input file at "level()+which" ("which" must be 0 or 1).
FileMetaData* input(int which, int i) const { return inputs_[which][i]; }
// Maximum size of files to build during this compaction.
uint64_t MaxOutputFileSize() const { return max_output_file_size_; }
// Whether compression will be enabled for compaction outputs
bool enable_compression() const { return enable_compression_; }
// Is this a trivial compaction that can be implemented by just
// moving a single input file to the next level (no merging or splitting)
bool IsTrivialMove() const;
// Add all inputs to this compaction as delete operations to *edit.
void AddInputDeletions(VersionEdit* edit);
// Returns true if the information we have available guarantees that
// the compaction is producing data in "level+1" for which no data exists
// in levels greater than "level+1".
bool IsBaseLevelForKey(const Slice& user_key);
// Returns true iff we should stop building the current output
// before processing "internal_key".
bool ShouldStopBefore(const Slice& internal_key);
// Release the input version for the compaction, once the compaction
// is successful.
void ReleaseInputs();
void Summary(char* output, int len);
// Return the score that was used to pick this compaction run.
double score() const { return score_; }
// Is this compaction creating a file in the bottom most level?
bool BottomMostLevel() { return bottommost_level_; }
// Does this compaction include all sst files?
bool IsFullCompaction() { return is_full_compaction_; }
private:
friend class Version;
friend class VersionSet;
friend class CompactionPicker;
friend class UniversalCompactionPicker;
friend class LevelCompactionPicker;
Compaction(Version* input_version, int level, int out_level,
uint64_t target_file_size, uint64_t max_grandparent_overlap_bytes,
bool seek_compaction = false, bool enable_compression = true);
int level_;
int out_level_; // levels to which output files are stored
uint64_t max_output_file_size_;
uint64_t maxGrandParentOverlapBytes_;
Version* input_version_;
VersionEdit* edit_;
int number_levels_;
bool seek_compaction_;
bool enable_compression_;
// Each compaction reads inputs from "level_" and "level_+1"
std::vector<FileMetaData*> inputs_[2]; // The two sets of inputs
// State used to check for number of of overlapping grandparent files
// (parent == level_ + 1, grandparent == level_ + 2)
std::vector<FileMetaData*> grandparents_;
size_t grandparent_index_; // Index in grandparent_starts_
bool seen_key_; // Some output key has been seen
uint64_t overlapped_bytes_; // Bytes of overlap between current output
// and grandparent files
int base_index_; // index of the file in files_[level_]
int parent_index_; // index of some file with same range in files_[level_+1]
double score_; // score that was used to pick this compaction.
// Is this compaction creating a file in the bottom most level?
bool bottommost_level_;
// Does this compaction include all sst files?
bool is_full_compaction_;
// level_ptrs_ holds indices into input_version_->levels_: our state
// is that we are positioned at one of the file ranges for each
// higher level than the ones involved in this compaction (i.e. for
// all L >= level_ + 2).
std::vector<size_t> level_ptrs_;
// mark (or clear) all files that are being compacted
void MarkFilesBeingCompacted(bool);
// Initialize whether compaction producing files at the bottommost level
void SetupBottomMostLevel(bool isManual);
// In case of compaction error, reset the nextIndex that is used
// to pick up the next file to be compacted from files_by_size_
void ResetNextCompactionIndex();
};
} // namespace rocksdb

@ -0,0 +1,847 @@
// Copyright (c) 2013, Facebook, Inc. All rights reserved.
// This source code is licensed under the BSD-style license found in the
// LICENSE file in the root directory of this source tree. An additional grant
// of patent rights can be found in the PATENTS file in the same directory.
//
// Copyright (c) 2011 The LevelDB Authors. All rights reserved.
// Use of this source code is governed by a BSD-style license that can be
// found in the LICENSE file. See the AUTHORS file for names of contributors.
#include "db/compaction_picker.h"
#include "util/statistics.h"
namespace rocksdb {
namespace {
uint64_t TotalFileSize(const std::vector<FileMetaData*>& files) {
uint64_t sum = 0;
for (size_t i = 0; i < files.size() && files[i]; i++) {
sum += files[i]->file_size;
}
return sum;
}
} // anonymous namespace
CompactionPicker::CompactionPicker(const Options* options,
const InternalKeyComparator* icmp)
: compactions_in_progress_(options->num_levels),
options_(options),
num_levels_(options->num_levels),
icmp_(icmp) {
Init();
}
void CompactionPicker::ReduceNumberOfLevels(int new_levels) {
num_levels_ = new_levels;
Init();
}
void CompactionPicker::Init() {
max_file_size_.reset(new uint64_t[NumberLevels()]);
level_max_bytes_.reset(new uint64_t[NumberLevels()]);
int target_file_size_multiplier = options_->target_file_size_multiplier;
int max_bytes_multiplier = options_->max_bytes_for_level_multiplier;
for (int i = 0; i < NumberLevels(); i++) {
if (i == 0 && options_->compaction_style == kCompactionStyleUniversal) {
max_file_size_[i] = ULLONG_MAX;
level_max_bytes_[i] = options_->max_bytes_for_level_base;
} else if (i > 1) {
max_file_size_[i] = max_file_size_[i - 1] * target_file_size_multiplier;
level_max_bytes_[i] =
level_max_bytes_[i - 1] * max_bytes_multiplier *
options_->max_bytes_for_level_multiplier_additional[i - 1];
} else {
max_file_size_[i] = options_->target_file_size_base;
level_max_bytes_[i] = options_->max_bytes_for_level_base;
}
}
}
CompactionPicker::~CompactionPicker() {}
void CompactionPicker::SizeBeingCompacted(std::vector<uint64_t>& sizes) {
for (int level = 0; level < NumberLevels() - 1; level++) {
uint64_t total = 0;
for (auto c : compactions_in_progress_[level]) {
assert(c->level() == level);
for (int i = 0; i < c->num_input_files(0); i++) {
total += c->input(0,i)->file_size;
}
}
sizes[level] = total;
}
}
// Clear all files to indicate that they are not being compacted
// Delete this compaction from the list of running compactions.
void CompactionPicker::ReleaseCompactionFiles(Compaction* c, Status status) {
c->MarkFilesBeingCompacted(false);
compactions_in_progress_[c->level()].erase(c);
if (!status.ok()) {
c->ResetNextCompactionIndex();
}
}
uint64_t CompactionPicker::MaxFileSizeForLevel(int level) const {
assert(level >= 0);
assert(level < NumberLevels());
return max_file_size_[level];
}
uint64_t CompactionPicker::MaxGrandParentOverlapBytes(int level) {
uint64_t result = MaxFileSizeForLevel(level);
result *= options_->max_grandparent_overlap_factor;
return result;
}
double CompactionPicker::MaxBytesForLevel(int level) {
// Note: the result for level zero is not really used since we set
// the level-0 compaction threshold based on number of files.
assert(level >= 0);
assert(level < NumberLevels());
return level_max_bytes_[level];
}
void CompactionPicker::GetRange(const std::vector<FileMetaData*>& inputs,
InternalKey* smallest, InternalKey* largest) {
assert(!inputs.empty());
smallest->Clear();
largest->Clear();
for (size_t i = 0; i < inputs.size(); i++) {
FileMetaData* f = inputs[i];
if (i == 0) {
*smallest = f->smallest;
*largest = f->largest;
} else {
if (icmp_->Compare(f->smallest, *smallest) < 0) {
*smallest = f->smallest;
}
if (icmp_->Compare(f->largest, *largest) > 0) {
*largest = f->largest;
}
}
}
}
void CompactionPicker::GetRange(const std::vector<FileMetaData*>& inputs1,
const std::vector<FileMetaData*>& inputs2,
InternalKey* smallest, InternalKey* largest) {
std::vector<FileMetaData*> all = inputs1;
all.insert(all.end(), inputs2.begin(), inputs2.end());
GetRange(all, smallest, largest);
}
bool CompactionPicker::ExpandWhileOverlapping(Compaction* c) {
// If inputs are empty then there is nothing to expand.
if (!c || c->inputs_[0].empty()) {
return true;
}
// GetOverlappingInputs will always do the right thing for level-0.
// So we don't need to do any expansion if level == 0.
if (c->level() == 0) {
return true;
}
const int level = c->level();
InternalKey smallest, largest;
// Keep expanding c->inputs_[0] until we are sure that there is a
// "clean cut" boundary between the files in input and the surrounding files.
// This will ensure that no parts of a key are lost during compaction.
int hint_index = -1;
size_t old_size;
do {
old_size = c->inputs_[0].size();
GetRange(c->inputs_[0], &smallest, &largest);
c->inputs_[0].clear();
c->input_version_->GetOverlappingInputs(
level, &smallest, &largest, &c->inputs_[0], hint_index, &hint_index);
} while(c->inputs_[0].size() > old_size);
// Get the new range
GetRange(c->inputs_[0], &smallest, &largest);
// If, after the expansion, there are files that are already under
// compaction, then we must drop/cancel this compaction.
int parent_index = -1;
if (FilesInCompaction(c->inputs_[0]) ||
(c->level() != c->output_level() &&
ParentRangeInCompaction(c->input_version_, &smallest, &largest, level,
&parent_index))) {
c->inputs_[0].clear();
c->inputs_[1].clear();
return false;
}
return true;
}
uint64_t CompactionPicker::ExpandedCompactionByteSizeLimit(int level) {
uint64_t result = MaxFileSizeForLevel(level);
result *= options_->expanded_compaction_factor;
return result;
}
// Returns true if any one of specified files are being compacted
bool CompactionPicker::FilesInCompaction(std::vector<FileMetaData*>& files) {
for (unsigned int i = 0; i < files.size(); i++) {
if (files[i]->being_compacted) {
return true;
}
}
return false;
}
// Returns true if any one of the parent files are being compacted
bool CompactionPicker::ParentRangeInCompaction(Version* version,
const InternalKey* smallest,
const InternalKey* largest,
int level, int* parent_index) {
std::vector<FileMetaData*> inputs;
assert(level + 1 < NumberLevels());
version->GetOverlappingInputs(level + 1, smallest, largest, &inputs,
*parent_index, parent_index);
return FilesInCompaction(inputs);
}
// Populates the set of inputs from "level+1" that overlap with "level".
// Will also attempt to expand "level" if that doesn't expand "level+1"
// or cause "level" to include a file for compaction that has an overlapping
// user-key with another file.
void CompactionPicker::SetupOtherInputs(Compaction* c) {
// If inputs are empty, then there is nothing to expand.
// If both input and output levels are the same, no need to consider
// files at level "level+1"
if (c->inputs_[0].empty() || c->level() == c->output_level()) {
return;
}
const int level = c->level();
InternalKey smallest, largest;
// Get the range one last time.
GetRange(c->inputs_[0], &smallest, &largest);
// Populate the set of next-level files (inputs_[1]) to include in compaction
c->input_version_->GetOverlappingInputs(level + 1, &smallest, &largest,
&c->inputs_[1], c->parent_index_,
&c->parent_index_);
// Get entire range covered by compaction
InternalKey all_start, all_limit;
GetRange(c->inputs_[0], c->inputs_[1], &all_start, &all_limit);
// See if we can further grow the number of inputs in "level" without
// changing the number of "level+1" files we pick up. We also choose NOT
// to expand if this would cause "level" to include some entries for some
// user key, while excluding other entries for the same user key. This
// can happen when one user key spans multiple files.
if (!c->inputs_[1].empty()) {
std::vector<FileMetaData*> expanded0;
c->input_version_->GetOverlappingInputs(
level, &all_start, &all_limit, &expanded0, c->base_index_, nullptr);
const uint64_t inputs0_size = TotalFileSize(c->inputs_[0]);
const uint64_t inputs1_size = TotalFileSize(c->inputs_[1]);
const uint64_t expanded0_size = TotalFileSize(expanded0);
uint64_t limit = ExpandedCompactionByteSizeLimit(level);
if (expanded0.size() > c->inputs_[0].size() &&
inputs1_size + expanded0_size < limit &&
!FilesInCompaction(expanded0) &&
!c->input_version_->HasOverlappingUserKey(&expanded0, level)) {
InternalKey new_start, new_limit;
GetRange(expanded0, &new_start, &new_limit);
std::vector<FileMetaData*> expanded1;
c->input_version_->GetOverlappingInputs(level + 1, &new_start, &new_limit,
&expanded1, c->parent_index_,
&c->parent_index_);
if (expanded1.size() == c->inputs_[1].size() &&
!FilesInCompaction(expanded1)) {
Log(options_->info_log,
"Expanding@%lu %lu+%lu (%lu+%lu bytes) to %lu+%lu (%lu+%lu bytes)"
"\n",
(unsigned long)level,
(unsigned long)(c->inputs_[0].size()),
(unsigned long)(c->inputs_[1].size()),
(unsigned long)inputs0_size,
(unsigned long)inputs1_size,
(unsigned long)(expanded0.size()),
(unsigned long)(expanded1.size()),
(unsigned long)expanded0_size,
(unsigned long)inputs1_size);
smallest = new_start;
largest = new_limit;
c->inputs_[0] = expanded0;
c->inputs_[1] = expanded1;
GetRange(c->inputs_[0], c->inputs_[1], &all_start, &all_limit);
}
}
}
// Compute the set of grandparent files that overlap this compaction
// (parent == level+1; grandparent == level+2)
if (level + 2 < NumberLevels()) {
c->input_version_->GetOverlappingInputs(level + 2, &all_start, &all_limit,
&c->grandparents_);
}
}
Compaction* CompactionPicker::CompactRange(Version* version, int input_level,
int output_level,
const InternalKey* begin,
const InternalKey* end,
InternalKey** compaction_end) {
std::vector<FileMetaData*> inputs;
bool covering_the_whole_range = true;
// All files are 'overlapping' in universal style compaction.
// We have to compact the entire range in one shot.
if (options_->compaction_style == kCompactionStyleUniversal) {
begin = nullptr;
end = nullptr;
}
version->GetOverlappingInputs(input_level, begin, end, &inputs);
if (inputs.empty()) {
return nullptr;
}
// Avoid compacting too much in one shot in case the range is large.
// But we cannot do this for level-0 since level-0 files can overlap
// and we must not pick one file and drop another older file if the
// two files overlap.
if (input_level > 0) {
const uint64_t limit =
MaxFileSizeForLevel(input_level) * options_->source_compaction_factor;
uint64_t total = 0;
for (size_t i = 0; i + 1 < inputs.size(); ++i) {
uint64_t s = inputs[i]->file_size;
total += s;
if (total >= limit) {
**compaction_end = inputs[i + 1]->smallest;
covering_the_whole_range = false;
inputs.resize(i + 1);
break;
}
}
}
Compaction* c = new Compaction(version, input_level, output_level,
MaxFileSizeForLevel(output_level),
MaxGrandParentOverlapBytes(input_level));
c->inputs_[0] = inputs;
if (ExpandWhileOverlapping(c) == false) {
delete c;
Log(options_->info_log, "Could not compact due to expansion failure.\n");
return nullptr;
}
SetupOtherInputs(c);
if (covering_the_whole_range) {
*compaction_end = nullptr;
}
// These files that are to be manaully compacted do not trample
// upon other files because manual compactions are processed when
// the system has a max of 1 background compaction thread.
c->MarkFilesBeingCompacted(true);
// Is this compaction creating a file at the bottommost level
c->SetupBottomMostLevel(true);
return c;
}
Compaction* LevelCompactionPicker::PickCompaction(Version* version) {
Compaction* c = nullptr;
int level = -1;
// Compute the compactions needed. It is better to do it here
// and also in LogAndApply(), otherwise the values could be stale.
std::vector<uint64_t> size_being_compacted(NumberLevels() - 1);
SizeBeingCompacted(size_being_compacted);
version->Finalize(size_being_compacted);
// We prefer compactions triggered by too much data in a level over
// the compactions triggered by seeks.
//
// Find the compactions by size on all levels.
for (int i = 0; i < NumberLevels() - 1; i++) {
assert(i == 0 ||
version->compaction_score_[i] <= version->compaction_score_[i - 1]);
level = version->compaction_level_[i];
if ((version->compaction_score_[i] >= 1)) {
c = PickCompactionBySize(version, level, version->compaction_score_[i]);
if (ExpandWhileOverlapping(c) == false) {
delete c;
c = nullptr;
} else {
break;
}
}
}
// Find compactions needed by seeks
FileMetaData* f = version->file_to_compact_;
if (c == nullptr && f != nullptr && !f->being_compacted) {
level = version->file_to_compact_level_;
int parent_index = -1;
// Only allow one level 0 compaction at a time.
// Do not pick this file if its parents at level+1 are being compacted.
if (level != 0 || compactions_in_progress_[0].empty()) {
if (!ParentRangeInCompaction(version, &f->smallest, &f->largest, level,
&parent_index)) {
c = new Compaction(version, level, level + 1,
MaxFileSizeForLevel(level + 1),
MaxGrandParentOverlapBytes(level), true);
c->inputs_[0].push_back(f);
c->parent_index_ = parent_index;
c->input_version_->file_to_compact_ = nullptr;
if (ExpandWhileOverlapping(c) == false) {
return nullptr;
}
}
}
}
if (c == nullptr) {
return nullptr;
}
// Two level 0 compaction won't run at the same time, so don't need to worry
// about files on level 0 being compacted.
if (level == 0) {
assert(compactions_in_progress_[0].empty());
InternalKey smallest, largest;
GetRange(c->inputs_[0], &smallest, &largest);
// Note that the next call will discard the file we placed in
// c->inputs_[0] earlier and replace it with an overlapping set
// which will include the picked file.
c->inputs_[0].clear();
c->input_version_->GetOverlappingInputs(0, &smallest, &largest,
&c->inputs_[0]);
// If we include more L0 files in the same compaction run it can
// cause the 'smallest' and 'largest' key to get extended to a
// larger range. So, re-invoke GetRange to get the new key range
GetRange(c->inputs_[0], &smallest, &largest);
if (ParentRangeInCompaction(c->input_version_, &smallest, &largest, level,
&c->parent_index_)) {
delete c;
return nullptr;
}
assert(!c->inputs_[0].empty());
}
// Setup "level+1" files (inputs_[1])
SetupOtherInputs(c);
// mark all the files that are being compacted
c->MarkFilesBeingCompacted(true);
// Is this compaction creating a file at the bottommost level
c->SetupBottomMostLevel(false);
// remember this currently undergoing compaction
compactions_in_progress_[level].insert(c);
return c;
}
Compaction* LevelCompactionPicker::PickCompactionBySize(Version* version,
int level,
double score) {
Compaction* c = nullptr;
// level 0 files are overlapping. So we cannot pick more
// than one concurrent compactions at this level. This
// could be made better by looking at key-ranges that are
// being compacted at level 0.
if (level == 0 && compactions_in_progress_[level].size() == 1) {
return nullptr;
}
assert(level >= 0);
assert(level + 1 < NumberLevels());
c = new Compaction(version, level, level + 1, MaxFileSizeForLevel(level + 1),
MaxGrandParentOverlapBytes(level));
c->score_ = score;
// Pick the largest file in this level that is not already
// being compacted
std::vector<int>& file_size = c->input_version_->files_by_size_[level];
// record the first file that is not yet compacted
int nextIndex = -1;
for (unsigned int i = c->input_version_->next_file_to_compact_by_size_[level];
i < file_size.size(); i++) {
int index = file_size[i];
FileMetaData* f = c->input_version_->files_[level][index];
// check to verify files are arranged in descending size
assert((i == file_size.size() - 1) ||
(i >= Version::number_of_files_to_sort_ - 1) ||
(f->file_size >=
c->input_version_->files_[level][file_size[i + 1]]->file_size));
// do not pick a file to compact if it is being compacted
// from n-1 level.
if (f->being_compacted) {
continue;
}
// remember the startIndex for the next call to PickCompaction
if (nextIndex == -1) {
nextIndex = i;
}
//if (i > Version::number_of_files_to_sort_) {
// Log(options_->info_log, "XXX Looking at index %d", i);
//}
// Do not pick this file if its parents at level+1 are being compacted.
// Maybe we can avoid redoing this work in SetupOtherInputs
int parent_index = -1;
if (ParentRangeInCompaction(c->input_version_, &f->smallest, &f->largest,
level, &parent_index)) {
continue;
}
c->inputs_[0].push_back(f);
c->base_index_ = index;
c->parent_index_ = parent_index;
break;
}
if (c->inputs_[0].empty()) {
delete c;
c = nullptr;
}
// store where to start the iteration in the next call to PickCompaction
version->next_file_to_compact_by_size_[level] = nextIndex;
return c;
}
// Universal style of compaction. Pick files that are contiguous in
// time-range to compact.
//
Compaction* UniversalCompactionPicker::PickCompaction(Version* version) {
int level = 0;
double score = version->compaction_score_[0];
if ((version->files_[level].size() <
(unsigned int)options_->level0_file_num_compaction_trigger)) {
Log(options_->info_log, "Universal: nothing to do\n");
return nullptr;
}
Version::FileSummaryStorage tmp;
Log(options_->info_log, "Universal: candidate files(%lu): %s\n",
version->files_[level].size(),
version->LevelFileSummary(&tmp, 0));
// Check for size amplification first.
Compaction* c = PickCompactionUniversalSizeAmp(version, score);
if (c == nullptr) {
// Size amplification is within limits. Try reducing read
// amplification while maintaining file size ratios.
unsigned int ratio = options_->compaction_options_universal.size_ratio;
c = PickCompactionUniversalReadAmp(version, score, ratio, UINT_MAX);
// Size amplification and file size ratios are within configured limits.
// If max read amplification is exceeding configured limits, then force
// compaction without looking at filesize ratios and try to reduce
// the number of files to fewer than level0_file_num_compaction_trigger.
if (c == nullptr) {
unsigned int num_files = version->files_[level].size() -
options_->level0_file_num_compaction_trigger;
c = PickCompactionUniversalReadAmp(version, score, UINT_MAX, num_files);
}
}
if (c == nullptr) {
return nullptr;
}
assert(c->inputs_[0].size() > 1);
// validate that all the chosen files are non overlapping in time
FileMetaData* newerfile __attribute__((unused)) = nullptr;
for (unsigned int i = 0; i < c->inputs_[0].size(); i++) {
FileMetaData* f = c->inputs_[0][i];
assert (f->smallest_seqno <= f->largest_seqno);
assert(newerfile == nullptr ||
newerfile->smallest_seqno > f->largest_seqno);
newerfile = f;
}
// The files are sorted from newest first to oldest last.
std::vector<int>& file_by_time = c->input_version_->files_by_size_[level];
// Is the earliest file part of this compaction?
int last_index = file_by_time[file_by_time.size()-1];
FileMetaData* last_file = c->input_version_->files_[level][last_index];
if (c->inputs_[0][c->inputs_[0].size()-1] == last_file) {
c->bottommost_level_ = true;
}
// update statistics
MeasureTime(options_->statistics.get(), NUM_FILES_IN_SINGLE_COMPACTION,
c->inputs_[0].size());
// mark all the files that are being compacted
c->MarkFilesBeingCompacted(true);
// remember this currently undergoing compaction
compactions_in_progress_[level].insert(c);
// Record whether this compaction includes all sst files.
// For now, it is only relevant in universal compaction mode.
c->is_full_compaction_ =
(c->inputs_[0].size() == c->input_version_->files_[0].size());
return c;
}
//
// Consider compaction files based on their size differences with
// the next file in time order.
//
Compaction* UniversalCompactionPicker::PickCompactionUniversalReadAmp(
Version* version, double score, unsigned int ratio,
unsigned int max_number_of_files_to_compact) {
int level = 0;
unsigned int min_merge_width =
options_->compaction_options_universal.min_merge_width;
unsigned int max_merge_width =
options_->compaction_options_universal.max_merge_width;
// The files are sorted from newest first to oldest last.
std::vector<int>& file_by_time = version->files_by_size_[level];
FileMetaData* f = nullptr;
bool done = false;
int start_index = 0;
unsigned int candidate_count;
assert(file_by_time.size() == version->files_[level].size());
unsigned int max_files_to_compact = std::min(max_merge_width,
max_number_of_files_to_compact);
min_merge_width = std::max(min_merge_width, 2U);
// Considers a candidate file only if it is smaller than the
// total size accumulated so far.
for (unsigned int loop = 0; loop < file_by_time.size(); loop++) {
candidate_count = 0;
// Skip files that are already being compacted
for (f = nullptr; loop < file_by_time.size(); loop++) {
int index = file_by_time[loop];
f = version->files_[level][index];
if (!f->being_compacted) {
candidate_count = 1;
break;
}
Log(options_->info_log,
"Universal: file %lu[%d] being compacted, skipping",
(unsigned long)f->number, loop);
f = nullptr;
}
// This file is not being compacted. Consider it as the
// first candidate to be compacted.
uint64_t candidate_size = f != nullptr? f->file_size : 0;
if (f != nullptr) {
Log(options_->info_log, "Universal: Possible candidate file %lu[%d].",
(unsigned long)f->number, loop);
}
// Check if the suceeding files need compaction.
for (unsigned int i = loop+1;
candidate_count < max_files_to_compact && i < file_by_time.size();
i++) {
int index = file_by_time[i];
FileMetaData* f = version->files_[level][index];
if (f->being_compacted) {
break;
}
// pick files if the total candidate file size (increased by the
// specified ratio) is still larger than the next candidate file.
uint64_t sz = (candidate_size * (100L + ratio)) /100;
if (sz < f->file_size) {
break;
}
candidate_count++;
candidate_size += f->file_size;
}
// Found a series of consecutive files that need compaction.
if (candidate_count >= (unsigned int)min_merge_width) {
start_index = loop;
done = true;
break;
} else {
for (unsigned int i = loop;
i < loop + candidate_count && i < file_by_time.size(); i++) {
int index = file_by_time[i];
FileMetaData* f = version->files_[level][index];
Log(options_->info_log,
"Universal: Skipping file %lu[%d] with size %lu %d\n",
(unsigned long)f->number,
i,
(unsigned long)f->file_size,
f->being_compacted);
}
}
}
if (!done || candidate_count <= 1) {
return nullptr;
}
unsigned int first_index_after = start_index + candidate_count;
// Compression is enabled if files compacted earlier already reached
// size ratio of compression.
bool enable_compression = true;
int ratio_to_compress =
options_->compaction_options_universal.compression_size_percent;
if (ratio_to_compress >= 0) {
uint64_t total_size = version->NumLevelBytes(level);
uint64_t older_file_size = 0;
for (unsigned int i = file_by_time.size() - 1; i >= first_index_after;
i--) {
older_file_size += version->files_[level][file_by_time[i]]->file_size;
if (older_file_size * 100L >= total_size * (long) ratio_to_compress) {
enable_compression = false;
break;
}
}
}
Compaction* c =
new Compaction(version, level, level, MaxFileSizeForLevel(level),
LLONG_MAX, false, enable_compression);
c->score_ = score;
for (unsigned int i = start_index; i < first_index_after; i++) {
int index = file_by_time[i];
FileMetaData* f = c->input_version_->files_[level][index];
c->inputs_[0].push_back(f);
Log(options_->info_log, "Universal: Picking file %lu[%d] with size %lu\n",
(unsigned long)f->number,
i,
(unsigned long)f->file_size);
}
return c;
}
// Look at overall size amplification. If size amplification
// exceeeds the configured value, then do a compaction
// of the candidate files all the way upto the earliest
// base file (overrides configured values of file-size ratios,
// min_merge_width and max_merge_width).
//
Compaction* UniversalCompactionPicker::PickCompactionUniversalSizeAmp(
Version* version, double score) {
int level = 0;
// percentage flexibilty while reducing size amplification
uint64_t ratio = options_->compaction_options_universal.
max_size_amplification_percent;
// The files are sorted from newest first to oldest last.
std::vector<int>& file_by_time = version->files_by_size_[level];
assert(file_by_time.size() == version->files_[level].size());
unsigned int candidate_count = 0;
uint64_t candidate_size = 0;
unsigned int start_index = 0;
FileMetaData* f = nullptr;
// Skip files that are already being compacted
for (unsigned int loop = 0; loop < file_by_time.size() - 1; loop++) {
int index = file_by_time[loop];
f = version->files_[level][index];
if (!f->being_compacted) {
start_index = loop; // Consider this as the first candidate.
break;
}
Log(options_->info_log, "Universal: skipping file %lu[%d] compacted %s",
(unsigned long)f->number,
loop,
" cannot be a candidate to reduce size amp.\n");
f = nullptr;
}
if (f == nullptr) {
return nullptr; // no candidate files
}
Log(options_->info_log, "Universal: First candidate file %lu[%d] %s",
(unsigned long)f->number,
start_index,
" to reduce size amp.\n");
// keep adding up all the remaining files
for (unsigned int loop = start_index; loop < file_by_time.size() - 1;
loop++) {
int index = file_by_time[loop];
f = version->files_[level][index];
if (f->being_compacted) {
Log(options_->info_log,
"Universal: Possible candidate file %lu[%d] %s.",
(unsigned long)f->number,
loop,
" is already being compacted. No size amp reduction possible.\n");
return nullptr;
}
candidate_size += f->file_size;
candidate_count++;
}
if (candidate_count == 0) {
return nullptr;
}
// size of earliest file
int index = file_by_time[file_by_time.size() - 1];
uint64_t earliest_file_size = version->files_[level][index]->file_size;
// size amplification = percentage of additional size
if (candidate_size * 100 < ratio * earliest_file_size) {
Log(options_->info_log,
"Universal: size amp not needed. newer-files-total-size %lu "
"earliest-file-size %lu",
(unsigned long)candidate_size,
(unsigned long)earliest_file_size);
return nullptr;
} else {
Log(options_->info_log,
"Universal: size amp needed. newer-files-total-size %lu "
"earliest-file-size %lu",
(unsigned long)candidate_size,
(unsigned long)earliest_file_size);
}
assert(start_index >= 0 && start_index < file_by_time.size() - 1);
// create a compaction request
// We always compact all the files, so always compress.
Compaction* c =
new Compaction(version, level, level, MaxFileSizeForLevel(level),
LLONG_MAX, false, true);
c->score_ = score;
for (unsigned int loop = start_index; loop < file_by_time.size(); loop++) {
int index = file_by_time[loop];
f = c->input_version_->files_[level][index];
c->inputs_[0].push_back(f);
Log(options_->info_log,
"Universal: size amp picking file %lu[%d] with size %lu",
(unsigned long)f->number,
index,
(unsigned long)f->file_size);
}
return c;
}
} // namespace rocksdb

@ -0,0 +1,162 @@
// Copyright (c) 2013, Facebook, Inc. All rights reserved.
// This source code is licensed under the BSD-style license found in the
// LICENSE file in the root directory of this source tree. An additional grant
// of patent rights can be found in the PATENTS file in the same directory.
//
// Copyright (c) 2011 The LevelDB Authors. All rights reserved.
// Use of this source code is governed by a BSD-style license that can be
// found in the LICENSE file. See the AUTHORS file for names of contributors.
#pragma once
#include "db/version_set.h"
#include "db/compaction.h"
#include "rocksdb/status.h"
#include "rocksdb/options.h"
#include <vector>
#include <memory>
#include <set>
namespace rocksdb {
class Compaction;
class Version;
class CompactionPicker {
public:
CompactionPicker(const Options* options, const InternalKeyComparator* icmp);
virtual ~CompactionPicker();
// See VersionSet::ReduceNumberOfLevels()
void ReduceNumberOfLevels(int new_levels);
// Pick level and inputs for a new compaction.
// Returns nullptr if there is no compaction to be done.
// Otherwise returns a pointer to a heap-allocated object that
// describes the compaction. Caller should delete the result.
virtual Compaction* PickCompaction(Version* version) = 0;
// Return a compaction object for compacting the range [begin,end] in
// the specified level. Returns nullptr if there is nothing in that
// level that overlaps the specified range. Caller should delete
// the result.
//
// The returned Compaction might not include the whole requested range.
// In that case, compaction_end will be set to the next key that needs
// compacting. In case the compaction will compact the whole range,
// compaction_end will be set to nullptr.
// Client is responsible for compaction_end storage -- when called,
// *compaction_end should point to valid InternalKey!
Compaction* CompactRange(Version* version, int input_level, int output_level,
const InternalKey* begin, const InternalKey* end,
InternalKey** compaction_end);
// Free up the files that participated in a compaction
void ReleaseCompactionFiles(Compaction* c, Status status);
// Return the total amount of data that is undergoing
// compactions per level
void SizeBeingCompacted(std::vector<uint64_t>& sizes);
// Returns maximum total overlap bytes with grandparent
// level (i.e., level+2) before we stop building a single
// file in level->level+1 compaction.
uint64_t MaxGrandParentOverlapBytes(int level);
// Returns maximum total bytes of data on a given level.
double MaxBytesForLevel(int level);
// Get the max file size in a given level.
uint64_t MaxFileSizeForLevel(int level) const;
protected:
int NumberLevels() const { return num_levels_; }
// Stores the minimal range that covers all entries in inputs in
// *smallest, *largest.
// REQUIRES: inputs is not empty
void GetRange(const std::vector<FileMetaData*>& inputs, InternalKey* smallest,
InternalKey* largest);
// Stores the minimal range that covers all entries in inputs1 and inputs2
// in *smallest, *largest.
// REQUIRES: inputs is not empty
void GetRange(const std::vector<FileMetaData*>& inputs1,
const std::vector<FileMetaData*>& inputs2,
InternalKey* smallest, InternalKey* largest);
// Add more files to the inputs on "level" to make sure that
// no newer version of a key is compacted to "level+1" while leaving an older
// version in a "level". Otherwise, any Get() will search "level" first,
// and will likely return an old/stale value for the key, since it always
// searches in increasing order of level to find the value. This could
// also scramble the order of merge operands. This function should be
// called any time a new Compaction is created, and its inputs_[0] are
// populated.
//
// Will return false if it is impossible to apply this compaction.
bool ExpandWhileOverlapping(Compaction* c);
uint64_t ExpandedCompactionByteSizeLimit(int level);
// Returns true if any one of the specified files are being compacted
bool FilesInCompaction(std::vector<FileMetaData*>& files);
// Returns true if any one of the parent files are being compacted
bool ParentRangeInCompaction(Version* version, const InternalKey* smallest,
const InternalKey* largest, int level,
int* index);
void SetupOtherInputs(Compaction* c);
// record all the ongoing compactions for all levels
std::vector<std::set<Compaction*>> compactions_in_progress_;
// Per-level target file size.
std::unique_ptr<uint64_t[]> max_file_size_;
// Per-level max bytes
std::unique_ptr<uint64_t[]> level_max_bytes_;
const Options* const options_;
private:
void Init();
int num_levels_;
const InternalKeyComparator* const icmp_;
};
class UniversalCompactionPicker : public CompactionPicker {
public:
UniversalCompactionPicker(const Options* options,
const InternalKeyComparator* icmp)
: CompactionPicker(options, icmp) {}
virtual Compaction* PickCompaction(Version* version) override;
private:
// Pick Universal compaction to limit read amplification
Compaction* PickCompactionUniversalReadAmp(Version* version, double score,
unsigned int ratio,
unsigned int num_files);
// Pick Universal compaction to limit space amplification.
Compaction* PickCompactionUniversalSizeAmp(Version* version, double score);
};
class LevelCompactionPicker : public CompactionPicker {
public:
LevelCompactionPicker(const Options* options,
const InternalKeyComparator* icmp)
: CompactionPicker(options, icmp) {}
virtual Compaction* PickCompaction(Version* version) override;
private:
// For the specfied level, pick a compaction.
// Returns nullptr if there is no compaction to be done.
// If level is 0 and there is already a compaction on that level, this
// function will return nullptr.
Compaction* PickCompactionBySize(Version* version, int level, double score);
};
} // namespace rocksdb

@ -14,7 +14,7 @@
#include <gflags/gflags.h> #include <gflags/gflags.h>
#include "db/db_impl.h" #include "db/db_impl.h"
#include "db/version_set.h" #include "db/version_set.h"
#include "db/db_statistics.h" #include "rocksdb/statistics.h"
#include "rocksdb/options.h" #include "rocksdb/options.h"
#include "rocksdb/cache.h" #include "rocksdb/cache.h"
#include "rocksdb/db.h" #include "rocksdb/db.h"
@ -30,6 +30,7 @@
#include "util/random.h" #include "util/random.h"
#include "util/stack_trace.h" #include "util/stack_trace.h"
#include "util/string_util.h" #include "util/string_util.h"
#include "util/statistics.h"
#include "util/testutil.h" #include "util/testutil.h"
#include "hdfs/env_hdfs.h" #include "hdfs/env_hdfs.h"
#include "utilities/merge_operators.h" #include "utilities/merge_operators.h"
@ -355,9 +356,9 @@ static bool ValidateCompressionLevel(const char* flagname, int32_t value) {
return true; return true;
} }
static const bool FLAGS_compression_level_dummy = static const bool FLAGS_compression_level_dummy __attribute__((unused)) =
google::RegisterFlagValidator(&FLAGS_compression_level, google::RegisterFlagValidator(&FLAGS_compression_level,
&ValidateCompressionLevel); &ValidateCompressionLevel);
DEFINE_int32(min_level_to_compress, -1, "If non-negative, compression starts" DEFINE_int32(min_level_to_compress, -1, "If non-negative, compression starts"
" from this level. Levels with number < min_level_to_compress are" " from this level. Levels with number < min_level_to_compress are"

@ -74,7 +74,7 @@ Status DBImpl::GetLiveFiles(std::vector<std::string>& ret,
// Make a set of all of the live *.sst files // Make a set of all of the live *.sst files
std::set<uint64_t> live; std::set<uint64_t> live;
versions_->AddLiveFilesCurrentVersion(&live); versions_->current()->AddLiveFiles(&live);
ret.clear(); ret.clear();
ret.reserve(live.size() + 2); //*.sst + CURRENT + MANIFEST ret.reserve(live.size() + 2); //*.sst + CURRENT + MANIFEST

@ -57,6 +57,7 @@
#include "util/mutexlock.h" #include "util/mutexlock.h"
#include "util/perf_context_imp.h" #include "util/perf_context_imp.h"
#include "util/stop_watch.h" #include "util/stop_watch.h"
#include "util/autovector.h"
namespace rocksdb { namespace rocksdb {
@ -254,8 +255,8 @@ DBImpl::DBImpl(const Options& options, const std::string& dbname)
: env_(options.env), : env_(options.env),
dbname_(dbname), dbname_(dbname),
internal_comparator_(options.comparator), internal_comparator_(options.comparator),
options_(SanitizeOptions( options_(SanitizeOptions(dbname, &internal_comparator_,
dbname, &internal_comparator_, &internal_filter_policy_, options)), &internal_filter_policy_, options)),
internal_filter_policy_(options.filter_policy), internal_filter_policy_(options.filter_policy),
owns_info_log_(options_.info_log != options.info_log), owns_info_log_(options_.info_log != options.info_log),
db_lock_(nullptr), db_lock_(nullptr),
@ -263,8 +264,7 @@ DBImpl::DBImpl(const Options& options, const std::string& dbname)
shutting_down_(nullptr), shutting_down_(nullptr),
bg_cv_(&mutex_), bg_cv_(&mutex_),
mem_rep_factory_(options_.memtable_factory.get()), mem_rep_factory_(options_.memtable_factory.get()),
mem_(new MemTable(internal_comparator_, mem_rep_factory_, mem_(new MemTable(internal_comparator_, options_)),
NumberLevels(), options_)),
logfile_number_(0), logfile_number_(0),
super_version_(nullptr), super_version_(nullptr),
tmp_batch_(), tmp_batch_(),
@ -410,7 +410,7 @@ uint64_t DBImpl::TEST_Current_Manifest_FileNo() {
} }
Status DBImpl::NewDB() { Status DBImpl::NewDB() {
VersionEdit new_db(NumberLevels()); VersionEdit new_db;
new_db.SetComparatorName(user_comparator()->Name()); new_db.SetComparatorName(user_comparator()->Name());
new_db.SetLogNumber(0); new_db.SetLogNumber(0);
new_db.SetNextFile(2); new_db.SetNextFile(2);
@ -1048,8 +1048,7 @@ Status DBImpl::RecoverLogFile(uint64_t log_number,
WriteBatchInternal::SetContents(&batch, record); WriteBatchInternal::SetContents(&batch, record);
if (mem == nullptr) { if (mem == nullptr) {
mem = new MemTable(internal_comparator_, mem_rep_factory_, mem = new MemTable(internal_comparator_, options_);
NumberLevels(), options_);
mem->Ref(); mem->Ref();
} }
status = WriteBatchInternal::InsertInto(&batch, mem, &options_); status = WriteBatchInternal::InsertInto(&batch, mem, &options_);
@ -1300,6 +1299,7 @@ Status DBImpl::FlushMemTableToOutputFile(bool* madeProgress,
void DBImpl::CompactRange(const ColumnFamilyHandle& column_family, void DBImpl::CompactRange(const ColumnFamilyHandle& column_family,
const Slice* begin, const Slice* end, const Slice* begin, const Slice* end,
bool reduce_level, int target_level) { bool reduce_level, int target_level) {
FlushMemTable(FlushOptions());
int max_level_with_files = 1; int max_level_with_files = 1;
{ {
MutexLock l(&mutex_); MutexLock l(&mutex_);
@ -1310,9 +1310,15 @@ void DBImpl::CompactRange(const ColumnFamilyHandle& column_family,
} }
} }
} }
TEST_FlushMemTable(); // TODO(sanjay): Skip if memtable does not overlap for (int level = 0; level <= max_level_with_files; level++) {
for (int level = 0; level < max_level_with_files; level++) { // in case the compaction is unversal or if we're compacting the
TEST_CompactRange(level, begin, end); // bottom-most level, the output level will be the same as input one
if (options_.compaction_style == kCompactionStyleUniversal ||
level == max_level_with_files) {
RunManualCompaction(level, level, begin, end);
} else {
RunManualCompaction(level, level + 1, begin, end);
}
} }
if (reduce_level) { if (reduce_level) {
@ -1324,13 +1330,13 @@ void DBImpl::CompactRange(const ColumnFamilyHandle& column_family,
// return the same level if it cannot be moved // return the same level if it cannot be moved
int DBImpl::FindMinimumEmptyLevelFitting(int level) { int DBImpl::FindMinimumEmptyLevelFitting(int level) {
mutex_.AssertHeld(); mutex_.AssertHeld();
Version* current = versions_->current();
int minimum_level = level; int minimum_level = level;
for (int i = level - 1; i > 0; --i) { for (int i = level - 1; i > 0; --i) {
// stop if level i is not empty // stop if level i is not empty
if (versions_->NumLevelFiles(i) > 0) break; if (current->NumLevelFiles(i) > 0) break;
// stop if level i is too small (cannot fit the level files) // stop if level i is too small (cannot fit the level files)
if (versions_->MaxBytesForLevel(i) < versions_->NumLevelBytes(level)) break; if (versions_->MaxBytesForLevel(i) < current->NumLevelBytes(level)) break;
minimum_level = i; minimum_level = i;
} }
@ -1376,7 +1382,7 @@ void DBImpl::ReFitLevel(int level, int target_level) {
Log(options_.info_log, "Before refitting:\n%s", Log(options_.info_log, "Before refitting:\n%s",
versions_->current()->DebugString().data()); versions_->current()->DebugString().data());
VersionEdit edit(NumberLevels()); VersionEdit edit;
for (const auto& f : versions_->current()->files_[level]) { for (const auto& f : versions_->current()->files_[level]) {
edit.DeleteFile(level, f->number); edit.DeleteFile(level, f->number);
edit.AddFile(to_level, f->number, f->file_size, f->smallest, f->largest, edit.AddFile(to_level, f->number, f->file_size, f->smallest, f->largest,
@ -1612,13 +1618,17 @@ Status DBImpl::AppendSortedWalsOfType(const std::string& path,
return status; return status;
} }
void DBImpl::TEST_CompactRange(int level, const Slice* begin,const Slice* end) { void DBImpl::RunManualCompaction(int input_level,
assert(level >= 0); int output_level,
const Slice* begin,
const Slice* end) {
assert(input_level >= 0);
InternalKey begin_storage, end_storage; InternalKey begin_storage, end_storage;
ManualCompaction manual; ManualCompaction manual;
manual.level = level; manual.input_level = input_level;
manual.output_level = output_level;
manual.done = false; manual.done = false;
manual.in_progress = false; manual.in_progress = false;
// For universal compaction, we enforce every manual compaction to compact // For universal compaction, we enforce every manual compaction to compact
@ -1646,11 +1656,11 @@ void DBImpl::TEST_CompactRange(int level, const Slice* begin,const Slice* end) {
// can compact any range of keys/files. // can compact any range of keys/files.
// //
// bg_manual_only_ is non-zero when at least one thread is inside // bg_manual_only_ is non-zero when at least one thread is inside
// TEST_CompactRange(), i.e. during that time no other compaction will // RunManualCompaction(), i.e. during that time no other compaction will
// get scheduled (see MaybeScheduleFlushOrCompaction). // get scheduled (see MaybeScheduleFlushOrCompaction).
// //
// Note that the following loop doesn't stop more that one thread calling // Note that the following loop doesn't stop more that one thread calling
// TEST_CompactRange() from getting to the second while loop below. // RunManualCompaction() from getting to the second while loop below.
// However, only one of them will actually schedule compaction, while // However, only one of them will actually schedule compaction, while
// others will wait on a condition variable until it completes. // others will wait on a condition variable until it completes.
@ -1680,6 +1690,15 @@ void DBImpl::TEST_CompactRange(int level, const Slice* begin,const Slice* end) {
--bg_manual_only_; --bg_manual_only_;
} }
void DBImpl::TEST_CompactRange(int level,
const Slice* begin,
const Slice* end) {
int output_level = (options_.compaction_style == kCompactionStyleUniversal)
? level
: level + 1;
RunManualCompaction(level, output_level, begin, end);
}
Status DBImpl::FlushMemTable(const FlushOptions& options) { Status DBImpl::FlushMemTable(const FlushOptions& options) {
// nullptr batch means just wait for earlier writes to be done // nullptr batch means just wait for earlier writes to be done
Status s = Write(WriteOptions(), nullptr); Status s = Write(WriteOptions(), nullptr);
@ -1825,6 +1844,11 @@ void DBImpl::TEST_PurgeObsoleteteWAL() {
PurgeObsoleteWALFiles(); PurgeObsoleteWALFiles();
} }
uint64_t DBImpl::TEST_GetLevel0TotalSize() {
MutexLock l(&mutex_);
return versions_->current()->NumLevelBytes(0);
}
void DBImpl::BackgroundCallCompaction() { void DBImpl::BackgroundCallCompaction() {
bool madeProgress = false; bool madeProgress = false;
DeletionState deletion_state(options_.max_write_buffer_number, true); DeletionState deletion_state(options_.max_write_buffer_number, true);
@ -1899,23 +1923,27 @@ Status DBImpl::BackgroundCompaction(bool* madeProgress,
unique_ptr<Compaction> c; unique_ptr<Compaction> c;
bool is_manual = (manual_compaction_ != nullptr) && bool is_manual = (manual_compaction_ != nullptr) &&
(manual_compaction_->in_progress == false); (manual_compaction_->in_progress == false);
InternalKey manual_end; InternalKey manual_end_storage;
InternalKey* manual_end = &manual_end_storage;
if (is_manual) { if (is_manual) {
ManualCompaction* m = manual_compaction_; ManualCompaction* m = manual_compaction_;
assert(!m->in_progress); assert(!m->in_progress);
m->in_progress = true; // another thread cannot pick up the same work m->in_progress = true; // another thread cannot pick up the same work
c.reset(versions_->CompactRange(m->level, m->begin, m->end)); c.reset(versions_->CompactRange(
if (c) { m->input_level, m->output_level, m->begin, m->end, &manual_end));
manual_end = c->input(0, c->num_input_files(0) - 1)->largest; if (!c) {
} else {
m->done = true; m->done = true;
} }
Log(options_.info_log, Log(options_.info_log,
"Manual compaction at level-%d from %s .. %s; will stop at %s\n", "Manual compaction from level-%d to level-%d from %s .. %s; will stop "
m->level, "at %s\n",
m->input_level,
m->output_level,
(m->begin ? m->begin->DebugString().c_str() : "(begin)"), (m->begin ? m->begin->DebugString().c_str() : "(begin)"),
(m->end ? m->end->DebugString().c_str() : "(end)"), (m->end ? m->end->DebugString().c_str() : "(end)"),
(m->done ? "(end)" : manual_end.DebugString().c_str())); ((m->done || manual_end == nullptr)
? "(end)"
: manual_end->DebugString().c_str()));
} else if (!options_.disable_auto_compactions) { } else if (!options_.disable_auto_compactions) {
c.reset(versions_->PickCompaction()); c.reset(versions_->PickCompaction());
} }
@ -1934,13 +1962,11 @@ Status DBImpl::BackgroundCompaction(bool* madeProgress,
f->smallest_seqno, f->largest_seqno); f->smallest_seqno, f->largest_seqno);
status = versions_->LogAndApply(c->edit(), &mutex_); status = versions_->LogAndApply(c->edit(), &mutex_);
InstallSuperVersion(deletion_state); InstallSuperVersion(deletion_state);
VersionSet::LevelSummaryStorage tmp; Version::LevelSummaryStorage tmp;
Log(options_.info_log, "Moved #%lld to level-%d %lld bytes %s: %s\n", Log(options_.info_log, "Moved #%lld to level-%d %lld bytes %s: %s\n",
static_cast<unsigned long long>(f->number), static_cast<unsigned long long>(f->number), c->level() + 1,
c->level() + 1,
static_cast<unsigned long long>(f->file_size), static_cast<unsigned long long>(f->file_size),
status.ToString().c_str(), status.ToString().c_str(), versions_->current()->LevelSummary(&tmp));
versions_->LevelSummary(&tmp));
versions_->ReleaseCompactionFiles(c.get(), status); versions_->ReleaseCompactionFiles(c.get(), status);
*madeProgress = true; *madeProgress = true;
} else { } else {
@ -1980,13 +2006,19 @@ Status DBImpl::BackgroundCompaction(bool* madeProgress,
// Also note that, if we don't stop here, then the current compaction // Also note that, if we don't stop here, then the current compaction
// writes a new file back to level 0, which will be used in successive // writes a new file back to level 0, which will be used in successive
// compaction. Hence the manual compaction will never finish. // compaction. Hence the manual compaction will never finish.
if (options_.compaction_style == kCompactionStyleUniversal) { //
// Stop the compaction if manual_end points to nullptr -- this means
// that we compacted the whole range. manual_end should always point
// to nullptr in case of universal compaction
if (manual_end == nullptr) {
m->done = true; m->done = true;
} }
if (!m->done) { if (!m->done) {
// We only compacted part of the requested range. Update *m // We only compacted part of the requested range. Update *m
// to the range that is left to be compacted. // to the range that is left to be compacted.
m->tmp_storage = manual_end; // Universal compaction should always compact the whole range
assert(options_.compaction_style != kCompactionStyleUniversal);
m->tmp_storage = *manual_end;
m->begin = &m->tmp_storage; m->begin = &m->tmp_storage;
} }
m->in_progress = false; // not being processed anymore m->in_progress = false; // not being processed anymore
@ -2018,14 +2050,14 @@ void DBImpl::CleanupCompaction(CompactionState* compact, Status status) {
} }
// Allocate the file numbers for the output file. We allocate as // Allocate the file numbers for the output file. We allocate as
// many output file numbers as there are files in level+1. // many output file numbers as there are files in level+1 (at least one)
// Insert them into pending_outputs so that they do not get deleted. // Insert them into pending_outputs so that they do not get deleted.
void DBImpl::AllocateCompactionOutputFileNumbers(CompactionState* compact) { void DBImpl::AllocateCompactionOutputFileNumbers(CompactionState* compact) {
mutex_.AssertHeld(); mutex_.AssertHeld();
assert(compact != nullptr); assert(compact != nullptr);
assert(compact->builder == nullptr); assert(compact->builder == nullptr);
int filesNeeded = compact->compaction->num_input_files(1); int filesNeeded = compact->compaction->num_input_files(1);
for (int i = 0; i < filesNeeded; i++) { for (int i = 0; i < std::max(filesNeeded, 1); i++) {
uint64_t file_number = versions_->NewFileNumber(); uint64_t file_number = versions_->NewFileNumber();
pending_outputs_.insert(file_number); pending_outputs_.insert(file_number);
compact->allocated_file_numbers.push_back(file_number); compact->allocated_file_numbers.push_back(file_number);
@ -2169,14 +2201,11 @@ Status DBImpl::InstallCompactionResults(CompactionState* compact) {
// Add compaction outputs // Add compaction outputs
compact->compaction->AddInputDeletions(compact->compaction->edit()); compact->compaction->AddInputDeletions(compact->compaction->edit());
const int level = compact->compaction->level();
for (size_t i = 0; i < compact->outputs.size(); i++) { for (size_t i = 0; i < compact->outputs.size(); i++) {
const CompactionState::Output& out = compact->outputs[i]; const CompactionState::Output& out = compact->outputs[i];
compact->compaction->edit()->AddFile( compact->compaction->edit()->AddFile(
(options_.compaction_style == kCompactionStyleUniversal) ? compact->compaction->output_level(), out.number, out.file_size,
level : level + 1, out.smallest, out.largest, out.smallest_seqno, out.largest_seqno);
out.number, out.file_size, out.smallest, out.largest,
out.smallest_seqno, out.largest_seqno);
} }
return versions_->LogAndApply(compact->compaction->edit(), &mutex_); return versions_->LogAndApply(compact->compaction->edit(), &mutex_);
} }
@ -2218,14 +2247,14 @@ Status DBImpl::DoCompactionWork(CompactionState* compact,
compact->compaction->num_input_files(0), compact->compaction->num_input_files(0),
compact->compaction->level(), compact->compaction->level(),
compact->compaction->num_input_files(1), compact->compaction->num_input_files(1),
compact->compaction->level() + 1, compact->compaction->output_level(),
compact->compaction->score(), compact->compaction->score(),
options_.max_background_compactions - bg_compaction_scheduled_); options_.max_background_compactions - bg_compaction_scheduled_);
char scratch[256]; char scratch[256];
compact->compaction->Summary(scratch, sizeof(scratch)); compact->compaction->Summary(scratch, sizeof(scratch));
Log(options_.info_log, "Compaction start summary: %s\n", scratch); Log(options_.info_log, "Compaction start summary: %s\n", scratch);
assert(versions_->NumLevelFiles(compact->compaction->level()) > 0); assert(versions_->current()->NumLevelFiles(compact->compaction->level()) > 0);
assert(compact->builder == nullptr); assert(compact->builder == nullptr);
assert(!compact->outfile); assert(!compact->outfile);
@ -2553,9 +2582,7 @@ Status DBImpl::DoCompactionWork(CompactionState* compact,
CompactionStats stats; CompactionStats stats;
stats.micros = env_->NowMicros() - start_micros - imm_micros; stats.micros = env_->NowMicros() - start_micros - imm_micros;
if (options_.statistics.get()) { MeasureTime(options_.statistics.get(), COMPACTION_TIME, stats.micros);
options_.statistics.get()->measureTime(COMPACTION_TIME, stats.micros);
}
stats.files_in_leveln = compact->compaction->num_input_files(0); stats.files_in_leveln = compact->compaction->num_input_files(0);
stats.files_in_levelnp1 = compact->compaction->num_input_files(1); stats.files_in_levelnp1 = compact->compaction->num_input_files(1);
@ -2597,22 +2624,21 @@ Status DBImpl::DoCompactionWork(CompactionState* compact,
status = InstallCompactionResults(compact); status = InstallCompactionResults(compact);
InstallSuperVersion(deletion_state); InstallSuperVersion(deletion_state);
} }
VersionSet::LevelSummaryStorage tmp; Version::LevelSummaryStorage tmp;
Log(options_.info_log, Log(options_.info_log,
"compacted to: %s, %.1f MB/sec, level %d, files in(%d, %d) out(%d) " "compacted to: %s, %.1f MB/sec, level %d, files in(%d, %d) out(%d) "
"MB in(%.1f, %.1f) out(%.1f), read-write-amplify(%.1f) " "MB in(%.1f, %.1f) out(%.1f), read-write-amplify(%.1f) "
"write-amplify(%.1f) %s\n", "write-amplify(%.1f) %s\n",
versions_->LevelSummary(&tmp), versions_->current()->LevelSummary(&tmp),
(stats.bytes_readn + stats.bytes_readnp1 + stats.bytes_written) / (stats.bytes_readn + stats.bytes_readnp1 + stats.bytes_written) /
(double) stats.micros, (double)stats.micros,
compact->compaction->output_level(), compact->compaction->output_level(), stats.files_in_leveln,
stats.files_in_leveln, stats.files_in_levelnp1, stats.files_out_levelnp1, stats.files_in_levelnp1, stats.files_out_levelnp1,
stats.bytes_readn / 1048576.0, stats.bytes_readn / 1048576.0, stats.bytes_readnp1 / 1048576.0,
stats.bytes_readnp1 / 1048576.0,
stats.bytes_written / 1048576.0, stats.bytes_written / 1048576.0,
(stats.bytes_written + stats.bytes_readnp1 + stats.bytes_readn) / (stats.bytes_written + stats.bytes_readnp1 + stats.bytes_readn) /
(double) stats.bytes_readn, (double)stats.bytes_readn,
stats.bytes_written / (double) stats.bytes_readn, stats.bytes_written / (double)stats.bytes_readn,
status.ToString().c_str()); status.ToString().c_str());
return status; return status;
@ -2649,38 +2675,40 @@ static void CleanupIteratorState(void* arg1, void* arg2) {
Iterator* DBImpl::NewInternalIterator(const ReadOptions& options, Iterator* DBImpl::NewInternalIterator(const ReadOptions& options,
SequenceNumber* latest_snapshot) { SequenceNumber* latest_snapshot) {
IterState* cleanup = new IterState; IterState* cleanup = new IterState;
mutex_.Lock(); MemTable* mutable_mem;
*latest_snapshot = versions_->LastSequence(); std::vector<MemTable*> immutables;
Version* version;
// Collect together all needed child iterators for mem // Collect together all needed child iterators for mem
std::vector<Iterator*> list; mutex_.Lock();
*latest_snapshot = versions_->LastSequence();
mem_->Ref(); mem_->Ref();
list.push_back(mem_->NewIterator(options)); mutable_mem = mem_;
cleanup->mem.push_back(mem_);
// Collect together all needed child iterators for imm_ // Collect together all needed child iterators for imm_
std::vector<MemTable*> immutables;
imm_.GetMemTables(&immutables); imm_.GetMemTables(&immutables);
for (unsigned int i = 0; i < immutables.size(); i++) { for (unsigned int i = 0; i < immutables.size(); i++) {
MemTable* m = immutables[i]; immutables[i]->Ref();
m->Ref(); }
// Collect iterators for files in L0 - Ln
versions_->current()->Ref();
version = versions_->current();
mutex_.Unlock();
std::vector<Iterator*> list;
list.push_back(mutable_mem->NewIterator(options));
cleanup->mem.push_back(mutable_mem);
for (MemTable* m : immutables) {
list.push_back(m->NewIterator(options)); list.push_back(m->NewIterator(options));
cleanup->mem.push_back(m); cleanup->mem.push_back(m);
} }
version->AddIterators(options, storage_options_, &list);
// Collect iterators for files in L0 - Ln
versions_->current()->AddIterators(options, storage_options_, &list);
Iterator* internal_iter = Iterator* internal_iter =
NewMergingIterator(&internal_comparator_, &list[0], list.size()); NewMergingIterator(&internal_comparator_, &list[0], list.size());
versions_->current()->Ref(); cleanup->version = version;
cleanup->mu = &mutex_; cleanup->mu = &mutex_;
cleanup->db = this; cleanup->db = this;
cleanup->version = versions_->current();
internal_iter->RegisterCleanup(CleanupIteratorState, cleanup, nullptr); internal_iter->RegisterCleanup(CleanupIteratorState, cleanup, nullptr);
mutex_.Unlock();
return internal_iter; return internal_iter;
} }
@ -2691,7 +2719,7 @@ Iterator* DBImpl::TEST_NewInternalIterator() {
int64_t DBImpl::TEST_MaxNextLevelOverlappingBytes() { int64_t DBImpl::TEST_MaxNextLevelOverlappingBytes() {
MutexLock l(&mutex_); MutexLock l(&mutex_);
return versions_->MaxNextLevelOverlappingBytes(); return versions_->current()->MaxNextLevelOverlappingBytes();
} }
Status DBImpl::Get(const ReadOptions& options, Status DBImpl::Get(const ReadOptions& options,
@ -2898,7 +2926,7 @@ std::vector<Status> DBImpl::MultiGet(
Status DBImpl::CreateColumnFamily(const ColumnFamilyOptions& options, Status DBImpl::CreateColumnFamily(const ColumnFamilyOptions& options,
const std::string& column_family_name, const std::string& column_family_name,
ColumnFamilyHandle* handle) { ColumnFamilyHandle* handle) {
VersionEdit edit(0); VersionEdit edit;
edit.AddColumnFamily(column_family_name); edit.AddColumnFamily(column_family_name);
MutexLock l(&mutex_); MutexLock l(&mutex_);
++versions_->max_column_family_; ++versions_->max_column_family_;
@ -2920,7 +2948,7 @@ Status DBImpl::DropColumnFamily(const ColumnFamilyHandle& column_family) {
if (column_family.id == 0) { if (column_family.id == 0) {
return Status::InvalidArgument("Can't drop default column family"); return Status::InvalidArgument("Can't drop default column family");
} }
VersionEdit edit(0); VersionEdit edit;
edit.DropColumnFamily(); edit.DropColumnFamily();
edit.SetColumnFamily(column_family.id); edit.SetColumnFamily(column_family.id);
MutexLock l(&mutex_); MutexLock l(&mutex_);
@ -3045,12 +3073,8 @@ Status DBImpl::Write(const WriteOptions& options, WriteBatch* my_batch) {
uint64_t last_sequence = versions_->LastSequence(); uint64_t last_sequence = versions_->LastSequence();
Writer* last_writer = &w; Writer* last_writer = &w;
if (status.ok() && my_batch != nullptr) { // nullptr batch is for compactions if (status.ok() && my_batch != nullptr) { // nullptr batch is for compactions
// TODO: BuildBatchGroup physically concatenate/copy all write batches into autovector<WriteBatch*> write_batch_group;
// a new one. Mem copy is done with the lock held. Ideally, we only need BuildBatchGroup(&last_writer, &write_batch_group);
// the lock to obtain the last_writer and the references to all batches.
// Creation (copy) of the merged batch could have been done outside of the
// lock protected region.
WriteBatch* updates = BuildBatchGroup(&last_writer);
// Add to log and apply to memtable. We can release the lock // Add to log and apply to memtable. We can release the lock
// during this phase since &w is currently responsible for logging // during this phase since &w is currently responsible for logging
@ -3058,6 +3082,16 @@ Status DBImpl::Write(const WriteOptions& options, WriteBatch* my_batch) {
// into mem_. // into mem_.
{ {
mutex_.Unlock(); mutex_.Unlock();
WriteBatch* updates = nullptr;
if (write_batch_group.size() == 1) {
updates = write_batch_group[0];
} else {
updates = &tmp_batch_;
for (size_t i = 0; i < write_batch_group.size(); ++i) {
WriteBatchInternal::Append(updates, write_batch_group[i]);
}
}
const SequenceNumber current_sequence = last_sequence + 1; const SequenceNumber current_sequence = last_sequence + 1;
WriteBatchInternal::SetSequence(updates, current_sequence); WriteBatchInternal::SetSequence(updates, current_sequence);
int my_batch_count = WriteBatchInternal::Count(updates); int my_batch_count = WriteBatchInternal::Count(updates);
@ -3100,15 +3134,15 @@ Status DBImpl::Write(const WriteOptions& options, WriteBatch* my_batch) {
// have succeeded in memtable but Status reports error for all writes. // have succeeded in memtable but Status reports error for all writes.
throw std::runtime_error("In memory WriteBatch corruption!"); throw std::runtime_error("In memory WriteBatch corruption!");
} }
SetTickerCount(options_.statistics.get(), SetTickerCount(options_.statistics.get(), SEQUENCE_NUMBER,
SEQUENCE_NUMBER, last_sequence); last_sequence);
} }
if (updates == &tmp_batch_) tmp_batch_.Clear();
mutex_.Lock(); mutex_.Lock();
if (status.ok()) { if (status.ok()) {
versions_->SetLastSequence(last_sequence); versions_->SetLastSequence(last_sequence);
} }
} }
if (updates == &tmp_batch_) tmp_batch_.Clear();
} }
if (options_.paranoid_checks && !status.ok() && bg_error_.ok()) { if (options_.paranoid_checks && !status.ok() && bg_error_.ok()) {
bg_error_ = status; // stop compaction & fail any further writes bg_error_ = status; // stop compaction & fail any further writes
@ -3136,13 +3170,14 @@ Status DBImpl::Write(const WriteOptions& options, WriteBatch* my_batch) {
// REQUIRES: Writer list must be non-empty // REQUIRES: Writer list must be non-empty
// REQUIRES: First writer must have a non-nullptr batch // REQUIRES: First writer must have a non-nullptr batch
WriteBatch* DBImpl::BuildBatchGroup(Writer** last_writer) { void DBImpl::BuildBatchGroup(Writer** last_writer,
autovector<WriteBatch*>* write_batch_group) {
assert(!writers_.empty()); assert(!writers_.empty());
Writer* first = writers_.front(); Writer* first = writers_.front();
WriteBatch* result = first->batch; assert(first->batch != nullptr);
assert(result != nullptr);
size_t size = WriteBatchInternal::ByteSize(first->batch); size_t size = WriteBatchInternal::ByteSize(first->batch);
write_batch_group->push_back(first->batch);
// Allow the group to grow up to a maximum size, but if the // Allow the group to grow up to a maximum size, but if the
// original write is small, limit the growth so we do not slow // original write is small, limit the growth so we do not slow
@ -3175,18 +3210,10 @@ WriteBatch* DBImpl::BuildBatchGroup(Writer** last_writer) {
break; break;
} }
// Append to *reuslt write_batch_group->push_back(w->batch);
if (result == first->batch) {
// Switch to temporary batch instead of disturbing caller's batch
result = &tmp_batch_;
assert(WriteBatchInternal::Count(result) == 0);
WriteBatchInternal::Append(result, first->batch);
}
WriteBatchInternal::Append(result, w->batch);
} }
*last_writer = w; *last_writer = w;
} }
return result;
} }
// This function computes the amount of time in microseconds by which a write // This function computes the amount of time in microseconds by which a write
@ -3200,7 +3227,7 @@ WriteBatch* DBImpl::BuildBatchGroup(Writer** last_writer) {
// The goal of this formula is to gradually increase the rate at which writes // The goal of this formula is to gradually increase the rate at which writes
// are slowed. We also tried linear delay (r * 1000), but it seemed to do // are slowed. We also tried linear delay (r * 1000), but it seemed to do
// slightly worse. There is no other particular reason for choosing quadratic. // slightly worse. There is no other particular reason for choosing quadratic.
uint64_t DBImpl::SlowdownAmount(int n, int top, int bottom) { uint64_t DBImpl::SlowdownAmount(int n, double bottom, double top) {
uint64_t delay; uint64_t delay;
if (n >= top) { if (n >= top) {
delay = 1000; delay = 1000;
@ -3212,10 +3239,10 @@ uint64_t DBImpl::SlowdownAmount(int n, int top, int bottom) {
// If we are here, we know that: // If we are here, we know that:
// level0_start_slowdown <= n < level0_slowdown // level0_start_slowdown <= n < level0_slowdown
// since the previous two conditions are false. // since the previous two conditions are false.
float how_much = double how_much =
(float) (n - bottom) / (double) (n - bottom) /
(top - bottom); (top - bottom);
delay = how_much * how_much * 1000; delay = std::max(how_much * how_much * 1000, 100.0);
} }
assert(delay <= 1000); assert(delay <= 1000);
return delay; return delay;
@ -3240,25 +3267,22 @@ Status DBImpl::MakeRoomForWrite(bool force,
// Yield previous error // Yield previous error
s = bg_error_; s = bg_error_;
break; break;
} else if ( } else if (allow_delay && versions_->NeedSlowdownForNumLevel0Files()) {
allow_delay &&
versions_->NumLevelFiles(0) >=
options_.level0_slowdown_writes_trigger) {
// We are getting close to hitting a hard limit on the number of // We are getting close to hitting a hard limit on the number of
// L0 files. Rather than delaying a single write by several // L0 files. Rather than delaying a single write by several
// seconds when we hit the hard limit, start delaying each // seconds when we hit the hard limit, start delaying each
// individual write by 0-1ms to reduce latency variance. Also, // individual write by 0-1ms to reduce latency variance. Also,
// this delay hands over some CPU to the compaction thread in // this delay hands over some CPU to the compaction thread in
// case it is sharing the same core as the writer. // case it is sharing the same core as the writer.
uint64_t slowdown =
SlowdownAmount(versions_->current()->NumLevelFiles(0),
options_.level0_slowdown_writes_trigger,
options_.level0_stop_writes_trigger);
mutex_.Unlock(); mutex_.Unlock();
uint64_t delayed; uint64_t delayed;
{ {
StopWatch sw(env_, options_.statistics.get(), STALL_L0_SLOWDOWN_COUNT); StopWatch sw(env_, options_.statistics.get(), STALL_L0_SLOWDOWN_COUNT);
env_->SleepForMicroseconds( env_->SleepForMicroseconds(slowdown);
SlowdownAmount(versions_->NumLevelFiles(0),
options_.level0_slowdown_writes_trigger,
options_.level0_stop_writes_trigger)
);
delayed = sw.ElapsedMicros(); delayed = sw.ElapsedMicros();
} }
RecordTick(options_.statistics.get(), STALL_L0_SLOWDOWN_MICROS, delayed); RecordTick(options_.statistics.get(), STALL_L0_SLOWDOWN_MICROS, delayed);
@ -3290,7 +3314,7 @@ Status DBImpl::MakeRoomForWrite(bool force,
STALL_MEMTABLE_COMPACTION_MICROS, stall); STALL_MEMTABLE_COMPACTION_MICROS, stall);
stall_memtable_compaction_ += stall; stall_memtable_compaction_ += stall;
stall_memtable_compaction_count_++; stall_memtable_compaction_count_++;
} else if (versions_->NumLevelFiles(0) >= } else if (versions_->current()->NumLevelFiles(0) >=
options_.level0_stop_writes_trigger) { options_.level0_stop_writes_trigger) {
// There are too many level-0 files. // There are too many level-0 files.
DelayLoggingAndReset(); DelayLoggingAndReset();
@ -3366,17 +3390,13 @@ Status DBImpl::MakeRoomForWrite(bool force,
EnvOptions soptions(storage_options_); EnvOptions soptions(storage_options_);
soptions.use_mmap_writes = false; soptions.use_mmap_writes = false;
DelayLoggingAndReset(); DelayLoggingAndReset();
s = env_->NewWritableFile( s = env_->NewWritableFile(LogFileName(options_.wal_dir, new_log_number),
LogFileName(options_.wal_dir, new_log_number), &lfile, soptions);
&lfile,
soptions
);
if (s.ok()) { if (s.ok()) {
// Our final size should be less than write_buffer_size // Our final size should be less than write_buffer_size
// (compression, etc) but err on the side of caution. // (compression, etc) but err on the side of caution.
lfile->SetPreallocationBlockSize(1.1 * options_.write_buffer_size); lfile->SetPreallocationBlockSize(1.1 * options_.write_buffer_size);
memtmp = new MemTable( memtmp = new MemTable(internal_comparator_, options_);
internal_comparator_, mem_rep_factory_, NumberLevels(), options_);
new_superversion = new SuperVersion(options_.max_write_buffer_number); new_superversion = new SuperVersion(options_.max_write_buffer_number);
} }
} }
@ -3426,6 +3446,7 @@ bool DBImpl::GetProperty(const ColumnFamilyHandle& column_family,
value->clear(); value->clear();
MutexLock l(&mutex_); MutexLock l(&mutex_);
Version* current = versions_->current();
Slice in = property; Slice in = property;
Slice prefix("rocksdb."); Slice prefix("rocksdb.");
if (!in.starts_with(prefix)) return false; if (!in.starts_with(prefix)) return false;
@ -3440,7 +3461,7 @@ bool DBImpl::GetProperty(const ColumnFamilyHandle& column_family,
} else { } else {
char buf[100]; char buf[100];
snprintf(buf, sizeof(buf), "%d", snprintf(buf, sizeof(buf), "%d",
versions_->NumLevelFiles(static_cast<int>(level))); current->NumLevelFiles(static_cast<int>(level)));
*value = buf; *value = buf;
return true; return true;
} }
@ -3455,8 +3476,8 @@ bool DBImpl::GetProperty(const ColumnFamilyHandle& column_family,
snprintf(buf, sizeof(buf), snprintf(buf, sizeof(buf),
"%3d %8d %8.0f\n", "%3d %8d %8.0f\n",
level, level,
versions_->NumLevelFiles(level), current->NumLevelFiles(level),
versions_->NumLevelBytes(level) / 1048576.0); current->NumLevelBytes(level) / 1048576.0);
value->append(buf); value->append(buf);
} }
return true; return true;
@ -3499,8 +3520,8 @@ bool DBImpl::GetProperty(const ColumnFamilyHandle& column_family,
"--------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------\n" "--------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------\n"
); );
value->append(buf); value->append(buf);
for (int level = 0; level < NumberLevels(); level++) { for (int level = 0; level < current->NumberLevels(); level++) {
int files = versions_->NumLevelFiles(level); int files = current->NumLevelFiles(level);
if (stats_[level].micros > 0 || files > 0) { if (stats_[level].micros > 0 || files > 0) {
int64_t bytes_read = stats_[level].bytes_readn + int64_t bytes_read = stats_[level].bytes_readn +
stats_[level].bytes_readnp1; stats_[level].bytes_readnp1;
@ -3521,8 +3542,8 @@ bool DBImpl::GetProperty(const ColumnFamilyHandle& column_family,
"%3d %8d %8.0f %5.1f %9.0f %9.0f %9.0f %9.0f %9.0f %9.0f %10.1f %9.1f %11.1f %8d %8d %8d %8d %8d %9.1f %9lu\n", "%3d %8d %8.0f %5.1f %9.0f %9.0f %9.0f %9.0f %9.0f %9.0f %10.1f %9.1f %11.1f %8d %8d %8d %8d %8d %9.1f %9lu\n",
level, level,
files, files,
versions_->NumLevelBytes(level) / 1048576.0, current->NumLevelBytes(level) / 1048576.0,
versions_->NumLevelBytes(level) / current->NumLevelBytes(level) /
versions_->MaxBytesForLevel(level), versions_->MaxBytesForLevel(level),
stats_[level].micros / 1e6, stats_[level].micros / 1e6,
bytes_read / 1048576.0, bytes_read / 1048576.0,
@ -3758,7 +3779,7 @@ Status DBImpl::DeleteFile(std::string name) {
int level; int level;
FileMetaData metadata; FileMetaData metadata;
int maxlevel = NumberLevels(); int maxlevel = NumberLevels();
VersionEdit edit(maxlevel); VersionEdit edit;
DeletionState deletion_state(0, true); DeletionState deletion_state(0, true);
{ {
MutexLock l(&mutex_); MutexLock l(&mutex_);
@ -3781,7 +3802,7 @@ Status DBImpl::DeleteFile(std::string name) {
// This is to make sure that any deletion tombstones are not // This is to make sure that any deletion tombstones are not
// lost. Check that the level passed is the last level. // lost. Check that the level passed is the last level.
for (int i = level + 1; i < maxlevel; i++) { for (int i = level + 1; i < maxlevel; i++) {
if (versions_->NumLevelFiles(i) != 0) { if (versions_->current()->NumLevelFiles(i) != 0) {
Log(options_.info_log, Log(options_.info_log,
"DeleteFile %s FAILED. File not in last level\n", name.c_str()); "DeleteFile %s FAILED. File not in last level\n", name.c_str());
return Status::InvalidArgument("File not in last level"); return Status::InvalidArgument("File not in last level");
@ -3836,7 +3857,10 @@ Status DBImpl::GetDbIdentity(std::string& identity) {
// can call if they wish // can call if they wish
Status DB::Put(const WriteOptions& opt, const ColumnFamilyHandle& column_family, Status DB::Put(const WriteOptions& opt, const ColumnFamilyHandle& column_family,
const Slice& key, const Slice& value) { const Slice& key, const Slice& value) {
WriteBatch batch; // Pre-allocate size of write batch conservatively.
// 8 bytes are taken by header, 4 bytes for count, 1 byte for type,
// and we allocate 11 extra bytes for key length, as well as value length.
WriteBatch batch(key.size() + value.size() + 24);
batch.Put(column_family.id, key, value); batch.Put(column_family.id, key, value);
return Write(opt, &batch); return Write(opt, &batch);
} }
@ -3915,20 +3939,20 @@ Status DB::OpenWithColumnFamilies(
return s; return s;
} }
impl->mutex_.Lock(); impl->mutex_.Lock();
VersionEdit edit(impl->NumberLevels()); VersionEdit edit;
// Handles create_if_missing, error_if_exists // Handles create_if_missing, error_if_exists
s = impl->Recover(&edit, column_families); s = impl->Recover(&edit, column_families);
if (s.ok()) { if (s.ok()) {
uint64_t new_log_number = impl->versions_->NewFileNumber(); uint64_t new_log_number = impl->versions_->NewFileNumber();
unique_ptr<WritableFile> lfile; unique_ptr<WritableFile> lfile;
soptions.use_mmap_writes = false; soptions.use_mmap_writes = false;
s = options.env->NewWritableFile( s = impl->options_.env->NewWritableFile(
LogFileName(impl->options_.wal_dir, new_log_number), LogFileName(impl->options_.wal_dir, new_log_number),
&lfile, &lfile,
soptions soptions
); );
if (s.ok()) { if (s.ok()) {
lfile->SetPreallocationBlockSize(1.1 * options.write_buffer_size); lfile->SetPreallocationBlockSize(1.1 * impl->options_.write_buffer_size);
edit.SetLogNumber(new_log_number); edit.SetLogNumber(new_log_number);
impl->logfile_number_ = new_log_number; impl->logfile_number_ = new_log_number;
impl->log_.reset(new log::Writer(std::move(lfile))); impl->log_.reset(new log::Writer(std::move(lfile)));
@ -3949,12 +3973,11 @@ Status DB::OpenWithColumnFamilies(
impl->MaybeScheduleLogDBDeployStats(); impl->MaybeScheduleLogDBDeployStats();
} }
} }
impl->mutex_.Unlock();
if (s.ok() && options.compaction_style == kCompactionStyleUniversal) { if (s.ok() && impl->options_.compaction_style == kCompactionStyleUniversal) {
int num_files; Version* current = impl->versions_->current();
for (int i = 1; i < impl->NumberLevels(); i++) { for (int i = 1; i < impl->NumberLevels(); i++) {
num_files = impl->versions_->NumLevelFiles(i); int num_files = current->NumLevelFiles(i);
if (num_files > 0) { if (num_files > 0) {
s = Status::InvalidArgument("Not all files are at level 0. Cannot " s = Status::InvalidArgument("Not all files are at level 0. Cannot "
"open with universal compaction style."); "open with universal compaction style.");
@ -3963,6 +3986,8 @@ Status DB::OpenWithColumnFamilies(
} }
} }
impl->mutex_.Unlock();
if (s.ok()) { if (s.ok()) {
*dbptr = impl; *dbptr = impl;
} else { } else {

@ -22,6 +22,7 @@
#include "port/port.h" #include "port/port.h"
#include "util/stats_logger.h" #include "util/stats_logger.h"
#include "memtablelist.h" #include "memtablelist.h"
#include "util/autovector.h"
namespace rocksdb { namespace rocksdb {
@ -125,10 +126,17 @@ class DBImpl : public DB {
virtual Status GetDbIdentity(std::string& identity); virtual Status GetDbIdentity(std::string& identity);
void RunManualCompaction(int input_level,
int output_level,
const Slice* begin,
const Slice* end);
// Extra methods (for testing) that are not in the public DB interface // Extra methods (for testing) that are not in the public DB interface
// Compact any files in the named level that overlap [*begin, *end] // Compact any files in the named level that overlap [*begin, *end]
void TEST_CompactRange(int level, const Slice* begin, const Slice* end); void TEST_CompactRange(int level,
const Slice* begin,
const Slice* end);
// Force current memtable contents to be flushed. // Force current memtable contents to be flushed.
Status TEST_FlushMemTable(); Status TEST_FlushMemTable();
@ -158,7 +166,7 @@ class DBImpl : public DB {
void TEST_PurgeObsoleteteWAL(); void TEST_PurgeObsoleteteWAL();
// get total level0 file size. Only for testing. // get total level0 file size. Only for testing.
uint64_t TEST_GetLevel0TotalSize() { return versions_->NumLevelBytes(0);} uint64_t TEST_GetLevel0TotalSize();
void TEST_SetDefaultTimeToCheck(uint64_t default_interval_to_delete_obsolete_WAL) void TEST_SetDefaultTimeToCheck(uint64_t default_interval_to_delete_obsolete_WAL)
{ {
@ -324,13 +332,14 @@ class DBImpl : public DB {
Status WriteLevel0Table(std::vector<MemTable*> &mems, VersionEdit* edit, Status WriteLevel0Table(std::vector<MemTable*> &mems, VersionEdit* edit,
uint64_t* filenumber); uint64_t* filenumber);
uint64_t SlowdownAmount(int n, int top, int bottom); uint64_t SlowdownAmount(int n, double bottom, double top);
// MakeRoomForWrite will return superversion_to_free through an arugment, // MakeRoomForWrite will return superversion_to_free through an arugment,
// which the caller needs to delete. We do it because caller can delete // which the caller needs to delete. We do it because caller can delete
// the superversion outside of mutex // the superversion outside of mutex
Status MakeRoomForWrite(bool force /* compact even if there is room? */, Status MakeRoomForWrite(bool force /* compact even if there is room? */,
SuperVersion** superversion_to_free); SuperVersion** superversion_to_free);
WriteBatch* BuildBatchGroup(Writer** last_writer); void BuildBatchGroup(Writer** last_writer,
autovector<WriteBatch*>* write_batch_group);
// Force current memtable contents to be flushed. // Force current memtable contents to be flushed.
Status FlushMemTable(const FlushOptions& options); Status FlushMemTable(const FlushOptions& options);
@ -443,7 +452,8 @@ class DBImpl : public DB {
// Information for a manual compaction // Information for a manual compaction
struct ManualCompaction { struct ManualCompaction {
int level; int input_level;
int output_level;
bool done; bool done;
bool in_progress; // compaction request being processed? bool in_progress; // compaction request being processed?
const InternalKey* begin; // nullptr means beginning of key range const InternalKey* begin; // nullptr means beginning of key range

@ -85,7 +85,7 @@ Status DB::OpenForReadOnly(const Options& options, const std::string& dbname,
DBImplReadOnly* impl = new DBImplReadOnly(options, dbname); DBImplReadOnly* impl = new DBImplReadOnly(options, dbname);
impl->mutex_.Lock(); impl->mutex_.Lock();
VersionEdit edit(impl->NumberLevels()); VersionEdit edit;
DBOptions db_options(options); DBOptions db_options(options);
ColumnFamilyOptions cf_options(options); ColumnFamilyOptions cf_options(options);
std::vector<ColumnFamilyDescriptor> column_families; std::vector<ColumnFamilyDescriptor> column_families;

@ -1,14 +0,0 @@
// Copyright (c) 2013, Facebook, Inc. All rights reserved.
// This source code is licensed under the BSD-style license found in the
// LICENSE file in the root directory of this source tree. An additional grant
// of patent rights can be found in the PATENTS file in the same directory.
#include "db/db_statistics.h"
namespace rocksdb {
std::shared_ptr<Statistics> CreateDBStatistics() {
return std::make_shared<DBStatistics>();
}
} // namespace rocksdb

@ -1,63 +0,0 @@
// Copyright (c) 2013, Facebook, Inc. All rights reserved.
// This source code is licensed under the BSD-style license found in the
// LICENSE file in the root directory of this source tree. An additional grant
// of patent rights can be found in the PATENTS file in the same directory.
//
// Copyright (c) 2011 The LevelDB Authors. All rights reserved.
// Use of this source code is governed by a BSD-style license that can be
// found in the LICENSE file. See the AUTHORS file for names of contributors.
#pragma once
#include <cassert>
#include <stdlib.h>
#include <vector>
#include <memory>
#include "rocksdb/statistics.h"
#include "util/histogram.h"
#include "port/port.h"
#include "util/mutexlock.h"
namespace rocksdb {
class DBStatistics: public Statistics {
public:
DBStatistics() : allTickers_(TICKER_ENUM_MAX),
allHistograms_(HISTOGRAM_ENUM_MAX) { }
virtual ~DBStatistics() {}
virtual long getTickerCount(Tickers tickerType) {
assert(tickerType < TICKER_ENUM_MAX);
return allTickers_[tickerType].getCount();
}
virtual void setTickerCount(Tickers tickerType, uint64_t count) {
assert(tickerType < TICKER_ENUM_MAX);
allTickers_[tickerType].setTickerCount(count);
}
virtual void recordTick(Tickers tickerType, uint64_t count) {
assert(tickerType < TICKER_ENUM_MAX);
allTickers_[tickerType].recordTick(count);
}
virtual void measureTime(Histograms histogramType, uint64_t value) {
assert(histogramType < HISTOGRAM_ENUM_MAX);
allHistograms_[histogramType].Add(value);
}
virtual void histogramData(Histograms histogramType,
HistogramData * const data) {
assert(histogramType < HISTOGRAM_ENUM_MAX);
allHistograms_[histogramType].Data(data);
}
std::vector<Ticker> allTickers_;
std::vector<HistogramImpl> allHistograms_;
};
std::shared_ptr<Statistics> CreateDBStatistics();
} // namespace rocksdb

@ -65,13 +65,14 @@ void DBImpl::LogDBDeployStats() {
uint64_t file_total_size = 0; uint64_t file_total_size = 0;
uint32_t file_total_num = 0; uint32_t file_total_num = 0;
for (int i = 0; i < versions_->NumberLevels(); i++) { Version* current = versions_->current();
file_total_num += versions_->NumLevelFiles(i); for (int i = 0; i < current->NumberLevels(); i++) {
file_total_size += versions_->NumLevelBytes(i); file_total_num += current->NumLevelFiles(i);
file_total_size += current->NumLevelBytes(i);
} }
VersionSet::LevelSummaryStorage scratch; Version::LevelSummaryStorage scratch;
const char* file_num_summary = versions_->LevelSummary(&scratch); const char* file_num_summary = current->LevelSummary(&scratch);
std::string file_num_per_level(file_num_summary); std::string file_num_per_level(file_num_summary);
std::string data_size_per_level(file_num_summary); std::string data_size_per_level(file_num_summary);

@ -17,7 +17,6 @@
#include "db/filename.h" #include "db/filename.h"
#include "db/version_set.h" #include "db/version_set.h"
#include "db/write_batch_internal.h" #include "db/write_batch_internal.h"
#include "db/db_statistics.h"
#include "rocksdb/cache.h" #include "rocksdb/cache.h"
#include "rocksdb/compaction_filter.h" #include "rocksdb/compaction_filter.h"
#include "rocksdb/env.h" #include "rocksdb/env.h"
@ -27,6 +26,7 @@
#include "util/mutexlock.h" #include "util/mutexlock.h"
#include "util/testharness.h" #include "util/testharness.h"
#include "util/testutil.h" #include "util/testutil.h"
#include "util/statistics.h"
#include "utilities/merge_operators.h" #include "utilities/merge_operators.h"
namespace rocksdb { namespace rocksdb {
@ -680,6 +680,10 @@ static std::string Key(int i) {
return std::string(buf); return std::string(buf);
} }
static long TestGetTickerCount(const Options& options, Tickers ticker_type) {
return options.statistics->getTickerCount(ticker_type);
}
TEST(DBTest, Empty) { TEST(DBTest, Empty) {
do { do {
ASSERT_TRUE(db_ != nullptr); ASSERT_TRUE(db_ != nullptr);
@ -713,14 +717,11 @@ TEST(DBTest, IndexAndFilterBlocksOfNewTableAddedToCache) {
dbfull()->Flush(FlushOptions()); dbfull()->Flush(FlushOptions());
// index/filter blocks added to block cache right after table creation. // index/filter blocks added to block cache right after table creation.
ASSERT_EQ(1, ASSERT_EQ(1, TestGetTickerCount(options, BLOCK_CACHE_INDEX_MISS));
options.statistics.get()->getTickerCount(BLOCK_CACHE_INDEX_MISS)); ASSERT_EQ(1, TestGetTickerCount(options, BLOCK_CACHE_FILTER_MISS));
ASSERT_EQ(1,
options.statistics.get()->getTickerCount(BLOCK_CACHE_FILTER_MISS));
ASSERT_EQ(2, /* only index/filter were added */ ASSERT_EQ(2, /* only index/filter were added */
options.statistics.get()->getTickerCount(BLOCK_CACHE_ADD)); TestGetTickerCount(options, BLOCK_CACHE_ADD));
ASSERT_EQ(0, ASSERT_EQ(0, TestGetTickerCount(options, BLOCK_CACHE_DATA_MISS));
options.statistics.get()->getTickerCount(BLOCK_CACHE_DATA_MISS));
// Make sure filter block is in cache. // Make sure filter block is in cache.
std::string value; std::string value;
@ -728,31 +729,24 @@ TEST(DBTest, IndexAndFilterBlocksOfNewTableAddedToCache) {
db_->KeyMayExist(ReadOptions(), "key", &value); db_->KeyMayExist(ReadOptions(), "key", &value);
// Miss count should remain the same. // Miss count should remain the same.
ASSERT_EQ(1, ASSERT_EQ(1, TestGetTickerCount(options, BLOCK_CACHE_FILTER_MISS));
options.statistics.get()->getTickerCount(BLOCK_CACHE_FILTER_MISS)); ASSERT_EQ(1, TestGetTickerCount(options, BLOCK_CACHE_FILTER_HIT));
ASSERT_EQ(1,
options.statistics.get()->getTickerCount(BLOCK_CACHE_FILTER_HIT));
db_->KeyMayExist(ReadOptions(), "key", &value); db_->KeyMayExist(ReadOptions(), "key", &value);
ASSERT_EQ(1, ASSERT_EQ(1, TestGetTickerCount(options, BLOCK_CACHE_FILTER_MISS));
options.statistics.get()->getTickerCount(BLOCK_CACHE_FILTER_MISS)); ASSERT_EQ(2, TestGetTickerCount(options, BLOCK_CACHE_FILTER_HIT));
ASSERT_EQ(2,
options.statistics.get()->getTickerCount(BLOCK_CACHE_FILTER_HIT));
// Make sure index block is in cache. // Make sure index block is in cache.
auto index_block_hit = auto index_block_hit = TestGetTickerCount(options, BLOCK_CACHE_FILTER_HIT);
options.statistics.get()->getTickerCount(BLOCK_CACHE_FILTER_HIT);
value = Get("key"); value = Get("key");
ASSERT_EQ(1, ASSERT_EQ(1, TestGetTickerCount(options, BLOCK_CACHE_FILTER_MISS));
options.statistics.get()->getTickerCount(BLOCK_CACHE_FILTER_MISS));
ASSERT_EQ(index_block_hit + 1, ASSERT_EQ(index_block_hit + 1,
options.statistics.get()->getTickerCount(BLOCK_CACHE_FILTER_HIT)); TestGetTickerCount(options, BLOCK_CACHE_FILTER_HIT));
value = Get("key"); value = Get("key");
ASSERT_EQ(1, ASSERT_EQ(1, TestGetTickerCount(options, BLOCK_CACHE_FILTER_MISS));
options.statistics.get()->getTickerCount(BLOCK_CACHE_FILTER_MISS));
ASSERT_EQ(index_block_hit + 2, ASSERT_EQ(index_block_hit + 2,
options.statistics.get()->getTickerCount(BLOCK_CACHE_FILTER_HIT)); TestGetTickerCount(options, BLOCK_CACHE_FILTER_HIT));
} }
TEST(DBTest, LevelLimitReopen) { TEST(DBTest, LevelLimitReopen) {
@ -768,10 +762,9 @@ TEST(DBTest, LevelLimitReopen) {
options.num_levels = 1; options.num_levels = 1;
options.max_bytes_for_level_multiplier_additional.resize(1, 1); options.max_bytes_for_level_multiplier_additional.resize(1, 1);
Status s = TryReopen(&options); Status s = TryReopen(&options);
ASSERT_EQ(s.IsCorruption(), true); ASSERT_EQ(s.IsInvalidArgument(), true);
ASSERT_EQ(s.ToString(), ASSERT_EQ(s.ToString(),
"Corruption: VersionEdit: column family already has " "Invalid argument: db has more levels than options.num_levels");
"more levels than specified");
options.num_levels = 10; options.num_levels = 10;
options.max_bytes_for_level_multiplier_additional.resize(10, 1); options.max_bytes_for_level_multiplier_additional.resize(10, 1);
@ -968,47 +961,39 @@ TEST(DBTest, KeyMayExist) {
dbfull()->Flush(FlushOptions()); dbfull()->Flush(FlushOptions());
value.clear(); value.clear();
long numopen = options.statistics.get()->getTickerCount(NO_FILE_OPENS); long numopen = TestGetTickerCount(options, NO_FILE_OPENS);
long cache_added = long cache_added = TestGetTickerCount(options, BLOCK_CACHE_ADD);
options.statistics.get()->getTickerCount(BLOCK_CACHE_ADD);
ASSERT_TRUE(db_->KeyMayExist(ropts, "a", &value, &value_found)); ASSERT_TRUE(db_->KeyMayExist(ropts, "a", &value, &value_found));
ASSERT_TRUE(!value_found); ASSERT_TRUE(!value_found);
// assert that no new files were opened and no new blocks were // assert that no new files were opened and no new blocks were
// read into block cache. // read into block cache.
ASSERT_EQ(numopen, options.statistics.get()->getTickerCount(NO_FILE_OPENS)); ASSERT_EQ(numopen, TestGetTickerCount(options, NO_FILE_OPENS));
ASSERT_EQ(cache_added, ASSERT_EQ(cache_added, TestGetTickerCount(options, BLOCK_CACHE_ADD));
options.statistics.get()->getTickerCount(BLOCK_CACHE_ADD));
ASSERT_OK(db_->Delete(WriteOptions(), "a")); ASSERT_OK(db_->Delete(WriteOptions(), "a"));
numopen = options.statistics.get()->getTickerCount(NO_FILE_OPENS); numopen = TestGetTickerCount(options, NO_FILE_OPENS);
cache_added = cache_added = TestGetTickerCount(options, BLOCK_CACHE_ADD);
options.statistics.get()->getTickerCount(BLOCK_CACHE_ADD);
ASSERT_TRUE(!db_->KeyMayExist(ropts, "a", &value)); ASSERT_TRUE(!db_->KeyMayExist(ropts, "a", &value));
ASSERT_EQ(numopen, options.statistics.get()->getTickerCount(NO_FILE_OPENS)); ASSERT_EQ(numopen, TestGetTickerCount(options, NO_FILE_OPENS));
ASSERT_EQ(cache_added, ASSERT_EQ(cache_added, TestGetTickerCount(options, BLOCK_CACHE_ADD));
options.statistics.get()->getTickerCount(BLOCK_CACHE_ADD));
dbfull()->Flush(FlushOptions()); dbfull()->Flush(FlushOptions());
dbfull()->CompactRange(nullptr, nullptr); dbfull()->CompactRange(nullptr, nullptr);
numopen = options.statistics.get()->getTickerCount(NO_FILE_OPENS); numopen = TestGetTickerCount(options, NO_FILE_OPENS);
cache_added = cache_added = TestGetTickerCount(options, BLOCK_CACHE_ADD);
options.statistics.get()->getTickerCount(BLOCK_CACHE_ADD);
ASSERT_TRUE(!db_->KeyMayExist(ropts, "a", &value)); ASSERT_TRUE(!db_->KeyMayExist(ropts, "a", &value));
ASSERT_EQ(numopen, options.statistics.get()->getTickerCount(NO_FILE_OPENS)); ASSERT_EQ(numopen, TestGetTickerCount(options, NO_FILE_OPENS));
ASSERT_EQ(cache_added, ASSERT_EQ(cache_added, TestGetTickerCount(options, BLOCK_CACHE_ADD));
options.statistics.get()->getTickerCount(BLOCK_CACHE_ADD));
ASSERT_OK(db_->Delete(WriteOptions(), "c")); ASSERT_OK(db_->Delete(WriteOptions(), "c"));
numopen = options.statistics.get()->getTickerCount(NO_FILE_OPENS); numopen = TestGetTickerCount(options, NO_FILE_OPENS);
cache_added = cache_added = TestGetTickerCount(options, BLOCK_CACHE_ADD);
options.statistics.get()->getTickerCount(BLOCK_CACHE_ADD);
ASSERT_TRUE(!db_->KeyMayExist(ropts, "c", &value)); ASSERT_TRUE(!db_->KeyMayExist(ropts, "c", &value));
ASSERT_EQ(numopen, options.statistics.get()->getTickerCount(NO_FILE_OPENS)); ASSERT_EQ(numopen, TestGetTickerCount(options, NO_FILE_OPENS));
ASSERT_EQ(cache_added, ASSERT_EQ(cache_added, TestGetTickerCount(options, BLOCK_CACHE_ADD));
options.statistics.get()->getTickerCount(BLOCK_CACHE_ADD));
delete options.filter_policy; delete options.filter_policy;
} while (ChangeOptions()); } while (ChangeOptions());
@ -1041,9 +1026,8 @@ TEST(DBTest, NonBlockingIteration) {
// verify that a non-blocking iterator does not find any // verify that a non-blocking iterator does not find any
// kvs. Neither does it do any IOs to storage. // kvs. Neither does it do any IOs to storage.
long numopen = options.statistics.get()->getTickerCount(NO_FILE_OPENS); long numopen = TestGetTickerCount(options, NO_FILE_OPENS);
long cache_added = long cache_added = TestGetTickerCount(options, BLOCK_CACHE_ADD);
options.statistics.get()->getTickerCount(BLOCK_CACHE_ADD);
iter = db_->NewIterator(non_blocking_opts); iter = db_->NewIterator(non_blocking_opts);
count = 0; count = 0;
for (iter->SeekToFirst(); iter->Valid(); iter->Next()) { for (iter->SeekToFirst(); iter->Valid(); iter->Next()) {
@ -1051,18 +1035,16 @@ TEST(DBTest, NonBlockingIteration) {
} }
ASSERT_EQ(count, 0); ASSERT_EQ(count, 0);
ASSERT_TRUE(iter->status().IsIncomplete()); ASSERT_TRUE(iter->status().IsIncomplete());
ASSERT_EQ(numopen, options.statistics.get()->getTickerCount(NO_FILE_OPENS)); ASSERT_EQ(numopen, TestGetTickerCount(options, NO_FILE_OPENS));
ASSERT_EQ(cache_added, ASSERT_EQ(cache_added, TestGetTickerCount(options, BLOCK_CACHE_ADD));
options.statistics.get()->getTickerCount(BLOCK_CACHE_ADD));
delete iter; delete iter;
// read in the specified block via a regular get // read in the specified block via a regular get
ASSERT_EQ(Get("a"), "b"); ASSERT_EQ(Get("a"), "b");
// verify that we can find it via a non-blocking scan // verify that we can find it via a non-blocking scan
numopen = options.statistics.get()->getTickerCount(NO_FILE_OPENS); numopen = TestGetTickerCount(options, NO_FILE_OPENS);
cache_added = cache_added = TestGetTickerCount(options, BLOCK_CACHE_ADD);
options.statistics.get()->getTickerCount(BLOCK_CACHE_ADD);
iter = db_->NewIterator(non_blocking_opts); iter = db_->NewIterator(non_blocking_opts);
count = 0; count = 0;
for (iter->SeekToFirst(); iter->Valid(); iter->Next()) { for (iter->SeekToFirst(); iter->Valid(); iter->Next()) {
@ -1070,9 +1052,8 @@ TEST(DBTest, NonBlockingIteration) {
count++; count++;
} }
ASSERT_EQ(count, 1); ASSERT_EQ(count, 1);
ASSERT_EQ(numopen, options.statistics.get()->getTickerCount(NO_FILE_OPENS)); ASSERT_EQ(numopen, TestGetTickerCount(options, NO_FILE_OPENS));
ASSERT_EQ(cache_added, ASSERT_EQ(cache_added, TestGetTickerCount(options, BLOCK_CACHE_ADD));
options.statistics.get()->getTickerCount(BLOCK_CACHE_ADD));
delete iter; delete iter;
} while (ChangeOptions()); } while (ChangeOptions());
@ -1277,12 +1258,10 @@ TEST(DBTest, IterReseek) {
ASSERT_OK(Put("b", "bone")); ASSERT_OK(Put("b", "bone"));
Iterator* iter = db_->NewIterator(ReadOptions()); Iterator* iter = db_->NewIterator(ReadOptions());
iter->SeekToFirst(); iter->SeekToFirst();
ASSERT_EQ(options.statistics.get()->getTickerCount( ASSERT_EQ(TestGetTickerCount(options, NUMBER_OF_RESEEKS_IN_ITERATION), 0);
NUMBER_OF_RESEEKS_IN_ITERATION), 0);
ASSERT_EQ(IterStatus(iter), "a->two"); ASSERT_EQ(IterStatus(iter), "a->two");
iter->Next(); iter->Next();
ASSERT_EQ(options.statistics.get()->getTickerCount( ASSERT_EQ(TestGetTickerCount(options, NUMBER_OF_RESEEKS_IN_ITERATION), 0);
NUMBER_OF_RESEEKS_IN_ITERATION), 0);
ASSERT_EQ(IterStatus(iter), "b->bone"); ASSERT_EQ(IterStatus(iter), "b->bone");
delete iter; delete iter;
@ -1293,8 +1272,7 @@ TEST(DBTest, IterReseek) {
iter->SeekToFirst(); iter->SeekToFirst();
ASSERT_EQ(IterStatus(iter), "a->three"); ASSERT_EQ(IterStatus(iter), "a->three");
iter->Next(); iter->Next();
ASSERT_EQ(options.statistics.get()->getTickerCount( ASSERT_EQ(TestGetTickerCount(options, NUMBER_OF_RESEEKS_IN_ITERATION), 0);
NUMBER_OF_RESEEKS_IN_ITERATION), 0);
ASSERT_EQ(IterStatus(iter), "b->bone"); ASSERT_EQ(IterStatus(iter), "b->bone");
delete iter; delete iter;
@ -1304,30 +1282,28 @@ TEST(DBTest, IterReseek) {
iter = db_->NewIterator(ReadOptions()); iter = db_->NewIterator(ReadOptions());
iter->SeekToFirst(); iter->SeekToFirst();
ASSERT_EQ(IterStatus(iter), "a->four"); ASSERT_EQ(IterStatus(iter), "a->four");
ASSERT_EQ(options.statistics.get()->getTickerCount( ASSERT_EQ(TestGetTickerCount(options, NUMBER_OF_RESEEKS_IN_ITERATION), 0);
NUMBER_OF_RESEEKS_IN_ITERATION), 0);
iter->Next(); iter->Next();
ASSERT_EQ(options.statistics.get()->getTickerCount( ASSERT_EQ(TestGetTickerCount(options, NUMBER_OF_RESEEKS_IN_ITERATION), 1);
NUMBER_OF_RESEEKS_IN_ITERATION), 1);
ASSERT_EQ(IterStatus(iter), "b->bone"); ASSERT_EQ(IterStatus(iter), "b->bone");
delete iter; delete iter;
// Testing reverse iterator // Testing reverse iterator
// At this point, we have three versions of "a" and one version of "b". // At this point, we have three versions of "a" and one version of "b".
// The reseek statistics is already at 1. // The reseek statistics is already at 1.
int num_reseeks = (int)options.statistics.get()->getTickerCount( int num_reseeks =
NUMBER_OF_RESEEKS_IN_ITERATION); (int)TestGetTickerCount(options, NUMBER_OF_RESEEKS_IN_ITERATION);
// Insert another version of b and assert that reseek is not invoked // Insert another version of b and assert that reseek is not invoked
ASSERT_OK(Put("b", "btwo")); ASSERT_OK(Put("b", "btwo"));
iter = db_->NewIterator(ReadOptions()); iter = db_->NewIterator(ReadOptions());
iter->SeekToLast(); iter->SeekToLast();
ASSERT_EQ(IterStatus(iter), "b->btwo"); ASSERT_EQ(IterStatus(iter), "b->btwo");
ASSERT_EQ(options.statistics.get()->getTickerCount( ASSERT_EQ(TestGetTickerCount(options, NUMBER_OF_RESEEKS_IN_ITERATION),
NUMBER_OF_RESEEKS_IN_ITERATION), num_reseeks); num_reseeks);
iter->Prev(); iter->Prev();
ASSERT_EQ(options.statistics.get()->getTickerCount( ASSERT_EQ(TestGetTickerCount(options, NUMBER_OF_RESEEKS_IN_ITERATION),
NUMBER_OF_RESEEKS_IN_ITERATION), num_reseeks+1); num_reseeks + 1);
ASSERT_EQ(IterStatus(iter), "a->four"); ASSERT_EQ(IterStatus(iter), "a->four");
delete iter; delete iter;
@ -1338,13 +1314,13 @@ TEST(DBTest, IterReseek) {
iter = db_->NewIterator(ReadOptions()); iter = db_->NewIterator(ReadOptions());
iter->SeekToLast(); iter->SeekToLast();
ASSERT_EQ(IterStatus(iter), "b->bfour"); ASSERT_EQ(IterStatus(iter), "b->bfour");
ASSERT_EQ(options.statistics.get()->getTickerCount( ASSERT_EQ(TestGetTickerCount(options, NUMBER_OF_RESEEKS_IN_ITERATION),
NUMBER_OF_RESEEKS_IN_ITERATION), num_reseeks + 2); num_reseeks + 2);
iter->Prev(); iter->Prev();
// the previous Prev call should have invoked reseek // the previous Prev call should have invoked reseek
ASSERT_EQ(options.statistics.get()->getTickerCount( ASSERT_EQ(TestGetTickerCount(options, NUMBER_OF_RESEEKS_IN_ITERATION),
NUMBER_OF_RESEEKS_IN_ITERATION), num_reseeks + 3); num_reseeks + 3);
ASSERT_EQ(IterStatus(iter), "a->four"); ASSERT_EQ(IterStatus(iter), "a->four");
delete iter; delete iter;
} }
@ -2107,24 +2083,18 @@ TEST(DBTest, CompressedCache) {
switch (iter) { switch (iter) {
case 0: case 0:
// only uncompressed block cache // only uncompressed block cache
ASSERT_GT(options.statistics.get()->getTickerCount(BLOCK_CACHE_MISS), ASSERT_GT(TestGetTickerCount(options, BLOCK_CACHE_MISS), 0);
0); ASSERT_EQ(TestGetTickerCount(options, BLOCK_CACHE_COMPRESSED_MISS), 0);
ASSERT_EQ(options.statistics.get()->getTickerCount
(BLOCK_CACHE_COMPRESSED_MISS), 0);
break; break;
case 1: case 1:
// no block cache, only compressed cache // no block cache, only compressed cache
ASSERT_EQ(options.statistics.get()->getTickerCount(BLOCK_CACHE_MISS), ASSERT_EQ(TestGetTickerCount(options, BLOCK_CACHE_MISS), 0);
0); ASSERT_GT(TestGetTickerCount(options, BLOCK_CACHE_COMPRESSED_MISS), 0);
ASSERT_GT(options.statistics.get()->getTickerCount
(BLOCK_CACHE_COMPRESSED_MISS), 0);
break; break;
case 2: case 2:
// both compressed and uncompressed block cache // both compressed and uncompressed block cache
ASSERT_GT(options.statistics.get()->getTickerCount(BLOCK_CACHE_MISS), ASSERT_GT(TestGetTickerCount(options, BLOCK_CACHE_MISS), 0);
0); ASSERT_GT(TestGetTickerCount(options, BLOCK_CACHE_COMPRESSED_MISS), 0);
ASSERT_GT(options.statistics.get()->getTickerCount
(BLOCK_CACHE_COMPRESSED_MISS), 0);
break; break;
default: default:
ASSERT_TRUE(false); ASSERT_TRUE(false);
@ -3313,34 +3283,46 @@ TEST(DBTest, ManualCompaction) {
ASSERT_EQ(dbfull()->MaxMemCompactionLevel(), 2) ASSERT_EQ(dbfull()->MaxMemCompactionLevel(), 2)
<< "Need to update this test to match kMaxMemCompactLevel"; << "Need to update this test to match kMaxMemCompactLevel";
MakeTables(3, "p", "q"); // iter - 0 with 7 levels
ASSERT_EQ("1,1,1", FilesPerLevel()); // iter - 1 with 3 levels
for (int iter = 0; iter < 2; ++iter) {
MakeTables(3, "p", "q");
ASSERT_EQ("1,1,1", FilesPerLevel());
// Compaction range falls before files
Compact("", "c");
ASSERT_EQ("1,1,1", FilesPerLevel());
// Compaction range falls before files // Compaction range falls after files
Compact("", "c"); Compact("r", "z");
ASSERT_EQ("1,1,1", FilesPerLevel()); ASSERT_EQ("1,1,1", FilesPerLevel());
// Compaction range falls after files // Compaction range overlaps files
Compact("r", "z"); Compact("p1", "p9");
ASSERT_EQ("1,1,1", FilesPerLevel()); ASSERT_EQ("0,0,1", FilesPerLevel());
// Compaction range overlaps files // Populate a different range
Compact("p1", "p9"); MakeTables(3, "c", "e");
ASSERT_EQ("0,0,1", FilesPerLevel()); ASSERT_EQ("1,1,2", FilesPerLevel());
// Populate a different range // Compact just the new range
MakeTables(3, "c", "e"); Compact("b", "f");
ASSERT_EQ("1,1,2", FilesPerLevel()); ASSERT_EQ("0,0,2", FilesPerLevel());
// Compact just the new range // Compact all
Compact("b", "f"); MakeTables(1, "a", "z");
ASSERT_EQ("0,0,2", FilesPerLevel()); ASSERT_EQ("0,1,2", FilesPerLevel());
db_->CompactRange(nullptr, nullptr);
ASSERT_EQ("0,0,1", FilesPerLevel());
if (iter == 0) {
Options options = CurrentOptions();
options.num_levels = 3;
options.create_if_missing = true;
DestroyAndReopen(&options);
}
}
// Compact all
MakeTables(1, "a", "z");
ASSERT_EQ("0,1,2", FilesPerLevel());
db_->CompactRange(nullptr, nullptr);
ASSERT_EQ("0,0,1", FilesPerLevel());
} }
TEST(DBTest, DBOpen_Options) { TEST(DBTest, DBOpen_Options) {
@ -3401,7 +3383,7 @@ TEST(DBTest, DBOpen_Change_NumLevels) {
opts.create_if_missing = false; opts.create_if_missing = false;
opts.num_levels = 2; opts.num_levels = 2;
s = DB::Open(opts, dbname, &db); s = DB::Open(opts, dbname, &db);
ASSERT_TRUE(strstr(s.ToString().c_str(), "Corruption") != nullptr); ASSERT_TRUE(strstr(s.ToString().c_str(), "Invalid argument") != nullptr);
ASSERT_TRUE(db == nullptr); ASSERT_TRUE(db == nullptr);
} }
@ -4336,6 +4318,70 @@ TEST(DBTest, MultiThreaded) {
} while (ChangeOptions()); } while (ChangeOptions());
} }
// Group commit test:
namespace {
static const int kGCNumThreads = 4;
static const int kGCNumKeys = 1000;
struct GCThread {
DB* db;
int id;
std::atomic<bool> done;
};
static void GCThreadBody(void* arg) {
GCThread* t = reinterpret_cast<GCThread*>(arg);
int id = t->id;
DB* db = t->db;
WriteOptions wo;
for (int i = 0; i < kGCNumKeys; ++i) {
std::string kv(std::to_string(i + id * kGCNumKeys));
ASSERT_OK(db->Put(wo, kv, kv));
}
t->done = true;
}
} // namespace
TEST(DBTest, GroupCommitTest) {
do {
// Start threads
GCThread thread[kGCNumThreads];
for (int id = 0; id < kGCNumThreads; id++) {
thread[id].id = id;
thread[id].db = db_;
thread[id].done = false;
env_->StartThread(GCThreadBody, &thread[id]);
}
for (int id = 0; id < kGCNumThreads; id++) {
while (thread[id].done == false) {
env_->SleepForMicroseconds(100000);
}
}
std::vector<std::string> expected_db;
for (int i = 0; i < kGCNumThreads * kGCNumKeys; ++i) {
expected_db.push_back(std::to_string(i));
}
sort(expected_db.begin(), expected_db.end());
Iterator* itr = db_->NewIterator(ReadOptions());
itr->SeekToFirst();
for (auto x : expected_db) {
ASSERT_TRUE(itr->Valid());
ASSERT_EQ(itr->key().ToString(), x);
ASSERT_EQ(itr->value().ToString(), x);
itr->Next();
}
ASSERT_TRUE(!itr->Valid());
delete itr;
} while (ChangeOptions());
}
namespace { namespace {
typedef std::map<std::string, std::string> KVMap; typedef std::map<std::string, std::string> KVMap;
} }
@ -4903,7 +4949,7 @@ void BM_LogAndApply(int iters, int num_base_files) {
EnvOptions sopt; EnvOptions sopt;
VersionSet vset(dbname, &options, sopt, nullptr, &cmp); VersionSet vset(dbname, &options, sopt, nullptr, &cmp);
ASSERT_OK(vset.Recover()); ASSERT_OK(vset.Recover());
VersionEdit vbase(vset.NumberLevels()); VersionEdit vbase;
uint64_t fnum = 1; uint64_t fnum = 1;
for (int i = 0; i < num_base_files; i++) { for (int i = 0; i < num_base_files; i++) {
InternalKey start(MakeKey(2*fnum), 1, kTypeValue); InternalKey start(MakeKey(2*fnum), 1, kTypeValue);
@ -4915,7 +4961,7 @@ void BM_LogAndApply(int iters, int num_base_files) {
uint64_t start_micros = env->NowMicros(); uint64_t start_micros = env->NowMicros();
for (int i = 0; i < iters; i++) { for (int i = 0; i < iters; i++) {
VersionEdit vedit(vset.NumberLevels()); VersionEdit vedit;
vedit.DeleteFile(2, fnum); vedit.DeleteFile(2, fnum);
InternalKey start(MakeKey(2*fnum), 1, kTypeValue); InternalKey start(MakeKey(2*fnum), 1, kTypeValue);
InternalKey limit(MakeKey(2*fnum+1), 1, kTypeDeletion); InternalKey limit(MakeKey(2*fnum+1), 1, kTypeDeletion);

@ -20,7 +20,7 @@
#include "util/coding.h" #include "util/coding.h"
#include "util/mutexlock.h" #include "util/mutexlock.h"
#include "util/murmurhash.h" #include "util/murmurhash.h"
#include "util/statistics_imp.h" #include "util/statistics.h"
namespace std { namespace std {
template <> template <>
@ -33,24 +33,20 @@ struct hash<rocksdb::Slice> {
namespace rocksdb { namespace rocksdb {
MemTable::MemTable(const InternalKeyComparator& cmp, MemTable::MemTable(const InternalKeyComparator& cmp, const Options& options)
MemTableRepFactory* table_factory,
int numlevel,
const Options& options)
: comparator_(cmp), : comparator_(cmp),
refs_(0), refs_(0),
arena_impl_(options.arena_block_size), arena_impl_(options.arena_block_size),
table_(table_factory->CreateMemTableRep(comparator_, &arena_impl_)), table_(options.memtable_factory->CreateMemTableRep(comparator_,
&arena_impl_)),
flush_in_progress_(false), flush_in_progress_(false),
flush_completed_(false), flush_completed_(false),
file_number_(0), file_number_(0),
edit_(numlevel),
first_seqno_(0), first_seqno_(0),
mem_next_logfile_number_(0), mem_next_logfile_number_(0),
mem_logfile_number_(0), mem_logfile_number_(0),
locks_(options.inplace_update_support locks_(options.inplace_update_support ? options.inplace_update_num_locks
? options.inplace_update_num_locks : 0) {}
: 0) { }
MemTable::~MemTable() { MemTable::~MemTable() {
assert(refs_ == 0); assert(refs_ == 0);
@ -58,7 +54,7 @@ MemTable::~MemTable() {
size_t MemTable::ApproximateMemoryUsage() { size_t MemTable::ApproximateMemoryUsage() {
return arena_impl_.ApproximateMemoryUsage() + return arena_impl_.ApproximateMemoryUsage() +
table_->ApproximateMemoryUsage(); table_->ApproximateMemoryUsage();
} }
int MemTable::KeyComparator::operator()(const char* aptr, const char* bptr) int MemTable::KeyComparator::operator()(const char* aptr, const char* bptr)
@ -89,11 +85,11 @@ class MemTableIterator: public Iterator {
MemTableIterator(MemTableRep* table, const ReadOptions& options) MemTableIterator(MemTableRep* table, const ReadOptions& options)
: iter_() { : iter_() {
if (options.prefix) { if (options.prefix) {
iter_ = table->GetPrefixIterator(*options.prefix); iter_.reset(table->GetPrefixIterator(*options.prefix));
} else if (options.prefix_seek) { } else if (options.prefix_seek) {
iter_ = table->GetDynamicPrefixIterator(); iter_.reset(table->GetDynamicPrefixIterator());
} else { } else {
iter_ = table->GetIterator(); iter_.reset(table->GetIterator());
} }
} }
@ -114,7 +110,7 @@ class MemTableIterator: public Iterator {
virtual Status status() const { return Status::OK(); } virtual Status status() const { return Status::OK(); }
private: private:
std::shared_ptr<MemTableRep::Iterator> iter_; std::unique_ptr<MemTableRep::Iterator> iter_;
std::string tmp_; // For passing to EncodeKey std::string tmp_; // For passing to EncodeKey
// No copying allowed // No copying allowed
@ -165,8 +161,8 @@ void MemTable::Add(SequenceNumber s, ValueType type,
bool MemTable::Get(const LookupKey& key, std::string* value, Status* s, bool MemTable::Get(const LookupKey& key, std::string* value, Status* s,
MergeContext& merge_context, const Options& options) { MergeContext& merge_context, const Options& options) {
Slice memkey = key.memtable_key(); Slice memkey = key.memtable_key();
std::shared_ptr<MemTableRep::Iterator> iter( std::unique_ptr<MemTableRep::Iterator> iter(
table_->GetIterator(key.user_key())); table_->GetIterator(key.user_key()));
iter->Seek(memkey.data()); iter->Seek(memkey.data());
bool merge_in_progress = s->IsMergeInProgress(); bool merge_in_progress = s->IsMergeInProgress();
@ -274,8 +270,8 @@ bool MemTable::Update(SequenceNumber seq, ValueType type,
LookupKey lkey(key, seq); LookupKey lkey(key, seq);
Slice memkey = lkey.memtable_key(); Slice memkey = lkey.memtable_key();
std::shared_ptr<MemTableRep::Iterator> iter( std::unique_ptr<MemTableRep::Iterator> iter(
table_->GetIterator(lkey.user_key())); table_->GetIterator(lkey.user_key()));
iter->Seek(memkey.data()); iter->Seek(memkey.data());
if (iter->Valid()) { if (iter->Valid()) {
@ -336,8 +332,8 @@ size_t MemTable::CountSuccessiveMergeEntries(const LookupKey& key) {
// A total ordered iterator is costly for some memtablerep (prefix aware // A total ordered iterator is costly for some memtablerep (prefix aware
// reps). By passing in the user key, we allow efficient iterator creation. // reps). By passing in the user key, we allow efficient iterator creation.
// The iterator only needs to be ordered within the same user key. // The iterator only needs to be ordered within the same user key.
std::shared_ptr<MemTableRep::Iterator> iter( std::unique_ptr<MemTableRep::Iterator> iter(
table_->GetIterator(key.user_key())); table_->GetIterator(key.user_key()));
iter->Seek(memkey.data()); iter->Seek(memkey.data());
size_t num_successive_merges = 0; size_t num_successive_merges = 0;

@ -34,11 +34,8 @@ class MemTable {
// MemTables are reference counted. The initial reference count // MemTables are reference counted. The initial reference count
// is zero and the caller must call Ref() at least once. // is zero and the caller must call Ref() at least once.
explicit MemTable( explicit MemTable(const InternalKeyComparator& comparator,
const InternalKeyComparator& comparator, const Options& options = Options());
MemTableRepFactory* table_factory,
int numlevel = 7,
const Options& options = Options());
~MemTable(); ~MemTable();
@ -146,7 +143,7 @@ class MemTable {
KeyComparator comparator_; KeyComparator comparator_;
int refs_; int refs_;
ArenaImpl arena_impl_; ArenaImpl arena_impl_;
shared_ptr<MemTableRep> table_; unique_ptr<MemTableRep> table_;
// These are used to manage memtable flushes to storage // These are used to manage memtable flushes to storage
bool flush_in_progress_; // started the flush bool flush_in_progress_; // started the flush

@ -8,7 +8,7 @@
#include "rocksdb/comparator.h" #include "rocksdb/comparator.h"
#include "rocksdb/db.h" #include "rocksdb/db.h"
#include "rocksdb/merge_operator.h" #include "rocksdb/merge_operator.h"
#include "util/statistics_imp.h" #include "util/statistics.h"
#include <string> #include <string>
#include <stdio.h> #include <stdio.h>

@ -58,7 +58,7 @@ class Repairer {
next_file_number_(1) { next_file_number_(1) {
// TableCache can be small since we expect each table to be opened once. // TableCache can be small since we expect each table to be opened once.
table_cache_ = new TableCache(dbname_, &options_, storage_options_, 10); table_cache_ = new TableCache(dbname_, &options_, storage_options_, 10);
edit_ = new VersionEdit(options.num_levels); edit_ = new VersionEdit();
} }
~Repairer() { ~Repairer() {
@ -196,8 +196,7 @@ class Repairer {
std::string scratch; std::string scratch;
Slice record; Slice record;
WriteBatch batch; WriteBatch batch;
MemTable* mem = new MemTable(icmp_, options_.memtable_factory.get(), MemTable* mem = new MemTable(icmp_, options_);
options_.num_levels);
mem->Ref(); mem->Ref();
int counter = 0; int counter = 0;
while (reader.ReadRecord(&record, &scratch)) { while (reader.ReadRecord(&record, &scratch)) {

@ -17,7 +17,7 @@
#include "db/filename.h" #include "db/filename.h"
#include "db/version_set.h" #include "db/version_set.h"
#include "db/write_batch_internal.h" #include "db/write_batch_internal.h"
#include "db/db_statistics.h" #include "rocksdb/statistics.h"
#include "rocksdb/cache.h" #include "rocksdb/cache.h"
#include "rocksdb/compaction_filter.h" #include "rocksdb/compaction_filter.h"
#include "rocksdb/env.h" #include "rocksdb/env.h"

@ -38,6 +38,7 @@ enum Tag {
void VersionEdit::Clear() { void VersionEdit::Clear() {
comparator_.clear(); comparator_.clear();
max_level_ = 0;
log_number_ = 0; log_number_ = 0;
prev_log_number_ = 0; prev_log_number_ = 0;
last_sequence_ = 0; last_sequence_ = 0;
@ -77,12 +78,6 @@ void VersionEdit::EncodeTo(std::string* dst) const {
PutVarint64(dst, last_sequence_); PutVarint64(dst, last_sequence_);
} }
for (size_t i = 0; i < compact_pointers_.size(); i++) {
PutVarint32(dst, kCompactPointer);
PutVarint32(dst, compact_pointers_[i].first); // level
PutLengthPrefixedSlice(dst, compact_pointers_[i].second.Encode());
}
for (DeletedFileSet::const_iterator iter = deleted_files_.begin(); for (DeletedFileSet::const_iterator iter = deleted_files_.begin();
iter != deleted_files_.end(); iter != deleted_files_.end();
++iter) { ++iter) {
@ -131,14 +126,13 @@ static bool GetInternalKey(Slice* input, InternalKey* dst) {
bool VersionEdit::GetLevel(Slice* input, int* level, const char** msg) { bool VersionEdit::GetLevel(Slice* input, int* level, const char** msg) {
uint32_t v; uint32_t v;
if (GetVarint32(input, &v) && if (GetVarint32(input, &v)) {
(int)v < number_levels_) {
*level = v; *level = v;
if (max_level_ < *level) {
max_level_ = *level;
}
return true; return true;
} else { } else {
if ((int)v >= number_levels_) {
*msg = "column family already has more levels than specified";
}
return false; return false;
} }
} }
@ -202,7 +196,9 @@ Status VersionEdit::DecodeFrom(const Slice& src) {
case kCompactPointer: case kCompactPointer:
if (GetLevel(&input, &level, &msg) && if (GetLevel(&input, &level, &msg) &&
GetInternalKey(&input, &key)) { GetInternalKey(&input, &key)) {
compact_pointers_.push_back(std::make_pair(level, key)); // we don't use compact pointers anymore,
// but we should not fail if they are still
// in manifest
} else { } else {
if (!msg) { if (!msg) {
msg = "compaction pointer"; msg = "compaction pointer";
@ -314,12 +310,6 @@ std::string VersionEdit::DebugString(bool hex_key) const {
r.append("\n LastSeq: "); r.append("\n LastSeq: ");
AppendNumberTo(&r, last_sequence_); AppendNumberTo(&r, last_sequence_);
} }
for (size_t i = 0; i < compact_pointers_.size(); i++) {
r.append("\n CompactPointer: ");
AppendNumberTo(&r, compact_pointers_[i].first);
r.append(" ");
r.append(compact_pointers_[i].second.DebugString(hex_key));
}
for (DeletedFileSet::const_iterator iter = deleted_files_.begin(); for (DeletedFileSet::const_iterator iter = deleted_files_.begin();
iter != deleted_files_.end(); iter != deleted_files_.end();
++iter) { ++iter) {

@ -34,9 +34,7 @@ struct FileMetaData {
class VersionEdit { class VersionEdit {
public: public:
explicit VersionEdit(int number_levels) : number_levels_(number_levels) { VersionEdit() { Clear(); }
Clear();
}
~VersionEdit() { } ~VersionEdit() { }
void Clear(); void Clear();
@ -61,9 +59,6 @@ class VersionEdit {
has_last_sequence_ = true; has_last_sequence_ = true;
last_sequence_ = seq; last_sequence_ = seq;
} }
void SetCompactPointer(int level, const InternalKey& key) {
compact_pointers_.push_back(std::make_pair(level, key));
}
// Add the specified file at the specified number. // Add the specified file at the specified number.
// REQUIRES: This version has not been saved (see VersionSet::SaveTo) // REQUIRES: This version has not been saved (see VersionSet::SaveTo)
@ -128,7 +123,7 @@ class VersionEdit {
bool GetLevel(Slice* input, int* level, const char** msg); bool GetLevel(Slice* input, int* level, const char** msg);
int number_levels_; int max_level_;
std::string comparator_; std::string comparator_;
uint64_t log_number_; uint64_t log_number_;
uint64_t prev_log_number_; uint64_t prev_log_number_;
@ -140,7 +135,6 @@ class VersionEdit {
bool has_next_file_number_; bool has_next_file_number_;
bool has_last_sequence_; bool has_last_sequence_;
std::vector< std::pair<int, InternalKey> > compact_pointers_;
DeletedFileSet deleted_files_; DeletedFileSet deleted_files_;
std::vector< std::pair<int, FileMetaData> > new_files_; std::vector< std::pair<int, FileMetaData> > new_files_;

@ -15,7 +15,7 @@ namespace rocksdb {
static void TestEncodeDecode(const VersionEdit& edit) { static void TestEncodeDecode(const VersionEdit& edit) {
std::string encoded, encoded2; std::string encoded, encoded2;
edit.EncodeTo(&encoded); edit.EncodeTo(&encoded);
VersionEdit parsed(7); VersionEdit parsed;
Status s = parsed.DecodeFrom(encoded); Status s = parsed.DecodeFrom(encoded);
ASSERT_TRUE(s.ok()) << s.ToString(); ASSERT_TRUE(s.ok()) << s.ToString();
parsed.EncodeTo(&encoded2); parsed.EncodeTo(&encoded2);
@ -27,7 +27,7 @@ class VersionEditTest { };
TEST(VersionEditTest, EncodeDecode) { TEST(VersionEditTest, EncodeDecode) {
static const uint64_t kBig = 1ull << 50; static const uint64_t kBig = 1ull << 50;
VersionEdit edit(7); VersionEdit edit;
for (int i = 0; i < 4; i++) { for (int i = 0; i < 4; i++) {
TestEncodeDecode(edit); TestEncodeDecode(edit);
edit.AddFile(3, kBig + 300 + i, kBig + 400 + i, edit.AddFile(3, kBig + 300 + i, kBig + 400 + i,
@ -36,7 +36,6 @@ TEST(VersionEditTest, EncodeDecode) {
kBig + 500 + i, kBig + 500 + i,
kBig + 600 + i); kBig + 600 + i);
edit.DeleteFile(4, kBig + 700 + i); edit.DeleteFile(4, kBig + 700 + i);
edit.SetCompactPointer(i, InternalKey("x", kBig + 900 + i, kTypeValue));
} }
edit.SetComparatorName("foo"); edit.SetComparatorName("foo");
@ -47,7 +46,7 @@ TEST(VersionEditTest, EncodeDecode) {
} }
TEST(VersionEditTest, ColumnFamilyTest) { TEST(VersionEditTest, ColumnFamilyTest) {
VersionEdit edit(7); VersionEdit edit;
edit.SetColumnFamily(2); edit.SetColumnFamily(2);
edit.AddColumnFamily("column_family"); edit.AddColumnFamily("column_family");
TestEncodeDecode(edit); TestEncodeDecode(edit);

File diff suppressed because it is too large Load Diff

@ -27,12 +27,15 @@
#include "db/version_edit.h" #include "db/version_edit.h"
#include "port/port.h" #include "port/port.h"
#include "db/table_cache.h" #include "db/table_cache.h"
#include "db/compaction.h"
#include "db/compaction_picker.h"
namespace rocksdb { namespace rocksdb {
namespace log { class Writer; } namespace log { class Writer; }
class Compaction; class Compaction;
class CompactionPicker;
class Iterator; class Iterator;
class MemTable; class MemTable;
class TableCache; class TableCache;
@ -86,6 +89,11 @@ class Version {
// REQUIRES: lock is held // REQUIRES: lock is held
bool UpdateStats(const GetStats& stats); bool UpdateStats(const GetStats& stats);
// Updates internal structures that keep track of compaction scores
// We use compaction scores to figure out which compaction to do next
// Also pre-sorts level0 files for Get()
void Finalize(std::vector<uint64_t>& size_being_compacted);
// Reference count management (so Versions do not disappear out from // Reference count management (so Versions do not disappear out from
// under live iterators) // under live iterators)
void Ref(); void Ref();
@ -135,21 +143,54 @@ class Version {
int PickLevelForMemTableOutput(const Slice& smallest_user_key, int PickLevelForMemTableOutput(const Slice& smallest_user_key,
const Slice& largest_user_key); const Slice& largest_user_key);
int NumFiles(int level) const { return files_[level].size(); } int NumberLevels() const { return num_levels_; }
// REQUIRES: lock is held
int NumLevelFiles(int level) const { return files_[level].size(); }
// Return the combined file size of all files at the specified level.
int64_t NumLevelBytes(int level) const;
// Return a human-readable short (single-line) summary of the number
// of files per level. Uses *scratch as backing store.
struct LevelSummaryStorage {
char buffer[100];
};
struct FileSummaryStorage {
char buffer[1000];
};
const char* LevelSummary(LevelSummaryStorage* scratch) const;
// Return a human-readable short (single-line) summary of files
// in a specified level. Uses *scratch as backing store.
const char* LevelFileSummary(FileSummaryStorage* scratch, int level) const;
// Return the maximum overlapping data (in bytes) at next level for any
// file at a level >= 1.
int64_t MaxNextLevelOverlappingBytes();
// Add all files listed in the current version to *live.
void AddLiveFiles(std::set<uint64_t>* live);
// Return a human readable string that describes this version's contents. // Return a human readable string that describes this version's contents.
std::string DebugString(bool hex = false) const; std::string DebugString(bool hex = false) const;
// Returns the version nuber of this version // Returns the version nuber of this version
uint64_t GetVersionNumber() { uint64_t GetVersionNumber() const { return version_number_; }
return version_number_;
} // used to sort files by size
struct Fsize {
int index;
FileMetaData* file;
};
private: private:
friend class Compaction; friend class Compaction;
friend class VersionSet; friend class VersionSet;
friend class DBImpl; friend class DBImpl;
friend struct ColumnFamilyData; friend struct ColumnFamilyData;
friend class CompactionPicker;
friend class LevelCompactionPicker;
friend class UniversalCompactionPicker;
class LevelFileNumIterator; class LevelFileNumIterator;
Iterator* NewConcatenatingIterator(const ReadOptions&, Iterator* NewConcatenatingIterator(const ReadOptions&,
@ -158,10 +199,15 @@ class Version {
bool PrefixMayMatch(const ReadOptions& options, const EnvOptions& soptions, bool PrefixMayMatch(const ReadOptions& options, const EnvOptions& soptions,
const Slice& internal_prefix, Iterator* level_iter) const; const Slice& internal_prefix, Iterator* level_iter) const;
// Sort all files for this version based on their file size and
// record results in files_by_size_. The largest files are listed first.
void UpdateFilesBySize();
VersionSet* vset_; // VersionSet to which this Version belongs VersionSet* vset_; // VersionSet to which this Version belongs
Version* next_; // Next version in linked list Version* next_; // Next version in linked list
Version* prev_; // Previous version in linked list Version* prev_; // Previous version in linked list
int refs_; // Number of live refs to this version int refs_; // Number of live refs to this version
int num_levels_; // Number of levels
// List of files per level, files in each level are arranged // List of files per level, files in each level are arranged
// in increasing order of keys // in increasing order of keys
@ -251,10 +297,8 @@ struct ColumnFamilyData {
class VersionSet { class VersionSet {
public: public:
VersionSet(const std::string& dbname, VersionSet(const std::string& dbname, const Options* options,
const Options* options, const EnvOptions& storage_options, TableCache* table_cache,
const EnvOptions& storage_options,
TableCache* table_cache,
const InternalKeyComparator*); const InternalKeyComparator*);
~VersionSet(); ~VersionSet();
@ -292,6 +336,12 @@ class VersionSet {
return column_family_data_.find(0)->second->current; return column_family_data_.find(0)->second->current;
} }
// A Flag indicating whether write needs to slowdown because of there are
// too many number of level0 files.
bool NeedSlowdownForNumLevel0Files() const {
return need_slowdown_for_num_level0_files_;
}
// Return the current manifest file number // Return the current manifest file number
uint64_t ManifestFileNumber() const { return manifest_file_number_; } uint64_t ManifestFileNumber() const { return manifest_file_number_; }
@ -307,12 +357,6 @@ class VersionSet {
} }
} }
// Return the number of Table files at the specified level.
int NumLevelFiles(int level) const;
// Return the combined file size of all files at the specified level.
int64_t NumLevelBytes(int level) const;
// Return the last sequence number. // Return the last sequence number.
uint64_t LastSequence() const { uint64_t LastSequence() const {
return last_sequence_.load(std::memory_order_acquire); return last_sequence_.load(std::memory_order_acquire);
@ -346,14 +390,18 @@ class VersionSet {
// the specified level. Returns nullptr if there is nothing in that // the specified level. Returns nullptr if there is nothing in that
// level that overlaps the specified range. Caller should delete // level that overlaps the specified range. Caller should delete
// the result. // the result.
Compaction* CompactRange( //
int level, // The returned Compaction might not include the whole requested range.
const InternalKey* begin, // In that case, compaction_end will be set to the next key that needs
const InternalKey* end); // compacting. In case the compaction will compact the whole range,
// compaction_end will be set to nullptr.
// Return the maximum overlapping data (in bytes) at next level for any // Client is responsible for compaction_end storage -- when called,
// file at a level >= 1. // *compaction_end should point to valid InternalKey!
int64_t MaxNextLevelOverlappingBytes(); Compaction* CompactRange(int input_level,
int output_level,
const InternalKey* begin,
const InternalKey* end,
InternalKey** compaction_end);
// Create an iterator that reads over the compaction inputs for "*c". // Create an iterator that reads over the compaction inputs for "*c".
// The caller should delete the iterator when no longer needed. // The caller should delete the iterator when no longer needed.
@ -405,58 +453,16 @@ class VersionSet {
// Add all files listed in any live version to *live. // Add all files listed in any live version to *live.
void AddLiveFiles(std::vector<uint64_t>* live_list); void AddLiveFiles(std::vector<uint64_t>* live_list);
// Add all files listed in the current version to *live.
void AddLiveFilesCurrentVersion(std::set<uint64_t>* live);
// Return the approximate offset in the database of the data for // Return the approximate offset in the database of the data for
// "key" as of version "v". // "key" as of version "v".
uint64_t ApproximateOffsetOf(Version* v, const InternalKey& key); uint64_t ApproximateOffsetOf(Version* v, const InternalKey& key);
// Return a human-readable short (single-line) summary of the number
// of files per level. Uses *scratch as backing store.
struct LevelSummaryStorage {
char buffer[100];
};
struct FileSummaryStorage {
char buffer[1000];
};
const char* LevelSummary(LevelSummaryStorage* scratch) const;
// printf contents (for debugging) // printf contents (for debugging)
Status DumpManifest(Options& options, std::string& manifestFileName, Status DumpManifest(Options& options, std::string& manifestFileName,
bool verbose, bool hex = false); bool verbose, bool hex = false);
// Return a human-readable short (single-line) summary of the data size
// of files per level. Uses *scratch as backing store.
const char* LevelDataSizeSummary(LevelSummaryStorage* scratch) const;
// Return a human-readable short (single-line) summary of files
// in a specified level. Uses *scratch as backing store.
const char* LevelFileSummary(Version* version,
FileSummaryStorage* scratch,
int level) const;
// Return the size of the current manifest file // Return the size of the current manifest file
const uint64_t ManifestFileSize() { return manifest_file_size_; } uint64_t ManifestFileSize() const { return manifest_file_size_; }
// For the specfied level, pick a compaction.
// Returns nullptr if there is no compaction to be done.
// If level is 0 and there is already a compaction on that level, this
// function will return nullptr.
Compaction* PickCompactionBySize(int level, double score);
// Pick files to compact in Universal mode
Compaction* PickCompactionUniversal(int level, double score);
// Pick Universal compaction to limit read amplification
Compaction* PickCompactionUniversalReadAmp(int level, double score,
unsigned int ratio, unsigned int num_files);
// Pick Universal compaction to limit space amplification.
Compaction* PickCompactionUniversalSizeAmp(int level, double score);
// Free up the files that were participated in a compaction
void ReleaseCompactionFiles(Compaction* c, Status status);
// verify that the files that we started with for a compaction // verify that the files that we started with for a compaction
// still exist in the current version and in the same original level. // still exist in the current version and in the same original level.
@ -464,20 +470,12 @@ class VersionSet {
// pick the same files to compact. // pick the same files to compact.
bool VerifyCompactionFileConsistency(Compaction* c); bool VerifyCompactionFileConsistency(Compaction* c);
// used to sort files by size double MaxBytesForLevel(int level);
typedef struct fsize {
int index;
FileMetaData* file;
} Fsize;
// Sort all files for this version based on their file size and
// record results in files_by_size_. The largest files are listed first.
void UpdateFilesBySize(Version *v);
// Get the max file size in a given level. // Get the max file size in a given level.
uint64_t MaxFileSizeForLevel(int level); uint64_t MaxFileSizeForLevel(int level);
double MaxBytesForLevel(int level); void ReleaseCompactionFiles(Compaction* c, Status status);
Status GetMetadataForFile( Status GetMetadataForFile(
uint64_t number, int *filelevel, FileMetaData *metadata); uint64_t number, int *filelevel, FileMetaData *metadata);
@ -503,23 +501,6 @@ class VersionSet {
friend class Compaction; friend class Compaction;
friend class Version; friend class Version;
void Init(int num_levels);
void Finalize(Version* v, std::vector<uint64_t>&);
void GetRange(const std::vector<FileMetaData*>& inputs,
InternalKey* smallest,
InternalKey* largest);
void GetRange2(const std::vector<FileMetaData*>& inputs1,
const std::vector<FileMetaData*>& inputs2,
InternalKey* smallest,
InternalKey* largest);
void ExpandWhileOverlapping(Compaction* c);
void SetupOtherInputs(Compaction* c);
// Save current contents to *log // Save current contents to *log
Status WriteSnapshot(log::Writer* log); Status WriteSnapshot(log::Writer* log);
@ -527,10 +508,6 @@ class VersionSet {
bool ManifestContains(const std::string& record) const; bool ManifestContains(const std::string& record) const;
uint64_t ExpandedCompactionByteSizeLimit(int level);
uint64_t MaxGrandParentOverlapBytes(int level);
Env* const env_; Env* const env_;
const std::string dbname_; const std::string dbname_;
const Options* const options_; const Options* const options_;
@ -547,18 +524,13 @@ class VersionSet {
// Opened lazily // Opened lazily
unique_ptr<log::Writer> descriptor_log_; unique_ptr<log::Writer> descriptor_log_;
// Per-level key at which the next compaction at that level should start. // A flag indicating whether we should delay writes because
// Either an empty string, or a valid InternalKey. // we have too many level 0 files
std::string* compact_pointer_; bool need_slowdown_for_num_level0_files_;
// Per-level target file size.
uint64_t* max_file_size_;
// Per-level max bytes // An object that keeps all the compaction stats
uint64_t* level_max_bytes_; // and picks the next compaction
std::unique_ptr<CompactionPicker> compaction_picker_;
// record all the ongoing compactions for all levels
std::vector<std::set<Compaction*> > compactions_in_progress_;
// generates a increasing version number for every new version // generates a increasing version number for every new version
uint64_t current_version_number_; uint64_t current_version_number_;
@ -566,7 +538,7 @@ class VersionSet {
// Queue of writers to the manifest file // Queue of writers to the manifest file
std::deque<ManifestWriter*> manifest_writers_; std::deque<ManifestWriter*> manifest_writers_;
// size of manifest file // Current size of manifest file
uint64_t manifest_file_size_; uint64_t manifest_file_size_;
std::vector<FileMetaData*> obsolete_files_; std::vector<FileMetaData*> obsolete_files_;
@ -582,138 +554,8 @@ class VersionSet {
VersionSet(const VersionSet&); VersionSet(const VersionSet&);
void operator=(const VersionSet&); void operator=(const VersionSet&);
// Return the total amount of data that is undergoing
// compactions per level
void SizeBeingCompacted(std::vector<uint64_t>&);
// Returns true if any one of the parent files are being compacted
bool ParentRangeInCompaction(const InternalKey* smallest,
const InternalKey* largest, int level, int* index);
// Returns true if any one of the specified files are being compacted
bool FilesInCompaction(std::vector<FileMetaData*>& files);
void LogAndApplyHelper(Builder*b, Version* v, void LogAndApplyHelper(Builder*b, Version* v,
VersionEdit* edit, port::Mutex* mu); VersionEdit* edit, port::Mutex* mu);
}; };
// A Compaction encapsulates information about a compaction.
class Compaction {
public:
~Compaction();
// Return the level that is being compacted. Inputs from "level"
// will be merged.
int level() const { return level_; }
// Outputs will go to this level
int output_level() const { return out_level_; }
// Return the object that holds the edits to the descriptor done
// by this compaction.
VersionEdit* edit() { return edit_; }
// "which" must be either 0 or 1
int num_input_files(int which) const { return inputs_[which].size(); }
// Return the ith input file at "level()+which" ("which" must be 0 or 1).
FileMetaData* input(int which, int i) const { return inputs_[which][i]; }
// Maximum size of files to build during this compaction.
uint64_t MaxOutputFileSize() const { return max_output_file_size_; }
// Whether compression will be enabled for compaction outputs
bool enable_compression() const { return enable_compression_; }
// Is this a trivial compaction that can be implemented by just
// moving a single input file to the next level (no merging or splitting)
bool IsTrivialMove() const;
// Add all inputs to this compaction as delete operations to *edit.
void AddInputDeletions(VersionEdit* edit);
// Returns true if the information we have available guarantees that
// the compaction is producing data in "level+1" for which no data exists
// in levels greater than "level+1".
bool IsBaseLevelForKey(const Slice& user_key);
// Returns true iff we should stop building the current output
// before processing "internal_key".
bool ShouldStopBefore(const Slice& internal_key);
// Release the input version for the compaction, once the compaction
// is successful.
void ReleaseInputs();
void Summary(char* output, int len);
// Return the score that was used to pick this compaction run.
double score() const { return score_; }
// Is this compaction creating a file in the bottom most level?
bool BottomMostLevel() { return bottommost_level_; }
// Does this compaction include all sst files?
bool IsFullCompaction() { return is_full_compaction_; }
private:
friend class Version;
friend class VersionSet;
Compaction(int level,
int out_level,
uint64_t target_file_size,
uint64_t max_grandparent_overlap_bytes,
int number_levels,
Version* input_version,
bool seek_compaction = false,
bool enable_compression = true);
int level_;
int out_level_; // levels to which output files are stored
uint64_t max_output_file_size_;
uint64_t maxGrandParentOverlapBytes_;
Version* input_version_;
VersionEdit* edit_;
int number_levels_;
bool seek_compaction_;
bool enable_compression_;
// Each compaction reads inputs from "level_" and "level_+1"
std::vector<FileMetaData*> inputs_[2]; // The two sets of inputs
// State used to check for number of of overlapping grandparent files
// (parent == level_ + 1, grandparent == level_ + 2)
std::vector<FileMetaData*> grandparents_;
size_t grandparent_index_; // Index in grandparent_starts_
bool seen_key_; // Some output key has been seen
uint64_t overlapped_bytes_; // Bytes of overlap between current output
// and grandparent files
int base_index_; // index of the file in files_[level_]
int parent_index_; // index of some file with same range in files_[level_+1]
double score_; // score that was used to pick this compaction.
// Is this compaction creating a file in the bottom most level?
bool bottommost_level_;
// Does this compaction include all sst files?
bool is_full_compaction_;
// level_ptrs_ holds indices into input_version_->levels_: our state
// is that we are positioned at one of the file ranges for each
// higher level than the ones involved in this compaction (i.e. for
// all L >= level_ + 2).
std::vector<size_t> level_ptrs_;
// mark (or clear) all files that are being compacted
void MarkFilesBeingCompacted(bool);
// Initialize whether compaction producing files at the bottommost level
void SetupBottomMostLevel(bool isManual);
// In case of compaction error, reset the nextIndex that is used
// to pick up the next file to be compacted from files_by_size_
void ResetNextCompactionIndex();
};
} // namespace rocksdb } // namespace rocksdb

@ -26,7 +26,7 @@ Status VersionSet::ReduceNumberOfLevels(int new_levels, port::Mutex* mu) {
// TODO this only works for default column family now // TODO this only works for default column family now
Version* current_version = column_family_data_.find(0)->second->current; Version* current_version = column_family_data_.find(0)->second->current;
int current_levels = NumberLevels(); int current_levels = current_version->NumberLevels();
if (current_levels <= new_levels) { if (current_levels <= new_levels) {
return Status::OK(); return Status::OK();
@ -37,7 +37,7 @@ Status VersionSet::ReduceNumberOfLevels(int new_levels, port::Mutex* mu) {
int first_nonempty_level = -1; int first_nonempty_level = -1;
int first_nonempty_level_filenum = 0; int first_nonempty_level_filenum = 0;
for (int i = new_levels - 1; i < current_levels; i++) { for (int i = new_levels - 1; i < current_levels; i++) {
int file_num = NumLevelFiles(i); int file_num = current_version->NumLevelFiles(i);
if (file_num != 0) { if (file_num != 0) {
if (first_nonempty_level < 0) { if (first_nonempty_level < 0) {
first_nonempty_level = i; first_nonempty_level = i;
@ -66,15 +66,12 @@ Status VersionSet::ReduceNumberOfLevels(int new_levels, port::Mutex* mu) {
delete[] current_version->files_; delete[] current_version->files_;
current_version->files_ = new_files_list; current_version->files_ = new_files_list;
current_version->num_levels_ = new_levels;
delete[] compact_pointer_;
delete[] max_file_size_;
delete[] level_max_bytes_;
num_levels_ = new_levels; num_levels_ = new_levels;
compact_pointer_ = new std::string[new_levels]; compaction_picker_->ReduceNumberOfLevels(new_levels);
Init(new_levels); VersionEdit ve;
VersionEdit ve(new_levels); st = LogAndApply(&ve, mu, true);
st = LogAndApply(&ve , mu, true);
return st; return st;
} }

@ -31,7 +31,7 @@
#include "db/snapshot.h" #include "db/snapshot.h"
#include "db/write_batch_internal.h" #include "db/write_batch_internal.h"
#include "util/coding.h" #include "util/coding.h"
#include "util/statistics_imp.h" #include "util/statistics.h"
#include <stdexcept> #include <stdexcept>
namespace rocksdb { namespace rocksdb {
@ -39,7 +39,8 @@ namespace rocksdb {
// WriteBatch header has an 8-byte sequence number followed by a 4-byte count. // WriteBatch header has an 8-byte sequence number followed by a 4-byte count.
static const size_t kHeader = 12; static const size_t kHeader = 12;
WriteBatch::WriteBatch() { WriteBatch::WriteBatch(size_t reserved_bytes) {
rep_.reserve((reserved_bytes > kHeader) ? reserved_bytes : kHeader);
Clear(); Clear();
} }

@ -22,10 +22,11 @@ namespace rocksdb {
static std::string PrintContents(WriteBatch* b) { static std::string PrintContents(WriteBatch* b) {
InternalKeyComparator cmp(BytewiseComparator()); InternalKeyComparator cmp(BytewiseComparator());
auto factory = std::make_shared<SkipListFactory>(); auto factory = std::make_shared<SkipListFactory>();
MemTable* mem = new MemTable(cmp, factory.get()); Options options;
options.memtable_factory = factory;
MemTable* mem = new MemTable(cmp, options);
mem->Ref(); mem->Ref();
std::string state; std::string state;
Options options;
Status s = WriteBatchInternal::InsertInto(b, mem, &options); Status s = WriteBatchInternal::InsertInto(b, mem, &options);
int count = 0; int count = 0;
Iterator* iter = mem->NewIterator(); Iterator* iter = mem->NewIterator();

@ -291,6 +291,7 @@ class DB {
} }
// Compact the underlying storage for the key range [*begin,*end]. // Compact the underlying storage for the key range [*begin,*end].
// The actual compaction interval might be superset of [*begin, *end].
// In particular, deleted and overwritten versions are discarded, // In particular, deleted and overwritten versions are discarded,
// and the data is rearranged to reduce the cost of operations // and the data is rearranged to reduce the cost of operations
// needed to access the data. This operation should typically only // needed to access the data. This operation should typically only

@ -111,27 +111,23 @@ class MemTableRep {
}; };
// Return an iterator over the keys in this representation. // Return an iterator over the keys in this representation.
virtual std::shared_ptr<Iterator> GetIterator() = 0; virtual Iterator* GetIterator() = 0;
// Return an iterator over at least the keys with the specified user key. The // Return an iterator over at least the keys with the specified user key. The
// iterator may also allow access to other keys, but doesn't have to. Default: // iterator may also allow access to other keys, but doesn't have to. Default:
// GetIterator(). // GetIterator().
virtual std::shared_ptr<Iterator> GetIterator(const Slice& user_key) { virtual Iterator* GetIterator(const Slice& user_key) { return GetIterator(); }
return GetIterator();
}
// Return an iterator over at least the keys with the specified prefix. The // Return an iterator over at least the keys with the specified prefix. The
// iterator may also allow access to other keys, but doesn't have to. Default: // iterator may also allow access to other keys, but doesn't have to. Default:
// GetIterator(). // GetIterator().
virtual std::shared_ptr<Iterator> GetPrefixIterator(const Slice& prefix) { virtual Iterator* GetPrefixIterator(const Slice& prefix) {
return GetIterator(); return GetIterator();
} }
// Return an iterator that has a special Seek semantics. The result of // Return an iterator that has a special Seek semantics. The result of
// a Seek might only include keys with the same prefix as the target key. // a Seek might only include keys with the same prefix as the target key.
virtual std::shared_ptr<Iterator> GetDynamicPrefixIterator() { virtual Iterator* GetDynamicPrefixIterator() { return GetIterator(); }
return GetIterator();
}
protected: protected:
// When *key is an internal key concatenated with the value, returns the // When *key is an internal key concatenated with the value, returns the
@ -144,8 +140,8 @@ class MemTableRep {
class MemTableRepFactory { class MemTableRepFactory {
public: public:
virtual ~MemTableRepFactory() { }; virtual ~MemTableRepFactory() { };
virtual std::shared_ptr<MemTableRep> CreateMemTableRep( virtual MemTableRep* CreateMemTableRep(MemTableRep::KeyComparator&,
MemTableRep::KeyComparator&, Arena*) = 0; Arena*) = 0;
virtual const char* Name() const = 0; virtual const char* Name() const = 0;
}; };
@ -161,8 +157,8 @@ class VectorRepFactory : public MemTableRepFactory {
const size_t count_; const size_t count_;
public: public:
explicit VectorRepFactory(size_t count = 0) : count_(count) { } explicit VectorRepFactory(size_t count = 0) : count_(count) { }
virtual std::shared_ptr<MemTableRep> CreateMemTableRep( virtual MemTableRep* CreateMemTableRep(MemTableRep::KeyComparator&,
MemTableRep::KeyComparator&, Arena*) override; Arena*) override;
virtual const char* Name() const override { virtual const char* Name() const override {
return "VectorRepFactory"; return "VectorRepFactory";
} }
@ -171,8 +167,8 @@ public:
// This uses a skip list to store keys. It is the default. // This uses a skip list to store keys. It is the default.
class SkipListFactory : public MemTableRepFactory { class SkipListFactory : public MemTableRepFactory {
public: public:
virtual std::shared_ptr<MemTableRep> CreateMemTableRep( virtual MemTableRep* CreateMemTableRep(MemTableRep::KeyComparator&,
MemTableRep::KeyComparator&, Arena*) override; Arena*) override;
virtual const char* Name() const override { virtual const char* Name() const override {
return "SkipListFactory"; return "SkipListFactory";
} }

@ -242,53 +242,10 @@ struct HistogramData {
double standard_deviation; double standard_deviation;
}; };
class Histogram {
public:
// clear's the histogram
virtual void Clear() = 0;
virtual ~Histogram();
// Add a value to be recorded in the histogram.
virtual void Add(uint64_t value) = 0;
virtual std::string ToString() const = 0;
// Get statistics
virtual double Median() const = 0;
virtual double Percentile(double p) const = 0;
virtual double Average() const = 0;
virtual double StandardDeviation() const = 0;
virtual void Data(HistogramData * const data) const = 0;
};
/**
* A dumb ticker which keeps incrementing through its life time.
* Thread safe. Locking managed by implementation of this interface.
*/
class Ticker {
public:
Ticker() : count_(0) { }
inline void setTickerCount(uint64_t count) {
count_ = count;
}
inline void recordTick(int count = 1) {
count_ += count;
}
inline uint64_t getCount() {
return count_;
}
private:
std::atomic_uint_fast64_t count_;
};
// Analyze the performance of a db // Analyze the performance of a db
class Statistics { class Statistics {
public: public:
virtual ~Statistics() {}
virtual long getTickerCount(Tickers tickerType) = 0; virtual long getTickerCount(Tickers tickerType) = 0;
virtual void recordTick(Tickers tickerType, uint64_t count = 0) = 0; virtual void recordTick(Tickers tickerType, uint64_t count = 0) = 0;

@ -36,7 +36,7 @@ struct SliceParts;
class WriteBatch { class WriteBatch {
public: public:
WriteBatch(); explicit WriteBatch(size_t reserved_bytes = 0);
~WriteBatch(); ~WriteBatch();
// Store the mapping "key->value" in the database. // Store the mapping "key->value" in the database.
@ -122,7 +122,10 @@ class WriteBatch {
Status Iterate(Handler* handler) const; Status Iterate(Handler* handler) const;
// Retrieve the serialized version of this batch. // Retrieve the serialized version of this batch.
std::string Data() { return rep_; } const std::string& Data() const { return rep_; }
// Retrieve data size of the batch.
size_t GetDataSize() const { return rep_.size(); }
// Returns the number of updates in the batch // Returns the number of updates in the batch
int Count() const; int Count() const;

@ -12,7 +12,8 @@
#include <vector> #include <vector>
#include "db/dbformat.h" #include "db/dbformat.h"
#include "db/db_statistics.h" #include "rocksdb/statistics.h"
#include "util/statistics.h"
#include "db/memtable.h" #include "db/memtable.h"
#include "db/write_batch_internal.h" #include "db/write_batch_internal.h"
#include "rocksdb/cache.h" #include "rocksdb/cache.h"
@ -370,7 +371,9 @@ class MemTableConstructor: public Constructor {
: Constructor(cmp), : Constructor(cmp),
internal_comparator_(cmp), internal_comparator_(cmp),
table_factory_(new SkipListFactory) { table_factory_(new SkipListFactory) {
memtable_ = new MemTable(internal_comparator_, table_factory_.get()); Options options;
options.memtable_factory = table_factory_;
memtable_ = new MemTable(internal_comparator_, options);
memtable_->Ref(); memtable_->Ref();
} }
~MemTableConstructor() { ~MemTableConstructor() {
@ -378,7 +381,9 @@ class MemTableConstructor: public Constructor {
} }
virtual Status FinishImpl(const Options& options, const KVMap& data) { virtual Status FinishImpl(const Options& options, const KVMap& data) {
delete memtable_->Unref(); delete memtable_->Unref();
memtable_ = new MemTable(internal_comparator_, table_factory_.get()); Options memtable_options;
memtable_options.memtable_factory = table_factory_;
memtable_ = new MemTable(internal_comparator_, memtable_options);
memtable_->Ref(); memtable_->Ref();
int seq = 1; int seq = 1;
for (KVMap::const_iterator it = data.begin(); for (KVMap::const_iterator it = data.begin();
@ -931,18 +936,12 @@ TEST(TableTest, NumBlockStat) {
class BlockCacheProperties { class BlockCacheProperties {
public: public:
explicit BlockCacheProperties(Statistics* statistics) { explicit BlockCacheProperties(Statistics* statistics) {
block_cache_miss = block_cache_miss = statistics->getTickerCount(BLOCK_CACHE_MISS);
statistics->getTickerCount(BLOCK_CACHE_MISS); block_cache_hit = statistics->getTickerCount(BLOCK_CACHE_HIT);
block_cache_hit = index_block_cache_miss = statistics->getTickerCount(BLOCK_CACHE_INDEX_MISS);
statistics->getTickerCount(BLOCK_CACHE_HIT); index_block_cache_hit = statistics->getTickerCount(BLOCK_CACHE_INDEX_HIT);
index_block_cache_miss = data_block_cache_miss = statistics->getTickerCount(BLOCK_CACHE_DATA_MISS);
statistics->getTickerCount(BLOCK_CACHE_INDEX_MISS); data_block_cache_hit = statistics->getTickerCount(BLOCK_CACHE_DATA_HIT);
index_block_cache_hit =
statistics->getTickerCount(BLOCK_CACHE_INDEX_HIT);
data_block_cache_miss =
statistics->getTickerCount(BLOCK_CACHE_DATA_MISS);
data_block_cache_hit =
statistics->getTickerCount(BLOCK_CACHE_DATA_HIT);
} }
// Check if the fetched props matches the expected ones. // Check if the fetched props matches the expected ones.
@ -1268,10 +1267,11 @@ class MemTableTest { };
TEST(MemTableTest, Simple) { TEST(MemTableTest, Simple) {
InternalKeyComparator cmp(BytewiseComparator()); InternalKeyComparator cmp(BytewiseComparator());
auto table_factory = std::make_shared<SkipListFactory>(); auto table_factory = std::make_shared<SkipListFactory>();
MemTable* memtable = new MemTable(cmp, table_factory.get()); Options options;
options.memtable_factory = table_factory;
MemTable* memtable = new MemTable(cmp, options);
memtable->Ref(); memtable->Ref();
WriteBatch batch; WriteBatch batch;
Options options;
WriteBatchInternal::SetSequence(&batch, 100); WriteBatchInternal::SetSequence(&batch, 100);
batch.Put(std::string("k1"), std::string("v1")); batch.Put(std::string("k1"), std::string("v1"));
batch.Put(std::string("k2"), std::string("v2")); batch.Put(std::string("k2"), std::string("v2"));

@ -26,7 +26,7 @@
#include <gflags/gflags.h> #include <gflags/gflags.h>
#include "db/db_impl.h" #include "db/db_impl.h"
#include "db/version_set.h" #include "db/version_set.h"
#include "db/db_statistics.h" #include "rocksdb/statistics.h"
#include "rocksdb/cache.h" #include "rocksdb/cache.h"
#include "utilities/utility_db.h" #include "utilities/utility_db.h"
#include "rocksdb/env.h" #include "rocksdb/env.h"

@ -31,17 +31,15 @@ class HashSkipListRep : public MemTableRep {
virtual ~HashSkipListRep(); virtual ~HashSkipListRep();
virtual std::shared_ptr<MemTableRep::Iterator> GetIterator() override; virtual MemTableRep::Iterator* GetIterator() override;
virtual std::shared_ptr<MemTableRep::Iterator> GetIterator( virtual MemTableRep::Iterator* GetIterator(const Slice& slice) override;
const Slice& slice) override;
virtual std::shared_ptr<MemTableRep::Iterator> GetPrefixIterator( virtual MemTableRep::Iterator* GetPrefixIterator(const Slice& prefix)
const Slice& prefix) override;
virtual std::shared_ptr<MemTableRep::Iterator> GetDynamicPrefixIterator()
override; override;
virtual MemTableRep::Iterator* GetDynamicPrefixIterator() override;
private: private:
friend class DynamicIterator; friend class DynamicIterator;
typedef SkipList<const char*, MemTableRep::KeyComparator&> Bucket; typedef SkipList<const char*, MemTableRep::KeyComparator&> Bucket;
@ -208,18 +206,15 @@ class HashSkipListRep : public MemTableRep {
virtual void SeekToLast() { } virtual void SeekToLast() { }
private: private:
}; };
std::shared_ptr<EmptyIterator> empty_iterator_;
}; };
HashSkipListRep::HashSkipListRep(MemTableRep::KeyComparator& compare, HashSkipListRep::HashSkipListRep(MemTableRep::KeyComparator& compare,
Arena* arena, const SliceTransform* transform, size_t bucket_size) Arena* arena, const SliceTransform* transform,
: bucket_size_(bucket_size), size_t bucket_size)
transform_(transform), : bucket_size_(bucket_size),
compare_(compare), transform_(transform),
arena_(arena), compare_(compare),
empty_iterator_(std::make_shared<EmptyIterator>()) { arena_(arena) {
buckets_ = new port::AtomicPointer[bucket_size]; buckets_ = new port::AtomicPointer[bucket_size];
for (size_t i = 0; i < bucket_size_; ++i) { for (size_t i = 0; i < bucket_size_; ++i) {
@ -263,7 +258,7 @@ size_t HashSkipListRep::ApproximateMemoryUsage() {
return sizeof(buckets_); return sizeof(buckets_);
} }
std::shared_ptr<MemTableRep::Iterator> HashSkipListRep::GetIterator() { MemTableRep::Iterator* HashSkipListRep::GetIterator() {
auto list = new Bucket(compare_, arena_); auto list = new Bucket(compare_, arena_);
for (size_t i = 0; i < bucket_size_; ++i) { for (size_t i = 0; i < bucket_size_; ++i) {
auto bucket = GetBucket(i); auto bucket = GetBucket(i);
@ -274,35 +269,30 @@ std::shared_ptr<MemTableRep::Iterator> HashSkipListRep::GetIterator() {
} }
} }
} }
return std::make_shared<Iterator>(list); return new Iterator(list);
} }
std::shared_ptr<MemTableRep::Iterator> HashSkipListRep::GetPrefixIterator( MemTableRep::Iterator* HashSkipListRep::GetPrefixIterator(const Slice& prefix) {
const Slice& prefix) {
auto bucket = GetBucket(prefix); auto bucket = GetBucket(prefix);
if (bucket == nullptr) { if (bucket == nullptr) {
return empty_iterator_; return new EmptyIterator();
} }
return std::make_shared<Iterator>(bucket, false); return new Iterator(bucket, false);
} }
std::shared_ptr<MemTableRep::Iterator> HashSkipListRep::GetIterator( MemTableRep::Iterator* HashSkipListRep::GetIterator(const Slice& slice) {
const Slice& slice) {
return GetPrefixIterator(transform_->Transform(slice)); return GetPrefixIterator(transform_->Transform(slice));
} }
std::shared_ptr<MemTableRep::Iterator> MemTableRep::Iterator* HashSkipListRep::GetDynamicPrefixIterator() {
HashSkipListRep::GetDynamicPrefixIterator() { return new DynamicIterator(*this);
return std::make_shared<DynamicIterator>(*this);
} }
} // anon namespace } // anon namespace
std::shared_ptr<MemTableRep> MemTableRep* HashSkipListRepFactory::CreateMemTableRep(
HashSkipListRepFactory::CreateMemTableRep(MemTableRep::KeyComparator &compare, MemTableRep::KeyComparator& compare, Arena* arena) {
Arena *arena) { return new HashSkipListRep(compare, arena, transform_, bucket_count_);
return std::make_shared<HashSkipListRep>(compare, arena, transform_,
bucket_count_);
} }
MemTableRepFactory* NewHashSkipListRepFactory( MemTableRepFactory* NewHashSkipListRepFactory(

@ -21,8 +21,8 @@ class HashSkipListRepFactory : public MemTableRepFactory {
virtual ~HashSkipListRepFactory() { delete transform_; } virtual ~HashSkipListRepFactory() { delete transform_; }
virtual std::shared_ptr<MemTableRep> CreateMemTableRep( virtual MemTableRep* CreateMemTableRep(MemTableRep::KeyComparator& compare,
MemTableRep::KeyComparator& compare, Arena* arena) override; Arena* arena) override;
virtual const char* Name() const override { virtual const char* Name() const override {
return "HashSkipListRepFactory"; return "HashSkipListRepFactory";

@ -16,27 +16,38 @@
namespace rocksdb { namespace rocksdb {
HistogramBucketMapper::HistogramBucketMapper() : HistogramBucketMapper::HistogramBucketMapper()
// Add newer bucket index here. :
// Should be alwyas added in sorted order. // Add newer bucket index here.
bucketValues_({ // Should be alwyas added in sorted order.
1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 12, 14, 16, 18, 20, 25, 30, 35, 40, 45, // If you change this, you also need to change
50, 60, 70, 80, 90, 100, 120, 140, 160, 180, 200, 250, 300, 350, 400, 450, // size of array buckets_ in HistogramImpl
500, 600, 700, 800, 900, 1000, 1200, 1400, 1600, 1800, 2000, 2500, 3000, bucketValues_(
3500, 4000, 4500, 5000, 6000, 7000, 8000, 9000, 10000, 12000, 14000, {1, 2, 3, 4, 5, 6,
16000, 18000, 20000, 25000, 30000, 35000, 40000, 45000, 50000, 60000, 7, 8, 9, 10, 12, 14,
70000, 80000, 90000, 100000, 120000, 140000, 160000, 180000, 200000, 16, 18, 20, 25, 30, 35,
250000, 300000, 350000, 400000, 450000, 500000, 600000, 700000, 800000, 40, 45, 50, 60, 70, 80,
900000, 1000000, 1200000, 1400000, 1600000, 1800000, 2000000, 2500000, 90, 100, 120, 140, 160, 180,
3000000, 3500000, 4000000, 4500000, 5000000, 6000000, 7000000, 8000000, 200, 250, 300, 350, 400, 450,
9000000, 10000000, 12000000, 14000000, 16000000, 18000000, 20000000, 500, 600, 700, 800, 900, 1000,
25000000, 30000000, 35000000, 40000000, 45000000, 50000000, 60000000, 1200, 1400, 1600, 1800, 2000, 2500,
70000000, 80000000, 90000000, 100000000, 120000000, 140000000, 160000000, 3000, 3500, 4000, 4500, 5000, 6000,
180000000, 200000000, 250000000, 300000000, 350000000, 400000000, 7000, 8000, 9000, 10000, 12000, 14000,
450000000, 500000000, 600000000, 700000000, 800000000, 900000000, 16000, 18000, 20000, 25000, 30000, 35000,
1000000000}), 40000, 45000, 50000, 60000, 70000, 80000,
maxBucketValue_(bucketValues_.back()), 90000, 100000, 120000, 140000, 160000, 180000,
minBucketValue_(bucketValues_.front()) { 200000, 250000, 300000, 350000, 400000, 450000,
500000, 600000, 700000, 800000, 900000, 1000000,
1200000, 1400000, 1600000, 1800000, 2000000, 2500000,
3000000, 3500000, 4000000, 4500000, 5000000, 6000000,
7000000, 8000000, 9000000, 10000000, 12000000, 14000000,
16000000, 18000000, 20000000, 25000000, 30000000, 35000000,
40000000, 45000000, 50000000, 60000000, 70000000, 80000000,
90000000, 100000000, 120000000, 140000000, 160000000, 180000000,
200000000, 250000000, 300000000, 350000000, 400000000, 450000000,
500000000, 600000000, 700000000, 800000000, 900000000, 1000000000}),
maxBucketValue_(bucketValues_.back()),
minBucketValue_(bucketValues_.front()) {
for (size_t i =0; i < bucketValues_.size(); ++i) { for (size_t i =0; i < bucketValues_.size(); ++i) {
valueIndexMap_[bucketValues_[i]] = i; valueIndexMap_[bucketValues_[i]] = i;
} }
@ -62,24 +73,17 @@ namespace {
const HistogramBucketMapper bucketMapper; const HistogramBucketMapper bucketMapper;
} }
HistogramImpl::HistogramImpl() :
min_(bucketMapper.LastValue()),
max_(0),
num_(0),
sum_(0),
sum_squares_(0),
buckets_(std::vector<uint64_t>(bucketMapper.BucketCount(), 0)) {}
void HistogramImpl::Clear() { void HistogramImpl::Clear() {
min_ = bucketMapper.LastValue(); min_ = bucketMapper.LastValue();
max_ = 0; max_ = 0;
num_ = 0; num_ = 0;
sum_ = 0; sum_ = 0;
sum_squares_ = 0; sum_squares_ = 0;
buckets_.resize(bucketMapper.BucketCount(), 0); memset(buckets_, 0, sizeof buckets_);
} }
bool HistogramImpl::Empty() { return sum_squares_ == 0; }
void HistogramImpl::Add(uint64_t value) { void HistogramImpl::Add(uint64_t value) {
const size_t index = bucketMapper.IndexForValue(value); const size_t index = bucketMapper.IndexForValue(value);
buckets_[index] += 1; buckets_[index] += 1;

@ -52,9 +52,8 @@ class HistogramBucketMapper {
class HistogramImpl { class HistogramImpl {
public: public:
HistogramImpl();
virtual ~HistogramImpl() {}
virtual void Clear(); virtual void Clear();
virtual bool Empty();
virtual void Add(uint64_t value); virtual void Add(uint64_t value);
void Merge(const HistogramImpl& other); void Merge(const HistogramImpl& other);
@ -67,13 +66,14 @@ class HistogramImpl {
virtual void Data(HistogramData * const data) const; virtual void Data(HistogramData * const data) const;
private: private:
double min_; // To be able to use HistogramImpl as thread local variable, its constructor
double max_; // has to be static. That's why we're using manually values from BucketMapper
double num_; double min_ = 1000000000; // this is BucketMapper:LastValue()
double sum_; double max_ = 0;
double sum_squares_; double num_ = 0;
std::vector<uint64_t> buckets_; double sum_ = 0;
double sum_squares_ = 0;
uint64_t buckets_[138] = {0}; // this is BucketMapper::BucketCount()
}; };
} // namespace rocksdb } // namespace rocksdb

@ -1024,7 +1024,7 @@ Status ReduceDBLevelsCommand::GetOldNumOfLevels(Options& opt,
} }
int max = -1; int max = -1;
for (int i = 0; i < versions.NumberLevels(); i++) { for (int i = 0; i < versions.NumberLevels(); i++) {
if (versions.NumLevelFiles(i)) { if (versions.current()->NumLevelFiles(i)) {
max = i; max = i;
} }
} }

@ -9,9 +9,13 @@
#include <cstdlib> #include <cstdlib>
#include "rocksdb/db.h" #include "rocksdb/db.h"
#include "rocksdb/compaction_filter.h"
#include "rocksdb/slice.h"
#include "rocksdb/write_batch.h" #include "rocksdb/write_batch.h"
#include "util/testharness.h" #include "util/testharness.h"
using namespace rocksdb;
namespace { namespace {
const int kNumKeys = 1100000; const int kNumKeys = 1100000;
@ -26,12 +30,71 @@ std::string Key2(int i) {
return Key1(i) + "_xxx"; return Key1(i) + "_xxx";
} }
class ManualCompactionTest { }; class ManualCompactionTest {
public:
ManualCompactionTest() {
// Get rid of any state from an old run.
dbname_ = rocksdb::test::TmpDir() + "/rocksdb_cbug_test";
DestroyDB(dbname_, rocksdb::Options());
}
std::string dbname_;
};
class DestroyAllCompactionFilter : public CompactionFilter {
public:
DestroyAllCompactionFilter() {}
virtual bool Filter(int level,
const Slice& key,
const Slice& existing_value,
std::string* new_value,
bool* value_changed) const {
return existing_value.ToString() == "destroy";
}
virtual const char* Name() const {
return "DestroyAllCompactionFilter";
}
};
TEST(ManualCompactionTest, CompactTouchesAllKeys) {
for (int iter = 0; iter < 2; ++iter) {
DB* db;
Options options;
if (iter == 0) { // level compaction
options.num_levels = 3;
options.compaction_style = kCompactionStyleLevel;
} else { // universal compaction
options.compaction_style = kCompactionStyleUniversal;
}
options.create_if_missing = true;
options.compression = rocksdb::kNoCompression;
options.compaction_filter = new DestroyAllCompactionFilter();
ASSERT_OK(DB::Open(options, dbname_, &db));
db->Put(WriteOptions(), Slice("key1"), Slice("destroy"));
db->Put(WriteOptions(), Slice("key2"), Slice("destroy"));
db->Put(WriteOptions(), Slice("key3"), Slice("value3"));
db->Put(WriteOptions(), Slice("key4"), Slice("destroy"));
Slice key4("key4");
db->CompactRange(nullptr, &key4);
Iterator* itr = db->NewIterator(ReadOptions());
itr->SeekToFirst();
ASSERT_TRUE(itr->Valid());
ASSERT_EQ("key3", itr->key().ToString());
itr->Next();
ASSERT_TRUE(!itr->Valid());
delete itr;
delete options.compaction_filter;
delete db;
DestroyDB(dbname_, options);
}
}
TEST(ManualCompactionTest, Test) { TEST(ManualCompactionTest, Test) {
// Get rid of any state from an old run.
std::string dbpath = rocksdb::test::TmpDir() + "/rocksdb_cbug_test";
DestroyDB(dbpath, rocksdb::Options());
// Open database. Disable compression since it affects the creation // Open database. Disable compression since it affects the creation
// of layers and the code below is trying to test against a very // of layers and the code below is trying to test against a very
@ -40,7 +103,7 @@ TEST(ManualCompactionTest, Test) {
rocksdb::Options db_options; rocksdb::Options db_options;
db_options.create_if_missing = true; db_options.create_if_missing = true;
db_options.compression = rocksdb::kNoCompression; db_options.compression = rocksdb::kNoCompression;
ASSERT_OK(rocksdb::DB::Open(db_options, dbpath, &db)); ASSERT_OK(rocksdb::DB::Open(db_options, dbname_, &db));
// create first key range // create first key range
rocksdb::WriteBatch batch; rocksdb::WriteBatch batch;
@ -83,7 +146,7 @@ TEST(ManualCompactionTest, Test) {
// close database // close database
delete db; delete db;
DestroyDB(dbpath, rocksdb::Options()); DestroyDB(dbname_, rocksdb::Options());
} }
} // anonymous namespace } // anonymous namespace

@ -90,15 +90,15 @@ public:
// Unhide default implementations of GetIterator // Unhide default implementations of GetIterator
using MemTableRep::GetIterator; using MemTableRep::GetIterator;
virtual std::shared_ptr<MemTableRep::Iterator> GetIterator() override { virtual MemTableRep::Iterator* GetIterator() override {
return std::make_shared<SkipListRep::Iterator>(&skip_list_); return new SkipListRep::Iterator(&skip_list_);
} }
}; };
} }
std::shared_ptr<MemTableRep> SkipListFactory::CreateMemTableRep ( MemTableRep* SkipListFactory::CreateMemTableRep(
MemTableRep::KeyComparator& compare, Arena* arena) { MemTableRep::KeyComparator& compare, Arena* arena) {
return std::shared_ptr<MemTableRep>(new SkipListRep(compare, arena)); return new SkipListRep(compare, arena);
} }
} // namespace rocksdb } // namespace rocksdb

@ -3,12 +3,48 @@
// LICENSE file in the root directory of this source tree. An additional grant // LICENSE file in the root directory of this source tree. An additional grant
// of patent rights can be found in the PATENTS file in the same directory. // of patent rights can be found in the PATENTS file in the same directory.
// //
#include "util/statistics.h"
#include "rocksdb/statistics.h" #include "rocksdb/statistics.h"
#include <cstdio> #include <cstdio>
namespace rocksdb { namespace rocksdb {
std::shared_ptr<Statistics> CreateDBStatistics() {
return std::make_shared<StatisticsImpl>();
}
StatisticsImpl::StatisticsImpl() {}
StatisticsImpl::~StatisticsImpl() {}
long StatisticsImpl::getTickerCount(Tickers tickerType) {
assert(tickerType < TICKER_ENUM_MAX);
return tickers_[tickerType];
}
void StatisticsImpl::setTickerCount(Tickers tickerType, uint64_t count) {
assert(tickerType < TICKER_ENUM_MAX);
tickers_[tickerType] = count;
}
void StatisticsImpl::recordTick(Tickers tickerType, uint64_t count) {
assert(tickerType < TICKER_ENUM_MAX);
tickers_[tickerType] += count;
}
void StatisticsImpl::measureTime(Histograms histogramType, uint64_t value) {
assert(histogramType < HISTOGRAM_ENUM_MAX);
histograms_[histogramType].Add(value);
}
void StatisticsImpl::histogramData(Histograms histogramType,
HistogramData* const data) {
assert(histogramType < HISTOGRAM_ENUM_MAX);
histograms_[histogramType].Data(data);
}
namespace { namespace {
// a buffer size used for temp string buffers // a buffer size used for temp string buffers
const int kBufferSize = 200; const int kBufferSize = 200;
@ -32,11 +68,8 @@ std::string HistogramToString (
return std::string(buffer); return std::string(buffer);
}; };
std::string TickerToString ( std::string TickerToString(Statistics* dbstats, const Tickers& ticker,
Statistics* dbstats, const std::string& name) {
const Tickers& ticker,
const std::string& name) {
char buffer[kBufferSize]; char buffer[kBufferSize];
snprintf(buffer, kBufferSize, "%s COUNT : %ld\n", snprintf(buffer, kBufferSize, "%s COUNT : %ld\n",
name.c_str(), dbstats->getTickerCount(ticker)); name.c_str(), dbstats->getTickerCount(ticker));

@ -0,0 +1,53 @@
// Copyright (c) 2013, Facebook, Inc. All rights reserved.
// This source code is licensed under the BSD-style license found in the
// LICENSE file in the root directory of this source tree. An additional grant
// of patent rights can be found in the PATENTS file in the same directory.
//
#pragma once
#include "rocksdb/statistics.h"
#include "util/histogram.h"
#include "util/mutexlock.h"
#define UNLIKELY(val) (__builtin_expect((val), 0))
namespace rocksdb {
class StatisticsImpl : public Statistics {
public:
StatisticsImpl();
virtual ~StatisticsImpl();
virtual long getTickerCount(Tickers tickerType);
virtual void setTickerCount(Tickers tickerType, uint64_t count);
virtual void recordTick(Tickers tickerType, uint64_t count);
virtual void measureTime(Histograms histogramType, uint64_t value);
virtual void histogramData(Histograms histogramType,
HistogramData* const data);
private:
std::atomic_uint_fast64_t tickers_[TICKER_ENUM_MAX];
HistogramImpl histograms_[HISTOGRAM_ENUM_MAX];
};
// Utility functions
inline void MeasureTime(Statistics* statistics, Histograms histogramType,
uint64_t value) {
if (statistics) {
statistics->measureTime(histogramType, value);
}
}
inline void RecordTick(Statistics* statistics, Tickers ticker,
uint64_t count = 1) {
if (statistics) {
statistics->recordTick(ticker, count);
}
}
inline void SetTickerCount(Statistics* statistics, Tickers ticker,
uint64_t count) {
if (statistics) {
statistics->setTickerCount(ticker, count);
}
}
}

@ -1,32 +0,0 @@
// Copyright (c) 2013, Facebook, Inc. All rights reserved.
// This source code is licensed under the BSD-style license found in the
// LICENSE file in the root directory of this source tree. An additional grant
// of patent rights can be found in the PATENTS file in the same directory.
//
#pragma once
#include "rocksdb/statistics.h"
namespace rocksdb {
// Utility functions
inline void RecordTick(Statistics* statistics,
Tickers ticker,
uint64_t count = 1) {
assert(HistogramsNameMap.size() == HISTOGRAM_ENUM_MAX);
assert(TickersNameMap.size() == TICKER_ENUM_MAX);
if (statistics) {
statistics->recordTick(ticker, count);
}
}
inline void SetTickerCount(Statistics* statistics,
Tickers ticker,
uint64_t count) {
assert(HistogramsNameMap.size() == HISTOGRAM_ENUM_MAX);
assert(TickersNameMap.size() == TICKER_ENUM_MAX);
if (statistics) {
statistics->setTickerCount(ticker, count);
}
}
}

@ -5,7 +5,7 @@
// //
#pragma once #pragma once
#include "rocksdb/env.h" #include "rocksdb/env.h"
#include "util/statistics_imp.h" #include "util/statistics.h"
namespace rocksdb { namespace rocksdb {
// Auto-scoped. // Auto-scoped.
@ -28,11 +28,7 @@ class StopWatch {
return env_->NowMicros() - start_time_; return env_->NowMicros() - start_time_;
} }
~StopWatch() { ~StopWatch() { MeasureTime(statistics_, histogram_name_, ElapsedMicros()); }
if (statistics_) {
statistics_->measureTime(histogram_name_, ElapsedMicros());
}
}
private: private:
Env* const env_; Env* const env_;

@ -88,7 +88,7 @@ class VectorRep : public MemTableRep {
using MemTableRep::GetIterator; using MemTableRep::GetIterator;
// Return an iterator over the keys in this representation. // Return an iterator over the keys in this representation.
virtual std::shared_ptr<MemTableRep::Iterator> GetIterator() override; virtual MemTableRep::Iterator* GetIterator() override;
private: private:
friend class Iterator; friend class Iterator;
@ -228,22 +228,22 @@ void VectorRep::Iterator::SeekToLast() {
} }
} }
std::shared_ptr<MemTableRep::Iterator> VectorRep::GetIterator() { MemTableRep::Iterator* VectorRep::GetIterator() {
ReadLock l(&rwlock_); ReadLock l(&rwlock_);
// Do not sort here. The sorting would be done the first time // Do not sort here. The sorting would be done the first time
// a Seek is performed on the iterator. // a Seek is performed on the iterator.
if (immutable_) { if (immutable_) {
return std::make_shared<Iterator>(this, bucket_, compare_); return new Iterator(this, bucket_, compare_);
} else { } else {
std::shared_ptr<Bucket> tmp; std::shared_ptr<Bucket> tmp;
tmp.reset(new Bucket(*bucket_)); // make a copy tmp.reset(new Bucket(*bucket_)); // make a copy
return std::make_shared<Iterator>(nullptr, tmp, compare_); return new Iterator(nullptr, tmp, compare_);
} }
} }
} // anon namespace } // anon namespace
std::shared_ptr<MemTableRep> VectorRepFactory::CreateMemTableRep( MemTableRep* VectorRepFactory::CreateMemTableRep(
MemTableRep::KeyComparator& compare, Arena* arena) { MemTableRep::KeyComparator& compare, Arena* arena) {
return std::make_shared<VectorRep>(compare, arena, count_); return new VectorRep(compare, arena, count_);
} }
} // namespace rocksdb } // namespace rocksdb

Loading…
Cancel
Save