From e6de02103a4baebe68c1df22c8735435f40add4b Mon Sep 17 00:00:00 2001 From: sdong Date: Fri, 16 May 2014 10:35:41 -0700 Subject: [PATCH] Add a utility function to guess optimized options based on constraints Summary: Add a function GetOptions(), where based on four parameters users give: read/write amplification threshold, memory budget for mem tables and target DB size, it picks up a compaction style and parameters for them. Background threads are not touched yet. One limit of this algorithm: since compression rate and key/value size are hard to predict, it's hard to predict level 0 file size from write buffer size. Simply make 1:1 ratio here. Sample results: https://reviews.facebook.net/P477 Test Plan: Will add some a unit test where some sample scenarios are given and see they pick the results that make sense Reviewers: yhchiang, dhruba, haobo, igor, ljin Reviewed By: ljin Subscribers: leveldb Differential Revision: https://reviews.facebook.net/D18741 --- Makefile | 11 ++- include/rocksdb/options.h | 12 ++- util/options_builder.cc | 196 ++++++++++++++++++++++++++++++++++++++ util/options_test.cc | 80 ++++++++++++++++ 4 files changed, 294 insertions(+), 5 deletions(-) create mode 100644 util/options_builder.cc create mode 100644 util/options_test.cc diff --git a/Makefile b/Makefile index a63dc381e..dcd24e776 100644 --- a/Makefile +++ b/Makefile @@ -114,9 +114,10 @@ TESTS = \ deletefile_test \ table_test \ thread_local_test \ - geodb_test \ - rate_limiter_test \ - cuckoo_table_builder_test + geodb_test \ + rate_limiter_test \ + cuckoo_table_builder_test \ + options_test TOOLS = \ sst_dump \ @@ -124,6 +125,7 @@ TOOLS = \ db_stress \ ldb \ db_repl_stress \ + options_test \ blob_store_bench PROGRAMS = db_bench signal_test table_reader_bench log_and_apply_bench $(TOOLS) @@ -414,6 +416,9 @@ geodb_test: utilities/geodb/geodb_test.o $(LIBOBJECTS) $(TESTHARNESS) cuckoo_table_builder_test: table/cuckoo_table_builder_test.o $(LIBOBJECTS) $(TESTHARNESS) $(CXX) table/cuckoo_table_builder_test.o $(LIBOBJECTS) $(TESTHARNESS) $(EXEC_LDFLAGS) -o $@ $(LDFLAGS) $(COVERAGEFLAGS) +options_test: util/options_test.o $(LIBOBJECTS) $(TESTHARNESS) + $(CXX) util/options_test.o $(LIBOBJECTS) $(TESTHARNESS) $(EXEC_LDFLAGS) -o $@ $(LDFLAGS) $(COVERAGEFLAGS) + $(MEMENVLIBRARY) : $(MEMENVOBJECTS) rm -f $@ $(AR) -rs $@ $(MEMENVOBJECTS) diff --git a/include/rocksdb/options.h b/include/rocksdb/options.h index f73faac7a..ac8eaae4b 100644 --- a/include/rocksdb/options.h +++ b/include/rocksdb/options.h @@ -1027,7 +1027,6 @@ struct FlushOptions { FlushOptions() : wait(true) {} }; - // Create a RateLimiter object, which can be shared among RocksDB instances to // control write rate of flush and compaction. // @rate_bytes_per_sec: this is the only parameter you want to set most of the @@ -1051,7 +1050,16 @@ extern RateLimiter* NewRateLimiter( int64_t refill_period_us = 100 * 1000, int32_t fairness = 10); - +// Get options based on some guidelines. Now only tune parameter based on +// flush/compaction and fill default parameters for other parameters. +// total_write_buffer_limit: budget for memory spent for mem tables +// read_amplification_threshold: comfortable value of read amplification +// write_amplification_threshold: comfortable value of write amplification. +// target_db_size: estimated total DB size. +extern Options GetOptions(size_t total_write_buffer_limit, + int read_amplification_threshold = 8, + int write_amplification_threshold = 32, + uint64_t target_db_size = 68719476736 /* 64GB */); } // namespace rocksdb #endif // STORAGE_ROCKSDB_INCLUDE_OPTIONS_H_ diff --git a/util/options_builder.cc b/util/options_builder.cc new file mode 100644 index 000000000..5796c9856 --- /dev/null +++ b/util/options_builder.cc @@ -0,0 +1,196 @@ +// Copyright (c) 2013, Facebook, Inc. All rights reserved. +// This source code is licensed under the BSD-style license found in the +// LICENSE file in the root directory of this source tree. An additional grant +// of patent rights can be found in the PATENTS file in the same directory. + +#include +#include +#include "rocksdb/options.h" + +namespace rocksdb { + +namespace { + +// For now, always use 1-0 as level bytes multiplier. +const int kBytesForLevelMultiplier = 10; +const size_t kBytesForOneMb = 1024 * 1024; + +// Pick compaction style +CompactionStyle PickCompactionStyle(size_t write_buffer_size, + int read_amp_threshold, + int write_amp_threshold, + uint64_t target_db_size) { + // Estimate read amplification and write amplification of two compaction + // styles. If there is hard limit to force a choice, make the choice. + // Otherwise, calculate a score based on threshold and expected value of + // two styles, weighing reads 4X important than writes. + int expected_levels = static_cast(ceil( + log(target_db_size / write_buffer_size) / log(kBytesForLevelMultiplier))); + + int expected_max_files_universal = + static_cast(ceil(log2(target_db_size / write_buffer_size))); + + const int kEstimatedLevel0FilesInLevelStyle = 2; + // Estimate write amplification: + // (1) 1 for every L0 file + // (2) 2 for L1 + // (3) kBytesForLevelMultiplier for the last level. It's really hard to + // predict. + // (3) kBytesForLevelMultiplier for other levels. + int expected_write_amp_level = kEstimatedLevel0FilesInLevelStyle + 2 + + (expected_levels - 2) * kBytesForLevelMultiplier + + kBytesForLevelMultiplier; + int expected_read_amp_level = + kEstimatedLevel0FilesInLevelStyle + expected_levels; + + int max_read_amp_uni = expected_max_files_universal; + if (read_amp_threshold <= max_read_amp_uni) { + return kCompactionStyleLevel; + } else if (write_amp_threshold <= expected_write_amp_level) { + return kCompactionStyleUniversal; + } + + const double kReadWriteWeight = 4; + + double level_ratio = + static_cast(read_amp_threshold) / expected_read_amp_level * + kReadWriteWeight + + static_cast(write_amp_threshold) / expected_write_amp_level; + + int expected_write_amp_uni = expected_max_files_universal / 2 + 2; + int expected_read_amp_uni = expected_max_files_universal / 2 + 1; + + double uni_ratio = + static_cast(read_amp_threshold) / expected_read_amp_uni * + kReadWriteWeight + + static_cast(write_amp_threshold) / expected_write_amp_uni; + + if (level_ratio > uni_ratio) { + return kCompactionStyleLevel; + } else { + return kCompactionStyleUniversal; + } +} + +// Pick mem table size +void PickWriteBufferSize(size_t total_write_buffer_limit, Options* options) { + const size_t kMaxWriteBufferSize = 128 * kBytesForOneMb; + const size_t kMinWriteBufferSize = 4 * kBytesForOneMb; + + // Try to pick up a buffer size between 4MB and 128MB. + // And try to pick 4 as the total number of write buffers. + size_t write_buffer_size = total_write_buffer_limit / 4; + if (write_buffer_size > kMaxWriteBufferSize) { + write_buffer_size = kMaxWriteBufferSize; + } else if (write_buffer_size < kMinWriteBufferSize) { + write_buffer_size = std::min(static_cast(kMinWriteBufferSize), + total_write_buffer_limit / 2); + } + + // Truncate to multiple of 1MB. + if (write_buffer_size % kBytesForOneMb != 0) { + write_buffer_size = + (write_buffer_size / kBytesForOneMb + 1) * kBytesForOneMb; + } + + options->write_buffer_size = write_buffer_size; + options->max_write_buffer_number = + total_write_buffer_limit / write_buffer_size; + options->min_write_buffer_number_to_merge = 1; +} + +void OptimizeForUniversal(Options* options) { + options->level0_file_num_compaction_trigger = 2; + options->level0_slowdown_writes_trigger = 30; + options->level0_stop_writes_trigger = 40; + options->max_open_files = -1; +} + +// Optimize parameters for level-based compaction +void OptimizeForLevel(int read_amplification_threshold, + int write_amplification_threshold, + uint64_t target_db_size, Options* options) { + int expected_levels_one_level0_file = + static_cast(ceil(log(target_db_size / options->write_buffer_size) / + log(kBytesForLevelMultiplier))); + + int level0_stop_writes_trigger = + read_amplification_threshold - expected_levels_one_level0_file; + + const size_t kInitialLevel0TotalSize = 128 * kBytesForOneMb; + const int kMaxFileNumCompactionTrigger = 4; + const int kMinLevel0StopTrigger = 3; + + int file_num_buffer = + kInitialLevel0TotalSize / options->write_buffer_size + 1; + + if (level0_stop_writes_trigger > file_num_buffer) { + // Have sufficient room for multiple level 0 files + // Try enlarge the buffer up to 1GB + + // Try to enlarge the buffer up to 1GB, if still have sufficient headroom. + file_num_buffer *= + std::pow(2, std::max(0, std::min(3, level0_stop_writes_trigger - + file_num_buffer - 2))); + + options->level0_stop_writes_trigger = level0_stop_writes_trigger; + options->level0_slowdown_writes_trigger = level0_stop_writes_trigger - 2; + options->level0_file_num_compaction_trigger = + std::min(kMaxFileNumCompactionTrigger, file_num_buffer / 2); + } else { + options->level0_stop_writes_trigger = + std::max(kMinLevel0StopTrigger, file_num_buffer); + options->level0_slowdown_writes_trigger = + options->level0_stop_writes_trigger - 1; + options->level0_file_num_compaction_trigger = 1; + } + + // This doesn't consider compaction and overheads of mem tables. But usually + // it is in the same order of magnitude. + int expected_level0_compaction_size = + options->level0_file_num_compaction_trigger * options->write_buffer_size; + // Enlarge level1 target file size if level0 compaction size is larger. + int max_bytes_for_level_base = 10 * kBytesForOneMb; + if (expected_level0_compaction_size > max_bytes_for_level_base) { + max_bytes_for_level_base = expected_level0_compaction_size; + } + options->max_bytes_for_level_base = max_bytes_for_level_base; + // Now always set level multiplier to be 10 + options->max_bytes_for_level_multiplier = kBytesForLevelMultiplier; + + const int kMinFileSize = 2 * kBytesForOneMb; + // Allow at least 3-way parallelism for compaction between level 1 and 2. + int max_file_size = max_bytes_for_level_base / 3; + if (max_file_size < kMinFileSize) { + options->target_file_size_base = kMinFileSize; + } else { + if (max_file_size % kBytesForOneMb != 0) { + max_file_size = (max_file_size / kBytesForOneMb + 1) * kBytesForOneMb; + } + options->target_file_size_base = max_file_size; + } + + // TODO: consider to tune num_levels too. +} + +} // namespace + +Options GetOptions(size_t total_write_buffer_limit, + int read_amplification_threshold, + int write_amplification_threshold, uint64_t target_db_size) { + Options options; + PickWriteBufferSize(total_write_buffer_limit, &options); + size_t write_buffer_size = options.write_buffer_size; + options.compaction_style = + PickCompactionStyle(write_buffer_size, read_amplification_threshold, + write_amplification_threshold, target_db_size); + if (options.compaction_style == kCompactionStyleUniversal) { + OptimizeForUniversal(&options); + } else { + OptimizeForLevel(read_amplification_threshold, + write_amplification_threshold, target_db_size, &options); + } + return options; +} + +} // namespace rocksdb diff --git a/util/options_test.cc b/util/options_test.cc new file mode 100644 index 000000000..be07a83f5 --- /dev/null +++ b/util/options_test.cc @@ -0,0 +1,80 @@ +// Copyright (c) 2013, Facebook, Inc. All rights reserved. +// This source code is licensed under the BSD-style license found in the +// LICENSE file in the root directory of this source tree. An additional grant +// of patent rights can be found in the PATENTS file in the same directory. +// +// Copyright (c) 2011 The LevelDB Authors. All rights reserved. +// Use of this source code is governed by a BSD-style license that can be +// found in the LICENSE file. See the AUTHORS file for names of contributors. + +#define __STDC_FORMAT_MACROS +#include +#include + +#include "rocksdb/options.h" +#include "util/testharness.h" + +using GFLAGS::ParseCommandLineFlags; +DEFINE_bool(enable_print, false, "Print options generated to console."); + +namespace rocksdb { + +class OptionsTest {}; + +class StderrLogger : public Logger { + public: + virtual void Logv(const char* format, va_list ap) override { + vprintf(format, ap); + printf("\n"); + } +}; + +Options PrintAndGetOptions(size_t total_write_buffer_limit, + int read_amplification_threshold, + int write_amplification_threshold, + uint64_t target_db_size = 68719476736) { + StderrLogger logger; + + if (FLAGS_enable_print) { + printf( + "---- total_write_buffer_limit: %zu " + "read_amplification_threshold: %d write_amplification_threshold: %d " + "target_db_size %" PRIu64 " ----\n", + total_write_buffer_limit, read_amplification_threshold, + write_amplification_threshold, target_db_size); + } + + Options options = + GetOptions(total_write_buffer_limit, read_amplification_threshold, + write_amplification_threshold, target_db_size); + if (FLAGS_enable_print) { + options.Dump(&logger); + printf("-------------------------------------\n\n\n"); + } + return options; +} + +TEST(OptionsTest, LooseCondition) { + Options options; + PrintAndGetOptions(static_cast(10) * 1024 * 1024 * 1024, 100, 100); + + // Less mem table memory budget + PrintAndGetOptions(32 * 1024 * 1024, 100, 100); + + // Tight read amplification + options = PrintAndGetOptions(128 * 1024 * 1024, 8, 100); + ASSERT_EQ(options.compaction_style, kCompactionStyleLevel); + + // Tight write amplification + options = PrintAndGetOptions(128 * 1024 * 1024, 64, 10); + ASSERT_EQ(options.compaction_style, kCompactionStyleUniversal); + + // Both tight amplifications + PrintAndGetOptions(128 * 1024 * 1024, 4, 8); +} +} // namespace rocksdb + +int main(int argc, char** argv) { + ParseCommandLineFlags(&argc, &argv, true); + return rocksdb::test::RunAllTests(); +}