Summary: Add a function GetOptions(), where based on four parameters users give: read/write amplification threshold, memory budget for mem tables and target DB size, it picks up a compaction style and parameters for them. Background threads are not touched yet. One limit of this algorithm: since compression rate and key/value size are hard to predict, it's hard to predict level 0 file size from write buffer size. Simply make 1:1 ratio here. Sample results: https://reviews.facebook.net/P477 Test Plan: Will add some a unit test where some sample scenarios are given and see they pick the results that make sense Reviewers: yhchiang, dhruba, haobo, igor, ljin Reviewed By: ljin Subscribers: leveldb Differential Revision: https://reviews.facebook.net/D18741main
parent
250f035782
commit
e6de02103a
@ -0,0 +1,196 @@ |
|||||||
|
// Copyright (c) 2013, Facebook, Inc. All rights reserved.
|
||||||
|
// This source code is licensed under the BSD-style license found in the
|
||||||
|
// LICENSE file in the root directory of this source tree. An additional grant
|
||||||
|
// of patent rights can be found in the PATENTS file in the same directory.
|
||||||
|
|
||||||
|
#include <math.h> |
||||||
|
#include <algorithm> |
||||||
|
#include "rocksdb/options.h" |
||||||
|
|
||||||
|
namespace rocksdb { |
||||||
|
|
||||||
|
namespace { |
||||||
|
|
||||||
|
// For now, always use 1-0 as level bytes multiplier.
|
||||||
|
const int kBytesForLevelMultiplier = 10; |
||||||
|
const size_t kBytesForOneMb = 1024 * 1024; |
||||||
|
|
||||||
|
// Pick compaction style
|
||||||
|
CompactionStyle PickCompactionStyle(size_t write_buffer_size, |
||||||
|
int read_amp_threshold, |
||||||
|
int write_amp_threshold, |
||||||
|
uint64_t target_db_size) { |
||||||
|
// Estimate read amplification and write amplification of two compaction
|
||||||
|
// styles. If there is hard limit to force a choice, make the choice.
|
||||||
|
// Otherwise, calculate a score based on threshold and expected value of
|
||||||
|
// two styles, weighing reads 4X important than writes.
|
||||||
|
int expected_levels = static_cast<int>(ceil( |
||||||
|
log(target_db_size / write_buffer_size) / log(kBytesForLevelMultiplier))); |
||||||
|
|
||||||
|
int expected_max_files_universal = |
||||||
|
static_cast<int>(ceil(log2(target_db_size / write_buffer_size))); |
||||||
|
|
||||||
|
const int kEstimatedLevel0FilesInLevelStyle = 2; |
||||||
|
// Estimate write amplification:
|
||||||
|
// (1) 1 for every L0 file
|
||||||
|
// (2) 2 for L1
|
||||||
|
// (3) kBytesForLevelMultiplier for the last level. It's really hard to
|
||||||
|
// predict.
|
||||||
|
// (3) kBytesForLevelMultiplier for other levels.
|
||||||
|
int expected_write_amp_level = kEstimatedLevel0FilesInLevelStyle + 2 |
||||||
|
+ (expected_levels - 2) * kBytesForLevelMultiplier |
||||||
|
+ kBytesForLevelMultiplier; |
||||||
|
int expected_read_amp_level = |
||||||
|
kEstimatedLevel0FilesInLevelStyle + expected_levels; |
||||||
|
|
||||||
|
int max_read_amp_uni = expected_max_files_universal; |
||||||
|
if (read_amp_threshold <= max_read_amp_uni) { |
||||||
|
return kCompactionStyleLevel; |
||||||
|
} else if (write_amp_threshold <= expected_write_amp_level) { |
||||||
|
return kCompactionStyleUniversal; |
||||||
|
} |
||||||
|
|
||||||
|
const double kReadWriteWeight = 4; |
||||||
|
|
||||||
|
double level_ratio = |
||||||
|
static_cast<double>(read_amp_threshold) / expected_read_amp_level * |
||||||
|
kReadWriteWeight + |
||||||
|
static_cast<double>(write_amp_threshold) / expected_write_amp_level; |
||||||
|
|
||||||
|
int expected_write_amp_uni = expected_max_files_universal / 2 + 2; |
||||||
|
int expected_read_amp_uni = expected_max_files_universal / 2 + 1; |
||||||
|
|
||||||
|
double uni_ratio = |
||||||
|
static_cast<double>(read_amp_threshold) / expected_read_amp_uni * |
||||||
|
kReadWriteWeight + |
||||||
|
static_cast<double>(write_amp_threshold) / expected_write_amp_uni; |
||||||
|
|
||||||
|
if (level_ratio > uni_ratio) { |
||||||
|
return kCompactionStyleLevel; |
||||||
|
} else { |
||||||
|
return kCompactionStyleUniversal; |
||||||
|
} |
||||||
|
} |
||||||
|
|
||||||
|
// Pick mem table size
|
||||||
|
void PickWriteBufferSize(size_t total_write_buffer_limit, Options* options) { |
||||||
|
const size_t kMaxWriteBufferSize = 128 * kBytesForOneMb; |
||||||
|
const size_t kMinWriteBufferSize = 4 * kBytesForOneMb; |
||||||
|
|
||||||
|
// Try to pick up a buffer size between 4MB and 128MB.
|
||||||
|
// And try to pick 4 as the total number of write buffers.
|
||||||
|
size_t write_buffer_size = total_write_buffer_limit / 4; |
||||||
|
if (write_buffer_size > kMaxWriteBufferSize) { |
||||||
|
write_buffer_size = kMaxWriteBufferSize; |
||||||
|
} else if (write_buffer_size < kMinWriteBufferSize) { |
||||||
|
write_buffer_size = std::min(static_cast<size_t>(kMinWriteBufferSize), |
||||||
|
total_write_buffer_limit / 2); |
||||||
|
} |
||||||
|
|
||||||
|
// Truncate to multiple of 1MB.
|
||||||
|
if (write_buffer_size % kBytesForOneMb != 0) { |
||||||
|
write_buffer_size = |
||||||
|
(write_buffer_size / kBytesForOneMb + 1) * kBytesForOneMb; |
||||||
|
} |
||||||
|
|
||||||
|
options->write_buffer_size = write_buffer_size; |
||||||
|
options->max_write_buffer_number = |
||||||
|
total_write_buffer_limit / write_buffer_size; |
||||||
|
options->min_write_buffer_number_to_merge = 1; |
||||||
|
} |
||||||
|
|
||||||
|
void OptimizeForUniversal(Options* options) { |
||||||
|
options->level0_file_num_compaction_trigger = 2; |
||||||
|
options->level0_slowdown_writes_trigger = 30; |
||||||
|
options->level0_stop_writes_trigger = 40; |
||||||
|
options->max_open_files = -1; |
||||||
|
} |
||||||
|
|
||||||
|
// Optimize parameters for level-based compaction
|
||||||
|
void OptimizeForLevel(int read_amplification_threshold, |
||||||
|
int write_amplification_threshold, |
||||||
|
uint64_t target_db_size, Options* options) { |
||||||
|
int expected_levels_one_level0_file = |
||||||
|
static_cast<int>(ceil(log(target_db_size / options->write_buffer_size) / |
||||||
|
log(kBytesForLevelMultiplier))); |
||||||
|
|
||||||
|
int level0_stop_writes_trigger = |
||||||
|
read_amplification_threshold - expected_levels_one_level0_file; |
||||||
|
|
||||||
|
const size_t kInitialLevel0TotalSize = 128 * kBytesForOneMb; |
||||||
|
const int kMaxFileNumCompactionTrigger = 4; |
||||||
|
const int kMinLevel0StopTrigger = 3; |
||||||
|
|
||||||
|
int file_num_buffer = |
||||||
|
kInitialLevel0TotalSize / options->write_buffer_size + 1; |
||||||
|
|
||||||
|
if (level0_stop_writes_trigger > file_num_buffer) { |
||||||
|
// Have sufficient room for multiple level 0 files
|
||||||
|
// Try enlarge the buffer up to 1GB
|
||||||
|
|
||||||
|
// Try to enlarge the buffer up to 1GB, if still have sufficient headroom.
|
||||||
|
file_num_buffer *= |
||||||
|
std::pow(2, std::max(0, std::min(3, level0_stop_writes_trigger - |
||||||
|
file_num_buffer - 2))); |
||||||
|
|
||||||
|
options->level0_stop_writes_trigger = level0_stop_writes_trigger; |
||||||
|
options->level0_slowdown_writes_trigger = level0_stop_writes_trigger - 2; |
||||||
|
options->level0_file_num_compaction_trigger = |
||||||
|
std::min(kMaxFileNumCompactionTrigger, file_num_buffer / 2); |
||||||
|
} else { |
||||||
|
options->level0_stop_writes_trigger = |
||||||
|
std::max(kMinLevel0StopTrigger, file_num_buffer); |
||||||
|
options->level0_slowdown_writes_trigger = |
||||||
|
options->level0_stop_writes_trigger - 1; |
||||||
|
options->level0_file_num_compaction_trigger = 1; |
||||||
|
} |
||||||
|
|
||||||
|
// This doesn't consider compaction and overheads of mem tables. But usually
|
||||||
|
// it is in the same order of magnitude.
|
||||||
|
int expected_level0_compaction_size = |
||||||
|
options->level0_file_num_compaction_trigger * options->write_buffer_size; |
||||||
|
// Enlarge level1 target file size if level0 compaction size is larger.
|
||||||
|
int max_bytes_for_level_base = 10 * kBytesForOneMb; |
||||||
|
if (expected_level0_compaction_size > max_bytes_for_level_base) { |
||||||
|
max_bytes_for_level_base = expected_level0_compaction_size; |
||||||
|
} |
||||||
|
options->max_bytes_for_level_base = max_bytes_for_level_base; |
||||||
|
// Now always set level multiplier to be 10
|
||||||
|
options->max_bytes_for_level_multiplier = kBytesForLevelMultiplier; |
||||||
|
|
||||||
|
const int kMinFileSize = 2 * kBytesForOneMb; |
||||||
|
// Allow at least 3-way parallelism for compaction between level 1 and 2.
|
||||||
|
int max_file_size = max_bytes_for_level_base / 3; |
||||||
|
if (max_file_size < kMinFileSize) { |
||||||
|
options->target_file_size_base = kMinFileSize; |
||||||
|
} else { |
||||||
|
if (max_file_size % kBytesForOneMb != 0) { |
||||||
|
max_file_size = (max_file_size / kBytesForOneMb + 1) * kBytesForOneMb; |
||||||
|
} |
||||||
|
options->target_file_size_base = max_file_size; |
||||||
|
} |
||||||
|
|
||||||
|
// TODO: consider to tune num_levels too.
|
||||||
|
} |
||||||
|
|
||||||
|
} // namespace
|
||||||
|
|
||||||
|
Options GetOptions(size_t total_write_buffer_limit, |
||||||
|
int read_amplification_threshold, |
||||||
|
int write_amplification_threshold, uint64_t target_db_size) { |
||||||
|
Options options; |
||||||
|
PickWriteBufferSize(total_write_buffer_limit, &options); |
||||||
|
size_t write_buffer_size = options.write_buffer_size; |
||||||
|
options.compaction_style = |
||||||
|
PickCompactionStyle(write_buffer_size, read_amplification_threshold, |
||||||
|
write_amplification_threshold, target_db_size); |
||||||
|
if (options.compaction_style == kCompactionStyleUniversal) { |
||||||
|
OptimizeForUniversal(&options); |
||||||
|
} else { |
||||||
|
OptimizeForLevel(read_amplification_threshold, |
||||||
|
write_amplification_threshold, target_db_size, &options); |
||||||
|
} |
||||||
|
return options; |
||||||
|
} |
||||||
|
|
||||||
|
} // namespace rocksdb
|
@ -0,0 +1,80 @@ |
|||||||
|
// Copyright (c) 2013, Facebook, Inc. All rights reserved.
|
||||||
|
// This source code is licensed under the BSD-style license found in the
|
||||||
|
// LICENSE file in the root directory of this source tree. An additional grant
|
||||||
|
// of patent rights can be found in the PATENTS file in the same directory.
|
||||||
|
//
|
||||||
|
// Copyright (c) 2011 The LevelDB Authors. All rights reserved.
|
||||||
|
// Use of this source code is governed by a BSD-style license that can be
|
||||||
|
// found in the LICENSE file. See the AUTHORS file for names of contributors.
|
||||||
|
|
||||||
|
#define __STDC_FORMAT_MACROS |
||||||
|
#include <inttypes.h> |
||||||
|
#include <gflags/gflags.h> |
||||||
|
|
||||||
|
#include "rocksdb/options.h" |
||||||
|
#include "util/testharness.h" |
||||||
|
|
||||||
|
using GFLAGS::ParseCommandLineFlags; |
||||||
|
DEFINE_bool(enable_print, false, "Print options generated to console."); |
||||||
|
|
||||||
|
namespace rocksdb { |
||||||
|
|
||||||
|
class OptionsTest {}; |
||||||
|
|
||||||
|
class StderrLogger : public Logger { |
||||||
|
public: |
||||||
|
virtual void Logv(const char* format, va_list ap) override { |
||||||
|
vprintf(format, ap); |
||||||
|
printf("\n"); |
||||||
|
} |
||||||
|
}; |
||||||
|
|
||||||
|
Options PrintAndGetOptions(size_t total_write_buffer_limit, |
||||||
|
int read_amplification_threshold, |
||||||
|
int write_amplification_threshold, |
||||||
|
uint64_t target_db_size = 68719476736) { |
||||||
|
StderrLogger logger; |
||||||
|
|
||||||
|
if (FLAGS_enable_print) { |
||||||
|
printf( |
||||||
|
"---- total_write_buffer_limit: %zu " |
||||||
|
"read_amplification_threshold: %d write_amplification_threshold: %d " |
||||||
|
"target_db_size %" PRIu64 " ----\n", |
||||||
|
total_write_buffer_limit, read_amplification_threshold, |
||||||
|
write_amplification_threshold, target_db_size); |
||||||
|
} |
||||||
|
|
||||||
|
Options options = |
||||||
|
GetOptions(total_write_buffer_limit, read_amplification_threshold, |
||||||
|
write_amplification_threshold, target_db_size); |
||||||
|
if (FLAGS_enable_print) { |
||||||
|
options.Dump(&logger); |
||||||
|
printf("-------------------------------------\n\n\n"); |
||||||
|
} |
||||||
|
return options; |
||||||
|
} |
||||||
|
|
||||||
|
TEST(OptionsTest, LooseCondition) { |
||||||
|
Options options; |
||||||
|
PrintAndGetOptions(static_cast<size_t>(10) * 1024 * 1024 * 1024, 100, 100); |
||||||
|
|
||||||
|
// Less mem table memory budget
|
||||||
|
PrintAndGetOptions(32 * 1024 * 1024, 100, 100); |
||||||
|
|
||||||
|
// Tight read amplification
|
||||||
|
options = PrintAndGetOptions(128 * 1024 * 1024, 8, 100); |
||||||
|
ASSERT_EQ(options.compaction_style, kCompactionStyleLevel); |
||||||
|
|
||||||
|
// Tight write amplification
|
||||||
|
options = PrintAndGetOptions(128 * 1024 * 1024, 64, 10); |
||||||
|
ASSERT_EQ(options.compaction_style, kCompactionStyleUniversal); |
||||||
|
|
||||||
|
// Both tight amplifications
|
||||||
|
PrintAndGetOptions(128 * 1024 * 1024, 4, 8); |
||||||
|
} |
||||||
|
} // namespace rocksdb
|
||||||
|
|
||||||
|
int main(int argc, char** argv) { |
||||||
|
ParseCommandLineFlags(&argc, &argv, true); |
||||||
|
return rocksdb::test::RunAllTests(); |
||||||
|
} |
Loading…
Reference in new issue