From 4b296512060e692842e45cbb09408956c530a737 Mon Sep 17 00:00:00 2001 From: heyongqiang Date: Wed, 15 May 2013 10:34:02 -0700 Subject: [PATCH] add block deviation option to terminate a block before it exceeds block_size Summary: a new option block_size_deviation is added. Test Plan: run db_test and db_bench Reviewers: dhruba, haobo Reviewed By: haobo Differential Revision: https://reviews.facebook.net/D10821 --- db/db_impl.cc | 3 +++ include/leveldb/options.h | 8 ++++++++ table/block_builder.cc | 15 +++++++++++++++ table/block_builder.h | 3 +++ table/table_builder.cc | 21 ++++++++++++++++----- util/options.cc | 8 +++++++- 6 files changed, 52 insertions(+), 6 deletions(-) diff --git a/db/db_impl.cc b/db/db_impl.cc index 563ed99eb..77956409a 100644 --- a/db/db_impl.cc +++ b/db/db_impl.cc @@ -138,6 +138,9 @@ Options SanitizeOptions(const std::string& dbname, result.block_cache = NewLRUCache(8 << 20); } result.compression_per_level = src.compression_per_level; + if (result.block_size_deviation < 0 || result.block_size_deviation > 100) { + result.block_size_deviation = 0; + } return result; } diff --git a/include/leveldb/options.h b/include/leveldb/options.h index f5fcdbf99..206547102 100644 --- a/include/leveldb/options.h +++ b/include/leveldb/options.h @@ -433,6 +433,14 @@ struct Options { // if not zero, dump leveldb.stats to LOG every stats_dump_period_sec // Default: 3600 (1 hour) unsigned int stats_dump_period_sec; + + // This is used to close a block before it reaches the configured + // 'block_size'. If the percentage of free space in the current block is less + // than this specified number and adding a new record to the block will + // exceed the configured block size, then this block will be closed and the + // new record will be written to the next block. + // Default is 10. + int block_size_deviation; }; // Options that control read operations diff --git a/table/block_builder.cc b/table/block_builder.cc index db660cd07..2e195223c 100644 --- a/table/block_builder.cc +++ b/table/block_builder.cc @@ -60,6 +60,21 @@ size_t BlockBuilder::CurrentSizeEstimate() const { sizeof(uint32_t)); // Restart array length } +size_t BlockBuilder::EstimateSizeAfterKV(const Slice& key, const Slice& value) + const { + size_t estimate = CurrentSizeEstimate(); + estimate += key.size() + value.size(); + if (counter_ >= options_->block_restart_interval) { + estimate += sizeof(uint32_t); // a new restart entry. + } + + estimate += sizeof(int32_t); // varint for shared prefix length. + estimate += VarintLength(key.size()); // varint for key length. + estimate += VarintLength(value.size()); // varint for value length. + + return estimate; +} + Slice BlockBuilder::Finish() { // Append restart array for (size_t i = 0; i < restarts_.size(); i++) { diff --git a/table/block_builder.h b/table/block_builder.h index 5b545bd1a..bfab3a394 100644 --- a/table/block_builder.h +++ b/table/block_builder.h @@ -34,6 +34,9 @@ class BlockBuilder { // we are building. size_t CurrentSizeEstimate() const; + // Returns an estimated block size after appending key and value. + size_t EstimateSizeAfterKV(const Slice& key, const Slice& value) const; + // Return true iff no entries have been added since the last Reset() bool empty() const { return buffer_.empty(); diff --git a/table/table_builder.cc b/table/table_builder.cc index 77f145456..8b81d2374 100644 --- a/table/table_builder.cc +++ b/table/table_builder.cc @@ -98,6 +98,22 @@ void TableBuilder::Add(const Slice& key, const Slice& value) { assert(r->options.comparator->Compare(key, Slice(r->last_key)) > 0); } + const size_t curr_size = r->data_block.CurrentSizeEstimate(); + const size_t estimated_size_after = r->data_block.EstimateSizeAfterKV(key, + value); + // Do flush if one of the below two conditions is true: + // 1) if the current estimated size already exceeds the block size, + // 2) block_size_deviation is set and the estimated size after appending + // the kv will exceed the block size and the current size is under the + // the deviation. + if (curr_size >= r->options.block_size || + (estimated_size_after > r->options.block_size && + r->options.block_size_deviation > 0 && + (curr_size * 100) > + r->options.block_size * (100 - r->options.block_size_deviation))) { + Flush(); + } + if (r->pending_index_entry) { assert(r->data_block.empty()); r->options.comparator->FindShortestSeparator(&r->last_key, key); @@ -114,11 +130,6 @@ void TableBuilder::Add(const Slice& key, const Slice& value) { r->last_key.assign(key.data(), key.size()); r->num_entries++; r->data_block.Add(key, value); - - const size_t estimated_block_size = r->data_block.CurrentSizeEstimate(); - if (estimated_block_size >= r->options.block_size) { - Flush(); - } } void TableBuilder::Flush() { diff --git a/util/options.cc b/util/options.cc index 2c4bd6370..2671f4611 100644 --- a/util/options.cc +++ b/util/options.cc @@ -70,7 +70,8 @@ Options::Options() allow_mmap_writes(true), is_fd_close_on_exec(true), skip_log_error_on_recovery(false), - stats_dump_period_sec(3600) { + stats_dump_period_sec(3600), + block_size_deviation (10) { } void @@ -191,10 +192,15 @@ Options::Dump(Logger* log) const allow_mmap_writes); Log(log," Options.is_fd_close_on_exec: %d", is_fd_close_on_exec); +<<<<<<< HEAD Log(log," Options.skip_log_error_on_recovery: %d", skip_log_error_on_recovery); Log(log," Options.stats_dump_period_sec: %d", stats_dump_period_sec); +======= + Log(log," Options.block_size_deviation: %d", + block_size_deviation); +>>>>>>> add block deviation option to terminate a block before it exceeds block_size } // Options::Dump //