Merge pull request #803 from SherlockNoMad/SkipFlush

Add Option to Skip Flushing in TableBuilder
main
Islam AbdelRahman 9 years ago
commit f31442fb5c
  1. 5
      db/c.cc
  2. 9
      db/db_bench.cc
  3. 3
      include/rocksdb/c.h
  4. 3
      include/rocksdb/env.h
  5. 8
      include/rocksdb/options.h
  6. 14
      include/rocksdb/table.h
  7. 2
      table/block_based_table_builder.cc
  8. 4
      table/block_based_table_factory.cc
  9. 2
      util/env.cc
  10. 13
      util/file_reader_writer.cc
  11. 2
      util/file_reader_writer.h
  12. 6
      util/options.cc
  13. 6
      util/options_helper.h
  14. 6
      util/options_test.cc

@ -1288,6 +1288,11 @@ void rocksdb_block_based_options_set_cache_index_and_filter_blocks(
options->rep.cache_index_and_filter_blocks = v;
}
void rocksdb_block_based_options_set_skip_table_builder_flush(
rocksdb_block_based_table_options_t* options, unsigned char v) {
options->rep.skip_table_builder_flush = v;
}
void rocksdb_options_set_block_based_table_factory(
rocksdb_options_t *opt,
rocksdb_block_based_table_options_t* table_options) {

@ -376,6 +376,12 @@ DEFINE_int32(compaction_readahead_size, 0, "Compaction readahead size");
DEFINE_int32(random_access_max_buffer_size, 1024 * 1024,
"Maximum windows randomaccess buffer size");
DEFINE_int32(writable_file_max_buffer_size, 1024 * 1024,
"Maximum write buffer for Writeable File");
DEFINE_int32(skip_table_builder_flush, false, "Skip flushing block in "
"table builder ");
DEFINE_int32(bloom_bits, -1, "Bloom filter bits per key. Negative means"
" use default settings.");
DEFINE_int32(memtable_bloom_bits, 0, "Bloom filter bits per key for memtable. "
@ -2299,6 +2305,7 @@ class Benchmark {
FLAGS_new_table_reader_for_compaction_inputs;
options.compaction_readahead_size = FLAGS_compaction_readahead_size;
options.random_access_max_buffer_size = FLAGS_random_access_max_buffer_size;
options.writable_file_max_buffer_size = FLAGS_writable_file_max_buffer_size;
options.statistics = dbstats;
if (FLAGS_enable_io_prio) {
FLAGS_env->LowerThreadPoolIOPriority(Env::LOW);
@ -2441,6 +2448,8 @@ class Benchmark {
block_based_options.block_size = FLAGS_block_size;
block_based_options.block_restart_interval = FLAGS_block_restart_interval;
block_based_options.filter_policy = filter_policy_;
block_based_options.skip_table_builder_flush =
FLAGS_skip_table_builder_flush;
block_based_options.format_version = 2;
options.table_factory.reset(
NewBlockBasedTableFactory(block_based_options));

@ -450,6 +450,9 @@ rocksdb_block_based_options_set_hash_index_allow_collision(
extern ROCKSDB_LIBRARY_API void
rocksdb_block_based_options_set_cache_index_and_filter_blocks(
rocksdb_block_based_table_options_t*, unsigned char);
extern ROCKSDB_LIBRARY_API void
rocksdb_block_based_options_set_skip_table_builder_flush(
rocksdb_block_based_table_options_t* options, unsigned char);
extern ROCKSDB_LIBRARY_API void rocksdb_options_set_block_based_table_factory(
rocksdb_options_t* opt, rocksdb_block_based_table_options_t* table_options);

@ -94,6 +94,9 @@ struct EnvOptions {
// See DBOPtions doc
size_t random_access_max_buffer_size;
// See DBOptions doc
size_t writable_file_max_buffer_size = 1024 * 1024;
// If not nullptr, write rate limiting is enabled for flush and compaction
RateLimiter* rate_limiter = nullptr;
};

@ -1089,6 +1089,14 @@ struct DBOptions {
// Default: 1 Mb
size_t random_access_max_buffer_size;
// This is the maximum buffer size that is used by WritableFileWriter.
// On Windows, we need to maintain an aligned buffer for writes.
// We allow the buffer to grow until it's size hits the limit.
//
// Default: 1024 * 1024 (1 MB)
size_t writable_file_max_buffer_size;
// Use adaptive mutex, which spins in the user space before resorting
// to kernel. This could reduce context switch when the mutex is not
// heavily contended. However, if the mutex is hot, we could end up

@ -128,6 +128,20 @@ struct BlockBasedTableOptions {
// This must generally be true for gets to be efficient.
bool whole_key_filtering = true;
// If true, block will not be explictly flushed to disk during building
// a SstTable. Instead, buffer in WritableFileWriter will take
// care of the flushing when it is full.
//
// On Windows, this option helps a lot when unbuffered I/O
// (allow_os_buffer = false) is used, since it avoids small
// unbuffered disk write.
//
// User may also adjust writable_file_max_buffer_size to optimize disk I/O
// size.
//
// Default: false
bool skip_table_builder_flush = false;
// We currently have three versions:
// 0 -- This version is currently written out by all RocksDB's versions by
// default. Can be read by really old RocksDB's. Doesn't support changing

@ -592,7 +592,7 @@ void BlockBasedTableBuilder::Flush() {
if (!ok()) return;
if (r->data_block.empty()) return;
WriteBlock(&r->data_block, &r->pending_handle);
if (ok()) {
if (ok() && !r->table_options.skip_table_builder_flush) {
r->status = r->file->Flush();
}
if (r->filter_block != nullptr) {

@ -152,6 +152,10 @@ std::string BlockBasedTableFactory::GetPrintableTableOptions() const {
ret.append(buffer);
snprintf(buffer, kBufferSize, " whole_key_filtering: %d\n",
table_options_.whole_key_filtering);
ret.append(buffer);
snprintf(buffer, kBufferSize, " skip_table_builder_flush: %d\n",
table_options_.skip_table_builder_flush);
ret.append(buffer);
snprintf(buffer, kBufferSize, " format_version: %d\n",
table_options_.format_version);
ret.append(buffer);

@ -296,6 +296,8 @@ void AssignEnvOptions(EnvOptions* env_options, const DBOptions& options) {
env_options->random_access_max_buffer_size =
options.random_access_max_buffer_size;
env_options->rate_limiter = options.rate_limiter.get();
env_options->writable_file_max_buffer_size =
options.writable_file_max_buffer_size;
env_options->allow_fallocate = options.allow_fallocate;
}

@ -21,10 +21,6 @@
namespace rocksdb {
namespace {
const size_t c_OneMb = (1 << 20);
}
Status SequentialFileReader::Read(size_t n, Slice* result, char* scratch) {
Status s = file_->Read(n, result, scratch);
IOSTATS_ADD(bytes_read, result->size());
@ -76,9 +72,9 @@ Status WritableFileWriter::Append(const Slice& data) {
}
}
if (buf_.Capacity() < c_OneMb) {
if (buf_.Capacity() < max_buffer_size_) {
size_t desiredCapacity = buf_.Capacity() * 2;
desiredCapacity = std::min(desiredCapacity, c_OneMb);
desiredCapacity = std::min(desiredCapacity, max_buffer_size_);
buf_.AllocateNewBuffer(desiredCapacity);
}
assert(buf_.CurrentSize() == 0);
@ -102,9 +98,9 @@ Status WritableFileWriter::Append(const Slice& data) {
// We double the buffer here because
// Flush calls do not keep up with the incoming bytes
// This is the only place when buffer is changed with unbuffered I/O
if (buf_.Capacity() < c_OneMb) {
if (buf_.Capacity() < max_buffer_size_) {
size_t desiredCapacity = buf_.Capacity() * 2;
desiredCapacity = std::min(desiredCapacity, c_OneMb);
desiredCapacity = std::min(desiredCapacity, max_buffer_size_);
buf_.AllocateNewBuffer(desiredCapacity);
}
}
@ -156,7 +152,6 @@ Status WritableFileWriter::Close() {
return s;
}
// write out the cached data to the OS cache
Status WritableFileWriter::Flush() {
Status s;

@ -93,6 +93,7 @@ class WritableFileWriter {
private:
std::unique_ptr<WritableFile> writable_file_;
AlignedBuffer buf_;
size_t max_buffer_size_;
// Actually written data size can be used for truncate
// not counting padding data
uint64_t filesize_;
@ -113,6 +114,7 @@ class WritableFileWriter {
const EnvOptions& options)
: writable_file_(std::move(file)),
buf_(),
max_buffer_size_(options.writable_file_max_buffer_size),
filesize_(0),
next_write_offset_(0),
pending_sync_(false),

@ -251,6 +251,7 @@ DBOptions::DBOptions()
new_table_reader_for_compaction_inputs(false),
compaction_readahead_size(0),
random_access_max_buffer_size(1024 * 1024),
writable_file_max_buffer_size(1024 * 1024),
use_adaptive_mutex(false),
bytes_per_sync(0),
wal_bytes_per_sync(0),
@ -313,6 +314,7 @@ DBOptions::DBOptions(const Options& options)
options.new_table_reader_for_compaction_inputs),
compaction_readahead_size(options.compaction_readahead_size),
random_access_max_buffer_size(options.random_access_max_buffer_size),
writable_file_max_buffer_size(options.writable_file_max_buffer_size),
use_adaptive_mutex(options.use_adaptive_mutex),
bytes_per_sync(options.bytes_per_sync),
wal_bytes_per_sync(options.wal_bytes_per_sync),
@ -412,6 +414,10 @@ void DBOptions::Dump(Logger* log) const {
" Options.random_access_max_buffer_size: %" ROCKSDB_PRIszt
"d",
random_access_max_buffer_size);
Header(log,
" Options.writable_file_max_buffer_size: %" ROCKSDB_PRIszt
"d",
writable_file_max_buffer_size);
Header(log, " Options.use_adaptive_mutex: %d",
use_adaptive_mutex);
Header(log, " Options.rate_limiter: %p",

@ -184,6 +184,9 @@ static std::unordered_map<std::string, OptionTypeInfo> db_options_type_info = {
{"random_access_max_buffer_size",
{offsetof(struct DBOptions, random_access_max_buffer_size),
OptionType::kSizeT, OptionVerificationType::kNormal}},
{"writable_file_max_buffer_size",
{offsetof(struct DBOptions, writable_file_max_buffer_size),
OptionType::kSizeT, OptionVerificationType::kNormal}},
{"use_adaptive_mutex",
{offsetof(struct DBOptions, use_adaptive_mutex), OptionType::kBoolean,
OptionVerificationType::kNormal}},
@ -460,6 +463,9 @@ static std::unordered_map<std::string,
{"whole_key_filtering",
{offsetof(struct BlockBasedTableOptions, whole_key_filtering),
OptionType::kBoolean, OptionVerificationType::kNormal}},
{"skip_table_builder_flush",
{offsetof(struct BlockBasedTableOptions, skip_table_builder_flush),
OptionType::kBoolean, OptionVerificationType::kNormal}},
{"format_version",
{offsetof(struct BlockBasedTableOptions, format_version),
OptionType::kUInt32T, OptionVerificationType::kNormal}}};

@ -341,6 +341,7 @@ TEST_F(OptionsTest, GetOptionsFromMapTest) {
{"new_table_reader_for_compaction_inputs", "true"},
{"compaction_readahead_size", "100"},
{"random_access_max_buffer_size", "3145728"},
{"writable_file_max_buffer_size", "314159"},
{"bytes_per_sync", "47"},
{"wal_bytes_per_sync", "48"},
};
@ -452,6 +453,7 @@ TEST_F(OptionsTest, GetOptionsFromMapTest) {
ASSERT_EQ(new_db_opt.new_table_reader_for_compaction_inputs, true);
ASSERT_EQ(new_db_opt.compaction_readahead_size, 100);
ASSERT_EQ(new_db_opt.random_access_max_buffer_size, 3145728);
ASSERT_EQ(new_db_opt.writable_file_max_buffer_size, 314159);
ASSERT_EQ(new_db_opt.bytes_per_sync, static_cast<uint64_t>(47));
ASSERT_EQ(new_db_opt.wal_bytes_per_sync, static_cast<uint64_t>(48));
}
@ -621,7 +623,8 @@ TEST_F(OptionsTest, GetBlockBasedTableOptionsFromString) {
"checksum=kxxHash;hash_index_allow_collision=1;no_block_cache=1;"
"block_cache=1M;block_cache_compressed=1k;block_size=1024;"
"block_size_deviation=8;block_restart_interval=4;"
"filter_policy=bloomfilter:4:true;whole_key_filtering=1",
"filter_policy=bloomfilter:4:true;whole_key_filtering=1;"
"skip_table_builder_flush=1",
&new_opt));
ASSERT_TRUE(new_opt.cache_index_and_filter_blocks);
ASSERT_EQ(new_opt.index_type, BlockBasedTableOptions::kHashSearch);
@ -636,6 +639,7 @@ TEST_F(OptionsTest, GetBlockBasedTableOptionsFromString) {
ASSERT_EQ(new_opt.block_size_deviation, 8);
ASSERT_EQ(new_opt.block_restart_interval, 4);
ASSERT_TRUE(new_opt.filter_policy != nullptr);
ASSERT_TRUE(new_opt.skip_table_builder_flush);
// unknown option
ASSERT_NOK(GetBlockBasedTableOptionsFromString(table_opt,

Loading…
Cancel
Save