Apply use_direct_io_for_flush_and_compaction to writes only

Summary:
Previously `DBOptions::use_direct_io_for_flush_and_compaction=true` combined with `DBOptions::use_direct_reads=false` could cause RocksDB to simultaneously read from two file descriptors for the same file, where background reads used direct I/O and foreground reads used buffered I/O. Our measurements found this mixed-mode I/O negatively impacted foreground read perf, compared to when only buffered I/O was used.

This PR makes the mixed-mode I/O situation impossible by repurposing `DBOptions::use_direct_io_for_flush_and_compaction` to only apply to background writes, and `DBOptions::use_direct_reads` to apply to all reads. There is no risk of direct background direct writes happening simultaneously with buffered reads since we never read from and write to the same file simultaneously.
Closes https://github.com/facebook/rocksdb/pull/3829

Differential Revision: D7915443

Pulled By: ajkr

fbshipit-source-id: 78bcbf276449b7e7766ab6b0db246f789fb1b279
main
Andrew Kryczka 7 years ago committed by Facebook Github Bot
parent d19f568abf
commit 072ae671a7
  1. 1
      HISTORY.md
  2. 4
      db/db_compaction_test.cc
  3. 4
      db/db_impl_open.cc
  4. 5
      db/db_test2.cc
  5. 3
      env/env.cc
  6. 6
      include/rocksdb/options.h
  7. 2
      tools/db_bench_tool.cc
  8. 2
      util/file_reader_writer.cc

@ -6,6 +6,7 @@
* Add a new ticker stat rocksdb.number.multiget.keys.found to count number of keys successfully read in MultiGet calls * Add a new ticker stat rocksdb.number.multiget.keys.found to count number of keys successfully read in MultiGet calls
* Touch-up to write-related counters in PerfContext. New counters added: write_scheduling_flushes_compactions_time, write_thread_wait_nanos. Counters whose behavior was fixed or modified: write_memtable_time, write_pre_and_post_process_time, write_delay_time. * Touch-up to write-related counters in PerfContext. New counters added: write_scheduling_flushes_compactions_time, write_thread_wait_nanos. Counters whose behavior was fixed or modified: write_memtable_time, write_pre_and_post_process_time, write_delay_time.
* Posix Env's NewRandomRWFile() will fail if the file doesn't exist. * Posix Env's NewRandomRWFile() will fail if the file doesn't exist.
* Now, `DBOptions::use_direct_io_for_flush_and_compaction` only applies to background writes, and `DBOptions::use_direct_reads` applies to both user reads and background reads. This conforms with Linux's `open(2)` manpage, which advises against simultaneously reading a file in buffered and direct modes, due to possibly undefined behavior and degraded performance.
### New Features ### New Features
* Introduce TTL for level compaction so that all files older than ttl go through the compaction process to get rid of old data. * Introduce TTL for level compaction so that all files older than ttl go through the compaction process to get rid of old data.

@ -3679,7 +3679,7 @@ TEST_P(DBCompactionDirectIOTest, DirectIO) {
"TableCache::NewIterator:for_compaction", [&](void* arg) { "TableCache::NewIterator:for_compaction", [&](void* arg) {
bool* use_direct_reads = static_cast<bool*>(arg); bool* use_direct_reads = static_cast<bool*>(arg);
ASSERT_EQ(*use_direct_reads, ASSERT_EQ(*use_direct_reads,
options.use_direct_io_for_flush_and_compaction); options.use_direct_reads);
}); });
SyncPoint::GetInstance()->SetCallBack( SyncPoint::GetInstance()->SetCallBack(
"CompactionJob::OpenCompactionOutputFile", [&](void* arg) { "CompactionJob::OpenCompactionOutputFile", [&](void* arg) {
@ -3698,7 +3698,7 @@ TEST_P(DBCompactionDirectIOTest, DirectIO) {
MakeTables(3, "p", "q", 1); MakeTables(3, "p", "q", 1);
ASSERT_EQ("1,1,1", FilesPerLevel(1)); ASSERT_EQ("1,1,1", FilesPerLevel(1));
Compact(1, "p1", "p9"); Compact(1, "p1", "p9");
ASSERT_FALSE(readahead ^ options.use_direct_io_for_flush_and_compaction); ASSERT_EQ(readahead, options.use_direct_reads);
ASSERT_EQ("0,0,1", FilesPerLevel(1)); ASSERT_EQ("0,0,1", FilesPerLevel(1));
Destroy(options); Destroy(options);
delete options.env; delete options.env;

@ -106,14 +106,14 @@ DBOptions SanitizeOptions(const std::string& dbname, const DBOptions& src) {
result.db_paths.emplace_back(dbname, std::numeric_limits<uint64_t>::max()); result.db_paths.emplace_back(dbname, std::numeric_limits<uint64_t>::max());
} }
if (result.use_direct_io_for_flush_and_compaction && if (result.use_direct_reads &&
result.compaction_readahead_size == 0) { result.compaction_readahead_size == 0) {
TEST_SYNC_POINT_CALLBACK("SanitizeOptions:direct_io", nullptr); TEST_SYNC_POINT_CALLBACK("SanitizeOptions:direct_io", nullptr);
result.compaction_readahead_size = 1024 * 1024 * 2; result.compaction_readahead_size = 1024 * 1024 * 2;
} }
if (result.compaction_readahead_size > 0 || if (result.compaction_readahead_size > 0 ||
result.use_direct_io_for_flush_and_compaction) { result.use_direct_reads) {
result.new_table_reader_for_compaction_inputs = true; result.new_table_reader_for_compaction_inputs = true;
} }

@ -2296,7 +2296,8 @@ TEST_F(DBTest2, RateLimitedCompactionReads) {
kBytesPerKey) /* rate_bytes_per_sec */, kBytesPerKey) /* rate_bytes_per_sec */,
10 * 1000 /* refill_period_us */, 10 /* fairness */, 10 * 1000 /* refill_period_us */, 10 /* fairness */,
RateLimiter::Mode::kReadsOnly)); RateLimiter::Mode::kReadsOnly));
options.use_direct_io_for_flush_and_compaction = use_direct_io; options.use_direct_reads = options.use_direct_io_for_flush_and_compaction =
use_direct_io;
BlockBasedTableOptions bbto; BlockBasedTableOptions bbto;
bbto.block_size = 16384; bbto.block_size = 16384;
bbto.no_block_cache = true; bbto.no_block_cache = true;
@ -2318,7 +2319,7 @@ TEST_F(DBTest2, RateLimitedCompactionReads) {
// chose 1MB as the upper bound on the total bytes read. // chose 1MB as the upper bound on the total bytes read.
size_t rate_limited_bytes = size_t rate_limited_bytes =
options.rate_limiter->GetTotalBytesThrough(Env::IO_LOW); options.rate_limiter->GetTotalBytesThrough(Env::IO_LOW);
// Include the explict prefetch of the footer in direct I/O case. // Include the explicit prefetch of the footer in direct I/O case.
size_t direct_io_extra = use_direct_io ? 512 * 1024 : 0; size_t direct_io_extra = use_direct_io ? 512 * 1024 : 0;
ASSERT_GE(rate_limited_bytes, ASSERT_GE(rate_limited_bytes,
static_cast<size_t>(kNumKeysPerFile * kBytesPerKey * kNumL0Files + static_cast<size_t>(kNumKeysPerFile * kBytesPerKey * kNumL0Files +

3
env/env.cc vendored

@ -404,8 +404,7 @@ EnvOptions Env::OptimizeForCompactionTableWrite(
EnvOptions Env::OptimizeForCompactionTableRead( EnvOptions Env::OptimizeForCompactionTableRead(
const EnvOptions& env_options, const ImmutableDBOptions& db_options) const { const EnvOptions& env_options, const ImmutableDBOptions& db_options) const {
EnvOptions optimized_env_options(env_options); EnvOptions optimized_env_options(env_options);
optimized_env_options.use_direct_reads = optimized_env_options.use_direct_reads = db_options.use_direct_reads;
db_options.use_direct_io_for_flush_and_compaction;
return optimized_env_options; return optimized_env_options;
} }

@ -604,13 +604,13 @@ struct DBOptions {
// buffered. The hardware buffer of the devices may however still // buffered. The hardware buffer of the devices may however still
// be used. Memory mapped files are not impacted by these parameters. // be used. Memory mapped files are not impacted by these parameters.
// Use O_DIRECT for user reads // Use O_DIRECT for user and compaction reads.
// When true, we also force new_table_reader_for_compaction_inputs to true.
// Default: false // Default: false
// Not supported in ROCKSDB_LITE mode! // Not supported in ROCKSDB_LITE mode!
bool use_direct_reads = false; bool use_direct_reads = false;
// Use O_DIRECT for both reads and writes in background flush and compactions // Use O_DIRECT for writes in background flush and compactions.
// When true, we also force new_table_reader_for_compaction_inputs to true.
// Default: false // Default: false
// Not supported in ROCKSDB_LITE mode! // Not supported in ROCKSDB_LITE mode!
bool use_direct_io_for_flush_and_compaction = false; bool use_direct_io_for_flush_and_compaction = false;

@ -920,7 +920,7 @@ DEFINE_bool(use_direct_reads, rocksdb::Options().use_direct_reads,
DEFINE_bool(use_direct_io_for_flush_and_compaction, DEFINE_bool(use_direct_io_for_flush_and_compaction,
rocksdb::Options().use_direct_io_for_flush_and_compaction, rocksdb::Options().use_direct_io_for_flush_and_compaction,
"Use O_DIRECT for background flush and compaction I/O"); "Use O_DIRECT for background flush and compaction writes");
DEFINE_bool(advise_random_on_open, rocksdb::Options().advise_random_on_open, DEFINE_bool(advise_random_on_open, rocksdb::Options().advise_random_on_open,
"Advise random access on table file open"); "Advise random access on table file open");

@ -88,7 +88,7 @@ Status RandomAccessFileReader::Read(uint64_t offset, size_t n, Slice* result,
buf.AllocateNewBuffer(read_size); buf.AllocateNewBuffer(read_size);
while (buf.CurrentSize() < read_size) { while (buf.CurrentSize() < read_size) {
size_t allowed; size_t allowed;
if (rate_limiter_ != nullptr) { if (for_compaction_ && rate_limiter_ != nullptr) {
allowed = rate_limiter_->RequestToken( allowed = rate_limiter_->RequestToken(
buf.Capacity() - buf.CurrentSize(), buf.Alignment(), buf.Capacity() - buf.CurrentSize(), buf.Alignment(),
Env::IOPriority::IO_LOW, stats_, RateLimiter::OpType::kRead); Env::IOPriority::IO_LOW, stats_, RateLimiter::OpType::kRead);

Loading…
Cancel
Save